[ { "title": "Improved Modelling of Federated Datasets using Mixtures-of-Dirichlet-Multinomials", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35220", "id": "01M0N8VgfB", "proceeding": "https://proceedings.mlr.press/v235/scott24a.html", "pdf": "https://openreview.net/pdf?id=01M0N8VgfB", "openreview": "https://openreview.net/forum?id=01M0N8VgfB", "author_site": "Jonathan Scott, Aine E Cahill", "tldr": "", "abstract": "In practice, training using federated learning can be orders of magnitude slower than standard centralized training. This severely limits the amount of experimentation and tuning that can be done, making it challenging to obtain good performance on a given task. Server-side proxy data can be used to run training simulations, for instance for hyperparameter tuning. This can greatly speed up the training pipeline by reducing the number of tuning runs to be performed overall on the true clients. However, it is challenging to ensure that these simulations accurately reflect the dynamics of the real federated training. In particular, the proxy data used for simulations often comes as a single centralized dataset without a partition into distinct clients, and partitioning this data in a naive way can lead to simulations that poorly reflect real federated training. In this paper we address the challenge of how to partition centralized data in a way that reflects the statistical heterogeneity of the true federated clients. We propose a fully federated, theoretically justified, algorithm that efficiently learns the distribution of the true clients and observe improved server-side simulations when using the inferred distribution to create simulated clients from the centralized data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jonathan Scott;\u00c1ine Cahill", "authorids": "~Jonathan_Scott1;~\u00c1ine_Cahill1", "gender": ";F", "homepage": ";", "dblp": "35/4737;", "google_scholar": ";", "orcid": ";", "linkedin": "jonny-scott-71b245103/;aine-cahill/", "or_profile": "~Jonathan_Scott1;~\u00c1ine_Cahill1", "aff": "Institute of Science and Technology;Apple", "aff_domain": "ist.ac.at;apple.com", "position": "PhD student;Machine learning research engineer", "bibtex": "@inproceedings{\nscott2024improved,\ntitle={Improved Modelling of Federated Datasets using Mixtures-of-Dirichlet-Multinomials},\nauthor={Jonathan Scott and {\\'A}ine Cahill},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=01M0N8VgfB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1282465, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:CpvyCUAYMA0J:scholar.google.com/&scioq=Improved+Modelling+of+Federated+Datasets+using+Mixtures-of-Dirichlet-Multinomials&hl=en&as_sdt=0,44", "gs_version_total": 7, "email": "ist.ac.at;apple.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Institute of Science and Technology;Apple", "aff_unique_dep": ";Apple Inc.", "aff_unique_url": ";https://www.apple.com", "aff_unique_abbr": ";Apple", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1", "aff_country_unique": ";United States" }, { "title": "One Meta-tuned Transformer is What You Need for Few-shot Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35219", "id": "01ahsMovBx", "proceeding": "https://proceedings.mlr.press/v235/yang24ah.html", "pdf": "https://openreview.net/pdf?id=01ahsMovBx", "openreview": "https://openreview.net/forum?id=01ahsMovBx", "author_site": "Xu Yang, Huaxiu Yao, Ying WEI", "tldr": "", "abstract": "Pre-trained vision transformers have revolutionized few-shot image classification, and it has been recently demonstrated that the previous common practice of meta-learning in synergy with these pre-trained transformers still holds significance. In this work, we design a new framework centered exclusively on self-attention, called MetaFormer, which extends the vision transformers beyond patch token interactions to encompass relationships between samples and tasks simultaneously for further advancing their downstream task performance. Leveraging the intrinsical property of ViTs in handling local patch relationships, we propose Masked Sample Attention (MSA) to efficiently embed the sample relationships into the network, where an adaptive mask is attached for enhancing task-specific feature consistency and providing flexibility in switching between few-shot learning setups. To encapsulate task relationships while filtering out background noise, Patch-grained Task Attention (PTA) is designed to maintain a dynamic knowledge pool consolidating diverse patterns from historical tasks. MetaFormer demonstrates coherence and compatibility with off-the-shelf pre-trained vision transformers and shows significant improvements in both inductive and transductive few-shot learning scenarios, outperforming state-of-the-art methods by up to 8.77% and 6.25% on 12 in-domain and 10 cross-domain datasets, respectively.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xu Yang;Huaxiu Yao;Ying Wei", "authorids": "~Xu_Yang10;~Huaxiu_Yao1;~Ying_Wei1", "gender": ";M;F", "homepage": ";http://huaxiuyao.mystrikingly.com;https://wei-ying.net/", "dblp": ";197/1635;14/4899-1", "google_scholar": ";A20BZnQAAAAJ;5UpFdKsAAAAJ", "orcid": ";;", "linkedin": ";huaxiuyao/;", "or_profile": "~Xu_Yang10;~Huaxiu_Yao1;~Ying_Wei1", "aff": ";Department of Computer Science, University of North Carolina at Chapel Hill;Nanyang Technological University", "aff_domain": ";cs.unc.edu;ntu.edu.sg", "position": ";Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nyang2024one,\ntitle={One Meta-tuned Transformer is What You Need for Few-shot Learning},\nauthor={Xu Yang and Huaxiu Yao and Ying Wei},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=01ahsMovBx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6254580, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10233150446107380038&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": ";cs.unc.edu;ntu.edu.sg", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "University of North Carolina at Chapel Hill;Nanyang Technological University", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.unc.edu;https://www.ntu.edu.sg", "aff_unique_abbr": "UNC Chapel Hill;NTU", "aff_campus_unique_index": "0", "aff_campus_unique": "Chapel Hill;", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Singapore" }, { "title": "SSL4Q: Semi-Supervised Learning of Quantum Data with Application to Quantum State Classification", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35218", "id": "04Fx1u2BUD", "proceeding": "https://proceedings.mlr.press/v235/tang24i.html", "pdf": "https://openreview.net/pdf?id=04Fx1u2BUD", "openreview": "https://openreview.net/forum?id=04Fx1u2BUD", "author_site": "Yehui Tang, Nianzu Yang, Mabiao Long, Junchi Yan", "tldr": "", "abstract": "The accurate classification of quantum states is crucial for advancing quantum computing, as it allows for the effective analysis and correct functioning of quantum devices by analyzing the statistics of the data from quantum measurements. Traditional supervised methods, which rely on extensive labeled measurement outcomes, are used to categorize unknown quantum states with different properties. However, the labeling process demands computational and memory resources that increase exponentially with the number of qubits. We propose SSL4Q, manage to achieve (for the first time) semi-supervised learning specifically designed for quantum state classification. SSL4Q's architecture is tailored to ensure permutation invariance for unordered quantum measurements and maintain robustness in the face of measurement uncertainties. Our empirical studies encompass simulations on two types of quantum systems: the Heisenberg Model and the Variational Quantum Circuit (VQC) Model, with system size reaching up to 50 qubits. The numerical results demonstrate SSL4Q's superiority over traditional supervised models in scenarios with limited labels, highlighting its potential in efficiently classifying quantum states with reduced computational and resource overhead.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yehui Tang;Nianzu Yang;Mabiao Long;Junchi Yan", "authorids": "~Yehui_Tang3;~Nianzu_Yang1;~Mabiao_Long1;~Junchi_Yan2", "gender": ";M;M;", "homepage": ";https://yangnianzu0515.github.io/;https://github.com/AlbertLong007;", "dblp": ";296/8412.html;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Yehui_Tang3;~Nianzu_Yang1;~Mabiao_Long1;~Junchi_Yan2", "aff": ";Shanghai Jiaotong University;Shanghai Jiaotong University;", "aff_domain": ";sjtu.edu.cn;sjtu.edu.cn;", "position": ";PhD student;Undergrad student;", "bibtex": "@inproceedings{\ntang2024sslq,\ntitle={{SSL}4Q: Semi-Supervised Learning of Quantum Data with Application to Quantum State Classification},\nauthor={Yehui Tang and Nianzu Yang and Mabiao Long and Junchi Yan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=04Fx1u2BUD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1688067, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18173815587833849468&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": ";sjtu.edu.cn;sjtu.edu.cn;", "author_num": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "PIVOT: Iterative Visual Prompting Elicits Actionable Knowledge for VLMs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35217", "id": "051jaf8MQy", "proceeding": "https://proceedings.mlr.press/v235/nasiriany24a.html", "pdf": "https://openreview.net/pdf?id=051jaf8MQy", "openreview": "https://openreview.net/forum?id=051jaf8MQy", "author_site": "Soroush Nasiriany, Fei Xia, Wenhao Yu, Ted Xiao, Jacky Liang, Ishita Dasgupta, Annie Xie, Danny Driess, Ayzaan Wahid, Zhuo Xu, Quan Vuong, Tingnan Zhang, Tsang-Wei Lee, Kuang-Huei Lee, Peng Xu, Sean Kirmani, Yuke Zhu, Andy Zeng, Karol Hausman, Nicolas Heess, Chelsea Finn, Sergey Levine, brian ichter", "tldr": "", "abstract": "Vision language models (VLMs) have shown impressive capabilities across a variety of tasks, from logical reasoning to visual understanding. This opens the door to richer interaction with the world, for example robotic control. However, VLMs produce only textual outputs, while robotic control and other spatial tasks require outputting continuous coordinates, actions, or trajectories. How can we enable VLMs to handle such settings without fine-tuning on task-specific data? In this paper, we propose a novel visual prompting approach for VLMs that we call Prompting with Iterative Visual Optimization (PIVOT), which casts tasks as iterative visual question answering. In each iteration, the image is annotated with a visual representation of proposals that the VLM can refer to (e.g., candidate robot actions, localizations, or trajectories). The VLM then selects the best ones for the task. These proposals are iteratively refined, allowing the VLM to eventually zero in on the best available answer. We investigate PIVOT on real-world robotic navigation, real-world manipulation from images, instruction following in simulation, and additional spatial inference tasks such as localization. We find, perhaps surprisingly, that our approach enables zero-shot control of robotic systems without any robot training data, navigation in a variety of environments, and other capabilities. Although current performance is far from perfect, our work highlights potentials and limitations of this new regime and shows a promising approach for Internet-Scale VLMs in robotic and spatial reasoning domains.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Soroush Nasiriany;Fei Xia;Wenhao Yu;Ted Xiao;Jacky Liang;Ishita Dasgupta;Annie Xie;Danny Driess;Ayzaan Wahid;Zhuo Xu;Quan Vuong;Tingnan Zhang;Tsang-Wei Edward Lee;Kuang-Huei Lee;Peng Xu;Sean Kirmani;Yuke Zhu;Andy Zeng;Karol Hausman;Nicolas Heess;Chelsea Finn;Sergey Levine;brian ichter", "authorids": "~Soroush_Nasiriany1;~Fei_Xia1;~Wenhao_Yu1;~Ted_Xiao1;~Jacky_Liang1;~Ishita_Dasgupta1;~Annie_Xie1;~Danny_Driess1;~Ayzaan_Wahid1;~Zhuo_Xu1;~Quan_Vuong2;~Tingnan_Zhang1;~Tsang-Wei_Edward_Lee1;~Kuang-Huei_Lee1;~Peng_Xu9;~Sean_Kirmani1;~Yuke_Zhu1;~Andy_Zeng3;~Karol_Hausman2;~Nicolas_Heess1;~Chelsea_Finn1;~Sergey_Levine1;~brian_ichter1", "gender": ";M;M;M;M;;;;M;M;M;M;M;M;M;M;M;;F;M;;M;M", "homepage": "http://snasiriany.me/;;https://wenhaoyu.weebly.com/;https://www.tedxiao.me;https://www.jacky.io;;https://cs.stanford.edu/~anniexie/;https://dannydriess.github.io/;https://ayzaan.com;;https://quanvuong.github.io;;;https://kuanghuei.github.io/;;https://kirmani.io/;https://cs.utexas.edu/~yukez/;;https://ai.stanford.edu/~cbfinn/;https://people.eecs.berkeley.edu/~svlevine/;;http://andyzeng.github.io/;https://karolhausman.github.io/", "dblp": "185/5645;;;198/0598;;169/6218;215/3608;;;;;https://dblp.uni-trier.de/pers/hd/z/Zhang:Tingnan;236/6317.html;66/11466;;;133/1772;76/9181;131/1783;80/7594;;http://dblp.uni-trier.de/pers/hd/z/Zeng:Andy;135/8164", "google_scholar": "bBLqsgkAAAAJ;pqP5_PgAAAAJ;1bF2s2kAAAAJ;;K29Sv1EAAAAJ;;;https://scholar.google.de/citations?user=wxnzyjwAAAAJ;;;NSWI3OwAAAAJ;RM2vMNcAAAAJ;;rE7-N30AAAAJ;460NWeQAAAAJ;iyEuK8kAAAAJ;mWGyYMsAAAAJ;79k7bGEAAAAJ;vfPE6hgAAAAJ;8R35rCwAAAAJ;-w5DuHgAAAAJ;q7nFtUcAAAAJ;yy0UFOwAAAAJ", "orcid": ";0000-0003-4343-1444;;;;;;;;;;;;;;;;;;;;;", "linkedin": ";;;;jackyliang42;idasgupta6/;;;;zhuo-xu-joe/;;;;;;skirmani;;;;;;;karolhausman/", "or_profile": "~Soroush_Nasiriany1;~Fei_Xia1;~Wenhao_Yu1;~Ted_Xiao1;~Jacky_Liang1;~Ishita_Dasgupta1;~Annie_Xie1;~Danny_Driess1;~Ayzaan_Wahid1;~Zhuo_Xu1;~Quan_Vuong2;~Tingnan_Zhang1;~Tsang-Wei_Edward_Lee1;~Kuang-Huei_Lee1;~Peng_Xu9;~Sean_Kirmani1;~Yuke_Zhu1;~Nicolas_Heess1;~Chelsea_Finn1;~Sergey_Levine1;~brian_ichter1;~Andy_Zeng1;~Karol_Hausman1", "aff": "University of Texas, Austin;Google;Google;;Google;Google DeepMind;Stanford University;Google;Robotics at Google;Google DeepMind;physical intelligence;Google;;Google;Google;Google DeepMind;Computer Science Department, University of Texas, Austin;Google DeepMind;Google;Google;Google;Google;Google Brain", "aff_domain": "utexas.edu;google.com;google.com;;google.com;deepmind.com;stanford.edu;google.com;google.com;google.com;physicalintelligence.company;google.com;;google.com;google.com;google.com;cs.utexas.edu;google.com;google.com;google.com;google.com;google.com;google.com", "position": "PhD student;Researcher;Software Engineer;;Researcher;Researcher;PhD student;Researcher;Software Engineer;Researcher;Researcher;Software Engineer;;Researcher;Researcher;Researcher;Assistant Professor;Research Scientist;Research Scientist;Research Scientist;Research Scientist;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nnasiriany2024pivot,\ntitle={{PIVOT}: Iterative Visual Prompting Elicits Actionable Knowledge for {VLM}s},\nauthor={Soroush Nasiriany and Fei Xia and Wenhao Yu and Ted Xiao and Jacky Liang and Ishita Dasgupta and Annie Xie and Danny Driess and Ayzaan Wahid and Zhuo Xu and Quan Vuong and Tingnan Zhang and Tsang-Wei Edward Lee and Kuang-Huei Lee and Peng Xu and Sean Kirmani and Yuke Zhu and Andy Zeng and Karol Hausman and Nicolas Heess and Chelsea Finn and Sergey Levine and brian ichter},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=051jaf8MQy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7524357, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 23, "gs_citation": 95, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=630371738178080282&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "utexas.edu;google.com;google.com;;google.com;deepmind.com;stanford.edu;google.com;google.com;google.com;physicalintelligence.company;google.com;;google.com;google.com;google.com;cs.utexas.edu;google.com;google.com;google.com;google.com;google.com;google.com", "author_num": 23, "aff_unique_index": "0;1;1;1;1;2;1;1;1;3;1;1;1;1;0;1;1;1;1;1;1", "aff_unique_norm": "University of Texas at Austin;Google;Stanford University;Physical Intelligence", "aff_unique_dep": ";Google;;", "aff_unique_url": "https://www.utexas.edu;https://www.google.com;https://www.stanford.edu;", "aff_unique_abbr": "UT Austin;Google;Stanford;", "aff_campus_unique_index": "0;1;1;1;3;1;1;1;1;1;0;1;1;1;1;1", "aff_campus_unique": "Austin;Mountain View;;Stanford", "aff_country_unique_index": "0;0;0;0;1;0;0;0;1;0;0;0;1;0;1;0;0;0;0;0", "aff_country_unique": "United States;United Kingdom;" }, { "title": "Regularized Q-learning through Robust Averaging", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35216", "id": "07f24ya6eX", "proceeding": "https://proceedings.mlr.press/v235/schmitt-forster24a.html", "pdf": "https://openreview.net/pdf?id=07f24ya6eX", "openreview": "https://openreview.net/forum?id=07f24ya6eX", "author_site": "Peter Schmitt-F\u00f6rster, Tobias Sutter", "tldr": "", "abstract": "We propose a new Q-learning variant, called 2RA Q-learning, that addresses some weaknesses of existing Q-learning methods in a principled manner. One such weakness is an underlying estimation bias which cannot be controlled and often results in poor performance. We propose a distributionally robust estimator for the maximum expected value term, which allows us to precisely control the level of estimation bias introduced. The distributionally robust estimator admits a closed-form solution such that the proposed algorithm has a computational cost per iteration comparable to Watkins' Q-learning. For the tabular case, we show that 2RA Q-learning converges to the optimal policy and analyze its asymptotic mean-squared error. Lastly, we conduct numerical experiments for various settings, which corroborate our theoretical findings and indicate that 2RA Q-learning often performs better than existing methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Peter Schmitt-F\u00f6rster;Tobias Sutter", "authorids": "~Peter_Schmitt-F\u00f6rster1;~Tobias_Sutter1", "gender": "M;M", "homepage": "https://www.mlo.uni-konstanz.de/team/peter-schmitt-foerster/;https://sites.google.com/view/suttert/home", "dblp": "378/5588;01/10961", "google_scholar": ";https://scholar.google.ch/citations?user=11gxHJIAAAAJ", "orcid": ";0000-0003-1226-6845", "linkedin": ";", "or_profile": "~Peter_Schmitt-F\u00f6rster1;~Tobias_Sutter1", "aff": "Universit\u00e4t Konstanz;Universit\u00e4t Konstanz", "aff_domain": "uni-konstanz.de;uni-konstanz.de", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nschmitt-f{\\\"o}rster2024regularized,\ntitle={Regularized Q-learning through Robust Averaging},\nauthor={Peter Schmitt-F{\\\"o}rster and Tobias Sutter},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=07f24ya6eX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1565032, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:mh8cVvpLdBwJ:scholar.google.com/&scioq=Regularized+Q-learning+through+Robust+Averaging&hl=en&as_sdt=0,33", "gs_version_total": 7, "email": "uni-konstanz.de;uni-konstanz.de", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Universit\u00e4t Konstanz", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-konstanz.de", "aff_unique_abbr": "Uni Konstanz", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "ProtoGate: Prototype-based Neural Networks with Global-to-local Feature Selection for Tabular Biomedical Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35215", "id": "07fSWltF6M", "proceeding": "https://proceedings.mlr.press/v235/jiang24c.html", "pdf": "https://openreview.net/pdf?id=07fSWltF6M", "openreview": "https://openreview.net/forum?id=07fSWltF6M", "author_site": "Xiangjian Jiang, Andrei Margeloiu, Nikola Simidjievski, Mateja Jamnik", "tldr": "", "abstract": "Tabular biomedical data poses challenges in machine learning because it is often high-dimensional and typically low-sample-size (HDLSS). Previous research has attempted to address these challenges via local feature selection, but existing approaches often fail to achieve optimal performance due to their limitation in identifying globally important features and their susceptibility to the co-adaptation problem. In this paper, we propose ProtoGate, a prototype-based neural model for feature selection on HDLSS data. ProtoGate first selects instance-wise features via adaptively balancing global and local feature selection. Furthermore, ProtoGate employs a non-parametric prototype-based prediction mechanism to tackle the co-adaptation problem, ensuring the feature selection results and predictions are consistent with underlying data clusters. We conduct comprehensive experiments to evaluate the performance and interpretability of ProtoGate on synthetic and real-world datasets. The results show that ProtoGate generally outperforms state-of-the-art methods in prediction accuracy by a clear margin while providing high-fidelity feature selection and explainable predictions. Code is available at https://github.com/SilenceX12138/ProtoGate.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiangjian Jiang;Andrei Margeloiu;Nikola Simidjievski;Mateja Jamnik", "authorids": "~Xiangjian_Jiang1;~Andrei_Margeloiu1;~Nikola_Simidjievski1;~Mateja_Jamnik1", "gender": "M;M;Unspecified;F", "homepage": "https://silencex12138.github.io/;;https://simidjievskin.github.io/;http://www.cl.cam.ac.uk/~mj201", "dblp": "300/4620;280/0265;;41/1392", "google_scholar": "1y8DKBYAAAAJ;35Ygi8wAAAAJ;;d5QiyJkAAAAJ", "orcid": ";;;0000-0003-2772-2532", "linkedin": "xiangjian-jiang-034b1a222/;andreimargeloiu/;;", "or_profile": "~Xiangjian_Jiang1;~Andrei_Margeloiu1;~Nikola_Simidjievski1;~Mateja_Jamnik1", "aff": "University of Cambridge;University of Cambridge;University of Cambridge;University of Cambridge", "aff_domain": "cam.ac.uk;cam.ac.uk;cam.ac.uk;cam.ac.uk", "position": "PhD student;PhD student;Principal Researcher;Professor in Artificial Intelligence", "bibtex": "@inproceedings{\njiang2024protogate,\ntitle={ProtoGate: Prototype-based Neural Networks with Global-to-local Feature Selection for Tabular Biomedical Data},\nauthor={Xiangjian Jiang and Andrei Margeloiu and Nikola Simidjievski and Mateja Jamnik},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=07fSWltF6M}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2578618, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5421269393088521412&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 7, "email": "cam.ac.uk;cam.ac.uk;cam.ac.uk;cam.ac.uk", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Cambridge", "aff_unique_dep": "", "aff_unique_url": "https://www.cam.ac.uk", "aff_unique_abbr": "Cambridge", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Optimal Transport for Structure Learning Under Missing Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35214", "id": "09Robz3Ppy", "proceeding": "https://proceedings.mlr.press/v235/vo24b.html", "pdf": "https://openreview.net/pdf?id=09Robz3Ppy", "openreview": "https://openreview.net/forum?id=09Robz3Ppy", "author_site": "Vy Vo, He Zhao, Trung Le, Edwin V. Bonilla, Dinh Phung", "tldr": "", "abstract": "Causal discovery in the presence of missing data introduces a chicken-and-egg dilemma. While the goal is to recover the true causal structure, robust imputation requires considering the dependencies or, preferably, causal relations among variables. Merely filling in missing values with existing imputation methods and subsequently applying structure learning on the complete data is empirically shown to be sub-optimal. To address this problem, we propose a score-based algorithm for learning causal structures from missing data based on optimal transport. This optimal transport viewpoint diverges from existing score-based approaches that are dominantly based on expectation maximization. We formulate structure learning as a density fitting problem, where the goal is to find the causal model that induces a distribution of minimum Wasserstein distance with the observed data distribution. Our framework is shown to recover the true causal graphs more effectively than competing methods in most simulations and real-data settings. Empirical evidence also shows the superior scalability of our approach, along with the flexibility to incorporate any off-the-shelf causal discovery methods for complete data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vy Vo;He Zhao;Trung Le;Edwin V. Bonilla;Dinh Phung", "authorids": "~Vy_Vo2;~He_Zhao1;~Trung_Le2;~Edwin_V._Bonilla1;~Dinh_Phung2", "gender": "F;;M;;", "homepage": "https://isvy08.github.io/;;;;", "dblp": "176/4660;;;;", "google_scholar": "3CpFpFkAAAAJ;;https://scholar.google.com/citations?hl=en;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Vy_Vo2;~He_Zhao1;~Trung_Le2;~Edwin_V._Bonilla1;~Dinh_Phung2", "aff": "Monash University;;Monash University;;", "aff_domain": "monash.edu;;monash.edu;;", "position": "PhD student;;Assistant Professor;;", "bibtex": "@inproceedings{\nvo2024optimal,\ntitle={Optimal Transport for Structure Learning Under Missing Data},\nauthor={Vy Vo and He Zhao and Trung Le and Edwin V. Bonilla and Dinh Phung},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=09Robz3Ppy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 885946, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15766865138964530265&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "monash.edu;;monash.edu;;", "author_num": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Monash University", "aff_unique_dep": "", "aff_unique_url": "https://www.monash.edu", "aff_unique_abbr": "Monash", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Australia" }, { "title": "Long Is More for Alignment: A Simple but Tough-to-Beat Baseline for Instruction Fine-Tuning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35213", "id": "0AZAjkXhit", "proceeding": "https://proceedings.mlr.press/v235/zhao24b.html", "pdf": "https://openreview.net/pdf?id=0AZAjkXhit", "openreview": "https://openreview.net/forum?id=0AZAjkXhit", "author_site": "Hao Zhao, Maksym Andriushchenko, Francesco Croce, Nicolas Flammarion", "tldr": "", "abstract": "There is a consensus that instruction fine-tuning of LLMs requires high-quality data, but what are they? LIMA (NeurIPS 2023) and AlpaGasus (ICLR 2024) are state-of-the-art methods for selecting such high-quality examples, either via manual curation or using GPT-3.5-Turbo as a quality scorer. We show that the extremely simple baseline of selecting the 1,000 instructions with longest responses---that intuitively contain more learnable information and are harder to overfit---from standard datasets can consistently outperform these sophisticated methods according to GPT-4 and PaLM-2 as judges, while remaining competitive on the Open LLM benchmarks that test factual knowledge. We demonstrate this for several LLMs (Llama-2-7B, Llama-2-13B, Mistral-7B-v0.1) and datasets (Alpaca-52k, Evol-Instruct-70k). In addition, a lightweight refinement of such long instructions can further improve the abilities of the fine-tuned LLMs, and allows us to obtain competitive results on MT-Bench and the 2nd highest-ranked Llama-2-7B-based model on AlpacaEval 2.0, while training on only 1,000 examples and no extra preference data. We also conduct a thorough analysis of our models to ensure that their enhanced performance is not simply due to GPT-4's preference for longer responses. Overall, our findings suggest that fine-tuning on the longest responses should be the default baseline for any work on instruction fine-tuning. We provide our code in this GitHub repository.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hao Zhao;Maksym Andriushchenko;Francesco Croce;Nicolas Flammarion", "authorids": "~Hao_Zhao3;~Maksym_Andriushchenko1;~Francesco_Croce1;~Nicolas_Flammarion1", "gender": "M;M;M;M", "homepage": "https://marcelluszhao.github.io/;https://www.andriushchenko.me/;;", "dblp": ";200/8865;52/4288;164/7417", "google_scholar": "vwWiKP8AAAAJ;ZNtuJYoAAAAJ;https://scholar.google.de/citations?view_op=list_works;", "orcid": ";;;", "linkedin": "hao-zhao-039a761a3/;;;", "or_profile": "~Hao_Zhao3;~Maksym_Andriushchenko1;~Francesco_Croce1;~Nicolas_Flammarion1", "aff": "EPFL - EPF Lausanne;Swiss Federal Institute of Technology Lausanne;EPFL - EPF Lausanne;Swiss Federal Institute of Technology Lausanne", "aff_domain": "epfl.ch;epfl.ch;epfl.ch;epfl.ch", "position": "MS student;PhD Student;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nzhao2024long,\ntitle={Long Is More for Alignment: A Simple but Tough-to-Beat Baseline for Instruction Fine-Tuning},\nauthor={Hao Zhao and Maksym Andriushchenko and Francesco Croce and Nicolas Flammarion},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0AZAjkXhit}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1976864, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17874591835662979790&as_sdt=8005&sciodt=0,7&hl=en", "gs_version_total": 7, "email": "epfl.ch;epfl.ch;epfl.ch;epfl.ch", "author_num": 4, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "EPFL;Swiss Federal Institute of Technology Lausanne", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://www.epfl.ch", "aff_unique_abbr": "EPFL;EPFL", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Large Scale Dataset Distillation with Domain Shift", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35212", "id": "0FWPKHMCSc", "proceeding": "https://proceedings.mlr.press/v235/loo24a.html", "pdf": "https://openreview.net/pdf?id=0FWPKHMCSc", "openreview": "https://openreview.net/forum?id=0FWPKHMCSc", "author_site": "Noel Loo, Alaa Maalouf, Ramin Hasani, Mathias Lechner, Alexander Amini, Daniela Rus", "tldr": "", "abstract": "Dataset Distillation seeks to summarize a large dataset by generating a reduced set of synthetic samples. While there has been much success at distilling small datasets such as CIFAR-10 on smaller neural architectures, Dataset Distillation methods fail to scale to larger high-resolution datasets and architectures. In this work, we introduce **D**ataset **D**istillation with **D**omain **S**hift (**D3S**), a scalable distillation algorithm, made by reframing the dataset distillation problem as a *domain shift* one. In doing so, we derive a universal bound on the distillation loss, and provide a method for efficiently approximately optimizing it. We achieve state-of-the-art results on Tiny-ImageNet, ImageNet-1k, and ImageNet-21K over a variety of recently proposed baselines, including high cross-architecture generalization. Additionally, our ablation studies provide lessons on the importance of validation-time hyperparameters on distillation performance, motivating the need for standardization.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Noel Loo;Alaa Maalouf;Ramin Hasani;Mathias Lechner;Alexander Amini;Daniela Rus", "authorids": "~Noel_Loo1;~Alaa_Maalouf1;~Ramin_Hasani1;~Mathias_Lechner1;~Alexander_Amini1;~Daniela_Rus1", "gender": ";M;Unspecified;;F;M", "homepage": "https://yolky.github.io/;;https://mlech26l.github.io/pages/;https://www.mit.edu/~amini;https://www.csail.mit.edu/person/daniela-rus;http://www.raminhasani.com", "dblp": "279/6288;242/8928.html;209/9862;;r/DanielaRus;190/3168", "google_scholar": "vokGv-gAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.at/citations?hl=en;EWB-8-oAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.at/citations?user=YarJF3QAAAAJ", "orcid": ";;;;;0000-0002-9889-5222", "linkedin": "noel-loo-23a2a112b;alaa-maalouf/?originalSubdomain=il;;;;raminhasani/", "or_profile": "~Noel_Loo1;~Alaa_Maalouf1;~Mathias_Lechner1;~Alexander_Amini1;~Daniela_Rus1;~Ramin_M._Hasani1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;;mit.edu;mit.edu;mit.edu", "position": "PhD student;Postdoc;;PhD student;Full Professor;Researcher", "bibtex": "@inproceedings{\nloo2024large,\ntitle={Large Scale Dataset Distillation with Domain Shift},\nauthor={Noel Loo and Alaa Maalouf and Ramin Hasani and Mathias Lechner and Alexander Amini and Daniela Rus},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0FWPKHMCSc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7505093, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13832757112008448634&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "email": "mit.edu;mit.edu;;mit.edu;mit.edu;mit.edu", "author_num": 6, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Generalized Sobolev Transport for Probability Measures on a Graph", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35211", "id": "0GC0NG6Orr", "proceeding": "https://proceedings.mlr.press/v235/le24a.html", "pdf": "https://openreview.net/pdf?id=0GC0NG6Orr", "openreview": "https://openreview.net/forum?id=0GC0NG6Orr", "author_site": "Tam Le, Truyen Nguyen, Kenji Fukumizu", "tldr": "", "abstract": "We study the optimal transport (OT) problem for measures supported on a graph metric space. Recently, Le et al. (2022) leverage the graph structure and propose a variant of OT, namely Sobolev transport (ST), which yields a closed-form expression for a fast computation. However, ST is essentially coupled with the $L^p$ geometric structure within its definition which makes it nontrivial to utilize ST for other prior structures. In contrast, the classic OT has the flexibility to adapt to various geometric structures by modifying the underlying cost function. An important instance is the Orlicz-Wasserstein (OW) which moves beyond the $L^p$ structure by leveraging the *Orlicz geometric structure*. Comparing to the usage of standard $p$-order Wasserstein, OW remarkably helps to advance certain machine learning approaches. Nevertheless, OW brings up a new challenge on its computation due to its two-level optimization formulation. In this work, we leverage a specific class of convex functions for Orlicz structure to propose the generalized Sobolev transport (GST). GST encompasses the ST as its special case, and can be utilized for prior structures beyond the $L^p$ geometry. In connection with the OW, we show that one only needs to simply solve a univariate optimization problem to compute the GST, unlike the complex two-level optimization problem in OW. We empirically illustrate that GST is several-order faster than the OW. Moreover, we provide preliminary evidences on the advantages of GST for document classification and for several tasks in topological data analysis.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tam Le;Truyen Nguyen;Kenji Fukumizu", "authorids": "~Tam_Le2;~Truyen_Nguyen1;~Kenji_Fukumizu1", "gender": "M;M;M", "homepage": "https://tamle-ml.github.io/;https://sites.google.com/site/truyennguyen3/;http://www.ism.ac.jp/~fukumizu/", "dblp": "137/4218;94/9501.html;96/464", "google_scholar": "ZyrRB_8AAAAJ;D6LuRHsAAAAJ;", "orcid": ";;0000-0002-3488-2625", "linkedin": "lttam;;", "or_profile": "~Tam_Le2;~Truyen_Nguyen1;~Kenji_Fukumizu1", "aff": "The Institute of Statistical Mathematics (ISM);Onto Innovation ;The Institute of Statistical Mathematics, Japan, Tokyo Institute of Technology", "aff_domain": "ism.ac.jp;ontoinnovation.com;ism.ac.jp", "position": "Assistant Professor;Senior Research Scientist;Full Professor", "bibtex": "@inproceedings{\nle2024generalized,\ntitle={Generalized Sobolev Transport for Probability Measures on a Graph},\nauthor={Tam Le and Truyen Nguyen and Kenji Fukumizu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0GC0NG6Orr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 834487, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2433687021827499857&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "ism.ac.jp;ontoinnovation.com;ism.ac.jp", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Institute of Statistical Mathematics;Onto Innovation", "aff_unique_dep": ";", "aff_unique_url": "https://www.ism.ac.jp;https://www.ontoinnovation.com", "aff_unique_abbr": "ISM;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Japan;United States" }, { "title": "OxyGenerator: Reconstructing Global Ocean Deoxygenation Over a Century with Deep Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35210", "id": "0HUInAsdoo", "proceeding": "https://proceedings.mlr.press/v235/lu24n.html", "pdf": "https://openreview.net/pdf?id=0HUInAsdoo", "openreview": "https://openreview.net/forum?id=0HUInAsdoo", "author_site": "Bin Lu, Ze Zhao, Luyu Han, Xiaoying Gan, Yuntao Zhou, Lei Zhou, Luoyi Fu, Xinbing Wang, Chenghu Zhou, Jing Zhang", "tldr": "", "abstract": "Accurately reconstructing the global ocean deoxygenation over a century is crucial for assessing and protecting marine ecosystem. Existing expert-dominated numerical simulations fail to catch up with the dynamic variation caused by global warming and human activities. Besides, due to the high-cost data collection, the historical observations are severely sparse, leading to big challenge for precise reconstruction. In this work, we propose OxyGenerator, the first deep learning based model, to reconstruct the global ocean deoxygenation from 1920 to 2023. Specifically, to address the heterogeneity across large temporal and spatial scales, we propose zoning-varying graph message-passing to capture the complex oceanographic correlations between missing values and sparse observations. Additionally, to further calibrate the uncertainty, we incorporate inductive bias from dissolved oxygen (DO) variations and chemical effects. Compared with in-situ DO observations, OxyGenerator significantly outperforms CMIP6 numerical simulations, reducing MAPE by 38.77%, demonstrating a promising potential to understand the \u201cbreathless ocean\u201d in data-driven manner.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bin Lu;Ze Zhao;Luyu Han;Xiaoying Gan;Yuntao Zhou;Lei Zhou;Luoyi Fu;Xinbing Wang;Chenghu Zhou;Jing Zhang", "authorids": "~Bin_Lu2;~Ze_Zhao1;~Luyu_Han1;~Xiaoying_Gan1;~Yuntao_Zhou1;~Lei_Zhou3;~Luoyi_Fu1;~Xinbing_Wang1;~Chenghu_Zhou3;~Jing_Zhang40", "gender": "M;M;;F;;M;F;M;M;M", "homepage": "https://robinlu1209.github.io/;https://zhaodazhuang2333.github.io/;;https://iiot.sjtu.edu.cn/#/xygan;;;http://www.cs.sjtu.edu.cn/~fu-ly/index.html;http://www.cs.sjtu.edu.cn/~wang-xb/;http://www.igsnrr.cas.cn/gkjj/ysfc/ysfc_zhouchenghu/;", "dblp": "48/7036-5;;;13/29.html;;72/5749;;96/1149.html;85/1324.html;", "google_scholar": "HsI7HPoAAAAJ;https://scholar.google.com/citations?view_op=list_works;;VRypOjcAAAAJ;;;https://scholar.google.com.tw/citations?user=xHs9mCUAAAAJ;https://scholar.google.com.tw/citations?user=CT5yZbwAAAAJ;;", "orcid": "0000-0001-6452-7029;0000-0002-1599-3434;0000-0003-2955-7841;0000-0001-5200-1409;0000-0001-9714-5385;;;0000-0002-0357-8356;;0000-0001-5403-5442", "linkedin": ";;;;;;;;;", "or_profile": "~Bin_Lu2;~Ze_Zhao1;~Luyu_Han1;~Xiaoying_Gan1;~Yuntao_Zhou1;~Lei_Zhou3;~Luoyi_Fu1;~Xinbing_Wang1;~Chenghu_Zhou3;~Jing_Zhang40", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;IGSNRR, Chinese Academy of Sciences, Beijing, China;East China Normal University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;cs.sjtu.edu.cn;lreis.ac.cn;ecnu.edu.cn", "position": "PhD student;MS student;Undergrad student;Full Professor;Associate Professor;Full Professor;Associate Professor;Full Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nlu2024oxygenerator,\ntitle={OxyGenerator: Reconstructing Global Ocean Deoxygenation Over a Century with Deep Learning},\nauthor={Bin Lu and Ze Zhao and Luyu Han and Xiaoying Gan and Yuntao Zhou and Lei Zhou and Luoyi Fu and Xinbing Wang and Chenghu Zhou and Jing Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0HUInAsdoo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 0, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12593527277240921570&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;cs.sjtu.edu.cn;lreis.ac.cn;ecnu.edu.cn", "author_num": 10, "aff_unique_index": "0;0;0;0;0;0;0;0;1;2", "aff_unique_norm": "Shanghai Jiao Tong University;Chinese Academy of Sciences;East China Normal University", "aff_unique_dep": ";IGSNRR;", "aff_unique_url": "https://www.sjtu.edu.cn;http://www.cas.cn;http://www.ecnu.edu.cn", "aff_unique_abbr": "SJTU;CAS;ECNU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Boosting Reinforcement Learning with Strongly Delayed Feedback Through Auxiliary Short Delays", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35209", "id": "0IDaPnY5d5", "proceeding": "https://proceedings.mlr.press/v235/wu24af.html", "pdf": "https://openreview.net/pdf?id=0IDaPnY5d5", "openreview": "https://openreview.net/forum?id=0IDaPnY5d5", "author_site": "Qingyuan Wu, Simon Zhan, Yixuan Wang, Yuhui Wang, Chung-Wei Lin, Chen Lv, Qi Zhu, J\u00fcrgen Schmidhuber, Chao Huang", "tldr": "", "abstract": "Reinforcement learning (RL) is challenging in the common case of delays between events and their sensory perceptions. State-of-the-art (SOTA) state augmentation techniques either suffer from state space explosion or performance degeneration in stochastic environments. To address these challenges, we present a novel *Auxiliary-Delayed Reinforcement Learning (AD-RL)* method that leverages auxiliary tasks involving short delays to accelerate RL with long delays, without compromising performance in stochastic environments. Specifically, AD-RL learns a value function for short delays and uses bootstrapping and policy improvement techniques to adjust it for long delays. We theoretically show that this can greatly reduce the sample complexity. On deterministic and stochastic benchmarks, our method significantly outperforms the SOTAs in both sample efficiency and policy performance. Code is available at https://github.com/QingyuanWuNothing/AD-RL.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qingyuan Wu;Simon Sinong Zhan;Yixuan Wang;Yuhui Wang;Chung-Wei Lin;Chen Lv;Qi Zhu;J\u00fcrgen Schmidhuber;Chao Huang", "authorids": "~Qingyuan_Wu1;~Simon_Sinong_Zhan1;~Yixuan_Wang1;~Yuhui_Wang1;~Chung-Wei_Lin1;~Chen_Lv1;~Qi_Zhu2;~J\u00fcrgen_Schmidhuber1;~Chao_Huang5", "gender": "M;M;M;M;M;M;;M;M", "homepage": ";https://simonzhan.github.io/;https://wangyixu14.github.io/;https://wangyuhuix.github.io/;https://www.csie.ntu.edu.tw/~cwlin/;https://lvchen.wixsite.com/automan;http://zhulab.ece.northwestern.edu/;http://people.idsia.ch/~juergen/;https://chaohuang2018.github.io/main/", "dblp": ";330/3557;44/4317;;;;66/5923-2.html;s/JurgenSchmidhuber;18/4087-15", "google_scholar": "CYfMzb8AAAAJ;uO4dG0wAAAAJ;7qP5C-kAAAAJ;https://scholar.google.com.tw/citations?hl=zh-CN;fh0S7TAAAAAJ;UKVs2CEAAAAJ;TN09YMcAAAAJ;https://scholar.google.ch/citations?user=gLnCTgIAAAAJ;GbY72eIAAAAJ", "orcid": ";;;;;0000-0001-6897-4512;;;0000-0002-9300-1787", "linkedin": ";;;;;chen-lv-7964b590/;;;", "or_profile": "~Qingyuan_Wu1;~Simon_Sinong_Zhan1;~Yixuan_Wang1;~Yuhui_Wang1;~Chung-Wei_Lin1;~Chen_Lv1;~Qi_Zhu2;~J\u00fcrgen_Schmidhuber1;~Chao_Huang5", "aff": "University of Liverpool;Northwestern University;Northwestern University, Northwestern University;King Abdullah University of Science and Technology;National Taiwan University;Nanyang Technological University;Northwestern University;IDSIA;University of Liverpool", "aff_domain": "liverpool.ac.uk;u.northwestern.edu;u.northwestern.edu;kaust.edu.sa;ntu.edu.tw;ntu.edu.sg;northwestern.edu;idsia.ch;liverpool.ac.uk", "position": "PhD student;PhD student;PhD student;Postdoc;Associate Professor;Assistant Professor;Associate Professor;Scientific Director;Assistant Professor", "bibtex": "@inproceedings{\nwu2024boosting,\ntitle={Boosting Reinforcement Learning with Strongly Delayed Feedback Through Auxiliary Short Delays},\nauthor={Qingyuan Wu and Simon Sinong Zhan and Yixuan Wang and Yuhui Wang and Chung-Wei Lin and Chen Lv and Qi Zhu and J{\\\"u}rgen Schmidhuber and Chao Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0IDaPnY5d5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1262013, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1774352490525470738&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "liverpool.ac.uk;u.northwestern.edu;u.northwestern.edu;kaust.edu.sa;ntu.edu.tw;ntu.edu.sg;northwestern.edu;idsia.ch;liverpool.ac.uk", "author_num": 9, "aff_unique_index": "0;1;1;2;3;4;1;5;0", "aff_unique_norm": "University of Liverpool;Northwestern University;King Abdullah University of Science and Technology;National Taiwan University;Nanyang Technological University;Institute of Digital Technologies", "aff_unique_dep": ";;;;;", "aff_unique_url": "https://www.liverpool.ac.uk;https://www.northwestern.edu;https://www.kast.kau.edu.sa;https://www.ntu.edu.tw;https://www.ntu.edu.sg;https://www.idsia.ch", "aff_unique_abbr": "Liv Uni;NU;KAUST;NTU;NTU;IDSIA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Taiwan", "aff_country_unique_index": "0;1;1;2;3;4;1;5;0", "aff_country_unique": "United Kingdom;United States;Saudi Arabia;China;Singapore;Switzerland" }, { "title": "PointMC: Multi-instance Point Cloud Registration based on Maximal Cliques", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35208", "id": "0JV5WpLQgv", "proceeding": "https://proceedings.mlr.press/v235/wu24k.html", "pdf": "https://openreview.net/pdf?id=0JV5WpLQgv", "openreview": "https://openreview.net/forum?id=0JV5WpLQgv", "author_site": "Yue Wu, Xidao hu, Yongzhe Yuan, Xiaolong Fan, Maoguo Gong, Hao Li, Mingyang Zhang, Qiguang Miao, Wenping Ma", "tldr": "", "abstract": "Multi-instance point cloud registration is the problem of estimating multiple rigid transformations between two point clouds. Existing solutions rely on global spatial consistency of ambiguity and the time-consuming clustering of highdimensional correspondence features, making it difficult to handle registration scenarios where multiple instances overlap. To address these problems, we propose a maximal clique based multiinstance point cloud registration framework called PointMC. The key idea is to search for maximal cliques on the correspondence compatibility graph to estimate multiple transformations, and cluster these transformations into clusters corresponding to different instances to efficiently and accurately estimate all poses. PointMC leverages a correspondence embedding module that relies on local spatial consistency to effectively eliminate outliers, and the extracted discriminative features empower the network to circumvent missed pose detection in scenarios involving multiple overlapping instances. We conduct comprehensive experiments on both synthetic and real-world datasets, and the results show that the proposed PointMC yields remarkable performance improvements.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yue Wu;Xidao hu;Yongzhe Yuan;Xiaolong Fan;Maoguo Gong;Hao Li;Mingyang Zhang;Qiguang Miao;Wenping Ma", "authorids": "~Yue_Wu14;~Xidao_hu1;~Yongzhe_Yuan2;~Xiaolong_Fan1;~Maoguo_Gong2;~Hao_Li12;~Mingyang_Zhang4;~Qiguang_Miao1;~Wenping_Ma3", "gender": "M;M;M;M;M;M;M;M;F", "homepage": "https://ywuchina.github.io/;https://scholar.google.com/citations?user=IqZR2i4AAAAJ&hl=zh-CN;https://yyzmars.github.io/;;https://scholar.google.com/citations?user=h4PExPwAAAAJ&hl=en;;https://faculty.xidian.edu.cn/LIHAO/zh_CN/index.htm;http://see.xidian.edu.cn/faculty/mggong/;", "dblp": "41/5979-4;;328/7811;17/1250;;03/4610;;80/3438;", "google_scholar": "https://scholar.google.com.hk/citations?user=axaj7H4AAAAJ;IqZR2i4AAAAJ;;yPZmXBQAAAAJ;h4PExPwAAAAJ;2TQfvt8AAAAJ;https://scholar.google.com.hk/citations?hl=zh-CN;https://scholar.google.com.hk/citations?user=D-TS1fAAAAAJ;https://scholar.google.com.hk/citations?user=I1pPv1QAAAAJ", "orcid": ";;;0000-0001-8484-2069;;0000-0002-2872-388X;;0000-0002-0415-8556;", "linkedin": ";;;;;;;;", "or_profile": "~Yue_Wu14;~Xidao_hu1;~Yongzhe_Yuan2;~Xiaolong_Fan1;~Mingyang_Zhang4;~Qiguang_Miao1;~Li_Hao3;~Gong_Maoguo1;~wenping_ma1", "aff": "Xidian University;Xi'an University of Electronic Science and Technology;Xidian University;Xidian University;Xidian University;Xidian University;Xidian University;Xidian University;Xidian University", "aff_domain": "xidian.edu;xidian.edu.cn;xidian.edu.cn;xidian.edu.cn;xidian.edu.cn;cs.xidian.edu;xidian.edu;xidian.edu;xidian.edu.cn", "position": "Associate Professor;MS student;PhD student;Lecturer;Associate Professor;Professor;Associate Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nwu2024pointmc,\ntitle={Point{MC}: Multi-instance Point Cloud Registration based on Maximal Cliques},\nauthor={Yue Wu and Xidao hu and Yongzhe Yuan and Xiaolong Fan and Maoguo Gong and Hao Li and Mingyang Zhang and Qiguang Miao and Wenping Ma},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0JV5WpLQgv}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5834449, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18207780054773060821&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 5, "email": "xidian.edu;xidian.edu.cn;xidian.edu.cn;xidian.edu.cn;xidian.edu.cn;cs.xidian.edu;xidian.edu;xidian.edu;xidian.edu.cn", "author_num": 9, "aff_unique_index": "0;1;0;0;0;0;0;0;0", "aff_unique_norm": "Xidian University;Xi'an University of Electronic Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "http://www.xidian.edu.cn/;http://www.xidian.edu.cn/", "aff_unique_abbr": "Xidian;Xidian University", "aff_campus_unique_index": "1", "aff_campus_unique": ";Xi'an", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "See More Details: Efficient Image Super-Resolution by Experts Mining", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35207", "id": "0JXGusc7E2", "proceeding": "https://proceedings.mlr.press/v235/zamfir24a.html", "pdf": "https://openreview.net/pdf?id=0JXGusc7E2", "openreview": "https://openreview.net/forum?id=0JXGusc7E2", "author_site": "Eduard Zamfir, Zongwei Wu, Nancy Mehta, Yulun Zhang, Radu Timofte", "tldr": "", "abstract": "Reconstructing high-resolution (HR) images from low-resolution (LR) inputs poses a significant challenge in image super-resolution (SR). While recent approaches have demonstrated the efficacy of intricate operations customized for various objectives, the straightforward stacking of these disparate operations can result in a substantial computational burden, hampering their practical utility. In response, we introduce SeemoRe, an efficient SR model employing expert mining. Our approach strategically incorporates experts at different levels, adopting a collaborative methodology. At the macro scale, our experts address rank-wise and spatial-wise informative features, providing a holistic understanding. Subsequently, the model delves into the subtleties of rank choice by leveraging a mixture of low-rank experts. By tapping into experts specialized in distinct key factors crucial for accurate SR, our model excels in uncovering intricate intra-feature details. This collaborative approach is reminiscent of the concept of ``see more\", allowing our model to achieve an optimal performance with minimal computational costs in efficient settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Eduard Zamfir;Zongwei Wu;Nancy Mehta;Yulun Zhang;Radu Timofte", "authorids": "~Eduard_Zamfir1;~Zongwei_Wu1;~Nancy_Mehta1;~Yulun_Zhang1;~Radu_Timofte1", "gender": "M;F;M;M;M", "homepage": "https://sites.google.com/view/zwwu/accueil;;http://yulunzhang.com/;https://www.informatik.uni-wuerzburg.de/computervision/;https://eduardzamfir.github.io", "dblp": "127/8689;305/3859;166/2763-1.html;24/8616;326/5425", "google_scholar": "3QSALjX498QC;WwdYdlUAAAAJ;ORmLjWoAAAAJ;https://scholar.google.ch/citations?user=u3MwH5kAAAAJ;5-FIWKoAAAAJ", "orcid": ";0000-0002-1249-8577;0000-0002-2288-5079;0000-0002-1478-0402;", "linkedin": ";;yulun-zhang-1116b5b9/;https://ch.linkedin.com/in/radutimofte;eduard-zamfir-167660161/", "or_profile": "~Zongwei_Wu1;~Nancy_Mehta1;~Yulun_Zhang1;~Radu_Timofte1;~Eduard_Sebastian_Zamfir1", "aff": "Bayerische Julius-Maximilians-Universit\u00e4t W\u00fcrzburg;Bayerische Julius-Maximilians-Universit\u00e4t W\u00fcrzburg;Swiss Federal Institute of Technology;Bayerische Julius-Maximilians-Universit\u00e4t W\u00fcrzburg;Bayerische Julius-Maximilians-Universit\u00e4t W\u00fcrzburg", "aff_domain": "uni-wuerzburg.de;uni-wuerzburg.de;ethz.ch;uni-wuerzburg.de;uni-wuerzburg.de", "position": "Researcher;Postdoc;Postdoc;Full Professor;PhD student", "bibtex": "@inproceedings{\nzamfir2024see,\ntitle={See More Details: Efficient Image Super-Resolution by Experts Mining},\nauthor={Eduard Zamfir and Zongwei Wu and Nancy Mehta and Yulun Zhang and Radu Timofte},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0JXGusc7E2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4069620, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5739720995823401338&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "uni-wuerzburg.de;uni-wuerzburg.de;ethz.ch;uni-wuerzburg.de;uni-wuerzburg.de", "author_num": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "University of W\u00fcrzburg;Swiss Federal Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-wuerzburg.de;https://www.ethz.ch", "aff_unique_abbr": "JMU;ETH Zurich", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "W\u00fcrzburg;", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "Germany;Switzerland" }, { "title": "Purify Unlearnable Examples via Rate-Constrained Variational Autoencoders", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35206", "id": "0LBNdbmQCM", "proceeding": "https://proceedings.mlr.press/v235/yu24m.html", "pdf": "https://openreview.net/pdf?id=0LBNdbmQCM", "openreview": "https://openreview.net/forum?id=0LBNdbmQCM", "author_site": "Yi Yu, Yufei Wang, Song Xia, Wenhan Yang, Shijian Lu, Yap-peng Tan, Alex Kot", "tldr": "", "abstract": "Unlearnable examples (UEs) seek to maximize testing error by making subtle modifications to training examples that are correctly labeled. Defenses against these poisoning attacks can be categorized based on whether specific interventions are adopted during training. The first approach is training-time defense, such as adversarial training, which can mitigate poisoning effects but is computationally intensive. The other approach is pre-training purification, e.g., image short squeezing, which consists of several simple compressions but often encounters challenges in dealing with various UEs. Our work provides a novel disentanglement mechanism to build an efficient pre-training purification method. Firstly, we uncover rate-constrained variational autoencoders (VAEs), demonstrating a clear tendency to suppress the perturbations in UEs. We subsequently conduct a theoretical analysis for this phenomenon. Building upon these insights, we introduce a disentangle variational autoencoder (D-VAE), capable of disentangling the perturbations with learnable class-wise embeddings. Based on this network, a two-stage purification approach is naturally developed. The first stage focuses on roughly eliminating perturbations, while the second stage produces refined, poison-free results, ensuring effectiveness and robustness across various scenarios. Extensive experiments demonstrate the remarkable performance of our method across CIFAR-10, CIFAR-100, and a 100-class ImageNet-subset. Code is available at https://github.com/yuyi-sd/D-VAE.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yi Yu;Yufei Wang;Song Xia;Wenhan Yang;Shijian Lu;Yap-peng Tan;Alex Kot", "authorids": "~Yi_Yu5;~Yufei_Wang5;~Song_Xia1;~Wenhan_Yang6;~Shijian_Lu1;~Yap-peng_Tan1;~Alex_Kot1", "gender": ";M;M;M;M;M;", "homepage": "https://github.com/yuyi-sd;https://github.com/wyf0912/;;https://flyywh.github.io/;https://personal.ntu.edu.sg/shijian.lu/;https://personal.ntu.edu.sg/eyptan/;https://www.ntu.edu.sg/home/eackot/", "dblp": "99/111-11.html;;;156/2359.html;42/2718;93/4472.html;", "google_scholar": "https://scholar.google.com/citations?hl=en;jLd1l_sAAAAJ;x_CkEE8AAAAJ;S8nAnakAAAAJ;https://scholar.google.com.sg/scholar?hl=en;https://scholar.google.com.sg/citations?user=t9EqYQIAAAAJ;", "orcid": "0000-0003-2730-9553;;0009-0002-1224-470X;;;0000-0002-0645-9109;", "linkedin": "%E7%9B%8A-%E4%BD%99-6b453a229;;;;;;", "or_profile": "~Yi_Yu5;~Yufei_Wang5;~Song_Xia1;~Wenhan_Yang6;~Shijian_Lu1;~Yap-peng_Tan1;~Alex_Kot1", "aff": "Nanyang Technological University;Nanyang Technological University;Nanyang Technological University;Peng Cheng Laboratory;Nanyang Technological University;Nanyang Technological University;Nanyang Technological University", "aff_domain": "ntu.edu.sg;ntu.edu.sg;ntu.edu.sg;pcl.ac.cn;ntu.edu.sg;ntu.edu.sg;ntu.edu.sg", "position": "PhD student;PhD student;PhD student;Researcher;Associate Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nyu2024purify,\ntitle={Purify Unlearnable Examples via Rate-Constrained Variational Autoencoders},\nauthor={Yi Yu and Yufei Wang and Song Xia and Wenhan Yang and Shijian Lu and Yap-peng Tan and Alex Kot},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0LBNdbmQCM}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4331998, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7031633929742760154&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "ntu.edu.sg;ntu.edu.sg;ntu.edu.sg;pcl.ac.cn;ntu.edu.sg;ntu.edu.sg;ntu.edu.sg", "author_num": 7, "aff_unique_index": "0;0;0;1;0;0;0", "aff_unique_norm": "Nanyang Technological University;Pengcheng Laboratory", "aff_unique_dep": ";Peng Cheng Laboratory", "aff_unique_url": "https://www.ntu.edu.sg;http://www.pcl.ac.cn", "aff_unique_abbr": "NTU;PCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0;0", "aff_country_unique": "Singapore;China" }, { "title": "Global Reinforcement Learning : Beyond Linear and Convex Rewards via Submodular Semi-gradient Methods", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35205", "id": "0M2tNui8jX", "proceeding": "https://proceedings.mlr.press/v235/de-santi24b.html", "pdf": "https://openreview.net/pdf?id=0M2tNui8jX", "openreview": "https://openreview.net/forum?id=0M2tNui8jX", "author_site": "Riccardo De Santi, Manish Prajapat, Andreas Krause", "tldr": "", "abstract": "In classic Reinforcement Learning (RL), the agent maximizes an additive objective of the visited states, e.g., a value function. Unfortunately, objectives of this type cannot model many real-world applications such as experiment design, exploration, imitation learning, and risk-averse RL to name a few. This is due to the fact that additive objectives disregard interactions between states that are crucial for certain tasks. To tackle this problem, we introduce *Global* RL (GRL), where rewards are *globally* defined over trajectories instead of *locally* over states. Global rewards can capture *negative interactions* among states, e.g., in exploration, via submodularity, *positive interactions*, e.g., synergetic effects, via supermodularity, while mixed interactions via combinations of them. By exploiting ideas from submodular optimization, we propose a novel algorithmic scheme that converts any GRL problem to a sequence of classic RL problems and solves it efficiently with curvature-dependent approximation guarantees. We also provide hardness of approximation results and empirically demonstrate the effectiveness of our method on several GRL instances.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Riccardo De Santi;Manish Prajapat;Andreas Krause", "authorids": "~Riccardo_De_Santi1;~Manish_Prajapat1;~Andreas_Krause1", "gender": "M;M;M", "homepage": "http://www.riccardodesanti.com;https://www.linkedin.com/in/manish-prajapat-eth/;https://las.inf.ethz.ch/krausea", "dblp": "313/1635;227/2093;87/1831-1.html", "google_scholar": "K7qyOj0AAAAJ;qnobH84AAAAJ;https://scholar.google.ch/citations?user=eDHv58AAAAAJ", "orcid": ";0000-0002-3867-4575;0000-0001-7260-9673", "linkedin": "riccardo-de-santi-426139135/;manish-prajapat-eth/;krausea/", "or_profile": "~Riccardo_De_Santi1;~Manish_Prajapat1;~Andreas_Krause1", "aff": "ETHZ - ETH Zurich;Swiss Federal Institute of Technology;ETH Zurich", "aff_domain": "ethz.ch;ethz.ch;ethz.ch", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nsanti2024global,\ntitle={Global Reinforcement Learning : Beyond Linear and Convex Rewards via Submodular Semi-gradient Methods},\nauthor={Riccardo De Santi and Manish Prajapat and Andreas Krause},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0M2tNui8jX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 697147, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14211822236680593999&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "ethz.ch;ethz.ch;ethz.ch", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "ETH Zurich;Swiss Federal Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch", "aff_unique_abbr": "ETHZ;ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Autonomous Sparse Mean-CVaR Portfolio Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35204", "id": "0NacraIYrA", "proceeding": "https://proceedings.mlr.press/v235/lin24w.html", "pdf": "https://openreview.net/pdf?id=0NacraIYrA", "openreview": "https://openreview.net/forum?id=0NacraIYrA", "author_site": "Yizun Lin, Yangyu Zhang, Zhao-Rong Lai, Cheng Li", "tldr": "", "abstract": "The $\\ell_0$-constrained mean-CVaR model poses a significant challenge due to its NP-hard nature, typically tackled through combinatorial methods characterized by high computational demands. From a markedly different perspective, we propose an innovative autonomous sparse mean-CVaR portfolio model, capable of approximating the original $\\ell_0$-constrained mean-CVaR model with arbitrary accuracy. The core idea is to convert the $\\ell_0$ constraint into an indicator function and subsequently handle it through a tailed approximation. We then propose a proximal alternating linearized minimization algorithm, coupled with a nested fixed-point proximity algorithm (both convergent), to iteratively solve the model. Autonomy in sparsity refers to retaining a significant portion of assets within the selected asset pool during adjustments in pool size. Consequently, our framework offers a theoretically guaranteed approximation of the $\\ell_0$-constrained mean-CVaR model, improving computational efficiency while providing a robust asset selection scheme.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yizun Lin;Yangyu Zhang;Zhao-Rong Lai;Cheng Li", "authorids": "linyizun@jnu.edu.cn;~Yangyu_Zhang1;~Zhao-Rong_Lai1;~Cheng_Li24", "gender": ";M;M;F", "homepage": ";;https://cybsec.jnu.edu.cn/2023/1120/c39593a781893/page.htm;https://scholar.google.com/citations?user=ZFroLyIAAAAJ&hl=zh-TW", "dblp": ";;142/3902;16/6465-18", "google_scholar": ";;https://scholar.google.com.hk/citations?user=psPB6TsAAAAJ;ZFroLyIAAAAJ", "orcid": ";0009-0008-6905-5073;;0000-0002-9019-0922", "linkedin": ";;;", "or_profile": "linyizun@jnu.edu.cn;~Yangyu_Zhang1;~Zhao-Rong_Lai1;~Cheng_Li24", "aff": ";South China University of Technology;Jinan University;Jinan University", "aff_domain": ";scut.edu.cn;jnu.edu.cn;jnu.edu.cn", "position": ";MS student;Associate Professor;PhD student", "bibtex": "@inproceedings{\nlin2024autonomous,\ntitle={Autonomous Sparse Mean-{CV}aR Portfolio Optimization},\nauthor={Yizun Lin and Yangyu Zhang and Zhao-Rong Lai and Cheng Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0NacraIYrA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 964004, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2838553108429915254&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 8, "email": ";scut.edu.cn;jnu.edu.cn;jnu.edu.cn", "author_num": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "South China University of Technology;Jinan University", "aff_unique_dep": ";", "aff_unique_url": "https://www.scut.edu.cn;https://www.jnu.edu.cn", "aff_unique_abbr": "SCUT;JNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Enhancing Size Generalization in Graph Neural Networks through Disentangled Representation Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35203", "id": "0NdU4y9dWC", "proceeding": "https://proceedings.mlr.press/v235/huang24ac.html", "pdf": "https://openreview.net/pdf?id=0NdU4y9dWC", "openreview": "https://openreview.net/forum?id=0NdU4y9dWC", "author_site": "Zheng Huang, Qihui Yang, Dawei Zhou, Yujun Yan", "tldr": "", "abstract": "Although most graph neural networks (GNNs) can operate on graphs of any size, their classification performance often declines on graphs larger than those encountered during training. Existing methods insufficiently address the removal of size information from graph representations, resulting in sub-optimal performance and reliance on backbone models. In response, we propose DISGEN, a novel and model-agnostic framework designed to disentangle size factors from graph representations. DISGEN employs size- and task-invariant augmentations and introduces a decoupling loss that minimizes shared information in hidden representations, with theoretical guarantees for its effectiveness. Our empirical results show that DISGEN outperforms the state-of-the-art models by up to 6% on real-world datasets, underscoring its effectiveness in enhancing the size generalizability of GNNs. Our codes are available at: https://github.com/GraphmindDartmouth/DISGEN.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zheng Huang;Qihui Yang;Dawei Zhou;Yujun Yan", "authorids": "~Zheng_Huang2;~Qihui_Yang1;~Dawei_Zhou1;~Yujun_Yan1", "gender": "M;;M;F", "homepage": "https://scholar.google.com/citations?user=kEdNHHwAAAAJ&hl=en&authuser=4;https://isaacyqh.github.io/;https://sites.google.com/view/dawei-zhou/home?authuser=0;https://sites.google.com/umich.edu/yujunyan/home", "dblp": ";;39/3130-3.html;219/1736", "google_scholar": ";_ye8JacAAAAJ;8dakqOgAAAAJ;5TQUP58AAAAJ", "orcid": ";;0000-0002-7065-2990;0000-0003-3776-4293", "linkedin": ";isaacyqh/;dawei-zhou-31035668/;", "or_profile": "~Zheng_Huang2;~Qihui_Yang1;~Dawei_Zhou1;~Yujun_Yan1", "aff": "Dartmouth College;The Chinese University of Hong Kong;Virginia Polytechnic Institute and State University;Dartmouth College", "aff_domain": "dartmouth.edu;cuhk.edu.hk;vt.edu;dartmouth.edu", "position": "PhD student;Intern;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nhuang2024enhancing,\ntitle={Enhancing Size Generalization in Graph Neural Networks through Disentangled Representation Learning},\nauthor={Zheng Huang and Qihui Yang and Dawei Zhou and Yujun Yan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0NdU4y9dWC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1434799, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9518718982339596681&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "dartmouth.edu;cuhk.edu.hk;vt.edu;dartmouth.edu", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Dartmouth College;Chinese University of Hong Kong;Virginia Tech", "aff_unique_dep": ";;", "aff_unique_url": "https://www.dartmouth.edu;https://www.cuhk.edu.hk;https://www.vt.edu", "aff_unique_abbr": "Dartmouth;CUHK;VT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;China" }, { "title": "Self-Rewarding Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35202", "id": "0NphYCmgua", "proceeding": "https://proceedings.mlr.press/v235/yuan24d.html", "pdf": "https://openreview.net/pdf?id=0NphYCmgua", "openreview": "https://openreview.net/forum?id=0NphYCmgua", "author_site": "Weizhe Yuan, Richard Yuanzhe Pang, Kyunghyun Cho, Xian Li, Sainbayar Sukhbaatar, Jing Xu, JASON WESTON", "tldr": "", "abstract": "We posit that to achieve superhuman agents, future models require superhuman feedback in order to provide an adequate training signal. Current approaches commonly train reward models from human preferences, which may then be bottlenecked by human performance level, and secondly these reward models require additional human preferences data to further improve.In this work, we study Self-Rewarding Language Models, where the language model itself is used via LLM-as-a-Judge prompting to provide its own rewards during training. We show that during Iterative DPO training, not only does instruction following ability improve, but also the ability to provide high-quality rewards to itself. Fine-tuning Llama 2 70B on three iterations of our approach yields a model that outperforms many existing systems on the AlpacaEval 2.0 leaderboard, including Claude 2, Gemini Pro, and GPT-4 0613. While there is much left still to explore, this work opens the door to the possibility of models that can continually improve in both axes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weizhe Yuan;Richard Yuanzhe Pang;Kyunghyun Cho;Xian Li;Sainbayar Sukhbaatar;Jing Xu;Jason E Weston", "authorids": "~Weizhe_Yuan1;~Richard_Yuanzhe_Pang1;~Kyunghyun_Cho1;~Xian_Li1;~Sainbayar_Sukhbaatar1;~Jing_Xu5;~Jason_E_Weston1", "gender": "F;M;M;;M;F;", "homepage": "http://yyy-apple.github.io/;https://yzpang.me;http://kyunghyuncho.me;;;;", "dblp": "207/1964;250/9059;41/9736;82/1763-3.html;56/10550;;", "google_scholar": "2k5j4eMAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.fi/citations?user=0RAmmIAAAAAJ;v_sIgawAAAAJ;ri1sE34AAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": ";;;;;0000-0001-8289-1852;", "linkedin": "weizhey/;yuanzhe-richard-pang/;;;;jing-xu-818022a1;", "or_profile": "~Weizhe_Yuan1;~Richard_Yuanzhe_Pang1;~Kyunghyun_Cho1;~Xian_Li1;~Sainbayar_Sukhbaatar1;~Jing_Xu5;~Jason_E_Weston1", "aff": "New York University;New York University;Genentech;Facebook AI;Meta AI;FAIR;", "aff_domain": "nyu.edu;nyu.edu;gene.com;fb.com;meta.com;meta.com;", "position": "PhD student;PhD student;Senior Director of Frontier Research;Principal Researcher;Research Scientist;Researcher;", "bibtex": "@inproceedings{\nyuan2024selfrewarding,\ntitle={Self-Rewarding Language Models},\nauthor={Weizhe Yuan and Richard Yuanzhe Pang and Kyunghyun Cho and Xian Li and Sainbayar Sukhbaatar and Jing Xu and Jason E Weston},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0NphYCmgua}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 799842, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "email": "nyu.edu;nyu.edu;gene.com;fb.com;meta.com;meta.com;", "author_num": 7, "aff_unique_index": "0;0;1;2;2;2", "aff_unique_norm": "New York University;Genentech;Meta", "aff_unique_dep": ";;Facebook AI", "aff_unique_url": "https://www.nyu.edu;https://www.genentech.com;https://www.facebook.com", "aff_unique_abbr": "NYU;Genentech;Facebook AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "End-to-End Neuro-Symbolic Reinforcement Learning with Textual Explanations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35201", "id": "0P3kaNluGj", "proceeding": "https://proceedings.mlr.press/v235/luo24j.html", "pdf": "https://openreview.net/pdf?id=0P3kaNluGj", "openreview": "https://openreview.net/forum?id=0P3kaNluGj", "author_site": "Lirui Luo, Guoxi Zhang, Hongming Xu, Yaodong Yang, Cong Fang, Qing Li", "tldr": "", "abstract": "Neuro-symbolic reinforcement learning (NS-RL) has emerged as a promising paradigm for explainable decision-making, characterized by the interpretability of symbolic policies. NS-RL entails structured state representations for tasks with visual observations, but previous methods cannot refine the structured states with rewards due to a lack of efficiency. Accessibility also remains an issue, as extensive domain knowledge is required to interpret symbolic policies. In this paper, we present a neuro-symbolic framework for jointly learning structured states and symbolic policies, whose key idea is to distill the vision foundation model into an efficient perception module and refine it during policy learning. Moreover, we design a pipeline to prompt GPT-4 to generate textual explanations for the learned policies and decisions, significantly reducing users' cognitive load to understand the symbolic policies. We verify the efficacy of our approach on nine Atari tasks and present GPT-generated explanations for policies and decisions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lirui Luo;Guoxi Zhang;Hongming Xu;Yaodong Yang;Cong Fang;Qing Li", "authorids": "~Lirui_Luo1;~Guoxi_Zhang1;~Hongming_Xu2;~Yaodong_Yang1;~Cong_Fang1;~Qing_Li1", "gender": ";Not Specified;;M;M;M", "homepage": "https://scholar.google.com/citations?hl=en&user=_yXEHHcAAAAJ;https://guoxizhang.com;https://github.com/sbx126;https://www.yangyaodong.com;https://congfang-ml.github.io/;http://liqing-ustc.github.io/", "dblp": ";211/5754;;170/1496-1;140/6568;181/2689-3", "google_scholar": "https://scholar.google.com/citations?hl=en;eCizWkgAAAAJ;;https://scholar.google.co.uk/citations?user=6yL0xw8AAAAJ;N2M9RPoAAAAJ;iwdFZBEAAAAJ", "orcid": ";;;0000-0001-8132-5613;;", "linkedin": ";;https://www.linkedin.cn/incareer/in/hongming-xu-5b333165;yaodong-yang;;", "or_profile": "~Lirui_Luo1;~Guoxi_Zhang1;~Hongming_Xu2;~Yaodong_Yang1;~Cong_Fang1;~Qing_Li1", "aff": "Peking University;;Beijing Institute for General Artificial Intelligence;Peking University;Peking University;Beijing Institute for General Artificial Intelligence (BIGAI)", "aff_domain": "stu.pku.edu.cn;;bigai.ai;pku.edu.cn;pku.edu.cn;bigai.ai", "position": "PhD student;;Researcher;Assistant Professor;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nluo2024endtoend,\ntitle={End-to-End Neuro-Symbolic Reinforcement Learning with Textual Explanations},\nauthor={Lirui Luo and Guoxi Zhang and Hongming Xu and Yaodong Yang and Cong Fang and Qing Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0P3kaNluGj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4500153, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6314330915578266078&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "email": "stu.pku.edu.cn;;bigai.ai;pku.edu.cn;pku.edu.cn;bigai.ai", "author_num": 6, "aff_unique_index": "0;1;0;0;1", "aff_unique_norm": "Peking University;Beijing Institute for General Artificial Intelligence", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;http://www.bigaiai.org/", "aff_unique_abbr": "Peking U;BIGAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "SLOG: An Inductive Spectral Graph Neural Network Beyond Polynomial Filter", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35200", "id": "0SrNCSklZx", "proceeding": "https://proceedings.mlr.press/v235/xu24aa.html", "pdf": "https://openreview.net/pdf?id=0SrNCSklZx", "openreview": "https://openreview.net/forum?id=0SrNCSklZx", "author_site": "Haobo Xu, Yuchen Yan, Dingsu Wang, Zhe Xu, Zhichen Zeng, Tarek Abdelzaher, Jiawei Han, Hanghang Tong", "tldr": "", "abstract": "Graph neural networks (GNNs) have exhibited superb power in many graph related tasks. Existing GNNs can be categorized into spatial GNNs and spectral GNNs. The spatial GNNs primarily capture the local information around each node, while the spectral GNNs are able to operate on the frequency signals of the entire graph. However, most, if not all, existing spectral GNNs are faced with two limitations: (1) the polynomial limitation that for most spectral GNNs, the expressive power in the spectral domain is limited to polynomial filters; and (2) the transductive limitation that most spectral GNNs can only be applied to the transductive setting on relatively small-scale graphs. In this paper, we propose a novel spectral graph neural network named SLOG to solve the above two limitations. For the polynomial limitation, SLOG proposes a novel real-valued filter with geometric interpretability, mathematical feasibility and adaptive filtering ability to go beyond polynomial. For the transductive limitation, SLOG combines the subgraph sampling technique in spatial GNNs and the signal processing technique in spectral GNNs together to make itself tailored to the inductive setting on large-scale graphs. Extensive experimental results on 16 datasets demonstrate the superiority of SLOG in inductive homophilic and heterophilic node classification task.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haobo Xu;Yuchen Yan;Dingsu Wang;Zhe Xu;Zhichen Zeng;Tarek F. Abdelzaher;Jiawei Han;Hanghang Tong", "authorids": "~Haobo_Xu2;~Yuchen_Yan1;~Dingsu_Wang1;~Zhe_Xu5;~Zhichen_Zeng1;~Tarek_F._Abdelzaher1;~Jiawei_Han1;~Hanghang_Tong3", "gender": ";;;M;;;M;", "homepage": ";;;https://pricexu.github.io/;https://zhichenz98.github.io/;;http://hanj.cs.illinois.edu/;http://tonghanghang.org", "dblp": ";;;97/3701-7;345/6632-1;;h/JiaweiHan.html;58/1757", "google_scholar": ";;;7IhVDFsAAAAJ;rFdX368AAAAJ;;https://scholar.google.com.tw/citations?user=Kv9AbjMAAAAJ;RaINcuUAAAAJ", "orcid": ";;;0000-0002-6675-1398;0000-0002-5534-3401;;0000-0002-3629-2696;0000-0003-4405-3887", "linkedin": ";;;;;;;htong/", "or_profile": "~Haobo_Xu2;~Yuchen_Yan1;~Dingsu_Wang1;~Zhe_Xu5;~Zhichen_Zeng1;~Tarek_F._Abdelzaher1;~Jiawei_Han1;~Hanghang_Tong3", "aff": ";;;University of Illinois, Urbana Champaign;University of Illinois Urbana-Champaign;;University of Illinois at Urbana-Champaign (UIUC);University of Illinois, Urbana Champaign", "aff_domain": ";;;illinois.edu;illinois.edu;;illinois.edu;illinois.edu", "position": ";;;PhD student;PhD student;;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nxu2024slog,\ntitle={{SLOG}: An Inductive Spectral Graph Neural Network Beyond Polynomial Filter},\nauthor={Haobo Xu and Yuchen Yan and Dingsu Wang and Zhe Xu and Zhichen Zeng and Tarek F. Abdelzaher and Jiawei Han and Hanghang Tong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0SrNCSklZx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 761838, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2679992641964334351&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";;;illinois.edu;illinois.edu;;illinois.edu;illinois.edu", "author_num": 8, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "On the Calibration of Human Pose Estimation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35199", "id": "0THUA66D8Z", "proceeding": "https://proceedings.mlr.press/v235/gu24a.html", "pdf": "https://openreview.net/pdf?id=0THUA66D8Z", "openreview": "https://openreview.net/forum?id=0THUA66D8Z", "author_site": "Kerui Gu, Rongyu Chen, Xuanlong Yu, Angela Yao", "tldr": "", "abstract": "2D human pose estimation predicts keypoint locations and the corresponding confidence. Calibration-wise, the confidence should be aligned with the pose accuracy. Yet existing pose estimation methods tend to estimate confidence with heuristics such as the maximum value of heatmaps. This work shows, through theoretical analysis and empirical verification, a calibration gap in current pose estimation frameworks. Our derivations directly lead to closed-form adjustments in the confidence based on additionally inferred instance size and visibility. Given the black-box nature of deep neural networks, however, it is not possible to close the gap with only closed-form adjustments. We go one step further and propose a Calibrated ConfidenceNet (CCNet) to explicitly learn network-specific adjustments with a confidence prediction branch. The proposed CCNet, as a lightweight post-hoc addition, improves the calibration of standard off-the-shelf pose estimation frameworks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kerui Gu;Rongyu Chen;Xuanlong Yu;Angela Yao", "authorids": "~Kerui_Gu1;~Rongyu_Chen1;~Xuanlong_Yu1;~Angela_Yao1", "gender": "M;M;M;", "homepage": "https://www.comp.nus.edu.sg/~keruigu/;https://gloryyrolg.github.io/;https://xuanlong-yu.github.io/;http://www.angelayao.com", "dblp": "315/5511;279/0280;304/8099;64/8484", "google_scholar": "if-RXSEAAAAJ;gP_jm9UAAAAJ;o3Q56qsAAAAJ;https://scholar.google.ch/citations?user=-LJCZMMAAAAJ", "orcid": ";0009-0004-3358-0053;;", "linkedin": ";rongyu-glory-chen-6a3482189/;;", "or_profile": "~Kerui_Gu1;~Rongyu_Chen1;~Xuanlong_Yu1;~Angela_Yao1", "aff": "National University of Singapore;Alibaba Group;Intellindust;National University of Singapore", "aff_domain": "nus.edu.sg;alibaba-inc.com;intellindust.com;nus.edu.sg", "position": "PhD student;Intern;Researcher;Associate Professor", "bibtex": "@inproceedings{\ngu2024on,\ntitle={On the Calibration of Human Pose Estimation},\nauthor={Kerui Gu and Rongyu Chen and Xuanlong Yu and Angela Yao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0THUA66D8Z}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4110689, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=931565726236933624&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 7, "email": "nus.edu.sg;alibaba-inc.com;intellindust.com;nus.edu.sg", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "National University of Singapore;Alibaba Group;Intellindust", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nus.edu.sg;https://www.alibaba.com;", "aff_unique_abbr": "NUS;Alibaba;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Singapore;China;" }, { "title": "On the Role of Edge Dependency in Graph Generative Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35198", "id": "0XDO74NlOd", "proceeding": "https://proceedings.mlr.press/v235/chanpuriya24a.html", "pdf": "https://openreview.net/pdf?id=0XDO74NlOd", "openreview": "https://openreview.net/forum?id=0XDO74NlOd", "author_site": "Sudhanshu Chanpuriya, Cameron Musco, Konstantinos Sotiropoulos, Charalampos Tsourakakis", "tldr": "", "abstract": "We investigate the trade-off between the representation power of graph generative models and model *overlap*, i.e., the degree to which the model generates diverse outputs versus regurgitating its training data. In particular, we delineate a nested hierarchy of graph generative models categorized into three levels of complexity: edge independent, node independent, and arbitrarily dependent models. This hierarchy encapsulates a wide range of prevalent methods. We derive theoretical bounds on the number of triangles and other short-length cycles producible by each level of the hierarchy, finding that more complex dependency structure allows an improved trade-off between representation power and overlap. We provide instances demonstrating the asymptotic optimality of our bounds. Furthermore, we introduce new generative models for each of the three hierarchical levels, leveraging dense subgraph discovery. Our evaluation, conducted on real-world datasets, focuses on assessing the output quality and overlap of our proposed models in comparison to other popular models. Our results indicate that our simple, interpretable models provide competitive baselines to popular generative models. Through this investigation, we offer a structured and robust evaluation scheme, thereby facilitating the development of models capable of generating accurate and edge-diverse graphs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sudhanshu Chanpuriya;Cameron N Musco;Konstantinos Sotiropoulos;Charalampos Tsourakakis", "authorids": "~Sudhanshu_Chanpuriya1;~Cameron_N_Musco1;~Konstantinos_Sotiropoulos1;~Charalampos_Tsourakakis1", "gender": ";M;M;M", "homepage": ";https://people.cs.umass.edu/~cmusco/;http://cs-people.bu.edu/ksotirop/;https://tsourakakis.com/", "dblp": ";149/2327;182/6845;https://dblp.uni-trier.de/pers/hd/t/Tsourakakis:Charalampos_E=", "google_scholar": ";EeYGZCwAAAAJ;https://scholar.google.com/citations?hl=en;IkEXPUEAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Sudhanshu_Chanpuriya1;~Cameron_N_Musco1;~Konstantinos_Sotiropoulos1;~Charalampos_Tsourakakis1", "aff": ";University of Massachusetts, Amherst;Meta Facebook;Boston University", "aff_domain": ";umass.edu;meta.com;bu.edu", "position": ";Assistant Professor;Research Scientist;Assistant Professor", "bibtex": "@inproceedings{\nchanpuriya2024on,\ntitle={On the Role of Edge Dependency in Graph Generative Models},\nauthor={Sudhanshu Chanpuriya and Cameron N Musco and Konstantinos Sotiropoulos and Charalampos Tsourakakis},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0XDO74NlOd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 853056, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11830894031227101403&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": ";umass.edu;meta.com;bu.edu", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Massachusetts Amherst;Meta;Boston University", "aff_unique_dep": ";Meta Platforms, Inc.;", "aff_unique_url": "https://www.umass.edu;https://meta.com;https://www.bu.edu", "aff_unique_abbr": "UMass Amherst;Meta;BU", "aff_campus_unique_index": "0", "aff_campus_unique": "Amherst;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Towards Modular LLMs by Building and Reusing a Library of LoRAs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35197", "id": "0ZFWfeVsaD", "proceeding": "https://proceedings.mlr.press/v235/ostapenko24a.html", "pdf": "https://openreview.net/pdf?id=0ZFWfeVsaD", "openreview": "https://openreview.net/forum?id=0ZFWfeVsaD", "author_site": "Oleksiy Ostapenko, Zhan Su, Edoardo Ponti, Laurent Charlin, Nicolas Le Roux, Lucas Caccia, Alessandro Sordoni", "tldr": "", "abstract": "Given the increasing number of parameter-efficient adapters of large language models (LLMs), how can we reuse them to improve LLM performance on new tasks? We study how to best build a *library* of adapters given multi-task data and devise techniques for both *zero-shot* and *supervised* task generalization through *routing* in such library. We benchmark existing approaches to build this library and introduce model-based clustering, $\\texttt{MBC}$, a method that groups tasks based on the similarity of their adapter parameters, indirectly optimizing for transfer across the multi-task dataset. In order to reuse the library, we present a novel zero-shot routing mechanism, $\\texttt{Arrow}$, which enables dynamic selection of the most relevant adapters for new inputs without the need for retraining. We experiment with several LLMs, such as Phi-2 and Mistral, on a wide array of held-out tasks, verifying that MBC-based adapters and Arrow routing lead to superior generalization to new tasks. Thus, we make steps towards creating modular, adaptable LLMs that can match or outperform traditional joint training.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Oleksiy Ostapenko;Zhan Su;Edoardo Ponti;Laurent Charlin;Nicolas Le Roux;Lucas Caccia;Alessandro Sordoni", "authorids": "~Oleksiy_Ostapenko1;~Zhan_Su1;~Edoardo_Ponti1;~Laurent_Charlin1;~Nicolas_Le_Roux2;~Lucas_Caccia1;~Alessandro_Sordoni2", "gender": "M;M;;M;M;M;M", "homepage": ";https://shuishen112.github.io/zhansu.github.io/;https://ducdauge.github.io/;http://www.cs.toronto.edu/~lcharlin/;https://www.cs.mcgill.ca/~lpagec/;;http://nicolas.le-roux.name", "dblp": ";02/6524;178/8829;48/5717;;57/7642;http://dblp.uni-trier.de/pers/hd/r/Roux:Nicolas_Le", "google_scholar": "mqLVUGgAAAAJ;VzEpVpoAAAAJ;https://scholar.google.ca/citations?user=tklL2q0AAAAJ;Cul0g2YAAAAJ;fuvIITUAAAAJ;;https://scholar.google.fr/citations?user=LmKtwk8AAAAJ", "orcid": ";0000-0001-5189-9165;0000-0002-6308-1050;0000-0002-6545-9459;;;", "linkedin": ";;edoardo-maria-ponti/;;;;", "or_profile": "~Oleksiy_Ostapenko1;~Zhan_Su1;~Edoardo_Ponti1;~Laurent_Charlin1;~Lucas_Caccia1;~Alessandro_Sordoni1;~Nicolas_Le_Roux1", "aff": "University of Montreal;University of Copenhagen;NVIDIA;Mila - Quebec Artificial Intelligence Institute;Microsoft;Microsoft;Microsoft", "aff_domain": "umontreal.ca;ku.dk;nvidia.com;mila.quebec;microsoft.com;microsoft.com;microsoft.com", "position": "PhD student;PhD student;Researcher;Principal Researcher;Postdoc;Researcher;Researcher", "bibtex": "@inproceedings{\nostapenko2024towards,\ntitle={Towards Modular {LLM}s by Building and Reusing a Library of Lo{RA}s},\nauthor={Oleksiy Ostapenko and Zhan Su and Edoardo Ponti and Laurent Charlin and Nicolas Le Roux and Lucas Caccia and Alessandro Sordoni},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0ZFWfeVsaD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 703235, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6091637381885993898&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 6, "email": "umontreal.ca;ku.dk;nvidia.com;mila.quebec;microsoft.com;microsoft.com;microsoft.com", "author_num": 7, "aff_unique_index": "0;1;2;3;4;4;4", "aff_unique_norm": "University of Montreal;University of Copenhagen;NVIDIA;Quebec Artificial Intelligence Institute;Microsoft", "aff_unique_dep": ";;NVIDIA Corporation;Artificial Intelligence;Microsoft Corporation", "aff_unique_url": "https://wwwumontreal.ca;https://www.ku.dk;https://www.nvidia.com;https://mila.quebec;https://www.microsoft.com", "aff_unique_abbr": "UM;UCPH;NVIDIA;Mila;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0;2;2;2", "aff_country_unique": "Canada;Denmark;United States" }, { "title": "TVE: Learning Meta-attribution for Transferable Vision Explainer", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35196", "id": "0ZTuy5CrL7", "proceeding": "https://proceedings.mlr.press/v235/wang24j.html", "pdf": "https://openreview.net/pdf?id=0ZTuy5CrL7", "openreview": "https://openreview.net/forum?id=0ZTuy5CrL7", "author_site": "Guanchu (Gary) Wang, Yu-Neng Chuang, Fan Yang, Mengnan Du, Chia-Yuan Chang, Shaochen (Henry) Zhong, Zirui Liu, Zhaozhuo Xu, Kaixiong Zhou, Xuanting Cai, Xia Hu", "tldr": "", "abstract": "Explainable machine learning significantly improves the transparency of deep neural networks. However, existing work is constrained to explaining the behavior of individual model predictions, and lacks the ability to transfer the explanation across various models and tasks. This limitation results in explaining various tasks being time- and resource-consuming. To address this problem, we introduce a **Transferable Vision Explainer** (TVE) that can effectively explain various vision models in downstream tasks. Specifically, the transferability of TVE is realized through a pre-training process on large-scale datasets towards learning the meta-attribution. This meta-attribution leverages the versatility of generic backbone encoders to comprehensively encode the attribution knowledge for the input instance, which enables TVE to seamlessly transfer to explaining various downstream tasks, without the need for training on task-specific data. Empirical studies involve explaining three different architectures of vision models across three diverse downstream datasets. The experiment results indicate TVE is effective in explaining these tasks without the need for additional training on downstream data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guanchu Wang;Yu-Neng Chuang;Fan Yang;Mengnan Du;Chia-Yuan Chang;Shaochen Zhong;Zirui Liu;Zhaozhuo Xu;Kaixiong Zhou;Xuanting Cai;Xia Hu", "authorids": "~Guanchu_Wang1;~Yu-Neng_Chuang1;~Fan_Yang27;~Mengnan_Du1;~Chia-Yuan_Chang3;~Shaochen_Zhong1;~Zirui_Liu1;~Zhaozhuo_Xu2;~Kaixiong_Zhou1;~Xuanting_Cai1;~Xia_Hu4", "gender": "M;M;M;;Not Specified;M;M;;M;M;M", "homepage": "https://guanchuwang.github.io/home;;https://yangfan.sites.wfu.edu/;https://mengnandu.com/;https://z76316.github.io/;https://openreview.net/profile?id=~Shaochen_Zhong1;https://zirui-ray-liu.github.io/;https://ottovonxu.github.io/;https://kaixiong-zhou.github.io/;;https://cs.rice.edu/~xh37/index.html", "dblp": "213/0985;207/7875;;183/5606;03/1382-2.html;326/7286.html;196/8629-1.html;195/4352;178/7315;;256/9406.html", "google_scholar": "_QL5218AAAAJ;;RXFeW-8AAAAJ;0i-Js2gAAAAJ;EO595aMAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=zh-CN;7tDlVAsAAAAJ;zMspIjIAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=pcCS60IAAAAJ", "orcid": ";;0000-0003-3442-754X;;0009-0001-1889-612X;;;;0000-0001-5226-8736;;", "linkedin": ";ync/;;;chia-yuan-chang/;shaochen-henry-zhong-96a941249/;;;;xuanting-c-093b983a/;", "or_profile": "~Guanchu_Wang1;~Yu-Neng_Chuang1;~Fan_Yang27;~Mengnan_Du1;~Chia-Yuan_Chang3;~Shaochen_Zhong1;~Zirui_Liu1;~Zhaozhuo_Xu2;~Kaixiong_Zhou1;~Xuanting_Cai1;~Xia_Hu2", "aff": "Rice University;Rice University;Wake Forest University;New Jersey Institute of Technology;Texas A&M University - College Station;Rice University;Rice University;Rice University;Massachusetts Institute of Technology;Meta Facebook;Rice University", "aff_domain": "rice.edu;rice.edu;wfu.edu;njit.edu;tamu.edu;rice.edu;rice.edu;rice.edu;mit.edu;facebook.com;rice.edu", "position": "PhD student;PhD student;Assistant Professor;Assistant Professor;PhD student;PhD student;PhD student;PhD student;Postdoc;Principal Researcher;Associate Professor", "bibtex": "@inproceedings{\nwang2024tve,\ntitle={{TVE}: Learning Meta-attribution for Transferable Vision Explainer},\nauthor={Guanchu Wang and Yu-Neng Chuang and Fan Yang and Mengnan Du and Chia-Yuan Chang and Shaochen Zhong and Zirui Liu and Zhaozhuo Xu and Kaixiong Zhou and Xuanting Cai and Xia Hu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0ZTuy5CrL7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5250966, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xpuA93j14mwJ:scholar.google.com/&scioq=TVE:+Learning+Meta-attribution+for+Transferable+Vision+Explainer&hl=en&as_sdt=0,31", "gs_version_total": 10, "email": "rice.edu;rice.edu;wfu.edu;njit.edu;tamu.edu;rice.edu;rice.edu;rice.edu;mit.edu;facebook.com;rice.edu", "author_num": 11, "aff_unique_index": "0;0;1;2;3;0;0;0;4;5;0", "aff_unique_norm": "Rice University;Wake Forest University;New Jersey Institute of Technology;Texas A&M University;Massachusetts Institute of Technology;Meta", "aff_unique_dep": ";;;;;Meta Platforms, Inc.", "aff_unique_url": "https://www.rice.edu;https://www.wfu.edu;https://www.njit.edu;https://www.tamu.edu;https://web.mit.edu;https://meta.com", "aff_unique_abbr": "Rice;WFU;NJIT;TAMU;MIT;Meta", "aff_campus_unique_index": "1", "aff_campus_unique": ";College Station", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Early Time Classification with Accumulated Accuracy Gap Control", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35195", "id": "0b7txvPYlr", "proceeding": "https://proceedings.mlr.press/v235/ringel24a.html", "pdf": "https://openreview.net/pdf?id=0b7txvPYlr", "openreview": "https://openreview.net/forum?id=0b7txvPYlr", "author_site": "Liran Ringel, Regev Cohen, Daniel Freedman, Michael Elad, Yaniv Romano", "tldr": "", "abstract": "Early time classification algorithms aim to label a stream of features without processing the full input stream, while maintaining accuracy comparable to that achieved by applying the classifier to the entire input. In this paper, we introduce a statistical framework that can be applied to any sequential classifier, formulating a calibrated stopping rule. This data-driven rule attains finite-sample, distribution-free control of the accuracy gap between full and early-time classification. We start by presenting a novel method that builds on the Learn-then-Test calibration framework to control this gap marginally, on average over i.i.d. instances. As this algorithm tends to yield an excessively high accuracy gap for early halt times, our main contribution is the proposal of a framework that controls a stronger notion of error, where the accuracy gap is controlled conditionally on the accumulated halt times. Numerical experiments demonstrate the effectiveness, applicability, and usefulness of our method. We show that our proposed early stopping mechanism reduces up to 94% of timesteps used for classification while achieving rigorous accuracy gap control.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Liran Ringel;Regev Cohen;Daniel Freedman;Michael Elad;Yaniv Romano", "authorids": "~Liran_Ringel1;~Regev_Cohen1;~Daniel_Freedman2;~Michael_Elad3;~Yaniv_Romano1", "gender": "M;M;;M;M", "homepage": ";;;https://sites.google.com/view/yaniv-romano/;https://elad.cs.technion.ac.il/", "dblp": "367/7044;;59/1865;142/0021;e/MichaelElad", "google_scholar": "ie9I6MIAAAAJ;naMCufgAAAAJ;https://scholar.google.com/citations?hl=en;L_m67ywAAAAJ;UpZbV44AAAAJ", "orcid": "0009-0006-5822-6856;;;;0000-0001-8131-6928", "linkedin": "liran-ringel;regev-cohen/;;;michael-elad-5553852a3/", "or_profile": "~Liran_Ringel1;~Regev_Cohen1;~Daniel_Freedman2;~Yaniv_Romano1;~Michael_Elad1", "aff": "Computer Science Departmen, Technion-Israel Institute of Technology;Google;Verily;Technion, Technion;Verily", "aff_domain": "cs.technion.ac.il;google.com;google.com;technion.ac.il;verily.com", "position": "MS student;Researcher;Researcher;Assistant Professor;Principal Researcher", "bibtex": "@inproceedings{\nringel2024early,\ntitle={Early Time Classification with Accumulated Accuracy Gap Control},\nauthor={Liran Ringel and Regev Cohen and Daniel Freedman and Michael Elad and Yaniv Romano},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0b7txvPYlr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5446687, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3770247521689827073&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "cs.technion.ac.il;google.com;google.com;technion.ac.il;verily.com", "author_num": 5, "aff_unique_index": "0;1;2;3;2", "aff_unique_norm": "Technion-Israel Institute of Technology;Google;Verily;Technion - Israel Institute of Technology", "aff_unique_dep": "Computer Science Department;Google;;", "aff_unique_url": "https://www.technion.ac.il;https://www.google.com;https://www.verily.com;https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion;Google;Verily;Technion", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;0;1", "aff_country_unique": "Israel;United States" }, { "title": "The Entropy Enigma: Success and Failure of Entropy Minimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35194", "id": "0bGsVoumFL", "proceeding": "https://proceedings.mlr.press/v235/press24a.html", "pdf": "https://openreview.net/pdf?id=0bGsVoumFL", "openreview": "https://openreview.net/forum?id=0bGsVoumFL", "author_site": "Ori Press, Ravid Shwartz-Ziv, Yann LeCun, Matthias Bethge", "tldr": "", "abstract": "Entropy minimization (EM) is frequently used to increase the accuracy of classification models when they're faced with new data at test time. EM is a self-supervised learning method that optimizes classifiers to assign even higher probabilities to their top predicted classes. In this paper, we analyze why EM works when adapting a model for a few steps and why it eventually fails after adapting for many steps. We show that, at first, EM causes the model to embed test images close to training images, thereby increasing model accuracy. After many steps of optimization, EM makes the model embed test images far away from the embeddings of training images, which results in a degradation of accuracy. Building upon our insights, we present a method for solving a practical problem: estimating a model's accuracy on a given arbitrary dataset without having access to its labels. Our method estimates accuracy by looking at how the embeddings of input images change as the model is optimized to minimize entropy. Experiments on 23 challenging datasets show that our method sets the SoTA with a mean absolute error of 5.75%, an improvement of 29.62% over the previous SoTA on this task. Our code is available at: https://github.com/oripress/EntropyEnigma", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ori Press;Ravid Shwartz-Ziv;Yann LeCun;Matthias Bethge", "authorids": "~Ori_Press1;~Ravid_Shwartz-Ziv2;~Yann_LeCun1;~Matthias_Bethge1", "gender": ";;M;M", "homepage": ";;http://yann.lecun.com;https://bethgelab.org", "dblp": ";;l/YannLeCun;77/3005", "google_scholar": ";;WLN3QrAAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Ori_Press1;~Ravid_Shwartz-Ziv2;~Yann_LeCun1;~Matthias_Bethge1", "aff": ";;New York University;University of Tuebingen", "aff_domain": ";;nyu.edu;uni-tuebingen.de", "position": ";;Full Professor;Full Professor", "bibtex": "@inproceedings{\npress2024the,\ntitle={The Entropy Enigma: Success and Failure of Entropy Minimization},\nauthor={Ori Press and Ravid Shwartz-Ziv and Yann LeCun and Matthias Bethge},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0bGsVoumFL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3642217, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14137736424752701394&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": ";;nyu.edu;uni-tuebingen.de", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "New York University;University of Tuebingen", "aff_unique_dep": ";", "aff_unique_url": "https://www.nyu.edu;https://www.uni-tuebingen.de/", "aff_unique_abbr": "NYU;Uni T\u00fcbingen", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Germany" }, { "title": "Beyond Chinchilla-Optimal: Accounting for Inference in Language Model Scaling Laws", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35193", "id": "0bmXrtTDUu", "proceeding": "https://proceedings.mlr.press/v235/sardana24a.html", "pdf": "https://openreview.net/pdf?id=0bmXrtTDUu", "openreview": "https://openreview.net/forum?id=0bmXrtTDUu", "author_site": "Nikhil Sardana, Jacob Portes, Alexandre (Sasha) Doubov, Jonathan Frankle", "tldr": "", "abstract": "Large language model (LLM) scaling laws are empirical formulas that estimate changes in model quality as a result of increasing parameter count and training data. However, these formulas, including the popular Deepmind Chinchilla scaling laws, neglect to include the cost of inference. We modify the Chinchilla scaling laws to calculate the optimal LLM parameter count and pre-training data size to train and deploy a model of a given quality and inference demand. We conduct our analysis both in terms of a compute budget and real-world costs and find that LLM researchers expecting reasonably large inference demand ($\\sim$1B requests) should train models smaller and longer than Chinchilla-optimal. Furthermore, we train 47 models of varying sizes and parameter counts to validate our formula and find that model quality continues to improve as we scale tokens per parameter to extreme ranges (up to 10,000). Finally, we ablate the procedure used to fit the Chinchilla scaling law coefficients and find that developing scaling laws only from data collected at typical token/parameter ratios overestimates the impact of additional tokens at these extreme ranges.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nikhil Sardana;Jacob Portes;Sasha Doubov;Jonathan Frankle", "authorids": "~Nikhil_Sardana1;~Jacob_Portes1;~Sasha_Doubov1;~Jonathan_Frankle1", "gender": ";M;M;M", "homepage": ";https://jacobfulano.github.io;;http://www.jfrankle.com", "dblp": ";322/0370;281/9848;169/9776", "google_scholar": ";CzH4cSEAAAAJ;;MlLJapIAAAAJ", "orcid": ";0000-0003-3102-012X;;", "linkedin": ";jacob-portes-82804062/;;jfrankle/", "or_profile": "~Nikhil_Sardana1;~Jacob_Portes1;~Sasha_Doubov1;~Jonathan_Frankle1", "aff": ";Databricks;Databricks, Databricks;Databricks", "aff_domain": ";databricks.com;databricks.com;databricks.com", "position": ";Researcher;Researcher;Chief AI Scientist", "bibtex": "@inproceedings{\nsardana2024beyond,\ntitle={Beyond Chinchilla-Optimal: Accounting for Inference in Language Model Scaling Laws},\nauthor={Nikhil Sardana and Jacob Portes and Sasha Doubov and Jonathan Frankle},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0bmXrtTDUu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4053179, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10834714692308557379&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": ";databricks.com;databricks.com;databricks.com", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Databricks", "aff_unique_dep": "", "aff_unique_url": "https://databricks.com", "aff_unique_abbr": "Databricks", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "KernelWarehouse: Rethinking the Design of Dynamic Convolution", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35192", "id": "0e8SEDSpNT", "proceeding": "https://proceedings.mlr.press/v235/li24cg.html", "pdf": "https://openreview.net/pdf?id=0e8SEDSpNT", "openreview": "https://openreview.net/forum?id=0e8SEDSpNT", "author_site": "Chao Li, Anbang Yao", "tldr": "", "abstract": "Dynamic convolution learns a linear mixture of $n$ static kernels weighted with their input-dependent attentions, demonstrating superior performance than normal convolution. However, it increases the number of convolutional parameters by $n$ times, and thus is not parameter efficient. This leads to no research progress that can allow researchers to explore the setting $n > 100$ (an order of magnitude larger than the typical setting $n < 10$) for pushing forward the performance boundary of dynamic convolution while enjoying parameter efficiency. To fill this gap, in this paper, we propose KernelWarehouse, a more general form of dynamic convolution, which redefines the basic concepts of \u201ckernels\u201d, \u201cassembling kernels\u201d and \u201cattention function\u201d through the lens of exploiting convolutional parameter dependencies within the same layer and across neighboring layers of a ConvNet. We testify the effectiveness of KernelWarehouse on ImageNet and MS-COCO datasets using various ConvNet architectures. Intriguingly, KernelWarehouse is also applicable to Vision Transformers, and it can even reduce the model size of a backbone while improving the model accuracy. For instance, KernelWarehouse ($n = 4$) achieves 5.61%|3.90%|4.38% absolute top-1 accuracy gain on the ResNet18|MobileNetV2|DeiT-Tiny backbone, and KernelWarehouse ($n = 1/4$) with 65.10% model size reduction still achieves 2.29% gain on the ResNet18 backbone. The code and models are available at https://github.com/OSVAI/KernelWarehouse.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chao Li;Anbang Yao", "authorids": "~Chao_Li16;~Anbang_Yao1", "gender": "M;", "homepage": "https://github.com/chaoli-ai/chaoli.github.io;https://yaoanbang.github.io/", "dblp": ";http://dblp.uni-trier.de/pers/hd/y/Yao:Anbang", "google_scholar": ";b9hCmPYAAAAJ", "orcid": ";0000-0002-3878-8679", "linkedin": ";anbang-yao-1805b712a/", "or_profile": "~Chao_Li16;~Anbang_Yao1", "aff": "Intel;Intel", "aff_domain": "intel.com;intel.com", "position": "Researcher;Principal Researcher", "bibtex": "@inproceedings{\nli2024kernelwarehouse,\ntitle={KernelWarehouse: Rethinking the Design of Dynamic Convolution},\nauthor={Chao Li and Anbang Yao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0e8SEDSpNT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 10020516, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7538616998466727124&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "intel.com;intel.com", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Intel", "aff_unique_dep": "Intel Corporation", "aff_unique_url": "https://www.intel.com", "aff_unique_abbr": "Intel", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Exploring the Low-Pass Filtering Behavior in Image Super-Resolution", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35191", "id": "0f4u3Wg9zT", "proceeding": "https://proceedings.mlr.press/v235/deng24e.html", "pdf": "https://openreview.net/pdf?id=0f4u3Wg9zT", "openreview": "https://openreview.net/forum?id=0f4u3Wg9zT", "author_site": "Haoyu Deng, Zijing Xu, Yule Duan, Xiao Wu, Wen-Jie Shu, Liang-Jian Deng", "tldr": "", "abstract": "Deep neural networks for image super-resolution (ISR) have shown significant advantages over traditional approaches like the interpolation. However, they are often criticized as 'black boxes' compared to traditional approaches with solid mathematical foundations. In this paper, we attempt to interpret the behavior of deep neural networks in ISR using theories from the field of signal processing. First, we report an intriguing phenomenon, referred to as `the sinc phenomenon.' It occurs when an impulse input is fed to a neural network. Then, building on this observation, we propose a method named Hybrid Response Analysis (HyRA) to analyze the behavior of neural networks in ISR tasks. Specifically, HyRA decomposes a neural network into a parallel connection of a linear system and a non-linear system and demonstrates that the linear system functions as a low-pass filter while the non-linear system injects high-frequency information. Finally, to quantify the injected high-frequency information, we introduce a metric for image-to-image tasks called Frequency Spectrum Distribution Similarity (FSDS). FSDS reflects the distribution similarity of different frequency components and can capture nuances that traditional metrics may overlook. Code, videos and raw experimental results for this paper can be found in: https://github.com/RisingEntropy/LPFInISR.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haoyu Deng;Zijing Xu;Yule Duan;Xiao Wu;Wenjie Shu;Liang-Jian Deng", "authorids": "~Haoyu_Deng1;~Zijing_Xu1;~Yule_Duan1;~Xiao_Wu6;~Wenjie_Shu1;~Liang-Jian_Deng2", "gender": "M;F;M;M;M;M", "homepage": "https://risingentropy.top;;https://duanyll.com;https://xiaoxiao-woo.github.io/;;https://liangjiandeng.github.io/", "dblp": ";;205/1045;;;136/7368", "google_scholar": ";;;-aFhoQgAAAAJ;;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";0009-0005-9611-0718;0000-0003-4392-402X;0000-0002-1259-8674;0009-0005-4066-3372;", "linkedin": ";;;;;", "or_profile": "~Haoyu_Deng1;~Zijing_Xu1;~Yule_Duan1;~Xiao_Wu6;~Wenjie_Shu1;~Liang-Jian_Deng2", "aff": "University of Electronic Science and Technology of China;University of Electronic Science and Technology of China;University of Electronic Science and Technology of China;University of Electronic Science and Technology of China;University of Electronic Science and Technology of China;University of Electronic Science and Technology of China", "aff_domain": "uestc.edu.cn;uestc.edu.cn;uestc.edu.cn;uestc.edu;uestc.edu.cn;uestc.edu.cn", "position": "Undergrad student;Undergrad student;Undergrad student;PhD student;Undergrad student;Full Professor", "bibtex": "@inproceedings{\ndeng2024exploring,\ntitle={Exploring the Low-Pass Filtering Behavior in Image Super-Resolution},\nauthor={Haoyu Deng and Zijing Xu and Yule Duan and Xiao Wu and Wenjie Shu and Liang-Jian Deng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0f4u3Wg9zT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5558826, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6440823287726175066&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "uestc.edu.cn;uestc.edu.cn;uestc.edu.cn;uestc.edu;uestc.edu.cn;uestc.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "University of Electronic Science and Technology of China", "aff_unique_dep": "", "aff_unique_url": "https://www.uestc.edu.cn", "aff_unique_abbr": "UESTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "DAG-Based Column Generation for Adversarial Team Games", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35190", "id": "0hbeZQm1Se", "proceeding": "https://proceedings.mlr.press/v235/zhang24b.html", "pdf": "https://openreview.net/pdf?id=0hbeZQm1Se", "openreview": "https://openreview.net/forum?id=0hbeZQm1Se", "author_site": "Youzhi Zhang, Bo An, Daniel Zeng", "tldr": "", "abstract": "Many works recently have focused on computing optimal solutions for the ex ante coordination of a team for solving sequential adversarial team games, where a team of players coordinate against an opponent (or a team of players) in a zero-sum extensive-form game. However, it is challenging to directly compute such an optimal solution because the team\u2019s coordinated strategy space is exponential in the size of the game tree due to the asymmetric information of team members. Column Generation (CG) algorithms have been proposed to overcome this challenge by iteratively expanding the team\u2019s coordinated strategy space via a Best Response Oracle (BRO). More recently, more compact representations (particularly, the Team Belief Directed Acyclic Graph (TB-DAG)) of the team\u2019s coordinated strategy space have been proposed, but the TB-DAG-based algorithms only outperform the CG-based algorithms in games with a small TB-DAG. Unfortunately, it is inefficient to directly apply CG to the TB-DAG because the size of the TB-DAG is still exponential in the size of the game tree and then makes the BRO unscalable. To this end, we develop our novel TB-DAG CG (DCG) algorithm framework by computing a coordinated best response in the original game first and then transforming this strategy into the TB-DAG form. To further improve the scalability, we propose a more suitable BRO for DCG to reduce the cost of the transformation at each iteration. We theoretically show that our algorithm converges exponentially faster than the state-of-the-art CG algorithms, and experimental results show that our algorithm is at least two orders of magnitude faster than the state-of-the-art baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Youzhi Zhang;Bo An;Daniel Dajun Zeng", "authorids": "~Youzhi_Zhang2;~Bo_An2;~Daniel_Dajun_Zeng1", "gender": ";M;M", "homepage": "https://youzhi333.github.io/index.html;https://personal.ntu.edu.sg/boan/;", "dblp": "131/9490-1;42/6178-1.html;z/DanielDajunZeng", "google_scholar": "i2j5DmwAAAAJ;PEEpuNwAAAAJ;d-tAMlYAAAAJ", "orcid": "0000-0002-2984-734X;0000-0002-7064-7438;0000-0002-9046-222X", "linkedin": ";;", "or_profile": "~Youzhi_Zhang2;~Bo_An2;~Daniel_Dajun_Zeng1", "aff": "Centre for Artificial Intelligence and Robotics, Hong Kong Institute of Science & Innovation, Chinese Academy of Sciences;Nanyang Technological University;Institute of Automation, Chinese Academy of Sciences", "aff_domain": "cair-cas.org.hk;ntu.edu.sg;ia.ac.cn", "position": "Assistant Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nzhang2024dagbased,\ntitle={{DAG}-Based Column Generation for Adversarial Team Games},\nauthor={Youzhi Zhang and Bo An and Daniel Dajun Zeng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0hbeZQm1Se}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 513820, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11291709803693736910&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "cair-cas.org.hk;ntu.edu.sg;ia.ac.cn", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Hong Kong Institute of Science & Innovation, Chinese Academy of Sciences;Nanyang Technological University;Chinese Academy of Sciences", "aff_unique_dep": "Centre for Artificial Intelligence and Robotics;;Institute of Automation", "aff_unique_url": ";https://www.ntu.edu.sg;http://www.ia.cas.cn", "aff_unique_abbr": ";NTU;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;Singapore" }, { "title": "Tripod: Three Complementary Inductive Biases for Disentangled Representation Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35189", "id": "0iXp5P77ho", "proceeding": "https://proceedings.mlr.press/v235/hsu24a.html", "pdf": "https://openreview.net/pdf?id=0iXp5P77ho", "openreview": "https://openreview.net/forum?id=0iXp5P77ho", "author_site": "Kyle Hsu, Jubayer Ibn Hamid, Kaylee Burns, Chelsea Finn, Jiajun Wu", "tldr": "", "abstract": "Inductive biases are crucial in disentangled representation learning for narrowing down an underspecified solution set. In this work, we consider endowing a neural network autoencoder with three select inductive biases from the literature: data compression into a grid-like latent space via quantization, collective independence amongst latents, and minimal functional influence of any latent on how other latents determine data generation. In principle, these inductive biases are deeply complementary: they most directly specify properties of the latent space, encoder, and decoder, respectively. In practice, however, naively combining existing techniques instantiating these inductive biases fails to yield significant benefits. To address this, we propose adaptations to the three techniques that simplify the learning problem, equip key regularization terms with stabilizing invariances, and quash degenerate incentives. The resulting model, Tripod, achieves state-of-the-art results on a suite of four image disentanglement benchmarks. We also verify that Tripod significantly improves upon its naive incarnation and that all three of its \"legs\" are necessary for best performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kyle Hsu;Jubayer Ibn Hamid;Kaylee Burns;Chelsea Finn;Jiajun Wu", "authorids": "~Kyle_Hsu1;~Jubayer_Ibn_Hamid1;~Kaylee_Burns2;~Chelsea_Finn1;~Jiajun_Wu1", "gender": "M;;F;F;M", "homepage": "https://www.kylehsu.org;;https://kayburns.github.io;https://ai.stanford.edu/~cbfinn/;https://jiajunwu.com", "dblp": "217/3841;;217/3002;131/1783;117/4768", "google_scholar": "KCdL5B0AAAAJ;;N_rVVG8AAAAJ;vfPE6hgAAAAJ;2efgcS0AAAAJ", "orcid": ";;;;0000-0002-4176-343X", "linkedin": ";;;;jiajunwu/", "or_profile": "~Kyle_Hsu1;~Jubayer_Ibn_Hamid1;~Kaylee_Burns2;~Chelsea_Finn1;~Jiajun_Wu1", "aff": "Stanford University;;Stanford University;Google;Stanford University", "aff_domain": "cs.stanford.edu;;stanford.edu;google.com;stanford.edu", "position": "PhD student;;PhD student;Research Scientist;Assistant Professor", "bibtex": "@inproceedings{\nhsu2024tripod,\ntitle={Tripod: Three Complementary Inductive Biases for Disentangled Representation Learning},\nauthor={Kyle Hsu and Jubayer Ibn Hamid and Kaylee Burns and Chelsea Finn and Jiajun Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0iXp5P77ho}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3417750, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9398484451388419105&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "cs.stanford.edu;;stanford.edu;google.com;stanford.edu", "author_num": 5, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.stanford.edu;https://www.google.com", "aff_unique_abbr": "Stanford;Google", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Stanford;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Domain-wise Data Acquisition to Improve Performance under Distribution Shift", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35188", "id": "0j28mmQ023", "proceeding": "https://proceedings.mlr.press/v235/he24f.html", "pdf": "https://openreview.net/pdf?id=0j28mmQ023", "openreview": "https://openreview.net/forum?id=0j28mmQ023", "author_site": "Yue He, Dongbai Li, Pengfei Tian, Han Yu, Jiashuo Liu, Hao Zou, Peng Cui", "tldr": "", "abstract": "Despite notable progress in enhancing the capability of machine learning against distribution shifts, training data quality remains a bottleneck for cross-distribution generalization. Recently, from a data-centric perspective, there have been considerable efforts to improve model performance through refining the preparation of training data. Inspired by realistic scenarios, this paper addresses a practical requirement of acquiring training samples from various domains on a limited budget to facilitate model generalization to target test domain with distribution shift. Our empirical evidence indicates that the advance in data acquisition can significantly benefit the model performance on shifted data. Additionally, by leveraging unlabeled test domain data, we introduce a Domain-wise Active Acquisition framework. This framework iteratively optimizes the data acquisition strategy as training samples are accumulated, theoretically ensuring the effective approximation of test distribution. Extensive real-world experiments demonstrate our proposal's advantages in machine learning applications. The code is available at https://github.com/dongbaili/DAA.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yue He;Dongbai Li;Pengfei Tian;Han Yu;Jiashuo Liu;Hao Zou;Peng Cui", "authorids": "~Yue_He2;~Dongbai_Li1;~Pengfei_Tian2;~Han_Yu5;~Jiashuo_Liu1;~Hao_Zou1;~Peng_Cui1", "gender": "M;M;M;M;M;M;M", "homepage": "https://heyuethu.github.io;https://dongbaili.github.io/;https://e9tian.github.io/;https://h-yu16.github.io;https://ljsthu.github.io;https://scholar.google.com/citations?user=f5cbI4cAAAAJ&hl=en;http://pengcui.thumedialab.com/", "dblp": "51/6071-1;;;362/3218-9;180/2823;13/4741-1;31/891-1", "google_scholar": ";;;1m6A31MAAAAJ;b7bpt5MAAAAJ;f5cbI4cAAAAJ;https://scholar.google.com.tw/citations?user=G8x97ZgAAAAJ", "orcid": "0009-0009-1536-1179;;;0009-0000-2828-4541;;0000-0002-6000-6936;0000-0003-2957-8511", "linkedin": ";;;%E5%90%AB-%E7%A6%B9-489254222/;jiashuo-liu-244a6b1a4;;", "or_profile": "~Yue_He2;~Dongbai_Li1;~Pengfei_Tian2;~Han_Yu5;~Jiashuo_Liu1;~Hao_Zou1;~Peng_Cui1", "aff": "Tsinghua University;University of Illinois, Urbana Champaign;Tsinghua University;Tsinghua University;University of Cambridge;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;illinois.edu;tsinghua.edu.cn;tsinghua.edu.cn;cam.ac.uk;tsinghua.edu.cn;tsinghua.edu.cn", "position": "Postdoc;Intern;Undergrad student;PhD student;Researcher;PhD student;Associate Professor", "bibtex": "@inproceedings{\nhe2024domainwise,\ntitle={Domain-wise Data Acquisition to Improve Performance under Distribution Shift},\nauthor={Yue He and Dongbai Li and Pengfei Tian and Han Yu and Jiashuo Liu and Hao Zou and Peng Cui},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0j28mmQ023}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 987078, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5859962535215505070&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "tsinghua.edu.cn;illinois.edu;tsinghua.edu.cn;tsinghua.edu.cn;cam.ac.uk;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 7, "aff_unique_index": "0;1;0;0;2;0;0", "aff_unique_norm": "Tsinghua University;University of Illinois Urbana-Champaign;University of Cambridge", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://illinois.edu;https://www.cam.ac.uk", "aff_unique_abbr": "THU;UIUC;Cambridge", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Urbana-Champaign;Cambridge", "aff_country_unique_index": "0;1;0;0;2;0;0", "aff_country_unique": "China;United States;United Kingdom" }, { "title": "SqueezeLLM: Dense-and-Sparse Quantization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35187", "id": "0jpbpFia8m", "proceeding": "https://proceedings.mlr.press/v235/kim24f.html", "pdf": "https://openreview.net/pdf?id=0jpbpFia8m", "openreview": "https://openreview.net/forum?id=0jpbpFia8m", "author_site": "Sehoon Kim, Coleman Hooper, Amir Gholaminejad, Zhen Dong, Xiuyu Li, Sheng Shen, Michael Mahoney, EECS Kurt Keutzer", "tldr": "", "abstract": "Generative Large Language Models (LLMs) have demonstrated remarkable results for a wide range of tasks. However, deploying these models for inference has been a significant challenge due to their unprecedented resource requirements. This has forced existing deployment frameworks to use multi-GPU inference pipelines, which are often complex and costly, or to use smaller and less performant models. In this work, we demonstrate that the main bottleneck for generative inference with LLMs is memory bandwidth, rather than compute, specifically for single batch inference. While quantization has emerged as a promising solution by representing weights with reduced precision, previous efforts have often resulted in notable performance degradation. To address this, we introduce SqueezeLLM, a post-training quantization framework that not only enables lossless compression to ultra-low precisions of up to 3-bit, but also achieves higher quantization performance under the same memory constraint. Our framework incorporates two novel ideas: (i) sensitivity-based non-uniform quantization, which searches for the optimal bit precision assignment based on second-order information; and (ii) the Dense-and-Sparse decomposition that stores outliers and sensitive weight values in an efficient sparse format. When applied to the LLaMA models, our 3-bit quantization significantly reduces the perplexity gap from the FP16 baseline by up to 2.1x as compared to the state-of-the-art methods with the same memory requirement. Furthermore, when deployed on an A6000 GPU, our quantized models achieve up to 2.3x speedup compared to the baseline. Our code is available at https://github.com/SqueezeAILab/SqueezeLLM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sehoon Kim;Coleman Richard Charles Hooper;Amir Gholami;Zhen Dong;Xiuyu Li;Sheng Shen;Michael W. Mahoney;Kurt Keutzer", "authorids": "~Sehoon_Kim1;~Coleman_Richard_Charles_Hooper1;~Amir_Gholami2;~Zhen_Dong3;~Xiuyu_Li1;~Sheng_Shen2;~Michael_W._Mahoney1;~Kurt_Keutzer1", "gender": "M;M;;M;Not Specified;M;;M", "homepage": "https://sehoonkim.org;https://www.linkedin.com/in/coleman-hooper-165061193/;;https://dong-zhen.com/;https://xiuyuli.com/;https://sincerass.github.io;;https://people.eecs.berkeley.edu/~keutzer/", "dblp": ";;;;279/5847;138/5764-1.html;;k/KurtKeutzer.html", "google_scholar": "zQABr7QAAAAJ;si-368wAAAAJ;;czxMUzcAAAAJ;https://scholar.google.com/citations?;https://scholar.google.com/citations?hl=en;;ID9QePIAAAAJ", "orcid": ";;;;;;;0000-0003-3868-8501", "linkedin": "sehoon-kim-13a1b51b1/;;;zhen-dong/;;sheng-s-ab198a174/;;kurtkeutzer/", "or_profile": "~Sehoon_Kim1;~Coleman_Richard_Charles_Hooper1;~Amir_Gholami2;~Zhen_Dong3;~Xiuyu_Li1;~Sheng_Shen2;~Michael_W._Mahoney1;~Kurt_Keutzer1", "aff": "University of California, Berkeley;University of California, Berkeley;;Nexusflow.ai Inc;University of California, Berkeley;University of California, Berkeley;;University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu;;nexusflow.ai;berkeley.edu;berkeley.edu;;berkeley.edu", "position": "PhD student;PhD student;;Principal Researcher;PhD student;PhD student;;Full Professor", "bibtex": "@inproceedings{\nkim2024squeezellm,\ntitle={Squeeze{LLM}: Dense-and-Sparse Quantization},\nauthor={Sehoon Kim and Coleman Richard Charles Hooper and Amir Gholami and Zhen Dong and Xiuyu Li and Sheng Shen and Michael W. Mahoney and Kurt Keutzer},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0jpbpFia8m}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1125625, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 215, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9711254670478924456&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "berkeley.edu;berkeley.edu;;nexusflow.ai;berkeley.edu;berkeley.edu;;berkeley.edu", "author_num": 8, "aff_unique_index": "0;0;1;0;0;0", "aff_unique_norm": "University of California, Berkeley;Nexusflow.ai", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://www.nexusflow.ai", "aff_unique_abbr": "UC Berkeley;Nexusflow.ai", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Scaling Exponents Across Parameterizations and Optimizers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35186", "id": "0ksNeD1SJT", "proceeding": "https://proceedings.mlr.press/v235/everett24a.html", "pdf": "https://openreview.net/pdf?id=0ksNeD1SJT", "openreview": "https://openreview.net/forum?id=0ksNeD1SJT", "author_site": "Katie Everett, Lechao Xiao, Mitchell Wortsman, Alexander Alemi, Roman Novak, Peter Liu, Izzeddin Gur, Jascha Sohl-Dickstein, Leslie Kaelbling, Jaehoon Lee, Jeffrey Pennington", "tldr": "", "abstract": "Robust and effective scaling of models from small to large width typically requires the precise adjustment of many algorithmic and architectural details, such as parameterization and optimizer choices. In this work, we propose a new perspective on parameterization by investigating a key assumption in prior work about the alignment between parameters and data and derive new theoretical results under weaker assumptions and a broader set of optimizers. Our extensive empirical investigation includes *tens of thousands* of models trained with *all combinations of* three optimizers, four parameterizations, several alignment assumptions, more than a dozen learning rates, and fourteen model sizes up to 27B parameters. We find that the best learning rate scaling prescription would often have been excluded by the assumptions in prior work. Our results show that all parameterizations, not just maximal update parameterization (muP), can achieve hyperparameter transfer; moreover, our novel per-layer learning rate prescription for standard parameterization outperforms muP. Finally, we demonstrate that an overlooked aspect of parameterization, the epsilon parameter in Adam, must be scaled correctly to avoid gradient underflow and propose *Adam-atan2*, a new numerically stable, scale-invariant version of Adam that eliminates the epsilon hyperparameter entirely.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Katie E Everett;Lechao Xiao;Mitchell Wortsman;Alexander A Alemi;Roman Novak;Peter J Liu;Izzeddin Gur;Jascha Sohl-Dickstein;Leslie Pack Kaelbling;Jaehoon Lee;Jeffrey Pennington", "authorids": "~Katie_E_Everett1;~Lechao_Xiao2;~Mitchell_Wortsman1;~Alexander_A_Alemi1;~Roman_Novak2;~Peter_J_Liu1;~Izzeddin_Gur1;~Jascha_Sohl-Dickstein2;~Leslie_Pack_Kaelbling1;~Jaehoon_Lee2;~Jeffrey_Pennington1", "gender": ";M;M;M;M;;;F;;M;M", "homepage": ";https://sites.google.com/site/lechaoxiao/;https://mitchellnw.github.io/;https://alexalemi.com;https://github.com/romanngg;http://www.peterjliu.com;;http://people.csail.mit.edu/lpk/;https://jaehlee.github.io;;http://sohldickstein.com", "dblp": "270/9991;222/3238;232/2273;160/8158;https://dblp.org/pers/n/Novak:Roman.html;190/7667;188/9027;k/LesliePackKaelbling;95/386-1.html;https://dblp.org/pers/p/Pennington:Jeffrey.html;51/7117", "google_scholar": "ox_zSwYAAAAJ;fvwzUnIAAAAJ;fzRnjFgAAAAJ;68hTs9wAAAAJ;LWvgl-8AAAAJ;;qS_ugJAAAAAJ;IcasIiwAAAAJ;d3YhiooAAAAJ;cn_FoswAAAAJ;-3zYIjQAAAAJ", "orcid": ";;;;;;;0000-0001-6054-7145;;;", "linkedin": "katie-everett/;;;;romanovak;p3t3rliu;;;eejaehoon/;jpennin;", "or_profile": "~Katie_E_Everett1;~Lechao_Xiao2;~Mitchell_Wortsman1;~Alexander_A_Alemi1;~Roman_Novak2;~Peter_J_Liu1;~Izzeddin_Gur1;~Leslie_Pack_Kaelbling1;~Jaehoon_Lee2;~Jeffrey_Pennington1;~Jascha_Sohl-Dickstein1", "aff": "Massachusetts Institute of Technology;Google DeepMind;University of Washington, Seattle;Google;Google Deepmind;Google Brain;Google;Massachusetts Institute of Technology;Google DeepMind;Google;Google", "aff_domain": "mit.edu;google.com;uw.edu;google.com;google.com;google.com;google.com;mit.edu;google.com;google.com;google.com", "position": "PhD student;Researcher;PhD student;Research Scientist;Research Scientist;Research Scientist;Research Scientist;Full Professor;Research Scientist;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\neverett2024scaling,\ntitle={Scaling Exponents Across Parameterizations and Optimizers},\nauthor={Katie E Everett and Lechao Xiao and Mitchell Wortsman and Alexander A Alemi and Roman Novak and Peter J Liu and Izzeddin Gur and Jascha Sohl-Dickstein and Leslie Pack Kaelbling and Jaehoon Lee and Jeffrey Pennington},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0ksNeD1SJT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3549794, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10225360157920423189&as_sdt=800005&sciodt=0,15&hl=en", "gs_version_total": 7, "email": "mit.edu;google.com;uw.edu;google.com;google.com;google.com;google.com;mit.edu;google.com;google.com;google.com", "author_num": 11, "aff_unique_index": "0;1;2;1;3;1;1;0;1;1;1", "aff_unique_norm": "Massachusetts Institute of Technology;Google;University of Washington;DeepMind", "aff_unique_dep": ";Google DeepMind;;DeepMind", "aff_unique_url": "https://web.mit.edu;https://deepmind.com;https://www.washington.edu;https://deepmind.com", "aff_unique_abbr": "MIT;DeepMind;UW;DeepMind", "aff_campus_unique_index": "1;2;2;2;2;2", "aff_campus_unique": ";Seattle;Mountain View", "aff_country_unique_index": "0;1;0;0;1;0;0;0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "CLIPZyme: Reaction-Conditioned Virtual Screening of Enzymes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35185", "id": "0mYAK6Yhhm", "proceeding": "https://proceedings.mlr.press/v235/mikhael24a.html", "pdf": "https://openreview.net/pdf?id=0mYAK6Yhhm", "openreview": "https://openreview.net/forum?id=0mYAK6Yhhm", "author_site": "Peter Mikhael, Itamar Chinn, Regina Barzilay", "tldr": "", "abstract": "Computational screening of naturally occurring proteins has the potential to identify efficient catalysts among the hundreds of millions of sequences that remain uncharacterized. Current experimental methods remain time, cost and labor intensive, limiting the number of enzymes they can reasonably screen. In this work, we propose a computational framework for in-silico enzyme screening. Through a contrastive objective, we train CLIPZyme to encode and align representations of enzyme structures and reaction pairs. With no standard computational baseline, we compare CLIPZyme to existing EC (enzyme commission) predictors applied to virtual enzyme screening and show improved performance in scenarios where limited information on the reaction is available (BEDROC$_{85}$ of 44.69%). Additionally, we evaluate combining EC predictors with CLIPZyme and show its generalization capacity on both unseen reactions and protein clusters.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Peter Mikhael;Itamar Chinn;Regina Barzilay", "authorids": "~Peter_Mikhael1;~Itamar_Chinn1;~Regina_Barzilay1", "gender": ";;female", "homepage": ";https://www.itamarchinn.com;https://www.regina.csail.mit.edu/", "dblp": ";384/4125;b/ReginaBarzilay", "google_scholar": ";;", "orcid": "0000-0002-6030-1636;0000-0001-5449-8818;", "linkedin": ";itamarchinn;", "or_profile": "~Peter_Mikhael1;~Itamar_Chinn1;~Regina_Barzilay1", "aff": "Computer Science and Artificial Intelligence Laboratory, Electrical Engineering & Computer Science;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "csail.mit.edu;mit.edu;mit.edu", "position": "PhD student;PhD student;Professor", "bibtex": "@inproceedings{\nmikhael2024clipzyme,\ntitle={{CLIPZ}yme: Reaction-Conditioned Virtual Screening of Enzymes},\nauthor={Peter Mikhael and Itamar Chinn and Regina Barzilay},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0mYAK6Yhhm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 974096, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6654325949267832942&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "csail.mit.edu;mit.edu;mit.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "Computer Science and Artificial Intelligence Laboratory", "aff_unique_url": "https://www.csail.mit.edu", "aff_unique_abbr": "CSAIL", "aff_campus_unique_index": "0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Provably Scalable Black-Box Variational Inference with Structured Variational Families", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35184", "id": "0miAQ1qHiw", "proceeding": "https://proceedings.mlr.press/v235/ko24d.html", "pdf": "https://openreview.net/pdf?id=0miAQ1qHiw", "openreview": "https://openreview.net/forum?id=0miAQ1qHiw", "author_site": "Joohwan Ko, Kyurae Kim, Woo Chang Kim, Jacob Gardner", "tldr": "", "abstract": "Variational families with full-rank covariance approximations are known not to work well in black-box variational inference (BBVI), both empirically and theoretically. In fact, recent computational complexity results for BBVI have established that full-rank variational families scale poorly with the dimensionality of the problem compared to *e.g.* mean-field families. This is particularly critical to hierarchical Bayesian models with local variables; their dimensionality increases with the size of the datasets. Consequently, one gets an iteration complexity with an explicit $\\mathcal{O}(N^2)$ dependence on the dataset size $N$. In this paper, we explore a theoretical middle ground *between* mean-field variational families and full-rank families: *structured* variational families. We rigorously prove that certain scale matrix structures can achieve a better iteration complexity of $\\mathcal{O}\\left(N\\right)$, implying better scaling with respect to $N$. We empirically verify our theoretical results on large-scale hierarchical models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Joohwan Ko;Kyurae Kim;Woo Chang Kim;Jacob R. Gardner", "authorids": "~Joohwan_Ko2;~Kyurae_Kim1;~Woo_Chang_Kim1;~Jacob_R._Gardner1", "gender": "M;M;M;M", "homepage": "https://joohwanko.com/;http://felab.kaist.ac.kr/;;https://krkim.me", "dblp": "358/5976;128/5936;144/7773;322/4034", "google_scholar": ";7NmBs1kAAAAJ;0gkajvEAAAAJ;pKGsQ1cAAAAJ", "orcid": ";0000-0001-8385-9598;;0000-0003-2063-0889", "linkedin": ";;;red-portal/", "or_profile": "~Joohwan_Ko2;~Woo_Chang_Kim1;~Jacob_R_Gardner1;~Khurai_Kim1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;University of Pennsylvania;University of Pennsylvania", "aff_domain": "kaist.edu;kaist.ac.kr;upenn.edu;seas.upenn.edu", "position": "MS student;Full Professor;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nko2024provably,\ntitle={Provably Scalable Black-Box Variational Inference with Structured Variational Families},\nauthor={Joohwan Ko and Kyurae Kim and Woo Chang Kim and Jacob R. Gardner},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0miAQ1qHiw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1709576, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TRsoXo541aYJ:scholar.google.com/&scioq=Provably+Scalable+Black-Box+Variational+Inference+with+Structured+Variational+Families&hl=en&as_sdt=0,33", "gs_version_total": 9, "email": "kaist.edu;kaist.ac.kr;upenn.edu;seas.upenn.edu", "author_num": 4, "aff_unique_index": "0;0;1;1", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;University of Pennsylvania", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;https://www.upenn.edu", "aff_unique_abbr": "KAIST;UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "South Korea;United States" }, { "title": "Exact Soft Analytical Side-Channel Attacks using Tractable Circuits", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35183", "id": "0mklK4h0rX", "proceeding": "https://proceedings.mlr.press/v235/wedenig24a.html", "pdf": "https://openreview.net/pdf?id=0mklK4h0rX", "openreview": "https://openreview.net/forum?id=0mklK4h0rX", "author_site": "Thomas Wedenig, Rishub Nagpal, Ga\u00ebtan Cassiers, Stefan Mangard, Robert Peharz", "tldr": "", "abstract": "Detecting weaknesses in cryptographic algorithms is of utmost importance for designing secure information systems. The state-of-the-art *soft analytical side-channel attack* (SASCA) uses physical leakage information to make probabilistic predictions about intermediate computations and combines these \"guesses\" with the known algorithmic logic to compute the posterior distribution over the key. This attack is commonly performed via loopy belief propagation, which, however, lacks guarantees in terms of convergence and inference quality. In this paper, we develop a fast and exact inference method for SASCA, denoted as ExSASCA, by leveraging knowledge compilation and tractable probabilistic circuits. When attacking the *Advanced Encryption Standard* (AES), the most widely used encryption algorithm to date, ExSASCA outperforms SASCA by more than 31% top-1 success rate absolute. By leveraging sparse belief messages, this performance is achieved with little more computational cost than SASCA, and about 3 orders of magnitude less than exact inference via exhaustive enumeration. Even with dense belief messages, ExSASCA still uses 6 times less computations than exhaustive inference.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Thomas Wedenig;Rishub Nagpal;Ga\u00ebtan Cassiers;Stefan Mangard;Robert Peharz", "authorids": "~Thomas_Wedenig1;rishub.nagpal@iaik.tugraz.at;gaetan.cassiers@uclouvain.be;stefan.mangard@tugraz.at;~Robert_Peharz5", "gender": "M;;;;M", "homepage": ";;;;https://robert-peharz.github.io/", "dblp": ";;;;30/9232", "google_scholar": "I7QdoV0AAAAJ;;;;https://scholar.google.com/citations?hl=en", "orcid": ";;;;0000-0002-8644-9655", "linkedin": "thomas-wedenig/;;;;", "or_profile": "~Thomas_Wedenig1;rishub.nagpal@iaik.tugraz.at;gaetan.cassiers@uclouvain.be;stefan.mangard@tugraz.at;~Robert_Peharz5", "aff": "Technische Universit\u00e4t Graz;;;;Technische Universit\u00e4t Graz", "aff_domain": "tugraz.at;;;;tugraz.at", "position": "PhD student;;;;Assistant Professor", "bibtex": "@inproceedings{\nwedenig2024exact,\ntitle={Exact Soft Analytical Side-Channel Attacks using Tractable Circuits},\nauthor={Thomas Wedenig and Rishub Nagpal and Ga{\\\"e}tan Cassiers and Stefan Mangard and Robert Peharz},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0mklK4h0rX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 750854, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14973843428235192026&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 9, "email": "tugraz.at;;;;tugraz.at", "author_num": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Technische Universit\u00e4t Graz", "aff_unique_dep": "", "aff_unique_url": "https://www.tugraz.at", "aff_unique_abbr": "TU Graz", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Austria" }, { "title": "FedSC: Provable Federated Self-supervised Learning with Spectral Contrastive Objective over Non-i.i.d. Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35182", "id": "0nMzOmkBHC", "proceeding": "https://proceedings.mlr.press/v235/jing24b.html", "pdf": "https://openreview.net/pdf?id=0nMzOmkBHC", "openreview": "https://openreview.net/forum?id=0nMzOmkBHC", "author_site": "Shusen Jing, Anlan Yu, Shuai Zhang, Songyang Zhang", "tldr": "", "abstract": "Recent efforts have been made to integrate self-supervised learning (SSL) with the framework of federated learning (FL). One unique challenge of federated self-supervised learning (FedSSL) is that the global objective of FedSSL usually does not equal the weighted sum of local SSL objectives. Consequently, conventional approaches, such as federated averaging (FedAvg), fail to precisely minimize the FedSSL global objective, often resulting in suboptimal performance, especially when data is non-i.i.d.. To fill this gap, we propose a provable FedSSL algorithm, named FedSC, based on the spectral contrastive objective. In FedSC, clients share correlation matrices of data representations in addition to model weights periodically, which enables inter-client contrast of data samples in addition to intra-client contrast and contraction, resulting in improved quality of data representations. Differential privacy (DP) protection is deployed to control the additional privacy leakage on local datasets when correlation matrices are shared. We provide theoretical analysis on convergence and extra privacy leakage, and conduct numerical experiments to justify the effectiveness of our proposed algorithm.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shusen Jing;Anlan Yu;Shuai Zhang;Songyang Zhang", "authorids": "~Shusen_Jing1;~Anlan_Yu1;~Shuai_Zhang6;~Songyang_Zhang2", "gender": ";F;M;M", "homepage": ";;https://inchs708.github.io/shuaizhang.github.io/index.html;https://sites.google.com/view/sydzhang", "dblp": ";;71/208-15;", "google_scholar": "ujhCaecAAAAJ;hvdTK7wAAAAJ;https://scholar.google.com/citations?view_op=list_works;cZI11G4AAAAJ", "orcid": ";0000-0003-4899-5118;0000-0001-8280-6988;", "linkedin": ";anlan-yu-2a39ab139/;;", "or_profile": "~Shusen_Jing1;~Anlan_Yu1;~Shuai_Zhang6;~Songyang_Zhang2", "aff": "Lehigh University;Lehigh University;New Jersey Institute of Technology;University of Louisiana at Lafayette", "aff_domain": "lehigh.edu;lehigh.edu;njit.edu;louisiana.edu", "position": "PhD student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\njing2024fedsc,\ntitle={Fed{SC}: Provable Federated Self-supervised Learning with Spectral Contrastive Objective over Non-i.i.d. Data},\nauthor={Shusen Jing and Anlan Yu and Shuai Zhang and Songyang Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0nMzOmkBHC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 687001, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4863608195870509786&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "lehigh.edu;lehigh.edu;njit.edu;louisiana.edu", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Lehigh University;New Jersey Institute of Technology;University of Louisiana at Lafayette", "aff_unique_dep": ";;", "aff_unique_url": "https://www.lehigh.edu;https://www.njit.edu;https://www.louisiana.edu", "aff_unique_abbr": "Lehigh;NJIT;ULL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Lafayette", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "ED-Copilot: Reduce Emergency Department Wait Time with Language Model Diagnostic Assistance", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35181", "id": "0ntak1BGBd", "proceeding": "https://proceedings.mlr.press/v235/sun24a.html", "pdf": "https://openreview.net/pdf?id=0ntak1BGBd", "openreview": "https://openreview.net/forum?id=0ntak1BGBd", "author_site": "Liwen Sun, Abhineet Agarwal, Aaron Kornblith, Bin Yu, Chenyan Xiong", "tldr": "", "abstract": "In the emergency department (ED), patients undergo triage and multiple laboratory tests before diagnosis. This time-consuming process causes ED crowding which impacts patient mortality, medical errors, staff burnout, etc. This work proposes (time) *cost-effective diagnostic assistance* that leverages artificial intelligence systems to help ED clinicians make efficient and accurate diagnoses. In collaboration with ED clinicians, we use public patient data to curate MIMIC-ED-Assist, a benchmark for AI systems to suggest laboratory tests that minimize wait time while accurately predicting critical outcomes such as death. With MIMIC-ED-Assist, we develop ED-Copilot which sequentially suggests patient-specific laboratory tests and makes diagnostic predictions. ED-Copilot employs a pre-trained bio-medical language model to encode patient information and uses reinforcement learning to minimize ED wait time and maximize prediction accuracy. On MIMIC-ED-Assist, ED-Copilot improves prediction accuracy over baselines while halving average wait time from four hours to two hours. ED-Copilot can also effectively personalize treatment recommendations based on patient severity, further highlighting its potential as a diagnostic assistant. Since MIMIC-ED-Assist is a retrospective benchmark, ED-Copilot is restricted to recommend only observed tests. We show ED-Copilot achieves competitive performance without this restriction as the maximum allowed time increases. Our code is available at https://github.com/cxcscmu/ED-Copilot.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Liwen Sun;Abhineet Agarwal;Aaron Kornblith;Bin Yu;Chenyan Xiong", "authorids": "~Liwen_Sun2;~Abhineet_Agarwal1;~Aaron_Kornblith1;~Bin_Yu5;~Chenyan_Xiong1", "gender": "M;M;;M;M", "homepage": ";;;https://binyu.stat.berkeley.edu;https://www.cs.cmu.edu/~cx/", "dblp": "31/7993;304/4687;;27/116;18/10886", "google_scholar": "SvhUG8wAAAAJ;;;https://scholar.google.com.hk/citations?user=z1iJa3UAAAAJ;E9BaEBYAAAAJ", "orcid": ";;0000-0002-1344-575X;0000-0003-3097-1433;", "linkedin": "dominic-liwen-sun/;abhineet-agarwal-126171185/;;bin-yu-b665063/;", "or_profile": "~Liwen_Sun2;~Abhineet_Agarwal1;~Aaron_Kornblith1;~Bin_Yu5;~Chenyan_Xiong1", "aff": "Carnegie Mellon University;University of California, Berkeley;University of California, San Francisco;University of California, Berkeley;School of Computer Science, Carnegie Mellon University", "aff_domain": "cs.cmu.edu;berkeley.edu;ucsf.edu;berkeley.edu;cs.cmu.edu", "position": "MS student;PhD student;Associate Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nsun2024edcopilot,\ntitle={{ED}-Copilot: Reduce Emergency Department Wait Time with Language Model Diagnostic Assistance},\nauthor={Liwen Sun and Abhineet Agarwal and Aaron Kornblith and Bin Yu and Chenyan Xiong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0ntak1BGBd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 577688, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4749708171357089019&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "cs.cmu.edu;berkeley.edu;ucsf.edu;berkeley.edu;cs.cmu.edu", "author_num": 5, "aff_unique_index": "0;1;2;1;0", "aff_unique_norm": "Carnegie Mellon University;University of California, Berkeley;University of California, San Francisco", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cmu.edu;https://www.berkeley.edu;https://www.ucsf.edu", "aff_unique_abbr": "CMU;UC Berkeley;UCSF", "aff_campus_unique_index": "1;2;1;3", "aff_campus_unique": ";Berkeley;San Francisco;Pittsburgh", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Position: Levels of AGI for Operationalizing Progress on the Path to AGI", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35180", "id": "0ofzEysK2D", "proceeding": "https://proceedings.mlr.press/v235/morris24b.html", "pdf": "https://openreview.net/pdf?id=0ofzEysK2D", "openreview": "https://openreview.net/forum?id=0ofzEysK2D", "author_site": "Meredith Morris, Jascha Sohl-Dickstein, Noah Fiedel, Tris Warkentin, Allan Dafoe, Aleksandra Faust, Clement Farabet, Shane Legg", "tldr": "", "abstract": "We propose a framework for classifying the capabilities and behavior of Artificial General Intelligence (AGI) models and their precursors. This framework introduces levels of AGI performance, generality, and autonomy, providing a common language to compare models, assess risks, and measure progress along the path to AGI. To develop our framework, we analyze existing definitions of AGI, and distill six principles that a useful ontology for AGI should satisfy. With these principles in mind, we propose \u201cLevels of AGI\u201d based on depth (performance) and breadth (generality) of capabilities, and reflect on how current systems fit into this ontology. We discuss the challenging requirements for future benchmarks that quantify the behavior and capabilities of AGI models against these levels. Finally, we discuss how these levels of AGI interact with deployment considerations such as autonomy and risk, and emphasize the importance of carefully selecting Human-AI Interaction paradigms for responsible and safe deployment of highly capable AI systems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Meredith Ringel Morris;Jascha Sohl-Dickstein;Noah Fiedel;Tris Warkentin;Allan Dafoe;Aleksandra Faust;Clement Farabet;Shane Legg", "authorids": "~Meredith_Ringel_Morris1;~Jascha_Sohl-Dickstein2;~Noah_Fiedel1;~Tris_Warkentin1;~Allan_Dafoe1;~Aleksandra_Faust1;~Clement_Farabet2;~Shane_Legg1", "gender": "F;;M;M;F;M;M;M", "homepage": "http://merrie.info;;http://triswarkentin.com;http://www.allandafoe.com;http://www.afaust.info;http://clmt.ai;http://www.vetta.org;http://sohldickstein.com", "dblp": "m/MeredithRingelMorris;204/3399;;;135/8420;;36/5739;51/7117", "google_scholar": "eJsW6W8AAAAJ;;;7oCYmT8AAAAJ;RK72t68AAAAJ;u3u16tgAAAAJ;;-3zYIjQAAAAJ", "orcid": ";;;0000-0003-0377-205X;0000-0002-3268-8685;;;", "linkedin": ";;;allan-dafoe-a94787142;aleksandrafaust;clementfarabet/;;", "or_profile": "~Meredith_Ringel_Morris1;~Noah_Fiedel1;~Tris_Warkentin1;~Allan_Dafoe1;~Aleksandra_Faust1;~Clement_Farabet2;~Shane_Legg1;~Jascha_Sohl-Dickstein1", "aff": "Google DeepMind;Google;Google;Google DeepMind;Google Brain;Google;Google DeepMind;Google", "aff_domain": "google.com;google.com;google.com;deepmind.com;google.com;google.com;deepmind.com;google.com", "position": "Director;Director, Research & Engineering;PM;Researcher;Principal Researcher;Researcher;Chief Scientist;Research Scientist", "bibtex": "@inproceedings{\nmorris2024position,\ntitle={Position: Levels of {AGI} for Operationalizing Progress on the Path to {AGI}},\nauthor={Meredith Ringel Morris and Jascha Sohl-Dickstein and Noah Fiedel and Tris Warkentin and Allan Dafoe and Aleksandra Faust and Clement Farabet and Shane Legg},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0ofzEysK2D}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 212889, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12778964541703770411&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "google.com;google.com;google.com;deepmind.com;google.com;google.com;deepmind.com;google.com", "author_num": 8, "aff_unique_index": "0;0;0;0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google DeepMind", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;0;1;1;0;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Improving Neural Additive Models with Bayesian Principles", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35179", "id": "0pSTzCnEmi", "proceeding": "https://proceedings.mlr.press/v235/bouchiat24a.html", "pdf": "https://openreview.net/pdf?id=0pSTzCnEmi", "openreview": "https://openreview.net/forum?id=0pSTzCnEmi", "author_site": "Kouroche Bouchiat, Alexander Immer, Hugo Y\u00e8che, Gunnar Ratsch, Vincent Fortuin", "tldr": "", "abstract": "Neural additive models (NAMs) enhance the transparency of deep neural networks by handling input features in separate additive sub-networks. However, they lack inherent mechanisms that provide calibrated uncertainties and enable selection of relevant features and interactions. Approaching NAMs from a Bayesian perspective, we augment them in three primary ways, namely by a) providing credible intervals for the individual additive sub-networks; b) estimating the marginal likelihood to perform an implicit selection of features via an empirical Bayes procedure; and c) facilitating the ranking of feature pairs as candidates for second-order interaction in fine-tuned models. In particular, we develop Laplace-approximated NAMs (LA-NAMs), which show improved empirical performance on tabular datasets and challenging real-world medical tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kouroche Bouchiat;Alexander Immer;Hugo Y\u00e8che;Gunnar Ratsch;Vincent Fortuin", "authorids": "~Kouroche_Bouchiat1;~Alexander_Immer1;~Hugo_Y\u00e8che1;~Gunnar_Ratsch1;~Vincent_Fortuin1", "gender": "M;;M;M;M", "homepage": ";;https://bmi.inf.ethz.ch/people/person/hugo-yeche;http://bmi.inf.ethz.ch;https://fortuin.github.io/", "dblp": "348/5641;;251/6034;https://dblp.uni-trier.de/pers/hd/r/R=auml=tsch:Gunnar;218/7489", "google_scholar": "WUnJTtYAAAAJ;;;https://scholar.google.ch/citations?user=tQuQ1FwAAAAJ;https://scholar.google.ch/citations?user=XBlrYTIAAAAJ", "orcid": ";;;0000-0001-5486-8532;0000-0002-0640-2671", "linkedin": "kouroche/;;;;vincent-fortuin-42426b134/", "or_profile": "~Kouroche_Bouchiat1;~Alexander_Immer1;~Hugo_Y\u00e8che1;~Gunnar_Ratsch1;~Vincent_Fortuin1", "aff": ";;Swiss Federal Institute of Technology;Swiss Federal Institute of Technology;Helmholtz AI", "aff_domain": ";;ethz.ch;ethz.ch;helmholtz.ai", "position": ";;PhD student;Professor;Principal Researcher", "bibtex": "@inproceedings{\nbouchiat2024improving,\ntitle={Improving Neural Additive Models with Bayesian Principles},\nauthor={Kouroche Bouchiat and Alexander Immer and Hugo Y{\\`e}che and Gunnar Ratsch and Vincent Fortuin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0pSTzCnEmi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4620178, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10507798791921498150&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 7, "email": ";;ethz.ch;ethz.ch;helmholtz.ai", "author_num": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "Swiss Federal Institute of Technology;Helmholtz Association of German Research Centres", "aff_unique_dep": ";Helmholtz AI", "aff_unique_url": "https://www.ethz.ch;https://www.helmholtz-ai.de", "aff_unique_abbr": "ETH Zurich;Helmholtz AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Switzerland;Germany" }, { "title": "Graph External Attention Enhanced Transformer", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35178", "id": "0rV7VIrcjX", "proceeding": "https://proceedings.mlr.press/v235/liang24a.html", "pdf": "https://openreview.net/pdf?id=0rV7VIrcjX", "openreview": "https://openreview.net/forum?id=0rV7VIrcjX", "author_site": "Jianqing Liang, Min Chen, Jiye Liang", "tldr": "", "abstract": "The Transformer architecture has recently gained considerable attention in the field of graph representation learning, as it naturally overcomes several limitations of Graph Neural Networks (GNNs) with customized attention mechanisms or positional and structural encodings. Despite making some progress, existing works tend to overlook external information of graphs, specifically the correlation between graphs. Intuitively, graphs with similar structures should have similar representations. Therefore, we propose Graph External Attention (GEA) --- a novel attention mechanism that leverages multiple external node/edge key-value units to capture inter-graph correlations implicitly. On this basis, we design an effective architecture called Graph External Attention Enhanced Transformer (GEAET), which integrates local structure and global interaction information for more comprehensive graph representations. Extensive experiments on benchmark datasets demonstrate that GEAET achieves state-of-the-art empirical performance. The source code is available for reproducibility at: https://github.com/icm1018/GEAET.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jianqing Liang;Min Chen;Jiye Liang", "authorids": "~Jianqing_Liang1;202222409003@email.sxu.edu.cn;~Jiye_Liang1", "gender": "F;;M", "homepage": "http://cs.sxu.edu.cn/faculty/lecturer/4902/index.htm;;https://jiyeliang.github.io/index.html", "dblp": "128/2639.html;;80/6535", "google_scholar": ";;iGc61hUAAAAJ", "orcid": ";;0000-0001-5887-9327", "linkedin": ";;", "or_profile": "~Jianqing_Liang1;202222409003@email.sxu.edu.cn;~Jiye_Liang1", "aff": "Shanxi University;;Shanxi University", "aff_domain": "sxu.edu.cn;;sxu.edu.cn", "position": "Associate Professor;;Full Professor", "bibtex": "@inproceedings{\nliang2024graph,\ntitle={Graph External Attention Enhanced Transformer},\nauthor={Jianqing Liang and Min Chen and Jiye Liang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0rV7VIrcjX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 648139, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11241905935675401989&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "sxu.edu.cn;;sxu.edu.cn", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Shanxi University", "aff_unique_dep": "", "aff_unique_url": "http://www.sxu.edu.cn", "aff_unique_abbr": "SXU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Adversarial Attacks on Combinatorial Multi-Armed Bandits", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35177", "id": "0tPBk24xNj", "proceeding": "https://proceedings.mlr.press/v235/balasubramanian24a.html", "pdf": "https://openreview.net/pdf?id=0tPBk24xNj", "openreview": "https://openreview.net/forum?id=0tPBk24xNj", "author_site": "Rishab Balasubramanian, Jiawei Li, Tadepalli Prasad, Huazheng Wang, Qingyun Wu, Haoyu Zhao", "tldr": "", "abstract": "We study reward poisoning attacks on Combinatorial Multi-armed Bandits (CMAB). We first provide a sufficient and necessary condition for the attackability of CMAB, a notion to capture the vulnerability and robustness of CMAB. The attackability condition depends on the intrinsic properties of the corresponding CMAB instance such as the reward distributions of super arms and outcome distributions of base arms. Additionally, we devise an attack algorithm for attackable CMAB instances. Contrary to prior understanding of multi-armed bandits, our work reveals a surprising fact that the attackability of a specific CMAB instance also depends on whether the bandit instance is known or unknown to the adversary. This finding indicates that adversarial attacks on CMAB are difficult in practice and a general attack strategy for any CMAB instance does not exist since the environment is mostly unknown to the adversary. We validate our theoretical findings via extensive experiments on real-world CMAB applications including probabilistic maximum covering problem, online minimum spanning tree, cascading bandits for online ranking, and online shortest path.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rishab Balasubramanian;Jiawei Li;Prasad Tadepalli;Huazheng Wang;Qingyun Wu;Haoyu Zhao", "authorids": "~Rishab_Balasubramanian1;~Jiawei_Li10;~Prasad_Tadepalli3;~Huazheng_Wang1;~Qingyun_Wu2;~Haoyu_Zhao1", "gender": "M;;M;;M;F", "homepage": "https://rishabbala.github.io/;https://jiawei-li20.github.io;http://eecs.oregonstate.edu/~tadepall;https://huazhengwang.github.io/;http://hyzhao.me;https://qingyun-wu.github.io/", "dblp": "277/9164.html;;42/4375.html;163/2233;;183/0579", "google_scholar": "K5x0hwcAAAAJ;;CXAN0i0AAAAJ;w3PrbKwAAAAJ;1MjanHUAAAAJ;Y54J21sAAAAJ", "orcid": ";;0000-0003-2736-3912;;;", "linkedin": "rishab-bala-b01110142/;;prasad-tadepalli-85857b5/;;;", "or_profile": "~Rishab_Balasubramanian1;~Jiawei_Li10;~Prasad_Tadepalli3;~Huazheng_Wang1;~Haoyu_Zhao1;~Qingyun_Wu1", "aff": "Oregon State University;University of Illinois, Urbana Champaign;Oregon State University;Oregon State University;Princeton University;Pennsylvania State University", "aff_domain": "oregonstate.edu;uiuc.edu;oregonstate.edu;oregonstate.edu;princeton.edu;psu.edu", "position": "MS student;PhD student;Full Professor;Assistant Professor;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nbalasubramanian2024adversarial,\ntitle={Adversarial Attacks on Combinatorial Multi-Armed Bandits},\nauthor={Rishab Balasubramanian and Jiawei Li and Prasad Tadepalli and Huazheng Wang and Qingyun Wu and Haoyu Zhao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0tPBk24xNj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8023604, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7302297620536087407&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "oregonstate.edu;uiuc.edu;oregonstate.edu;oregonstate.edu;princeton.edu;psu.edu", "author_num": 6, "aff_unique_index": "0;1;0;0;2;3", "aff_unique_norm": "Oregon State University;University of Illinois Urbana-Champaign;Princeton University;Pennsylvania State University", "aff_unique_dep": ";;;", "aff_unique_url": "https://oregonstate.edu;https://illinois.edu;https://www.princeton.edu;https://www.psu.edu", "aff_unique_abbr": "OSU;UIUC;Princeton;PSU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Log Neural Controlled Differential Equations: The Lie Brackets Make A Difference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35176", "id": "0tYrMtQyPT", "proceeding": "https://proceedings.mlr.press/v235/walker24a.html", "pdf": "https://openreview.net/pdf?id=0tYrMtQyPT", "openreview": "https://openreview.net/forum?id=0tYrMtQyPT", "author_site": "Benjamin Walker, Andrew McLeod, Tiexin QIN, Yichuan Cheng, Haoliang Li, Terry Lyons", "tldr": "", "abstract": "The vector field of a controlled differential equation (CDE) describes the relationship between a *control* path and the evolution of a *solution* path. Neural CDEs (NCDEs) treat time series data as observations from a control path, parameterise a CDE's vector field using a neural network, and use the solution path as a continuously evolving hidden state. As their formulation makes them robust to irregular sampling rates, NCDEs are a powerful approach for modelling real-world data. Building on neural rough differential equations (NRDEs), we introduce Log-NCDEs, a novel, effective, and efficient method for training NCDEs. The core component of Log-NCDEs is the Log-ODE method, a tool from the study of rough paths for approximating a CDE's solution. Log-NCDEs are shown to outperform NCDEs, NRDEs, the linear recurrent unit, S5, and MAMBA on a range of multivariate time series datasets with up to $50{,}000$ observations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Benjamin Walker;Andrew Donald McLeod;Tiexin Qin;Yichuan Cheng;Haoliang Li;Terry Lyons", "authorids": "~Benjamin_Walker1;~Andrew_Donald_McLeod1;~Tiexin_Qin1;~Yichuan_Cheng1;~Haoliang_Li2;~Terry_Lyons2", "gender": ";M;M;M;;M", "homepage": ";https://www.maths.ox.ac.uk/people/andrew.mcleod;;;;https://DataSig.ac.uk/", "dblp": "344/6314;;;355/2578;;", "google_scholar": ";;86PL14gAAAAJ;;;", "orcid": "0000-0002-9574-973X;;0000-0003-2968-7320;0009-0006-3705-5437;;", "linkedin": "benjamin-walker-3880771b9/;;;;;", "or_profile": "~Benjamin_Walker1;~Andrew_Donald_McLeod1;~Tiexin_Qin1;~Yichuan_Cheng1;~Haoliang_Li2;~Terry_Lyons2", "aff": "Weierstrass Institute for Applied Analysis and Stochastics;University of Oxford;City University of Hong Kong;City University of Hong Kong;;University of Oxford", "aff_domain": "wias-berlin.de;ox.ac.uk;cityu.edu.hk;cityu.edu.hk;;ox.ac.uk", "position": "Intern;Postdoc;PhD student;PhD student;;Full Professor", "bibtex": "@inproceedings{\nwalker2024log,\ntitle={Log Neural Controlled Differential Equations: The Lie Brackets Make A Difference},\nauthor={Benjamin Walker and Andrew Donald McLeod and Tiexin Qin and Yichuan Cheng and Haoliang Li and Terry Lyons},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0tYrMtQyPT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4231494, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6281669172364560578&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "wias-berlin.de;ox.ac.uk;cityu.edu.hk;cityu.edu.hk;;ox.ac.uk", "author_num": 6, "aff_unique_index": "0;1;2;2;1", "aff_unique_norm": "Weierstrass Institute for Applied Analysis and Stochastics;University of Oxford;City University of Hong Kong", "aff_unique_dep": ";;", "aff_unique_url": "https://www.wias-berlin.de/;https://www.ox.ac.uk;https://www.cityu.edu.hk", "aff_unique_abbr": "WIAS;Oxford;CityU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;2;2;1", "aff_country_unique": "Germany;United Kingdom;China" }, { "title": "Complexity Matters: Feature Learning in the Presence of Spurious Correlations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35175", "id": "0tuwdgBiSN", "proceeding": "https://proceedings.mlr.press/v235/qiu24e.html", "pdf": "https://openreview.net/pdf?id=0tuwdgBiSN", "openreview": "https://openreview.net/forum?id=0tuwdgBiSN", "author_site": "GuanWen Qiu, Da Kuang, Surbhi Goel", "tldr": "", "abstract": "Existing research often posits spurious features as **easier** to learn than core features in neural network optimization, but the impact of their relative simplicity remains under-explored. Moreover, studies mainly focus on end performance rather than the learning dynamics of feature learning. In this paper, we propose a theoretical framework and an associated synthetic dataset grounded in boolean function analysis. This setup allows for fine-grained control over the relative complexity (compared to core features) and correlation strength (with respect to the label) of spurious features to study the dynamics of feature learning under spurious correlations. Our findings uncover several interesting phenomena: (1) stronger spurious correlations or simpler spurious features slow down the learning rate of the core features, (2) two distinct subnetworks are formed to learn core and spurious features separately, (3) learning phases of spurious and core features are not always separable, (4) spurious features are not forgotten even after core features are fully learned. We demonstrate that our findings justify the success of retraining the last layer to remove spurious correlation and also identifies limitations of popular debiasing algorithms that exploit early learning of spurious features. We support our empirical findings with theoretical analyses for the case of learning XOR features with a one-hidden-layer ReLU network.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "GuanWen Qiu;Da Kuang;Surbhi Goel", "authorids": "~GuanWen_Qiu1;~Da_Kuang2;~Surbhi_Goel1", "gender": "M;M;F", "homepage": ";https://www.kuangda.tech/;https://www.surbhigoel.com", "dblp": ";;190/7815", "google_scholar": ";;https://scholar.google.co.in/citations?user=Zqz4CQoAAAAJ", "orcid": ";;", "linkedin": "guanwen-qiu-92b6651b0/;;", "or_profile": "~GuanWen_Qiu1;~Da_Kuang2;~Surbhi_Goel1", "aff": "School of Engineering and Science, University of Pennsylvania;University of Pennsylvania;University of Pennsylvania", "aff_domain": "seas.upenn.edu;upenn.edu;upenn.edu", "position": "MS student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nqiu2024complexity,\ntitle={Complexity Matters: Feature Learning in the Presence of Spurious Correlations},\nauthor={GuanWen Qiu and Da Kuang and Surbhi Goel},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0tuwdgBiSN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4895204, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7266456410965936021&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 4, "email": "seas.upenn.edu;upenn.edu;upenn.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "School of Engineering and Science", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "DiJiang: Efficient Large Language Models through Compact Kernelization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35174", "id": "0uUHfhXdnH", "proceeding": "https://proceedings.mlr.press/v235/chen24ab.html", "pdf": "https://openreview.net/pdf?id=0uUHfhXdnH", "openreview": "https://openreview.net/forum?id=0uUHfhXdnH", "author_site": "Hanting Chen, Liuzhicheng Liuzhicheng, Xutao Wang, Yuchuan Tian, Yunhe Wang", "tldr": "", "abstract": "In an effort to reduce the computational load of Transformers, research on linear attention has gained significant momentum. However, the improvement strategies for attention mechanisms typically necessitate extensive retraining, which is impractical for large language models with a vast array of parameters. In this paper, we present DiJiang, a novel Frequency Domain Kernelization approach that enables the transformation of a pre-trained vanilla Transformer into a linear complexity model with little training costs. By employing a weighted Quasi-Monte Carlo method for sampling, the proposed approach theoretically offers superior approximation efficiency. To further reduce the training computational complexity, our kernelization is based on Discrete Cosine Transform (DCT) operations. Extensive experiments demonstrate that the proposed method achieves comparable performance to the original Transformer, but with significantly reduced training costs and much faster inference speeds. Our DiJiang-7B achieves comparable performance with LLaMA2-7B on various benchmark while requires only about 1/50 training cost. Code is available at https://github.com/YuchuanTian/DiJiang.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hanting Chen;Liuzhicheng;Xutao Wang;Yuchuan Tian;Yunhe Wang", "authorids": "~Hanting_Chen1;~Liuzhicheng2;~Xutao_Wang1;~Yuchuan_Tian1;~Yunhe_Wang1", "gender": "M;;M;M;M", "homepage": ";https://github.com/Lzc06;;;https://www.wangyunhe.site/", "dblp": "232/2060;;;193/6675;63/8217-1", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;;https://scholar.google.com/citations?hl=en;;https://scholar.google.com.sg/citations?user=isizOkYAAAAJ", "orcid": ";;;;0000-0002-0142-509X", "linkedin": ";;;;", "or_profile": "~Hanting_Chen1;~Liuzhicheng2;~Xutao_Wang1;~Yuchuan_Tian1;~Yunhe_Wang1", "aff": "Huawei Technologies Ltd.;Huawei Technologies Ltd.;;Peking University;Huawei Noah's Ark Lab", "aff_domain": "huawei.com;huawei.com;;pku.edu.cn;huawei.com", "position": "Researcher;Researcher;;PhD student;Principal Researcher", "bibtex": "@inproceedings{\nchen2024dijiang,\ntitle={DiJiang: Efficient Large Language Models through Compact Kernelization},\nauthor={Hanting Chen and Liuzhicheng and Xutao Wang and Yuchuan Tian and Yunhe Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0uUHfhXdnH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 599077, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9451344766463580985&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 6, "email": "huawei.com;huawei.com;;pku.edu.cn;huawei.com", "author_num": 5, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Huawei;Peking University", "aff_unique_dep": "Huawei Technologies;", "aff_unique_url": "https://www.huawei.com;http://www.pku.edu.cn", "aff_unique_abbr": "Huawei;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "PEARL: Zero-shot Cross-task Preference Alignment and Robust Reward Learning for Robotic Manipulation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35173", "id": "0urN0PnNDj", "proceeding": "https://proceedings.mlr.press/v235/liu24o.html", "pdf": "https://openreview.net/pdf?id=0urN0PnNDj", "openreview": "https://openreview.net/forum?id=0urN0PnNDj", "author_site": "Runze Liu, Yali Du, Fengshuo Bai, Jiafei Lyu, Xiu Li", "tldr": "", "abstract": "In preference-based Reinforcement Learning (RL), obtaining a large number of preference labels are both time-consuming and costly. Furthermore, the queried human preferences cannot be utilized for the new tasks. In this paper, we propose Zero-shot Cross-task Preference Alignment and Robust Reward Learning (PEARL), which learns policies from cross-task preference transfer without any human labels of the target task. Our contributions include two novel components that facilitate the transfer and learning process. The first is Cross-task Preference Alignment (CPA), which transfers the preferences between tasks via optimal transport. The key idea of CPA is to use Gromov-Wasserstein distance to align the trajectories between tasks, and the solved optimal transport matrix serves as the correspondence between trajectories. The target task preferences are computed as the weighted sum of source task preference labels with the correspondence as weights. Moreover, to ensure robust learning from these transferred labels, we introduce Robust Reward Learning (RRL), which considers both reward mean and uncertainty by modeling rewards as Gaussian distributions. Empirical results on robotic manipulation tasks from Meta-World and Robomimic demonstrate that our method is capable of transferring preference labels across tasks accurately and then learns well-behaved policies. Notably, our approach significantly exceeds existing methods when there are few human preferences. The code and videos of our method are available at: https://sites.google.com/view/pearl-preference.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Runze Liu;Yali Du;Fengshuo Bai;Jiafei Lyu;Xiu Li", "authorids": "~Runze_Liu2;~Yali_Du1;~Fengshuo_Bai1;~Jiafei_Lyu1;~Xiu_Li1", "gender": "M;;;M;F", "homepage": "https://ryanliu112.github.io/;;https://changwinde.github.io/;;https://thusigsiclab.github.io/thu.github.io/introduction.html", "dblp": "235/0682-2;;346/1114;278/1503;13/1206-1", "google_scholar": "LiIfGakAAAAJ;;https://scholar.google.com/citations?hl=en;bfgCMr8AAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": "0009-0007-4784-5333;;;0000-0001-6616-417X;0000-0003-0403-1923", "linkedin": ";;;;", "or_profile": "~Runze_Liu2;~Yali_Du1;~Fengshuo_Bai1;~Jiafei_Lyu1;~Xiu_Li1", "aff": "Shanghai Artificial Intelligence Laboratory;;Shanghai Jiaotong University;Tsinghua University;Tsinghua University", "aff_domain": "pjlab.org.cn;;sjtu.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "Intern;;PhD student;PhD student;Professor", "bibtex": "@inproceedings{\nliu2024pearl,\ntitle={{PEARL}: Zero-shot Cross-task Preference Alignment and Robust Reward Learning for Robotic Manipulation},\nauthor={Runze Liu and Yali Du and Fengshuo Bai and Jiafei Lyu and Xiu Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0urN0PnNDj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3816666, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8975735774279042168&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "pjlab.org.cn;;sjtu.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 5, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Shanghai Artificial Intelligence Laboratory;Shanghai Jiao Tong University;Tsinghua University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.shailab.org/;https://www.sjtu.edu.cn;https://www.tsinghua.edu.cn", "aff_unique_abbr": "Shanghai AI Lab;SJTU;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Efficient Contextual Bandits with Uninformed Feedback Graphs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35172", "id": "0vozy8vstt", "proceeding": "https://proceedings.mlr.press/v235/zhang24ce.html", "pdf": "https://openreview.net/pdf?id=0vozy8vstt", "openreview": "https://openreview.net/forum?id=0vozy8vstt", "author_site": "Mengxiao Zhang, Yuheng Zhang, Haipeng Luo, Paul Mineiro", "tldr": "", "abstract": "Bandits with feedback graphs are powerful online learning models that interpolate between the full information and classic bandit problems, capturing many real-life applications. A recent work by [Zhang et al., 2023] studies the contextual version of this problem and proposes an efficient and optimal algorithm via a reduction to online regression. However, their algorithm crucially relies on seeing the feedback graph before making each decision, while in many applications, the feedback graph is *uninformed*, meaning that it is either only revealed after the learner makes her decision or even never fully revealed at all. This work develops the first contextual algorithms for such uninformed settings, via an efficient reduction to online regression over both the losses and the graphs. Importantly, we show that it is critical to learn the graphs using *log loss* instead of squared loss to obtain favorable regret guarantees. We also demonstrate the empirical effectiveness of our algorithm on a bidding application using both synthetic and real-world data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mengxiao Zhang;Yuheng Zhang;Haipeng Luo;Paul Mineiro", "authorids": "~Mengxiao_Zhang2;~Yuheng_Zhang1;~Haipeng_Luo1;~Paul_Mineiro1", "gender": ";M;M;", "homepage": ";;https://haipeng-luo.net/;", "dblp": ";;62/2576;35/5613", "google_scholar": ";IoEBLNYAAAAJ;ct2hw4UAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Mengxiao_Zhang2;~Yuheng_Zhang1;~Haipeng_Luo1;~Paul_Mineiro1", "aff": ";University of Illinois, Urbana Champaign;University of Southern California;", "aff_domain": ";cs.illinois.edu;usc.edu;", "position": ";PhD student;Associate Professor;", "bibtex": "@inproceedings{\nzhang2024efficient,\ntitle={Efficient Contextual Bandits with Uninformed Feedback Graphs},\nauthor={Mengxiao Zhang and Yuheng Zhang and Haipeng Luo and Paul Mineiro},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0vozy8vstt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3523837, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1377020143143731659&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": ";cs.illinois.edu;usc.edu;", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "University of Illinois Urbana-Champaign;University of Southern California", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://www.usc.edu", "aff_unique_abbr": "UIUC;USC", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Urbana-Champaign;Los Angeles", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Neural-Kernel Conditional Mean Embeddings", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35171", "id": "0wso32h0jc", "proceeding": "https://proceedings.mlr.press/v235/shimizu24a.html", "pdf": "https://openreview.net/pdf?id=0wso32h0jc", "openreview": "https://openreview.net/forum?id=0wso32h0jc", "author_site": "Eiki Shimizu, Kenji Fukumizu, Dino Sejdinovic", "tldr": "", "abstract": "Kernel conditional mean embeddings (CMEs) offer a powerful framework for representing conditional distributions, but they often face scalability and expressiveness challenges. In this work, we propose a new method that effectively combines the strengths of deep learning with CMEs in order to address these challenges. Specifically, our approach leverages the end-to-end neural network (NN) optimization framework using a kernel-based objective. This design circumvents the computationally expensive Gram matrix inversion required by current CME methods. To further enhance performance, we provide efficient strategies to optimize the remaining kernel hyperparameters. In conditional density estimation tasks, our NN-CME hybrid achieves competitive performance and often surpasses existing deep learning-based methods. Lastly, we showcase its remarkable versatility by seamlessly integrating it into reinforcement learning (RL) contexts. Building on Q-learning, our approach naturally leads to a new variant of distributional RL methods, which demonstrates consistent effectiveness across different environments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Eiki Shimizu;Kenji Fukumizu;Dino Sejdinovic", "authorids": "~Eiki_Shimizu1;~Kenji_Fukumizu1;~Dino_Sejdinovic1", "gender": ";M;M", "homepage": ";http://www.ism.ac.jp/~fukumizu/;https://sejdino.github.io/", "dblp": ";96/464;31/1783", "google_scholar": ";;v8Dg1lIAAAAJ", "orcid": "0009-0005-3592-8903;0000-0002-3488-2625;0000-0001-5547-9213", "linkedin": ";;https://linkedin.com/in/dinosejdinovic", "or_profile": "~Eiki_Shimizu1;~Kenji_Fukumizu1;~Dino_Sejdinovic1", "aff": "The Institute of Statistical Mathematics, Japan;The Institute of Statistical Mathematics, Japan, Tokyo Institute of Technology;University of Adelaide", "aff_domain": "ism.ac.jp;ism.ac.jp;adelaide.edu.au", "position": "PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nshimizu2024neuralkernel,\ntitle={Neural-Kernel Conditional Mean Embeddings},\nauthor={Eiki Shimizu and Kenji Fukumizu and Dino Sejdinovic},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0wso32h0jc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 619455, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17425210034453127733&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "ism.ac.jp;ism.ac.jp;adelaide.edu.au", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Institute of Statistical Mathematics;University of Adelaide", "aff_unique_dep": ";", "aff_unique_url": "https://www.ism.ac.jp;https://www.adelaide.edu.au", "aff_unique_abbr": "ISM;Adelaide", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Japan;Australia" }, { "title": "Provable Risk-Sensitive Distributional Reinforcement Learning with General Function Approximation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35170", "id": "0xmfExPqFf", "proceeding": "https://proceedings.mlr.press/v235/chen24bf.html", "pdf": "https://openreview.net/pdf?id=0xmfExPqFf", "openreview": "https://openreview.net/forum?id=0xmfExPqFf", "author_site": "Yu Chen, XiangCheng Zhang, Siwei Wang, Longbo Huang", "tldr": "", "abstract": "In the realm of reinforcement learning (RL), accounting for risk is crucial for making decisions under uncertainty, particularly in applications where safety and reliability are paramount. In this paper, we introduce a general framework on Risk-Sensitive Distributional Reinforcement Learning (RS-DisRL), with static Lipschitz Risk Measures (LRM) and general function approximation. Our framework covers a broad class of risk-sensitive RL, and facilitates analysis of the impact of estimation functions on the effectiveness of RSRL strategies and evaluation of their sample complexity. We design two innovative meta-algorithms: RS-DisRL-M, a model-based strategy for model-based function approximation, and RS-DisRL-V, a model-free approach for general value function approximation. With our novel estimation techniques via Least Squares Regression (LSR) and Maximum Likelihood Estimation (MLE) in distributional RL with augmented Markov Decision Process (MDP), we derive the first $\\widetilde{\\mathcal{O}}(\\sqrt{K})$ dependency of the regret upper bound for RSRL with static LRM, marking a pioneering contribution towards statistically efficient algorithms in this domain.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yu Chen;XiangCheng Zhang;Siwei Wang;Longbo Huang", "authorids": "~Yu_Chen19;~XiangCheng_Zhang1;~Siwei_Wang2;~Longbo_Huang2", "gender": "M;M;M;M", "homepage": "https://custyhs.github.io/;;https://www.microsoft.com/en-us/research/people/siweiwang/publications/;http://people.iiis.tsinghua.edu.cn/~huang/", "dblp": "87/1254-74;;51/8279-2;79/7077", "google_scholar": "rJ6Ipa0AAAAJ;;;", "orcid": "0009-0006-9503-6613;;;", "linkedin": ";%E6%A9%A1%E6%88%90-%E7%AB%A0-019354242/;;", "or_profile": "~Yu_Chen19;~XiangCheng_Zhang1;~Siwei_Wang2;~Longbo_Huang2", "aff": "Tsinghua University; Tsinghua University, Tsinghua University;Microsoft;Tsinghua University", "aff_domain": "tsinghua.edu.cn;mails.tsinghua.edu.cn;microsoft.com;tsinghua.edu.cn", "position": "PhD student;Undergrad student;Researcher;Full Professor", "bibtex": "@inproceedings{\nchen2024provable,\ntitle={Provable Risk-Sensitive Distributional Reinforcement Learning with General Function Approximation},\nauthor={Yu Chen and XiangCheng Zhang and Siwei Wang and Longbo Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0xmfExPqFf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 818576, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13217149982218938551&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "tsinghua.edu.cn;mails.tsinghua.edu.cn;microsoft.com;tsinghua.edu.cn", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Tsinghua University;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.microsoft.com", "aff_unique_abbr": "THU;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Robust Optimization in Protein Fitness Landscapes Using Reinforcement Learning in Latent Space", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35169", "id": "0zbxwvJqwf", "proceeding": "https://proceedings.mlr.press/v235/lee24x.html", "pdf": "https://openreview.net/pdf?id=0zbxwvJqwf", "openreview": "https://openreview.net/forum?id=0zbxwvJqwf", "author_site": "Minji Lee, Luiz Felipe Vecchietti, Hyunkyu Jung, Hyun Joo Ro, MEEYOUNG CHA, Ho Min Kim", "tldr": "", "abstract": "Proteins are complex molecules responsible for different functions in nature. Enhancing the functionality of proteins and cellular fitness can significantly impact various industries. However, protein optimization using computational methods remains challenging, especially when starting from low-fitness sequences. We propose LatProtRL, an optimization method to efficiently traverse a latent space learned by an encoder-decoder leveraging a large protein language model. To escape local optima, our optimization is modeled as a Markov decision process using reinforcement learning acting directly in latent space. We evaluate our approach on two important fitness optimization tasks, demonstrating its ability to achieve comparable or superior fitness over baseline methods. Our findings and in vitro evaluation show that the generated sequences can reach high-fitness regions, suggesting a substantial potential of LatProtRL in lab-in-the-loop scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Minji Lee;Luiz Felipe Vecchietti;Hyunkyu Jung;Hyun Joo Ro;Meeyoung Cha;Ho Min Kim", "authorids": "~Minji_Lee1;~Luiz_Felipe_Vecchietti1;~Hyunkyu_Jung1;~Hyun_Joo_Ro1;~Meeyoung_Cha2;~Ho_Min_Kim1", "gender": "F;M;M;F;F;M", "homepage": "https://haewonc.github.io/;https://lfelipesv.github.io/;;https://openreview.net/login;https://www.mpi-sp.org/cha;", "dblp": ";248/4211;;https://dblp.uni-trier.de;57/4924;", "google_scholar": ";xPV0ONYAAAAJ;https://scholar.google.com/citations?hl=ko;https://scholar.google;iFlnVCoAAAAJ;https://scholar.google.co.kr/citations?user=AhBBZ-AAAAAJ", "orcid": ";0000-0003-2862-6200;;0000-0000-0000-0000;0000-0003-4085-9648;", "linkedin": ";;;linkedin;meeyoungcha/;", "or_profile": "~Minji_Lee1;~Luiz_Felipe_Vecchietti1;~Hyunkyu_Jung1;~Hyun_Joo_Ro1;~Meeyoung_Cha2;~Ho_Min_Kim1", "aff": "Korea Advanced Institute of Science & Technology;Institute for Basic Science;Korea Advanced Institute of Science & Technology;;Korea Advanced Institute of Science & Technology;", "aff_domain": "kaist.ac.kr;ibs.re.kr;kaist.ac.kr;;kaist.ac.kr;", "position": "Undergrad student;Researcher;MS student;;Full Professor;", "bibtex": "@inproceedings{\nlee2024robust,\ntitle={Robust Optimization in Protein Fitness Landscapes Using Reinforcement Learning in Latent Space},\nauthor={Minji Lee and Luiz Felipe Vecchietti and Hyunkyu Jung and Hyun Joo Ro and Meeyoung Cha and Ho Min Kim},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0zbxwvJqwf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4332532, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17345995142665920558&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "kaist.ac.kr;ibs.re.kr;kaist.ac.kr;;kaist.ac.kr;", "author_num": 6, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Institute for Basic Science", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;https://www.ibs.re.kr", "aff_unique_abbr": "KAIST;IBS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Sparse is Enough in Fine-tuning Pre-trained Large Language Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35168", "id": "10hu2D3hAg", "proceeding": "https://proceedings.mlr.press/v235/song24e.html", "pdf": "https://openreview.net/pdf?id=10hu2D3hAg", "openreview": "https://openreview.net/forum?id=10hu2D3hAg", "author_site": "Weixi Song, Zuchao Li, Lefei Zhang, hai zhao, Bo Du", "tldr": "", "abstract": "With the prevalence of pre-training-fine-tuning paradigm, how to efficiently adapt the pre-trained model to the downstream tasks has been an intriguing issue. $\\textbf{P}$arameter-$\\textbf{E}$fficient $\\textbf{F}$ine-$\\textbf{T}$uning(PEFT) methods have been proposed for low-cost adaptation. Although PEFT has demonstrated effectiveness and been widely applied, the underlying principles are still unclear. In this paper, we adopt the PAC-Bayesian generalization error bound, viewing pre-training as a shift of prior distribution which leads to a tighter bound for generalization error. We validate this shift from the perspectives of oscillations in the loss landscape and the quasi-sparsity in gradient distribution. Based on this, we propose a gradient-based sparse fine-tuning algorithm, named $\\textbf{S}$parse $\\textbf{I}$ncrement $\\textbf{F}$ine-$\\textbf{T}$uning(SIFT), and validate its effectiveness on a range of tasks including the GLUE Benchmark and Instruction-tuning. The code is accessible at https://github.com/song-wx/SIFT/.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weixi Song;Zuchao Li;Lefei Zhang;hai zhao;Bo Du", "authorids": "~Weixi_Song2;~Zuchao_Li1;~Lefei_Zhang1;~hai_zhao1;~Bo_Du3", "gender": "M;M;M;M;M", "homepage": "https://song-wx.github.io;https://zcli-charlie.github.io/;;http://bcmi.sjtu.edu.cn/~zhaohai/;", "dblp": ";198/9339;28/10770;25/1145-1.html;70/6443-1.html", "google_scholar": "fvP8SGcAAAAJ;PyzBf5oAAAAJ;BLKHwNwAAAAJ;https://scholar.google.com.tw/citations?user=4dU5KS0AAAAJ;Shy1gnMAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Weixi_Song2;~Zuchao_Li1;~Lefei_Zhang1;~hai_zhao1;~Bo_Du1", "aff": "Wuhan University;Wuhan University;Wuhan University;Shanghai Jiaotong University;Wuhan University", "aff_domain": "whu.edu.cn;whu.edu.cn;whu.edu.cn;sjtu.edu.cn;whu.edu.cn", "position": "Undergrad student;Researcher;Full Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nsong2024sparse,\ntitle={Sparse is Enough in Fine-tuning Pre-trained Large Language Models},\nauthor={Weixi Song and Zuchao Li and Lefei Zhang and hai zhao and Bo Du},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=10hu2D3hAg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1374066, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=881567673870743597&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 6, "email": "whu.edu.cn;whu.edu.cn;whu.edu.cn;sjtu.edu.cn;whu.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Wuhan University;Shanghai Jiao Tong University", "aff_unique_dep": ";", "aff_unique_url": "http://www.whu.edu.cn/;https://www.sjtu.edu.cn", "aff_unique_abbr": "WHU;SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "A Dual-module Framework for Counterfactual Estimation over Time", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35167", "id": "126SR50BEL", "proceeding": "https://proceedings.mlr.press/v235/wang24au.html", "pdf": "https://openreview.net/pdf?id=126SR50BEL", "openreview": "https://openreview.net/forum?id=126SR50BEL", "author_site": "Xin Wang, Shengfei Lyu, Lishan Yang, Yibing Zhan, Huanhuan Chen", "tldr": "", "abstract": "Efficiently and effectively estimating counterfactuals over time is crucial for optimizing treatment strategies. We present the Adversarial Counterfactual Temporal Inference Network (ACTIN), a novel framework with dual modules to enhance counterfactual estimation. The balancing module employs a distribution-based adversarial method to learn balanced representations, extending beyond the limitations of current classification-based methods to mitigate confounding bias across various treatment types. The integrating module adopts a novel Temporal Integration Predicting (TIP) strategy, which has a wider receptive field of treatments and balanced representations from the beginning to the current time for a more profound level of analysis. TIP goes beyond the established Direct Predicting (DP) strategy, which only relies on current treatments and representations, by empowering the integrating module to effectively capture long-range dependencies and temporal treatment interactions. ACTIN exceeds the confines of specific base models, and when implemented with simple base models, consistently delivers state-of-the-art performance and efficiency across both synthetic and real-world datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xin Wang;Shengfei Lyu;Lishan Yang;Yibing Zhan;Huanhuan Chen", "authorids": "~Xin_Wang46;~Shengfei_Lyu1;~Lishan_Yang2;~Yibing_Zhan2;~Huanhuan_Chen1", "gender": "M;M;M;;", "homepage": "https://github.com/wangxin0126;;;;", "dblp": ";268/5763.html;;;", "google_scholar": ";;;;", "orcid": ";;0009-0007-1917-6822;;", "linkedin": ";;;;", "or_profile": "~Xin_Wang46;~Shengfei_Lyu1;~Lishan_Yang2;~Yibing_Zhan2;~Huanhuan_Chen1", "aff": "University of Science and Technology of China;Nanyang Technological University;University of Science and Technology of China;;", "aff_domain": "ustc.edu.cn;ntu.edu.sg;ustc.edu.cn;;", "position": "PhD student;Researcher;PhD student;;", "bibtex": "@inproceedings{\nwang2024a,\ntitle={A Dual-module Framework for Counterfactual Estimation over Time},\nauthor={Xin Wang and Shengfei Lyu and Lishan Yang and Yibing Zhan and Huanhuan Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=126SR50BEL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 900011, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1555588359541519838&as_sdt=5,39&sciodt=0,39&hl=en", "gs_version_total": 4, "email": "ustc.edu.cn;ntu.edu.sg;ustc.edu.cn;;", "author_num": 5, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Science and Technology of China;Nanyang Technological University", "aff_unique_dep": ";", "aff_unique_url": "http://www.ustc.edu.cn;https://www.ntu.edu.sg", "aff_unique_abbr": "USTC;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;Singapore" }, { "title": "Tuning-free Estimation and Inference of Cumulative Distribution Function under Local Differential Privacy", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35166", "id": "15MpDbv3IQ", "proceeding": "https://proceedings.mlr.press/v235/liu24z.html", "pdf": "https://openreview.net/pdf?id=15MpDbv3IQ", "openreview": "https://openreview.net/forum?id=15MpDbv3IQ", "author_site": "Yi Liu, Qirui Hu, Linglong Kong", "tldr": "", "abstract": "We introduce a novel algorithm for estimating Cumulative Distribution Function (CDF) values under Local Differential Privacy (LDP) by exploiting an unexpected connection between LDP and the current status problem, a classical survival data problem in statistics. This connection leads to the development of tools for constrained isotonic estimation based on binary queries. Through mathematical proofs and extensive numerical testing, we demonstrate that our method achieves uniform and $L_2$ error bounds when estimating the entire CDF curve. By employing increasingly dense grids, the error bound can be improved, exhibiting an asymptotic normal distribution of the proposed estimator. Theoretically, we show that the error bound smoothly changes as the number of grids increases relative to the sample size $n$. Computationally, we demonstrate that our constrained isotonic estimator can be efficiently computed deterministically, eliminating the need for hyperparameters or random optimization.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yi Liu;Qirui Hu;Linglong Kong", "authorids": "~Yi_Liu13;~Qirui_Hu1;~Linglong_Kong2", "gender": "M;M;M", "homepage": "https://apps.ualberta.ca/directory/person/yliu16;;https://www.ualberta.ca/~lkong", "dblp": "97/4626-62;;35/8525", "google_scholar": ";;https://scholar.google.ca/citations?hl=en", "orcid": ";0000-0002-4846-3886;0000-0003-3011-9216", "linkedin": ";;", "or_profile": "~Yi_Liu13;~Qirui_Hu1;~Linglong_Kong2", "aff": "University of Alberta;Tsinghua University;University of Alberta", "aff_domain": "ualberta.ca;tsinghua.edu.cn;ualberta.ca", "position": "Postdoc;PhD student;Full Professor", "bibtex": "@inproceedings{\nliu2024tuningfree,\ntitle={Tuning-free Estimation and Inference of Cumulative Distribution Function under Local Differential Privacy},\nauthor={Yi Liu and Qirui Hu and Linglong Kong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=15MpDbv3IQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 723655, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Y7nCn-1aBvUJ:scholar.google.com/&scioq=Tuning-free+Estimation+and+Inference+of+Cumulative+Distribution+Function+under+Local+Differential+Privacy&hl=en&as_sdt=0,10", "gs_version_total": 5, "email": "ualberta.ca;tsinghua.edu.cn;ualberta.ca", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Alberta;Tsinghua University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ualberta.ca;https://www.tsinghua.edu.cn", "aff_unique_abbr": "UAlberta;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Canada;China" }, { "title": "PlanDQ: Hierarchical Plan Orchestration via D-Conductor and Q-Performer", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35165", "id": "17ZwoHl65h", "proceeding": "https://proceedings.mlr.press/v235/chen24a.html", "pdf": "https://openreview.net/pdf?id=17ZwoHl65h", "openreview": "https://openreview.net/forum?id=17ZwoHl65h", "author_site": "Chang Chen, Junyeob Baek, Fei Deng, Kenji Kawaguchi, Caglar Gulcehre, Sungjin Ahn", "tldr": "", "abstract": "Despite the recent advancements in offline RL, no unified algorithm could achieve superior performance across a broad range of tasks. Offline *value function learning*, in particular, struggles with sparse-reward, long-horizon tasks due to the difficulty of solving credit assignment and extrapolation errors that accumulates as the horizon of the task grows. On the other hand, models that can perform well in long-horizon tasks are designed specifically for goal-conditioned tasks, which commonly perform worse than value function learning methods on short-horizon, dense-reward scenarios. To bridge this gap, we propose a hierarchical planner designed for offline RL called PlanDQ. PlanDQ incorporates a diffusion-based planner at the high level, named D-Conductor, which guides the low-level policy through sub-goals. At the low level, we used a Q-learning based approach called the Q-Performer to accomplish these sub-goals. Our experimental results suggest that PlanDQ can achieve superior or competitive performance on D4RL continuous control benchmark tasks as well as AntMaze, Kitchen, and Calvin as long-horizon tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chang Chen;Junyeob Baek;Fei Deng;Kenji Kawaguchi;Caglar Gulcehre;Sungjin Ahn", "authorids": "~Chang_Chen1;~Junyeob_Baek1;~Fei_Deng1;~Kenji_Kawaguchi1;~Caglar_Gulcehre1;~Sungjin_Ahn1", "gender": ";;M;;M;", "homepage": "https://www.linkedin.com/in/chen-chang-bba27643/;https://cun-bjy.github.io/;;https://ml.comp.nus.edu.sg/#members;http://caglarg.com;", "dblp": ";379/6400;46/10037-1;;125/2132;", "google_scholar": ";https://scholar.google.co.kr/citations?user=Nr0st_gAAAAJ;https://scholar.google.com/citations?hl=en;aLl3rYoAAAAJ;https://scholar.google.ca/citations?user=7hwJ2ckAAAAJ;", "orcid": ";;;;;", "linkedin": ";junyeob-baek-640abb5b/;;;;", "or_profile": "~Chang_Chen1;~Junyeob_Baek1;~Fei_Deng1;~Kenji_Kawaguchi1;~Caglar_Gulcehre1;~Sungjin_Ahn1", "aff": "Rutgers University;Korea Advanced Institute of Science & Technology;Rutgers University;National University of Singapore;EPFL - EPF Lausanne;", "aff_domain": "rutgers.edu;kaist.ac.kr;rutgers.edu;nus.edu;epfl.ch;", "position": "Phd student;MS student;PhD student;Presidential Young Professor;EPFL;", "bibtex": "@inproceedings{\nchen2024plandq,\ntitle={Plan{DQ}: Hierarchical Plan Orchestration via D-Conductor and Q-Performer},\nauthor={Chang Chen and Junyeob Baek and Fei Deng and Kenji Kawaguchi and Caglar Gulcehre and Sungjin Ahn},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=17ZwoHl65h}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1412221, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11389306770917872180&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "rutgers.edu;kaist.ac.kr;rutgers.edu;nus.edu;epfl.ch;", "author_num": 6, "aff_unique_index": "0;1;0;2;3", "aff_unique_norm": "Rutgers University;Korea Advanced Institute of Science and Technology;National University of Singapore;EPFL", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.rutgers.edu;https://www.kaist.ac.kr;https://www.nus.edu.sg;https://www.epfl.ch", "aff_unique_abbr": "Rutgers;KAIST;NUS;EPFL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Lausanne", "aff_country_unique_index": "0;1;0;2;3", "aff_country_unique": "United States;South Korea;Singapore;Switzerland" }, { "title": "NeWRF: A Deep Learning Framework for Wireless Radiation Field Reconstruction and Channel Prediction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35164", "id": "181hXof7ho", "proceeding": "https://proceedings.mlr.press/v235/lu24j.html", "pdf": "https://openreview.net/pdf?id=181hXof7ho", "openreview": "https://openreview.net/forum?id=181hXof7ho", "author_site": "Haofan Lu, Christopher Vattheuer, Baharan Mirzasoleiman, Omid Abari", "tldr": "", "abstract": "We present NeWRF, a novel deep-learning-based framework for predicting wireless channels. Wireless channel prediction is a long-standing problem in the wireless community and is a key technology for improving the coverage of wireless network deployments. Today, a wireless deployment is evaluated by a site survey which is a cumbersome process requiring an experienced engineer to perform extensive channel measurements. To reduce the cost of site surveys, we develop NeWRF, which is based on recent advances in Neural Radiance Fields (NeRF). NeWRF trains a neural network model with a sparse set of channel measurements, and predicts the wireless channel accurately at any location in the site. We introduce a series of techniques that integrate wireless propagation properties into the NeRF framework to account for the fundamental differences between the behavior of light and wireless signals. We conduct extensive evaluations of our framework and show that our approach can accurately predict channels at unvisited locations with significantly lower measurement density than prior state-of-the-art.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haofan Lu;Christopher Vattheuer;Baharan Mirzasoleiman;Omid Abari", "authorids": "~Haofan_Lu2;~Christopher_Vattheuer1;~Baharan_Mirzasoleiman1;omid@cs.ucla.edu", "gender": "M;M;F;", "homepage": "https://luhaofan.github.io/;;http://web.cs.ucla.edu/~baharan/;", "dblp": ";;52/10075;", "google_scholar": "6zppXJ8AAAAJ;PfSS4p8AAAAJ;x63j7HEAAAAJ;", "orcid": ";;;", "linkedin": ";christopher-vattheuer-22aab6b7/;;", "or_profile": "~Haofan_Lu2;~Christopher_Vattheuer1;~Baharan_Mirzasoleiman1;omid@cs.ucla.edu", "aff": "University of California, Los Angeles;UCLA Computer Science Department, University of California, Los Angeles;University of California, Los Angeles;", "aff_domain": "ucla.edu;cs.ucla.edu;ucla.edu;", "position": "PhD student;PhD student;Assistant Professor;", "bibtex": "@inproceedings{\nlu2024newrf,\ntitle={Ne{WRF}: A Deep Learning Framework for Wireless Radiation Field Reconstruction and Channel Prediction},\nauthor={Haofan Lu and Christopher Vattheuer and Baharan Mirzasoleiman and Omid Abari},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=181hXof7ho}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5987253, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8303196056236593482&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "ucla.edu;cs.ucla.edu;ucla.edu;", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Los Angeles", "aff_unique_dep": "", "aff_unique_url": "https://www.ucla.edu", "aff_unique_abbr": "UCLA", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "On the Nonlinearity of Layer Normalization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35163", "id": "18f6iPn0zq", "proceeding": "https://proceedings.mlr.press/v235/ni24b.html", "pdf": "https://openreview.net/pdf?id=18f6iPn0zq", "openreview": "https://openreview.net/forum?id=18f6iPn0zq", "author_site": "Yunhao Ni, Yuxin Guo, Junlong Jia, Lei Huang", "tldr": "", "abstract": "Layer normalization (LN) is a ubiquitous technique in deep learning but our theoretical understanding to it remains elusive. This paper investigates a new theoretical direction for LN, regarding to its nonlinearity and representation capacity. We investigate the representation capacity of a network with layerwise composition of linear and LN transformations, referred to as LN-Net. We theoretically show that, given $m$ samples with any label assignment, an LN-Net with only 3 neurons in each layer and $O(m)$ LN layers can correctly classify them. We further show the lower bound of the VC dimension of an LN-Net. The nonlinearity of LN can be amplified by group partition, which is also theoretically demonstrated with mild assumption and empirically supported by our experiments. Based on our analyses, we consider to design neural architecture by exploiting and amplifying the nonlinearity of LN, and the effectiveness is supported by our experiments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yunhao Ni;Yuxin Guo;Junlong Jia;Lei Huang", "authorids": "~Yunhao_Ni1;~Yuxin_Guo5;~Junlong_Jia1;~Lei_Huang1", "gender": ";F;M;M", "homepage": "https://github.com/Musicath;https://github.com/Whsjrczr;https://github.com/jiajunlong;https://huangleibuaa.github.io/", "dblp": ";;371/2687;18/1763-15", "google_scholar": ";;c9vYGLgAAAAJ;https://scholar.google.com.hk/citations?user=yTshbKkAAAAJ", "orcid": "0009-0000-7637-1447;;;", "linkedin": ";;;", "or_profile": "~Yunhao_Ni1;~Yuxin_Guo5;~Junlong_Jia1;~Lei_Huang1", "aff": "Beihang University;Beihang University;Beihang University;Beihang University", "aff_domain": "buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;buaa.edu.cn", "position": "MS student;Undergrad student;MS student;Associate Professor", "bibtex": "@inproceedings{\nni2024on,\ntitle={On the Nonlinearity of Layer Normalization},\nauthor={Yunhao Ni and Yuxin Guo and Junlong Jia and Lei Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=18f6iPn0zq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1111450, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12067694821478319043&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;buaa.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Beihang University", "aff_unique_dep": "", "aff_unique_url": "http://www.buaa.edu.cn/", "aff_unique_abbr": "BUAA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Finite-Time Convergence and Sample Complexity of Actor-Critic Multi-Objective Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35162", "id": "18rzx2PXKm", "proceeding": "https://proceedings.mlr.press/v235/zhou24h.html", "pdf": "https://openreview.net/pdf?id=18rzx2PXKm", "openreview": "https://openreview.net/forum?id=18rzx2PXKm", "author_site": "Tianchen Zhou, Hairi, Haibo Yang, Jia (Kevin) Liu, Tian Tong, Fan Yang, Michinari Momma, Yan Gao", "tldr": "", "abstract": "Reinforcement learning with multiple, potentially conflicting objectives is pervasive in real-world applications, while this problem remains theoretically under-explored. This paper tackles the multi-objective reinforcement learning (MORL) problem and introduces an innovative actor-critic algorithm named MOAC which finds a policy by iteratively making trade-offs among conflicting reward signals. Notably, we provide the first analysis of finite-time Pareto-stationary convergence and corresponding sample complexity in both discounted and average reward settings. Our approach has two salient features: (a) MOAC mitigates the cumulative estimation bias resulting from finding an optimal common gradient descent direction out of stochastic samples. This enables provable convergence rate and sample complexity guarantees independent of the number of objectives; (b) With proper momentum coefficient, MOAC initializes the weights of individual policy gradients using samples from the environment, instead of manual initialization. This enhances the practicality and robustness of our algorithm. Finally, experiments conducted on a real-world dataset validate the effectiveness of our proposed method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tianchen Zhou;FNU Hairi;Haibo Yang;Jia Liu;Tian Tong;Fan Yang;Michinari Momma;Yan Gao", "authorids": "~Tianchen_Zhou1;~FNU_Hairi1;~Haibo_Yang1;~Jia_Liu1;~Tian_Tong1;~Fan_Yang36;~Michinari_Momma2;~Yan_Gao8", "gender": "F;;M;M;Not Specified;M;;F", "homepage": ";;https://haibo-yang-osu.github.io/homepage/;https://kevinliu-osu.github.io/index.html;;;;", "dblp": "293/7470;;43/7829-1;;;29/3081-84.html;34/6761.html;46/3479-3.html", "google_scholar": "oGYO-a0AAAAJ;;eyy22VoAAAAJ;Ofx3dScAAAAJ;Qg7x7M8AAAAJ;2MYn3NIAAAAJ;UDoWGDUAAAAJ;", "orcid": ";0000-0001-7457-9893;0000-0002-3245-2728;;0009-0008-3816-8235;0000-0002-0940-4218;;0000-0002-8012-1392", "linkedin": "tianchen-zhou-6582b510b/;;;;;fan-yang-882b368b/;;yan-gao-16a477b/", "or_profile": "~Tianchen_Zhou1;~FNU_Hairi1;~Haibo_Yang1;~Jia_Liu1;~Tian_Tong1;~Fan_Yang36;~Michinari_Momma2;~Yan_Gao8", "aff": "Amazon;University of Wisconsin - Whitewater;Rochester Institute of Technology;The Ohio State University;Amazon;Amazon;Amazon;Amazon", "aff_domain": "amazon.com;uww.edu;rit.edu;osu.edu;amazon.com;amazon.com;amazon.com;amazon.com", "position": "Applied Scientist;Assistant Professor;Assistant Professor;Assistant Professor;Applied Scientist;Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\nzhou2024finitetime,\ntitle={Finite-Time Convergence and Sample Complexity of Actor-Critic Multi-Objective Reinforcement Learning},\nauthor={Tianchen Zhou and FNU Hairi and Haibo Yang and Jia Liu and Tian Tong and Fan Yang and Michinari Momma and Yan Gao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=18rzx2PXKm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 973814, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14859739668721573034&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "amazon.com;uww.edu;rit.edu;osu.edu;amazon.com;amazon.com;amazon.com;amazon.com", "author_num": 8, "aff_unique_index": "0;1;2;3;0;0;0;0", "aff_unique_norm": "Amazon;University of Wisconsin-Whitewater;Rochester Institute of Technology;Ohio State University", "aff_unique_dep": "Amazon.com, Inc.;;;", "aff_unique_url": "https://www.amazon.com;https://www.uww.edu;https://www.rit.edu;https://www.osu.edu", "aff_unique_abbr": "Amazon;UW-Whitewater;RIT;OSU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Whitewater", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Gradual Divergence for Seamless Adaptation: A Novel Domain Incremental Learning Method", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35161", "id": "1AAlMSo7Js", "proceeding": "https://proceedings.mlr.press/v235/jeeveswaran24a.html", "pdf": "https://openreview.net/pdf?id=1AAlMSo7Js", "openreview": "https://openreview.net/forum?id=1AAlMSo7Js", "author_site": "Jeeveswaran Kishaan, Elahe Arani, Bahram Zonooz", "tldr": "", "abstract": "Domain incremental learning (DIL) poses a significant challenge in real-world scenarios, as models need to be sequentially trained on diverse domains over time, all the while avoiding catastrophic forgetting. Mitigating representation drift, which refers to the phenomenon of learned representations undergoing changes as the model adapts to new tasks, can help alleviate catastrophic forgetting. In this study, we propose a novel DIL method named *DARE*, featuring a three-stage training process: Divergence, Adaptation, and REfinement. This process gradually adapts the representations associated with new tasks into the feature space spanned by samples from previous tasks, simultaneously integrating task-specific decision boundaries. Additionally, we introduce a novel strategy for buffer sampling and demonstrate the effectiveness of our proposed method, combined with this sampling strategy, in reducing representation drift within the feature encoder. This contribution effectively alleviates catastrophic forgetting across multiple DIL benchmarks. Furthermore, our approach prevents sudden representation drift at task boundaries, resulting in a well-calibrated DIL model that maintains the performance on previous tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kishaan Jeeveswaran;Elahe Arani;Bahram Zonooz", "authorids": "~Kishaan_Jeeveswaran1;~Elahe_Arani1;~Bahram_Zonooz1", "gender": "M;F;M", "homepage": ";https://sites.google.com/view/elahe-arani;https://sites.google.com/view/bahramzonooz", "dblp": ";;250/9573", "google_scholar": "https://scholar.google.com/citations?hl=en;e_I_v6cAAAAJ;", "orcid": ";0000-0002-0952-7007;", "linkedin": ";elahe-arani-630870b2/;", "or_profile": "~Kishaan_Jeeveswaran1;~Elahe_Arani1;~Bahram_Zonooz1", "aff": ";Wayve Technologies Ltd;Eindhoven University of Technology", "aff_domain": ";wayve.ai;tue.nl", "position": ";Head of AI Research;Assistant Professor", "bibtex": "@inproceedings{\njeeveswaran2024gradual,\ntitle={Gradual Divergence for Seamless Adaptation: A Novel Domain Incremental Learning Method},\nauthor={Kishaan Jeeveswaran and Elahe Arani and Bahram Zonooz},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1AAlMSo7Js}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1800283, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=130411859931369468&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 7, "email": ";wayve.ai;tue.nl", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Wayve Technologies;Eindhoven University of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.wayvetechnologies.com;https://www.tue.nl", "aff_unique_abbr": "Wayve;TU/e", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United Kingdom;Netherlands" }, { "title": "Decoupling Learning and Decision-Making: Breaking the $\\mathcal{O}(\\sqrt{T})$ Barrier in Online Resource Allocation with First-Order Methods", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35160", "id": "1DyruVvVaQ", "proceeding": "https://proceedings.mlr.press/v235/gao24n.html", "pdf": "https://openreview.net/pdf?id=1DyruVvVaQ", "openreview": "https://openreview.net/forum?id=1DyruVvVaQ", "author_site": "Wenzhi Gao, Chunlin Sun, Chenyu Xue, Yinyu Ye", "tldr": "", "abstract": "Online linear programming plays an important role in both revenue management and resource allocation, and recent research has focused on developing efficient first-order online learning algorithms. Despite the empirical success of first-order methods, they typically achieve regret no better than $\\mathcal{O}(\\sqrt{T})$, which is suboptimal compared to the $\\mathcal{O}(\\log T)$ result guaranteed by the state-of-the-art linear programming (LP)-based online algorithms. This paper establishes several important facts about online linear programming, which unveils the challenge for first-order online algorithms to achieve beyond $\\mathcal{O}(\\sqrt{T})$ regret. To address this challenge, we introduce a new algorithmic framework which decouples learning from decision-making. For the first time, we show that first-order methods can achieve regret $\\mathcal{O}(T^{1/3})$ with this new framework.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenzhi Gao;Chunlin Sun;Chenyu Xue;Yinyu Ye", "authorids": "~Wenzhi_Gao1;~Chunlin_Sun1;~Chenyu_Xue1;~Yinyu_Ye1", "gender": "M;M;M;M", "homepage": "https://github.com/Gwzwpxz;https://chunlinsun.github.io/;https://sites.google.com/view/chenyuxue/home;https://web.stanford.edu/~yyye/", "dblp": ";260/0567;240/6330;42/1372", "google_scholar": "4lDkX_YAAAAJ;2MMNRmoAAAAJ;;BgOXDogAAAAJ", "orcid": ";;;", "linkedin": ";chunlin-sun-ab8334139/;;", "or_profile": "~Wenzhi_Gao1;~Chunlin_Sun1;~Chenyu_Xue1;~Yinyu_Ye1", "aff": "Stanford University;Stanford University;National University of Singapore;", "aff_domain": "stanford.edu;stanford.edu;nus.edu;", "position": "PhD student;PhD student;PhD student;", "bibtex": "@inproceedings{\ngao2024decoupling,\ntitle={Decoupling Learning and Decision-Making: Breaking the \\${\\textbackslash}mathcal\\{O\\}({\\textbackslash}sqrt\\{T\\})\\$ Barrier in Online Resource Allocation with First-Order Methods},\nauthor={Wenzhi Gao and Chunlin Sun and Chenyu Xue and Yinyu Ye},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1DyruVvVaQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3347693, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17153591104578452698&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "email": "stanford.edu;stanford.edu;nus.edu;", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Stanford University;National University of Singapore", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.nus.edu.sg", "aff_unique_abbr": "Stanford;NUS", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;Singapore" }, { "title": "MLAgentBench: Evaluating Language Agents on Machine Learning Experimentation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35159", "id": "1Fs1LvjYQW", "proceeding": "https://proceedings.mlr.press/v235/huang24y.html", "pdf": "https://openreview.net/pdf?id=1Fs1LvjYQW", "openreview": "https://openreview.net/forum?id=1Fs1LvjYQW", "author_site": "Qian Huang, Jian Vora, Percy Liang, Jure Leskovec", "tldr": "", "abstract": "A central aspect of machine learning research is experimentation, the process of designing and running experiments, analyzing the results, and iterating towards some positive outcome (e.g., improving accuracy). Could agents driven by powerful language models perform machine learning experimentation effectively? To answer this question, we introduce MLAgentBench, a suite of 13 tasks ranging from improving model performance on CIFAR-10 to recent research problems like BabyLM. For each task, an agent can perform actions like reading/writing files, executing code, and inspecting outputs. We then construct an agent that can perform ML experimentation based on ReAct framework. We benchmark agents based on Claude v1.0, Claude v2.1, Claude v3 Opus, GPT-4, GPT-4-turbo, Gemini-Pro, and Mixtral and find that a Claude v3 Opus agent is the best in terms of success rate. It can build compelling ML models over many tasks in MLAgentBench with 37.5% average success rate. Our agents also display highly interpretable plans and actions. However, the success rates vary considerably; they span from 100% on well-established older datasets to as low as 0% on recent Kaggle challenges created potentially after the underlying LM was trained. Finally, we identify several key challenges for LM-based agents such as long-term planning and reducing hallucination.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qian Huang;Jian Vora;Percy Liang;Jure Leskovec", "authorids": "~Qian_Huang2;~Jian_Vora1;~Percy_Liang1;~Jure_Leskovec1", "gender": "F;M;;", "homepage": "https://q-hwang.github.io/;https://jianvora.github.io;https://cs.stanford.edu/~pliang/;http://cs.stanford.edu/~jure/", "dblp": "07/4378.html;;04/1701;l/JureLeskovec", "google_scholar": "L3hkmG0AAAAJ;D1Jl19oAAAAJ;pouyVyUAAAAJ;Q_kKkIUAAAAJ", "orcid": ";;;0000-0002-5411-923X", "linkedin": "qian-huang-b20315149/;;;leskovec/", "or_profile": "~Qian_Huang2;~Jian_Vora1;~Percy_Liang1;~Jure_Leskovec1", "aff": "Stanford University;Computer Science Department, Stanford University;Stanford University;Kumo.AI", "aff_domain": "stanford.edu;cs.stanford.edu;stanford.edu;kumo.ai", "position": "PhD student;MS student;Associate Professor;Chief Scientist", "bibtex": "@inproceedings{\nhuang2024mlagentbench,\ntitle={{MLA}gentBench: Evaluating Language Agents on Machine Learning Experimentation},\nauthor={Qian Huang and Jian Vora and Percy Liang and Jure Leskovec},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1Fs1LvjYQW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 881889, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 61, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3339231665865796200&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 10, "email": "stanford.edu;cs.stanford.edu;stanford.edu;kumo.ai", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Stanford University;Kumo.AI", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.kumo.ai", "aff_unique_abbr": "Stanford;Kumo.AI", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Compression of Structured Data with Autoencoders: Provable Benefit of Nonlinearities and Depth", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35158", "id": "1HDrfUahXv", "proceeding": "https://proceedings.mlr.press/v235/kogler24a.html", "pdf": "https://openreview.net/pdf?id=1HDrfUahXv", "openreview": "https://openreview.net/forum?id=1HDrfUahXv", "author_site": "Kevin K\u00f6gler, Aleksandr Shevchenko, Hamed Hassani, Marco Mondelli", "tldr": "", "abstract": "Autoencoders are a prominent model in many empirical branches of machine learning and lossy data compression. However, basic theoretical questions remain unanswered even in a shallow two-layer setting. In particular, to what degree does a shallow autoencoder capture the structure of the underlying data distribution? For the prototypical case of the 1-bit compression of *sparse* Gaussian data, we prove that gradient descent converges to a solution that completely disregards the sparse structure of the input. Namely, the performance of the algorithm is the same as if it was compressing a Gaussian source -- with no sparsity. For general data distributions, we give evidence of a phase transition phenomenon in the shape of the gradient descent minimizer, as a function of the data sparsity: below the critical sparsity level, the minimizer is a rotation taken uniformly at random (just like in the compression of non-sparse data); above the critical sparsity, the minimizer is the identity (up to a permutation). Finally, by exploiting a connection with approximate message passing algorithms, we show how to improve upon Gaussian performance for the compression of sparse data: adding a denoising function to a shallow architecture already reduces the loss provably, and a suitable multi-layer decoder leads to a further improvement. We validate our findings on image datasets, such as CIFAR-10 and MNIST.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kevin K\u00f6gler;Aleksandr Shevchenko;Hamed Hassani;Marco Mondelli", "authorids": "~Kevin_K\u00f6gler1;~Aleksandr_Shevchenko1;~Hamed_Hassani2;~Marco_Mondelli1", "gender": "M;;M;M", "homepage": ";;https://www.seas.upenn.edu/~hassani/;http://marcomondelli.com", "dblp": "308/7022;;73/4984;120/7089", "google_scholar": "K8ooQfwAAAAJ;V7N_EpoAAAAJ;;BHdSb5AAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Kevin_K\u00f6gler1;~Aleksandr_Shevchenko1;~Hamed_Hassani2;~Marco_Mondelli1", "aff": "Institute of Science and Technology Austria;Institute of Science and Technology;University of Pennsylvania;Institute of Science and Technology", "aff_domain": "ist.ac.at;ist.ac.at;upenn.edu;ist.ac.at", "position": "PhD student;PhD student;;Assistant Professor", "bibtex": "@inproceedings{\nk{\\\"o}gler2024compression,\ntitle={Compression of Structured Data with Autoencoders: Provable Benefit of Nonlinearities and Depth},\nauthor={Kevin K{\\\"o}gler and Aleksandr Shevchenko and Hamed Hassani and Marco Mondelli},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1HDrfUahXv}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2861885, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=876787362861653997&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 8, "email": "ist.ac.at;ist.ac.at;upenn.edu;ist.ac.at", "author_num": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Institute of Science and Technology Austria;Institute of Science and Technology;University of Pennsylvania", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ist.ac.at;;https://www.upenn.edu", "aff_unique_abbr": "IST Austria;;UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;2", "aff_country_unique": "Austria;;United States" }, { "title": "INViT: A Generalizable Routing Problem Solver with Invariant Nested View Transformer", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35157", "id": "1IZLOPxtfK", "proceeding": "https://proceedings.mlr.press/v235/fang24c.html", "pdf": "https://openreview.net/pdf?id=1IZLOPxtfK", "openreview": "https://openreview.net/forum?id=1IZLOPxtfK", "author_site": "Han Fang, Zhihao Song, Paul Weng, Yutong Ban", "tldr": "", "abstract": "Recently, deep reinforcement learning has shown promising results for learning fast heuristics to solve routing problems. Meanwhile, most of the solvers suffer from generalizing to an unseen distribution or distributions with different scales. To address this issue, we propose a novel architecture, called Invariant Nested View Transformer (INViT), which is designed to enforce a nested design together with invariant views inside the encoders to promote the generalizability of the learned solver. It applies a modified policy gradient algorithm enhanced with data augmentations. We demonstrate that the proposed INViT achieves a dominant generalization performance on both TSP and CVRP problems with various distributions and different problem scales. Our source code and datasets are available in supplementary materials.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Han Fang;Zhihao Song;Paul Weng;Yutong Ban", "authorids": "~Han_Fang7;~Zhihao_Song1;~Paul_Weng1;~Yutong_Ban1", "gender": "M;M;M;M", "homepage": "https://github.com/Kasumigaoka-Utaha;https://github.com/howord-texin;http://weng.fr;https://team.inria.fr/perception/alumni/yutong-ban/", "dblp": ";;http://dblp.uni-trier.de/pers/hd/w/Weng:Paul;188/7582", "google_scholar": ";;_Hd6AeQAAAAJ;4EXokwkAAAAJ", "orcid": ";;;", "linkedin": ";;paul-weng-69a15980/;", "or_profile": "~Han_Fang7;~Zhihao_Song1;~Paul_Weng1;~Yutong_Ban1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Duke Kunshan University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;dukekunshan.edu.cn;sjtu.edu", "position": "MS student;PhD student;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nfang2024invit,\ntitle={{INV}iT: A Generalizable Routing Problem Solver with Invariant Nested View Transformer},\nauthor={Han Fang and Zhihao Song and Paul Weng and Yutong Ban},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1IZLOPxtfK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1864516, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5199660742388753330&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "sjtu.edu.cn;sjtu.edu.cn;dukekunshan.edu.cn;sjtu.edu", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Shanghai Jiao Tong University;Duke Kunshan University", "aff_unique_dep": ";", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.duk/Dk.edu", "aff_unique_abbr": "SJTU;DKU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Kunshan", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Inferring Change Points in High-Dimensional Linear Regression via Approximate Message Passing", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35156", "id": "1JgCpZS17T", "proceeding": "https://proceedings.mlr.press/v235/arpino24a.html", "pdf": "https://openreview.net/pdf?id=1JgCpZS17T", "openreview": "https://openreview.net/forum?id=1JgCpZS17T", "author_site": "Gabriel Arpino, Xiaoqi Liu, Ramji Venkataramanan", "tldr": "", "abstract": "We consider the problem of localizing change points in high-dimensional linear regression. We propose an Approximate Message Passing (AMP) algorithm for estimating both the signals and the change point locations. Assuming Gaussian covariates, we give an exact asymptotic characterization of its estimation performance in the limit where the number of samples grows proportionally to the signal dimension. Our algorithm can be tailored to exploit any prior information on the signal, noise, and change points. It also enables uncertainty quantification in the form of an efficiently computable approximate posterior distribution, whose asymptotic form we characterize exactly. We validate our theory via numerical experiments, and demonstrate the favorable performance of our estimators on both synthetic data and images.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gabriel Arpino;Xiaoqi Liu;Ramji Venkataramanan", "authorids": "~Gabriel_Arpino1;~Xiaoqi_Liu1;rv285@cam.ac.uk", "gender": ";F;", "homepage": ";https://shirleyliuxq.github.io/;", "dblp": ";;", "google_scholar": ";OFtk8oAAAAAJ;", "orcid": ";0000-0001-9451-8684;", "linkedin": ";;", "or_profile": "~Gabriel_Arpino1;~Xiaoqi_Liu1;rv285@cam.ac.uk", "aff": ";University of Cambridge;", "aff_domain": ";cam.ac.uk;", "position": ";PhD student;", "bibtex": "@inproceedings{\narpino2024inferring,\ntitle={Inferring Change Points in High-Dimensional Linear Regression via Approximate Message Passing},\nauthor={Gabriel Arpino and Xiaoqi Liu and Ramji Venkataramanan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1JgCpZS17T}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1013015, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7841792540708806017&as_sdt=20000005&sciodt=0,21&hl=en", "gs_version_total": 7, "email": ";cam.ac.uk;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "University of Cambridge", "aff_unique_dep": "", "aff_unique_url": "https://www.cam.ac.uk", "aff_unique_abbr": "Cambridge", "aff_campus_unique_index": "0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "title": "A Circuit Domain Generalization Framework for Efficient Logic Synthesis in Chip Design", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35155", "id": "1KemC8DNa0", "proceeding": "https://proceedings.mlr.press/v235/wang24g.html", "pdf": "https://openreview.net/pdf?id=1KemC8DNa0", "openreview": "https://openreview.net/forum?id=1KemC8DNa0", "author_site": "Zhihai Wang, Lei Chen, Jie Wang, \u767d \u5bc5\u5c90, Xing Li, Xijun Li, Mingxuan Yuan, Jianye Hao, Yongdong Zhang, Feng Wu", "tldr": "", "abstract": "Logic Synthesis (LS) plays a vital role in chip design. A key task in LS is to simplify circuits---modeled by directed acyclic graphs (DAGs)---with functionality-equivalent transformations. To tackle this task, many LS heuristics apply transformations to subgraphs---rooted at each node on an input DAG---sequentially. However, we found that a large number of transformations are ineffective, which makes applying these heuristics highly time-consuming. In particular, we notice that the runtime of the Resub and Mfs2 heuristics often dominates the overall runtime of LS optimization processes. To address this challenge, we propose a novel data-driven LS heuristic paradigm, namely PruneX, to reduce ineffective transformations. The major challenge of developing PruneX is to learn models that well generalize to unseen circuits, i.e., the out-of-distribution (OOD) generalization problem. Thus, the major technical contribution of PruneX is the novel circuit domain generalization framework, which learns domain-invariant representations based on the transformation-invariant domain-knowledge. To the best of our knowledge, PruneX is the first approach to tackle the OOD problem in LS heuristics. We integrate PruneX with the aforementioned Resub and Mfs2 heuristics. Experiments demonstrate that PruneX significantly improves their efficiency while keeping comparable optimization performance on industrial and very large-scale circuits, achieving up to $3.1\\times$ faster runtime.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhihai Wang;Lei Chen;Jie Wang;Yinqi Bai;Xing Li;Xijun Li;Mingxuan Yuan;Jianye HAO;Yongdong Zhang;Feng Wu", "authorids": "~Zhihai_Wang1;lc.leichen@huawei.com;~Jie_Wang1;byq000324@mail.ustc.edu.cn;li.xing2@huawei.com;~Xijun_Li1;~Mingxuan_Yuan1;~Jianye_HAO1;~Yongdong_Zhang2;~Feng_Wu1", "gender": "M;;M;;;M;M;M;M;M", "homepage": "https://miralab.ai/people/zhihai-wang/;;http://staff.ustc.edu.cn/~jwangx;;;https://xijunlee.github.io/;;http://www.icdai.org/jianye.html;https://imcc.ustc.edu.cn/_upload/tpl/0d/13/3347/template3347/zhangyongdong.html;", "dblp": "35/4357;;29/5259-5;;;203/0784;74/2356;21/7664.html;z/YongdongZhang;25/3972-1", "google_scholar": "EdLIBG8AAAAJ;;OugG4dUAAAAJ;;;QXU_QbMAAAAJ;https://scholar.google.com/citations?hl=en;;https://scholar.google.com.hk/citations?user=hxGs4ukAAAAJ;5bInRDEAAAAJ", "orcid": ";;;;;0000-0002-9013-1180;0000-0002-2236-8784;0000-0002-0422-8235;0000-0003-0066-3448;", "linkedin": ";;;;;;;;;", "or_profile": "~Zhihai_Wang1;lc.leichen@huawei.com;~Jie_Wang1;byq000324@mail.ustc.edu.cn;li.xing2@huawei.com;~Xijun_Li1;~Mingxuan_Yuan1;~Jianye_HAO1;~Yongdong_Zhang2;~Feng_Wu1", "aff": "University of Science and Technology of China;;University of Science and Technology of China;;;Huawei Technologies Ltd.;Huawei Technologies Ltd.;Tianjin University;University of Science and Technology of China;University of Science and Technology of China", "aff_domain": "ustc.edu.cn;;ustc.edu.cn;;;huawei.com;huawei.com;tju.edu.cn;ustc.edu.cn;ustc.edu.cn", "position": "PhD student;;Full Professor;;;Researcher;Researcher;Associate Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nwang2024a,\ntitle={A Circuit Domain Generalization Framework for Efficient Logic Synthesis in Chip Design},\nauthor={Zhihai Wang and Lei Chen and Jie Wang and Yinqi Bai and Xing Li and Xijun Li and Mingxuan Yuan and Jianye HAO and Yongdong Zhang and Feng Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1KemC8DNa0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1687722, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6002505327144737532&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "ustc.edu.cn;;ustc.edu.cn;;;huawei.com;huawei.com;tju.edu.cn;ustc.edu.cn;ustc.edu.cn", "author_num": 10, "aff_unique_index": "0;0;1;1;2;0;0", "aff_unique_norm": "University of Science and Technology of China;Huawei;Tianjin University", "aff_unique_dep": ";Huawei Technologies;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.huawei.com;http://www.tju.edu.cn", "aff_unique_abbr": "USTC;Huawei;TJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "PID: Prompt-Independent Data Protection Against Latent Diffusion Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35154", "id": "1N7pjXKkx8", "proceeding": "https://proceedings.mlr.press/v235/li24ay.html", "pdf": "https://openreview.net/pdf?id=1N7pjXKkx8", "openreview": "https://openreview.net/forum?id=1N7pjXKkx8", "author_site": "Ang Li, Yichuan Mo, Mingjie Li, Yisen Wang", "tldr": "", "abstract": "The few-shot fine-tuning of Latent Diffusion Models (LDMs) has enabled them to grasp new concepts from a limited number of images. However, given the vast amount of personal images accessible online, this capability raises critical concerns about civil privacy. While several previous defense methods have been developed to prevent such misuse of LDMs, they typically assume that the textual prompts used by data protectors exactly match those employed by data exploiters. In this paper, we first empirically demonstrate that breaking this assumption, i.e., in cases where discrepancies exist between the textual conditions used by protectors and exploiters, could substantially reduces the effectiveness of these defenses. Furthermore, considering the visual encoder's independence from textual prompts, we delve into the visual encoder and thoroughly investigate how manipulating the visual encoder affects the few-shot fine-tuning process of LDMs. Drawing on these insights, we propose a simple yet effective method called Prompt-Independent Defense (PID) to safeguard privacy against LDMs. We show that PID can act as a strong privacy shield on its own while requiring significantly less computational power. We believe our studies, along with the comprehensive understanding and new defense method, provide a notable advance toward reliable data protection against LDMs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ang Li;Yichuan Mo;Mingjie Li;Yisen Wang", "authorids": "~Ang_Li20;~Yichuan_Mo1;~Mingjie_Li1;~Yisen_Wang1", "gender": "M;M;M;M", "homepage": "https://github.com/Charles20021201;https://www.linkedin.com/in/%E6%98%93%E5%B7%9D-%E8%8E%AB-446841212/;https://mingjieli0111.github.io/;https://yisenwang.github.io/", "dblp": ";321/6790;;172/1346-1", "google_scholar": ";xvSYG1gAAAAJ;;uMWPDboAAAAJ", "orcid": ";;0000-0002-1588-2654;", "linkedin": ";;;", "or_profile": "~Ang_Li20;~Yichuan_Mo1;~Mingjie_Li1;~Yisen_Wang1", "aff": "Peking University;Peking University;CISPA Helmholtz Center for Information Security;Peking University", "aff_domain": "pku.edu.cn;stu.pku.edu.cn;cispa.de;pku.edu.cn", "position": "Undergrad student;PhD student;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nli2024pid,\ntitle={{PID}: Prompt-Independent Data Protection Against Latent Diffusion Models},\nauthor={Ang Li and Yichuan Mo and Mingjie Li and Yisen Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1N7pjXKkx8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 0, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13585677613216683671&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "pku.edu.cn;stu.pku.edu.cn;cispa.de;pku.edu.cn", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Peking University;CISPA Helmholtz Center for Information Security", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.cispa.de/", "aff_unique_abbr": "Peking U;CISPA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;Germany" }, { "title": "EAGLE: Speculative Sampling Requires Rethinking Feature Uncertainty", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35153", "id": "1NdN7eXyb4", "proceeding": "https://proceedings.mlr.press/v235/li24bt.html", "pdf": "https://openreview.net/pdf?id=1NdN7eXyb4", "openreview": "https://openreview.net/forum?id=1NdN7eXyb4", "author_site": "Yuhui Li, Fangyun Wei, Chao Zhang, Hongyang Zhang", "tldr": "", "abstract": "Autoregressive decoding makes the inference of Large Language Models (LLMs) time-consuming. In this paper, we reconsider speculative sampling and derive two key observations. Firstly, autoregression at the feature (second-to-top-layer) level is more straightforward than at the token level. Secondly, the inherent uncertainty in feature (second-to-top-layer) level autoregression constrains its performance. Based on these insights, we introduce EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency), a simple yet highly efficient speculative sampling framework. By incorporating a token sequence advanced by one time step, EAGLE effectively resolves the uncertainty, enabling precise second-to-top-layer feature prediction with minimal overhead. We conducted comprehensive evaluations of EAGLE, including all models from the Vicuna and LLaMA2-Chat series, the MoE model Mixtral 8x7B Instruct, and tasks in dialogue, code generation, mathematical reasoning, and instruction following. For LLaMA2-Chat 70B, EAGLE achieved a latency speedup ratio of **2.7x-3.5x**, doubled throughput, while maintaining the distribution of the generated text.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuhui Li;Fangyun Wei;Chao Zhang;Hongyang Zhang", "authorids": "~Yuhui_Li1;~Fangyun_Wei1;~Chao_Zhang10;~Hongyang_Zhang1", "gender": "M;M;M;M", "homepage": ";;http://www.cis.pku.edu.cn/faculty/vision/zhangchao/zhangchao.htm;https://hongyanz.github.io/", "dblp": ";161/2636;94/3019-1;23/10537-1", "google_scholar": ";-ncz2s8AAAAJ;NeCCx-kAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;", "linkedin": "https://www.linkedin.cn/incareer/in/ACoAAD4l0ZYBzPPIPhZpDeJ7Zfc_v082CPYOoWc;;;", "or_profile": "~Yuhui_Li1;~Fangyun_Wei1;~Chao_Zhang10;~Hongyang_Zhang1", "aff": "Peking University;Microsoft Research;Peking University;School of Computer Science, University of Waterloo", "aff_domain": "pku.edu.cn;microsoft.com;pku.edu.cn;uwaterloo.ca", "position": "MS student;Researcher;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nli2024eagle,\ntitle={{EAGLE}: Speculative Sampling Requires Rethinking Feature Uncertainty},\nauthor={Yuhui Li and Fangyun Wei and Chao Zhang and Hongyang Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1NdN7eXyb4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1478479, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 113, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15027653250903855773&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "pku.edu.cn;microsoft.com;pku.edu.cn;uwaterloo.ca", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Peking University;Microsoft;University of Waterloo", "aff_unique_dep": ";Microsoft Research;School of Computer Science", "aff_unique_url": "http://www.pku.edu.cn;https://www.microsoft.com/en-us/research;https://uwaterloo.ca", "aff_unique_abbr": "Peking U;MSR;UWaterloo", "aff_campus_unique_index": "1", "aff_campus_unique": ";Waterloo", "aff_country_unique_index": "0;1;0;2", "aff_country_unique": "China;United States;Canada" }, { "title": "Promoting External and Internal Equities Under Ex-Ante/Ex-Post Metrics in Online Resource Allocation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35152", "id": "1OsRSrkFWl", "proceeding": "https://proceedings.mlr.press/v235/sankararaman24a.html", "pdf": "https://openreview.net/pdf?id=1OsRSrkFWl", "openreview": "https://openreview.net/forum?id=1OsRSrkFWl", "author_site": "Karthik Abinav Sankararaman, Aravind Srinivasan, Pan Xu", "tldr": "", "abstract": "This paper proposes two different models for equitable resource allocation in online settings. The first one is called *external* equity promotion, where sequentially arriving agents are heterogeneous in their external attributes, namely how many resources they demand, which are drawn from a probability distribution (accessible to the algorithm). The focus is then to devise an allocation policy such that every requester can get a fair share of resources *proportional to their demands*, regardless of their arrival time. The second is called *internal* equity promotion, where arriving requesters can be treated homogeneously in external attributes (demands) but are heterogeneous in internal traits such as demographics. In particular, each requester can be identified as belonging to one or several groups, and an allocation of resources is regarded as equitable when every group of requesters can receive a fair share of resources proportional to the percentage of that group in the whole population. For both models above, we consider as the benchmark a clairvoyant optimal solution that has the privilege to access all random demand realizations in advance. We consider two equity metrics, namely *ex-post* and *ex-ante*, and discuss the challenges under the two metrics in detail. Specifically, we present two linear program (LP)-based policies for external equity promotion under ex-ante with independent demands, each achieving an *optimal* CR of $1/2$ with respect to the benchmark LP. For internal equity promotion, we present optimal policies under both ex-ante and ex-post metrics.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Karthik Abinav Sankararaman;Aravind Srinivasan;Pan Xu", "authorids": "~Karthik_Abinav_Sankararaman1;~Aravind_Srinivasan1;~Pan_Xu2", "gender": "M;M;Not Specified", "homepage": "http://karthikabinavs.xyz;https://www.cs.umd.edu/~srin/;https://sites.google.com/site/panxupi/", "dblp": "154/4666;s/AravindSrinivasan;11/9718-1.html", "google_scholar": "uJ-Dhj4AAAAJ;sPzla6IAAAAJ;", "orcid": ";;", "linkedin": ";aravind-srinivasan-8a572811/;", "or_profile": "~Karthik_Abinav_Sankararaman1;~Aravind_Srinivasan1;~Pan_Xu2", "aff": "Meta Facebook;University of Maryland, College Park;New Jersey Institute of Technology", "aff_domain": "fb.com;umd.edu;cs.njit.edu", "position": "Research Scientist;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nsankararaman2024promoting,\ntitle={Promoting External and Internal Equities Under Ex-Ante/Ex-Post Metrics in Online Resource Allocation},\nauthor={Karthik Abinav Sankararaman and Aravind Srinivasan and Pan Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1OsRSrkFWl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 427955, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4372109618213148834&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": "fb.com;umd.edu;cs.njit.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Meta;University of Maryland;New Jersey Institute of Technology", "aff_unique_dep": "Meta Platforms, Inc.;;", "aff_unique_url": "https://meta.com;https://www/umd.edu;https://www.njit.edu", "aff_unique_abbr": "Meta;UMD;NJIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";College Park", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Nonparametric Teaching of Implicit Neural Representations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35151", "id": "1PMkV6oKw3", "proceeding": "https://proceedings.mlr.press/v235/zhang24ap.html", "pdf": "https://openreview.net/pdf?id=1PMkV6oKw3", "openreview": "https://openreview.net/forum?id=1PMkV6oKw3", "author_site": "Chen Zhang, Steven T. S. Luo, Jason Chun Lok Li, Yik-Chung WU, Ngai Wong", "tldr": "", "abstract": "We investigate the learning of implicit neural representation (INR) using an overparameterized multilayer perceptron (MLP) via a novel nonparametric teaching perspective. The latter offers an efficient example selection framework for teaching nonparametrically defined (viz. non-closed-form) target functions, such as image functions defined by 2D grids of pixels. To address the costly training of INRs, we propose a paradigm called Implicit Neural Teaching (INT) that treats INR learning as a nonparametric teaching problem, where the given signal being fitted serves as the target function. The teacher then selects signal fragments for iterative training of the MLP to achieve fast convergence. By establishing a connection between MLP evolution through parameter-based gradient descent and that of function evolution through functional gradient descent in nonparametric teaching, we show *for the first time* that teaching an overparameterized MLP is consistent with teaching a nonparametric learner. This new discovery readily permits a convenient drop-in of nonparametric teaching algorithms to broadly enhance INR training efficiency, demonstrating 30%+ training time savings across various input modalities.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chen Zhang;Steven Tin Sui Luo;Jason Chun Lok Li;Yik Chung WU;Ngai Wong", "authorids": "~Chen_Zhang13;~Steven_Tin_Sui_Luo1;~Jason_Chun_Lok_Li1;~Yik_Chung_WU1;~Ngai_Wong1", "gender": ";M;M;M;M", "homepage": ";https://stevolopolis.github.io/;;http://www.eee.hku.hk/~ycwu/;https://www.eee.hku.hk/~nwong/", "dblp": ";377/7023;;;88/3656", "google_scholar": ";https://scholar.google.ca/citations?user=2ASNx9AAAAAJ;Tcpdsh0AAAAJ;;PM_uMYIAAAAJ", "orcid": ";0009-0004-7907-7680;;;0000-0002-3026-0108", "linkedin": ";steven-luo-uoft/;jason-chun-lok-li-0590b3166;;", "or_profile": "~Chen_Zhang13;~Steven_Tin_Sui_Luo1;~Jason_Chun_Lok_Li1;~Yik_Chung_WU1;~Ngai_Wong1", "aff": ";University of Hong Kong;University of Hong Kong;University of Hong Kong;The University of Hong Kong", "aff_domain": ";eee.hku.hk;eee.hku.hk;hku.hk;hku.hk", "position": ";Intern;PhD student;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nzhang2024nonparametric,\ntitle={Nonparametric Teaching of Implicit Neural Representations},\nauthor={Chen Zhang and Steven Tin Sui Luo and Jason Chun Lok Li and Yik Chung WU and Ngai Wong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1PMkV6oKw3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7598628, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8463314924097163505&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": ";eee.hku.hk;eee.hku.hk;hku.hk;hku.hk", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.hku.hk", "aff_unique_abbr": "HKU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Privacy Preserving Adaptive Experiment Design", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35150", "id": "1QmFKwVwwI", "proceeding": "https://proceedings.mlr.press/v235/li24bg.html", "pdf": "https://openreview.net/pdf?id=1QmFKwVwwI", "openreview": "https://openreview.net/forum?id=1QmFKwVwwI", "author_site": "Jiachun Li, Kaining Shi, David Simchi-Levi", "tldr": "", "abstract": "Adaptive experiment is widely adopted to estimate conditional average treatment effect (CATE) in clinical trials and many other scenarios. While the primary goal in experiment is to maximize estimation accuracy, due to the imperative of social welfare, it's also crucial to provide treatment with superior outcomes to patients, which is measured by regret in contextual bandit framework. Furthermore, privacy concerns arise in clinical scenarios containing sensitive data like patients health records. Therefore, it's essential for the treatment allocation mechanism to incorporate robust privacy protection measures. In this paper, we investigate the tradeoff between loss of social welfare and statistical power of CATE estimation in contextual bandit experiment. We propose a matched upper and lower bound for the multi-objective optimization problem, and then adopt the concept of Pareto optimality to mathematically characterize the optimality condition. Furthermore, we propose differentially private algorithms which still matches the lower bound, showing that privacy is \"almost free\". Additionally, we derive the asymptotic normality of the estimator, which is essential in statistical inference and hypothesis testing.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiachun Li;Kaining Shi;David Simchi-Levi", "authorids": "~Jiachun_Li1;~Kaining_Shi1;~David_Simchi-Levi2", "gender": "M;M;M", "homepage": ";https://www.linkedin.com/in/%E6%81%BA%E5%AE%81-%E7%9F%B3-2580412b2/?trk=public-profile-join-page;http://slevi1.mit.edu/", "dblp": ";;", "google_scholar": ";;https://scholar.google.co.uk/citations?hl=en", "orcid": ";;", "linkedin": "jiachun-li-87076320a/;;", "or_profile": "~Jiachun_Li1;~Kaining_Shi1;~David_Simchi-Levi2", "aff": "Massachusetts Institute of Technology;Tsinghua University;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mails.tsinghua.edu.cn;mit.edu", "position": "PhD student;Undergrad student;Full Professor", "bibtex": "@inproceedings{\nli2024privacy,\ntitle={Privacy Preserving Adaptive Experiment Design},\nauthor={Jiachun Li and Kaining Shi and David Simchi-Levi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1QmFKwVwwI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 542064, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11050000887895991816&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "mit.edu;mails.tsinghua.edu.cn;mit.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Massachusetts Institute of Technology;Tsinghua University", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://www.tsinghua.edu.cn", "aff_unique_abbr": "MIT;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;China" }, { "title": "Token-level Direct Preference Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35149", "id": "1RZKuvqYCR", "proceeding": "https://proceedings.mlr.press/v235/zeng24c.html", "pdf": "https://openreview.net/pdf?id=1RZKuvqYCR", "openreview": "https://openreview.net/forum?id=1RZKuvqYCR", "author_site": "Yongcheng Zeng, Guoqing Liu, Weiyu Ma, Ning Yang, Haifeng Zhang, Jun Wang", "tldr": "", "abstract": "Fine-tuning pre-trained Large Language Models (LLMs) is essential to align them with human values and intentions. This process often utilizes methods like pairwise comparisons and KL divergence against a reference LLM, focusing on the evaluation of full answers generated by the models. However, the generation of these responses occurs in a token level, following a sequential, auto-regressive fashion. In this paper, we introduce Token-level Direct Preference Optimization (TDPO), a novel approach to align LLMs with human preferences by optimizing policy at the token level. Unlike previous methods, which face challenges in divergence efficiency, TDPO integrates forward KL divergence constraints for each token, improving alignment and diversity. Utilizing the Bradley-Terry model for a token-based reward system, our method enhances the regulation of KL divergence, while preserving simplicity without the need for explicit reward modeling. Experimental results across various text tasks demonstrate TDPO\u2019s superior performance in balancing alignment with generation diversity. Notably, fine-tuning with TDPO strikes a better balance than DPO in the controlled sentiment generation and single-turn dialogue datasets, and significantly improves the quality of generated responses compared to both DPO and PPO-based RLHF methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yongcheng Zeng;Guoqing Liu;Weiyu Ma;Ning Yang;Haifeng Zhang;Jun Wang", "authorids": "~Yongcheng_Zeng1;~Guoqing_Liu3;~Weiyu_Ma1;~Ning_Yang5;~Haifeng_Zhang3;~Jun_Wang2", "gender": "M;M;M;F;;M", "homepage": "http://marl.ia.ac.cn/people/zengyongcheng.html;https://www.microsoft.com/en-us/research/people/guoqingliu/;http://marl.ia.ac.cn/people/maweiyu.html;http://marl.ia.ac.cn/people/yangning.html;https://pkuzhf.github.io;http://www0.cs.ucl.ac.uk/staff/jun.wang/", "dblp": ";;;67/1751-5;93/7133-2;w/JunWang12", "google_scholar": ";h-eHvyoAAAAJ;;https://scholar.google.com/citations?hl=zh-CN;;https://scholar.google.co.uk/citations?user=wIE1tY4AAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Yongcheng_Zeng1;~Guoqing_Liu3;~Weiyu_Ma1;~Ning_Yang5;~Haifeng_Zhang3;~Jun_Wang2", "aff": "Institute of Automation, Chinese Academy of Sciences;Microsoft Research ;Institute of Automation, Chinese Academy of Sciences;Institute of Automation, Chinese Academy of Sciences;Institute of Automation, Chinese Academy of Sciences;University College London", "aff_domain": "ia.ac.cn;microsoft.com;ia.ac.cn;ia.ac.cn;ia.ac.cn;ucl.ac.uk", "position": "PhD student;Researcher;MS student;Assistant Professor;Associate Professor;Professor", "bibtex": "@inproceedings{\nzeng2024tokenlevel,\ntitle={Token-level Direct Preference Optimization},\nauthor={Yongcheng Zeng and Guoqing Liu and Weiyu Ma and Ning Yang and Haifeng Zhang and Jun Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1RZKuvqYCR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 715396, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6994314563865245546&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 6, "email": "ia.ac.cn;microsoft.com;ia.ac.cn;ia.ac.cn;ia.ac.cn;ucl.ac.uk", "author_num": 6, "aff_unique_index": "0;1;0;0;0;2", "aff_unique_norm": "Chinese Academy of Sciences;Microsoft;University College London", "aff_unique_dep": "Institute of Automation;Microsoft Research;", "aff_unique_url": "http://www.ia.cas.cn;https://www.microsoft.com/en-us/research;https://www.ucl.ac.uk", "aff_unique_abbr": "CAS;MSR;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;2", "aff_country_unique": "China;United States;United Kingdom" }, { "title": "Defense against Backdoor Attack on Pre-trained Language Models via Head Pruning and Attention Normalization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35148", "id": "1SiEfsCecd", "proceeding": "https://proceedings.mlr.press/v235/zhao24r.html", "pdf": "https://openreview.net/pdf?id=1SiEfsCecd", "openreview": "https://openreview.net/forum?id=1SiEfsCecd", "author_site": "Xingyi Zhao, Depeng Xu, Shuhan Yuan", "tldr": "", "abstract": "Pre-trained language models (PLMs) are commonly used for various downstream natural language processing tasks via fine-tuning. However, recent studies have demonstrated that PLMs are vulnerable to backdoor attacks, which can mislabel poisoned samples to target outputs even after a vanilla fine-tuning process. The key challenge for defending against the backdoored PLMs is that end users who adopt the PLMs for their downstream tasks usually do not have any knowledge about the attacking strategies, such as triggers. To tackle this challenge, in this work, we propose a backdoor mitigation approach, PURE, via head pruning and normalization of attention weights. The idea is to prune the attention heads that are potentially affected by poisoned texts with only clean texts on hand and then further normalize the weights of remaining attention heads to mitigate the backdoor impacts. We conduct experiments to defend against various backdoor attacks on the classification task. The experimental results show the effectiveness of PURE in lowering the attack success rate without sacrificing the performance on clean texts.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xingyi Zhao;Depeng Xu;Shuhan Yuan", "authorids": "~Xingyi_Zhao1;~Depeng_Xu2;~Shuhan_Yuan2", "gender": "M;M;", "homepage": ";https://webpages.charlotte.edu/dxu7/;https://yuan.shuhan.org/", "dblp": "333/0697.html;212/1161.html;172/2711.html+", "google_scholar": "yVq7_1sAAAAJ;GIVbIEgAAAAJ;14J_wuIAAAAJ", "orcid": ";0000-0002-0371-1815;", "linkedin": ";;", "or_profile": "~Xingyi_Zhao1;~Depeng_Xu2;~Shuhan_Yuan2", "aff": "Utah State University;University of North Carolina at Charlotte;Utah State University", "aff_domain": "usu.edu;uncc.edu;usu.edu", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nzhao2024defense,\ntitle={Defense against Backdoor Attack on Pre-trained Language Models via Head Pruning and Attention Normalization},\nauthor={Xingyi Zhao and Depeng Xu and Shuhan Yuan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1SiEfsCecd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2811795, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3979137353845160273&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "usu.edu;uncc.edu;usu.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Utah State University;University of North Carolina at Charlotte", "aff_unique_dep": ";", "aff_unique_url": "https://www.usu.edu;https://www.uncc.edu", "aff_unique_abbr": "USU;UNCC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Charlotte", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Physics and Lie symmetry informed Gaussian processes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35147", "id": "1V50J0emll", "proceeding": "https://proceedings.mlr.press/v235/dalton24a.html", "pdf": "https://openreview.net/pdf?id=1V50J0emll", "openreview": "https://openreview.net/forum?id=1V50J0emll", "author_site": "David Dalton, Dirk Husmeier, Hao Gao", "tldr": "", "abstract": "Physics-informed machine learning (PIML) has established itself as a new scientific paradigm which enables the seamless integration of observational data with partial differential equation (PDE) based physics models. A powerful tool for the analysis, reduction and solution of PDEs is the Lie symmetry method. Nevertheless, only recently has the integration of such symmetries into PIML frameworks begun to be explored. The present work adds to this growing literature by introducing an approach for incorporating a Lie symmetry into a physics-informed Gaussian process (GP) model. The symmetry is introduced as a constraint on the GP; either in a soft manner via virtual observations of an induced PDE called the invariant surface condition, or explicitly through the design of the kernel. Experimental results demonstrate that the use of symmetry constraints improves the performance of the GP for both forward and inverse problems, and that our approach offers competitive performance with neural networks in the low-data environment.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "David Dalton;Dirk Husmeier;Hao Gao", "authorids": "~David_Dalton1;dirk.husmeier@glasgow.ac.uk;hao.gao@glasgow.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ndalton2024physics,\ntitle={Physics and Lie symmetry informed Gaussian processes},\nauthor={David Dalton and Dirk Husmeier and Hao Gao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1V50J0emll}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 915444, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3453707534324644421&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": ";;", "author_num": 3 }, { "title": "Correcting Diffusion-Based Perceptual Image Compression with Privileged End-to-End Decoder", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35146", "id": "1WWpIEFdlk", "proceeding": "https://proceedings.mlr.press/v235/ma24s.html", "pdf": "https://openreview.net/pdf?id=1WWpIEFdlk", "openreview": "https://openreview.net/forum?id=1WWpIEFdlk", "author_site": "Yiyang Ma, Wenhan Yang, Jiaying Liu", "tldr": "", "abstract": "The images produced by diffusion models can attain excellent perceptual quality. However, it is challenging for diffusion models to guarantee distortion, hence the integration of diffusion models and image compression models still needs more comprehensive explorations. This paper presents a diffusion-based image compression method that employs a privileged end-to-end decoder model as correction, which achieves better perceptual quality while guaranteeing the distortion to an extent. We build a diffusion model and design a novel paradigm that combines the diffusion model and an end-to-end decoder, and the latter is responsible for transmitting the privileged information extracted at the encoder side. Specifically, we theoretically analyze the reconstruction process of the diffusion models at the encoder side with the original images being visible. Based on the analysis, we introduce an end-to-end convolutional decoder to provide a better approximation of the score function $\\nabla_{\\mathbf{x}_t}\\log p(\\mathbf{x}_t)$ at the encoder side and effectively transmit the combination. Experiments demonstrate the superiority of our method in both distortion and perception compared with previous perceptual compression methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yiyang Ma;Wenhan Yang;Jiaying Liu", "authorids": "~Yiyang_Ma1;~Wenhan_Yang1;~Jiaying_Liu1", "gender": "M;M;F", "homepage": "https://realpasu.github.io/;https://flyywh.github.io/;http://www.icst.pku.edu.cn/struct/people/liujiaying.html", "dblp": "324/2590;156/2359;32/197.html", "google_scholar": ";https://scholar.google.com.hk/citations?hl=zh-CN;https://scholar.google.com.tw/citations?user=-OcSne0AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yiyang_Ma1;~Wenhan_Yang1;~Jiaying_Liu1", "aff": "DeepSeek AI;;Peking University", "aff_domain": "deepseek.com;;pku.edu.cn", "position": "Intern;;Associate Professor", "bibtex": "@inproceedings{\nma2024correcting,\ntitle={Correcting Diffusion-Based Perceptual Image Compression with Privileged End-to-End Decoder},\nauthor={Yiyang Ma and Wenhan Yang and Jiaying Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1WWpIEFdlk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3017464, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6715817532385783219&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "deepseek.com;;pku.edu.cn", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "DeepSeek AI;Peking University", "aff_unique_dep": ";", "aff_unique_url": ";http://www.pku.edu.cn", "aff_unique_abbr": "DeepSeek AI;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;China" }, { "title": "OpenMoE: An Early Effort on Open Mixture-of-Experts Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35145", "id": "1YDeZU8Lt5", "proceeding": "https://proceedings.mlr.press/v235/xue24c.html", "pdf": "https://openreview.net/pdf?id=1YDeZU8Lt5", "openreview": "https://openreview.net/forum?id=1YDeZU8Lt5", "author_site": "Fuzhao Xue, Zian Zheng, Yao Fu, Jinjie Ni, Zangwei Zheng, Wangchunshu Zhou, Yang You", "tldr": "", "abstract": "To help the open-source community have a better understanding of Mixture-of-Experts (MoE) based large language models (LLMs), we train and release OpenMoE, a series of fully open-sourced and reproducible decoder-only MoE LLMs, ranging from 650M to 34B parameters and trained on up to over 1T tokens. Our investigation confirms that MoE-based LLMs can offer a more favorable cost-effectiveness trade-off than dense LLMs, highlighting the potential effectiveness for future LLM development. One more important contribution of this study is an in-depth analysis of the routing mechanisms within our OpenMoE models, leading to three significant findings: Context-Independent Specialization, Early Routing Learning, and Drop-towards-the-End. We discovered that routing decisions in MoE models are predominantly based on token IDs, with minimal context relevance. The token-to-expert assignments are determined early in the pre-training phase and remain largely unchanged. This imperfect routing can result in performance degradation, particularly in sequential tasks like multi-turn conversations, where tokens appearing later in a sequence are more likely to be dropped. Finally, we rethink our design based on the above-mentioned observations and analysis. To facilitate future MoE LLM development, we propose potential strategies for mitigating the issues we found and further improving off-the-shelf MoE LLM designs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fuzhao Xue;Zian Zheng;Yao Fu;Jinjie Ni;Zangwei Zheng;Wangchunshu Zhou;Yang You", "authorids": "~Fuzhao_Xue1;~Zian_Zheng1;~Yao_Fu3;~Jinjie_Ni1;~Zangwei_Zheng1;~Wangchunshu_Zhou1;~Yang_You1", "gender": "M;M;M;M;M;M;M", "homepage": "https://xuefuzhao.github.io/;http://zheng-zian-andy.com;https://franxyao.github.io/;;https://zhengzangw.github.io;https://michaelzhouwang.github.io;https://www.comp.nus.edu.sg/~youy/", "dblp": "248/1245;290/8897-1;;257/4822;289/0376;245/8640.html;33/8167-1.html", "google_scholar": "JMHsqIkAAAAJ;3j1uWicAAAAJ;liSP4cEAAAAJ;TXfiHo8AAAAJ;FTqutJEAAAAJ;UebIjuQAAAAJ;jF4dPZwAAAAJ", "orcid": ";;;;0000-0002-1505-1535;;", "linkedin": "fuzhao-xue-6410561a6/;zian-andy-zheng/;;;;;yang-you-0b92914b/", "or_profile": "~Fuzhao_Xue1;~Zian_Zheng1;~Yao_Fu3;~Jinjie_Ni1;~Zangwei_Zheng1;~Wangchunshu_Zhou1;~Yang_You1", "aff": "National University of Singapore;National University of Singapore;University of Edinburgh;National University of Singapore;National University of Singapore;AIWaves Inc.;National University of Singapore", "aff_domain": "nus.edu.sg;u.nus.edu;ed.ac.uk;nus.edu.sg;nus.edu.sg;aiwaves.cn;nus.edu.sg", "position": "PhD student;MS student;PhD student;Postdoc;PhD student;Researcher;Professor", "bibtex": "@inproceedings{\nxue2024openmoe,\ntitle={OpenMoE: An Early Effort on Open Mixture-of-Experts Language Models},\nauthor={Fuzhao Xue and Zian Zheng and Yao Fu and Jinjie Ni and Zangwei Zheng and Wangchunshu Zhou and Yang You},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1YDeZU8Lt5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 824069, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 84, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14984858455919371073&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "nus.edu.sg;u.nus.edu;ed.ac.uk;nus.edu.sg;nus.edu.sg;aiwaves.cn;nus.edu.sg", "author_num": 7, "aff_unique_index": "0;0;1;0;0;2;0", "aff_unique_norm": "National University of Singapore;University of Edinburgh;AIWaves Inc.", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nus.edu.sg;https://www.ed.ac.uk;", "aff_unique_abbr": "NUS;Edinburgh;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;2;0", "aff_country_unique": "Singapore;United Kingdom;United States" }, { "title": "SPABA: A Single-Loop and Probabilistic Stochastic Bilevel Algorithm Achieving Optimal Sample Complexity", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35144", "id": "1YMjzz2g81", "proceeding": "https://proceedings.mlr.press/v235/chu24a.html", "pdf": "https://openreview.net/pdf?id=1YMjzz2g81", "openreview": "https://openreview.net/forum?id=1YMjzz2g81", "author_site": "Tianshu Chu, Dachuan Xu, Wei Yao, Jin Zhang", "tldr": "", "abstract": "While stochastic bilevel optimization methods have been extensively studied for addressing large-scale nested optimization problems in machine learning, it remains an open question whether the optimal complexity bounds for solving bilevel optimization are the same as those in single-level optimization. Our main result resolves this question: SPABA, an adaptation of the PAGE method for nonconvex optimization in (Li et al., 2021) to the bilevel setting, can achieve optimal sample complexity in both the finite-sum and expectation settings. We show the optimality of SPABA by proving that there is no gap in complexity analysis between stochastic bilevel and single-level optimization when implementing PAGE. Notably, as indicated by the results of (Dagr\u00e9ou et al., 2022), there might exist a gap in complexity analysis when implementing other stochastic gradient estimators, like SGD and SAGA. In addition to SPABA, we propose several other single-loop stochastic bilevel algorithms, that either match or improve the state-of-the-art sample complexity results, leveraging our convergence rate and complexity analysis. Numerical experiments demonstrate the superior practical performance of the proposed methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tianshu Chu;Dachuan Xu;Wei Yao;Jin Zhang", "authorids": "~Tianshu_Chu4;~Dachuan_Xu1;~Wei_Yao3;~Jin_Zhang8", "gender": "F;M;;M", "homepage": ";;https://mathscinet.ams.org/mathscinet/search/author.html?mrauthid=910710;https://math.sustech.edu.cn/c/zhangjin?lang=en", "dblp": ";73/41.html;;", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;;;https://scholar.google.ca/citations?hl=en", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Tianshu_Chu4;~Dachuan_Xu1;~Wei_Yao3;~Jin_Zhang8", "aff": "Beijing University of Technology;Beijing University of Technology;Southern University of Science and Technology;", "aff_domain": "bjut.edu.cn;bjut.edu.cn;sustech.edu.cn;", "position": "PhD student;Full Professor;Assistant Professor;", "bibtex": "@inproceedings{\nchu2024spaba,\ntitle={{SPABA}: A Single-Loop and Probabilistic Stochastic Bilevel Algorithm Achieving Optimal Sample Complexity},\nauthor={Tianshu Chu and Dachuan Xu and Wei Yao and Jin Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1YMjzz2g81}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3110379, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12523516892825154525&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "bjut.edu.cn;bjut.edu.cn;sustech.edu.cn;", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Beijing University of Technology;Southern University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "http://www.bjut.edu.cn;https://www.sustech.edu.cn", "aff_unique_abbr": "BJUT;SUSTech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Antibody Design Using a Score-based Diffusion Model Guided by Evolutionary, Physical and Geometric Constraints", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35143", "id": "1YsQI04KaN", "proceeding": "https://proceedings.mlr.press/v235/zhu24j.html", "pdf": "https://openreview.net/pdf?id=1YsQI04KaN", "openreview": "https://openreview.net/forum?id=1YsQI04KaN", "author_site": "Tian Zhu, Milong Ren, Haicang Zhang", "tldr": "", "abstract": "Antibodies are central proteins in adaptive immune responses, responsible for protecting against viruses and other pathogens. Rational antibody design has proven effective in the diagnosis and treatment of various diseases like cancers and virus infections. While recent diffusion-based generative models show promise in designing antigen-specific antibodies, the primary challenge lies in the scarcity of labeled antibody-antigen complex data and binding affinity data. We present AbX, a new score-based diffusion generative model guided by evolutionary, physical, and geometric constraints for antibody design. These constraints serve to narrow the search space and provide priors for plausible antibody sequences and structures. Specifically, we leverage a pre-trained protein language model as priors for evolutionary plausible antibodies and introduce additional training objectives for geometric and physical constraints like van der Waals forces. Furthermore, as far as we know, AbX is the first score-based diffusion model with continuous timesteps for antibody design, jointly modeling the discrete sequence space and the $\\mathrm{SE}(3)$ structure space. Evaluated on two independent testing sets, we show that AbX outperforms other published methods, achieving higher accuracy in sequence and structure generation and enhanced antibody-antigen binding affinity. Ablation studies highlight the clear contributions of the introduced constraints to antibody design.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tian Zhu;Milong Ren;Haicang Zhang", "authorids": "~Tian_Zhu1;~Milong_Ren2;~Haicang_Zhang1", "gender": "M;M;M", "homepage": "https://eurekazhu.github.io;https://github.com/rabbit-0001/renmilong;", "dblp": ";;138/0439", "google_scholar": "rLxdI10AAAAJ;;myzZFrYAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Tian_Zhu1;~Milong_Ren2;~Haicang_Zhang1", "aff": "Institute of Computing Technology, Chinese Academy of Sciences; Institute of Computing Technology;Institute of Computing Technology, Chinese Academy of Sciences", "aff_domain": "ict.ac.cn;ict.ac.cn;ict.ac.cn", "position": "MS student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nzhu2024antibody,\ntitle={Antibody Design Using a Score-based Diffusion Model Guided by Evolutionary, Physical and Geometric Constraints},\nauthor={Tian Zhu and Milong Ren and Haicang Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1YsQI04KaN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5831068, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14621205415255986178&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "ict.ac.cn;ict.ac.cn;ict.ac.cn", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Chinese Academy of Sciences;Institute of Computing Technology", "aff_unique_dep": "Institute of Computing Technology;", "aff_unique_url": "http://www.ict.ac.cn;http://www.ict.ac.cn", "aff_unique_abbr": "CAS;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Towards Interpretable Deep Local Learning with Successive Gradient Reconciliation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35142", "id": "1ZJLNLZIpk", "proceeding": "https://proceedings.mlr.press/v235/yang24m.html", "pdf": "https://openreview.net/pdf?id=1ZJLNLZIpk", "openreview": "https://openreview.net/forum?id=1ZJLNLZIpk", "author_site": "Yibo Yang, Xiaojie Li, Motasem Alfarra, Hasan Hammoud, Adel Bibi, Phil Torr, Bernard Ghanem", "tldr": "", "abstract": "Relieving the reliance of neural network training on a global back-propagation (BP) has emerged as a notable research topic due to the biological implausibility and huge memory consumption caused by BP. Among the existing solutions, local learning optimizes gradient-isolated modules of a neural network with local errors and has been proved to be effective even on large-scale datasets. However, the reconciliation among local errors has never been investigated. In this paper, we first theoretically study non-greedy layer-wise training and show that the convergence cannot be assured when the local gradient in a module w.r.t. its input is not reconciled with the local gradient in the previous module w.r.t. its output. Inspired by the theoretical result, we further propose a local training strategy that successively regularizes the gradient reconciliation between neighboring modules without breaking gradient isolation or introducing any learnable parameters. Our method can be integrated into both local-BP and BP-free settings. In experiments, we achieve significant performance improvements compared to previous methods. Particularly, our method for CNN and Transformer architectures on ImageNet is able to attain a competitive performance with global BP, saving more than 40% memory consumption.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yibo Yang;Xiaojie Li;Motasem Alfarra;Hasan Abed Al Kader Hammoud;Adel Bibi;Philip Torr;Bernard Ghanem", "authorids": "~Yibo_Yang2;~Xiaojie_Li3;~Motasem_Alfarra1;~Hasan_Abed_Al_Kader_Hammoud1;~Adel_Bibi1;~Philip_Torr1;~Bernard_Ghanem1", "gender": "M;F;M;M;M;;M", "homepage": "https://iboing.github.io/;https://xiaojieli0903.github.io;https://motasemalfarra.netlify.app/;https://cemse.kaust.edu.sa/vcc/people/person/hasan-abed-al-kader-hammoud;http://adelbibi.com;http://www.robots.ox.ac.uk/~tvg/;https://ivul.kaust.edu.sa", "dblp": "28/7717/;;255/5192;259/0615;176/0964;;37/2516", "google_scholar": "DxXXnCcAAAAJ;0PcPQfQAAAAJ;https://scholar.google.com/citations?hl=en;Plf1JSIAAAAJ;Q4j2laYAAAAJ;;rVsGTeEAAAAJ", "orcid": ";0000-0001-6449-2727;;;0000-0002-6169-3918;;0000-0002-5534-587X", "linkedin": ";;;hasan-abed-al-kader-hammoud-56392a147/;adel-bibi-ba3671ab/;;bernardghanem/", "or_profile": "~Yibo_Yang2;~Xiaojie_Li3;~Motasem_Alfarra1;~Hasan_Abed_Al_Kader_Hammoud1;~Adel_Bibi1;~Philip_Torr1;~Bernard_Ghanem1", "aff": "King Abdullah University of Science and Technology;Harbin Institute of Technology;KAUST;KAUST;University of Oxford;University of Oxford;King Abdullah University of Science and Technology", "aff_domain": "kaust.edu.sa;hit.edu.cn;kaust.edu.sa;kaust.edu.sa;ox.ac.uk;ox.ac.uk;kaust.edu.sa", "position": "Research Scientist;PhD student;PhD student;PhD student;Senior Researcher;Full Professor;Full Professor", "bibtex": "@inproceedings{\nyang2024towards,\ntitle={Towards Interpretable Deep Local Learning with Successive Gradient Reconciliation},\nauthor={Yibo Yang and Xiaojie Li and Motasem Alfarra and Hasan Abed Al Kader Hammoud and Adel Bibi and Philip Torr and Bernard Ghanem},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1ZJLNLZIpk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 518174, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5495139438286468285&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "email": "kaust.edu.sa;hit.edu.cn;kaust.edu.sa;kaust.edu.sa;ox.ac.uk;ox.ac.uk;kaust.edu.sa", "author_num": 7, "aff_unique_index": "0;1;0;0;2;2;0", "aff_unique_norm": "King Abdullah University of Science and Technology;Harbin Institute of Technology;University of Oxford", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kast.kau.edu.sa;http://www.hit.edu.cn/;https://www.ox.ac.uk", "aff_unique_abbr": "KAUST;HIT;Oxford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Harbin", "aff_country_unique_index": "0;1;0;0;2;2;0", "aff_country_unique": "Saudi Arabia;China;United Kingdom" }, { "title": "Lie Neurons: Adjoint-Equivariant Neural Networks for Semisimple Lie Algebras", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35141", "id": "1bJLl4fY6i", "proceeding": "https://proceedings.mlr.press/v235/lin24aa.html", "pdf": "https://openreview.net/pdf?id=1bJLl4fY6i", "openreview": "https://openreview.net/forum?id=1bJLl4fY6i", "author_site": "Tzu-Yuan Lin, Minghan Zhu, Maani Ghaffari", "tldr": "", "abstract": "This paper proposes an equivariant neural network that takes data in any finite-dimensional semi-simple Lie algebra as input. The corresponding group acts on the Lie algebra as adjoint operations, making our proposed network adjoint-equivariant. Our framework generalizes the Vector Neurons, a simple $\\mathrm{SO}(3)$-equivariant network, from 3-D Euclidean space to Lie algebra spaces, building upon the invariance property of the Killing form. Furthermore, we propose novel Lie bracket layers and geometric channel mixing layers that extend the modeling capacity. Experiments are conducted for the $\\mathfrak{so}(3)$, $\\mathfrak{sl}(3)$, and $\\mathfrak{sp}(4)$ Lie algebras on various tasks, including fitting equivariant and invariant functions, learning system dynamics, point cloud registration, and homography-based shape classification. Our proposed equivariant network shows wide applicability and competitive performance in various domains.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tzu-Yuan Lin;Minghan Zhu;Maani Ghaffari", "authorids": "~Tzu-Yuan_Lin1;~Minghan_Zhu1;~Maani_Ghaffari1", "gender": "M;Not Specified;M", "homepage": "https://tzuyuan.github.io/;;https://curly.engin.umich.edu/", "dblp": "116/9958;255/5003;", "google_scholar": "1HY3TXcAAAAJ;70CbUXwAAAAJ;l2jdSb8AAAAJ", "orcid": ";0000-0002-0145-7542;0000-0002-4734-4295", "linkedin": ";;maani-ghaffari-19b017203/", "or_profile": "~Tzu-Yuan_Lin1;~Minghan_Zhu1;~Maani_Ghaffari1", "aff": "University of Michigan;University of Pennsylvania;University of Michigan", "aff_domain": "umich.edu;upenn.edu;umich.edu", "position": "PhD student;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nlin2024lie,\ntitle={Lie Neurons: Adjoint-Equivariant Neural Networks for Semisimple Lie Algebras},\nauthor={Tzu-Yuan Lin and Minghan Zhu and Maani Ghaffari},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1bJLl4fY6i}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1870776, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8227908635002508967&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "umich.edu;upenn.edu;umich.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Michigan;University of Pennsylvania", "aff_unique_dep": ";", "aff_unique_url": "https://www.umich.edu;https://www.upenn.edu", "aff_unique_abbr": "UM;UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Understanding MLP-Mixer as a wide and sparse MLP", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35140", "id": "1dtYo5ywXZ", "proceeding": "https://proceedings.mlr.press/v235/hayase24a.html", "pdf": "https://openreview.net/pdf?id=1dtYo5ywXZ", "openreview": "https://openreview.net/forum?id=1dtYo5ywXZ", "author_site": "Tomohiro Hayase, Ryo Karakida", "tldr": "", "abstract": "Multi-layer perceptron (MLP) is a fundamental component of deep learning, and recent MLP-based architectures, especially the MLP-Mixer, have achieved significant empirical success. Nevertheless, our understanding of why and how the MLP-Mixer outperforms conventional MLPs remains largely unexplored. In this work, we reveal that sparseness is a key mechanism underlying the MLP-Mixers. First, the Mixers have an effective expression as a wider MLP with Kronecker-product weights, clarifying that the Mixers efficiently embody several sparseness properties explored in deep learning. In the case of linear layers, the effective expression elucidates an implicit sparse regularization caused by the model architecture and a hidden relation to Monarch matrices, which is also known as another form of sparse parameterization. Next, for general cases, we empirically demonstrate quantitative similarities between the Mixer and the unstructured sparse-weight MLPs. Following a guiding principle proposed by Golubeva, Neyshabur and Gur-Ari (2021), which fixes the number of connections and increases the width and sparsity, the Mixers can demonstrate improved performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tomohiro Hayase;Ryo Karakida", "authorids": "~Tomohiro_Hayase1;~Ryo_Karakida2", "gender": "M;M", "homepage": "https://thayafluss.notion.site/main;https://sites.google.com/view/ryokarakida/english", "dblp": "218/5945;", "google_scholar": "https://scholar.google.co.in/citations?user=_F82YY4AAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Tomohiro_Hayase1;~Ryo_Karakida2", "aff": "Cluster, INC.;AIST, National Institute of Advanced Industrial Science and Technology", "aff_domain": "cluster.mu;aist.go.jp", "position": "Principal Researcher;Researcher", "bibtex": "@inproceedings{\nhayase2024understanding,\ntitle={Understanding {MLP}-Mixer as a wide and sparse {MLP}},\nauthor={Tomohiro Hayase and Ryo Karakida},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1dtYo5ywXZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9622380, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2719037636805174909&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "cluster.mu;aist.go.jp", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Cluster, INC.;National Institute of Advanced Industrial Science and Technology", "aff_unique_dep": ";", "aff_unique_url": ";https://www.aist.go.jp", "aff_unique_abbr": ";AIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Japan" }, { "title": "Unifying Bayesian Flow Networks and Diffusion Models through Stochastic Differential Equations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35139", "id": "1jHiq640y1", "proceeding": "https://proceedings.mlr.press/v235/xue24d.html", "pdf": "https://openreview.net/pdf?id=1jHiq640y1", "openreview": "https://openreview.net/forum?id=1jHiq640y1", "author_site": "Kaiwen Xue, Yuhao Zhou, Shen Nie, Xu Min, Xiaolu Zhang, JUN ZHOU, Chongxuan Li", "tldr": "", "abstract": "Bayesian flow networks (BFNs) iteratively refine the parameters, instead of the samples in diffusion models (DMs), of distributions at various noise levels through Bayesian inference. Owing to its differentiable nature, BFNs are promising in modeling both continuous and discrete data, while simultaneously maintaining fast sampling capabilities. This paper aims to understand and enhance BFNs by connecting them with DMs through stochastic differential equations (SDEs). We identify the linear SDEs corresponding to the noise-addition processes in BFNs, demonstrate that BFN's regression losses are aligned with denoise score matching, and validate the sampler in BFN as a first-order solver for the respective reverse-time SDE. Based on these findings and existing recipes of fast sampling in DMs, we propose specialized solvers for BFNs that markedly surpass the original BFN sampler in terms of sample quality with a limited number of function evaluations (e.g., 10) on both image and text datasets. Notably, our best sampler achieves an increase in speed of $5\\sim20$ times for free.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kaiwen Xue;Yuhao Zhou;Shen Nie;Xu Min;Xiaolu Zhang;JUN ZHOU;Chongxuan Li", "authorids": "~Kaiwen_Xue1;~Yuhao_Zhou2;~Shen_Nie2;~Xu_Min1;~Xiaolu_Zhang2;~JUN_ZHOU6;~Chongxuan_Li1", "gender": "M;M;M;F;M;M;M", "homepage": "https://yuhaoz.com;https://github.com/NieShenRuc;https://minxueric.github.io/;https://scholar.google.com/citations?user=cAz9PToAAAAJ;https://scholar.google.com/citations?user=mCVvloEAAAAJ&hl=en;http://ml.cs.tsinghua.edu.cn/~chongxuan;http://kaiwenxue.top/", "dblp": ";342/3413;08/2810;48/5176;99/3847-11;161/9965;", "google_scholar": "GKLRbxoAAAAJ;;xuYp0_sAAAAJ;;mCVvloEAAAAJ;UKMcQn4AAAAJ;", "orcid": ";;;0000-0001-8055-0245;0000-0001-6033-6102;0000-0002-0912-9076;", "linkedin": ";;;;;;", "or_profile": "~Yuhao_Zhou2;~Shen_Nie2;~Xu_Min1;~Xiaolu_Zhang2;~JUN_ZHOU6;~Chongxuan_Li1;~Kevin_Xue1", "aff": "Tsinghua University;Renmin University of China;;Ant Group;Ant Group;Renmin University of China;", "aff_domain": "tsinghua.edu.cn;ruc.edu.cn;;antfin.com;antgroup.com;ruc.edu.cn;", "position": "PhD student;PhD student;;Researcher;Researcher;Associate Professor;", "bibtex": "@inproceedings{\nxue2024unifying,\ntitle={Unifying Bayesian Flow Networks and Diffusion Models through Stochastic Differential Equations},\nauthor={Kaiwen Xue and Yuhao Zhou and Shen Nie and Xu Min and Xiaolu Zhang and JUN ZHOU and Chongxuan Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1jHiq640y1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3753525, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4728908763675332235&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "tsinghua.edu.cn;ruc.edu.cn;;antfin.com;antgroup.com;ruc.edu.cn;", "author_num": 7, "aff_unique_index": "0;1;2;2;1", "aff_unique_norm": "Tsinghua University;Renmin University of China;Ant Group", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;http://www.ruc.edu.cn;https://www.antgroup.com", "aff_unique_abbr": "THU;RUC;Ant Group", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "On PI Controllers for Updating Lagrange Multipliers in Constrained Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35138", "id": "1khG2xf1yt", "proceeding": "https://proceedings.mlr.press/v235/sohrabi24a.html", "pdf": "https://openreview.net/pdf?id=1khG2xf1yt", "openreview": "https://openreview.net/forum?id=1khG2xf1yt", "author_site": "Motahareh Sohrabi, Juan Ramirez, Tianyue Zhang, Simon Lacoste-Julien, Jose Gallego-Posada", "tldr": "", "abstract": "Constrained optimization offers a powerful framework to prescribe desired behaviors in neural network models. Typically, constrained problems are solved via their min-max Lagrangian formulations, which exhibit unstable oscillatory dynamics when optimized using gradient descent-ascent. The adoption of constrained optimization techniques in the machine learning community is currently limited by the lack of reliable, general-purpose update schemes for the Lagrange multipliers. This paper proposes the \u03bdPI algorithm and contributes an optimization perspective on Lagrange multiplier updates based on PI controllers, extending the work of Stooke, Achiam and Abbeel (2020). We provide theoretical and empirical insights explaining the inability of momentum methods to address the shortcomings of gradient descent-ascent, and contrast this with the empirical success of our proposed \u03bdPI controller. Moreover, we prove that \u03bdPI generalizes popular momentum methods for single-objective minimization. Our experiments demonstrate that \u03bdPI reliably stabilizes the multiplier dynamics and its hyperparameters enjoy robust and predictable behavior.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Motahareh Sohrabi;Juan Ramirez;Tianyue H. Zhang;Simon Lacoste-Julien;Jose Gallego-Posada", "authorids": "~Motahareh_Sohrabi1;~Juan_Ramirez2;~Tianyue_H._Zhang1;~Simon_Lacoste-Julien1;~Jose_Gallego-Posada1", "gender": "F;M;F;M;M", "homepage": "https://motahareh-sohrabi.github.io/;https://juan43ramirez.github.io;https://tianyuehz.github.io/;http://www.iro.umontreal.ca/~slacoste/;http://gallego-posada.github.io/", "dblp": ";;354/4223;94/446.html;211/7701", "google_scholar": ";yop0kRkAAAAJ;6dHuGwkAAAAJ;oejm5IUAAAAJ;tfKnkRQAAAAJ", "orcid": ";;;0000-0001-6485-6180;", "linkedin": "motahareh-sohrabi-aa507a149/;juan-camilo-ramirez-de-los-rios-11ab2b141/;helen-zhang-tianyue/;simon-lacoste-julien-355b9a3;", "or_profile": "~Motahareh_Sohrabi1;~Juan_Ramirez2;~Tianyue_H._Zhang1;~Simon_Lacoste-Julien1;~Jose_Gallego-Posada1", "aff": ";University of Montreal;Mila, Universit\u00e9 de Montr\u00e9al;Samsung - SAIT AI Lab, Montreal;University of Montreal", "aff_domain": ";umontreal.ca;mila.umontreal.ca;samsung.com;umontreal.ca", "position": ";PhD student;PhD student;VP Lab Director;PhD student", "bibtex": "@inproceedings{\nsohrabi2024on,\ntitle={On {PI} Controllers for Updating Lagrange Multipliers in Constrained Optimization},\nauthor={Motahareh Sohrabi and Juan Ramirez and Tianyue H. Zhang and Simon Lacoste-Julien and Jose Gallego-Posada},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1khG2xf1yt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1726190, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14042355114995164188&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": ";umontreal.ca;mila.umontreal.ca;samsung.com;umontreal.ca", "author_num": 5, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Montreal;Universit\u00e9 de Montr\u00e9al;Samsung", "aff_unique_dep": ";Mila;SAIT AI Lab", "aff_unique_url": "https://wwwumontreal.ca;https://umontreal.ca;https://www.samsung.com", "aff_unique_abbr": "UM;UdeM;Samsung", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Montr\u00e9al;Montreal", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Canada" }, { "title": "CATS: Enhancing Multivariate Time Series Forecasting by Constructing Auxiliary Time Series as Exogenous Variables", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35137", "id": "1lDAGDe0UR", "proceeding": "https://proceedings.mlr.press/v235/lu24d.html", "pdf": "https://openreview.net/pdf?id=1lDAGDe0UR", "openreview": "https://openreview.net/forum?id=1lDAGDe0UR", "author_site": "Jiecheng Lu, Xu Han, Sun, Shihao Yang", "tldr": "", "abstract": "For Multivariate Time Series Forecasting (MTSF), recent deep learning applications show that univariate models frequently outperform multivariate ones. To address the deficiency in multivariate models, we introduce a method to Construct Auxiliary Time Series (CATS) that functions like a 2D temporal-contextual attention mechanism, which generates Auxiliary Time Series (ATS) from Original Time Series (OTS) to effectively represent and incorporate inter-series relationships for forecasting. Key principles of ATS\u2014continuity, sparsity, and variability\u2014are identified and implemented through different modules. Even with a basic 2-layer MLP as the core predictor, CATS achieves state-of-the-art, significantly reducing complexity and parameters compared to previous multivariate models, marking it as an efficient and transferable MTSF solution.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiecheng Lu;Xu Han;Yan Sun;Shihao Yang", "authorids": "~Jiecheng_Lu1;~Xu_Han14;~Yan_Sun5;~Shihao_Yang1", "gender": "M;;M;M", "homepage": ";;https://www.isye.gatech.edu/people/phd-students;https://www.isye.gatech.edu/users/shihao-yang", "dblp": "359/2326;;;", "google_scholar": "H_Bz5A0AAAAJ;;;Ig6LhL8AAAAJ", "orcid": "0009-0008-3453-8569;;;0000-0003-3910-4969", "linkedin": "jiecheng-l-b01a491a4/;;;", "or_profile": "~Jiecheng_Lu1;~Xu_Han14;~Yan_Sun5;~Shihao_Yang1", "aff": "Georgia Institute of Technology;;Georgia Institute of Technology;Georgia Institute of Technology", "aff_domain": "gatech.edu;;gatech.edu;gatech.edu", "position": "PhD student;;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nlu2024cats,\ntitle={{CATS}: Enhancing Multivariate Time Series Forecasting by Constructing Auxiliary Time Series as Exogenous Variables},\nauthor={Jiecheng Lu and Xu Han and Yan Sun and Shihao Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1lDAGDe0UR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3032261, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11949058600486577366&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "gatech.edu;;gatech.edu;gatech.edu", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Towards Certified Unlearning for Deep Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35136", "id": "1mf1ISuyS3", "proceeding": "https://proceedings.mlr.press/v235/zhang24l.html", "pdf": "https://openreview.net/pdf?id=1mf1ISuyS3", "openreview": "https://openreview.net/forum?id=1mf1ISuyS3", "author_site": "Binchi Zhang, Yushun Dong, Tianhao Wang, Jundong Li", "tldr": "", "abstract": "In the field of machine unlearning, certified unlearning has been extensively studied in convex machine learning models due to its high efficiency and strong theoretical guarantees. However, its application to deep neural networks (DNNs), known for their highly nonconvex nature, still poses challenges. To bridge the gap between certified unlearning and DNNs, we propose several simple techniques to extend certified unlearning methods to nonconvex objectives. To reduce the time complexity, we develop an efficient computation method by inverse Hessian approximation without compromising certification guarantees. In addition, we extend our discussion of certification to nonconvergence training and sequential unlearning, considering that real-world users can send unlearning requests at different time points. Extensive experiments on three real-world datasets demonstrate the efficacy of our method and the advantages of certified unlearning in DNNs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Binchi Zhang;Yushun Dong;Tianhao Wang;Jundong Li", "authorids": "~Binchi_Zhang1;~Yushun_Dong1;~Tianhao_Wang3;~Jundong_Li2", "gender": "M;M;M;M", "homepage": "https://zhangbinchi.github.io/;https://yushundong.github.io;https://tianhao.wang;https://jundongli.github.io/", "dblp": "304/7647;251/9559;https://dblp.uni-trier.de/pid/145/3288-1.html;144/7997.html", "google_scholar": "c8Z36PAAAAAJ;https://scholar.google.com/citations?hl=en;TkgyXGwAAAAJ;uY6ek7sAAAAJ", "orcid": "0000-0001-7321-3822;0000-0001-7504-6159;;", "linkedin": "binchi-zhang-274922221/;;;", "or_profile": "~Binchi_Zhang1;~Yushun_Dong1;~Tianhao_Wang3;~Jundong_Li2", "aff": "University of Virginia, Charlottesville;University of Virginia, Charlottesville;University of Virginia, Charlottesville;University of Virginia", "aff_domain": "virginia.edu;virginia.edu;virginia.edu;virginia.edu", "position": "PhD student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nzhang2024towards,\ntitle={Towards Certified Unlearning for Deep Neural Networks},\nauthor={Binchi Zhang and Yushun Dong and Tianhao Wang and Jundong Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1mf1ISuyS3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 735080, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2234021460618852385&as_sdt=800005&sciodt=0,15&hl=en", "gs_version_total": 8, "email": "virginia.edu;virginia.edu;virginia.edu;virginia.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Virginia", "aff_unique_dep": "", "aff_unique_url": "https://www.virginia.edu", "aff_unique_abbr": "UVA", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Charlottesville;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Variational Linearized Laplace Approximation for Bayesian Deep Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35135", "id": "1n3aC5rvdE", "proceeding": "https://proceedings.mlr.press/v235/ortega24a.html", "pdf": "https://openreview.net/pdf?id=1n3aC5rvdE", "openreview": "https://openreview.net/forum?id=1n3aC5rvdE", "author_site": "Luis A. Ortega, Simon Rodriguez Santana, Daniel Hern\u00e1ndez-Lobato", "tldr": "", "abstract": "The Linearized Laplace Approximation (LLA) has been recently used to perform uncertainty estimation on the predictions of pre-trained deep neural networks (DNNs). However, its widespread application is hindered by significant computational costs, particularly in scenarios with a large number of training points or DNN parameters. Consequently, additional approximations of LLA, such as Kronecker-factored or diagonal approximate GGN matrices, are utilized, potentially compromising the model's performance. To address these challenges, we propose a new method for approximating LLA using a variational sparse Gaussian Process (GP). Our method is based on the dual RKHS formulation of GPs and retains as the predictive mean the output of the original DNN. Furthermore, it allows for efficient stochastic optimization, which results in sub-linear training time in the size of the training dataset. Specifically, its training cost is independent of the number of training points. We compare our proposed method against accelerated LLA (ELLA), which relies on the Nystr\u00f6m approximation, as well as other LLA variants employing the sample-then-optimize principle. Experimental results, both on regression and classification datasets, show that our method outperforms these already existing efficient variants of LLA, both in terms of the quality of the predictive distribution and in terms of total computational time.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Luis A. Ortega;Simon Rodriguez Santana;Daniel Hern\u00e1ndez-Lobato", "authorids": "~Luis_A._Ortega1;~Simon_Rodriguez_Santana1;~Daniel_Hern\u00e1ndez-Lobato1", "gender": "M;M;M", "homepage": ";http://dhnzl.org;", "dblp": "249/2890;95/166;304/8839", "google_scholar": "https://scholar.google.es/citations?user=9x_tXzwAAAAJ;https://scholar.google.es/citations?user=rL6cvTUAAAAJ;1Ly8qeoAAAAJ", "orcid": "0000-0003-3760-0520;;", "linkedin": ";;ludvins", "or_profile": "~Simon_Rodriguez_Santana1;~Daniel_Hern\u00e1ndez-Lobato1;~Luis_Antonio_Ortega_Andr\u00e9s1", "aff": "ICMAT-CSIC;Universidad Aut\u00f3noma de Madrid;Universidad Aut\u00f3noma de Madrid", "aff_domain": "icmat.es;uam.es;uam.es", "position": "Postdoc;Associate Professor;PhD student", "bibtex": "@inproceedings{\nortega2024variational,\ntitle={Variational Linearized Laplace Approximation for Bayesian Deep Learning},\nauthor={Luis A. Ortega and Simon Rodriguez Santana and Daniel Hern{\\'a}ndez-Lobato},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1n3aC5rvdE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1044417, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8558142634181930387&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "icmat.es;uam.es;uam.es", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Instituto de Ciencias Matem\u00e1ticas (ICMAT);Universidad Aut\u00f3noma de Madrid", "aff_unique_dep": ";", "aff_unique_url": "https://icmat.csic.es/;https://www.uam.es", "aff_unique_abbr": "ICMAT;UAM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Spain" }, { "title": "Proactive DP: A Multiple Target Optimization Framework for DP-SGD", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35134", "id": "1nT6uc3HdY", "proceeding": "https://proceedings.mlr.press/v235/van-dijk24a.html", "pdf": "https://openreview.net/pdf?id=1nT6uc3HdY", "openreview": "https://openreview.net/forum?id=1nT6uc3HdY", "author_site": "Marten van Dijk, Nhuong Nguyen, Toan N. Nguyen, Lam M. Nguyen, Phuong Ha Nguyen", "tldr": "", "abstract": "We introduce a multiple target optimization framework for DP-SGD referred to as pro-active DP. In contrast to traditional DP accountants, which are used to track the expenditure of privacy budgets, the pro-active DP scheme allows one to *a-priori* select parameters of DP-SGD based on a fixed privacy budget (in terms of $\\epsilon$ and $\\delta$) in such a way to optimize the anticipated utility (test accuracy) the most. To achieve this objective, we first propose significant improvements to the moment account method, presenting a closed-form $(\\epsilon,\\delta)$-DP guarantee that connects all parameters in the DP-SGD setup. Generally, DP-SGD is $(\\epsilon\\leq 1/2,\\delta=1/N)$-DP if $\\sigma=\\sqrt{2(\\epsilon +\\ln(1/\\delta))/\\epsilon}$ with $T$ at least $\\approx 2k^2/\\epsilon$ and $(2/e)^2k^2-1/2\\geq \\ln(N)$, where $T$ is the total number of rounds, and $K=kN$ is the total number of gradient computations where $k$ measures $K$ in number of epochs of size $N$ of the local data set. We prove that our expression is close to tight in that if $T$ is more than a constant factor $\\approx 4$ smaller than the lower bound $\\approx 2k^2/\\epsilon$, then the $(\\epsilon,\\delta)$-DP guarantee is violated. Our enhanced DP theory allows us to create a utility graph and DP calculator. These tools link privacy and utility objectives and search for optimal experiment setups, efficiently taking into account both accuracy and privacy objectives, as well as implementation goals. We furnish a comprehensive implementation flow of our proactive DP, with rigorous experiments to showcase the proof-of-concept.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Marten van Dijk;Nhuong Van Nguyen;Toan N. Nguyen;Lam M. Nguyen;Phuong Ha Nguyen", "authorids": "~Marten_van_Dijk1;~Nhuong_Van_Nguyen1;~Toan_N._Nguyen1;~Lam_M._Nguyen1;~Phuong_Ha_Nguyen1", "gender": "M;M;M;M;M", "homepage": "https://www.cwi.nl/people/marten-van-dijk;;;https://scl.uconn.edu/people/ha/info.php;https://lamnguyen-mltd.github.io/", "dblp": "32/1399.html;;;26/8612;181/1428", "google_scholar": "byCWPiwAAAAJ;VPicH1QAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.co.in/citations?user=OfhlvXQAAAAJ;DeFL5Q8AAAAJ", "orcid": "0000-0001-9388-8050;;;;", "linkedin": "marten-van-dijk-51554012/?originalSubdomain=nl;;toan-nguyen-b9b2601a3;;lam-m-nguyen-71b54750/", "or_profile": "~Marten_van_Dijk1;~Nhuong_Van_Nguyen1;~Toan_N._Nguyen1;~Phuong_Ha_Nguyen1;~Lam_M_Nguyen1", "aff": "Centrum voor Wiskunde en Informatica;University of Connecticut;;eBay Inc.;IBM Research, Thomas J. Watson Research Center", "aff_domain": "cwi.nl;uconn.edu;;ebay.com;ibm.com", "position": "Full Professor;PhD student;;Researcher;Staff Research Scientist", "bibtex": "@inproceedings{\ndijk2024proactive,\ntitle={Proactive {DP}: A Multiple Target Optimization Framework for {DP}-{SGD}},\nauthor={Marten van Dijk and Nhuong Van Nguyen and Toan N. Nguyen and Lam M. Nguyen and Phuong Ha Nguyen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1nT6uc3HdY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2329881, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:0z1NhVlbyrIJ:scholar.google.com/&scioq=Proactive+DP:+A+Multiple+Target+Optimization+Framework+for+DP-SGD&hl=en&as_sdt=0,5", "gs_version_total": 10, "email": "cwi.nl;uconn.edu;;ebay.com;ibm.com", "author_num": 5, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Centrum voor Wiskunde en Informatica;University of Connecticut;eBay Inc.;IBM", "aff_unique_dep": ";;;IBM Research", "aff_unique_url": "https://www.cwi.nl/;https://www.uconn.edu;https://www.ebayinc.com;https://www.ibm.com/research", "aff_unique_abbr": "CWI;UConn;eBay;IBM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Yorktown Heights", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Netherlands;United States" }, { "title": "A Provably Effective Method for Pruning Experts in Fine-tuned Sparse Mixture-of-Experts", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35133", "id": "1oU4FKpVx5", "proceeding": "https://proceedings.mlr.press/v235/chowdhury24a.html", "pdf": "https://openreview.net/pdf?id=1oU4FKpVx5", "openreview": "https://openreview.net/forum?id=1oU4FKpVx5", "author_site": "Mohammed Nowaz Rabbani Chowdhury, Meng Wang, Kaoutar El Maghraoui, Naigang Wang, Pin-Yu Chen, Christopher Carothers", "tldr": "", "abstract": "The sparsely gated mixture of experts (MoE) architecture sends different inputs to different subnetworks (experts), through trainable routers. MoE reduces the training computation significantly for large models, but its deployment can be still memory/computation expensive for some downstream tasks. Model pruning is a popular approach to reduce inference computation, but its application in MoE architecture is largely unexplored. To the best of our knowledge, this paper provides the first provably efficient technique for pruning experts in fine-tuned MoE models. We theoretically prove that prioritizing the pruning of the experts with a smaller change of the router\u2019s $l_2$ norm from the pre-trained model guarantees the preservation of test accuracy, while significantly reducing the model size and the computational requirements. Although our theoretical analysis is centered on binary classification tasks on simplified MoE architecture, our expert pruning method is verified on large vision MoE models such as V-MoE and $\\text{E}^3$-MoE fine-tuned on benchmark datasets such as CIFAR-10, CIFAR-100, and ImageNet.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mohammed Nowaz Rabbani Chowdhury;Meng Wang;Kaoutar El Maghraoui;Naigang Wang;Pin-Yu Chen;Christopher Carothers", "authorids": "~Mohammed_Nowaz_Rabbani_Chowdhury1;~Meng_Wang4;~Kaoutar_El_Maghraoui1;~Naigang_Wang1;~Pin-Yu_Chen1;~Christopher_Carothers2", "gender": "M;F;F;M;M;M", "homepage": ";https://www.ecse.rpi.edu/~wang/index.html;https://researcher.watson.ibm.com/researcher/view.php?person=us-kelmaghr;;http://www.pinyuchen.com;https://www.cs.rpi.edu/~chrisc", "dblp": "348/9557;93/6765-3;19/2658.html;78/11176;39/8969;", "google_scholar": "-JaZmaYAAAAJ;;yDp6rbcAAAAJ;https://scholar.google.com/citations?hl=en;jxwlCUUAAAAJ;PQakxO4AAAAJ", "orcid": ";;0000-0002-1967-8749;;0000-0003-1039-8369;", "linkedin": "mohammed-nowaz-rabbani-chowdhury-b24446251/;;kaoutar-el-maghraoui/;;pin-yu-chen-940062a2;", "or_profile": "~Mohammed_Nowaz_Rabbani_Chowdhury1;~Meng_Wang4;~Kaoutar_El_Maghraoui1;~Naigang_Wang1;~Pin-Yu_Chen1;~Christopher_Carothers2", "aff": "University of Dhaka;Rensselaer Polytechnic Institute;International Business Machines;IBM, International Business Machines;International Business Machines;Rensselaer Polytechnic Institute", "aff_domain": "du.ac.bd;rpi.edu;ibm.com;us.ibm.com;ibm.com;rpi.edu", "position": "Lecturer;Associate Professor;Principal Research Staff Member;Researcher;Principal Researcher;Full Professor", "bibtex": "@inproceedings{\nchowdhury2024a,\ntitle={A Provably Effective Method for Pruning Experts in Fine-tuned Sparse Mixture-of-Experts},\nauthor={Mohammed Nowaz Rabbani Chowdhury and Meng Wang and Kaoutar El Maghraoui and Naigang Wang and Pin-Yu Chen and Christopher Carothers},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1oU4FKpVx5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7551009, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13834870043738489306&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "du.ac.bd;rpi.edu;ibm.com;us.ibm.com;ibm.com;rpi.edu", "author_num": 6, "aff_unique_index": "0;1;2;3;2;1", "aff_unique_norm": "University of Dhaka;Rensselaer Polytechnic Institute;International Business Machines Corporation;International Business Machines", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.du.ac.bd;https://www.rpi.edu;https://www.ibm.com;https://www.ibm.com", "aff_unique_abbr": "DU;RPI;IBM;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;1", "aff_country_unique": "Bangladesh;United States" }, { "title": "Going beyond Compositions, DDPMs Can Produce Zero-Shot Interpolations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35132", "id": "1pj0Sk8GfP", "proceeding": "https://proceedings.mlr.press/v235/deschenaux24a.html", "pdf": "https://openreview.net/pdf?id=1pj0Sk8GfP", "openreview": "https://openreview.net/forum?id=1pj0Sk8GfP", "author_site": "Justin Deschenaux, Igor Krawczuk, Grigorios Chrysos, Volkan Cevher", "tldr": "", "abstract": "Denoising Diffusion Probabilistic Models (DDPMs) exhibit remarkable capabilities in image generation, with studies suggesting that they can generalize by composing latent factors learned from the training data. In this work, we go further and study DDPMs trained on strictly separate subsets of the data distribution with large gaps on the support of the latent factors. We show that such a model can effectively generate images in the unexplored, intermediate regions of the distribution. For instance, when trained on clearly smiling and non-smiling faces, we demonstrate a sampling procedure which can generate slightly smiling faces without reference images (zero-shot interpolation). We replicate these findings for other attributes as well as other datasets. [Our code is available on GitHub](https://github.com/jdeschena/ddpm-zero-shot-interpolation).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Justin Deschenaux;Igor Krawczuk;Grigorios Chrysos;Volkan Cevher", "authorids": "~Justin_Deschenaux1;~Igor_Krawczuk1;~Grigorios_Chrysos1;~Volkan_Cevher1", "gender": "M;Unspecified;M;M", "homepage": "https://github.com/jdeschena;https://krawczuk.eu;https://grigorisg9gr.github.io/;http://lions.epfl.ch", "dblp": ";244/7380.html;75/6117-2;70/5301", "google_scholar": ";https://scholar.google.ch/citations?user=rLQIkUsAAAAJ;1bU041kAAAAJ;https://scholar.google.ch/citations?user=hlWhzU8AAAAJ", "orcid": ";0000-0002-5281-8926;;", "linkedin": "justin-d-288b05211/;https://linkedin.com/in/igorkrawczuk;;", "or_profile": "~Justin_Deschenaux1;~Igor_Krawczuk1;~Grigorios_Chrysos1;~Volkan_Cevher1", "aff": "EPFL - EPF Lausanne;Swiss Federal Institute of Technology Lausanne;University of Wisconsin - Madison;Amazon Development Center Germany", "aff_domain": "epfl.ch;epfl.ch;wisc.edu;amazon.de", "position": "PhD student;PhD student;Assistant Professor;Amazon Scholar", "bibtex": "@inproceedings{\ndeschenaux2024going,\ntitle={Going beyond Compositions, {DDPM}s Can Produce Zero-Shot Interpolations},\nauthor={Justin Deschenaux and Igor Krawczuk and Grigorios Chrysos and Volkan Cevher},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1pj0Sk8GfP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8935738, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=267387450261826080&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": "epfl.ch;epfl.ch;wisc.edu;amazon.de", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "EPFL;Swiss Federal Institute of Technology Lausanne;University of Wisconsin-Madison;Amazon", "aff_unique_dep": ";;;Development Center", "aff_unique_url": "https://www.epfl.ch;https://www.epfl.ch;https://www.wisc.edu;https://www.amazon.de", "aff_unique_abbr": "EPFL;EPFL;UW-Madison;Amazon", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Lausanne;Madison;", "aff_country_unique_index": "0;0;1;2", "aff_country_unique": "Switzerland;United States;Germany" }, { "title": "ACE: Off-Policy Actor-Critic with Causality-Aware Entropy Regularization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35131", "id": "1puvYh729M", "proceeding": "https://proceedings.mlr.press/v235/ji24b.html", "pdf": "https://openreview.net/pdf?id=1puvYh729M", "openreview": "https://openreview.net/forum?id=1puvYh729M", "author_site": "Tianying Ji, Yongyuan Liang, Yan Zeng, Yu Luo, Guowei Xu, Jiawei Guo, Ruijie Zheng, Furong Huang, Fuchun Sun, Huazhe Xu", "tldr": "", "abstract": "The varying significance of distinct primitive behaviors during the policy learning process has been overlooked by prior model-free RL algorithms. Leveraging this insight, we explore the causal relationship between different action dimensions and rewards to evaluate the significance of various primitive behaviors during training. We introduce a causality-aware entropy term that effectively identifies and prioritizes actions with high potential impacts for efficient exploration. Furthermore, to prevent excessive focus on specific primitive behaviors, we analyze the gradient dormancy phenomenon and introduce a dormancy-guided reset mechanism to further enhance the efficacy of our method. Our proposed algorithm, **ACE**: Off-policy **A**ctor-critic with **C**ausality-aware **E**ntropy regularization, demonstrates a substantial performance advantage across 29 diverse continuous control tasks spanning 7 domains compared to model-free RL baselines, which underscores the effectiveness, versatility, and efficient sample efficiency of our approach. Benchmark results and videos are available at https://ace-rl.github.io/.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tianying Ji;Yongyuan Liang;Yan Zeng;Yu Luo;Guowei Xu;Jiawei Guo;Ruijie Zheng;Furong Huang;Fuchun Sun;Huazhe Xu", "authorids": "~Tianying_Ji2;~Yongyuan_Liang1;~Yan_Zeng2;~Yu_Luo5;~Guowei_Xu2;~Jiawei_Guo4;~Ruijie_Zheng1;~Furong_Huang1;~Fuchun_Sun1;~Huazhe_Xu1", "gender": "F;F;;M;M;;F;M;M;", "homepage": ";https://cheryyunl.github.io/;https://scholar.google.com/citations?user=XyxLHCAAAAAJ&hl=zh-CN;;https://xugw-kevin.github.io/;http://www.ruijiezheng.com;https://furong-huang.com;https://www.cs.tsinghua.edu.cn/info/1121/3555.htm;http://hxu.rocks;https://github.com/GuoKaku", "dblp": "124/2199.html;238/4104;83/4665-2;;11/7718-1;294/8474;72/8513;;164/9006;", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;GQToORIAAAAJ;XyxLHCAAAAAJ;https://scholar.google.com.hk/citations?user=KQjoQOMAAAAJ;7xKdIM4AAAAJ;;13yyuCcAAAAJ;;t9HPFawAAAAJ;", "orcid": ";;0000-0001-7721-2560;0000-0001-6229-4639;;;;;;", "linkedin": ";https://linkedin.com/in/yongyuan-l-31462a17a;;;;;;;;", "or_profile": "~Tianying_Ji2;~Yongyuan_Liang1;~Yan_Zeng2;~Yu_Luo5;~Guowei_Xu2;~Ruijie_Zheng1;~Furong_Huang1;~Fuchun_Sun1;~Huazhe_Xu1;~Guo_Jiawei1", "aff": "Tsinghua University;University of Maryland, College Park;Beijing Technology and Business University;Tsinghua University;Tsinghua University;University of Maryland, College Park;University of Maryland;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;umd.edu;btbu.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;cs.umd.edu;cs.umd.edu;cs.tsinghua.edu.cn;tsinghua.edu.cn;mails.tsinghua.edu.cn", "position": "PhD student;PhD student;Lecturer;PhD student;Undergrad student;PhD student;Assistant Professor;Full Professor;Assistant Professor;Undergrad student", "bibtex": "@inproceedings{\nji2024ace,\ntitle={{ACE}: Off-Policy Actor-Critic with Causality-Aware Entropy Regularization},\nauthor={Tianying Ji and Yongyuan Liang and Yan Zeng and Yu Luo and Guowei Xu and Jiawei Guo and Ruijie Zheng and Furong Huang and Fuchun Sun and Huazhe Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1puvYh729M}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7540422, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13046081266874631697&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "tsinghua.edu.cn;umd.edu;btbu.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;cs.umd.edu;cs.umd.edu;cs.tsinghua.edu.cn;tsinghua.edu.cn;mails.tsinghua.edu.cn", "author_num": 10, "aff_unique_index": "0;1;2;0;0;1;1;0;0;0", "aff_unique_norm": "Tsinghua University;University of Maryland;Beijing Technology and Business University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www/umd.edu;http://www.btbu.edu.cn", "aff_unique_abbr": "THU;UMD;BTBU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";College Park", "aff_country_unique_index": "0;1;0;0;0;1;1;0;0;0", "aff_country_unique": "China;United States" }, { "title": "Multi-Track Message Passing: Tackling Oversmoothing and Oversquashing in Graph Learning via Preventing Heterophily Mixing", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35130", "id": "1sRuv4cnuZ", "proceeding": "https://proceedings.mlr.press/v235/pei24a.html", "pdf": "https://openreview.net/pdf?id=1sRuv4cnuZ", "openreview": "https://openreview.net/forum?id=1sRuv4cnuZ", "author_site": "Hongbin Pei, Yu Li, Huiqi Deng, Jingxin Hai, Pinghui Wang, Jie Ma, Jing Tao, Yuheng Xiong, Xiaohong Guan", "tldr": "", "abstract": "The advancement toward deeper graph neural networks is currently obscured by two inherent issues in message passing, *oversmoothing* and *oversquashing*. We identify the root cause of these issues as information loss due to *heterophily mixing* in aggregation, where messages of diverse category semantics are mixed. We propose a novel multi-track graph convolutional network to address oversmoothing and oversquashing effectively. Our basic idea is intuitive: if messages are separated and independently propagated according to their category semantics, heterophilic mixing can be prevented. Consequently, we present a novel multi-track message passing scheme capable of preventing heterophilic mixing, enhancing long-distance information flow, and improving separation condition. Empirical validations show that our model achieved state-of-the-art performance on several graph datasets and effectively tackled oversmoothing and oversquashing, setting a new benchmark of $86.4$% accuracy on Cora.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hongbin Pei;Yu Li;Huiqi Deng;Jingxin Hai;Pinghui Wang;Jie Ma;Jing Tao;Yuheng Xiong;Xiaohong Guan", "authorids": "~Hongbin_Pei1;~Yu_Li30;~Huiqi_Deng1;haijingxin@stu.xjtu.edu.cn;~Pinghui_Wang1;~Jie_Ma1;~Jing_Tao2;~Yuheng_Xiong1;~Xiaohong_Guan2", "gender": ";M;F;;M;M;M;M;M", "homepage": ";https://github.com/liyu199809;;;;https://dr-majie.github.io/;https://gr.xjtu.edu.cn/en/web/jtao/9;https://github.com/xyh1999;", "dblp": ";;229/1317;;84/7882;62/5110-1;;;45/503.html", "google_scholar": ";;QEjqzXgAAAAJ;;;VsY24XkAAAAJ;;https://scholar.google.com.hk/citations?hl=zh-CN;", "orcid": ";;;;0000-0002-1434-837X;0000-0002-7432-3238;;;", "linkedin": ";;;;pinghui-wang-53b86818/?originalSubdomain=hk;;;;", "or_profile": "~Hongbin_Pei1;~Yu_Li30;~Huiqi_Deng1;haijingxin@stu.xjtu.edu.cn;~Pinghui_Wang1;~Jie_Ma1;~Jing_Tao2;~Yuheng_Xiong1;~Xiaohong_Guan2", "aff": ";Xi'an Jiaotong University;Xi'an jiaotong University;;Xi'an Jiaotong University;Xi'an Jiaotong University;Xi'an Jiaotong University;Xi'an Jiaotong University;Xi'an Jiaotong University", "aff_domain": ";xjtu.edu.cn;edu.cn;;xjtu.edu.cn;xjtu.edu.cn;xjtu.edu.cn;xjtu.edu.cn;xjtu.edu.cn", "position": ";MS student;Assistant Professor;;Full Professor;Assistant Professor;Researcher;MS student;Full Professor", "bibtex": "@inproceedings{\npei2024multitrack,\ntitle={Multi-Track Message Passing: Tackling Oversmoothing and Oversquashing in Graph Learning via Preventing Heterophily Mixing},\nauthor={Hongbin Pei and Yu Li and Huiqi Deng and Jingxin Hai and Pinghui Wang and Jie Ma and Jing Tao and Yuheng Xiong and Xiaohong Guan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1sRuv4cnuZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8030674, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2658133127792604663&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": ";xjtu.edu.cn;edu.cn;;xjtu.edu.cn;xjtu.edu.cn;xjtu.edu.cn;xjtu.edu.cn;xjtu.edu.cn", "author_num": 9, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Xi'an Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.xjtu.edu.cn", "aff_unique_abbr": "XJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "DecisionNCE: Embodied Multimodal Representations via Implicit Preference Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35129", "id": "1sesUtOIH5", "proceeding": "https://proceedings.mlr.press/v235/li24cr.html", "pdf": "https://openreview.net/pdf?id=1sesUtOIH5", "openreview": "https://openreview.net/forum?id=1sesUtOIH5", "author_site": "Jianxiong Li, Jinliang Zheng, Yinan Zheng, Liyuan Mao, Xiao Hu, Sijie Cheng, Haoyi Niu, Jihao Liu, Yu Liu, Jingjing Liu, Ya-Qin Zhang, Xianyuan Zhan", "tldr": "", "abstract": "Multimodal pretraining is an effective strategy for the trinity of goals of representation learning in autonomous robots: $1)$ extracting both local and global task progressions; $2)$ enforcing temporal consistency of visual representation; $3)$ capturing trajectory-level language grounding. Most existing methods approach these via separate objectives, which often reach sub-optimal solutions. In this paper, we propose a universal unified objective that can simultaneously extract meaningful task progression information from image sequences and seamlessly align them with language instructions. We discover that via implicit preferences, where a visual trajectory inherently aligns better with its corresponding language instruction than mismatched pairs, the popular Bradley-Terry model can transform into representation learning through proper reward reparameterizations. The resulted framework, DecisionNCE, mirrors an InfoNCE-style objective but is distinctively tailored for decision-making tasks, providing an embodied representation learning framework that elegantly extracts both local and global task progression features, with temporal consistency enforced through implicit time contrastive learning, while ensuring trajectory-level instruction grounding via multimodal joint encoding. Evaluation on both simulated and real robots demonstrates that DecisionNCE effectively facilitates diverse downstream policy learning tasks, offering a versatile solution for unified representation and reward learning. Project Page: https://2toinf.github.io/DecisionNCE/", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jianxiong Li;Jinliang Zheng;Yinan Zheng;Liyuan Mao;Xiao Hu;Sijie Cheng;Haoyi Niu;Jihao Liu;Yu Liu;Jingjing Liu;Ya-Qin Zhang;Xianyuan Zhan", "authorids": "~Jianxiong_Li1;~Jinliang_Zheng1;~Yinan_Zheng1;~Liyuan_Mao2;~Xiao_Hu7;~Sijie_Cheng1;~Haoyi_Niu1;~Jihao_Liu4;~Yu_Liu2;~Jingjing_Liu2;~Ya-Qin_Zhang1;~Xianyuan_Zhan1", "gender": "M;;M;F;M;M;M;;M;M;M;M", "homepage": "https://2toinf.github.io/;https://github.com/ZhengYinan-AIR;https://github.com/huxiao09;https://adacheng.github.io/;https://t6-thu.github.io;https://jihaonew.github.io/;http://liuyu.us;https://air.tsinghua.edu.cn/en/info/1046/1194.htm#:~:text=Jingjing%20Liu%20is%20Professor%2C%20Principal,CVPR%2C%20ACL%2C%20etc.);https://air.tsinghua.edu.cn/en/info/1046/1188.htm;http://zhanxianyuan.xyz/;https://facebear-ljx.github.io/;https://github.com/maoliyuan", "dblp": "156/3720.html;;19/1374;160/7320;;167/0509;97/2274-15;30/3008-1;09/2187;181/5081;43/1987;", "google_scholar": "3j5AHFsAAAAJ;;_9btJRYAAAAJ;pruwctkAAAAJ;https://scholar.google.com/citations?hl=zh-CN;PP1HyToAAAAJ;;BzJ_GboAAAAJ;mDOMfxIAAAAJ;pDMnGloAAAAJ;TRLwpiUAAAAJ;", "orcid": "0009-0000-0605-2969;;;;0000-0002-7072-3787;;;;;0000-0002-3683-0554;;", "linkedin": ";;;;;;;jingjing-liu-65703431/;;;;", "or_profile": "~Jinliang_Zheng1;~Yinan_Zheng1;~Xiao_Hu7;~Sijie_Cheng1;~Haoyi_Niu1;~Jihao_Liu4;~Yu_Liu2;~Jingjing_Liu2;~Ya-Qin_Zhang1;~Xianyuan_Zhan1;~Li_Jianxiong1;~Liyuan_Richard_Mao1", "aff": "Sensetime Research;Tsinghua University;Tsinghua University;Tsinghua University;Institute for AI Industry Research, Tsinghua University;The Chinese University of Hong Kong;SenseTime;Tsinghua University;AIR, Tsinghua University;Tsinghua University;Tsinghua University;Shanghai Jiaotong University", "aff_domain": "sensetime.com;tsinghua.edu.cn;tsinghua.edu.cn;mails.tsinghua.edu.cn;tsinghua.edu.cn;cuhk.edu.hk;sensetime.com;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;sjtu.edu.cn", "position": "Intern;PhD student;PhD student;PhD student;Research Intern;PhD student;Principal Researcher;Full Professor;Full Professor;Associate Professor;PhD student;Undergrad student", "bibtex": "@inproceedings{\nli2024decisionnce,\ntitle={Decision{NCE}: Embodied Multimodal Representations via Implicit Preference Learning},\nauthor={Jianxiong Li and Jinliang Zheng and Yinan Zheng and Liyuan Mao and Xiao Hu and Sijie Cheng and Haoyi Niu and Jihao Liu and Yu Liu and Jingjing Liu and Ya-Qin Zhang and Xianyuan Zhan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1sesUtOIH5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 10197653, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7023539500485316869&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "sensetime.com;tsinghua.edu.cn;tsinghua.edu.cn;mails.tsinghua.edu.cn;tsinghua.edu.cn;cuhk.edu.hk;sensetime.com;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;sjtu.edu.cn", "author_num": 12, "aff_unique_index": "0;1;1;1;1;2;0;1;1;1;1;3", "aff_unique_norm": "SenseTime;Tsinghua University;Chinese University of Hong Kong;Shanghai Jiao Tong University", "aff_unique_dep": "Research;;;", "aff_unique_url": "https://www.sensetime.com/;https://www.tsinghua.edu.cn;https://www.cuhk.edu.hk;https://www.sjtu.edu.cn", "aff_unique_abbr": "SenseTime;THU;CUHK;SJTU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Pruner-Zero: Evolving Symbolic Pruning Metric From Scratch for Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35128", "id": "1tRLxQzdep", "proceeding": "https://proceedings.mlr.press/v235/dong24b.html", "pdf": "https://openreview.net/pdf?id=1tRLxQzdep", "openreview": "https://openreview.net/forum?id=1tRLxQzdep", "author_site": "Peijie Dong, Lujun Li, Zhenheng Tang, Xiang Liu, Xinglin Pan, Qiang Wang, Xiaowen Chu", "tldr": "", "abstract": "Despite the remarkable capabilities, Large Language Models (LLMs) face deployment challenges due to their extensive size. Pruning methods drop a subset of weights to accelerate, but many of them require retraining, which is prohibitively expensive and computationally demanding. Recently, post-training pruning approaches introduced novel metrics, enabling the pruning of LLMs without retraining. However, these metrics require the involvement of human experts and tedious trial and error. To efficiently identify superior pruning metrics, we develop an automatic framework for searching symbolic pruning metrics using genetic programming. In particular, we devise an elaborate search space encompassing the existing pruning metrics to discover the potential symbolic pruning metric. We propose an opposing operation simplification strategy to increase the diversity of the population. In this way, Pruner-Zero allows auto-generation of symbolic pruning metrics. Based on the searched results, we explore the correlation between pruning metrics and performance after pruning and summarize some principles. Extensive experiments on LLaMA and LLaMA-2 on language modeling and zero-shot tasks demonstrate that our Pruner-Zero obtains superior performance than SOTA post-training pruning methods. Code at: https://github.com/pprp/Pruner-Zero.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Peijie Dong;Lujun Li;Zhenheng Tang;Xiang Liu;Xinglin Pan;Qiang Wang;Xiaowen Chu", "authorids": "~Peijie_Dong1;~Lujun_Li1;~Zhenheng_Tang2;~Xiang_Liu10;~Xinglin_Pan1;~Qiang_Wang14;~Xiaowen_Chu2", "gender": "M;;;M;M;M;M", "homepage": "https://pprp.github.io;;;https://dominic789654.github.io/;http://smilelab.uestc.edu.cn/members/panxinglin/;http://faculty.hitsz.edu.cn/wangqiang;https://facultyprofiles.hkust-gz.edu.cn/faculty-personal-page/CHU-Xiaowen/xwchu", "dblp": "315/4734;;;31/5736-1;273/3352;64/5630-22;24/2536", "google_scholar": "TqS6s4gAAAAJ;;;VtK5lwUAAAAJ;e0oE3QIAAAAJ;6YzjcNgAAAAJ;https://scholar.google.com.hk/citations?user=v4rX24EAAAAJ", "orcid": "0000-0003-1952-4544;;;;;0000-0002-2986-967X;0000-0001-9745-4372", "linkedin": ";;;;;;", "or_profile": "~Peijie_Dong1;~Lujun_Li1;~Zhenheng_Tang2;~Xiang_Liu10;~Xinglin_Pan1;~Qiang_Wang14;~Xiaowen_Chu2", "aff": "The Hong Kong University of Science and Technology (Guang Zhou);;;Hong Kong University of Science and Technology (Guang Zhou));The Hong Kong University of Science and Technology (Guangzhou);Harbin Institute of Technology, Shenzhen;Hong Kong University of Science and Technology (Guangzhou)", "aff_domain": "connect.hkust-gz.edu.cn;;;hkust-gz.edu.cn;hkust-gz.edu.cn;hit.edu.cn;ust.hk", "position": "Phd student;;;PhD student;PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\ndong2024prunerzero,\ntitle={Pruner-Zero: Evolving Symbolic Pruning Metric From Scratch for Large Language Models},\nauthor={Peijie Dong and Lujun Li and Zhenheng Tang and Xiang Liu and Xinglin Pan and Qiang Wang and Xiaowen Chu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1tRLxQzdep}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1695404, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4308691204959682147&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "connect.hkust-gz.edu.cn;;;hkust-gz.edu.cn;hkust-gz.edu.cn;hit.edu.cn;ust.hk", "author_num": 7, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Hong Kong University of Science and Technology;Harbin Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ust.hk;http://en.hhit.edu.cn/", "aff_unique_abbr": "HKUST;HIT", "aff_campus_unique_index": "0;0;1;2;0", "aff_campus_unique": "Hong Kong SAR;Guangzhou;Shenzhen", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Not all distributional shifts are equal: Fine-grained robust conformal inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35127", "id": "1v1oFF3aw0", "proceeding": "https://proceedings.mlr.press/v235/ai24a.html", "pdf": "https://openreview.net/pdf?id=1v1oFF3aw0", "openreview": "https://openreview.net/forum?id=1v1oFF3aw0", "author_site": "Jiahao Ai, Zhimei Ren", "tldr": "", "abstract": "We introduce a fine-grained framework for uncertainty quantification of predictive models under distributional shifts. This framework distinguishes the shift in covariate distributions from that in the conditional relationship between the outcome ($Y$) and the covariates ($X$). We propose to reweight the training samples to adjust for an identifiable shift in covariate distribution while protecting against the worst-case conditional distribution shift bounded in an $f$-divergence ball. Based on ideas from conformal inference and distributionally robust learning, we present an algorithm that outputs (approximately) valid and efficient prediction intervals in the presence of distributional shifts. As a use case, we apply the framework to sensitivity analysis of individual treatment effects with hidden confounding. The proposed methods are evaluated in simulations and four real data applications, demonstrating superior robustness and efficiency compared with existing benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiahao Ai;Zhimei Ren", "authorids": "2100010637@stu.pku.edu.cn;~Zhimei_Ren1", "gender": ";F", "homepage": ";https://zhimeir.github.io/", "dblp": ";", "google_scholar": ";X3gGi_0AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "2100010637@stu.pku.edu.cn;~Zhimei_Ren1", "aff": ";The Wharton School, University of Pennsylvania", "aff_domain": ";wharton.upenn.edu", "position": ";Assistant Professor", "bibtex": "@inproceedings{\nai2024not,\ntitle={Not all distributional shifts are equal: Fine-grained robust conformal inference},\nauthor={Jiahao Ai and Zhimei Ren},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1v1oFF3aw0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1327486, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14367773599479938731&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";wharton.upenn.edu", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "The Wharton School", "aff_unique_url": "https://www.wharton.upenn.edu", "aff_unique_abbr": "UPenn Wharton", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "EquiPocket: an E(3)-Equivariant Geometric Graph Neural Network for Ligand Binding Site Prediction", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35126", "id": "1vGN3CSxVs", "proceeding": "https://proceedings.mlr.press/v235/zhang24bp.html", "pdf": "https://openreview.net/pdf?id=1vGN3CSxVs", "openreview": "https://openreview.net/forum?id=1vGN3CSxVs", "author_site": "yang zhang, Zhewei Wei, Ye Yuan, Chongxuan Li, Wenbing Huang", "tldr": "", "abstract": "Predicting the binding sites of target proteins plays a fundamental role in drug discovery. Most existing deep-learning methods consider a protein as a 3D image by spatially clustering its atoms into voxels and then feed the voxelized protein into a 3D CNN for prediction. However, the CNN-based methods encounter several critical issues: 1) defective in representing irregular protein structures; 2) sensitive to rotations; 3) insufficient to characterize the protein surface; 4) unaware of protein size shift. To address the above issues, this work proposes EquiPocket, an E(3)-equivariant Graph Neural Network (GNN) for binding site prediction, which comprises three modules: the first one to extract local geometric information for each surface atom, the second one to model both the chemical and spatial structure of protein and the last one to capture the geometry of the surface via equivariant message passing over the surface atoms. We further propose a dense attention output layer to alleviate the effect incurred by variable protein size. Extensive experiments on several representative benchmarks demonstrate the superiority of our framework to the state-of-the-art methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "yang zhang;Zhewei Wei;Ye Yuan;Chongxuan Li;Wenbing Huang", "authorids": "~yang_zhang28;~Zhewei_Wei1;~Ye_Yuan15;~Chongxuan_Li1;~Wenbing_Huang1", "gender": "M;M;;M;M", "homepage": "https://fengyuewuya.github.io/;http://weizhewei.com;;http://ml.cs.tsinghua.edu.cn/~chongxuan;https://gsai.ruc.edu.cn/english/wenbing_huang", "dblp": "06/6785-94;94/4260;;161/9965;155/3181-1.html", "google_scholar": "ObZB2jwAAAAJ;https://scholar.google.com.hk/citations?user=qZ7dj4gAAAAJ;;UKMcQn4AAAAJ;0yNkmO4AAAAJ", "orcid": ";0000-0003-3620-5086;;0000-0002-0912-9076;", "linkedin": ";;;;", "or_profile": "~yang_zhang28;~Zhewei_Wei1;~Ye_Yuan15;~Chongxuan_Li1;~Wenbing_Huang1", "aff": "Renmin University of China;Renmin University of China;;Renmin University of China;Renmin University of China", "aff_domain": "ruc.edu.cn;ruc.edu.cn;;ruc.edu.cn;ruc.edu.cn", "position": "PhD student;Full Professor;;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nzhang2024equipocket,\ntitle={EquiPocket: an E(3)-Equivariant Geometric Graph Neural Network for Ligand Binding Site Prediction},\nauthor={yang zhang and Zhewei Wei and Ye Yuan and Chongxuan Li and Wenbing Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1vGN3CSxVs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4171039, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14450080582761474773&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "ruc.edu.cn;ruc.edu.cn;;ruc.edu.cn;ruc.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Renmin University of China", "aff_unique_dep": "", "aff_unique_url": "http://www.ruc.edu.cn", "aff_unique_abbr": "RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Aligned Objective for Soft-Pseudo-Label Generation in Supervised Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35125", "id": "1wzdf6NjHd", "proceeding": "https://proceedings.mlr.press/v235/xu24k.html", "pdf": "https://openreview.net/pdf?id=1wzdf6NjHd", "openreview": "https://openreview.net/forum?id=1wzdf6NjHd", "author_site": "Ning Xu, Yihao Hu, Congyu Qiao, Xin Geng", "tldr": "", "abstract": "Soft pseudo-labels, generated by the softmax predictions of the trained networks, offer a probabilistic rather than binary form, and have been shown to improve the performance of deep neural networks in supervised learning. Most previous methods adopt classification loss to train a classifier as the soft-pseudo-label generator and fail to fully exploit their potential due to the misalignment with the target of soft-pseudo-label generation, aimed at capturing the knowledge in the data rather than making definitive classifications. Nevertheless, manually designing an effective objective function for a soft-pseudo-label generator is challenging, primarily because datasets typically lack ground-truth soft labels, complicating the evaluation of the soft pseudo-label accuracy. To deal with this problem, we propose a novel framework that alternately trains the predictive model and the soft-pseudo-label generator guided by a meta-network-parameterized objective function. The parameters of the objective function are optimized based on the feedback from both the performance of the predictive model and the soft-pseudo-label generator in the learning task. Additionally, the framework offers versatility across different learning tasks by allowing direct modifications to the task loss. Experiments on the benchmark datasets validate the effectiveness of the proposed framework.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ning Xu;Yihao Hu;Congyu Qiao;Xin Geng", "authorids": "~Ning_Xu5;~Yihao_Hu2;~Congyu_Qiao3;~Xin_Geng1", "gender": "M;M;M;M", "homepage": "http://palm.seu.edu.cn/xuning/;;http://palm.seu.edu.cn/homepage/qiaocongyu/demo/index.html;http://palm.seu.edu.cn/xgeng/index.htm", "dblp": "04/5856-9;234/7986-4.html;277/9262;", "google_scholar": ";;;ZOCxkIcAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Ning_Xu5;~Yihao_Hu2;~Congyu_Qiao3;~Xin_Geng1", "aff": "Southeast University;Southeast University;Southeast University;Southeast University, China", "aff_domain": "seu.edu.cn;seu.edu.cn;seu.edu.cn;seu.edu.cn", "position": "Associate Professor;MS student;PhD student;Professor", "bibtex": "@inproceedings{\nxu2024aligned,\ntitle={Aligned Objective for Soft-Pseudo-Label Generation in Supervised Learning},\nauthor={Ning Xu and Yihao Hu and Congyu Qiao and Xin Geng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1wzdf6NjHd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 684931, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16149780380437311047&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "seu.edu.cn;seu.edu.cn;seu.edu.cn;seu.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Southeast University", "aff_unique_dep": "", "aff_unique_url": "https://www.seu.edu.cn/", "aff_unique_abbr": "SEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Retrieval-Augmented Score Distillation for Text-to-3D Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35124", "id": "1xKgDANODx", "proceeding": "https://proceedings.mlr.press/v235/seo24a.html", "pdf": "https://openreview.net/pdf?id=1xKgDANODx", "openreview": "https://openreview.net/forum?id=1xKgDANODx", "author_site": "Junyoung Seo, Susung Hong, Wooseok Jang, In\u00e8s Hyeonsu Kim, Min-Seop Kwak, Doyup Lee, Seungryong Kim", "tldr": "", "abstract": "Text-to-3D generation has achieved significant success by incorporating powerful 2D diffusion models, but insufficient 3D prior knowledge also leads to the inconsistency of 3D geometry. Recently, since large-scale multi-view datasets have been released, fine-tuning the diffusion model on the multi-view datasets becomes a mainstream to solve the 3D inconsistency problem. However, it has confronted with fundamental difficulties regarding the limited quality and diversity of 3D data, compared with 2D data. To sidestep these trade-offs, we explore a retrieval-augmented approach tailored for score distillation, dubbed ReDream. We postulate that both expressiveness of 2D diffusion models and geometric consistency of 3D assets can be fully leveraged by employing the semantically relevant assets directly within the optimization process. To this end, we introduce novel framework for retrieval-based quality enhancement in text-to-3D generation. We leverage the retrieved asset to incorporate its geometric prior in the variational objective and adapt the diffusion model's 2D prior toward view consistency, achieving drastic improvements in both geometry and fidelity of generated scenes. We conduct extensive experiments to demonstrate that ReDream exhibits superior quality with increased geometric consistency. Project page is available at https://ku-cvlab.github.io/ReDream/.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Junyoung Seo;Susung Hong;Wooseok Jang;In\u00e8s Hyeonsu Kim;Min-Seop Kwak;Doyup Lee;Seungryong Kim", "authorids": "~Junyoung_Seo1;~Susung_Hong1;~Wooseok_Jang2;~In\u00e8s_Hyeonsu_Kim1;~Min-Seop_Kwak1;~Doyup_Lee1;~Seungryong_Kim1", "gender": "M;M;;M;M;M;F", "homepage": "https://j0seo.github.io;https://susunghong.github.io/;https://github.com/woo1726;;;https://cvlab.korea.ac.kr/members/faculty;https://ines-hyeonsu-kim.github.io", "dblp": "209/9340;330/5127;30/4458;338/9125;205/2368;141/9955;367/9344", "google_scholar": "orJRvmEAAAAJ;HigIHvUAAAAJ;;;https://scholar.google.co.kr/citations?user=5rAj44kAAAAJ;cIK1hS8AAAAJ;tBcqfncAAAAJ", "orcid": ";;;;;;0009-0003-3695-0243", "linkedin": ";;;matthewmatics96;;;", "or_profile": "~Junyoung_Seo1;~Susung_Hong1;~Wooseok_Jang2;~Min-Seop_Kwak1;~Doyup_Lee1;~Seungryong_Kim1;~Hyeonsu_Kim2", "aff": "Korea University;Korea University;Korea University;Korea Advanced Institute of Science & Technology;Runway;Korea University;Korea University", "aff_domain": "korea.ac.kr;korea.ac.kr;korea.ac.kr;kaist.ac.kr;runwayml.com;korea.ac.kr;korea.ac.kr", "position": "PhD student;Undergrad student;MS student;PhD student;Researcher;Assistant Professor;Undergrad student", "bibtex": "@inproceedings{\nseo2024retrievalaugmented,\ntitle={Retrieval-Augmented Score Distillation for Text-to-3D Generation},\nauthor={Junyoung Seo and Susung Hong and Wooseok Jang and In{\\`e}s Hyeonsu Kim and Min-Seop Kwak and Doyup Lee and Seungryong Kim},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1xKgDANODx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8635701, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9812421002074899785&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "korea.ac.kr;korea.ac.kr;korea.ac.kr;kaist.ac.kr;runwayml.com;korea.ac.kr;korea.ac.kr", "author_num": 7, "aff_unique_index": "0;0;0;1;2;0;0", "aff_unique_norm": "Korea University;Korea Advanced Institute of Science and Technology;Runway", "aff_unique_dep": ";;", "aff_unique_url": "https://www.korea.ac.kr;https://www.kaist.ac.kr;https://www.runwayml.com", "aff_unique_abbr": "KU;KAIST;Runway", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0;0", "aff_country_unique": "South Korea;United States" }, { "title": "Non-convex Stochastic Composite Optimization with Polyak Momentum", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35123", "id": "1ySQI9LE4w", "proceeding": "https://proceedings.mlr.press/v235/gao24l.html", "pdf": "https://openreview.net/pdf?id=1ySQI9LE4w", "openreview": "https://openreview.net/forum?id=1ySQI9LE4w", "author_site": "Yuan Gao, Anton Rodomanov, Sebastian Stich", "tldr": "", "abstract": "The stochastic proximal gradient method is a powerful generalization of the widely used stochastic gradient descent (SGD) method and has found numerous applications in Machine Learning. However, it is notoriously known that this method fails to converge in non-convex settings where the stochastic noise is significant (i.e. when only small or bounded batch sizes are used). In this paper, we focus on the stochastic proximal gradient method with Polyak momentum. We prove this method attains an optimal convergence rate for non-convex composite optimization problems, regardless of batch size. Additionally, we rigorously analyze the variance reduction effect of the Polyak momentum in the composite optimization setting and we show the method also converges when the proximal step can only be solved inexactly. Finally, we provide numerical experiments to validate our theoretical results.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuan Gao;Anton Rodomanov;Sebastian U Stich", "authorids": "~Yuan_Gao23;~Anton_Rodomanov1;~Sebastian_U_Stich1", "gender": ";;M", "homepage": "https://cispa.de/en/people/c01yuga;;https://www.sstich.ch", "dblp": ";153/5453;04/10549", "google_scholar": ";u95GRZQAAAAJ;https://scholar.google.ch/citations?user=8l-mDfQAAAAJ", "orcid": "0009-0004-2339-1718;;", "linkedin": ";;", "or_profile": "~Yuan_Gao23;~Anton_Rodomanov1;~Sebastian_U_Stich1", "aff": "CISPA, saarland university, saarland informatics campus;CISPA;CISPA Helmholtz Center for Information Security", "aff_domain": "cispa.saarland;cispa.de;cispa.de", "position": "PhD student;Postdoc;Tenure Track Faculty", "bibtex": "@inproceedings{\ngao2024nonconvex,\ntitle={Non-convex Stochastic Composite Optimization with Polyak Momentum},\nauthor={Yuan Gao and Anton Rodomanov and Sebastian U Stich},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1ySQI9LE4w}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1364999, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5498808995535781900&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "cispa.saarland;cispa.de;cispa.de", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Saarland University;CISPA Helmholtz Center for Information Security", "aff_unique_dep": "CISPA;", "aff_unique_url": "https://www.uni-saarland.de;https://www.cispa.de/", "aff_unique_abbr": "Saarland U;CISPA", "aff_campus_unique_index": "0", "aff_campus_unique": "Saarland Informatics Campus;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "Parameter Efficient Quasi-Orthogonal Fine-Tuning via Givens Rotation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35122", "id": "1zFkjbTgwC", "proceeding": "https://proceedings.mlr.press/v235/ma24a.html", "pdf": "https://openreview.net/pdf?id=1zFkjbTgwC", "openreview": "https://openreview.net/forum?id=1zFkjbTgwC", "author_site": "Xinyu Ma, Xu Chu, Zhibang Yang, Yang Lin, Xin Gao, Junfeng Zhao", "tldr": "", "abstract": "With the increasingly powerful performances and enormous scales of pretrained models, promoting parameter efficiency in fine-tuning has become a crucial need for effective and efficient adaptation to various downstream tasks. One representative line of fine-tuning methods is Orthogonal Fine-tuning (OFT), which rigorously preserves the angular distances within the parameter space to preserve the pretrained knowledge. Despite the empirical effectiveness, OFT still suffers low parameter efficiency at $\\mathcal{O}(d^2)$ and limited capability of downstream adaptation. Inspired by Givens rotation, in this paper, we proposed quasi-Givens Orthogonal Fine-Tuning (qGOFT) to address the problems. We first use $\\mathcal{O}(d)$ Givens rotations to accomplish arbitrary orthogonal transformation in $SO(d)$ with provable equivalence, reducing parameter complexity from $\\mathcal{O}(d^2)$ to $\\mathcal{O}(d)$. Then we introduce flexible norm and relative angular adjustments under soft orthogonality regularization to enhance the adaptation capability of downstream semantic deviations. Extensive experiments on various tasks and pretrained models validate the effectiveness of our methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xinyu Ma;Xu Chu;Zhibang Yang;Yang Lin;Xin Gao;Junfeng Zhao", "authorids": "~Xinyu_Ma3;~Xu_Chu1;~Zhibang_Yang1;~Yang_Lin2;~Xin_Gao4;~Junfeng_Zhao1", "gender": "M;;M;M;M;F", "homepage": ";;https://github.com/YZZBBB;;;https://cs.pku.edu.cn/info/1084/1224.htm", "dblp": "43/7894;;;59/5166;;72/3918-1", "google_scholar": "ygvzwbUAAAAJ;;;https://scholar.google.com.hk/citations?user=oAffgtgAAAAJ;;", "orcid": "0000-0003-4574-0830;;;;0000-0002-3479-8220;", "linkedin": ";;;;;", "or_profile": "~Xinyu_Ma3;~Xu_Chu1;~Zhibang_Yang1;~Yang_Lin2;~Xin_Gao4;~Junfeng_Zhao1", "aff": "Peking University;;South China University of Technology;Peking University;Peking University;Peking University", "aff_domain": "pku.edu.cn;;scut.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn", "position": "PhD student;;Undergrad student;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nma2024parameter,\ntitle={Parameter Efficient Quasi-Orthogonal Fine-Tuning via Givens Rotation},\nauthor={Xinyu Ma and Xu Chu and Zhibang Yang and Yang Lin and Xin Gao and Junfeng Zhao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1zFkjbTgwC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1654783, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5117467841067419865&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "pku.edu.cn;;scut.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn", "author_num": 6, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Peking University;South China University of Technology", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.scut.edu.cn", "aff_unique_abbr": "Peking U;SCUT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Offline Inverse RL: New Solution Concepts and Provably Efficient Algorithms", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35121", "id": "23tMOWscus", "proceeding": "https://proceedings.mlr.press/v235/lazzati24a.html", "pdf": "https://openreview.net/pdf?id=23tMOWscus", "openreview": "https://openreview.net/forum?id=23tMOWscus", "author_site": "Filippo Lazzati, Mirco Mutti, Alberto Maria Metelli", "tldr": "", "abstract": "*Inverse reinforcement learning* (IRL) aims to recover the reward function of an *expert* agent from demonstrations of behavior. It is well-known that the IRL problem is fundamentally ill-posed, i.e., many reward functions can explain the demonstrations. For this reason, IRL has been recently reframed in terms of estimating the *feasible reward set* (Metelli et al., 2021), thus, postponing the selection of a single reward. However, so far, the available formulations and algorithmic solutions have been proposed and analyzed mainly for the *online* setting, where the learner can interact with the environment and query the expert at will. This is clearly unrealistic in most practical applications, where the availability of an *offline* dataset is a much more common scenario. In this paper, we introduce a novel notion of feasible reward set capturing the opportunities and limitations of the offline setting and we analyze the complexity of its estimation. This requires the introduction an original learning framework that copes with the intrinsic difficulty of the setting, for which data coverage is not under control. Then, we propose two computationally and statistically efficient algorithms, IRLO and PIRLO, for addressing the problem. In particular, the latter adopts a specific form of *pessimism* to enforce the novel, desirable property of *inclusion monotonicity* of the delivered feasible set. With this work, we aim to provide a panorama of the challenges of the offline IRL problem and how they can be fruitfully addressed.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Filippo Lazzati;Mirco Mutti;Alberto Maria Metelli", "authorids": "~Filippo_Lazzati2;~Mirco_Mutti1;~Alberto_Maria_Metelli2", "gender": "M;;M", "homepage": "https://filippolazzati.github.io/;;https://albertometelli.github.io/", "dblp": "345/8703;222/2815;209/4941", "google_scholar": "lIf4g_IAAAAJ;GlLkJ9UAAAAJ;R31IsPwAAAAJ", "orcid": "0009-0004-2561-417X;;0000-0002-3424-5212", "linkedin": "filippo-lazzati/;;", "or_profile": "~Filippo_Lazzati2;~Mirco_Mutti1;~Alberto_Maria_Metelli2", "aff": "Politecnico di Milano;Technion - Israel Institute of Technology;Politecnico di Milano", "aff_domain": "polimi.it;technion.ac.il;polimi.it", "position": "PhD student;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nlazzati2024offline,\ntitle={Offline Inverse {RL}: New Solution Concepts and Provably Efficient Algorithms},\nauthor={Filippo Lazzati and Mirco Mutti and Alberto Maria Metelli},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=23tMOWscus}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 791923, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7540066144713785313&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "polimi.it;technion.ac.il;polimi.it", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Politecnico di Milano;Technion - Israel Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.polimi.it;https://www.technion.ac.il/en/", "aff_unique_abbr": "Polimi;Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Italy;Israel" }, { "title": "Risk-Sensitive Policy Optimization via Predictive CVaR Policy Gradient", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35120", "id": "24zMewdzyJ", "proceeding": "https://proceedings.mlr.press/v235/kim24x.html", "pdf": "https://openreview.net/pdf?id=24zMewdzyJ", "openreview": "https://openreview.net/forum?id=24zMewdzyJ", "author_site": "Ju-Hyun Kim, Seungki Min", "tldr": "", "abstract": "This paper addresses a policy optimization task with the conditional value-at-risk (CVaR) objective. We introduce the *predictive CVaR policy gradient*, a novel approach that seamlessly integrates risk-neutral policy gradient algorithms with minimal modifications. Our method incorporates a reweighting strategy in gradient calculation -- individual cost terms are reweighted in proportion to their *predicted* contribution to the objective. These weights can be easily estimated through a separate learning procedure. We provide theoretical and empirical analyses, demonstrating the validity and effectiveness of our proposed method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ju-Hyun Kim;Seungki Min", "authorids": "~Ju-Hyun_Kim1;~Seungki_Min2", "gender": "M;", "homepage": "http://do.kaist.ac.kr/;https://mskyt88.github.io/", "dblp": ";", "google_scholar": ";https://scholar.google.co.kr/citations?user=OTH0HWEAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Ju-Hyun_Kim1;~Seungki_Min2", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nkim2024risksensitive,\ntitle={Risk-Sensitive Policy Optimization via Predictive {CV}aR Policy Gradient},\nauthor={Ju-Hyun Kim and Seungki Min},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=24zMewdzyJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1331910, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zYPC8SJbW2YJ:scholar.google.com/&scioq=Risk-Sensitive+Policy+Optimization+via+Predictive+CVaR+Policy+Gradient&hl=en&as_sdt=0,5", "gs_version_total": 7, "email": "kaist.ac.kr;kaist.ac.kr", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "ESM All-Atom: Multi-Scale Protein Language Model for Unified Molecular Modeling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35119", "id": "283cGgWfM2", "proceeding": "https://proceedings.mlr.press/v235/zheng24h.html", "pdf": "https://openreview.net/pdf?id=283cGgWfM2", "openreview": "https://openreview.net/forum?id=283cGgWfM2", "author_site": "Kangjie Zheng, Siyu Long, Tianyu Lu, Junwei Yang, Xinyu Dai, Ming Zhang, Zaiqing Nie, Wei-Ying Ma, Hao Zhou", "tldr": "", "abstract": "Protein language models have demonstrated significant potential in the field of protein engineering. However, current protein language models primarily operate at the residue scale, which limits their ability to provide information at the atom level. This limitation prevents us from fully exploiting the capabilities of protein language models for applications involving both proteins and small molecules. In this paper, we propose ESM-AA (ESM All-Atom), a novel approach that enables atom-scale and residue-scale unified molecular modeling. ESM-AA achieves this by pre-training on multi-scale code-switch protein sequences and utilizing a multi-scale position encoding to capture relationships among residues and atoms. Experimental results indicate that ESM-AA surpasses previous methods in protein-molecule tasks, demonstrating the full utilization of protein language models. Further investigations reveal that through unified molecular modeling, ESM-AA not only gains molecular knowledge but also retains its understanding of proteins.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kangjie Zheng;Siyu Long;Tianyu Lu;Junwei Yang;Xinyu Dai;Ming Zhang;Zaiqing Nie;Wei-Ying Ma;Hao Zhou", "authorids": "~Kangjie_Zheng1;~Siyu_Long1;~Tianyu_Lu2;~Junwei_Yang2;~Xinyu_Dai1;~Ming_Zhang5;~Zaiqing_Nie2;~Wei-Ying_Ma2;~Hao_Zhou5", "gender": "M;M;M;M;F;M;M;M;M", "homepage": "https://scholar.google.com/citations?user=n8kbAwQAAAAJ&hl=en;https://longlongman.github.io;https://github.com/lutianyu21?tab=repositories;http://cs.nju.edu.cn/daixinyu;https://cs.pku.edu.cn/info/1080/1371.htm;https://air.tsinghua.edu.cn/en/info/1046/1192.htm;https://air.tsinghua.edu.cn/en/info/1046/1189.htm;https://zhouh.github.io/;https://github.com/yjwtheonly", "dblp": ";234/9275;;39/5815;73/1844-4;n/ZaiqingNie;m/WYMa.html;63/778-12;", "google_scholar": "n8kbAwQAAAAJ;aOfk1hsAAAAJ;;https://scholar.google.com/citations?hl=en;LbzoQBsAAAAJ;;SToCbu8AAAAJ;https://scholar.google.com/citations?hl=zh-CN;kbGJGvsAAAAJ", "orcid": ";0000-0002-9944-4837;;;0000-0002-9809-3430;0000-0002-1134-2343;;;", "linkedin": ";siyulong;;;;;wei-ying-ma-16a0171/;;", "or_profile": "~Kangjie_Zheng1;~Siyu_Long1;~Tianyu_Lu2;~Xinyu_Dai1;~Ming_Zhang5;~Zaiqing_Nie2;~Wei-Ying_Ma2;~Hao_Zhou5;~junwei_yang1", "aff": "Tsinghua University;Nanjing University;Tsinghua University;Nanjing University;Peking University;Tsinghua University;Tsinghua University;Tsinghua University;Peking University", "aff_domain": "tsinghua.edu.cn;nju.edu.cn;tsinghua.edu.cn;nju.edu.cn;pku.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;pku.edu.cn", "position": "Intern;PhD student;Undergrad student;Full Professor;Full Professor;Full Professor;Full Professor;Associate Professor;PhD Student", "bibtex": "@inproceedings{\nzheng2024esm,\ntitle={{ESM} All-Atom: Multi-Scale Protein Language Model for Unified Molecular Modeling},\nauthor={Kangjie Zheng and Siyu Long and Tianyu Lu and Junwei Yang and Xinyu Dai and Ming Zhang and Zaiqing Nie and Wei-Ying Ma and Hao Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=283cGgWfM2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 755935, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17640750707427370757&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "tsinghua.edu.cn;nju.edu.cn;tsinghua.edu.cn;nju.edu.cn;pku.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;pku.edu.cn", "author_num": 9, "aff_unique_index": "0;1;0;1;2;0;0;0;2", "aff_unique_norm": "Tsinghua University;Nanjing University;Peking University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.nju.edu.cn;http://www.pku.edu.cn", "aff_unique_abbr": "THU;Nanjing U;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Sliced-Wasserstein Estimation with Spherical Harmonics as Control Variates", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35118", "id": "28SEr5iFyT", "proceeding": "https://proceedings.mlr.press/v235/leluc24a.html", "pdf": "https://openreview.net/pdf?id=28SEr5iFyT", "openreview": "https://openreview.net/forum?id=28SEr5iFyT", "author_site": "R\u00e9mi Leluc, Aymeric Dieuleveut, Fran\u00e7ois Portier, Johan Segers, Aigerim Zhuman", "tldr": "", "abstract": "The Sliced-Wasserstein (SW) distance between probability measures is defined as the average of the Wasserstein distances resulting for the associated one-dimensional projections. As a consequence, the SW distance can be written as an integral with respect to the uniform measure on the sphere and the Monte Carlo framework can be employed for calculating the SW distance. Spherical harmonics are polynomials on the sphere that form an orthonormal basis of the set of square-integrable functions on the sphere. Putting these two facts together, a new Monte Carlo method, hereby referred to as Spherical Harmonics Control Variates (SHCV), is proposed for approximating the SW distance using spherical harmonics as control variates. The resulting approach is shown to have good theoretical properties, e.g., a no-error property for Gaussian measures under a certain form of linear dependency between the variables. Moreover, an improved rate of convergence, compared to Monte Carlo, is established for general measures. The convergence analysis relies on the Lipschitz property associated to the SW integrand. Several numerical experiments demonstrate the superior performance of SHCV against state-of-the-art methods for SW distance computation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "R\u00e9mi Leluc;Aymeric Dieuleveut;Fran\u00e7ois Portier;Johan Segers;Aigerim Zhuman", "authorids": "~R\u00e9mi_Leluc1;~Aymeric_Dieuleveut1;~Fran\u00e7ois_Portier1;~Johan_Segers1;~Aigerim_Zhuman1", "gender": "M;M;M;;F", "homepage": "https://remileluc.github.io/;http://www.cmap.polytechnique.fr/~aymeric.dieuleveut/;https://sites.google.com/site/fportierwebpage/;https://perso.uclouvain.be/johan.segers/;https://uclouvain.be/en/directories/aigerim.zhuman", "dblp": "266/9621;176/5034;121/8308;;", "google_scholar": ";ge-OinUAAAAJ;https://scholar.google.fr/citations?user=8DPvcK4AAAAJ;https://scholar.google.be/citations?user=fA6rT1sAAAAJ;", "orcid": ";;;0000-0002-0444-689X;", "linkedin": ";;;;", "or_profile": "~R\u00e9mi_Leluc1;~Aymeric_Dieuleveut1;~Fran\u00e7ois_Portier1;~Johan_Segers1;~Aigerim_Zhuman1", "aff": "\u00c9cole Polytechnique (CMAP);\u00c9cole Polytechnique;Ecole Nationale de la Statistique et de l'Analyse de l'information;UCL;UCLouvain", "aff_domain": "polytechnique.fr;polytechnique.edu;ensai.fr;uclouvain.be;uclouvain.be", "position": "Postdoc;Full Professor;Associate Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nleluc2024slicedwasserstein,\ntitle={Sliced-Wasserstein Estimation with Spherical Harmonics as Control Variates},\nauthor={R{\\'e}mi Leluc and Aymeric Dieuleveut and Fran{\\c{c}}ois Portier and Johan Segers and Aigerim Zhuman},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=28SEr5iFyT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2613243, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10478452476459504681&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 14, "email": "polytechnique.fr;polytechnique.edu;ensai.fr;uclouvain.be;uclouvain.be", "author_num": 5, "aff_unique_index": "0;0;1;2;3", "aff_unique_norm": "Ecole Polytechnique;Ecole Nationale de la Statistique et de l'Analyse de l'information;University College London;Universit\u00e9 catholique de Louvain", "aff_unique_dep": "CMAP;;;", "aff_unique_url": "https://www.ecp.fr;https://ensai.fr;https://www.ucl.ac.uk;https://www.uclouvain.be", "aff_unique_abbr": "\u00c9cole Polytechnique;ENSAI;UCL;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;2", "aff_country_unique": "France;United Kingdom;Belgium" }, { "title": "HarmoDT: Harmony Multi-Task Decision Transformer for Offline Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35117", "id": "2Asakozn3Z", "proceeding": "https://proceedings.mlr.press/v235/hu24d.html", "pdf": "https://openreview.net/pdf?id=2Asakozn3Z", "openreview": "https://openreview.net/forum?id=2Asakozn3Z", "author_site": "Shengchao Hu, Ziqing Fan, Li Shen, Ya Zhang, Yanfeng Wang, Dacheng Tao", "tldr": "", "abstract": "The purpose of offline multi-task reinforcement learning (MTRL) is to develop a unified policy applicable to diverse tasks without the need for online environmental interaction. Recent advancements approach this through sequence modeling, leveraging the Transformer architecture's scalability and the benefits of parameter sharing to exploit task similarities. However, variations in task content and complexity pose significant challenges in policy formulation, necessitating judicious parameter sharing and management of conflicting gradients for optimal policy performance. In this work, we introduce the Harmony Multi-Task Decision Transformer (HarmoDT), a novel solution designed to identify an optimal harmony subspace of parameters for each task. We approach this as a bi-level optimization problem, employing a meta-learning framework that leverages gradient-based techniques. The upper level of this framework is dedicated to learning a task-specific mask that delineates the harmony subspace, while the inner level focuses on updating parameters to enhance the overall performance of the unified policy. Empirical evaluations on a series of benchmarks demonstrate the superiority of HarmoDT, verifying the effectiveness of our approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shengchao Hu;Ziqing Fan;Li Shen;Ya Zhang;Yanfeng Wang;Dacheng Tao", "authorids": "~Shengchao_Hu1;~Ziqing_Fan1;~Li_Shen1;~Ya_Zhang1;~Yanfeng_Wang1;~Dacheng_Tao1", "gender": ";;M;F;M;", "homepage": ";;https://sites.google.com/site/mathshenli/home;https://annzhanglion.github.io/;https://cmic.sjtu.edu.cn/wangyanfeng/;", "dblp": ";;91/3680-8;85/3714-2;55/5407-1.html;", "google_scholar": ";;yVhgENIAAAAJ;pbjw9sMAAAAJ;https://scholar.google.com/citations?hl=zh-CN;", "orcid": ";;;0000-0002-5390-9053;0000-0002-3196-2347;", "linkedin": ";;;;;", "or_profile": "~Shengchao_Hu1;~Ziqing_Fan1;~Li_Shen1;~Ya_Zhang1;~Yanfeng_Wang1;~Dacheng_Tao1", "aff": ";;JD Explore Academy;Shanghai Jiaotong University;Shanghai Jiaotong University;", "aff_domain": ";;jd.com;sjtu.edu.cn;sjtu.edu.cn;", "position": ";;Researcher;Professor;Full Professor;", "bibtex": "@inproceedings{\nhu2024harmodt,\ntitle={Harmo{DT}: Harmony Multi-Task Decision Transformer for Offline Reinforcement Learning},\nauthor={Shengchao Hu and Ziqing Fan and Li Shen and Ya Zhang and Yanfeng Wang and Dacheng Tao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2Asakozn3Z}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 868543, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15550378485469863438&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";;jd.com;sjtu.edu.cn;sjtu.edu.cn;", "author_num": 6, "aff_unique_index": "0;1;1", "aff_unique_norm": "JD;Shanghai Jiao Tong University", "aff_unique_dep": "JD Explore Academy;", "aff_unique_url": ";https://www.sjtu.edu.cn", "aff_unique_abbr": ";SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;1", "aff_country_unique": ";China" }, { "title": "On the Duality Between Sharpness-Aware Minimization and Adversarial Training", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35116", "id": "2B2U5kkGUA", "proceeding": "https://proceedings.mlr.press/v235/zhang24w.html", "pdf": "https://openreview.net/pdf?id=2B2U5kkGUA", "openreview": "https://openreview.net/forum?id=2B2U5kkGUA", "author_site": "Yihao Zhang, Hangzhou He, Jingyu Zhu, Huanran Chen, Yifei Wang, Zeming Wei", "tldr": "", "abstract": "Adversarial Training (AT), which adversarially perturb the input samples during training, has been acknowledged as one of the most effective defenses against adversarial attacks, yet suffers from inevitably decreased clean accuracy. Instead of perturbing the samples, Sharpness-Aware Minimization (SAM) perturbs the model weights during training to find a more flat loss landscape and improve generalization. However, as SAM is designed for better clean accuracy, its effectiveness in enhancing adversarial robustness remains unexplored. In this work, considering the duality between SAM and AT, we investigate the adversarial robustness derived from SAM. Intriguingly, we find that using SAM alone can improve adversarial robustness. To understand this unexpected property of SAM, we first provide empirical and theoretical insights into how SAM can implicitly learn more robust features, and conduct comprehensive experiments to show that SAM can improve adversarial robustness notably without sacrificing any clean accuracy, shedding light on the potential of SAM to be a substitute for AT when accuracy comes at a higher priority. Code is available at https://github.com/weizeming/SAM_AT.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yihao Zhang;Hangzhou He;Jingyu Zhu;Huanran Chen;Yifei Wang;Zeming Wei", "authorids": "~Yihao_Zhang3;~Hangzhou_He1;~Jingyu_Zhu1;~Huanran_Chen1;~Yifei_Wang1;~Zeming_Wei1", "gender": "Non-Binary;M;M;M;M;M", "homepage": "https://zhang-yihao.github.io/;https://hangzhouhe.com;https://github.com/zhujingyu-zzz;https://huanranchen.github.io/;https://yifeiwang77.com;https://weizeming.github.io", "dblp": ";354/7180;;329/6558;00/555-1;276/6608", "google_scholar": ";IVvW2kMAAAAJ;BA0BaS4AAAAJ;https://scholar.google.co.jp/citations?user=QYsKXccAAAAJ;-CLy6YsAAAAJ;Kyn1zdQAAAAJ", "orcid": ";0009-0009-3050-8773;;;;", "linkedin": ";;;;;", "or_profile": "~Yihao_Zhang3;~Hangzhou_He1;~Jingyu_Zhu1;~Huanran_Chen1;~Yifei_Wang1;~Zeming_Wei1", "aff": "Peking University;Peking University;Peking University;;Massachusetts Institute of Technology;University of California, Berkeley", "aff_domain": "pku.edu.cn;pku.edu.cn;pku.edu.cn;;mit.edu;berkeley.edu", "position": "Undergrad student;Undergrad student;Undergrad student;;Postdoc;Undergrad student", "bibtex": "@inproceedings{\nzhang2024on,\ntitle={On the Duality Between Sharpness-Aware Minimization and Adversarial Training},\nauthor={Yihao Zhang and Hangzhou He and Jingyu Zhu and Huanran Chen and Yifei Wang and Zeming Wei},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2B2U5kkGUA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 444480, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7913009421384930072&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "pku.edu.cn;pku.edu.cn;pku.edu.cn;;mit.edu;berkeley.edu", "author_num": 6, "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "Peking University;Massachusetts Institute of Technology;University of California, Berkeley", "aff_unique_dep": ";;", "aff_unique_url": "http://www.pku.edu.cn;https://web.mit.edu;https://www.berkeley.edu", "aff_unique_abbr": "Peking U;MIT;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;1;1", "aff_country_unique": "China;United States" }, { "title": "Adaptive Horizon Actor-Critic for Policy Learning in Contact-Rich Differentiable Simulation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35115", "id": "2FHWFG5ahw", "proceeding": "https://proceedings.mlr.press/v235/georgiev24a.html", "pdf": "https://openreview.net/pdf?id=2FHWFG5ahw", "openreview": "https://openreview.net/forum?id=2FHWFG5ahw", "author_site": "Ignat Georgiev, Krishnan Srinivasan, Jie Xu, Eric Heiden, Animesh Garg", "tldr": "", "abstract": "Model-Free Reinforcement Learning (MFRL), leveraging the policy gradient theorem, has demonstrated considerable success in continuous control tasks. However, these approaches are plagued by high gradient variance due to zeroth-order gradient estimation, resulting in suboptimal policies. Conversely, First-Order Model-Based Reinforcement Learning (FO-MBRL) methods employing differentiable simulation provide gradients with reduced variance but are susceptible to sampling error in scenarios involving stiff dynamics, such as physical contact. This paper investigates the source of this error and introduces Adaptive Horizon Actor-Critic (AHAC), an FO-MBRL algorithm that reduces gradient error by adapting the model-based horizon to avoid stiff dynamics. Empirical findings reveal that AHAC outperforms MFRL baselines, attaining 40% more reward across a set of locomotion tasks and efficiently scaling to high-dimensional control environments with improved wall-clock-time efficiency. [adaptive-horizon-actor-critic.github.io](https://adaptive-horizon-actor-critic.github.io/)", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ignat Georgiev;Krishnan Srinivasan;Jie Xu;Eric Heiden;Animesh Garg", "authorids": "~Ignat_Georgiev1;~Krishnan_Srinivasan1;~Jie_Xu7;~Eric_Heiden1;~Animesh_Garg1", "gender": "M;M;M;;M", "homepage": "http://imgeorgiev.com/;http://github.com/krishpop;https://people.csail.mit.edu/jiex;https://eric-heiden.com/;http://animesh.garg.tech", "dblp": ";02/4773;37/5126-28;;123/5728", "google_scholar": "https://scholar.google.co.uk/citations?user=1Yu0vQkAAAAJ;;3Tj5lWEAAAAJ;iWmtv7gAAAAJ;zp8V7ZMAAAAJ", "orcid": ";;;;0000-0003-0482-4296", "linkedin": "imgeorgiev/;;;;animeshgarg/", "or_profile": "~Ignat_Georgiev1;~Krishnan_Srinivasan1;~Jie_Xu7;~Eric_Heiden1;~Animesh_Garg1", "aff": "Georgia Institute of Technology;Stanford University;NVIDIA;NVIDIA;NVIDIA", "aff_domain": "gatech.edu;stanford.edu;nvidia.com;nvidia.com;nvidia.com", "position": "PhD student;PhD student;Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\ngeorgiev2024adaptive,\ntitle={Adaptive Horizon Actor-Critic for Policy Learning in Contact-Rich Differentiable Simulation},\nauthor={Ignat Georgiev and Krishnan Srinivasan and Jie Xu and Eric Heiden and Animesh Garg},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2FHWFG5ahw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7975149, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13978332684088521869&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "gatech.edu;stanford.edu;nvidia.com;nvidia.com;nvidia.com", "author_num": 5, "aff_unique_index": "0;1;2;2;2", "aff_unique_norm": "Georgia Institute of Technology;Stanford University;NVIDIA", "aff_unique_dep": ";;NVIDIA Corporation", "aff_unique_url": "https://www.gatech.edu;https://www.stanford.edu;https://www.nvidia.com", "aff_unique_abbr": "Georgia Tech;Stanford;NVIDIA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "A Differentiable Partially Observable Generalized Linear Model with Forward-Backward Message Passing", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35114", "id": "2FKzbEE24s", "proceeding": "https://proceedings.mlr.press/v235/li24aj.html", "pdf": "https://openreview.net/pdf?id=2FKzbEE24s", "openreview": "https://openreview.net/forum?id=2FKzbEE24s", "author_site": "Chengrui Li, Weihan Li, Yule Wang, Anqi Wu", "tldr": "", "abstract": "The partially observable generalized linear model (POGLM) is a powerful tool for understanding neural connectivities under the assumption of existing hidden neurons. With spike trains only recorded from visible neurons, existing works use variational inference to learn POGLM meanwhile presenting the difficulty of learning this latent variable model. There are two main issues: (1) the sampled Poisson hidden spike count hinders the use of the pathwise gradient estimator in VI; and (2) the existing design of the variational model is neither expressive nor time-efficient, which further affects the performance. For (1), we propose a new differentiable POGLM, which enables the pathwise gradient estimator, better than the score function gradient estimator used in existing works. For (2), we propose the forward-backward message-passing sampling scheme for the variational model. Comprehensive experiments show that our differentiable POGLMs with our forward-backward message passing produce a better performance on one synthetic and two real-world datasets. Furthermore, our new method yields more interpretable parameters, underscoring its significance in neuroscience.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chengrui Li;Weihan Li;Yule Wang;Anqi Wu", "authorids": "~Chengrui_Li1;~Weihan_Li1;~Yule_Wang1;~Anqi_Wu3", "gender": "M;M;M;F", "homepage": "https://jerrysoybean.github.io/;https://weihanlikk.github.io/;https://yulewang97.github.io/;https://sites.google.com/view/brainml/home", "dblp": "174/4237;24/8923;;15/9453", "google_scholar": "https://scholar.google.com/citations?h;qW4_NR4AAAAJ;vqsl1YYAAAAJ;ptGYJiEAAAAJ", "orcid": "0000-0001-5947-2393;;;0000-0002-7866-9455", "linkedin": ";;yule-wang-a8002b195/;", "or_profile": "~Chengrui_Li1;~Weihan_Li1;~Yule_Wang1;~Anqi_Wu3", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology", "aff_domain": "gatech.edu;gatech.edu;gatech.edu;gatech.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nli2024a,\ntitle={A Differentiable Partially Observable Generalized Linear Model with Forward-Backward Message Passing},\nauthor={Chengrui Li and Weihan Li and Yule Wang and Anqi Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2FKzbEE24s}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1922651, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17192093447935678584&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "gatech.edu;gatech.edu;gatech.edu;gatech.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Stochastic Localization via Iterative Posterior Sampling", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35113", "id": "2Gr5wZR6uc", "proceeding": "https://proceedings.mlr.press/v235/grenioux24a.html", "pdf": "https://openreview.net/pdf?id=2Gr5wZR6uc", "openreview": "https://openreview.net/forum?id=2Gr5wZR6uc", "author_site": "Louis Grenioux, Maxence Noble, Marylou Gabri\u00e9, Alain Oliviero Durmus", "tldr": "", "abstract": "Building upon score-based learning, new interest in stochastic localization techniques has recently emerged. In these models, one seeks to noise a sample from the data distribution through a stochastic process, called observation process, and progressively learns a denoiser associated to this dynamics. Apart from specific applications, the use of stochastic localization for the problem of sampling from an unnormalized target density has not been explored extensively. This work contributes to fill this gap. We consider a general stochastic localization framework and introduce an explicit class of observation processes, associated with flexible denoising schedules. We provide a complete methodology, *Stochastic Localization via Iterative Posterior Sampling* (**SLIPS**), to obtain approximate samples of these dynamics, and as a by-product, samples from the target distribution. Our scheme is based on a Markov chain Monte Carlo estimation of the denoiser and comes with detailed practical guidelines. We illustrate the benefits and applicability of **SLIPS** on several benchmarks of multi-modal distributions, including Gaussian mixtures in increasing dimensions, Bayesian logistic regression and a high-dimensional field system from statistical-mechanics.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Louis Grenioux;Maxence Noble;Marylou Gabri\u00e9;Alain Oliviero Durmus", "authorids": "~Louis_Grenioux1;~Maxence_Noble1;~Marylou_Gabri\u00e91;~Alain_Oliviero_Durmus1", "gender": "M;M;F;M", "homepage": "https://github.com/h2o64;https://maxencenoble.github.io/;https://marylou-gabrie.github.io/;", "dblp": "339/8821.html;306/7678;164/5772;01/11275", "google_scholar": ";4eGHx3gAAAAJ;5m1DvLwAAAAJ;", "orcid": ";;;", "linkedin": "https://linkedin.com/in/lgrenioux;maxence-noble-393588172/;;", "or_profile": "~Louis_Grenioux1;~Maxence_Noble1;~Marylou_Gabri\u00e91;~Alain_Durmus1", "aff": "\u00c9cole Polytechnique;\u00c9cole Polytechnique;\u00c9cole Polytechnique;\u00c9cole Polytechnique", "aff_domain": "polytechnique.edu;polytechnique.fr;polytechnique.edu;polytechnique.fr", "position": "PhD student;PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\ngrenioux2024stochastic,\ntitle={Stochastic Localization via Iterative Posterior Sampling},\nauthor={Louis Grenioux and Maxence Noble and Marylou Gabri{\\'e} and Alain Oliviero Durmus},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2Gr5wZR6uc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2087709, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17344046561291018738&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "email": "polytechnique.edu;polytechnique.fr;polytechnique.edu;polytechnique.fr", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Ecole Polytechnique", "aff_unique_dep": "", "aff_unique_url": "https://www.polytechnique.edu", "aff_unique_abbr": "X", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "France" }, { "title": "Geometric Active Exploration in Markov Decision Processes: the Benefit of Abstraction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35112", "id": "2JYOxcGlRe", "proceeding": "https://proceedings.mlr.press/v235/de-santi24a.html", "pdf": "https://openreview.net/pdf?id=2JYOxcGlRe", "openreview": "https://openreview.net/forum?id=2JYOxcGlRe", "author_site": "Riccardo De Santi, Federico Arangath Joseph, Noah Liniger, Mirco Mutti, Andreas Krause", "tldr": "", "abstract": "How can a scientist use a Reinforcement Learning (RL) algorithm to design experiments over a dynamical system's state space? In the case of finite and Markovian systems, an area called *Active Exploration* (AE) relaxes the optimization problem of experiments design into Convex RL, a generalization of RL admitting a wider notion of reward. Unfortunately, this framework is currently not scalable and the potential of AE is hindered by the vastness of experiments spaces typical of scientific discovery applications. However, these spaces are often endowed with natural geometries, e.g., permutation invariance in molecular design, that an agent could leverage to improve the statistical and computational efficiency of AE. To achieve this, we bridge AE and MDP homomorphisms, which offer a way to exploit known geometric structures via abstraction. Towards this goal, we make two fundamental contributions: we extend MDP homomorphisms formalism to Convex RL, and we present, to the best of our knowledge, the first analysis that formally captures the benefit of abstraction via homomorphisms on sample efficiency. Ultimately, we propose the Geometric Active Exploration (GAE) algorithm, which we analyse theoretically and experimentally in environments motivated by problems in scientific discovery.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Riccardo De Santi;Federico Arangath Joseph;Noah Liniger;Mirco Mutti;Andreas Krause", "authorids": "~Riccardo_De_Santi1;~Federico_Arangath_Joseph1;~Noah_Liniger1;~Mirco_Mutti1;~Andreas_Krause1", "gender": "M;M;M;;M", "homepage": "http://www.riccardodesanti.com;;;;https://las.inf.ethz.ch/krausea", "dblp": "313/1635;383/2618;;222/2815;87/1831-1.html", "google_scholar": "K7qyOj0AAAAJ;https://scholar.google.com/citations?hl=it;;GlLkJ9UAAAAJ;https://scholar.google.ch/citations?user=eDHv58AAAAAJ", "orcid": ";;;;0000-0001-7260-9673", "linkedin": "riccardo-de-santi-426139135/;federico-arangath-joseph-a3096124a/;noah-liniger/;;krausea/", "or_profile": "~Riccardo_De_Santi1;~Federico_Arangath_Joseph1;~Noah_Liniger1;~Mirco_Mutti1;~Andreas_Krause1", "aff": "ETHZ - ETH Zurich;ETHZ - ETH Zurich;ETHZ - ETH Zurich;Technion - Israel Institute of Technology;ETH Zurich", "aff_domain": "ethz.ch;ethz.ch;ethz.ch;technion.ac.il;ethz.ch", "position": "PhD student;MS student;MS student;Postdoc;Full Professor", "bibtex": "@inproceedings{\nsanti2024geometric,\ntitle={Geometric Active Exploration in Markov Decision Processes: the Benefit of Abstraction},\nauthor={Riccardo De Santi and Federico Arangath Joseph and Noah Liniger and Mirco Mutti and Andreas Krause},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2JYOxcGlRe}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1035619, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17034408851358295215&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "ethz.ch;ethz.ch;ethz.ch;technion.ac.il;ethz.ch", "author_num": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "ETH Zurich;Technion - Israel Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.technion.ac.il/en/", "aff_unique_abbr": "ETHZ;Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "Switzerland;Israel" }, { "title": "Breaking through the learning plateaus of in-context learning in Transformer", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35111", "id": "2K87GFLYWz", "proceeding": "https://proceedings.mlr.press/v235/fu24h.html", "pdf": "https://openreview.net/pdf?id=2K87GFLYWz", "openreview": "https://openreview.net/forum?id=2K87GFLYWz", "author_site": "Jingwen Fu, Tao Yang, Yuwang Wang, Yan Lu, Nanning Zheng", "tldr": "", "abstract": "In-context learning, i.e., learning from context examples, is an impressive ability of Transformer. Training Transformers to possess this in-context learning skill is computationally intensive due to the occurrence of *learning plateaus*, which are periods within the training process where there is minimal or no enhancement in the model's in-context learning capability. To study the mechanism behind the learning plateaus, we conceptually separate a component within the model's internal representation that is exclusively affected by the model's weights. We call this the \u201cweights component\u201d, and the remainder is identified as the \u201ccontext component\u201d. By conducting meticulous and controlled experiments on synthetic tasks, we note that the persistence of learning plateaus correlates with compromised functionality of the weights component. Recognizing the impaired performance of the weights component as a fundamental behavior that drives learning plateaus, we have developed three strategies to expedite the learning of Transformers. The effectiveness of these strategies is further confirmed in natural language processing tasks. In conclusion, our research demonstrates the feasibility of cultivating a powerful in-context learning ability within AI systems in an eco-friendly manner.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jingwen Fu;Tao Yang;Yuwang Wang;Yan Lu;Nanning Zheng", "authorids": "~Jingwen_Fu1;~Tao_Yang9;~Yuwang_Wang3;~Yan_Lu7;~Nanning_Zheng1", "gender": "M;M;M;M;M", "homepage": "https://www.jw-fu.cn/;https://github.com/ThomasMrY;;https://www.microsoft.com/en-us/research/people/yanlu/;", "dblp": "247/5290;;161/2633;15/4830-1;07/256-1", "google_scholar": ";https://scholar.google.com.hk/citations?user=qT5psCEAAAAJ;;djk5l-4AAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";;;0000-0001-5383-6424;", "linkedin": ";;;;", "or_profile": "~Jingwen_Fu1;~Tao_Yang9;~Yuwang_Wang3;~Yan_Lu7;~Nanning_Zheng1", "aff": "Microsoft;Xi'an Jiaotong University;Tsinghua University;Microsoft Research Asia;Xi'an Jiaotong University", "aff_domain": "microsoft.com;xjtu.edu.cn;tsinghua.edu.cn;microsoft.com;xjtu.edu.cn", "position": "Intern;PhD student;Researcher;Partner Research Manager;Full Professor", "bibtex": "@inproceedings{\nfu2024breaking,\ntitle={Breaking through the learning plateaus of in-context learning in Transformer},\nauthor={Jingwen Fu and Tao Yang and Yuwang Wang and Yan Lu and Nanning Zheng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2K87GFLYWz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1550441, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2245772145989313546&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "microsoft.com;xjtu.edu.cn;tsinghua.edu.cn;microsoft.com;xjtu.edu.cn", "author_num": 5, "aff_unique_index": "0;1;2;0;1", "aff_unique_norm": "Microsoft;Xi'an Jiao Tong University;Tsinghua University", "aff_unique_dep": "Microsoft Corporation;;", "aff_unique_url": "https://www.microsoft.com;https://www.xjtu.edu.cn;https://www.tsinghua.edu.cn", "aff_unique_abbr": "Microsoft;XJTU;THU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "United States;China" }, { "title": "Diffusion Models Demand Contrastive Guidance for Adversarial Purification to Advance", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35110", "id": "2NUGeV64y2", "proceeding": "https://proceedings.mlr.press/v235/bai24b.html", "pdf": "https://openreview.net/pdf?id=2NUGeV64y2", "openreview": "https://openreview.net/forum?id=2NUGeV64y2", "author_site": "Mingyuan Bai, Wei Huang, Li Tenghui, Andong Wang, Junbin Gao, Cesar F Caiafa, Qibin Zhao", "tldr": "", "abstract": "In adversarial defense, adversarial purification can be viewed as a special generation task with the purpose to remove adversarial attacks and diffusion models excel in adversarial purification for their strong generative power. With different predetermined generation requirements, various types of guidance have been proposed, but few of them focuses on adversarial purification. In this work, we propose to guide diffusion models for adversarial purification using contrastive guidance. We theoretically derive the proper noise level added in the forward process diffusion models for adversarial purification from a feature learning perspective. For the reverse process, it is implied that the role of contrastive loss guidance is to facilitate the evolution towards the signal direction. From the theoretical findings and implications, we design the forward process with the proper amount of Gaussian noise added and the reverse process with the gradient of contrastive loss as the guidance of diffusion models for adversarial purification. Empirically, extensive experiments on CIFAR-10, CIFAR-100, the German Traffic Sign Recognition Benchmark and ImageNet datasets with ResNet and WideResNet classifiers show that our method outperforms most of current adversarial training and adversarial purification methods by a large improvement.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mingyuan Bai;Wei Huang;Tenghui Li;Andong Wang;Junbin Gao;Cesar F Caiafa;Qibin Zhao", "authorids": "~Mingyuan_Bai1;~Wei_Huang6;tenghui.li@riken.jp;~Andong_Wang1;~Junbin_Gao1;~Cesar_F_Caiafa1;~Qibin_Zhao1", "gender": "F;M;;M;;M;M", "homepage": ";https://weihuang05.github.io/;;https://www.patternrecognition.asia/wad/;https://www.sydney.edu.au/business/about/our-people/academic-staff/junbin-gao.html;http://web.fi.uba.ar/~ccaiafa/Cesar.html;https://qibinzhao.github.io", "dblp": "205/2305;81/6685-34;;190/5540;30/3983;97/2347;13/1193", "google_scholar": "https://scholar.google.co.jp/citations?user=lo0_2rMAAAAJ;RZfDh4MAAAAJ;;vuPyxGwAAAAJ;https://scholar.google.com.au/citations?user=3-KJN8IAAAAJ;https://scholar.google.ca/citations?user=yAUJHFYAAAAJ;https://scholar.google.co.jp/citations?hl=en", "orcid": "0000-0002-2454-4219;0000-0001-5674-7021;;;0000-0001-9803-0256;0000-0001-5437-6095;0000-0002-4442-3182", "linkedin": ";;;;;cesar-caiafa-phd-8b4605/;", "or_profile": "~Mingyuan_Bai1;~Wei_Huang6;tenghui.li@riken.jp;~Andong_Wang1;~Junbin_Gao1;~Cesar_F_Caiafa1;~Qibin_Zhao1", "aff": "RIKEN;RIKEN AIP;;RIKEN AIP;University of Sydney;CONICET;RIKEN", "aff_domain": "riken.jp;riken.jp;;riken.jp;sydney.edu.au;iar-conicet.gov.ar;riken.jp", "position": "Postdoc;Research Scientist;;Postdoc;Full Professor;Research Scientist;Team Leader", "bibtex": "@inproceedings{\nbai2024diffusion,\ntitle={Diffusion Models Demand Contrastive Guidance for Adversarial Purification to Advance},\nauthor={Mingyuan Bai and Wei Huang and Tenghui Li and Andong Wang and Junbin Gao and Cesar F Caiafa and Qibin Zhao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2NUGeV64y2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 762517, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15650128776146334608&as_sdt=8005&sciodt=0,7&hl=en", "gs_version_total": 8, "email": "riken.jp;riken.jp;;riken.jp;sydney.edu.au;iar-conicet.gov.ar;riken.jp", "author_num": 7, "aff_unique_index": "0;0;0;1;2;0", "aff_unique_norm": "RIKEN;University of Sydney;Consejo Nacional de Investigaciones Cient\u00edficas y T\u00e9cnicas", "aff_unique_dep": ";;", "aff_unique_url": "https://www.riken.jp;https://www.sydney.edu.au;https://www.conicet.gov.ar", "aff_unique_abbr": "RIKEN;USYD;CONICET", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;2;0", "aff_country_unique": "Japan;Australia;Argentina" }, { "title": "UniCorn: A Unified Contrastive Learning Approach for Multi-view Molecular Representation Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35109", "id": "2NfpFwJfKu", "proceeding": "https://proceedings.mlr.press/v235/feng24f.html", "pdf": "https://openreview.net/pdf?id=2NfpFwJfKu", "openreview": "https://openreview.net/forum?id=2NfpFwJfKu", "author_site": "Shikun Feng, Yuyan Ni, Li, Yanwen Huang, Zhiming Ma, Wei-Ying Ma, Yanyan Lan", "tldr": "", "abstract": "Recently, a noticeable trend has emerged in developing pre-trained foundation models in the domains of CV and NLP. However, for molecular pre-training, there lacks a universal model capable of effectively applying to various categories of molecular tasks, since existing prevalent pre-training methods exhibit effectiveness for specific types of downstream tasks. Furthermore, the lack of profound understanding of existing pre-training methods, including 2D graph masking, 2D-3D contrastive learning, and 3D denoising, hampers the advancement of molecular foundation models. In this work, we provide a unified comprehension of existing pre-training methods through the lens of contrastive learning. Thus their distinctions lie in clustering different views of molecules, which is shown beneficial to specific downstream tasks. To achieve a complete and general-purpose molecular representation, we propose a novel pre-training framework, named UniCorn, that inherits the merits of the three methods, depicting molecular views in three different levels. SOTA performance across quantum, physicochemical, and biological tasks, along with comprehensive ablation study, validate the universality and effectiveness of UniCorn.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shikun Feng;Yuyan Ni;Minghao Li;Yanwen Huang;Zhi-Ming Ma;Wei-Ying Ma;Yanyan Lan", "authorids": "~Shikun_Feng3;~Yuyan_Ni1;~Minghao_Li8;~Yanwen_Huang2;~Zhi-Ming_Ma1;~Wei-Ying_Ma2;~Yanyan_Lan2", "gender": "M;;M;;;M;", "homepage": "https://fengshikun.github.io;https://nyyxxx.github.io/;https://github.com/limh1317;https://github.com/AnnaKhuan;http://homepage.amss.ac.cn/research/homePage/8eb59241e2e74d828fb84eec0efadba5/myHomePage.html;https://air.tsinghua.edu.cn/en/info/1046/1189.htm;", "dblp": ";117/6286;;53/6836;;m/WYMa.html;00/6040.html", "google_scholar": ";https://scholar.google.com/citations?hl=zh-CN;;kKz2vv9_pEoC;;SToCbu8AAAAJ;", "orcid": ";;;0009-0008-2338-4357;;;", "linkedin": ";;;;;wei-ying-ma-16a0171/;", "or_profile": "~Shikun_Feng3;~Yuyan_Ni1;~Minghao_Li8;~Yanwen_Huang2;~Zhi-Ming_Ma1;~Wei-Ying_Ma2;~Yanyan_Lan2", "aff": "Tsinghua University;University of Chinese Academy of Sciences;University of Chinese Academy of Sciences;Peking University;Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Chinese Academy of Sciences;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;ucas.ac.cn;ucas.ac.cn;pku.edu.cn;amss.ac.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;PhD student;MS student;PhD student;Full Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nfeng2024unicorn,\ntitle={UniCorn: A Unified Contrastive Learning Approach for Multi-view Molecular Representation Learning},\nauthor={Shikun Feng and Yuyan Ni and Minghao Li and Yanwen Huang and Zhi-Ming Ma and Wei-Ying Ma and Yanyan Lan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2NfpFwJfKu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9280257, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8604366396524631352&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "tsinghua.edu.cn;ucas.ac.cn;ucas.ac.cn;pku.edu.cn;amss.ac.cn;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 7, "aff_unique_index": "0;1;1;2;3;0;0", "aff_unique_norm": "Tsinghua University;University of Chinese Academy of Sciences;Peking University;Chinese Academy of Sciences", "aff_unique_dep": ";;;Academy of Mathematics and Systems Science", "aff_unique_url": "https://www.tsinghua.edu.cn;http://www.ucas.ac.cn;http://www.pku.edu.cn;http://www.cas.cn", "aff_unique_abbr": "THU;UCAS;Peking U;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Whispering Experts: Neural Interventions for Toxicity Mitigation in Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35108", "id": "2P6GVfSrfZ", "proceeding": "https://proceedings.mlr.press/v235/suau24a.html", "pdf": "https://openreview.net/pdf?id=2P6GVfSrfZ", "openreview": "https://openreview.net/forum?id=2P6GVfSrfZ", "author_site": "Xavi Suau, Pieter Delobelle, Katherine Metcalf, Armand Joulin, Nicholas Apostoloff, Luca Zappella, Pau Rodriguez", "tldr": "", "abstract": "An important issue with Large Language Models (LLMs) is their undesired ability to generate toxic language. In this work, we show that the neurons responsible for toxicity can be determined by their power to discriminate toxic sentences, and that toxic language can be mitigated by reducing their activation levels proportionally to this power. We propose AUROC adaptation (AurA), an intervention that can be applied to any pre-trained LLM to mitigate toxicity. As the intervention is proportional to the ability of each neuron to discriminate toxic content, it is free of any model-dependent hyperparameters. We show that AurA can achieve up to $2.2\\times$ reduction in toxicity with only a $0.72$ perplexity increase. We also show that AurA is effective with models of different scale (from 1.5B to 40B parameters), and its effectiveness in mitigating toxic language, while preserving common-sense zero-shot abilities, holds across all scales. AurA can be combined with pre-prompting strategies, boosting its average mitigation potential from $1.28\\times$ to $2.35\\times$. Moreover, AurA can counteract adversarial pre-prompts that maliciously elicit toxic content, making it an effective method for deploying safer and less toxic models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xavier Suau;Pieter Delobelle;Katherine Metcalf;Armand Joulin;Nicholas Apostoloff;Luca Zappella;Pau Rodriguez", "authorids": "~Xavier_Suau1;~Pieter_Delobelle1;~Katherine_Metcalf1;~Armand_Joulin2;~Nicholas_Apostoloff1;~Luca_Zappella1;~Pau_Rodriguez2", "gender": "M;;M;;M;;", "homepage": ";https://people.cs.kuleuven.be/~pieter.delobelle/;;;http://www.cis.jhu.edu/~luca/;;https://prlz77.github.io", "dblp": "21/8106;245/8739;;92/3793;38/2520;141/6401;190/7735", "google_scholar": ";https://scholar.google.be/citations?user=MVjJgxAAAAAJ;;p4w7a_kAAAAJ;bmh6mxAAAAAJ;V7baeTMAAAAJ;https://scholar.google.es/citations?user=IwBx73wAAAAJ", "orcid": ";0000-0001-5911-5310;;;;;0000-0002-1689-8084", "linkedin": ";;;apostoloff/;zappella?trk=people-guest_profile-result-card_result-card_full-click;;", "or_profile": "~Xavier_Suau1;~Pieter_Delobelle1;~Armand_Joulin2;~Nicholas_Apostoloff1;~Luca_Zappella1;~Rin_Metcalf1;~Pau_Rodriguez_Lopez1", "aff": "Apple;KU Leuven, KU Leuven;Meta Facebook;Apple;Apple;Apple;Apple", "aff_domain": "apple.com;cs.kuleuven.be;fb.com;apple.com;apple.com;apple.com;apple.com", "position": "Research scientist;Postdoc;Associate Professor;Principal Researcher;Principal Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\nsuau2024whispering,\ntitle={Whispering Experts: Neural Interventions for Toxicity Mitigation in Language Models},\nauthor={Xavier Suau and Pieter Delobelle and Katherine Metcalf and Armand Joulin and Nicholas Apostoloff and Luca Zappella and Pau Rodriguez},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2P6GVfSrfZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1139623, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1175978100035569770&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "apple.com;cs.kuleuven.be;fb.com;apple.com;apple.com;apple.com;apple.com", "author_num": 7, "aff_unique_index": "0;1;2;0;0;0;0", "aff_unique_norm": "Apple;KU Leuven;Meta", "aff_unique_dep": "Apple Inc.;;Meta Platforms, Inc.", "aff_unique_url": "https://www.apple.com;https://www.kuleuven.be;https://meta.com", "aff_unique_abbr": "Apple;KU Leuven;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;0;0", "aff_country_unique": "United States;Belgium" }, { "title": "Efficient PAC Learnability of Dynamical Systems Over Multilayer Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35107", "id": "2PVjIQdq7N", "proceeding": "https://proceedings.mlr.press/v235/qiu24a.html", "pdf": "https://openreview.net/pdf?id=2PVjIQdq7N", "openreview": "https://openreview.net/forum?id=2PVjIQdq7N", "author_site": "Zirou Qiu, Abhijin Adiga, Madhav Marathe, S. S. Ravi, Daniel Rosenkrantz, Richard Stearns, Anil Vullikanti", "tldr": "", "abstract": "Networked dynamical systems are widely used as formal models of real-world cascading phenomena, such as the spread of diseases and information. Prior research has addressed the problem of learning the behavior of an unknown dynamical system when the underlying network has a single layer. In this work, we study the learnability of dynamical systems over multilayer networks, which are more realistic and challenging. First, we present an efficient PAC learning algorithm with provable guarantees to show that the learner only requires a small number of training examples to infer an unknown system. We further provide a tight analysis of the Natarajan dimension which measures the model complexity. Asymptotically, our bound on the Nararajan dimension is tight for almost all multilayer graphs. The techniques and insights from our work provide the theoretical foundations for future investigations of learning problems for multilayer dynamical systems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zirou Qiu;Abhijin Adiga;Madhav Marathe;S. S. Ravi;Daniel Rosenkrantz;Richard Stearns;Anil Kumar Vullikanti", "authorids": "~Zirou_Qiu1;~Abhijin_Adiga1;~Madhav_V._Marathe1;~S._S._Ravi1;~Daniel_Rosenkrantz1;~Richard_Stearns1;~Anil_Vullikanti1", "gender": "M;M;M;M;M;M;M", "homepage": "https://zirouq.me;https://biocomplexity.virginia.edu/our-team/abhijin-adiga;https://biocomplexity.virginia.edu/person/madhav-marathe;;https://biocomplexity.virginia.edu/person/daniel-j-rosenkrantz;https://biocomplixity.Virginia.edu/person/Richard-e-stearns;https://engineering.virginia.edu/faculty/anil-vullikanti", "dblp": "254/1088.html;73/8044;52/991.html;94/2464;80/6105;;89/7912", "google_scholar": ";qaLuzFIAAAAJ;diIore8AAAAJ;XG2aNyYAAAAJ;RNHJqvcAAAAJ;;MNJ-E9UAAAAJ", "orcid": ";0000-0002-9770-034X;0000-0003-1653-0658;;;;0000-0002-8597-6197", "linkedin": ";abhijin-adiga-1b4b36128/;madhav-marathe-1426826/;;;;", "or_profile": "~Zirou_Qiu1;~Abhijin_Adiga1;~Madhav_V._Marathe1;~S._S._Ravi1;~Daniel_Rosenkrantz1;~Richard_Stearns1;~Anil_Vullikanti1", "aff": "University of Virginia, Charlottesville;University of Virginia, Charlottesville;University of Virginia, Charlottesville;University of Virginia;University of Virginia, Charlottesville;;University of Virginia", "aff_domain": "virginia.edu;virginia.edu;virginia.edu;virginia.edu;virginia.edu;;virginia.edu", "position": "PhD student;Associate Professor;Full Professor;Research Professor;Full Professor;;Professor", "bibtex": "@inproceedings{\nqiu2024efficient,\ntitle={Efficient {PAC} Learnability of Dynamical Systems Over Multilayer Networks},\nauthor={Zirou Qiu and Abhijin Adiga and Madhav Marathe and S. S. Ravi and Daniel Rosenkrantz and Richard Stearns and Anil Kumar Vullikanti},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2PVjIQdq7N}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1081282, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10778292781607336792&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "virginia.edu;virginia.edu;virginia.edu;virginia.edu;virginia.edu;;virginia.edu", "author_num": 7, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "University of Virginia", "aff_unique_dep": "", "aff_unique_url": "https://www.virginia.edu", "aff_unique_abbr": "UVA", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Charlottesville;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Human Alignment of Large Language Models through Online Preference Optimisation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35106", "id": "2RQqg2Y7Y6", "proceeding": "https://proceedings.mlr.press/v235/calandriello24a.html", "pdf": "https://openreview.net/pdf?id=2RQqg2Y7Y6", "openreview": "https://openreview.net/forum?id=2RQqg2Y7Y6", "author_site": "Daniele Calandriello, Zhaohan Guo, REMI MUNOS, Mark Rowland, Yunhao Tang, Bernardo Avila Pires, Pierre Richemond, Charline Le Lan, Michal Valko, Tianqi Liu, Rishabh Joshi, Zeyu Zheng, Bilal Piot", "tldr": "", "abstract": "Ensuring alignment of language model's outputs with human preferences is critical to guarantee a useful, safe, and pleasant user experience. Thus, human alignment has been extensively studied recently and several methods such as Reinforcement Learning from Human Feedback (RLHF), Direct Policy Optimisation (DPO) and Sequence Likelihood Calibration (SLiC) have emerged. In this paper, our contribution is two-fold. First, we show the equivalence between two recent alignment methods, namely Identity Policy Optimisation (IPO) and Nash Mirror Descent (Nash-MD). Second, we introduce a generalisation of IPO, named IPO-MD, that leverages the regularised sampling approach proposed by Nash-MD. This equivalence may seem surprising at first sight, since IPO is an offline method whereas Nash-MD is an online method using a preference model. However, this equivalence can be proven when we consider the online version of IPO, that is when both generations are sampled by the online policy and annotated by a trained preference model. Optimising the IPO loss with such a stream of data becomes then equivalent to finding the Nash equilibrium of the preference model through self-play. Building on this equivalence, we introduce the IPO-MD algorithm that generates data with a mixture policy (between the online and reference policy) similarly as the general Nash-MD algorithm. We compare online-IPO and IPO-MD to different online versions of existing losses on preference data such as DPO and SLiC on a summarisation task.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Daniele Calandriello;Zhaohan Daniel Guo;Remi Munos;Mark Rowland;Yunhao Tang;Bernardo Avila Pires;Pierre Harvey Richemond;Charline Le Lan;Michal Valko;Tianqi Liu;Rishabh Joshi;Zeyu Zheng;Bilal Piot", "authorids": "~Daniele_Calandriello1;~Zhaohan_Daniel_Guo1;~Remi_Munos1;~Mark_Rowland1;~Yunhao_Tang1;~Bernardo_Avila_Pires1;~Pierre_Harvey_Richemond1;~Charline_Le_Lan2;~Michal_Valko1;~Tianqi_Liu1;~Rishabh_Joshi1;~Zeyu_Zheng1;~Bilal_Piot1", "gender": "M;M;M;M;M;M;M;F;M;M;M;M;M", "homepage": ";;http://researchers.lille.inria.fr/~munos/;http://sites.google.com/view/markrowland;https://robintyh1.github.io;;;http://csml.stats.ox.ac.uk/people/lelan/;https://misovalko.github.io/research.html;;http://rishabhjoshi.github.io;http://www-personal.umich.edu/~zeyu/;", "dblp": "129/1542;160/9943;69/6815;86/4090;210/2229;124/8971;200/8842;234/9001;03/5455;134/5653-2;228/5645;48/7883;", "google_scholar": ";fxr_9oQAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.co.uk/citations?user=-0U84zMAAAAJ;;WpAH4iUAAAAJ;;3geG4OkAAAAJ;jrazNCQAAAAJ;pUKhiMIAAAAJ;https://scholar.google.co.in/citations?user=vu2pNVAAAAAJ;;https://scholar.google.fr/citations?user=fqxNUREAAAAJ", "orcid": ";;;;;;;;;;;;", "linkedin": ";;;;;;;;michalvalko/;;joshi-rishabh/;;", "or_profile": "~Daniele_Calandriello1;~Zhaohan_Daniel_Guo1;~Remi_Munos1;~Mark_Rowland1;~Yunhao_Tang1;~Bernardo_Avila_Pires1;~Pierre_Harvey_Richemond1;~Charline_Le_Lan2;~Michal_Valko1;~Tianqi_Liu1;~Rishabh_Joshi1;~Zeyu_Zheng1;~Bilal_Piot1", "aff": "Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Imperial College London;;Meta;Google DeepMind;Google;Google DeepMind;University Lille", "aff_domain": "deepmind.com;deepmind.com;google.com;google.com;deepmind.com;google.com;imperial.ac.uk;;meta.com;google.com;google.com;deepmind.com;univ-lille1.fr", "position": "Researcher;Research Scientist;Research scientist;Research Scientist;Research Scientist;Research Scientist;Visiting Researcher;;Principal Researcher;Software Engineer;Researcher;Research Scientist;Associate Professor", "bibtex": "@inproceedings{\ncalandriello2024human,\ntitle={Human Alignment of Large Language Models through Online Preference Optimisation},\nauthor={Daniele Calandriello and Zhaohan Daniel Guo and Remi Munos and Mark Rowland and Yunhao Tang and Bernardo Avila Pires and Pierre Harvey Richemond and Charline Le Lan and Michal Valko and Tianqi Liu and Rishabh Joshi and Zeyu Zheng and Bilal Piot},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2RQqg2Y7Y6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 789606, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 13, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8146124506374065567&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "deepmind.com;deepmind.com;google.com;google.com;deepmind.com;google.com;imperial.ac.uk;;meta.com;google.com;google.com;deepmind.com;univ-lille1.fr", "author_num": 13, "aff_unique_index": "0;0;0;0;0;0;1;2;0;0;0;3", "aff_unique_norm": "Google;Imperial College London;Meta;University of Lille", "aff_unique_dep": "Google DeepMind;;Meta Platforms, Inc.;", "aff_unique_url": "https://deepmind.com;https://www.imperial.ac.uk;https://meta.com;https://www.univ-lille.fr", "aff_unique_abbr": "DeepMind;ICL;Meta;ULille", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;1;0;1;0;2", "aff_country_unique": "United Kingdom;United States;France" }, { "title": "A General Theory for Softmax Gating Multinomial Logistic Mixture of Experts", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35105", "id": "2Sl0lPF6ka", "proceeding": "https://proceedings.mlr.press/v235/nguyen24b.html", "pdf": "https://openreview.net/pdf?id=2Sl0lPF6ka", "openreview": "https://openreview.net/forum?id=2Sl0lPF6ka", "author_site": "Huy Nguyen, Pedram Akbarian, TrungTin Nguyen, Nhat Ho", "tldr": "", "abstract": "Mixture-of-experts (MoE) model incorporates the power of multiple submodels via gating functions to achieve greater performance in numerous regression and classification applications. From a theoretical perspective, while there have been previous attempts to comprehend the behavior of that model under the regression settings through the convergence analysis of maximum likelihood estimation in the Gaussian MoE model, such analysis under the setting of a classification problem has remained missing in the literature. We close this gap by establishing the convergence rates of density estimation and parameter estimation in the softmax gating multinomial logistic MoE model. Notably, when part of the expert parameters vanish, these rates are shown to be slower than polynomial rates owing to an inherent interaction between the softmax gating and expert functions via partial differential equations. To address this issue, we propose using a novel class of modified softmax gating functions which transform the input before delivering them to the gating functions. As a result, the previous interaction disappears and the parameter estimation rates are significantly improved.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Huy Nguyen;Pedram Akbarian;TrungTin Nguyen;Nhat Ho", "authorids": "~Huy_Nguyen5;~Pedram_Akbarian1;~TrungTin_Nguyen1;~Nhat_Ho1", "gender": "M;M;M;M", "homepage": "https://huynm99.github.io/;https://pedakb.github.io/;https://trung-tinnguyen.github.io/;https://nhatptnk8912.github.io/", "dblp": "48/6075;358/2800;275/3643;203/4479", "google_scholar": "_YYwzhQAAAAJ;eg68QWIAAAAJ;NhiJDJsAAAAJ;https://scholar.google.ca/citations?user=Xs7cKMwAAAAJ", "orcid": ";;0000-0001-8433-5980;", "linkedin": "huy-nguyen-081199/;;trungtinnguyen0/;nhat-pham-minh-ho-267b8164/", "or_profile": "~Huy_Nguyen5;~Pedram_Akbarian1;~TrungTin_Nguyen1;~Nhat_Ho1", "aff": "Microsoft AI;University of Texas at Austin;The University of Queensland;University of Texas, Austin", "aff_domain": "microsoft.com;utexas.edu;uq.edu.au;utexas.edu", "position": "Intern;PhD student;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nnguyen2024a,\ntitle={A General Theory for Softmax Gating Multinomial Logistic Mixture of Experts},\nauthor={Huy Nguyen and Pedram Akbarian and TrungTin Nguyen and Nhat Ho},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2Sl0lPF6ka}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 733304, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3041725821980088575&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 10, "email": "microsoft.com;utexas.edu;uq.edu.au;utexas.edu", "author_num": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Microsoft;University of Texas at Austin;University of Queensland", "aff_unique_dep": "Microsoft AI;;", "aff_unique_url": "https://www.microsoft.com;https://www.utexas.edu;https://www.uq.edu.au", "aff_unique_abbr": "Microsoft;UT Austin;UQ", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;Australia" }, { "title": "Explaining Graph Neural Networks via Structure-aware Interaction Index", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35104", "id": "2T00oYk54P", "proceeding": "https://proceedings.mlr.press/v235/bui24b.html", "pdf": "https://openreview.net/pdf?id=2T00oYk54P", "openreview": "https://openreview.net/forum?id=2T00oYk54P", "author_site": "Ngoc Bui, Trung Hieu Nguyen, Viet Anh Nguyen, ZHITAO YING", "tldr": "", "abstract": "The Shapley value is a prominent tool for interpreting black-box machine learning models thanks to its strong theoretical foundation. However, for models with structured inputs, such as graph neural networks, existing Shapley-based explainability approaches either focus solely on node-wise importance or neglect the graph structure when perturbing the input instance. This paper introduces the Myerson-Taylor interaction index that internalizes the graph structure into attributing the node values and the interaction values among nodes. Unlike the Shapley-based methods, the Myerson-Taylor index decomposes coalitions into components satisfying a pre-chosen connectivity criterion. We prove that the Myerson-Taylor index is the unique one that satisfies a system of five natural axioms accounting for graph structure and high-order interaction among nodes. Leveraging these properties, we propose Myerson-Taylor Structure-Aware Graph Explainer (MAGE), a novel explainer that uses the second-order Myerson-Taylor index to identify the most important motifs influencing the model prediction, both positively and negatively. Extensive experiments on various graph datasets and models demonstrate that our method consistently provides superior subgraph explanations compared to state-of-the-art methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ngoc Bui;Hieu Trung Nguyen;Viet Anh Nguyen;Rex Ying", "authorids": "~Ngoc_Bui1;~Hieu_Trung_Nguyen2;~Viet_Anh_Nguyen2;~Zhitao_Ying1", "gender": "M;M;M;M", "homepage": "http://ngocbh.github.io;;http://www.vietanhnguyen.net;https://www.cs.yale.edu/homes/ying-rex", "dblp": "312/6811;;;209/4936", "google_scholar": ";OlFCFKgAAAAJ;3iyf-EoAAAAJ;6fqNXooAAAAJ", "orcid": ";;;", "linkedin": ";hieu-nguyen-08774317a/;;rex-ying-92770148/", "or_profile": "~Ngoc_Bui1;~Hieu_Trung_Nguyen2;~Viet_Anh_Nguyen2;~Zhitao_Ying1", "aff": "Yale University;The Chinese University of Hong Kong;The Chinese University of Hong Kong;Yale University", "aff_domain": "yale.edu;cuhk.edu.hk;cuhk.edu.hk;yale.edu", "position": "PhD student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nbui2024explaining,\ntitle={Explaining Graph Neural Networks via Structure-aware Interaction Index},\nauthor={Ngoc Bui and Hieu Trung Nguyen and Viet Anh Nguyen and Rex Ying},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2T00oYk54P}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6780997, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7778797873453224714&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "yale.edu;cuhk.edu.hk;cuhk.edu.hk;yale.edu", "author_num": 4, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Yale University;Chinese University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "https://www.yale.edu;https://www.cuhk.edu.hk", "aff_unique_abbr": "Yale;CUHK", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "United States;China" }, { "title": "Implicit Representations via Operator Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35103", "id": "2W3KUAaZgO", "proceeding": "https://proceedings.mlr.press/v235/pal24a.html", "pdf": "https://openreview.net/pdf?id=2W3KUAaZgO", "openreview": "https://openreview.net/forum?id=2W3KUAaZgO", "author_site": "Sourav Pal, Harshavardhan Adepu, Clinton Wang, Polina Golland, Vikas Singh", "tldr": "", "abstract": "The idea of representing a signal as the weights of a neural network, called *Implicit Neural Representations* (INRs), has led to exciting implications for compression, view synthesis and 3D volumetric data understanding. One problem in this setting pertains to the use of INRs for downstream processing tasks. Despite some conceptual results, this remains challenging because the INR for a given image/signal often exists in isolation. What does the neighborhood around a given INR correspond to? Based on this question, we offer an operator theoretic reformulation of the INR model, which we call Operator INR (or O-INR). At a high level, instead of mapping positional encodings to a signal, O-INR maps one function space to another function space. A practical form of this general casting is obtained by appealing to Integral Transforms. The resultant model does not need multi-layer perceptrons (MLPs), used in most existing INR models -- we show that convolutions are sufficient and offer benefits including numerically stable behavior. We show that O-INR can easily handle most problem settings in the literature, and offers a similar performance profile as baselines. These benefits come with minimal, if any, compromise. Our code is available at https://github.com/vsingh-group/oinr.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sourav Pal;Harshavardhan Adepu;Clinton Wang;Polina Golland;Vikas Singh", "authorids": "~Sourav_Pal1;~Harshavardhan_Adepu1;~Clinton_Wang1;~Polina_Golland1;~Vikas_Singh1", "gender": "M;M;M;;M", "homepage": ";https://harshauwm163.github.io/;https://clintonjwang.github.io/;https://people.csail.mit.edu/polina;http://vsingh-www.cs.wisc.edu/", "dblp": "19/5611;372/1508;239/1852.html;g/PolinaGolland;", "google_scholar": "672qcz0AAAAJ;dOUqv1AAAAAJ;7ICTJmoAAAAJ;;d32BmwcAAAAJ", "orcid": ";;;;", "linkedin": "souravpalkgp/;adepu-harshavardhan-9a7006ba/;;;", "or_profile": "~Sourav_Pal1;~Harshavardhan_Adepu1;~Clinton_Wang1;~Polina_Golland1;~Vikas_Singh1", "aff": "University of Wisconsin, Madison;Google;Massachusetts Institute of Technology;Massachusetts Institute of Technology;University of Wisconsin, Madison", "aff_domain": "wisc.edu;google.com;mit.edu;mit.edu;wisc.edu", "position": "PhD student;Intern;PhD student;Full Professor;Professor", "bibtex": "@inproceedings{\npal2024implicit,\ntitle={Implicit Representations via Operator Learning},\nauthor={Sourav Pal and Harshavardhan Adepu and Clinton Wang and Polina Golland and Vikas Singh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2W3KUAaZgO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6842356, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1349359679764205752&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "wisc.edu;google.com;mit.edu;mit.edu;wisc.edu", "author_num": 5, "aff_unique_index": "0;1;2;2;0", "aff_unique_norm": "University of Wisconsin;Google;Massachusetts Institute of Technology", "aff_unique_dep": ";Google;", "aff_unique_url": "https://www.wisc.edu;https://www.google.com;https://web.mit.edu", "aff_unique_abbr": "UW;Google;MIT", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Madison;Mountain View;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Online conformal prediction with decaying step sizes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35102", "id": "2XkRIijUKw", "proceeding": "https://proceedings.mlr.press/v235/angelopoulos24a.html", "pdf": "https://openreview.net/pdf?id=2XkRIijUKw", "openreview": "https://openreview.net/forum?id=2XkRIijUKw", "author_site": "Anastasios Angelopoulos, Rina Barber, Stephen Bates", "tldr": "", "abstract": "We introduce a method for online conformal prediction with decaying step sizes. Like previous methods, ours possesses a retrospective guarantee of coverage for arbitrary sequences. However, unlike previous methods, we can simultaneously estimate a population quantile when it exists. Our theory and experiments indicate substantially improved practical properties: in particular, when the distribution is stable, the coverage is close to the desired level *for every time point*, not just on average over the observed sequence.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anastasios Nikolas Angelopoulos;Rina Barber;Stephen Bates", "authorids": "~Anastasios_Nikolas_Angelopoulos1;~Rina_Barber1;~Stephen_Bates1", "gender": "M;F;", "homepage": "http://angelopoulos.ai;http://www.stat.uchicago.edu/~rina;https://stephenbates19.github.io/", "dblp": ";;", "google_scholar": "nfX25MMAAAAJ;;", "orcid": ";;0000-0002-3273-8179", "linkedin": "anastasiosa/;;", "or_profile": "~Anastasios_Nikolas_Angelopoulos1;~Rina_Barber1;~Stephen_Bates1", "aff": "University of California, Berkeley;University of Chicago;Massachusetts Institute of Technology", "aff_domain": "berkeley.edu;uchicago.edu;mit.edu", "position": "PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nangelopoulos2024online,\ntitle={Online conformal prediction with decaying step sizes},\nauthor={Anastasios Nikolas Angelopoulos and Rina Barber and Stephen Bates},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2XkRIijUKw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 742150, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16126586795309519564&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 7, "email": "berkeley.edu;uchicago.edu;mit.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of California, Berkeley;University of Chicago;Massachusetts Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.berkeley.edu;https://www.uchicago.edu;https://web.mit.edu", "aff_unique_abbr": "UC Berkeley;UChicago;MIT", "aff_campus_unique_index": "0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Revisiting the Power of Prompt for Visual Tuning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35101", "id": "2Y93PtAqCl", "proceeding": "https://proceedings.mlr.press/v235/wang24i.html", "pdf": "https://openreview.net/pdf?id=2Y93PtAqCl", "openreview": "https://openreview.net/forum?id=2Y93PtAqCl", "author_site": "Yuzhu Wang, Lechao Cheng, Chaowei Fang, Dingwen Zhang, Manni Duan, Meng Wang", "tldr": "", "abstract": "Visual prompt tuning (VPT) is a promising solution incorporating learnable prompt tokens to customize pre-trained models for downstream tasks. However, VPT and its variants often encounter challenges like prompt initialization, prompt length, and subpar performance in self-supervised pretraining, hindering successful contextual adaptation. This study commences by exploring the correlation evolvement between prompts and patch tokens during proficient training. Inspired by the observation that the prompt tokens tend to share high mutual information with patch tokens, we propose initializing prompts with downstream token prototypes. The strategic initialization, a stand-in for the previous initialization, substantially improves performance. To refine further, we optimize token construction with a streamlined pipeline that maintains excellent performance with almost no increase in computational expenses compared to VPT. Exhaustive experiments show our proposed approach outperforms existing methods by a remarkable margin. For instance, after MAE pre-training, our method improves accuracy by up to 10%$\\sim$30% compared to VPT, and outperforms Full fine-tuning 19 out of 24 cases while using less than 0.4% of learnable parameters. Besides, the experimental results demonstrate the proposed SPT is robust to prompt lengths and scales well with model capacity and training data size. We finally provide an insightful exploration into the amount of target data facilitating the adaptation of pre-trained models to downstream tasks. The code is available at https://github.com/WangYZ1608/Self-Prompt-Tuning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuzhu Wang;Lechao Cheng;Chaowei Fang;Dingwen Zhang;Manni Duan;Meng Wang", "authorids": "~Yuzhu_Wang1;~Lechao_Cheng2;~Chaowei_Fang3;~Dingwen_Zhang1;~Manni_Duan2;~Meng_Wang2", "gender": ";M;M;;F;", "homepage": ";https://faculty.hfut.edu.cn/ChengLechao;https://chaoweifang.github.io/;;;", "dblp": ";165/9781;159/1655;;03/3126;", "google_scholar": "ApZFks8AAAAJ;PKFAv-cAAAAJ;eNtYEmcAAAAJ;;lIlNBQoAAAAJ;", "orcid": "0009-0005-8827-3779;0000-0002-7546-9052;;;;", "linkedin": ";;;;;", "or_profile": "~Yuzhu_Wang1;~Lechao_Cheng2;~Chaowei_Fang3;~Dingwen_Zhang1;~Manni_Duan2;~Meng_Wang2", "aff": "Zhejiang lab;Hefei University of Technology;Xidian University;;Zhejiang Lab;", "aff_domain": "zhejianglab.com;hfut.edu.cn;xidian.edu.cn;;zhejianglab.com;", "position": "Algorithm Engineer;Associate Professor;Associate Professor;;Researcher;", "bibtex": "@inproceedings{\nwang2024revisiting,\ntitle={Revisiting the Power of Prompt for Visual Tuning},\nauthor={Yuzhu Wang and Lechao Cheng and Chaowei Fang and Dingwen Zhang and Manni Duan and Meng Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2Y93PtAqCl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1346344, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8928355307599028453&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "zhejianglab.com;hfut.edu.cn;xidian.edu.cn;;zhejianglab.com;", "author_num": 6, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Zhejiang University;Hefei University of Technology;Xidian University;Zhejiang Lab", "aff_unique_dep": "Zhejiang Lab;;;", "aff_unique_url": "http://www.zju.edu.cn;http://www.hfut.edu.cn/;http://www.xidian.edu.cn/;http://www.zhejianglab.com", "aff_unique_abbr": "ZJU;HUT;Xidian;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Prompt Sketching for Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35100", "id": "2Yu5FWdzde", "proceeding": "https://proceedings.mlr.press/v235/beurer-kellner24b.html", "pdf": "https://openreview.net/pdf?id=2Yu5FWdzde", "openreview": "https://openreview.net/forum?id=2Yu5FWdzde", "author_site": "Luca Beurer-Kellner, Mark M\u00fcller, Marc Fischer, Martin Vechev", "tldr": "", "abstract": "Many recent prompting strategies for large language models (LLMs) query the model multiple times sequentially -- first to produce intermediate results and then the final answer. However, using these methods, both decoder and model are unaware of potential follow-up prompts, leading to disconnected and undesirably wordy intermediate responses. In this work, we address this issue by proposing prompt sketching, a new prompting paradigm in which an LLM does not only respond by completing a prompt, but by predicting values for multiple variables in a template. This way, sketching grants users more control over the generation process, e.g., by providing a reasoning framework via intermediate instructions, leading to better overall results. The key idea enabling sketching with existing, autoregressive models is to adapt the decoding procedure to also score follow-up instructions during text generation, thus optimizing overall template likelihood in inference. Our experiments show that in a zero-shot setting, prompt sketching outperforms existing, sequential prompting schemes such as direct asking or chain-of-thought on 7 out of 8 LLM benchmarking tasks, including state tracking, arithmetic reasoning, and general question answering. To facilitate future use, we release a number of generic, yet effective sketches applicable to many tasks, and an open source library called dclib, powering our sketch-aware decoders as part of https://github.com/eth-sri/lmql.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Luca Beurer-Kellner;Mark Niklas Mueller;Marc Fischer;Martin Vechev", "authorids": "~Luca_Beurer-Kellner1;~Mark_Niklas_Mueller2;~Marc_Fischer1;~Martin_Vechev1", "gender": "M;M;M;M", "homepage": ";https://www.sri.inf.ethz.ch/people/mark;;https://www.sri.inf.ethz.ch/people/martin", "dblp": "314/2627;287/4254;37/9373-2;93/2189.html", "google_scholar": "https://scholar.google.com/citations?hl=de;RBpmcCAAAAAJ;;https://scholar.google.ch/citations?user=aZ1Rh50AAAAJ", "orcid": ";0000-0002-2496-6542;;", "linkedin": ";mark-m%C3%BCller-8bb4b1140/;;", "or_profile": "~Luca_Beurer-Kellner1;~Mark_Niklas_Mueller2;~Marc_Fischer1;~Martin_Vechev1", "aff": "ETHZ - ETH Zurich;Swiss Federal Institute of Technology;Swiss Federal Institute of Technology;Swiss Federal Institute of Technology", "aff_domain": "ethz.ch;ethz.ch;ethz.ch;ethz.ch", "position": "PhD student;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nbeurer-kellner2024prompt,\ntitle={Prompt Sketching for Large Language Models},\nauthor={Luca Beurer-Kellner and Mark Niklas Mueller and Marc Fischer and Martin Vechev},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2Yu5FWdzde}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1558699, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14562663346167291414&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "ethz.ch;ethz.ch;ethz.ch;ethz.ch", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "ETH Zurich;Swiss Federal Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch", "aff_unique_abbr": "ETHZ;ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Towards Efficient Training and Evaluation of Robust Models against $l_0$ Bounded Adversarial Perturbations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35099", "id": "2bUFIsg2f5", "proceeding": "https://proceedings.mlr.press/v235/zhong24c.html", "pdf": "https://openreview.net/pdf?id=2bUFIsg2f5", "openreview": "https://openreview.net/forum?id=2bUFIsg2f5", "author_site": "Xuyang Zhong, Yixiao HUANG, Chen Liu", "tldr": "", "abstract": "This work studies sparse adversarial perturbations bounded by $l_0$ norm. We propose a white-box PGD-like attack method named sparse-PGD to effectively and efficiently generate such perturbations. Furthermore, we combine sparse-PGD with a black-box attack to comprehensively and more reliably evaluate the models' robustness against $l_0$ bounded adversarial perturbations. Moreover, the efficiency of sparse-PGD enables us to conduct adversarial training to build robust models against sparse perturbations. Extensive experiments demonstrate that our proposed attack algorithm exhibits strong performance in different scenarios. More importantly, compared with other robust models, our adversarially trained model demonstrates state-of-the-art robustness against various sparse attacks. Codes are available at https://github.com/CityU-MLO/sPGD.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xuyang Zhong;Yixiao Huang;Chen Liu", "authorids": "~Xuyang_Zhong1;~Yixiao_Huang3;~Chen_Liu1", "gender": "M;M;M", "homepage": ";https://yixiao-huang.github.io/;http://liuchen1993.cn/HomePage/index.html", "dblp": ";130/6820-4;10/2639-27", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;iTEcewwAAAAJ;48PsswEAAAAJ", "orcid": ";;", "linkedin": "xuyang-zhong-62a937226;;", "or_profile": "~Xuyang_Zhong1;~Yixiao_Huang3;~Chen_Liu1", "aff": "City University of Hong Kong;University of California, Berkeley;City University of Hong Kong", "aff_domain": "cityu.edu.hk;berkeley.edu;cityu.edu.hk", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nzhong2024towards,\ntitle={Towards Efficient Training and Evaluation of Robust Models against \\$l\\_0\\$ Bounded Adversarial Perturbations},\nauthor={Xuyang Zhong and Yixiao Huang and Chen Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2bUFIsg2f5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 875620, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3899132998636311717&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "cityu.edu.hk;berkeley.edu;cityu.edu.hk", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "City University of Hong Kong;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://www.cityu.edu.hk;https://www.berkeley.edu", "aff_unique_abbr": "CityU;UC Berkeley", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Hong Kong SAR;Berkeley", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;United States" }, { "title": "Editing Partially Observable Networks via Graph Diffusion Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35098", "id": "2cEhQ4vtTf", "proceeding": "https://proceedings.mlr.press/v235/trivedi24a.html", "pdf": "https://openreview.net/pdf?id=2cEhQ4vtTf", "openreview": "https://openreview.net/forum?id=2cEhQ4vtTf", "author_site": "Puja Trivedi, Ryan A Rossi, David Arbour, Tong Yu, Franck Dernoncourt, Sungchul Kim, Nedim Lipka, Namyong Park, Nesreen Ahmed, Danai Koutra", "tldr": "", "abstract": "Most real-world networks are noisy and incomplete samples from an unknown target distribution. Refining them by correcting corruptions or inferring unobserved regions typically improves downstream performance. Inspired by the impressive generative capabilities that have been used to correct corruptions in images, and the similarities between \"in-painting\" and filling in missing nodes and edges conditioned on the observed graph, we propose a novel graph generative framework, SGDM, which is based on subgraph diffusion. Our framework not only improves the scalability and fidelity of graph diffusion models, but also leverages the reverse process to perform novel, conditional generation tasks. In particular, through extensive empirical analysis and a set of novel metrics, we demonstrate that our proposed model effectively supports the following refinement tasks for partially observable networks: (T1) denoising extraneous subgraphs, (T2) expanding existing subgraphs and (T3) performing ``style\" transfer by regenerating a particular subgraph to match the characteristics of a different node or subgraph.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Puja Trivedi;Ryan A. Rossi;David Arbour;Tong Yu;Franck Dernoncourt;Sungchul Kim;Nedim Lipka;Namyong Park;Nesreen K. Ahmed;Danai Koutra", "authorids": "~Puja_Trivedi1;~Ryan_A._Rossi2;~David_Arbour1;~Tong_Yu3;~Franck_Dernoncourt1;~Sungchul_Kim1;~Nedim_Lipka1;~Namyong_Park1;~Nesreen_K._Ahmed2;~Danai_Koutra1", "gender": "F;;;;;M;;;;F", "homepage": "https://pujacomputes.github.io/;;http://darbour.github.io;https://www.linkedin.com/in/tong-yu-42790744;http://francky.me;https://sites.google.com/site/subright;;https://namyongpark.github.io/;;http://web.eecs.umich.edu/~dkoutra/", "dblp": "274/2080;;87/7578;32/1593-1;132/4043;61/1573;;116/9404;;91/9987", "google_scholar": "1y9cR50AAAAJ;;prj0heYAAAAJ;https://scholar.google.com/citations?hl=en;kz2aIc8AAAAJ;v8ISLgIAAAAJ;;YBTXGb8AAAAJ;;https://scholar.google.com.tw/citations?user=bDrA1-8AAAAJ", "orcid": "0000-0003-1874-8992;;;0000-0002-5991-2050;0000-0002-1119-1346;0000-0003-3580-5290;;;;0000-0002-3206-8179", "linkedin": ";;david-arbour/;tong-yu-42790744;franckdernoncourt;;;;;", "or_profile": "~Puja_Trivedi1;~Ryan_A._Rossi2;~David_Arbour1;~Tong_Yu3;~Franck_Dernoncourt1;~Sungchul_Kim1;~Nedim_Lipka1;~Namyong_Park1;~Nesreen_K._Ahmed2;~Danai_Koutra1", "aff": "University of Michigan;;Adobe Systems;Adobe Research;Adobe Systems;Adobe Systems;;Meta AI;;Amazon", "aff_domain": "umich.edu;;adobe.com;adobe.com;adobe.com;adobe.com;;meta.com;;amazon.com", "position": "PhD student;;Research Scientist;Senior Research Scientist;Researcher;Researcher;;Researcher;;Scholar", "bibtex": "@inproceedings{\ntrivedi2024editing,\ntitle={Editing Partially Observable Networks via Graph Diffusion Models},\nauthor={Puja Trivedi and Ryan A. Rossi and David Arbour and Tong Yu and Franck Dernoncourt and Sungchul Kim and Nedim Lipka and Namyong Park and Nesreen K. Ahmed and Danai Koutra},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2cEhQ4vtTf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7330834, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12791179398745624748&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "umich.edu;;adobe.com;adobe.com;adobe.com;adobe.com;;meta.com;;amazon.com", "author_num": 10, "aff_unique_index": "0;1;1;1;1;2;3", "aff_unique_norm": "University of Michigan;Adobe;Meta;Amazon", "aff_unique_dep": ";Adobe Systems Incorporated;Meta AI;Amazon.com, Inc.", "aff_unique_url": "https://www.umich.edu;https://www.adobe.com;https://meta.com;https://www.amazon.com", "aff_unique_abbr": "UM;Adobe;Meta;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "PDHG-Unrolled Learning-to-Optimize Method for Large-Scale Linear Programming", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35097", "id": "2cXzNDe614", "proceeding": "https://proceedings.mlr.press/v235/li24ce.html", "pdf": "https://openreview.net/pdf?id=2cXzNDe614", "openreview": "https://openreview.net/forum?id=2cXzNDe614", "author_site": "Bingheng Li, Linxin Yang, Yupeng Chen, Senmiao Wang, Haitao Mao, Qian Chen, Yao Ma, Akang Wang, Tian Ding, Jiliang Tang, Ruoyu Sun", "tldr": "", "abstract": "Solving large-scale linear programming (LP) problems is an important task in various areas such as communication networks, power systems, finance and logistics. Recently, two distinct approaches have emerged to expedite LP solving: (i) First-order methods (FOMs); (ii) Learning to optimize (L2O). In this work, we propose an FOM-unrolled neural network (NN) called PDHG-Net, and propose a two-stage L2O method to solve large-scale LP problems. The new architecture PDHG-Net is designed by unrolling the recently emerged PDHG method into a neural network, combined with channel-expansion techniques borrowed from graph neural networks. We prove that the proposed PDHG-Net can recover PDHG algorithm, thus can approximate optimal solutions of LP instances with a polynomial number of neurons. We propose a two-stage inference approach: first use PDHG-Net to generate an approximate solution, and then apply PDHG algorithm to further improve the solution. Experiments show that our approach can significantly accelerate LP solving, achieving up to a 3$\\times$ speedup compared to FOMs for large-scale LP problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bingheng Li;Linxin Yang;Yupeng Chen;Senmiao Wang;Haitao Mao;Qian Chen;Yao Ma;Akang Wang;Tian Ding;Jiliang Tang;Ruoyu Sun", "authorids": "~Bingheng_Li1;~Linxin_Yang1;~Yupeng_Chen3;~Senmiao_Wang1;~Haitao_Mao1;~Qian_Chen10;~Yao_Ma3;~Akang_Wang1;~Tian_Ding1;~Jiliang_Tang1;~Ruoyu_Sun1", "gender": "M;M;M;;;;M;Not Specified;M;M;", "homepage": "https://github.com/uestclbh;;;;;;https://yaoma24.github.io/;https://akangw.github.io/;;https://www.cse.msu.edu/~tangjili/;https://ruoyus.github.io/", "dblp": ";;;;;;212/7871.html;222/3290;;64/10812;30/9879-1", "google_scholar": ";https://scholar.google.com/citations?hl=en;;;;O74Oj08AAAAJ;wf9TTOIAAAAJ;TyYzzmoAAAAJ;https://scholar.google.com.hk/citations?user=lVkDF-YAAAAJ;WtzKMWAAAAAJ;PsfzbCMAAAAJ", "orcid": "0009-0000-0950-9012;;0000-0003-1802-7220;;;;;0000-0002-3325-8441;0000-0002-9383-8405;0000-0001-7125-3898;", "linkedin": ";;;;;;;wangakang/;;;", "or_profile": "~Bingheng_Li1;~Linxin_Yang1;~Yupeng_Chen3;~Senmiao_Wang1;~Haitao_Mao1;~Qian_Chen10;~Yao_Ma3;~Akang_Wang1;~Tian_Ding1;~Jiliang_Tang1;~Ruoyu_Sun1", "aff": "University of Electronic Science and Technology of China;The Chinese University of Hong Kong, Shenzhen;Sichuan University;;;The Chinese University of Hong Kong, Shenzhen;Rensselaer Polytechnic Institute;Shenzhen Research Institute of Big Data;Shenzhen Research Institute of Big Data;Michigan State University;The Chinese University of Hong Kong", "aff_domain": "uestc.edu.cn;link.cuhk.edu.cn;scu.edu.cn;;;cuhk.edu.cn;rpi.edu;sribd.cn;sribd.cn;msu.edu;cuhk.edu.cn", "position": "Undergrad student;PhD student;MS student;;;PhD student;Assistant Professor;Researcher;Researcher;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nli2024pdhgunrolled,\ntitle={{PDHG}-Unrolled Learning-to-Optimize Method for Large-Scale Linear Programming},\nauthor={Bingheng Li and Linxin Yang and Yupeng Chen and Senmiao Wang and Haitao Mao and Qian Chen and Yao Ma and Akang Wang and Tian Ding and Jiliang Tang and Ruoyu Sun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2cXzNDe614}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 527732, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15244543940443284360&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "uestc.edu.cn;link.cuhk.edu.cn;scu.edu.cn;;;cuhk.edu.cn;rpi.edu;sribd.cn;sribd.cn;msu.edu;cuhk.edu.cn", "author_num": 11, "aff_unique_index": "0;1;2;1;3;4;4;5;1", "aff_unique_norm": "University of Electronic Science and Technology of China;Chinese University of Hong Kong;Sichuan University;Rensselaer Polytechnic Institute;Shenzhen Research Institute of Big Data;Michigan State University", "aff_unique_dep": ";;;;;", "aff_unique_url": "https://www.uestc.edu.cn;https://www.cuhk.edu.cn;https://www.scu.edu.cn;https://www.rpi.edu;http://www.sribd.cn;https://www.msu.edu", "aff_unique_abbr": "UESTC;CUHK;SCU;RPI;;MSU", "aff_campus_unique_index": "1;1;2", "aff_campus_unique": ";Shenzhen;Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;1;0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Do Topological Characteristics Help in Knowledge Distillation?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35096", "id": "2dEH0u8w0b", "proceeding": "https://proceedings.mlr.press/v235/kim24aj.html", "pdf": "https://openreview.net/pdf?id=2dEH0u8w0b", "openreview": "https://openreview.net/forum?id=2dEH0u8w0b", "author_site": "Jungeun Kim, Junwon You, Dongjin Lee, Ha Young Kim, Jae-Hun Jung", "tldr": "", "abstract": "Knowledge distillation (KD) aims to transfer knowledge from larger (teacher) to smaller (student) networks. Previous studies focus on point-to-point or pairwise relationships in embedding features as knowledge and struggle to efficiently transfer relationships of complex latent spaces. To tackle this issue, we propose a novel KD method called TopKD, which considers the global topology of the latent spaces. We define *global topology knowledge* using the persistence diagram (PD) that captures comprehensive geometric structures such as shape of distribution, multiscale structure and connectivity, and the *topology distillation loss* for teaching this knowledge. To make the PD transferable within reasonable computational time, we employ approximated persistence images of PDs. Through experiments, we support the benefits of using global topology as knowledge and demonstrate the potential of TopKD. Code is available at https://github.com/jekim5418/TopKD", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jungeun Kim;Junwon You;Dongjin Lee;Ha Young Kim;Jae-Hun Jung", "authorids": "~Jungeun_Kim2;~Junwon_You1;~Dongjin_Lee3;~Ha_Young_Kim2;~Jae-Hun_Jung1", "gender": "F;M;M;F;", "homepage": ";;;https://sites.google.com/view/mlcf/home;https://sites.google.com/view/jaehunjung/home", "dblp": "86/1254.html;;;191/2588;", "google_scholar": "ipQHR3wAAAAJ;https://scholar.google.com/citations?hl=ko;kfPaTigAAAAJ;https://scholar.google.com/citations?hl=ko;", "orcid": "0000-0002-5341-726X;0009-0000-6220-7275;0009-0003-0777-6717;0000-0001-5115-9984;", "linkedin": "jungeun-kim-383bb218a/;;hiddenbeginner/;;", "or_profile": "~Jungeun_Kim2;~Junwon_You1;~Dongjin_Lee3;~Ha_Young_Kim2;~Jae-Hun_Jung1", "aff": "Yonsei University;Pohang University of Science and Technology;POSTECH;Yonsei University;Pohang University of Science and Technology", "aff_domain": "yonsei.ac.kr;postech.ac.kr;postech.ac.kr;yonsei.ac.kr;postech.ac.kr", "position": "PhD student;PhD student;PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nkim2024do,\ntitle={Do Topological Characteristics Help in Knowledge Distillation?},\nauthor={Jungeun Kim and Junwon You and Dongjin Lee and Ha Young Kim and Jae-Hun Jung},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2dEH0u8w0b}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9294200, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8650966065494032807&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "yonsei.ac.kr;postech.ac.kr;postech.ac.kr;yonsei.ac.kr;postech.ac.kr", "author_num": 5, "aff_unique_index": "0;1;1;0;1", "aff_unique_norm": "Yonsei University;Pohang University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.yonsei.ac.kr;https://www.postech.ac.kr", "aff_unique_abbr": "Yonsei;POSTECH", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Pohang", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Kernel-Based Evaluation of Conditional Biological Sequence Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35095", "id": "2dlmcTXfcY", "proceeding": "https://proceedings.mlr.press/v235/glaser24a.html", "pdf": "https://openreview.net/pdf?id=2dlmcTXfcY", "openreview": "https://openreview.net/forum?id=2dlmcTXfcY", "author_site": "Pierre Glaser, Steffan Paul, Alissa M. Hummer, Charlotte Deane, Debora Marks, Alan Amin", "tldr": "", "abstract": "We propose a set of kernel-based tools to evaluate the designs and tune the hyperparameters of conditional sequence models, with a focus on problems in computational biology. The backbone of our tools is a new measure of discrepancy between the true conditional distribution and the model's estimate, called the Augmented Conditional Maximum Mean Discrepancy (ACMMD). Provided that the model can be sampled from, the ACMMD can be estimated unbiasedly from data to quantify absolute model fit, integrated within hypothesis tests, and used to evaluate model reliability. We demonstrate the utility of our approach by analyzing a popular protein design model, ProteinMPNN. We are able to reject the hypothesis that ProteinMPNN fits its data for various protein families, and tune the model's temperature hyperparameter to achieve a better fit.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pierre Glaser;Steffanie Paul;Alissa M Hummer;Charlotte Deane;Debora Susan Marks;Alan Nawzad Amin", "authorids": "~Pierre_Glaser1;~Steffanie_Paul1;~Alissa_M_Hummer1;~Charlotte_Deane1;~Debora_Susan_Marks1;~Alan_Nawzad_Amin1", "gender": "M;;F;F;M;F", "homepage": "https://pierreglaser.github.io;https://amhummer.github.io;https://www.stats.ox.ac.uk/~deane/;https://www.deboramarkslab.com/;;https://dbmi.hms.harvard.edu/people/steffan-paul", "dblp": ";;;;319/5032.html;", "google_scholar": ";M-xaCnEAAAAJ;https://scholar.google.co.uk/citations?user=QAdcBnQAAAAJ;qFmoeNkAAAAJ;;", "orcid": ";0000-0002-3023-2588;0000-0003-1388-2252;0000-0001-9388-2281;0000-0002-2656-8273;0000-0001-7306-4863", "linkedin": ";alissa-hummer/;charlotte-deane-27918614/;debora-marks-3932a97/;;steffanpaul359/", "or_profile": "~Pierre_Glaser1;~Alissa_M_Hummer1;~Charlotte_Deane1;~Debora_Susan_Marks1;~Alan_Nawzad_Amin1;~Steffan_Paul1", "aff": "University College London;University of Oxford;University of Oxford;Harvard Medical School, Harvard University;New York University;Harvard University", "aff_domain": "ucl.ac.uk;ox.ac.uk;ox.ac.uk;hms.harvard.edu;nyu.edu;harvard.edu", "position": "PhD student;PhD student;Full Professor;Full Professor;Postdoc;PhD student", "bibtex": "@inproceedings{\nglaser2024kernelbased,\ntitle={Kernel-Based Evaluation of Conditional Biological Sequence Models},\nauthor={Pierre Glaser and Steffanie Paul and Alissa M Hummer and Charlotte Deane and Debora Susan Marks and Alan Nawzad Amin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2dlmcTXfcY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 747176, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17193463040112596130&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "ucl.ac.uk;ox.ac.uk;ox.ac.uk;hms.harvard.edu;nyu.edu;harvard.edu", "author_num": 6, "aff_unique_index": "0;1;1;2;3;2", "aff_unique_norm": "University College London;University of Oxford;Harvard University;New York University", "aff_unique_dep": ";;Harvard Medical School;", "aff_unique_url": "https://www.ucl.ac.uk;https://www.ox.ac.uk;https://www.harvard.edu;https://www.nyu.edu", "aff_unique_abbr": "UCL;Oxford;Harvard;NYU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "New Sample Complexity Bounds for Sample Average Approximation in Heavy-Tailed Stochastic Programming", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35094", "id": "2hWd4CVhXz", "proceeding": "https://proceedings.mlr.press/v235/liu24bf.html", "pdf": "https://openreview.net/pdf?id=2hWd4CVhXz", "openreview": "https://openreview.net/forum?id=2hWd4CVhXz", "author_site": "Hongcheng Liu, Jindong Tong", "tldr": "", "abstract": "This paper studies sample average approximation (SAA) and its simple regularized variation in solving convex or strongly convex stochastic programming problems. Under heavy-tailed assumptions and comparable regularity conditions as in the typical SAA literature, we show --- perhaps for the first time --- that the sample complexity can be completely free from any complexity measure (e.g., logarithm of the covering number) of the feasible region. As a result, our new bounds can be more advantageous than the state-of-the-art in terms of the dependence on the problem dimensionality.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hongcheng Liu;Jindong Tong", "authorids": "~Hongcheng_Liu1;~Jindong_Tong1", "gender": "M;M", "homepage": "https://www.ise.ufl.edu/liu/;", "dblp": ";", "google_scholar": "Ava2qF4AAAAJ;", "orcid": ";0009-0001-6807-153X", "linkedin": ";", "or_profile": "~Hongcheng_Liu1;~Jindong_Tong1", "aff": "University of Florida;University of Florida", "aff_domain": "ufl.edu;ufl.edu", "position": "Associate Professor;PhD student", "bibtex": "@inproceedings{\nliu2024new,\ntitle={New Sample Complexity Bounds for Sample Average Approximation in Heavy-Tailed Stochastic Programming},\nauthor={Hongcheng Liu and Jindong Tong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2hWd4CVhXz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 378440, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3351529374824770078&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "ufl.edu;ufl.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Florida", "aff_unique_dep": "", "aff_unique_url": "https://www.ufl.edu", "aff_unique_abbr": "UF", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Stochastic Bandits with ReLU Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35093", "id": "2hidpjUPvV", "proceeding": "https://proceedings.mlr.press/v235/xu24c.html", "pdf": "https://openreview.net/pdf?id=2hidpjUPvV", "openreview": "https://openreview.net/forum?id=2hidpjUPvV", "author_site": "Kan Xu, Hamsa Bastani, Surbhi Goel, Osbert Bastani", "tldr": "", "abstract": "We study the stochastic bandit problem with ReLU neural network structure. We show that a $\\tilde{O}(\\sqrt{T})$ regret guarantee is achievable by considering bandits with one-layer ReLU neural networks; to the best of our knowledge, our work is the first to achieve such a guarantee. In this specific setting, we propose an OFU-ReLU algorithm that can achieve this upper bound. The algorithm first explores randomly until it reaches a *linear* regime, and then implements a UCB-type linear bandit algorithm to balance exploration and exploitation. Our key insight is that we can exploit the piecewise linear structure of ReLU activations and convert the problem into a linear bandit in a transformed feature space, once we learn the parameters of ReLU relatively accurately during the exploration stage. To remove dependence on model parameters, we design an OFU-ReLU+ algorithm based on a batching strategy, which can provide the same theoretical guarantee.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kan Xu;Hamsa Bastani;Surbhi Goel;Osbert Bastani", "authorids": "~Kan_Xu2;~Hamsa_Bastani1;~Surbhi_Goel1;~Osbert_Bastani1", "gender": "M;F;M;F", "homepage": "https://kanxu526.github.io/;https://www.surbhigoel.com;http://obastani.github.io;https://hamsabastani.github.io/", "dblp": ";190/7815;21/11275;199/1777", "google_scholar": "SaEf5CUAAAAJ;https://scholar.google.co.in/citations?user=Zqz4CQoAAAAJ;cxYepGkAAAAJ;ZbUfUMoAAAAJ", "orcid": "0000-0001-8738-6564;;;", "linkedin": "kan-xu-8170b953/;;;", "or_profile": "~Kan_Xu2;~Surbhi_Goel1;~Osbert_Bastani1;~Hamsa_Sridhar_Bastani1", "aff": "University of Pennsylvania;University of Pennsylvania;University of Pennsylvania;The Wharton School, University of Pennsylvania", "aff_domain": "upenn.edu;upenn.edu;upenn.edu;wharton.upenn.edu", "position": "PhD student;Assistant Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nxu2024stochastic,\ntitle={Stochastic Bandits with Re{LU} Neural Networks},\nauthor={Kan Xu and Hamsa Bastani and Surbhi Goel and Osbert Bastani},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2hidpjUPvV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 850314, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xs5-jIhuTSAJ:scholar.google.com/&scioq=Stochastic+Bandits+with+ReLU+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 8, "email": "upenn.edu;upenn.edu;upenn.edu;wharton.upenn.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "The Surprising Effectiveness of Skip-Tuning in Diffusion Sampling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35092", "id": "2pYTCy4GUV", "proceeding": "https://proceedings.mlr.press/v235/ma24r.html", "pdf": "https://openreview.net/pdf?id=2pYTCy4GUV", "openreview": "https://openreview.net/forum?id=2pYTCy4GUV", "author_site": "Jiajun Ma, Shuchen Xue, Tianyang Hu, Wenjia Wang, Zhaoqiang Liu, Zhenguo Li, Zhiming Ma, Kenji Kawaguchi", "tldr": "", "abstract": "With the incorporation of the UNet architecture, diffusion probabilistic models have become a dominant force in image generation tasks. One key design in UNet is the skip connections between the encoder and decoder blocks. Although skip connections have been shown to improve training stability and model performance, we point out that such shortcuts can be a limiting factor for the complexity of the transformation. As the sampling steps decrease, the generation process and the role of the UNet get closer to the push-forward transformations from Gaussian distribution to the target, posing a challenge for the network's complexity. To address this challenge, we propose Skip-Tuning, a simple yet surprisingly effective training-free tuning method on the skip connections. For instance, our method can achieve 100% FID improvement for pretrained EDM on ImageNet 64 with only 19 NFEs (1.75), breaking the limit of ODE samplers regardless of sampling steps. Surprisingly, the improvement persists when we increase the number of sampling steps and can even surpass the best result from EDM-2 (1.58) with only 39 NFEs (1.57). Comprehensive exploratory experiments are conducted to shed light on the surprising effectiveness of our Skip-Tuning. We observe that while Skip-Tuning increases the score-matching losses in the pixel space, the losses in the feature space are reduced, particularly at intermediate noise levels, which coincide with the most effective range accounting for image quality improvement.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiajun Ma;Shuchen Xue;Tianyang Hu;Wenjia Wang;Zhaoqiang Liu;Zhenguo Li;Zhi-Ming Ma;Kenji Kawaguchi", "authorids": "~Jiajun_Ma1;~Shuchen_Xue1;~Tianyang_Hu1;~Wenjia_Wang2;~Zhaoqiang_Liu1;~Zhenguo_Li1;~Zhi-Ming_Ma1;~Kenji_Kawaguchi1", "gender": "M;M;M;M;M;M;;", "homepage": ";;https://hu-tianyang.github.io/;https://www.wenjia-w.com/;;http://www.ee.columbia.edu/~zgli/;http://homepage.amss.ac.cn/research/homePage/8eb59241e2e74d828fb84eec0efadba5/myHomePage.html;https://ml.comp.nus.edu.sg/#members", "dblp": ";356/7258;170/2551;;198/1405;23/6479;;", "google_scholar": ";aA70TOwAAAAJ;mlA_3r0AAAAJ;EKS1sO0AAAAJ;EmGrPbIAAAAJ;XboZC1AAAAAJ;;aLl3rYoAAAAJ", "orcid": ";;;;;;;", "linkedin": "https://www.linkedin.cn/incareer/in/ACoAABNx8OQBL99vmEOUUrE18c5XwhVpsxhEGu0;https://linkedin.com/in/shuchen-xue/;;;;;;", "or_profile": "~Jiajun_Ma1;~Shuchen_Xue1;~Tianyang_Hu1;~Wenjia_Wang2;~Zhaoqiang_Liu1;~Zhenguo_Li1;~Zhi-Ming_Ma1;~Kenji_Kawaguchi1", "aff": "Hong Kong University of Science and Technology;Academy of Mathematics and Systems Science, Chinese Academy of Sciences;Huawei Noah's Ark Lab;HKUST (GZ);University of Electronic Science and Technology of China;Huawei Noah's Ark Lab;Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Chinese Academy of Sciences;National University of Singapore", "aff_domain": "ust.hk;amss.ac.cn;huawei.com;hkust-gz.edu.cn;uestc.edu.cn;huawei.com;amss.ac.cn;nus.edu", "position": "PhD student;PhD student;Researcher;Assistant Professor;Full Professor;Principal Researcher;Full Professor;Presidential Young Professor", "bibtex": "@inproceedings{\nma2024the,\ntitle={The Surprising Effectiveness of Skip-Tuning in Diffusion Sampling},\nauthor={Jiajun Ma and Shuchen Xue and Tianyang Hu and Wenjia Wang and Zhaoqiang Liu and Zhenguo Li and Zhi-Ming Ma and Kenji Kawaguchi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2pYTCy4GUV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6882426, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12015354927067399569&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "ust.hk;amss.ac.cn;huawei.com;hkust-gz.edu.cn;uestc.edu.cn;huawei.com;amss.ac.cn;nus.edu", "author_num": 8, "aff_unique_index": "0;1;2;0;3;2;1;4", "aff_unique_norm": "Hong Kong University of Science and Technology;Chinese Academy of Sciences;Huawei;University of Electronic Science and Technology of China;National University of Singapore", "aff_unique_dep": ";Academy of Mathematics and Systems Science;Noah's Ark Lab;;", "aff_unique_url": "https://www.ust.hk;http://www.amss.cas.cn;https://www.huawei.com;https://www.uestc.edu.cn;https://www.nus.edu.sg", "aff_unique_abbr": "HKUST;AMSS;Huawei;UESTC;NUS", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Hong Kong SAR;;Guangzhou", "aff_country_unique_index": "0;0;0;0;0;0;0;1", "aff_country_unique": "China;Singapore" }, { "title": "Look Ahead or Look Around? A Theoretical Comparison Between Autoregressive and Masked Pretraining", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35091", "id": "2rPoTgEmjV", "proceeding": "https://proceedings.mlr.press/v235/zhang24m.html", "pdf": "https://openreview.net/pdf?id=2rPoTgEmjV", "openreview": "https://openreview.net/forum?id=2rPoTgEmjV", "author_site": "Qi Zhang, Tianqi Du, Haotian Huang, Yifei Wang, Yisen Wang", "tldr": "", "abstract": "In recent years, the rise of generative self-supervised learning (SSL) paradigms has exhibited impressive performance across visual, language, and multi-modal domains. While the varied designs of generative SSL objectives lead to distinct properties in downstream tasks, a theoretical understanding of these differences remains largely unexplored. In this paper, we establish the first theoretical comparisons between two leading generative SSL paradigms: autoregressive SSL and masked SSL. Through establishing theoretical frameworks, we elucidate the strengths and limitations of autoregressive and masked SSL within the primary evaluation tasks of classification and content generation. Our findings demonstrate that in classification tasks, the flexibility of targeted tokens in masked SSL fosters more inter-sample connections compared to the fixed position of target tokens in autoregressive SSL, which yields superior clustering performance. In content generation tasks, the misalignment between the flexible lengths of test samples and the fixed length of unmasked texts in masked SSL (vs. flexible lengths of conditional texts in autoregressive SSL) hinders its generation performance. To leverage each other's strengths and mitigate weaknesses, we propose diversity-enhanced autoregressive and variable-length masked objectives, which substantially improve the classification performance of autoregressive SSL and the generation performance of masked SSL. Code is available at https://github.com/PKU-ML/LookAheadLookAround.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qi Zhang;Tianqi Du;Haotian Huang;Yifei Wang;Yisen Wang", "authorids": "~Qi_Zhang28;~Tianqi_Du1;~Haotian_Huang1;~Yifei_Wang1;~Yisen_Wang1", "gender": "M;M;M;M;M", "homepage": "https://omnscent.github.io/;https://yifeiwang77.com;https://yisenwang.github.io/;https://github.com/zhangq327;https://github.com/rexdu2003/rexdu.github.io", "dblp": ";00/555-1;172/1346-1;;341/5548", "google_scholar": ";-CLy6YsAAAAJ;uMWPDboAAAAJ;;nQjREpoAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Haotian_Huang1;~Yifei_Wang1;~Yisen_Wang1;~zhang_qi2;~Du_Tianqi1", "aff": "Sun Yat-Sen University;Massachusetts Institute of Technology;Peking University;Peking University;Peking University", "aff_domain": "mail2.sysu.edu.cn;mit.edu;pku.edu.cn;pku.edu.cn;pku.edu.cn", "position": "Undergrad student;Postdoc;Assistant Professor;PhD student;PhD student", "bibtex": "@inproceedings{\nzhang2024look,\ntitle={Look Ahead or Look Around? A Theoretical Comparison Between Autoregressive and Masked Pretraining},\nauthor={Qi Zhang and Tianqi Du and Haotian Huang and Yifei Wang and Yisen Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2rPoTgEmjV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1625443, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2872567291653114698&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "mail2.sysu.edu.cn;mit.edu;pku.edu.cn;pku.edu.cn;pku.edu.cn", "author_num": 5, "aff_unique_index": "0;1;2;2;2", "aff_unique_norm": "Sun Yat-sen University;Massachusetts Institute of Technology;Peking University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.sysu.edu.cn/;https://web.mit.edu;http://www.pku.edu.cn", "aff_unique_abbr": "SYSU;MIT;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "China;United States" }, { "title": "Switched Flow Matching: Eliminating Singularities via Switching ODEs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35090", "id": "2ulUrcOZ64", "proceeding": "https://proceedings.mlr.press/v235/zhu24g.html", "pdf": "https://openreview.net/pdf?id=2ulUrcOZ64", "openreview": "https://openreview.net/forum?id=2ulUrcOZ64", "author_site": "Qunxi Zhu, Wei Lin", "tldr": "", "abstract": "Continuous-time generative models, such as Flow Matching (FM), construct probability paths to transport between one distribution and another through the simulation-free learning of the neural ordinary differential equations (ODEs). During inference, however, the learned model often requires multiple neural network evaluations to accurately integrate the flow, resulting in a slow sampling speed. We attribute the reason to the inherent (joint) heterogeneity of source and/or target distributions, namely the singularity problem, which poses challenges for training the neural ODEs effectively. To address this issue, we propose a more general framework, termed Switched FM (SFM), that eliminates singularities via switching ODEs, as opposed to using a uniform ODE in FM. Importantly, we theoretically show that FM cannot transport between two simple distributions due to the existence and uniqueness of initial value problems of ODEs, while these limitations can be well tackled by SFM. From an orthogonal perspective, our framework can seamlessly integrate with the existing advanced techniques, such as minibatch optimal transport, to further enhance the straightness of the flow, yielding a more efficient sampling process with reduced costs. We demonstrate the effectiveness of the newly proposed SFM through several numerical examples.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qunxi Zhu;Wei Lin", "authorids": "~Qunxi_Zhu1;~Wei_Lin1", "gender": "M;M", "homepage": "https://www.researchgate.net/profile/Qunxi_Zhu;https://faculty.fudan.edu.cn/wlin/zh_CN/", "dblp": "219/7742;99/2649", "google_scholar": "https://scholar.google.co.jp/citations?user=45oFQD4AAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": "0000-0001-7281-5274;0000-0002-1863-4306", "linkedin": ";", "or_profile": "~Qunxi_Zhu1;~Wei_Lin1", "aff": "Fudan University;Fudan University", "aff_domain": "fudan.edu.cn;fudan.edu.cn", "position": "Postdoc;Full Professor", "bibtex": "@inproceedings{\nzhu2024switched,\ntitle={Switched Flow Matching: Eliminating Singularities via Switching {ODE}s},\nauthor={Qunxi Zhu and Wei Lin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2ulUrcOZ64}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9942337, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17318179362784775542&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "fudan.edu.cn;fudan.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Fudan University", "aff_unique_dep": "", "aff_unique_url": "https://www.fudan.edu.cn", "aff_unique_abbr": "Fudan", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "The Pitfalls and Promise of Conformal Inference Under Adversarial Attacks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35089", "id": "2xLyc5TkFl", "proceeding": "https://proceedings.mlr.press/v235/liu24m.html", "pdf": "https://openreview.net/pdf?id=2xLyc5TkFl", "openreview": "https://openreview.net/forum?id=2xLyc5TkFl", "author_site": "Ziquan Liu, Yufei Cui, Yan Yan, Yi Xu, Xiangyang Ji, Xue Liu, Antoni Chan", "tldr": "", "abstract": "In safety-critical applications such as medical imaging and autonomous driving, where decisions have profound implications for patient health and road safety, it is imperative to maintain both high adversarial robustness to protect against potential adversarial attacks and reliable uncertainty quantification in decision-making. With extensive research focused on enhancing adversarial robustness through various forms of adversarial training (AT), a notable knowledge gap remains concerning the uncertainty inherent in adversarially trained models. To address this gap, this study investigates the uncertainty of deep learning models by examining the performance of conformal prediction (CP) in the context of standard adversarial attacks within the adversarial defense community. It is first unveiled that existing CP methods do not produce informative prediction sets under the commonly used $l_{\\infty}$-norm bounded attack if the model is not adversarially trained, which underpins the importance of adversarial training for CP. Our paper next demonstrates that the prediction set size (PSS) of CP using adversarially trained models with AT variants is often worse than using standard AT, inspiring us to research into CP-efficient AT for improved PSS. We propose to optimize a Beta-weighting loss with an entropy minimization regularizer during AT to improve CP-efficiency, where the Beta-weighting loss is shown to be an upper bound of PSS at the population level by our theoretical analysis. Moreover, our empirical study on four image classification datasets across three popular AT baselines validates the effectiveness of the proposed Uncertainty-Reducing AT (AT-UR).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ziquan Liu;Yufei CUI;Yan Yan;Yi Xu;Xiangyang Ji;Xue Liu;Antoni B. Chan", "authorids": "~Ziquan_Liu1;~Yufei_CUI2;~Yan_Yan3;~Yi_Xu8;~Xiangyang_Ji1;~Xue_Liu1;~Antoni_B._Chan1", "gender": "M;M;M;;;M;M", "homepage": "https://sites.google.com/view/ziquanliu;;http://iemppu.github.io/;;;http://www.cs.mcgill.ca/~xueliu/;http://www.cs.cityu.edu.hk/~abchan/", "dblp": "207/9035;188/0049;13/3953-6;;;l/XueLiu;55/5814", "google_scholar": "https://scholar.google.com.hk/citations?user=x28OqBkAAAAJ;;A6co_BAAAAAJ;;;https://scholar.google.com.tw/citations?user=rfLIRakAAAAJ;j4vFSn8AAAAJ", "orcid": ";;0000-0001-9108-6767;;;;0000-0002-2886-2513", "linkedin": ";;;;;;", "or_profile": "~Ziquan_Liu1;~Yufei_CUI2;~Yan_Yan3;~Yi_Xu8;~Xiangyang_Ji1;~Xue_Liu1;~Antoni_B._Chan1", "aff": "Queen Mary, University of London;McGill University;Washington State University, Pullman;;;McGill University;City University of Hong Kong", "aff_domain": "qmul.ac.uk;cs.mcgill.ca;wsu.edu;;;mcgill.ca;cityu.edu.hk", "position": "Lecturer;Postdoc;Assistant Professor;;;Full Professor;Full Professor", "bibtex": "@inproceedings{\nliu2024the,\ntitle={The Pitfalls and Promise of Conformal Inference Under Adversarial Attacks},\nauthor={Ziquan Liu and Yufei CUI and Yan Yan and Yi Xu and Xiangyang Ji and Xue Liu and Antoni B. Chan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2xLyc5TkFl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1960683, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9621002763644992885&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "qmul.ac.uk;cs.mcgill.ca;wsu.edu;;;mcgill.ca;cityu.edu.hk", "author_num": 7, "aff_unique_index": "0;1;2;1;3", "aff_unique_norm": "Queen Mary, University of London;McGill University;Washington State University;City University of Hong Kong", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.qmul.ac.uk;https://www.mcgill.ca;https://wsu.edu;https://www.cityu.edu.hk", "aff_unique_abbr": "QMUL;McGill;WSU;CityU", "aff_campus_unique_index": "0;2;3", "aff_campus_unique": "London;;Pullman;Hong Kong SAR", "aff_country_unique_index": "0;1;2;1;3", "aff_country_unique": "United Kingdom;Canada;United States;China" }, { "title": "Offline Training of Language Model Agents with Functions as Learnable Weights", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35088", "id": "2xbkWiEuR1", "proceeding": "https://proceedings.mlr.press/v235/zhang24cd.html", "pdf": "https://openreview.net/pdf?id=2xbkWiEuR1", "openreview": "https://openreview.net/forum?id=2xbkWiEuR1", "author_site": "Shaokun Zhang, Jieyu Zhang, Jiale Liu, Linxin Song, Chi Wang, Ranjay Krishna, Qingyun Wu", "tldr": "", "abstract": "Researchers and practitioners have recently reframed powerful Large Language Models (LLMs) as *agents*, enabling them to automate complex tasks largely via the use of specialized functions. To facilitate the development of LLM agents, we present a novel paradigm of training LLM agents without modifying the LLM weights, which is particularly useful when the LLMs are difficult or inaccessible for modifications. Inspired by how humans continuously forge tools to adapt to real-world tasks, rather than change our biological structure to fit a static set of tools, we propose to progressively forge agent's functions to better solve the downstream tasks instead of modifying the LLM weights. By treating the functions as learnable `agent parameters' and leveraging the fundamental idea of model training in artificial intelligence, we develop AgentOptimizer that employs the LLM to update agents' functions and devise an *agent training* algorithm with two strategies, roll-back, and early-stop, to streamline the training process. With extensive experiments, we showcase that the agent training paradigm could significantly improve the performance of representative LLM agents in various downstream tasks. We also study the behavior of the agent training regarding aspects like the learning curve and domain transferability.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shaokun Zhang;Jieyu Zhang;Jiale Liu;Linxin Song;Chi Wang;Ranjay Krishna;Qingyun Wu", "authorids": "~Shaokun_Zhang2;~Jieyu_Zhang1;~Jiale_Liu2;~Linxin_Song1;~Chi_Wang3;~Ranjay_Krishna1;~Qingyun_Wu2", "gender": ";M;;M;M;M;", "homepage": ";https://jieyuz2.github.io/;;https://linxins97.github.io/;http://chiwang.cc;http://ranjaykrishna.com;", "dblp": ";;;330/3920.html;09/404-1;167/3785;", "google_scholar": ";T_INUHUAAAAJ;;https://scholar.google.com.hk/citations?user=IjqXzSwAAAAJ;https://scholar.google.com/citations?hl=en;IcqahyAAAAAJ;", "orcid": ";0000-0002-1846-2436;;0009-0009-7349-8990;;0000-0001-8784-2531;", "linkedin": ";jieyu-zhang-3baaa8154/;;;chi-wang-autogen/;ranjay-krishna-1a344444/;", "or_profile": "~Shaokun_Zhang2;~Jieyu_Zhang1;~Jiale_Liu2;~Linxin_Song1;~Chi_Wang3;~Ranjay_Krishna1;~Qingyun_Wu2", "aff": ";University of Washington;;University of Southern California;Microsoft Research;University of Washington;", "aff_domain": ";cs.washington.edu;;usc.edu;microsoft.com;cs.washington.edu;", "position": ";PhD student;;PhD student;Principal Researcher;Assistant Professor;", "bibtex": "@inproceedings{\nzhang2024offline,\ntitle={Offline Training of Language Model Agents with Functions as Learnable Weights},\nauthor={Shaokun Zhang and Jieyu Zhang and Jiale Liu and Linxin Song and Chi Wang and Ranjay Krishna and Qingyun Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2xbkWiEuR1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1204111, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12139538866756361070&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";cs.washington.edu;;usc.edu;microsoft.com;cs.washington.edu;", "author_num": 7, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Washington;University of Southern California;Microsoft", "aff_unique_dep": ";;Microsoft Research", "aff_unique_url": "https://www.washington.edu;https://www.usc.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "UW;USC;MSR", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Hybrid Inverse Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35087", "id": "2zI2scD2Iz", "proceeding": "https://proceedings.mlr.press/v235/ren24c.html", "pdf": "https://openreview.net/pdf?id=2zI2scD2Iz", "openreview": "https://openreview.net/forum?id=2zI2scD2Iz", "author_site": "Juntao Ren, Gokul Swamy, Steven Wu, J. Bagnell, Sanjiban Choudhury", "tldr": "", "abstract": "The inverse reinforcement learning approach to imitation learning is a double-edged sword. On the one hand, it can enable learning from a smaller number of expert demonstrations with more robustness to error compounding than behavioral cloning approaches. On the other hand, it requires that the learner repeatedly solve a computationally expensive reinforcement learning (RL) problem. Often, much of this computation is wasted searching over policies very dissimilar to the expert's. In this work, we propose using *hybrid RL* -- training on a mixture of online and expert data -- to curtail unnecessary exploration. Intuitively, the expert data focuses the learner on good states during training, which reduces the amount of exploration required to compute a strong policy. Notably, such an approach doesn't need the ability to reset the learner to arbitrary states in the environment, a requirement of prior work in efficient inverse RL. More formally, we derive a reduction from inverse RL to *expert-competitive RL* (rather than globally optimal RL) that allows us to dramatically reduce interaction during the inner policy search loop while maintaining the benefits of the IRL approach. This allows us to derive both model-free and model-based hybrid inverse RL algorithms with strong policy performance guarantees. Empirically, we find that our approaches are significantly more sample efficient than standard inverse RL and several other baselines on a suite of continuous control tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Juntao Ren;Gokul Swamy;Steven Wu;Drew Bagnell;Sanjiban Choudhury", "authorids": "~Juntao_Ren1;~Gokul_Swamy1;~Steven_Wu1;~Drew_Bagnell2;~Sanjiban_Choudhury3", "gender": "M;;;M;M", "homepage": "https://jren03.github.io/;https://gokul.dev/;https://robotwhisperer.org/;https://www.sanjibanchoudhury.com/;https://zstevenwu.com/", "dblp": "340/8425;31/11509;;135/8207;137/8350", "google_scholar": "https://scholar.google.com/citations?hl=en;Sbpra_AAAAAJ;7t4jbPQAAAAJ;https://scholar.google.com/citations?hl=en;MbF6rTEAAAAJ", "orcid": ";;;;", "linkedin": "juntaoren/;;;;zstevenwu/", "or_profile": "~Juntao_Ren1;~Gokul_Swamy1;~Drew_Bagnell2;~Sanjiban_Choudhury3;~Zhiwei_Steven_Wu1", "aff": "Department of Computer Science, Cornell University;Carnegie Mellon University;Carnegie Mellon University;Cornell University;Carnegie Mellon University", "aff_domain": "cs.cornell.edu;cmu.edu;cmu.edu;cornell.edu;cmu.edu", "position": "Undergrad student;PhD student;Associate Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nren2024hybrid,\ntitle={Hybrid Inverse Reinforcement Learning},\nauthor={Juntao Ren and Gokul Swamy and Steven Wu and Drew Bagnell and Sanjiban Choudhury},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2zI2scD2Iz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2430198, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12557682072017168776&as_sdt=5,24&sciodt=0,24&hl=en", "gs_version_total": 9, "email": "cs.cornell.edu;cmu.edu;cmu.edu;cornell.edu;cmu.edu", "author_num": 5, "aff_unique_index": "0;1;1;0;1", "aff_unique_norm": "Cornell University;Carnegie Mellon University", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.cornell.edu;https://www.cmu.edu", "aff_unique_abbr": "Cornell;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Beyond the Federation: Topology-aware Federated Learning for Generalization to Unseen Clients", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35086", "id": "2zLt2Odckx", "proceeding": "https://proceedings.mlr.press/v235/ma24e.html", "pdf": "https://openreview.net/pdf?id=2zLt2Odckx", "openreview": "https://openreview.net/forum?id=2zLt2Odckx", "author_site": "Mengmeng Ma, Tang Li, Xi Peng", "tldr": "", "abstract": "Federated Learning is widely employed to tackle distributed sensitive data. Existing methods primarily focus on addressing in-federation data heterogeneity. However, we observed that they suffer from significant performance degradation when applied to unseen clients for out-of-federation (OOF) generalization. The recent attempts to address generalization to unseen clients generally struggle to scale up to large-scale distributed settings due to high communication or computation costs. Moreover, methods that scale well often demonstrate poor generalization capability. To achieve OOF-resiliency in a scalable manner, we propose Topology-aware Federated Learning (TFL) that leverages client topology - a graph representing client relationships - to effectively train robust models against OOF data. We formulate a novel optimization problem for TFL, consisting of two key modules: Client Topology Learning, which infers the client relationships in a privacy-preserving manner, and Learning on Client Topology, which leverages the learned topology to identify influential clients and harness this information into the FL optimization process to efficiently build robust models. Empirical evaluation on a variety of real-world datasets verifies TFL's superior OOF robustness and scalability.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mengmeng Ma;Tang Li;Xi Peng", "authorids": "~Mengmeng_Ma1;~Tang_Li1;~Xi_Peng1", "gender": "M;M;Not Specified", "homepage": "https://mengmenm.top/;https://tangli0305.github.io/;https://deep-real.github.io/dr_xipeng.html", "dblp": "150/6565-2;01/1190-5;149/7762-5", "google_scholar": "ycXTxwoAAAAJ;mQFL3DYAAAAJ;DWw4v0kAAAAJ", "orcid": "0000-0002-2804-2718;0000-0002-3134-4151;0000-0002-7772-001X", "linkedin": ";tang-li-613132180/;xi-peng-74b540b6/", "or_profile": "~Mengmeng_Ma1;~Tang_Li1;~Xi_Peng1", "aff": "University of Delaware;University of Delaware;University of Delaware", "aff_domain": "udel.edu;udel.edu;udel.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nma2024beyond,\ntitle={Beyond the Federation: Topology-aware Federated Learning for Generalization to Unseen Clients},\nauthor={Mengmeng Ma and Tang Li and Xi Peng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2zLt2Odckx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5339733, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11553916373948922544&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 6, "email": "udel.edu;udel.edu;udel.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Delaware", "aff_unique_dep": "", "aff_unique_url": "https://www.udel.edu", "aff_unique_abbr": "UD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Transformers Get Stable: An End-to-End Signal Propagation Theory for Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35085", "id": "30waYPIZUA", "proceeding": "https://proceedings.mlr.press/v235/kedia24a.html", "pdf": "https://openreview.net/pdf?id=30waYPIZUA", "openreview": "https://openreview.net/forum?id=30waYPIZUA", "author_site": "Akhil Kedia, Mohd Abbas Zaidi, Sushil Khyalia, JungHo Jung, Harshith Goka, Haejun Lee", "tldr": "", "abstract": "In spite of their huge success, transformer models remain difficult to scale in depth. In this work, we develop a unified signal propagation theory and provide formulae that govern the moments of the forward and backward signal through the transformer model. Our framework can be used to understand and mitigate vanishing/exploding gradients, rank collapse, and instability associated with high attention scores. We also propose DeepScaleLM, an initialization and scaling scheme that conserves unit output/gradient moments throughout the model, enabling the training of very deep models with 1000 layers. We find that transformer models could be much deeper - our deep models with fewer parameters outperform shallow models in Language Modeling, Speech Translation, and Image Classification, across encoder-only, decoder-only and encoder-decoder variants, for both Pre-LN and Post-LN transformers, for multiple datasets and model sizes. These improvements also translate into improved performance on downstream Question Answering tasks and improved robustness for Image Classification.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Akhil Kedia;Mohd Abbas Zaidi;Sushil Khyalia;JungHo Jung;Harshith Goka;Haejun Lee", "authorids": "~Akhil_Kedia1;~Mohd_Abbas_Zaidi2;~Sushil_Khyalia1;~JungHo_Jung1;~Harshith_Goka1;~Haejun_Lee2", "gender": ";M;M;M;M;M", "homepage": ";https://sushil-khyalia.github.io;https://github.com/diffunity;;;https://mzaidi59.github.io/", "dblp": "264/2699;284/0708;;284/9148;;266/4018.html", "google_scholar": "VvLIqCcAAAAJ;S2akEm4AAAAJ;;5a1FXeEAAAAJ;https://scholar.google.co.kr/citations?hl=ko;i_cSaKgAAAAJ", "orcid": ";;;;;0000-0003-2961-4404", "linkedin": ";sushil-khyalia-41b829160/;https://linkedin.com/in/diffunity;;;mazaidi/", "or_profile": "~Akhil_Kedia1;~Sushil_Khyalia1;~JungHo_Jung1;~Harshith_Goka1;~Haejun_Lee2;~Mohd_Abbas_Zaidi1", "aff": "Samsung;Carnegie Mellon University;Samsung;Samsung Research;;Georgia Institute of Technology", "aff_domain": "samsung.com;andrew.cmu.edu;samsung.com;samsung.com;;gatech.edu", "position": "Researcher;MS student;Researcher;Researcher;;MS student", "bibtex": "@inproceedings{\nkedia2024transformers,\ntitle={Transformers Get Stable: An End-to-End Signal Propagation Theory for Language Models},\nauthor={Akhil Kedia and Mohd Abbas Zaidi and Sushil Khyalia and JungHo Jung and Harshith Goka and Haejun Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=30waYPIZUA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1026465, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10707758445061663632&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "samsung.com;andrew.cmu.edu;samsung.com;samsung.com;;gatech.edu", "author_num": 6, "aff_unique_index": "0;1;0;0;2", "aff_unique_norm": "Samsung;Carnegie Mellon University;Georgia Institute of Technology", "aff_unique_dep": "Samsung;;", "aff_unique_url": "https://www.samsung.com;https://www.cmu.edu;https://www.gatech.edu", "aff_unique_abbr": "Samsung;CMU;Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1", "aff_country_unique": "South Korea;United States" }, { "title": "REMEDI: Corrective Transformations for Improved Neural Entropy Estimation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35084", "id": "321GwKMtxO", "proceeding": "https://proceedings.mlr.press/v235/nilsson24a.html", "pdf": "https://openreview.net/pdf?id=321GwKMtxO", "openreview": "https://openreview.net/forum?id=321GwKMtxO", "author_site": "Viktor Nilsson, Anirban Samaddar, Sandeep Madireddy, Pierre Nyquist", "tldr": "", "abstract": "Information theoretic quantities play a central role in machine learning. The recent surge in the complexity of data and models has increased the demand for accurate estimation of these quantities. However, as the dimension grows the estimation presents significant challenges, with existing methods struggling already in relatively low dimensions. To address this issue, in this work, we introduce REMEDI for efficient and accurate estimation of differential entropy, a fundamental information theoretic quantity. The approach combines the minimization of the cross-entropy for simple, adaptive base models and the estimation of their deviation, in terms of the relative entropy, from the data density. Our approach demonstrates improvement across a broad spectrum of estimation tasks, encompassing entropy estimation on both synthetic and natural data. Further, we extend important theoretical consistency results to a more generalized setting required by our approach. We illustrate how the framework can be naturally extended to information theoretic supervised learning models, with a specific focus on the Information Bottleneck approach. It is demonstrated that the method delivers better accuracy compared to the existing methods in Information Bottleneck. In addition, we explore a natural connection between REMEDI and generative modeling using rejection sampling and Langevin dynamics.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Viktor Nilsson;Anirban Samaddar;Sandeep Madireddy;Pierre Nyquist", "authorids": "~Viktor_Nilsson1;~Anirban_Samaddar1;~Sandeep_Madireddy1;~Pierre_Nyquist1", "gender": "M;M;M;M", "homepage": ";https://www.researchgate.net/profile/Anirban-Samaddar;https://www.anl.gov/profile/sandeep-r-madireddy;https://people.kth.se/~pierren/index.html", "dblp": "323/6240;315/4737;205/7527;", "google_scholar": "nNKRP00AAAAJ;https://scholar.google.com/citations?view_op=list_works;jSa5jTAAAAAJ;", "orcid": "0000-0001-5740-5103;0000-0001-9520-8012;0000-0002-0437-8655;0000-0001-8702-2293", "linkedin": "v01/;anirban-samaddar-885267b3/;;", "or_profile": "~Viktor_Nilsson1;~Anirban_Samaddar1;~Sandeep_Madireddy1;~Pierre_Nyquist1", "aff": "Argonne National Laboratory;Argonne National Laboratory;Argonne National Laboratory;KTH Royal Institute of Technology", "aff_domain": "anl.gov;anl.gov;anl.gov;kth.se", "position": "Intern;Postdoc;Computer Scientist;Associate Professor", "bibtex": "@inproceedings{\nnilsson2024remedi,\ntitle={{REMEDI}: Corrective Transformations for Improved Neural Entropy Estimation},\nauthor={Viktor Nilsson and Anirban Samaddar and Sandeep Madireddy and Pierre Nyquist},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=321GwKMtxO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7376889, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12467736128834182089&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "anl.gov;anl.gov;anl.gov;kth.se", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Argonne National Laboratory;KTH Royal Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.anl.gov;https://www.kth.se", "aff_unique_abbr": "ANL;KTH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;Sweden" }, { "title": "Learning a Diffusion Model Policy from Rewards via Q-Score Matching", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35083", "id": "35ahHydjXo", "proceeding": "https://proceedings.mlr.press/v235/psenka24a.html", "pdf": "https://openreview.net/pdf?id=35ahHydjXo", "openreview": "https://openreview.net/forum?id=35ahHydjXo", "author_site": "Michael Psenka, Alejandro Escontrela, Pieter Abbeel, Yi Ma", "tldr": "", "abstract": "Diffusion models have become a popular choice for representing actor policies in behavior cloning and offline reinforcement learning. This is due to their natural ability to optimize an expressive class of distributions over a continuous space. However, previous works fail to exploit the score-based structure of diffusion models, and instead utilize a simple behavior cloning term to train the actor, limiting their ability in the actor-critic setting. In this paper, we present a theoretical framework linking the structure of diffusion model policies to a learned Q-function, by linking the structure between the score of the policy to the action gradient of the Q-function. We focus on off-policy reinforcement learning and propose a new policy update method from this theory, which we denote Q-score matching. Notably, this algorithm only needs to differentiate through the denoising model rather than the entire diffusion model evaluation, and converged policies through Q-score matching are implicitly multi-modal and explorative in continuous domains. We conduct experiments in simulated environments to demonstrate the viability of our proposed method and compare to popular baselines. Source code is available from the project website: https://michaelpsenka.io/qsm.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Michael Psenka;Alejandro Escontrela;Pieter Abbeel;Yi Ma", "authorids": "~Michael_Psenka1;~Alejandro_Escontrela1;~Pieter_Abbeel2;~Yi_Ma4", "gender": ";M;M;M", "homepage": "https://www.michaelpsenka.io;https://www.escontrela.me;https://people.eecs.berkeley.edu/~pabbeel/;http://people.eecs.berkeley.edu/~yima/", "dblp": "306/1500;;;", "google_scholar": "vqYq3egAAAAJ;53OxjmYAAAAJ;https://scholar.google.com.tw/citations?user=vtwH6GkAAAAJ;https://scholar.google.com.hk/citations?user=XqLiBQMAAAAJ", "orcid": ";;;", "linkedin": "michael-psenka/;alejandro-escontrela/;;", "or_profile": "~Michael_Psenka1;~Alejandro_Escontrela1;~Pieter_Abbeel2;~Yi_Ma4", "aff": "University of California, Berkeley;University of California, Berkeley;Covariant;University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu;covariant.ai;berkeley.edu", "position": "PhD student;PhD student;Founder;Full Professor", "bibtex": "@inproceedings{\npsenka2024learning,\ntitle={Learning a Diffusion Model Policy from Rewards via Q-Score Matching},\nauthor={Michael Psenka and Alejandro Escontrela and Pieter Abbeel and Yi Ma},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=35ahHydjXo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 742613, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18392490740254682730&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "berkeley.edu;berkeley.edu;covariant.ai;berkeley.edu", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of California, Berkeley;Covariant", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;", "aff_unique_abbr": "UC Berkeley;", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States;" }, { "title": "Projection-Free Variance Reduction Methods for Stochastic Constrained Multi-Level Compositional Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35082", "id": "36jWuAmGRC", "proceeding": "https://proceedings.mlr.press/v235/jiang24g.html", "pdf": "https://openreview.net/pdf?id=36jWuAmGRC", "openreview": "https://openreview.net/forum?id=36jWuAmGRC", "author_site": "Wei Jiang, Sifan Yang, Wenhao Yang, Yibo Wang, Yuanyu Wan, Lijun Zhang", "tldr": "", "abstract": "This paper investigates projection-free algorithms for stochastic constrained multi-level optimization. In this context, the objective function is a nested composition of several smooth functions, and the decision set is closed and convex. Existing projection-free algorithms for solving this problem suffer from two limitations: 1) they solely focus on the gradient mapping criterion and fail to match the optimal sample complexities in unconstrained settings; 2) their analysis is exclusively applicable to non-convex functions, without considering convex and strongly convex objectives. To address these issues, we introduce novel projection-free variance reduction algorithms and analyze their complexities under different criteria. For gradient mapping, our complexities improve existing results and match the optimal rates for unconstrained problems. For the widely-used Frank-Wolfe gap criterion, we provide theoretical guarantees that align with those for single-level problems. Additionally, by using a stage-wise adaptation, we further obtain complexities for convex and strongly convex functions. Finally, numerical experiments on different tasks demonstrate the effectiveness of our methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wei Jiang;Sifan Yang;Wenhao Yang;Yibo Wang;Yuanyu Wan;Lijun Zhang", "authorids": "~Wei_Jiang8;~Sifan_Yang2;~Wenhao_Yang3;~Yibo_Wang2;~Yuanyu_Wan1;~Lijun_Zhang1", "gender": "M;M;M;;M;", "homepage": "http://www.lamda.nju.edu.cn/jiangw/?AspxAutoDetectCookieSupport=1;https://www.lamda.nju.edu.cn/yangsf/;http://www.lamda.nju.edu.cn/yangwh/;;https://yuanyuwan.github.io/;", "dblp": ";251/2905;233/4699;;221/3499;", "google_scholar": ";qTISlvMAAAAJ;ycccau7cWYIC;;CEymMc8AAAAJ;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Wei_Jiang8;~Sifan_Yang2;~Wenhao_Yang3;~Yibo_Wang2;~Yuanyu_Wan1;~Lijun_Zhang1", "aff": "Nanjing University;Nanjing University;Nanjing University;;Zhejiang University;", "aff_domain": "nju.edu.cn;nju.edu.cn;nju.edu.cn;;zju.edu.cn;", "position": "PhD student;MS student;PhD student;;Researcher;", "bibtex": "@inproceedings{\njiang2024projectionfree,\ntitle={Projection-Free Variance Reduction Methods for Stochastic Constrained Multi-Level Compositional Optimization},\nauthor={Wei Jiang and Sifan Yang and Wenhao Yang and Yibo Wang and Yuanyu Wan and Lijun Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=36jWuAmGRC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1114328, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10272750850830224668&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "nju.edu.cn;nju.edu.cn;nju.edu.cn;;zju.edu.cn;", "author_num": 6, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Nanjing University;Zhejiang University", "aff_unique_dep": ";", "aff_unique_url": "https://www.nju.edu.cn;https://www.zju.edu.cn", "aff_unique_abbr": "Nanjing U;ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "A Nearly Optimal Single Loop Algorithm for Stochastic Bilevel Optimization under Unbounded Smoothness", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35081", "id": "36rWa8zVkh", "proceeding": "https://proceedings.mlr.press/v235/gong24d.html", "pdf": "https://openreview.net/pdf?id=36rWa8zVkh", "openreview": "https://openreview.net/forum?id=36rWa8zVkh", "author_site": "Xiaochuan Gong, Jie Hao, Mingrui Liu", "tldr": "", "abstract": "This paper studies the problem of stochastic bilevel optimization where the upper-level function is nonconvex with potentially unbounded smoothness and the lower-level function is strongly convex. This problem is motivated by meta-learning applied to sequential data, such as text classification using recurrent neural networks, where the smoothness constant of the upper-level loss function scales linearly with the gradient norm and can be potentially unbounded. Existing algorithm crucially relies on the nested loop design, which requires significant tuning efforts and is not practical. In this paper, we address this issue by proposing a Single Loop bIlevel oPtimizer (SLIP). The proposed algorithm first updates the lower-level variable by a few steps of stochastic gradient descent, and then simultaneously updates the upper-level variable by normalized stochastic gradient descent with momentum and the lower-level variable by stochastic gradient descent. Under standard assumptions, we show that our algorithm finds an $\\epsilon$-stationary point within $\\widetilde{O}(1/\\epsilon^4)$[Here $\\widetilde{O}(\\cdot)$ compresses logarithmic factors of $1/\\epsilon$ and $1/\\delta$, where $\\delta\\in(0,1)$ denotes the failure probability.] oracle calls of stochastic gradient or Hessian-vector product, both in expectation and with high probability. This complexity result is nearly optimal up to logarithmic factors without mean-square smoothness of the stochastic gradient oracle. Our proof relies on (i) a refined characterization and control of the lower-level variable and (ii) establishing a novel connection between bilevel optimization and stochastic optimization under distributional drift. Our experiments on various tasks show that our algorithm significantly outperforms strong baselines in bilevel optimization.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaochuan Gong;Jie Hao;Mingrui Liu", "authorids": "~Xiaochuan_Gong1;~Jie_Hao3;~Mingrui_Liu2", "gender": "M;M;", "homepage": ";https://jhao6.github.io/JieHao.github.io/;https://mingrliu.github.io", "dblp": ";;", "google_scholar": "byUF8hgAAAAJ;S8ZTkikAAAAJ;KFoEnFQAAAAJ", "orcid": ";;", "linkedin": ";;mingrui-liu-447a2aab/", "or_profile": "~Xiaochuan_Gong1;~Jie_Hao3;~Mingrui_Liu2", "aff": "George Mason University;George Mason University;George Mason University", "aff_domain": "gmu.edu;gmu.edu;gmu.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\ngong2024a,\ntitle={A Nearly Optimal Single Loop Algorithm for Stochastic Bilevel Optimization under Unbounded Smoothness},\nauthor={Xiaochuan Gong and Jie Hao and Mingrui Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=36rWa8zVkh}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6134139, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3474266752131008642&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "gmu.edu;gmu.edu;gmu.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "George Mason University", "aff_unique_dep": "", "aff_unique_url": "https://www.gmu.edu", "aff_unique_abbr": "GMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Explaining Probabilistic Models with Distributional Values", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35080", "id": "37xFIeYgE0", "proceeding": "https://proceedings.mlr.press/v235/franceschi24a.html", "pdf": "https://openreview.net/pdf?id=37xFIeYgE0", "openreview": "https://openreview.net/forum?id=37xFIeYgE0", "author_site": "Luca Franceschi, Michele Donini, Cedric Archambeau, Matthias Seeger", "tldr": "", "abstract": "A large branch of explainable machine learning is grounded in cooperative game theory. However, research indicates that game-theoretic explanations may mislead or be hard to interpret. We argue that often there is a critical mismatch between what one wishes to explain (e.g. the output of a classifier) and what current methods such as SHAP explain (e.g. the scalar probability of a class). This paper addresses such gap for probabilistic models by generalising cooperative games and value operators. We introduce the *distributional values*, random variables that track changes in the model output (e.g. flipping of the predicted class) and derive their analytic expressions for games with Gaussian, Bernoulli and Categorical payoffs. We further establish several characterising properties, and show that our framework provides fine-grained and insightful explanations with case studies on vision and language models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Luca Franceschi;Michele Donini;Cedric Archambeau;Matthias Seeger", "authorids": "~Luca_Franceschi1;~Michele_Donini1;~Cedric_Archambeau1;~Matthias_Seeger2", "gender": "M;M;M;M", "homepage": ";https://sites.google.com/view/mdonini/;http://www0.cs.ucl.ac.uk/staff/c.archambeau/;https://mseeger.github.io/", "dblp": "203/4457;149/0239;59/1878;43/5832", "google_scholar": "https://scholar.google.co.uk/citations?user=NCls8VMAAAAJ;u3ogi00AAAAJ;pPx5WWIAAAAJ;V-lc8A8AAAAJ", "orcid": ";;;", "linkedin": ";michele-donini-2484734a/;carchambeau/;matthias-seeger-3010b765/?locale=de_DE", "or_profile": "~Luca_Franceschi1;~Michele_Donini1;~Cedric_Archambeau1;~Matthias_Seeger2", "aff": "Amazon Development Center Germany;Amazon;Helsing;Amazon Development Center Germany", "aff_domain": "amazon.de;amazon.com;helsing.ai;amazon.de", "position": "Researcher;Scientist;Principal Researcher;Principal Applied Scientist", "bibtex": "@inproceedings{\nfranceschi2024explaining,\ntitle={Explaining Probabilistic Models with Distributional Values},\nauthor={Luca Franceschi and Michele Donini and Cedric Archambeau and Matthias Seeger},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=37xFIeYgE0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5659563, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3845046583445544182&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "amazon.de;amazon.com;helsing.ai;amazon.de", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Amazon;Helsing", "aff_unique_dep": "Development Center;", "aff_unique_url": "https://www.amazon.de;", "aff_unique_abbr": "Amazon;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Germany;United States;" }, { "title": "When Do Skills Help Reinforcement Learning? A Theoretical Analysis of Temporal Abstractions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35079", "id": "39UqOkTjFn", "proceeding": "https://proceedings.mlr.press/v235/li24be.html", "pdf": "https://openreview.net/pdf?id=39UqOkTjFn", "openreview": "https://openreview.net/forum?id=39UqOkTjFn", "author_site": "Zhening Li, Gabriel Poesia, Armando Solar-Lezama", "tldr": "", "abstract": "Skills are temporal abstractions that are intended to improve reinforcement learning (RL) performance through hierarchical RL. Despite our intuition about the properties of an environment that make skills useful, a precise characterization has been absent. We provide the first such characterization, focusing on the utility of deterministic skills in deterministic sparse-reward environments with finite action spaces. We show theoretically and empirically that RL performance gain from skills is worse in environments where solutions to states are less compressible. Additional theoretical results suggest that skills benefit exploration more than they benefit learning from existing experience, and that using unexpressive skills such as macroactions may worsen RL performance. We hope our findings can guide research on automatic skill discovery and help RL practitioners better decide when and how to use skills.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhening Li;Gabriel Poesia;Armando Solar-Lezama", "authorids": "~Zhening_Li1;~Gabriel_Poesia1;~Armando_Solar-Lezama1", "gender": ";M;M", "homepage": "https://people.csail.mit.edu/zli11010/;https://gpoesia.com;https://people.csail.mit.edu/asolar/", "dblp": ";150/2695.html;95/6919", "google_scholar": "3pEDdyoAAAAJ;as5iYn4AAAAJ;https://scholar.google.com.tw/citations?user=8BX3BokAAAAJ", "orcid": "0000-0002-6059-4985;;", "linkedin": ";;", "or_profile": "~Zhening_Li1;~Gabriel_Poesia1;~Armando_Solar-Lezama1", "aff": "Massachusetts Institute of Technology;Stanford University;Massachusetts Institute of Technology", "aff_domain": "mit.edu;stanford.edu;mit.edu", "position": "Undergrad student;PhD student;Full Professor", "bibtex": "@inproceedings{\nli2024when,\ntitle={When Do Skills Help Reinforcement Learning? A Theoretical Analysis of Temporal Abstractions},\nauthor={Zhening Li and Gabriel Poesia and Armando Solar-Lezama},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=39UqOkTjFn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 521228, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:lYHLeUJy2AUJ:scholar.google.com/&scioq=When+Do+Skills+Help+Reinforcement+Learning%3F+A+Theoretical+Analysis+of+Temporal+Abstractions&hl=en&as_sdt=0,5", "gs_version_total": 7, "email": "mit.edu;stanford.edu;mit.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Massachusetts Institute of Technology;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://www.stanford.edu", "aff_unique_abbr": "MIT;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Offline Multi-Objective Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35078", "id": "3AuoStfUIH", "proceeding": "https://proceedings.mlr.press/v235/xue24b.html", "pdf": "https://openreview.net/pdf?id=3AuoStfUIH", "openreview": "https://openreview.net/forum?id=3AuoStfUIH", "author_site": "Ke Xue, Rong-Xi Tan, Xiaobin Huang, Chao Qian", "tldr": "", "abstract": "Offline optimization aims to maximize a black-box objective function with a static dataset and has wide applications. In addition to the objective function being black-box and expensive to evaluate, numerous complex real-world problems entail optimizing multiple conflicting objectives, i.e., multi-objective optimization (MOO). Nevertheless, offline MOO has not progressed as much as offline single-objective optimization (SOO), mainly due to the lack of benchmarks like Design-Bench for SOO. To bridge this gap, we propose a first benchmark for offline MOO, covering a range of problems from synthetic to real-world tasks. This benchmark provides tasks, datasets, and open-source examples, which can serve as a foundation for method comparisons and advancements in offline MOO. Furthermore, we analyze how the current related methods can be adapted to offline MOO from four fundamental perspectives, including data, model architecture, learning algorithm, and search algorithm. Empirical results show improvements over the best value of the training set, demonstrating the effectiveness of offline MOO methods. As no particular method stands out significantly, there is still an open challenge in further enhancing the effectiveness of offline MOO. We finally discuss future challenges for offline MOO, with the hope of shedding some light on this emerging field. Our code is available at https://github.com/lamda-bbo/offline-moo.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ke Xue;Rongxi Tan;Xiaobin Huang;Chao Qian", "authorids": "~Ke_Xue1;~Rongxi_Tan1;~Xiaobin_Huang2;~Chao_Qian1", "gender": "M;;M;M", "homepage": "http://www.lamda.nju.edu.cn/xuek/;https://trxcc.github.io/;;http://www.lamda.nju.edu.cn/qianc/", "dblp": "93/2469-1;379/5944.html;;84/8508-1", "google_scholar": "78bZVOwAAAAJ;m82W6XUAAAAJ;;", "orcid": "0000-0001-6789-2670;;;", "linkedin": ";;https://www.linkedin.cn/incareer/in/%E6%99%93%E6%96%8C-%E9%BB%84-1b3787221;", "or_profile": "~Ke_Xue1;~Rongxi_Tan1;~Xiaobin_Huang2;~Chao_Qian1", "aff": "Nanjing University;Nanjing University;Nanjing University;Nanjing university", "aff_domain": "nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu", "position": "PhD student;Undergrad student;MS student;Full Professor", "bibtex": "@inproceedings{\nxue2024offline,\ntitle={Offline Multi-Objective Optimization},\nauthor={Ke Xue and Rongxi Tan and Xiaobin Huang and Chao Qian},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3AuoStfUIH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2933590, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15808367635685190186&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "", "aff_unique_url": "https://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Single-Trajectory Distributionally Robust Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35077", "id": "3B6vmW2L80", "proceeding": "https://proceedings.mlr.press/v235/liang24d.html", "pdf": "https://openreview.net/pdf?id=3B6vmW2L80", "openreview": "https://openreview.net/forum?id=3B6vmW2L80", "author_site": "Zhipeng Liang, Xiaoteng Ma, Jose Blanchet, Jun Yang, Jiheng Zhang, Zhengyuan Zhou", "tldr": "", "abstract": "To mitigate the limitation that the classical reinforcement learning (RL) framework heavily relies on identical training and test environments, Distributionally Robust RL (DRRL) has been proposed to enhance performance across a range of environments, possibly including unknown test environments. As a price for robustness gain, DRRL involves optimizing over a set of distributions, which is inherently more challenging than optimizing over a fixed distribution in the non-robust case. Existing DRRL algorithms are either model-based or fail to learn from a single sample trajectory. In this paper, we design a first fully model-free DRRL algorithm, called distributionally robust Q-learning with single trajectory (DRQ). We delicately design a multi-timescale framework to fully utilize each incrementally arriving sample and directly learn the optimal distributionally robust policy without modeling the environment, thus the algorithm can be trained along a single trajectory in a model-free fashion. Despite the algorithm's complexity, we provide asymptotic convergence guarantees by generalizing classical stochastic approximation tools.Comprehensive experimental results demonstrate the superior robustness and sample complexity of our proposed algorithm, compared to non-robust methods and other robust RL algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhipeng Liang;Xiaoteng Ma;Jose Blanchet;Jun Yang;Jiheng Zhang;Zhengyuan Zhou", "authorids": "~Zhipeng_Liang1;~Xiaoteng_Ma1;~Jose_Blanchet1;~Jun_Yang6;~Jiheng_Zhang1;~Zhengyuan_Zhou2", "gender": ";M;M;M;;M", "homepage": ";https://xtma.github.io/;https://web.stanford.edu/~jblanche/;;https://reijz.github.io;https://scholar.google.com/citations?user=hiGI9v0AAAAJ&hl=en", "dblp": "225/7683.html;238/3249;75/5093.html;;13/7602;125/5270", "google_scholar": ";CeDFnNMAAAAJ;https://scholar.google.co.in/citations?user=O24CcQQAAAAJ;ZrgN9ssAAAAJ;;", "orcid": "0000-0002-3101-5673;0000-0002-7250-6268;;;;", "linkedin": "zhipeng-liang-958977204/;;jose-blanchet;;;", "or_profile": "~Zhipeng_Liang1;~Xiaoteng_Ma1;~Jose_Blanchet1;~Jun_Yang6;~Jiheng_Zhang1;~Zhengyuan_Zhou2", "aff": "Hong Kong University of Science and Technology;;Stanford University;Tsinghua University;Hong Kong University of Science and Technology;New York University", "aff_domain": "hkust.edu;;stanford.edu;tsinghua.edu.cn;ust.hk;nyu.edu", "position": "PhD student;;Professor;Assistant Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nliang2024singletrajectory,\ntitle={Single-Trajectory Distributionally Robust Reinforcement Learning},\nauthor={Zhipeng Liang and Xiaoteng Ma and Jose Blanchet and Jun Yang and Jiheng Zhang and Zhengyuan Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3B6vmW2L80}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5245130, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15318649337541386877&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "email": "hkust.edu;;stanford.edu;tsinghua.edu.cn;ust.hk;nyu.edu", "author_num": 6, "aff_unique_index": "0;1;2;0;3", "aff_unique_norm": "Hong Kong University of Science and Technology;Stanford University;Tsinghua University;New York University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ust.hk;https://www.stanford.edu;https://www.tsinghua.edu.cn;https://www.nyu.edu", "aff_unique_abbr": "HKUST;Stanford;THU;NYU", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Hong Kong SAR;Stanford;", "aff_country_unique_index": "0;1;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Rethinking Generative Large Language Model Evaluation for Semantic Comprehension", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35076", "id": "3Cp042s1Nc", "proceeding": "https://proceedings.mlr.press/v235/wei24c.html", "pdf": "https://openreview.net/pdf?id=3Cp042s1Nc", "openreview": "https://openreview.net/forum?id=3Cp042s1Nc", "author_site": "Fangyun Wei, Xi Chen, Lin Luo", "tldr": "", "abstract": "Despite their sophisticated capabilities, large language models (LLMs) encounter a major hurdle in effective assessment. This paper first revisits the prevalent evaluation method\u2014multiple choice question answering (MCQA), which allows for straightforward accuracy measurement. Through a comprehensive evaluation of 24 models across 11 benchmarks, we highlight several potential drawbacks of MCQA, for instance, the inconsistency between the MCQA evaluation and the generation of open-ended responses in practical scenarios. In response, we introduce an RWQ-Elo rating system, engaging 24 LLMs such as GPT-4, GPT-3.5, Google-Gemini-Pro and LLaMA-1/-2, in a two-player competitive format, with GPT-4 serving as the judge. Each LLM receives an Elo rating thereafter. This system is designed to mirror real-world usage, and for this purpose, we have compiled a new benchmark called ``Real-world questions'' (RWQ), comprising 20,772 authentic user inquiries. Additionally, we thoroughly analyze the characteristics of our system and compare it with prior leaderboards like Alpaca Eval and MT-Bench. Our analysis reveals the stability of our RWQ-Elo system, the feasibility of registering new models, and its potential to reshape LLM leaderboards.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fangyun Wei;Xi Chen;Lin Luo", "authorids": "~Fangyun_Wei1;~Xi_Chen55;~Lin_Luo7", "gender": "M;F;F", "homepage": ";;", "dblp": "161/2636;;", "google_scholar": "-ncz2s8AAAAJ;qkXqHgcAAAAJ;", "orcid": ";;", "linkedin": ";;%E7%90%B3-%E7%BD%97-6661412b2/", "or_profile": "~Fangyun_Wei1;~Xi_Chen55;~Lin_Luo7", "aff": "Microsoft Research;Microsoft;Microsoft", "aff_domain": "microsoft.com;microsoft.com;microsoft.com", "position": "Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\nwei2024rethinking,\ntitle={Rethinking Generative Large Language Model Evaluation for Semantic Comprehension},\nauthor={Fangyun Wei and Xi Chen and Lin Luo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3Cp042s1Nc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5321761, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15749010224246216380&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "microsoft.com;microsoft.com;microsoft.com", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Microsoft", "aff_unique_dep": "Microsoft Research", "aff_unique_url": "https://www.microsoft.com/en-us/research", "aff_unique_abbr": "MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Logistic Variational Bayes Revisited", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35075", "id": "3FBO41d4T2", "proceeding": "https://proceedings.mlr.press/v235/komodromos24a.html", "pdf": "https://openreview.net/pdf?id=3FBO41d4T2", "openreview": "https://openreview.net/forum?id=3FBO41d4T2", "author_site": "Michael Komodromos, Marina Evangelou, Sarah Filippi", "tldr": "", "abstract": "Variational logistic regression is a popular method for approximate Bayesian inference seeing wide-spread use in many areas of machine learning including: Bayesian optimization, reinforcement learning and multi-instance learning to name a few. However, due to the intractability of the Evidence Lower Bound, authors have turned to the use of Monte Carlo, quadrature or bounds to perform inference, methods which are costly or give poor approximations to the true posterior. In this paper we introduce a new bound for the expectation of softplus function and subsequently show how this can be applied to variational logistic regression and Gaussian process classification. Unlike other bounds, our proposal does not rely on extending the variational family, or introducing additional parameters to ensure the bound is tight. In fact, we show that this bound is tighter than the state-of-the-art, and that the resulting variational posterior achieves state-of-the-art performance, whilst being significantly faster to compute than Monte-Carlo methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Michael Komodromos;Marina Evangelou;Sarah Lucie Filippi", "authorids": "~Michael_Komodromos1;~Marina_Evangelou1;~Sarah_Lucie_Filippi1", "gender": "M;F;F", "homepage": "http://mkomod.github.io;https://www.imperial.ac.uk/people/m.evangelou;", "dblp": ";;", "google_scholar": ";;HhMJevQAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Michael_Komodromos1;~Marina_Evangelou1;~Sarah_Lucie_Filippi1", "aff": "Imperial College London;Imperial College London;Imperial College London", "aff_domain": "ic.ac.uk;imperial.ac.uk;imperial.ac.uk", "position": "PhD student;Senior Lecturer;Associate Professor", "bibtex": "@inproceedings{\nkomodromos2024logistic,\ntitle={Logistic Variational Bayes Revisited},\nauthor={Michael Komodromos and Marina Evangelou and Sarah Lucie Filippi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3FBO41d4T2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2312183, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:99ZTG0ki-38J:scholar.google.com/&scioq=Logistic+Variational+Bayes+Revisited&hl=en&as_sdt=0,44", "gs_version_total": 6, "email": "ic.ac.uk;imperial.ac.uk;imperial.ac.uk", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Imperial College London", "aff_unique_dep": "", "aff_unique_url": "https://www.imperial.ac.uk", "aff_unique_abbr": "ICL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Neuroexplicit Diffusion Models for Inpainting of Optical Flow Fields", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35074", "id": "3FKEtlX4aM", "proceeding": "https://proceedings.mlr.press/v235/fischer24b.html", "pdf": "https://openreview.net/pdf?id=3FKEtlX4aM", "openreview": "https://openreview.net/forum?id=3FKEtlX4aM", "author_site": "Tom Fischer, Pascal Peter, Joachim Weickert, Eddy Ilg", "tldr": "", "abstract": "Deep learning has revolutionized the field of computer vision by introducing large scale neural networks with millions of parameters. Training these networks requires massive datasets and leads to intransparent models that can fail to generalize. At the other extreme, models designed from partial differential equations (PDEs) embed specialized domain knowledge into mathematical equations and usually rely on few manually chosen hyperparameters. This makes them transparent by construction and if designed and calibrated carefully, they can generalize well to unseen scenarios. In this paper, we show how to bring model- and data-driven approaches together by combining the explicit PDE-based approaches with convolutional neural networks to obtain the best of both worlds. We illustrate a joint architecture for the task of inpainting optical flow fields and show that the combination of model- and data-driven modeling leads to an effective architecture. Our model outperforms both fully explicit and fully data-driven baselines in terms of reconstruction quality, robustness and amount of required training data. Averaging the endpoint error across different mask densities, our method outperforms the explicit baselines by 11-27%, the GAN baseline by 47% and the Probabilisitic Diffusion baseline by 42%. With that, our method sets a new state of the art for inpainting of optical flow fields from random masks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tom Fischer;Pascal Peter;Joachim Weickert;Eddy Ilg", "authorids": "~Tom_Fischer1;~Pascal_Peter1;~Joachim_Weickert1;~Eddy_Ilg3", "gender": "M;M;M;M", "homepage": "https://cvmp.cs.uni-saarland.de/people/#tom-fischer;https://www.mia.uni-saarland.de/peter/index.shtml;https://www.mia.uni-saarland.de/weickert/index.shtml;https://www.utn.de/departments/department-engineering/cvmp-lab/", "dblp": ";134/3090;w/JoachimWeickert.html;151/9307", "google_scholar": "idj3nF4AAAAJ;y4TcrHUAAAAJ;IWwCuGAAAAAJ;MYvSvGsAAAAJ", "orcid": "0009-0009-6776-2767;;;", "linkedin": "https://linkedin.com/in/tom-fischer-6209a2239;pascal-peter/;;eddy-ilg/", "or_profile": "~Tom_Fischer1;~Pascal_Peter1;~Joachim_Weickert1;~Eddy_Ilg3", "aff": "Universit\u00e4t des Saarlandes;Universit\u00e4t des Saarlandes;Universit\u00e4t des Saarlandes;Universit\u00e4t des Saarlandes", "aff_domain": "uni-saarland.de;uni-saarland.de;uni-saarland.de;uni-saarland.de", "position": "PhD student;Lecturer;Professor;Associate Professor", "bibtex": "@inproceedings{\nfischer2024neuroexplicit,\ntitle={Neuroexplicit Diffusion Models for Inpainting of Optical Flow Fields},\nauthor={Tom Fischer and Pascal Peter and Joachim Weickert and Eddy Ilg},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3FKEtlX4aM}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7555401, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:UscZFzNJEpgJ:scholar.google.com/&scioq=Neuroexplicit+Diffusion+Models+for+Inpainting+of+Optical+Flow+Fields&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": "uni-saarland.de;uni-saarland.de;uni-saarland.de;uni-saarland.de", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Universit\u00e4t des Saarlandes", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-saarland.de", "aff_unique_abbr": "UDS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Temporal Spiking Neural Networks with Synaptic Delay for Graph Reasoning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35073", "id": "3FeYlKIPr3", "proceeding": "https://proceedings.mlr.press/v235/xiao24f.html", "pdf": "https://openreview.net/pdf?id=3FeYlKIPr3", "openreview": "https://openreview.net/forum?id=3FeYlKIPr3", "author_site": "Mingqing Xiao, Yixin Zhu, Di He, Zhouchen Lin", "tldr": "", "abstract": "Spiking neural networks (SNNs) are investigated as biologically inspired models of neural computation, distinguished by their computational capability and energy efficiency due to precise spiking times and sparse spikes with event-driven computation. A significant question is how SNNs can emulate human-like graph-based reasoning of concepts and relations, especially leveraging the temporal domain optimally. This paper reveals that SNNs, when amalgamated with synaptic delay and temporal coding, are proficient in executing (knowledge) graph reasoning. It is elucidated that spiking time can function as an additional dimension to encode relation properties via a neural-generalized path formulation. Empirical results highlight the efficacy of temporal delay in relation processing and showcase exemplary performance in diverse graph reasoning tasks. The spiking model is theoretically estimated to achieve $20\\times$ energy savings compared to non-spiking counterparts, deepening insights into the capabilities and potential of biologically inspired SNNs for efficient reasoning. The code is available at https://github.com/pkuxmq/GRSNN.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mingqing Xiao;Yixin Zhu;Di He;Zhouchen Lin", "authorids": "~Mingqing_Xiao1;~Yixin_Zhu1;~Di_He1;~Zhouchen_Lin1", "gender": "M;M;M;M", "homepage": "https://pkuxmq.github.io/;https://yzhu.io/;https://dihe-pku.github.io/;https://zhouchenlin.github.io", "dblp": "19/2900-2;91/1103-1.html;74/184;l/ZhouchenLin", "google_scholar": "Hvj-WrwAAAAJ;qG9l6JEAAAAJ;https://scholar.google.co.jp/citations?user=orVoz4IAAAAJ;https://scholar.google.com.tw/citations?user=TanjFwoAAAAJ", "orcid": "0000-0001-6191-7726;0000-0001-7024-1545;;0000-0003-1493-7569", "linkedin": ";;;", "or_profile": "~Mingqing_Xiao1;~Yixin_Zhu1;~Di_He1;~Zhouchen_Lin1", "aff": "Peking University;Peking University;Microsoft;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn;microsoft.com;pku.edu.cn", "position": "PhD student;Assistant Professor;Senior Researcher;Professor", "bibtex": "@inproceedings{\nxiao2024temporal,\ntitle={Temporal Spiking Neural Networks with Synaptic Delay for Graph Reasoning},\nauthor={Mingqing Xiao and Yixin Zhu and Di He and Zhouchen Lin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3FeYlKIPr3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2732721, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=80739753939667416&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 6, "email": "pku.edu.cn;pku.edu.cn;microsoft.com;pku.edu.cn", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Peking University;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "http://www.pku.edu.cn;https://www.microsoft.com", "aff_unique_abbr": "Peking U;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Learning to Reach Goals via Diffusion", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35072", "id": "3JhmHCVPa8", "proceeding": "https://proceedings.mlr.press/v235/jain24b.html", "pdf": "https://openreview.net/pdf?id=3JhmHCVPa8", "openreview": "https://openreview.net/forum?id=3JhmHCVPa8", "author_site": "Vineet Jain, Siamak Ravanbakhsh", "tldr": "", "abstract": "We present a novel perspective on goal-conditioned reinforcement learning by framing it within the context of denoising diffusion models. Analogous to the diffusion process, where Gaussian noise is used to create random trajectories that walk away from the data manifold, we construct trajectories that move away from potential goal states. We then learn a goal-conditioned policy to reverse these deviations, analogous to the score function. This approach, which we call Merlin, can reach specified goals from arbitrary initial states without learning a separate value function. In contrast to recent works utilizing diffusion models in offline RL, Merlin stands out as the first method to perform diffusion in the state space, requiring only one \"denoising\" iteration per environment step. We experimentally validate our approach in various offline goal-reaching tasks, demonstrating substantial performance enhancements compared to state-of-the-art methods while improving computational efficiency over other diffusion-based RL methods by an order of magnitude. Our results suggest that this perspective on diffusion for RL is a simple and scalable approach for sequential decision making.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vineet Jain;Siamak Ravanbakhsh", "authorids": "~Vineet_Jain1;~Siamak_Ravanbakhsh1", "gender": ";", "homepage": ";", "dblp": "92/3653;", "google_scholar": "https://scholar.google.com/citations?hl=en;", "orcid": ";", "linkedin": ";", "or_profile": "~Vineet_Jain1;~Siamak_Ravanbakhsh1", "aff": "McGill University;", "aff_domain": "mcgill.ca;", "position": "PhD student;", "bibtex": "@inproceedings{\njain2024learning,\ntitle={Learning to Reach Goals via Diffusion},\nauthor={Vineet Jain and Siamak Ravanbakhsh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3JhmHCVPa8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9974933, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17073276869658529521&as_sdt=8005&sciodt=0,7&hl=en", "gs_version_total": 6, "email": "mcgill.ca;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "McGill University", "aff_unique_dep": "", "aff_unique_url": "https://www.mcgill.ca", "aff_unique_abbr": "McGill", "aff_country_unique_index": "0", "aff_country_unique": "Canada" }, { "title": "Local Causal Structure Learning in the Presence of Latent Variables", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35071", "id": "3KMMPxrAk5", "proceeding": "https://proceedings.mlr.press/v235/xie24f.html", "pdf": "https://openreview.net/pdf?id=3KMMPxrAk5", "openreview": "https://openreview.net/forum?id=3KMMPxrAk5", "author_site": "Feng Xie, Zheng Li, Peng Wu, Yan Zeng, Chunchen LIU, zhi geng", "tldr": "", "abstract": "Discovering causal relationships from observational data, particularly in the presence of latent variables, poses a challenging problem. While current local structure learning methods have proven effective and efficient when the focus lies solely on the local relationships of a target variable, they operate under the assumption of causal sufficiency. This assumption implies that all the common causes of the measured variables are observed, leaving no room for latent variables. Such a premise can be easily violated in various real-world applications, resulting in inaccurate structures that may adversely impact downstream tasks. In light of this, our paper delves into the primary investigation of locally identifying potential parents and children of a target from observational data that may include latent variables. Specifically, we harness the causal information from m-separation and V-structures to derive theoretical consistency results, effectively bridging the gap between global and local structure learning. Together with the newly developed stop rules, we present a principled method for determining whether a variable is a direct cause or effect of a target. Further, we theoretically demonstrate the correctness of our approach under the standard causal Markov and faithfulness conditions, with infinite samples. Experimental results on both synthetic and real-world data validate the effectiveness and efficiency of our approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Feng Xie;Zheng Li;Peng Wu;Yan Zeng;Chunchen LIU;Zhi Geng", "authorids": "~Feng_Xie1;~Zheng_Li27;~Peng_Wu5;~Yan_Zeng2;~Chunchen_LIU2;~Zhi_Geng1", "gender": "M;;M;;F;M", "homepage": "https://fengxie.site/;;https://pengwu.site/;https://scholar.google.com/citations?user=XyxLHCAAAAAJ&hl=zh-CN;;https://stxy.btbu.edu.cn/szdw/bssds/34339356074b408c8650309f05f24558.htm", "dblp": "11/4605-2;;15/6146-12;83/4665-2;;", "google_scholar": "stLFCtQAAAAJ;;https://scholar.google.com/citations?view_op=list_works;XyxLHCAAAAAJ;IkbNsd4AAAAJ;", "orcid": "0000-0001-7229-3955;;0000-0001-7154-8880;0000-0001-7721-2560;;", "linkedin": ";;;;chunchen-liu-76915766/;", "or_profile": "~Feng_Xie1;~Zheng_Li27;~Peng_Wu5;~Yan_Zeng2;~Chunchen_LIU2;~Zhi_Geng1", "aff": "Beijing Technology and Business University;;Beijing Technology and Business University;Beijing Technology and Business University;Alibaba Group;School of mathematical Science, Peking University, Peking University", "aff_domain": "btbu.edu.cn;;btbu.edu.cn;btbu.edu.cn;alibaba-inc.com;math.pku.edu.cn", "position": "Associate Professor;;Associate Professor;Lecturer;Researcher;Full Professor", "bibtex": "@inproceedings{\nxie2024local,\ntitle={Local Causal Structure Learning in the Presence of Latent Variables},\nauthor={Feng Xie and Zheng Li and Peng Wu and Yan Zeng and Chunchen LIU and Zhi Geng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3KMMPxrAk5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 477406, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3591942603865640611&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "btbu.edu.cn;;btbu.edu.cn;btbu.edu.cn;alibaba-inc.com;math.pku.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "Beijing Technology and Business University;Alibaba Group;Peking University", "aff_unique_dep": ";;School of Mathematical Sciences", "aff_unique_url": "http://www.btbu.edu.cn;https://www.alibaba.com;http://www.pku.edu.cn", "aff_unique_abbr": "BTBU;Alibaba;PKU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Peking", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Navigating Scaling Laws: Compute Optimality in Adaptive Model Training", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35070", "id": "3KxPo62PYn", "proceeding": "https://proceedings.mlr.press/v235/anagnostidis24a.html", "pdf": "https://openreview.net/pdf?id=3KxPo62PYn", "openreview": "https://openreview.net/forum?id=3KxPo62PYn", "author_site": "Sotiris Anagnostidis, Gregor Bachmann, Imanol Schlag, Thomas Hofmann", "tldr": "", "abstract": "In recent years, the state-of-the-art in deep learning has been dominated by very large models that have been pre-trained on vast amounts of data. The paradigm is very simple: investing more computational resources (optimally) leads to better performance, and even predictably so; neural scaling laws have been derived that accurately forecast the performance of a network for a desired level of compute. This leads to the notion of a 'compute-optimal' model, i.e. a model that allocates a given level of compute during training optimally to maximize performance. In this work, we extend the concept of optimality by allowing for an 'adaptive' model, i.e. a model that can change its shape during training. By doing so, we can design adaptive models that optimally traverse between the underlying scaling laws and outpace their `static' counterparts, leading to a significant reduction in the required compute to reach a given target performance. We show that our approach generalizes across modalities and different shape parameters.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sotiris Anagnostidis;Gregor Bachmann;Imanol Schlag;Thomas Hofmann", "authorids": "~Sotiris_Anagnostidis1;~Gregor_Bachmann1;~Imanol_Schlag3;~Thomas_Hofmann1", "gender": "M;M;M;M", "homepage": ";http://www.da.inf.ethz.ch/people/GregorBachmann;;http://www.da.inf.ethz.ch/", "dblp": "286/1763;;213/4144;h/ThHofmann", "google_scholar": "qjzTKWUAAAAJ;bbGqqloAAAAJ;https://scholar.google.ch/citations?user=nFQJEskAAAAJ;T3hAyLkAAAAJ", "orcid": ";;;", "linkedin": "sotiris-anagnostidis-b064a5129/;;;thomas-hofmann-1ab2402/", "or_profile": "~Sotiris_Anagnostidis1;~Gregor_Bachmann1;~Imanol_Schlag3;~Thomas_Hofmann1", "aff": "ETH Zurich;Swiss Federal Institute of Technology;ETHZ - ETH Zurich;Swiss Federal Institute of Technology", "aff_domain": "inf.ethz.ch;ethz.ch;ethz.ch;ethz.ch", "position": "PhD student;PhD student;Postdoc;Full Professor", "bibtex": "@inproceedings{\nanagnostidis2024navigating,\ntitle={Navigating Scaling Laws: Compute Optimality in Adaptive Model Training},\nauthor={Sotiris Anagnostidis and Gregor Bachmann and Imanol Schlag and Thomas Hofmann},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3KxPo62PYn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3324322, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3612406235909448672&as_sdt=5,39&sciodt=0,39&hl=en", "gs_version_total": 7, "email": "inf.ethz.ch;ethz.ch;ethz.ch;ethz.ch", "author_num": 4, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "ETH Zurich;Swiss Federal Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch", "aff_unique_abbr": "ETHZ;ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Expand-and-Cluster: Parameter Recovery of Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35069", "id": "3MIuPRJYwf", "proceeding": "https://proceedings.mlr.press/v235/martinelli24a.html", "pdf": "https://openreview.net/pdf?id=3MIuPRJYwf", "openreview": "https://openreview.net/forum?id=3MIuPRJYwf", "author_site": "Flavio Martinelli, Berfin Simsek, Wulfram Gerstner, Johanni Brea", "tldr": "", "abstract": "Can we identify the weights of a neural network by probing its input-output mapping? At first glance, this problem seems to have many solutions because of permutation, overparameterisation and activation function symmetries. Yet, we show that the incoming weight vector of each neuron is identifiable up to sign or scaling, depending on the activation function. Our novel method 'Expand-and-Cluster\u2019 can identify layer sizes and weights of a target network for all commonly used activation functions. Expand-and-Cluster consists of two phases: (i) to relax the non-convex optimisation problem, we train multiple overparameterised student networks to best imitate the target function; (ii) to reverse engineer the target network's weights, we employ an ad-hoc clustering procedure that reveals the learnt weight vectors shared between students -- these correspond to the target weight vectors. We demonstrate successful weights and size recovery of trained shallow and deep networks with less than 10% overhead in the layer size and describe an 'ease-of-identifiability' axis by analysing 150 synthetic problems of variable difficulty.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Flavio Martinelli;Berfin Simsek;Wulfram Gerstner;Johanni Brea", "authorids": "~Flavio_Martinelli1;~Berfin_Simsek1;~Wulfram_Gerstner1;~Johanni_Brea1", "gender": "M;F;;", "homepage": ";https://www.bsimsek.com/;https://lcnwww.epfl.ch/gerstner/;", "dblp": "251/5678;244/2455;g/WGerstner;", "google_scholar": "DabSKBgAAAAJ;Ysi38KIAAAAJ;https://scholar.google.ch/citations?user=vSd2RnEAAAAJ;", "orcid": ";;0000-0002-4344-2189;", "linkedin": ";;;", "or_profile": "~Flavio_Martinelli1;~Berfin_Simsek1;~Wulfram_Gerstner1;~Johanni_Brea1", "aff": "EPFL - EPF Lausanne;New York University;EPFL - EPF Lausanne;", "aff_domain": "epfl.ch;nyu.edu;epfl.ch;", "position": "PhD student;Assistant Professor;Full Professor;", "bibtex": "@inproceedings{\nmartinelli2024expandandcluster,\ntitle={Expand-and-Cluster: Parameter Recovery of Neural Networks},\nauthor={Flavio Martinelli and Berfin Simsek and Wulfram Gerstner and Johanni Brea},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3MIuPRJYwf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7422794, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15811448903509550402&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "epfl.ch;nyu.edu;epfl.ch;", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "EPFL;New York University", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://www.nyu.edu", "aff_unique_abbr": "EPFL;NYU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Switzerland;United States" }, { "title": "Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35068", "id": "3MW8GKNyzI", "proceeding": "https://proceedings.mlr.press/v235/chiang24b.html", "pdf": "https://openreview.net/pdf?id=3MW8GKNyzI", "openreview": "https://openreview.net/forum?id=3MW8GKNyzI", "author_site": "Wei-Lin Chiang, Lianmin Zheng, Ying Sheng, Anastasios Angelopoulos, Tianle Li, Dacheng Li, Banghua Zhu, Hao Zhang, Michael Jordan, Joseph E Gonzalez, Ion Stoica", "tldr": "", "abstract": "Large Language Models (LLMs) have unlocked new capabilities and applications; however, evaluating the alignment with human preferences still poses significant challenges. To address this issue, we introduce Chatbot Arena, an open platform for evaluating LLMs based on human preferences. Our methodology employs a pairwise comparison approach and leverages input from a diverse user base through crowdsourcing. The platform has been operational for several months, amassing over 240K votes. This paper describes the platform, analyzes the data we have collected so far, and explains the tried-and-true statistical methods we are using for efficient and accurate evaluation and ranking of models. We confirm that the crowdsourced questions are sufficiently diverse and discriminating and that the crowd-sourced human votes are in good agreement with those of expert raters. These analyses collectively establish a robust foundation for the credibility of Chatbot Arena. Because of its unique value and openness, Chatbot Arena has emerged as one of the most referenced LLM leaderboards, widely cited by leading LLM developers and companies. The platform is publicly available at https://chat.lmsys.org.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wei-Lin Chiang;Lianmin Zheng;Ying Sheng;Anastasios Nikolas Angelopoulos;Tianle Li;Dacheng Li;Banghua Zhu;Hao Zhang;Michael Jordan;Joseph E. Gonzalez;Ion Stoica", "authorids": "~Wei-Lin_Chiang1;~Lianmin_Zheng2;~Ying_Sheng1;~Anastasios_Nikolas_Angelopoulos1;~Tianle_Li2;~Dacheng_Li1;~Banghua_Zhu1;~Hao_Zhang2;~Michael_Jordan1;~Joseph_E._Gonzalez1;~Ion_Stoica1", "gender": ";M;F;M;M;;M;M;M;M;M", "homepage": "https://infwinston.github.io/;http://lmzheng.net/;https://sites.google.com/view/yingsheng;http://angelopoulos.ai;;;https://people.eecs.berkeley.edu/~banghua/;https://cseweb.ucsd.edu/~haozhang/;http://www.cs.berkeley.edu/~jordan/;http://eecs.berkeley.edu/~jegonzal;http://people.eecs.berkeley.edu/~istoica/", "dblp": "174/2148;211/7027;262/6232.html;;;;204/5394;55/2270-25;j/MichaelIJordan;61/8262;s/IonStoica", "google_scholar": "https://scholar.google.com/citations?hl=en;_7Q8uIYAAAAJ;xMhGYpgAAAAJ;nfX25MMAAAAJ;1M79iLwAAAAJ;;https://scholar.google.com/citations?hl=en;H1d4BS8AAAAJ;https://scholar.google.com.tw/citations?user=yxUduqMAAAAJ;https://scholar.google.com.tw/citations?user=gM2WW9UAAAAJ;vN-is70AAAAJ", "orcid": ";;0000-0002-1883-2126;;;;;;0000-0001-8935-817X;0000-0003-2921-956X;", "linkedin": ";;;anastasiosa/;tianleli/;;;;;;ionstoica", "or_profile": "~Wei-Lin_Chiang1;~Lianmin_Zheng2;~Ying_Sheng1;~Anastasios_Nikolas_Angelopoulos1;~Tianle_Li2;~Dacheng_Li1;~Banghua_Zhu1;~Hao_Zhang2;~Michael_Jordan1;~Joseph_E._Gonzalez1;~Ion_Stoica1", "aff": "University of California, Berkeley;University of California, Berkeley;Stanford University;University of California, Berkeley;University of California, Berkeley;;University of California, Berkeley;Carnegie Mellon University;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu;stanford.edu;berkeley.edu;berkeley.edu;;berkeley.edu;cmu.edu;berkeley.edu;berkeley.edu;berkeley.edu", "position": "PhD student;PhD student;PhD student;PhD student;Undergrad student;;PhD student;PhD student;Full Professor;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nchiang2024chatbot,\ntitle={Chatbot Arena: An Open Platform for Evaluating {LLM}s by Human Preference},\nauthor={Wei-Lin Chiang and Lianmin Zheng and Ying Sheng and Anastasios Nikolas Angelopoulos and Tianle Li and Dacheng Li and Banghua Zhu and Hao Zhang and Michael Jordan and Joseph E. Gonzalez and Ion Stoica},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3MW8GKNyzI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 958126, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 554, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3554688913259716790&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "berkeley.edu;berkeley.edu;stanford.edu;berkeley.edu;berkeley.edu;;berkeley.edu;cmu.edu;berkeley.edu;berkeley.edu;berkeley.edu", "author_num": 11, "aff_unique_index": "0;0;1;0;0;0;2;0;0;0", "aff_unique_norm": "University of California, Berkeley;Stanford University;Carnegie Mellon University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.berkeley.edu;https://www.stanford.edu;https://www.cmu.edu", "aff_unique_abbr": "UC Berkeley;Stanford;CMU", "aff_campus_unique_index": "0;0;1;0;0;0;0;0;0", "aff_campus_unique": "Berkeley;Stanford;", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "How Flawed Is ECE? An Analysis via Logit Smoothing", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35067", "id": "3McL91pE6x", "proceeding": "https://proceedings.mlr.press/v235/chidambaram24a.html", "pdf": "https://openreview.net/pdf?id=3McL91pE6x", "openreview": "https://openreview.net/forum?id=3McL91pE6x", "author_site": "Muthu Chidambaram, Holden Lee, Colin McSwiggen, Semon Rezchikov", "tldr": "", "abstract": "Informally, a model is calibrated if its predictions are correct with a probability that matches the confidence of the prediction. By far the most common method in the literature for measuring calibration is the expected calibration error (ECE). Recent work, however, has pointed out drawbacks of ECE, such as the fact that it is discontinuous in the space of predictors. In this work, we ask: how fundamental are these issues, and what are their impacts on existing results? Towards this end, we completely characterize the discontinuities of ECE with respect to general probability measures on Polish spaces. We then use the nature of these discontinuities to motivate a novel *continuous, easily estimated* miscalibration metric, which we term *Logit-Smoothed ECE (LS-ECE)*. By comparing the ECE and LS-ECE of pre-trained image classification models, we show in initial experiments that binned ECE closely tracks LS-ECE, indicating that the theoretical pathologies of ECE may be avoidable in practice.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Muthu Chidambaram;Holden Lee;Colin McSwiggen;Semon Rezchikov", "authorids": "~Muthu_Chidambaram1;~Holden_Lee1;cmcswiggen@gmail.com;~Semon_Rezchikov1", "gender": "M;M;;M", "homepage": "https://2014mchidamb.github.io/;http://holdenlee.github.io;;https://www.rezchikov.me/", "dblp": "304/3319;150/3407;;294/5525", "google_scholar": "R43EbqAAAAAJ;hR9rFHgAAAAJ;;https://scholar.google.com/citations?view_op=list_works", "orcid": ";;;", "linkedin": "muthu-chidambaram-b8803919a/;;;", "or_profile": "~Muthu_Chidambaram1;~Holden_Lee1;cmcswiggen@gmail.com;~Semon_Rezchikov1", "aff": "Duke University;Johns Hopkins University;;Princeton University", "aff_domain": "duke.edu;jh.edu;;princeton.edu", "position": "PhD student;Assistant Professor;;Postdoc", "bibtex": "@inproceedings{\nchidambaram2024how,\ntitle={How Flawed Is {ECE}? An Analysis via Logit Smoothing},\nauthor={Muthu Chidambaram and Holden Lee and Colin McSwiggen and Semon Rezchikov},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3McL91pE6x}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 922438, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13384494355260863630&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "duke.edu;jh.edu;;princeton.edu", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Duke University;Johns Hopkins University;Princeton University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.duke.edu;https://www.jhu.edu;https://www.princeton.edu", "aff_unique_abbr": "Duke;JHU;Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Multimodal Prototyping for cancer survival prediction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35066", "id": "3MfvxH3Gia", "proceeding": "https://proceedings.mlr.press/v235/song24b.html", "pdf": "https://openreview.net/pdf?id=3MfvxH3Gia", "openreview": "https://openreview.net/forum?id=3MfvxH3Gia", "author_site": "Andrew Song, Richard Chen, Guillaume Jaume, Anurag Vaidya, Alexander Baras, Faisal Mahmood", "tldr": "", "abstract": "Multimodal survival methods combining gigapixel histology whole-slide images (WSIs) and transcriptomic profiles are particularly promising for patient prognostication and stratification. Current approaches involve tokenizing the WSIs into smaller patches ($>10^4$ patches) and transcriptomics into gene groups, which are then integrated using a Transformer for predicting outcomes. However, this process generates many tokens, which leads to high memory requirements for computing attention and complicates post-hoc interpretability analyses. Instead, we hypothesize that we can: (1) effectively summarize the morphological content of a WSI by condensing its constituting tokens using morphological prototypes, achieving more than $300\\times$ compression; and (2) accurately characterize cellular functions by encoding the transcriptomic profile with biological pathway prototypes, all in an unsupervised fashion. The resulting multimodal tokens are then processed by a fusion network, either with a Transformer or an optimal transport cross-alignment, which now operates with a small and fixed number of tokens without approximations. Extensive evaluation on six cancer types shows that our framework outperforms state-of-the-art methods with much less computation while unlocking new interpretability analyses. The code is available at https://github.com/mahmoodlab/MMP.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Andrew H. Song;Richard J. Chen;Guillaume Jaume;Anurag Jayant Vaidya;Alexander Baras;Faisal Mahmood", "authorids": "~Andrew_H._Song1;~Richard_J._Chen1;~Guillaume_Jaume2;~Anurag_Jayant_Vaidya1;~Alexander_Baras1;~Faisal_Mahmood1", "gender": "M;M;M;M;M;M", "homepage": "http://richarizardd.me;https://guillaumejaume.github.io/;https://ajv012.github.io;;http://www.mahmoodlab.org;https://andrewhsong.com", "dblp": "244/1941;;;177/8463;;229/0474", "google_scholar": "yhGqdMgAAAAJ;am5XqsQAAAAJ;4Z2Qu_YAAAAJ;https://scholar.google.com/citations?hl=en;9MsdbKoAAAAJ;1UNlyTcAAAAJ", "orcid": "0000-0003-0389-1331;;;0000-0003-2397-3342;0000-0001-7587-1562;", "linkedin": "richardchen95;;;;;", "or_profile": "~Richard_J._Chen1;~Guillaume_Jaume2;~Anurag_Jayant_Vaidya1;~Alexander_Baras1;~Faisal_Mahmood1;~Andrew_Song1", "aff": "Harvard University;Harvard University;Massachusetts Institute of Technology;Johns Hopkins University;Harvard University;Brigham and Women's hospital", "aff_domain": "harvard.edu;harvard.edu;mit.edu;jh.edu;harvard.edu;bwh.harvard.edu", "position": "PhD student;Postdoc;PhD student;Associate Professor;Associate Professor;Postdoc", "bibtex": "@inproceedings{\nsong2024multimodal,\ntitle={Multimodal Prototyping for cancer survival prediction},\nauthor={Andrew H. Song and Richard J. Chen and Guillaume Jaume and Anurag Jayant Vaidya and Alexander Baras and Faisal Mahmood},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3MfvxH3Gia}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5709396, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10474269334534463697&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "harvard.edu;harvard.edu;mit.edu;jh.edu;harvard.edu;bwh.harvard.edu", "author_num": 6, "aff_unique_index": "0;0;1;2;0;3", "aff_unique_norm": "Harvard University;Massachusetts Institute of Technology;Johns Hopkins University;Brigham and Women's Hospital", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.harvard.edu;https://web.mit.edu;https://www.jhu.edu;https://www.brighamandwomens.org", "aff_unique_abbr": "Harvard;MIT;JHU;BWH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Differentiable Combinatorial Scheduling at Scale", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35065", "id": "3Pq6uI1MTE", "proceeding": "https://proceedings.mlr.press/v235/liu24al.html", "pdf": "https://openreview.net/pdf?id=3Pq6uI1MTE", "openreview": "https://openreview.net/forum?id=3Pq6uI1MTE", "author_site": "Mingju Liu, Yingjie Li, Jiaqi Yin, Zhiru Zhang, CUNXI YU", "tldr": "", "abstract": "This paper addresses the complex issue of resource-constrained scheduling, an NP-hard problem that spans critical areas including chip design and high-performance computing. Traditional scheduling methods often stumble over scalability and applicability challenges. We propose a novel approach using a differentiable combinatorial scheduling framework, utilizing Gumbel-Softmax differentiable sampling technique. This new technical allows for a fully differentiable formulation of linear programming (LP) based scheduling, extending its application to a broader range of LP formulations. To encode inequality constraints for scheduling tasks, we introduce *constrained Gumbel Trick*, which adeptly encodes arbitrary inequality constraints. Consequently, our method facilitates an efficient and scalable scheduling via gradient descent without the need for training data. Comparative evaluations on both synthetic and real-world benchmarks highlight our capability to significantly improve the optimization efficiency of scheduling, surpassing state-of-the-art solutions offered by commercial and open-source solvers such as CPLEX, Gurobi, and CP-SAT in the majority of the designs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mingju Liu;Yingjie Li;Jiaqi Yin;Zhiru Zhang;CUNXI YU", "authorids": "~Mingju_Liu1;~Yingjie_Li1;jyin629@umd.edu;~Zhiru_Zhang2;~CUNXI_YU1", "gender": "M;F;;M;", "homepage": "https://mingj-liu.github.io;https://www.ece.utah.edu;;https://www.csl.cornell.edu/~zhiruz;", "dblp": "46/4616.html;;;81/4227;", "google_scholar": ";;;https://scholar.google.com.tw/citations?user=x05pUHsAAAAJ;", "orcid": ";;;;", "linkedin": "https://linkedin.com/in/mingju-liu-3929b513a;;;;", "or_profile": "~Mingju_Liu1;~Yingjie_Li1;jyin629@umd.edu;~Zhiru_Zhang2;~CUNXI_YU1", "aff": "University of Maryland, College Park;University of Utah;;Cornell University;", "aff_domain": "umd.edu;utah.edu;;cornell.edu;", "position": "PhD student;PhD student;;Associate Professor;", "bibtex": "@inproceedings{\nliu2024differentiable,\ntitle={Differentiable Combinatorial Scheduling at Scale},\nauthor={Mingju Liu and Yingjie Li and Jiaqi Yin and Zhiru Zhang and CUNXI YU},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3Pq6uI1MTE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 803321, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gSSCV5Qy288J:scholar.google.com/&scioq=Differentiable+Combinatorial+Scheduling+at+Scale&hl=en&as_sdt=0,5", "gs_version_total": 9, "email": "umd.edu;utah.edu;;cornell.edu;", "author_num": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Maryland;University of Utah;Cornell University", "aff_unique_dep": ";;", "aff_unique_url": "https://www/umd.edu;https://www.utah.edu;https://www.cornell.edu", "aff_unique_abbr": "UMD;Utah;Cornell", "aff_campus_unique_index": "0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "MALIBO: Meta-learning for Likelihood-free Bayesian Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35064", "id": "3QM5SWfeov", "proceeding": "https://proceedings.mlr.press/v235/pan24b.html", "pdf": "https://openreview.net/pdf?id=3QM5SWfeov", "openreview": "https://openreview.net/forum?id=3QM5SWfeov", "author_site": "Jiarong Pan, Stefan Falkner, Felix Berkenkamp, Joaquin Vanschoren", "tldr": "", "abstract": "Bayesian optimization (BO) is a popular method to optimize costly black-box functions, and meta-learning has emerged as a way to leverage knowledge from related tasks to optimize new tasks faster. However, existing meta-learning methods for BO rely on surrogate models that are not scalable or are sensitive to varying input scales and noise types across tasks. Moreover, they often overlook the uncertainty associated with task similarity, leading to unreliable task adaptation when a new task differs significantly or has not been sufficiently explored yet. We propose a novel meta-learning BO approach that bypasses the surrogate model and directly learns the utility of queries across tasks. It explicitly models task uncertainty and includes an auxiliary model to enable robust adaptation to new tasks. Extensive experiments show that our method achieves strong performance and outperforms multiple meta-learning BO methods across various benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiarong Pan;Stefan Falkner;Felix Berkenkamp;Joaquin Vanschoren", "authorids": "~Jiarong_Pan1;~Stefan_Falkner1;~Felix_Berkenkamp1;~Joaquin_Vanschoren1", "gender": "M;M;M;M", "homepage": ";;https://berkenkamp.me;http://www.win.tue.nl/~jvanscho/", "dblp": "296/3036;168/1232;168/8558;85/5045", "google_scholar": "3KjbaRUAAAAJ;https://scholar.google.de/citations?user=r7FWJEkAAAAJ;https://scholar.google.ch/citations?user=N_tCEl8AAAAJ;HhDsD9UAAAAJ", "orcid": ";;;0000-0001-7044-9805", "linkedin": "gary-pan/;stefan-falkner-b4142771;berkenkamp/;", "or_profile": "~Jiarong_Pan1;~Stefan_Falkner1;~Felix_Berkenkamp1;~Joaquin_Vanschoren1", "aff": "Eindhoven University of Technology;Robert Bosch GmbH;Bosch;Eindhoven University of Technology", "aff_domain": "tue.nl;de.bosch.de;bosch.com;tue.nl", "position": "PhD student;Research Scientist;Research Scientist;Associate Professor", "bibtex": "@inproceedings{\npan2024malibo,\ntitle={{MALIBO}: Meta-learning for Likelihood-free Bayesian Optimization},\nauthor={Jiarong Pan and Stefan Falkner and Felix Berkenkamp and Joaquin Vanschoren},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3QM5SWfeov}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5710024, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17421855921906059459&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "email": "tue.nl;de.bosch.de;bosch.com;tue.nl", "author_num": 4, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Eindhoven University of Technology;Robert Bosch GmbH", "aff_unique_dep": ";", "aff_unique_url": "https://www.tue.nl;https://www.bosch.com", "aff_unique_abbr": "TU/e;Bosch", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "Netherlands;Germany" }, { "title": "Position: Embracing Negative Results in Machine Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35063", "id": "3RXAiU7sss", "proceeding": "https://proceedings.mlr.press/v235/karl24a.html", "pdf": "https://openreview.net/pdf?id=3RXAiU7sss", "openreview": "https://openreview.net/forum?id=3RXAiU7sss", "author_site": "Florian Karl, Malte Kemeter, Gabriel Dax, Paulina Sierak", "tldr": "", "abstract": "Publications proposing novel machine learning methods are often primarily rated by exhibited predictive performance on selected problems. In this position paper we argue that predictive performance alone is not a good indicator for the worth of a publication. Using it as such even fosters problems like inefficiencies of the machine learning research community as a whole and setting wrong incentives for researchers. We therefore put out a call for the publication of ``negative'' results, which can help alleviate some of these problems and improve the scientific output of the machine learning research community. To substantiate our position, we present the advantages of publishing negative results and provide concrete measures for the community to move towards a paradigm where their publication is normalized.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Florian Karl;Malte Kemeter;Gabriel Dax;Paulina Sierak", "authorids": "~Florian_Karl1;malte.kemeter@iis.fraunhofer.de;~Gabriel_Dax1;paulina.sierak@iis.fraunhofer.de", "gender": "M;;M;", "homepage": "https://www.slds.stat.uni-muenchen.de/people/karl/;;https://g4br1el.github.io/;", "dblp": ";;;", "google_scholar": "k-ZyY8EAAAAJ;;;", "orcid": "0000-0003-0163-2272;;;", "linkedin": ";;;", "or_profile": "~Florian_Karl1;malte.kemeter@iis.fraunhofer.de;~Gabriel_Dax1;paulina.sierak@iis.fraunhofer.de", "aff": "University of Munich, Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;;Fraunhofer IIS;", "aff_domain": "campus.lmu.de;;iis.fraunhofer.de;", "position": "PhD student;;Postdoc;", "bibtex": "@inproceedings{\nkarl2024position,\ntitle={Position: Embracing Negative Results in Machine Learning},\nauthor={Florian Karl and Malte Kemeter and Gabriel Dax and Paulina Sierak},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3RXAiU7sss}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 180271, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8619047077735724241&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "campus.lmu.de;;iis.fraunhofer.de;", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;Fraunhofer Institute for Integrated Circuits", "aff_unique_dep": ";", "aff_unique_url": "https://www.lmu.de;https://www.iis.fraunhofer.de/", "aff_unique_abbr": "LMU;Fraunhofer IIS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Borda Regret Minimization for Generalized Linear Dueling Bandits", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35062", "id": "3Tzdpjc59k", "proceeding": "https://proceedings.mlr.press/v235/wu24m.html", "pdf": "https://openreview.net/pdf?id=3Tzdpjc59k", "openreview": "https://openreview.net/forum?id=3Tzdpjc59k", "author_site": "Yue Wu, Tao Jin, Qiwei Di, Hao Lou, Farzad Farnoud, Quanquan Gu", "tldr": "", "abstract": "Dueling bandits are widely used to model preferential feedback prevalent in many applications such as recommendation systems and ranking. In this paper, we study the Borda regret minimization problem for dueling bandits, which aims to identify the item with the highest Borda score while minimizing the cumulative regret. We propose a rich class of generalized linear dueling bandit models, which cover many existing models. We first prove a regret lower bound of order $\\Omega(d^{2/3} T^{2/3})$ for the Borda regret minimization problem, where $d$ is the dimension of contextual vectors and $T$ is the time horizon. To attain this lower bound, we propose an explore-then-commit type algorithm for the stochastic setting, which has a nearly matching regret upper bound $\\tilde{O}(d^{2/3} T^{2/3})$. We also propose an EXP3-type algorithm for the adversarial linear setting, where the underlying model parameter can change in each round. Our algorithm achieves an $\\tilde{O}(d^{2/3} T^{2/3})$ regret, which is also optimal. Empirical evaluations on both synthetic data and a simulated real-world environment are conducted to corroborate our theoretical analysis.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yue Wu;Tao Jin;Qiwei Di;Hao Lou;Farzad Farnoud;Quanquan Gu", "authorids": "~Yue_Wu12;~Tao_Jin3;~Qiwei_Di1;~Hao_Lou1;~Farzad_Farnoud1;~Quanquan_Gu1", "gender": "M;M;M;M;;M", "homepage": "https://yuewu.us/;https://tao-j.me;https://qiwei-di1234.github.io/;http://ips.lab.virginia.edu/;http://www.ece.virginia.edu/~ffh8x;http://web.cs.ucla.edu/~qgu/", "dblp": "41/5979-11;88/4850-2;354/3878;44/6250;88/7890.html;50/4597", "google_scholar": "kSQ1mLYAAAAJ;0kCyQGsAAAAJ;SewL0pkAAAAJ;;https://scholar.google.com/citations?hl=en;GU9HgNAAAAAJ", "orcid": ";;;;0000-0002-8684-4487;", "linkedin": ";;qiwei-di-00776a253/;;farzad-farnoud-b7993315/;", "or_profile": "~Yue_Wu12;~Tao_Jin3;~Qiwei_Di1;~Hao_Lou1;~Farzad_Farnoud1;~Quanquan_Gu1", "aff": "University of California, Los Angeles;;University of California, Los Angeles;;University of Virginia;University of California, Los Angeles", "aff_domain": "ucla.edu;;ucla.edu;;virginia.edu;cs.ucla.edu", "position": "PhD student;;PhD student;;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nwu2024borda,\ntitle={Borda Regret Minimization for Generalized Linear Dueling Bandits},\nauthor={Yue Wu and Tao Jin and Qiwei Di and Hao Lou and Farzad Farnoud and Quanquan Gu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3Tzdpjc59k}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1041165, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14660110423878220029&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "ucla.edu;;ucla.edu;;virginia.edu;cs.ucla.edu", "author_num": 6, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of California, Los Angeles;University of Virginia", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucla.edu;https://www.virginia.edu", "aff_unique_abbr": "UCLA;UVA", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Provably Better Explanations with Optimized Aggregation of Feature Attributions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35061", "id": "3VnSgdget6", "proceeding": "https://proceedings.mlr.press/v235/decker24a.html", "pdf": "https://openreview.net/pdf?id=3VnSgdget6", "openreview": "https://openreview.net/forum?id=3VnSgdget6", "author_site": "Thomas Decker, Ananta Bhattarai, Jindong Gu, Volker Tresp, Florian Buettner", "tldr": "", "abstract": "Using feature attributions for post-hoc explanations is a common practice to understand and verify the predictions of opaque machine learning models. Despite the numerous techniques available, individual methods often produce inconsistent and unstable results, putting their overall reliability into question. In this work, we aim to systematically improve the quality of feature attributions by combining multiple explanations across distinct methods or their variations. For this purpose, we propose a novel approach to derive optimal convex combinations of feature attributions that yield provable improvements of desired quality criteria such as robustness or faithfulness to the model behavior. Through extensive experiments involving various model architectures and popular feature attribution techniques, we demonstrate that our combination strategy consistently outperforms individual methods and existing baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Thomas Decker;Ananta R. Bhattarai;Jindong Gu;Volker Tresp;Florian Buettner", "authorids": "~Thomas_Decker1;~Ananta_R._Bhattarai1;~Jindong_Gu1;~Volker_Tresp1;~Florian_Buettner1", "gender": ";;;M;", "homepage": ";;;https://www.dbs.ifi.lmu.de/~tresp/;", "dblp": "351/4532;;;t/VolkerTresp;245/4220", "google_scholar": "Et8pIioAAAAJ;;;xIJHTUwAAAAJ;AaPKbPAAAAAJ", "orcid": ";;;0000-0001-9428-3686;0000-0001-5587-6761", "linkedin": ";;;volker-tresp-8110a118/;", "or_profile": "~Thomas_Decker1;~Ananta_R._Bhattarai1;~Jindong_Gu1;~Volker_Tresp1;~Florian_Buettner1", "aff": "Siemens AG;;;Siemens Corporate Research;Deutsches Krebsforschungszentrum", "aff_domain": "siemens.com;;;siemens.com;dkfz.de", "position": "PhD student;;;Principal Researcher;Associate Professor", "bibtex": "@inproceedings{\ndecker2024provably,\ntitle={Provably Better Explanations with Optimized Aggregation of Feature Attributions},\nauthor={Thomas Decker and Ananta R. Bhattarai and Jindong Gu and Volker Tresp and Florian Buettner},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3VnSgdget6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3028924, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12793374427133001342&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "siemens.com;;;siemens.com;dkfz.de", "author_num": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "Siemens AG;Deutsches Krebsforschungszentrum", "aff_unique_dep": ";", "aff_unique_url": "https://www.siemens.com;https://www.dkfz.de", "aff_unique_abbr": "Siemens;DKFZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "PrE-Text: Training Language Models on Private Federated Data in the Age of LLMs", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35060", "id": "3WCvnkHnxV", "proceeding": "https://proceedings.mlr.press/v235/hou24c.html", "pdf": "https://openreview.net/pdf?id=3WCvnkHnxV", "openreview": "https://openreview.net/forum?id=3WCvnkHnxV", "author_site": "Charlie Hou, Akshat Shrivastava, Hongyuan Zhan, Rylan Conway, Trang Le, Adithya Sagar, Giulia Fanti, Daniel Lazar", "tldr": "", "abstract": "On-device training is currently the most common approach for training machine learning (ML) models on private, distributed user data. Despite this, on-device training has several drawbacks: (1) most user devices are too small to train large models on-device, (2) on-device training is communication- and computation-intensive, and (3) on-device training can be difficult to debug and deploy. To address these problems, we propose Private Evolution-Text (PrE-Text), a method for generating differentially private (DP) synthetic textual data. First, we show that across multiple datasets, training small models (models that fit on user devices) with PrE-Text synthetic data outperforms small models trained on-device under practical privacy regimes ($\\epsilon=1.29$, $\\epsilon=7.58$). We achieve these results while using 9$\\times$ fewer rounds, 6$\\times$ less client computation per round, and 100$\\times$ less communication per round. Second, finetuning large models on PrE-Text's DP synthetic data improves large language model (LLM) performance on private data across the same range of privacy budgets. Altogether, these results suggest that training on DP synthetic data can be a better option than training a model on-device on private distributed data. Code is available at https://github.com/houcharlie/PrE-Text.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Charlie Hou;Akshat Shrivastava;Hongyuan Zhan;Rylan Conway;Trang Le;Adithya Sagar;Giulia Fanti;Daniel Lazar", "authorids": "~Charlie_Hou1;~Akshat_Shrivastava1;~Hongyuan_Zhan2;~Rylan_Conway1;~Trang_Le1;adithyasagar@meta.com;~Giulia_Fanti1;~Daniel_Lazar1", "gender": ";M;M;M;F;;;M", "homepage": "https://www.andrew.cmu.edu/user/charlieh/;https://akshatsh.github.io/;https://sites.google.com/view/hongyuanzhan/research;;;;https://www.andrew.cmu.edu/user/gfanti/;", "dblp": ";;;;;;141/9910;", "google_scholar": "92wmC6gAAAAJ;ecQt6m4AAAAJ;oQ6VZmQAAAAJ;Bz7ww9sAAAAJ;;;Rn_BmTYAAAAJ;q9g_OlwAAAAJ", "orcid": ";;;;;;0000-0002-7671-2624;", "linkedin": "charlie-hou-027a19113/;akshatsh/;;rylan-conway-baa7318a/;;;;", "or_profile": "~Charlie_Hou1;~Akshat_Shrivastava1;~Hongyuan_Zhan2;~Rylan_Conway1;~Trang_Le1;adithyasagar@meta.com;~Giulia_Fanti1;~Daniel_Lazar1", "aff": "Carnegie Mellon University;Meta Facebook;Meta;Meta Facebook;;;Carnegie Mellon University;", "aff_domain": "andrew.cmu.edu;facebook.com;fb.com;meta.com;;;andrew.cmu.edu;", "position": "PhD student;Researcher;Researcher;Researcher;;;Assistant Professor;", "bibtex": "@inproceedings{\nhou2024pretext,\ntitle={PrE-Text: Training Language Models on Private Federated Data in the Age of {LLM}s},\nauthor={Charlie Hou and Akshat Shrivastava and Hongyuan Zhan and Rylan Conway and Trang Le and Adithya Sagar and Giulia Fanti and Daniel Lazar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3WCvnkHnxV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 662118, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=430214116577883102&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "andrew.cmu.edu;facebook.com;fb.com;meta.com;;;andrew.cmu.edu;", "author_num": 8, "aff_unique_index": "0;1;1;1;0", "aff_unique_norm": "Carnegie Mellon University;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.cmu.edu;https://meta.com", "aff_unique_abbr": "CMU;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "FreeBind: Free Lunch in Unified Multimodal Space via Knowledge Fusion", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35059", "id": "3XG69ZmfsB", "proceeding": "https://proceedings.mlr.press/v235/wang24co.html", "pdf": "https://openreview.net/pdf?id=3XG69ZmfsB", "openreview": "https://openreview.net/forum?id=3XG69ZmfsB", "author_site": "Zehan Wang, Ziang Zhang, xize cheng, Rongjie Huang, Luping Liu, Zhenhui Ye, Haifeng Huang, Yang Zhao, Tao Jin, Peng Gao, Zhou Zhao", "tldr": "", "abstract": "Unified multi-model representation spaces are the foundation of multimodal understanding and generation. However, the billions of model parameters and catastrophic forgetting problems make it challenging to further enhance pre-trained unified spaces. In this work, we propose FreeBind, an idea that treats multimodal representation spaces as basic units, and freely augments pre-trained unified space by integrating knowledge from extra expert spaces via ``space bonds\". Specifically, we introduce two kinds of basic space bonds: 1) Space Displacement Bond and 2) Space Combination Bond. Based on these basic bonds, we design Complex Sequential & Parallel Bonds to effectively integrate multiple spaces simultaneously. Benefiting from the modularization concept, we further propose a coarse-to-fine customized inference strategy to flexibly adjust the enhanced unified space for different purposes. Experimentally, we bind ImageBind with extra image-text and audio-text expert spaces, resulting in three main variants: ImageBind++, InternVL_IB, and InternVL_IB++. These resulting spaces outperform ImageBind on 5 audio-image-text downstream tasks across 9 datasets. Moreover, via customized inference, it even surpasses the advanced audio-text and image-text expert spaces. Our code and checkpoints are released at https://github.com/zehanwang01/FreeBind", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zehan Wang;Ziang Zhang;Xize Cheng;Rongjie Huang;Luping Liu;Zhenhui Ye;Haifeng Huang;Yang Zhao;Tao Jin;Peng Gao;Zhou Zhao", "authorids": "~Zehan_Wang2;~Ziang_Zhang1;~Xize_Cheng1;~Rongjie_Huang1;~Luping_Liu2;~Zhenhui_Ye1;~Haifeng_Huang3;~Yang_Zhao14;~Tao_Jin2;~Peng_Gao3;~Zhou_Zhao3", "gender": "M;M;M;M;;M;M;M;M;;", "homepage": "https://github.com/12zehan17;;https://exgc.github.io/;;;https://yerfor.github.io;https://zzzzchs.github.io/;;https://hugddygff.github.io/;;", "dblp": "126/7826-1;;334/2167;212/8936-1;;265/6375;;50/2082-22;88/4850-4.html;;", "google_scholar": "euXK0lkAAAAJ;DptGMnYAAAAJ;https://scholar.google.com/citations?hl=zh-CN;iRHBUsgAAAAJ;;;oUm2gZUAAAAJ;;;;", "orcid": "0009-0007-7509-7563;;0000-0001-9708-3225;;;;;;0000-0003-3564-1628;;", "linkedin": ";;;;;;haifeng-huang-784b2b249/;;;;", "or_profile": "~Zehan_Wang2;~Ziang_Zhang1;~Xize_Cheng1;~Rongjie_Huang1;~Luping_Liu2;~Zhenhui_Ye1;~Haifeng_Huang3;~Yang_Zhao14;~Tao_Jin2;~Peng_Gao3;~Zhou_Zhao3", "aff": "Zhejiang University;Zhejiang University;Zhejiang University;Zhejiang University;;Zhejiang University;Zhejiang University;ByteDance Inc.;Zhejiang University;;", "aff_domain": "zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn;;zju.edu.cn;zju.edu.cn;bytedance.com;zju.edu.cn;;", "position": "PhD student;Undergrad student;PhD student;MS student;;PhD student;MS student;Researcher;Assistant Professor;;", "bibtex": "@inproceedings{\nwang2024freebind,\ntitle={FreeBind: Free Lunch in Unified Multimodal Space via Knowledge Fusion},\nauthor={Zehan Wang and Ziang Zhang and Xize Cheng and Rongjie Huang and Luping Liu and Zhenhui Ye and Haifeng Huang and Yang Zhao and Tao Jin and Peng Gao and Zhou Zhao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3XG69ZmfsB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4828119, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16449970934698418261&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn;;zju.edu.cn;zju.edu.cn;bytedance.com;zju.edu.cn;;", "author_num": 11, "aff_unique_index": "0;0;0;0;0;0;1;0", "aff_unique_norm": "Zhejiang University;ByteDance", "aff_unique_dep": ";", "aff_unique_url": "https://www.zju.edu.cn;https://www.bytedance.com", "aff_unique_abbr": "ZJU;ByteDance", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Dynamic Correlation Clustering in Sublinear Update Time", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35058", "id": "3YG55Lbcnr", "proceeding": "https://proceedings.mlr.press/v235/cohen-addad24d.html", "pdf": "https://openreview.net/pdf?id=3YG55Lbcnr", "openreview": "https://openreview.net/forum?id=3YG55Lbcnr", "author_site": "Vincent Cohen-Addad, Silvio Lattanzi, Andreas Maggiori, Nikos Parotsidis", "tldr": "", "abstract": "We study the classic problem of correlation clustering in dynamic vertex streams. In this setting, vertices are either added or randomly deleted over time, and each vertex pair is connected by a positive or negative edge. The objective is to continuously find a partition which minimizes the sum of positive edges crossing clusters and negative edges within clusters. We present an algorithm that maintains an $O(1)$-approximation with $O(\\text{polylog} n)$ amortized update time. Prior to our work Behnezhad et al. in SODA 2023 achieved a $5$-approximation with $O(1)$ expected update time in edge streams which translates in vertex streams to an $O(D)$-update time where $D$ is the maximum possible degree. Finally we complement our theoretical analysis with experiments on real world data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vincent Cohen-Addad;Silvio Lattanzi;Andreas Maggiori;Nikos Parotsidis", "authorids": "~Vincent_Cohen-Addad1;~Silvio_Lattanzi1;~Andreas_Maggiori1;~Nikos_Parotsidis1", "gender": ";M;M;M", "homepage": ";https://sites.google.com/site/silviolattanzi/;;https://sites.google.com/view/nikosparotsidis", "dblp": "136/5814;46/6611;239/5932;129/9110", "google_scholar": ";vxUZ4AUAAAAJ;2QzQRW4AAAAJ;https://scholar.google.gr/citations?user=Txeb6wsAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Vincent_Cohen-Addad1;~Silvio_Lattanzi1;~Andreas_Maggiori1;~Nikos_Parotsidis1", "aff": "Google;Google;Columbia University;Google", "aff_domain": "google.com;google.com;columbia.edu;google.com", "position": "Researcher;Researcher;Postdoc;Researcher", "bibtex": "@inproceedings{\ncohen-addad2024dynamic,\ntitle={Dynamic Correlation Clustering in Sublinear Update Time},\nauthor={Vincent Cohen-Addad and Silvio Lattanzi and Andreas Maggiori and Nikos Parotsidis},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3YG55Lbcnr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 634424, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7705777016670487606&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "google.com;google.com;columbia.edu;google.com", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Google;Columbia University", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.columbia.edu", "aff_unique_abbr": "Google;Columbia", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "In-Context Language Learning: Architectures and Algorithms", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35057", "id": "3Z9CRr5srL", "proceeding": "https://proceedings.mlr.press/v235/akyurek24a.html", "pdf": "https://openreview.net/pdf?id=3Z9CRr5srL", "openreview": "https://openreview.net/forum?id=3Z9CRr5srL", "author_site": "Ekin Aky\u00fcrek, Bailin Wang, Yoon Kim, Jacob Andreas", "tldr": "", "abstract": "Some neural language models (LMs) exhibit a remarkable capacity for in-context learning (ICL): they can fit predictors to datasets provided as input. While the mechanisms underlying ICL are well-studied in the context of synthetic problems like in-context linear regression, there is still some divergence between these model problems and the \u201creal\u201d ICL exhibited by LMs trained on large text corpora. In this paper, we study ICL through the lens of a new family of model problems we term in context language learning (ICLL). In ICLL, LMs are presented with a set of strings from a formal language, and must generate additional strings from the same language. We focus on in- context learning of regular languages generated by random finite automata. We evaluate a diverse set of neural sequence models on regular ICLL tasks. We first show that Transformers significantly outperform neural sequence models with recurrent or convolutional representations on ICLL tasks. Next, we provide evidence that they do so by computing in-context n-gram statistics using specialized attention heads. Finally, we show that hard-wiring these heads into neural models improves performance not just on synthetic ICLL, but natural language modeling, reducing the perplexity of 340M-parameter Transformers by up to 1.14 points (6.7%) on the SlimPajama dataset. Our results highlight the usefulness of in-context formal language learning as a tool for understanding ICL in models of natural text.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ekin Aky\u00fcrek;Bailin Wang;Yoon Kim;Jacob Andreas", "authorids": "~Ekin_Aky\u00fcrek1;~Bailin_Wang3;~Yoon_Kim1;~Jacob_Andreas1", "gender": ";M;M;M", "homepage": "https://people.csail.mit.edu/yoonkim/;http://web.mit.edu/jda/www;https://berlino.github.io/;https://www.ekinakyurek.me/", "dblp": ";97/8154;218/7334;216/3446", "google_scholar": "n_ts4eYAAAAJ;dnZ8udEAAAAJ;;FQHeASwAAAAJ", "orcid": ";;;0000-0002-5166-4689", "linkedin": ";;;", "or_profile": "~Yoon_Kim1;~Jacob_Andreas1;~bailin_wang1;~EKIN_AKYUREK1", "aff": "Massachusetts Institute of Technology;Microsoft;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;microsoft.com;mit.edu;mit.edu", "position": "Assistant Professor;Researcher;Postdoc;PhD student", "bibtex": "@inproceedings{\naky{\\\"u}rek2024incontext,\ntitle={In-Context Language Learning: Architectures and Algorithms},\nauthor={Ekin Aky{\\\"u}rek and Bailin Wang and Yoon Kim and Jacob Andreas},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3Z9CRr5srL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3795306, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18283479549202134225&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "mit.edu;microsoft.com;mit.edu;mit.edu", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Massachusetts Institute of Technology;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://web.mit.edu;https://www.microsoft.com", "aff_unique_abbr": "MIT;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Auto-Linear Phenomenon in Subsurface Imaging", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35056", "id": "3ZM8MXGFRA", "proceeding": "https://proceedings.mlr.press/v235/feng24a.html", "pdf": "https://openreview.net/pdf?id=3ZM8MXGFRA", "openreview": "https://openreview.net/forum?id=3ZM8MXGFRA", "author_site": "Yinan Feng, Yinpeng Chen, Peng Jin, Shihang Feng, Youzuo Lin", "tldr": "", "abstract": "Subsurface imaging involves solving full waveform inversion (FWI) to predict geophysical properties from measurements. This problem can be reframed as an image-to-image translation, with the usual approach being to train an encoder-decoder network using paired data from two domains: geophysical property and measurement. A recent seminal work (InvLINT) demonstrates there is only a linear mapping between the latent spaces of the two domains, and the decoder requires paired data for training. This paper extends this direction by demonstrating that only linear mapping necessitates paired data, while both the encoder and decoder can be learned from their respective domains through self-supervised learning. This unveils an intriguing phenomenon (named Auto-Linear) where the self-learned features of two separate domains are automatically linearly correlated. Compared with existing methods, our Auto-Linear has four advantages: (a) solving both forward and inverse modeling simultaneously, (b) reducing model size, (c) enhanced performance, especially when the paired data is limited, and (d) strong generalization ability of the trained encoder and decoder.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yinan Feng;Yinpeng Chen;Peng Jin;Shihang Feng;Youzuo Lin", "authorids": "~Yinan_Feng1;~Yinpeng_Chen1;~Peng_Jin6;~Shihang_Feng1;~Youzuo_Lin1", "gender": "M;M;;M;M", "homepage": ";https://scholar.google.com/citations?user=V_VpLksAAAAJ&hl=en;https://ist.psu.edu/directory/pqj5125;;https://sites.google.com/site/youzuolin044/", "dblp": "154/0112;45/6977;;;", "google_scholar": "LySxJYUAAAAJ;;;m304bMcAAAAJ;CMXuHYgAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Yinan_Feng1;~Yinpeng_Chen1;~Peng_Jin6;~Shihang_Feng1;~Youzuo_Lin1", "aff": "Los Alamos National Laboratory;Google DeepMind;Pennsylvania State University;Los Alamos National Laboratory;Los Alamos National Laboratory", "aff_domain": "lanl.gov;google.com;psu.edu;lanl.gov;lanl.gov", "position": "PostMaster;Research Scientist;PhD student;Postdoc;Researcher", "bibtex": "@inproceedings{\nfeng2024autolinear,\ntitle={Auto-Linear Phenomenon in Subsurface Imaging},\nauthor={Yinan Feng and Yinpeng Chen and Peng Jin and Shihang Feng and Youzuo Lin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3ZM8MXGFRA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3592500, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4719164107499686113&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "lanl.gov;google.com;psu.edu;lanl.gov;lanl.gov", "author_num": 5, "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "Los Alamos National Laboratory;Google;Pennsylvania State University", "aff_unique_dep": ";Google DeepMind;", "aff_unique_url": "https://www.lanl.gov;https://deepmind.com;https://www.psu.edu", "aff_unique_abbr": "LANL;DeepMind;PSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "DySLIM: Dynamics Stable Learning by Invariant Measure for Chaotic Systems", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35055", "id": "3abgRKnK1W", "proceeding": "https://proceedings.mlr.press/v235/schiff24b.html", "pdf": "https://openreview.net/pdf?id=3abgRKnK1W", "openreview": "https://openreview.net/forum?id=3abgRKnK1W", "author_site": "Yair Schiff, Zhong Yi Wan, Jeffrey Parker, Stephan Hoyer, Volodymyr Kuleshov, Fei Sha, Leonardo Zepeda-Nunez", "tldr": "", "abstract": "Learning dynamics from dissipative chaotic systems is notoriously difficult due to their inherent instability, as formalized by their positive Lyapunov exponents, which exponentially amplify errors in the learned dynamics. However, many of these systems exhibit ergodicity and an attractor: a compact and highly complex manifold, to which trajectories converge in finite-time, that supports an invariant measure, i.e., a probability distribution that is invariant under the action of the dynamics, which dictates the long-term statistical behavior of the system. In this work, we leverage this structure to propose a new framework that targets learning the invariant measure as well as the dynamics, in contrast with typical methods that only target the misfit between trajectories, which often leads to divergence as the trajectories' length increases. We use our framework to propose a tractable and sample efficient objective that can be used with any existing learning objectives. Our **Dy**namics **S**table **L**earning by **I**nvariant **M**easure (DySLIM) objective enables model training that achieves better point-wise tracking and long-term statistical accuracy relative to other learning objectives. By targeting the distribution with a scalable regularization term, we hope that this approach can be extended to more complex systems exhibiting slowly-variant distributions, such as weather and climate models. Code to reproduce our experiments is available here: https://github.com/google-research/swirl-dynamics/tree/main/swirl_dynamics/projects/ergodic.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yair Schiff;Zhong Yi Wan;Jeffrey B. Parker;Stephan Hoyer;Volodymyr Kuleshov;Fei Sha;Leonardo Zepeda-N\u00fa\u00f1ez", "authorids": "~Yair_Schiff1;~Zhong_Yi_Wan1;~Jeffrey_B._Parker1;~Stephan_Hoyer1;~Volodymyr_Kuleshov1;~Fei_Sha3;~Leonardo_Zepeda-N\u00fa\u00f1ez1", "gender": "M;M;M;M;;M;M", "homepage": "https://github.com/yair-schiff;;;http://stephanhoyer.com;https://www.cs.cornell.edu/~kuleshov/;http://feisha.org;https://www.math.wisc.edu/~lzepeda/", "dblp": ";338/6288;;;81/8612;13/3601;", "google_scholar": "GhFrOdQAAAAJ;T1FxBHsAAAAJ;_w6i1bEAAAAJ;bWTG5FgAAAAJ;RY_t8XAAAAAJ;HDHOS0QAAAAJ;qbMVyzQAAAAJ", "orcid": ";;0000-0002-9079-9930;0000-0002-5207-0380;;;", "linkedin": "yair-schiff;zhong1wan/;jeffparker2;;;;", "or_profile": "~Yair_Schiff1;~Zhong_Yi_Wan1;~Jeffrey_B._Parker1;~Stephan_Hoyer1;~Volodymyr_Kuleshov1;~Fei_Sha2;~Leonardo_Zepeda-Nunez1", "aff": "Department of Computer Science, Cornell University;Google;Research, Google;Google;Cornell University;Google;University of Wisconsin, Madison", "aff_domain": "cs.cornell.edu;google.com;research.google.com;google.com;cornell.edu;google.com;wisc.edu", "position": "PhD student;Researcher;Researcher;Researcher;Assistant Professor;research scientist;Assistant Professor", "bibtex": "@inproceedings{\nschiff2024dyslim,\ntitle={Dy{SLIM}: Dynamics Stable Learning by Invariant Measure for Chaotic Systems},\nauthor={Yair Schiff and Zhong Yi Wan and Jeffrey B. Parker and Stephan Hoyer and Volodymyr Kuleshov and Fei Sha and Leonardo Zepeda-N{\\'u}{\\~n}ez},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3abgRKnK1W}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9391105, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6321188634311273023&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "cs.cornell.edu;google.com;research.google.com;google.com;cornell.edu;google.com;wisc.edu", "author_num": 7, "aff_unique_index": "0;1;1;1;0;1;2", "aff_unique_norm": "Cornell University;Google;University of Wisconsin", "aff_unique_dep": "Department of Computer Science;Google;", "aff_unique_url": "https://www.cornell.edu;https://www.google.com;https://www.wisc.edu", "aff_unique_abbr": "Cornell;Google;UW", "aff_campus_unique_index": "1;1;1;1;2", "aff_campus_unique": ";Mountain View;Madison", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Making Old Things New: A Unified Algorithm for Differentially Private Clustering", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35054", "id": "3ajK5xplDL", "proceeding": "https://proceedings.mlr.press/v235/dupre-la-tour24a.html", "pdf": "https://openreview.net/pdf?id=3ajK5xplDL", "openreview": "https://openreview.net/forum?id=3ajK5xplDL", "author_site": "Max Dupre la Tour, Monika Henzinger, David Saulpic", "tldr": "", "abstract": "As a staple of data analysis and unsupervised learning, the problem of private clustering has been widely studied, under various privacy models. Centralized differential privacy is the first of them, and the problem has also been studied for the local and the shuffle variation. In each case, the goal is to design an algorithm that computes privately a clustering, with the smallest possible error. The study of each variation gave rise to new algorithm: the landscape of private clustering algorithm is therefore quite intricate. In this paper, we show that a 20 year-old algorithm can be slightly modified to work for any of those models. This provides a unified picture: while matching almost all previously known results, it allows us to improve some of them, and extend to a new privacy model, the continual observation setting, where the input is changing over time and the algorithm must output a new solution at each time step.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Max Dupre la Tour;Monika Henzinger;David Saulpic", "authorids": "~Max_Dupre_la_Tour1;~Monika_Henzinger1;~David_Saulpic1", "gender": "M;;", "homepage": ";;http://www.normalesup.org/~saulpic/", "dblp": "221/2881.html;;https://dblp.uni-trier.de/pers/hd/s/Saulpic:David", "google_scholar": ";NXbggxYAAAAJ;", "orcid": ";;0000-0003-4208-8541", "linkedin": ";;", "or_profile": "~Max_Dupre_la_Tour1;~Monika_Henzinger1;~David_Saulpic1", "aff": "McGill University;Institute of Science and Technology;Institute of Science and Technology", "aff_domain": "mcgill.ca;ist.ac.at;ist.ac.at", "position": "PhD student;Full Professor;Postdoc", "bibtex": "@inproceedings{\ntour2024making,\ntitle={Making Old Things New: A Unified Algorithm for Differentially Private Clustering},\nauthor={Max Dupre la Tour and Monika Henzinger and David Saulpic},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3ajK5xplDL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 679637, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8973940771088358128&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "mcgill.ca;ist.ac.at;ist.ac.at", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "McGill University;Institute of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.mcgill.ca;", "aff_unique_abbr": "McGill;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0", "aff_country_unique": "Canada;" }, { "title": "Statistically Optimal Generative Modeling with Maximum Deviation from the Empirical Distribution", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35053", "id": "3ash2ksk1r", "proceeding": "https://proceedings.mlr.press/v235/vardanyan24a.html", "pdf": "https://openreview.net/pdf?id=3ash2ksk1r", "openreview": "https://openreview.net/forum?id=3ash2ksk1r", "author_site": "Elen Vardanyan, Sona Hunanyan, Tigran Galstyan, Arshak Minasyan, Arnak Dalalyan", "tldr": "", "abstract": "This paper explores the problem of generative modeling, aiming to simulate diverse examples from an unknown distribution based on observed examples. While recent studies have focused on quantifying the statistical precision of popular algorithms, there is a lack of mathematical evaluation regarding the non-replication of observed examples and the creativity of the generative model. We present theoretical insights into this aspect, demonstrating that the Wasserstein GAN, constrained to left-invertible push-forward maps, generates distributions that not only avoid replication but also significantly deviate from the empirical distribution. Importantly, we show that left-invertibility achieves this without compromising the statistical optimality of the resulting generator. Our most important contribution provides a finite-sample lower bound on the Wasserstein-1 distance between the generative distribution and the empirical one. We also establish a finite-sample upper bound on the distance between the generative distribution and the true data-generating one. Both bounds are explicit and show the impact of key parameters such as sample size, dimensions of the ambient and latent spaces, noise level, and smoothness measured by the Lipschitz constant.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Elen Vardanyan;Sona Hunanyan;Tigran Galstyan;Arshak Minasyan;Arnak S. Dalalyan", "authorids": "~Elen_Vardanyan1;~Sona_Hunanyan1;~Tigran_Galstyan1;~Arshak_Minasyan1;~Arnak_S._Dalalyan2", "gender": "F;F;;;M", "homepage": ";;;https://pointguard0.github.io/;https://adalalyan.github.io/", "dblp": "324/1283;;246/0174;294/9437.html;87/1594", "google_scholar": "_YxoA0wAAAAJ;;RqqMcAEAAAAJ;https://scholar.google.fr/citations?user=-BcHcowAAAAJ;https://scholar.google.fr/citations?user=avlybF8AAAAJ", "orcid": ";;;;", "linkedin": "elenvardanyan/;sona-hunanyan/;;;", "or_profile": "~Elen_Vardanyan1;~Sona_Hunanyan1;~Tigran_Galstyan1;~Arshak_Minasyan1;~Arnak_Dalalyan1", "aff": "Yerevan State University;Yerevan State University;;;", "aff_domain": "ysu.am;ysu.am;;;", "position": "Adjunct Lecturer;Researcher;;;", "bibtex": "@inproceedings{\nvardanyan2024statistically,\ntitle={Statistically Optimal Generative Modeling with Maximum Deviation from the Empirical Distribution},\nauthor={Elen Vardanyan and Sona Hunanyan and Tigran Galstyan and Arshak Minasyan and Arnak S. Dalalyan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3ash2ksk1r}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3815576, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3616631285088576075&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "ysu.am;ysu.am;;;", "author_num": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Yerevan State University", "aff_unique_dep": "", "aff_unique_url": "https://www.yerevanstateuniversity.am", "aff_unique_abbr": "YSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Armenia" }, { "title": "DoRA: Weight-Decomposed Low-Rank Adaptation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35052", "id": "3d5CIRG1n2", "proceeding": "https://proceedings.mlr.press/v235/liu24bn.html", "pdf": "https://openreview.net/pdf?id=3d5CIRG1n2", "openreview": "https://openreview.net/forum?id=3d5CIRG1n2", "author_site": "Shih-Yang Liu, Chien-Yi Wang, Hongxu Yin, Pavlo Molchanov, Yu-Chiang Wang, Kwang-Ting Cheng, Min-Hung Chen", "tldr": "", "abstract": "Among the widely used parameter-efficient fine-tuning (PEFT) methods, LoRA and its variants have gained considerable popularity because of avoiding additional inference costs. However, there still often exists an accuracy gap between these methods and full fine-tuning (FT). In this work, we first introduce a novel weight decomposition analysis to investigate the inherent differences between FT and LoRA. Aiming to resemble the learning capacity of FT from the findings, we propose Weight-Decomposed Low-Rank Adaptation (DoRA). DoRA decomposes the pre-trained weight into two components, magnitude and direction, for fine-tuning, specifically employing LoRA for directional updates to efficiently minimize the number of trainable parameters. By employing DoRA, we enhance both the learning capacity and training stability of LoRA while avoiding any additional inference overhead. DoRA consistently outperforms LoRA on fine-tuning LLaMA, LLaVA, and VL-BART on various downstream tasks, such as commonsense reasoning, visual instruction tuning, and image/video-text understanding. The code is available at https://github.com/NVlabs/DoRA.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shih-yang Liu;Chien-Yi Wang;Hongxu Yin;Pavlo Molchanov;Yu-Chiang Frank Wang;Kwang-Ting Cheng;Min-Hung Chen", "authorids": "~Shih-yang_Liu1;~Chien-Yi_Wang1;~Hongxu_Yin2;~Pavlo_Molchanov1;~Yu-Chiang_Frank_Wang2;~Kwang-Ting_Cheng1;~Min-Hung_Chen2", "gender": "M;M;;M;M;;M", "homepage": "https://vsdl.hkust.edu.hk/people.html;https://chienyiwang.github.io/;;;http://vllab.ee.ntu.edu.tw/ycwang.html;;https://minhungchen.netlify.app/", "dblp": ";12/6741;;165/8169.html;30/1690;;04/6305", "google_scholar": "eBXRoDgAAAAJ;05LW2DcAAAAJ;;J9PoyoIAAAAJ;HSGvdtoAAAAJ;;ovzuxi8AAAAJ", "orcid": "0000-0003-1997-0843;;;;0000-0002-2333-157X;;0000-0002-4046-3937", "linkedin": ";chienyiwang/;;;;;chensteven/", "or_profile": "~Shih-yang_Liu1;~Chien-Yi_Wang1;~Hongxu_Yin2;~Pavlo_Molchanov1;~Yu-Chiang_Frank_Wang2;~Kwang-Ting_Cheng1;~Min-Hung_Chen2", "aff": "NVIDIA;NVIDIA;;NVIDIA Research;National Taiwan University;;NVIDIA", "aff_domain": "nvidia.com;nvidia.com;;nvidia.com;ntu.edu.tw;;nvidia.com", "position": "Intern;Researcher;;Research Scientist;Full Professor;;Research Scientist", "bibtex": "@inproceedings{\nliu2024dora,\ntitle={Do{RA}: Weight-Decomposed Low-Rank Adaptation},\nauthor={Shih-yang Liu and Chien-Yi Wang and Hongxu Yin and Pavlo Molchanov and Yu-Chiang Frank Wang and Kwang-Ting Cheng and Min-Hung Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3d5CIRG1n2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5177308, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 396, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2411396716934347860&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "nvidia.com;nvidia.com;;nvidia.com;ntu.edu.tw;;nvidia.com", "author_num": 7, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "NVIDIA;National Taiwan University", "aff_unique_dep": "NVIDIA Corporation;", "aff_unique_url": "https://www.nvidia.com;https://www.ntu.edu.tw", "aff_unique_abbr": "NVIDIA;NTU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Taiwan", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;China" }, { "title": "How Uniform Random Weights Induce Non-uniform Bias: Typical Interpolating Neural Networks Generalize with Narrow Teachers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35051", "id": "3eHNvPHL9Z", "proceeding": "https://proceedings.mlr.press/v235/buzaglo24a.html", "pdf": "https://openreview.net/pdf?id=3eHNvPHL9Z", "openreview": "https://openreview.net/forum?id=3eHNvPHL9Z", "author_site": "Gon Buzaglo, Itamar Harel, Mor Shpigel Nacson, Alon Brutzkus, Nati Srebro, Daniel Soudry", "tldr": "", "abstract": "A main theoretical puzzle is why over-parameterized Neural Networks (NNs) generalize well when trained to zero loss (i.e., so they interpolate the data). Usually, the NN is trained with Stochastic Gradient Descent (SGD) or one of its variants. However, recent empirical work examined the generalization of a random NN that interpolates the data: the NN was sampled from a seemingly uniform prior over the parameters, conditioned on that the NN perfectly classifying the training set. Interestingly, such a NN sample typically generalized as well as SGD-trained NNs. We prove that such a random NN interpolator typically generalizes well if there exists an underlying narrow ``teacher NN\" that agrees with the labels. Specifically, we show that such a `flat' prior over the NN parametrization induces a rich prior over the NN functions, due to the redundancy in the NN structure. In particular, this creates a bias towards simpler functions, which require less relevant parameters to represent --- enabling learning with a sample complexity approximately proportional to the complexity of the teacher (roughly, the number of non-redundant parameters), rather than the student's.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gon Buzaglo;Itamar Harel;Mor Shpigel Nacson;Alon Brutzkus;Nathan Srebro;Daniel Soudry", "authorids": "~Gon_Buzaglo1;~Itamar_Harel1;~Mor_Shpigel_Nacson1;~Alon_Brutzkus1;~Nathan_Srebro1;~Daniel_Soudry1", "gender": ";M;;M;M;M", "homepage": "https://www.buzaglo.me/;;;;http://ttic.uchicago.edu/~nati/;https://soudry.github.io/", "dblp": "346/4869.html;368/6206;217/3134;161/7411;50/3633;126/1779", "google_scholar": "YZHL8N0AAAAJ;BdWvuiIAAAAJ;wrozdTYAAAAJ;m1wmXdgAAAAJ;https://scholar.google.com.tw/citations?user=ZnT-QpMAAAAJ;https://scholar.google.co.il/citations?user=AEBWEm8AAAAJ", "orcid": ";0009-0007-2900-4653;;;;0000-0001-9368-6352", "linkedin": "gonbuzaglo;itamar-harel-3245a82b2;;;;daniel-soudry-2aa3a88/", "or_profile": "~Gon_Buzaglo1;~Itamar_Harel1;~Mor_Shpigel_Nacson1;~Alon_Brutzkus1;~Nathan_Srebro1;~Daniel_Soudry1", "aff": "Technion - Israel Institute of Technology, Technion;Technion - Israel Institute of Technology, Technion - Israel Institute of Technology;Technion, Technion;GSK plc;University of Chicago;Technion - Israel Institute of Technology, Technion", "aff_domain": "technion.ac.il;campus.technion.ac.il;technion.ac.il;gsk.com;uchicago.edu;technion.ac.il", "position": "Undergrad student;MS student;PhD student;Researcher;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nbuzaglo2024how,\ntitle={How Uniform Random Weights Induce Non-uniform Bias: Typical Interpolating Neural Networks Generalize with Narrow Teachers},\nauthor={Gon Buzaglo and Itamar Harel and Mor Shpigel Nacson and Alon Brutzkus and Nathan Srebro and Daniel Soudry},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3eHNvPHL9Z}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 729731, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1546800956395412861&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 6, "email": "technion.ac.il;campus.technion.ac.il;technion.ac.il;gsk.com;uchicago.edu;technion.ac.il", "author_num": 6, "aff_unique_index": "0;0;0;1;2;0", "aff_unique_norm": "Technion - Israel Institute of Technology;GlaxoSmithKline plc;University of Chicago", "aff_unique_dep": ";;", "aff_unique_url": "https://www.technion.ac.il;https://www.gsk.com;https://www.uchicago.edu", "aff_unique_abbr": "Technion;GSK;UChicago", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;2;0", "aff_country_unique": "Israel;United Kingdom;United States" }, { "title": "Position: Data Authenticity, Consent, & Provenance for AI are all broken: what will it take to fix them?", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35050", "id": "3hSTecKy1b", "proceeding": "https://proceedings.mlr.press/v235/longpre24b.html", "pdf": "https://openreview.net/pdf?id=3hSTecKy1b", "openreview": "https://openreview.net/forum?id=3hSTecKy1b", "author_site": "Shayne Longpre, Robert Mahari, Naana Obeng-Marnu, William Brannon, Tobin South, Katy Gero, Alex Pentland, Jad Kabbara", "tldr": "", "abstract": "New capabilities in foundation models are owed in large part to massive, widely-sourced, and under-documented training data collections. Existing practices in data collection have led to challenges in tracing authenticity, verifying consent, preserving privacy, addressing representation and bias, respecting copyright, and overall developing ethical and trustworthy foundation models. In response, regulation is emphasizing the need for training data transparency to understand foundation models\u2019 limitations. Based on a large-scale analysis of the foundation model training data landscape and existing solutions, we identify the missing infrastructure to facilitate responsible foundation model development practices. We examine the current shortcomings of common tools for tracing data authenticity, consent, and documentation, and outline how policymakers, developers, and data creators can facilitate responsible foundation model development by adopting universal data provenance standards.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shayne Longpre;Robert Mahari;Naana Obeng-Marnu;William Brannon;Tobin South;Katy Ilonka Gero;Alex Pentland;Jad Kabbara", "authorids": "~Shayne_Longpre1;~Robert_Mahari1;~Naana_Obeng-Marnu1;~William_Brannon1;~Tobin_South1;~Katy_Ilonka_Gero1;~Alex_Pentland1;~Jad_Kabbara1", "gender": "M;M;F;M;Not Specified;F;M;M", "homepage": "https://www.shaynelongpre.com;https://robertmahari.com/;;https://willbrannon.com/;https://tobin.page;http://www.katygero.com/;https://www.media.mit.edu/people/sandy/overview/;http://www.mit.edu/~jkabbara/", "dblp": "190/7024;;;245/2682;;;p/AlexPentland;148/9943", "google_scholar": "ADd_YfkAAAAJ;3qM8lPsAAAAJ;Xk25h8YAAAAJ;0Dd6lAEAAAAJ;r5pPBFMAAAAJ;AlDePoAAAAAJ;P4nfoKYAAAAJ;", "orcid": ";0000-0003-2372-2746;0000-0002-1501-4558;0000-0002-1435-8535;0000-0003-2740-9829;;;", "linkedin": "shayne-redford-longpre/;robert-mahari-874310157/;naana-obengmarnu/;wwbrannon;;;;", "or_profile": "~Shayne_Longpre1;~Robert_Mahari1;~Naana_Obeng-Marnu1;~William_Brannon1;~Tobin_South1;~Katy_Ilonka_Gero1;~Alex_Pentland1;~Jad_Kabbara1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;School of Engineering and Applied Sciences, Harvard University;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu;mit.edu;mit.edu;seas.harvard.edu;mit.edu;mit.edu", "position": "PhD student;PhD;MS student;PhD student;PhD student;Postdoc;Full Professor;Postdoc", "bibtex": "@inproceedings{\nlongpre2024position,\ntitle={Position: Data Authenticity, Consent, \\& Provenance for {AI} are all broken: what will it take to fix them?},\nauthor={Shayne Longpre and Robert Mahari and Naana Obeng-Marnu and William Brannon and Tobin South and Katy Ilonka Gero and Alex Pentland and Jad Kabbara},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3hSTecKy1b}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 237161, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14808243614379286754&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "mit.edu;mit.edu;mit.edu;mit.edu;mit.edu;seas.harvard.edu;mit.edu;mit.edu", "author_num": 8, "aff_unique_index": "0;0;0;0;0;1;0;0", "aff_unique_norm": "Massachusetts Institute of Technology;Harvard University", "aff_unique_dep": ";School of Engineering and Applied Sciences", "aff_unique_url": "https://web.mit.edu;https://www.harvard.edu", "aff_unique_abbr": "MIT;Harvard", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Prompting a Pretrained Transformer Can Be a Universal Approximator", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35049", "id": "3mQ6ZKTSQl", "proceeding": "https://proceedings.mlr.press/v235/petrov24a.html", "pdf": "https://openreview.net/pdf?id=3mQ6ZKTSQl", "openreview": "https://openreview.net/forum?id=3mQ6ZKTSQl", "author_site": "Aleksandar Petrov, Phil Torr, Adel Bibi", "tldr": "", "abstract": "Despite the widespread adoption of prompting, prompt tuning and prefix-tuning of transformer models, our theoretical understanding of these fine-tuning methods remains limited. A key question is whether one can arbitrarily modify the behavior of a pretrained model by prompting or prefix-tuning it. Formally, whether prompting and prefix-tuning a pretrained model can universally approximate sequence-to-sequence functions. This paper answers in the affirmative and demonstrates that much smaller pretrained models than previously thought can be universal approximators when prefixed. In fact, prefix-tuning a single attention head is sufficient to approximate any continuous function making the attention mechanism uniquely suited for universal approximation. Moreover, any sequence-to-sequence function can be approximated by prefixing a transformer with depth linear in the sequence length. Beyond these density-type results, we also offer Jackson-type bounds on the length of the prefix needed to approximate a function to a desired precision.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aleksandar Petrov;Philip Torr;Adel Bibi", "authorids": "~Aleksandar_Petrov1;~Philip_Torr1;~Adel_Bibi1", "gender": "M;;M", "homepage": "https://p-petrov.com/;http://www.robots.ox.ac.uk/~tvg/;http://adelbibi.com", "dblp": "49/8105;;176/0964", "google_scholar": "em54BT4AAAAJ;;Q4j2laYAAAAJ", "orcid": ";;0000-0002-6169-3918", "linkedin": "aleksandar-petrov/;;adel-bibi-ba3671ab/", "or_profile": "~Aleksandar_Petrov1;~Philip_Torr1;~Adel_Bibi1", "aff": "Adobe Systems;University of Oxford;University of Oxford", "aff_domain": "adobe.com;ox.ac.uk;ox.ac.uk", "position": "Intern;Full Professor;Senior Researcher", "bibtex": "@inproceedings{\npetrov2024prompting,\ntitle={Prompting a Pretrained Transformer Can Be a Universal Approximator},\nauthor={Aleksandar Petrov and Philip Torr and Adel Bibi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3mQ6ZKTSQl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1832083, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17234455813671611257&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "adobe.com;ox.ac.uk;ox.ac.uk", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Adobe;University of Oxford", "aff_unique_dep": "Adobe Systems Incorporated;", "aff_unique_url": "https://www.adobe.com;https://www.ox.ac.uk", "aff_unique_abbr": "Adobe;Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Regression Learning with Limited Observations of Multivariate Outcomes and Features", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35048", "id": "3nlBesNxcm", "proceeding": "https://proceedings.mlr.press/v235/sun24k.html", "pdf": "https://openreview.net/pdf?id=3nlBesNxcm", "openreview": "https://openreview.net/forum?id=3nlBesNxcm", "author_site": "Yifan Sun, Grace Yi", "tldr": "", "abstract": "Multivariate linear regression models are broadly used to facilitate relationships between outcomes and features. However, their effectiveness is compromised by the presence of missing observations, a ubiquitous challenge in real-world applications. Considering a scenario where learners access only limited components for both outcomes and features, we develop efficient algorithms tailored for the least squares ($L_2$) and least absolute ($L_1$) loss functions, each coupled with a ridge-like and Lasso-type penalty, respectively. Moreover, we establish rigorous error bounds for all proposed algorithms. Notably, our $L_2$ loss function algorithms are probably approximately correct (PAC), distinguishing them from their $L_1$ counterparts. Extensive numerical experiments show that our approach outperforms methods that apply existing algorithms for univariate outcome individually to each coordinate of multivariate outcomes in a naive manner. Further, utilizing the $L_1$ loss function or introducing a Lasso-type penalty can enhance predictions in the presence of outliers or high dimensional features. This research contributes valuable insights into addressing the challenges posed by incomplete data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yifan Sun;Grace Yi", "authorids": "~Yifan_Sun11;~Grace_Yi1", "gender": "M;F", "homepage": "https://www.uwo.ca/stats/people/postdoctoral.html;http://fisher.stats.uwo.ca/faculty/yyi/", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Yifan_Sun11;~Grace_Yi1", "aff": ";University of Western Ontario", "aff_domain": ";uwo.ca", "position": ";Full Professor", "bibtex": "@inproceedings{\nsun2024regression,\ntitle={Regression Learning with Limited Observations of Multivariate Outcomes and Features},\nauthor={Yifan Sun and Grace Yi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3nlBesNxcm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 477181, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FD-FA75d7ZwJ:scholar.google.com/&scioq=Regression+Learning+with+Limited+Observations+of+Multivariate+Outcomes+and+Features&hl=en&as_sdt=0,5", "gs_version_total": 4, "email": ";uwo.ca", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "University of Western Ontario", "aff_unique_dep": "", "aff_unique_url": "https://www.uwo.ca", "aff_unique_abbr": "UWO", "aff_country_unique_index": "0", "aff_country_unique": "Canada" }, { "title": "Improved Generalization of Weight Space Networks via Augmentations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35047", "id": "3o7G6tIo4X", "proceeding": "https://proceedings.mlr.press/v235/shamsian24a.html", "pdf": "https://openreview.net/pdf?id=3o7G6tIo4X", "openreview": "https://openreview.net/forum?id=3o7G6tIo4X", "author_site": "Aviv Shamsian, Aviv Navon, David Zhang, Yan Zhang, Ethan Fetaya, Gal Chechik, Haggai Maron", "tldr": "", "abstract": "Learning in deep weight spaces (DWS), where neural networks process the weights of other neural networks, is an emerging research direction, with applications to 2D and 3D neural fields (INRs, NeRFs), as well as making inferences about other types of neural networks. Unfortunately, weight space models tend to suffer from substantial overfitting. We empirically analyze the reasons for this overfitting and find that a key reason is the lack of diversity in DWS datasets. While a given object can be represented by many different weight configurations, typical INR training sets fail to capture variability across INRs that represent the same object. To address this, we explore strategies for data augmentation in weight spaces and propose a MixUp method adapted for weight spaces. We demonstrate the effectiveness of these methods in two setups. In classification, they improve performance similarly to having up to 10 times more data. In self-supervised contrastive learning, they yield substantial 5-10% gains in downstream classification.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aviv Shamsian;Aviv Navon;David W. Zhang;Yan Zhang;Ethan Fetaya;Gal Chechik;Haggai Maron", "authorids": "~Aviv_Shamsian1;~Aviv_Navon1;~David_W._Zhang1;~Yan_Zhang1;~Ethan_Fetaya1;~Gal_Chechik1;~Haggai_Maron1", "gender": "M;M;M;M;;M;M", "homepage": ";https://avivnavon.github.io/;https://www.cyanogenoid.com;http://www.cs.toronto.edu/~ethanf/;https://chechiklab.biu.ac.il/~gal/;https://haggaim.github.io/;https://davzha.netlify.app/", "dblp": "261/9492;269/9785;04/3348-67;01/10046;c/GalChechik;181/6629;119/0960", "google_scholar": ";https://scholar.google.co.il/citations?user=N-sME4wAAAAJ;https://scholar.google.co.uk/citations?user=XtCqbfEAAAAJ;zLuqh-0AAAAJ;Wk2gAZUAAAAJ;https://scholar.google.co.il/citations?user=4v8uJrIAAAAJ;https://scholar.google.nl/citations?user=MG3oLzUAAAAJ", "orcid": ";;0000-0003-3470-3663;0000-0003-3125-1665;0000-0001-9164-5303;;0000-0002-2137-1738", "linkedin": "aviv-shamsian/;;;;;;david-zhang-1b86b314a", "or_profile": "~Aviv_Shamsian1;~Aviv_Navon1;~Yan_Zhang1;~Ethan_Fetaya1;~Gal_Chechik1;~Haggai_Maron1;~David_W_Zhang1", "aff": "Bar-Ilan University;Bar Ilan University, Israel;Mila - Quebec Artificial Intelligence Institute;Bar Ilan University;NVIDIA;NVIDIA;Qualcomm Inc, QualComm", "aff_domain": "biu.ac.il;biu.ac.il;mila.quebec;biu.ac.il;nvidia.com;nvidia.com;qti.qualcomm.com", "position": "PhD student;PhD student;Industrial Partner;Assistant Professor;Principal Researcher;Research Scientist;Researcher", "bibtex": "@inproceedings{\nshamsian2024improved,\ntitle={Improved Generalization of Weight Space Networks via Augmentations},\nauthor={Aviv Shamsian and Aviv Navon and David W. Zhang and Yan Zhang and Ethan Fetaya and Gal Chechik and Haggai Maron},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3o7G6tIo4X}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4211157, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3978133348308241299&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "biu.ac.il;biu.ac.il;mila.quebec;biu.ac.il;nvidia.com;nvidia.com;qti.qualcomm.com", "author_num": 7, "aff_unique_index": "0;0;1;0;2;2;3", "aff_unique_norm": "Bar-Ilan University;Quebec Artificial Intelligence Institute;NVIDIA;Qualcomm Incorporated", "aff_unique_dep": ";Artificial Intelligence;NVIDIA Corporation;", "aff_unique_url": "https://www.biu.ac.il;https://mila.quebec;https://www.nvidia.com;https://www.qualcomm.com", "aff_unique_abbr": "BIU;Mila;NVIDIA;Qualcomm", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;2;2;2", "aff_country_unique": "Israel;Canada;United States" }, { "title": "Biharmonic Distance of Graphs and its Higher-Order Variants: Theoretical Properties with Applications to Centrality and Clustering", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35046", "id": "3pxMIjB9QK", "proceeding": "https://proceedings.mlr.press/v235/black24a.html", "pdf": "https://openreview.net/pdf?id=3pxMIjB9QK", "openreview": "https://openreview.net/forum?id=3pxMIjB9QK", "author_site": "Mitchell Black, Lucy Lin, Weng-Keen Wong, Amir Nayyeri", "tldr": "", "abstract": "Effective resistance is a distance between vertices of a graph that is both theoretically interesting and useful in applications. We study a variant of effective resistance called the biharmonic distance. While the effective resistance measures how well-connected two vertices are, we prove several theoretical results supporting the idea that the biharmonic distance measures how important an edge is to the global topology of the graph. Our theoretical results connect the biharmonic distance to well-known measures of connectivity of a graph like its total resistance and sparsity. Based on these results, we introduce two clustering algorithms using the biharmonic distance. Finally, we introduce a further generalization of the biharmonic distance that we call the $k$-harmonic distance. We empirically study the utility of biharmonic and $k$-harmonic distance for edge centrality and graph clustering.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mitchell Black;Lucy Lin;Weng-Keen Wong;Amir Nayyeri", "authorids": "~Mitchell_Black1;linlu@oregonstate.edu;~Weng-Keen_Wong1;~Amir_Nayyeri1", "gender": "M;;M;", "homepage": "https://mitchell.black;;http://www.eecs.oregonstate.edu/~wong;", "dblp": "262/3347-2;;19/1015;", "google_scholar": "https://scholar.google.com/citations?hl=en;;https://scholar.google.com/citations?hl=en;", "orcid": ";;0000-0002-6673-343X;", "linkedin": ";;;", "or_profile": "~Mitchell_Black1;linlu@oregonstate.edu;~Weng-Keen_Wong1;~Amir_Nayyeri1", "aff": "Oregon State University;;Oregon State University;", "aff_domain": "oregonstate.edu;;oregonstate.edu;", "position": "PhD student;;Full Professor;", "bibtex": "@inproceedings{\nblack2024biharmonic,\ntitle={Biharmonic Distance of Graphs and its Higher-Order Variants: Theoretical Properties with Applications to Centrality and Clustering},\nauthor={Mitchell Black and Lucy Lin and Weng-Keen Wong and Amir Nayyeri},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3pxMIjB9QK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1303916, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3291057022581854252&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "oregonstate.edu;;oregonstate.edu;", "author_num": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Oregon State University", "aff_unique_dep": "", "aff_unique_url": "https://oregonstate.edu", "aff_unique_abbr": "OSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "CHEMREASONER: Heuristic Search over a Large Language Model\u2019s Knowledge Space using Quantum-Chemical Feedback", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35045", "id": "3tJDnEszco", "proceeding": "https://proceedings.mlr.press/v235/sprueill24a.html", "pdf": "https://openreview.net/pdf?id=3tJDnEszco", "openreview": "https://openreview.net/forum?id=3tJDnEszco", "author_site": "Henry W. Sprueill, Carl Edwards, Khushbu Agarwal, Mariefel Olarte, Udishnu Sanyal, Conrad Johnston, Hongbin Liu, Heng Ji, Sutanay Choudhury", "tldr": "", "abstract": "The discovery of new catalysts is essential for the design of new and more efficient chemical processes in order to transition to a sustainable future. We introduce an AI-guided computational screening framework unifying linguistic reasoning with quantum-chemistry based feedback from 3D atomistic representations. Our approach formulates catalyst discovery as an uncertain environment where an agent actively searches for highly effective catalysts via the iterative combination of large language model (LLM)-derived hypotheses and atomistic graph neural network (GNN)-derived feedback. Identified catalysts in intermediate search steps undergo structural evaluation based on spatial orientation, reaction pathways, and stability. Scoring functions based on adsorption energies and reaction energy barriers steer the exploration in the LLM's knowledge space toward energetically favorable, high-efficiency catalysts. We introduce planning methods that automatically guide the exploration without human input, providing competitive performance against expert-enumerated chemical descriptor-based implementations. By integrating language-guided reasoning with computational chemistry feedback, our work pioneers AI-accelerated, trustworthy catalyst discovery.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Henry W. Sprueill;Carl Edwards;Khushbu Agarwal;Mariefel V Olarte;Udishnu Sanyal;Conrad Johnston;Hongbin Liu;Heng Ji;Sutanay Choudhury", "authorids": "~Henry_W._Sprueill1;~Carl_Edwards1;~Khushbu_Agarwal2;~Mariefel_V_Olarte1;~Udishnu_Sanyal1;~Conrad_Johnston1;hongbin.liu@microsoft.com;~Heng_Ji3;~Sutanay_Choudhury2", "gender": ";M;F;;;;;F;", "homepage": ";https://cnedwards.com/;;;;;;http://blender.cs.illinois.edu/hengji.html;", "dblp": ";300/1001;72/8323.html;;;;;;57/7437", "google_scholar": ";https://scholar.google.com/citations?hl=en;F6639o4AAAAJ;;;;;z7GCqT4AAAAJ;oouJk7YAAAAJ", "orcid": ";;;0000-0003-2989-1110;0000-0002-7935-8691;;;;", "linkedin": ";carl-edwards-70a90592;khushbu-agarwal-8a603210/;;;;;;", "or_profile": "~Henry_W._Sprueill1;~Carl_Edwards1;~Khushbu_Agarwal2;~Mariefel_V_Olarte1;~Udishnu_Sanyal1;~Conrad_Johnston1;hongbin.liu@microsoft.com;~Heng_Ji3;~Sutanay_Choudhury2", "aff": ";Genentech;;Pacific Northwest National Laboratory;Pacific Northwest National Lab;;;University of Illinois, Urbana-Champaign;Pacific Northwest National Lab", "aff_domain": ";gene.com;;pnnl.gov;pnl.gov;;;uiuc.edu;pnl.gov", "position": ";Intern;;Research Engineer IV;Researcher;;;Full Professor;Scientist", "bibtex": "@inproceedings{\nsprueill2024chemreasoner,\ntitle={{CHEMREASONER}: Heuristic Search over a Large Language Model{\\textquoteright}s Knowledge Space using Quantum-Chemical Feedback},\nauthor={Henry W. Sprueill and Carl Edwards and Khushbu Agarwal and Mariefel V Olarte and Udishnu Sanyal and Conrad Johnston and Hongbin Liu and Heng Ji and Sutanay Choudhury},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3tJDnEszco}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2963506, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6195886637373183148&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 8, "email": ";gene.com;;pnnl.gov;pnl.gov;;;uiuc.edu;pnl.gov", "author_num": 9, "aff_unique_index": "0;1;1;2;1", "aff_unique_norm": "Genentech;Pacific Northwest National Laboratory;University of Illinois", "aff_unique_dep": ";;", "aff_unique_url": "https://www.genentech.com;https://www.pnnl.gov;https://illinois.edu", "aff_unique_abbr": "Genentech;PNNL;UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Cross-Domain Policy Adaptation by Capturing Representation Mismatch", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35044", "id": "3uPSQmjXzd", "proceeding": "https://proceedings.mlr.press/v235/lyu24a.html", "pdf": "https://openreview.net/pdf?id=3uPSQmjXzd", "openreview": "https://openreview.net/forum?id=3uPSQmjXzd", "author_site": "Jiafei Lyu, Chenjia Bai, Jing-Wen Yang, Zongqing Lu, Xiu Li", "tldr": "", "abstract": "It is vital to learn effective policies that can be transferred to different domains with dynamics discrepancies in reinforcement learning (RL). In this paper, we consider dynamics adaptation settings where there exists dynamics mismatch between the source domain and the target domain, and one can get access to sufficient source domain data, while can only have limited interactions with the target domain. Existing methods address this problem by learning domain classifiers, performing data filtering from a value discrepancy perspective, etc. Instead, we tackle this challenge from a decoupled representation learning perspective. We perform representation learning only in the target domain and measure the representation deviations on the transitions from the source domain, which we show can be a signal of dynamics mismatch. We also show that representation deviation upper bounds performance difference of a given policy in the source domain and target domain, which motivates us to adopt representation deviation as a reward penalty. The produced representations are not involved in either policy or value function, but only serve as a reward penalizer. We conduct extensive experiments on environments with kinematic and morphology mismatch, and the results show that our method exhibits strong performance on many tasks. Our code is publicly available at https://github.com/dmksjfl/PAR.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiafei Lyu;Chenjia Bai;Jing-Wen Yang;Zongqing Lu;Xiu Li", "authorids": "~Jiafei_Lyu1;~Chenjia_Bai2;~Jing-Wen_Yang3;~Zongqing_Lu2;~Xiu_Li1", "gender": "M;M;M;;F", "homepage": ";https://baichenjia.github.io/;https://www.lamda.nju.edu.cn/yangjw/;;https://thusigsiclab.github.io/thu.github.io/introduction.html", "dblp": "278/1503;247/1943;204/2956.html;;13/1206-1", "google_scholar": "bfgCMr8AAAAJ;Rm_1y2kAAAAJ;;;https://scholar.google.com/citations?hl=zh-CN", "orcid": "0000-0001-6616-417X;;;;0000-0003-0403-1923", "linkedin": ";;;;", "or_profile": "~Jiafei_Lyu1;~Chenjia_Bai2;~Jing-Wen_Yang3;~Zongqing_Lu2;~Xiu_Li1", "aff": "Tsinghua University;Shanghai AI Laboratory;Game AI Center;;Tsinghua University", "aff_domain": "tsinghua.edu.cn;pjlab.org.cn;tencent.com;;tsinghua.edu.cn", "position": "PhD student;Researcher;Principal Researcher;;Professor", "bibtex": "@inproceedings{\nlyu2024crossdomain,\ntitle={Cross-Domain Policy Adaptation by Capturing Representation Mismatch},\nauthor={Jiafei Lyu and Chenjia Bai and Jing-Wen Yang and Zongqing Lu and Xiu Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3uPSQmjXzd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1164312, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15230273693769155804&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 8, "email": "tsinghua.edu.cn;pjlab.org.cn;tencent.com;;tsinghua.edu.cn", "author_num": 5, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Tsinghua University;Shanghai AI Laboratory;Game AI Center", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.shanghai-ai-lab.com;", "aff_unique_abbr": "THU;SAIL;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China;" }, { "title": "ReLU to the Rescue: Improve Your On-Policy Actor-Critic with Positive Advantages", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35043", "id": "3umNqxjFad", "proceeding": "https://proceedings.mlr.press/v235/jesson24a.html", "pdf": "https://openreview.net/pdf?id=3umNqxjFad", "openreview": "https://openreview.net/forum?id=3umNqxjFad", "author_site": "Andrew Jesson, Christopher Lu, Gunshi Gupta, Nicolas Beltran-Velez, Angelos Filos, Jakob Foerster, Yarin Gal", "tldr": "", "abstract": "This paper proposes a step toward approximate Bayesian inference in on-policy actor-critic deep reinforcement learning. It is implemented through three changes to the Asynchronous Advantage Actor-Critic (A3C) algorithm: (1) applying a ReLU function to advantage estimates, (2) spectral normalization of actor-critic weights, and (3) incorporating *dropout as a Bayesian approximation*. We prove under standard assumptions that restricting policy updates to positive advantages optimizes for value by maximizing a lower bound on the value function plus an additive term. We show that the additive term is bounded proportional to the Lipschitz constant of the value function, which offers theoretical grounding for spectral normalization of critic weights. Finally, our application of dropout corresponds to approximate Bayesian inference over both the actor and critic parameters, which enables *adaptive state-aware* exploration around the modes of the actor via Thompson sampling. We demonstrate significant improvements for median and interquartile mean metrics over A3C, PPO, SAC, and TD3 on the MuJoCo continuous control benchmark and improvement over PPO in the challenging ProcGen generalization benchmark.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Andrew Jesson;Chris Lu;Gunshi Gupta;Nicolas Beltran-Velez;Angelos Filos;Jakob Nicolaus Foerster;Yarin Gal", "authorids": "~Andrew_Jesson1;~Chris_Lu1;~Gunshi_Gupta1;~Nicolas_Beltran-Velez1;~Angelos_Filos1;~Jakob_Nicolaus_Foerster1;~Yarin_Gal1", "gender": "M;;F;M;M;M;", "homepage": "https://oatml.cs.ox.ac.uk/members/andrew_jesson/;;;;;https://www.jakobfoerster.com;http://www.cs.ox.ac.uk/people/yarin.gal/website//", "dblp": ";77/9579;218/5542;;https://dblp.uni-trier.de/pers/hd/f/Filos:Angelos;176/5095;67/9076", "google_scholar": "ElJ_fC4AAAAJ;4WLoIRsAAAAJ;w4UK_9kAAAAJ;;SGjYdrEAAAAJ;6z4lQzMAAAAJ;https://scholar.google.co.uk/citations?user=SIayDoQAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;nicolas-beltran-velez-1950141a9;;;", "or_profile": "~Andrew_Jesson1;~Chris_Lu1;~Gunshi_Gupta1;~Nicolas_Beltran-Velez1;~Angelos_Filos1;~Jakob_Nicolaus_Foerster1;~Yarin_Gal1", "aff": "Columbia University;University of Oxford;University of Oxford;Columbia University;Google DeepMind;University of Oxford, University of Oxford;University of Oxford", "aff_domain": "columbia.edu;ox.ac.uk;ox.ac.uk;columbia.edu;deepmind.com;eng.ox.ac.uk;ox.ac.uk", "position": "Postdoc;PhD student;PhD student;PhD student;Researcher;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\njesson2024relu,\ntitle={Re{LU} to the Rescue: Improve Your On-Policy Actor-Critic with Positive Advantages},\nauthor={Andrew Jesson and Chris Lu and Gunshi Gupta and Nicolas Beltran-Velez and Angelos Filos and Jakob Nicolaus Foerster and Yarin Gal},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3umNqxjFad}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6714509, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1652347692233005003&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 7, "email": "columbia.edu;ox.ac.uk;ox.ac.uk;columbia.edu;deepmind.com;eng.ox.ac.uk;ox.ac.uk", "author_num": 7, "aff_unique_index": "0;1;1;0;2;1;1", "aff_unique_norm": "Columbia University;University of Oxford;Google", "aff_unique_dep": ";;Google DeepMind", "aff_unique_url": "https://www.columbia.edu;https://www.ox.ac.uk;https://deepmind.com", "aff_unique_abbr": "Columbia;Oxford;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;1;1;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Lyapunov-stable Neural Control for State and Output Feedback: A Novel Formulation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35042", "id": "3xPMW9JURD", "proceeding": "https://proceedings.mlr.press/v235/yang24f.html", "pdf": "https://openreview.net/pdf?id=3xPMW9JURD", "openreview": "https://openreview.net/forum?id=3xPMW9JURD", "author_site": "Lujie Yang, Hongkai Dai, Zhouxing Shi, Cho-Jui Hsieh, Russ Tedrake, Huan Zhang", "tldr": "", "abstract": "Learning-based neural-network (NN) control policies have shown impressive empirical performance in a wide range of tasks in robotics and control. However, formal (Lyapunov) stability guarantees over the region-of-attraction (ROA) for NN controllers with nonlinear dynamical systems are challenging to obtain, and most existing approaches rely on expensive solvers for sums-of-squares (SOS), mixed-integer programming (MIP), or satisfiability modulo theories (SMT). In this paper, we demonstrate a new framework for learning NN controllers together with Lyapunov certificates using fast empirical falsification and strategic regularizations. We propose a novel formulation that defines a larger verifiable region-of-attraction (ROA) than shown in the literature, and refines the conventional restrictive constraints on Lyapunov derivatives to focus only on certifiable ROAs. The Lyapunov condition is rigorously verified post-hoc using branch-and-bound with scalable linear bound propagation-based NN verification techniques. The approach is efficient and flexible, and the full training and verification procedure is accelerated on GPUs without relying on expensive solvers for SOS, MIP, nor SMT. The flexibility and efficiency of our framework allow us to demonstrate Lyapunov-stable output feedback control with synthesized NN-based controllers and NN-based observers with formal stability guarantees, for the first time in literature.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lujie Yang;Hongkai Dai;Zhouxing Shi;Cho-Jui Hsieh;Russ Tedrake;Huan Zhang", "authorids": "~Lujie_Yang1;~Hongkai_Dai1;~Zhouxing_Shi1;~Cho-Jui_Hsieh1;~Russ_Tedrake1;~Huan_Zhang1", "gender": ";;;M;M;M", "homepage": ";;https://shizhouxing.github.io;http://web.cs.ucla.edu/~chohsieh/index.html;http://people.csail.mit.edu/russt;http://huan-zhang.com", "dblp": ";;232/2169;14/2770;73/1296;23/1797-1.html", "google_scholar": "oU5haR0AAAAJ;ZZsEXLAAAAAJ;YFIr4PwAAAAJ;Wy89g4IAAAAJ;nxNkEiYAAAAJ;LTa3GzEAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Lujie_Yang1;~Hongkai_Dai1;~Zhouxing_Shi1;~Cho-Jui_Hsieh1;~Russ_Tedrake1;~Huan_Zhang1", "aff": "Massachusetts Institute of Technology;Toyota Research Institute;University of California, Los Angeles;University of California, Los Angeles;Massachusetts Institute of Technology;University of Illinois, Urbana Champaign", "aff_domain": "mit.edu;tri.global;ucla.edu;ucla.edu;mit.edu;uiuc.edu", "position": "PhD student;Researcher;PhD student;Associate Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nyang2024lyapunovstable,\ntitle={Lyapunov-stable Neural Control for State and Output Feedback: A Novel Formulation},\nauthor={Lujie Yang and Hongkai Dai and Zhouxing Shi and Cho-Jui Hsieh and Russ Tedrake and Huan Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3xPMW9JURD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5018869, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18415234823238027436&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "mit.edu;tri.global;ucla.edu;ucla.edu;mit.edu;uiuc.edu", "author_num": 6, "aff_unique_index": "0;1;2;2;0;3", "aff_unique_norm": "Massachusetts Institute of Technology;Toyota Research Institute;University of California, Los Angeles;University of Illinois Urbana-Champaign", "aff_unique_dep": ";;;", "aff_unique_url": "https://web.mit.edu;https://www.tri.global;https://www.ucla.edu;https://illinois.edu", "aff_unique_abbr": "MIT;TRI;UCLA;UIUC", "aff_campus_unique_index": "1;1;2", "aff_campus_unique": ";Los Angeles;Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Generalization Analysis of Deep Non-linear Matrix Completion", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35041", "id": "40foON48am", "proceeding": "https://proceedings.mlr.press/v235/ledent24a.html", "pdf": "https://openreview.net/pdf?id=40foON48am", "openreview": "https://openreview.net/forum?id=40foON48am", "author_site": "Antoine Ledent, Rodrigo Alves", "tldr": "", "abstract": "We provide generalization bounds for matrix completion with Schatten $p$ quasi-norm constraints, which is equivalent to deep matrix factorization with Frobenius constraints. In the uniform sampling regime, the sample complexity scales like $\\widetilde{O}\\left( rn\\right)$ where $n$ is the size of the matrix and $r$ is a constraint of the same order as the ground truth rank in the isotropic case. In the distribution-free setting, the bounds scale as $\\widetilde{O}\\left(r^{1-\\frac{p}{2}}n^{1+\\frac{p}{2}}\\right)$, which reduces to the familiar $\\sqrt{r}n^{\\frac{3}{2}}$ for $p=1$. Furthermore, we provide an analogue of the weighted trace norm for this setting which brings the sample complexity down to $\\widetilde{O}(nr)$ in all cases. We then present a non-linear model, Functionally Rescaled Matrix Completion (FRMC) which applies a single trainable function from $\\mathbb{R}\\rightarrow \\mathbb{R}$ to each entry of a latent matrix, and prove that this adds only negligible terms of the overall sample complexity, whilst experiments demonstrate that this simple model improvement already leads to significant gains on real data. We also provide extensions of our results to various neural architectures, thereby providing the first comprehensive uniform convergence PAC analysis of neural network matrix completion.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Antoine Ledent;Rodrigo Alves", "authorids": "~Antoine_Ledent1;~Rodrigo_Alves1", "gender": "M;M", "homepage": "https://pages.fit.cvut.cz/dasilrod/;https://sites.google.com/view/antoine-ledent/home", "dblp": "143/7395.html;241/9500", "google_scholar": "86t4YwYAAAAJ;https://scholar.google.de/citations?user=xxbYi-MAAAAJ", "orcid": "0000-0001-7458-5281;0000-0001-8440-2784", "linkedin": ";antoine-ledent-005a46160/?originalSubdomain=de", "or_profile": "~Rodrigo_Alves1;~Antoine_Patrick_Isabelle_Eric_Ledent1", "aff": "Czech Technical University in Prague;Singapore Management University", "aff_domain": "cvut.cz;smu.edu.sg", "position": "Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nledent2024generalization,\ntitle={Generalization Analysis of Deep Non-linear Matrix Completion},\nauthor={Antoine Ledent and Rodrigo Alves},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=40foON48am}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1151812, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4290811240824891604&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "cvut.cz;smu.edu.sg", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Czech Technical University;Singapore Management University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ctu.cz;https://www.smu.edu.sg", "aff_unique_abbr": "CTU;SMU", "aff_campus_unique_index": "0", "aff_campus_unique": "Prague;", "aff_country_unique_index": "0;1", "aff_country_unique": "Czech Republic;Singapore" }, { "title": "InfoNet: Neural Estimation of Mutual Information without Test-Time Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35040", "id": "40hCy8n5XH", "proceeding": "https://proceedings.mlr.press/v235/hu24h.html", "pdf": "https://openreview.net/pdf?id=40hCy8n5XH", "openreview": "https://openreview.net/forum?id=40hCy8n5XH", "author_site": "Zhengyang Hu, Song Kang, Qunsong Zeng, Kaibin Huang, Yanchao Yang", "tldr": "", "abstract": "Estimating mutual correlations between random variables or data streams is essential for intelligent behavior and decision-making. As a fundamental quantity for measuring statistical relationships, mutual information has been extensively studied and utilized for its generality and equitability. However, existing methods often lack the efficiency needed for real-time applications, such as test-time optimization of a neural network, or the differentiability required for end-to-end learning, like histograms. We introduce a neural network called InfoNet, which directly outputs mutual information estimations of data streams by leveraging the attention mechanism and the computational efficiency of deep learning infrastructures. By maximizing a dual formulation of mutual information through large-scale simulated training, our approach circumvents time-consuming test-time optimization and offers generalization ability. We evaluate the effectiveness and generalization of our proposed mutual information estimation scheme on various families of distributions and applications. Our results demonstrate that InfoNet and its training process provide a graceful efficiency-accuracy trade-off and order-preserving properties. We will make the code and models available as a comprehensive toolbox to facilitate studies in different fields requiring real-time mutual information estimation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhengyang Hu;Song Kang;Qunsong Zeng;Kaibin Huang;Yanchao Yang", "authorids": "~Zhengyang_Hu1;~Song_Kang1;~Qunsong_Zeng1;~Kaibin_Huang1;~Yanchao_Yang1", "gender": "M;M;;M;M", "homepage": ";;;https://www.eee.hku.hk/~huangkb/;https://yanchaoyang.github.io/", "dblp": "219/6927-2;;;;84/8637-1", "google_scholar": "qe-fgwYAAAAJ;https://scholar.google.com/citations?hl=en;;HfuA3uIAAAAJ;r2tKnV4AAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Zhengyang_Hu1;~Song_Kang1;~Qunsong_Zeng1;~Kaibin_Huang1;~Yanchao_Yang1", "aff": "Hong Kong University;University of Science and Technology of China;;;University of Hong Kong", "aff_domain": "connect.hku.hk;ustc.edu.cn;;;hku.hk", "position": "PhD student;Undergrad student;;;Assistant Professor", "bibtex": "@inproceedings{\nhu2024infonet,\ntitle={InfoNet: Neural Estimation of Mutual Information without Test-Time Optimization},\nauthor={Zhengyang Hu and Song Kang and Qunsong Zeng and Kaibin Huang and Yanchao Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=40hCy8n5XH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8608186, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10083906016032550618&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "connect.hku.hk;ustc.edu.cn;;;hku.hk", "author_num": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "Hong Kong University;University of Science and Technology of China;University of Hong Kong", "aff_unique_dep": ";;", "aff_unique_url": "https://www.hku.hk;http://www.ustc.edu.cn;https://www.hku.hk", "aff_unique_abbr": "HKU;USTC;HKU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Diffusion Tempering Improves Parameter Estimation with Probabilistic Integrators for Ordinary Differential Equations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35039", "id": "43HZG9zwaj", "proceeding": "https://proceedings.mlr.press/v235/beck24a.html", "pdf": "https://openreview.net/pdf?id=43HZG9zwaj", "openreview": "https://openreview.net/forum?id=43HZG9zwaj", "author_site": "Jonas Beck, Nathanael Bosch, Michael Deistler, Kyra Kadhim, Jakob Macke, Philipp Hennig, Philipp Berens", "tldr": "", "abstract": "Ordinary differential equations (ODEs) are widely used to describe dynamical systems in science, but identifying parameters that explain experimental measurements is challenging. In particular, although ODEs are differentiable and would allow for gradient-based parameter optimization, the nonlinear dynamics of ODEs often lead to many local minima and extreme sensitivity to initial conditions. We therefore propose diffusion tempering, a novel regularization technique for probabilistic numerical methods which improves convergence of gradient-based parameter optimization in ODEs. By iteratively reducing a noise parameter of the probabilistic integrator, the proposed method converges more reliably to the true parameters. We demonstrate that our method is effective for dynamical systems of different complexity and show that it obtains reliable parameter estimates for a Hodgkin--Huxley model with a practically relevant number of parameters.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jonas Beck;Nathanael Bosch;Michael Deistler;Kyra L. Kadhim;Jakob H. Macke;Philipp Hennig;Philipp Berens", "authorids": "~Jonas_Beck1;~Nathanael_Bosch1;~Michael_Deistler1;~Kyra_L._Kadhim1;~Jakob_H._Macke1;~Philipp_Hennig1;~Philipp_Berens1", "gender": ";M;M;F;M;M;M", "homepage": "https://github.com/jnsbck;https://nathanaelbosch.github.io;https://michaeldeistler.github.io/;https://hertie.ai/data-science/team/members/kyra-kadhim;http://mml.inf.uni-tuebingen.de;http://www.berenslab.org;http://www.mackelab.org", "dblp": "331/5676;264/9948;243/5747;;08/9077;78/3560;97/11106", "google_scholar": "jndTQCgAAAAJ;2vejDygAAAAJ;Q24H-zYAAAAJ;IMwZHisAAAAJ;https://scholar.google.de/citations?user=UeG5w08AAAAJ;https://scholar.google.de/citations?user=lPQLk3QAAAAJ;FKOqtF8AAAAJ", "orcid": "0009-0000-0338-2559;0000-0003-0139-4622;0000-0002-3573-0404;;0000-0001-7293-6092;;0000-0001-5154-8912", "linkedin": ";;;;;;", "or_profile": "~Jonas_Beck1;~Nathanael_Bosch1;~Michael_Deistler1;~Kyra_L._Kadhim1;~Philipp_Hennig1;~Philipp_Berens1;~Jakob_H_Macke1", "aff": "University of Tuebingen;University of Tuebingen;University of Tuebingen;Eberhard-Karls-Universit\u00e4t T\u00fcbingen;University of T\u00fcbingen;University of Tuebingen;University of Tuebingen", "aff_domain": "uni-tuebingen.de;uni-tuebingen.de;uni-tuebingen.de;uni-tuebingen.de;uni-tuebingen.de;uni-tuebingen.de;uni-tuebingen.de", "position": "PhD student;PhD student;PhD student;PhD student;Full Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nbeck2024diffusion,\ntitle={Diffusion Tempering Improves Parameter Estimation with Probabilistic Integrators for Ordinary Differential Equations},\nauthor={Jonas Beck and Nathanael Bosch and Michael Deistler and Kyra L. Kadhim and Jakob H. Macke and Philipp Hennig and Philipp Berens},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=43HZG9zwaj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3106144, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3828127099916742151&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "uni-tuebingen.de;uni-tuebingen.de;uni-tuebingen.de;uni-tuebingen.de;uni-tuebingen.de;uni-tuebingen.de;uni-tuebingen.de", "author_num": 7, "aff_unique_index": "0;0;0;1;2;0;0", "aff_unique_norm": "University of Tuebingen;Eberhard Karls University of T\u00fcbingen;University of T\u00fcbingen", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.uni-tuebingen.de/;https://www.uni-tuebingen.de/", "aff_unique_abbr": "Uni T\u00fcbingen;Uni T\u00fcbingen;Uni T\u00fcbingen", "aff_campus_unique_index": "1", "aff_campus_unique": ";T\u00fcbingen", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Position: Scarce Resource Allocations That Rely On Machine Learning Should Be Randomized", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35038", "id": "44qxX6Ty6F", "proceeding": "https://proceedings.mlr.press/v235/jain24a.html", "pdf": "https://openreview.net/pdf?id=44qxX6Ty6F", "openreview": "https://openreview.net/forum?id=44qxX6Ty6F", "author_site": "Shomik Jain, Kathleen A. Creel, Ashia Wilson", "tldr": "", "abstract": "Contrary to traditional deterministic notions of algorithmic fairness, this paper argues that fairly allocating scarce resources using machine learning often requires randomness. We address why, when, and how to randomize by offering a set of stochastic procedures that more adequately account for all of the claims individuals have to allocations of social goods or opportunities and effectively balances their interests.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shomik Jain;Kathleen Creel;Ashia Camage Wilson", "authorids": "~Shomik_Jain1;~Kathleen_Creel1;~Ashia_Camage_Wilson1", "gender": ";F;F", "homepage": ";https://kathleenacreel.com;https://www.ashiawilson.com", "dblp": ";249/6794;", "google_scholar": "KXHdjLoAAAAJ;qMIT0dcAAAAJ;", "orcid": ";0000-0001-7371-2680;", "linkedin": ";;", "or_profile": "~Shomik_Jain1;~Kathleen_Creel1;~Ashia_C._Wilson1", "aff": "Massachusetts Institute of Technology;Northeastern University;Massachusetts Institute of Technology", "aff_domain": "mit.edu;northeastern.edu;mit.edu", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\njain2024position,\ntitle={Position: Scarce Resource Allocations That Rely On Machine Learning Should Be Randomized},\nauthor={Shomik Jain and Kathleen Creel and Ashia Camage Wilson},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=44qxX6Ty6F}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9515388, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4999428906047336984&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "email": "mit.edu;northeastern.edu;mit.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Massachusetts Institute of Technology;Northeastern University", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://www.northeastern.edu", "aff_unique_abbr": "MIT;NEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Perturb-and-Project: Differentially Private Similarities and Marginals", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35037", "id": "45HNimd4YI", "proceeding": "https://proceedings.mlr.press/v235/cohen-addad24a.html", "pdf": "https://openreview.net/pdf?id=45HNimd4YI", "openreview": "https://openreview.net/forum?id=45HNimd4YI", "author_site": "Vincent Cohen-Addad, Tommaso d'Orsi, Alessandro Epasto, Vahab Mirrokni, Peilin Zhong", "tldr": "", "abstract": "We revisit the objective perturbations framework for differential privacy where noise is added to the input $A\\in \\mathcal{S}$ and the result is then projected back to the space of admissible datasets $\\mathcal{S}$. Through this framework, we first design novel efficient algorithms to privately release pair-wise cosine similarities. Second, we derive a novel algorithm to compute $k$-way marginal queries over $n$ features. Prior work could achieve comparable guarantees only for $k$ even. Furthermore, we extend our results to $t$-sparse datasets, where our efficient algorithms yields novel, stronger guarantees whenever $t\\le n^{5/6}/\\log n.$ Finally, we provide a theoretical perspective on why *fast* input perturbation algorithms works well in practice. The key technical ingredients behind our results are tight sum-of-squares certificates upper bounding the Gaussian complexity of sets of solutions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vincent Cohen-Addad;Tommaso d'Orsi;Alessandro Epasto;Vahab Mirrokni;Peilin Zhong", "authorids": "~Vincent_Cohen-Addad1;~Tommaso_d'Orsi1;~Alessandro_Epasto3;~Vahab_Mirrokni2;~Peilin_Zhong1", "gender": ";;M;M;M", "homepage": ";https://tommasodorsi.github.io;https://epasto.org;https://people.csail.mit.edu/mirrokni/Welcome.html;http://www.cs.columbia.edu/~peilin/", "dblp": "136/5814;275/8135;58/7802;m/VahabSMirrokni;148/9632", "google_scholar": ";;https://scholar.google.com/citations?hl=en;opbZfw0AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0003-0456-3217;;", "linkedin": ";;https://www.linkedin.com/pub/alessandro-epasto/85/649/733/;;", "or_profile": "~Vincent_Cohen-Addad1;~Tommaso_d'Orsi1;~Alessandro_Epasto3;~Vahab_Mirrokni2;~Peilin_Zhong1", "aff": "Google;Bocconi University;Google;Google Research;Google", "aff_domain": "google.com;unibocconi.it;google.com;google.com;google.com", "position": "Researcher;Assistant Professor;Research Scientist;VP, Google Fellow;Researcher", "bibtex": "@inproceedings{\ncohen-addad2024perturbandproject,\ntitle={Perturb-and-Project: Differentially Private Similarities and Marginals},\nauthor={Vincent Cohen-Addad and Tommaso d'Orsi and Alessandro Epasto and Vahab Mirrokni and Peilin Zhong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=45HNimd4YI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 435859, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:XBS9VN6g9HkJ:scholar.google.com/&scioq=Perturb-and-Project:+Differentially+Private+Similarities+and+Marginals&hl=en&as_sdt=0,47", "gs_version_total": 6, "email": "google.com;unibocconi.it;google.com;google.com;google.com", "author_num": 5, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Google;Bocconi University", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.bocconi.edu", "aff_unique_abbr": "Google;Bocconi", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;Italy" }, { "title": "Accelerating Look-ahead in Bayesian Optimization: Multilevel Monte Carlo is All you Need", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35036", "id": "46vXhZn7lN", "proceeding": "https://proceedings.mlr.press/v235/yang24aj.html", "pdf": "https://openreview.net/pdf?id=46vXhZn7lN", "openreview": "https://openreview.net/forum?id=46vXhZn7lN", "author_site": "Shangda Yang, Vitaly Zankin, Maximilian Balandat, Stefan Scherer, Kevin Carlberg, Neil Walton, Kody Law", "tldr": "", "abstract": "We leverage multilevel Monte Carlo (MLMC) to improve the performance of multi-step look- ahead Bayesian optimization (BO) methods that involve nested expectations and maximizations. Often these expectations must be computed by Monte Carlo (MC). The complexity rate of naive MC degrades for nested operations, whereas MLMC is capable of achieving the canonical MC convergence rate for this type of problem, independently of dimension and without any smoothness assumptions. Our theoretical study focuses on the approximation improvements for two- and three-step look-ahead acquisition functions, but, as we discuss, the approach is generalizable in various ways, including beyond the context of BO. Our findings are verified numerically and the benefits of MLMC for BO are illustrated on several benchmark examples. Code is available at https://github.com/Shangda-Yang/MLMCBO.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shangda Yang;Vitaly Zankin;Maximilian Balandat;Stefan Scherer;Kevin Thomas Carlberg;Neil Walton;Kody J. H. Law", "authorids": "~Shangda_Yang1;~Vitaly_Zankin1;~Maximilian_Balandat1;~Stefan_Scherer1;~Kevin_Thomas_Carlberg1;neil.walton@durham.ac.uk;~Kody_J._H._Law1", "gender": "M;;;M;;;", "homepage": ";;https://research.facebook.com/people/balandat-max/;;;;", "dblp": ";;41/9185;60/5336;;;", "google_scholar": "UkrtutgAAAAJ;;N0iLicUAAAAJ;rbGxNYwAAAAJ;;;", "orcid": ";;0000-0002-8214-8935;0000-0002-0280-5393;;;", "linkedin": ";;maximilian-balandat-b5843946/;;;;", "or_profile": "~Shangda_Yang1;~Vitaly_Zankin1;~Maximilian_Balandat1;~Stefan_Scherer1;~Kevin_Thomas_Carlberg1;neil.walton@durham.ac.uk;~Kody_J._H._Law1", "aff": "University of Manchester;;Meta;University of Southern California;;;", "aff_domain": "manchester.ac.uk;;meta.com;usc.edu;;;", "position": "PhD student;;Research Scientist Manager;Assistant Professor;;;", "bibtex": "@inproceedings{\nyang2024accelerating,\ntitle={Accelerating Look-ahead in Bayesian Optimization: Multilevel Monte Carlo is All you Need},\nauthor={Shangda Yang and Vitaly Zankin and Maximilian Balandat and Stefan Scherer and Kevin Thomas Carlberg and Neil Walton and Kody J. H. Law},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=46vXhZn7lN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1163463, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9704269002997318469&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "manchester.ac.uk;;meta.com;usc.edu;;;", "author_num": 7, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Manchester;Meta;University of Southern California", "aff_unique_dep": ";Meta Platforms, Inc.;", "aff_unique_url": "https://www.manchester.ac.uk;https://meta.com;https://www.usc.edu", "aff_unique_abbr": "UoM;Meta;USC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Universality of Linear Recurrences Followed by Non-linear Projections: Finite-Width Guarantees and Benefits of Complex Eigenvalues", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35035", "id": "47ahBl70xb", "proceeding": "https://proceedings.mlr.press/v235/orvieto24a.html", "pdf": "https://openreview.net/pdf?id=47ahBl70xb", "openreview": "https://openreview.net/forum?id=47ahBl70xb", "author_site": "Antonio Orvieto, Soham De, Caglar Gulcehre, Razvan Pascanu, Samuel Smith", "tldr": "", "abstract": "Deep neural networks based on linear RNNs interleaved with position-wise MLPs are gaining traction as competitive approaches for sequence modeling. Examples of such architectures include state-space models (SSMs) like S4, LRU, and Mamba: recently proposed models that achieve promising performance on text, genetics, and other data that require long-range reasoning. Despite experimental evidence highlighting these architectures' effectiveness and computational efficiency, their expressive power remains relatively unexplored, especially in connection to specific choices crucial in practice - e.g., carefully designed initialization distribution and potential use of complex numbers. In this paper, we show that combining MLPs with both real or complex linear diagonal recurrences leads to arbitrarily precise approximation of regular causal sequence-to-sequence maps. At the heart of our proof, we rely on a separation of concerns: the linear RNN provides a lossless encoding of the input sequence, and the MLP performs non-linear processing on this encoding. While we show that real diagonal linear recurrences are enough to achieve universality in this architecture, we prove that employing complex eigenvalues near unit disk - i.e., empirically the most successful strategy in S4 - greatly helps the RNN in storing information. We connect this finding with the vanishing gradient issue and provide experiments supporting our claims.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Antonio Orvieto;Soham De;Caglar Gulcehre;Razvan Pascanu;Samuel L Smith", "authorids": "~Antonio_Orvieto3;~Soham_De2;caglar.gulcehre@epfl.ch;~Razvan_Pascanu1;~Samuel_L_Smith1", "gender": "M;M;;M;M", "homepage": "http://orvi.altervista.org/;https://sohamde.github.io;;https://razp.info;https://www.samtalksml.net/", "dblp": ";124/9197;;65/8368.html;", "google_scholar": "xkuLyHoAAAAJ;lHf55pF3KVQC;;https://scholar.google.ca/citations?user=eSPY8LwAAAAJ;https://scholar.google.co.uk/citations?user=fyEqU5oAAAAJ", "orcid": ";;;;", "linkedin": "antonio-orvieto-947ab0130/;;;;", "or_profile": "~Antonio_Orvieto3;~Soham_De2;caglar.gulcehre@epfl.ch;~Razvan_Pascanu1;~Samuel_L_Smith1", "aff": "ELLIS Institute T\u00fcbingen, Max Planck Institute for Intelligent Systems, T\u00fcbingen AI Center, T\u00fcbingen, Germany;Google DeepMind;;Google DeepMind;babylon health", "aff_domain": "tue.ellis.eu;google.com;;google.com;babylonhealth.com", "position": "Principal Researcher;Research Scientist;;Research Scientist;Data scientist", "bibtex": "@inproceedings{\norvieto2024universality,\ntitle={Universality of Linear Recurrences Followed by Non-linear Projections: Finite-Width Guarantees and Benefits of Complex Eigenvalues},\nauthor={Antonio Orvieto and Soham De and Caglar Gulcehre and Razvan Pascanu and Samuel L Smith},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=47ahBl70xb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4441755, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14201374155981570047&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "tue.ellis.eu;google.com;;google.com;babylonhealth.com", "author_num": 5, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "ELLIS Institute T\u00fcbingen;Google;Babylon Health", "aff_unique_dep": ";Google DeepMind;", "aff_unique_url": ";https://deepmind.com;https://www.babylonhealth.com", "aff_unique_abbr": ";DeepMind;Babylon", "aff_campus_unique_index": "0", "aff_campus_unique": "T\u00fcbingen;", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Germany;United Kingdom" }, { "title": "Eluder-based Regret for Stochastic Contextual MDPs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35034", "id": "47jMS97wJX", "proceeding": "https://proceedings.mlr.press/v235/levy24a.html", "pdf": "https://openreview.net/pdf?id=47jMS97wJX", "openreview": "https://openreview.net/forum?id=47jMS97wJX", "author_site": "Orin Levy, Asaf Cassel, Alon Cohen, Yishay Mansour", "tldr": "", "abstract": "We present the E-UC$^3$RL algorithm for regret minimization in Stochastic Contextual Markov Decision Processes (CMDPs). The algorithm operates under the minimal assumptions of realizable function class and access to *offline* least squares and log loss regression oracles. Our algorithm is efficient (assuming efficient offline regression oracles) and enjoys a regret guarantee of $ \\widetilde{O}(H^3 \\sqrt{T |S| |A|d_{\\mathrm{E}}(\\mathcal{P}) \\log (|\\mathcal{F}| |\\mathcal{P}|/ \\delta) )}) $ , with $T$ being the number of episodes, $S$ the state space, $A$ the action space, $H$ the horizon, $\\mathcal{P}$ and $\\mathcal{F}$ are finite function classes used to approximate the context-dependent dynamics and rewards, respectively, and $d_{\\mathrm{E}}(\\mathcal{P})$ is the Eluder dimension of $\\mathcal{P}$ w.r.t the Hellinger distance. To the best of our knowledge, our algorithm is the first efficient and rate-optimal regret minimization algorithm for CMDPs that operates under the general offline function approximation setting. In addition, we extend the Eluder dimension to general bounded metrics which may be of independent interest.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Orin Levy;Asaf Cassel;Alon Cohen;Yishay Mansour", "authorids": "~Orin_Levy1;~Asaf_Cassel1;~Alon_Cohen1;~Yishay_Mansour2", "gender": "F;M;M;M", "homepage": "https://sites.google.com/view/orinlevy/home;https://sites.google.com/site/aloncohentechnion/;;https://www.cs.tau.ac.il/~mansour/", "dblp": "315/0411;133/2021;222/3222;m/YishayMansour", "google_scholar": "X-7G2gQAAAAJ;shoYR_AAAAAJ;vhIydFkAAAAJ;OEJUgwkAAAAJ", "orcid": ";;;0000-0001-6891-2645", "linkedin": "orin-l-5997b0136/;;;", "or_profile": "~Orin_Levy1;~Alon_Cohen1;~Asaf_Benjamin_Cassel1;~Yishay_Mansour1", "aff": "Amazon;Google;Tel Aviv University;School of Computer Science, Tel Aviv University", "aff_domain": "amazon.com;google.com;tau.ac.il;cs.tau.ac.il", "position": "Intern;Researcher;PhD student;Full Professor", "bibtex": "@inproceedings{\nlevy2024eluderbased,\ntitle={Eluder-based Regret for Stochastic Contextual {MDP}s},\nauthor={Orin Levy and Asaf Cassel and Alon Cohen and Yishay Mansour},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=47jMS97wJX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 410805, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8238350234221560047&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "amazon.com;google.com;tau.ac.il;cs.tau.ac.il", "author_num": 4, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Amazon;Google;Tel Aviv University", "aff_unique_dep": "Amazon.com, Inc.;Google;", "aff_unique_url": "https://www.amazon.com;https://www.google.com;https://www.tau.ac.il", "aff_unique_abbr": "Amazon;Google;TAU", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Mountain View;Tel Aviv", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "United States;Israel" }, { "title": "A Probabilistic Approach to Learning the Degree of Equivariance in Steerable CNNs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35033", "id": "49vHLSxjzy", "proceeding": "https://proceedings.mlr.press/v235/veefkind24a.html", "pdf": "https://openreview.net/pdf?id=49vHLSxjzy", "openreview": "https://openreview.net/forum?id=49vHLSxjzy", "author_site": "Lars Veefkind, Gabriele Cesa", "tldr": "", "abstract": "Steerable convolutional neural networks (SCNNs) enhance task performance by modelling geometric symmetries through equivariance constraints on weights. Yet, unknown or varying symmetries can lead to overconstrained weights and decreased performance. To address this, this paper introduces a probabilistic method to learn the degree of equivariance in SCNNs. We parameterise the degree of equivariance as a likelihood distribution over the transformation group using Fourier coefficients, offering the option to model layer-wise and shared equivariance. These likelihood distributions are regularised to ensure an interpretable degree of equivariance across the network. Advantages include the applicability to many types of equivariant networks through the flexible framework of SCNNs and the ability to learn equivariance with respect to any subgroup of any compact group without requiring additional layers. Our experiments reveal competitive performance on datasets with mixed symmetries, with learnt likelihood distributions that are representative of the underlying degree of equivariance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lars Veefkind;Gabriele Cesa", "authorids": "~Lars_Veefkind1;~Gabriele_Cesa1", "gender": "M;M", "homepage": "https://www.linkedin.com/in/lars-veefkind/;https://github.com/Gabri95", "dblp": ";254/1536", "google_scholar": ";hTplhaMAAAAJ", "orcid": ";", "linkedin": "lars-veefkind/;", "or_profile": "~Lars_Veefkind1;~Gabriele_Cesa1", "aff": ";Qualcomm Inc, QualComm", "aff_domain": ";qti.qualcomm.com", "position": ";Researcher", "bibtex": "@inproceedings{\nveefkind2024a,\ntitle={A Probabilistic Approach to Learning the Degree of Equivariance in Steerable {CNN}s},\nauthor={Lars Veefkind and Gabriele Cesa},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=49vHLSxjzy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6853238, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14351535335845609831&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";qti.qualcomm.com", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Qualcomm Incorporated", "aff_unique_dep": "", "aff_unique_url": "https://www.qualcomm.com", "aff_unique_abbr": "Qualcomm", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Remembering to Be Fair: Non-Markovian Fairness in Sequential Decision Making", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35032", "id": "4BIOZSz7zU", "proceeding": "https://proceedings.mlr.press/v235/alamdari24a.html", "pdf": "https://openreview.net/pdf?id=4BIOZSz7zU", "openreview": "https://openreview.net/forum?id=4BIOZSz7zU", "author_site": "Parand A. Alamdari, Toryn Q. Klassen, Elliot Creager, Sheila McIlraith", "tldr": "", "abstract": "Fair decision making has largely been studied with respect to a single decision. Here we investigate the notion of fairness in the context of sequential decision making where multiple stakeholders can be affected by the outcomes of decisions. We observe that fairness often depends on the history of the sequential decision-making process, and in this sense that it is inherently non-Markovian. We further observe that fairness often needs to be assessed at time points *within* the process, not just at the end of the process. To advance our understanding of this class of fairness problems, we explore the notion of non-Markovian fairness in the context of sequential decision making. We identify properties of non-Markovian fairness, including notions of long-term, anytime, periodic, and bounded fairness. We explore the interplay between non-Markovian fairness and memory and how memory can support construction of fair policies. Finally, we introduce the FairQCM algorithm, which can automatically augment its training data to improve sample efficiency in the synthesis of fair policies via reinforcement learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Parand A. Alamdari;Toryn Q. Klassen;Elliot Creager;Sheila A. McIlraith", "authorids": "~Parand_A._Alamdari1;~Toryn_Q._Klassen1;~Elliot_Creager1;~Sheila_A._McIlraith1", "gender": ";M;F;F", "homepage": "http://tqk.ca;https://ecreager.github.io/;http://www.cs.toronto.edu/~sheila/;http://praal.github.io", "dblp": "213/4964;182/2055;66/3221;266/1421", "google_scholar": "https://scholar.google.ca/citations?user=uNl1QHMAAAAJ;boebIUcAAAAJ;https://scholar.google.com.tw/citations?user=ny2zuvMAAAAJ;WE3XiuoAAAAJ", "orcid": ";0009-0004-7122-3866;0000-0003-4953-0945;", "linkedin": ";;sheila-mcilraith-a76aa513/?originalSubdomain=ca;", "or_profile": "~Toryn_Q._Klassen1;~Elliot_Creager1;~Sheila_A._McIlraith1;~Parand_Alizadeh_Alamdari1", "aff": "University of Toronto;University of Waterloo;Department of Computer Science, University of Toronto;University of Toronto", "aff_domain": "toronto.edu;uwaterloo.ca;cs.toronto.edu;cs.toronto.edu", "position": "Postdoc;Assistant Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nalamdari2024remembering,\ntitle={Remembering to Be Fair: Non-Markovian Fairness in Sequential Decision Making},\nauthor={Parand A. Alamdari and Toryn Q. Klassen and Elliot Creager and Sheila A. McIlraith},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4BIOZSz7zU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 562020, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4625567665247682092&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "toronto.edu;uwaterloo.ca;cs.toronto.edu;cs.toronto.edu", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of Toronto;University of Waterloo", "aff_unique_dep": ";", "aff_unique_url": "https://www.utoronto.ca;https://uwaterloo.ca", "aff_unique_abbr": "U of T;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Toronto", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Canada" }, { "title": "PerceptAnon: Exploring the Human Perception of Image Anonymization Beyond Pseudonymization for GDPR", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35031", "id": "4BWCecFEcQ", "proceeding": "https://proceedings.mlr.press/v235/patwari24a.html", "pdf": "https://openreview.net/pdf?id=4BWCecFEcQ", "openreview": "https://openreview.net/forum?id=4BWCecFEcQ", "author_site": "Kartik Patwari, Chen-Nee Chuah, Lingjuan Lyu, Vivek Sharma", "tldr": "", "abstract": "Current image anonymization techniques, largely focus on localized pseudonymization, typically modify identifiable features like faces or full bodies and evaluate anonymity through metrics such as detection and re-identification rates. However, this approach often overlooks information present in the entire image post-anonymization that can compromise privacy, such as specific locations, objects/items, or unique attributes. Acknowledging the pivotal role of human judgment in anonymity, our study conducts a thorough analysis of perceptual anonymization, exploring its spectral nature and its critical implications for image privacy assessment, particularly in light of regulations such as the General Data Protection Regulation (GDPR). To facilitate this, we curated a dataset specifically tailored for assessing anonymized images. We introduce a learning-based metric, PerceptAnon, which is tuned to align with the human Perception of Anonymity. PerceptAnon evaluates both original-anonymized image pairs and solely anonymized images. Trained using human annotations, our metric encompasses both anonymized subjects and their contextual backgrounds, thus providing a comprehensive evaluation of privacy vulnerabilities. We envision this work as a milestone for understanding and assessing image anonymization, and establishing a foundation for future research. The codes and dataset are available in https://github.com/SonyResearch/gdpr_perceptanon.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kartik Patwari;Chen-Nee Chuah;Lingjuan Lyu;Vivek Sharma", "authorids": "~Kartik_Patwari1;~Chen-Nee_Chuah1;~Lingjuan_Lyu1;~Vivek_Sharma1", "gender": "M;F;F;M", "homepage": ";https://www.ece.ucdavis.edu/~chuah/;https://sites.google.com/view/lingjuan-lyu;https://vivoutlaw.github.io/", "dblp": ";;178/9876;", "google_scholar": "RZp_kd0AAAAJ;bZNRLNAAAAAJ;;fNbVXwQAAAAJ", "orcid": ";0000-0002-2772-387X;;", "linkedin": ";chen-nee-chuah-2451511/;;vivoutlaw/", "or_profile": "~Kartik_Patwari1;~Chen-Nee_Chuah1;~Lingjuan_Lyu1;~Vivek_Sharma1", "aff": "University of California, Davis;University of California, Davis;Sony;Sony Research", "aff_domain": "ucdavis.edu;ucdavis.edu;sony.com;sony.com", "position": "PhD student;Professor;scientist;Senior Research Scientist", "bibtex": "@inproceedings{\npatwari2024perceptanon,\ntitle={PerceptAnon: Exploring the Human Perception of Image Anonymization Beyond Pseudonymization for {GDPR}},\nauthor={Kartik Patwari and Chen-Nee Chuah and Lingjuan Lyu and Vivek Sharma},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4BWCecFEcQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7054803, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12630186499674824660&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "email": "ucdavis.edu;ucdavis.edu;sony.com;sony.com", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "University of California, Davis;Sony Corporation;Sony", "aff_unique_dep": ";;Research", "aff_unique_url": "https://www.ucdavis.edu;https://www.sony.com;https://www.sony.com", "aff_unique_abbr": "UC Davis;Sony;Sony", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Davis;", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "United States;Japan" }, { "title": "Conformal Prediction Sets Improve Human Decision Making", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35030", "id": "4CO45y7Mlv", "proceeding": "https://proceedings.mlr.press/v235/cresswell24a.html", "pdf": "https://openreview.net/pdf?id=4CO45y7Mlv", "openreview": "https://openreview.net/forum?id=4CO45y7Mlv", "author_site": "Jesse Cresswell, yi sui, Bhargava Kumar, No\u00ebl Vouitsis", "tldr": "", "abstract": "In response to everyday queries, humans explicitly signal uncertainty and offer alternative answers when they are unsure. Machine learning models that output calibrated prediction sets through conformal prediction mimic this human behaviour; larger sets signal greater uncertainty while providing alternatives. In this work, we study the usefulness of conformal prediction sets as an aid for human decision making by conducting a pre-registered randomized controlled trial with conformal prediction sets provided to human subjects. With statistical significance, we find that when humans are given conformal prediction sets their accuracy on tasks improves compared to fixed-size prediction sets with the same coverage guarantee. The results show that quantifying model uncertainty with conformal prediction is helpful for human-in-the-loop decision making and human-AI teams.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jesse C. Cresswell;Yi Sui;Bhargava Kumar;No\u00ebl Vouitsis", "authorids": "~Jesse_C._Cresswell1;~Yi_Sui1;bhargava1409@gmail.com;~No\u00ebl_Vouitsis1", "gender": ";F;;", "homepage": ";https://www.linkedin.com/in/yi-sui-90513699/;;", "dblp": ";;;", "google_scholar": ";fLo2o54AAAAJ;;", "orcid": ";0009-0009-9207-7403;;", "linkedin": ";;;", "or_profile": "~Jesse_C._Cresswell1;~Yi_Sui1;bhargava1409@gmail.com;~No\u00ebl_Vouitsis1", "aff": ";Layer6 AI;;", "aff_domain": ";layer6.ai;;", "position": ";Machine Learning Scientist;;", "bibtex": "@inproceedings{\ncresswell2024conformal,\ntitle={Conformal Prediction Sets Improve Human Decision Making},\nauthor={Jesse C. Cresswell and Yi Sui and Bhargava Kumar and No{\\\"e}l Vouitsis},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4CO45y7Mlv}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2641023, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7429376989627944162&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": ";layer6.ai;;", "author_num": 4, "aff_unique_index": "0", "aff_unique_norm": "Layer6 AI", "aff_unique_dep": "", "aff_unique_url": "https://layer6.ai", "aff_unique_abbr": "Layer6", "aff_country_unique_index": "0", "aff_country_unique": "Canada" }, { "title": "From Geometry to Causality- Ricci Curvature and the Reliability of Causal Inference on Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35029", "id": "4DAl3IsvlU", "proceeding": "https://proceedings.mlr.press/v235/farzam24a.html", "pdf": "https://openreview.net/pdf?id=4DAl3IsvlU", "openreview": "https://openreview.net/forum?id=4DAl3IsvlU", "author_site": "Amirhossein Farzam, Allen Tannenbaum, Guillermo Sapiro", "tldr": "", "abstract": "Causal inference on networks faces challenges posed in part by violations of standard identification assumptions due to dependencies between treatment units. Although graph geometry fundamentally influences such dependencies, the potential of geometric tools for causal inference on networked treatment units is yet to be unlocked. Moreover, despite significant progress utilizing graph neural networks (GNNs) for causal inference on networks, methods for evaluating their achievable reliability without ground truth are lacking. In this work we establish for the first time a theoretical link between network geometry, the graph Ricci curvature in particular, and causal inference, formalizing the intrinsic challenges that negative curvature poses to estimating causal parameters. The Ricci curvature can then be used to assess the reliability of causal estimates in structured data, as we empirically demonstrate. Informed by this finding, we propose a method using the geometric Ricci flow to reduce causal effect estimation error in networked data, showcasing how this newfound connection between graph geometry and causal inference could improve GNN-based causal inference. Bridging graph geometry and causal inference, this paper opens the door to geometric techniques for improving causal estimation on networks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Amirhossein Farzam;Allen Tannenbaum;Guillermo Sapiro", "authorids": "~Amirhossein_Farzam1;~Allen_Tannenbaum2;~Guillermo_Sapiro1", "gender": ";M;", "homepage": ";https://www.cs.stonybrook.edu/people/faculty/AllenTannenbaum;", "dblp": ";;82/5175", "google_scholar": ";https://scholar.google.com.tw/citations?user=w0Vl_lsAAAAJ;https://scholar.google.co.il/citations?user=ISRNX3gAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Amirhossein_Farzam1;~Allen_Tannenbaum2;~Guillermo_Sapiro1", "aff": ";State University of New York at Stony Brook;Duke University", "aff_domain": ";stonybrook.edu;duke.edu", "position": ";Full Professor;Full Professor", "bibtex": "@inproceedings{\nfarzam2024from,\ntitle={From Geometry to Causality- Ricci Curvature and the Reliability of Causal Inference on Networks},\nauthor={Amirhossein Farzam and Allen Tannenbaum and Guillermo Sapiro},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4DAl3IsvlU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4244431, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11304834916446310857&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";stonybrook.edu;duke.edu", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "State University of New York at Stony Brook;Duke University", "aff_unique_dep": ";", "aff_unique_url": "https://www.stonybrook.edu;https://www.duke.edu", "aff_unique_abbr": "SUNY Stony Brook;Duke", "aff_campus_unique_index": "0", "aff_campus_unique": "Stony Brook;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Aligning Transformers with Weisfeiler-Leman", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35028", "id": "4FJJfYjUQR", "proceeding": "https://proceedings.mlr.press/v235/muller24c.html", "pdf": "https://openreview.net/pdf?id=4FJJfYjUQR", "openreview": "https://openreview.net/forum?id=4FJJfYjUQR", "author_site": "Luis M\u00fcller, Christopher Morris", "tldr": "", "abstract": "Graph neural network architectures aligned with the $k$-dimensional Weisfeiler--Leman ($k$-WL) hierarchy offer theoretically well-understood expressive power. However, these architectures often fail to deliver state-of-the-art predictive performance on real-world graphs, limiting their practical utility. While recent works aligning graph transformer architectures with the $k$-WL hierarchy have shown promising empirical results, employing transformers for higher orders of $k$ remains challenging due to a prohibitive runtime and memory complexity of self-attention as well as impractical architectural assumptions, such as an infeasible number of attention heads. Here, we advance the alignment of transformers with the $k$-WL hierarchy, showing stronger expressivity results for each $k$, making them more feasible in practice. In addition, we develop a theoretical framework that allows the study of established positional encodings such as Laplacian PEs and SPE. We evaluate our transformers on the large-scale PCQM4Mv2 dataset, showing competitive predictive performance with the state-of-the-art and demonstrating strong downstream performance when fine-tuning them on small-scale molecular datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Luis M\u00fcller;Christopher Morris", "authorids": "~Luis_M\u00fcller1;~Christopher_Morris1", "gender": ";M", "homepage": "https://luis-mueller.github.io/;http://christophermorris.info", "dblp": "264/5619;156/7303", "google_scholar": "iPxfRnEAAAAJ;", "orcid": ";", "linkedin": "luis-m%C3%BCller-58a5ba236/;", "or_profile": "~Luis_M\u00fcller1;~Christopher_Morris1", "aff": "RWTH Aachen University, Rheinisch Westf\u00e4lische Technische Hochschule Aachen;Rheinisch Westf\u00e4lische Technische Hochschule Aachen", "aff_domain": "cs.rwth-aachen.de;rwth-aachen.de", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nm{\\\"u}ller2024aligning,\ntitle={Aligning Transformers with Weisfeiler-Leman},\nauthor={Luis M{\\\"u}ller and Christopher Morris},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4FJJfYjUQR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 612971, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12498276195507925991&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "cs.rwth-aachen.de;rwth-aachen.de", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "RWTH Aachen University", "aff_unique_dep": "", "aff_unique_url": "https://www.rwth-aachen.de", "aff_unique_abbr": "RWTH", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Aachen", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "VinT-6D: A Large-Scale Object-in-hand Dataset from Vision, Touch and Proprioception", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35027", "id": "4G5Dcjcm1s", "proceeding": "https://proceedings.mlr.press/v235/wan24d.html", "pdf": "https://openreview.net/pdf?id=4G5Dcjcm1s", "openreview": "https://openreview.net/forum?id=4G5Dcjcm1s", "author_site": "Zhaoliang Wan, Yonggen Ling, Senlin Yi, Lu Qi, Wang Lee, Minglei Lu, Sicheng Yang, Xiao Teng, Peng Lu, Xu Yang, Ming-Hsuan Yang, Hui Cheng", "tldr": "", "abstract": "This paper addresses the scarcity of large-scale datasets for accurate object-in-hand pose estimation, which is crucial for robotic in-hand manipulation within the \"Perception-Planning-Control\" paradigm. Specifically, we introduce VinT-6D, the first extensive multi-modal dataset integrating vision, touch, and proprioception, to enhance robotic manipulation. VinT-6D comprises 2 million VinT-Sim and 0.1 million VinT-Real entries, collected via simulations in Mujoco and Blender and a custom-designed real-world platform. This dataset is tailored for robotic hands, offering models with whole-hand tactile perception and high-quality, well-aligned data. To the best of our knowledge, the VinT-Real is the largest considering the collection difficulties in the real-world environment so it can bridge the gap of simulation to real compared to the previous works. Built upon VinT-6D, we present a benchmark method that shows significant improvements in performance by fusing multi-modal information. The project is available at https://VinT-6D.github.io/.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhaoliang Wan;Yonggen Ling;Senlin Yi;Lu Qi;Wang Wei Lee;Minglei Lu;Sicheng Yang;Xiao Teng;Peng Lu;Xu Yang;Ming-Hsuan Yang;Hui Cheng", "authorids": "~Zhaoliang_Wan1;~Yonggen_Ling1;~Senlin_Yi1;~Lu_Qi1;~Wang_Wei_Lee1;~Minglei_Lu1;~Sicheng_Yang5;~Xiao_Teng4;~Peng_Lu9;~Xu_Yang1;~Ming-Hsuan_Yang1;~Hui_Cheng5", "gender": "M;M;M;M;;;M;M;M;M;M;", "homepage": "https://jeffreyzhaoliang.github.io/zhaoliang-wan.github.io/;;https://github.com/rpo130;https://www.luqi.info;;;https://scholar.google.com/citations?user=eZeK7jAAAAAJ&hl=zh-CN;https://www.linkedin.com/in/xiao-teng.30837a46;;http://people.ucas.ac.cn/~XuYang;https://faculty.ucmerced.edu/mhyang/;", "dblp": "237/8933.html;139/7117;;;;;;;;63/1534-4.html;79/3711.html;", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;;;https://scholar.google.com.hk/citations?user=SSI90d4AAAAJ;;jq2fvmIAAAAJ;;;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.hk/citations?hl=zh-CN;p9-ohHsAAAAJ;", "orcid": ";;;;;;;;;0000-0003-0553-4581;0000-0003-4848-2304;", "linkedin": ";;;;lee-wang-wei/;;;;;;minghsuanyang/;", "or_profile": "~Zhaoliang_Wan1;~Yonggen_Ling1;~Senlin_Yi1;~Lu_Qi1;~Wang_Wei_Lee1;~Minglei_Lu1;~Sicheng_Yang5;~Xiao_Teng4;~Peng_Lu9;~Xu_Yang1;~Ming-Hsuan_Yang1;~Hui_Cheng5", "aff": "SUN YAT-SEN UNIVERSITY;Tencent Robotics X;SUN YAT-SEN UNIVERSITY;University of California, Merced;Tencent Robotics X;Tencent;;;;Institute of Automation of Chinese academy of science;University of California at Merced;", "aff_domain": "sysu.edu;tencent.com;sysu.edu.cn;ucmerced.edu;tencent.com;tencent.com;;;;ia.ac.cn;umcerced.edu;", "position": "PhD student;Researcher;MS student;Postdoc;Researcher;Researcher;;;;Associate Professor;Professor;", "bibtex": "@inproceedings{\nwan2024vintd,\ntitle={VinT-6D: A Large-Scale Object-in-hand Dataset from Vision, Touch and Proprioception},\nauthor={Zhaoliang Wan and Yonggen Ling and Senlin Yi and Lu Qi and Wang Wei Lee and Minglei Lu and Sicheng Yang and Xiao Teng and Peng Lu and Xu Yang and Ming-Hsuan Yang and Hui Cheng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4G5Dcjcm1s}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5266904, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4899615073261222577&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "sysu.edu;tencent.com;sysu.edu.cn;ucmerced.edu;tencent.com;tencent.com;;;;ia.ac.cn;umcerced.edu;", "author_num": 12, "aff_unique_index": "0;1;0;2;1;1;3;2", "aff_unique_norm": "Sun Yat-sen University;Tencent;University of California, Merced;Chinese Academy of Sciences", "aff_unique_dep": ";Tencent Robotics X;;Institute of Automation", "aff_unique_url": "http://www.sysu.edu.cn;https://www.tencent.com;https://www.ucmerced.edu;http://www.ia.cas.cn", "aff_unique_abbr": "SYSU;Tencent Robotics X;UC Merced;CAS", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Merced", "aff_country_unique_index": "0;0;0;1;0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Size-invariance Matters: Rethinking Metrics and Losses for Imbalanced Multi-object Salient Object Detection", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35026", "id": "4HCi7JGCZk", "proceeding": "https://proceedings.mlr.press/v235/li24bx.html", "pdf": "https://openreview.net/pdf?id=4HCi7JGCZk", "openreview": "https://openreview.net/forum?id=4HCi7JGCZk", "author_site": "Feiran Li, Qianqian Xu, Shilong Bao, Zhiyong Yang, Runmin Cong, Xiaochun Cao, Qingming Huang", "tldr": "", "abstract": "This paper explores the size-invariance of evaluation metrics in Salient Object Detection (SOD), especially when multiple targets of diverse sizes co-exist in the same image. We observe that current metrics are size-sensitive, where larger objects are focused, and smaller ones tend to be ignored. We argue that the evaluation should be size-invariant because bias based on size is unjustified without additional semantic information. In pursuit of this, we propose a generic approach that evaluates each salient object separately and then combines the results, effectively alleviating the imbalance. We further develop an optimization framework tailored to this goal, achieving considerable improvements in detecting objects of different sizes. Theoretically, we provide evidence supporting the validity of our new metrics and present the generalization analysis of SOD. Extensive experiments demonstrate the effectiveness of our method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Feiran Li;Qianqian Xu;Shilong Bao;Zhiyong Yang;Runmin Cong;Xiaochun Cao;Qingming Huang", "authorids": "~Feiran_Li3;~Qianqian_Xu2;~Shilong_Bao1;~Zhiyong_Yang1;~Runmin_Cong1;~Xiaochun_Cao3;~Qingming_Huang1", "gender": "M;F;M;M;M;M;", "homepage": "https://ferry-li.github.io;http://vipl.ict.ac.cn/people/~qianqianxu;https://statusrank.github.io/;https://joshuaas.github.io/;https://rmcong.github.io/;https://scst.sysu.edu.cn/members/caoxiaochun.htm;https://qmhuang-ucas.github.io/", "dblp": ";07/7627;143/0246;01/452-1.html;180/7852;39/3695;68/4388", "google_scholar": "vPStt7cAAAAJ;https://scholar.google.com.hk/citations?user=MjifS2MAAAAJ;https://scholar.google.com.hk/citations?user=5ZCgkQkAAAAJ;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.co.uk/citations?hl=en;https://scholar.google.com/citations?hl=en;https://scholar.google.com.hk/citations?user=J1vMnRgAAAAJ", "orcid": "0000-0001-6443-4842;;;0000-0002-4409-4999;0000-0003-0972-4008;0000-0001-7141-708X;", "linkedin": ";;;;;;", "or_profile": "~Feiran_Li3;~Qianqian_Xu2;~Shilong_Bao1;~Zhiyong_Yang1;~Runmin_Cong1;~Xiaochun_Cao3;~Qingming_Huang2", "aff": "Institute of Information Engineering, Chinese Academy of Sciences;Institute of Computing Technology, Chinese Academy of Sciences;University of Chinese Academy of Sciences;University of Chinese Academic of Sciences;Shandong University;SUN YAT-SEN UNIVERSITY;University of Chinese Academy of Sciences", "aff_domain": "iie.ac.cn;ict.ac.cn;ucas.ac.cn;ucas.ac.cb;sdu.edu.cn;sysu.edu.cn;ucas.ac.cn", "position": "PhD student;Full Professor;PhD student;Associate Professor;Full Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nli2024sizeinvariance,\ntitle={Size-invariance Matters: Rethinking Metrics and Losses for Imbalanced Multi-object Salient Object Detection},\nauthor={Feiran Li and Qianqian Xu and Shilong Bao and Zhiyong Yang and Runmin Cong and Xiaochun Cao and Qingming Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4HCi7JGCZk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3571677, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5564886173950741778&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "iie.ac.cn;ict.ac.cn;ucas.ac.cn;ucas.ac.cb;sdu.edu.cn;sysu.edu.cn;ucas.ac.cn", "author_num": 7, "aff_unique_index": "0;0;1;1;2;3;1", "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Shandong University;Sun Yat-sen University", "aff_unique_dep": "Institute of Information Engineering;;;", "aff_unique_url": "http://www.cas.cn;http://www.ucas.ac.cn;http://www.sdu.edu.cn;http://www.sysu.edu.cn", "aff_unique_abbr": "CAS;UCAS;SDU;SYSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "To the Max: Reinventing Reward in Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35025", "id": "4KQ0VwqPg8", "proceeding": "https://proceedings.mlr.press/v235/veviurko24a.html", "pdf": "https://openreview.net/pdf?id=4KQ0VwqPg8", "openreview": "https://openreview.net/forum?id=4KQ0VwqPg8", "author_site": "Grigorii Veviurko, Wendelin Boehmer, Mathijs de Weerdt", "tldr": "", "abstract": "In reinforcement learning (RL), different reward functions can define the same optimal policy but result in drastically different learning performance. For some, the agent gets stuck with a suboptimal behavior, and for others, it solves the task efficiently. Choosing a good reward function is hence an extremely important yet challenging problem. In this paper, we explore an alternative approach for using rewards for learning. We introduce *max-reward RL*, where an agent optimizes the maximum rather than the cumulative reward. Unlike earlier works, our approach works for deterministic and stochastic environments and can be easily combined with state-of-the-art RL algorithms. In the experiments, we study the performance of max-reward RL algorithms in two goal-reaching environments from Gymnasium-Robotics and demonstrate its benefits over standard RL. The code is available at https://github.com/veviurko/To-the-Max.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Grigorii Veviurko;Wendelin Boehmer;Mathijs de Weerdt", "authorids": "~Grigorii_Veviurko1;~Wendelin_Boehmer1;~Mathijs_de_Weerdt1", "gender": "M;M;M", "homepage": ";https://reinforceAI.net;http://www.alg.ewi.tudelft.nl/weerdt/", "dblp": ";08/9988;91/3015", "google_scholar": "2jVnBAIAAAAJ;https://scholar.google.de/citations?user=wI5MV8IAAAAJ;https://scholar.google.com.tw/citations?user=9GJ8AvgAAAAJ", "orcid": ";0000-0002-4398-6792;0000-0002-0470-6241", "linkedin": ";wendelin-boehmer;mdeweerdt/", "or_profile": "~Grigorii_Veviurko1;~Wendelin_Boehmer1;~Mathijs_Weerdt1", "aff": "Delft University of Technology;Delft University of Technology;Delft University of Technology", "aff_domain": "tudelft.nl;tudelft.nl;tudelft.nl", "position": "PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nveviurko2024to,\ntitle={To the Max: Reinventing Reward in Reinforcement Learning},\nauthor={Grigorii Veviurko and Wendelin Boehmer and Mathijs de Weerdt},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4KQ0VwqPg8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 594348, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10335037194100178821&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13, "email": "tudelft.nl;tudelft.nl;tudelft.nl", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Delft University of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.tudelft.nl", "aff_unique_abbr": "TU Delft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Netherlands" }, { "title": "SpikeLM: Towards General Spike-Driven Language Modeling via Elastic Bi-Spiking Mechanisms", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35024", "id": "4PB1RMsUy4", "proceeding": "https://proceedings.mlr.press/v235/xing24d.html", "pdf": "https://openreview.net/pdf?id=4PB1RMsUy4", "openreview": "https://openreview.net/forum?id=4PB1RMsUy4", "author_site": "Xingrun Xing, Zheng Zhang, Ziyi Ni, Shitao Xiao, Yiming Ju, Siqi Fan, Yequan Wang, Jiajun Zhang, Guoqi Li", "tldr": "", "abstract": "Towards energy-efficient artificial intelligence similar to the human brain, the bio-inspired spiking neural networks (SNNs) have advantages of biological plausibility, event-driven sparsity, and binary activation. Recently, large-scale language models exhibit promising generalization capability, making it a valuable issue to explore more general spike-driven models. However, the binary spikes in existing SNNs fail to encode adequate semantic information, placing technological challenges for generalization. This work proposes the first fully spiking mechanism for general language tasks, including both discriminative and generative ones. Different from previous spikes with 0,1 levels, we propose a more general spike formulation with bi-directional, elastic amplitude, and elastic frequency encoding, while still maintaining the addition nature of SNNs. In a single time step, the spike is enhanced by direction and amplitude information; in spike frequency, a strategy to control spike firing rate is well designed. We plug this elastic bi-spiking mechanism in language modeling, named SpikeLM. It is the first time to handle general language tasks with fully spike-driven models, which achieve much higher accuracy than previously possible. SpikeLM also greatly bridges the performance gap between SNNs and ANNs in language modeling. Our code is available at https://github.com/Xingrun-Xing/SpikeLM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xingrun Xing;Zheng Zhang;Ziyi Ni;Shitao Xiao;Yiming Ju;Siqi Fan;Yequan Wang;Jiajun Zhang;Guoqi Li", "authorids": "~Xingrun_Xing1;~Zheng_Zhang12;~Ziyi_Ni1;~Shitao_Xiao1;~Yiming_Ju1;~Siqi_Fan4;~Yequan_Wang1;~Jiajun_Zhang1;~Guoqi_Li1", "gender": "M;M;F;M;M;;M;M;M", "homepage": "https://scholar.google.com.hk/citations?user=sEdxu1UAAAAJ&hl=zh-CN;;https://www.researchgate.net/profile/Ziyi-Ni-2;;https://github.com/juyiming2;;http://www.wangyequan.com;http://www.nlpr.ia.ac.cn/cip/jjzhang.htm;https://scholar.google.com/citations?hl=en&user=qCfE--MAAAAJ", "dblp": "245/7952;181/2621-20.html;;286/1495;;149/1267-1;188/9082;71/6950-1.html;", "google_scholar": "https://scholar.google.com.hk/citations?user=sEdxu1UAAAAJ;S2bil1cAAAAJ;;https://scholar.google.com.hk/citations?hl=zh-CN;;pybmbCYAAAAJ;7Gqp6FsAAAAJ;93zngeYAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": "~Xingrun_Xing1;~Zheng_Zhang12;~Ziyi_Ni1;~Shitao_Xiao1;~Yiming_Ju1;~Siqi_Fan4;~Yequan_Wang1;~Jiajun_Zhang1;~Guoqi_Li1", "aff": "Chinese Academy of Sciences;Beijing Academy of Artificial Intelligence;Institute of Automation, Chinese Academy of Sciences;Beijing Academy of Artificial Intelligence;BAAI;University of Electronic Science and Technology of China;Beijing Academy of Artificial Intelligence;Institute of Automation, Chinese Academy of Sciences;Institute of Automation, Chinese Academy of Sciences", "aff_domain": "ia.ac.cn;baai.ac.cn;ia.ac.cn;baai.ac.cn;baai.ac.cn;uestc.edu.cn;baai.ac.cn;ia.ac.cn;ia.ac.cn", "position": "PhD student;Principal Researcher;PhD student;Researcher;Researcher;PhD student;Researcher;Full Professor;Full Professor", "bibtex": "@inproceedings{\nxing2024spikelm,\ntitle={Spike{LM}: Towards General Spike-Driven Language Modeling via Elastic Bi-Spiking Mechanisms},\nauthor={Xingrun Xing and Zheng Zhang and Ziyi Ni and Shitao Xiao and Yiming Ju and Siqi Fan and Yequan Wang and Jiajun Zhang and Guoqi Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4PB1RMsUy4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2339557, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=61281320262539949&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "ia.ac.cn;baai.ac.cn;ia.ac.cn;baai.ac.cn;baai.ac.cn;uestc.edu.cn;baai.ac.cn;ia.ac.cn;ia.ac.cn", "author_num": 9, "aff_unique_index": "0;1;0;1;1;2;1;0;0", "aff_unique_norm": "Chinese Academy of Sciences;Beijing Academy of Artificial Intelligence;University of Electronic Science and Technology of China", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cas.cn;https://www.baaic.cn;https://www.uestc.edu.cn", "aff_unique_abbr": "CAS;BAAI;UESTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Deep Fusion: Efficient Network Training via Pre-trained Initializations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35023", "id": "4PuM6iGPPi", "proceeding": "https://proceedings.mlr.press/v235/mazzawi24a.html", "pdf": "https://openreview.net/pdf?id=4PuM6iGPPi", "openreview": "https://openreview.net/forum?id=4PuM6iGPPi", "author_site": "Hanna Mazzawi, Xavi Gonzalvo, Michael Wunder, Sammy Jerome, Benoit Dherin", "tldr": "", "abstract": "Training deep neural networks for large language models (LLMs) remains computationally very expensive. To mitigate this, network growing algorithms offer potential cost savings, but their underlying mechanisms are poorly understood. In this paper, we propose a theoretical framework using backward error analysis to illuminate the dynamics of mid-training network growth. Furthermore, we introduce Deep Fusion, an efficient network training approach that leverages pre-trained initializations of smaller networks, facilitating network growth from diverse sources. Our experiments validate the power of our theoretical framework in guiding the optimal use of Deep Fusion. With carefully optimized training dynamics, Deep Fusion demonstrates significant reductions in both training time and resource consumption. Importantly, these gains are achieved without sacrificing performance. We demonstrate reduced computational requirements, and improved generalization performance on a variety of NLP tasks and T5 model sizes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hanna Mazzawi;Javier Gonzalvo;Michael Wunder;Sammy Jerome;Benoit Dherin", "authorids": "~Hanna_Mazzawi1;~Javier_Gonzalvo1;~Michael_Wunder2;~Sammy_Jerome1;~Benoit_Dherin1", "gender": "M;;;;", "homepage": ";http://www.xavigonzalvo.com;;;", "dblp": "84/5481;;98/8411;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;https://www.linkedin.com/samuel-jerome-6a190284;", "or_profile": "~Hanna_Mazzawi1;~Javier_Gonzalvo1;~Michael_Wunder2;~Sammy_Jerome1;~Benoit_Dherin1", "aff": ";Google;Google;Google;", "aff_domain": ";google.com;google.com;google.com;", "position": ";Researcher;Researcher;Researcher;", "bibtex": "@inproceedings{\nmazzawi2024deep,\ntitle={Deep Fusion: Efficient Network Training via Pre-trained Initializations},\nauthor={Hanna Mazzawi and Javier Gonzalvo and Michael Wunder and Sammy Jerome and Benoit Dherin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4PuM6iGPPi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1416843, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5462932396193355891&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";google.com;google.com;google.com;", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Self-Attention through Kernel-Eigen Pair Sparse Variational Gaussian Processes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35022", "id": "4RqG4K5UwL", "proceeding": "https://proceedings.mlr.press/v235/chen24am.html", "pdf": "https://openreview.net/pdf?id=4RqG4K5UwL", "openreview": "https://openreview.net/forum?id=4RqG4K5UwL", "author_site": "Yingyi Chen, Qinghua Tao, Francesco Tonin, Johan Suykens", "tldr": "", "abstract": "While the great capability of Transformers significantly boosts prediction accuracy, it could also yield overconfident predictions and require calibrated uncertainty estimation, which can be commonly tackled by Gaussian processes (GPs). Existing works apply GPs with symmetric kernels under variational inference to the attention kernel; however, omitting the fact that attention kernels are in essence asymmetric. Moreover, the complexity of deriving the GP posteriors remains high for large-scale data. In this work, we propose Kernel-Eigen Pair Sparse Variational Gaussian Processes (KEP-SVGP) for building uncertainty-aware self-attention where the asymmetry of attention kernels is tackled by Kernel SVD (KSVD) and a reduced complexity is acquired. Through KEP-SVGP, i) the SVGP pair induced by the two sets of singular vectors from KSVD w.r.t. the attention kernel fully characterizes the asymmetry; ii) using only a small set of adjoint eigenfunctions from KSVD, the derivation of SVGP posteriors can be based on the inversion of a diagonal matrix containing singular values, contributing to a reduction in time complexity; iii) an evidence lower bound is derived so that variational parameters and network weights can be optimized with it. Experiments verify our excellent performances and efficiency on in-distribution, distribution-shift and out-of-distribution benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yingyi Chen;Qinghua Tao;Francesco Tonin;Johan Suykens", "authorids": "~Yingyi_Chen3;~Qinghua_Tao1;~Francesco_Tonin1;~Johan_Suykens1", "gender": "F;F;;M", "homepage": ";https://qinghua-tao.github.io/;https://taralloc.github.io/;https://www.kuleuven.be/wieiswie/nl/person/00015385", "dblp": "09/9441;182/9643.html;279/6777;61/3224", "google_scholar": "5b2jAVUAAAAJ;_dZHZD8AAAAJ;;https://scholar.google.be/citations?user=WtBmh0UAAAAJ", "orcid": "0000-0002-5571-9050;0000-0001-9705-7748;0000-0002-5644-0086;0000-0002-8846-6352", "linkedin": ";;;", "or_profile": "~Yingyi_Chen3;~Qinghua_Tao1;~Francesco_Tonin1;~Johan_Suykens1", "aff": "Department of Electrical Engineering, KU Leuven, Belgium;(ESAT) Department of Electrical Engineering, KU Leuven, Belgium, KU Leuven;EPFL - EPF Lausanne;KU Leuven", "aff_domain": "esat.kuleuven.be;esat.kuleuven.be;epfl.ch;kuleuven.be", "position": "PhD student;Postdoc;Postdoc;Full Professor", "bibtex": "@inproceedings{\nchen2024selfattention,\ntitle={Self-Attention through Kernel-Eigen Pair Sparse Variational Gaussian Processes},\nauthor={Yingyi Chen and Qinghua Tao and Francesco Tonin and Johan Suykens},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4RqG4K5UwL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 777714, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12312789018844180045&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "email": "esat.kuleuven.be;esat.kuleuven.be;epfl.ch;kuleuven.be", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "KU Leuven;EPFL;Katholieke Universiteit Leuven", "aff_unique_dep": "Department of Electrical Engineering;;", "aff_unique_url": "https://www.kuleuven.be;https://www.epfl.ch;https://www.kuleuven.be", "aff_unique_abbr": "KU Leuven;EPFL;KU Leuven", "aff_campus_unique_index": "1", "aff_campus_unique": ";Lausanne", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Belgium;Switzerland" }, { "title": "Coresets for Multiple $\\ell_p$ Regression", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35021", "id": "4UWjqrMmFp", "proceeding": "https://proceedings.mlr.press/v235/woodruff24a.html", "pdf": "https://openreview.net/pdf?id=4UWjqrMmFp", "openreview": "https://openreview.net/forum?id=4UWjqrMmFp", "author_site": "David Woodruff, Taisuke Yasuda", "tldr": "", "abstract": "A *coreset* of a dataset with $n$ examples and $d$ features is a weighted subset of examples that is sufficient for solving downstream data analytic tasks. Nearly optimal constructions of coresets for least squares and $\\ell_p$ linear regression with a single response are known in prior work. However, for multiple $\\ell_p$ regression where there can be $m$ responses, there are no known constructions with size sublinear in $m$. In this work, we construct coresets of size $\\tilde O(\\varepsilon^{-2}d)$ for $p<2$ and $\\tilde O(\\varepsilon^{-p}d^{p/2})$ for $p>2$ independently of $m$ (i.e., dimension-free) that approximate the multiple $\\ell_p$ regression objective at every point in the domain up to $(1\\pm\\varepsilon)$ relative error. If we only need to preserve the minimizer subject to a subspace constraint, we improve these bounds by an $\\varepsilon$ factor for all $p>1$. All of our bounds are nearly tight. We give two application of our results. First, we settle the number of uniform samples needed to approximate $\\ell_p$ Euclidean power means up to a $(1+\\varepsilon)$ factor, showing that $\\tilde\\Theta(\\varepsilon^{-2})$ samples for $p = 1$, $\\tilde\\Theta(\\varepsilon^{-1})$ samples for $1 < p < 2$, and $\\tilde\\Theta(\\varepsilon^{1-p})$ samples for $p>2$ is tight, answering a question of Cohen-Addad, Saulpic, and Schwiegelshohn. Second, we show that for $1.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jinlu Zhang;Yiyi Zhou;Qiancheng Zheng;Xiaoxiong Du;Gen Luo;Jun Peng;Xiaoshuai Sun;Rongrong Ji", "authorids": "~Jinlu_Zhang2;~Yiyi_Zhou1;~Qiancheng_Zheng1;~Xiaoxiong_Du1;~Gen_Luo1;~Jun_Peng2;~Xiaoshuai_Sun3;~Rongrong_Ji5", "gender": "F;M;M;M;;;M;M", "homepage": ";;;https://mac.xmu.edu.cn/index.htm;;;https://sites.google.com/view/xssun;http://mac.xmu.edu.cn/rrji-en.html", "dblp": "130/5411-2.html;174/0086;372/1656;331/1412;;;26/5787.html;86/5681", "google_scholar": ";w3_2ep0AAAAJ;;;;;KPMK3B4AAAAJ;", "orcid": "0000-0002-8731-7099;;0009-0007-9389-3555;0009-0005-7743-7301;;;0000-0003-3912-9306;", "linkedin": ";;;;;;;", "or_profile": "~Jinlu_Zhang2;~Yiyi_Zhou1;~Qiancheng_Zheng1;~Xiaoxiong_Du1;~Gen_Luo1;~Jun_Peng2;~Xiaoshuai_Sun3;~Rongrong_Ji5", "aff": "Xiamen University;Xiamen University;Xiamen University;Xiamen University;;;Xiamen University;Xiamen University", "aff_domain": "xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;;;xmu.edu.cn;xmu.edu.cn", "position": "PhD student;Associate Professor;MS student;MS student;;;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nzhang2024fast,\ntitle={Fast Text-to-3D-Aware Face Generation and Manipulation via Direct Cross-modal Mapping and Geometric Regularization},\nauthor={Jinlu Zhang and Yiyi Zhou and Qiancheng Zheng and Xiaoxiong Du and Gen Luo and Jun Peng and Xiaoshuai Sun and Rongrong Ji},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4boDu42RtE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9317666, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3313947624026527494&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;;;xmu.edu.cn;xmu.edu.cn", "author_num": 8, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Xiamen University", "aff_unique_dep": "", "aff_unique_url": "https://www.xmu.edu.cn", "aff_unique_abbr": "XMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Preventing Model Collapse in Gaussian Process Latent Variable Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35013", "id": "4byOXWrJay", "proceeding": "https://proceedings.mlr.press/v235/li24as.html", "pdf": "https://openreview.net/pdf?id=4byOXWrJay", "openreview": "https://openreview.net/forum?id=4byOXWrJay", "author_site": "Ying Li, Zhidi Lin, Feng Yin, Michael Minyi Zhang", "tldr": "", "abstract": "Gaussian process latent variable models (GPLVMs) are a versatile family of unsupervised learning models commonly used for dimensionality reduction. However, common challenges in modeling data with GPLVMs include inadequate kernel flexibility and improper selection of the projection noise, leading to a type of model collapse characterized by vague latent representations that do not reflect the underlying data structure. This paper addresses these issues by, first, theoretically examining the impact of projection variance on model collapse through the lens of a linear GPLVM. Second, we tackle model collapse due to inadequate kernel flexibility by integrating the spectral mixture (SM) kernel and a differentiable random Fourier feature (RFF) kernel approximation, which ensures computational scalability and efficiency through off-the-shelf automatic differentiation tools for learning the kernel hyperparameters, projection variance, and latent representations within the variational inference framework. The proposed GPLVM, named *advised*RFLVM, is evaluated across diverse datasets and consistently outperforms various salient competing models, including state-of-the-art variational autoencoders (VAEs) and other GPLVM variants, in terms of informative latent representations and missing data imputation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ying Li;Zhidi Lin;Feng Yin;Michael Minyi Zhang", "authorids": "~Ying_Li21;~Zhidi_Lin1;~Feng_Yin1;~Michael_Minyi_Zhang1", "gender": "F;Not Specified;M;", "homepage": "https://scholar.google.com/citations?view_op=list_works&hl=en&user=mQQxodAAAAAJ&gmla=AJsN-F6CGw8lAhRFck_3cZ4GNr08C_nKG0Qp0mxfe37NavqPJMwCtth1RAM_vRz-yQIvz-FQesAlnhrYtmxiwRSvZyV346VtWka5O74C4isxgacQL0pCvhe4obe1gka5vo5ZIWhGQX_a;https://zhidi-lin.github.io/;https://sse.cuhk.edu.cn/en/faculty/yinfeng;https://michaelzhang01.github.io/", "dblp": ";236/7105;59/6917;223/7432", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;8BmRXqMAAAAJ;4mW1N5oAAAAJ;JFLkLhoAAAAJ", "orcid": "0009-0007-8100-6568;;;", "linkedin": ";zhidi-lin/;;", "or_profile": "~Ying_Li21;~Zhidi_Lin1;~Feng_Yin1;~Michael_Minyi_Zhang1", "aff": "University of Hong Kong;The Chinese University of Hong Kong, Shenzhen;;The University of Hong Kong", "aff_domain": "hku.hk;cuhk.edu.cn;;hku.hk", "position": "PhD student;PhD student;;Assistant Professor", "bibtex": "@inproceedings{\nli2024preventing,\ntitle={Preventing Model Collapse in Gaussian Process Latent Variable Models},\nauthor={Ying Li and Zhidi Lin and Feng Yin and Michael Minyi Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4byOXWrJay}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3523396, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=869871562252265272&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "hku.hk;cuhk.edu.cn;;hku.hk", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Hong Kong;Chinese University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "https://www.hku.hk;https://www.cuhk.edu.cn", "aff_unique_abbr": "HKU;CUHK", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Hong Kong SAR;Shenzhen", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "SAPG: Split and Aggregate Policy Gradients", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35012", "id": "4dOJAfXhNV", "proceeding": "https://proceedings.mlr.press/v235/singla24a.html", "pdf": "https://openreview.net/pdf?id=4dOJAfXhNV", "openreview": "https://openreview.net/forum?id=4dOJAfXhNV", "author_site": "Jayesh Singla, Ananye Agarwal, Deepak Pathak", "tldr": "", "abstract": "Despite extreme sample inefficiency, on-policy reinforcement learning, aka policy gradients, has become a fundamental tool in decision-making problems. With the recent advances in GPU-driven simulation, the ability to collect large amounts of data for RL training has scaled exponentially. However, we show that current RL methods, e.g. PPO, fail to ingest the benefit of parallelized environments beyond a certain point and their performance saturates. To address this, we propose a new on-policy RL algorithm that can effectively leverage large-scale environments by splitting them into chunks and fusing them back together via importance sampling. Our algorithm, termed SAPG, shows significantly higher performance across a variety of challenging environments where vanilla PPO and other strong baselines fail to achieve high performance. Webpage at https://sapg-rl.github.io/.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jayesh Singla;Ananye Agarwal;Deepak Pathak", "authorids": "~Jayesh_Singla1;~Ananye_Agarwal1;~Deepak_Pathak1", "gender": "M;M;M", "homepage": ";https://anag.me/;https://www.cs.cmu.edu/~dpathak/", "dblp": ";294/4812;155/9860", "google_scholar": ";https://scholar.google.com/citations?hl=en;https://scholar.google.cl/citations?user=AEsPCAUAAAAJ", "orcid": ";;", "linkedin": "jayeshsingla/;;pathak22/", "or_profile": "~Jayesh_Singla1;~Ananye_Agarwal1;~Deepak_Pathak1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu;cmu.edu", "position": "MS student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nsingla2024sapg,\ntitle={{SAPG}: Split and Aggregate Policy Gradients},\nauthor={Jayesh Singla and Ananye Agarwal and Deepak Pathak},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4dOJAfXhNV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7575963, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1731581948429011106&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "cmu.edu;cmu.edu;cmu.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Unveiling Privacy, Memorization, and Input Curvature Links", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35011", "id": "4dxR7awO5n", "proceeding": "https://proceedings.mlr.press/v235/ravikumar24a.html", "pdf": "https://openreview.net/pdf?id=4dxR7awO5n", "openreview": "https://openreview.net/forum?id=4dxR7awO5n", "author_site": "Deepak Ravikumar, Efstathia Soufleri, Abolfazl Hashemi, Kaushik Roy", "tldr": "", "abstract": "Deep Neural Nets (DNNs) have become a pervasive tool for solving many emerging problems. However, they tend to overfit to and memorize the training set. Memorization is of keen interest since it is closely related to several concepts such as generalization, noisy learning, and privacy. To study memorization, Feldman (2019) proposed a formal score, however its computational requirements limit its practical use. Recent research has shown empirical evidence linking input loss curvature (measured by the trace of the loss Hessian w.r.t inputs) and memorization. It was shown to be $\\sim3$ orders of magnitude more efficient than calculating the memorization score. However, there is a lack of theoretical understanding linking memorization with input loss curvature. In this paper, we not only investigate this connection but also extend our analysis to establish theoretical links between differential privacy, memorization, and input loss curvature. First, we derive an upper bound on memorization characterized by both differential privacy and input loss curvature. Secondly, we present a novel insight showing that input loss curvature is upper-bounded by the differential privacy parameter. Our theoretical findings are further validated using deep models on CIFAR and ImageNet datasets, showing a strong correlation between our theoretical predictions and results observed in practice.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Deepak Ravikumar;Efstathia Soufleri;Abolfazl Hashemi;Kaushik Roy", "authorids": "~Deepak_Ravikumar1;~Efstathia_Soufleri1;~Abolfazl_Hashemi1;~Kaushik_Roy1", "gender": ";F;M;M", "homepage": ";https://www.linkedin.com/in/efstathia-soufleri/;https://abolfazlh.github.io/;https://engineering.purdue.edu/NRL/Group", "dblp": ";241/5004;176/5595;r/KaushikRoy", "google_scholar": ";RXLWGNcAAAAJ;Se7mocgAAAAJ;to4P8KgAAAAJ", "orcid": ";0000-0001-8699-9940;0000-0002-8421-4270;", "linkedin": ";efstathia-soufleri/;abolfazlh;", "or_profile": "~Deepak_Ravikumar1;~Efstathia_Soufleri1;~Abolfazl_Hashemi1;~Kaushik_Roy1", "aff": ";Purdue University;Purdue University;Purdue University", "aff_domain": ";purdue.edu;purdue.edu;purdue.edu", "position": ";PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nravikumar2024unveiling,\ntitle={Unveiling Privacy, Memorization, and Input Curvature Links},\nauthor={Deepak Ravikumar and Efstathia Soufleri and Abolfazl Hashemi and Kaushik Roy},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4dxR7awO5n}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2341330, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11225863693777422265&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": ";purdue.edu;purdue.edu;purdue.edu", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Generalized Smooth Variational Inequalities: Methods with Adaptive Stepsizes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35010", "id": "4iBJyJeBX5", "proceeding": "https://proceedings.mlr.press/v235/vankov24a.html", "pdf": "https://openreview.net/pdf?id=4iBJyJeBX5", "openreview": "https://openreview.net/forum?id=4iBJyJeBX5", "author_site": "Daniil Vankov, Angelia Nedich, Lalitha Sankar", "tldr": "", "abstract": "Variational Inequality (VI) problems have attracted great interest in the machine learning (ML) community due to their application in adversarial and multi-agent training. Despite its relevance in ML, the oft-used strong-monotonicity and Lipschitz continuity assumptions on VI problems are restrictive and do not hold in many machine learning problems. To address this, we relax smoothness and monotonicity assumptions and study structured non-monotone generalized smoothness. The key idea of our results is in adaptive stepsizes. We prove the first-known convergence results for solving generalized smooth VIs for the three popular methods, namely, projection, Korpelevich, and Popov methods. Our convergence rate results for generalized smooth VIs match or improve existing results on smooth VIs. We present numerical experiments that support our theoretical guarantees and highlight the efficiency of proposed adaptive stepsizes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Daniil Vankov;Angelia Nedich;Lalitha Sankar", "authorids": "~Daniil_Vankov1;~Angelia_Nedich1;~Lalitha_Sankar2", "gender": ";F;F", "homepage": ";https://angelia.engineering.asu.edu/;https://sankar.engineering.asu.edu/", "dblp": ";;https://dblp.uni-trier.de/pers/s/Sankar:Lalitha.html", "google_scholar": "Za_nqnIAAAAJ;86PxxsoAAAAJ;VQq0aIwAAAAJ", "orcid": ";;", "linkedin": ";;lalitha-sankar-045b3a7/", "or_profile": "~Daniil_Vankov1;~Angelia_Nedich1;~Lalitha_Sankar2", "aff": "Arizona State University;Arizona State University;Arizona State University", "aff_domain": "asu.edu;asu.edu;asu.edu", "position": "PhD student;Full Professor;Professor", "bibtex": "@inproceedings{\nvankov2024generalized,\ntitle={Generalized Smooth Variational Inequalities: Methods with Adaptive Stepsizes},\nauthor={Daniil Vankov and Angelia Nedich and Lalitha Sankar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4iBJyJeBX5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 733966, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13615479278009793892&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 5, "email": "asu.edu;asu.edu;asu.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Arizona State University", "aff_unique_dep": "", "aff_unique_url": "https://www.asu.edu", "aff_unique_abbr": "ASU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Equivariant Frames and the Impossibility of Continuous Canonicalization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35009", "id": "4iy0q0carb", "proceeding": "https://proceedings.mlr.press/v235/dym24a.html", "pdf": "https://openreview.net/pdf?id=4iy0q0carb", "openreview": "https://openreview.net/forum?id=4iy0q0carb", "author_site": "Nadav Dym, Hannah Lawrence, Jonathan Siegel", "tldr": "", "abstract": "Canonicalization provides an architecture-agnostic method for enforcing equivariance, with generalizations such as frame-averaging recently gaining prominence as a lightweight and flexible alternative to equivariant architectures. Recent works have found an empirical benefit to using probabilistic frames instead, which learn weighted distributions over group elements. In this work, we provide strong theoretical justification for this phenomenon: for commonly-used groups, there is no efficiently computable choice of frame that preserves continuity of the function being averaged. In other words, unweighted frame-averaging can turn a smooth, non-symmetric function into a discontinuous, symmetric function. To address this fundamental robustness problem, we formally define and construct *weighted* frames, which provably preserve continuity, and demonstrate their utility by constructing efficient and continuous weighted frames for the actions of $SO(d)$, $O(d)$, and $S_n$ on point clouds.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nadav Dym;Hannah Lawrence;Jonathan W. Siegel", "authorids": "~Nadav_Dym1;~Hannah_Lawrence1;~Jonathan_W._Siegel1", "gender": "F;M;M", "homepage": "https://hannahlawrence.github.io/;https://jwsiegel2510.github.io;https://nadavdym.github.io./", "dblp": "251/5474;239/6028;167/1176", "google_scholar": ";oI42qIIAAAAJ;https://scholar.google.co.il/citations?user=qOyXmMYAAAAJ", "orcid": ";;", "linkedin": "hannah-lawrence-417b5a130/;;", "or_profile": "~Hannah_Lawrence1;~Jonathan_W._Siegel1;~Nadav_E_Dym1", "aff": "Massachusetts Institute of Technology;Texas A&M University - College Station;Technion - Israel Institute of Technology, Technion", "aff_domain": "mit.edu;tamu.edu;technion.ac.il", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\ndym2024equivariant,\ntitle={Equivariant Frames and the Impossibility of Continuous Canonicalization},\nauthor={Nadav Dym and Hannah Lawrence and Jonathan W. Siegel},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4iy0q0carb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1450239, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8567147802086046825&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "mit.edu;tamu.edu;technion.ac.il", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Massachusetts Institute of Technology;Texas A&M University;Technion - Israel Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://web.mit.edu;https://www.tamu.edu;https://www.technion.ac.il", "aff_unique_abbr": "MIT;TAMU;Technion", "aff_campus_unique_index": "1", "aff_campus_unique": ";College Station", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;Israel" }, { "title": "Automated Evaluation of Retrieval-Augmented Language Models with Task-Specific Exam Generation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35008", "id": "4jqOV6NlUz", "proceeding": "https://proceedings.mlr.press/v235/guinet24a.html", "pdf": "https://openreview.net/pdf?id=4jqOV6NlUz", "openreview": "https://openreview.net/forum?id=4jqOV6NlUz", "author_site": "Gauthier Guinet, Behrooz Tehrani, Anoop Deoras, Laurent Callot", "tldr": "", "abstract": "We propose a new method to measure the task-specific accuracy of Retrieval-Augmented Large Language Models (RAG). Evaluation is performed by scoring the RAG on an automatically-generated synthetic exam composed of multiple choice questions based on the corpus of documents associated with the task. Our method is an automated, cost-efficient, interpretable, and robust strategy to select the optimal components for a RAG system. We leverage Item Response Theory (IRT) to estimate the quality of an exam and its informativeness on task-specific accuracy. IRT also provides a natural way to iteratively improve the exam by eliminating the exam questions that are not sufficiently informative about a model's ability. We demonstrate our approach on four new open-ended Question-Answering tasks based on Arxiv abstracts, StackExchange questions, AWS DevOps troubleshooting guides, and SEC filings. In addition, our experiments reveal more general insights into factors impacting RAG performance like size, retrieval mechanism, prompting and fine-tuning. Most notably, our findings show that choosing the right retrieval algorithms often leads to bigger performance gains than simply using a larger language model.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gauthier Guinet;Behrooz Omidvar-Tehrani;Anoop Deoras;Laurent Callot", "authorids": "~Gauthier_Guinet1;omidvart@amazon.com;~Anoop_Deoras1;~Laurent_Callot1", "gender": "M;;M;", "homepage": "https://gguinet.github.io;;;https://lcallot.github.io/", "dblp": ";;55/8761;", "google_scholar": "https://scholar.google.fr/citations?hl=fr;;QF_rhCIAAAAJ;bkrcSq0AAAAJ", "orcid": ";;;", "linkedin": "gauthier-guinet;;anoopdeoras/;", "or_profile": "~Gauthier_Guinet1;omidvart@amazon.com;~Anoop_Deoras1;~Laurent_Callot1", "aff": "Amazon;;Amazon;Amazon", "aff_domain": "amazon.com;;amazon.com;amazon.com", "position": "Researcher;;Principal Researcher;Researcher", "bibtex": "@inproceedings{\nguinet2024automated,\ntitle={Automated Evaluation of Retrieval-Augmented Language Models with Task-Specific Exam Generation},\nauthor={Gauthier Guinet and Behrooz Omidvar-Tehrani and Anoop Deoras and Laurent Callot},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4jqOV6NlUz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7062987, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12997032662710507918&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "amazon.com;;amazon.com;amazon.com", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Amazon", "aff_unique_dep": "Amazon.com, Inc.", "aff_unique_url": "https://www.amazon.com", "aff_unique_abbr": "Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "High-dimensional Linear Bandits with Knapsacks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35007", "id": "4lghifYrSU", "proceeding": "https://proceedings.mlr.press/v235/ma24p.html", "pdf": "https://openreview.net/pdf?id=4lghifYrSU", "openreview": "https://openreview.net/forum?id=4lghifYrSU", "author_site": "Wanteng Ma, Dong Xia, Jiashuo Jiang", "tldr": "", "abstract": "We study the contextual bandits with knapsack (CBwK) problem under the high-dimensional setting where the dimension of the feature is large. We investigate how to exploit the sparsity structure to achieve improved regret for the CBwK problem. To this end, we first develop an online variant of the hard thresholding algorithm that performs the optimal sparse estimation. We further combine our online estimator with a primal-dual framework, where we assign a dual variable to each knapsack constraint and utilize an online learning algorithm to update the dual variable, thereby controlling the consumption of the knapsack capacity. We show that this integrated approach allows us to achieve a sublinear regret that depends logarithmically on the feature dimension, thus improving the polynomial dependency established in the previous literature. We also apply our framework to the high-dimension contextual bandit problem without the knapsack constraint and achieve optimal regret in both the data-poor regime and the data-rich regime.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wanteng Ma;Dong Xia;Jiashuo Jiang", "authorids": "wmaah@connect.ust.hk;~Dong_Xia1;~Jiashuo_Jiang1", "gender": ";M;", "homepage": ";https://www.math.hkust.edu.hk/~madxia/index.html;https://jiashuo3.github.io/", "dblp": ";;281/6676", "google_scholar": ";https://scholar.google.com.hk/citations?user=btelFt8AAAAJ;", "orcid": ";;0000-0001-5230-4231", "linkedin": ";;", "or_profile": "wmaah@connect.ust.hk;~Dong_Xia1;~Jiashuo_Jiang1", "aff": ";Hong Kong University of Science and Technology;Hong Kong University of Science and Technology", "aff_domain": ";ust.hk;ust.hk", "position": ";Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nma2024highdimensional,\ntitle={High-dimensional Linear Bandits with Knapsacks},\nauthor={Wanteng Ma and Dong Xia and Jiashuo Jiang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4lghifYrSU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 695838, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6575690718294977532&as_sdt=800005&sciodt=0,15&hl=en", "gs_version_total": 8, "email": ";ust.hk;ust.hk", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Hong Kong University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.ust.hk", "aff_unique_abbr": "HKUST", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "GroupCover: A Secure, Efficient and Scalable Inference Framework for On-device Model Protection based on TEEs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35006", "id": "4mU6LNMaIu", "proceeding": "https://proceedings.mlr.press/v235/zhang24bn.html", "pdf": "https://openreview.net/pdf?id=4mU6LNMaIu", "openreview": "https://openreview.net/forum?id=4mU6LNMaIu", "author_site": "Zheng Zhang, Na Wang, Ziqi Zhang, Yao Zhang, Tianyi Zhang, Jianwei Liu, Ye Wu", "tldr": "", "abstract": "Due to the high cost of training DNN models, how to protect the intellectual property of DNN models, especially when the models are deployed to users' devices, is becoming an important topic. One practical solution is to use Trusted Execution Environments (TEEs) and researchers have proposed various model obfuscation solutions to make full use of the high-security guarantee of TEEs and the high performance of collocated GPUs. In this paper, we first identify a common vulnerability, namely the fragility of randomness, that is shared by existing TEE-based model obfuscation solutions. This vulnerability benefits model-stealing attacks and allows the adversary to recover about 97% of the secret model. To improve the security of TEE-shielded DNN models, we further propose a new model obfuscation approach GroupCover, which uses sufficient randomization and mutual covering obfuscation to protect model weights. Experimental results demonstrate that GroupCover can achieve a comparable security level as the upper-bound (black-box protection), which is remarkably over 3x compared with existing solutions. Besides, GroupCover introduces 19% overhead and negligible accuracy loss compared to model unprotected scheme.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zheng Zhang;Na Wang;Ziqi Zhang;Yao Zhang;Tianyi Zhang;Jianwei Liu;Ye Wu", "authorids": "~Zheng_Zhang33;nawang@buaa.edu.cn;~Ziqi_Zhang6;zhangyao.crypto@bytedance.com;zhangtianyi1@buaa.edu.cn;~Jianwei_Liu2;~Ye_Wu8", "gender": ";;M;;;M;", "homepage": ";;;;;http://cst.buaa.edu.cn/info/1206/2649.htm;", "dblp": ";;;;;;", "google_scholar": ";;TR8bkyYAAAAJ;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Zheng_Zhang33;nawang@buaa.edu.cn;~Ziqi_Zhang6;zhangyao.crypto@bytedance.com;zhangtianyi1@buaa.edu.cn;~Jianwei_Liu2;~Ye_Wu8", "aff": ";;Peking University;;;School of Cyber Science and Technology, Beihang University;", "aff_domain": ";;pku.edu.cn;;;cst.buaa.edu.cn;", "position": ";;Postdoc;;;Full Professor;", "bibtex": "@inproceedings{\nzhang2024groupcover,\ntitle={GroupCover: A Secure, Efficient and Scalable Inference Framework for On-device Model Protection based on {TEE}s},\nauthor={Zheng Zhang and Na Wang and Ziqi Zhang and Yao Zhang and Tianyi Zhang and Jianwei Liu and Ye Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4mU6LNMaIu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 730310, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1821389264381020871&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": ";;pku.edu.cn;;;cst.buaa.edu.cn;", "author_num": 7, "aff_unique_index": "0;1", "aff_unique_norm": "Peking University;Beihang University", "aff_unique_dep": ";School of Cyber Science and Technology", "aff_unique_url": "http://www.pku.edu.cn;http://www.buaa.edu.cn", "aff_unique_abbr": "Peking U;Beihang", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "$\\bf{\\Phi}_\\textrm{Flow}$: Differentiable Simulations for PyTorch, TensorFlow and Jax", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35005", "id": "4oD0tRrUOX", "proceeding": "https://proceedings.mlr.press/v235/holl24a.html", "pdf": "https://openreview.net/pdf?id=4oD0tRrUOX", "openreview": "https://openreview.net/forum?id=4oD0tRrUOX", "author_site": "Philipp Holl, Nils Thuerey", "tldr": "", "abstract": "Differentiable processes have proven an invaluable tool for machine learning (ML) in scientific and engineering settings, but most ML libraries are not primarily designed for such applications. We present $\\Phi_\\textrm{Flow}$, a Python toolkit that seamlessly integrates with PyTorch, TensorFlow, Jax and NumPy, simplifying the process of writing differentiable simulation code at every step. $\\Phi_\\textrm{Flow}$ provides many essential features that go beyond the capabilities of the base libraries, such as differential operators, boundary conditions, the ability to write dimensionality-agnostic code, floating-point precision management, fully differentiable preconditioned (sparse) linear solves, automatic matrix generation via function tracing, integration of SciPy optimizers, simulation vectorization, and visualization tools. At the same time, $\\Phi_\\textrm{Flow}$ inherits all important traits of the base ML libraries, such as GPU / TPU support, just-in-time compilation, and automatic differentiation. Put together, these features drastically simplify scientific code like PDE or ODE solvers on grids or unstructured meshes, and $\\Phi_\\textrm{Flow}$ even includes out-of-the-box support for fluid simulations. $\\Phi_\\textrm{Flow}$ has been used in various publications and as a ground-truth solver in multiple scientific data sets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Philipp Holl;Nils Thuerey", "authorids": "~Philipp_Holl1;~Nils_Thuerey1", "gender": "M;M", "homepage": ";https://ge.in.tum.de", "dblp": "256/9374;42/478", "google_scholar": "LilimmEAAAAJ;https://scholar.google.com.tw/citations?user=GEehwv8AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Philipp_Holl1;~Nils_Thuerey1", "aff": "Technical University Munich;Technical University Munich", "aff_domain": "tum.de;tum.de", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nholl2024bfphitextrmflow,\ntitle={\\${\\textbackslash}bf\\{{\\textbackslash}Phi\\}\\_{\\textbackslash}textrm\\{Flow\\}\\$: Differentiable Simulations for PyTorch, TensorFlow and Jax},\nauthor={Philipp Holl and Nils Thuerey},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4oD0tRrUOX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3121455, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "email": "tum.de;tum.de", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Technical University of Munich", "aff_unique_dep": "", "aff_unique_url": "https://www.tum.de", "aff_unique_abbr": "TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Online Learning with Bounded Recall", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35004", "id": "4pFgOzKF76", "proceeding": "https://proceedings.mlr.press/v235/schneider24b.html", "pdf": "https://openreview.net/pdf?id=4pFgOzKF76", "openreview": "https://openreview.net/forum?id=4pFgOzKF76", "author_site": "Jon Schneider, Kiran Vodrahalli", "tldr": "", "abstract": "We study the problem of full-information online learning in the ``bounded recall'' setting popular in the study of repeated games. An online learning algorithm $\\mathcal{A}$ is $M$-*bounded-recall* if its output at time $t$ can be written as a function of the $M$ previous rewards (and not e.g. any other internal state of $\\mathcal{A}$). We first demonstrate that a natural approach to constructing bounded-recall algorithms from mean-based no-regret learning algorithms (e.g., running Hedge over the last $M$ rounds) fails, and that any such algorithm incurs constant regret per round. We then construct a stationary bounded-recall algorithm that achieves a per-round regret of $\\Theta(1/\\sqrt{M})$, which we complement with a tight lower bound. Finally, we show that unlike the perfect recall setting, any low regret bound bounded-recall algorithm must be aware of the ordering of the past $M$ losses -- any bounded-recall algorithm which plays a symmetric function of the past $M$ losses must incur constant regret per round.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jon Schneider;Kiran Vodrahalli", "authorids": "~Jon_Schneider1;~Kiran_Vodrahalli1", "gender": "M;M", "homepage": "https://jschnei.github.io;https://kiranvodrahalli.github.io", "dblp": "146/0503;188/5863", "google_scholar": "Jc97EyAAAAAJ;7oBE9-oAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Jon_Schneider1;~Kiran_Vodrahalli1", "aff": "Google;Google", "aff_domain": "google.com;google.com", "position": "Researcher;Researcher", "bibtex": "@inproceedings{\nschneider2024online,\ntitle={Online Learning with Bounded Recall},\nauthor={Jon Schneider and Kiran Vodrahalli},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4pFgOzKF76}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 407680, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10059485042466197497&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "google.com;google.com", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Mean-field Underdamped Langevin Dynamics and its Spacetime Discretization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35003", "id": "4qsduFJDEB", "proceeding": "https://proceedings.mlr.press/v235/fu24g.html", "pdf": "https://openreview.net/pdf?id=4qsduFJDEB", "openreview": "https://openreview.net/forum?id=4qsduFJDEB", "author_site": "Qiang Fu, Ashia Wilson", "tldr": "", "abstract": "We propose a new method called the N-particle underdamped Langevin algorithm for optimizing a special class of non-linear functionals defined over the space of probability measures. Examples of problems with this formulation include training mean-field neural networks, maximum mean discrepancy minimization and kernel Stein discrepancy minimization. Our algorithm is based on a novel spacetime discretization of the mean-field underdamped Langevin dynamics, for which we provide a new, fast mixing guarantee. In addition, we demonstrate that our algorithm converges globally in total variation distance, bridging the theoretical gap between the dynamics and its practical implementation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qiang Fu;Ashia Camage Wilson", "authorids": "~Qiang_Fu11;~Ashia_Camage_Wilson1", "gender": "M;F", "homepage": "https://sites.google.com/view/qiangfu;https://www.ashiawilson.com", "dblp": ";", "google_scholar": "QiTtvZkAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Qiang_Fu11;~Ashia_C._Wilson1", "aff": "Sun Yat-sen University;Massachusetts Institute of Technology", "aff_domain": "sysu.edu.cn;mit.edu", "position": "MS student;Assistant Professor", "bibtex": "@inproceedings{\nfu2024meanfield,\ntitle={Mean-field Underdamped Langevin Dynamics and its Spacetime Discretization},\nauthor={Qiang Fu and Ashia Camage Wilson},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4qsduFJDEB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1103524, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6407492465767185372&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "sysu.edu.cn;mit.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Sun Yat-sen University;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "http://www.sysu.edu.cn/;https://web.mit.edu", "aff_unique_abbr": "SYSU;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "title": "Sample-specific Masks for Visual Reprogramming-based Prompting", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35002", "id": "4sikyurTLX", "proceeding": "https://proceedings.mlr.press/v235/cai24i.html", "pdf": "https://openreview.net/pdf?id=4sikyurTLX", "openreview": "https://openreview.net/forum?id=4sikyurTLX", "author_site": "Chengyi Cai, Zesheng Ye, Lei Feng, Jianzhong Qi, Feng Liu", "tldr": "", "abstract": "*Visual reprogramming* (VR) is a prompting technique that aims to re-purpose a pre-trained model (e.g., a classifier on ImageNet) to target tasks (e.g., medical data prediction) by learning a *small-scale pattern* added into input images instead of tuning considerable parameters within the model. The location of the pattern within input samples is usually determined by a pre-defined mask *shared across all samples*. In this paper, we show that the shared mask potentially limits VR's generalization and increases its approximation error due to the lack of sample-level adaptation. Motivated by this finding, we design a new framework for VR called *sample-specific multi-channel masks* (SMM). Specifically, SMM employs a lightweight ConvNet and patch-wise interpolation to generate sample-specific three-channel masks instead of a shared and pre-defined mask. Since we generate different masks for individual samples, SMM is theoretically shown to reduce approximation error for the target tasks compared with existing state-of-the-art VR methods. We also empirically demonstrate its performance gain on both ResNet and ViT. The success of SMM further highlights the broader applicability of VR in leveraging the latent knowledge of pre-trained models for various target tasks. Our code is available at https://github.com/tmlr-group/SMM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chengyi Cai;Zesheng Ye;Lei Feng;Jianzhong Qi;Feng Liu", "authorids": "~Chengyi_Cai2;~Zesheng_Ye1;~Lei_Feng1;~Jianzhong_Qi1;~Feng_Liu2", "gender": "F;;M;M;M", "homepage": "https://caichengyi.github.io/;;https://lfeng1995.github.io/;https://people.eng.unimelb.edu.au/jianzhongq/;https://fengliu90.github.io/index.html", "dblp": ";;76/847-6;41/1074-1;77/1318-3", "google_scholar": ";;https://scholar.google.com.sg/citations?user=KomQOFkAAAAJ;https://scholar.google.com.au/citations?user=mxS6eHYAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0003-2839-5799;0000-0001-6501-9050;0000-0002-5005-9129", "linkedin": ";;;;alexfengliu", "or_profile": "~Chengyi_Cai2;~Zesheng_Ye1;~Lei_Feng1;~Jianzhong_Qi1;~Feng_Liu2", "aff": "University of Melbourne;;Singapore University of Technology and Design;University of Melbourne;University of Melbourne", "aff_domain": "unimelb.edu.au;;sutd.edu.sg;unimelb.edu.au;unimelb.edu.au", "position": "PhD student;;Assistant Professor;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\ncai2024samplespecific,\ntitle={Sample-specific Masks for Visual Reprogramming-based Prompting},\nauthor={Chengyi Cai and Zesheng Ye and Lei Feng and Jianzhong Qi and Feng Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4sikyurTLX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7492867, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13794895493406759582&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "unimelb.edu.au;;sutd.edu.sg;unimelb.edu.au;unimelb.edu.au", "author_num": 5, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of Melbourne;Singapore University of Technology and Design", "aff_unique_dep": ";", "aff_unique_url": "https://www.unimelb.edu.au;https://www.sutd.edu.sg", "aff_unique_abbr": "UniMelb;SUTD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Australia;Singapore" }, { "title": "An Efficient Self-Learning Framework For Interactive Spoken Dialog Systems", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35001", "id": "4uTJfGYA2t", "proceeding": "https://proceedings.mlr.press/v235/tulsiani24a.html", "pdf": "https://openreview.net/pdf?id=4uTJfGYA2t", "openreview": "https://openreview.net/forum?id=4uTJfGYA2t", "author_site": "Hitesh Tulsiani, David Chan, Shalini Ghosh, Garima Lalwani, Prabhat Pandey, Ankish Bansal, Sri Garimella, Ariya Rastrow, Bj\u00f6rn Hoffmeister", "tldr": "", "abstract": "Dialog systems, such as voice assistants, are expected to engage with users in complex, evolving conversations. Unfortunately, traditional automatic speech recognition (ASR) systems deployed in such applications are usually trained to recognize each turn independently and lack the ability to adapt to the conversational context or incorporate user feedback. In this work, we introduce a general framework for ASR in dialog systems that can go beyond learning from single-turn utterances and learn over time how to adapt to both explicit supervision and implicit user feedback present in multi-turn conversations. We accomplish that by leveraging advances in student-teacher learning and context-aware dialog processing, and designing contrastive self-supervision approaches with Ohm, a new online hard-negative mining approach. We show that leveraging our new framework compared to traditional training leads to relative WER reductions of close to 10% in real-world dialog systems, and up to 26% on public synthetic data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hitesh Tulsiani;David Chan;Shalini Ghosh;Garima Lalwani;Prabhat Pandey;Ankish Bansal;Sri Garimella;Ariya Rastrow;Bj\u00f6rn Hoffmeister", "authorids": "hittul@amazon.com;~David_Chan3;~Shalini_Ghosh3;glalwani@amazon.com;panprabh@amazon.com;bankish@amazon.com;srigar@amazon.com;~Ariya_Rastrow2;bjornh@amazon.com", "gender": ";M;F;;;;;M;", "homepage": ";https://people.eecs.berkeley.edu/~davidchan/;http://shalinighosh.com;;;;;;", "dblp": ";80/9659;45/4320;;;;;;", "google_scholar": ";qa4M89wAAAAJ;kC9Pmn8AAAAJ;;;;;78YEqxgAAAAJ;", "orcid": ";;;;;;;;", "linkedin": ";;shalini-ghosh-99bb3719;;;;;ariya-rastrow-4ab149b/;", "or_profile": "hittul@amazon.com;~David_Chan3;~Shalini_Ghosh3;glalwani@amazon.com;panprabh@amazon.com;bankish@amazon.com;srigar@amazon.com;~Ariya_Rastrow2;bjornh@amazon.com", "aff": ";University of California, Berkeley;Amazon;;;;;;", "aff_domain": ";berkeley.edu;amazon.com;;;;;;", "position": ";PhD student;Principal Researcher;;;;;;", "bibtex": "@inproceedings{\ntulsiani2024an,\ntitle={An Efficient Self-Learning Framework For Interactive Spoken Dialog Systems},\nauthor={Hitesh Tulsiani and David Chan and Shalini Ghosh and Garima Lalwani and Prabhat Pandey and Ankish Bansal and Sri Garimella and Ariya Rastrow and Bj{\\\"o}rn Hoffmeister},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4uTJfGYA2t}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 982630, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:hXODXV7xCwcJ:scholar.google.com/&scioq=An+Efficient+Self-Learning+Framework+For+Interactive+Spoken+Dialog+Systems&hl=en&as_sdt=0,14", "gs_version_total": 9, "email": ";berkeley.edu;amazon.com;;;;;;", "author_num": 9, "aff_unique_index": "0;1", "aff_unique_norm": "University of California, Berkeley;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.berkeley.edu;https://www.amazon.com", "aff_unique_abbr": "UC Berkeley;Amazon", "aff_campus_unique_index": "0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Model-Based RL for Mean-Field Games is not Statistically Harder than Single-Agent RL", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35000", "id": "4ye2I5OelI", "proceeding": "https://proceedings.mlr.press/v235/huang24i.html", "pdf": "https://openreview.net/pdf?id=4ye2I5OelI", "openreview": "https://openreview.net/forum?id=4ye2I5OelI", "author_site": "Jiawei Huang, Niao He, Andreas Krause", "tldr": "", "abstract": "We study the sample complexity of reinforcement learning (RL) in Mean-Field Games (MFGs) with model-based function approximation that requires strategic exploration to find a Nash Equilibrium policy. We introduce the Partial Model-Based Eluder Dimension (P-MBED), a more effective notion to characterize the model class complexity. Notably, P-MBED measures the complexity of the single-agent model class converted from the given mean-field model class, and potentially, can be exponentially lower than the MBED proposed by Huang et al. (2024). We contribute a model elimination algorithm featuring a novel exploration strategy and establish sample complexity results polynomial w.r.t. P-MBED. Crucially, our results reveal that, under the basic realizability and Lipschitz continuity assumptions, *learning Nash Equilibrium in MFGs is no more statistically challenging than solving a logarithmic number of single-agent RL problems*. We further extend our results to Multi-Type MFGs, generalizing from conventional MFGs and involving multiple types of agents. This extension implies statistical tractability of a broader class of Markov Games through the efficacy of mean-field approximation. Finally, inspired by our theoretical algorithm, we present a heuristic approach with improved computational efficiency and empirically demonstrate its effectiveness.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiawei Huang;Niao He;Andreas Krause", "authorids": "~Jiawei_Huang3;~Niao_He3;~Andreas_Krause1", "gender": ";M;", "homepage": "https://jiaweihhuang.github.io;https://las.inf.ethz.ch/krausea;http://people.inf.ethz.ch/niaohe", "dblp": "13/4208;87/1831-1.html;https://dblp.uni-trier.de/pers/h/He:Niao.html", "google_scholar": "6IcfJiIAAAAJ;https://scholar.google.ch/citations?user=eDHv58AAAAAJ;iNcA81MAAAAJ", "orcid": ";0000-0001-7260-9673;", "linkedin": ";krausea/;", "or_profile": "~Jiawei_Huang3;~Andreas_Krause1;~Niao_He1", "aff": "Department of Computer Science, ETHZ - ETH Zurich;ETH Zurich;Swiss Federal Institute of Technology", "aff_domain": "inf.ethz.ch;ethz.ch;ethz.ch", "position": "PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nhuang2024modelbased,\ntitle={Model-Based {RL} for Mean-Field Games is not Statistically Harder than Single-Agent {RL}},\nauthor={Jiawei Huang and Niao He and Andreas Krause},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4ye2I5OelI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 787614, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9087503395722135109&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "inf.ethz.ch;ethz.ch;ethz.ch", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "ETH Zurich;Swiss Federal Institute of Technology", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch", "aff_unique_abbr": "ETHZ;ETH Zurich", "aff_campus_unique_index": "0", "aff_campus_unique": "Zurich;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Premise Order Matters in Reasoning with Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34999", "id": "4zAHgkiCQg", "proceeding": "https://proceedings.mlr.press/v235/chen24i.html", "pdf": "https://openreview.net/pdf?id=4zAHgkiCQg", "openreview": "https://openreview.net/forum?id=4zAHgkiCQg", "author_site": "Xinyun Chen, Ryan Chi, Xuezhi Wang, Denny Zhou", "tldr": "", "abstract": "Large language models (LLMs) have accomplished remarkable reasoning performance in various domains. However, in the domain of reasoning tasks, we discover a frailty: LLMs are surprisingly brittle to the ordering of the premises, despite the fact that such ordering does not alter the underlying task. In particular, we observe that LLMs achieve the best performance when the premise order aligns with the context required in intermediate reasoning steps. For example, in deductive reasoning tasks, presenting the premises in the same order as the ground truth proof in the prompt (as opposed to random ordering) drastically increases the model's accuracy. We first examine the effect of premise ordering on deductive reasoning on a variety of LLMs, and our evaluation shows that even if the model performance is decent on the optimal order, permuting the premise order can cause a performance drop of over 30%. In addition, we release the benchmark R-GSM, based on GSM8K, to examine the ordering effect for mathematical problem-solving, and we again observe a significant drop in accuracy, relative to the original GSM8K benchmark.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xinyun Chen;Ryan Andrew Chi;Xuezhi Wang;Denny Zhou", "authorids": "~Xinyun_Chen1;~Ryan_Andrew_Chi1;~Xuezhi_Wang3;~Denny_Zhou1", "gender": "M;;;F", "homepage": "http://ryanachi.com;https://research.google/people/105995/;https://dennyzhou.github.io/;https://jungyhuk.github.io/", "dblp": ";70/4090-2;178/3277;", "google_scholar": ";ScLUQ-YAAAAJ;UwLsYw8AAAAJ;d4W1UT0AAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Ryan_Andrew_Chi1;~Xuezhi_Wang3;~Dengyong_Zhou2;~Xinyun_Chen2", "aff": "Stanford University;Google DeepMind;Google DeepMind;Google", "aff_domain": "stanford.edu;google.com;google.com;google.com", "position": "Undergrad student;Research Scientist;Research Scientist;Researcher", "bibtex": "@inproceedings{\nchen2024premise,\ntitle={Premise Order Matters in Reasoning with Large Language Models},\nauthor={Xinyun Chen and Ryan Andrew Chi and Xuezhi Wang and Denny Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4zAHgkiCQg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6261725, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16672494262602610426&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "stanford.edu;google.com;google.com;google.com", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.stanford.edu;https://deepmind.com", "aff_unique_abbr": "Stanford;DeepMind", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Stanford;;Mountain View", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Privacy-Preserving Data Release Leveraging Optimal Transport and Particle Gradient Descent", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34998", "id": "4zN9tvZfns", "proceeding": "https://proceedings.mlr.press/v235/donhauser24a.html", "pdf": "https://openreview.net/pdf?id=4zN9tvZfns", "openreview": "https://openreview.net/forum?id=4zN9tvZfns", "author_site": "Konstantin Donhauser, Javier Abad, Neha Hulkund, Fanny Yang", "tldr": "", "abstract": "We present a novel approach for differentially private data synthesis of protected tabular datasets, a relevant task in highly sensitive domains such as healthcare and government. Current state-of-the-art methods predominantly use marginal-based approaches, where a dataset is generated from private estimates of the marginals. In this paper, we introduce PrivPGD, a new generation method for marginal-based private data synthesis, leveraging tools from optimal transport and particle gradient descent. Our algorithm outperforms existing methods on a large range of datasets while being highly scalable and offering the flexibility to incorporate additional domain-specific constraints.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Konstantin Donhauser;Javier Abad;Neha Hulkund;Fanny Yang", "authorids": "~Konstantin_Donhauser1;~Javier_Abad1;~Neha_Hulkund1;~Fanny_Yang1", "gender": "M;M;F;", "homepage": ";;https://hulkund.github.io;http://www.fanny-yang.de", "dblp": "238/0076;;297/5263;126/4852", "google_scholar": ";gGHkUhkAAAAJ;;BfDKicQAAAAJ", "orcid": ";;;", "linkedin": "konstantin-donhauser-5a5704192/;javiabadm/;;", "or_profile": "~Konstantin_Donhauser1;~Javier_Abad1;~Neha_Hulkund1;~Fanny_Yang1", "aff": "Swiss Federal Institute of Technology;ETHZ - ETH Zurich;Massachusetts Institute of Technology;Swiss Federal Institute of Technology", "aff_domain": "ethz.ch;ethz.ch;mit.edu;ethz.ch", "position": "PhD student;PhD student;PhD student;Professor", "bibtex": "@inproceedings{\ndonhauser2024privacypreserving,\ntitle={Privacy-Preserving Data Release Leveraging Optimal Transport and Particle Gradient Descent},\nauthor={Konstantin Donhauser and Javier Abad and Neha Hulkund and Fanny Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4zN9tvZfns}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 588337, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14802883606878414686&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "ethz.ch;ethz.ch;mit.edu;ethz.ch", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Swiss Federal Institute of Technology;ETH Zurich;Massachusetts Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch;https://web.mit.edu", "aff_unique_abbr": "ETH Zurich;ETHZ;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Switzerland;United States" }, { "title": "Probabilistic Modeling of Interpersonal Coordination Processes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34997", "id": "4zOZ0yKhm6", "proceeding": "https://proceedings.mlr.press/v235/soares24a.html", "pdf": "https://openreview.net/pdf?id=4zOZ0yKhm6", "openreview": "https://openreview.net/forum?id=4zOZ0yKhm6", "author_site": "Paulo Soares, Adarsh Pyarelal, Meghavarshini Krishnaswamy, Emily Butler, Kobus Barnard", "tldr": "", "abstract": "We develop a novel probabilistic model for interpersonal coordination as a latent phenomenon explaining statistical temporal influence between multiple components in a system. For example, the state of one person can influence that of another at a later time, as indicated by their observed behaviors. We characterize coordination as the degree to which the distributions for such states at one time point are merged for the next salient time point. We evaluate our model in the context of three-person teams executing a virtual search and rescue (SAR) mission. We first use synthetic data to confirm that our technical definition of coordination is consistent with expectations and that we can recover generated coordination despite noise. We then show that captured coordination can be predictive of team performance on real data. Here we use speech vocalics and semantics to infer coordination for 36 teams carrying out two successive SAR missions. In two different datasets, we find that coordination is generally predictive of team score for the second mission, but not for the first, where teams are largely learning to play the game. In addition, we found that including a semantic modality improves prediction in some scenarios. This shows that our intuitive technical definition can capture useful explanatory aspects of team behavior.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Paulo Soares;Adarsh Pyarelal;Meghavarshini Krishnaswamy;Emily Butler;Kobus Barnard", "authorids": "~Paulo_Soares1;~Adarsh_Pyarelal1;~Meghavarshini_Krishnaswamy1;~Emily_Butler3;~Kobus_Barnard1", "gender": "M;M;F;M;F", "homepage": ";https://adarsh.cc;https://linguistics.arizona.edu/people/meghavarshini-krishnaswamy;http://kobus.ca;", "dblp": "118/5058.html;242/7424;290/5863;53/2666;", "google_scholar": "Fpwf6FYAAAAJ;https://scholar.google.com/citations?hl=en;9HzdoS0AAAAJ;https://scholar.google.co.uk/citations?user=fKESO6sAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-0646-037X;0000-0002-1602-0386;0000-0002-0205-9298;0000-0002-8568-9518;", "linkedin": "paulosoaresua/;adarshpyarelal/;meghavarshini-krishnaswamy/;;", "or_profile": "~Paulo_Soares1;~Adarsh_Pyarelal1;~Meghavarshini_Krishnaswamy1;~Kobus_Barnard1;~Emily_Annette_Butler1", "aff": "University of Arizona;University of Arizona;University of Arizona;University of Arizona;", "aff_domain": "cs.arizona.edu;arizona.edu;arizona.edu;arizona.edu;", "position": "PhD student;Assistant Professor;PhD student;Professor;", "bibtex": "@inproceedings{\nsoares2024probabilistic,\ntitle={Probabilistic Modeling of Interpersonal Coordination Processes},\nauthor={Paulo Soares and Adarsh Pyarelal and Meghavarshini Krishnaswamy and Emily Butler and Kobus Barnard},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4zOZ0yKhm6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 814368, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=482743812086415414&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "cs.arizona.edu;arizona.edu;arizona.edu;arizona.edu;", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Arizona", "aff_unique_dep": "", "aff_unique_url": "https://www.arizona.edu", "aff_unique_abbr": "UA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Quantum Implicit Neural Representations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34996", "id": "50vc4HBuKU", "proceeding": "https://proceedings.mlr.press/v235/zhao24l.html", "pdf": "https://openreview.net/pdf?id=50vc4HBuKU", "openreview": "https://openreview.net/forum?id=50vc4HBuKU", "author_site": "Jiaming Zhao, Wenbo Qiao, Peng Zhang, Hui Gao", "tldr": "", "abstract": "Implicit neural representations have emerged as a powerful paradigm to represent signals such as images and sounds. This approach aims to utilize neural networks to parameterize the implicit function of the signal. However, when representing implicit functions, traditional neural networks such as ReLU-based multilayer perceptrons face challenges in accurately modeling high-frequency components of signals. Recent research has begun to explore the use of Fourier Neural Networks (FNNs) to overcome this limitation. In this paper, we propose Quantum Implicit Representation Network (QIREN), a novel quantum generalization of FNNs. Furthermore, through theoretical analysis, we demonstrate that QIREN possesses a quantum advantage over classical FNNs. Lastly, we conducted experiments in signal representation, image superresolution, and image generation tasks to show the superior performance of QIREN compared to state-of-the-art (SOTA) models. Our work not only incorporates quantum advantages into implicit neural representations but also uncovers a promising application direction for Quantum Neural Networks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiaming Zhao;Wenbo Qiao;Peng Zhang;Hui Gao", "authorids": "~Jiaming_Zhao2;~Wenbo_Qiao1;~Peng_Zhang17;~Hui_Gao4", "gender": ";;M;F", "homepage": ";;http://cic.tju.edu.cn/faculty/zhangpeng/index.html;https://github.com/TJUIRLAB/SIGIR20_QINM", "dblp": ";;21/1048-2%20;", "google_scholar": ";;tvDb5_cAAAAJ;", "orcid": ";;0000-0003-0228-9330;", "linkedin": ";;;", "or_profile": "~Jiaming_Zhao2;~Wenbo_Qiao1;~Peng_Zhang17;~Hui_Gao4", "aff": ";;Tianjin University;Tianjin University", "aff_domain": ";;tju.edu.cn;tju.edu.cn", "position": ";;Full Professor;PhD student", "bibtex": "@inproceedings{\nzhao2024quantum,\ntitle={Quantum Implicit Neural Representations},\nauthor={Jiaming Zhao and Wenbo Qiao and Peng Zhang and Hui Gao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=50vc4HBuKU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9129890, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8561601883087271092&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": ";;tju.edu.cn;tju.edu.cn", "author_num": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Tianjin University", "aff_unique_dep": "", "aff_unique_url": "http://www.tju.edu.cn", "aff_unique_abbr": "TJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Pricing with Contextual Elasticity and Heteroscedastic Valuation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34995", "id": "51gXk4BISH", "proceeding": "https://proceedings.mlr.press/v235/xu24x.html", "pdf": "https://openreview.net/pdf?id=51gXk4BISH", "openreview": "https://openreview.net/forum?id=51gXk4BISH", "author_site": "Jianyu Xu, Yu-Xiang Wang", "tldr": "", "abstract": "We study an online contextual dynamic pricing problem, where customers decide whether to purchase a product based on its features and price. We introduce a novel approach to modeling a customer's expected demand by incorporating feature-based price elasticity, which can be equivalently represented as a valuation with heteroscedastic noise. To solve the problem, we propose a computationally efficient algorithm called \"Pricing with Perturbation (PwP)\", which enjoys an $O(\\sqrt{dT\\log T})$ regret while allowing arbitrary adversarial input context sequences. We also prove a matching lower bound at $\\Omega(\\sqrt{dT})$ to show the optimality regarding $d$ and $T$ (up to $\\log T$ factors). Our results shed light on the relationship between contextual elasticity and heteroscedastic valuation, providing insights for effective and practical pricing strategies.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jianyu Xu;Yu-Xiang Wang", "authorids": "~Jianyu_Xu1;~Yu-Xiang_Wang1", "gender": "M;", "homepage": "https://xu-jy.github.io/;http://www.cs.ucsb.edu/~yuxiangw/publications.html", "dblp": ";62/1637-3.html", "google_scholar": "3ubVhAMAAAAJ;HGNZ1fkAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Jianyu_Xu1;~Yu-Xiang_Wang1", "aff": "UC Santa Barbara;UC Santa Barbara", "aff_domain": "ucsb.edu;ucsb.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nxu2024pricing,\ntitle={Pricing with Contextual Elasticity and Heteroscedastic Valuation},\nauthor={Jianyu Xu and Yu-Xiang Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=51gXk4BISH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7228360, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1906783157447010543&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "ucsb.edu;ucsb.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Santa Barbara", "aff_unique_dep": "", "aff_unique_url": "https://www.ucsb.edu", "aff_unique_abbr": "UCSB", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Santa Barbara", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Contrastive Preference Optimization: Pushing the Boundaries of LLM Performance in Machine Translation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34994", "id": "51iwkioZpn", "proceeding": "https://proceedings.mlr.press/v235/xu24t.html", "pdf": "https://openreview.net/pdf?id=51iwkioZpn", "openreview": "https://openreview.net/forum?id=51iwkioZpn", "author_site": "Haoran Xu, Amr Sharaf, Yunmo Chen, Weiting Tan, Lingfeng Shen, Benjamin Van Durme, Kenton Murray, Young Jin Kim", "tldr": "", "abstract": "Moderate-sized large language models (LLMs) -- those with 7B or 13B parameters -- exhibit promising machine translation (MT) performance. However, they do not match the performance of state-of-the-art conventional encoder-decoder translation models or larger-scale LLMs such as GPT-4. In this study, we bridge this performance gap. We first assess the shortcomings of supervised fine-tuning for LLMs in the MT task, emphasizing the quality issues present in the reference data, despite being human-generated. Then, in contrast to supervised fine-tuning which mimics reference translations, we introduce Contrastive Preference Optimization (CPO), a novel approach that trains models to avoid generating adequate but not perfect translations. Applying CPO to ALMA models with only 22K parallel sentences and 0.1% parameters yields significant improvements. The resulting model, called ALMA-R, can match or exceed the performance of the WMT competition winners and GPT-4 on WMT'21, WMT'22 and WMT'23 test datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haoran Xu;Amr Sharaf;Yunmo Chen;Weiting Tan;Lingfeng Shen;Benjamin Van Durme;Kenton Murray;Young Jin Kim", "authorids": "~Haoran_Xu3;~Amr_Sharaf1;~Yunmo_Chen1;~Weiting_Tan1;~Lingfeng_Shen1;~Benjamin_Van_Durme2;~Kenton_Murray1;~Young_Jin_Kim1", "gender": "M;M;M;M;M;;;M", "homepage": "https://www.fe1ixxu.com/;http://cs.umd.edu/~amr;https://omnuy.me;https://steventan0110.github.io/;;;http://www.kentonmurray.com;https://www.microsoft.com/en-us/research/people/youki/", "dblp": ";159/1156;252/7831;208/0745;240/5490.html;;143/9465;00/8110-1.html", "google_scholar": "rhcrGQ0AAAAJ;It3Gm1EAAAAJ;V-g2Tx8AAAAJ;hD8E4gYAAAAJ;PoSTdLAAAAAJ;;;", "orcid": ";;;;;;0000-0002-5628-1003;", "linkedin": "haoran-xu-0842b3194/;amrsharaf/;yunmochen;weiting-steven-tan-30bb4a175/;;;kentonmurray/;ykim362/", "or_profile": "~Haoran_Xu3;~Amr_Sharaf1;~Yunmo_Chen1;~Weiting_Tan1;~Lingfeng_Shen1;~Benjamin_Van_Durme2;~Kenton_Murray1;~Young_Jin_Kim1", "aff": "Johns Hopkins University;Microsoft;Johns Hopkins University;Johns Hopkins University;Johns Hopkins University;;Johns Hopkins University;Microsoft", "aff_domain": "jhu.edu;microsoft.com;jhu.edu;jhu.edu;jh.edu;;jhu.edu;microsoft.com", "position": "PhD student;Researcher;PhD student;PhD student;MS student;;Researcher;Principal Researcher", "bibtex": "@inproceedings{\nxu2024contrastive,\ntitle={Contrastive Preference Optimization: Pushing the Boundaries of {LLM} Performance in Machine Translation},\nauthor={Haoran Xu and Amr Sharaf and Yunmo Chen and Weiting Tan and Lingfeng Shen and Benjamin Van Durme and Kenton Murray and Young Jin Kim},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=51iwkioZpn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1242870, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 185, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6361588517228067393&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "jhu.edu;microsoft.com;jhu.edu;jhu.edu;jh.edu;;jhu.edu;microsoft.com", "author_num": 8, "aff_unique_index": "0;1;0;0;0;0;1", "aff_unique_norm": "Johns Hopkins University;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.jhu.edu;https://www.microsoft.com", "aff_unique_abbr": "JHU;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "PAGER: Accurate Failure Characterization in Deep Regression Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34993", "id": "5353dJE9Ek", "proceeding": "https://proceedings.mlr.press/v235/j-thiagarajan24a.html", "pdf": "https://openreview.net/pdf?id=5353dJE9Ek", "openreview": "https://openreview.net/forum?id=5353dJE9Ek", "author_site": "Jayaraman J. Thiagarajan, Vivek Narayanaswamy, Puja Trivedi, Rushil Anirudh", "tldr": "", "abstract": "Safe deployment of AI models requires proactive detection of failures to prevent costly errors. To this end, we study the important problem of detecting failures in deep regression models. Existing approaches rely on epistemic uncertainty estimates or inconsistency w.r.t the training data to identify failure. Interestingly, we find that while uncertainties are necessary they are insufficient to accurately characterize failure in practice. Hence, we introduce PAGER (Principled Analysis of Generalization Errors in Regressors), a framework to systematically detect and characterize failures in deep regressors. Built upon the principle of anchored training in deep models, PAGER unifies both epistemic uncertainty and complementary manifold non-conformity scores to accurately organize samples into different risk regimes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jayaraman J. Thiagarajan;Vivek Narayanaswamy;Puja Trivedi;Rushil Anirudh", "authorids": "~Jayaraman_J._Thiagarajan3;~Vivek_Narayanaswamy1;~Puja_Trivedi1;~Rushil_Anirudh1", "gender": "M;F;M;M", "homepage": ";https://pujacomputes.github.io/;https://rushila.com/;https://jjthiagarajan.com", "dblp": "230/4531;274/2080;136/5391;16/7803", "google_scholar": "7h2Ui6YAAAAJ;1y9cR50AAAAJ;WkoIlpQAAAAJ;cMz65_oAAAAJ", "orcid": ";0000-0003-1874-8992;0000-0002-4186-3502;", "linkedin": ";;rushilanirudh/;", "or_profile": "~Vivek_Narayanaswamy1;~Puja_Trivedi1;~Rushil_Anirudh1;~Jayaraman_J._Thiagarajan2", "aff": "Lawrence Livermore National Labs;University of Michigan;Amazon;Lawrence Livermore National Labs", "aff_domain": "llnl.gov;umich.edu;amazon.com;llnl.gov", "position": "Researcher;PhD student;Applied Scientist;Computer Scientist", "bibtex": "@inproceedings{\nthiagarajan2024pager,\ntitle={{PAGER}: Accurate Failure Characterization in Deep Regression Models},\nauthor={Jayaraman J. Thiagarajan and Vivek Narayanaswamy and Puja Trivedi and Rushil Anirudh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5353dJE9Ek}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4780527, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2195777955766171323&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "llnl.gov;umich.edu;amazon.com;llnl.gov", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Lawrence Livermore National Laboratory;University of Michigan;Amazon", "aff_unique_dep": ";;Amazon.com, Inc.", "aff_unique_url": "https://www.llnl.gov;https://www.umich.edu;https://www.amazon.com", "aff_unique_abbr": "LLNL;UM;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Fine-tuning Reinforcement Learning Models is Secretly a Forgetting Mitigation Problem", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34992", "id": "53iSXb1m8w", "proceeding": "https://proceedings.mlr.press/v235/wolczyk24a.html", "pdf": "https://openreview.net/pdf?id=53iSXb1m8w", "openreview": "https://openreview.net/forum?id=53iSXb1m8w", "author_site": "Maciej Wo\u0142czyk, Bart\u0142omiej Cupia\u0142, Mateusz Ostaszewski, Micha\u0142 Bortkiewicz, Micha\u0142 Zaj\u0105c, Razvan Pascanu, Lukasz Kucinski, Piotr Milos", "tldr": "", "abstract": "Fine-tuning is a widespread technique that allows practitioners to transfer pre-trained capabilities, as recently showcased by the successful applications of foundation models. However, fine-tuning reinforcement learning (RL) models remains a challenge. This work conceptualizes one specific cause of poor transfer, accentuated in the RL setting by the interplay between actions and observations: *forgetting of pre-trained capabilities*. Namely, a model deteriorates on the state subspace of the downstream task not visited in the initial phase of fine-tuning, on which the model behaved well due to pre-training. This way, we lose the anticipated transfer benefits. We identify conditions when this problem occurs, showing that it is common and, in many cases, catastrophic. Through a detailed empirical analysis of the challenging NetHack and Montezuma's Revenge environments, we show that standard knowledge retention techniques mitigate the problem and thus allow us to take full advantage of the pre-trained capabilities. In particular, in NetHack, we achieve a new state-of-the-art for neural models, improving the previous best score from $5$K to over $10$K points in the Human Monk scenario.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Maciej Wolczyk;Bart\u0142omiej Cupia\u0142;Mateusz Ostaszewski;Micha\u0142 Bortkiewicz;Micha\u0142 Zaj\u0105c;Razvan Pascanu;\u0141ukasz Kuci\u0144ski;Piotr Mi\u0142o\u015b", "authorids": "~Maciej_Wolczyk1;~Bart\u0142omiej_Cupia\u01421;~Mateusz_Ostaszewski1;~Micha\u0142_Bortkiewicz1;~Micha\u0142_Zaj\u0105c1;~Razvan_Pascanu1;~\u0141ukasz_Kuci\u0144ski1;~Piotr_Mi\u0142o\u015b1", "gender": "M;M;;;M;M;M;", "homepage": ";https://bartekcupial.github.io/;;;;https://razp.info;https://sites.google.com/view/lukaszkucinski;", "dblp": "236/5956;;;;02/6977-5.html;65/8368.html;250/9699;208/0989.html", "google_scholar": ";2I0GV_UAAAAJ;;;https://scholar.google.pl/citations?user=5HHtXzwAAAAJ;https://scholar.google.ca/citations?user=eSPY8LwAAAAJ;l6dK-VUAAAAJ;Se68XecAAAAJ", "orcid": ";;;;;;0000-0002-5617-8129;", "linkedin": ";bart%C5%82omiej-cupia%C5%82-7b37b0160;;;;;https://linkedin.com/in/lukasz-kucinski;piotr-milos-4b02151/", "or_profile": "~Maciej_Wolczyk1;~Bart\u0142omiej_Cupia\u01421;~Mateusz_Ostaszewski1;~Micha\u0142_Bortkiewicz1;~Micha\u0142_Zaj\u0105c1;~Razvan_Pascanu1;~\u0141ukasz_Kuci\u0144ski1;~Piotr_Mi\u0142o\u015b1", "aff": "IDEAS NCBR Sp.;University of Warsaw;;;FAR AI;Google DeepMind;Institute of Mathematics Polish Academy of Sciences;IDEAS NCBR", "aff_domain": "ideas-ncbr.pl;uw.edu.pl;;;far.ai;google.com;impan.pl;ideas-ncbr.pl", "position": "Postdoc;PhD student;;;Research Engineer;Research Scientist;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nwolczyk2024finetuning,\ntitle={Fine-tuning Reinforcement Learning Models is Secretly a Forgetting Mitigation Problem},\nauthor={Maciej Wolczyk and Bart{\\l}omiej Cupia{\\l} and Mateusz Ostaszewski and Micha{\\l} Bortkiewicz and Micha{\\l} Zaj{\\k{a}}c and Razvan Pascanu and {\\L}ukasz Kuci{\\'n}ski and Piotr Mi{\\l}o{\\'s}},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=53iSXb1m8w}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5035773, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9250153917008115615&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 8, "email": "ideas-ncbr.pl;uw.edu.pl;;;far.ai;google.com;impan.pl;ideas-ncbr.pl", "author_num": 8, "aff_unique_index": "0;1;2;3;4;5", "aff_unique_norm": "IDEAS NCBR;University of Warsaw;FAR AI;Google;Polish Academy of Sciences;Institute for Development, Economic Analysis, and Simulation (IDEAS)", "aff_unique_dep": ";;;Google DeepMind;Institute of Mathematics;", "aff_unique_url": ";https://www.uw.edu.pl;https://www.far.ai;https://deepmind.com;https://www.impan.pl/;https://www.ideas-ncbr.gov.pl", "aff_unique_abbr": ";UW;FAR AI;DeepMind;PAS;IDEAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;2;0;0", "aff_country_unique": "Poland;United States;United Kingdom" }, { "title": "SparseTSF: Modeling Long-term Time Series Forecasting with *1k* Parameters", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34991", "id": "54NSHO0lFe", "proceeding": "https://proceedings.mlr.press/v235/lin24n.html", "pdf": "https://openreview.net/pdf?id=54NSHO0lFe", "openreview": "https://openreview.net/forum?id=54NSHO0lFe", "author_site": "Shengsheng Lin, Weiwei Lin, Wentai Wu, Haojun Chen, Junjie Yang", "tldr": "", "abstract": "This paper introduces SparseTSF, a novel, extremely lightweight model for Long-term Time Series Forecasting (LTSF), designed to address the challenges of modeling complex temporal dependencies over extended horizons with minimal computational resources. At the heart of SparseTSF lies the Cross-Period Sparse Forecasting technique, which simplifies the forecasting task by decoupling the periodicity and trend in time series data. This technique involves downsampling the original sequences to focus on cross-period trend prediction, effectively extracting periodic features while minimizing the model's complexity and parameter count. Based on this technique, the SparseTSF model uses fewer than *1k* parameters to achieve competitive or superior performance compared to state-of-the-art models. Furthermore, SparseTSF showcases remarkable generalization capabilities, making it well-suited for scenarios with limited computational resources, small samples, or low-quality data. The code is publicly available at this repository: https://github.com/lss-1138/SparseTSF.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shengsheng Lin;Weiwei Lin;Wentai Wu;Haojun Chen;Junjie Yang", "authorids": "~Shengsheng_Lin1;~Weiwei_Lin1;~Wentai_Wu1;~Haojun_Chen3;~Junjie_Yang7", "gender": ";M;M;M;M", "homepage": ";https://www.scholat.com/linweiwei;https://wingter562.github.io/wentai_homepage/;;https://orcid.org/0009-0005-3988-1268", "dblp": ";53/282-1;;;", "google_scholar": ";IWsha94AAAAJ;hyTiOb0AAAAJ;;", "orcid": "0000-0001-5445-5148;0000-0001-6876-1795;;0009-0006-0616-3865;0009-0005-3988-1268", "linkedin": ";;;;", "or_profile": "~Shengsheng_Lin1;~Weiwei_Lin1;~Wentai_Wu1;~Haojun_Chen3;~Junjie_Yang7", "aff": "South China University of Technology;South China University of Technology;Jinan University;South China University of Technology;South China University of Technology", "aff_domain": "scut.edu.cn;scut.edu.cn;jnu.edu.cn;mail.scut.edu.cn;mail.scut.edu", "position": "PhD student;Full Professor;Associate Professor;Undergrad student;Undergrad student", "bibtex": "@inproceedings{\nlin2024sparsetsf,\ntitle={Sparse{TSF}: Modeling Long-term Time Series Forecasting with *1k* Parameters},\nauthor={Shengsheng Lin and Weiwei Lin and Wentai Wu and Haojun Chen and Junjie Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=54NSHO0lFe}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 697339, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9916472971791394475&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "scut.edu.cn;scut.edu.cn;jnu.edu.cn;mail.scut.edu.cn;mail.scut.edu", "author_num": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "South China University of Technology;Jinan University", "aff_unique_dep": ";", "aff_unique_url": "https://www.scut.edu.cn;https://www.jnu.edu.cn", "aff_unique_abbr": "SCUT;JNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Learning Optimal Projection for Forecast Reconciliation of Hierarchical Time Series", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34990", "id": "55HfvJ6lDB", "proceeding": "https://proceedings.mlr.press/v235/tsiourvas24b.html", "pdf": "https://openreview.net/pdf?id=55HfvJ6lDB", "openreview": "https://openreview.net/forum?id=55HfvJ6lDB", "author_site": "Asterios Tsiourvas, Wei Sun, Georgia Perakis, Pin-Yu Chen, Yada Zhu", "tldr": "", "abstract": "Hierarchical time series forecasting requires not only prediction accuracy but also coherency, i.e., forecasts add up appropriately across the hierarchy. Recent literature has shown that reconciliation via projection outperforms prior methods such as top-down or bottom-up approaches. Unlike existing work that pre-specifies a projection matrix (e.g., orthogonal), we study the problem of learning the optimal oblique projection from data for coherent forecasting of hierarchical time series. In addition to the unbiasedness-preserving property, oblique projection implicitly accounts for the hierarchy structure and assigns different weights to individual time series, providing significant adaptability over orthogonal projection which treats base forecast errors equally. We examine two broad classes of projections, namely Euclidean projection and general oblique projections. We propose to model the reconciliation step as a learnable, structured, projection layer in the neural forecaster architecture. The proposed approach allows for the efficient learning of the optimal projection in an end-to-end framework where both the neural forecaster and the projection layer are learned simultaneously. An empirical evaluation of real-world hierarchical time series datasets demonstrates the superior performance of the proposed method over existing state-of-the-art approaches.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Asterios Tsiourvas;Wei Sun;Georgia Perakis;Pin-Yu Chen;Yada Zhu", "authorids": "~Asterios_Tsiourvas1;~Wei_Sun7;~Georgia_Perakis1;~Pin-Yu_Chen1;~Yada_Zhu1", "gender": "M;F;F;M;", "homepage": "https://www.linkedin.com/in/asterios-tsiourvas/;https://research.ibm.com/people/wei-sun;https://mitmgmtfaculty.mit.edu/gperakis/;http://www.pinyuchen.com;https://researcher.watson.ibm.com/researcher/view.php?person=us-yzhu", "dblp": ";09/5042-31;;39/8969;56/8808", "google_scholar": ";IEt8VeoAAAAJ;SUwM5jUAAAAJ;jxwlCUUAAAAJ;AJb408gAAAAJ", "orcid": ";;;0000-0003-1039-8369;0000-0002-3338-6371", "linkedin": ";;;pin-yu-chen-940062a2;yadazhu/", "or_profile": "~Asterios_Tsiourvas1;~Wei_Sun7;~Georgia_Perakis1;~Pin-Yu_Chen1;~Yada_Zhu1", "aff": "Massachusetts Institute of Technology;;Massachusetts Institute of Technology;International Business Machines;IBM Research", "aff_domain": "mit.edu;;mit.edu;ibm.com;us.ibm.com", "position": "PhD student;;Full Professor;Principal Researcher;Principal Research Scientist", "bibtex": "@inproceedings{\ntsiourvas2024learning,\ntitle={Learning Optimal Projection for Forecast Reconciliation of Hierarchical Time Series},\nauthor={Asterios Tsiourvas and Wei Sun and Georgia Perakis and Pin-Yu Chen and Yada Zhu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=55HfvJ6lDB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1688874, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7yqXLkOv3DwJ:scholar.google.com/&scioq=Learning+Optimal+Projection+for+Forecast+Reconciliation+of+Hierarchical+Time+Series&hl=en&as_sdt=0,44", "gs_version_total": 6, "email": "mit.edu;;mit.edu;ibm.com;us.ibm.com", "author_num": 5, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Massachusetts Institute of Technology;International Business Machines Corporation;IBM", "aff_unique_dep": ";;IBM Research", "aff_unique_url": "https://web.mit.edu;https://www.ibm.com;https://www.ibm.com/research", "aff_unique_abbr": "MIT;IBM;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Breadth-First Exploration on Adaptive Grid for Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34989", "id": "59MYoLghyk", "proceeding": "https://proceedings.mlr.press/v235/yoon24d.html", "pdf": "https://openreview.net/pdf?id=59MYoLghyk", "openreview": "https://openreview.net/forum?id=59MYoLghyk", "author_site": "Youngsik Yoon, Gangbok Lee, Sungsoo Ahn, Jungseul Ok", "tldr": "", "abstract": "Graph-based planners have gained significant attention for goal-conditioned reinforcement learning (RL), where they construct a graph consisting of confident transitions between *subgoals* as edges and run shortest path algorithms to exploit the confident edges. Meanwhile, identifying and avoiding unattainable transitions are also crucial yet overlooked by the previous graph-based planners, leading to wasting an excessive number of attempts at unattainable subgoals. To address this oversight, we propose a graph construction method that efficiently manages all the achieved and unattained subgoals on a grid graph adaptively discretizing the goal space. This enables a breadth-first exploration strategy, grounded in the local adaptive grid refinement, that prioritizes broad probing of subgoals on a coarse grid over meticulous one on a dense grid. We conducted a theoretical analysis and demonstrated the effectiveness of our approach through empirical evidence, showing that only BEAG succeeds in complex environments under the proposed fixed-goal setting.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Youngsik Yoon;Gangbok Lee;Sungsoo Ahn;Jungseul Ok", "authorids": "~Youngsik_Yoon1;~Gangbok_Lee1;~Sungsoo_Ahn1;~Jungseul_Ok2", "gender": "M;;M;M", "homepage": "http://ml.postech.ac.kr/;http://ml.postech.ac.kr;https://sungsooahn.super.site/;https://sites.google.com/view/jungseulok", "dblp": "273/1365;;90/5164;117/3448", "google_scholar": ";;XTenHs0AAAAJ;KWG3UUMAAAAJ", "orcid": ";;;0000-0003-4742-2473", "linkedin": ";;;", "or_profile": "~Youngsik_Yoon1;~Gangbok_Lee1;~Sungsoo_Ahn1;~Jungseul_Ok2", "aff": "POSTECH;POSTECH;Pohang University of Science and Technology;POSTECH", "aff_domain": "postech.ac.kr;postech.ac.kr;postech.ac.kr;postech.ac.kr", "position": "PhD student;MS student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nyoon2024breadthfirst,\ntitle={Breadth-First Exploration on Adaptive Grid for Reinforcement Learning},\nauthor={Youngsik Yoon and Gangbok Lee and Sungsoo Ahn and Jungseul Ok},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=59MYoLghyk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2181575, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16355095761765023030&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "postech.ac.kr;postech.ac.kr;postech.ac.kr;postech.ac.kr", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Pohang University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.postech.ac.kr", "aff_unique_abbr": "POSTECH", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Pohang", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Discovering Symmetry Breaking in Physical Systems with Relaxed Group Convolution", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34988", "id": "59oXyDTLJv", "proceeding": "https://proceedings.mlr.press/v235/wang24y.html", "pdf": "https://openreview.net/pdf?id=59oXyDTLJv", "openreview": "https://openreview.net/forum?id=59oXyDTLJv", "author_site": "Rui Wang, Elyssa Hofgard, Han Gao, Robin Walters, Tess Smidt", "tldr": "", "abstract": "Modeling symmetry breaking is essential for understanding the fundamental changes in the behaviors and properties of physical systems, from microscopic particle interactions to macroscopic phenomena like fluid dynamics and cosmic structures. Thus, identifying sources of asymmetry is an important tool for understanding physical systems. In this paper, we focus on learning asymmetries of data using relaxed group convolutions. We provide both theoretical and empirical evidence that this flexible convolution technique allows the model to maintain the highest level of equivariance that is consistent with data and discover the subtle symmetry-breaking factors in various physical systems. We employ various relaxed group convolution architectures to uncover various symmetry-breaking factors that are interpretable and physically meaningful in different physical systems, including the phase transition of crystal structure, the isotropy and homogeneity breaking in turbulent flow, and the time-reversal symmetry breaking in pendulum systems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rui Wang;Elyssa Hofgard;Han Gao;Robin Walters;Tess Smidt", "authorids": "~Rui_Wang11;ehofgard@mit.edu;~Han_Gao3;~Robin_Walters1;~Tess_Smidt1", "gender": "M;;M;M;F", "homepage": "https://rui1521.github.io/online-cv/;;https://gaohan1234.github.io/;http://www.robinwalters.com;https://blondegeek.github.io/", "dblp": "06/2293-86;;;258/3416;215/4978.html", "google_scholar": "lEmjtfIAAAAJ;;ozQz4CQAAAAJ;fnprJmUAAAAJ;", "orcid": ";;0000-0002-7733-8996;;0000-0001-5581-5344", "linkedin": "rui-ray-wang-41a398149/;;%E6%B6%B5-han-%E9%AB%98-gao-87038a143/;;", "or_profile": "~Rui_Wang11;ehofgard@mit.edu;~Han_Gao3;~Robin_Walters1;~Tess_Smidt1", "aff": "Massachusetts Institute of Technology;;School of Engineering and Applied Sciences, Harvard University;Northeastern University ;Massachusetts Institute of Technology", "aff_domain": "mit.edu;;seas.harvard.edu;northeastern.edu;mit.edu", "position": "Postdoc;;Postdoc;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nwang2024discovering,\ntitle={Discovering Symmetry Breaking in Physical Systems with Relaxed Group Convolution},\nauthor={Rui Wang and Elyssa Hofgard and Han Gao and Robin Walters and Tess Smidt},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=59oXyDTLJv}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7133583, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8494411049000681216&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 6, "email": "mit.edu;;seas.harvard.edu;northeastern.edu;mit.edu", "author_num": 5, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Massachusetts Institute of Technology;Harvard University;Northeastern University", "aff_unique_dep": ";School of Engineering and Applied Sciences;", "aff_unique_url": "https://web.mit.edu;https://www.harvard.edu;https://www.northeastern.edu", "aff_unique_abbr": "MIT;Harvard;NEU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Vision Transformers as Probabilistic Expansion from Learngene", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34987", "id": "5ExWEazod5", "proceeding": "https://proceedings.mlr.press/v235/wang24cf.html", "pdf": "https://openreview.net/pdf?id=5ExWEazod5", "openreview": "https://openreview.net/forum?id=5ExWEazod5", "author_site": "Qiufeng Wang, Xu Yang, Haokun Chen, Xin Geng", "tldr": "", "abstract": "Deep learning has advanced through the combination of large datasets and computational power, leading to the development of extensive pre-trained models like Vision Transformers (ViTs). However, these models often assume a one-size-fits-all utility, lacking the ability to initialize models with elastic scales tailored to the resource constraints of specific downstream tasks. To address these issues, we propose Probabilistic Expansion from LearnGene (PEG) for mixture sampling and elastic initialization of Vision Transformers. Specifically, PEG utilizes a probabilistic mixture approach to sample Multi-Head Self-Attention layers and Feed-Forward Networks from a large ancestry model into a more compact part termed as learngene. Theoretically, we demonstrate that these learngene can approximate the parameter distribution of the original ancestry model, thereby preserving its significant knowledge. Next, PEG expands the sampled learngene through non-linear mapping, enabling the initialization of descendant models with elastic scales to suit various resource constraints. Our extensive experiments demonstrate the effectiveness of PEG and outperforming traditional initialization strategies.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qiufeng Wang;Xu Yang;Haokun Chen;Xin Geng", "authorids": "~Qiufeng_Wang3;~Xu_Yang5;~Haokun_Chen4;~Xin_Geng1", "gender": "M;M;M;M", "homepage": "http://palm.seu.edu.cn/homepage/wangqiufeng/demo/index.html;;https://haokunchen0.github.io/;http://palm.seu.edu.cn/xgeng/index.htm", "dblp": ";63/1534-21.html;218/6928;", "google_scholar": "HQYQkTwAAAAJ;SqdxMH0AAAAJ;;ZOCxkIcAAAAJ", "orcid": "0000-0001-7680-6607;0000-0002-8276-2679;;", "linkedin": ";;;", "or_profile": "~Qiufeng_Wang3;~Xu_Yang5;~Haokun_Chen4;~Xin_Geng1", "aff": "Southeast University;Southeast University;Southeast University;Southeast University, China", "aff_domain": "seu.edu.cn;seu.edu.cn;seu.edu.cn;seu.edu.cn", "position": "PhD student;Associate Professor;MS student;Professor", "bibtex": "@inproceedings{\nwang2024vision,\ntitle={Vision Transformers as Probabilistic Expansion from Learngene},\nauthor={Qiufeng Wang and Xu Yang and Haokun Chen and Xin Geng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5ExWEazod5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1381833, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16810362996625957802&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "seu.edu.cn;seu.edu.cn;seu.edu.cn;seu.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Southeast University", "aff_unique_dep": "", "aff_unique_url": "https://www.seu.edu.cn/", "aff_unique_abbr": "SEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Kepler codebook", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34986", "id": "5ILo43JIzg", "proceeding": "https://proceedings.mlr.press/v235/lian24a.html", "pdf": "https://openreview.net/pdf?id=5ILo43JIzg", "openreview": "https://openreview.net/forum?id=5ILo43JIzg", "author_site": "Junrong Lian, Ziyue Dong, Pengxu Wei, Wei Ke, Chang Liu, Qixiang Ye, Xiangyang Ji, Liang Lin", "tldr": "", "abstract": "A codebook designed for learning discrete distributions in latent space has demonstrated state-of-the-art results on generation tasks. This inspires us to explore what distribution of codebook is better. Following the spirit of Kepler's Conjecture, we cast the codebook training as solving the sphere packing problem and derive a Kepler codebook with a compact and structured distribution to obtain a codebook for image representations. Furthermore, we implement the Kepler codebook training by simply employing this derived distribution as regularization and using the codebook partition method. We conduct extensive experiments to evaluate our trained codebook for image reconstruction and generation on natural and human face datasets, respectively, achieving significant performance improvement. Besides, our Kepler codebook has demonstrated superior performance when evaluated across datasets and even for reconstructing images with different resolutions. Our trained models and source codes will be publicly released.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Junrong Lian;Ziyue Dong;Pengxu Wei;Wei Ke;Chang Liu;Qixiang Ye;Xiangyang Ji;Liang Lin", "authorids": "~Junrong_Lian1;~Ziyue_Dong1;~Pengxu_Wei1;~Wei_Ke1;~Chang_Liu9;~Qixiang_Ye1;~Xiangyang_Ji1;~Liang_Lin1", "gender": "M;F;;M;M;M;;M", "homepage": "https://github.com/banianrong?tab=repositories;https://github.com/DZY-irene;;https://gr.xjtu.edu.cn/web/wei.ke/homepage;https://www.au.tsinghua.edu.cn/en/info/1096/3484.htm;http://people.ucas.ac.cn/~qxye?language=en;;http://www.linliang.net", "dblp": "384/4156.html;;;52/7566-3;52/5716-42;06/4335;;", "google_scholar": ";;;BENt-uEAAAAJ;vsh1WP4AAAAJ;https://scholar.google.com.hk/citations?user=tjEfgsEAAAAJ;;https://scholar.google.com.hk/citations?user=Nav8m8gAAAAJ", "orcid": ";;;;0000-0001-6747-0646;;;", "linkedin": ";;;;;;;", "or_profile": "~Junrong_Lian1;~Ziyue_Dong1;~Pengxu_Wei1;~Wei_Ke1;~Chang_Liu9;~Qixiang_Ye1;~Xiangyang_Ji1;~Liang_Lin1", "aff": "SUN YAT-SEN UNIVERSITY;Xi'an Jiaotong University;;Xi'an Jiaotong University;Tsinghua University;University of Chinese Academy of Sciences;;SUN YAT-SEN UNIVERSITY", "aff_domain": "sysu.edu.cn;xjtu.edu.cn;;xjtu.edu.cn;tsinghua.edu.cn;ucas.ac.cn;;sysu.edu.cn", "position": "Undergrad student;MS student;;Associate Professor;Postdoc;Full Professor;;Full Professor", "bibtex": "@inproceedings{\nlian2024kepler,\ntitle={Kepler codebook},\nauthor={Junrong Lian and Ziyue Dong and Pengxu Wei and Wei Ke and Chang Liu and Qixiang Ye and Xiangyang Ji and Liang Lin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5ILo43JIzg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7344070, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2325016765166324453&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 4, "email": "sysu.edu.cn;xjtu.edu.cn;;xjtu.edu.cn;tsinghua.edu.cn;ucas.ac.cn;;sysu.edu.cn", "author_num": 8, "aff_unique_index": "0;1;1;2;3;0", "aff_unique_norm": "Sun Yat-sen University;Xi'an Jiao Tong University;Tsinghua University;University of Chinese Academy of Sciences", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.sysu.edu.cn;https://www.xjtu.edu.cn;https://www.tsinghua.edu.cn;http://www.ucas.ac.cn", "aff_unique_abbr": "SYSU;XJTU;THU;UCAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "The Privacy Power of Correlated Noise in Decentralized Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34985", "id": "5JrlywYHRi", "proceeding": "https://proceedings.mlr.press/v235/allouah24b.html", "pdf": "https://openreview.net/pdf?id=5JrlywYHRi", "openreview": "https://openreview.net/forum?id=5JrlywYHRi", "author_site": "Youssef Allouah, Anastasiia Koloskova, Aymane Firdoussi, Martin Jaggi, Rachid Guerraoui", "tldr": "", "abstract": "Decentralized learning is appealing as it enables the scalable usage of large amounts of distributed data and resources without resorting to any central entity, while promoting privacy since every user minimizes the direct exposure of their data. Yet, without additional precautions, curious users can still leverage models obtained from their peers to violate privacy. In this paper, we propose Decor, a variant of decentralized SGD with differential privacy (DP) guarantees. Essentially, in Decor, users securely exchange randomness seeds in one communication round to generate pairwise-canceling correlated Gaussian noises, which are injected to protect local models at every communication round. We theoretically and empirically show that, for arbitrary connected graphs, Decor matches the central DP optimal privacy-utility trade-off. We do so under SecLDP, our new relaxation of local DP, which protects all user communications against an external eavesdropper and curious users, assuming that every pair of connected users shares a secret, i.e., an information hidden to all others. The main theoretical challenge is to control the accumulation of non-canceling correlated noise due to network sparsity. We also propose a companion SecLDP privacy accountant for public use.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Youssef Allouah;Anastasia Koloskova;Aymane El Firdoussi;Martin Jaggi;Rachid Guerraoui", "authorids": "~Youssef_Allouah1;~Anastasia_Koloskova2;~Aymane_El_Firdoussi1;~Martin_Jaggi1;~Rachid_Guerraoui1", "gender": "M;M;M;M;F", "homepage": "https://youssefallouah.com/;;https://mlo.epfl.ch;https://lpdwww.epfl.ch/rachid/;", "dblp": "312/3936;;17/4402;g/RachidGuerraoui;228/9222", "google_scholar": "kVZu88cAAAAJ;;https://scholar.google.ch/citations?user=r1TJBr8AAAAJ;;ldJpvE8AAAAJ", "orcid": "0000-0003-1048-7548;;0000-0003-1579-5558;;", "linkedin": ";aymane-el-firdoussi-4a7544225;;;", "or_profile": "~Youssef_Allouah1;~Aymane_El_Firdoussi1;~Martin_Jaggi1;~Rachid_Guerraoui1;~Anastasiia_Koloskova1", "aff": "Stanford University;T\u00e9l\u00e9com ParisTech;EPFL;;Swiss Federal Institute of Technology Lausanne", "aff_domain": "stanford.edu;telecom-paristech.fr;epfl.ch;;epfl.ch", "position": "Visiting student researcher;MS student;Associate Professor;;PhD student", "bibtex": "@inproceedings{\nallouah2024the,\ntitle={The Privacy Power of Correlated Noise in Decentralized Learning},\nauthor={Youssef Allouah and Anastasia Koloskova and Aymane El Firdoussi and Martin Jaggi and Rachid Guerraoui},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5JrlywYHRi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 569108, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=896680821377123126&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 9, "email": "stanford.edu;telecom-paristech.fr;epfl.ch;;epfl.ch", "author_num": 5, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Stanford University;T\u00e9l\u00e9com ParisTech;EPFL;Swiss Federal Institute of Technology Lausanne", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.stanford.edu;https://www.telecom-paristech.fr;https://www.epfl.ch;https://www.epfl.ch", "aff_unique_abbr": "Stanford;TP;EPFL;EPFL", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Stanford;;Lausanne", "aff_country_unique_index": "0;1;2;2", "aff_country_unique": "United States;France;Switzerland" }, { "title": "Scalable and Flexible Causal Discovery with an Efficient Test for Adjacency", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34984", "id": "5M4Qa9AqY7", "proceeding": "https://proceedings.mlr.press/v235/amin24a.html", "pdf": "https://openreview.net/pdf?id=5M4Qa9AqY7", "openreview": "https://openreview.net/forum?id=5M4Qa9AqY7", "author_site": "Alan Amin, Andrew Wilson", "tldr": "", "abstract": "To make accurate predictions, understand mechanisms, and design interventions in systems of many variables, we wish to learn causal graphs from large scale data. Unfortunately the space of all possible causal graphs is enormous so scalably and accurately searching for the best fit to the data is a challenge. In principle we could substantially decrease the search space, or learn the graph entirely, by testing the conditional independence of variables. However, deciding if two variables are adjacent in a causal graph may require an exponential number of tests. Here we build a scalable and flexible method to evaluate if two variables are adjacent in a causal graph, the Differentiable Adjacency Test (DAT). DAT replaces an exponential number of tests with a provably equivalent relaxed problem. It then solves this problem by training two neural networks. We build a graph learning method based on DAT, DAT-Graph, that can also learn from data with interventions. DAT-Graph can learn graphs of 1000 variables with state of the art accuracy. Using the graph learned by DAT-Graph, we also build models that make much more accurate predictions of the effects of interventions on large scale RNA sequencing data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alan Nawzad Amin;Andrew Gordon Wilson", "authorids": "~Alan_Nawzad_Amin1;~Andrew_Gordon_Wilson1", "gender": "M;Not Specified", "homepage": ";https://cims.nyu.edu/~andrewgw", "dblp": "319/5032.html;65/10453", "google_scholar": ";https://scholar.google.com.tw/citations?user=twWX2LIAAAAJ", "orcid": "0000-0002-2656-8273;", "linkedin": ";", "or_profile": "~Alan_Nawzad_Amin1;~Andrew_Gordon_Wilson1", "aff": "New York University;New York University", "aff_domain": "nyu.edu;nyu.edu", "position": "Postdoc;Associate Professor", "bibtex": "@inproceedings{\namin2024scalable,\ntitle={Scalable and Flexible Causal Discovery with an Efficient Test for Adjacency},\nauthor={Alan Nawzad Amin and Andrew Gordon Wilson},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5M4Qa9AqY7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1543494, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:h2TNk4REYY8J:scholar.google.com/&scioq=Scalable+and+Flexible+Causal+Discovery+with+an+Efficient+Test+for+Adjacency&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": "nyu.edu;nyu.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "New York University", "aff_unique_dep": "", "aff_unique_url": "https://www.nyu.edu", "aff_unique_abbr": "NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Regression with Multi-Expert Deferral", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34983", "id": "5NTTCCO74S", "proceeding": "https://proceedings.mlr.press/v235/mao24d.html", "pdf": "https://openreview.net/pdf?id=5NTTCCO74S", "openreview": "https://openreview.net/forum?id=5NTTCCO74S", "author_site": "Anqi Mao, Mehryar Mohri, Yutao Zhong", "tldr": "", "abstract": "Learning to defer with multiple experts is a framework where the learner can choose to defer the prediction to several experts. While this problem has received significant attention in classification contexts, it presents unique challenges in regression due to the infinite and continuous nature of the label space. In this work, we introduce a novel framework of *regression with deferral*, which involves deferring the prediction to multiple experts. We present a comprehensive analysis for both the single-stage scenario, where there is simultaneous learning of predictor and deferral functions, and the two-stage scenario, which involves a pre-trained predictor with a learned deferral function. We introduce new surrogate loss functions for both scenarios and prove that they are supported by $H$-consistency bounds. These bounds provide consistency guarantees that are stronger than Bayes consistency, as they are non-asymptotic and hypothesis set-specific. Our framework is versatile, applying to multiple experts, accommodating any bounded regression losses, addressing both instance-dependent and label-dependent costs, and supporting both single-stage and two-stage methods. Our single-stage formulation subsumes as a special case the recent *regression with abstention* (Cheng et al., 2023) framework, where only a single expert is considered, specifically for the squared loss and a label-independent cost. Minimizing our proposed loss functions directly leads to novel algorithms for regression with deferral. We report the results of extensive experiments showing the effectiveness of our proposed algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anqi Mao;Mehryar Mohri;Yutao Zhong", "authorids": "~Anqi_Mao1;~Mehryar_Mohri2;~Yutao_Zhong1", "gender": "F;M;", "homepage": "https://anqi-mao.github.io;https://cs.nyu.edu/~mohri/;", "dblp": "241/6864;03/5448;51/3178-2", "google_scholar": "nkjIZ-oAAAAJ;ktwwLjsAAAAJ;", "orcid": ";;", "linkedin": ";mehryar-mohri-3737b981/;", "or_profile": "~Anqi_Mao1;~Mehryar_Mohri2;~Yutao_Zhong1", "aff": "Courant Institute of Mathematical Sciences, NYU;Google Research;Google", "aff_domain": "cims.nyu.edu;google.com;google.com", "position": "PhD student;Principal Researcher;Researcher", "bibtex": "@inproceedings{\nmao2024regression,\ntitle={Regression with Multi-Expert Deferral},\nauthor={Anqi Mao and Mehryar Mohri and Yutao Zhong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5NTTCCO74S}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 449485, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12222841928940668056&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 8, "email": "cims.nyu.edu;google.com;google.com", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "New York University;Google", "aff_unique_dep": "Courant Institute of Mathematical Sciences;Google Research", "aff_unique_url": "https://www.courant.nyu.edu;https://research.google", "aff_unique_abbr": "NYU;Google Research", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "New York;Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Detecting and Identifying Selection Structure in Sequential Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34982", "id": "5PQhu8flSO", "proceeding": "https://proceedings.mlr.press/v235/zheng24k.html", "pdf": "https://openreview.net/pdf?id=5PQhu8flSO", "openreview": "https://openreview.net/forum?id=5PQhu8flSO", "author_site": "Yujia Zheng, Zeyu Tang, Yiwen Qiu, Bernhard Sch\u00f6lkopf, Kun Zhang", "tldr": "", "abstract": "We argue that the selective inclusion of data points based on latent objectives is common in practical situations, such as music sequences. Since this selection process often distorts statistical analysis, previous work primarily views it as a bias to be corrected and proposes various methods to mitigate its effect. However, while controlling this bias is crucial, selection also offers an opportunity to provide a deeper insight into the hidden generation process, as it is a fundamental mechanism underlying what we observe. In particular, overlooking selection in sequential data can lead to an incomplete or overcomplicated inductive bias in modeling, such as assuming a universal autoregressive structure for all dependencies. Therefore, rather than merely viewing it as a bias, we explore the causal structure of selection in sequential data to delve deeper into the complete causal process. Specifically, we show that selection structure is identifiable without any parametric assumptions or interventional experiments. Moreover, even in cases where selection variables coexist with latent confounders, we still establish the nonparametric identifiability under appropriate structural conditions. Meanwhile, we also propose a provably correct algorithm to detect and identify selection structures as well as other types of dependencies. The framework has been validated empirically on both synthetic data and real-world music.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yujia Zheng;Zeyu Tang;Yiwen Qiu;Bernhard Sch\u00f6lkopf;Kun Zhang", "authorids": "~Yujia_Zheng1;~Zeyu_Tang1;~Yiwen_Qiu1;~Bernhard_Sch\u00f6lkopf1;~Kun_Zhang1", "gender": "M;;F;;M", "homepage": "https://yjzheng.com;https://zeyu.one;https://evieq01.github.io/evieqiu.github.io/;;http://www.andrew.cmu.edu/user/kunz1/", "dblp": "245/6109-1.html;296/1601-2;159/9832;;96/3115-1", "google_scholar": "https://scholar.google.co.uk/citations?user=ioiW248AAAAJ;https://scholar.google.com/citations?hl=en;tumZYG0AAAAJ;;RGoypN4AAAAJ", "orcid": "0009-0003-5225-6366;0000-0002-4423-4728;;;", "linkedin": ";;;;", "or_profile": "~Yujia_Zheng1;~Zeyu_Tang1;~Yiwen_Qiu1;~Bernhard_Sch\u00f6lkopf1;~Kun_Zhang1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;;Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu;andrew.cmu.edu;;cmu.edu", "position": "PhD student;PhD student;MS student;;Associate Professor", "bibtex": "@inproceedings{\nzheng2024detecting,\ntitle={Detecting and Identifying Selection Structure in Sequential Data},\nauthor={Yujia Zheng and Zeyu Tang and Yiwen Qiu and Bernhard Sch{\\\"o}lkopf and Kun Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5PQhu8flSO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4690805, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13368189580321092523&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "cmu.edu;cmu.edu;andrew.cmu.edu;;cmu.edu", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Leveraging VLM-Based Pipelines to Annotate 3D Objects", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34981", "id": "5Pcl5qOOfL", "proceeding": "https://proceedings.mlr.press/v235/kabra24a.html", "pdf": "https://openreview.net/pdf?id=5Pcl5qOOfL", "openreview": "https://openreview.net/forum?id=5Pcl5qOOfL", "author_site": "Rishabh Kabra, Loic Matthey, Alexander Lerchner, Niloy Mitra", "tldr": "", "abstract": "Pretrained vision language models (VLMs) present an opportunity to caption unlabeled 3D objects at scale. The leading approach to summarize VLM descriptions from different views of an object (Luo et al., 2023) relies on a language model (GPT4) to produce the final output. This text-based aggregation is susceptible to hallucinations as it merges potentially contradictory descriptions. We propose an alternative algorithm to marginalize over factors such as the viewpoint that affect the VLM's response. Instead of merging text-only responses, we utilize the VLM's joint image-text likelihoods. We show our probabilistic aggregation is not only more reliable and efficient, but sets the SoTA on inferring object types with respect to human-verified labels. The aggregated annotations are also useful for conditional inference; they improve downstream predictions (e.g., of object material) when the object\u2019s type is specified as an auxiliary text-based input. Such auxiliary inputs allow ablating the contribution of visual reasoning over visionless reasoning in an unsupervised setting. With these supervised and unsupervised evaluations, we show how a VLM-based pipeline can be leveraged to produce reliable annotations for 764K objects from the Objaverse dataset.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rishabh Kabra;Loic Matthey;Alexander Lerchner;Niloy Mitra", "authorids": "~Rishabh_Kabra1;~Loic_Matthey1;~Alexander_Lerchner1;~Niloy_Mitra1", "gender": "M;M;M;M", "homepage": ";https://matthey.me/;;http://www0.cs.ucl.ac.uk/staff/n.mitra/", "dblp": "234/8010;34/3990;21/3421;45/1575", "google_scholar": ";https://scholar.google.co.uk/citations?user=f520HmwAAAAJ;;https://scholar.google.com.tw/citations?user=dPrZJWMAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Rishabh_Kabra1;~Loic_Matthey1;~Alexander_Lerchner1;~Niloy_Mitra1", "aff": "University College London, University of London;Google DeepMind;Google DeepMind;Adobe Systems", "aff_domain": "ucl.ac.uk;google.com;deepmind.com;adobe.com", "position": "PhD student;Staff Research Scientist;Research Scientist;Researcher", "bibtex": "@inproceedings{\nkabra2024leveraging,\ntitle={Leveraging {VLM}-Based Pipelines to Annotate 3D Objects},\nauthor={Rishabh Kabra and Loic Matthey and Alexander Lerchner and Niloy Mitra},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5Pcl5qOOfL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8318010, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7216865921409345370&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "ucl.ac.uk;google.com;deepmind.com;adobe.com", "author_num": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "University College London;Google;Adobe", "aff_unique_dep": ";Google DeepMind;Adobe Systems Incorporated", "aff_unique_url": "https://www.ucl.ac.uk;https://deepmind.com;https://www.adobe.com", "aff_unique_abbr": "UCL;DeepMind;Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Tilt your Head: Activating the Hidden Spatial-Invariance of Classifiers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34980", "id": "5PqzKxmfag", "proceeding": "https://proceedings.mlr.press/v235/schmidt24a.html", "pdf": "https://openreview.net/pdf?id=5PqzKxmfag", "openreview": "https://openreview.net/forum?id=5PqzKxmfag", "author_site": "Johann Schmidt, Sebastian Stober", "tldr": "", "abstract": "Deep neural networks are applied in more and more areas of everyday life. However, they still lack essential abilities, such as robustly dealing with spatially transformed input signals. Approaches to mitigate this severe robustness issue are limited to two pathways: Either models are implicitly regularised by increased sample variability (data augmentation) or explicitly constrained by hard-coded inductive biases. The limiting factor of the former is the size of the data space, which renders sufficient sample coverage intractable. The latter is limited by the engineering effort required to develop such inductive biases for every possible scenario. Instead, we take inspiration from human behaviour, where percepts are modified by mental or physical actions during inference. We propose a novel technique to emulate such an inference process for neural nets. This is achieved by traversing a sparsified inverse transformation tree during inference using parallel energy-based evaluations. Our proposed inference algorithm, called Inverse Transformation Search (ITS), is model-agnostic and equips the model with zero-shot pseudo-invariance to spatially transformed inputs. We evaluated our method on several benchmark datasets, including a synthesised ImageNet test set. ITS outperforms the utilised baselines on all zero-shot test scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Johann Schmidt;Sebastian Stober", "authorids": "~Johann_Schmidt1;~Sebastian_Stober1", "gender": "M;M", "homepage": "https://johann-schmidt.com/;https://ai.ovgu.de", "dblp": ";73/650", "google_scholar": ";https://scholar.google.de/citations?user=OPyztE0AAAAJ", "orcid": "0000-0002-9700-3069;", "linkedin": ";", "or_profile": "~Johann_Schmidt1;~Sebastian_Stober1", "aff": "Otto-von-Guericke-Universit\u00e4t Magdeburg;Otto-von-Guericke-University Magdeburg", "aff_domain": "dke.ovgu.de;ovgu.de", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nschmidt2024tilt,\ntitle={Tilt your Head: Activating the Hidden Spatial-Invariance of Classifiers},\nauthor={Johann Schmidt and Sebastian Stober},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5PqzKxmfag}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7724611, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:l909EWpW4dsJ:scholar.google.com/&scioq=Tilt+your+Head:+Activating+the+Hidden+Spatial-Invariance+of+Classifiers&hl=en&as_sdt=0,21", "gs_version_total": 6, "email": "dke.ovgu.de;ovgu.de", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Otto-von-Guericke-Universit\u00e4t;Otto-von-Guericke-University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ovgu.de;https://www.ovgu.de", "aff_unique_abbr": "OVGU;OVGU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Magdeburg", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Diversified Batch Selection for Training Acceleration", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34979", "id": "5QWKec0eDF", "proceeding": "https://proceedings.mlr.press/v235/hong24c.html", "pdf": "https://openreview.net/pdf?id=5QWKec0eDF", "openreview": "https://openreview.net/forum?id=5QWKec0eDF", "author_site": "Feng Hong, Yueming LYU, Jiangchao Yao, Ya Zhang, Ivor Tsang, Yanfeng Wang", "tldr": "", "abstract": "The remarkable success of modern machine learning models on large datasets often demands extensive training time and resource consumption. To save cost, a prevalent research line, known as online batch selection, explores selecting informative subsets during the training process. Although recent efforts achieve advancements by measuring the impact of each sample on generalization, their reliance on additional reference models inherently limits their practical applications, when there are no such ideal models available. On the other hand, the vanilla reference-model-free methods involve independently scoring and selecting data in a sample-wise manner, which sacrifices the diversity and induces the redundancy. To tackle this dilemma, we propose Diversified Batch Selection (DivBS), which is reference-model-free and can efficiently select diverse and representative samples. Specifically, we define a novel selection objective that measures the group-wise orthogonalized representativeness to combat the redundancy issue of previous sample-wise criteria, and provide a principled selection-efficient realization. Extensive experiments across various tasks demonstrate the significant superiority of DivBS in the performance-speedup trade-off. The code is publicly available.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Feng Hong;Yueming Lyu;Jiangchao Yao;Ya Zhang;Ivor Tsang;Yanfeng Wang", "authorids": "~Feng_Hong1;~Yueming_Lyu1;~Jiangchao_Yao1;~Ya_Zhang1;~Ivor_Tsang1;~Yanfeng_Wang1", "gender": "M;M;M;F;M;M", "homepage": ";https://yueminglyu.github.io/;https://sunarker.github.io/;https://annzhanglion.github.io/;https://cmic.sjtu.edu.cn/wangyanfeng/;https://www.a-star.edu.sg/cfar/about-cfar/management/prof-ivor-tsang", "dblp": "68/1260-4;;166/5900;85/3714-2;55/5407-1.html;35/5873", "google_scholar": "DCTAaNQAAAAJ;uQXB6-oAAAAJ;w8oDh9QAAAAJ;pbjw9sMAAAAJ;https://scholar.google.com/citations?hl=zh-CN;rJMOlVsAAAAJ", "orcid": ";;;0000-0002-5390-9053;0000-0002-3196-2347;", "linkedin": ";;;;;", "or_profile": "~Feng_Hong1;~Yueming_Lyu1;~Jiangchao_Yao1;~Ya_Zhang1;~Yanfeng_Wang1;~Ivor_W_Tsang1", "aff": "Shanghai Jiaotong University;Agency for Science, Technology and Research (A*STAR);Shanghai Artificial Intelligence Laboratory;Shanghai Jiaotong University;Shanghai Jiaotong University;A*STAR", "aff_domain": "sjtu.edu.cn;astar.edu.sg;pjlab.org.cn;sjtu.edu.cn;sjtu.edu.cn;cfar.a-star.edu.sg", "position": "PhD student;Researcher;Researcher;Professor;Full Professor;Principal Researcher", "bibtex": "@inproceedings{\nhong2024diversified,\ntitle={Diversified Batch Selection for Training Acceleration},\nauthor={Feng Hong and Yueming Lyu and Jiangchao Yao and Ya Zhang and Ivor Tsang and Yanfeng Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5QWKec0eDF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 818006, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=806316857526918325&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "sjtu.edu.cn;astar.edu.sg;pjlab.org.cn;sjtu.edu.cn;sjtu.edu.cn;cfar.a-star.edu.sg", "author_num": 6, "aff_unique_index": "0;1;2;0;0;1", "aff_unique_norm": "Shanghai Jiao Tong University;Agency for Science, Technology and Research;Shanghai Artificial Intelligence Laboratory", "aff_unique_dep": ";;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.a-star.edu.sg;http://www.shailab.org/", "aff_unique_abbr": "SJTU;A*STAR;Shanghai AI Lab", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;1", "aff_country_unique": "China;Singapore" }, { "title": "Provably Efficient Partially Observable Risk-sensitive Reinforcement Learning with Hindsight Observation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34978", "id": "5S8ukkEQr2", "proceeding": "https://proceedings.mlr.press/v235/zhang24g.html", "pdf": "https://openreview.net/pdf?id=5S8ukkEQr2", "openreview": "https://openreview.net/forum?id=5S8ukkEQr2", "author_site": "Tonghe Zhang, Yu Chen, Longbo Huang", "tldr": "", "abstract": "This work pioneers regret analysis of risk-sensitive reinforcement learning in partially observable environments with hindsight observation, addressing a gap in theoretical exploration. We introduce a novel formulation that integrates hindsight observations into a Partially Observable Markov Decision Process (POMDP) framework, where the goal is to optimize accumulated reward under the entropic risk measure. We develop the first provably efficient RL algorithm tailored for this setting. We also prove by rigorous analysis that our algorithm achieves polynomial regret $\\tilde{O}\\left(\\frac{e^{|{\\gamma}|H}-1}{|{\\gamma}|H}H^2\\sqrt{KHS^2OA}\\right)$, which outperforms or matches existing upper bounds when the model degenerates to risk-neutral or fully observable settings. We adopt the method of change-of-measure and develop a novel analytical tool of beta vectors to streamline mathematical derivations. These techniques are of particular interest to the theoretical study of reinforcement learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tonghe Zhang;Yu Chen;Longbo Huang", "authorids": "~Tonghe_Zhang1;~Yu_Chen19;~Longbo_Huang2", "gender": "M;M;M", "homepage": "https://tonghe-zhang.github.io/;https://custyhs.github.io/;http://people.iiis.tsinghua.edu.cn/~huang/", "dblp": ";87/1254-74;79/7077", "google_scholar": "Tk747_gAAAAJ;rJ6Ipa0AAAAJ;", "orcid": ";0009-0006-9503-6613;", "linkedin": ";;", "or_profile": "~Tonghe_Zhang1;~Yu_Chen19;~Longbo_Huang2", "aff": "Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "Undergrad student;PhD student;Full Professor", "bibtex": "@inproceedings{\nzhang2024provably,\ntitle={Provably Efficient Partially Observable Risk-sensitive Reinforcement Learning with Hindsight Observation},\nauthor={Tonghe Zhang and Yu Chen and Longbo Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5S8ukkEQr2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 879579, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10592898070058203245&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Position: Data-driven Discovery with Large Generative Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34977", "id": "5SpjhZNXtt", "proceeding": "https://proceedings.mlr.press/v235/majumder24a.html", "pdf": "https://openreview.net/pdf?id=5SpjhZNXtt", "openreview": "https://openreview.net/forum?id=5SpjhZNXtt", "author_site": "Bodhisattwa Prasad Majumder, Harshit Surana, Dhruv Agarwal, Sanchaita Hazra, Ashish Sabharwal, Peter Clark", "tldr": "", "abstract": "With the accumulation of data at an unprecedented rate, its potential to fuel scientific discovery is growing exponentially. This position paper urges the Machine Learning (ML) community to exploit the capabilities of large generative models (LGMs) to develop automated systems for end-to-end data-driven discovery\u2014a paradigm encompassing the search and verification of hypotheses purely from a set of provided datasets, without the need for additional data collection or physical experiments. We first outline several desiderata for an ideal data-driven discovery system. Then, through DataVoyager, a proof-of-concept utilizing GPT-4, we demonstrate how LGMs fulfill several of these desiderata\u2014a feat previously unattainable\u2014while also highlighting important limitations in the current system that open up opportunities for novel ML research. We contend that achieving accurate, reliable, and robust end-to-end discovery systems solely through the current capabilities of LGMs is challenging. We instead advocate for fail-proof tool integration, along with active user moderation through feedback mechanisms, to foster data-driven scientific discoveries with efficiency and reproducibility.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bodhisattwa Prasad Majumder;Harshit Surana;Dhruv Agarwal;Sanchaita Hazra;Ashish Sabharwal;Peter Clark", "authorids": "~Bodhisattwa_Prasad_Majumder1;~Harshit_Surana1;~Dhruv_Agarwal2;~Sanchaita_Hazra1;~Ashish_Sabharwal1;~Peter_Clark1", "gender": ";M;M;F;M;M", "homepage": "https://www.majumderb.com/;https://harshitsurana.com/;https://people.cs.umass.edu/~dagarwal/;https://sanchaitahazra.com/;;https://allenai.org/team/peterc", "dblp": "138/6177;29/8163.html;301/7894;;13/154;34/1184", "google_scholar": "cEM1a5gAAAAJ;xl24xGEAAAAJ;7-AxhB4AAAAJ;;7VspfeAAAAAJ;o-5vyEsAAAAJ", "orcid": ";;0000-0001-7258-5130;;;", "linkedin": ";surana/;dhdhagar/;;ashish-sabharwal-82a2b661;peter-clark-a8b556/", "or_profile": "~Bodhisattwa_Prasad_Majumder1;~Harshit_Surana1;~Dhruv_Agarwal2;~Sanchaita_Hazra1;~Ashish_Sabharwal1;~Peter_Clark1", "aff": "Allen Institute for Artificial Intelligence;Allen Institute for Artificial Intelligence;Amazon;University of Utah;Allen Institute for AI;Allen Institute for Artificial Intelligence", "aff_domain": "allenai.org;allenai.org;amazon.com;utah.edu;allenai.org;allenai.org", "position": "Researcher;Researcher;Intern;PhD student;Principal Researcher;Senior Research Manager", "bibtex": "@inproceedings{\nmajumder2024position,\ntitle={Position: Data-driven Discovery with Large Generative Models},\nauthor={Bodhisattwa Prasad Majumder and Harshit Surana and Dhruv Agarwal and Sanchaita Hazra and Ashish Sabharwal and Peter Clark},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5SpjhZNXtt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3998342, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7543414478050680263&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 5, "email": "allenai.org;allenai.org;amazon.com;utah.edu;allenai.org;allenai.org", "author_num": 6, "aff_unique_index": "0;0;1;2;3;0", "aff_unique_norm": "Allen Institute for Artificial Intelligence;Amazon;University of Utah;Allen Institute for AI", "aff_unique_dep": ";Amazon.com, Inc.;;", "aff_unique_url": "https://allenai.org;https://www.amazon.com;https://www.utah.edu;https://allenai.org", "aff_unique_abbr": "AI2;Amazon;Utah;AI2", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Iterative Search Attribution for Deep Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34976", "id": "5ToHnqYxjB", "proceeding": "https://proceedings.mlr.press/v235/zhu24a.html", "pdf": "https://openreview.net/pdf?id=5ToHnqYxjB", "openreview": "https://openreview.net/forum?id=5ToHnqYxjB", "author_site": "Zhiyu Zhu, Huaming Chen, Xinyi Wang, Jiayu Zhang, Zhibo Jin, Minhui Xue, Jun Shen", "tldr": "", "abstract": "Deep neural networks (DNNs) have achieved state-of-the-art performance across various applications. However, ensuring the reliability and trustworthiness of DNNs requires enhanced interpretability of model inputs and outputs. As an effective means of Explainable Artificial Intelligence (XAI) research, the interpretability of existing attribution algorithms varies depending on the choice of reference point, the quality of adversarial samples, or the applicability of gradient constraints in specific tasks. To thoroughly explore the attribution integration paths, in this paper, inspired by the iterative generation of high-quality samples in the diffusion model, we propose an Iterative Search Attribution (ISA) method. To enhance attribution accuracy, ISA distinguishes the importance of samples during gradient ascent and descent, while clipping the relatively unimportant features in the model. Specifically, we introduce a scale parameter during the iterative process to ensure the features in next iteration are always more significant than those in current iteration. Comprehensive experimental results show that our method has superior interpretability in image recognition tasks compared with state-of-the-art baselines. Our code is available at: https://github.com/LMBTough/ISA", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhiyu Zhu;Huaming Chen;Xinyi Wang;Jiayu Zhang;Zhibo Jin;Jason Xue;Jun Shen", "authorids": "~Zhiyu_Zhu2;~Huaming_Chen1;~Xinyi_Wang9;~Jiayu_Zhang1;~Zhibo_Jin1;~Jason_Xue1;~Jun_Shen3", "gender": ";;M;M;M;;M", "homepage": ";;https://github.com/noctisluna;https://github.com/KxPlaug;https://github.com/Davidjinzb;;https://scholars.uow.edu.au/display/jun_shen", "dblp": ";;;;;;48/5700-1", "google_scholar": ";;;;;;Bf6gvGkAAAAJ", "orcid": ";;;;;;0000-0002-9403-7140", "linkedin": ";;;;;;jun-shen-him-his-37b95337/", "or_profile": "~Zhiyu_Zhu2;~Huaming_Chen1;~Xinyi_Wang9;~Jiayu_Zhang1;~Zhibo_Jin1;~Jason_Xue1;~Jun_Shen3", "aff": ";;Universiti Malaya;Suzhou Yierqi;University of Sydney;;University of Wollongong", "aff_domain": ";;um.edu.my;szyierqi.com;usyd.edu.au;;uow.edu.au", "position": ";;MS student;Researcher;MS student;;Full Professor", "bibtex": "@inproceedings{\nzhu2024iterative,\ntitle={Iterative Search Attribution for Deep Neural Networks},\nauthor={Zhiyu Zhu and Huaming Chen and Xinyi Wang and Jiayu Zhang and Zhibo Jin and Jason Xue and Jun Shen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5ToHnqYxjB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3650552, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16203848881917102408&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 5, "email": ";;um.edu.my;szyierqi.com;usyd.edu.au;;uow.edu.au", "author_num": 7, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Universiti Malaya;Suzhou Yierqi;University of Sydney;University of Wollongong", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.malaya.edu.my;;https://www.sydney.edu.au;https://www.uow.edu.au", "aff_unique_abbr": "UM;;USYD;UOW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;2", "aff_country_unique": "Malaysia;China;Australia" }, { "title": "IW-GAE: Importance weighted group accuracy estimation for improved calibration and model selection in unsupervised domain adaptation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34975", "id": "5WEIVj98Ju", "proceeding": "https://proceedings.mlr.press/v235/joo24a.html", "pdf": "https://openreview.net/pdf?id=5WEIVj98Ju", "openreview": "https://openreview.net/forum?id=5WEIVj98Ju", "author_site": "Taejong Joo, Diego Klabjan", "tldr": "", "abstract": "Distribution shifts pose significant challenges for model calibration and model selection tasks in the unsupervised domain adaptation problem---a scenario where the goal is to perform well in a distribution shifted domain without labels. In this work, we tackle difficulties coming from distribution shifts by developing a novel importance weighted group accuracy estimator. Specifically, we present a new perspective of addressing the model calibration and model selection tasks by estimating the group accuracy. Then, we formulate an optimization problem for finding an importance weight that leads to an accurate group accuracy estimation with theoretical analyses. Our extensive experiments show that our approach improves state-of-the-art performances by 22% in the model calibration task and 14% in the model selection task.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Taejong Joo;Diego Klabjan", "authorids": "~Taejong_Joo1;~Diego_Klabjan1", "gender": "M;M", "homepage": "https://tjoo512.github.io/;http://dynresmanagement.com/index.html", "dblp": "237/3935;17/105", "google_scholar": "ESo1UqMAAAAJ;TaQZ_VUAAAAJ", "orcid": ";0000-0003-4213-9281", "linkedin": ";diegoklabjan", "or_profile": "~Taejong_Joo1;~Diego_Klabjan1", "aff": "Northwestern University;Northwestern University", "aff_domain": "u.northwestern.edu;u.northwestern.edu", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\njoo2024iwgae,\ntitle={{IW}-{GAE}: Importance weighted group accuracy estimation for improved calibration and model selection in unsupervised domain adaptation},\nauthor={Taejong Joo and Diego Klabjan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5WEIVj98Ju}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1540217, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=818625207292464182&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "u.northwestern.edu;u.northwestern.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Northwestern University", "aff_unique_dep": "", "aff_unique_url": "https://www.northwestern.edu", "aff_unique_abbr": "NU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Robust and Conjugate Gaussian Process Regression", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34974", "id": "5WnKLIAX4q", "proceeding": "https://proceedings.mlr.press/v235/altamirano24a.html", "pdf": "https://openreview.net/pdf?id=5WnKLIAX4q", "openreview": "https://openreview.net/forum?id=5WnKLIAX4q", "author_site": "Matias Altamirano, Francois-Xavier Briol, Jeremias Knoblauch", "tldr": "", "abstract": "To enable closed form conditioning, a common assumption in Gaussian process (GP) regression is independent and identically distributed Gaussian observation noise. This strong and simplistic assumption is often violated in practice, which leads to unreliable inferences and uncertainty quantification. Unfortunately, existing methods for robustifying GPs break closed-form conditioning, which makes them less attractive to practitioners and significantly more computationally expensive. In this paper, we demonstrate how to perform provably robust and conjugate Gaussian process (RCGP) regression at virtually no additional cost using generalised Bayesian inference. RCGP is particularly versatile as it enables exact conjugate closed form updates in all settings where standard GPs admit them. To demonstrate its strong empirical performance, we deploy RCGP for problems ranging from Bayesian optimisation to sparse variational Gaussian processes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Matias Altamirano;Francois-Xavier Briol;Jeremias Knoblauch", "authorids": "~Matias_Altamirano2;~Francois-Xavier_Briol1;~Jeremias_Knoblauch1", "gender": "M;M;M", "homepage": "https://maltamiranomontero.github.io/;https://fxbriol.github.io;https://jeremiasknoblauch.github.io/", "dblp": ";https://dblp.uni-trier.de/pid/173/4982;220/5462", "google_scholar": "qxVZ-mIAAAAJ;https://scholar.google.co.uk/citations?user=yLBYtAwAAAAJ;https://scholar.google.co.uk/citations?user=4TPsxlsAAAAJ", "orcid": ";0000-0002-0181-2559;", "linkedin": ";;", "or_profile": "~Matias_Altamirano2;~Francois-Xavier_Briol1;~Jeremias_Knoblauch1", "aff": "University College London, University of London;University College London, University of London;", "aff_domain": "ucl.ac.uk;ucl.ac.uk;", "position": "PhD student;Associate Professor;", "bibtex": "@inproceedings{\naltamirano2024robust,\ntitle={Robust and Conjugate Gaussian Process Regression},\nauthor={Matias Altamirano and Francois-Xavier Briol and Jeremias Knoblauch},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5WnKLIAX4q}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1085085, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11216480856698165278&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "ucl.ac.uk;ucl.ac.uk;", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University College London", "aff_unique_dep": "", "aff_unique_url": "https://www.ucl.ac.uk", "aff_unique_abbr": "UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Explorations of Self-Repair in Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34973", "id": "5ZwEifshyo", "proceeding": "https://proceedings.mlr.press/v235/rushing24a.html", "pdf": "https://openreview.net/pdf?id=5ZwEifshyo", "openreview": "https://openreview.net/forum?id=5ZwEifshyo", "author_site": "Cody Rushing, Neel Nanda", "tldr": "", "abstract": "Prior interpretability research studying narrow distributions has preliminarily identified self-repair, a phenomena where if components in large language models are ablated, later components will change their behavior to compensate. Our work builds off this past literature, demonstrating that self-repair exists on a variety of models families and sizes when ablating individual attention heads on the full training distribution. We further show that on the full training distribution self-repair is imperfect, as the original direct effect of the head is not fully restored, and noisy, since the degree of self-repair varies significantly across different prompts (sometimes overcorrecting beyond the original effect). We highlight two different mechanisms that contribute to self-repair, including changes in the final LayerNorm scaling factor and sparse sets of neurons implementing Anti-Erasure. We additionally discuss the implications of these results for interpretability practitioners and close with a more speculative discussion on the mystery of why self-repair occurs in these models at all, highlighting evidence for the Iterative Inference hypothesis in language models, a framework that predicts self-repair.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Cody Rushing;Neel Nanda", "authorids": "~Cody_Rushing1;~Neel_Nanda1", "gender": "M;M", "homepage": "https://starship006.github.io/;https://neelnanda.io", "dblp": ";285/6389", "google_scholar": "t5fY7ysAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";", "linkedin": "codyrushing/;https://linkedin.com/in/neel-nanda-993580151", "or_profile": "~Cody_Rushing1;~Neel_Nanda1", "aff": "University of Texas at Austin;Google DeepMind", "aff_domain": "utexas.edu;deepmind.com", "position": "Undergrad student;Researcher", "bibtex": "@inproceedings{\nrushing2024explorations,\ntitle={Explorations of Self-Repair in Language Models},\nauthor={Cody Rushing and Neel Nanda},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5ZwEifshyo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2852547, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15291384838409907850&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "utexas.edu;deepmind.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Texas at Austin;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.utexas.edu;https://deepmind.com", "aff_unique_abbr": "UT Austin;DeepMind", "aff_campus_unique_index": "0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Partial Multi-View Multi-Label Classification via Semantic Invariance Learning and Prototype Modeling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34972", "id": "5ap1MmUqO6", "proceeding": "https://proceedings.mlr.press/v235/liu24bv.html", "pdf": "https://openreview.net/pdf?id=5ap1MmUqO6", "openreview": "https://openreview.net/forum?id=5ap1MmUqO6", "author_site": "Chengliang Liu, Gehui Xu, Jie Wen, Yabo Liu, Chao Huang, Yong Xu", "tldr": "", "abstract": "The difficulty of partial multi-view multi-label learning lies in coupling the consensus of multi-view data with the task relevance of multi-label classification, under the condition where partial views and labels are unavailable. In this paper, we seek to compress cross-view representation to maximize the proportion of shared information to better predict semantic tags. To achieve this, we establish a model consistent with the information bottleneck theory for learning cross-view shared representation, minimizing non-shared information while maintaining feature validity to help increase the purity of task-relevant information. Furthermore, we model multi-label prototype instances in the latent space and learn label correlations in a data-driven manner. Our method outperforms existing state-of-the-art methods on multiple public datasets while exhibiting good compatibility with both partial and complete data. Finally, we experimentally reveal the importance of condensing shared information under the premise of information balancing, in the process of multi-view information encoding and compression.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chengliang Liu;Gehui Xu;Jie Wen;Yabo Liu;Chao Huang;Yong Xu", "authorids": "~Chengliang_Liu1;~Gehui_Xu2;~Jie_Wen1;~Yabo_Liu1;~Chao_Huang16;~Yong_Xu9", "gender": ";M;;M;M;M", "homepage": ";https://github.com/tkkxgh/tkkxgh.github.io.git;;;;https://www.yongxu.org", "dblp": ";328/7509;;08/7626;;", "google_scholar": ";;;8AyAPDEAAAAJ;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.hk/citations?user=zOVgYQYAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Chengliang_Liu1;~Gehui_Xu2;~Jie_Wen1;~Yabo_Liu1;~Chao_Huang16;~Yong_Xu9", "aff": ";Harbin Institute of Technology;;Harbin Institute of Technology;University of Macau;Harbin Institute of Technology", "aff_domain": ";hit.edu.cn;;hit.edu.cn;um.edu.mo;hit.edu.cn", "position": ";MS student;;PhD student;Intern;Full Professor", "bibtex": "@inproceedings{\nliu2024partial,\ntitle={Partial Multi-View Multi-Label Classification via Semantic Invariance Learning and Prototype Modeling},\nauthor={Chengliang Liu and Gehui Xu and Jie Wen and Yabo Liu and Chao Huang and Yong Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5ap1MmUqO6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1051189, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16633965298714994062&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": ";hit.edu.cn;;hit.edu.cn;um.edu.mo;hit.edu.cn", "author_num": 6, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Harbin Institute of Technology;University of Macau", "aff_unique_dep": ";", "aff_unique_url": "http://www.hit.edu.cn/;https://www.um.edu.mo", "aff_unique_abbr": "HIT;UM", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Harbin;Macau SAR", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "On the Maximal Local Disparity of Fairness-Aware Classifiers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34971", "id": "5cm2jGct2W", "proceeding": "https://proceedings.mlr.press/v235/jin24c.html", "pdf": "https://openreview.net/pdf?id=5cm2jGct2W", "openreview": "https://openreview.net/forum?id=5cm2jGct2W", "author_site": "Jinqiu Jin, Haoxuan Li, Fuli Feng", "tldr": "", "abstract": "Fairness has become a crucial aspect in the development of trustworthy machine learning algorithms. Current fairness metrics to measure the violation of demographic parity have the following drawbacks: (i) the *average difference* of model predictions on two groups cannot reflect their *distribution disparity*, and (ii) the *overall* calculation along all possible predictions conceals the *extreme local disparity* at or around certain predictions. In this work, we propose a novel fairness metric called **M**aximal **C**umulative ratio **D**isparity along varying **P**redictions' neighborhood (MCDP), for measuring the maximal local disparity of the fairness-aware classifiers. To accurately and efficiently calculate the MCDP, we develop a provably exact and an approximate calculation algorithm that greatly reduces the computational complexity with low estimation error. We further propose a bi-level optimization algorithm using a differentiable approximation of the MCDP for improving the algorithmic fairness. Extensive experiments on both tabular and image datasets validate that our fair training algorithm can achieve superior fairness-accuracy trade-offs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jinqiu Jin;Haoxuan Li;Fuli Feng", "authorids": "~Jinqiu_Jin1;~Haoxuan_Li6;~Fuli_Feng1", "gender": "M;M;M", "homepage": "https://github.com/mitao-cat;https://haoxuanli-pku.github.io/;https://fulifeng.github.io/", "dblp": ";145/4965-1.html;183/9198", "google_scholar": ";gtDqiucAAAAJ;https://scholar.google.com.sg/citations?user=QePM4u8AAAAJ", "orcid": ";0000-0003-3620-3769;0000-0002-5828-9842", "linkedin": ";;", "or_profile": "~Jinqiu_Jin1;~Haoxuan_Li6;~Fuli_Feng1", "aff": "University of Science and Technology of China;Peking University;University of Science and Technology of China", "aff_domain": "ustc.edu.cn;pku.edu.cn;ustc.edu.cn", "position": "MS student;PhD student;Full Professor", "bibtex": "@inproceedings{\njin2024on,\ntitle={On the Maximal Local Disparity of Fairness-Aware Classifiers},\nauthor={Jinqiu Jin and Haoxuan Li and Fuli Feng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5cm2jGct2W}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2140818, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10179609291806263446&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "ustc.edu.cn;pku.edu.cn;ustc.edu.cn", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Science and Technology of China;Peking University", "aff_unique_dep": ";", "aff_unique_url": "http://www.ustc.edu.cn;http://www.pku.edu.cn", "aff_unique_abbr": "USTC;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Unveiling the Dynamics of Information Interplay in Supervised Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34970", "id": "5hfvLBgnNE", "proceeding": "https://proceedings.mlr.press/v235/song24g.html", "pdf": "https://openreview.net/pdf?id=5hfvLBgnNE", "openreview": "https://openreview.net/forum?id=5hfvLBgnNE", "author_site": "Kun Song, Zhiquan Tan, Bochao Zou, Huimin Ma, Weiran Huang", "tldr": "", "abstract": "In this paper, we use matrix information theory as an analytical tool to analyze the dynamics of the information interplay between data representations and classification head vectors in the supervised learning process. Specifically, inspired by the theory of Neural Collapse, we introduce matrix mutual information ratio (MIR) and matrix entropy difference ratio (HDR) to assess the interactions of data representation and class classification heads in supervised learning, and we determine the theoretical optimal values for MIR and HDR when Neural Collapse happens. Our experiments show that MIR and HDR can effectively explain many phenomena occurring in neural networks, for example, the standard supervised training dynamics, linear mode connectivity, and the performance of label smoothing and pruning. Additionally, we use MIR and HDR to gain insights into the dynamics of grokking, which is an intriguing phenomenon observed in supervised training, where the model demonstrates generalization capabilities long after it has learned to fit the training data. Furthermore, we introduce MIR and HDR as loss terms in supervised and semi-supervised learning to optimize the information interactions among samples and classification heads. The empirical results provide evidence of the method's effectiveness, demonstrating that the utilization of MIR and HDR not only aids in comprehending the dynamics throughout the training process but can also enhances the training procedure itself.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kun Song;Zhiquan Tan;Bochao Zou;Huimin Ma;Weiran Huang", "authorids": "~Kun_Song3;~Zhiquan_Tan1;~Bochao_Zou1;~Huimin_Ma1;~Weiran_Huang1", "gender": "M;M;M;F;M", "homepage": "https://www.sanker.plus;;;http://server.3dimagelab.cn:5000;https://www.weiranhuang.com", "dblp": "96/855-4;326/0177;197/9774;69/7694-1;170/0073-1", "google_scholar": "mUDDC_4AAAAJ;;Cb29A3cAAAAJ;32hwVLEAAAAJ;AjJ2rf8AAAAJ", "orcid": "0009-0003-0004-3780;;;;", "linkedin": ";https://www.linkedin.cn/incareer/in/ACoAAC1A8_QBFX8OlchWmVI_pNXN4zm_t6vPKCs;;;", "or_profile": "~Kun_Song3;~Zhiquan_Tan1;~Bochao_Zou1;~Huimin_Ma1;~Weiran_Huang1", "aff": "University of Science and Technology Beijing;Tsinghua University;University of Science and Technology Beijing;University of Science and Technology Beijing;Shanghai AI Laboratory", "aff_domain": "ustb.edu.cn;tsinghua.edu.cn;ustb.edu.cn;ustb.edu.cn;pjlab.org.cn", "position": "PhD student;PhD student;Lecturer;Full Professor;Consultant", "bibtex": "@inproceedings{\nsong2024unveiling,\ntitle={Unveiling the Dynamics of Information Interplay in Supervised Learning},\nauthor={Kun Song and Zhiquan Tan and Bochao Zou and Huimin Ma and Weiran Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5hfvLBgnNE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 513106, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5923607887621411370&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "ustb.edu.cn;tsinghua.edu.cn;ustb.edu.cn;ustb.edu.cn;pjlab.org.cn", "author_num": 5, "aff_unique_index": "0;1;0;0;2", "aff_unique_norm": "University of Science and Technology Beijing;Tsinghua University;Shanghai AI Laboratory", "aff_unique_dep": ";;", "aff_unique_url": "http://www.ustb.edu.cn;https://www.tsinghua.edu.cn;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "USTB;THU;SAIL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Distributed Bilevel Optimization with Communication Compression", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34969", "id": "5j7Lq2ASiU", "proceeding": "https://proceedings.mlr.press/v235/he24d.html", "pdf": "https://openreview.net/pdf?id=5j7Lq2ASiU", "openreview": "https://openreview.net/forum?id=5j7Lq2ASiU", "author_site": "Yutong He, Jie Hu, Xinmeng Huang, Songtao Lu, Bin Wang, Kun Yuan", "tldr": "", "abstract": "Stochastic bilevel optimization tackles challenges involving nested optimization structures. Its fast-growing scale nowadays necessitates efficient distributed algorithms. In conventional distributed bilevel methods, each worker must transmit full-dimensional stochastic gradients to the server every iteration, leading to significant communication overhead and thus hindering efficiency and scalability. To resolve this issue, we introduce the **first** family of distributed bilevel algorithms with communication compression. The primary challenge in algorithmic development is mitigating bias in hypergradient estimation caused by the nested structure. We first propose C-SOBA, a simple yet effective approach with unbiased compression and provable linear speedup convergence. However, it relies on strong assumptions on bounded gradients. To address this limitation, we explore the use of moving average, error feedback, and multi-step compression in bilevel optimization, resulting in a series of advanced algorithms with relaxed assumptions and improved convergence properties. Numerical experiments show that our compressed bilevel algorithms can achieve $10\\times$ reduction in communication overhead without severe performance degradation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yutong He;Jie Hu;Xinmeng Huang;Songtao Lu;Bin Wang;Kun Yuan", "authorids": "~Yutong_He2;~Jie_Hu12;~Xinmeng_Huang1;~Songtao_Lu1;~Bin_Wang35;~Kun_Yuan4", "gender": "M;F;M;M;;", "homepage": "https://www.researchgate.net/profile/Yutong-He-16;https://jiehu-jessica.github.io/;;https://songtaogithub.github.io/;;", "dblp": ";;256/1617;05/2887;;", "google_scholar": "XsXjeysAAAAJ;;vM2nHxEAAAAJ;LRsjX7kAAAAJ;;", "orcid": "0009-0002-5078-6454;;;;;", "linkedin": ";;xinmeng-huang-8032221b3/;;;", "or_profile": "~Yutong_He2;~Jie_Hu12;~Xinmeng_Huang1;~Songtao_Lu1;~Bin_Wang35;~Kun_Yuan4", "aff": "Alibaba Group;Peking University;University of Pennsylvania;IBM Thomas J. Watson Research Center;;", "aff_domain": "alibaba-inc.com;stu.pku.edu.cn;upenn.edu;ibm.com;;", "position": "Intern;PhD student;PhD student;Researcher;;", "bibtex": "@inproceedings{\nhe2024distributed,\ntitle={Distributed Bilevel Optimization with Communication Compression},\nauthor={Yutong He and Jie Hu and Xinmeng Huang and Songtao Lu and Bin Wang and Kun Yuan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5j7Lq2ASiU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1034028, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6301009907277508834&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "alibaba-inc.com;stu.pku.edu.cn;upenn.edu;ibm.com;;", "author_num": 6, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Alibaba Group;Peking University;University of Pennsylvania;IBM", "aff_unique_dep": ";;;Research", "aff_unique_url": "https://www.alibaba.com;http://www.pku.edu.cn;https://www.upenn.edu;https://www.ibm.com/research", "aff_unique_abbr": "Alibaba;Peking U;UPenn;IBM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Yorktown Heights", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "China;United States" }, { "title": "Recurrent Distance Filtering for Graph Representation Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34968", "id": "5kGfm3Pa41", "proceeding": "https://proceedings.mlr.press/v235/ding24d.html", "pdf": "https://openreview.net/pdf?id=5kGfm3Pa41", "openreview": "https://openreview.net/forum?id=5kGfm3Pa41", "author_site": "Yuhui Ding, Antonio Orvieto, Bobby He, Thomas Hofmann", "tldr": "", "abstract": "Graph neural networks based on iterative one-hop message passing have been shown to struggle in harnessing the information from distant nodes effectively. Conversely, graph transformers allow each node to attend to all other nodes directly, but lack graph inductive bias and have to rely on ad-hoc positional encoding. In this paper, we propose a new architecture to reconcile these challenges. Our approach stems from the recent breakthroughs in long-range modeling provided by deep state-space models: for a given target node, our model aggregates other nodes by their shortest distances to the target and uses a linear RNN to encode the sequence of hop representations. The linear RNN is parameterized in a particular diagonal form for stable long-range signal propagation and is theoretically expressive enough to encode the neighborhood hierarchy. With no need for positional encoding, we empirically show that the performance of our model is comparable to or better than that of state-of-the-art graph transformers on various benchmarks, with a significantly reduced computational cost. Our code is open-source at https://github.com/skeletondyh/GRED.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuhui Ding;Antonio Orvieto;Bobby He;Thomas Hofmann", "authorids": "~Yuhui_Ding1;~Antonio_Orvieto3;~Bobby_He1;~Thomas_Hofmann1", "gender": "M;M;;M", "homepage": "https://skeletondyh.github.io;http://orvi.altervista.org/;http://csml.stats.ox.ac.uk/people/he/;http://www.da.inf.ethz.ch/", "dblp": "230/3524;;270/3685;h/ThHofmann", "google_scholar": "r7KsfaAAAAAJ;xkuLyHoAAAAJ;;T3hAyLkAAAAJ", "orcid": ";;;", "linkedin": ";antonio-orvieto-947ab0130/;;thomas-hofmann-1ab2402/", "or_profile": "~Yuhui_Ding1;~Antonio_Orvieto3;~Bobby_He1;~Thomas_Hofmann1", "aff": "Department of Computer Science, ETHZ - ETH Zurich;ELLIS Institute T\u00fcbingen, Max Planck Institute for Intelligent Systems, T\u00fcbingen AI Center, T\u00fcbingen, Germany;Department of Computer Science, ETHZ - ETH Zurich;Swiss Federal Institute of Technology", "aff_domain": "inf.ethz.ch;tue.ellis.eu;inf.ethz.ch;ethz.ch", "position": "PhD student;Principal Researcher;Postdoc;Full Professor", "bibtex": "@inproceedings{\nding2024recurrent,\ntitle={Recurrent Distance Filtering for Graph Representation Learning},\nauthor={Yuhui Ding and Antonio Orvieto and Bobby He and Thomas Hofmann},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5kGfm3Pa41}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2218185, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6302882728347743255&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "inf.ethz.ch;tue.ellis.eu;inf.ethz.ch;ethz.ch", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "ETH Zurich;ELLIS Institute T\u00fcbingen;Swiss Federal Institute of Technology", "aff_unique_dep": "Department of Computer Science;;", "aff_unique_url": "https://www.ethz.ch;;https://www.ethz.ch", "aff_unique_abbr": "ETHZ;;ETH Zurich", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Zurich;T\u00fcbingen;", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Switzerland;Germany" }, { "title": "A Minimaximalist Approach to Reinforcement Learning from Human Feedback", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34967", "id": "5kVgd2MwMY", "proceeding": "https://proceedings.mlr.press/v235/swamy24a.html", "pdf": "https://openreview.net/pdf?id=5kVgd2MwMY", "openreview": "https://openreview.net/forum?id=5kVgd2MwMY", "author_site": "Gokul Swamy, Christoph Dann, Rahul Kidambi, Steven Wu, Alekh Agarwal", "tldr": "", "abstract": "We present *Self-Play Preference Optimization* (SPO), an algorithm for reinforcement learning from human feedback. Our approach is *minimalist* in that it does not require training a reward model nor unstable adversarial training and is therefore rather simple to implement. Our approach is *maximalist* in that it provably handles non-Markovian, intransitive, and stochastic preferences while being robust to the compounding errors that plague offline approaches to sequential prediction. To achieve the preceding qualities, we build upon the concept of a *Minimax Winner* (MW), a notion of preference aggregation from the social choice theory literature that frames learning from preferences as a zero-sum game between two policies. By leveraging the symmetry of this game, we prove that rather than using the traditional technique of dueling two policies to compute the MW, we can simply have a *single* agent play against itself while maintaining strong convergence guarantees. Practically, this corresponds to sampling multiple trajectories from a policy, asking a *preference* or teacher model to compare them, and then using the proportion of wins as the reward for a particular trajectory. We demonstrate that on a suite of continuous control tasks, we are able to learn significantly more efficiently than reward-model based approaches while maintaining robustness to the intransitive and stochastic preferences that frequently occur in practice when aggregating human judgments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gokul Swamy;Christoph Dann;Rahul Kidambi;Steven Wu;Alekh Agarwal", "authorids": "~Gokul_Swamy1;~Christoph_Dann1;~Rahul_Kidambi1;~Steven_Wu1;~Alekh_Agarwal2", "gender": ";M;;;M", "homepage": "https://gokul.dev/;http://cdann.net;;;https://alekhagarwal.net", "dblp": "31/11509;117/5869;;;", "google_scholar": "Sbpra_AAAAAJ;FuGllAwAAAAJ;;;9nnDvooAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Gokul_Swamy1;~Christoph_Dann1;~Rahul_Kidambi1;~Steven_Wu1;~Alekh_Agarwal2", "aff": "Carnegie Mellon University;Google;;;Google", "aff_domain": "cmu.edu;google.com;;;google.com", "position": "PhD student;Research Scientist;;;Researcher", "bibtex": "@inproceedings{\nswamy2024a,\ntitle={A Minimaximalist Approach to Reinforcement Learning from Human Feedback},\nauthor={Gokul Swamy and Christoph Dann and Rahul Kidambi and Steven Wu and Alekh Agarwal},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5kVgd2MwMY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5018671, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 87, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18273842745928489276&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "cmu.edu;google.com;;;google.com", "author_num": 5, "aff_unique_index": "0;1;1", "aff_unique_norm": "Carnegie Mellon University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.cmu.edu;https://www.google.com", "aff_unique_abbr": "CMU;Google", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "A New Linear Scaling Rule for Private Adaptive Hyperparameter Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34966", "id": "5kXNMDpUVF", "proceeding": "https://proceedings.mlr.press/v235/panda24a.html", "pdf": "https://openreview.net/pdf?id=5kXNMDpUVF", "openreview": "https://openreview.net/forum?id=5kXNMDpUVF", "author_site": "Ashwinee Panda, Xinyu Tang, Saeed Mahloujifar, Vikash Sehwag, Prateek Mittal", "tldr": "", "abstract": "An open problem in differentially private deep learning is hyperparameter optimization (HPO). DP-SGD introduces new hyperparameters and complicates existing ones, forcing researchers to painstakingly tune hyperparameters with hundreds of trials, which in turn makes it impossible to account for the privacy cost of HPO without destroying the utility. We propose an adaptive HPO method that uses cheap trials (in terms of privacy cost and runtime) to estimate optimal hyperparameters and scales them up. We obtain state-of-the-art performance on 22 benchmark tasks, across computer vision and natural language processing, across pretraining and finetuning, across architectures and a wide range of $\\varepsilon \\in [0.01,8.0]$, all while accounting for the privacy cost of HPO.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ashwinee Panda;Xinyu Tang;Saeed Mahloujifar;Vikash Sehwag;Prateek Mittal", "authorids": "~Ashwinee_Panda1;~Xinyu_Tang1;~Saeed_Mahloujifar1;~Vikash_Sehwag1;~Prateek_Mittal1", "gender": "M;;M;M;", "homepage": "https://kiddyboots216.github.io/;;https://www.cs.virginia.edu/~sm5fd/;https://vsehwag.github.io/;http://www.princeton.edu/~pmittal/", "dblp": "270/1582.html;65/5518;208/0825;187/5613;", "google_scholar": "FM7JCgQAAAAJ;uwcdL7gAAAAJ;kW-hl3YAAAAJ;JAkeEG8AAAAJ;https://scholar.google.com.tw/citations?user=xTKD8J4AAAAJ", "orcid": ";;;;0000-0002-4057-0118", "linkedin": "https://linkedin.com/in/ashwineepanda;;;;", "or_profile": "~Ashwinee_Panda1;~Xinyu_Tang1;~Saeed_Mahloujifar1;~Vikash_Sehwag1;~Prateek_Mittal1", "aff": "Princeton University;Princeton University;Meta;Sony AI;Princeton University", "aff_domain": "princeton.edu;princeton.edu;meta.com;sony.com;princeton.edu", "position": "PhD student;PhD student;Researcher;Researcher;Full Professor", "bibtex": "@inproceedings{\npanda2024a,\ntitle={A New Linear Scaling Rule for Private Adaptive Hyperparameter Optimization},\nauthor={Ashwinee Panda and Xinyu Tang and Saeed Mahloujifar and Vikash Sehwag and Prateek Mittal},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5kXNMDpUVF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1671123, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16343514701464964878&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "princeton.edu;princeton.edu;meta.com;sony.com;princeton.edu", "author_num": 5, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Princeton University;Meta;Sony", "aff_unique_dep": ";Meta Platforms, Inc.;Sony AI", "aff_unique_url": "https://www.princeton.edu;https://meta.com;https://www.sony.com", "aff_unique_abbr": "Princeton;Meta;Sony AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;Japan" }, { "title": "Doubly Robust Causal Effect Estimation under Networked Interference via Targeted Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34965", "id": "5lI9wm4dws", "proceeding": "https://proceedings.mlr.press/v235/chen24c.html", "pdf": "https://openreview.net/pdf?id=5lI9wm4dws", "openreview": "https://openreview.net/forum?id=5lI9wm4dws", "author_site": "Weilin Chen, Ruichu Cai, Zeqin Yang, Jie Qiao, Yuguang Yan, Zijian Li, Zhifeng Hao", "tldr": "", "abstract": "Causal effect estimation under networked interference is an important but challenging problem. Available parametric methods are limited in their model space, while previous semiparametric methods, e.g., leveraging neural networks to fit only one single nuisance function, may still encounter misspecification problems under networked interference without appropriate assumptions on the data generation process. To mitigate bias stemming from misspecification, we propose a novel doubly robust causal effect estimator under networked interference, by adapting the targeted learning technique to the training of neural networks. Specifically, we generalize the targeted learning technique into the networked interference setting and establish the condition under which an estimator achieves double robustness. Based on the condition, we devise an end-to-end causal effect estimator by transforming the identified theoretical condition into a targeted loss. Moreover, we provide a theoretical analysis of our designed estimator, revealing a faster convergence rate compared to a single nuisance model. Extensive experimental results on two real-world networks with semisynthetic data demonstrate the effectiveness of our proposed estimators.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weilin Chen;Ruichu Cai;Zeqin Yang;Jie Qiao;Yuguang Yan;Zijian Li;Zhifeng Hao", "authorids": "~Weilin_Chen1;~Ruichu_Cai1;~Zeqin_Yang1;~Jie_Qiao1;~Yuguang_Yan1;~Zijian_Li1;~Zhifeng_Hao5", "gender": "M;M;M;M;M;M;M", "homepage": ";https://ruichucai.github.io/;;;;;https://www.scholat.com/zfhao", "dblp": ";09/6889;;00/7723;154/0064;27/10487;", "google_scholar": "KVvl1vgAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=zh-CN;aCEp7f4AAAAJ;;j3ilESoAAAAJ;", "orcid": ";;0009-0004-3343-4195;0000-0002-4581-9656;;;", "linkedin": ";;;;;;", "or_profile": "~Weilin_Chen1;~Ruichu_Cai1;~Zeqin_Yang1;~Jie_Qiao1;~Yuguang_Yan1;~Zijian_Li1;~Zhifeng_Hao5", "aff": "University of Cambridge;Guangdong University of Technology;Guangdong University of Technology;Guangdong University of Technology;Guangdong University of Technology;Mohamed bin Zayed University of Artificial Intelligence;", "aff_domain": "cam.ac.uk;gdut.edu.cn;gdut.edu.cn;gdut.edu.cn;gdut.edu.cn;mbzuai.ac.ae;", "position": "Visiting Student;Full Professor;MS student;Postdoc;Associate Professor;Postdoc;", "bibtex": "@inproceedings{\nchen2024doubly,\ntitle={Doubly Robust Causal Effect Estimation under Networked Interference via Targeted Learning},\nauthor={Weilin Chen and Ruichu Cai and Zeqin Yang and Jie Qiao and Yuguang Yan and Zijian Li and Zhifeng Hao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5lI9wm4dws}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1475489, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12869915932439024771&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 8, "email": "cam.ac.uk;gdut.edu.cn;gdut.edu.cn;gdut.edu.cn;gdut.edu.cn;mbzuai.ac.ae;", "author_num": 7, "aff_unique_index": "0;1;1;1;1;2", "aff_unique_norm": "University of Cambridge;Guangdong University of Technology;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cam.ac.uk;http://www.gdut.edu.cn;https://mbzuai.ac.ae", "aff_unique_abbr": "Cambridge;GDUT;MBZUAI", "aff_campus_unique_index": "0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;1;1;1;1;2", "aff_country_unique": "United Kingdom;China;United Arab Emirates" }, { "title": "Extreme Compression of Large Language Models via Additive Quantization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34964", "id": "5mCaITRTmO", "proceeding": "https://proceedings.mlr.press/v235/egiazarian24a.html", "pdf": "https://openreview.net/pdf?id=5mCaITRTmO", "openreview": "https://openreview.net/forum?id=5mCaITRTmO", "author_site": "Vage Egiazarian, Andrei Panferov, Denis Kuznedelev, Elias Frantar, Artem Babenko, Dan Alistarh", "tldr": "", "abstract": "The emergence of accurate open large language models (LLMs) has led to a race towards performant quantization techniques which can enable their execution on end-user devices. In this paper, we revisit the problem of ``extreme'' LLM compression---defined as targeting extremely low bit counts, such as 2 to 3 bits per parameter---from the point of view of classic methods in Multi-Codebook Quantization (MCQ). Our algorithm, called AQLM, generalizes the classic *Additive Quantization (AQ)* approach for information retrieval to advance the state-of-the-art in LLM compression, via two innovations: 1) learned additive quantization of weight matrices in input-adaptive fashion, and 2) joint optimization of codebook parameters across each transformer blocks. Broadly, AQLM is the first scheme that is Pareto optimal in terms of accuracy-vs-model-size when compressing to less than 3 bits per parameter, and significantly improves upon all known schemes in the extreme compression (2bit) regime. In addition, AQLM is practical: we provide fast GPU and CPU implementations of AQLM for token generation, which enable us to match or outperform optimized FP16 implementations for speed, while executing in a much smaller memory footprint.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vage Egiazarian;Andrei Panferov;Denis Kuznedelev;Elias Frantar;Artem Babenko;Dan Alistarh", "authorids": "~Vage_Egiazarian1;~Andrei_Panferov1;~Denis_Kuznedelev1;~Elias_Frantar1;~Artem_Babenko1;~Dan_Alistarh7", "gender": "M;M;M;M;M;M", "homepage": ";;https://github.com/Godofnothing;;;http://people.csail.mit.edu/alistarh/", "dblp": "232/3274;366/7174;322/8616;259/2210;117/4834;36/3251.html", "google_scholar": "Bktg6JEAAAAJ;;;hjdlwz8AAAAJ;q885d1wAAAAJ;https://scholar.google.com.tw/citations?user=75q-6ZQAAAAJ", "orcid": "0000-0003-4444-9769;;0009-0005-2420-9620;;0000-0002-1830-8252;", "linkedin": ";blacksamorez;;elias-frantar-5b43181a4;;", "or_profile": "~Vage_Egiazarian1;~Andrei_Panferov1;~Denis_Kuznedelev1;~Elias_Frantar1;~Artem_Babenko1;~Dan_Alistarh1", "aff": "Yandex;Moscow Institute of Physics and Technology;Yandex;Institute of Science and Technology Austria;Yandex;Institute of Science and Technology", "aff_domain": "yandex-team.ru;mipt.ru;yandex-team.ru;ist.ac.at;yandex-team.ru;ist.ac.at", "position": "Researcher;Undergrad student;Researcher;PhD student;Researcher;Full Professor", "bibtex": "@inproceedings{\negiazarian2024extreme,\ntitle={Extreme Compression of Large Language Models via Additive Quantization},\nauthor={Vage Egiazarian and Andrei Panferov and Denis Kuznedelev and Elias Frantar and Artem Babenko and Dan Alistarh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5mCaITRTmO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1767397, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 89, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13589435069515692336&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 8, "email": "yandex-team.ru;mipt.ru;yandex-team.ru;ist.ac.at;yandex-team.ru;ist.ac.at", "author_num": 6, "aff_unique_index": "0;1;0;2;0;3", "aff_unique_norm": "Yandex;Moscow Institute of Physics and Technology;Institute of Science and Technology Austria;Institute of Science and Technology", "aff_unique_dep": ";;;", "aff_unique_url": "https://yandex.com;https://www.mipt.ru/en;https://www.ist.ac.at;", "aff_unique_abbr": "Yandex;MIPT;IST Austria;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "Russian Federation;Austria;" }, { "title": "Unveiling the Potential of AI for Nanomaterial Morphology Prediction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34963", "id": "5nuW5iBAJS", "proceeding": "https://proceedings.mlr.press/v235/dubrovsky24a.html", "pdf": "https://openreview.net/pdf?id=5nuW5iBAJS", "openreview": "https://openreview.net/forum?id=5nuW5iBAJS", "author_site": "Ivan Dubrovsky, Andrei Dmitrenko, Aleksey Dmitrenko, Nikita Serov, Vladimir Vinogradov", "tldr": "", "abstract": "Creation of nanomaterials with specific morphology remains a complex experimental process, even though there is a growing demand for these materials in various industry sectors. This study explores the potential of AI to predict the morphology of nanoparticles within the data availability constraints. For that, we first generated a new multi-modal dataset that is double the size of analogous studies. Then, we systematically evaluated performance of classical machine learning and large language models in prediction of nanomaterial shapes and sizes. Finally, we prototyped a text-to-image system, discussed the obtained empirical results, as well as the limitations and promises of existing approaches.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ivan Dubrovsky;Andrei Dmitrenko;Aleksei Dmitrenko;Nikita Serov;Vladimir Vinogradov", "authorids": "~Ivan_Dubrovsky1;~Andrei_Dmitrenko1;~Aleksei_Dmitrenko1;serov@scamt-itmo.ru;vinogradov@scamt-itmo.ru", "gender": "M;M;M;;", "homepage": ";;https://github.com/essenceD;;", "dblp": "379/6307;;379/6083;;", "google_scholar": "SEEzV84AAAAJ;;;;", "orcid": "0000-0003-4266-6532;;;;", "linkedin": ";andrei-dmitrenko/;;;", "or_profile": "~Ivan_Dubrovsky1;~Andrei_Dmitrenko1;~Aleksei_Dmitrenko1;serov@scamt-itmo.ru;vinogradov@scamt-itmo.ru", "aff": "ITMO University;ITMO University;ITMO University;;", "aff_domain": "itmo.ru;itmo.ru;itmo.ru;;", "position": "MS student;Researcher;Researcher;;", "bibtex": "@inproceedings{\ndubrovsky2024unveiling,\ntitle={Unveiling the Potential of {AI} for Nanomaterial Morphology Prediction},\nauthor={Ivan Dubrovsky and Andrei Dmitrenko and Aleksei Dmitrenko and Nikita Serov and Vladimir Vinogradov},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5nuW5iBAJS}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2544071, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17529803250487211272&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 7, "email": "itmo.ru;itmo.ru;itmo.ru;;", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "ITMO University", "aff_unique_dep": "", "aff_unique_url": "https://www.itmo.ru", "aff_unique_abbr": "ITMO", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Russian Federation" }, { "title": "Improving fine-grained understanding in image-text pre-training", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34962", "id": "5nxIRQ8GNa", "proceeding": "https://proceedings.mlr.press/v235/bica24a.html", "pdf": "https://openreview.net/pdf?id=5nxIRQ8GNa", "openreview": "https://openreview.net/forum?id=5nxIRQ8GNa", "author_site": "Ioana Bica, Anastasija Ilic, Matthias Bauer, Goker Erdogan, Matko Bo\u0161njak, Christos Kaplanis, Alexey Gritsenko, Matthias Minderer, Charles Blundell, Razvan Pascanu, Jovana Mitrovic", "tldr": "", "abstract": "We introduce SPARse fine-grained Contrastive alignment (SPARC), a simple method for pretraining more fine-grained multimodal representations from image-text pairs. Given that multiple image patches often correspond to single words, we propose to learn a grouping of image patches for every token in the caption. To achieve this, we use a sparse similarity metric between image patches and language tokens and compute for each token a language-grouped vision embedding as the weighted average of patches. The token and language-grouped vision embeddings are then contrasted through a fine-grained sequence-wise loss that only depends on individual samples and does not require other batch samples as negatives, i.e., more detailed information is encoded in a computationally inexpensive way. SPARC combines this fine-grained loss with a contrastive loss between global image and text embeddings to learn representations that simultaneously encode global and local information. We thoroughly evaluate SPARC and show improved performance over competing approaches both on image-level tasks relying on coarse-grained information, e.g. classification, as well as region-level tasks relying on fine-grained information, e.g., retrieval, object detection, segmentation while also improving model faithfulness and captioning in foundational vision-language models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ioana Bica;Anastasija Ilic;Matthias Bauer;Goker Erdogan;Matko Bo\u0161njak;Christos Kaplanis;Alexey A. Gritsenko;Matthias Minderer;Charles Blundell;Razvan Pascanu;Jovana Mitrovic", "authorids": "~Ioana_Bica1;~Anastasija_Ilic1;~Matthias_Bauer1;~Goker_Erdogan1;~Matko_Bo\u0161njak2;~Christos_Kaplanis2;~Alexey_A._Gritsenko1;~Matthias_Minderer1;~Charles_Blundell1;~Razvan_Pascanu1;~Jovana_Mitrovic1", "gender": "F;F;;M;;;;M;;M;", "homepage": "https://ioanabica.github.io/;https://www.linkedin.com/in/anastasija-ilic/;;https://gokererdogan.github.io/;;;;https://mjlm.github.io/;http://www.gatsby.ucl.ac.uk/~ucgtcbl/;https://razp.info;http://jovana-mitrovic.github.io", "dblp": ";;;151/6462;;;;243/3155;35/8396;65/8368.html;176/5114", "google_scholar": ";;;DD1MSdMAAAAJ;;;;57BFBY0AAAAJ;https://scholar.google.co.uk/citations?user=f31mvPsAAAAJ;https://scholar.google.ca/citations?user=eSPY8LwAAAAJ;", "orcid": ";;;;;;;0000-0002-6428-8256;;;", "linkedin": ";;;;;;;;;;", "or_profile": "~Ioana_Bica1;~Anastasija_Ilic1;~Matthias_Bauer1;~Goker_Erdogan1;~Matko_Bo\u0161njak2;~Christos_Kaplanis2;~Alexey_A._Gritsenko1;~Matthias_Minderer1;~Charles_Blundell1;~Razvan_Pascanu1;~Jovana_Mitrovic1", "aff": "Google DeepMind;Google DeepMind;;Google DeepMind;;;;Google;Google DeepMind;Google DeepMind;Google DeepMind", "aff_domain": "deepmind.com;deepmind.google;;google.com;;;;google.com;google.com;google.com;google.com", "position": "Researcher;Researcher;;Researcher;;;;Researcher;Research Scientist;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nbica2024improving,\ntitle={Improving fine-grained understanding in image-text pre-training},\nauthor={Ioana Bica and Anastasija Ilic and Matthias Bauer and Goker Erdogan and Matko Bo{\\v{s}}njak and Christos Kaplanis and Alexey A. Gritsenko and Matthias Minderer and Charles Blundell and Razvan Pascanu and Jovana Mitrovic},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5nxIRQ8GNa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9917206, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2460694431195968993&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "deepmind.com;deepmind.google;;google.com;;;;google.com;google.com;google.com;google.com", "author_num": 11, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google DeepMind", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;1;0;0;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "Creative Text-to-Audio Generation via Synthesizer Programming", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34961", "id": "5pg9YJBaiG", "proceeding": "https://proceedings.mlr.press/v235/cherep24a.html", "pdf": "https://openreview.net/pdf?id=5pg9YJBaiG", "openreview": "https://openreview.net/forum?id=5pg9YJBaiG", "author_site": "Manuel Cherep, Nikhil Singh, Jessica Shand", "tldr": "", "abstract": "Neural audio synthesis methods now allow specifying ideas in natural language. However, these methods produce results that cannot be easily tweaked, as they are based on large latent spaces and up to billions of uninterpretable parameters. We propose a text-to-audio generation method that leverages a virtual modular sound synthesizer with only 78 parameters. Synthesizers have long been used by skilled sound designers for media like music and film due to their flexibility and intuitive controls. Our method, CTAG, iteratively updates a synthesizer's parameters to produce high-quality audio renderings of text prompts that can be easily inspected and tweaked. Sounds produced this way are also more abstract, capturing essential conceptual features over fine-grained acoustic details, akin to how simple sketches can vividly convey visual concepts. Our results show how CTAG produces sounds that are distinctive, perceived as artistic, and yet similarly identifiable to recent neural audio synthesis models, positioning it as a valuable and complementary tool.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Manuel Cherep;Nikhil Singh;Jessica Shand", "authorids": "~Manuel_Cherep1;~Nikhil_Singh2;shand@mit.edu", "gender": "M;M;", "homepage": "https://mcherep.github.io/;https://web.media.mit.edu/~nsingh1/;", "dblp": "329/1440;12/5407-3;", "google_scholar": "bFWJ7YgAAAAJ;h3YzYXwAAAAJ;", "orcid": ";0000-0003-4465-6469;", "linkedin": ";https://linkedin.com/in/nikhilsinghmusic;", "or_profile": "~Manuel_Cherep1;~Nikhil_Singh2;shand@mit.edu", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;", "aff_domain": "mit.edu;mit.edu;", "position": "MS student;PhD student;", "bibtex": "@inproceedings{\ncherep2024creative,\ntitle={Creative Text-to-Audio Generation via Synthesizer Programming},\nauthor={Manuel Cherep and Nikhil Singh and Jessica Shand},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5pg9YJBaiG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1853525, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10143992223301836974&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "mit.edu;mit.edu;", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Compress Clean Signal from Noisy Raw Image: A Self-Supervised Approach", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34960", "id": "5sgkNtexs2", "proceeding": "https://proceedings.mlr.press/v235/li24bl.html", "pdf": "https://openreview.net/pdf?id=5sgkNtexs2", "openreview": "https://openreview.net/forum?id=5sgkNtexs2", "author_site": "Zhihao Li, Yufei Wang, Alex Kot, Bihan Wen", "tldr": "", "abstract": "Raw images offer unique advantages in many low-level visual tasks due to their unprocessed nature. However, this unprocessed state accentuates noise, making raw images challenging to compress effectively. Current compression methods often overlook the ubiquitous noise in raw space, leading to increased bitrates and reduced quality. In this paper, we propose a novel raw image compression scheme that selectively compresses the noise-free component of the input, while discarding its real noise using a self-supervised approach. By excluding noise from the bitstream, both the coding efficiency and reconstruction quality are significantly enhanced. We curate an full-day dataset of raw images with calibrated noise parameters and reference images to evaluate the performance of models under a wide range of input signal-noise ratios. Experimental results demonstrate that our method surpasses existing compression techniques, achieving a more advantageous rate-distortion balance with improvements ranging from +2 to +10dB and yielding a bit saving of 2 to 50 times. The code will be released upon paper acceptance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhihao Li;Yufei Wang;Alex Kot;Bihan Wen", "authorids": "~Zhihao_Li14;~Yufei_Wang5;~Alex_Kot1;~Bihan_Wen2", "gender": "M;M;;M", "homepage": "https://lizhihao6.github.io;https://github.com/wyf0912/;https://www.ntu.edu.sg/home/eackot/;https://personal.ntu.edu.sg/bihan.wen/", "dblp": ";;;158/9840", "google_scholar": "gWlYsj0AAAAJ;jLd1l_sAAAAJ;;ypkClpwAAAAJ", "orcid": "0000-0002-2066-8775;;;0000-0002-6874-6453", "linkedin": ";;;", "or_profile": "~Zhihao_Li14;~Yufei_Wang5;~Alex_Kot1;~Bihan_Wen2", "aff": "Nanyang Technological University;Nanyang Technological University;Nanyang Technological University;Nanyang Technological University", "aff_domain": "ntu.edu.sg;ntu.edu.sg;ntu.edu.sg;ntu.edu.sg", "position": "Researcher;PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nli2024compress,\ntitle={Compress Clean Signal from Noisy Raw Image: A Self-Supervised Approach},\nauthor={Zhihao Li and Yufei Wang and Alex Kot and Bihan Wen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5sgkNtexs2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7552577, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:l_AHttUaqCIJ:scholar.google.com/&scioq=Compress+Clean+Signal+from+Noisy+Raw+Image:+A+Self-Supervised+Approach&hl=en&as_sdt=0,5", "gs_version_total": 3, "email": "ntu.edu.sg;ntu.edu.sg;ntu.edu.sg;ntu.edu.sg", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Nanyang Technological University", "aff_unique_dep": "", "aff_unique_url": "https://www.ntu.edu.sg", "aff_unique_abbr": "NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Singapore" }, { "title": "Efficient Error Certification for Physics-Informed Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34959", "id": "5t4V7Q6lmz", "proceeding": "https://proceedings.mlr.press/v235/eiras24a.html", "pdf": "https://openreview.net/pdf?id=5t4V7Q6lmz", "openreview": "https://openreview.net/forum?id=5t4V7Q6lmz", "author_site": "Francisco Eiras, Adel Bibi, Rudy Bunel, Krishnamurthy Dvijotham, Phil Torr, M. Pawan Kumar", "tldr": "", "abstract": "Recent work provides promising evidence that Physics-Informed Neural Networks (PINN) can efficiently solve partial differential equations (PDE). However, previous works have failed to provide guarantees on the *worst-case* residual error of a PINN across the spatio-temporal domain - a measure akin to the tolerance of numerical solvers - focusing instead on point-wise comparisons between their solution and the ones obtained by a solver on a set of inputs. In real-world applications, one cannot consider tests on a finite set of points to be sufficient grounds for deployment, as the performance could be substantially worse on a different set. To alleviate this issue, we establish guaranteed error-based conditions for PINNs over their *continuous* applicability domain. To verify the extent to which they hold, we introduce $\\partial$-CROWN: a general, efficient and scalable post-training framework to bound PINN residual errors. We demonstrate its effectiveness in obtaining tight certificates by applying it to two classically studied PINNs -- Burgers' and Schr\u00f6dinger's equations --, and two more challenging ones with real-world applications -- the Allan-Cahn and Diffusion-Sorption equations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Francisco Eiras;Adel Bibi;Rudy R Bunel;Krishnamurthy Dj Dvijotham;Philip Torr;M. Pawan Kumar", "authorids": "~Francisco_Eiras1;~Adel_Bibi1;~Rudy_R_Bunel1;~Krishnamurthy_Dj_Dvijotham1;~Philip_Torr1;~M._Pawan_Kumar1", "gender": "M;M;M;;;M", "homepage": "https://fgirbal.github.io;http://adelbibi.com;http://www.robots.ox.ac.uk/~rudy/;http://www.robots.ox.ac.uk/~tvg/;;http://dvij.github.io", "dblp": "218/5843;176/0964;180/5419;;45/2527;16/8758", "google_scholar": "O_iJTgYAAAAJ;Q4j2laYAAAAJ;https://scholar.google.fr/citations?user=7cqQFSoAAAAJ;;https://scholar.google.com/citations?hl=en;BUtloecAAAAJ", "orcid": ";0000-0002-6169-3918;;;;", "linkedin": "franciscogirbaleiras/;adel-bibi-ba3671ab/;;;;", "or_profile": "~Francisco_Eiras1;~Adel_Bibi1;~Rudy_R_Bunel1;~Philip_Torr1;~M._Pawan_Kumar1;~Krishnamurthy_Dvijotham2", "aff": "University of Oxford;University of Oxford;;University of Oxford;Google DeepMind;Google DeepMind", "aff_domain": "ox.ac.uk;ox.ac.uk;;ox.ac.uk;deepmind.com;google.com", "position": "PhD student;Senior Researcher;;Full Professor;Researcher;Researcher", "bibtex": "@inproceedings{\neiras2024efficient,\ntitle={Efficient Error Certification for Physics-Informed Neural Networks},\nauthor={Francisco Eiras and Adel Bibi and Rudy R Bunel and Krishnamurthy Dj Dvijotham and Philip Torr and M. Pawan Kumar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5t4V7Q6lmz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1208242, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16440297912984899426&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "ox.ac.uk;ox.ac.uk;;ox.ac.uk;deepmind.com;google.com", "author_num": 6, "aff_unique_index": "0;0;0;1;1", "aff_unique_norm": "University of Oxford;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.ox.ac.uk;https://deepmind.com", "aff_unique_abbr": "Oxford;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Full-Atom Peptide Design based on Multi-modal Flow Matching", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34958", "id": "5tPB5VXo87", "proceeding": "https://proceedings.mlr.press/v235/li24o.html", "pdf": "https://openreview.net/pdf?id=5tPB5VXo87", "openreview": "https://openreview.net/forum?id=5tPB5VXo87", "author_site": "Jiahan Li, Chaoran Cheng, Zuofan Wu, Ruihan Guo, Shitong Luo, Zhizhou Ren, Jian Peng, Jianzhu Ma", "tldr": "", "abstract": "Peptides, short chains of amino acid residues, play a vital role in numerous biological processes by interacting with other target molecules, offering substantial potential in drug discovery. In this work, we present *PepFlow*, the first multi-modal deep generative model grounded in the flow-matching framework for the design of full-atom peptides that target specific protein receptors. Drawing inspiration from the crucial roles of residue backbone orientations and side-chain dynamics in protein-peptide interactions, we characterize the peptide structure using rigid backbone frames within the $\\mathrm{SE}(3)$ manifold and side-chain angles on high-dimensional tori. Furthermore, we represent discrete residue types in the peptide sequence as categorical distributions on the probability simplex. By learning the joint distributions of each modality using derived flows and vector fields on corresponding manifolds, our method excels in the fine-grained design of full-atom peptides. Harnessing the multi-modal paradigm, our approach adeptly tackles various tasks such as fix-backbone sequence design and side-chain packing through partial sampling. Through meticulously crafted experiments, we demonstrate that *PepFlow* exhibits superior performance in comprehensive benchmarks, highlighting its significant potential in computational peptide design and analysis.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiahan Li;Chaoran Cheng;Zuofan Wu;Ruihan Guo;Shitong Luo;Zhizhou Ren;Jian Peng;Jianzhu Ma", "authorids": "~Jiahan_Li2;~Chaoran_Cheng2;~Zuofan_Wu1;~Ruihan_Guo1;~Shitong_Luo1;~Zhizhou_Ren1;~Jian_Peng1;~Jianzhu_Ma2", "gender": ";M;M;M;;M;M;M", "homepage": ";https://ccr-cheng.github.io/;;https://github.com/guoruihan;https://luost.me;;http://jianpeng.web.engr.illinois.edu/;https://majianzhu.com/", "dblp": ";;;;271/0339;https://dblp.uni-trier.de/pid/239/5714.html;29/4181-1;24/9080.html", "google_scholar": ";SrGZZ1wAAAAJ;;;z1BrjyIAAAAJ;xgpMeDgAAAAJ;https://scholar.google.com.tw/citations?user=4wcAVXAAAAAJ;", "orcid": ";;;;;;;", "linkedin": ";chaoran-cheng-a70638214/;zuofan-wu-b08398213/;;;;;", "or_profile": "~Jiahan_Li2;~Chaoran_Cheng2;~Zuofan_Wu1;~Ruihan_Guo1;~Shitong_Luo1;~Zhizhou_Ren1;~Jian_Peng1;~Jianzhu_Ma2", "aff": ";University of Illinois, Urbana Champaign;Helixon Research;;Massachusetts Institute of Technology;University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;Tsinghua University", "aff_domain": ";illinois.edu;helixon.com;;mit.edu;illinois.edu;illinois.edu;tsinghua.edu.cn", "position": ";PhD student;Researcher;;PhD student;PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nli2024fullatom,\ntitle={Full-Atom Peptide Design based on Multi-modal Flow Matching},\nauthor={Jiahan Li and Chaoran Cheng and Zuofan Wu and Ruihan Guo and Shitong Luo and Zhizhou Ren and Jian Peng and Jianzhu Ma},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5tPB5VXo87}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9125518, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1615789375347310641&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": ";illinois.edu;helixon.com;;mit.edu;illinois.edu;illinois.edu;tsinghua.edu.cn", "author_num": 8, "aff_unique_index": "0;1;2;0;0;3", "aff_unique_norm": "University of Illinois Urbana-Champaign;Helixon Research;Massachusetts Institute of Technology;Tsinghua University", "aff_unique_dep": ";;;", "aff_unique_url": "https://illinois.edu;;https://web.mit.edu;https://www.tsinghua.edu.cn", "aff_unique_abbr": "UIUC;;MIT;THU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0;0;1", "aff_country_unique": "United States;China" }, { "title": "Patchscopes: A Unifying Framework for Inspecting Hidden Representations of Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34957", "id": "5uwBzcn885", "proceeding": "https://proceedings.mlr.press/v235/ghandeharioun24a.html", "pdf": "https://openreview.net/pdf?id=5uwBzcn885", "openreview": "https://openreview.net/forum?id=5uwBzcn885", "author_site": "Asma Ghandeharioun, \u202aAvi Caciularu\u202c\u200f, Adam Pearce, Lucas Dixon, Mor Geva", "tldr": "", "abstract": "Understanding the internal representations of large language models (LLMs) can help explain models' behavior and verify their alignment with human values. Given the capabilities of LLMs in generating human-understandable text, we propose leveraging the model itself to explain its internal representations in natural language. We introduce a framework called Patchscopes and show how it can be used to answer a wide range of questions about an LLM's computation. We show that many prior interpretability methods based on projecting representations into the vocabulary space and intervening on the LLM computation can be viewed as instances of this framework. Moreover, several of their shortcomings such as failure in inspecting early layers or lack of expressivity can be mitigated by Patchscopes. Beyond unifying prior inspection techniques, Patchscopes also opens up *new* possibilities such as using a more capable model to explain the representations of a smaller model, and multihop reasoning error correction.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Asma Ghandeharioun;Avi Caciularu;Adam Pearce;Lucas Dixon;Mor Geva", "authorids": "~Asma_Ghandeharioun1;~Avi_Caciularu1;~Adam_Pearce1;~Lucas_Dixon1;~Mor_Geva1", "gender": ";M;M;Not Specified;F", "homepage": "https://alum.mit.edu/www/asma_gh;http://aviclu.github.io/;https://roadtolarissa.com/;https://research.google/people/lucas-dixon/;https://mega002.github.io/", "dblp": "124/3110;https://dblp.uni-trier.de/pid/207/8509;;39/6853;203/9159", "google_scholar": "CkfQy2gAAAAJ;https://scholar.google.co.il/citations?user=fPG_0aQAAAAJ;;nDs3-TMAAAAJ;https://scholar.google.co.il/citations?user=GxpQbSkAAAAJ", "orcid": ";;;0000-0003-1094-1675;", "linkedin": ";avicaciularu/;;lucas-dixon-94070354/;morgeva/", "or_profile": "~Asma_Ghandeharioun1;~Avi_Caciularu1;~Adam_Pearce1;~Lucas_Dixon1;~Mor_Geva1", "aff": "Google;Google;Google;Research, Google;Google Research", "aff_domain": "google.com;google.com;google.com;research.google.com;google.com", "position": "Research Scientist;Researcher;Data Visualization;Researcher;Researcher", "bibtex": "@inproceedings{\nghandeharioun2024patchscopes,\ntitle={Patchscopes: A Unifying Framework for Inspecting Hidden Representations of Language Models},\nauthor={Asma Ghandeharioun and Avi Caciularu and Adam Pearce and Lucas Dixon and Mor Geva},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5uwBzcn885}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1595397, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10112662895643774874&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "google.com;google.com;google.com;research.google.com;google.com", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Overestimation, Overfitting, and Plasticity in Actor-Critic: the Bitter Lesson of Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34956", "id": "5vZzmCeTYu", "proceeding": "https://proceedings.mlr.press/v235/nauman24a.html", "pdf": "https://openreview.net/pdf?id=5vZzmCeTYu", "openreview": "https://openreview.net/forum?id=5vZzmCeTYu", "author_site": "Michal Nauman, Micha\u0142 Bortkiewicz, Piotr Milos, Tomasz Trzcinski, Mateusz Ostaszewski, Marek Cygan", "tldr": "", "abstract": "Recent advancements in off-policy Reinforcement Learning (RL) have significantly improved sample efficiency, primarily due to the incorporation of various forms of regularization that enable more gradient update steps than traditional agents. However, many of these techniques have been tested in limited settings, often on tasks from single simulation benchmarks and against well-known algorithms rather than a range of regularization approaches. This limits our understanding of the specific mechanisms driving RL improvements. To address this, we implemented over 60 different off-policy agents, each integrating established regularization techniques from recent state-of-the-art algorithms. We tested these agents across 14 diverse tasks from 2 simulation benchmarks, measuring training metrics related to overestimation, overfitting, and plasticity loss \u2014 issues that motivate the examined regularization techniques. Our findings reveal that while the effectiveness of a specific regularization setup varies with the task, certain combinations consistently demonstrate robust and superior performance. Notably, a simple Soft Actor-Critic agent, appropriately regularized, reliably finds a better-performing policy within the training regime, which previously was achieved mainly through model-based approaches.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Michal Nauman;Micha\u0142 Bortkiewicz;Piotr Mi\u0142o\u015b;Tomasz Trzcinski;Mateusz Ostaszewski;Marek Cygan", "authorids": "~Michal_Nauman1;~Micha\u0142_Bortkiewicz1;~Piotr_Mi\u0142o\u015b1;~Tomasz_Trzcinski2;~Mateusz_Ostaszewski1;~Marek_Cygan1", "gender": ";;;M;;", "homepage": ";;;https://cvlab.ii.pw.edu.pl/ttrzcins/;;", "dblp": ";;208/0989.html;05/11408;;76/819", "google_scholar": ";;Se68XecAAAAJ;https://scholar.google.pl/citations?user=bJMRBFoAAAAJ;;df8TSy4AAAAJ", "orcid": ";;;;;", "linkedin": "michal-nauman/;;piotr-milos-4b02151/;;;marek-cygan-b9a316140/", "or_profile": "~Michal_Nauman1;~Micha\u0142_Bortkiewicz1;~Piotr_Mi\u0142o\u015b1;~Tomasz_Trzcinski2;~Mateusz_Ostaszewski1;~Marek_Cygan1", "aff": "University of Warsaw;;IDEAS NCBR;Warsaw University of Technology;;Nomagic", "aff_domain": "mimuw.edu.pl;;ideas-ncbr.pl;pw.edu.pl;;nomagic.ai", "position": "PhD student;;Researcher;Full Professor;;Founder / CTO", "bibtex": "@inproceedings{\nnauman2024overestimation,\ntitle={Overestimation, Overfitting, and Plasticity in Actor-Critic: the Bitter Lesson of Reinforcement Learning},\nauthor={Michal Nauman and Micha{\\l} Bortkiewicz and Piotr Mi{\\l}o{\\'s} and Tomasz Trzcinski and Mateusz Ostaszewski and Marek Cygan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5vZzmCeTYu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9021386, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15045756786446784009&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 6, "email": "mimuw.edu.pl;;ideas-ncbr.pl;pw.edu.pl;;nomagic.ai", "author_num": 6, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Warsaw;Institute for Development, Economic Analysis, and Simulation (IDEAS);Warsaw University of Technology;Nomagic", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.uw.edu.pl;https://www.ideas-ncbr.gov.pl;https://www.pw.edu.pl;", "aff_unique_abbr": "UW;IDEAS;WUT;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Poland;" }, { "title": "Physics of Language Models: Part 3.1, Knowledge Storage and Extraction", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34955", "id": "5x788rqbcj", "proceeding": "https://proceedings.mlr.press/v235/allen-zhu24a.html", "pdf": "https://openreview.net/pdf?id=5x788rqbcj", "openreview": "https://openreview.net/forum?id=5x788rqbcj", "author_site": "Zeyuan Allen-Zhu, Yuanzhi Li", "tldr": "", "abstract": "Large language models (LLMs) can store a vast amount of world knowledge, often extractable via question-answering (e.g., \"What is Abraham Lincoln's birthday?''). However, do they answer such questions based on exposure to similar questions during training (i.e., cheating), or by genuinely learning to extract knowledge from sources like Wikipedia? In this paper, we investigate this issue using a controlled biography dataset. We find a strong correlation between the model's ability to extract knowledge and various _diversity measures_ of the training data. **Essentially**, for knowledge to be reliably extracted, it must be sufficiently augmented (e.g., through paraphrasing, sentence shuffling) _during pretraining_. Without such augmentation, knowledge may be memorized but not extractable, leading to 0% accuracy, regardless of subsequent instruction fine-tuning. To understand why this occurs, we employ (nearly) linear probing to demonstrate a strong connection between the observed correlation and _how the model internally encodes knowledge_ --- whether it is linearly encoded in the hidden embeddings of entity names or distributed across other token embeddings in the training text. **This paper provides several key recommendations for LLM pretraining in the industry:** (1) rewrite the pretraining data --- using small, auxiliary models --- to provide knowledge augmentation, and (2) incorporate more instruction-finetuning data into the pretraining stage before it becomes too late.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zeyuan Allen-Zhu;Yuanzhi Li", "authorids": "~Zeyuan_Allen-Zhu1;~Yuanzhi_Li1", "gender": ";M", "homepage": ";", "dblp": ";73/3628", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Zeyuan_Allen-Zhu1;~Yuanzhi_Li1", "aff": ";Carnegie Mellon University", "aff_domain": ";andrew.cmu.edu", "position": ";Assistant Professor", "bibtex": "@inproceedings{\nallen-zhu2024physics,\ntitle={Physics of Language Models: Part 3.1, Knowledge Storage and Extraction},\nauthor={Zeyuan Allen-Zhu and Yuanzhi Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5x788rqbcj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1965159, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 118, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=617581497034228402&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 6, "email": ";andrew.cmu.edu", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "BECoTTA: Input-dependent Online Blending of Experts for Continual Test-time Adaptation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34954", "id": "5zXTwX92qv", "proceeding": "https://proceedings.mlr.press/v235/lee24ab.html", "pdf": "https://openreview.net/pdf?id=5zXTwX92qv", "openreview": "https://openreview.net/forum?id=5zXTwX92qv", "author_site": "Daeun Lee, Jaehong Yoon, Sung Ju Hwang", "tldr": "", "abstract": "Continual Test-Time Adaptation (CTTA) is designed to optimize the model during deployment under changing conditions. CTTA is an important problem as it enables models to remain effective and reliable in dynamic and evolving environments. However, tackling the CTTA problem is nontrivial. The model needs to be computationally and memory-efficient to rapidly update its parameters for ever-changing environments in real-time. Also, the model should generalize well to new unseen domains while maintaining its capability on previously encountered ones, as old domains can be revisited in future adaptation phases. To tackle these challenges, this paper proposes BECoTTA, a parameter/memory-efficient yet powerful framework for CTTA. We introduce Mixture-of-Domain Low-rank Experts (MoDE) that contains two core components: \bi) Domain-Adaptive Routing, which can aid in selectively capturing the domain-adaptive knowledge, and ii) Domain-Expert Synergy Loss to maximize the dependency between each domain and expert. We validate our proposed method over multiple CTTA benchmarks, getting 5.81% performance gain, while only requiring 0.001x trainable parameters. We also provide analyses of our BECoTTA, including expert assignment and target domain relation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Daeun Lee;Jaehong Yoon;Sung Ju Hwang", "authorids": "~Daeun_Lee2;~Jaehong_Yoon1;~Sung_Ju_Hwang1", "gender": "F;M;", "homepage": "https://daeunni.github.io/;https://jaehong31.github.io/;", "dblp": "70/6922;203/4449;", "google_scholar": "https://scholar.google.co.kr/citations?hl=ko;-5comoUAAAAJ;", "orcid": ";;", "linkedin": "dangni/;jaehongyoon/;", "or_profile": "~Daeun_Lee2;~Jaehong_Yoon1;~Sung_Ju_Hwang1", "aff": "Korea University;University of North Carolina at Chapel Hill;", "aff_domain": "korea.ac.kr;unc.edu;", "position": "Undergrad student;Postdoc;", "bibtex": "@inproceedings{\nlee2024becotta,\ntitle={{BEC}o{TTA}: Input-dependent Online Blending of Experts for Continual Test-time Adaptation},\nauthor={Daeun Lee and Jaehong Yoon and Sung Ju Hwang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5zXTwX92qv}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2748067, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17441138606596295398&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "korea.ac.kr;unc.edu;", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Korea University;University of North Carolina", "aff_unique_dep": ";", "aff_unique_url": "https://www.korea.ac.kr;https://www.unc.edu", "aff_unique_abbr": "KU;UNC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Chapel Hill", "aff_country_unique_index": "0;1", "aff_country_unique": "South Korea;United States" }, { "title": "Learning Causal Relations from Subsampled Time Series with Two Time-Slices", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34953", "id": "60F0fVbknK", "proceeding": "https://proceedings.mlr.press/v235/wu24p.html", "pdf": "https://openreview.net/pdf?id=60F0fVbknK", "openreview": "https://openreview.net/forum?id=60F0fVbknK", "author_site": "Anpeng Wu, Haoxuan Li, Kun Kuang, zhang keli, Fei Wu", "tldr": "", "abstract": "This paper studies the causal relations from subsampled time series, in which measurements are sparse and sampled at a coarser timescale than the causal timescale of the underlying system. In such data, because there are numerous missing time-slices (i.e., cross-sections at each time point) between two consecutive measurements, conventional causal discovery methods designed for standard time series data would produce significant errors. To learn causal relations from subsampled time series, a typical solution is to conduct different interventions and then make a comparison. However, full interventions are often expensive, unethical, or even infeasible, particularly in fields such as health and social science. In this paper, we first explore how readily available two-time-slices data can replace intervention data to improve causal ordering, and propose a novel Descendant Hierarchical Topology algorithm with Conditional Independence Test (DHT-CIT) to learn causal relations from subsampled time series using only two time-slices. Specifically, we develop a conditional independence criterion that can be applied iteratively to test each node from time series and identify all of its descendant nodes. Empirical results on both synthetic and real-world datasets demonstrate the superiority of our DHT-CIT algorithm.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anpeng Wu;Haoxuan Li;Kun Kuang;Zhang Keli;Fei Wu", "authorids": "~Anpeng_Wu1;~Haoxuan_Li6;~Kun_Kuang1;~Zhang_Keli1;~Fei_Wu1", "gender": "M;M;M;M;M", "homepage": "https://scholar.google.com.hk/citations?user=VQ4m6zQAAAAJ&hl=zh-CN&oi=sra;https://haoxuanli-pku.github.io/;http://kunkuang.github.io;;https://person.zju.edu.cn/wufei", "dblp": "267/5637;145/4965-1.html;194/4245;92/573;84/3254-1", "google_scholar": "https://scholar.google.com.hk/citations?user=VQ4m6zQAAAAJ;gtDqiucAAAAJ;https://scholar.google.com.hk/citations?user=FOsNiMQAAAAJ;;XJLn4MYAAAAJ", "orcid": "0000-0003-3898-7122;0000-0003-3620-3769;0009-0000-7528-8131;0000-0002-7883-0552;", "linkedin": ";;;;", "or_profile": "~Anpeng_Wu1;~Haoxuan_Li6;~Kun_Kuang1;~Zhang_Keli1;~Fei_Wu1", "aff": "Mohamed bin Zayed University of Artificial Intelligence;Peking University;Zhejiang University;Huawei Technologies Ltd.;Zhejiang University", "aff_domain": "mbzuai.ac.ae;pku.edu.cn;zju.edu.cn;huawei.com;zju.edu.cn", "position": "Researcher;PhD student;Associate Professor;Researcher;Full Professor", "bibtex": "@inproceedings{\nwu2024learning,\ntitle={Learning Causal Relations from Subsampled Time Series with Two Time-Slices},\nauthor={Anpeng Wu and Haoxuan Li and Kun Kuang and Zhang Keli and Fei Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=60F0fVbknK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 998162, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:aY4Jdpmy6YIJ:scholar.google.com/&scioq=Learning+Causal+Relations+from+Subsampled+Time+Series+with+Two+Time-Slices&hl=en&as_sdt=0,33", "gs_version_total": 6, "email": "mbzuai.ac.ae;pku.edu.cn;zju.edu.cn;huawei.com;zju.edu.cn", "author_num": 5, "aff_unique_index": "0;1;2;3;2", "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;Peking University;Zhejiang University;Huawei", "aff_unique_dep": ";;;Huawei Technologies", "aff_unique_url": "https://mbzuai.ac.ae;http://www.pku.edu.cn;https://www.zju.edu.cn;https://www.huawei.com", "aff_unique_abbr": "MBZUAI;Peking U;ZJU;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "United Arab Emirates;China" }, { "title": "Generative Modeling on Manifolds Through Mixture of Riemannian Diffusion Processes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34952", "id": "60HydCpCMZ", "proceeding": "https://proceedings.mlr.press/v235/jo24a.html", "pdf": "https://openreview.net/pdf?id=60HydCpCMZ", "openreview": "https://openreview.net/forum?id=60HydCpCMZ", "author_site": "Jaehyeong Jo, Sung Ju Hwang", "tldr": "", "abstract": "Learning the distribution of data on Riemannian manifolds is crucial for modeling data from non-Euclidean space, which is required by many applications in diverse scientific fields. Yet, existing generative models on manifolds suffer from expensive divergence computation or rely on approximations of heat kernel. These limitations restrict their applicability to simple geometries and hinder scalability to high dimensions. In this work, we introduce the Riemannian Diffusion Mixture, a principled framework for building a generative diffusion process on manifolds. Instead of following the denoising approach of previous diffusion models, we construct a diffusion process using a mixture of bridge processes derived on general manifolds without requiring heat kernel estimations. We develop a geometric understanding of the mixture process, deriving the drift as a weighted mean of tangent directions to the data points that guides the process toward the data distribution. We further propose a scalable training objective for learning the mixture process that readily applies to general manifolds. Our method achieves superior performance on diverse manifolds with dramatically reduced number of in-training simulation steps for general manifolds.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jaehyeong Jo;Sung Ju Hwang", "authorids": "~Jaehyeong_Jo1;~Sung_Ju_Hwang1", "gender": "M;", "homepage": "https://github.com/harryjo97;", "dblp": "296/2037;", "google_scholar": "https://scholar.google.com/citations?hl=ko;", "orcid": ";", "linkedin": ";", "or_profile": "~Jaehyeong_Jo1;~Sung_Ju_Hwang1", "aff": "Korea Advanced Institute of Science & Technology;", "aff_domain": "kaist.ac.kr;", "position": "MS student;", "bibtex": "@inproceedings{\njo2024generative,\ntitle={Generative Modeling on Manifolds Through Mixture of Riemannian Diffusion Processes},\nauthor={Jaehyeong Jo and Sung Ju Hwang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=60HydCpCMZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4696330, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15176113037263379386&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "kaist.ac.kr;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_country_unique_index": "0", "aff_country_unique": "South Korea" }, { "title": "When Will Gradient Regularization Be Harmful?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34951", "id": "60vC1FY0dZ", "proceeding": "https://proceedings.mlr.press/v235/zhao24t.html", "pdf": "https://openreview.net/pdf?id=60vC1FY0dZ", "openreview": "https://openreview.net/forum?id=60vC1FY0dZ", "author_site": "Yang Zhao, Hao Zhang, Xiuyuan Hu", "tldr": "", "abstract": "Gradient regularization (GR), which aims to penalize the gradient norm atop the loss function, has shown promising results in training modern over-parameterized deep neural networks. However, can we trust this powerful technique? This paper reveals that GR can cause performance degeneration in adaptive optimization scenarios, particularly with learning rate warmup. Our empirical and theoretical analyses suggest this is due to GR inducing instability and divergence in gradient statistics of adaptive optimizers at the initial training stage. Inspired by the warmup heuristic, we propose three GR warmup strategies, each relaxing the regularization effect to a certain extent during the warmup course to ensure the accurate and stable accumulation of gradients. With experiments on Vision Transformer family, we confirm the three GR warmup strategies can effectively circumvent these issues, thereby largely improving the model performance. Meanwhile, we note that scalable models tend to rely more on the GR warmup, where the performance can be improved by up to 3% on Cifar10 compared to baseline GR. Code is available at https://github.com/zhaoyang-0204/gnp.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yang Zhao;Hao Zhang;Xiuyuan Hu", "authorids": "~Yang_Zhao11;~Hao_Zhang37;~Xiuyuan_Hu1", "gender": "M;M;M", "homepage": ";http://ee.tsinghua.edu.cn;https://hxyfighter.github.io/", "dblp": "50/2082-16;;180/4559", "google_scholar": "KF9ag1sAAAAJ;;8cRupWIAAAAJ", "orcid": "0000-0001-5883-2799;;0009-0003-5543-0972", "linkedin": ";;xiuyuan-hu-30b7a8201/", "or_profile": "~Yang_Zhao11;~Hao_Zhang37;~Xiuyuan_Hu1", "aff": "Tsinghua University;;Tsinghua University", "aff_domain": "tsinghua.edu.cn;;tsinghua.edu.cn", "position": "Postdoc;;PhD student", "bibtex": "@inproceedings{\nzhao2024when,\ntitle={When Will Gradient Regularization Be Harmful?},\nauthor={Yang Zhao and Hao Zhang and Xiuyuan Hu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=60vC1FY0dZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 824817, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:x0r5ILKHBMIJ:scholar.google.com/&scioq=When+Will+Gradient+Regularization+Be+Harmful%3F&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": "tsinghua.edu.cn;;tsinghua.edu.cn", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "No Wrong Turns: The Simple Geometry Of Neural Networks Optimization Paths", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34950", "id": "60vx5AfM3C", "proceeding": "https://proceedings.mlr.press/v235/guille-escuret24a.html", "pdf": "https://openreview.net/pdf?id=60vx5AfM3C", "openreview": "https://openreview.net/forum?id=60vx5AfM3C", "author_site": "Charles Guille-Escuret, Hiroki Naganuma, Kilian Fatras, Ioannis Mitliagkas", "tldr": "", "abstract": "Understanding the optimization dynamics of neural networks is necessary for closing the gap between theory and practice. Stochastic first-order optimization algorithms are known to efficiently locate favorable minima in deep neural networks. This efficiency, however, contrasts with the non-convex and seemingly complex structure of neural loss landscapes. In this study, we delve into the fundamental geometric properties of sampled gradients along optimization paths. We focus on two key quantities, the restricted secant inequality and error bound, as well as their ratio \u03b3, which hold high significance for first-order optimization. Our analysis reveals that these quantities exhibit predictable, consistent behavior throughout training, despite the stochasticity induced by sampling minibatches. Our findings suggest that not only do optimization trajectories never encounter significant obstacles, but they also maintain stable dynamics during the majority of training. These observed properties are sufficiently expressive to theoretically guarantee linear convergence and prescribe learning rate schedules mirroring empirical practices. We conduct our experiments on image classification, semantic segmentation and language modeling across different batch sizes, network architectures, datasets, optimizers, and initialization seeds. We discuss the impact of each factor. Our work provides novel insights into the properties of neural network loss functions, and opens the door to theoretical frameworks more relevant to prevalent practice.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Charles Guille-Escuret;Hiroki Naganuma;Kilian FATRAS;Ioannis Mitliagkas", "authorids": "~Charles_Guille-Escuret1;~Hiroki_Naganuma1;~Kilian_FATRAS1;~Ioannis_Mitliagkas1", "gender": "M;M;M;M", "homepage": ";https://hiroki11x.github.io/;http://kilianfatras.github.io;http://mitliagkas.github.io/", "dblp": "243/7039;206/0082;;83/8757", "google_scholar": "VNgVRmgAAAAJ;https://scholar.google.co.jp/citations?user=xx7O2voAAAAJ;https://scholar.google.ca/citations?user=DHMjyDgAAAAJ;K757SxgAAAAJ", "orcid": ";0000-0002-4595-8381;;", "linkedin": ";hiroki11x/;;", "or_profile": "~Charles_Guille-Escuret1;~Hiroki_Naganuma1;~Kilian_FATRAS1;~Ioannis_Mitliagkas1", "aff": "ServiceNow;Microsoft Research;McGill University;Mila - Quebec AI Institute", "aff_domain": "servicenow.com;microsoft.com;mcgill.ca;mila.quebec", "position": "Intern;Intern;Postdoc;Principal Researcher", "bibtex": "@inproceedings{\nguille-escuret2024no,\ntitle={No Wrong Turns: The Simple Geometry Of Neural Networks Optimization Paths},\nauthor={Charles Guille-Escuret and Hiroki Naganuma and Kilian FATRAS and Ioannis Mitliagkas},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=60vx5AfM3C}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8528686, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7255617281957896084&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6, "email": "servicenow.com;microsoft.com;mcgill.ca;mila.quebec", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "ServiceNow;Microsoft;McGill University;Quebec AI Institute", "aff_unique_dep": ";Microsoft Research;;AI Institute", "aff_unique_url": "https://www.servicenow.com;https://www.microsoft.com/en-us/research;https://www.mcgill.ca;https://mila.quebec", "aff_unique_abbr": "ServiceNow;MSR;McGill;Mila", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "United States;Canada" }, { "title": "Tilt and Average : Geometric Adjustment of the Last Layer for Recalibration", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34949", "id": "61A1bsVjRg", "proceeding": "https://proceedings.mlr.press/v235/cho24g.html", "pdf": "https://openreview.net/pdf?id=61A1bsVjRg", "openreview": "https://openreview.net/forum?id=61A1bsVjRg", "author_site": "Gyusang Cho, Chan-Hyun Youn", "tldr": "", "abstract": "After the revelation that neural networks tend to produce overconfident predictions, the problem of calibration, which aims to align confidence with accuracy to enhance the reliability of predictions, has gained significant importance. Several solutions based on calibration maps have been proposed to address the problem of recalibrating a trained classifier using additional datasets. In this paper, we offer an algorithm that transforms the weights of the last layer of the classifier, distinct from the calibration-map-based approach. We concentrate on the geometry of the final linear layer, specifically its angular aspect, and adjust the weights of the corresponding layer. We name the method Tilt and Average, and validate the calibration effect empirically and theoretically. Through this, we demonstrate that our approach, in addition to the existing calibration-map-based techniques, can yield improved calibration performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gyusang Cho;Chan-Hyun Youn", "authorids": "~Gyusang_Cho1;~Chan-Hyun_Youn1", "gender": "M;M", "homepage": "http://ncl.kaist.ac.kr;http://ncl.kaist.ac.kr", "dblp": "15/4738;31/5293", "google_scholar": "Fjd05KwAAAAJ;https://scholar.google.co.kr/scholar?q=chan-hyun+youn", "orcid": ";0000-0002-3970-7308", "linkedin": ";", "or_profile": "~Gyusang_Cho1;~Chan-Hyun_Youn1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\ncho2024tilt,\ntitle={Tilt and Average : Geometric Adjustment of the Last Layer for Recalibration},\nauthor={Gyusang Cho and Chan-Hyun Youn},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=61A1bsVjRg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2651351, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:mGoFraSg8XQJ:scholar.google.com/&scioq=Tilt+and+Average+:+Geometric+Adjustment+of+the+Last+Layer+for+Recalibration&hl=en&as_sdt=0,33", "gs_version_total": 8, "email": "kaist.ac.kr;kaist.ac.kr", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Neural Tangent Kernels Motivate Cross-Covariance Graphs in Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34948", "id": "61JD8wp4Id", "proceeding": "https://proceedings.mlr.press/v235/khalafi24a.html", "pdf": "https://openreview.net/pdf?id=61JD8wp4Id", "openreview": "https://openreview.net/forum?id=61JD8wp4Id", "author_site": "Shervin Khalafi, Saurabh Sihag, Alejandro Ribeiro", "tldr": "", "abstract": "Neural tangent kernels (NTKs) provide a theoretical regime to analyze the learning and generalization behavior of over-parametrized neural networks. For a supervised learning task, the association between the eigenvectors of the NTK and given data (a concept referred to as alignment in this paper) can govern the rate of convergence of gradient descent, as well as generalization to unseen data. Building upon this concept and leveraging the structure of NTKs for graph neural networks (GNNs), we theoretically investigate NTKs and alignment, where our analysis reveals that optimizing the alignment translates to optimizing the graph representation or the graph shift operator (GSO) in a GNN. Our results further establish theoretical guarantees on the optimality of the alignment for a two-layer GNN and these guarantees are characterized by the graph shift operator being a function of the cross-covariance between the input and the output data. The theoretical insights drawn from the analysis of NTKs are validated by our experiments focused on a multi-variate time series prediction task for a publicly available dataset. Specifically, they demonstrate that GNN-based learning models that operate on the cross-covariance matrix indeed outperform those that operate on the covariance matrix estimated from only the input data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shervin Khalafi;Saurabh Sihag;Alejandro Ribeiro", "authorids": "~Shervin_Khalafi1;~Saurabh_Sihag1;~Alejandro_Ribeiro1", "gender": "M;M;M", "homepage": "https://shervinkhalafi.github.io/;https://sihags.github.io/;https://alelab.seas.upenn.edu", "dblp": ";172/0928;32/15", "google_scholar": "rdfxlq8AAAAJ;T8D94-QAAAAJ;7mrPM4kAAAAJ", "orcid": ";;0000-0003-4230-9906", "linkedin": "shervin-khalafi-316b03221/;;", "or_profile": "~Shervin_Khalafi1;~Saurabh_Sihag1;~Alejandro_Ribeiro1", "aff": "University of Pennsylvania;University of Pennsylvania;University of Pennsylvania", "aff_domain": "upenn.edu;upenn.edu;upenn.edu", "position": "PhD student;Postdoc;Full Professor", "bibtex": "@inproceedings{\nkhalafi2024neural,\ntitle={Neural Tangent Kernels Motivate Cross-Covariance Graphs in Neural Networks},\nauthor={Shervin Khalafi and Saurabh Sihag and Alejandro Ribeiro},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=61JD8wp4Id}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2018388, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Tv5skuOnvmoJ:scholar.google.com/&scioq=Neural+Tangent+Kernels+Motivate+Cross-Covariance+Graphs+in+Neural+Networks&hl=en&as_sdt=0,33", "gs_version_total": 4, "email": "upenn.edu;upenn.edu;upenn.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "MaSS: Multi-attribute Selective Suppression for Utility-preserving Data Transformation from an Information-theoretic Perspective", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34947", "id": "61RlaY9EIn", "proceeding": "https://proceedings.mlr.press/v235/chen24f.html", "pdf": "https://openreview.net/pdf?id=61RlaY9EIn", "openreview": "https://openreview.net/forum?id=61RlaY9EIn", "author_site": "Yizhuo Chen, Chun-Fu (Richard) Chen, Hsiang Hsu, Shaohan Hu, Marco Pistoia, Tarek Abdelzaher", "tldr": "", "abstract": "The growing richness of large-scale datasets has been crucial in driving the rapid advancement and wide adoption of machine learning technologies. The massive collection and usage of data, however, pose an increasing risk for people's private and sensitive information due to either inadvertent mishandling or malicious exploitation. Besides legislative solutions, many technical approaches have been proposed towards data privacy protection. However, they bear various limitations such as leading to degraded data availability and utility, or relying on heuristics and lacking solid theoretical bases. To overcome these limitations, we propose a formal information-theoretic definition for this utility-preserving privacy protection problem, and design a data-driven learnable data transformation framework that is capable of selectively suppressing sensitive attributes from target datasets while preserving the other useful attributes, regardless of whether or not they are known in advance or explicitly annotated for preservation. We provide rigorous theoretical analyses on the operational bounds for our framework, and carry out comprehensive experimental evaluations using datasets of a variety of modalities, including facial images, voice audio clips, and human activity motion sensor signals. Results demonstrate the effectiveness and generalizability of our method under various configurations on a multitude of tasks. Our source code is available at this [URL](https://arxiv.org/abs/2405.14981).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yizhuo Chen;Chun-Fu Chen;Hsiang Hsu;Shaohan Hu;Marco Pistoia;Tarek F. Abdelzaher", "authorids": "~Yizhuo_Chen2;~Chun-Fu_Chen1;~Hsiang_Hsu1;~Shaohan_Hu2;~Marco_Pistoia2;~Tarek_F._Abdelzaher1", "gender": ";M;M;;M;M", "homepage": "https://yizhuochen99.github.io/;;https://hsianghsu.github.io;https://hushaohan.github.io/;https://sites.google.com/view/marcopistoia/;http://abdelzaher.cs.illinois.edu/", "dblp": ";48/915;;;p/MarcoPistoia;a/TarekFAbdelzaher", "google_scholar": ";9gqd5cYAAAAJ;https://scholar.google.com.tw/citations?user=JRl3iYIAAAAJ;m9dh1GoAAAAJ;tj2A8SUAAAAJ;https://scholar.google.com.tw/citations?user=cA28Zs0AAAAJ", "orcid": ";;0000-0001-8084-3929;;0000-0001-9002-1128;0000-0003-3883-7220", "linkedin": ";;;;pistoia/;tarek-abdelzaher-0216071/", "or_profile": "~Yizhuo_Chen2;~Chun-Fu_Chen1;~Hsiang_Hsu1;~Shaohan_Hu2;~Marco_Pistoia2;~Tarek_Abdelzaher1", "aff": "J.P. Morgan Chase;JPMorganChase, GTAR;JP Morgan & Chase Bank;J.P. Morgan Chase;J.P. Morgan Chase;University of Illinois, Urbana Champaign", "aff_domain": "jpmchase.com;jpmchase.com;jpmchase.com;jpmchase.com;jpmorgan.com;illinois.edu", "position": "Intern;Executive Director;Researcher;Executive Director;Principal Researcher;Full Professor", "bibtex": "@inproceedings{\nchen2024mass,\ntitle={Ma{SS}: Multi-attribute Selective Suppression for Utility-preserving Data Transformation from an Information-theoretic Perspective},\nauthor={Yizhuo Chen and Chun-Fu Chen and Hsiang Hsu and Shaohan Hu and Marco Pistoia and Tarek F. Abdelzaher},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=61RlaY9EIn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 682575, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:PPLSJu5WWbMJ:scholar.google.com/&scioq=MaSS:+Multi-attribute+Selective+Suppression+for+Utility-preserving+Data+Transformation+from+an+Information-theoretic+Perspective&hl=en&as_sdt=0,5", "gs_version_total": 8, "email": "jpmchase.com;jpmchase.com;jpmchase.com;jpmchase.com;jpmorgan.com;illinois.edu", "author_num": 6, "aff_unique_index": "0;1;0;0;0;2", "aff_unique_norm": "JPMorgan Chase & Co.;JPMorgan Chase;University of Illinois Urbana-Champaign", "aff_unique_dep": ";Global Technology, Analytics, and Research (GTAR);", "aff_unique_url": "https://www.jpmorganchase.com;https://www.jpmorganchase.com;https://illinois.edu", "aff_unique_abbr": "JPM;JPM;UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Online bipartite matching with imperfect advice", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34946", "id": "61WtHsVKWF", "proceeding": "https://proceedings.mlr.press/v235/choo24a.html", "pdf": "https://openreview.net/pdf?id=61WtHsVKWF", "openreview": "https://openreview.net/forum?id=61WtHsVKWF", "author_site": "Davin Choo, Themis Gouleakis, Chun Kai Ling, Arnab Bhattacharyya", "tldr": "", "abstract": "We study the problem of online unweighted bipartite matching with $n$ offline vertices and $n$ online vertices where one wishes to be competitive against the optimal offline algorithm. While the classic RANKING algorithm of (Karp et al., 1990) provably attains competitive ratio of $1-1/e > 1/2$, we show that no learning-augmented method can be both 1-consistent and strictly better than 1/2-robust under the adversarial arrival model. Meanwhile, under the random arrival model, we show how one can utilize methods from distribution testing to design an algorithm that takes in external advice about the online vertices and provably achieves competitive ratio interpolating between any ratio attainable by advice-free methods and the optimal ratio of 1, depending on the advice quality.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Davin Choo;Themistoklis Gouleakis;Chun Kai Ling;Arnab Bhattacharyya", "authorids": "~Davin_Choo1;~Themistoklis_Gouleakis2;~Chun_Kai_Ling2;~Arnab_Bhattacharyya1", "gender": ";M;M;M", "homepage": "http://davinchoo.com/;https://www.mit.edu/~tgoule/;https://lingchunkai.github.io/;https://warwick.ac.uk/fac/sci/dcs/people/arnab_bhattacharyya/", "dblp": "230/4363.html;122/9933;172/1134;64/574.html", "google_scholar": "cPtzhPsAAAAJ;;foMuvWEAAAAJ;eECXWqUAAAAJ", "orcid": "0000-0002-4545-7341;;;", "linkedin": ";;;", "or_profile": "~Davin_Choo1;~Themistoklis_Gouleakis2;~Chun_Kai_Ling2;~Arnab_Bhattacharyya1", "aff": "National University of Singapore;National University of Singapore;Columbia University;National University of Singapore", "aff_domain": "u.nus.edu;nus.edu.sg;columbia.edu;nus.edu.sg", "position": "PhD student;Postdoc;Postdoc;Associate Professor", "bibtex": "@inproceedings{\nchoo2024online,\ntitle={Online bipartite matching with imperfect advice},\nauthor={Davin Choo and Themistoklis Gouleakis and Chun Kai Ling and Arnab Bhattacharyya},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=61WtHsVKWF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 520039, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3809971196348375243&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 9, "email": "u.nus.edu;nus.edu.sg;columbia.edu;nus.edu.sg", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "National University of Singapore;Columbia University", "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;https://www.columbia.edu", "aff_unique_abbr": "NUS;Columbia", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Singapore;United States" }, { "title": "Quality-Diversity with Limited Resources", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34945", "id": "64I29YeQdt", "proceeding": "https://proceedings.mlr.press/v235/wang24cd.html", "pdf": "https://openreview.net/pdf?id=64I29YeQdt", "openreview": "https://openreview.net/forum?id=64I29YeQdt", "author_site": "Ren-Jian Wang, Ke Xue, Cong Guan, Chao Qian", "tldr": "", "abstract": "Quality-Diversity (QD) algorithms have emerged as a powerful optimization paradigm with the aim of generating a set of high-quality and diverse solutions. To achieve such a challenging goal, QD algorithms require maintaining a large archive and a large population in each iteration, which brings two main issues, sample and resource efficiency. Most advanced QD algorithms focus on improving the sample efficiency, while the resource efficiency is overlooked to some extent. Particularly, the resource overhead during the training process has not been touched yet, hindering the wider application of QD algorithms. In this paper, we highlight this important research question, i.e., how to efficiently train QD algorithms with limited resources, and propose a novel and effective method called RefQD to address it. RefQD decomposes a neural network into representation and decision parts, and shares the representation part with all decision parts in the archive to reduce the resource overhead. It also employs a series of strategies to address the mismatch issue between the old decision parts and the newly updated representation part. Experiments on different types of tasks from small to large resource consumption demonstrate the excellent performance of RefQD: it not only uses significantly fewer resources (e.g., 16% GPU memories on QDax and 3.7% on Atari) but also achieves comparable or better performance compared to sample-efficient QD algorithms. Our code is available at [https://github.com/lamda-bbo/RefQD](https://github.com/lamda-bbo/RefQD).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ren-Jian Wang;Ke Xue;Cong Guan;Chao Qian", "authorids": "~Ren-Jian_Wang1;~Ke_Xue1;~Cong_Guan1;~Chao_Qian1", "gender": "Not Specified;M;M;M", "homepage": "https://www.lamda.nju.edu.cn/wangrj;http://www.lamda.nju.edu.cn/xuek/;http://www.lamda.nju.edu.cn/guanc/;http://www.lamda.nju.edu.cn/qianc/", "dblp": "354/0651;93/2469-1;191/7206;84/8508-1", "google_scholar": "FQNe_R0AAAAJ;78bZVOwAAAAJ;;", "orcid": ";0000-0001-6789-2670;;", "linkedin": ";;;", "or_profile": "~Ren-Jian_Wang1;~Ke_Xue1;~Cong_Guan1;~Chao_Qian1", "aff": "Nanjing University;Nanjing University;Nanjing University;Nanjing university", "aff_domain": "nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu", "position": "PhD student;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nwang2024qualitydiversity,\ntitle={Quality-Diversity with Limited Resources},\nauthor={Ren-Jian Wang and Ke Xue and Cong Guan and Chao Qian},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=64I29YeQdt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1726424, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5984219800099046772&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "", "aff_unique_url": "https://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Deep Stochastic Mechanics", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34944", "id": "64MQCia06B", "proceeding": "https://proceedings.mlr.press/v235/orlova24a.html", "pdf": "https://openreview.net/pdf?id=64MQCia06B", "openreview": "https://openreview.net/forum?id=64MQCia06B", "author_site": "Elena Orlova, Aleksei Ustimenko, Ruoxi Jiang, Peter Y. Lu, Rebecca Willett", "tldr": "", "abstract": "This paper introduces a novel deep-learning-based approach for numerical simulation of a time-evolving Schr\u00f6dinger equation inspired by stochastic mechanics and generative diffusion models. Unlike existing approaches, which exhibit computational complexity that scales exponentially in the problem dimension, our method allows us to adapt to the latent low-dimensional structure of the wave function by sampling from the Markovian diffusion. Depending on the latent dimension, our method may have far lower computational complexity in higher dimensions. Moreover, we propose novel equations for stochastic quantum mechanics, resulting in quadratic computational complexity with respect to the number of dimensions. Numerical simulations verify our theoretical findings and show a significant advantage of our method compared to other deep-learning-based approaches used for quantum mechanics.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Elena Orlova;Aleksei Ustimenko;Ruoxi Jiang;Peter Y. Lu;Rebecca Willett", "authorids": "~Elena_Orlova1;~Aleksei_Ustimenko1;~Ruoxi_Jiang1;~Peter_Y._Lu1;~Rebecca_Willett1", "gender": ";M;F;;F", "homepage": ";;https://roxie62.github.io/;;https://willett.psd.uchicago.edu/", "dblp": ";242/3873;296/0221;;w/RebeccaWillett", "google_scholar": ";OES5pK4AAAAJ;fbVZpI4AAAAJ;;bGRVPl8AAAAJ", "orcid": ";;;;0000-0002-8109-7582", "linkedin": ";;ruoxi-jiang/;;rebecca-willett-90b95973/", "or_profile": "~Elena_Orlova1;~Aleksei_Ustimenko1;~Ruoxi_Jiang1;~Peter_Y._Lu1;~Rebecca_Willett1", "aff": ";ShareChat;University of Chicago;;University of Chicago", "aff_domain": ";sharechat.co;uchicago.edu;;uchicago.edu", "position": ";Researcher;PhD student;;Full Professor", "bibtex": "@inproceedings{\norlova2024deep,\ntitle={Deep Stochastic Mechanics},\nauthor={Elena Orlova and Aleksei Ustimenko and Ruoxi Jiang and Peter Y. Lu and Rebecca Willett},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=64MQCia06B}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5468649, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4334904437321309573&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": ";sharechat.co;uchicago.edu;;uchicago.edu", "author_num": 5, "aff_unique_index": "0;1;1", "aff_unique_norm": "ShareChat;University of Chicago", "aff_unique_dep": ";", "aff_unique_url": "https://www.sharechat.com;https://www.uchicago.edu", "aff_unique_abbr": ";UChicago", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "India;United States" }, { "title": "Learning to Stabilize Online Reinforcement Learning in Unbounded State Spaces", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34943", "id": "64fdhmogiD", "proceeding": "https://proceedings.mlr.press/v235/pavse24a.html", "pdf": "https://openreview.net/pdf?id=64fdhmogiD", "openreview": "https://openreview.net/forum?id=64fdhmogiD", "author_site": "Brahma Pavse, Matthew Zurek, Yudong Chen, Qiaomin Xie, Josiah Hanna", "tldr": "", "abstract": "In many reinforcement learning (RL) applications, we want policies that reach desired states and then keep the controlled system within an acceptable region around the desired states over an indefinite period of time. This latter objective is called *stability* and is especially important when the state space is unbounded, such that the states can be arbitrarily far from each other and the agent can drift far away from the desired states. For example, in stochastic queuing networks, where queues of waiting jobs can grow without bound, the desired state is all-zero queue lengths. Here, a stable policy ensures queue lengths are finite while an optimal policy minimizes queue lengths. Since an optimal policy is also stable, one would expect that RL algorithms would implicitly give us stable policies. However, in this work, we find that deep RL algorithms that directly minimize the distance to the desired state during online training often result in unstable policies, i.e., policies that drift far away from the desired state. We attribute this instability to poor credit-assignment for destabilizing actions. We then introduce an approach based on two ideas: 1) a Lyapunov-based cost-shaping technique and 2) state transformations to the unbounded state space. We conduct an empirical study on various queueing networks and traffic signal control problems and find that our approach performs competitively against strong baselines with knowledge of the transition dynamics. Our code is available here: https://github.com/Badger-RL/STOP", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Brahma S Pavse;Matthew Zurek;Yudong Chen;Qiaomin Xie;Josiah P. Hanna", "authorids": "~Brahma_S_Pavse1;~Matthew_Zurek1;~Yudong_Chen1;~Qiaomin_Xie1;~Josiah_P._Hanna1", "gender": "M;;M;F;", "homepage": "https://brahmasp.github.io/;;https://pages.cs.wisc.edu/~yudongchen/;https://qiaominxie.github.io/;", "dblp": "243/3510;;15/1975-1;37/10269;", "google_scholar": "2Dc_GnUAAAAJ;;ze5rCdwAAAAJ;RVNcy4EAAAAJ;", "orcid": ";;0000-0002-6416-5635;;", "linkedin": ";;;;", "or_profile": "~Brahma_S_Pavse1;~Matthew_Zurek1;~Yudong_Chen1;~Qiaomin_Xie1;~Josiah_P._Hanna1", "aff": "University of Wisconsin - Madison;;Department of Computer Sciences, University of Wisconsin - Madison;University of Wisconsin - Madison;", "aff_domain": "wisc.edu;;cs.wisc.edu;wisc.edu;", "position": "PhD student;;Associate Professor;Assistant Professor;", "bibtex": "@inproceedings{\npavse2024learning,\ntitle={Learning to Stabilize Online Reinforcement Learning in Unbounded State Spaces},\nauthor={Brahma S Pavse and Matthew Zurek and Yudong Chen and Qiaomin Xie and Josiah P. Hanna},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=64fdhmogiD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3455728, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15670335490965013875&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "wisc.edu;;cs.wisc.edu;wisc.edu;", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Wisconsin-Madison", "aff_unique_dep": "", "aff_unique_url": "https://www.wisc.edu", "aff_unique_abbr": "UW-Madison", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "MindEye2: Shared-Subject Models Enable fMRI-To-Image With 1 Hour of Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34942", "id": "65XKBGH5PO", "proceeding": "https://proceedings.mlr.press/v235/scotti24a.html", "pdf": "https://openreview.net/pdf?id=65XKBGH5PO", "openreview": "https://openreview.net/forum?id=65XKBGH5PO", "author_site": "Paul Scotti, Mihir Tripathy, Cesar Kadir Torrico Villanueva, Reese Kneeland, Tong Chen, Ashutosh Narang, Charan Santhirasegaran, Jonathan Xu, Thomas Naselaris, Kenneth Norman, Tanishq Abraham", "tldr": "", "abstract": "Reconstructions of visual perception from brain activity have improved tremendously, but the practical utility of such methods has been limited. This is because such models are trained independently per subject where each subject requires dozens of hours of expensive fMRI training data to attain high-quality results. The present work showcases high-quality reconstructions using only 1 hour of fMRI training data. We pretrain our model across 7 subjects and then fine-tune on minimal data from a new subject. Our novel functional alignment procedure linearly maps all brain data to a shared-subject latent space, followed by a shared non-linear mapping to CLIP image space. We then map from CLIP space to pixel space by fine-tuning Stable Diffusion XL to accept CLIP latents as inputs instead of text. This approach improves out-of-subject generalization with limited training data and also attains state-of-the-art image retrieval and reconstruction metrics compared to single-subject approaches. MindEye2 demonstrates how accurate reconstructions of perception are possible from a single visit to the MRI facility. All code is available on Github: https://github.com/MedARC-AI/MindEyeV2", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Paul Steven Scotti;Mihir Tripathy;Cesar Torrico;Reese Kneeland;Tong Chen;Ashutosh Narang;Charan Santhirasegaran;Jonathan Xu;Thomas Naselaris;Kenneth A. Norman;Tanishq Mathew Abraham", "authorids": "~Paul_Steven_Scotti1;~Mihir_Tripathy1;~Cesar_Torrico1;~Reese_Kneeland1;~Tong_Chen14;~Ashutosh_Narang1;~Charan_Santhirasegaran1;~Jonathan_Xu1;~Thomas_Naselaris3;~Kenneth_A._Norman2;~Tanishq_Mathew_Abraham1", "gender": "M;M;M;M;M;M;M;;;;M", "homepage": "https://paulscotti.github.io/;https://mihirneal.com;;https://www.alljoined.com/;https://davismeee.github.io/;https://conscioustahoe.github.io/;;https://jonathanxu.com;;;https://tanishq.ai", "dblp": "323/1744;;;346/0193;;;;201/7567;;;", "google_scholar": "AE-k4ukAAAAJ;;;https://scholar.google.com/citations?hl=en;A-IPJPkAAAAJ;;;8oFMHzwAAAAJ;;;hIyhkfQAAAAJ", "orcid": "0000-0003-4912-8809;;;0009-0005-7330-4499;0000-0003-4312-7151;;;0009-0006-9236-307X;;;", "linkedin": "paulscotti/;;ckadirt;reese-kneeland/;;;charan-segaran/;jonathanxu01/;;;", "or_profile": "~Paul_Steven_Scotti1;~Mihir_Tripathy1;~Cesar_Torrico1;~Reese_Kneeland1;~Tong_Chen14;~Ashutosh_Narang1;~Charan_Santhirasegaran1;~Jonathan_Xu1;~Thomas_Naselaris3;~Kenneth_A._Norman2;~Tanishq_Mathew_Abraham1", "aff": "Stability AI;Yale University;MedARC;University of Minnesota - Twin Cities;University of Sydney, University of Sydney;MedARC;Columbia University;University of Waterloo;;;Stability AI", "aff_domain": "stability.ai;yale.edu;medarc.ai;umn.edu;uni.sydney.edu.au;medarc.ai;columbia.edu;uwaterloo.ca;;;stability.ai", "position": "Principal Researcher;Intern;Researcher;PhD student;PhD student;Researcher;MS student;Undergrad student;;;Research Director", "bibtex": "@inproceedings{\nscotti2024mindeye,\ntitle={MindEye2: Shared-Subject Models Enable f{MRI}-To-Image With 1 Hour of Data},\nauthor={Paul Steven Scotti and Mihir Tripathy and Cesar Torrico and Reese Kneeland and Tong Chen and Ashutosh Narang and Charan Santhirasegaran and Jonathan Xu and Thomas Naselaris and Kenneth A. Norman and Tanishq Mathew Abraham},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=65XKBGH5PO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8592538, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2810291302970296763&as_sdt=5,24&sciodt=0,24&hl=en", "gs_version_total": 10, "email": "stability.ai;yale.edu;medarc.ai;umn.edu;uni.sydney.edu.au;medarc.ai;columbia.edu;uwaterloo.ca;;;stability.ai", "author_num": 11, "aff_unique_index": "0;1;2;3;4;2;5;6;0", "aff_unique_norm": "Stability AI;Yale University;MedARC;University of Minnesota;University of Sydney;Columbia University;University of Waterloo", "aff_unique_dep": ";;;;;;", "aff_unique_url": "https://stability.ai;https://www.yale.edu;https://www.medarc.org;https://www.minnesota.edu;https://www.sydney.edu.au;https://www.columbia.edu;https://uwaterloo.ca", "aff_unique_abbr": "Stability AI;Yale;;UMN;USYD;Columbia;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Twin Cities", "aff_country_unique_index": "0;0;0;0;1;0;0;2;0", "aff_country_unique": "United States;Australia;Canada" }, { "title": "Position: An Inner Interpretability Framework for AI Inspired by Lessons from Cognitive Neuroscience", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34941", "id": "66KmnMhGU5", "proceeding": "https://proceedings.mlr.press/v235/vilas24a.html", "pdf": "https://openreview.net/pdf?id=66KmnMhGU5", "openreview": "https://openreview.net/forum?id=66KmnMhGU5", "author_site": "Martina G. Vilas, Federico Adolfi, David Poeppel, Gemma Roig", "tldr": "", "abstract": "Inner Interpretability is a promising emerging field tasked with uncovering the inner mechanisms of AI systems, though how to develop these mechanistic theories is still much debated. Moreover, recent critiques raise issues that question its usefulness to advance the broader goals of AI. However, it has been overlooked that these issues resemble those that have been grappled with in another field: Cognitive Neuroscience. Here we draw the relevant connections and highlight lessons that can be transferred productively between fields. Based on these, we propose a general conceptual framework and give concrete methodological strategies for building mechanistic explanations in AI inner interpretability research. With this conceptual framework, Inner Interpretability can fend off critiques and position itself on a productive path to explain AI systems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Martina G. Vilas;Federico Adolfi;David Poeppel;Gemma Roig", "authorids": "~Martina_G._Vilas1;~Federico_Adolfi1;~David_Poeppel1;~Gemma_Roig1", "gender": "F;;M;F", "homepage": "https://martinagvilas.github.io/;;;http://www.cvai.cs.uni-frankfurt.de/", "dblp": ";;;58/9606", "google_scholar": ";;9EyT1mYAAAAJ;6MjMhT4AAAAJ", "orcid": "0000-0002-1097-8534;;;0000-0002-6439-8076", "linkedin": "martinagvilas;;;gemma-roig-5830b414/", "or_profile": "~Martina_G._Vilas1;~Federico_Adolfi1;~David_Poeppel1;~Gemma_Roig1", "aff": "Goethe University;;New York University;Johann Wolfgang Goethe Universit\u00e4t Frankfurt am Main", "aff_domain": "cs.uni-frankfurt.de;;nyu.edu;uni-frankfurt.de", "position": "PhD student;;Full Professor;Full Professor", "bibtex": "@inproceedings{\nvilas2024position,\ntitle={Position: An Inner Interpretability Framework for {AI} Inspired by Lessons from Cognitive Neuroscience},\nauthor={Martina G. Vilas and Federico Adolfi and David Poeppel and Gemma Roig},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=66KmnMhGU5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1087377, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6974819136727392732&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 7, "email": "cs.uni-frankfurt.de;;nyu.edu;uni-frankfurt.de", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Goethe University Frankfurt;New York University;Johann Wolfgang Goethe University Frankfurt am Main", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uni-frankfurt.de;https://www.nyu.edu;https://www.uni-frankfurt.de", "aff_unique_abbr": "GU;NYU;JWGU", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Frankfurt;;Frankfurt am Main", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Germany;United States" }, { "title": "Towards Efficient Exact Optimization of Language Model Alignment", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34940", "id": "66k81s33p3", "proceeding": "https://proceedings.mlr.press/v235/ji24c.html", "pdf": "https://openreview.net/pdf?id=66k81s33p3", "openreview": "https://openreview.net/forum?id=66k81s33p3", "author_site": "Haozhe Ji, Cheng Lu, Yilin Niu, Pei Ke, Hongning Wang, Jun Zhu, Jie Tang, Minlie Huang", "tldr": "", "abstract": "The alignment of language models with human preferences is vital for their application in real-world tasks. The problem is formulated as optimizing the model's policy to maximize the expected reward that reflects human preferences with minimal deviation from the initial policy. While considered as a straightforward solution, reinforcement learning (RL) suffers from high variance in policy updates, which impedes efficient policy improvement. Recently, direct preference optimization (DPO) was proposed to directly optimize the policy from preference data. However, we show that DPO derived based on the optimal solution of the problem leads to a compromised mean-seeking approximation of the optimal solution in practice. In this paper, we propose efficient exact optimization (EXO) of the alignment objective. EXO is guaranteed to optimize in the same direction as RL algorithms asymptotically for arbitrary policy parametrization. This leads to the same mode-seeking solution, while enables efficient optimization by circumventing the complexities of RL. We also compare our method to DPO with both theoretical and empirical analyses, and further demonstrate the advantages of our method over existing approaches on realistic human preference data. Code is available at https://github.com/haozheji/exact-optimization.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haozhe Ji;Cheng Lu;Yilin Niu;Pei Ke;Hongning Wang;Jun Zhu;Jie Tang;Minlie Huang", "authorids": "~Haozhe_Ji2;~Cheng_Lu5;~Yilin_Niu1;~Pei_Ke2;~Hongning_Wang1;~Jun_Zhu2;~Jie_Tang1;~Minlie_Huang1", "gender": "M;M;M;M;M;M;;M", "homepage": "https://haozheji.github.io/;https://luchengthu.github.io/;;https://kepei1106.github.io/;http://www.cs.virginia.edu/~hw5x/;http://ml.cs.tsinghua.edu.cn/~jun;;http://coai.cs.tsinghua.edu.cn/hml", "dblp": "222/9546;91/1482-11;199/3657;10/2179;05/6545;50/2644-1;;", "google_scholar": "EE5Z7mUAAAAJ;vPE9VRoAAAAJ;;W_zPCtEAAAAJ;qkdvKNoAAAAJ;axsP38wAAAAJ;;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";;;;0000-0002-6524-9195;;;", "linkedin": "%E6%98%8A%E5%93%B2-%E8%AE%A1-69722313b/;;;;;;;", "or_profile": "~Haozhe_Ji2;~Cheng_Lu5;~Yilin_Niu1;~Pei_Ke2;~Hongning_Wang1;~Jun_Zhu2;~Jie_Tang1;~Minlie_Huang1", "aff": "Tsinghua University;Tsinghua University;;Tsinghua University;Tsinghua University;Tsinghua University;;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;;tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn;;tsinghua.edu.cn", "position": "PhD student;PhD student;;Postdoc;Associate Professor;Professor;;Full Professor", "bibtex": "@inproceedings{\nji2024towards,\ntitle={Towards Efficient Exact Optimization of Language Model Alignment},\nauthor={Haozhe Ji and Cheng Lu and Yilin Niu and Pei Ke and Hongning Wang and Jun Zhu and Jie Tang and Minlie Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=66k81s33p3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4352318, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15100359811638383458&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 8, "email": "tsinghua.edu.cn;tsinghua.edu.cn;;tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn;;tsinghua.edu.cn", "author_num": 8, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "How do Large Language Models Navigate Conflicts between Honesty and Helpfulness?", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34939", "id": "685vj0lC9z", "proceeding": "https://proceedings.mlr.press/v235/liu24bb.html", "pdf": "https://openreview.net/pdf?id=685vj0lC9z", "openreview": "https://openreview.net/forum?id=685vj0lC9z", "author_site": "Ryan Liu, Theodore R Sumers, Ishita Dasgupta, Thomas Griffiths", "tldr": "", "abstract": "In day-to-day communication, people often approximate the truth --- for example, rounding the time or omitting details --- in order to be maximally helpful to the listener. How do large language models (LLMs) handle such nuanced trade-offs? To address this question, we use psychological models and experiments designed to characterize human behavior to analyze LLMs. We test a range of LLMs and explore how optimization for human preferences or inference-time reasoning affects these trade-offs. We find that reinforcement learning from human feedback improves both honesty and helpfulness, while chain-of-thought prompting skews LLMs towards helpfulness over honesty. Finally, GPT-4 Turbo demonstrates human-like response patterns including sensitivity to the conversational framing and listener's decision context. Our findings reveal the conversational values internalized by LLMs and suggest that even these abstract values can, to a degree, be steered by zero-shot prompting.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ryan Liu;Theodore Sumers;Ishita Dasgupta;Thomas L. Griffiths", "authorids": "~Ryan_Liu1;~Theodore_Sumers1;~Ishita_Dasgupta1;~Thomas_L._Griffiths1", "gender": "M;M;;", "homepage": "https://theryanl.github.io;https://www.tedsumers.info/;;http://cocosci.princeton.edu/tom/", "dblp": ";275/8880;169/6218;34/4472", "google_scholar": "s3McVn8AAAAJ;xZal_nUAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0002-6128-0291;;", "linkedin": "ryanchenliu/;;idasgupta6/;", "or_profile": "~Ryan_Liu1;~Theodore_Sumers1;~Ishita_Dasgupta1;~Thomas_L._Griffiths1", "aff": "Princeton University;Anthropic;Google DeepMind;Princeton University", "aff_domain": "princeton.edu;anthropic.com;deepmind.com;princeton.edu", "position": "PhD student;Researcher;Researcher;Professor", "bibtex": "@inproceedings{\nliu2024how,\ntitle={How do Large Language Models Navigate Conflicts between Honesty and Helpfulness?},\nauthor={Ryan Liu and Theodore Sumers and Ishita Dasgupta and Thomas L. Griffiths},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=685vj0lC9z}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5984989, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6111475570798101374&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "princeton.edu;anthropic.com;deepmind.com;princeton.edu", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Princeton University;Anthropic;Google", "aff_unique_dep": ";;Google DeepMind", "aff_unique_url": "https://www.princeton.edu;https://www.anthropic.com;https://deepmind.com", "aff_unique_abbr": "Princeton;Anthropic;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "An Iterative Min-Min Optimization Method for Sparse Bayesian Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34938", "id": "69RewQwWA9", "proceeding": "https://proceedings.mlr.press/v235/wang24al.html", "pdf": "https://openreview.net/pdf?id=69RewQwWA9", "openreview": "https://openreview.net/forum?id=69RewQwWA9", "author_site": "Yasen Wang, Junlin Li, Zuogong Yue, Ye Yuan", "tldr": "", "abstract": "As a well-known machine learning algorithm, sparse Bayesian learning (SBL) can find sparse representations in linearly probabilistic models by imposing a sparsity-promoting prior on model coefficients. However, classical SBL algorithms lack the essential theoretical guarantees of global convergence. To address this issue, we propose an iterative Min-Min optimization method to solve the marginal likelihood function (MLF) of SBL based on the concave-convex procedure. The method can optimize the hyperparameters related to both the prior and noise level analytically at each iteration by re-expressing MLF using auxiliary functions. Particularly, we demonstrate that the method globally converges to a local minimum or saddle point of MLF. With rigorous theoretical guarantees, the proposed novel SBL algorithm outperforms classical ones in finding sparse representations on simulation and real-world examples, ranging from sparse signal recovery to system identification and kernel regression.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yasen Wang;Junlin Li;Zuogong Yue;ye yuan", "authorids": "~Yasen_Wang1;~Junlin_Li4;~Zuogong_Yue1;~ye_yuan4", "gender": "M;M;M;M", "homepage": ";https://www.researchgate.net/profile/Junlin-Li-2;http://faculty.hust.edu.cn/yuezuogong/en/index.htm;https://yy311.github.io", "dblp": "166/9152;;158/6567;", "google_scholar": "https://scholar.google.com/citations?hl=en;;b3AVP4QAAAAJ;", "orcid": "0000-0002-0586-8381;0000-0003-2288-5383;0000-0001-8457-2900;", "linkedin": ";;zuogong-yue-2921a223/;", "or_profile": "~Yasen_Wang1;~Junlin_Li4;~Zuogong_Yue1;~ye_yuan4", "aff": "Huazhong University of Science and Technology;Fuyang Normal University;Huazhong University of Science and Technology;Huazhong University of Science and Technology, Tsinghua University", "aff_domain": "hust.edu.cn;fynu.edu.cn;hust.edu.cn;hust.edu.cn", "position": "PhD student;Lecturer;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nwang2024an,\ntitle={An Iterative Min-Min Optimization Method for Sparse Bayesian Learning},\nauthor={Yasen Wang and Junlin Li and Zuogong Yue and ye yuan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=69RewQwWA9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 394318, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15266749746389571451&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "hust.edu.cn;fynu.edu.cn;hust.edu.cn;hust.edu.cn", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Huazhong University of Science and Technology;Fuyang Normal University", "aff_unique_dep": ";", "aff_unique_url": "http://www.hust.edu.cn;http://www.fynu.edu.cn", "aff_unique_abbr": "HUST;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Neural NeRF Compression", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34937", "id": "6BYD121JFO", "proceeding": "https://proceedings.mlr.press/v235/pham24a.html", "pdf": "https://openreview.net/pdf?id=6BYD121JFO", "openreview": "https://openreview.net/forum?id=6BYD121JFO", "author_site": "Tuan Pham, Stephan Mandt", "tldr": "", "abstract": "Neural Radiance Fields (NeRFs) have emerged as powerful tools for capturing detailed 3D scenes through continuous volumetric representations. Recent NeRFs utilize feature grids to improve rendering quality and speed; however, these representations introduce significant storage overhead. This paper presents a novel method for efficiently compressing a grid-based NeRF model, addressing the storage overhead concern. Our approach is based on the non-linear transform coding paradigm, employing neural compression for compressing the model's feature grids. Due to the lack of training data involving many i.i.d scenes, we design an encoder-free, end-to-end optimized approach for individual scenes, using lightweight decoders. To leverage the spatial inhomogeneity of the latent feature grids, we introduce an importance-weighted rate-distortion objective and a sparse entropy model employing a masking mechanism. Our experimental results validate that our proposed method surpasses existing works in terms of grid-based NeRF compression efficacy and reconstruction quality.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tuan Pham;Stephan Mandt", "authorids": "~Tuan_Pham4;~Stephan_Mandt1", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Tuan_Pham4;~Stephan_Mandt1", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\npham2024neural,\ntitle={Neural Ne{RF} Compression},\nauthor={Tuan Pham and Stephan Mandt},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6BYD121JFO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4798065, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15964129268626478439&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";", "author_num": 2 }, { "title": "Stationarity without mean reversion in improper Gaussian processes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34936", "id": "6CV1N7hhpA", "proceeding": "https://proceedings.mlr.press/v235/ambrogioni24a.html", "pdf": "https://openreview.net/pdf?id=6CV1N7hhpA", "openreview": "https://openreview.net/forum?id=6CV1N7hhpA", "tldr": "", "abstract": "The behavior of a GP regression depends on the choice of covariance function. Stationary covariance functions are preferred in machine learning applications. However, (non-periodic) stationary covariance functions are always mean reverting and can therefore exhibit pathological behavior when applied to data that does not relax to a fixed global mean value. In this paper we show that it is possible to use improper GP priors with infinite variance to define processes that are stationary but not mean reverting. To this aim, we use of non-positive kernels that can only be defined in this limit regime. The resulting posterior distributions can be computed analytically and it involves a simple correction of the usual formulas. The main contribution of the paper is the introduction of a large family of smooth non-reverting covariance functions that closely resemble the kernels commonly used in the GP literature (e.g. squared exponential and Mat\u00e9rn class). By analyzing both synthetic and real data, we demonstrate that these non-positive kernels solve some known pathologies of mean reverting GP regression while retaining most of the favorable properties of ordinary smooth stationary kernels.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Luca Ambrogioni", "authorids": "~Luca_Ambrogioni1", "gender": "M", "homepage": "https://scholar.google.nl/citations?user=J9IABpQAAAAJ&hl=en", "dblp": "151/9813", "google_scholar": "https://scholar.google.nl/citations?user=J9IABpQAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Luca_Ambrogioni1", "aff": "Radboud University Nijmegen", "aff_domain": "ru.nl", "position": "Assistant Professor", "bibtex": "@inproceedings{\nambrogioni2024stationarity,\ntitle={Stationarity without mean reversion in improper Gaussian processes},\nauthor={Luca Ambrogioni},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6CV1N7hhpA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1170538, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:wbHykv4KuTkJ:scholar.google.com/&scioq=Stationarity+without+mean+reversion+in+improper+Gaussian+processes&hl=en&as_sdt=0,33", "gs_version_total": 7, "email": "ru.nl", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Radboud University", "aff_unique_dep": "", "aff_unique_url": "https://www.ru.nl/", "aff_unique_abbr": "RU", "aff_campus_unique_index": "0", "aff_campus_unique": "Nijmegen", "aff_country_unique_index": "0", "aff_country_unique": "Netherlands" }, { "title": "On Positivity Condition for Causal Inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34935", "id": "6D0nyemiWk", "proceeding": "https://proceedings.mlr.press/v235/hwang24a.html", "pdf": "https://openreview.net/pdf?id=6D0nyemiWk", "openreview": "https://openreview.net/forum?id=6D0nyemiWk", "author_site": "Inwoo Hwang, Yesong Choe, Yeahoon Kwon, Sanghack Lee", "tldr": "", "abstract": "Identifying and estimating a causal effect is a fundamental task when researchers want to infer a causal effect using an observational study without experiments. A conventional assumption is the strict positivity of the given distribution, or so called positivity (or overlap) under the unconfounded assumption that the probabilities of treatments are positive. However, there exist many environments where neither observational data exhibits strict positivity nor unconfounded assumption holds. Against this background, we examine the graphical counterpart of the conventional positivity condition so as to license the use of identification formula without strict positivity. In particular, we explore various approaches, including analysis in a post-hoc manner, do-calculus, $Q$-decomposition, and algorithmic, to yielding a positivity condition for an identification formula, where we relate them, providing a comprehensive view. We further discuss the design of a positivity-aware identification algorithm based on the theoretical characterization of identification formulas.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Inwoo Hwang;Yesong Choe;Yeahoon Kwon;Sanghack Lee", "authorids": "~Inwoo_Hwang1;yesong@snu.ac.kr;dataofyou@snu.ac.kr;~Sanghack_Lee1", "gender": ";;;M", "homepage": "https://iwhwang.github.io;;;http://www.sanghacklee.me", "dblp": "317/0732;;;20/1133", "google_scholar": "MuG6Le8AAAAJ;;;https://scholar.google.com/citations?hl=en", "orcid": ";;;0000-0001-7137-6126", "linkedin": ";;;sanghack-lee-65b52a28/", "or_profile": "~Inwoo_Hwang1;yesong@snu.ac.kr;dataofyou@snu.ac.kr;~Sanghack_Lee1", "aff": "Seoul National University;;;Seoul National University", "aff_domain": "snu.ac.kr;;;snu.ac.kr", "position": "PhD student;;;Assistant Professor", "bibtex": "@inproceedings{\nhwang2024on,\ntitle={On Positivity Condition for Causal Inference},\nauthor={Inwoo Hwang and Yesong Choe and Yeahoon Kwon and Sanghack Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6D0nyemiWk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 657569, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3612476116997733647&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 8, "email": "snu.ac.kr;;;snu.ac.kr", "author_num": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Sub-token ViT Embedding via Stochastic Resonance Transformers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34934", "id": "6DBvBcW770", "proceeding": "https://proceedings.mlr.press/v235/lao24a.html", "pdf": "https://openreview.net/pdf?id=6DBvBcW770", "openreview": "https://openreview.net/forum?id=6DBvBcW770", "author_site": "Dong Lao, Yangchao Wu, Tian Yu Liu, Alex Wong, Stefano Soatto", "tldr": "", "abstract": "Vision Transformer (ViT) architectures represent images as collections of high-dimensional vectorized tokens, each corresponding to a rectangular non-overlapping patch. This representation trades spatial granularity for embedding dimensionality, and results in semantically rich but spatially coarsely quantized feature maps. In order to retrieve spatial details beneficial to fine-grained inference tasks we propose a training-free method inspired by \"stochastic resonance.\" Specifically, we perform sub-token spatial transformations to the input data, and aggregate the resulting ViT features after applying the inverse transformation. The resulting \"Stochastic Resonance Transformer\" (SRT) retains the rich semantic information of the original representation, but grounds it on a finer-scale spatial domain, partly mitigating the coarse effect of spatial tokenization. SRT is applicable across any layer of any ViT architecture, consistently boosting performance on several tasks including segmentation, classification, depth estimation, and others by up to 14.9% without the need for any fine-tuning. Code: https://github.com/donglao/srt.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dong Lao;Yangchao Wu;Tian Yu Liu;Alex Wong;Stefano Soatto", "authorids": "~Dong_Lao1;~Yangchao_Wu1;~Tian_Yu_Liu2;~Alex_Wong2;~Stefano_Soatto1", "gender": "M;M;;M;", "homepage": ";https://scholar.google.com/citations?view_op=list_works&hl=en&user=k_h1nbAAAAAJ;;https://vision.cs.yale.edu/members/alex-wong/;", "dblp": "180/5522;;;39/6537-1;", "google_scholar": "dvQXYW0AAAAJ;https://scholar.google.com/citations?view_op=list_works;;K9_XuM8AAAAJ;", "orcid": ";;;0000-0002-3157-6016;", "linkedin": ";;;;", "or_profile": "~Dong_Lao1;~Yangchao_Wu1;~Tian_Yu_Liu2;~Alex_Wong2;~Stefano_Soatto1", "aff": "University of California, Los Angeles;University of California, Los Angeles;;Yale University;", "aff_domain": "cs.ucla.edu;ucla.edu;;yale.edu;", "position": "Postdoc;PhD student;;Assistant Professor;", "bibtex": "@inproceedings{\nlao2024subtoken,\ntitle={Sub-token ViT Embedding via Stochastic Resonance Transformers},\nauthor={Dong Lao and Yangchao Wu and Tian Yu Liu and Alex Wong and Stefano Soatto},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6DBvBcW770}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3556994, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16436480747636646688&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "cs.ucla.edu;ucla.edu;;yale.edu;", "author_num": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of California, Los Angeles;Yale University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucla.edu;https://www.yale.edu", "aff_unique_abbr": "UCLA;Yale", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Monotone Individual Fairness", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34933", "id": "6EF0bxcZvT", "proceeding": "https://proceedings.mlr.press/v235/bechavod24a.html", "pdf": "https://openreview.net/pdf?id=6EF0bxcZvT", "openreview": "https://openreview.net/forum?id=6EF0bxcZvT", "tldr": "", "abstract": "We revisit the problem of online learning with individual fairness, where an online learner strives to maximize predictive accuracy while ensuring that similar individuals are treated similarly. We first extend the frameworks of Gillen et al. (2018); Bechavod et al. (2020), which rely on feedback from human auditors regarding fairness violations, to allow for auditing schemes that can aggregate feedback from any number of auditors, using a rich class we term monotone aggregation functions, for which we also prove a useful characterization. Using our generalized framework, we present an oracle-efficient algorithm guaranteeing a bound of $\\mathcal{O}(T^\\frac{3}{4})$ simultaneously for regret and number of fairness violations. We then study an online classification setting where label feedback is available for positively-predicted individuals only, and present an algorithm guaranteeing a bound of $\\mathcal{O}(T^\\frac{5}{6})$ simultaneously for regret and number of fairness violations. In both settings, our algorithms improve on the best known bounds for oracle-efficient algorithms. Furthermore, our algorithms offer significant improvements in computational efficiency, greatly reducing the number of required calls to an (offline) optimization oracle, as opposed to previous algorithms which required $T$ such calls every round.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yahav Bechavod", "authorids": "~Yahav_Bechavod1", "gender": "M", "homepage": "https://yahavbe.github.io/", "dblp": "203/9048", "google_scholar": "https://scholar.google.co.il/citations?user=bsnma4wAAAAJ", "orcid": "", "linkedin": "yahav-bechavod-25305b194", "or_profile": "~Yahav_Bechavod1", "aff": "University of Pennsylvania", "aff_domain": "upenn.edu", "position": "Postdoc", "bibtex": "@inproceedings{\nbechavod2024monotone,\ntitle={Monotone Individual Fairness},\nauthor={Yahav Bechavod},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6EF0bxcZvT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 399656, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13478196712505247173&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "upenn.edu", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Prismatic VLMs: Investigating the Design Space of Visually-Conditioned Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34932", "id": "6FXtu8clyp", "proceeding": "https://proceedings.mlr.press/v235/karamcheti24a.html", "pdf": "https://openreview.net/pdf?id=6FXtu8clyp", "openreview": "https://openreview.net/forum?id=6FXtu8clyp", "author_site": "Siddharth Karamcheti, Suraj Nair, Ashwin Balakrishna, Percy Liang, Thomas Kollar, Dorsa Sadigh", "tldr": "", "abstract": "Visually-conditioned language models (VLMs) have seen growing adoption in applications such as visual dialogue, scene understanding, and robotic task planning; adoption that has fueled a wealth of new models such as LLaVa, InstructBLIP, and PaLI-3. Despite the volume of new releases, key design decisions around image preprocessing, architecture, and optimization are under-explored, making it challenging to understand what factors account for model performance \u2013 a challenge further complicated by the lack of objective, consistent evaluations. To address these gaps, we first compile a suite of standardized evaluations spanning visual question answering, object localization, and challenge sets that probe properties such as hallucination; evaluations that provide fine-grained insight VLM capabilities. Second, we rigorously investigate VLMs along key design axes, including pretrained visual representations and training from base vs. instruct-tuned language models, amongst others. We couple our analysis with three resource contributions: (1) a unified framework for evaluating VLMs, (2) optimized, flexible training code, and (3) checkpoints for all models, including a family of VLMs at the 7-13B scale that strictly outperform InstructBLIP and LLaVa v1.5, the state-of-the-art in open VLMs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Siddharth Karamcheti;Suraj Nair;Ashwin Balakrishna;Percy Liang;Thomas Kollar;Dorsa Sadigh", "authorids": "~Siddharth_Karamcheti1;~Suraj_Nair1;~Ashwin_Balakrishna1;~Percy_Liang1;~Thomas_Kollar1;~Dorsa_Sadigh1", "gender": "M;M;M;;M;F", "homepage": "http://siddkaramcheti.com/;https://suraj-nair-1.github.io/;https://abalakrishna123.github.io/;https://cs.stanford.edu/~pliang/;http://tkollar.github.io;https://dorsa.fyi/", "dblp": "199/1922;;218/5246.html;04/1701;10/6653;117/3174", "google_scholar": "L5v2PHAAAAAJ;EHSuFcwAAAAJ;tfN6V84AAAAJ;pouyVyUAAAAJ;AEKT17QAAAAJ;ZaJEZpYAAAAJ", "orcid": ";;;;0000-0003-2598-8118;", "linkedin": ";;ashwin-balakrishna-9b71a357/;;;", "or_profile": "~Siddharth_Karamcheti1;~Suraj_Nair1;~Ashwin_Balakrishna1;~Percy_Liang1;~Thomas_Kollar1;~Dorsa_Sadigh1", "aff": "Stanford University;Toyota Research Institute;Toyota Research Institute;Stanford University;Toyota Research Institute;Stanford University", "aff_domain": "stanford.edu;tri.global;tri.global;stanford.edu;tri.global;stanford.edu", "position": "PhD student;Researcher;Researcher;Associate Professor;Principal Researcher;Assistant Professor", "bibtex": "@inproceedings{\nkaramcheti2024prismatic,\ntitle={Prismatic {VLM}s: Investigating the Design Space of Visually-Conditioned Language Models},\nauthor={Siddharth Karamcheti and Suraj Nair and Ashwin Balakrishna and Percy Liang and Thomas Kollar and Dorsa Sadigh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6FXtu8clyp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9724881, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 102, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9396030288154519426&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "stanford.edu;tri.global;tri.global;stanford.edu;tri.global;stanford.edu", "author_num": 6, "aff_unique_index": "0;1;1;0;1;0", "aff_unique_norm": "Stanford University;Toyota Research Institute", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.tri.global", "aff_unique_abbr": "Stanford;TRI", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Evaluation of Test-Time Adaptation Under Computational Time Constraints", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34931", "id": "6FtAXU4ean", "proceeding": "https://proceedings.mlr.press/v235/alfarra24a.html", "pdf": "https://openreview.net/pdf?id=6FtAXU4ean", "openreview": "https://openreview.net/forum?id=6FtAXU4ean", "author_site": "Motasem Alfarra, Hani Itani, Alejandro Pardo, Shyma Alhuwaider, Merey Ramazanova, Juan C Perez, zhipeng cai, Matthias M\u00fcller, Bernard Ghanem", "tldr": "", "abstract": "This paper proposes a novel online evaluation protocol for Test Time Adaptation (TTA) methods, which penalizes slower methods by providing them with fewer samples for adaptation. TTA methods leverage unlabeled data at test time to adapt to distribution shifts. Though many effective methods have been proposed, their impressive performance usually comes at the cost of significantly increased computation budgets. Current evaluation protocols overlook the effect of this extra computation cost, affecting their real-world applicability. To address this issue, we propose a more realistic evaluation protocol for TTA methods, where data is received in an online fashion from a constant-speed data stream, thereby accounting for the method's adaptation speed. We apply our proposed protocol to benchmark several TTA methods on multiple datasets and scenarios. Extensive experiments shows that, when accounting for inference speed, simple and fast approaches can outperform more sophisticated but slower methods. For example, SHOT from 2020, outperforms the state-of-the-art method SAR from 2023 under our online setting. Our results reveal the importance of developing practical TTA methods that are both accurate and efficient.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Motasem Alfarra;Hani Itani;Alejandro Pardo;shyma yaser alhuwaider;Merey Ramazanova;Juan Camilo Perez;zhipeng cai;Matthias M\u00fcller;Bernard Ghanem", "authorids": "~Motasem_Alfarra1;~Hani_Itani1;~Alejandro_Pardo1;~shyma_yaser_alhuwaider1;~Merey_Ramazanova1;~Juan_Camilo_Perez1;~zhipeng_cai3;~Matthias_M\u00fcller1;~Bernard_Ghanem1", "gender": "M;M;M;F;;M;M;;M", "homepage": "https://motasemalfarra.netlify.app/;;https://www.alejandropardo.net;https://scholar.google.com/citations?hl=en&user=Gz5Iea8AAAAJ;https://meryusha.github.io/;https://www.juancprzs.com;https://zhipengcai.github.io;https://matthias.pw;https://ivul.kaust.edu.sa", "dblp": "255/5192;208/4215;212/9476;334/1560;304/3597;196/0064.html;;169/4686-1;37/2516", "google_scholar": "https://scholar.google.com/citations?hl=en;0GkfZ64AAAAJ;_lKVc3sAAAAJ;Gz5Iea8AAAAJ;tz0TYg4AAAAJ;zBbUubUAAAAJ;;AeMLOMEAAAAJ;rVsGTeEAAAAJ", "orcid": ";;;;;;;;0000-0002-5534-587X", "linkedin": ";;;;;;;;bernardghanem/", "or_profile": "~Motasem_Alfarra1;~Hani_Itani1;~Alejandro_Pardo1;~shyma_yaser_alhuwaider1;~Merey_Ramazanova1;~Juan_Camilo_Perez1;~zhipeng_cai3;~Matthias_M\u00fcller1;~Bernard_Ghanem1", "aff": "KAUST;King Abdullah University of Science and Technology;KAUST;King Abdullah University of Science and Technology;KAUST;KAUST;Intel;Apple;King Abdullah University of Science and Technology", "aff_domain": "kaust.edu.sa;kaust.edu.sa;kaust.edu.sa;kaust.edu.sa;kaust.edu.sa;kaust.edu.sa;intel.com;apple.com;kaust.edu.sa", "position": "PhD student;PhD student;PhD student;PhD student;PhD student;PhD student;Researcher;Researcher;Full Professor", "bibtex": "@inproceedings{\nalfarra2024evaluation,\ntitle={Evaluation of Test-Time Adaptation Under Computational Time Constraints},\nauthor={Motasem Alfarra and Hani Itani and Alejandro Pardo and shyma yaser alhuwaider and Merey Ramazanova and Juan Camilo Perez and zhipeng cai and Matthias M{\\\"u}ller and Bernard Ghanem},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6FtAXU4ean}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 822772, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16742415318592100828&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "kaust.edu.sa;kaust.edu.sa;kaust.edu.sa;kaust.edu.sa;kaust.edu.sa;kaust.edu.sa;intel.com;apple.com;kaust.edu.sa", "author_num": 9, "aff_unique_index": "0;0;0;0;0;0;1;2;0", "aff_unique_norm": "King Abdullah University of Science and Technology;Intel;Apple", "aff_unique_dep": ";Intel Corporation;Apple Inc.", "aff_unique_url": "https://www.kaust.edu.sa;https://www.intel.com;https://www.apple.com", "aff_unique_abbr": "KAUST;Intel;Apple", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;1;1;0", "aff_country_unique": "Saudi Arabia;United States" }, { "title": "Geometry-Aware Instrumental Variable Regression", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34930", "id": "6KLNiRdWH6", "proceeding": "https://proceedings.mlr.press/v235/kremer24a.html", "pdf": "https://openreview.net/pdf?id=6KLNiRdWH6", "openreview": "https://openreview.net/forum?id=6KLNiRdWH6", "author_site": "Heiner Kremer, Bernhard Sch\u00f6lkopf", "tldr": "", "abstract": "Instrumental variable (IV) regression can be approached through its formulation in terms of conditional moment restrictions (CMR). Building on variants of the generalized method of moments, most CMR estimators are implicitly based on approximating the population data distribution via reweightings of the empirical sample. While for large sample sizes, in the independent identically distributed (IID) setting, reweightings can provide sufficient flexibility, they might fail to capture the relevant information in presence of corrupted data or data prone to adversarial attacks. To address these shortcomings, we propose the Sinkhorn Method of Moments, an optimal transport-based IV estimator that takes into account the geometry of the data manifold through data-derivative information. We provide a simple plug-and-play implementation of our method that performs on par with related estimators in standard settings but improves robustness against data corruption and adversarial attacks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Heiner Kremer;Bernhard Sch\u00f6lkopf", "authorids": "~Heiner_Kremer1;~Bernhard_Sch\u00f6lkopf1", "gender": "M;", "homepage": "https://heinerkremer.github.io/;", "dblp": ";", "google_scholar": "I_9TrpgAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Heiner_Kremer1;~Bernhard_Sch\u00f6lkopf1", "aff": "Max Planck Institute for Intelligent Systems, Max-Planck Institute;", "aff_domain": "tuebingen.mpg.de;", "position": "PhD student;", "bibtex": "@inproceedings{\nkremer2024geometryaware,\ntitle={Geometry-Aware Instrumental Variable Regression},\nauthor={Heiner Kremer and Bernhard Sch{\\\"o}lkopf},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6KLNiRdWH6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1209654, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZGWxNW3zDysJ:scholar.google.com/&scioq=Geometry-Aware+Instrumental+Variable+Regression&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": "tuebingen.mpg.de;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Max Planck Institute for Intelligent Systems", "aff_unique_dep": "Intelligent Systems", "aff_unique_url": "https://www.mpi-is.mpg.de", "aff_unique_abbr": "MPI-IS", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "title": "Non-Vacuous Generalization Bounds for Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34929", "id": "6Kg9p8URlj", "proceeding": "https://proceedings.mlr.press/v235/lotfi24a.html", "pdf": "https://openreview.net/pdf?id=6Kg9p8URlj", "openreview": "https://openreview.net/forum?id=6Kg9p8URlj", "author_site": "Sanae Lotfi, Marc Finzi, Yilun Kuang, Tim G. J. Rudner, Micah Goldblum, Andrew Wilson", "tldr": "", "abstract": "Modern language models can contain billions of parameters, raising the question of whether they can generalize beyond the training data or simply parrot their training corpora. We provide the first non-vacuous generalization bounds for pretrained large language models (LLMs), indicating that language models are capable of discovering regularities that generalize to unseen data. In particular, we derive a compression bound that is valid for the unbounded log-likelihood loss using prediction smoothing, and we extend the bound to handle subsampling, making bound computation 900 times faster on massive datasets. To achieve the extreme level of compression required for non-vacuous bounds, we devise SubLoRA, a simple low-dimensional nonlinear parameterization that leads to non-vacuous generalization bounds for very large models with up to 849 million parameters. Finally, we use our bounds to understand LLM generalization and find that larger models have better generalization bounds and are more compressible than smaller models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sanae Lotfi;Marc Anton Finzi;Yilun Kuang;Tim G. J. Rudner;Micah Goldblum;Andrew Gordon Wilson", "authorids": "~Sanae_Lotfi1;~Marc_Anton_Finzi1;~Yilun_Kuang1;~Tim_G._J._Rudner2;~Micah_Goldblum1;~Andrew_Gordon_Wilson1", "gender": "F;M;M;;Not Specified;Not Specified", "homepage": "https://sanaelotfi.github.io/;https://mfinzi.github.io;https://yilunkuang.github.io/;;https://cims.nyu.edu/~andrewgw;https://timrudner.com", "dblp": "281/6627;222/3062;;241/7231;65/10453;230/3480", "google_scholar": "0GyjMX4AAAAJ;ysMAhlwAAAAJ;XvIasgEAAAAJ;pGDKzuUAAAAJ;https://scholar.google.com.tw/citations?user=twWX2LIAAAAJ;https://scholar.google.de/citations?user=MbBntPgAAAAJ", "orcid": ";;;;;", "linkedin": "sanae-lotfi-636825127;;yilun-mark-kuang/;;;trudner", "or_profile": "~Sanae_Lotfi1;~Marc_Anton_Finzi1;~Yilun_Kuang1;~Micah_Goldblum1;~Andrew_Gordon_Wilson1;~Tim_Georg_Johann_Rudner1", "aff": "Microsoft;Carnegie Mellon University;New York University;New York University;New York University;New York University", "aff_domain": "microsoft.com;cmu.edu;nyu.edu;nyu.edu;nyu.edu;nyu.edu", "position": "Researcher;Postdoc;PhD student;Postdoc;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nlotfi2024nonvacuous,\ntitle={Non-Vacuous Generalization Bounds for Large Language Models},\nauthor={Sanae Lotfi and Marc Anton Finzi and Yilun Kuang and Tim G. J. Rudner and Micah Goldblum and Andrew Gordon Wilson},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6Kg9p8URlj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 443557, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12924325717022661669&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "microsoft.com;cmu.edu;nyu.edu;nyu.edu;nyu.edu;nyu.edu", "author_num": 6, "aff_unique_index": "0;1;2;2;2;2", "aff_unique_norm": "Microsoft;Carnegie Mellon University;New York University", "aff_unique_dep": "Microsoft Corporation;;", "aff_unique_url": "https://www.microsoft.com;https://www.cmu.edu;https://www.nyu.edu", "aff_unique_abbr": "Microsoft;CMU;NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Fair Risk Control: A Generalized Framework for Calibrating Multi-group Fairness Risks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34928", "id": "6KtXzUUEp4", "proceeding": "https://proceedings.mlr.press/v235/zhang24be.html", "pdf": "https://openreview.net/pdf?id=6KtXzUUEp4", "openreview": "https://openreview.net/forum?id=6KtXzUUEp4", "author_site": "Lujing Zhang, Aaron Roth, Linjun Zhang", "tldr": "", "abstract": "This paper introduces a framework for post-processing machine learning models so that their predictions satisfy multi-group fairness guarantees. Based on the celebrated notion of multicalibration, we introduce $(s,g,\\alpha)-$GMC (Generalized Multi-Dimensional Multicalibration) for multi-dimensional mappings $s$, constraints $g$, and a pre-specified threshold level $\\alpha$. We propose associated algorithms to achieve this notion in general settings. This framework is then applied to diverse scenarios encompassing different fairness concerns, including false negative rate control in image segmentation, prediction set conditional uncertainty quantification in hierarchical classification, and de-biased text generation in language models. We conduct numerical studies on several datasets and tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lujing Zhang;Aaron Roth;Linjun Zhang", "authorids": "~Lujing_Zhang1;~Aaron_Roth1;~Linjun_Zhang1", "gender": "F;M;M", "homepage": "https://misdrifter.github.io/FooFooFood/;http://www.cis.upenn.edu/~aaroth/;", "dblp": ";80/3311;", "google_scholar": ";https://scholar.google.com.tw/citations?user=kLUQrrYAAAAJ;TUAzs3sAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Lujing_Zhang1;~Aaron_Roth1;~Linjun_Zhang1", "aff": "Peking University;University of Pennsylvania;Rutgers University", "aff_domain": "pku.edu.cn;upenn.edu;rutgers.edu", "position": "Undergrad student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nzhang2024fair,\ntitle={Fair Risk Control: A Generalized Framework for Calibrating Multi-group Fairness Risks},\nauthor={Lujing Zhang and Aaron Roth and Linjun Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6KtXzUUEp4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 983327, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14002496710737651674&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "pku.edu.cn;upenn.edu;rutgers.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Peking University;University of Pennsylvania;Rutgers University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.pku.edu.cn;https://www.upenn.edu;https://www.rutgers.edu", "aff_unique_abbr": "Peking U;UPenn;Rutgers", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;United States" }, { "title": "How Free is Parameter-Free Stochastic Optimization?", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34927", "id": "6L4K5jmSJq", "proceeding": "https://proceedings.mlr.press/v235/attia24a.html", "pdf": "https://openreview.net/pdf?id=6L4K5jmSJq", "openreview": "https://openreview.net/forum?id=6L4K5jmSJq", "author_site": "Amit Attia, Tomer Koren", "tldr": "", "abstract": "We study the problem of parameter-free stochastic optimization, inquiring whether, and under what conditions, do fully parameter-free methods exist: these are methods that achieve convergence rates competitive with optimally tuned methods, without requiring significant knowledge of the true problem parameters. Existing parameter-free methods can only be considered ``partially'' parameter-free, as they require some non-trivial knowledge of the true problem parameters, such as a bound on the stochastic gradient norms, a bound on the distance to a minimizer, etc. In the non-convex setting, we demonstrate that a simple hyperparameter search technique results in a fully parameter-free method that outperforms more sophisticated state-of-the-art algorithms. We also provide a similar result in the convex setting with access to noisy function values under mild noise assumptions. Finally, assuming only access to stochastic gradients, we establish a lower bound that renders fully parameter-free stochastic convex optimization infeasible, and provide a method which is (partially) parameter-free up to the limit indicated by our lower bound.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Amit Attia;Tomer Koren", "authorids": "~Amit_Attia1;~Tomer_Koren1", "gender": "M;M", "homepage": "https://amitattia.github.io;https://tomerkoren.github.io", "dblp": "284/8167;12/10044", "google_scholar": "nozAiIkAAAAJ;wGG1voYAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Amit_Attia1;~Tomer_Koren1", "aff": "Tel Aviv University;Tel Aviv University", "aff_domain": "tau.ac.il;tau.ac.il", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nattia2024how,\ntitle={How Free is Parameter-Free Stochastic Optimization?},\nauthor={Amit Attia and Tomer Koren},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6L4K5jmSJq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 399947, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14909149588547205886&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "tau.ac.il;tau.ac.il", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Tel Aviv University", "aff_unique_dep": "", "aff_unique_url": "https://www.tau.ac.il", "aff_unique_abbr": "TAU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "Convex and Bilevel Optimization for Neural-Symbolic Inference and Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34926", "id": "6NQ77Vj3DT", "proceeding": "https://proceedings.mlr.press/v235/dickens24a.html", "pdf": "https://openreview.net/pdf?id=6NQ77Vj3DT", "openreview": "https://openreview.net/forum?id=6NQ77Vj3DT", "author_site": "Charles Dickens, Changyu Gao, Connor Pryor, Stephen Wright, Lise Getoor", "tldr": "", "abstract": "We leverage convex and bilevel optimization techniques to develop a general gradient-based parameter learning framework for neural-symbolic (NeSy) systems. We demonstrate our framework with NeuPSL, a state-of-the-art NeSy architecture. To achieve this, we propose a smooth primal and dual formulation of NeuPSL inference and show learning gradients are functions of the optimal dual variables. Additionally, we develop a dual block coordinate descent algorithm for the new formulation that naturally exploits warm-starts. This leads to over $100 \\times$ learning runtime improvements over the current best NeuPSL inference method. Finally, we provide extensive empirical evaluations across $8$ datasets covering a range of tasks and demonstrate our learning framework achieves up to a $16$% point prediction performance improvement over alternative learning methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Charles Andrew Dickens;Changyu Gao;Connor Pryor;Stephen Wright;Lise Getoor", "authorids": "~Charles_Andrew_Dickens1;~Changyu_Gao1;~Connor_Pryor1;~Stephen_Wright1;~Lise_Getoor1", "gender": "M;;M;M;F", "homepage": "https://users.soe.ucsc.edu/~cadicken/_site/;https://cyugao.github.io/;;https://wrightstephen.github.io/sw_proj/;https://getoor.soe.ucsc.edu/home", "dblp": ";339/8993;294/8960;75/2677;g/LiseGetoor", "google_scholar": ";;https://scholar.google.com/citations?hl=en;VFQRIOwAAAAJ;", "orcid": ";;;;", "linkedin": ";;connor-pryor-182764199/;;", "or_profile": "~Charles_Andrew_Dickens1;~Changyu_Gao1;~Connor_Pryor1;~Stephen_Wright1;~Lise_Getoor1", "aff": "University of California, Santa Cruz;University of Wisconsin - Madison;University of California, Santa Cruz;University of Wisconsin, Madison;University of Maryland, College Park", "aff_domain": "ucsc.edu;wisc.edu;ucsc.edu;wisc.edu;umd.edu", "position": "PhD student;PhD student;PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\ndickens2024convex,\ntitle={Convex and Bilevel Optimization for Neural-Symbolic Inference and Learning},\nauthor={Charles Andrew Dickens and Changyu Gao and Connor Pryor and Stephen Wright and Lise Getoor},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6NQ77Vj3DT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 684007, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4402417743925395547&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "ucsc.edu;wisc.edu;ucsc.edu;wisc.edu;umd.edu", "author_num": 5, "aff_unique_index": "0;1;0;2;3", "aff_unique_norm": "University of California, Santa Cruz;University of Wisconsin-Madison;University of Wisconsin;University of Maryland", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ucsc.edu;https://www.wisc.edu;https://www.wisc.edu;https://www/umd.edu", "aff_unique_abbr": "UCSC;UW-Madison;UW;UMD", "aff_campus_unique_index": "0;1;0;1;2", "aff_campus_unique": "Santa Cruz;Madison;College Park", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Total Variation Distance Meets Probabilistic Inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34925", "id": "6OSLjErBhh", "proceeding": "https://proceedings.mlr.press/v235/bhattacharyya24a.html", "pdf": "https://openreview.net/pdf?id=6OSLjErBhh", "openreview": "https://openreview.net/forum?id=6OSLjErBhh", "author_site": "Arnab Bhattacharyya, Sutanu Gayen, Kuldeep S. Meel, Dimitrios Myrisiotis, A. Pavan, N. Vinodchandran", "tldr": "", "abstract": "In this paper, we establish a novel connection between total variation (TV) distance estimation and probabilistic inference. In particular, we present an efficient, structure-preserving reduction from relative approximation of TV distance to probabilistic inference over directed graphical models. This reduction leads to a fully polynomial randomized approximation scheme (FPRAS) for estimating TV distances between same-structure distributions over any class of Bayes nets for which there is an efficient probabilistic inference algorithm. In particular, it leads to an FPRAS for estimating TV distances between distributions that are defined over a common Bayes net of small treewidth. Prior to this work, such approximation schemes only existed for estimating TV distances between product distributions. Our approach employs a new notion of *partial* couplings of high-dimensional distributions, which might be of independent interest.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Arnab Bhattacharyya;Sutanu Gayen;Kuldeep S. Meel;Dimitrios Myrisiotis;A. Pavan;N. V. Vinodchandran", "authorids": "~Arnab_Bhattacharyya1;~Sutanu_Gayen1;~Kuldeep_S._Meel2;~Dimitrios_Myrisiotis1;~A._Pavan1;~N._V._Vinodchandran2", "gender": "M;M;M;;;", "homepage": "https://warwick.ac.uk/fac/sci/dcs/people/arnab_bhattacharyya/;;https://www.kuldeepmeel.com;https://dimyrisiotis.github.io/;https://www.cs.iastate.edu/pavan;", "dblp": "64/574.html;183/2766;https://dblp.org/pers/m/Meel:Kuldeep_S=.html;176/6922;88/1807;", "google_scholar": "eECXWqUAAAAJ;aqd7jKoAAAAJ;;BvR0TM8AAAAJ;4QIV0FUAAAAJ;", "orcid": ";0000-0003-3300-1627;;0000-0001-9585-1227;0000-0003-1665-5266;", "linkedin": ";;;dimitriosimyrisiotis;;", "or_profile": "~Arnab_Bhattacharyya1;~Sutanu_Gayen1;~Kuldeep_S._Meel2;~Dimitrios_Myrisiotis1;~A._Pavan1;~N._V._Vinodchandran2", "aff": "National University of Singapore;Indian Institute of Technology, Kanpur;University of Toronto;CNRS@CREATE LTD.;Iowa State University;", "aff_domain": "nus.edu.sg;iitk.ac.in;comp.nus.edu.sg;cnrsatcreate.cnrs.fr;iastate.edu;", "position": "Associate Professor;Assistant Professor;Associate Professor;Postdoc;Full Professor;", "bibtex": "@inproceedings{\nbhattacharyya2024total,\ntitle={Total Variation Distance Meets Probabilistic Inference},\nauthor={Arnab Bhattacharyya and Sutanu Gayen and Kuldeep S. Meel and Dimitrios Myrisiotis and A. Pavan and N. V. Vinodchandran},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6OSLjErBhh}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 397237, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9979624660497540592&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "nus.edu.sg;iitk.ac.in;comp.nus.edu.sg;cnrsatcreate.cnrs.fr;iastate.edu;", "author_num": 6, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "National University of Singapore;Indian Institute of Technology Kanpur;University of Toronto;CREATE;Iowa State University", "aff_unique_dep": ";;;CNRS;", "aff_unique_url": "https://www.nus.edu.sg;https://www.iitk.ac.in;https://www.utoronto.ca;https://www.create-ltd.com;https://www.iastate.edu", "aff_unique_abbr": "NUS;IIT Kanpur;U of T;CREATE;ISU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Kanpur", "aff_country_unique_index": "0;1;2;3;4", "aff_country_unique": "Singapore;India;Canada;France;United States" }, { "title": "Hyperbolic Geometric Latent Diffusion Model for Graph Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34924", "id": "6OkvBGqW62", "proceeding": "https://proceedings.mlr.press/v235/fu24c.html", "pdf": "https://openreview.net/pdf?id=6OkvBGqW62", "openreview": "https://openreview.net/forum?id=6OkvBGqW62", "author_site": "Xingcheng Fu, Yisen Gao, Yuecen Wei, Qingyun Sun, Hao Peng, Jianxin Li, Xianxian Li", "tldr": "", "abstract": "Diffusion models have made significant contributions to computer vision, sparking a growing interest in the community recently regarding the application of it to graph generation. The existing discrete graph diffusion models exhibit heightened computational complexity and diminished training efficiency. A preferable and natural way is to directly diffuse the graph within the latent space. However, due to the non-Euclidean structure of graphs is not isotropic in the latent space, the existing latent diffusion models effectively make it difficult to capture and preserve the topological information of graphs. To address the above challenges, we propose a novel geometrically latent diffusion framework HypDiff. Specifically, we first establish a geometrically latent space with interpretability measures based on hyperbolic geometry, to define anisotropic latent diffusion processes for graphs. Then, we propose a geometrically latent diffusion process that is constrained by both radial and angular geometric properties, thereby ensuring the preservation of the original topological properties in the generative graphs. Extensive experimental results demonstrate the superior effectiveness of HypDiff for graph generation with various topologies.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xingcheng Fu;Yisen Gao;Yuecen Wei;Qingyun Sun;Hao Peng;Jianxin Li;Xianxian LI", "authorids": "~Xingcheng_Fu1;~Yisen_Gao1;~Yuecen_Wei1;~Qingyun_Sun2;~Hao_Peng7;~Jianxin_Li3;~Xianxian_LI2", "gender": "M;M;;F;M;M;M", "homepage": "https://fuxingcheng.github.io/;https://github.com/Eason-nuosen;;https://sunqysunqy.github.io/;https://penghao-bdsc.github.io/;http://myjianxin.github.io;http://www.cs.gxnu.edu.cn/lxx/listm.htm", "dblp": "236/7003;377/2092.html;330/4273;;69/7742-1;l/JianxinLi-2.html;81/4000.html", "google_scholar": "gN4tbgMAAAAJ;https://scholar.google.cz/citations?user=MMxR_uYAAAAJ;https://scholar.google.com.hk/citations?user=z2gWWkMAAAAJ;e2oYBzUAAAAJ;R25rbyQAAAAJ;EY2lqD0AAAAJ;", "orcid": "0000-0002-4643-8126;;;;0000-0003-0458-5977;0000-0001-5152-0055;", "linkedin": ";;;;;;", "or_profile": "~Xingcheng_Fu1;~Yisen_Gao1;~Yuecen_Wei1;~Qingyun_Sun2;~Hao_Peng7;~Jianxin_Li3;~Xianxian_LI2", "aff": "Guangxi Normal University;Beihang University;Beihang University;Beihang University;Beihang University;Beihang University ;Guangxi Normal University", "aff_domain": "gxnu.edu.cn;buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;gxnu.edu.cn", "position": "Assistant Professor;Undergrad student;PhD student;Assistant Professor;Full Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nfu2024hyperbolic,\ntitle={Hyperbolic Geometric Latent Diffusion Model for Graph Generation},\nauthor={Xingcheng Fu and Yisen Gao and Yuecen Wei and Qingyun Sun and Hao Peng and Jianxin Li and Xianxian LI},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6OkvBGqW62}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5879736, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12080563849177399279&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "gxnu.edu.cn;buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;gxnu.edu.cn", "author_num": 7, "aff_unique_index": "0;1;1;1;1;1;0", "aff_unique_norm": "Guangxi Normal University;Beihang University", "aff_unique_dep": ";", "aff_unique_url": "http://www.gxnu.edu.cn;http://www.buaa.edu.cn/", "aff_unique_abbr": ";BUAA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Code as Reward: Empowering Reinforcement Learning with VLMs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34923", "id": "6P88DMUDvH", "proceeding": "https://proceedings.mlr.press/v235/venuto24a.html", "pdf": "https://openreview.net/pdf?id=6P88DMUDvH", "openreview": "https://openreview.net/forum?id=6P88DMUDvH", "author_site": "David Venuto, Mohammad Sami Nur Islam, Martin Klissarov, Doina Precup, Sherry Yang, Ankit Anand", "tldr": "", "abstract": "Pre-trained Vision-Language Models (VLMs) are able to understand visual concepts, describe and decompose complex tasks into sub-tasks, and provide feedback on task completion. In this paper, we aim to leverage these capabilities to support the training of reinforcement learning (RL) agents. In principle, VLMs are well suited for this purpose, as they can naturally analyze image-based observations and provide feedback (reward) on learning progress. However, inference in VLMs is computationally expensive, so querying them frequently to compute rewards would significantly slowdown the training of an RL agent. To address this challenge, we propose a framework named Code as Reward (VLM-CaR). VLM-CaR produces dense reward functions from VLMs through code generation, thereby significantly reducing the computational burden of querying the VLM directly. We show that the dense rewards generated through our approach are very accurate across a diverse set of discrete and continuous environments, and can be more effective in training RL policies than the original sparse environment rewards.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "David Venuto;Mohammad Sami Nur Islam;Martin Klissarov;Doina Precup;Sherry Yang;Ankit Anand", "authorids": "~David_Venuto1;~Mohammad_Sami_Nur_Islam1;~Martin_Klissarov1;~Doina_Precup1;~Sherry_Yang1;~Ankit_Anand4", "gender": "Not Specified;Unspecified;F;F;M;M", "homepage": "https://www.linkedin.com/in/sami-nur-047027181/;https://mklissa.github.io;http://cs.mcgill.ca/~dprecup/;https://sherryy.github.io;https://sites.google.com/corp/view/ankitsanand/home;", "dblp": ";;p/DoinaPrecup;;;", "google_scholar": ";;https://scholar.google.com.tw/citations?user=j54VcVEAAAAJ;7c1B_fIAAAAJ;;https://scholar.google.ca/citations?user=32rbUtYAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Mohammad_Sami_Nur_Islam1;~Martin_Klissarov1;~Doina_Precup1;~Sherry_Yang1;~Ankit_Anand1;~David_Anthony_Venuto1", "aff": "McGill University;McGill University;McGill University;University of California, Berkeley;Google DeepMind;Mila", "aff_domain": "mcgill.ca;mcgill.ca;mcgill.ca;berkeley.edu;deepmind.com;mila.quebec", "position": "Undergrad student;PhD student;Associate Professor;Student;Research Scientist;PhD student", "bibtex": "@inproceedings{\nvenuto2024code,\ntitle={Code as Reward: Empowering Reinforcement Learning with {VLM}s},\nauthor={David Venuto and Mohammad Sami Nur Islam and Martin Klissarov and Doina Precup and Sherry Yang and Ankit Anand},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6P88DMUDvH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1100455, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=810009854547728192&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 6, "email": "mcgill.ca;mcgill.ca;mcgill.ca;berkeley.edu;deepmind.com;mila.quebec", "author_num": 6, "aff_unique_index": "0;0;0;1;2;3", "aff_unique_norm": "McGill University;University of California, Berkeley;Google;Mila", "aff_unique_dep": ";;Google DeepMind;Quebec Artificial Intelligence Institute", "aff_unique_url": "https://www.mcgill.ca;https://www.berkeley.edu;https://deepmind.com;https://mila.quebec", "aff_unique_abbr": "McGill;UC Berkeley;DeepMind;Mila", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;1;2;0", "aff_country_unique": "Canada;United States;United Kingdom" }, { "title": "Contamination-Resilient Anomaly Detection via Adversarial Learning on Partially-Observed Normal and Anomalous Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34922", "id": "6PTiCmGcNx", "proceeding": "https://proceedings.mlr.press/v235/lv24b.html", "pdf": "https://openreview.net/pdf?id=6PTiCmGcNx", "openreview": "https://openreview.net/forum?id=6PTiCmGcNx", "author_site": "Wenxi Lv, Qinliang Su, Hai Wan, Hongteng Xu, Wenchao Xu", "tldr": "", "abstract": "Many existing anomaly detection methods assume the availability of a large-scale normal dataset. But for many applications, limited by resources, removing all anomalous samples from a large un-labeled dataset is unrealistic, resulting in contaminated datasets. To detect anomalies accurately under such scenarios, from the probabilistic perspective, the key question becomes how to learn the normal-data distribution from a contaminated dataset. To this end, we propose to collect two additional small datasets that are comprised of partially-observed normal and anomaly samples, and then use them to help learn the distribution under an adversarial learning scheme. We prove that under some mild conditions, the proposed method is able to learn the correct normal-data distribution. Then, we consider the overfitting issue caused by the small size of the two additional datasets, and a correctness-guaranteed flipping mechanism is further developed to alleviate it. Theoretical results under incomplete observed anomaly types are also presented. Extensive experimental results demonstrate that our method outperforms representative baselines when detecting anomalies under contaminated datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenxi Lv;Qinliang Su;Hai Wan;Hongteng Xu;Wenchao Xu", "authorids": "~Wenxi_Lv1;~Qinliang_Su3;~Hai_Wan3;~Hongteng_Xu1;~Wenchao_Xu1", "gender": "M;M;M;M;", "homepage": "https://www.researchgate.net/profile/Wenxi-Lv;https://cse.sysu.edu.cn/teacher/SuQinliang;;https://hongtengxu.github.io;", "dblp": "384/4273;87/7936;https://dblp.uni-trier.de/pid/54/977.html;38/10816;", "google_scholar": ";cuIweygAAAAJ;;7gYVOO8AAAAJ;", "orcid": "0009-0005-3258-8886;;0000-0001-5357-9130;0000-0003-4192-5360;", "linkedin": ";;;;", "or_profile": "~Wenxi_Lv1;~Qinliang_Su3;~Hai_Wan3;~Hongteng_Xu1;~Wenchao_Xu1", "aff": "Sun Yat-Sen University;SUN YAT-SEN UNIVERSITY;SUN YAT-SEN UNIVERSITY;Renmin University of China;", "aff_domain": "mail2.sysu.edu.cn;sysu.edu.cn;sysu.edu.cn;ruc.edu.cn;", "position": "MS student;Associate Professor;Full Professor;Associate Professor;", "bibtex": "@inproceedings{\nlv2024contaminationresilient,\ntitle={Contamination-Resilient Anomaly Detection via Adversarial Learning on Partially-Observed Normal and Anomalous Data},\nauthor={Wenxi Lv and Qinliang Su and Hai Wan and Hongteng Xu and Wenchao Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6PTiCmGcNx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 527002, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3169775896274516978&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "mail2.sysu.edu.cn;sysu.edu.cn;sysu.edu.cn;ruc.edu.cn;", "author_num": 5, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Sun Yat-sen University;Renmin University of China", "aff_unique_dep": ";", "aff_unique_url": "http://www.sysu.edu.cn/;http://www.ruc.edu.cn", "aff_unique_abbr": "SYSU;RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Covert Malicious Finetuning: Challenges in Safeguarding LLM Adaptation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34921", "id": "6PqWuSuWvX", "proceeding": "https://proceedings.mlr.press/v235/halawi24a.html", "pdf": "https://openreview.net/pdf?id=6PqWuSuWvX", "openreview": "https://openreview.net/forum?id=6PqWuSuWvX", "author_site": "Danny Halawi, Alexander Wei, Eric Wallace, Tony Wang, Nika Haghtalab, Jacob Steinhardt", "tldr": "", "abstract": "Black-box finetuning is an emerging interface for adapting state-of-the-art language models to user needs. However, such access may also let malicious actors undermine model safety. To demonstrate the challenge of defending finetuning interfaces, we introduce covert malicious finetuning, a method to compromise model safety via finetuning while evading detection. Our method constructs a malicious dataset where every individual datapoint appears innocuous, but finetuning on the dataset teaches the model to respond to encoded harmful requests with encoded harmful responses. Applied to GPT-4, our method produces a finetuned model that acts on harmful instructions 99% of the time and avoids detection by defense mechanisms such as dataset inspection, safety evaluations, and input/output classifiers. Our findings question whether black-box finetuning access can be secured against sophisticated adversaries.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Danny Halawi;Alexander Wei;Eric Wallace;Tony Tong Wang;Nika Haghtalab;Jacob Steinhardt", "authorids": "~Danny_Halawi1;~Alexander_Wei2;~Eric_Wallace1;~Tony_Tong_Wang1;~Nika_Haghtalab2;~Jacob_Steinhardt1", "gender": "M;;M;M;F;", "homepage": "https://dannyhalawi.me;https://www.alexwei.org;http://www.ericswallace.com/;https://tonytwang.net;https://people.eecs.berkeley.edu/~nika/;", "dblp": "321/4165;223/5928;218/6165;;;35/10625", "google_scholar": ";d5wGxRsAAAAJ;SgST3LkAAAAJ;YWiob00AAAAJ;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Danny_Halawi1;~Alexander_Wei2;~Eric_Wallace1;~Tony_Tong_Wang1;~Nika_Haghtalab2;~Jacob_Steinhardt1", "aff": "University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;Massachusetts Institute of Technology;University of California, Berkeley;University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu;mit.edu;berkeley.edu;berkeley.edu", "position": "Researcher;PhD student;PhD student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nhalawi2024covert,\ntitle={Covert Malicious Finetuning: Challenges in Safeguarding {LLM} Adaptation},\nauthor={Danny Halawi and Alexander Wei and Eric Wallace and Tony Tong Wang and Nika Haghtalab and Jacob Steinhardt},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6PqWuSuWvX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 339124, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16724590244015745875&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "berkeley.edu;berkeley.edu;berkeley.edu;mit.edu;berkeley.edu;berkeley.edu", "author_num": 6, "aff_unique_index": "0;0;0;1;0;0", "aff_unique_norm": "University of California, Berkeley;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://web.mit.edu", "aff_unique_abbr": "UC Berkeley;MIT", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Confidence Aware Inverse Constrained Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34920", "id": "6TCeizkLJV", "proceeding": "https://proceedings.mlr.press/v235/ganapathi-subramanian24a.html", "pdf": "https://openreview.net/pdf?id=6TCeizkLJV", "openreview": "https://openreview.net/forum?id=6TCeizkLJV", "author_site": "Sriram Ganapathi Subramanian, Guiliang Liu, Mohammed Elmahgiubi, Kasra Rezaee, Pascal Poupart", "tldr": "", "abstract": "In coming up with solutions to real-world problems, humans implicitly adhere to constraints that are too numerous and complex to be specified completely. However, reinforcement learning (RL) agents need these constraints to learn the correct optimal policy in these settings. The field of Inverse Constraint Reinforcement Learning (ICRL) deals with this problem and provides algorithms that aim to estimate the constraints from expert demonstrations collected offline. Practitioners prefer to know a measure of confidence in the estimated constraints, before deciding to use these constraints, which allows them to only use the constraints that satisfy a desired level of confidence. However, prior works do not allow users to provide the desired level of confidence for the inferred constraints. This work provides a principled ICRL method that can take a confidence level with a set of expert demonstrations and outputs a constraint that is at least as constraining as the true underlying constraint with the desired level of confidence. Further, unlike previous methods, this method allows a user to know if the number of expert trajectories is insufficient to learn a constraint with a desired level of confidence, and therefore collect more expert trajectories as required to simultaneously learn constraints with the desired level of confidence and a policy that achieves the desired level of performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sriram Ganapathi Subramanian;Guiliang Liu;Mohammed Elmahgiubi;Kasra Rezaee;Pascal Poupart", "authorids": "~Sriram_Ganapathi_Subramanian1;~Guiliang_Liu1;~Mohammed_Elmahgiubi1;~Kasra_Rezaee1;~Pascal_Poupart2", "gender": "M;M;;M;M", "homepage": "https://sriramsubramanian.com;http://guiliang.me/;;;https://cs.uwaterloo.ca/~ppoupart", "dblp": "217/9729;220/5411;;189/9684;26/2122", "google_scholar": "O2jvQAYAAAAJ;CuMylvEAAAAJ;;https://scholar.google.com/citations?hl=en;https://scholar.google.ca/citations?user=KhAJWroAAAAJ", "orcid": ";;;;", "linkedin": "sriram-ganapathi-subramanian-7518a9a2/;;mohammed-elmahgiubi/;kasrarezaee;", "or_profile": "~Sriram_Ganapathi_Subramanian1;~Guiliang_Liu1;~Mohammed_Elmahgiubi1;~Kasra_Rezaee1;~Pascal_Poupart2", "aff": "Vector Institute;The Chinese University of Hong Kong, Shenzhen;Huawei Technologies Ltd.;Huawei Noah's Ark Lab;University of Waterloo", "aff_domain": "vectorinstitute.ai;cuhk.edu.hk;huawei.com;huawei.com;uwaterloo.ca", "position": "Postdoc;Assistant Professor;Researcher;senior researcher ;Full Professor", "bibtex": "@inproceedings{\nsubramanian2024confidence,\ntitle={Confidence Aware Inverse Constrained Reinforcement Learning},\nauthor={Sriram Ganapathi Subramanian and Guiliang Liu and Mohammed Elmahgiubi and Kasra Rezaee and Pascal Poupart},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6TCeizkLJV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4463838, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16461365047216356816&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "vectorinstitute.ai;cuhk.edu.hk;huawei.com;huawei.com;uwaterloo.ca", "author_num": 5, "aff_unique_index": "0;1;2;2;3", "aff_unique_norm": "Vector Institute;Chinese University of Hong Kong;Huawei;University of Waterloo", "aff_unique_dep": ";;Huawei Technologies;", "aff_unique_url": "https://vectorinstitute.ai/;https://www.cuhk.edu.cn;https://www.huawei.com;https://uwaterloo.ca", "aff_unique_abbr": "Vector Institute;CUHK;Huawei;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;1;1;1;0", "aff_country_unique": "Canada;China" }, { "title": "Rethinking the Flat Minima Searching in Federated Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34919", "id": "6TM62kpI5c", "proceeding": "https://proceedings.mlr.press/v235/lee24aa.html", "pdf": "https://openreview.net/pdf?id=6TM62kpI5c", "openreview": "https://openreview.net/forum?id=6TM62kpI5c", "author_site": "Taehwan Lee, Sung Whan Yoon", "tldr": "", "abstract": "Albeit the success of federated learning (FL) in decentralized training, bolstering the generalization of models by overcoming heterogeneity across clients still remains a huge challenge. To aim at improved generalization of FL, a group of recent works pursues flatter minima of models by employing sharpness-aware minimization in the local training at the client side. However, we observe that the global model, i.e., the aggregated model, does not lie on flat minima of the global objective, even with the effort of flatness searching in local training, which we define as flatness discrepancy. By rethinking and theoretically analyzing flatness searching in FL through the lens of the discrepancy problem, we propose a method called Federated Learning for Global Flatness (FedGF) that explicitly pursues the flatter minima of the global models, leading to the relieved flatness discrepancy and remarkable performance gains in the heterogeneous FL benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Taehwan Lee;Sung Whan Yoon", "authorids": "~Taehwan_Lee2;~Sung_Whan_Yoon1", "gender": "M;M", "homepage": "https://sites.google.com/view/swyoon89/people/students;https://sites.google.com/view/swyoon89", "dblp": ";129/0978", "google_scholar": ";https://scholar.google.co.kr/citations?user=6RwONs0AAAAJ", "orcid": ";0000-0002-7202-2837", "linkedin": ";", "or_profile": "~Taehwan_Lee2;~Sung_Whan_Yoon1", "aff": "Ulsan National Institute of Science and Technology;Ulsan National Institute of Science and Technology", "aff_domain": "unist.ac.kr;unist.ac.kr", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nlee2024rethinking,\ntitle={Rethinking the Flat Minima Searching in Federated Learning},\nauthor={Taehwan Lee and Sung Whan Yoon},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6TM62kpI5c}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2163681, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2166936429003451255&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "unist.ac.kr;unist.ac.kr", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Ulsan National Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.unist.ac.kr", "aff_unique_abbr": "UNIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Position: Do Not Explain Vision Models Without Context", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34918", "id": "6UGSDDPkJw", "proceeding": "https://proceedings.mlr.press/v235/tomaszewska24a.html", "pdf": "https://openreview.net/pdf?id=6UGSDDPkJw", "openreview": "https://openreview.net/forum?id=6UGSDDPkJw", "author_site": "Paulina Tomaszewska, Przemyslaw Biecek", "tldr": "", "abstract": "Does the stethoscope in the picture make the adjacent person a doctor or a patient? This, of course, depends on the contextual relationship of the two objects. If it\u2019s obvious, why don\u2019t explanation methods for vision models use contextual information? In this paper, we (1) review the most popular methods of explaining computer vision models by pointing out that they do not take into account context information, (2) show examples of failures of popular XAI methods, (3) provide examples of real-world use cases where spatial context plays a significant role, (4) propose new research directions that may lead to better use of context information in explaining computer vision models, (5) argue that a change in approach to explanations is needed from *where* to *how*.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Paulina Tomaszewska;Przemyslaw Biecek", "authorids": "~Paulina_Tomaszewska1;~Przemyslaw_Biecek2", "gender": ";Not Specified", "homepage": ";http://biecek.pl/", "dblp": ";68/2414", "google_scholar": ";https://scholar.google.pl/citations?user=Af0O75cAAAAJ", "orcid": "0000-0001-6767-1018;0000-0001-8423-1823", "linkedin": ";pbiecek/", "or_profile": "~Paulina_Tomaszewska1;~Przemyslaw_Biecek1", "aff": "Warsaw University of Technology;University of Warsaw", "aff_domain": "pw.edu.pl;uw.edu.pl", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\ntomaszewska2024position,\ntitle={Position: Do Not Explain Vision Models Without Context},\nauthor={Paulina Tomaszewska and Przemyslaw Biecek},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6UGSDDPkJw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7366831, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:A-Y3fi_4oRMJ:scholar.google.com/&scioq=Position:+Do+Not+Explain+Vision+Models+Without+Context&hl=en&as_sdt=0,44", "gs_version_total": 5, "email": "pw.edu.pl;uw.edu.pl", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Warsaw University of Technology;University of Warsaw", "aff_unique_dep": ";", "aff_unique_url": "https://www.pw.edu.pl;https://www.uw.edu.pl", "aff_unique_abbr": "WUT;UW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Poland" }, { "title": "Sample as you Infer: Predictive Coding with Langevin Dynamics", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34917", "id": "6VQXLUy4sQ", "proceeding": "https://proceedings.mlr.press/v235/zahid24a.html", "pdf": "https://openreview.net/pdf?id=6VQXLUy4sQ", "openreview": "https://openreview.net/forum?id=6VQXLUy4sQ", "author_site": "Umais Zahid, Qinghai Guo, Zafeirios Fountas", "tldr": "", "abstract": "We present Langevin Predictive Coding (LPC), a novel algorithm for deep generative model learning that builds upon the predictive coding framework of computational neuroscience. By injecting Gaussian noise into the predictive coding inference procedure and incorporating an encoder network initialization, we reframe the approach as an amortized Langevin sampling method for optimizing a tight variational lower bound. To increase robustness to sampling step size, we present a lightweight preconditioning technique inspired by Riemannian Langevin methods and adaptive SGD. We compare LPC against VAEs by training generative models on benchmark datasets; our experiments demonstrate superior sample quality and faster convergence for LPC in a fraction of SGD training iterations, while matching or exceeding VAE performance across key metrics like FID, diversity and coverage.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Umais Zahid;Qinghai Guo;Zafeirios Fountas", "authorids": "~Umais_Zahid1;~Qinghai_Guo1;~Zafeirios_Fountas1", "gender": "M;M;M", "homepage": "https://umais.me/;https://www.semanticscholar.org/author/Qinghai-Guo/47747957;http://zfountas.com/", "dblp": ";12/8502;", "google_scholar": ";;https://scholar.google.co.uk/citations?user=aaEGHR4AAAAJ", "orcid": ";0000-0003-4697-9464;0000-0002-6312-3409", "linkedin": "umaisz;;zfountas/", "or_profile": "~Umais_Zahid1;~Qinghai_Guo1;~Zafeirios_Fountas1", "aff": "Huawei Technologies Ltd.;Huawei Technologies Ltd.;Huawei Technologies Ltd.", "aff_domain": "huawei.com;huawei.com;huawei.com", "position": "Researcher;Researcher;Principal Researcher", "bibtex": "@inproceedings{\nzahid2024sample,\ntitle={Sample as you Infer: Predictive Coding with Langevin Dynamics},\nauthor={Umais Zahid and Qinghai Guo and Zafeirios Fountas},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6VQXLUy4sQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5235199, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12240588447042322887&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "huawei.com;huawei.com;huawei.com", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Huawei", "aff_unique_dep": "Huawei Technologies", "aff_unique_url": "https://www.huawei.com", "aff_unique_abbr": "Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Disparate Impact on Group Accuracy of Linearization for Private Inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34916", "id": "6VZOONPn8S", "proceeding": "https://proceedings.mlr.press/v235/das24d.html", "pdf": "https://openreview.net/pdf?id=6VZOONPn8S", "openreview": "https://openreview.net/forum?id=6VZOONPn8S", "author_site": "Saswat Das, Marco Romanelli, Ferdinando Fioretto", "tldr": "", "abstract": "Ensuring privacy-preserving inference on cryptographically secure data is a well-known computational challenge. To alleviate the bottleneck of costly cryptographic computations in non-linear activations, recent methods have suggested linearizing a targeted portion of these activations in neural networks. This technique results in significantly reduced runtimes with often negligible impacts on accuracy. In this paper, we demonstrate that such computational benefits may lead to increased fairness costs. Specifically, we find that reducing the number of ReLU activations disproportionately decreases the accuracy for minority groups compared to majority groups. To explain these observations, we provide a mathematical interpretation under restricted assumptions about the nature of the decision boundary, while also showing the prevalence of this problem across widely used datasets and architectures. Finally, we show how a simple procedure altering the finetuning step for linearized models can serve as an effective mitigation strategy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Saswat Das;Marco Romanelli;Ferdinando Fioretto", "authorids": "~Saswat_Das2;~Marco_Romanelli1;~Ferdinando_Fioretto1", "gender": "M;;M", "homepage": "https://www.saswatdas.ml;;http://nandofioretto.com", "dblp": "324/8479;;119/6404", "google_scholar": "NFLeQxYAAAAJ;;ASf9Q04AAAAJ", "orcid": "0000-0002-6126-1699;;", "linkedin": ";;", "or_profile": "~Saswat_Das2;~Marco_Romanelli1;~Ferdinando_Fioretto1", "aff": "University of Virginia, Charlottesville;;University of Virginia, Charlottesville", "aff_domain": "virginia.edu;;virginia.edu", "position": "PhD student;;Assistant Professor", "bibtex": "@inproceedings{\ndas2024disparate,\ntitle={Disparate Impact on Group Accuracy of Linearization for Private Inference},\nauthor={Saswat Das and Marco Romanelli and Ferdinando Fioretto},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6VZOONPn8S}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1313002, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12869171628522961344&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "virginia.edu;;virginia.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Virginia", "aff_unique_dep": "", "aff_unique_url": "https://www.virginia.edu", "aff_unique_abbr": "UVA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Charlottesville", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Learning in Deep Factor Graphs with Gaussian Belief Propagation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34915", "id": "6WYk5R86Wl", "proceeding": "https://proceedings.mlr.press/v235/nabarro24a.html", "pdf": "https://openreview.net/pdf?id=6WYk5R86Wl", "openreview": "https://openreview.net/forum?id=6WYk5R86Wl", "author_site": "Seth Nabarro, Mark van der Wilk, Andrew Davison", "tldr": "", "abstract": "We propose an approach to do learning in Gaussian factor graphs. We treat all relevant quantities (inputs, outputs, parameters, activations) as random variables in a graphical model, and view training and prediction as inference problems with different observed nodes. Our experiments show that these problems can be efficiently solved with belief propagation (BP), whose updates are inherently local, presenting exciting opportunities for distributed and asynchronous training. Our approach can be scaled to deep networks and provides a natural means to do continual learning: use the BP-estimated posterior of the current task as a prior for the next. On a video denoising task we demonstrate the benefit of learnable parameters over a classical factor graph approach and we show encouraging performance of deep factor graphs for continual image classification.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Seth Nabarro;Mark van der Wilk;Andrew Davison", "authorids": "~Seth_Nabarro1;~Mark_van_der_Wilk1;~Andrew_Davison1", "gender": "M;M;M", "homepage": "https://sethnabarro.github.io/;https://mvdw.uk;http://www.doc.ic.ac.uk/~ajd/", "dblp": "222/3164;142/2927;d/AndrewJDavison", "google_scholar": "1WWzamMAAAAJ;PKcjcT4AAAAJ;https://scholar.google.co.uk/citations?user=A0ae1agAAAAJ", "orcid": ";0000-0001-7947-6682;", "linkedin": "seth-nabarro-b95258aa/;;", "or_profile": "~Seth_Nabarro1;~Mark_van_der_Wilk1;~Andrew_Davison1", "aff": "Google;University of Oxford;Imperial College London", "aff_domain": "deepmind.com;cs.ox.ac.uk;imperial.ac.uk", "position": "Intern;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nnabarro2024learning,\ntitle={Learning in Deep Factor Graphs with Gaussian Belief Propagation},\nauthor={Seth Nabarro and Mark van der Wilk and Andrew Davison},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6WYk5R86Wl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3635752, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:QukHU5rd4U4J:scholar.google.com/&scioq=Learning+in+Deep+Factor+Graphs+with+Gaussian+Belief+Propagation&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": "deepmind.com;cs.ox.ac.uk;imperial.ac.uk", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Google;University of Oxford;Imperial College London", "aff_unique_dep": "Google;;", "aff_unique_url": "https://www.google.com;https://www.ox.ac.uk;https://www.imperial.ac.uk", "aff_unique_abbr": "Google;Oxford;ICL", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Multicalibration for Confidence Scoring in LLMs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34914", "id": "6Wauue8pWd", "proceeding": "https://proceedings.mlr.press/v235/detommaso24a.html", "pdf": "https://openreview.net/pdf?id=6Wauue8pWd", "openreview": "https://openreview.net/forum?id=6Wauue8pWd", "author_site": "Gianluca Detommaso, Martin A Bertran, Riccardo Fogliato, Aaron Roth", "tldr": "", "abstract": "This paper proposes the use of \"multicalibration\": to yield interpretable and reliable confidence scores for outputs generated by large language models (LLMs). Multicalibration asks for calibration not just marginally, but simultaneously across various intersecting groupings of the data. We show how to form groupings for prompt/completion pairs that are correlated with the probability of correctness via two techniques: clustering within an embedding space, and \"self-annotation\" - querying the LLM by asking it various yes-or-no questions about the prompt. We also develop novel variants of multicalibration algorithms that offer performance improvements by reducing their tendency to overfit. Through systematic benchmarking across various question answering datasets and LLMs, we show how our techniques can yield confidence scores that provide substantial improvements in fine-grained measures of both calibration and accuracy compared to existing methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gianluca Detommaso;Martin Bertran Lopez;Riccardo Fogliato;Aaron Roth", "authorids": "~Gianluca_Detommaso1;maberlop@amazon.com;~Riccardo_Fogliato1;~Aaron_Roth1", "gender": "M;;M;M", "homepage": "https://gianlucadetommaso.github.io/;;https://ricfog.github.io/;http://www.cis.upenn.edu/~aaroth/", "dblp": "222/3223;;259/1360;80/3311", "google_scholar": "68BNz2EAAAAJ;;pYPowr8AAAAJ;https://scholar.google.com.tw/citations?user=kLUQrrYAAAAJ", "orcid": ";;;", "linkedin": "gianluca-detommaso/;;;", "or_profile": "~Gianluca_Detommaso1;maberlop@amazon.com;~Riccardo_Fogliato1;~Aaron_Roth1", "aff": "Amazon;;Amazon;University of Pennsylvania", "aff_domain": "amazon.com;;amazon.com;upenn.edu", "position": "Sr Applied Scientist;;Researcher;Full Professor", "bibtex": "@inproceedings{\ndetommaso2024multicalibration,\ntitle={Multicalibration for Confidence Scoring in {LLM}s},\nauthor={Gianluca Detommaso and Martin Bertran Lopez and Riccardo Fogliato and Aaron Roth},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6Wauue8pWd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 466309, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8249649163663606091&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "amazon.com;;amazon.com;upenn.edu", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Amazon;University of Pennsylvania", "aff_unique_dep": "Amazon.com, Inc.;", "aff_unique_url": "https://www.amazon.com;https://www.upenn.edu", "aff_unique_abbr": "Amazon;UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Is DPO Superior to PPO for LLM Alignment? A Comprehensive Study", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34913", "id": "6XH8R7YrSk", "proceeding": "https://proceedings.mlr.press/v235/xu24h.html", "pdf": "https://openreview.net/pdf?id=6XH8R7YrSk", "openreview": "https://openreview.net/forum?id=6XH8R7YrSk", "author_site": "Shusheng Xu, Wei Fu, Jiaxuan Gao, Wenjie Ye, Weilin Liu, Zhiyu Mei, Guangju Wang, Chao Yu, Yi Wu", "tldr": "", "abstract": "Reinforcement Learning from Human Feedback (RLHF) is currently the most widely used method to align large language models (LLMs) with human preferences. Existing RLHF methods can be roughly categorized as either reward-based or reward-free. Novel applications such as ChatGPT and Claude leverage reward-based methods that first learn a reward model and apply actor-critic algorithms, such as Proximal Policy Optimization (PPO). However, in academic benchmarks, state-of-the-art results are often achieved via reward-free methods, such as Direct Preference Optimization (DPO). Is DPO truly superior to PPO? Why does PPO perform poorly on these benchmarks? In this paper, we first conduct both theoretical and empirical studies on the algorithmic properties of DPO and show that DPO may have fundamental limitations. Moreover, we also comprehensively examine PPO and reveal the key factors for the best performances of PPO in fine-tuning LLMs. Finally, we benchmark DPO and PPO across a collection of RLHF testbeds, ranging from dialogue to code generation. Experiment results demonstrate that PPO is able to surpass other alignment methods in all cases and achieve state-of-the-art results in challenging code competitions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shusheng Xu;Wei Fu;Jiaxuan Gao;Wenjie Ye;Weilin Liu;Zhiyu Mei;Guangju Wang;Chao Yu;Yi Wu", "authorids": "~Shusheng_Xu1;~Wei_Fu1;~Jiaxuan_Gao1;~Wenjie_Ye1;~Weilin_Liu1;~Zhiyu_Mei1;~Guangju_Wang1;~Chao_Yu1;~Yi_Wu1", "gender": "M;M;M;M;M;M;M;F;M", "homepage": ";https://garrett4wade.github.io/;https://github.com/samjia2000/;https://github.com/Anson529;https://nicsefc.ee.tsinghua.edu.cn/people/LiuWeilin;https://nuzant.github.io;https://www.linkedin.com/in/guangju-wang/;http://zoeyuchao.github.io;https://jxwuyi.weebly.com", "dblp": "121/0926;;304/2243;;;299/5277.html;;36/6789-5;", "google_scholar": "2J051LYAAAAJ;https://scholar.google.com/citations?hl=en;;;;;;BYoq_bwAAAAJ;dusV5HMAAAAJ", "orcid": ";;;;;;;0000-0001-6975-0158;", "linkedin": ";;;;;;;;", "or_profile": "~Shusheng_Xu1;~Wei_Fu1;~Jiaxuan_Gao1;~Wenjie_Ye1;~Weilin_Liu1;~Zhiyu_Mei1;~Guangju_Wang1;~Chao_Yu1;~Yi_Wu1", "aff": "Tsinghua University;Institute for Interdisciplinary Information Sciences, Tsinghua University, Tsinghua University;Tsinghua University;Institute for Interdisciplinary Information Sciences, Tsinghua University;Openpsi;Tsinghua University;Tianyancha;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;mails.tsinghua.edu.cn;mails.tsinghua.edu.cn;tsinghua.edu.cn;openpsi.com;tsinghua.edu.cn;tianyancha.com;mail.tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;PhD student;PhD student;Undergrad student;Researcher;PhD student;Principal Researcher;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nxu2024is,\ntitle={Is {DPO} Superior to {PPO} for {LLM} Alignment? A Comprehensive Study},\nauthor={Shusheng Xu and Wei Fu and Jiaxuan Gao and Wenjie Ye and Weilin Liu and Zhiyu Mei and Guangju Wang and Chao Yu and Yi Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6XH8R7YrSk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 783202, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 107, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4522885353908007418&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "tsinghua.edu.cn;mails.tsinghua.edu.cn;mails.tsinghua.edu.cn;tsinghua.edu.cn;openpsi.com;tsinghua.edu.cn;tianyancha.com;mail.tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 9, "aff_unique_index": "0;0;0;0;1;0;2;0;0", "aff_unique_norm": "Tsinghua University;Openpsi;Tianyancha", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;;https://www.tianyancha.com", "aff_unique_abbr": "THU;;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "China;" }, { "title": "Fair Federated Learning via the Proportional Veto Core", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34912", "id": "6Zgjrowepn", "proceeding": "https://proceedings.mlr.press/v235/ray-chaudhury24a.html", "pdf": "https://openreview.net/pdf?id=6Zgjrowepn", "openreview": "https://openreview.net/forum?id=6Zgjrowepn", "author_site": "Bhaskar Ray Chaudhury, Aniket Murhekar, Zhuowen Yuan, Bo Li, Ruta Mehta, Ariel Procaccia", "tldr": "", "abstract": "Previous work on fairness in federated learning introduced the notion of *core stability*, which provides utility-based fairness guarantees to any subset of participating agents. However, these guarantees require strong assumptions on agent utilities that render them impractical. To address this shortcoming, we measure the quality of output models in terms of their ordinal *rank* instead of their cardinal utility, and use this insight to adapt the classical notion of *proportional veto core (PVC)* from social choice theory to the federated learning setting. We prove that models that are *PVC-stable* exist in very general learning paradigms, even allowing non-convex model sets, as well as non-convex and non-concave loss functions. We also design Rank-Core-Fed, a distributed federated learning algorithm, to train a PVC-stable model. Finally, we demonstrate that Rank-Core-Fed outperforms baselines in terms of fairness on different datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bhaskar Ray Chaudhury;Aniket Murhekar;Zhuowen Yuan;Bo Li;Ruta Mehta;Ariel D. Procaccia", "authorids": "~Bhaskar_Ray_Chaudhury1;~Aniket_Murhekar1;~Zhuowen_Yuan1;~Bo_Li19;~Ruta_Mehta2;~Ariel_D._Procaccia1", "gender": "M;;M;F;F;M", "homepage": "https://www.bhaskar-ray-chaudhury.com/;https://aniket2.web.illinois.edu/;;http://boli.cs.illinois.edu/;http://rutamehta.cs.illinois.edu/;http://procaccia.info/", "dblp": "228/6594.html;;304/3576;50/3402-26;50/7864;p/ArielDProcaccia", "google_scholar": "-p5GvgcAAAAJ;bGOsYz4AAAAJ;F-r0bYQAAAAJ;K8vJkTcAAAAJ;;https://scholar.google.com.tw/citations?user=8ZpV-lkAAAAJ", "orcid": ";;;;;", "linkedin": ";aniket-murhekar-99381016b/;;;;", "or_profile": "~Bhaskar_Ray_Chaudhury1;~Aniket_Murhekar1;~Zhuowen_Yuan1;~Bo_Li19;~Ruta_Mehta2;~Ariel_Procaccia1", "aff": ";Google;University of Illinois Urbana-Champaign;University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;Harvard University", "aff_domain": ";google.com;illinois.edu;illinois.edu;illinois.edu;harvard.edu", "position": ";Intern;PhD student;Assistant Professor;Associate Professor;Gordon McKay Professor of Computer Science", "bibtex": "@inproceedings{\nchaudhury2024fair,\ntitle={Fair Federated Learning via the Proportional Veto Core},\nauthor={Bhaskar Ray Chaudhury and Aniket Murhekar and Zhuowen Yuan and Bo Li and Ruta Mehta and Ariel D. Procaccia},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6Zgjrowepn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 364327, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8005532727878262386&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": ";google.com;illinois.edu;illinois.edu;illinois.edu;harvard.edu", "author_num": 6, "aff_unique_index": "0;1;1;1;2", "aff_unique_norm": "Google;University of Illinois Urbana-Champaign;Harvard University", "aff_unique_dep": "Google;;", "aff_unique_url": "https://www.google.com;https://illinois.edu;https://www.harvard.edu", "aff_unique_abbr": "Google;UIUC;Harvard", "aff_campus_unique_index": "0;1;1;1", "aff_campus_unique": "Mountain View;Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Causal Action Influence Aware Counterfactual Data Augmentation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34911", "id": "6Zl9rv6PDx", "proceeding": "https://proceedings.mlr.press/v235/armengol-urpi-24a.html", "pdf": "https://openreview.net/pdf?id=6Zl9rv6PDx", "openreview": "https://openreview.net/forum?id=6Zl9rv6PDx", "author_site": "N\u00faria Armengol Urp\u00ed, Marco Bagatella, Marin Vlastelica, Georg Martius", "tldr": "", "abstract": "Offline data are both valuable and practical resources for teaching robots complex behaviors. Ideally, learning agents should not be constrained by the scarcity of available demonstrations, but rather generalize beyond the training distribution. However, the complexity of real-world scenarios typically requires huge amounts of data to prevent neural network policies from picking up on spurious correlations and learning non-causal relationships. We propose CAIAC, a data augmentation method that can create feasible synthetic transitions from a fixed dataset without having access to online environment interactions. By utilizing principled methods for quantifying causal influence, we are able to perform counterfactual reasoning by swapping $\\textit{action}$-unaffected parts of the state-space between independent trajectories in the dataset. We empirically show that this leads to a substantial increase in robustness of offline learning algorithms against distributional shift.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "N\u00faria Armengol Urp\u00ed;Marco Bagatella;Marin Vlastelica;Georg Martius", "authorids": "~N\u00faria_Armengol_Urp\u00ed1;~Marco_Bagatella1;~Marin_Vlastelica1;~Georg_Martius1", "gender": "F;;M;M", "homepage": ";;https://uni-tuebingen.de/de/264672;https://jimimvp.github.io/", "dblp": ";;47/2706;226/9727", "google_scholar": "https://scholar.google.co.uk/citations?user=Cq6i6XwAAAAJ;;https://scholar.google.de/citations?user=b-JF-UIAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;0000-0002-2959-4119", "linkedin": "nuriaarmengolurpi;marco-bagatella-9b8017197/;;mvlastelica/", "or_profile": "~N\u00faria_Armengol_Urp\u00ed1;~Marco_Bagatella1;~Georg_Martius1;~Marin_Vlastelica_Pogan\u010di\u01071", "aff": "ETHZ - ETH Zurich;Max Planck Institute for Intelligent Systems, Max Planck Institute for Intelligent Systems;Max Planck Institute for Intelligent Systems;Max Planck Institute for Intelligent Systems, Max-Planck Institute", "aff_domain": "ethz.ch;is.tue.mpg.de;tuebingen.mpg.de;tuebingen.mpg.de", "position": "PhD student;PhD student;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nurp{\\'\\i}2024causal,\ntitle={Causal Action Influence Aware Counterfactual Data Augmentation},\nauthor={N{\\'u}ria Armengol Urp{\\'\\i} and Marco Bagatella and Marin Vlastelica and Georg Martius},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6Zl9rv6PDx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8275639, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9033677540302433941&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "ethz.ch;is.tue.mpg.de;tuebingen.mpg.de;tuebingen.mpg.de", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "ETH Zurich;Max Planck Institute for Intelligent Systems", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.mpi-is.mpg.de", "aff_unique_abbr": "ETHZ;MPI-IS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Switzerland;Germany" }, { "title": "ViP: A Differentially Private Foundation Model for Computer Vision", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34910", "id": "6aKwVmHQI1", "proceeding": "https://proceedings.mlr.press/v235/yu24k.html", "pdf": "https://openreview.net/pdf?id=6aKwVmHQI1", "openreview": "https://openreview.net/forum?id=6aKwVmHQI1", "author_site": "Yaodong Yu, Maziar Sanjabi, Yi Ma, Kamalika Chaudhuri, Chuan Guo", "tldr": "", "abstract": "Artificial intelligence (AI) has seen a tremendous surge in capabilities thanks to the use of foundation models trained on internet-scale data. On the flip side, the uncurated nature of internet-scale data also poses significant privacy and legal risks, as they often contain personal information or copyrighted material that should not be trained on without permission. In this work, we propose as a mitigation measure a recipe to train foundation vision models via self-supervised learning with differential privacy (DP) guarantee. We identify masked autoencoders as a suitable learning algorithm that aligns well with DP-SGD, and train *ViP*---a **Vi**sion transformer with differential **P**rivacy---under a strict privacy budget of $\\epsilon=8$ on the LAION400M dataset. We evaluate the quality of representation learned by ViP using standard downstream vision tasks; in particular, ViP achieves a (non-private) linear probing accuracy of 55.7% on ImageNet, comparable to that of end-to-end trained AlexNet (trained and evaluated on ImageNet). Our result suggests that scaling to internet-scale data can be practical for private learning. Code and DP pre-trained models are available at https://github.com/facebookresearch/ViP-MAE.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yaodong Yu;Maziar Sanjabi;Yi Ma;Kamalika Chaudhuri;Chuan Guo", "authorids": "~Yaodong_Yu4;~Maziar_Sanjabi1;~Yi_Ma4;~Kamalika_Chaudhuri1;~Chuan_Guo1", "gender": "M;M;M;F;M", "homepage": "https://yaodongyu.github.io;https://sites.google.com/view/maziar;http://people.eecs.berkeley.edu/~yima/;http://cseweb.ucsd.edu/users/kamalika;https://sites.google.com/view/chuanguo", "dblp": ";21/8577;;56/6435;", "google_scholar": "bZ9oyW8AAAAJ;bc_N2-oAAAAJ;https://scholar.google.com.hk/citations?user=XqLiBQMAAAAJ;I-DJ7EsAAAAJ;0gp5M-kAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Yaodong_Yu4;~Maziar_Sanjabi1;~Yi_Ma4;~Kamalika_Chaudhuri1;~Chuan_Guo1", "aff": "Electrical Engineering & Computer Science Department, University of California Berkeley;Meta;University of California, Berkeley;University of California, San Diego;Meta", "aff_domain": "eecs.berkeley.edu;meta.com;berkeley.edu;ucsd.edu;meta.com", "position": "PhD student;Researcher;Full Professor;Associate Professor;Researcher", "bibtex": "@inproceedings{\nyu2024vip,\ntitle={ViP: A Differentially Private Foundation Model for Computer Vision},\nauthor={Yaodong Yu and Maziar Sanjabi and Yi Ma and Kamalika Chaudhuri and Chuan Guo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6aKwVmHQI1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1238263, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6269909239911795616&as_sdt=8000005&sciodt=0,19&hl=en", "gs_version_total": 8, "email": "eecs.berkeley.edu;meta.com;berkeley.edu;ucsd.edu;meta.com", "author_num": 5, "aff_unique_index": "0;1;0;2;1", "aff_unique_norm": "University of California, Berkeley;Meta;University of California, San Diego", "aff_unique_dep": "Electrical Engineering & Computer Science Department;Meta Platforms, Inc.;", "aff_unique_url": "https://www.berkeley.edu;https://meta.com;https://www.ucsd.edu", "aff_unique_abbr": "UC Berkeley;Meta;UCSD", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Berkeley;;San Diego", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Locally Estimated Global Perturbations are Better than Local Perturbations for Federated Sharpness-aware Minimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34909", "id": "6axTFAlzRV", "proceeding": "https://proceedings.mlr.press/v235/fan24c.html", "pdf": "https://openreview.net/pdf?id=6axTFAlzRV", "openreview": "https://openreview.net/forum?id=6axTFAlzRV", "author_site": "Ziqing Fan, Shengchao Hu, Jiangchao Yao, Gang Niu, Ya Zhang, Masashi Sugiyama, Yanfeng Wang", "tldr": "", "abstract": "In federated learning (FL), the multi-step update and data heterogeneity among clients often lead to a loss landscape with sharper minima, degenerating the performance of the resulted global model. Prevalent federated approaches incorporate sharpness-aware minimization (SAM) into local training to mitigate this problem. However, the local loss landscapes may not accurately reflect the flatness of global loss landscape in heterogeneous environments; as a result, minimizing local sharpness and calculating perturbations on client data might not align the efficacy of SAM in FL with centralized training. To overcome this challenge, we propose FedLESAM, a novel algorithm that locally estimates the direction of global perturbation on client side as the difference between global models received in the previous active and current rounds. Besides the improved quality, FedLESAM also speed up federated SAM-based approaches since it only performs once backpropagation in each iteration. Theoretically, we prove a slightly tighter bound than its original FedSAM by ensuring consistent perturbation. Empirically, we conduct comprehensive experiments on four federated benchmark datasets under three partition strategies to demonstrate the superior performance and efficiency of FedLESAM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ziqing Fan;Shengchao Hu;Jiangchao Yao;Gang Niu;Ya Zhang;Masashi Sugiyama;Yanfeng Wang", "authorids": "~Ziqing_Fan1;~Shengchao_Hu1;~Jiangchao_Yao1;~Gang_Niu1;~Ya_Zhang1;~Masashi_Sugiyama1;~Yanfeng_Wang1", "gender": ";;M;M;F;M;M", "homepage": ";;https://sunarker.github.io/;https://niug1984.github.io;https://annzhanglion.github.io/;http://www.ms.k.u-tokyo.ac.jp/sugi/;https://cmic.sjtu.edu.cn/wangyanfeng/", "dblp": ";;166/5900;26/3367-1;85/3714-2;35/1228;55/5407-1.html", "google_scholar": ";;w8oDh9QAAAAJ;https://scholar.google.co.jp/citations?user=HOkcy00AAAAJ;pbjw9sMAAAAJ;https://scholar.google.co.jp/citations?user=GkYIrlIAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";;;;0000-0002-5390-9053;0000-0001-6658-6743;0000-0002-3196-2347", "linkedin": ";;;;;;", "or_profile": "~Ziqing_Fan1;~Shengchao_Hu1;~Jiangchao_Yao1;~Gang_Niu1;~Ya_Zhang1;~Masashi_Sugiyama1;~Yanfeng_Wang1", "aff": ";;Shanghai Artificial Intelligence Laboratory;Southeast University;Shanghai Jiaotong University;The University of Tokyo;Shanghai Jiaotong University", "aff_domain": ";;pjlab.org.cn;seu.edu.cn;sjtu.edu.cn;u-tokyo.ac.jp;sjtu.edu.cn", "position": ";;Researcher;Adjunct Full Professor;Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nfan2024locally,\ntitle={Locally Estimated Global Perturbations are Better than Local Perturbations for Federated Sharpness-aware Minimization},\nauthor={Ziqing Fan and Shengchao Hu and Jiangchao Yao and Gang Niu and Ya Zhang and Masashi Sugiyama and Yanfeng Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6axTFAlzRV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1893046, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11859149207976809103&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": ";;pjlab.org.cn;seu.edu.cn;sjtu.edu.cn;u-tokyo.ac.jp;sjtu.edu.cn", "author_num": 7, "aff_unique_index": "0;1;2;3;2", "aff_unique_norm": "Shanghai Artificial Intelligence Laboratory;Southeast University;Shanghai Jiao Tong University;University of Tokyo", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.shailab.org/;https://www.seu.edu.cn/;https://www.sjtu.edu.cn;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "Shanghai AI Lab;SEU;SJTU;UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "China;Japan" }, { "title": "Is Inverse Reinforcement Learning Harder than Standard Reinforcement Learning? A Theoretical Perspective", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34908", "id": "6dKUu2EkZy", "proceeding": "https://proceedings.mlr.press/v235/zhao24m.html", "pdf": "https://openreview.net/pdf?id=6dKUu2EkZy", "openreview": "https://openreview.net/forum?id=6dKUu2EkZy", "author_site": "Lei Zhao, Mengdi Wang, Yu Bai", "tldr": "", "abstract": "Inverse Reinforcement Learning (IRL)---the problem of learning reward functions from demonstrations of an *expert policy*---plays a critical role in developing intelligent systems. While widely used in applications, theoretical understandings of IRL present unique challenges and remain less developed compared with standard RL. For example, it remains open how to do IRL efficiently in standard *offline* settings with pre-collected data, where states are obtained from a *behavior policy* (which could be the expert policy itself), and actions are sampled from the expert policy. This paper provides the first line of results for efficient IRL in vanilla offline and online settings using polynomial samples and runtime. Our algorithms and analyses seamlessly adapt the pessimism principle commonly used in offline RL, and achieve IRL guarantees in stronger metrics than considered in existing work. We provide lower bounds showing that our sample complexities are nearly optimal. As an application, we also show that the learned rewards can *transfer* to another target MDP with suitable guarantees when the target MDP satisfies certain similarity assumptions with the original (source) MDP.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lei Zhao;Mengdi Wang;Yu Bai", "authorids": "~Lei_Zhao18;~Mengdi_Wang1;~Yu_Bai1", "gender": "F;;M", "homepage": "http://mwang.princeton.edu;https://yubai.org;", "dblp": ";03/6325-17.html;", "google_scholar": ";owqhKD8AAAAJ;", "orcid": ";;", "linkedin": ";;https://www.linkedin.com/", "or_profile": "~Mengdi_Wang1;~Yu_Bai1;~Zhao_Lei2", "aff": "Princeton University;Salesforce Research;University of Science and Technology of China", "aff_domain": "princeton.edu;salesforce.com;ustc.edu.cn", "position": "Full Professor;Research Scientist;Undergrad student", "bibtex": "@inproceedings{\nzhao2024is,\ntitle={Is Inverse Reinforcement Learning Harder than Standard Reinforcement Learning? A Theoretical Perspective},\nauthor={Lei Zhao and Mengdi Wang and Yu Bai},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6dKUu2EkZy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 764280, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18031395478558921297&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "princeton.edu;salesforce.com;ustc.edu.cn", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Princeton University;Salesforce;University of Science and Technology of China", "aff_unique_dep": ";Salesforce Research;", "aff_unique_url": "https://www.princeton.edu;https://research.salesforce.com;http://www.ustc.edu.cn", "aff_unique_abbr": "Princeton;Salesforce;USTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;China" }, { "title": "Subgraphormer: Unifying Subgraph GNNs and Graph Transformers via Graph Products", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34907", "id": "6djDWVTUEq", "proceeding": "https://proceedings.mlr.press/v235/bar-shalom24a.html", "pdf": "https://openreview.net/pdf?id=6djDWVTUEq", "openreview": "https://openreview.net/forum?id=6djDWVTUEq", "author_site": "Guy Bar Shalom, Beatrice Bevilacqua, Haggai Maron", "tldr": "", "abstract": "In the realm of Graph Neural Networks (GNNs), two exciting research directions have recently emerged: Subgraph GNNs and Graph Transformers. In this paper, we propose an architecture that integrates both approaches, dubbed *Subgraphormer*, which combines the enhanced expressive power, message-passing mechanisms, and aggregation schemes from Subgraph GNNs with attention and positional encodings, arguably the most important components in Graph Transformers. Our method is based on an intriguing new connection we reveal between Subgraph GNNs and product graphs, suggesting that Subgraph GNNs can be formulated as Message Passing Neural Networks (MPNNs) operating on a product of the graph with itself. We use this formulation to design our architecture: first, we devise an attention mechanism based on the connectivity of the product graph. Following this, we propose a novel and efficient positional encoding scheme for Subgraph GNNs, which we derive as a positional encoding for the product graph. Our experimental results demonstrate significant performance improvements over both Subgraph GNNs and Graph Transformers on a wide range of datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guy Bar-Shalom;Beatrice Bevilacqua;Haggai Maron", "authorids": "~Guy_Bar-Shalom1;~Beatrice_Bevilacqua1;~Haggai_Maron1", "gender": "M;F;M", "homepage": "https://barsguy.github.io/;http://beabevi.github.io/;https://haggaim.github.io/", "dblp": "321/1651;275/2364;181/6629", "google_scholar": "9Zvzm5MAAAAJ;;https://scholar.google.co.il/citations?user=4v8uJrIAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Guy_Bar-Shalom1;~Beatrice_Bevilacqua1;~Haggai_Maron1", "aff": "Technion, Technion;Purdue University;NVIDIA", "aff_domain": "technion.ac.il;purdue.edu;nvidia.com", "position": "PhD student;PhD student;Research Scientist", "bibtex": "@inproceedings{\nbar-shalom2024subgraphormer,\ntitle={Subgraphormer: Unifying Subgraph {GNN}s and Graph Transformers via Graph Products},\nauthor={Guy Bar-Shalom and Beatrice Bevilacqua and Haggai Maron},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6djDWVTUEq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1781420, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10787199175375005769&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "technion.ac.il;purdue.edu;nvidia.com", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Technion - Israel Institute of Technology;Purdue University;NVIDIA", "aff_unique_dep": ";;NVIDIA Corporation", "aff_unique_url": "https://www.technion.ac.il/en/;https://www.purdue.edu;https://www.nvidia.com", "aff_unique_abbr": "Technion;Purdue;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Israel;United States" }, { "title": "Faster Streaming and Scalable Algorithms for Finding Directed Dense Subgraphs in Large Graphs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34906", "id": "6h6ovHcC9G", "proceeding": "https://proceedings.mlr.press/v235/mitrovic24a.html", "pdf": "https://openreview.net/pdf?id=6h6ovHcC9G", "openreview": "https://openreview.net/forum?id=6h6ovHcC9G", "author_site": "Slobodan Mitrovic, Theodore Pan", "tldr": "", "abstract": "Finding dense subgraphs is a fundamental algorithmic tool in data mining, community detection, and clustering. In this problem, the aim is to find an induced subgraph whose edge-to-vertex ratio is maximized. We show how to find a $(2+\\epsilon)$ approximation of the directed densest subgraph on randomized streams in a single pass while using $O(n \\cdot {\\rm poly} \\log n)$ memory on $n$-vertex graphs. In contrast, the approach by Bahmani et al. (VLDB 2012) uses $O(\\log n)$ passes and by Esfandiari et al. (2015) makes one pass but uses $O(n^{3/2})$ memory; both algorithms also apply to arbitrary-ordered streams. Our techniques extend to Massively Parallel Computation (MPC), yielding quadratic improvement over state-of-the-art by Bahmani et al. (VLDB 2012 and WAW 2014). We empirically show that the quality of our output is essentially the same as that of Bahmani et al. (VLDB 2012) while being $2$ times faster on large graphs, even on non-randomly ordered streams.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Slobodan Mitrovic;Theodore Pan", "authorids": "~Slobodan_Mitrovic1;~Theodore_Pan1", "gender": ";M", "homepage": ";", "dblp": ";361/1582", "google_scholar": ";", "orcid": ";", "linkedin": ";theodore-pan-a803ba220/", "or_profile": "~Slobodan_Mitrovic1;~Theodore_Pan1", "aff": ";University of California, Davis", "aff_domain": ";ucdavis.edu", "position": ";Undergrad student", "bibtex": "@inproceedings{\nmitrovic2024faster,\ntitle={Faster Streaming and Scalable Algorithms for Finding Directed Dense Subgraphs in Large Graphs},\nauthor={Slobodan Mitrovic and Theodore Pan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6h6ovHcC9G}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 850558, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eSwRH1vpzt4J:scholar.google.com/&scioq=Faster+Streaming+and+Scalable+Algorithms+for+Finding+Directed+Dense+Subgraphs+in+Large+Graphs&hl=en&as_sdt=0,34", "gs_version_total": 6, "email": ";ucdavis.edu", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "University of California, Davis", "aff_unique_dep": "", "aff_unique_url": "https://www.ucdavis.edu", "aff_unique_abbr": "UC Davis", "aff_campus_unique_index": "0", "aff_campus_unique": "Davis", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Scalable AI Safety via Doubly-Efficient Debate", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34905", "id": "6jmdOTRMIO", "proceeding": "https://proceedings.mlr.press/v235/brown-cohen24a.html", "pdf": "https://openreview.net/pdf?id=6jmdOTRMIO", "openreview": "https://openreview.net/forum?id=6jmdOTRMIO", "author_site": "Jonah Brown-Cohen, Geoffrey Irving, Georgios Piliouras", "tldr": "", "abstract": "The emergence of pre-trained AI systems with powerful capabilities across a diverse and ever-increasing set of complex domains has raised a critical challenge for AI safety as tasks can become too complicated for humans to judge directly. Irving et al (2018). proposed a debate method in this direction with the goal of pitting the power of such AI models against each other until the problem of identifying (mis)-alignment is broken down into a manageable subtask. While the promise of this approach is clear, the original framework was based on the assumption that the honest strategy is able to simulate *deterministic* AI systems for an *exponential* number of steps, limiting its applicability. In this paper, we show how to address these challenges by designing a new set of debate protocols where the honest strategy can always succeed using a simulation of a *polynomial* number of steps, whilst being able to verify the alignment of *stochastic* AI systems, even when the dishonest strategy is allowed to use exponentially many simulation steps.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jonah Brown-Cohen;Geoffrey Irving;Georgios Piliouras", "authorids": "~Jonah_Brown-Cohen1;~Geoffrey_Irving2;~Georgios_Piliouras1", "gender": "M;M;", "homepage": "https://jonahbc.github.io/;https://naml.us;", "dblp": "157/1513;95/4978;62/1236", "google_scholar": "fRc3A80AAAAJ;TrdtzgwAAAAJ;", "orcid": ";;", "linkedin": ";geoffreyirving;", "or_profile": "~Jonah_Brown-Cohen1;~Geoffrey_Irving2;~Georgios_Piliouras1", "aff": "Google DeepMind;Google DeepMind;Singapore University of Technology and Design", "aff_domain": "deepmind.com;deepmind.com;sutd.edu.sg", "position": "Researcher;Safety Researcher;Associate Professor", "bibtex": "@inproceedings{\nbrown-cohen2024scalable,\ntitle={Scalable {AI} Safety via Doubly-Efficient Debate},\nauthor={Jonah Brown-Cohen and Geoffrey Irving and Georgios Piliouras},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6jmdOTRMIO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 419593, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17236456443179009770&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "deepmind.com;deepmind.com;sutd.edu.sg", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Google;Singapore University of Technology and Design", "aff_unique_dep": "Google DeepMind;", "aff_unique_url": "https://deepmind.com;https://www.sutd.edu.sg", "aff_unique_abbr": "DeepMind;SUTD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United Kingdom;Singapore" }, { "title": "SelfVC: Voice Conversion With Iterative Refinement using Self Transformations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34904", "id": "6kMMgmeM2U", "proceeding": "https://proceedings.mlr.press/v235/neekhara24a.html", "pdf": "https://openreview.net/pdf?id=6kMMgmeM2U", "openreview": "https://openreview.net/forum?id=6kMMgmeM2U", "author_site": "Paarth Neekhara, Shehzeen Hussain, Rafael Valle, Boris Ginsburg, Rishabh Ranjan, Shlomo Dubnov, Farinaz Koushanfar, Julian McAuley", "tldr": "", "abstract": "We propose SelfVC, a training strategy to iteratively improve a voice conversion model with self-synthesized examples. Previous efforts on voice conversion focus on factorizing speech into explicitly disentangled representations that separately encode speaker characteristics and linguistic content. However, disentangling speech representations to capture such attributes using task-specific loss terms can lead to information loss. In this work, instead of explicitly disentangling attributes with loss terms, we present a framework to train a controllable voice conversion model on entangled speech representations derived from self-supervised learning (SSL) and speaker verification models. First, we develop techniques to derive prosodic information from the audio signal and SSL representations to train predictive submodules in the synthesis model. Next, we propose a training strategy to iteratively improve the synthesis model for voice conversion, by creating a challenging training objective using self-synthesized examples. We demonstrate that incorporating such self-synthesized examples during training improves the speaker similarity of generated speech as compared to a baseline voice conversion model trained solely on heuristically perturbed inputs. Our framework is trained without any text and achieves state-of-the-art results in zero-shot voice conversion on metrics evaluating naturalness, speaker similarity, and intelligibility of synthesized audio.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Paarth Neekhara;Shehzeen Samarah Hussain;Rafael Valle;Boris Ginsburg;Rishabh Ranjan;Shlomo Dubnov;Farinaz Koushanfar;Julian McAuley", "authorids": "~Paarth_Neekhara1;~Shehzeen_Samarah_Hussain1;~Rafael_Valle1;~Boris_Ginsburg1;~Rishabh_Ranjan5;~Shlomo_Dubnov1;~Farinaz_Koushanfar1;~Julian_McAuley1", "gender": "M;;Not Specified;;M;M;F;M", "homepage": "https://paarthneekhara.github.io/;;http://rafaelvalle.github.io;;;http://dub.ucsd.edu;https://farinaz.eng.ucsd.edu/;http://cseweb.ucsd.edu/~jmcauley/", "dblp": "194/3168;;;;148/9922;89/4032;k/FarinazKoushanfar.html;29/3483", "google_scholar": "lbls-cUAAAAJ;;SktxU8IAAAAJ;;o7WaNbMAAAAJ;NJfiIl8AAAAJ;3XnMVUAAAAAJ;icbo4M0AAAAJ", "orcid": ";;;;;;0000-0003-0798-3794;0000-0003-0955-7588", "linkedin": ";;vallerafael/;;ranjan-rishabh;shlomo-dubnov-10141/;farinaz-koushanfar-9372a6a/;", "or_profile": "~Paarth_Neekhara1;~Shehzeen_Samarah_Hussain1;~Rafael_Valle1;~Boris_Ginsburg1;~Rishabh_Ranjan5;~Shlomo_Dubnov1;~Farinaz_Koushanfar1;~Julian_McAuley1", "aff": "NVIDIA;;NVIDIA;;;University of California, San Diego;University of California, San Diego;University of California, San Diego, University of California, San Diego", "aff_domain": "nvidia.com;;nvidia.com;;;ucsd.edu;ucsd.edu;eng.ucsd.edu", "position": "Researcher;;Senior Research Scientist;;;Full Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nneekhara2024selfvc,\ntitle={Self{VC}: Voice Conversion With Iterative Refinement using Self Transformations},\nauthor={Paarth Neekhara and Shehzeen Samarah Hussain and Rafael Valle and Boris Ginsburg and Rishabh Ranjan and Shlomo Dubnov and Farinaz Koushanfar and Julian McAuley},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6kMMgmeM2U}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1890251, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10338007784549955322&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "nvidia.com;;nvidia.com;;;ucsd.edu;ucsd.edu;eng.ucsd.edu", "author_num": 8, "aff_unique_index": "0;0;1;1;1", "aff_unique_norm": "NVIDIA;University of California, San Diego", "aff_unique_dep": "NVIDIA Corporation;", "aff_unique_url": "https://www.nvidia.com;https://www.ucsd.edu", "aff_unique_abbr": "NVIDIA;UCSD", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Tackling Prevalent Conditions in Unsupervised Combinatorial Optimization: Cardinality, Minimum, Covering, and More", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34903", "id": "6n99bIxb3r", "proceeding": "https://proceedings.mlr.press/v235/bu24b.html", "pdf": "https://openreview.net/pdf?id=6n99bIxb3r", "openreview": "https://openreview.net/forum?id=6n99bIxb3r", "author_site": "Fanchen Bu, Hyeonsoo Jo, Soo Yong Lee, Sungsoo Ahn, Kijung Shin", "tldr": "", "abstract": "Combinatorial optimization (CO) is naturally discrete, making machine-learning techniques based on differentiable optimization inapplicable. Karalias & Loukas (2020) adapted the probabilistic method by Erd\u0151s & Spencer (1974), to incorporate CO into differentiable optimization. Their work ignited the research on unsupervised learning for CO, composed of two main components: probabilistic objectives and derandomization. However, each component confronts unique challenges. First, deriving objectives under complex conditions and constraints is nontrivial. Second, the derandomization process is underexplored, and the existing derandomization methods are either random sampling or naive rounding. In this work, we aim to tackle complex conditions in unsupervised CO. First, we concretize the targets for probabilistic objective construction and derandomization with theoretical justification. Then, for various complex conditions commonly involved in different CO problems, we derive nontrivial objectives and derandomization to meet the targets. Finally, we apply the derivations to various CO problems. Via extensive experiments on synthetic and real-world graphs, we validate the correctness of our derivations and show our empirical superiority w.r.t. both optimization quality and speed.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fanchen Bu;Hyeonsoo Jo;Soo Yong Lee;Sungsoo Ahn;Kijung Shin", "authorids": "~Fanchen_Bu1;~Hyeonsoo_Jo1;~Soo_Yong_Lee1;~Sungsoo_Ahn1;~Kijung_Shin2", "gender": "M;M;M;M;M", "homepage": "https://github.com/bokveizen;https://hyeonsoojo.github.io/;https://syleetolow.notion.site/Soo-Yong-s-Homepage-2e5cfa74f1784bf4957e7ba0ab0fbc7a;https://sungsooahn.super.site/;https://kijungs.github.io/", "dblp": "270/0123;254/8496;348/9631;90/5164;153/2052", "google_scholar": "XjNu7-AAAAAJ;dx5_RmkAAAAJ;U3vZd0kAAAAJ;XTenHs0AAAAJ;https://scholar.google.co.kr/citations?user=Yp3Cz5AAAAAJ", "orcid": "0000-0003-0497-3902;0000-0002-9281-8672;0000-0001-7957-7600;;0000-0002-2872-1526", "linkedin": "fanchen-bu-1268a1255/;hyeonsoo-jo-203960179/;syleeheal/;;kijungshin/", "or_profile": "~Fanchen_Bu1;~Hyeonsoo_Jo1;~Soo_Yong_Lee1;~Sungsoo_Ahn1;~Kijung_Shin2", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;KAIST;Pohang University of Science and Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.edu;postech.ac.kr;kaist.ac.kr", "position": "PhD student;PhD student;PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nbu2024tackling,\ntitle={Tackling Prevalent Conditions in Unsupervised Combinatorial Optimization: Cardinality, Minimum, Covering, and More},\nauthor={Fanchen Bu and Hyeonsoo Jo and Soo Yong Lee and Sungsoo Ahn and Kijung Shin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6n99bIxb3r}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 743087, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4503149429423659176&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 7, "email": "kaist.ac.kr;kaist.ac.kr;kaist.edu;postech.ac.kr;kaist.ac.kr", "author_num": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Pohang University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;https://www.postech.ac.kr", "aff_unique_abbr": "KAIST;POSTECH", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pohang", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "GeoAB: Towards Realistic Antibody Design and Reliable Affinity Maturation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34902", "id": "6pHP51F55x", "proceeding": "https://proceedings.mlr.press/v235/lin24s.html", "pdf": "https://openreview.net/pdf?id=6pHP51F55x", "openreview": "https://openreview.net/forum?id=6pHP51F55x", "author_site": "Haitao Lin, Lirong Wu, Yufei Huang, Yunfan Liu, Odin Zhang, Yuanqing Zhou, Rui Sun, Stan Z Li", "tldr": "", "abstract": "Increasing works for antibody design are emerging to generate sequences and structures in Complementarity Determining Regions (CDRs), but problems still exist. We focus on two of them: (i) authenticity of the generated structure and (ii) rationality of the affinity maturation, and propose GeoAB as a solution. In specific, GeoAB-Designergenerates CDR structures with realistic internal geometries, composed of a generative geometry initializer (Geo-Initializer) and a position refiner (Geo-Refiner); GeoAB-Optimizer achieves affinity maturation by accurately predicting both the mutation effects and structures of mutant antibodies with the same network architecture as Geo-Refiner. Experiments show that GeoAB achieves state-of-the-art performance in CDR co-design and mutation effect predictions, and fulfills the discussed tasks effectively.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haitao Lin;Lirong Wu;Yufei Huang;Yunfan Liu;Odin Zhang;Yuanqing Zhou;Rui Sun;Stan Z. Li", "authorids": "~Haitao_Lin2;~Lirong_Wu1;~Yufei_Huang4;~Yunfan_Liu2;~Odin_Zhang1;yuanqzhou@zju.edu.cn;~Rui_Sun13;~Stan_Z._Li2", "gender": "M;;M;M;;;M;", "homepage": ";;https://2021.igem.org/Team:ZJU-China;https://github.com/XYxiyang;https://haotianzhangai4science.github.io/;;https://github.com/Siirui;", "dblp": "34/1040;15/10330;68/1946-2;170/8550-2;;;;", "google_scholar": "o5A23qIAAAAJ;Tk7TrCoAAAAJ;qmTjdwIAAAAJ;;ypnp3YwAAAAJ;;;", "orcid": ";;0009-0007-8184-4529;0009-0002-1639-5855;;;;", "linkedin": ";;;;;;;", "or_profile": "~Haitao_Lin2;~Lirong_Wu1;~Yufei_Huang4;~Yunfan_Liu2;~Odin_Zhang1;yuanqzhou@zju.edu.cn;~Rui_Sun13;~Stan_Z._Li2", "aff": "Westlake University;Westlake University;Zhejiang University;Zhejiang University;;;Tongji University;", "aff_domain": "westlake.edu.cn;westlake.edu.cn;zju.edu.cn;zju.edu.cn;;;tongji.edu.cn;", "position": "PhD student;PhD student;PhD student;PhD student;;;Undergrad student;", "bibtex": "@inproceedings{\nlin2024geoab,\ntitle={Geo{AB}: Towards Realistic Antibody Design and Reliable Affinity Maturation},\nauthor={Haitao Lin and Lirong Wu and Yufei Huang and Yunfan Liu and Odin Zhang and Yuanqing Zhou and Rui Sun and Stan Z. Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6pHP51F55x}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1522346, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17668693239718232210&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "westlake.edu.cn;westlake.edu.cn;zju.edu.cn;zju.edu.cn;;;tongji.edu.cn;", "author_num": 8, "aff_unique_index": "0;0;1;1;2", "aff_unique_norm": "Westlake University;Zhejiang University;Tongji University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.westlake.edu.cn;https://www.zju.edu.cn;https://www.tongji.edu.cn", "aff_unique_abbr": "WU;ZJU;Tongji", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Improved Operator Learning by Orthogonal Attention", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34901", "id": "6w7zkf9FBR", "proceeding": "https://proceedings.mlr.press/v235/xiao24c.html", "pdf": "https://openreview.net/pdf?id=6w7zkf9FBR", "openreview": "https://openreview.net/forum?id=6w7zkf9FBR", "author_site": "Zipeng Xiao, Zhongkai Hao, Bokai Lin, Zhijie Deng, Hang Su", "tldr": "", "abstract": "This work presents orthogonal attention for constructing neural operators to serve as surrogates to model the solutions of a family of Partial Differential Equations (PDEs). The motivation is that the kernel integral operator, which is usually at the core of neural operators, can be reformulated with orthonormal eigenfunctions. Inspired by the success of the neural approximation of eigenfunctions (Deng et al., 2022), we opt to directly parameterize the involved eigenfunctions with flexible neural networks (NNs), based on which the input function is then transformed by the rule of kernel integral. Surprisingly, the resulting NN module bears a striking resemblance to regular attention mechanisms, albeit without softmax. Instead, it incorporates an orthogonalization operation that provides regularization during model training and helps mitigate overfitting, particularly in scenarios with limited data availability. In practice, the orthogonalization operation can be implemented with minimal additional overheads. Experiments on six standard neural operator benchmark datasets comprising both regular and irregular geometries show that our method can outperform competing baselines with decent margins.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zipeng Xiao;Zhongkai Hao;Bokai Lin;Zhijie Deng;Hang Su", "authorids": "~Zipeng_Xiao1;~Zhongkai_Hao1;~Bokai_Lin1;~Zhijie_Deng1;~Hang_Su3", "gender": "M;M;M;M;M", "homepage": "https://github.com/xzppp;https://github.com/zhuanrangqun;https://thudzj.github.io/;;https://haozhongkai.github.io/", "dblp": ";;209/4959;26/5371-6;270/0220.html", "google_scholar": ";;J3dR0sUAAAAJ;dxN1_X0AAAAJ;dfSzq27ZiVoC", "orcid": ";;0000-0002-0932-1631;;", "linkedin": ";;;;", "or_profile": "~Zipeng_Xiao1;~Bokai_Lin1;~Zhijie_Deng1;~Hang_Su2;~Hao_Zhongkai1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Tsinghua University;Tsinghua University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;tsinghua.edu.cn;mails.tsinghua.edu.cn", "position": "MS student;Undergrad student;Assistant Professor;Associate Professor;PhD student", "bibtex": "@inproceedings{\nxiao2024improved,\ntitle={Improved Operator Learning by Orthogonal Attention},\nauthor={Zipeng Xiao and Zhongkai Hao and Bokai Lin and Zhijie Deng and Hang Su},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6w7zkf9FBR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 886373, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5762943687772372832&as_sdt=40000005&sciodt=0,22&hl=en", "gs_version_total": 7, "email": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;tsinghua.edu.cn;mails.tsinghua.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;1;1", "aff_unique_norm": "Shanghai Jiao Tong University;Tsinghua University", "aff_unique_dep": ";", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.tsinghua.edu.cn", "aff_unique_abbr": "SJTU;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Leveraging Self-Consistency for Data-Efficient Amortized Bayesian Inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34900", "id": "6wVlH96oMX", "proceeding": "https://proceedings.mlr.press/v235/schmitt24a.html", "pdf": "https://openreview.net/pdf?id=6wVlH96oMX", "openreview": "https://openreview.net/forum?id=6wVlH96oMX", "author_site": "Marvin Schmitt, Desi Ivanova, Daniel Habermann, Ullrich Koethe, Paul Buerkner, Stefan Radev", "tldr": "", "abstract": "We propose a method to improve the efficiency and accuracy of amortized Bayesian inference by leveraging universal symmetries in the joint probabilistic model of parameters and data. In a nutshell, we invert Bayes' theorem and estimate the marginal likelihood based on approximate representations of the joint model. Upon perfect approximation, the marginal likelihood is constant across all parameter values by definition. However, errors in approximate inference lead to undesirable variance in the marginal likelihood estimates across different parameter values. We penalize violations of this symmetry with a self-consistency loss which significantly improves the quality of approximate inference in low data regimes and can be used to augment the training of popular neural density estimators. We apply our method to a number of synthetic problems and realistic scientific models, discovering notable advantages in the context of both neural posterior and likelihood approximation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Marvin Schmitt;Desi R. Ivanova;Daniel Habermann;Ullrich Koethe;Paul-Christian B\u00fcrkner;Stefan T. Radev", "authorids": "~Marvin_Schmitt1;~Desi_R._Ivanova1;~Daniel_Habermann1;~Ullrich_Koethe1;~Paul-Christian_B\u00fcrkner1;~Stefan_T._Radev1", "gender": ";F;M;M;;M", "homepage": ";https://desirivanova.com;https://daniel-habermann.de;https://hci.iwr.uni-heidelberg.de/vislearn/people/ullrich-koethe/;;https://faculty.rpi.edu/stefan-radev", "dblp": ";286/8335;358/7058;15/809;;", "google_scholar": ";AmX6sMIAAAAJ;;gt-yaNMAAAAJ;;JbDfkRkAAAAJ", "orcid": ";;0000-0003-3685-7287;0000-0001-6036-1287;;0000-0002-6702-9559", "linkedin": ";dr-ivanova/;;;;stefan-radev-21b713187/", "or_profile": "~Marvin_Schmitt1;~Desi_R._Ivanova1;~Daniel_Habermann1;~Ullrich_Koethe1;~Paul-Christian_B\u00fcrkner1;~Stefan_T._Radev1", "aff": ";University of Oxford;Technische Universit\u00e4t Dortmund;Heidelberg University;;Rensselaer Polytechnic Institute", "aff_domain": ";ox.ac.uk;tu-dortmund.de;uni-heidelberg.de;;epi.edu", "position": ";PhD student;Postdoc;Adjunct Professor;;Assistant Professor", "bibtex": "@inproceedings{\nschmitt2024leveraging,\ntitle={Leveraging Self-Consistency for Data-Efficient Amortized Bayesian Inference},\nauthor={Marvin Schmitt and Desi R. Ivanova and Daniel Habermann and Ullrich Koethe and Paul-Christian B{\\\"u}rkner and Stefan T. Radev},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6wVlH96oMX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1691127, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=953717708054211071&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "email": ";ox.ac.uk;tu-dortmund.de;uni-heidelberg.de;;epi.edu", "author_num": 6, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Oxford;Technische Universit\u00e4t Dortmund;Heidelberg University;Rensselaer Polytechnic Institute", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ox.ac.uk;https://www.tu-dortmund.de;https://www.uni-heidelberg.de;https://www.rpi.edu", "aff_unique_abbr": "Oxford;TU Dortmund;Uni Heidelberg;RPI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;2", "aff_country_unique": "United Kingdom;Germany;United States" }, { "title": "Algorithmic Stability Unleashed: Generalization Bounds with Unbounded Losses", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34899", "id": "6yQ5mIYxjj", "proceeding": "https://proceedings.mlr.press/v235/li24cs.html", "pdf": "https://openreview.net/pdf?id=6yQ5mIYxjj", "openreview": "https://openreview.net/forum?id=6yQ5mIYxjj", "author_site": "Shaojie Li, Bowei Zhu, Yong Liu", "tldr": "", "abstract": "One of the central problems of statistical learning theory is quantifying the generalization ability of learning algorithms within a probabilistic framework. Algorithmic stability is a powerful tool for deriving generalization bounds, however, it typically builds on a critical assumption that losses are bounded. In this paper, we relax this condition to unbounded loss functions with subweibull diameter. This gives new generalization bounds for algorithmic stability and also includes existing results of subgaussian and subexponential diameters as specific cases. Furthermore, we provide a refined stability analysis by developing generalization bounds which can be $\\sqrt{n}$-times faster than the previous results, where $n$ is the sample size. Our main technical contribution is general concentration inequalities for subweibull random variables, which may be of independent interest.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shaojie Li;Bowei Zhu;Yong Liu", "authorids": "~Shaojie_Li2;~Bowei_Zhu1;~Yong_Liu7", "gender": "M;;M", "homepage": ";;https://iie-liuyong.github.io", "dblp": ";304/1543;29/4867-18", "google_scholar": ";;vVhmzbAAAAAJ", "orcid": ";;0000-0002-6739-621X", "linkedin": ";;", "or_profile": "~Shaojie_Li2;~Bowei_Zhu1;~Yong_Liu7", "aff": "Renmin University of China;Renmin University of China;Renmin University of China", "aff_domain": "ruc.edu.cn;ruc.edu.cn;ruc.edu.cn", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nli2024algorithmic,\ntitle={Algorithmic Stability Unleashed: Generalization Bounds with Unbounded Losses},\nauthor={Shaojie Li and Bowei Zhu and Yong Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6yQ5mIYxjj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 355686, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3002706169684833497&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "ruc.edu.cn;ruc.edu.cn;ruc.edu.cn", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Renmin University of China", "aff_unique_dep": "", "aff_unique_url": "http://www.ruc.edu.cn", "aff_unique_abbr": "RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "PPFLOW: Target-Aware Peptide Design with Torsional Flow Matching", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34898", "id": "70jplnkLMe", "proceeding": "https://proceedings.mlr.press/v235/lin24z.html", "pdf": "https://openreview.net/pdf?id=70jplnkLMe", "openreview": "https://openreview.net/forum?id=70jplnkLMe", "author_site": "Haitao Lin, Odin Zhang, Huifeng Zhao, Dejun Jiang, Lirong Wu, Zicheng Liu, Yufei Huang, Stan Z Li", "tldr": "", "abstract": "Therapeutic peptides have proven to have great pharmaceutical value and potential in recent decades. However, methods of AI-assisted peptide drug discovery are not fully explored. To fill the gap, we propose a target-aware peptide design method called PPFlow, based on conditional flow matching on torus manifolds, to model the internal geometries of torsion angles for the peptide structure design. Besides, we establish a protein-peptide binding dataset named PPBench2024 to fill the void of massive data for the task of structure-based peptide drug design and to allow the training of deep learning methods. Extensive experiments show that PPFlow reaches state-of-the-art performance in tasks of peptide drug generation and optimization in comparison with baseline models, and can be generalized to other tasks including docking and side-chain packing.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haitao Lin;Odin Zhang;Huifeng Zhao;Dejun Jiang;Lirong Wu;Zicheng Liu;Yufei Huang;Stan Z. Li", "authorids": "~Haitao_Lin2;~Odin_Zhang1;~Huifeng_Zhao1;~Dejun_Jiang2;~Lirong_Wu1;~Zicheng_Liu2;~Yufei_Huang4;~Stan_Z._Li2", "gender": "M;;;;;M;M;M", "homepage": ";https://haotianzhangai4science.github.io/;https://cadd.zju.edu.cn;https://www.researchgate.net/profile/Jiang-Dejun;;;https://2021.igem.org/Team:ZJU-China;https://en.westlake.edu.cn/academics/School_of_Engineering/About/Our_People/Faculty/201912/t20191206_2497.shtml", "dblp": "34/1040;;;;15/10330;l/ZichengLiu-6;68/1946-2;l/StanZLi", "google_scholar": "o5A23qIAAAAJ;ypnp3YwAAAAJ;;B1J94LwAAAAJ;Tk7TrCoAAAAJ;https://scholar.google.com/citations?hl=zh-CN;qmTjdwIAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";;;;;;0009-0007-8184-4529;", "linkedin": ";;;;;;;stan-z-li-%E6%9D%8E%E5%AD%90%E9%9D%92-55753224/", "or_profile": "~Haitao_Lin2;~Odin_Zhang1;~Huifeng_Zhao1;~Dejun_Jiang2;~Lirong_Wu1;~Zicheng_Liu2;~Yufei_Huang4;~Stan_Z._Li1", "aff": "Westlake University;;Zhejiang University;;Westlake University;Zhejiang University;Zhejiang University;Westlake University", "aff_domain": "westlake.edu.cn;;zju.edu.cn;;westlake.edu.cn;zju.edu.cn;zju.edu.cn;westlake.edu.cn", "position": "PhD student;;PhD student;;PhD student;PhD student;PhD student;Chair Professor", "bibtex": "@inproceedings{\nlin2024ppflow,\ntitle={{PPFLOW}: Target-Aware Peptide Design with Torsional Flow Matching},\nauthor={Haitao Lin and Odin Zhang and Huifeng Zhao and Dejun Jiang and Lirong Wu and Zicheng Liu and Yufei Huang and Stan Z. Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=70jplnkLMe}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6763810, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10772760861664713101&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 11, "email": "westlake.edu.cn;;zju.edu.cn;;westlake.edu.cn;zju.edu.cn;zju.edu.cn;westlake.edu.cn", "author_num": 8, "aff_unique_index": "0;1;0;1;1;0", "aff_unique_norm": "Westlake University;Zhejiang University", "aff_unique_dep": ";", "aff_unique_url": "https://www.westlake.edu.cn;https://www.zju.edu.cn", "aff_unique_abbr": "WU;ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Agnostic Sample Compression Schemes for Regression", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34897", "id": "71ktaA3ihI", "proceeding": "https://proceedings.mlr.press/v235/attias24b.html", "pdf": "https://openreview.net/pdf?id=71ktaA3ihI", "openreview": "https://openreview.net/forum?id=71ktaA3ihI", "author_site": "Idan Attias, Steve Hanneke, Aryeh Kontorovich, Menachem Sadigurschi", "tldr": "", "abstract": "We obtain the first positive results for bounded sample compression in the agnostic regression setting with the $\\ell_p$ loss, where $p\\in [1,\\infty]$. We construct a generic approximate sample compression scheme for real-valued function classes exhibiting exponential size in the fat-shattering dimension but independent of the sample size. Notably, for linear regression, an approximate compression of size linear in the dimension is constructed. Moreover, for $\\ell_1$ and $\\ell_\\infty$ losses, we can even exhibit an efficient exact sample compression scheme of size linear in the dimension. We further show that for every other $\\ell_p$ loss, $p\\in (1,\\infty)$, there does not exist an exact agnostic compression scheme of bounded size. This refines and generalizes a negative result of David, Moran, and Yehudayoff (2016) for the $\\ell_2$ loss. We close by posing general open questions: for agnostic regression with $\\ell_1$ loss, does every function class admit an exact compression scheme of polynomial size in the pseudo-dimension? For the $\\ell_2$ loss, does every function class admit an approximate compression scheme of polynomial size in the fat-shattering dimension? These questions generalize Warmuth's classic sample compression conjecture for realizable-case classification (Warmuth, 2003).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Idan Attias;Steve Hanneke;Aryeh Kontorovich;Menachem Sadigurschi", "authorids": "~Idan_Attias1;~Steve_Hanneke1;~Aryeh_Kontorovich1;~Menachem_Sadigurschi1", "gender": "M;M;;M", "homepage": "https://www.idanattias.com;http://www.stevehanneke.com;http://www.cs.bgu.ac.il/~karyeh/;https://menisadi.github.io/", "dblp": "228/6803;40/154;20/10289;https://dblp.org/pers/s/Sadigurschi:Menachem.html", "google_scholar": "-L6uUy0AAAAJ;fEhNO7YAAAAJ;https://scholar.google.co.il/citations?user=UNVQ5DsAAAAJ;dmGbLNYAAAAJ", "orcid": ";;;", "linkedin": ";;prof-aryeh-kontorovich-7b236055/;", "or_profile": "~Idan_Attias1;~Steve_Hanneke1;~Aryeh_Kontorovich1;~Menachem_Sadigurschi1", "aff": "Tel Aviv University;Purdue University;Ben Gurion University of the Negev;Ben Gurion University of the Negev, Technion", "aff_domain": "tau.ac.il;purdue.edu;bgu.ac.il;bgu.ac.il", "position": "PhD student;Assistant Professor;Professor;PhD student", "bibtex": "@inproceedings{\nattias2024agnostic,\ntitle={Agnostic Sample Compression Schemes for Regression},\nauthor={Idan Attias and Steve Hanneke and Aryeh Kontorovich and Menachem Sadigurschi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=71ktaA3ihI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 455930, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11546762857874908670&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "tau.ac.il;purdue.edu;bgu.ac.il;bgu.ac.il", "author_num": 4, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Tel Aviv University;Purdue University;Ben Gurion University of the Negev", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tau.ac.il;https://www.purdue.edu;https://www.bgu.ac.il", "aff_unique_abbr": "TAU;Purdue;BGU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Israel;United States" }, { "title": "From Self-Attention to Markov Models: Unveiling the Dynamics of Generative Transformers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34896", "id": "72oT4mPLUb", "proceeding": "https://proceedings.mlr.press/v235/ildiz24a.html", "pdf": "https://openreview.net/pdf?id=72oT4mPLUb", "openreview": "https://openreview.net/forum?id=72oT4mPLUb", "author_site": "Muhammed Emrullah Ildiz, Yixiao HUANG, Yingcong Li, Ankit Singh Rawat, Samet Oymak", "tldr": "", "abstract": "Modern language models rely on the transformer architecture and attention mechanism to perform language understanding and text generation. In this work, we study learning a 1-layer self-attention model from a set of prompts and the associated outputs sampled from the model. We first establish a formal link between the self-attention mechanism and Markov models under suitable conditions: Inputting a prompt to the self-attention model samples the output token according to a *context-conditioned Markov chain* (CCMC). *CCMC* is obtained by weighing the transition matrix of a standard Markov chain according to the sufficient statistics of the prompt/context. Building on this formalism, we develop identifiability/coverage conditions for the data distribution that guarantee consistent estimation of the latent model under a teacher-student setting and establish sample complexity guarantees under IID data. Finally, we study the problem of learning from a single output trajectory generated in response to an initial prompt. We characterize a *winner-takes-all* phenomenon where the generative process of self-attention evolves to sampling from a small set of *winner tokens* that dominate the context window. This provides a mathematical explanation to the tendency of modern LLMs to generate repetitive text.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Muhammed Emrullah Ildiz;Yixiao Huang;Yingcong Li;Ankit Singh Rawat;Samet Oymak", "authorids": "~Muhammed_Emrullah_Ildiz1;~Yixiao_Huang3;~Yingcong_Li1;~Ankit_Singh_Rawat1;~Samet_Oymak2", "gender": "M;M;;M;M", "homepage": ";https://yixiao-huang.github.io/;https://yingcong-li.github.io/;https://ankitsrawat.github.io/home/;https://sota.engin.umich.edu/", "dblp": ";130/6820-4;244/4435;https://dblp.org/pers/hd/r/Rawat:Ankit_Singh;89/8771", "google_scholar": "-Mt1UfUAAAAJ;iTEcewwAAAAJ;9uWgjIUAAAAJ;http://scholar.google.com/citations?user=U0_ab4cAAAAJ;AY6InkoAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Muhammed_Emrullah_Ildiz1;~Yixiao_Huang3;~Yingcong_Li1;~Ankit_Singh_Rawat1;~Samet_Oymak1", "aff": "University of Michigan - Ann Arbor;University of California, Berkeley;University of Michigan - Ann Arbor;Google;University of Michigan - Ann Arbor", "aff_domain": "umich.edu;berkeley.edu;umich.edu;google.com;umich.edu", "position": "PhD student;PhD student;PhD student;Research Scientist;Assistant Professor", "bibtex": "@inproceedings{\nildiz2024from,\ntitle={From Self-Attention to Markov Models: Unveiling the Dynamics of Generative Transformers},\nauthor={Muhammed Emrullah Ildiz and Yixiao Huang and Yingcong Li and Ankit Singh Rawat and Samet Oymak},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=72oT4mPLUb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1373634, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=701405927026046935&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 6, "email": "umich.edu;berkeley.edu;umich.edu;google.com;umich.edu", "author_num": 5, "aff_unique_index": "0;1;0;2;0", "aff_unique_norm": "University of Michigan;University of California, Berkeley;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.umich.edu;https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "UM;UC Berkeley;Google", "aff_campus_unique_index": "0;1;0;2;0", "aff_campus_unique": "Ann Arbor;Berkeley;Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "EvoRainbow: Combining Improvements in Evolutionary Reinforcement Learning for Policy Search", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34895", "id": "75Hes6Zse4", "proceeding": "https://proceedings.mlr.press/v235/li24cp.html", "pdf": "https://openreview.net/pdf?id=75Hes6Zse4", "openreview": "https://openreview.net/forum?id=75Hes6Zse4", "author_site": "Pengyi Li, Yan Zheng, Hongyao Tang, Xian Fu, Jianye Hao", "tldr": "", "abstract": "Both Evolutionary Algorithms (EAs) and Reinforcement Learning (RL) have demonstrated powerful capabilities in policy search with different principles. A promising direction is to combine the respective strengths of both for efficient policy optimization. To this end, many works have proposed various mechanisms to integrate EAs and RL. However, it is still unclear which of these mechanisms are complementary and can be fully combined. In this paper, we revisit different mechanisms from five perspectives: 1) Interaction Mode, 2) Individual Architecture, 3) EAs and operators, 4) Impact of EA on RL, and 5) Fitness Surrogate and Usage. We evaluate the effectiveness of each mechanism and experimentally analyze the reasons for the more effective mechanisms. Using the most effective mechanisms, we develop EvoRainbow and EvoRainbow-Exp, which outperform strong baselines and provide state-of-the-art performance across various tasks with distinct characteristics. To promote community development, we release the code on https://github.com/yeshenpy/EvoRainbow.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pengyi Li;YAN ZHENG;Hongyao Tang;Xian Fu;Jianye HAO", "authorids": "~Pengyi_Li1;~YAN_ZHENG1;~Hongyao_Tang1;~Xian_Fu1;~Jianye_HAO1", "gender": "M;M;M;M;M", "homepage": "https://yeshenpy.github.io/;https://yanzzzzz.github.io;https://bluecontra.github.io/;https://cyanwatts.github.io/;http://www.icdai.org/jianye.html", "dblp": "195/6948;10/2381-2;220/4275;54/1085;21/7664.html", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.hk/citations?user=tJuhd1kAAAAJ;yIqzRH4AAAAJ;https://scholar.google.com/citations?hl=zh-CN;", "orcid": "0009-0009-8546-2346;;;;0000-0002-0422-8235", "linkedin": ";;;;", "or_profile": "~Pengyi_Li1;~YAN_ZHENG1;~Hongyao_Tang1;~Xian_Fu1;~Jianye_HAO1", "aff": "Tianjin University;Tianjin Unibersity, China;Montreal Institute for Learning Algorithms, University of Montreal, Universit\u00e9 de Montr\u00e9al;Tianjin University;Tianjin University", "aff_domain": "tju.edu.cn;tju.edu.cn;mila.umontreal.ca;tju.edu.cn;tju.edu.cn", "position": "PhD student;Associate Professor;Postdoc;MS student;Associate Professor", "bibtex": "@inproceedings{\nli2024evorainbow,\ntitle={EvoRainbow: Combining Improvements in Evolutionary Reinforcement Learning for Policy Search},\nauthor={Pengyi Li and YAN ZHENG and Hongyao Tang and Xian Fu and Jianye HAO},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=75Hes6Zse4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3897036, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16767498533727196670&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "tju.edu.cn;tju.edu.cn;mila.umontreal.ca;tju.edu.cn;tju.edu.cn", "author_num": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Tianjin University;University of Montreal", "aff_unique_dep": ";Montreal Institute for Learning Algorithms", "aff_unique_url": "http://www.tju.edu.cn;https://www.mila.quebec", "aff_unique_abbr": "TJU;MILA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "China;Canada" }, { "title": "Recovering the Pre-Fine-Tuning Weights of Generative Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34894", "id": "761UxjOTHB", "proceeding": "https://proceedings.mlr.press/v235/horwitz24a.html", "pdf": "https://openreview.net/pdf?id=761UxjOTHB", "openreview": "https://openreview.net/forum?id=761UxjOTHB", "author_site": "Eliahu Horwitz, Jonathan Kahana, Yedid Hoshen", "tldr": "", "abstract": "The dominant paradigm in generative modeling consists of two steps: i) pre-training on a large-scale but unsafe dataset, ii) aligning the pre-trained model with human values via fine-tuning. This practice is considered safe, as no current method can recover the unsafe, *pre-fine-tuning* model weights. In this paper, we demonstrate that this assumption is often false. Concretely, we present *Spectral DeTuning*, a method that can recover the weights of the pre-fine-tuning model using a few low-rank (LoRA) fine-tuned models. In contrast to previous attacks that attempt to recover pre-fine-tuning capabilities, our method aims to recover the exact pre-fine-tuning weights. Our approach exploits this new vulnerability against large-scale models such as a personalized Stable Diffusion and an aligned Mistral. The code is available at https://vision.huji.ac.il/spectral_detuning/.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Eliahu Horwitz;Jonathan Kahana;Yedid Hoshen", "authorids": "~Eliahu_Horwitz1;~Jonathan_Kahana1;~Yedid_Hoshen3", "gender": "M;M;M", "homepage": "https://horwitz.ai;;https://www.cs.huji.ac.il/~ydidh/", "dblp": "268/8318;317/0994;136/0280", "google_scholar": "NyLx5nIAAAAJ;;https://scholar.google.co.il/citations?user=6y1-qS4AAAAJ", "orcid": ";;", "linkedin": "eliahu-horwitz/;jonathan-kahana-a92b96221/;", "or_profile": "~Eliahu_Horwitz1;~Jonathan_Kahana1;~Yedid_Hoshen3", "aff": "Hebrew University of Jerusalem;Hebrew University of Jerusalem;Google", "aff_domain": "huji.ac.il;huji.ac.il;google.com", "position": "PhD student;PhD student;Researcher", "bibtex": "@inproceedings{\nhorwitz2024recovering,\ntitle={Recovering the Pre-Fine-Tuning Weights of Generative Models},\nauthor={Eliahu Horwitz and Jonathan Kahana and Yedid Hoshen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=761UxjOTHB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5434160, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9790485706109183480&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "huji.ac.il;huji.ac.il;google.com", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Hebrew University of Jerusalem;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.huji.ac.il;https://www.google.com", "aff_unique_abbr": "HUJI;Google", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Jerusalem;Mountain View", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Israel;United States" }, { "title": "The Pitfalls of Next-Token Prediction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34893", "id": "76zq8Wkl6Z", "proceeding": "https://proceedings.mlr.press/v235/bachmann24a.html", "pdf": "https://openreview.net/pdf?id=76zq8Wkl6Z", "openreview": "https://openreview.net/forum?id=76zq8Wkl6Z", "author_site": "Gregor Bachmann, Vaishnavh Nagarajan", "tldr": "", "abstract": "Can a mere next-token predictor faithfully model human thinking? Our work is aimed at crystallizing this intuitive concern, which is currently fragmented in the literature. First, we emphasize isolating the two phases of next-token prediction that are often conflated: autoregression during inference vs. teacher-forcing during training. We argue that the previously-identified problem of \"exponential error accumulation\" is a symptom of autoregressive inference. But more concerningly, we identify that teacher-forcing can let the model fit the training data by cheating, causing total in-distribution failure. We design a minimal planning task where empirically both the Transformer and the Mamba architecture fail in this manner - remarkably, despite the task being easy to learn. Overall, our work consolidates these and other essential arguments surrounding next-token prediction. We hope this effort can ground future discussions and inspire explorations beyond the next-token prediction paradigm.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gregor Bachmann;Vaishnavh Nagarajan", "authorids": "~Gregor_Bachmann1;~Vaishnavh_Nagarajan3", "gender": "M;M", "homepage": "http://www.da.inf.ethz.ch/people/GregorBachmann;https://vaishnavh.github.io/", "dblp": ";161/0079", "google_scholar": "bbGqqloAAAAJ;https://scholar.google.nl/citations?user=LrsjJfwAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Gregor_Bachmann1;~Vaishnavh_Nagarajan1", "aff": "Swiss Federal Institute of Technology;Google", "aff_domain": "ethz.ch;google.com", "position": "PhD student;Researcher", "bibtex": "@inproceedings{\nbachmann2024the,\ntitle={The Pitfalls of Next-Token Prediction},\nauthor={Gregor Bachmann and Vaishnavh Nagarajan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=76zq8Wkl6Z}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1047131, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14911491618623343799&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "ethz.ch;google.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Swiss Federal Institute of Technology;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.ethz.ch;https://www.google.com", "aff_unique_abbr": "ETH Zurich;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1", "aff_country_unique": "Switzerland;United States" }, { "title": "Symmetry Induces Structure and Constraint of Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34892", "id": "7AF0AMI4AE", "proceeding": "https://proceedings.mlr.press/v235/ziyin24a.html", "pdf": "https://openreview.net/pdf?id=7AF0AMI4AE", "openreview": "https://openreview.net/forum?id=7AF0AMI4AE", "tldr": "", "abstract": "Due to common architecture designs, symmetries exist extensively in contemporary neural networks. In this work, we unveil the importance of the loss function symmetries in affecting, if not deciding, the learning behavior of machine learning models. We prove that every mirror-reflection symmetry, with reflection surface $O$, in the loss function leads to the emergence of a constraint on the model parameters $\\theta$: $O^T\\theta =0$. This constrained solution becomes satisfied when either the weight decay or gradient noise is large. Common instances of mirror symmetries in deep learning include rescaling, rotation, and permutation symmetry. As direct corollaries, we show that rescaling symmetry leads to sparsity, rotation symmetry leads to low rankness, and permutation symmetry leads to homogeneous ensembling. Then, we show that the theoretical framework can explain intriguing phenomena, such as the loss of plasticity and various collapse phenomena in neural networks, and suggest how symmetries can be used to design an elegant algorithm to enforce hard constraints in a differentiable way.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Liu Ziyin", "authorids": "~Liu_Ziyin1", "gender": "", "homepage": "https://www.mit.edu/~ziyinl/", "dblp": "", "google_scholar": "NpN9oRMAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Liu_Ziyin1", "aff": "Massachusetts Institute of Technology", "aff_domain": "mit.edu", "position": "Postdoc", "bibtex": "@inproceedings{\nziyin2024symmetry,\ntitle={Symmetry Induces Structure and Constraint of Learning},\nauthor={Liu Ziyin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7AF0AMI4AE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3054890, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12013958151650756783&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "mit.edu", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Graphon Mean Field Games with a Representative Player: Analysis and Learning Algorithm", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34891", "id": "7C4EQqtb02", "proceeding": "https://proceedings.mlr.press/v235/zhou24u.html", "pdf": "https://openreview.net/pdf?id=7C4EQqtb02", "openreview": "https://openreview.net/forum?id=7C4EQqtb02", "author_site": "Fuzhong Zhou, Chenyu Zhang, Xu Chen, Xuan Di", "tldr": "", "abstract": "We propose a discrete time graphon game formulation on continuous state and action spaces using a representative player to study stochastic games with heterogeneous interaction among agents. This formulation admits both conceptual and mathematical advantages, compared to a widely adopted formulation using a continuum of players. We prove the existence and uniqueness of the graphon equilibrium with mild assumptions, and show that this equilibrium can be used to construct an approximate solution for the finite player game, which is challenging to analyze and solve due to curse of dimensionality. An online oracle-free learning algorithm is developed to solve the equilibrium numerically, and sample complexity analysis is provided for its convergence.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fuzhong Zhou;Chenyu Zhang;Xu Chen;Xuan Di", "authorids": "~Fuzhong_Zhou1;~Chenyu_Zhang2;~Xu_Chen29;~Xuan_Di1", "gender": "F;M;M;F", "homepage": "https://fuzhongzhou.github.io;https://zcysxy.github.io/;https://www.researchgate.net/profile/Xu-Chen-36;https://sharondi-columbia.wixsite.com/ditectlab", "dblp": "378/6140;136/1220-2;;", "google_scholar": "g6T9YxUAAAAJ;6MAGp1QAAAAJ;MU4xWsYAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0009-0002-0228-6877;0009-0005-3612-4894;0000-0002-1006-0926;0000-0003-2925-7697", "linkedin": ";;xu-chen-433600236/;", "or_profile": "~Fuzhong_Zhou1;~Chenyu_Zhang2;~Xu_Chen29;~Xuan_Di1", "aff": "Columbia University;Massachusetts Institute of Technology;;Columbia University", "aff_domain": "columbia.edu;mit.edu;;columbia.edu", "position": "PhD student;PhD student;;Associate Professor", "bibtex": "@inproceedings{\nzhou2024graphon,\ntitle={Graphon Mean Field Games with a Representative Player: Analysis and Learning Algorithm},\nauthor={Fuzhong Zhou and Chenyu Zhang and Xu Chen and Xuan Di},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7C4EQqtb02}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5852303, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10098544733915385939&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "columbia.edu;mit.edu;;columbia.edu", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Columbia University;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.columbia.edu;https://web.mit.edu", "aff_unique_abbr": "Columbia;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Characterizing Truthfulness in Large Language Model Generations with Local Intrinsic Dimension", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34890", "id": "7DbIyQlfaO", "proceeding": "https://proceedings.mlr.press/v235/yin24c.html", "pdf": "https://openreview.net/pdf?id=7DbIyQlfaO", "openreview": "https://openreview.net/forum?id=7DbIyQlfaO", "author_site": "Fan Yin, Jayanth Srinivasa, Kai-Wei Chang", "tldr": "", "abstract": "We study how to characterize and predict the truthfulness of texts generated from large language models (LLMs), which serves as a crucial step in building trust between humans and LLMs. Although several approaches based on entropy or verbalized uncertainty have been proposed to calibrate model predictions, these methods are often intractable, sensitive to hyperparameters, and less reliable when applied in generative tasks with LLMs. In this paper, we suggest investigating internal activations and quantifying LLM's truthfulness using the local intrinsic dimension (LID) of model activations. Through experiments on four question answering (QA) datasets, we demonstrate the effectiveness of our proposed method. Additionally, we study intrinsic dimensions in LLMs and their relations with model layers, autoregressive language modeling, and the training of LLMs, revealing that intrinsic dimensions can be a powerful approach to understanding LLMs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fan Yin;Jayanth Srinivasa;Kai-Wei Chang", "authorids": "~Fan_Yin1;~Jayanth_Srinivasa1;~Kai-Wei_Chang1", "gender": "M;M;M", "homepage": ";;http://kwchang.net", "dblp": ";285/5006;18/2428", "google_scholar": "klShdV0AAAAJ;HtNfeKYAAAAJ;fqDBtzYAAAAJ", "orcid": ";;0000-0001-5365-0072", "linkedin": "fan-y-60b666180/;;kai-wei-chang-41239040", "or_profile": "~Fan_Yin1;~Jayanth_Srinivasa1;~Kai-Wei_Chang1", "aff": "University of California, Los Angeles;Cisco;Amazon", "aff_domain": "cs.ucla.edu;cisco.com;amazon.com", "position": "PhD student;Researcher;Researcher", "bibtex": "@inproceedings{\nyin2024characterizing,\ntitle={Characterizing Truthfulness in Large Language Model Generations with Local Intrinsic Dimension},\nauthor={Fan Yin and Jayanth Srinivasa and Kai-Wei Chang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7DbIyQlfaO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 885406, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3580910911007993645&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "cs.ucla.edu;cisco.com;amazon.com", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of California, Los Angeles;Cisco Systems;Amazon", "aff_unique_dep": ";;Amazon.com, Inc.", "aff_unique_url": "https://www.ucla.edu;https://www.cisco.com;https://www.amazon.com", "aff_unique_abbr": "UCLA;Cisco;Amazon", "aff_campus_unique_index": "0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "DiffFPR: Diffusion Prior for Oversampled Fourier Phase Retrieval", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34889", "id": "7E4c2gyP0R", "proceeding": "https://proceedings.mlr.press/v235/li24bj.html", "pdf": "https://openreview.net/pdf?id=7E4c2gyP0R", "openreview": "https://openreview.net/forum?id=7E4c2gyP0R", "author_site": "Ji Li, Chao Wang", "tldr": "", "abstract": "This paper tackled the challenging Fourier phase retrieval problem, the *absolute uniqueness* of which does not hold. The existence of *equivalent solution* (a.k.a. trivial solution ambiguity) hinders the successful recovery, especially for multi-channel color image. The traditional iterative engine, such as the Relaxed Averaged Alternating Reflections (RAAR), can be applied to reconstruct the image channel-wisely. However, due to the *relative uniqueness* of the solution, the restoration is not automatically aligned with the accurate orientation for each channel, resulting in a reconstructed image that deviates significantly from the true solution manifold. To address this issue, by penalizing the mismatch of the image channels, a diffusion model as the strong prior of the color image is integrated into the iterative engine. The combination of the traditional iterative engine and the diffusion model provides an effective solution to the oversampled Fourier phase retrieval. The formed algorithm, *DiffFPR*, is validated by experiments. The code is available at https://github.com/Chilie/DiffFPR.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ji Li;Chao Wang", "authorids": "~Ji_Li3;~Chao_Wang35", "gender": "M;F", "homepage": "http://chilie.github.io/about.html;https://scholar.google.com/citations?hl=en&user=57qzWYMAAAAJ", "dblp": ";", "google_scholar": "utI10ZoAAAAJ;", "orcid": "0000-0001-5244-0778;", "linkedin": ";", "or_profile": "~Ji_Li3;~Chao_Wang35", "aff": "Capital Normal University;University of Kansas Medical Center", "aff_domain": "cnu.edu.cn;kumc.edu", "position": "Associate Professor;Postdoc", "bibtex": "@inproceedings{\nli2024difffpr,\ntitle={Diff{FPR}: Diffusion Prior for Oversampled Fourier Phase Retrieval},\nauthor={Ji Li and Chao Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7E4c2gyP0R}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 570840, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17526380868715866106&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "cnu.edu.cn;kumc.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Capital Normal University;University of Kansas Medical Center", "aff_unique_dep": ";", "aff_unique_url": "http://www.cnu.edu.cn;https://www.kumc.edu", "aff_unique_abbr": "CNU;KUMC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "title": "Position: Standardization of Behavioral Use Clauses is Necessary for the Adoption of Responsible Licensing of AI", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34888", "id": "7JKVPNEBkU", "proceeding": "https://proceedings.mlr.press/v235/mcduff24a.html", "pdf": "https://openreview.net/pdf?id=7JKVPNEBkU", "openreview": "https://openreview.net/forum?id=7JKVPNEBkU", "author_site": "Daniel McDuff, Tim Korjakow, Scott Cambo, Jesse Benjamin, Jenny Lee, Yacine Jernite, Carlos Mu\u00f1oz Ferrandis, Aaron Gokaslan, Alek Tarkowski, Joseph Lindley, A. Feder Cooper, Danish Contractor", "tldr": "", "abstract": "Growing concerns over negligent or malicious uses of AI have increased the appetite for tools that help manage the risks of the technology. In 2018, licenses with behaviorial-use clauses (commonly referred to as Responsible AI Licenses) were proposed to give developers a framework for releasing AI assets while specifying their users to mitigate negative applications. As of the end of 2023, on the order of 40,000 software and model repositories have adopted responsible AI licenses licenses. Notable models licensed with behavioral use clauses include BLOOM (language) and LLaMA2 (language), Stable Diffusion (image), and GRID (robotics). This paper explores why and how these licenses have been adopted, and why and how they have been adapted to fit particular use cases. We use a mixed-methods methodology of qualitative interviews, clustering of license clauses, and quantitative analysis of license adoption. Based on this evidence we take the position that responsible AI licenses need standardization to avoid confusing users or diluting their impact. At the same time, customization of behavioral restrictions is also appropriate in some contexts (e.g., medical domains). We advocate for \u201cstandardized customization\u201d that can meet users\u2019 needs and can be supported via tooling.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Daniel McDuff;Tim Korjakow;Scott Cambo;Jesse Josua Benjamin;Jenny Lee;Yacine Jernite;Carlos Mu\u00f1oz Ferrandis;Aaron Gokaslan;Alek Tarkowski;Joseph Lindley;A. Feder Cooper;Danish Contractor", "authorids": "~Daniel_McDuff1;tim.korjakow@gmx.de;scottallencambo@gmail.com;j.j.benjamin@lancaster.ac.uk;leewall@gmail.com;~Yacine_Jernite1;~Carlos_Mu\u00f1oz_Ferrandis1;~Aaron_Gokaslan1;alek@openfuture.eu;joseph.lindley@gmail.com;~A._Feder_Cooper1;~Danish_Contractor2", "gender": "M;;;;;M;;M;;;;", "homepage": "http://alumni.media.mit.edu/~djmcduff/;;;;;http://cs.nyu.edu/~jernite/yj/;;https://skylion007.github.io/;;;https://afedercooper.info;", "dblp": "63/9606;;;;;http://dblp.uni-trier.de/pers/hd/j/Jernite:Yacine;;220/6816;;;260/0514;", "google_scholar": "m7Jr-b4AAAAJ;;;;;AK_7EBgAAAAJ;;Mt2wyL4AAAAJ;;;https://scholar.google.ch/citations?hl=en;", "orcid": ";;;;;;;0000-0002-3575-2961;;;0000-0002-4892-681X;", "linkedin": ";;;;;;carlos-mu%C3%B1oz-ferrandis-a22592105/;aarongokaslan/;;;;", "or_profile": "~Daniel_McDuff1;tim.korjakow@gmx.de;scottallencambo@gmail.com;j.j.benjamin@lancaster.ac.uk;leewall@gmail.com;~Yacine_Jernite1;~Carlos_Mu\u00f1oz_Ferrandis1;~Aaron_Gokaslan1;alek@openfuture.eu;joseph.lindley@gmail.com;~A._Feder_Cooper1;~Danish_Contractor2", "aff": "Google;;;;;;;Cornell University;;;Cornell University;", "aff_domain": "google.com;;;;;;;cornell.edu;;;cornell.edu;", "position": "Principal Researcher;;;;;;;PhD student;;;PhD student;", "bibtex": "@inproceedings{\nmcduff2024position,\ntitle={Position: Standardization of Behavioral Use Clauses is Necessary for the Adoption of Responsible Licensing of {AI}},\nauthor={Daniel McDuff and Tim Korjakow and Scott Cambo and Jesse Josua Benjamin and Jenny Lee and Yacine Jernite and Carlos Mu{\\~n}oz Ferrandis and Aaron Gokaslan and Alek Tarkowski and Joseph Lindley and A. Feder Cooper and Danish Contractor},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7JKVPNEBkU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 705744, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "email": "google.com;;;;;;;cornell.edu;;;cornell.edu;", "author_num": 12, "aff_unique_index": "0;1;1", "aff_unique_norm": "Google;Cornell University", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.cornell.edu", "aff_unique_abbr": "Google;Cornell", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Convergence and Complexity Guarantee for Inexact First-order Riemannian Optimization Algorithms", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34887", "id": "7KtFQnF368", "proceeding": "https://proceedings.mlr.press/v235/li24b.html", "pdf": "https://openreview.net/pdf?id=7KtFQnF368", "openreview": "https://openreview.net/forum?id=7KtFQnF368", "author_site": "Yuchen Li, Laura Balzano, Deanna Needell, Hanbaek Lyu", "tldr": "", "abstract": "We analyze inexact Riemannian gradient descent (RGD) where Riemannian gradients and retractions are inexactly (and cheaply) computed. Our focus is on understanding when inexact RGD converges and what is the complexity in the general nonconvex and constrained setting. We answer these questions in a general framework of tangential Block Majorization-Minimization (tBMM). We establish that tBMM converges to an $\\epsilon$-stationary point within $O(\\epsilon^{-2})$ iterations. Under a mild assumption, the results still hold when the subproblem is solved inexactly in each iteration provided the total optimality gap is bounded. Our general analysis applies to a wide range of classical algorithms with Riemannian constraints including inexact RGD and proximal gradient method on Stiefel manifolds. We numerically validate that tBMM shows improved performance over existing methods when applied to various problems, including nonnegative tensor decomposition with Riemannian constraints, regularized nonnegative matrix factorization, and low-rank matrix recovery problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuchen Li;Laura Balzano;Deanna Needell;Hanbaek Lyu", "authorids": "~Yuchen_Li11;~Laura_Balzano1;~Deanna_Needell2;~Hanbaek_Lyu1", "gender": "M;F;Not Specified;", "homepage": "https://yuchenli966.com/;http://web.eecs.umich.edu/~girasole/;https://www.math.ucla.edu/~deanna/index.html;https://www.hanbaeklyu.com", "dblp": ";25/6625;03/2691;", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;https://scholar.google.com/citations?hl=en;;gDFWvgQAAAAJ", "orcid": ";0000-0003-2914-123X;0000-0002-8058-8638;", "linkedin": ";;;", "or_profile": "~Yuchen_Li11;~Laura_Balzano1;~Deanna_Needell2;~Hanbaek_Lyu1", "aff": "University of Wisconsin - Madison;University of Michigan - Ann Arbor;University of California, Los Angeles;University of Wisconsin, Madison", "aff_domain": "wisc.edu;umich.edu;ucla.edu;wisc.edu", "position": "PhD student;Associate Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nli2024convergence,\ntitle={Convergence and Complexity Guarantee for Inexact First-order Riemannian Optimization Algorithms},\nauthor={Yuchen Li and Laura Balzano and Deanna Needell and Hanbaek Lyu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7KtFQnF368}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1751358, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_Vj1ISK54fAJ:scholar.google.com/&scioq=Convergence+and+Complexity+Guarantee+for+Inexact+First-order+Riemannian+Optimization+Algorithms&hl=en&as_sdt=0,48", "gs_version_total": 8, "email": "wisc.edu;umich.edu;ucla.edu;wisc.edu", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Wisconsin-Madison;University of Michigan;University of California, Los Angeles;University of Wisconsin", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.wisc.edu;https://www.umich.edu;https://www.ucla.edu;https://www.wisc.edu", "aff_unique_abbr": "UW-Madison;UM;UCLA;UW", "aff_campus_unique_index": "0;1;2;0", "aff_campus_unique": "Madison;Ann Arbor;Los Angeles", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Double Momentum Method for Lower-Level Constrained Bilevel Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34886", "id": "7OPHCeXcSS", "proceeding": "https://proceedings.mlr.press/v235/shi24a.html", "pdf": "https://openreview.net/pdf?id=7OPHCeXcSS", "openreview": "https://openreview.net/forum?id=7OPHCeXcSS", "author_site": "Wanli Shi, Yi Chang, Bin Gu", "tldr": "", "abstract": "Bilevel optimization (BO) has recently gained prominence in many machine learning applications due to its ability to capture the nested structure inherent in these problems. Recently, many hypergradient methods have been proposed as effective solutions for solving large-scale problems. However, current hypergradient methods for the lower-level constrained bilevel optimization (LCBO) problems need very restrictive assumptions, namely, where optimality conditions satisfy the differentiability and invertibility conditions, and lack a solid analysis of the convergence rate. What's worse, existing methods require either double-loop updates, which are sometimes less efficient. To solve this problem, in this paper, we propose a new hypergradient of LCBO leveraging the theory of nonsmooth implicit function theorem instead of using the restrive assumptions. In addition, we propose a *single-loop single-timescale* algorithm based on the double-momentum method and adaptive step size method and prove it can return a $(\\delta, \\epsilon)$-stationary point with $\\tilde{\\mathcal{O}}(d_2^2\\epsilon^{-4})$ iterations. Experiments on two applications demonstrate the effectiveness of our proposed method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wanli Shi;Yi Chang;Bin Gu", "authorids": "~Wanli_Shi1;~Yi_Chang4;~Bin_Gu1", "gender": "M;M;M", "homepage": ";http://www.yichang-cs.com;https://mbzuai.ac.ae/study/faculty/bin-gu/", "dblp": "245/9064;02/5438.html;29/1758-1", "google_scholar": "Li38vbwAAAAJ;https://scholar.google.com.hk/citations?user=drEkR50AAAAJ;Vo8OgCgAAAAJ", "orcid": ";0000-0003-2697-8093;0000-0001-6049-1815", "linkedin": ";;", "or_profile": "~Wanli_Shi1;~Yi_Chang4;~Bin_Gu1", "aff": "Mohamed bin Zayed University of Artificial Intelligence;Jilin University, China;Mohamed bin Zayed University of Artificial Intelligence", "aff_domain": "mbzuai.ac.ae;jlu.edu.cn;mbzuai.ac.ae", "position": "Postdoc;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nshi2024double,\ntitle={Double Momentum Method for Lower-Level Constrained Bilevel Optimization},\nauthor={Wanli Shi and Yi Chang and Bin Gu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7OPHCeXcSS}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 956039, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4091420892703425345&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "mbzuai.ac.ae;jlu.edu.cn;mbzuai.ac.ae", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;Jilin University", "aff_unique_dep": ";", "aff_unique_url": "https://mbzuai.ac.ae;http://www.jlu.edu.cn", "aff_unique_abbr": "MBZUAI;JLU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United Arab Emirates;China" }, { "title": "Switching the Loss Reduces the Cost in Batch Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34885", "id": "7PXSc5fURu", "proceeding": "https://proceedings.mlr.press/v235/ayoub24a.html", "pdf": "https://openreview.net/pdf?id=7PXSc5fURu", "openreview": "https://openreview.net/forum?id=7PXSc5fURu", "author_site": "Alex Ayoub, Kaiwen Wang, Vincent Liu, Samuel Robertson, James McInerney, Dawen Liang, Nathan Kallus, Csaba Szepesvari", "tldr": "", "abstract": "We propose training fitted Q-iteration with log-loss (FQI-LOG) for batch reinforcement learning (RL). We show that the number of samples needed to learn a near-optimal policy with FQI-LOG scales with the accumulated cost of the optimal policy, which is zero in problems where acting optimally achieves the goal and incurs no cost. In doing so, we provide a general framework for proving small-cost bounds, i.e. bounds that scale with the optimal achievable cost, in batch RL. Moreover, we empirically verify that FQI-LOG uses fewer samples than FQI trained with squared loss on problems where the optimal policy reliably achieves the goal.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alex Ayoub;Kaiwen Wang;Vincent Liu;Samuel Robertson;James McInerney;Dawen Liang;Nathan Kallus;Csaba Szepesvari", "authorids": "~Alex_Ayoub1;~Kaiwen_Wang1;~Vincent_Liu3;smrobert@ualberta.ca;~James_McInerney2;~Dawen_Liang1;~Nathan_Kallus1;~Csaba_Szepesvari1", "gender": "M;M;;;;M;;M", "homepage": ";https://kaiwenw.github.io/;;;http://jamesmc.com;https://dawenl.github.io;http://nathankallus.com/;https://sites.ualberta.ca/~szepesva/", "dblp": "266/8071;220/3822;;;128/4650;63/10572;142/2900;http://dblp.uni-trier.de/pers/hd/s/Szepesv=aacute=ri:Csaba", "google_scholar": "eh0TSgYAAAAJ;HsMheBUAAAAJ;https://scholar.google.ca/citations?hl=en;;0rXgFbsAAAAJ;4c1ZNm4AAAAJ;K2WfIlsAAAAJ;https://scholar.google.ca/citations?user=zvC19mQAAAAJ", "orcid": ";;;;0009-0004-6025-5555;;0000-0003-1672-0507;", "linkedin": ";kaiwenw/;;;jemcinerney/;;;csaba-szepesvari-09376b1?trk=hp-identity-name", "or_profile": "~Alex_Ayoub1;~Kaiwen_Wang1;~Vincent_Liu3;smrobert@ualberta.ca;~James_McInerney2;~Dawen_Liang1;~Nathan_Kallus1;~Csaba_Szepesvari1", "aff": "NetFlix;Department of Computer Science, Cornell University;University of British Columbia;;Netflix;Netflix;Cornell University;Google DeepMind", "aff_domain": "netflix.com;cs.cornell.edu;ubc.ca;;netflix.com;netflix.com;cornell.edu;google.com", "position": "Intern;PhD student;Postdoc;;Snr Research Scientist;Research Scientist;Associate Professor;Research Scientist", "bibtex": "@inproceedings{\nayoub2024switching,\ntitle={Switching the Loss Reduces the Cost in Batch Reinforcement Learning},\nauthor={Alex Ayoub and Kaiwen Wang and Vincent Liu and Samuel Robertson and James McInerney and Dawen Liang and Nathan Kallus and Csaba Szepesvari},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7PXSc5fURu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 538432, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6196660514586112022&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "netflix.com;cs.cornell.edu;ubc.ca;;netflix.com;netflix.com;cornell.edu;google.com", "author_num": 8, "aff_unique_index": "0;1;2;0;0;1;3", "aff_unique_norm": "Netflix;Cornell University;University of British Columbia;Google", "aff_unique_dep": ";Department of Computer Science;;Google DeepMind", "aff_unique_url": "https://www.netflix.com;https://www.cornell.edu;https://www.ubc.ca;https://deepmind.com", "aff_unique_abbr": "Netflix;Cornell;UBC;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0;2", "aff_country_unique": "United States;Canada;United Kingdom" }, { "title": "Policy Learning for Balancing Short-Term and Long-Term Rewards", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34884", "id": "7Qf1uHTahP", "proceeding": "https://proceedings.mlr.press/v235/wu24x.html", "pdf": "https://openreview.net/pdf?id=7Qf1uHTahP", "openreview": "https://openreview.net/forum?id=7Qf1uHTahP", "author_site": "Peng Wu, Ziyu Shen, Feng Xie, Wang Zhongyao, Chunchen LIU, Yan Zeng", "tldr": "", "abstract": "Empirical researchers and decision-makers spanning various domains frequently seek profound insights into the long-term impacts of interventions. While the significance of long-term outcomes is undeniable, an overemphasis on them may inadvertently overshadow short-term gains. Motivated by this, this paper formalizes a new framework for learning the optimal policy that effectively balances both long-term and short-term rewards, where some long-term outcomes are allowed to be missing. In particular, we first present the identifiability of both rewards under mild assumptions. Next, we deduce the semiparametric efficiency bounds, along with the consistency and asymptotic normality of their estimators. We also reveal that short-term outcomes, if associated, contribute to improving the estimator of the long-term reward. Based on the proposed estimators, we develop a principled policy learning approach and further derive the convergence rates of regret and estimation errors associated with the learned policy. Extensive experiments are conducted to validate the effectiveness of the proposed method, demonstrating its practical applicability.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Peng Wu;Ziyu Shen;Feng Xie;Wang Zhongyao;Chunchen LIU;Yan Zeng", "authorids": "~Peng_Wu5;~Ziyu_Shen2;~Feng_Xie1;~Wang_Zhongyao1;~Chunchen_LIU2;~Yan_Zeng2", "gender": "M;F;M;M;F;", "homepage": "https://pengwu.site/;;https://fengxie.site/;;;https://scholar.google.com/citations?user=XyxLHCAAAAAJ&hl=zh-CN", "dblp": "15/6146-12;;11/4605-2;12/7774;;83/4665-2", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;;stLFCtQAAAAJ;;IkbNsd4AAAAJ;XyxLHCAAAAAJ", "orcid": "0000-0001-7154-8880;0009-0009-5015-8117;0000-0001-7229-3955;;;0000-0001-7721-2560", "linkedin": ";;;https://cn.linkedin.com/in/zhongyao-wang-35082b21;chunchen-liu-76915766/;", "or_profile": "~Peng_Wu5;~Ziyu_Shen2;~Feng_Xie1;~Wang_Zhongyao1;~Chunchen_LIU2;~Yan_Zeng2", "aff": "Beijing Technology and Business University;Beijing Technology and Business University;Beijing Technology and Business University;Alibaba Group;Alibaba Group;Beijing Technology and Business University", "aff_domain": "btbu.edu.cn;btbu.edu;btbu.edu.cn;alibaba-inc.com;alibaba-inc.com;btbu.edu.cn", "position": "Associate Professor;MS student;Associate Professor;Researcher;Researcher;Lecturer", "bibtex": "@inproceedings{\nwu2024policy,\ntitle={Policy Learning for Balancing Short-Term and Long-Term Rewards},\nauthor={Peng Wu and Ziyu Shen and Feng Xie and Wang Zhongyao and Chunchen LIU and Yan Zeng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7Qf1uHTahP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 551324, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=222869954787242563&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 7, "email": "btbu.edu.cn;btbu.edu;btbu.edu.cn;alibaba-inc.com;alibaba-inc.com;btbu.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;1;1;0", "aff_unique_norm": "Beijing Technology and Business University;Alibaba Group", "aff_unique_dep": ";", "aff_unique_url": "http://www.btbu.edu.cn;https://www.alibaba.com", "aff_unique_abbr": "BTBU;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Structured Chemistry Reasoning with Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34883", "id": "7R3pzxTSlg", "proceeding": "https://proceedings.mlr.press/v235/ouyang24a.html", "pdf": "https://openreview.net/pdf?id=7R3pzxTSlg", "openreview": "https://openreview.net/forum?id=7R3pzxTSlg", "author_site": "Siru Ouyang, Zhuosheng Zhang, Bing Yan, Xuan Liu, Yejin Choi, Jiawei Han, Lianhui Qin", "tldr": "", "abstract": "Large Language Models (LLMs) excel in diverse areas, yet struggle with complex scientific reasoning, especially in the field of chemistry. Different from the simple chemistry tasks (e.g., molecule classification) addressed in previous studies, complex chemistry problems require not only vast knowledge and precise calculation, but also compositional reasoning about rich dynamic interactions of different concepts (e.g., temperature changes). Our study shows that even advanced LLMs, like GPT-4, can fail easily in different ways. Interestingly, the errors often stem not from a lack of domain knowledge within the LLMs, but rather from the absence of an effective reasoning *structure* that guides the LLMs to elicit the right knowledge, incorporate the knowledge in step-by-step reasoning, and iteratively refine results for further improved quality. On this basis, we introduce StructChem, a simple yet effective prompting strategy that offers the desired guidance and substantially boosts the LLMs' chemical reasoning capability. Testing across four chemistry areas---quantum chemistry, mechanics, physical chemistry, and kinetics---StructChem substantially enhances GPT-4's performance, with up to 30% peak improvement. Our analysis also underscores the unique difficulties of precise grounded reasoning in science with LLMs, highlighting a need for more research in this area.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Siru Ouyang;Zhuosheng Zhang;Bing Yan;Xuan Liu;Yejin Choi;Jiawei Han;Lianhui Qin", "authorids": "~Siru_Ouyang1;~Zhuosheng_Zhang1;~Bing_Yan2;~Xuan_Liu6;~Yejin_Choi1;~Jiawei_Han1;~Lianhui_Qin1", "gender": "F;M;F;M;F;M;F", "homepage": "https://ozyyshr.github.io;https://bcmi.sjtu.edu.cn/~zhangzs/;https://bingyan.me/;;https://yejinc.github.io/;http://hanj.cs.illinois.edu/;https://lianhui.ucsd.edu/", "dblp": "https://dblp.org/search/pid/api?q=author:Siru_Ouyang:;06/9708;;;89/579-1;h/JiaweiHan.html;184/3753", "google_scholar": "fetoihAAAAAJ;https://scholar.google.co.jp/citations?user=63LTQhgAAAAJ;337w9-V_WxIC;XbtWYioAAAAJ;vhP-tlcAAAAJ;https://scholar.google.com.tw/citations?user=Kv9AbjMAAAAJ;smd19iIAAAAJ", "orcid": "0009-0001-1331-424X;0000-0002-4183-3645;;;;0000-0002-3629-2696;", "linkedin": ";;;;;;", "or_profile": "~Siru_Ouyang1;~Zhuosheng_Zhang1;~Bing_Yan2;~Xuan_Liu6;~Yejin_Choi1;~Jiawei_Han1;~Lianhui_Qin1", "aff": "University of Illinois Urbana-Champaign Champaign;Shanghai Jiaotong University;New York University;University of Illinois Urbana-Champaign;Department of Computer Science, University of Washington;University of Illinois at Urbana-Champaign (UIUC);Allen Institute for Artificial Intelligence", "aff_domain": "illinois.edu;sjtu.edu.cn;nyu.edu;illinois.edu;cs.washington.edu;illinois.edu;allenai.org", "position": "PhD student;Assistant Professor;PhD student;PhD student;Full Professor;Full Professor;Postdoc", "bibtex": "@inproceedings{\nouyang2024structured,\ntitle={Structured Chemistry Reasoning with Large Language Models},\nauthor={Siru Ouyang and Zhuosheng Zhang and Bing Yan and Xuan Liu and Yejin Choi and Jiawei Han and Lianhui Qin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7R3pzxTSlg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3226253, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3696879512594558544&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "illinois.edu;sjtu.edu.cn;nyu.edu;illinois.edu;cs.washington.edu;illinois.edu;allenai.org", "author_num": 7, "aff_unique_index": "0;1;2;0;3;0;4", "aff_unique_norm": "University of Illinois Urbana-Champaign;Shanghai Jiao Tong University;New York University;University of Washington;Allen Institute for Artificial Intelligence", "aff_unique_dep": ";;;Department of Computer Science;", "aff_unique_url": "https://illinois.edu;https://www.sjtu.edu.cn;https://www.nyu.edu;https://www.washington.edu;https://allenai.org", "aff_unique_abbr": "UIUC;SJTU;NYU;UW;AI2", "aff_campus_unique_index": "0;2;3;2", "aff_campus_unique": "Champaign;;Urbana-Champaign;Seattle", "aff_country_unique_index": "0;1;0;0;0;0;0", "aff_country_unique": "United States;China" }, { "title": "AttNS: Attention-Inspired Numerical Solving For Limited Data Scenarios", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34882", "id": "7RHFdkAkVY", "proceeding": "https://proceedings.mlr.press/v235/huang24m.html", "pdf": "https://openreview.net/pdf?id=7RHFdkAkVY", "openreview": "https://openreview.net/forum?id=7RHFdkAkVY", "author_site": "Zhongzhan Huang, Mingfu Liang, Shanshan Zhong, Liang Lin", "tldr": "", "abstract": "We propose the attention-inspired numerical solver (AttNS), a concise method that helps the generalization and robustness issues faced by the AI-Hybrid numerical solver in solving differential equations due to limited data. AttNS is inspired by the effectiveness of attention modules in Residual Neural Networks (ResNet) in enhancing model generalization and robustness for conventional deep learning tasks. Drawing from the dynamical system perspective of ResNet, We seamlessly incorporate attention mechanisms into the design of numerical methods tailored for the characteristics of solving differential equations. Our results on benchmarks, ranging from high-dimensional problems to chaotic systems, showcase AttNS consistently enhancing various numerical solvers without any intricate model crafting. Finally, we analyze AttNS experimentally and theoretically, demonstrating its ability to achieve strong generalization and robustness while ensuring the convergence of the solver. This includes requiring less data compared to other advanced methods to achieve comparable generalization errors and better prevention of numerical explosion issues when solving differential equations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhongzhan Huang;Mingfu Liang;Shanshan Zhong;Liang Lin", "authorids": "~Zhongzhan_Huang1;~Mingfu_Liang1;~Shanshan_Zhong1;~Liang_Lin1", "gender": "M;M;;M", "homepage": "https://dedekinds.github.io/;https://mingfuliang.com/;;http://www.linliang.net", "dblp": "241/9753;241/9790;;", "google_scholar": "R-b68CEAAAAJ;_uUUvt4AAAAJ;;https://scholar.google.com.hk/citations?user=Nav8m8gAAAAJ", "orcid": ";0000-0001-6779-2418;;", "linkedin": ";;;", "or_profile": "~Zhongzhan_Huang1;~Mingfu_Liang1;~Shanshan_Zhong1;~Liang_Lin1", "aff": "Sun Yat-Sen University;Northwestern University;;SUN YAT-SEN UNIVERSITY", "aff_domain": "sysu.edu.cn;northwestern.edu;;sysu.edu.cn", "position": "PhD student;PhD student;;Full Professor", "bibtex": "@inproceedings{\nhuang2024attns,\ntitle={Att{NS}: Attention-Inspired Numerical Solving For Limited Data Scenarios},\nauthor={Zhongzhan Huang and Mingfu Liang and Shanshan Zhong and Liang Lin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7RHFdkAkVY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 993028, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11166892139902117861&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 5, "email": "sysu.edu.cn;northwestern.edu;;sysu.edu.cn", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Sun Yat-sen University;Northwestern University", "aff_unique_dep": ";", "aff_unique_url": "http://www.sysu.edu.cn/;https://www.northwestern.edu", "aff_unique_abbr": "SYSU;NU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;United States" }, { "title": "A Geometric Decomposition of Finite Games: Convergence vs. Recurrence under Exponential Weights", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34881", "id": "7RSIGQRT1F", "proceeding": "https://proceedings.mlr.press/v235/legacci24a.html", "pdf": "https://openreview.net/pdf?id=7RSIGQRT1F", "openreview": "https://openreview.net/forum?id=7RSIGQRT1F", "author_site": "Davide Legacci, Panayotis Mertikopoulos, Bary Pradelski", "tldr": "", "abstract": "In view of the complexity of the dynamics of learning in games, we seek to decompose a game into simpler components where the dynamics' long-run behavior is well understood. A natural starting point for this is Helmholtz's theorem, which decomposes a vector field into a potential and an incompressible component. However, the geometry of game dynamics - and, in particular, the dynamics of exponential / multiplicative weights (EW) schemes - is not compatible with the Euclidean underpinnings of Helmholtz's theorem. This leads us to consider a specific Riemannian framework based on the so-called *Shahshahani metric*, and introduce the class of *incompressible games*, for which we establish the following results: First, in addition to being volume-preserving, the continuous-time EW dynamics in incompressible games admit a constant of motion and are *Poincar\u00e9 recurrent* - i.e., almost every trajectory of play comes arbitrarily close to its starting point infinitely often. Second, we establish a deep connection with a well-known decomposition of games into a potential and harmonic component (where the players' objectives are aligned and anti-aligned respectively): a game is incompressible if and only if it is harmonic, implying in turn that the EW dynamics lead to Poincar\u00e9 recurrence in harmonic games.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Davide Legacci;Panayotis Mertikopoulos;Bary Pradelski", "authorids": "davide.legacci@univ-grenoble-alpes.fr;~Panayotis_Mertikopoulos1;~Bary_Pradelski1", "gender": ";M;", "homepage": ";http://polaris.imag.fr/panayotis.mertikopoulos/;https://barypradelski.com/", "dblp": ";49/6721;46/11488", "google_scholar": ";xsusqPYAAAAJ;", "orcid": ";0000-0003-2026-9616;", "linkedin": ";;", "or_profile": "davide.legacci@univ-grenoble-alpes.fr;~Panayotis_Mertikopoulos1;~Bary_Pradelski1", "aff": ";French National Center for Scientific Research;CNRS", "aff_domain": ";imag.fr;cnrs.fr", "position": ";Principal Researcher;Researcher", "bibtex": "@inproceedings{\nlegacci2024a,\ntitle={A Geometric Decomposition of Finite Games: Convergence vs. Recurrence under Exponential Weights},\nauthor={Davide Legacci and Panayotis Mertikopoulos and Bary Pradelski},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7RSIGQRT1F}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4134180, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4152082156071838884&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 17, "email": ";imag.fr;cnrs.fr", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "French National Center for Scientific Research;Centre National de la Recherche Scientifique", "aff_unique_dep": ";", "aff_unique_url": "https://www.cnrs.fr;https://www.cnrs.fr", "aff_unique_abbr": "CNRS;CNRS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "title": "Online Matrix Completion: A Collaborative Approach with Hott Items", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34880", "id": "7XZKzQtooN", "proceeding": "https://proceedings.mlr.press/v235/baby24a.html", "pdf": "https://openreview.net/pdf?id=7XZKzQtooN", "openreview": "https://openreview.net/forum?id=7XZKzQtooN", "author_site": "Dheeraj Baby, Soumyabrata Pal", "tldr": "", "abstract": "We investigate the low rank matrix completion problem in an online setting with ${M}$ users, ${N}$ items, ${T}$ rounds, and an unknown rank-$r$ reward matrix ${R}\\in \\mathbb{R}^{{M}\\times {N}}$. This problem has been well-studied in the literature and has several applications in practice. In each round, we recommend ${S}$ carefully chosen distinct items to every user and observe noisy rewards. In the regime where ${M},{N} >> {T}$, we propose two distinct computationally efficient algorithms for recommending items to users and analyze them under the benign *hott items* assumption 1) First, for ${S}=1$, under additional incoherence/smoothness assumptions on ${R}$, we propose the phased algorithm PhasedClusterElim. Our algorithm obtains a near-optimal per-user regret of $\\tilde{O}({N}{M}^{-1}(\\Delta^{-1}+\\Delta_{\\text{hott}}^{-2}))$ where $\\Delta_{\\text{hott}},\\Delta$ are problem-dependent gap parameters with $\\Delta_{\\text{hott}} >> \\Delta$ almost always. 2) Second, we consider a simplified setting with ${S}=r$ where we make significantly milder assumptions on ${R}$. Here, we introduce another phased algorithm, DeterminantElim, to derive a regret guarantee of $\\tilde{O}({N}{M}^{-1/r}\\Delta_\\text{det}^{-1}))$ where $\\Delta_{\\text{det}}$ is another problem-dependent gap. Both algorithms crucially use collaboration among users to jointly eliminate sub-optimal items for groups of users successively in phases, but with distinctive and novel approaches.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dheeraj Baby;Soumyabrata Pal", "authorids": "~Dheeraj_Baby1;~Soumyabrata_Pal1", "gender": ";M", "homepage": "https://dheeraj-b.github.io/home/;https://soumyabratap.github.io/", "dblp": ";206/6371", "google_scholar": "L3YF8nIAAAAJ;J4UxoTEAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Dheeraj_Baby1;~Soumyabrata_Pal1", "aff": "University of California, Santa Barbara;Adobe Systems", "aff_domain": "cs.ucsb.edu;adobe.com", "position": "PhD student;Researcher", "bibtex": "@inproceedings{\nbaby2024online,\ntitle={Online Matrix Completion: A Collaborative Approach with Hott Items},\nauthor={Dheeraj Baby and Soumyabrata Pal},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7XZKzQtooN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 669658, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1789851647445944409&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "cs.ucsb.edu;adobe.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of California, Santa Barbara;Adobe", "aff_unique_dep": ";Adobe Systems Incorporated", "aff_unique_url": "https://www.ucsb.edu;https://www.adobe.com", "aff_unique_abbr": "UCSB;Adobe", "aff_campus_unique_index": "0", "aff_campus_unique": "Santa Barbara;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Temporal Logic Specification-Conditioned Decision Transformer for Offline Safe Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34879", "id": "7bg10Jj3bG", "proceeding": "https://proceedings.mlr.press/v235/guo24j.html", "pdf": "https://openreview.net/pdf?id=7bg10Jj3bG", "openreview": "https://openreview.net/forum?id=7bg10Jj3bG", "author_site": "Zijian Guo, Weichao Zhou, Wenchao Li", "tldr": "", "abstract": "Offline safe reinforcement learning (RL) aims to train a constraint satisfaction policy from a fixed dataset. Current state-of-the-art approaches are based on supervised learning with a conditioned policy. However, these approaches fall short in real-world applications that involve complex tasks with rich temporal and logical structures. In this paper, we propose temporal logic Specification-conditioned Decision Transformer (SDT), a novel framework that harnesses the expressive power of signal temporal logic (STL) to specify complex temporal rules that an agent should follow and the sequential modeling capability of Decision Transformer (DT). Empirical evaluations on the DSRL benchmarks demonstrate the better capacity of SDT in learning safe and high-reward policies compared with existing approaches. In addition, SDT shows good alignment with respect to different desired degrees of satisfaction of the STL specification that it is conditioned on.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zijian Guo;Weichao Zhou;Wenchao Li", "authorids": "~Zijian_Guo1;~Weichao_Zhou1;~Wenchao_Li1", "gender": "M;M;", "homepage": "https://ja4822.github.io/zijianguo.github.io/;https://sites.google.com/view/zwc662/;http://sites.bu.edu/depend/", "dblp": "11/4679-2;207/8077;23/5721-1", "google_scholar": "TxBFprEAAAAJ;JdiJIF0AAAAJ;zwA5eokAAAAJ", "orcid": "0000-0002-9791-6749;0009-0002-0369-2113;", "linkedin": "zijian-guo/;;", "or_profile": "~Zijian_Guo1;~Weichao_Zhou1;~Wenchao_Li1", "aff": "Boston University, Boston University;Boston University;Boston University", "aff_domain": "bu.edu;bu.edu;bu.edu", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nguo2024temporal,\ntitle={Temporal Logic Specification-Conditioned Decision Transformer for Offline Safe Reinforcement Learning},\nauthor={Zijian Guo and Weichao Zhou and Wenchao Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7bg10Jj3bG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2120312, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5901815532993616854&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "bu.edu;bu.edu;bu.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Boston University", "aff_unique_dep": "", "aff_unique_url": "https://www.bu.edu", "aff_unique_abbr": "BU", "aff_campus_unique_index": "0", "aff_campus_unique": "Boston;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Maestro: Uncovering Low-Rank Structures via Trainable Decomposition", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34878", "id": "7bjyambg4x", "proceeding": "https://proceedings.mlr.press/v235/horvath24a.html", "pdf": "https://openreview.net/pdf?id=7bjyambg4x", "openreview": "https://openreview.net/forum?id=7bjyambg4x", "author_site": "Samuel Horv\u00e1th, Stefanos Laskaridis, Shashank Rajput, Hongyi Wang", "tldr": "", "abstract": "Deep Neural Networks (DNNs) have been a large driver for AI breakthroughs in recent years, ranging from self-driving cars to intelligent assistants. However, these models have been getting increasingly large as they become more accurate and safe. This means that their training becomes increasingly costly and time-consuming, and typically yields a single model to fit all targets. To mitigate this, various techniques have been proposed in the literature, including pruning, sparsification or quantization of the model weights and updates. While achieving high compression rates, they often incur significant computational overheads at training or lead to non-negligible accuracy penalty. Alternatively, factorization methods have been leveraged for low-rank compression of DNNs. Similarly, such techniques (e.g., SVD) frequently rely on heavy iterative decompositions of layers and are potentially sub-optimal for non-linear models, such as DNNs. We take a further step in designing efficient low-rank models and propose Maestro, a framework for trainable low-rank layers. Instead of iteratively applying a priori decompositions, the low-rank structure is baked into the training process through LoD, a low-rank ordered decomposition. Not only is this the first time importance ordering via sampling is applied on the decomposed DNN structure, but it also allows selecting ranks at a layer granularity. Our theoretical analysis demonstrates that LoD recovers the SVD decomposition of linear mapping on uniformly distributed data and PCA for linear autoencoders. Applied to DNNs, Maestro enables the extraction of lower footprint models that preserve performance. Simultaneously, it enables the graceful tradeoff between accuracy-latency for deployment to even more constrained devices, without retraining.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Samuel Horv\u00e1th;Stefanos Laskaridis;Shashank Rajput;Hongyi Wang", "authorids": "~Samuel_Horv\u00e1th1;~Stefanos_Laskaridis1;~Shashank_Rajput1;~Hongyi_Wang1", "gender": "M;;M;M", "homepage": "https://sites.google.com/view/samuelhorvath;https://stefanos.cc;https://pages.cs.wisc.edu/~srajput/;https://hwang595.github.io/", "dblp": "234/8604;241/6273;241/5361;15/832-1.html", "google_scholar": "k252J7kAAAAJ;https://scholar.google.co.uk/citations?user=TcVC--IAAAAJ;qEXxyDQAAAAJ;zYdZORsAAAAJ", "orcid": "0000-0003-0619-9260;;;", "linkedin": "samuel-horvath/;stevelaskaridis/;;hongyi-wang-b89651102/", "or_profile": "~Samuel_Horv\u00e1th1;~Stefanos_Laskaridis1;~Shashank_Rajput1;~Hongyi_Wang1", "aff": "MBZUAI;Brave Software;University of Wisconsin, Madison;Carnegie Mellon University", "aff_domain": "mbzuai.ac.ae;brave.com;wisc.edu;andrew.cmu.edu", "position": "Assistant Professor;Researcher;PhD student;Researcher", "bibtex": "@inproceedings{\nhorv{\\'a}th2024maestro,\ntitle={Maestro: Uncovering Low-Rank Structures via Trainable Decomposition},\nauthor={Samuel Horv{\\'a}th and Stefanos Laskaridis and Shashank Rajput and Hongyi Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7bjyambg4x}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2007532, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8374317016626564409&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "mbzuai.ac.ae;brave.com;wisc.edu;andrew.cmu.edu", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;Brave Software;University of Wisconsin;Carnegie Mellon University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.mbzuai.ac.ae;https://www.brave.com;https://www.wisc.edu;https://www.cmu.edu", "aff_unique_abbr": "MBZUAI;Brave;UW;CMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Madison", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United Arab Emirates;United States" }, { "title": "NeuralIndicator: Implicit Surface Reconstruction from Neural Indicator Priors", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34877", "id": "7ckuC9C2FZ", "proceeding": "https://proceedings.mlr.press/v235/huang24b.html", "pdf": "https://openreview.net/pdf?id=7ckuC9C2FZ", "openreview": "https://openreview.net/forum?id=7ckuC9C2FZ", "author_site": "Shi-Sheng Huang, Guo Chen, Li-heng Chen, Hua Huang", "tldr": "", "abstract": "The neural implicit surface reconstruction from unorganized points is still challenging, especially when the point clouds are incomplete and/or noisy with complex topology structure. Unlike previous approaches performing neural implicit surface learning relying on local shape priors, this paper proposes to utilize global shape priors to regularize the neural implicit function learning for more reliable surface reconstruction. To this end, we first introduce a differentiable module to generate a smooth indicator function, which globally encodes both the indicative prior and local SDFs of the entire input point cloud. Benefit from this, we propose a new framework, called NeuralIndicator, to jointly learn both the smooth indicator function and neural implicit function simultaneously, using the global shape prior encoded by smooth indicator function to effectively regularize the neural implicit function learning, towards reliable and high-fidelity surface reconstruction from unorganized points without any normal information. Extensive evaluations on synthetic and real-scan datasets show that our approach consistently outperforms previous approaches, especially when point clouds are incomplete and/or noisy with complex topology structure.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shi-Sheng Huang;Guo Chen;CHEN LI HENG;Hua Huang", "authorids": "~Shi-Sheng_Huang2;~Guo_Chen6;~CHEN_LI_HENG1;~Hua_Huang1", "gender": "M;M;M;M", "homepage": "https://shishenghuang.github.io/index/;https://github.com/Shanzhaguoo;https://github.com/CountNemoChan;https://vmcl.bnu.edu.cn/group/teacher/teacher01.html", "dblp": ";;;70/5618-1", "google_scholar": ";;;", "orcid": ";;;0000-0003-2587-1702", "linkedin": ";;;", "or_profile": "~Shi-Sheng_Huang2;~Guo_Chen6;~CHEN_LI_HENG1;~Hua_Huang1", "aff": "Beijing Normal University;Beijing Normal University;Beijing Normal University;Beijing Normal University", "aff_domain": "bnu.edu.cn;bnu.edu.cn;bnu.edu.cn;bnu.edu.cn", "position": "Associate Professor;Undergrad student;Undergrad student;Full Professor", "bibtex": "@inproceedings{\nhuang2024neuralindicator,\ntitle={NeuralIndicator: Implicit Surface Reconstruction from Neural Indicator Priors},\nauthor={Shi-Sheng Huang and Guo Chen and CHEN LI HENG and Hua Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7ckuC9C2FZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8007734, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6821841124208067508&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 3, "email": "bnu.edu.cn;bnu.edu.cn;bnu.edu.cn;bnu.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Beijing Normal University", "aff_unique_dep": "", "aff_unique_url": "https://www.bnu.edu.cn", "aff_unique_abbr": "BNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Learning to Model the World With Language", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34876", "id": "7dP6Yq9Uwv", "proceeding": "https://proceedings.mlr.press/v235/lin24g.html", "pdf": "https://openreview.net/pdf?id=7dP6Yq9Uwv", "openreview": "https://openreview.net/forum?id=7dP6Yq9Uwv", "author_site": "Jessy Lin, Yuqing Du, Olivia Watkins, Danijar Hafner, Pieter Abbeel, Dan Klein, Anca Dragan", "tldr": "", "abstract": "To interact with humans and act in the world, agents need to understand the range of language that people use and relate it to the visual world. While current agents can learn to execute simple language instructions, we aim to build agents that leverage diverse language---language like \"this button turns on the TV\" or \"I put the bowls away\"---that conveys general knowledge, describes the state of the world, provides interactive feedback, and more. Our key idea is that *agents should interpret such diverse language as a signal that helps them predict the future*: what they will observe, how the world will behave, and which situations will be rewarded. This perspective unifies language understanding with future prediction as a powerful self-supervised learning objective. We instantiate this in Dynalang, an agent that learns a multimodal world model to predict future text and image representations, and learns to act from imagined model rollouts. While current methods that learn language-conditioned policies degrade in performance with more diverse types of language, we show that Dynalang learns to leverage environment descriptions, game rules, and instructions to excel on tasks ranging from game-playing to navigating photorealistic home scans. Finally, we show that our method enables additional capabilities due to learning a generative model: Dynalang can be pretrained on text-only data, enabling learning from offline datasets, and generate language grounded in an environment.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jessy Lin;Yuqing Du;Olivia Watkins;Danijar Hafner;Pieter Abbeel;Dan Klein;Anca Dragan", "authorids": "~Jessy_Lin1;~Yuqing_Du1;~Olivia_Watkins1;~Danijar_Hafner1;~Pieter_Abbeel2;~Dan_Klein1;~Anca_Dragan1", "gender": ";;;;M;;F", "homepage": "https://jessylin.com;http://yuqingd.github.io;https://people.eecs.berkeley.edu/~oliviawatkins/;https://danijar.com;https://people.eecs.berkeley.edu/~pabbeel/;http://people.eecs.berkeley.edu/~klein/;http://www.ancadragan.com/", "dblp": "211/7575;218/5572;;184/8088;;;", "google_scholar": "jTMUPNkAAAAJ;;;VINmGpYAAAAJ;https://scholar.google.com.tw/citations?user=vtwH6GkAAAAJ;;", "orcid": ";;;0000-0002-9534-7271;;;", "linkedin": ";yuqingdu;;;;dan-klein/;", "or_profile": "~Jessy_Lin1;~Yuqing_Du1;~Olivia_Watkins1;~Danijar_Hafner1;~Pieter_Abbeel2;~Dan_Klein1;~Anca_Dragan1", "aff": "University of California, Berkeley;Google DeepMind;University of California, Berkeley;Google;Covariant;University of California, Berkeley;University of California, Berkeley", "aff_domain": "berkeley.edu;google.com;berkeley.edu;google.com;covariant.ai;berkeley.edu;berkeley.edu", "position": "PhD student;Researcher;PhD student;Researcher;Founder;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nlin2024learning,\ntitle={Learning to Model the World With Language},\nauthor={Jessy Lin and Yuqing Du and Olivia Watkins and Danijar Hafner and Pieter Abbeel and Dan Klein and Anca Dragan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7dP6Yq9Uwv}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1376206, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18079008737666460104&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "berkeley.edu;google.com;berkeley.edu;google.com;covariant.ai;berkeley.edu;berkeley.edu", "author_num": 7, "aff_unique_index": "0;1;0;1;2;0;0", "aff_unique_norm": "University of California, Berkeley;Google;Covariant", "aff_unique_dep": ";Google DeepMind;", "aff_unique_url": "https://www.berkeley.edu;https://deepmind.com;", "aff_unique_abbr": "UC Berkeley;DeepMind;", "aff_campus_unique_index": "0;0;2;0;0", "aff_campus_unique": "Berkeley;;Mountain View", "aff_country_unique_index": "0;1;0;0;0;0", "aff_country_unique": "United States;United Kingdom;" }, { "title": "Adaptive Text Watermark for Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34875", "id": "7emOSb5UfX", "proceeding": "https://proceedings.mlr.press/v235/liu24e.html", "pdf": "https://openreview.net/pdf?id=7emOSb5UfX", "openreview": "https://openreview.net/forum?id=7emOSb5UfX", "author_site": "Yepeng Liu, Yuheng Bu", "tldr": "", "abstract": "The advancement of Large Language Models (LLMs) has led to increasing concerns about the misuse of AI-generated text, and watermarking LLM-generated text has emerged as a potential solution. However, it is challenging to generate high-quality watermarked text while maintaining robustness, security, and the ability to detect watermarks without prior knowledge of the prompt and model. This paper proposes an adaptive text watermarking strategy to address such a challenge. To improve the text quality and maintain robustness, we adaptively add watermarking to token distributions with high entropy measured by an auxiliary model and keep the low-entropy token distributions untouched. For the sake of security and to further minimize the watermark's impact on text quality, instead of using a fixed green/red list generated from a random secret key, which can be vulnerable to decryption and forgery, we adaptively scale up the output logits based on the semantic embedding of previously generated text using a well designed semantic mapping model. Our experiments involving various LLMs demonstrate that our approach achieves comparable robustness performance to existing watermark methods. Additionally, the text generated by our method has perplexity comparable to that of *un-watermarked* LLMs while maintaining sufficient security.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yepeng Liu;Yuheng Bu", "authorids": "~Yepeng_Liu1;~Yuheng_Bu1", "gender": ";M", "homepage": ";https://buyuheng.github.io/", "dblp": ";168/8338", "google_scholar": ";1jPQEVMAAAAJ", "orcid": ";0000-0002-3479-4553", "linkedin": ";bu-yuheng-36560039/", "or_profile": "~Yepeng_Liu1;~Yuheng_Bu1", "aff": ";University of Florida", "aff_domain": ";ufl.edu", "position": ";Assistant Professor", "bibtex": "@inproceedings{\nliu2024adaptive,\ntitle={Adaptive Text Watermark for Large Language Models},\nauthor={Yepeng Liu and Yuheng Bu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7emOSb5UfX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5801405, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3507345109032622654&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";ufl.edu", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "University of Florida", "aff_unique_dep": "", "aff_unique_url": "https://www.ufl.edu", "aff_unique_abbr": "UF", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Faster Sampling via Stochastic Gradient Proximal Sampler", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34874", "id": "7gEcbhMqKU", "proceeding": "https://proceedings.mlr.press/v235/huang24aj.html", "pdf": "https://openreview.net/pdf?id=7gEcbhMqKU", "openreview": "https://openreview.net/forum?id=7gEcbhMqKU", "author_site": "Xunpeng Huang, Difan Zou, Hanze Dong, Yian Ma, Tong Zhang", "tldr": "", "abstract": "Stochastic gradients have been widely integrated into Langevin-based methods to improve their scalability and efficiency in solving large-scale sampling problems. However, the proximal sampler, which exhibits much faster convergence than Langevin-based algorithms in the deterministic setting (Lee et al., 2021), has yet to be explored in its stochastic variants. In this paper, we study the Stochastic Proximal Samplers (SPS) for sampling from non-log-concave distributions. We first establish a general framework for implementing stochastic proximal samplers and establish the convergence theory accordingly. We show that the convergence to the target distribution can be guaranteed as long as the second moment of the algorithm trajectory is bounded and restricted Gaussian oracles can be well approximated. We then provide two implementable variants based on Stochastic gradient Langevin dynamics (SGLD) and Metropolis-adjusted Langevin algorithm (MALA), giving rise to SPS-SGLD and SPS-MALA. We further show that SPS-SGLD and SPS-MALA can achieve $\\epsilon$-sampling error in total variation (TV) distance within $\\tilde{\\mathcal{O}}(d\\epsilon^{-2})$ and $\\tilde{\\mathcal{O}}(d^{1/2}\\epsilon^{-2})$ gradient complexities, which outperform the best-known result by at least an $\\tilde{\\mathcal{O}}(d^{1/3})$ factor. This enhancement in performance is corroborated by our empirical studies on synthetic data with various dimensions, demonstrating the efficiency of our proposed algorithm.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xunpeng Huang;Difan Zou;Hanze Dong;Yian Ma;Tong Zhang", "authorids": "~Xunpeng_Huang2;~Difan_Zou1;~Hanze_Dong1;~Yian_Ma1;~Tong_Zhang2", "gender": "M;M;M;M;M", "homepage": "https://xunpeng746.github.io;https://difanzou.github.io/;https://hendrydong.github.io/;https://sites.google.com/view/yianma;http://tongzhang-ml.org", "dblp": ";161/8923;228/7798;;07/4227-1", "google_scholar": "T2L6rKcAAAAJ;Cp4fcTQAAAAJ;g9WLzWoAAAAJ;A0TFlacAAAAJ;LurWtuYAAAAJ", "orcid": ";;;;0000-0002-5511-2558", "linkedin": ";;hanze-dong/;;", "or_profile": "~Xunpeng_Huang2;~Difan_Zou1;~Hanze_Dong1;~Yian_Ma1;~Tong_Zhang2", "aff": "Hong Kong University of Science and Technology;University of Hong Kong;SalesForce;University of California, San Diego;UIUC", "aff_domain": "ust.hk;hku.hk;salesforce.com;ucsd.edu;illinois.edu", "position": "PhD student;Assistant Professor;Researcher;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nhuang2024faster,\ntitle={Faster Sampling via Stochastic Gradient Proximal Sampler},\nauthor={Xunpeng Huang and Difan Zou and Hanze Dong and Yian Ma and Tong Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7gEcbhMqKU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1276212, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2672386204232820790&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 10, "email": "ust.hk;hku.hk;salesforce.com;ucsd.edu;illinois.edu", "author_num": 5, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Hong Kong University of Science and Technology;University of Hong Kong;Salesforce;University of California, San Diego;University of Illinois Urbana-Champaign", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.ust.hk;https://www.hku.hk;https://www.salesforce.com;https://www.ucsd.edu;https://www illinois.edu", "aff_unique_abbr": "HKUST;HKU;Salesforce;UCSD;UIUC", "aff_campus_unique_index": "0;0;2;3", "aff_campus_unique": "Hong Kong SAR;;San Diego;Urbana-Champaign", "aff_country_unique_index": "0;0;1;1;1", "aff_country_unique": "China;United States" }, { "title": "Adaptive Stabilization Based on Machine Learning for Column Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34873", "id": "7iH9RgMrzX", "proceeding": "https://proceedings.mlr.press/v235/shen24e.html", "pdf": "https://openreview.net/pdf?id=7iH9RgMrzX", "openreview": "https://openreview.net/forum?id=7iH9RgMrzX", "author_site": "Yunzhuang Shen, Yuan Sun, Xiaodong Li, Zhiguang Cao, Andrew Eberhard, Guangquan Zhang", "tldr": "", "abstract": "Column generation (CG) is a well-established method for solving large-scale linear programs. It involves iteratively optimizing a subproblem containing a subset of columns and using its dual solution to generate new columns with negative reduced costs. This process continues until the dual values converge to the optimal dual solution to the original problem. A natural phenomenon in CG is the heavy oscillation of the dual values during iterations, which can lead to a substantial slowdown in the convergence rate. *Stabilization* techniques are devised to accelerate the convergence of dual values by using information beyond the state of the current subproblem. However, there remains a significant gap in obtaining more accurate dual values at an earlier stage. To further narrow this gap, this paper introduces a novel approach consisting of 1) a *machine learning* approach for accurate prediction of optimal dual solutions and 2) an *adaptive stabilization* technique that effectively capitalizes on accurate predictions. On the graph coloring problem, we show that our method achieves a significantly improved convergence rate compared to traditional methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yunzhuang Shen;Yuan Sun;Xiaodong Li;Zhiguang Cao;Andrew Eberhard;Guangquan Zhang", "authorids": "~Yunzhuang_Shen1;~Yuan_Sun1;~Xiaodong_Li5;~Zhiguang_Cao1;~Andrew_Eberhard1;~Guangquan_Zhang2", "gender": ";M;M;M;M;", "homepage": ";https://scholars.latrobe.edu.au/y6sun;https://titan.csit.rmit.edu.au/~e46507/;https://zhiguangcaosg.github.io/;https://www.rmit.edu.au/contact/staff-contacts/academic-staff/e/eberhard-professor-andrew;", "dblp": ";75/5247-3;;178/8621;120/5748.html;", "google_scholar": ";B49vHtUAAAAJ;https://scholar.google.com.au/citations?user=AQewL04AAAAJ;https://scholar.google.com.sg/citations?user=2R-cOkYAAAAJ;https://scholar.google.com.au/citations?user=ZGZkJ5gAAAAJ;_1RMrhsAAAAJ", "orcid": ";0000-0003-2911-0070;;0000-0002-4499-759X;0000-0003-2977-3456;", "linkedin": ";;;;andrew-eberhard-13017132/?originalSubdomain=au;", "or_profile": "~Yunzhuang_Shen1;~Yuan_Sun1;~Xiaodong_Li5;~Zhiguang_Cao1;~Andrew_Eberhard1;~Guangquan_Zhang2", "aff": ";La Trobe University;Royal Melbourne Institute of Technology;Singapore Management University;Royal Melbourne Institute of Technology;University of Technology Sydney (UTS)", "aff_domain": ";latrobe.edu.au;rmit.edu.au;smu.edu.sg;rmit.edu.au;uts.eud.au", "position": ";Assistant Professor;Full Professor;Assistant Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nshen2024adaptive,\ntitle={Adaptive Stabilization Based on Machine Learning for Column Generation},\nauthor={Yunzhuang Shen and Yuan Sun and Xiaodong Li and Zhiguang Cao and Andrew Eberhard and Guangquan Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7iH9RgMrzX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 462052, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3251423488384356400&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";latrobe.edu.au;rmit.edu.au;smu.edu.sg;rmit.edu.au;uts.eud.au", "author_num": 6, "aff_unique_index": "0;1;2;1;3", "aff_unique_norm": "La Trobe University;Royal Melbourne Institute of Technology;Singapore Management University;University of Technology Sydney", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.latrobe.edu.au;https://www.rmit.edu.au;https://www.smu.edu.sg;https://www.uts.edu.au", "aff_unique_abbr": "LTU;RMIT;SMU;UTS", "aff_campus_unique_index": "1", "aff_campus_unique": ";Sydney", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "Australia;Singapore" }, { "title": "Offline-Boosted Actor-Critic: Adaptively Blending Optimal Historical Behaviors in Deep Off-Policy RL", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34872", "id": "7joG3i2pUR", "proceeding": "https://proceedings.mlr.press/v235/luo24e.html", "pdf": "https://openreview.net/pdf?id=7joG3i2pUR", "openreview": "https://openreview.net/forum?id=7joG3i2pUR", "author_site": "Yu Luo, Tianying Ji, Fuchun Sun, Jianwei Zhang, Huazhe Xu, Xianyuan Zhan", "tldr": "", "abstract": "Off-policy reinforcement learning (RL) has achieved notable success in tackling many complex real-world tasks, by leveraging previously collected data for policy learning. However, most existing off-policy RL algorithms fail to maximally exploit the information in the replay buffer, limiting sample efficiency and policy performance. In this work, we discover that concurrently training an offline RL policy based on the shared online replay buffer can sometimes outperform the original online learning policy, though the occurrence of such performance gains remains uncertain. This motivates a new possibility of harnessing the emergent outperforming offline optimal policy to improve online policy learning. Based on this insight, we present Offline-Boosted Actor-Critic (OBAC), a model-free online RL framework that elegantly identifies the outperforming offline policy through value comparison, and uses it as an adaptive constraint to guarantee stronger policy learning performance. Our experiments demonstrate that OBAC outperforms other popular model-free RL baselines and rivals advanced model-based RL methods in terms of sample efficiency and asymptotic performance across **53** tasks spanning **6** task suites.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yu Luo;Tianying Ji;Fuchun Sun;Jianwei Zhang;Huazhe Xu;Xianyuan Zhan", "authorids": "~Yu_Luo5;~Tianying_Ji2;~Fuchun_Sun1;~Jianwei_Zhang2;~Huazhe_Xu1;~Xianyuan_Zhan1", "gender": "M;F;M;M;M;M", "homepage": ";;https://www.cs.tsinghua.edu.cn/info/1121/3555.htm;https://tams.informatik.uni-hamburg.de/people/zhang/;http://hxu.rocks;http://zhanxianyuan.xyz/", "dblp": ";124/2199.html;;z/JianweiZhang1;164/9006;181/5081", "google_scholar": "https://scholar.google.com.hk/citations?user=KQjoQOMAAAAJ;https://scholar.google.com/citations?hl=zh-CN;;;t9HPFawAAAAJ;pDMnGloAAAAJ", "orcid": "0000-0001-6229-4639;;;;;0000-0002-3683-0554", "linkedin": ";;;;;", "or_profile": "~Yu_Luo5;~Tianying_Ji2;~Fuchun_Sun1;~Jianwei_Zhang2;~Huazhe_Xu1;~Xianyuan_Zhan1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Universit\u00e4t Hamburg;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;cs.tsinghua.edu.cn;uni-hamburg.de;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;PhD student;Full Professor;Full Professor;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nluo2024offlineboosted,\ntitle={Offline-Boosted Actor-Critic: Adaptively Blending Optimal Historical Behaviors in Deep Off-Policy {RL}},\nauthor={Yu Luo and Tianying Ji and Fuchun Sun and Jianwei Zhang and Huazhe Xu and Xianyuan Zhan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7joG3i2pUR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4074193, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9967873667865298060&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "tsinghua.edu.cn;tsinghua.edu.cn;cs.tsinghua.edu.cn;uni-hamburg.de;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;1;0;0", "aff_unique_norm": "Tsinghua University;University of Hamburg", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.uni-hamburg.de", "aff_unique_abbr": "THU;UHH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "China;Germany" }, { "title": "When Linear Attention Meets Autoregressive Decoding: Towards More Effective and Efficient Linearized Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34871", "id": "7mFSaP6IiN", "proceeding": "https://proceedings.mlr.press/v235/you24a.html", "pdf": "https://openreview.net/pdf?id=7mFSaP6IiN", "openreview": "https://openreview.net/forum?id=7mFSaP6IiN", "author_site": "Haoran You, Yichao Fu, Zheng Wang, Amir Yazdanbakhsh, Yingyan (Celine) Lin", "tldr": "", "abstract": "Autoregressive Large Language Models (LLMs) have achieved impressive performance in language tasks but face two significant bottlenecks: (1) quadratic complexity in the attention module as the number of tokens increases, and (2) limited efficiency due to the sequential processing nature of autoregressive LLMs during generation. While linear attention and speculative decoding offer potential solutions, their applicability and synergistic potential for enhancing autoregressive LLMs remain uncertain. We conduct the first comprehensive study on the efficacy of existing linear attention methods for autoregressive LLMs, integrating them with speculative decoding. We introduce an augmentation technique for linear attention that ensures compatibility with speculative decoding, enabling more efficient training and serving of LLMs. Extensive experiments and ablation studies involving seven existing linear attention models and five encoder/decoder-based LLMs consistently validate the effectiveness of our augmented linearized LLMs. Notably, our approach achieves up to a 6.67 reduction in perplexity on the LLaMA model and up to a 2$\\times$ speedup during generation compared to prior linear attention methods. Codes and models are available at https://github.com/GATECH-EIC/Linearized-LLM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haoran You;Yichao Fu;Zheng Wang;Amir Yazdanbakhsh;Yingyan Celine Lin", "authorids": "~Haoran_You1;~Yichao_Fu1;~Zheng_Wang38;~Amir_Yazdanbakhsh1;~Yingyan_Celine_Lin1", "gender": "M;;M;M;", "homepage": "http://haoranyou.com/;;https://zkbig.github.io/;https://www.ayazdan.com/;", "dblp": "230/4247;;;44/8745;", "google_scholar": "z5Eku1sAAAAJ;;MIBiy2gAAAAJ;Vdu_sqwAAAAJ;", "orcid": "0000-0002-2873-2153;;;0000-0001-8199-7671;", "linkedin": "haoran-you-b4b958165/;;;ayazdanb/;", "or_profile": "~Haoran_You1;~Yichao_Fu1;~Zheng_Wang38;~Amir_Yazdanbakhsh1;~Yingyan_Celine_Lin1", "aff": "Georgia Institute of Technology;;Georgia Institute of Technology;Google DeepMind;", "aff_domain": "gatech.edu;;gatech.edu;google.com;", "position": "PhD student;;MS student;Researcher;", "bibtex": "@inproceedings{\nyou2024when,\ntitle={When Linear Attention Meets Autoregressive Decoding: Towards More Effective and Efficient Linearized Large Language Models},\nauthor={Haoran You and Yichao Fu and Zheng Wang and Amir Yazdanbakhsh and Yingyan Celine Lin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7mFSaP6IiN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2876282, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9587173720616027049&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "gatech.edu;;gatech.edu;google.com;", "author_num": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "Georgia Institute of Technology;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.gatech.edu;https://deepmind.com", "aff_unique_abbr": "Georgia Tech;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Predicting and Interpreting Energy Barriers of Metallic Glasses with Graph Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34870", "id": "7rTbqkKvA6", "proceeding": "https://proceedings.mlr.press/v235/li24cm.html", "pdf": "https://openreview.net/pdf?id=7rTbqkKvA6", "openreview": "https://openreview.net/forum?id=7rTbqkKvA6", "author_site": "Haoyu Li, Shichang Zhang, Longwen Tang, Mathieu Bauchy, Yizhou Sun", "tldr": "", "abstract": "Metallic Glasses (MGs) are widely used materials that are stronger than steel while being shapeable as plastic. While understanding the structure-property relationship of MGs remains a challenge in materials science, studying their energy barriers (EBs) as an intermediary step shows promise. In this work, we utilize Graph Neural Networks (GNNs) to model MGs and study EBs. We contribute a new dataset for EB prediction and a novel Symmetrized GNN (SymGNN) model that is E(3)-invariant in expectation. SymGNN handles invariance by aggregating over orthogonal transformations of the graph structure. When applied to EB prediction, SymGNN are more accurate than molecular dynamics (MD) local-sampling methods and other machine-learning models. Compared to precise MD simulations, SymGNN reduces the inference time on new MGs from roughly **41 days** to **less than one second**. We apply explanation algorithms to reveal the relationship between structures and EBs. The structures that we identify through explanations match the medium-range order (MRO) hypothesis and possess unique topological properties. Our work enables effective prediction and interpretation of MG EBs, bolstering material science research.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haoyu Li;Shichang Zhang;Longwen Tang;Mathieu Bauchy;Yizhou Sun", "authorids": "~Haoyu_Li4;~Shichang_Zhang2;~Longwen_Tang1;~Mathieu_Bauchy1;~Yizhou_Sun1", "gender": "M;M;M;;F", "homepage": "https://haoyuli02.github.io/;https://shichangzh.github.io/;https://scholar.google.com/citations?user=0FmJFb4AAAAJ&hl=zh-CN;http://www.lab-paris.com;http://web.cs.ucla.edu/~yzsun/", "dblp": ";234/4118;;;37/3868", "google_scholar": "vOwTylIAAAAJ;TYqG0x4AAAAJ;0FmJFb4AAAAJ;TsvYdEkAAAAJ;https://scholar.google.com.tw/citations?user=TQgOjK0AAAAJ", "orcid": ";0000-0003-0954-5018;;0000-0003-4600-0631;", "linkedin": "haoyu-li-4b6787235;shichang-zhang-4430a4106/;;bauchy/;", "or_profile": "~Haoyu_Li4;~Shichang_Zhang2;~Longwen_Tang1;~Mathieu_Bauchy1;~Yizhou_Sun1", "aff": "University of California, Los Angeles;University of California, Los Angeles;University of California, Los Angeles;University of California, Los Angeles;University of California, Los Angeles", "aff_domain": "ucla.edu;cs.ucla.edu;ucla.edu;ucla.edu;ucla.edu", "position": "Undergrad student;PhD student;Postdoc;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nli2024predicting,\ntitle={Predicting and Interpreting Energy Barriers of Metallic Glasses with Graph Neural Networks},\nauthor={Haoyu Li and Shichang Zhang and Longwen Tang and Mathieu Bauchy and Yizhou Sun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7rTbqkKvA6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6358168, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17275451219043130722&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 9, "email": "ucla.edu;cs.ucla.edu;ucla.edu;ucla.edu;ucla.edu", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of California, Los Angeles", "aff_unique_dep": "", "aff_unique_url": "https://www.ucla.edu", "aff_unique_abbr": "UCLA", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "DOGE: Domain Reweighting with Generalization Estimation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34869", "id": "7rfZ6bMZq4", "proceeding": "https://proceedings.mlr.press/v235/fan24e.html", "pdf": "https://openreview.net/pdf?id=7rfZ6bMZq4", "openreview": "https://openreview.net/forum?id=7rfZ6bMZq4", "author_site": "Simin Fan, Matteo Pagliardini, Martin Jaggi", "tldr": "", "abstract": "The coverage and composition of the pretraining data significantly impacts the generalization ability of Large Language Models (LLMs). Despite its importance, recent LLMs still rely on heuristics and trial and error to increase or reduce the influence of data-domains. We propose DOmain reweighting with Generalization Estimation (DoGE), which optimizes the probability of sampling from each domain (domain weights) in a principled way. Our approach is a two stage process consisting (i) training a proxy model to obtain domain weights using a bi-level optimization algorithm; (ii) training a larger base model by sampling training domains according to the learnt domain weights. In our experiments, we extensively show how DoGE improves the generalization of the base model to any target data mixture. On the SlimPajama dataset, our base model gets a better perplexity and few-shot reasoning accuracies across 6 tasks compared to baseline methods. Moreover, aiming to generalize to out-of-domain target tasks, which is unseen in the pretraining corpus (OOD domain), DoGE can effectively identify inter-domain dependencies, consistently achieves better test perplexity on the target domain.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Simin Fan;Matteo Pagliardini;Martin Jaggi", "authorids": "~Simin_Fan1;~Matteo_Pagliardini1;~Martin_Jaggi1", "gender": "F;M;M", "homepage": "https://olivia-fsm.github.io/;;https://mlo.epfl.ch", "dblp": ";140/7789;17/4402", "google_scholar": "YFJJxpQAAAAJ;https://scholar.google.ch/citations?user=FXacC3oAAAAJ;https://scholar.google.ch/citations?user=r1TJBr8AAAAJ", "orcid": "0000-0002-1490-9413;;0000-0003-1579-5558", "linkedin": ";;", "or_profile": "~Simin_Fan1;~Matteo_Pagliardini1;~Martin_Jaggi1", "aff": "EPFL - EPF Lausanne;Swiss Federal Institute of Technology Lausanne;EPFL", "aff_domain": "epfl.ch;epfl.ch;epfl.ch", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nfan2024doge,\ntitle={{DOGE}: Domain Reweighting with Generalization Estimation},\nauthor={Simin Fan and Matteo Pagliardini and Martin Jaggi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7rfZ6bMZq4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5808289, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5797681520589461559&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "epfl.ch;epfl.ch;epfl.ch", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "EPFL;Swiss Federal Institute of Technology Lausanne", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://www.epfl.ch", "aff_unique_abbr": "EPFL;EPFL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Neural Collapse meets Differential Privacy: Curious behaviors of NoisyGD with Near-Perfect Representation Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34868", "id": "7rrN6E4KU0", "proceeding": "https://proceedings.mlr.press/v235/wang24cu.html", "pdf": "https://openreview.net/pdf?id=7rrN6E4KU0", "openreview": "https://openreview.net/forum?id=7rrN6E4KU0", "author_site": "Chendi Wang, Yuqing Zhu, Weijie Su, Yu-Xiang Wang", "tldr": "", "abstract": "A recent study by De et al. (2022) shows that large-scale representation learning through pre-training on a public dataset significantly enhances differentially private (DP) learning in downstream tasks. To explain this, we consider a layer-peeled model in representation learning, resulting in Neural Collapse (NC) phenomena. Within NC, we establish that the misclassification error is independent of dimension when the distance between actual and ideal features is below a threshold. We empirically evaluate feature quality in the last layer under different pre-trained models, showing that a more powerful pre-trained model improves feature representation. Moreover, we show that DP fine-tuning is less robust compared to non-DP fine-tuning, especially with perturbations. Supported by theoretical analyses and experiments, we suggest strategies like feature normalization and dimension reduction methods such as PCA to enhance DP fine-tuning robustness. Conducting PCA on last-layer features significantly improves testing accuracy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chendi Wang;Yuqing Zhu;Weijie J Su;Yu-Xiang Wang", "authorids": "~Chendi_Wang2;~Yuqing_Zhu1;~Weijie_J_Su1;~Yu-Xiang_Wang1", "gender": "M;F;M;", "homepage": ";https://jeremy43.github.io/;http://stat.wharton.upenn.edu/~suw/;http://www.cs.ucsb.edu/~yuxiangw/publications.html", "dblp": ";;228/9127;62/1637-3.html", "google_scholar": ";QmMv9PIAAAAJ;Uhf4nBkAAAAJ;HGNZ1fkAAAAJ", "orcid": "0000-0001-5321-1846;;;", "linkedin": ";;;", "or_profile": "~Chendi_Wang2;~Yuqing_Zhu1;~Weijie_J_Su1;~Yu-Xiang_Wang1", "aff": "Shenzhen Research Institute of Big Data;Tiktok;University of Pennsylvania;UC Santa Barbara", "aff_domain": "cuhk.edu.cn;tiktok.com;upenn.edu;ucsb.edu", "position": "Postdoc;Researcher;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nwang2024neural,\ntitle={Neural Collapse meets Differential Privacy: Curious behaviors of Noisy{GD} with Near-Perfect Representation Learning},\nauthor={Chendi Wang and Yuqing Zhu and Weijie J Su and Yu-Xiang Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7rrN6E4KU0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 653186, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:m8cYqs8s0F0J:scholar.google.com/&scioq=Neural+Collapse+meets+Differential+Privacy:+Curious+behaviors+of+NoisyGD+with+Near-Perfect+Representation+Learning&hl=en&as_sdt=0,5", "gs_version_total": 9, "email": "cuhk.edu.cn;tiktok.com;upenn.edu;ucsb.edu", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Shenzhen Research Institute of Big Data;TikTok;University of Pennsylvania;University of California, Santa Barbara", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.sribd.cn;https://www.tiktok.com;https://www.upenn.edu;https://www.ucsb.edu", "aff_unique_abbr": ";TikTok;UPenn;UCSB", "aff_campus_unique_index": "1", "aff_campus_unique": ";Santa Barbara", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "China;United States" }, { "title": "A General Framework for Learning from Weak Supervision", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34867", "id": "7sgqXa4aNM", "proceeding": "https://proceedings.mlr.press/v235/chen24ar.html", "pdf": "https://openreview.net/pdf?id=7sgqXa4aNM", "openreview": "https://openreview.net/forum?id=7sgqXa4aNM", "author_site": "Hao Chen, Jindong Wang, Lei Feng, Xiang Li, Yidong Wang, Xing Xie, Masashi Sugiyama, Rita Singh, Bhiksha Raj", "tldr": "", "abstract": "Weakly supervised learning generally faces challenges in applicability to various scenarios with diverse weak supervision and in scalability due to the complexity of existing algorithms, thereby hindering the practical deployment. This paper introduces a general framework for learning from weak supervision (GLWS) with a novel algorithm. Central to GLWS is an Expectation-Maximization (EM) formulation, adeptly accommodating various weak supervision sources, including instance partial labels, aggregate statistics, pairwise observations, and unlabeled data. We further present an advanced algorithm that significantly simplifies the EM computational demands using a Non-deterministic Finite Automaton (NFA) along with a forward-backward algorithm, which effectively reduces time complexity from quadratic or factorial often required in existing solutions to linear scale. The problem of learning from arbitrary weak supervision is therefore converted to the NFA modeling of them. GLWS not only enhances the scalability of machine learning models but also demonstrates superior performance and versatility across 11 weak supervision scenarios. We hope our work paves the way for further advancements and practical deployment in this field.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hao Chen;Jindong Wang;Lei Feng;Xiang Li;Yidong Wang;Xing Xie;Masashi Sugiyama;Rita Singh;Bhiksha Raj", "authorids": "~Hao_Chen15;~Jindong_Wang1;~Lei_Feng1;~Xiang_Li35;~Yidong_Wang1;~Xing_Xie3;~Masashi_Sugiyama1;~Rita_Singh1;~Bhiksha_Raj1", "gender": "M;;M;;M;M;M;F;M", "homepage": "https://hhhhhhao.github.io/;;https://lfeng1995.github.io/;;https://qianlanwyd.github.io/;http://research.microsoft.com/en-us/people/xingx/;http://www.ms.k.u-tokyo.ac.jp/sugi/;http://mlsp.cs.cmu.edu/people/rsingh/index.html;https://www.cs.cmu.edu/directory/bhikshar/", "dblp": ";;76/847-6;;59/6759.html;08/6809-1;35/1228;;60/3996", "google_scholar": "tktqkhwAAAAJ;;https://scholar.google.com.sg/citations?user=KomQOFkAAAAJ;;;5EQfAFIAAAAJ;https://scholar.google.co.jp/citations?user=GkYIrlIAAAAJ;;", "orcid": ";;0000-0003-2839-5799;;;0000-0002-8608-8482;0000-0001-6658-6743;;", "linkedin": "haochen97/;;;;;xingx/;;;", "or_profile": "~Hao_Chen15;~Jindong_Wang1;~Lei_Feng1;~Xiang_Li35;~Yidong_Wang1;~Xing_Xie3;~Masashi_Sugiyama1;~Rita_Singh1;~Bhiksha_Raj1", "aff": "Carnegie Mellon University;;Singapore University of Technology and Design;;Peking University;Microsoft Research Asia;The University of Tokyo;School of Computer Science, Carnegie Mellon University;Mohamed bin Zayed University of Artificial Intelligence", "aff_domain": "andrew.cmu.edu;;sutd.edu.sg;;pku.edu.cn;microsoft.com;u-tokyo.ac.jp;cs.cmu.edu;mbzuai.ac.ae", "position": "PhD student;;Assistant Professor;;PhD student;Senior Principal Researcher;Full Professor;Research Professor;Full Professor", "bibtex": "@inproceedings{\nchen2024a,\ntitle={A General Framework for Learning from Weak Supervision},\nauthor={Hao Chen and Jindong Wang and Lei Feng and Xiang Li and Yidong Wang and Xing Xie and Masashi Sugiyama and Rita Singh and Bhiksha Raj},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7sgqXa4aNM}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1298471, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12509781191741282597&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "andrew.cmu.edu;;sutd.edu.sg;;pku.edu.cn;microsoft.com;u-tokyo.ac.jp;cs.cmu.edu;mbzuai.ac.ae", "author_num": 9, "aff_unique_index": "0;1;2;3;4;0;5", "aff_unique_norm": "Carnegie Mellon University;Singapore University of Technology and Design;Peking University;Microsoft;University of Tokyo;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";;;Research;;", "aff_unique_url": "https://www.cmu.edu;https://www.sutd.edu.sg;http://www.pku.edu.cn;https://www.microsoft.com/en-us/research/group/asia;https://www.u-tokyo.ac.jp;https://mbzuai.ac.ae", "aff_unique_abbr": "CMU;SUTD;Peking U;MSR Asia;UTokyo;MBZUAI", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Asia;Pittsburgh", "aff_country_unique_index": "0;1;2;2;3;0;4", "aff_country_unique": "United States;Singapore;China;Japan;United Arab Emirates" }, { "title": "Synergistic Integration of Coordinate Network and Tensorial Feature for Improving Neural Radiance Fields from Sparse Inputs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34866", "id": "7tyAO5tUF8", "proceeding": "https://proceedings.mlr.press/v235/kim24j.html", "pdf": "https://openreview.net/pdf?id=7tyAO5tUF8", "openreview": "https://openreview.net/forum?id=7tyAO5tUF8", "author_site": "Mingyu Kim, Kim Jun-Seong, Se-Young Yun, Jin-Hwa Kim", "tldr": "", "abstract": "The multi-plane representation has been highlighted for its fast training and inference across static and dynamic neural radiance fields. This approach constructs relevant features via projection onto learnable grids and interpolating adjacent vertices. However, it has limitations in capturing low-frequency details and tends to overuse parameters for low-frequency features due to its bias toward fine details, despite its multi-resolution concept. This phenomenon leads to instability and inefficiency when training poses are sparse. In this work, we propose a method that synergistically integrates multi-plane representation with a coordinate-based MLP network known for strong bias toward low-frequency signals. The coordinate-based network is responsible for capturing low-frequency details, while the multi-plane representation focuses on capturing fine-grained details. We demonstrate that using residual connections between them seamlessly preserves their own inherent properties. Additionally, the proposed progressive training scheme accelerates the disentanglement of these two features. We demonstrate empirically that our proposed method not only outperforms baseline models for both static and dynamic NeRFs with sparse inputs, but also achieves comparable results with fewer parameters.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mingyu Kim;Kim Jun-Seong;Se-Young Yun;Jin-Hwa Kim", "authorids": "~Mingyu_Kim2;~Kim_Jun-Seong1;~Se-Young_Yun1;~Jin-Hwa_Kim1", "gender": "M;M;Unspecified;M", "homepage": ";https://fbsqkd.github.io;http://wityworks.com;https://mingyukim87.github.io", "dblp": "326/5743;23/8862;48/258;64/10624-2", "google_scholar": "CMqYPY8AAAAJ;X_IAjb8AAAAJ;https://scholar.google.co.kr/citations?user=3f2wPekAAAAJ;nrZYdwQAAAAJ", "orcid": ";;0000-0002-0423-0415;0000-0001-5082-7223", "linkedin": "junseong-kim-325120255/;seyoung-yun-395130ab/;;mingyu-kim-0380a2125/", "or_profile": "~Kim_Jun-Seong1;~Se-Young_Yun1;~Jin-Hwa_Kim1;~Kim_Mingyu1", "aff": "Pohang University of Science and Technology;KAIST;NAVER;Korea Advanced Institute of Science & Technology", "aff_domain": "postech.edu;kaist.ac.kr;navercorp.com;kaist.ac.kr", "position": "PhD student;Assistant Professor;Research Scientist;PhD student", "bibtex": "@inproceedings{\nkim2024synergistic,\ntitle={Synergistic Integration of Coordinate Network and Tensorial Feature for Improving Neural Radiance Fields from Sparse Inputs},\nauthor={Mingyu Kim and Kim Jun-Seong and Se-Young Yun and Jin-Hwa Kim},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7tyAO5tUF8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7609153, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16366002340270236295&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "postech.edu;kaist.ac.kr;navercorp.com;kaist.ac.kr", "author_num": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Pohang University of Science and Technology;Korea Advanced Institute of Science and Technology;NAVER Corporation", "aff_unique_dep": ";;", "aff_unique_url": "https://www.postech.ac.kr;https://www.kaist.ac.kr;https://www.naver.com", "aff_unique_abbr": "POSTECH;KAIST;NAVER", "aff_campus_unique_index": "0", "aff_campus_unique": "Pohang;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Vector Quantization Pretraining for EEG Time Series with Random Projection and Phase Alignment", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34865", "id": "7uwLvFvpis", "proceeding": "https://proceedings.mlr.press/v235/gui24a.html", "pdf": "https://openreview.net/pdf?id=7uwLvFvpis", "openreview": "https://openreview.net/forum?id=7uwLvFvpis", "author_site": "Haokun Gui, Xiucheng Li, Xinyang Chen", "tldr": "", "abstract": "In this paper, we propose a BERT-style self-supervised learning model, VQ-MTM (Vector Quantization Masked Time-Series Modeling), for the EEG time series data analysis. At its core, VQ-MTM comprises a theoretically grounded random-projection quantization module and a phase-aligning module guided by the Time-Phase-Shift Equivariance of Fourier Transform, the two modules can generate well-defined semantic units (akin to words in natural language) for the corrupted and periodic time series, thus offering robust and consistent learning signals for the EEG self-supervised learning. VQ-MTM also owns low model complexity and can easily adapt to large-scale datasets. We conduct experiments on five real-world datasets including two large-scale datasets to verify the efficacy of our proposed model, the experiment results show that VQ-MTM is able to consistently surpass the existing methods by large margins on both seizure detection and classification tasks. Our code is available at https://github.com/HaokunGUI/VQ_MTM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haokun GUI;Xiucheng Li;Xinyang Chen", "authorids": "~Haokun_GUI1;~Xiucheng_Li2;~Xinyang_Chen1", "gender": "M;M;Not Specified", "homepage": "https://haokungui.github.io/;https://xiucheng.org/;https://chenxinyang123.github.io/", "dblp": ";152/8201;242/3871-1", "google_scholar": ";https://scholar.google.com.sg/citations?user=qFSxE6YAAAAJ;qVxhGWUAAAAJ", "orcid": ";;0000-0001-6743-838X", "linkedin": ";;", "or_profile": "~Haokun_GUI1;~Xiucheng_Li2;~Xinyang_Chen1", "aff": "Harbin Institute of Technology, Shenzhen;Harbin Institute of Technology;Harbin Institute of Technology, Shenzhen", "aff_domain": "stu.hit.edu.cn;hit.edu.cn;hit.edu.cn", "position": "Undergrad student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\ngui2024vector,\ntitle={Vector Quantization Pretraining for {EEG} Time Series with Random Projection and Phase Alignment},\nauthor={Haokun GUI and Xiucheng Li and Xinyang Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7uwLvFvpis}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1009751, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7869667262750363256&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "stu.hit.edu.cn;hit.edu.cn;hit.edu.cn", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Harbin Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "http://en.hhit.edu.cn/", "aff_unique_abbr": "HIT", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Shenzhen;Harbin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Boximator: Generating Rich and Controllable Motions for Video Synthesis", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34864", "id": "7wgXuNOF0V", "proceeding": "https://proceedings.mlr.press/v235/wang24cr.html", "pdf": "https://openreview.net/pdf?id=7wgXuNOF0V", "openreview": "https://openreview.net/forum?id=7wgXuNOF0V", "author_site": "Jiawei Wang, Yuchen Zhang, Jiaxin Zou, Yan Zeng, Guoqiang Wei, Liping Yuan, Hang Li", "tldr": "", "abstract": "Generating rich and controllable motion is a pivotal challenge in video synthesis. We propose *Boximator*, a new approach for fine-grained motion control. Boximator introduces two constraint types: *hard box* and *soft box*. Users select objects in the conditional frame using hard boxes and then use either type of boxes to roughly or rigorously define the object\u2019s position, shape, or motion path in future frames. Boximator functions as a plug-in for existing video diffusion models. Its training process preserves the base model\u2019s knowledge by freezing the original weights and training only the control module. To address training challenges, we introduce a novel *self-tracking* technique that greatly simplifies the learning of box-object correlations. Empirically, Boximator achieves state-of-the-art video quality (FVD) scores, improving on two base models, and further enhanced after incorporating box constraints. Its robust motion controllability is validated by drastic increases in the bounding box alignment metric. Human evaluation also shows that users favor Boximator generation results over the base model.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiawei Wang;Yuchen Zhang;Jiaxin Zou;Yan Zeng;Guoqiang Wei;Liping Yuan;Hang Li", "authorids": "~Jiawei_Wang14;~Yuchen_Zhang1;~Jiaxin_Zou1;~Yan_Zeng1;~Guoqiang_Wei1;~Liping_Yuan2;~Hang_Li4", "gender": "M;M;F;M;F;M;M", "homepage": ";;;https://guoqiangwei.xyz/;;https://hangli-hl.github.io/;", "dblp": ";;83/4665-3.html;234/8900;04/40.html;https://dblp.org/pers/hd/l/Li_0001:Hang;09/5661-2", "google_scholar": "https://scholar.google.com.hk/citations?user=greXPJ8AAAAJ;https://scholar.google.com/citations?view_op=list_works;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?view_op=list_works;nTl5mSwAAAAJ;Om4Lag0AAAAJ", "orcid": ";0009-0009-7870-0174;;0000-0003-1846-5693;;0000-0001-9628-3487;", "linkedin": ";;;;;hang-li-84aa6314/;", "or_profile": "~Jiawei_Wang14;~Jiaxin_Zou1;~Yan_Zeng1;~Guoqiang_Wei1;~Liping_Yuan2;~Hang_Li4;~Yuchen_Zhang2", "aff": "ByteDance Inc.;ByteDance Ltd.;ByteDance;ByteDance;ByteDance Inc.;ByteDance Technology;ByteDance Inc.", "aff_domain": "bytedance.com;bytedance.com;bytedance.com;bytedance.com;bytedance.com;bytedance.com;bytedance.com", "position": "Researcher;Researcher;Researcher;Researcher;Researcher;Head of Research;Researcher", "bibtex": "@inproceedings{\nwang2024boximator,\ntitle={Boximator: Generating Rich and Controllable Motions for Video Synthesis},\nauthor={Jiawei Wang and Yuchen Zhang and Jiaxin Zou and Yan Zeng and Guoqiang Wei and Liping Yuan and Hang Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7wgXuNOF0V}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7066068, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6765650631207769657&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 6, "email": "bytedance.com;bytedance.com;bytedance.com;bytedance.com;bytedance.com;bytedance.com;bytedance.com", "author_num": 7, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "ByteDance", "aff_unique_dep": "", "aff_unique_url": "https://www.bytedance.com", "aff_unique_abbr": "ByteDance", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Risk Estimation in a Markov Cost Process: Lower and Upper Bounds", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34863", "id": "7xzhKEPfBo", "proceeding": "https://proceedings.mlr.press/v235/thoppe24a.html", "pdf": "https://openreview.net/pdf?id=7xzhKEPfBo", "openreview": "https://openreview.net/forum?id=7xzhKEPfBo", "author_site": "Gugan Chandrashekhar Mallika Thoppe, Prashanth L.A., Sanjay Bhat", "tldr": "", "abstract": "We tackle the problem of estimating risk measures of the infinite-horizon discounted cost of a Markov cost process. The risk measures we study include variance, Value-at-Risk (VaR), and Conditional Value-at-Risk (CVaR). First, we show that estimating any of these risk measures with $\\epsilon$-accuracy, either in expected or high-probability sense, requires at least $\\Omega(1/\\epsilon^2)$ samples. Then, using a truncation scheme, we derive an upper bound for the CVaR and variance estimation. This bound matches our lower bound up to logarithmic factors. Finally, we discuss an extension of our estimation scheme that covers more general risk measures satisfying a certain continuity criterion, such as spectral risk measures and utility-based shortfall risk. To the best of our knowledge, our work is the first to provide lower and upper bounds for estimating any risk measure beyond the mean within a Markovian setting. Our lower bounds also extend to the infinite-horizon discounted costs' mean. Even in that case, our lower bound of $\\Omega(1/\\epsilon^2) $ improves upon the existing $\\Omega(1/\\epsilon)$ bound (Metelli et al. 2023.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gugan Thoppe;Prashanth L A;Sanjay P. Bhat", "authorids": "~Gugan_Thoppe1;~Prashanth_L_A1;~Sanjay_P._Bhat1", "gender": "M;M;M", "homepage": ";http://www.cse.iitm.ac.in/~prashla/;", "dblp": "117/3710;90/3161;", "google_scholar": "https://scholar.google.co.in/citations?user=X5zV3s8AAAAJ;https://scholar.google.co.in/citations?user=Q1YXWpoAAAAJ;CL6xIH8AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Gugan_Thoppe1;~Prashanth_L_A1;~Sanjay_P._Bhat1", "aff": "Indian Institute of Science;Indian Institute of Technology Madras;Tata Consultancy Services Limited, India", "aff_domain": "iisc.ac.in;iitm.ac.in;tcs.com", "position": "Assistant Professor;Assistant Professor;Principal Scientist ", "bibtex": "@inproceedings{\nthoppe2024risk,\ntitle={Risk Estimation in a Markov Cost Process: Lower and Upper Bounds},\nauthor={Gugan Thoppe and Prashanth L A and Sanjay P. Bhat},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7xzhKEPfBo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 412329, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vj9lTlmYpZEJ:scholar.google.com/&scioq=Risk+Estimation+in+a+Markov+Cost+Process:+Lower+and+Upper+Bounds&hl=en&as_sdt=0,33", "gs_version_total": 8, "email": "iisc.ac.in;iitm.ac.in;tcs.com", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Indian Institute of Science;Indian Institute of Technology Madras;Tata Consultancy Services Limited", "aff_unique_dep": ";;", "aff_unique_url": "https://www.iisc.ac.in;https://www.iitm.ac.in;https://www.tcs.com", "aff_unique_abbr": "IISc;IIT Madras;TCS", "aff_campus_unique_index": "1", "aff_campus_unique": ";Madras", "aff_country_unique_index": "0;0;0", "aff_country_unique": "India" }, { "title": "Privacy Backdoors: Stealing Data with Corrupted Pretrained Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34862", "id": "7yixJXmzb8", "proceeding": "https://proceedings.mlr.press/v235/feng24h.html", "pdf": "https://openreview.net/pdf?id=7yixJXmzb8", "openreview": "https://openreview.net/forum?id=7yixJXmzb8", "author_site": "Shanglun Feng, Florian Tramer", "tldr": "", "abstract": "Practitioners commonly download pretrained machine learning models from open repositories and finetune them to fit specific applications. We show that this practice introduces a new risk of privacy backdoors. By tampering with a pretrained model\u2019s weights, an attacker can fully compromise the privacy of the finetuning data. We show how to build privacy backdoors for a variety of models, including transformers, which enable an attacker to reconstruct individual finetuning samples, with a guaranteed success! We further show that backdoored models allow for tight privacy attacks on models trained with differential privacy (DP). The common optimistic practice of training DP models with loose privacy guarantees is thus insecure if the model is not trusted. Overall, our work highlights a crucial and overlooked supply chain attack on machine learning privacy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shanglun Feng;Florian Tram\u00e8r", "authorids": "~Shanglun_Feng1;~Florian_Tram\u00e8r1", "gender": "M;M", "homepage": ";http://floriantramer.com", "dblp": ";158/7224", "google_scholar": ";https://scholar.google.ch/citations?user=ijH0-a8AAAAJ", "orcid": ";", "linkedin": "shanglun-feng-a95828250/;", "or_profile": "~Shanglun_Feng1;~Florian_Tramer1", "aff": "ETHZ - ETH Zurich;ETHZ - ETH Zurich", "aff_domain": "ethz.ch;ethz.ch", "position": "MS student;Assistant Professor", "bibtex": "@inproceedings{\nfeng2024privacy,\ntitle={Privacy Backdoors: Stealing Data with Corrupted Pretrained Models},\nauthor={Shanglun Feng and Florian Tram{\\`e}r},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7yixJXmzb8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4139542, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16387858983629109909&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 8, "email": "ethz.ch;ethz.ch", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "ETH Zurich", "aff_unique_dep": "", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "title": "Layerwise Change of Knowledge in Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34861", "id": "7zEoinErzQ", "proceeding": "https://proceedings.mlr.press/v235/cheng24b.html", "pdf": "https://openreview.net/pdf?id=7zEoinErzQ", "openreview": "https://openreview.net/forum?id=7zEoinErzQ", "author_site": "Xu Cheng, Lei Cheng, Zhaoran Peng, Yang Xu, Tian Han, Quanshi Zhang", "tldr": "", "abstract": "This paper aims to explain how a deep neural network (DNN) gradually extracts new knowledge and forgets noisy features through layers in forward propagation. Up to now, although how to define knowledge encoded by the DNN has not reached a consensus so far, previous studies have derived a series of mathematical evidences to take interactions as symbolic primitive inference patterns encoded by a DNN. We extend the definition of interactions and, for the first time, extract interactions encoded by intermediate layers. We quantify and track the newly emerged interactions and the forgotten interactions in each layer during the forward propagation, which shed new light on the learning behavior of DNNs. The layer-wise change of interactions also reveals the change of the generalization capacity and instability of feature representations of a DNN.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xu Cheng;Lei Cheng;Zhaoran Peng;Yang Xu;Tian Han;Quanshi Zhang", "authorids": "~Xu_Cheng1;~Lei_Cheng2;~Zhaoran_Peng2;~Yang_Xu19;~Tian_Han1;~Quanshi_Zhang1", "gender": "F;M;;M;M;M", "homepage": "https://cx1208.github.io/ChengXuSJTU.github.io/;https://github.com/chengstones;;https://hthth0801.github.io/;http://qszhang.com;https://superposition09m.github.io/", "dblp": "30/828-5;;;65/4065-1;http://dblp.uni-trier.de/pers/hd/z/Zhang:Quanshi;", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;;;Qtvu5t4AAAAJ;iFFhHK0AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0009-0001-5086-5673;;;;;", "linkedin": ";;zhaoran-peng-712b32264/;;;", "or_profile": "~Xu_Cheng1;~Lei_Cheng2;~Zhaoran_Peng2;~Tian_Han1;~Quanshi_Zhang1;~Yang_Xu20", "aff": "Nanjing University of Science and Technology;Shanghai Jiaotong University;Shanghai Jiaotong University;Stevens Institute of Technology;Shanghai Jiaotong University;Zhejiang University", "aff_domain": "njust.edu.cn;sjtu.edu.cn;sjtu.edu.cn;stevens.edu;sjtu.edu.cn;zju.edu.cn", "position": "Assistant Professor;Undergrad student;Undergrad student;Assistant Professor;Associate Professor;Undergrad student", "bibtex": "@inproceedings{\ncheng2024layerwise,\ntitle={Layerwise Change of Knowledge in Neural Networks},\nauthor={Xu Cheng and Lei Cheng and Zhaoran Peng and Yang Xu and Tian Han and Quanshi Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7zEoinErzQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4875667, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16693137472964295264&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "email": "njust.edu.cn;sjtu.edu.cn;sjtu.edu.cn;stevens.edu;sjtu.edu.cn;zju.edu.cn", "author_num": 6, "aff_unique_index": "0;1;1;2;1;3", "aff_unique_norm": "Nanjing University of Science and Technology;Shanghai Jiao Tong University;Stevens Institute of Technology;Zhejiang University", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.nust.edu.cn/;https://www.sjtu.edu.cn;https://www.stevens.edu;https://www.zju.edu.cn", "aff_unique_abbr": "NUST;SJTU;SIT;ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "China;United States" }, { "title": "Compositional Image Decomposition with Diffusion Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34860", "id": "7zvl9mNQG2", "proceeding": "https://proceedings.mlr.press/v235/su24c.html", "pdf": "https://openreview.net/pdf?id=7zvl9mNQG2", "openreview": "https://openreview.net/forum?id=7zvl9mNQG2", "author_site": "Jocelin Su, Nan Liu, Yanbo Wang, Josh Tenenbaum, Yilun Du", "tldr": "", "abstract": "Given an image of a natural scene, we are able to quickly decompose it into a set of components such as objects, lighting, shadows, and foreground. We can then envision a scene where we combine certain components with those from other images, for instance a set of objects from our bedroom and animals from a zoo under the lighting conditions of a forest, even if we have never encountered such a scene before. In this paper, we present a method to decompose an image into such compositional components. Our approach, Decomp Diffusion, is an unsupervised method which, when given a single image, infers a set of different components in the image, each represented by a diffusion model. We demonstrate how components can capture different factors of the scene, ranging from global scene descriptors like shadows or facial expression to local scene descriptors like constituent objects. We further illustrate how inferred factors can be flexibly composed, even with factors inferred from other models, to generate a variety of scenes sharply different than those seen in training time. Code and visualizations are at https://energy-based-model.github.io/decomp-diffusion.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jocelin Su;Nan Liu;Yanbo Wang;Joshua B. Tenenbaum;Yilun Du", "authorids": "~Jocelin_Su1;~Nan_Liu4;~Yanbo_Wang3;~Joshua_B._Tenenbaum1;~Yilun_Du1", "gender": ";;M;;", "homepage": ";;http://microelectronics.tudelft.nl/People/bio.php?id=810;;https://yilundu.github.io", "dblp": ";;;t/JoshuaBTenenbaum;204/4379", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Jocelin_Su1;~Nan_Liu4;~Yanbo_Wang3;~Joshua_B._Tenenbaum1;~Yilun_Du1", "aff": ";;Delft University of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": ";;tudelft.nl;mit.edu;mit.edu", "position": ";;PhD student;Professor;PhD student", "bibtex": "@inproceedings{\nsu2024compositional,\ntitle={Compositional Image Decomposition with Diffusion Models},\nauthor={Jocelin Su and Nan Liu and Yanbo Wang and Joshua B. Tenenbaum and Yilun Du},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7zvl9mNQG2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6993208, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16868049165076129421&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";;tudelft.nl;mit.edu;mit.edu", "author_num": 5, "aff_unique_index": "0;1;1", "aff_unique_norm": "Delft University of Technology;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.tudelft.nl;https://web.mit.edu", "aff_unique_abbr": "TU Delft;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Netherlands;United States" }, { "title": "DIDI: Diffusion-Guided Diversity for Offline Behavioral Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34859", "id": "8296yUBoXr", "proceeding": "https://proceedings.mlr.press/v235/liu24s.html", "pdf": "https://openreview.net/pdf?id=8296yUBoXr", "openreview": "https://openreview.net/forum?id=8296yUBoXr", "author_site": "Jinxin Liu, Xinghong Guo, Zifeng Zhuang, Donglin Wang", "tldr": "", "abstract": "In this paper, we propose a novel approach called DIffusion-guided DIversity (DIDI) for offline behavioral generation. The goal of DIDI is to learn a diverse set of skills from a mixture of label-free offline data. We achieve this by leveraging diffusion probabilistic models as priors to guide the learning process and regularize the policy. By optimizing a joint objective that incorporates diversity and diffusion-guided regularization, we encourage the emergence of diverse behaviors while maintaining the similarity to the offline data. Experimental results in four decision-making domains (Push, Kitchen, Humanoid, and D4RL tasks) show that DIDI is effective in discovering diverse and discriminative skills. We also introduce skill stitching and skill interpolation, which highlight the generalist nature of the learned skill space. Further, by incorporating an extrinsic reward function, DIDI enables reward-guided behavior generation, facilitating the learning of diverse and optimal behaviors from sub-optimal data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jinxin Liu;Xinghong Guo;Zifeng Zhuang;Donglin Wang", "authorids": "~Jinxin_Liu1;~Xinghong_Guo1;~Zifeng_Zhuang1;~Donglin_Wang1", "gender": ";M;M;M", "homepage": ";;;https://milab.westlake.edu.cn/", "dblp": ";;276/5034;", "google_scholar": ";clYA4F8AAAAJ;;https://scholar.google.ca/citations?user=-fo6wdwAAAAJ", "orcid": ";;;0000-0002-8188-3735", "linkedin": ";;;", "or_profile": "~Jinxin_Liu1;~Xinghong_Guo1;~Zifeng_Zhuang1;~Donglin_Wang1", "aff": ";;Zhejiang University;Westlake University", "aff_domain": ";;zju.edu.cn;westlake.edu.cn", "position": ";;PhD student;Associate Professor", "bibtex": "@inproceedings{\nliu2024didi,\ntitle={{DIDI}: Diffusion-Guided Diversity for Offline Behavioral Generation},\nauthor={Jinxin Liu and Xinghong Guo and Zifeng Zhuang and Donglin Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8296yUBoXr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8503363, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10241038766856691648&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": ";;zju.edu.cn;westlake.edu.cn", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Zhejiang University;Westlake University", "aff_unique_dep": ";", "aff_unique_url": "https://www.zju.edu.cn;https://www.westlake.edu.cn", "aff_unique_abbr": "ZJU;WU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Revitalizing Multivariate Time Series Forecasting: Learnable Decomposition with Inter-Series Dependencies and Intra-Series Variations Modeling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34858", "id": "87CYNyCGOo", "proceeding": "https://proceedings.mlr.press/v235/yu24s.html", "pdf": "https://openreview.net/pdf?id=87CYNyCGOo", "openreview": "https://openreview.net/forum?id=87CYNyCGOo", "author_site": "Guoqi Yu, Jing Zou, Xiaowei Hu, Angelica I Aviles-Rivero, Jing Qin, Shujun Wang", "tldr": "", "abstract": "Predicting multivariate time series is crucial, demanding precise modeling of intricate patterns, including inter-series dependencies and intra-series variations. Distinctive trend characteristics in each time series pose challenges, and existing methods, relying on basic moving average kernels, may struggle with the non-linear structure and complex trends in real-world data. Given that, we introduce a learnable decomposition strategy to capture dynamic trend information more reasonably. Additionally, we propose a dual attention module tailored to capture inter-series dependencies and intra-series variations simultaneously for better time series forecasting, which is implemented by channel-wise self-attention and autoregressive self-attention. To evaluate the effectiveness of our method, we conducted experiments across eight open-source datasets and compared it with the state-of-the-art methods. Through the comparison results, our $\\textbf{Leddam}$ ($\\textbf{LE}arnable$ $\\textbf{D}ecomposition$ and $\\textbf{D}ual $ $\\textbf{A}ttention$ $\\textbf{M}odule$) not only demonstrates significant advancements in predictive performance but also the proposed decomposition strategy can be plugged into other methods with a large performance-boosting, from 11.87% to 48.56% MSE error degradation. Code is available at this link: https://github.com/Levi-Ackman/Leddam.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guoqi Yu;Jing Zou;Xiaowei Hu;Angelica I Aviles-Rivero;Jing Qin;Shujun Wang", "authorids": "~Guoqi_Yu1;~Jing_Zou2;~Xiaowei_Hu3;~Angelica_I_Aviles-Rivero1;~Jing_Qin3;~Shujun_Wang1", "gender": "M;F;M;F;M;F", "homepage": "https://github.com/Levi-Ackman;https://github.com/zoujing925;https://xw-hu.github.io/;https://angelicaiaviles.wordpress.com/;https://harry-qinjing.github.io/;https://emma-sjwang.github.io/", "dblp": ";;151/5859-1;138/9507;00/1015-1;", "google_scholar": "https://scholar.google.com.hk/citations?hl=zh-CN;;tUb4J0kAAAAJ;https://scholar.google.com/citations?hl=en;X3Wi7wkAAAAJ;https://scholar.google.com.hk/citations?user=WFELH6IAAAAJ", "orcid": ";;0000-0002-5708-7018;;0000-0002-7059-0929;0000-0003-1495-3278", "linkedin": ";;;;;", "or_profile": "~Guoqi_Yu1;~Jing_Zou2;~Xiaowei_Hu3;~Angelica_I_Aviles-Rivero1;~Jing_Qin3;~Shujun_Wang1", "aff": "University of Electronic Science and Technology of China;Hong Kong Polytechnic University;Shanghai Artificial Intelligence Laboratory;University of Cambridge;Hong Kong Polytechnic University;Hong Kong Polytechnic University", "aff_domain": "uestc.edu.cn;polyu.edu.hk;pjlab.org.cn;cam.ac.uk;polyu.edu.hk;polyu.edu.hk", "position": "Undergrad student;Postdoc;Researcher;Senior Research Associate;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nyu2024revitalizing,\ntitle={Revitalizing Multivariate Time Series Forecasting: Learnable Decomposition with Inter-Series Dependencies and Intra-Series Variations Modeling},\nauthor={Guoqi Yu and Jing Zou and Xiaowei Hu and Angelica I Aviles-Rivero and Jing Qin and Shujun Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=87CYNyCGOo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8186812, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2131357536733862726&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "uestc.edu.cn;polyu.edu.hk;pjlab.org.cn;cam.ac.uk;polyu.edu.hk;polyu.edu.hk", "author_num": 6, "aff_unique_index": "0;1;2;3;1;1", "aff_unique_norm": "University of Electronic Science and Technology of China;Hong Kong Polytechnic University;Shanghai Artificial Intelligence Laboratory;University of Cambridge", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.uestc.edu.cn;https://www.polyu.edu.hk;http://www.shailab.org/;https://www.cam.ac.uk", "aff_unique_abbr": "UESTC;PolyU;Shanghai AI Lab;Cambridge", "aff_campus_unique_index": "1;2;1;1", "aff_campus_unique": ";Hong Kong SAR;Cambridge", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "China;United Kingdom" }, { "title": "Unlocking the Power of Spatial and Temporal Information in Medical Multimodal Pre-training", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34857", "id": "87ZrVHDqmR", "proceeding": "https://proceedings.mlr.press/v235/yang24v.html", "pdf": "https://openreview.net/pdf?id=87ZrVHDqmR", "openreview": "https://openreview.net/forum?id=87ZrVHDqmR", "author_site": "Jinxia Yang, Bing Su, Xin Zhao, Ji-Rong Wen", "tldr": "", "abstract": "Medical vision-language pre-training methods mainly leverage the correspondence between paired medical images and radiological reports. Although multi-view spatial images and temporal sequences of image-report pairs are available in off-the-shelf multi-modal medical datasets, most existing methods have not thoroughly tapped into such extensive supervision signals. In this paper, we introduce the Med-ST framework for fine-grained spatial and temporal modeling to exploit information from multiple spatial views of chest radiographs and temporal historical records. For spatial modeling, Med-ST employs the *Mixture of View Expert (MoVE)* architecture to integrate different visual features from both frontal and lateral views. To achieve a more comprehensive alignment, Med-ST not only establishes the global alignment between whole images and texts but also introduces modality-weighted local alignment between text tokens and spatial regions of images. For temporal modeling, we propose a novel cross-modal bidirectional cycle consistency objective by forward mapping classification (FMC) and reverse mapping regression (RMR). By perceiving temporal information from simple to complex, Med-ST can learn temporal semantics. Experimental results across four distinct tasks demonstrate the effectiveness of Med-ST, especially in temporal classification tasks. Our code and model are available at https://github.com/SVT-Yang/MedST.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jinxia Yang;Bing Su;Xin Zhao;Ji-Rong Wen", "authorids": "~Jinxia_Yang2;~Bing_Su1;~Xin_Zhao10;~Ji-Rong_Wen1", "gender": "M;M;M;", "homepage": "https://gsai.ruc.edu.cn/bingsu;https://gsai.ruc.edu.cn/addons/teacher/index/info.html?user_id=5&ruccode=20140041&ln=cn;https://gsai.ruc.edu.cn/english/jrwen;", "dblp": "41/5270-1;https://dblp.uni-trier.de/pid/52/8700.html;w/JRWen;", "google_scholar": "https://scholar.google.com.sg/citations?user=d3g2VJQAAAAJ;JNhNacoAAAAJ;tbxCHJgAAAAJ;", "orcid": "0000-0001-8560-1910;0000-0002-8333-6196;0000-0002-9777-9676;0000-0002-2915-7103", "linkedin": ";;;", "or_profile": "~Bing_Su1;~Xin_Zhao10;~Ji-Rong_Wen1;~jinxia_yang1", "aff": "Renmin University of China;Renmin University of China;Renmin University of China;Renmin University of China", "aff_domain": "ruc.edu.cn;ruc.edu.cn;ruc.edu.cn;ruc.edu.cn", "position": "Associate Professor;Full Professor;Full Professor;MS student", "bibtex": "@inproceedings{\nyang2024unlocking,\ntitle={Unlocking the Power of Spatial and Temporal Information in Medical Multimodal Pre-training},\nauthor={Jinxia Yang and Bing Su and Xin Zhao and Ji-Rong Wen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=87ZrVHDqmR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2005323, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6123805948979252524&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "ruc.edu.cn;ruc.edu.cn;ruc.edu.cn;ruc.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Renmin University of China", "aff_unique_dep": "", "aff_unique_url": "http://www.ruc.edu.cn", "aff_unique_abbr": "RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Understanding Forgetting in Continual Learning with Linear Regression", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34856", "id": "89kZWloYQx", "proceeding": "https://proceedings.mlr.press/v235/ding24c.html", "pdf": "https://openreview.net/pdf?id=89kZWloYQx", "openreview": "https://openreview.net/forum?id=89kZWloYQx", "author_site": "Meng Ding, Kaiyi Ji, Di Wang, Jinhui Xu", "tldr": "", "abstract": "Continual learning, focused on sequentially learning multiple tasks, has gained significant attention recently. Despite the tremendous progress made in the past, the theoretical understanding, especially factors contributing to $\\textit{catastrophic forgetting}$, remains relatively unexplored. In this paper, we provide a general theoretical analysis of forgetting in the linear regression model via Stochastic Gradient Descent (SGD) applicable to both under-parameterized and overparameterized regimes. Our theoretical framework reveals some interesting insights into the intricate relationship between task sequence and algorithmic parameters, an aspect not fully captured in previous studies due to their restrictive assumptions. Specifically, we demonstrate that, given a sufficiently large data size, the arrangement of tasks in a sequence\u2014where tasks with larger eigenvalues in their population data covariance matrices are trained later\u2014tends to result in increased forgetting. Additionally, our findings highlight that an appropriate choice of step size will help mitigate forgetting in both under-parameterized and overparameterized settings. To validate our theoretical analysis, we conducted simulation experiments on both linear regression models and Deep Neural Networks (DNNs). Results from these simulations substantiate our theoretical findings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Meng Ding;Kaiyi Ji;Di Wang;Jinhui Xu", "authorids": "~Meng_Ding3;~Kaiyi_Ji1;~Di_Wang1;~Jinhui_Xu1", "gender": "F;M;;M", "homepage": ";https://cse.buffalo.edu/~kaiyiji/;;https://www.cse.buffalo.edu/~jinhui/", "dblp": ";205/3164;;24/6437-1.html", "google_scholar": "Ipwvf8oAAAAJ;E0A3lSIAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Meng_Ding3;~Kaiyi_Ji1;~Di_Wang1;~Jinhui_Xu1", "aff": "State University of New York at Buffalo;State University of New York at Buffalo;;University at Buffalo, State University of New York", "aff_domain": "buffalo.edu;buffalo.edu;;buffalo.edu", "position": "PhD student;Assistant Professor;;Full Professor", "bibtex": "@inproceedings{\nding2024understanding,\ntitle={Understanding Forgetting in Continual Learning with Linear Regression},\nauthor={Meng Ding and Kaiyi Ji and Di Wang and Jinhui Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=89kZWloYQx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1868403, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=563327614202053268&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "buffalo.edu;buffalo.edu;;buffalo.edu", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "State University of New York at Buffalo;University at Buffalo", "aff_unique_dep": ";", "aff_unique_url": "https://www.buffalo.edu;https://www.buffalo.edu", "aff_unique_abbr": "SUNY Buffalo;UB", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Buffalo", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "An Infinite-Width Analysis on the Jacobian-Regularised Training of a Neural Network", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34855", "id": "8AeuhCgRRv", "proceeding": "https://proceedings.mlr.press/v235/kim24ah.html", "pdf": "https://openreview.net/pdf?id=8AeuhCgRRv", "openreview": "https://openreview.net/forum?id=8AeuhCgRRv", "author_site": "Taeyoung Kim, Hongseok Yang", "tldr": "", "abstract": "The recent theoretical analysis of deep neural networks in their infinite-width limits has deepened our understanding of initialisation, feature learning, and training of those networks, and brought new practical techniques for finding appropriate hyperparameters, learning network weights, and performing inference. In this paper, we broaden this line of research by showing that this infinite-width analysis can be extended to the Jacobian of a deep neural network. We show that a multilayer perceptron (MLP) and its Jacobian at initialisation jointly converge to a Gaussian process (GP) as the widths of the MLP's hidden layers go to infinity and characterise this GP. We also prove that in the infinite-width limit, the evolution of the MLP under the so-called robust training (i.e., training with a regulariser on the Jacobian) is described by a linear first-order ordinary differential equation that is determined by a variant of the Neural Tangent Kernel. We experimentally show the relevance of our theoretical claims to wide finite networks, and empirically analyse the properties of kernel regression solution to obtain an insight into Jacobian regularisation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Taeyoung Kim;Hongseok Yang", "authorids": "~Taeyoung_Kim2;~Hongseok_Yang2", "gender": "M;M", "homepage": "https://github.com/mekty2012;https://sites.google.com/view/hongseokyang/home", "dblp": ";82/5808", "google_scholar": ";cLuwH14AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Taeyoung_Kim2;~Hongseok_Yang1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nkim2024an,\ntitle={An Infinite-Width Analysis on the Jacobian-Regularised Training of a Neural Network},\nauthor={Taeyoung Kim and Hongseok Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8AeuhCgRRv}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3357120, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:LlG-akYWVKMJ:scholar.google.com/&scioq=An+Infinite-Width+Analysis+on+the+Jacobian-Regularised+Training+of+a+Neural+Network&hl=en&as_sdt=0,31", "gs_version_total": 9, "email": "kaist.ac.kr;kaist.ac.kr", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Attack-free Evaluating and Enhancing Adversarial Robustness on Categorical Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34854", "id": "8ERo4jph0A", "proceeding": "https://proceedings.mlr.press/v235/zhou24i.html", "pdf": "https://openreview.net/pdf?id=8ERo4jph0A", "openreview": "https://openreview.net/forum?id=8ERo4jph0A", "author_site": "Yujun Zhou, Yufei Han, Haomin Zhuang, Hongyan Bao, Xiangliang Zhang", "tldr": "", "abstract": "Research on adversarial robustness has predominantly focused on continuous inputs, leaving categorical inputs, especially tabular attributes, less examined. To echo this challenge, our work aims to evaluate and enhance the robustness of classification over categorical attributes against adversarial perturbations through efficient attack-free approaches. We propose a robustness evaluation metric named Integrated Gradient-Smoothed Gradient (IGSG). It is designed to evaluate the attributional sensitivity of each feature and the decision boundary of the classifier, two aspects that significantly influence adversarial risk, according to our theoretical analysis. Leveraging this metric, we develop an IGSG-based regularization to reduce adversarial risk by suppressing the sensitivity of categorical attributes. We conduct extensive empirical studies over categorical datasets of various application domains. The results affirm the efficacy of both IGSG and IGSG-based regularization. Notably, IGSG-based regularization surpasses the state-of-the-art robust training methods by a margin of approximately 0.4% to 12.2% on average in terms of adversarial accuracy, especially on high-dimension datasets. The code is available at https://github.com/YujunZhou/IGSG.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yujun Zhou;Yufei Han;Haomin Zhuang;Hongyan Bao;Xiangliang Zhang", "authorids": "~Yujun_Zhou1;~Yufei_Han1;~Haomin_Zhuang1;~Hongyan_Bao1;~Xiangliang_Zhang1", "gender": "M;M;;;F", "homepage": "https://yujunzhou.github.io/;;https://zhmzm.github.io/;https://mine.kaust.edu.sa/Pages/Hongyan.aspx;https://sites.nd.edu/xiangliang-zhang/", "dblp": "162/3265-2;74/2507;344/1798;234/6902;74/1890-1", "google_scholar": "t0c7rQQAAAAJ;xdCvBg0AAAAJ;vXllNroAAAAJ;;BhRJe4wAAAAJ", "orcid": "0000-0003-1376-5187;;;;0000-0002-3574-5665", "linkedin": "yujun-zhou-zyj/;;;;", "or_profile": "~Yujun_Zhou1;~Yufei_Han1;~Haomin_Zhuang1;~Hongyan_Bao1;~Xiangliang_Zhang1", "aff": "University of Notre Dame;INRIA;University of Notre Dame;KAUST;University of Notre Dame", "aff_domain": "nd.edu;inria.fr;nd.edu;kaust.edu.sa;nd.edu", "position": "PhD student;Researcher;PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nzhou2024attackfree,\ntitle={Attack-free Evaluating and Enhancing Adversarial Robustness on Categorical Data},\nauthor={Yujun Zhou and Yufei Han and Haomin Zhuang and Hongyan Bao and Xiangliang Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8ERo4jph0A}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2861468, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5172045289081328261&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 11, "email": "nd.edu;inria.fr;nd.edu;kaust.edu.sa;nd.edu", "author_num": 5, "aff_unique_index": "0;1;0;2;0", "aff_unique_norm": "University of Notre Dame;INRIA;King Abdullah University of Science and Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nd.edu;https://www.inria.fr;https://www.kaust.edu.sa", "aff_unique_abbr": "Notre Dame;INRIA;KAUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;2;0", "aff_country_unique": "United States;France;Saudi Arabia" }, { "title": "Bridging discrete and continuous state spaces: Exploring the Ehrenfest process in time-continuous diffusion models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34853", "id": "8GYclcxQXB", "proceeding": "https://proceedings.mlr.press/v235/winkler24a.html", "pdf": "https://openreview.net/pdf?id=8GYclcxQXB", "openreview": "https://openreview.net/forum?id=8GYclcxQXB", "author_site": "Ludwig Winkler, Lorenz Richter, Manfred Opper", "tldr": "", "abstract": "Generative modeling via stochastic processes has led to remarkable empirical results as well as to recent advances in their theoretical understanding. In principle, both space and time of the processes can be discrete or continuous. In this work, we study time-continuous Markov jump processes on discrete state spaces and investigate their correspondence to state-continuous diffusion processes given by SDEs. In particular, we revisit the $\\textit{Ehrenfest process}$, which converges to an Ornstein-Uhlenbeck process in the infinite state space limit. Likewise, we can show that the time-reversal of the Ehrenfest process converges to the time-reversed Ornstein-Uhlenbeck process. This observation bridges discrete and continuous state spaces and allows to carry over methods from one to the respective other setting, such as for instance loss functions that lead to improved convergence. Additionally, we suggest an algorithm for training the time-reversal of Markov jump processes which relies on conditional expectations and can thus be directly related to denoising score matching. We demonstrate our methods in multiple convincing numerical experiments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ludwig Winkler;Lorenz Richter;Manfred Opper", "authorids": "~Ludwig_Winkler1;~Lorenz_Richter1;~Manfred_Opper1", "gender": "M;;", "homepage": "https://ludwigwinkler.github.io;;", "dblp": "171/8795;;", "google_scholar": "nu_vtVkAAAAJ;https://scholar.google.de/citations?user=uxlQvnUAAAAJ;", "orcid": "0000-0002-1354-4715;;", "linkedin": ";;", "or_profile": "~Ludwig_Winkler1;~Lorenz_Richter1;~Manfred_Opper1", "aff": "Technische Universit\u00e4t Berlin;Zuse Institute Berlin;", "aff_domain": "tu-berlin.de;zib.de;", "position": "PhD student;Postdoc;", "bibtex": "@inproceedings{\nwinkler2024bridging,\ntitle={Bridging discrete and continuous state spaces: Exploring the Ehrenfest process in time-continuous diffusion models},\nauthor={Ludwig Winkler and Lorenz Richter and Manfred Opper},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8GYclcxQXB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3464354, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9446009451119788814&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "tu-berlin.de;zib.de;", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Technische Universit\u00e4t Berlin;Zuse Institute Berlin", "aff_unique_dep": ";", "aff_unique_url": "https://www.tu-berlin.de;https://www.zib.de", "aff_unique_abbr": "TU Berlin;ZIB", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Multi-Sender Persuasion: A Computational Perspective", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34852", "id": "8JFIKpzumn", "proceeding": "https://proceedings.mlr.press/v235/hossain24c.html", "pdf": "https://openreview.net/pdf?id=8JFIKpzumn", "openreview": "https://openreview.net/forum?id=8JFIKpzumn", "author_site": "Safwan Hossain, Tonghan Wang, Tao Lin, Yiling Chen, David Parkes, Haifeng Xu", "tldr": "", "abstract": "We consider *multiple senders* with informational advantage signaling to convince a single self-interested actor to take certain actions. Generalizing the seminal *Bayesian Persuasion* framework, such settings are ubiquitous in computational economics, multi-agent learning, and machine learning with multiple objectives. The core solution concept here is the Nash equilibrium of senders' signaling policies. Theoretically, we prove that finding an equilibrium in general is PPAD-Hard; in fact, even computing a sender's best response is NP-Hard. Given these intrinsic difficulties, we turn to finding local Nash equilibria. We propose a novel differentiable neural network to approximate this game's non-linear and discontinuous utilities. Complementing this with the extra-gradient algorithm, we discover local equilibria that Pareto dominates full-revelation equilibria and those found by existing neural networks. Broadly, our theoretical and empirical contributions are of interest to a large class of economic problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Safwan Hossain;Tonghan Wang;Tao Lin;Yiling Chen;David C. Parkes;Haifeng Xu", "authorids": "~Safwan_Hossain1;~Tonghan_Wang1;~Tao_Lin2;~Yiling_Chen1;~David_C._Parkes1;~Haifeng_Xu1", "gender": "M;M;M;F;M;M", "homepage": "https://safwanhossain.github.io/;https://tonghanwang.github.io/;https://tao-l.github.io/;https://yiling.seas.harvard.edu/;https://parkes.seas.harvard.edu/;http://www.haifeng-xu.com/", "dblp": ";175/6039-1.html;64/4492-13;72/3762-1;p/DavidCParkes.html;04/1895", "google_scholar": "https://scholar.google.ca/citations?user=gyCQnUAAAAAJ;-AR1yc4AAAAJ;https://scholar.google.com/citations?hl=en;x_7xA0UAAAAJ;JUn8PgwAAAAJ;nLgg388AAAAJ", "orcid": ";;;;0000-0002-2701-3464;", "linkedin": ";;;;;", "or_profile": "~Safwan_Hossain1;~Tonghan_Wang1;~Tao_Lin2;~Yiling_Chen1;~David_C._Parkes1;~Haifeng_Xu1", "aff": "Harvard University;Tsinghua University;Harvard University;Harvard University;Harvard University;University of Chicago", "aff_domain": "harvard.edu;tsinghua.edu.cn;g.harvard.edu;fas.harvard.edu;harvard.edu;cs.uchicago.edu", "position": "PhD student;MS student;PhD student;Full Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nhossain2024multisender,\ntitle={Multi-Sender Persuasion: A Computational Perspective},\nauthor={Safwan Hossain and Tonghan Wang and Tao Lin and Yiling Chen and David C. Parkes and Haifeng Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8JFIKpzumn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2043091, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11929189569418036764&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "harvard.edu;tsinghua.edu.cn;g.harvard.edu;fas.harvard.edu;harvard.edu;cs.uchicago.edu", "author_num": 6, "aff_unique_index": "0;1;0;0;0;2", "aff_unique_norm": "Harvard University;Tsinghua University;University of Chicago", "aff_unique_dep": ";;", "aff_unique_url": "https://www.harvard.edu;https://www.tsinghua.edu.cn;https://www.uchicago.edu", "aff_unique_abbr": "Harvard;THU;UChicago", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;0", "aff_country_unique": "United States;China" }, { "title": "Data-Efficient Molecular Generation with Hierarchical Textual Inversion", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34851", "id": "8KeD4mEh3j", "proceeding": "https://proceedings.mlr.press/v235/kim24z.html", "pdf": "https://openreview.net/pdf?id=8KeD4mEh3j", "openreview": "https://openreview.net/forum?id=8KeD4mEh3j", "author_site": "Seojin Kim, Jaehyun Nam, Sihyun Yu, Younghoon Shin, Jinwoo Shin", "tldr": "", "abstract": "Developing an effective molecular generation framework even with a limited number of molecules is often important for its practical deployment, e.g., drug discovery, since acquiring task-related molecular data requires expensive and time-consuming experimental costs. To tackle this issue, we introduce Hierarchical Textual Inversion for Molecular Generation (HI-Mol), a novel data-efficient molecular generation method. HI-Mol is inspired by the importance of hierarchical information, e.g., both coarse- and fine-grained features, in understanding the molecule distribution. We propose to use multi-level embeddings to reflect such hierarchical features based on the adoption of the recent textual inversion technique in the visual domain, which achieves data-efficient image generation. Compared to the conventional textual inversion method in the image domain using a single-level token embedding, our multi-level token embeddings allow the model to effectively learn the underlying low-shot molecule distribution. We then generate molecules based on the interpolation of the multi-level token embeddings. Extensive experiments demonstrate the superiority of HI-Mol with notable data-efficiency. For instance, on QM9, HI-Mol outperforms the prior state-of-the-art method with 50x less training data. We also show the effectiveness of molecules generated by HI-Mol in low-shot molecular property prediction. Code is available at https://github.com/Seojin-Kim/HI-Mol.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Seojin Kim;Jaehyun Nam;Sihyun Yu;Younghoon Shin;Jinwoo Shin", "authorids": "~Seojin_Kim2;~Jaehyun_Nam2;~Sihyun_Yu2;~Younghoon_Shin2;~Jinwoo_Shin1", "gender": "M;;M;M;M", "homepage": "https://alinlab.kaist.ac.kr/members.html;https://jaehyun513.github.io/;https://sihyun-yu.github.io;https://github.com/0HOON;https://sites.google.com/site/mijirim/", "dblp": "95/102;162/5227;287/4627;;31/7062", "google_scholar": ";https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;;https://scholar.google.com.tw/citations?user=m3eDp7kAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Seojin_Kim2;~Jaehyun_Nam2;~Sihyun_Yu2;~Younghoon_Shin2;~Jinwoo_Shin1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Research, Google;Korea University;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;research.google.com;korea.ac.kr;kaist.ac.kr", "position": "PhD student;PhD student;Intern;Undergrad student;Full Professor", "bibtex": "@inproceedings{\nkim2024dataefficient,\ntitle={Data-Efficient Molecular Generation with Hierarchical Textual Inversion},\nauthor={Seojin Kim and Jaehyun Nam and Sihyun Yu and Younghoon Shin and Jinwoo Shin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8KeD4mEh3j}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6047753, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16201387754646065141&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "kaist.ac.kr;kaist.ac.kr;research.google.com;korea.ac.kr;kaist.ac.kr", "author_num": 5, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Google;Korea University", "aff_unique_dep": ";Google Research;", "aff_unique_url": "https://www.kaist.ac.kr;https://research.google;https://www.korea.ac.kr", "aff_unique_abbr": "KAIST;Google;KU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "South Korea;United States" }, { "title": "Context-Guided Diffusion for Out-of-Distribution Molecular and Protein Design", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34850", "id": "8NfHmzo0Op", "proceeding": "https://proceedings.mlr.press/v235/klarner24a.html", "pdf": "https://openreview.net/pdf?id=8NfHmzo0Op", "openreview": "https://openreview.net/forum?id=8NfHmzo0Op", "author_site": "Leo Klarner, Tim G. J. Rudner, Garrett Morris, Charlotte Deane, Yee-Whye Teh", "tldr": "", "abstract": "Generative models have the potential to accelerate key steps in the discovery of novel molecular therapeutics and materials. Diffusion models have recently emerged as a powerful approach, excelling at unconditional sample generation and, with data-driven guidance, conditional generation within their training domain. Reliably sampling from high-value regions beyond the training data, however, remains an open challenge---with current methods predominantly focusing on modifying the diffusion process itself. In this paper, we develop context-guided diffusion (CGD), a simple plug-and-play method that leverages unlabeled data and smoothness constraints to improve the out-of-distribution generalization of guided diffusion models. We demonstrate that this approach leads to substantial performance gains across various settings, including continuous, discrete, and graph-structured diffusion processes with applications across drug discovery, materials science, and protein design.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Leo Klarner;Tim G. J. Rudner;Garrett M Morris;Charlotte Deane;Yee Whye Teh", "authorids": "~Leo_Klarner2;~Tim_G._J._Rudner2;~Garrett_M_Morris1;~Charlotte_Deane1;~Yee_Whye_Teh2", "gender": ";;M;F;", "homepage": ";;https://www.stats.ox.ac.uk/all-people/garrett-morris/;https://www.stats.ox.ac.uk/~deane/;", "dblp": ";;;;", "google_scholar": ";;https://scholar.google.co.uk/citations?user=yCc_2IwAAAAJ;https://scholar.google.co.uk/citations?user=QAdcBnQAAAAJ;", "orcid": ";;0000-0003-1731-8405;0000-0003-1388-2252;", "linkedin": ";;garrettmorris/;charlotte-deane-27918614/;", "or_profile": "~Leo_Klarner2;~Tim_G._J._Rudner2;~Garrett_M_Morris1;~Charlotte_Deane1;~Yee_Whye_Teh2", "aff": ";;University of Oxford;University of Oxford;", "aff_domain": ";;ox.ac.uk;ox.ac.uk;", "position": ";;Associate Professor;Full Professor;", "bibtex": "@inproceedings{\nklarner2024contextguided,\ntitle={Context-Guided Diffusion for Out-of-Distribution Molecular and Protein Design},\nauthor={Leo Klarner and Tim G. J. Rudner and Garrett M Morris and Charlotte Deane and Yee Whye Teh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8NfHmzo0Op}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5314519, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8254616805448597046&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";;ox.ac.uk;ox.ac.uk;", "author_num": 5, "aff_unique_index": "0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Emergent Representations of Program Semantics in Language Models Trained on Programs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34849", "id": "8PTx4CpNoT", "proceeding": "https://proceedings.mlr.press/v235/jin24e.html", "pdf": "https://openreview.net/pdf?id=8PTx4CpNoT", "openreview": "https://openreview.net/forum?id=8PTx4CpNoT", "author_site": "Charles Jin, Martin Rinard", "tldr": "", "abstract": "We present evidence that language models (LMs) of code can learn to represent the formal semantics of programs, despite being trained only to perform next-token prediction. Specifically, we train a Transformer model on a synthetic corpus of programs written in a domain-specific language for navigating 2D grid world environments. Each program in the corpus is preceded by a (partial) specification in the form of several input-output grid world states. Despite providing no further inductive biases, we find that a probing classifier is able to extract increasingly accurate representations of the *unobserved, intermediate* grid world states from the LM hidden states over the course of training, suggesting the LM acquires an emergent ability to *interpret* programs in the formal sense. We also develop a novel interventional baseline that enables us to disambiguate what is represented by the LM as opposed to learned by the probe. We anticipate that this technique may be generally applicable to a broad range of *semantic* probing experiments. In summary, this paper does not propose any new techniques for training LMs of code, but develops an experimental framework for and provides insights into the acquisition and representation of formal semantics in statistical models of code.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Charles Jin;Martin Rinard", "authorids": "~Charles_Jin1;~Martin_Rinard1", "gender": ";Not Specified", "homepage": "https://charlesjin.com;http://people.csail.mit.edu/rinard/", "dblp": "245/5611;", "google_scholar": "WC99LxgAAAAJ;https://scholar.google.com.tw/citations?user=hxlxVEUAAAAJ", "orcid": "0000-0001-6871-5764;", "linkedin": ";", "or_profile": "~Charles_Jin1;~Martin_Rinard1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\njin2024emergent,\ntitle={Emergent Representations of Program Semantics in Language Models Trained on Programs},\nauthor={Charles Jin and Martin Rinard},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8PTx4CpNoT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8843851, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10350156376281172704&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "mit.edu;mit.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "On the Asymptotic Distribution of the Minimum Empirical Risk", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34848", "id": "8RwhTPACAO", "proceeding": "https://proceedings.mlr.press/v235/westerhout24a.html", "pdf": "https://openreview.net/pdf?id=8RwhTPACAO", "openreview": "https://openreview.net/forum?id=8RwhTPACAO", "author_site": "Jacob Westerhout, TrungTin Nguyen, Xin Guo, Hien Nguyen", "tldr": "", "abstract": "Empirical risk minimization (ERM) is a foundational framework for the estimation of solutions to statistical and machine learning problems. Characterizing the distributional properties of the minimum empirical risk (MER) provides valuable tools for conducting inference and assessing the goodness of model fit. We provide a comprehensive account of the asymptotic distribution for the order-$\\sqrt{n}$ blowup of the MER under generic and abstract assumptions, and present practical conditions under which our theorems hold. Our results improve upon and relax the assumptions made in previous works. Specifically, we provide asymptotic distributions for MERs for non-independent and identically distributed data, and when the loss functions may be discontinuous or indexed by non-Euclidean spaces. We further present results that enable the application of these asymptotics for statistical inference. Specifically, the construction of consistent confidence sets using the bootstrap and consistent hypothesis tests using penalized model selection. We illustrate the utility of our approach by applying our results to neural network problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jacob Westerhout;TrungTin Nguyen;Xin Guo;Hien Duy Nguyen", "authorids": "~Jacob_Westerhout1;~TrungTin_Nguyen1;~Xin_Guo5;~Hien_Duy_Nguyen1", "gender": "M;M;;M", "homepage": "https://scholar.google.com/citations?view_op=list_works&hl=en&user=uzh65-QAAAAJ;https://trung-tinnguyen.github.io/;https://hiendn.github.io/;https://people.smp.uq.edu.au/XinGuo/", "dblp": ";275/3643;;17/1430-3", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;NhiJDJsAAAAJ;https://scholar.google.com.au/citations?user=CXr0aV8AAAAJ;https://scholar.google.com.hk/citations?user=32pRl-YAAAAJ", "orcid": ";0000-0001-8433-5980;;0000-0002-7465-9356", "linkedin": ";trungtinnguyen0/;;", "or_profile": "~Jacob_Westerhout1;~TrungTin_Nguyen1;~Hien_Duy_Nguyen1;~Xin_GUO1", "aff": "University of Queensland;The University of Queensland;Kyushu University;The University of Queensland", "aff_domain": "uq.edu.au;uq.edu.au;kyushu-u.ac.jp;uq.edu.au", "position": "PhD student;Postdoc;Full Professor;Senior Lecturer in Mathematical Data Science", "bibtex": "@inproceedings{\nwesterhout2024on,\ntitle={On the Asymptotic Distribution of the Minimum Empirical Risk},\nauthor={Jacob Westerhout and TrungTin Nguyen and Xin Guo and Hien Duy Nguyen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8RwhTPACAO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 522349, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5389379421554397528&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "uq.edu.au;uq.edu.au;kyushu-u.ac.jp;uq.edu.au", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Queensland;Kyushu University", "aff_unique_dep": ";", "aff_unique_url": "https://www.uq.edu.au;https://www.kyushu-u.ac.jp", "aff_unique_abbr": "UQ;Kyushu U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Australia;Japan" }, { "title": "HyperFields: Towards Zero-Shot Generation of NeRFs from Text", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34847", "id": "8STOjGCkfH", "proceeding": "https://proceedings.mlr.press/v235/babu24a.html", "pdf": "https://openreview.net/pdf?id=8STOjGCkfH", "openreview": "https://openreview.net/forum?id=8STOjGCkfH", "author_site": "Sudarshan Babu, Richard Liu, Zi Yu Zhou, Michael Maire, Greg Shakhnarovich, Rana Hanocka", "tldr": "", "abstract": "We introduce HyperFields, a method for generating text-conditioned Neural Radiance Fields (NeRFs) with a single forward pass and (optionally) some fine-tuning. Key to our approach are: (i) a dynamic hypernetwork, which learns a smooth mapping from text token embeddings to the space of NeRFs; (ii) NeRF distillation training, which distills scenes encoded in individual NeRFs into one dynamic hypernetwork. These techniques enable a single network to fit over a hundred unique scenes. We further demonstrate that HyperFields learns a more general map between text and NeRFs, and consequently is capable of predicting novel in-distribution and out-of-distribution scenes --- either zero-shot or with a few finetuning steps. Finetuning HyperFields benefits from accelerated convergence thanks to the learned general map, and is capable of synthesizing novel scenes 5 to 10 times faster than existing neural optimization-based methods. Our ablation experiments show that both the dynamic architecture and NeRF distillation are critical to the expressivity of HyperFields.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sudarshan Babu;Richard Liu;Avery Zhou;Michael Maire;Greg Shakhnarovich;Rana Hanocka", "authorids": "~Sudarshan_Babu1;~Richard_Liu1;~Avery_Zhou1;~Michael_Maire1;~Greg_Shakhnarovich1;~Rana_Hanocka1", "gender": "M;M;M;M;;M", "homepage": "https://people.cs.uchicago.edu/~sudarshan/;https://factoryofthesun.github.io/;https://www.linkedin.com/in/azhou/;http://people.cs.uchicago.edu/~mmaire/;https://people.cs.uchicago.edu/~ranahanocka/;http://ttic.edu/gregory/", "dblp": "164/6304;44/5359;;73/1498.html;167/2260;17/1926.html", "google_scholar": ";;;HXowq5YAAAAJ;3Bk5C9EAAAAJ;https://scholar.google.com.tw/citations?user=YLOz1kgAAAAJ", "orcid": ";;;;0000-0003-3214-3703;", "linkedin": ";;;;;", "or_profile": "~Sudarshan_Babu1;~Richard_Liu1;~Avery_Zhou1;~Michael_Maire1;~Rana_Hanocka1;~Gregory_Shakhnarovich2", "aff": "Toyota Technological Institute at Chicago;University of Chicago;;University of Chicago;University of Chicago;University of Chicago", "aff_domain": "ttic.edu;cs.uchicago.edu;;uchicago.edu;uchicago.edu;uchicago.edu", "position": "PhD student;PhD student;;Associate Professor;Assistant Professor;Professor, part time", "bibtex": "@inproceedings{\nbabu2024hyperfields,\ntitle={HyperFields: Towards Zero-Shot Generation of Ne{RF}s from Text},\nauthor={Sudarshan Babu and Richard Liu and Avery Zhou and Michael Maire and Greg Shakhnarovich and Rana Hanocka},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8STOjGCkfH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6471812, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11413657046320631001&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "ttic.edu;cs.uchicago.edu;;uchicago.edu;uchicago.edu;uchicago.edu", "author_num": 6, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "Toyota Technological Institute at Chicago;University of Chicago", "aff_unique_dep": ";", "aff_unique_url": "https://www.tti-chicago.org;https://www.uchicago.edu", "aff_unique_abbr": "TTI Chicago;UChicago", "aff_campus_unique_index": "0", "aff_campus_unique": "Chicago;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Towards an Understanding of Stepwise Inference in Transformers: A Synthetic Graph Navigation Model", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34846", "id": "8VEGkphQaK", "proceeding": "https://proceedings.mlr.press/v235/khona24a.html", "pdf": "https://openreview.net/pdf?id=8VEGkphQaK", "openreview": "https://openreview.net/forum?id=8VEGkphQaK", "author_site": "Mikail Khona, Maya Okawa, Jan Hula, Rahul Ramesh, Kento Nishi, Robert Dick, Ekdeep Singh Lubana, Hidenori Tanaka", "tldr": "", "abstract": "Stepwise inference protocols, such as scratchpads and chain-of-thought, help language models solve complex problems by decomposing them into a sequence of simpler subproblems. To unravel the underlying mechanisms of stepwise inference we propose to study autoregressive Transformer models on a synthetic task that embodies the multi-step nature of problems where stepwise inference is generally most useful. Specifically, we define a graph navigation problem wherein a model is tasked with traversing a path from a start to a goal node on the graph. We find we can empirically reproduce and analyze several phenomena observed at scale: (i) the stepwise inference reasoning gap, the cause of which we find in the structure of the training data; (ii) a diversity-accuracy trade-off in model generations as sampling temperature varies; (iii) a simplicity bias in the model\u2019s output; and (iv) compositional generalization and a primacy bias with in-context exemplars. Overall, our work introduces a grounded, synthetic framework for studying stepwise inference and offers mechanistic hypotheses that can lay the foundation for a deeper understanding of this phenomenon.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mikail Khona;Maya Okawa;Jan Hula;Rahul Ramesh;Kento Nishi;Robert P. Dick;Ekdeep Singh Lubana;Hidenori Tanaka", "authorids": "~Mikail_Khona2;~Maya_Okawa1;~Jan_Hula1;~Rahul_Ramesh2;~Kento_Nishi1;~Robert_P._Dick1;~Ekdeep_Singh_Lubana1;~Hidenori_Tanaka1", "gender": ";;M;M;M;M;M;", "homepage": ";;;https://cis.upenn.edu/~rahulram;https://kentonishi.github.io/;http://robertdick.org/;https://ekdeepslubana.github.io/;https://sites.google.com/view/htanaka/home", "dblp": ";;;168/7029;;84/523.html;228/2683;", "google_scholar": ";;pCF6oo8AAAAJ;wCa6nygAAAAJ;iQoZSr4AAAAJ;;https://scholar.google.co.in/citations?user=OP7S3vsAAAAJ;f_pWOGIAAAAJ", "orcid": ";;;;;;;", "linkedin": ";;;;kento-nishi-5696ab185/;;;", "or_profile": "~Mikail_Khona2;~Maya_Okawa1;~Jan_Hula1;~Rahul_Ramesh2;~Kento_Nishi1;~Robert_P._Dick1;~Ekdeep_Singh_Lubana1;~Hidenori_Tanaka1", "aff": ";;CIIRC, Czech Technical University, Czech Technical University of Prague;University of Pennsylvania;Harvard University;University of Michigan;University of Michigan;Physics & Informatics Lab, NTT Research, Inc.", "aff_domain": ";;ciirc.cvut.cz;upenn.edu;harvard.edu;umich.edu;umich.edu;ntt-research.com", "position": ";;Postdoc;PhD student;Undergrad student;Full Professor;PhD student;Senior Research Scientist", "bibtex": "@inproceedings{\nkhona2024towards,\ntitle={Towards an Understanding of Stepwise Inference in Transformers: A Synthetic Graph Navigation Model},\nauthor={Mikail Khona and Maya Okawa and Jan Hula and Rahul Ramesh and Kento Nishi and Robert P. Dick and Ekdeep Singh Lubana and Hidenori Tanaka},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8VEGkphQaK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8286381, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15997730875882311200&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": ";;ciirc.cvut.cz;upenn.edu;harvard.edu;umich.edu;umich.edu;ntt-research.com", "author_num": 8, "aff_unique_index": "0;1;2;3;3;4", "aff_unique_norm": "Czech Technical University;University of Pennsylvania;Harvard University;University of Michigan;NTT Research, Inc.", "aff_unique_dep": "CIIRC;;;;Physics & Informatics Lab", "aff_unique_url": "https://www.cvut.cz;https://www.upenn.edu;https://www.harvard.edu;https://www.umich.edu;https://www.ntt-research.com", "aff_unique_abbr": "CTU;UPenn;Harvard;UM;NTT Research", "aff_campus_unique_index": "0", "aff_campus_unique": "Prague;", "aff_country_unique_index": "0;1;1;1;1;1", "aff_country_unique": "Czech Republic;United States" }, { "title": "Rethinking Specificity in SBDD: Leveraging Delta Score and Energy-Guided Diffusion", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34845", "id": "8WSNl2XA9r", "proceeding": "https://proceedings.mlr.press/v235/gao24k.html", "pdf": "https://openreview.net/pdf?id=8WSNl2XA9r", "openreview": "https://openreview.net/forum?id=8WSNl2XA9r", "author_site": "Bowen Gao, Minsi Ren, Yuyan Ni, Yanwen Huang, Bo Qiang, Zhiming Ma, Wei-Ying Ma, Yanyan Lan", "tldr": "", "abstract": "In the field of Structure-based Drug Design (SBDD), deep learning-based generative models have achieved outstanding performance in terms of docking score. However, further study shows that the existing molecular generative methods and docking scores both have lacked consideration in terms of specificity, which means that generated molecules bind to almost every protein pocket with high affinity. To address this, we introduce the Delta Score, a new metric for evaluating the specificity of molecular binding. To further incorporate this insight for generation, we develop an innovative energy-guided approach using contrastive learning, with active compounds as decoys, to direct generative models toward creating molecules with high specificity. Our empirical results show that this method not only enhances the delta score but also maintains or improves traditional docking scores, successfully bridging the gap between SBDD and real-world needs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bowen Gao;Minsi Ren;Yuyan Ni;Yanwen Huang;Bo Qiang;Zhi-Ming Ma;Wei-Ying Ma;Yanyan Lan", "authorids": "~Bowen_Gao1;~Minsi_Ren1;~Yuyan_Ni1;~Yanwen_Huang2;~Bo_Qiang1;~Zhi-Ming_Ma1;~Wei-Ying_Ma2;~Yanyan_Lan2", "gender": "M;M;;;M;;M;", "homepage": "https://www.linkedin.com/in/bgao/;;https://nyyxxx.github.io/;https://github.com/AnnaKhuan;;http://homepage.amss.ac.cn/research/homePage/8eb59241e2e74d828fb84eec0efadba5/myHomePage.html;https://air.tsinghua.edu.cn/en/info/1046/1189.htm;", "dblp": ";313/3162;117/6286;53/6836;;;m/WYMa.html;00/6040.html", "google_scholar": "cTGzVe8AAAAJ;SodlECMAAAAJ;https://scholar.google.com/citations?hl=zh-CN;kKz2vv9_pEoC;7FQInvgAAAAJ;;SToCbu8AAAAJ;", "orcid": ";;;0009-0008-2338-4357;0000-0001-7428-4104;;;", "linkedin": ";https://www.linkedin.cn/incareer/in/ACoAADIyJsgBrvXiwCdovg-un2CwBcF8p4v5aas;;;;;wei-ying-ma-16a0171/;", "or_profile": "~Bowen_Gao1;~Minsi_Ren1;~Yuyan_Ni1;~Yanwen_Huang2;~Bo_Qiang1;~Zhi-Ming_Ma1;~Wei-Ying_Ma2;~Yanyan_Lan2", "aff": "Tsinghua University;Institute of Automation, Chinese Academy of Sciences;University of Chinese Academy of Sciences;Peking University;Peking University;Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Chinese Academy of Sciences;Tsinghua University;Tsinghua University", "aff_domain": "mail.tsinghua.edu.cn;ia.ac.cn;ucas.ac.cn;pku.edu.cn;pku.edu.cn;amss.ac.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "Researcher;MS student;PhD student;PhD student;MS student;Full Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\ngao2024rethinking,\ntitle={Rethinking Specificity in {SBDD}: Leveraging Delta Score and Energy-Guided Diffusion},\nauthor={Bowen Gao and Minsi Ren and Yuyan Ni and Yanwen Huang and Bo Qiang and Zhi-Ming Ma and Wei-Ying Ma and Yanyan Lan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8WSNl2XA9r}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2618361, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8270081034310883621&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "mail.tsinghua.edu.cn;ia.ac.cn;ucas.ac.cn;pku.edu.cn;pku.edu.cn;amss.ac.cn;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 8, "aff_unique_index": "0;1;2;3;3;1;0;0", "aff_unique_norm": "Tsinghua University;Chinese Academy of Sciences;University of Chinese Academy of Sciences;Peking University", "aff_unique_dep": ";Institute of Automation;;", "aff_unique_url": "https://www.tsinghua.edu.cn;http://www.ia.cas.cn;http://www.ucas.ac.cn;http://www.pku.edu.cn", "aff_unique_abbr": "THU;CAS;UCAS;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "On Stronger Computational Separations Between Multimodal and Unimodal Machine Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34844", "id": "8Z2xWhuT6R", "proceeding": "https://proceedings.mlr.press/v235/karchmer24a.html", "pdf": "https://openreview.net/pdf?id=8Z2xWhuT6R", "openreview": "https://openreview.net/forum?id=8Z2xWhuT6R", "tldr": "", "abstract": "Recently, multimodal machine learning has enjoyed huge empirical success (e.g. GPT-4). Motivated to develop theoretical justification for this empirical success, Lu (NeurIPS '23, ALT '24) introduces a theory of multimodal learning, and considers possible *separations* between theoretical models of multimodal and unimodal learning. In particular, Lu (ALT '24) shows a computational separation, which is relevant to *worst-case* instances of the learning task. In this paper, we give a stronger *average-case* computational separation, where for \"typical\" instances of the learning task, unimodal learning is computationally hard, but multimodal learning is easy. We then question how \"natural\" the average-case separation is. Would it be encountered in practice? To this end, we prove that under basic conditions, any given computational separation between average-case unimodal and multimodal learning tasks implies a corresponding cryptographic key agreement protocol. We suggest to interpret this as evidence that very strong *computational* advantages of multimodal learning may arise *infrequently* in practice, since they exist only for the \"pathological\" case of inherently cryptographic distributions. However, this does not apply to possible (super-polynomial) *statistical* advantages.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ari Karchmer", "authorids": "~Ari_Karchmer1", "gender": "M", "homepage": "https://arikarchmer.com", "dblp": "296/2173", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "~Ari_Karchmer1", "aff": "Boston University", "aff_domain": "bu.edu", "position": "PhD student", "bibtex": "@inproceedings{\nkarchmer2024on,\ntitle={On Stronger Computational Separations Between Multimodal and Unimodal Machine Learning},\nauthor={Ari Karchmer},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8Z2xWhuT6R}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 353461, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4547948388298115341&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "bu.edu", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Boston University", "aff_unique_dep": "", "aff_unique_url": "https://www.bu.edu", "aff_unique_abbr": "BU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "ALERT-Transformer: Bridging Asynchronous and Synchronous Machine Learning for Real-Time Event-based Spatio-Temporal Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34843", "id": "8ZDFn7BDaH", "proceeding": "https://proceedings.mlr.press/v235/turrero24a.html", "pdf": "https://openreview.net/pdf?id=8ZDFn7BDaH", "openreview": "https://openreview.net/forum?id=8ZDFn7BDaH", "author_site": "Carmen Martin-Turrero, Maxence Bouvier, Manuel Breitenstein, Pietro Zanuttigh, Vincent Parret", "tldr": "", "abstract": "We seek to enable classic processing of continuous ultra-sparse spatiotemporal data generated by event-based sensors with dense machine learning models. We propose a novel hybrid pipeline composed of asynchronous sensing and synchronous processing that combines several ideas: (1) an embedding based on PointNet models -- the ALERT module -- that can continuously integrate new and dismiss old events thanks to a leakage mechanism, (2) a flexible readout of the embedded data that allows to feed any downstream model with always up-to-date features at any sampling rate, (3) exploiting the input sparsity in a patch-based approach inspired by Vision Transformer to optimize the efficiency of the method. These embeddings are then processed by a transformer model trained for object and gesture recognition. Using this approach, we achieve performances at the state-of-the-art with a lower latency than competitors. We also demonstrate that our asynchronous model can operate at any desired sampling rate.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Carmen Martin Turrero;Maxence Bouvier;Manuel Breitenstein;Pietro Zanuttigh;Vincent Parret", "authorids": "cmartur@gmail.com;~Maxence_Bouvier1;manuel.breitenstein@gmail.com;~Pietro_Zanuttigh1;~Vincent_Parret1", "gender": ";M;;M;M", "homepage": ";https://maxencebouvier.github.io;;https://medialab.dei.unipd.it/members/pietro-zanuttigh/;", "dblp": ";;;18/797;", "google_scholar": ";AeceaFAAAAAJ;;https://scholar.google.it/citations?user=xk2N2wkAAAAJ;", "orcid": ";;;0000-0002-9502-2389;", "linkedin": ";maxence-bouvier/;;;vincent-parret-bb04ab153", "or_profile": "cmartur@gmail.com;~Maxence_Bouvier1;manuel.breitenstein@gmail.com;~Pietro_Zanuttigh1;~Vincent_Parret1", "aff": ";Sony Europe Ltd.;;Universita' degli studi di Padova;Sony Europe B.V.", "aff_domain": ";sony.com;;unipd.it;sony.com", "position": ";Researcher;;Associate Professor;Engineer", "bibtex": "@inproceedings{\nturrero2024alerttransformer,\ntitle={{ALERT}-Transformer: Bridging Asynchronous and Synchronous Machine Learning for Real-Time Event-based Spatio-Temporal Data},\nauthor={Carmen Martin Turrero and Maxence Bouvier and Manuel Breitenstein and Pietro Zanuttigh and Vincent Parret},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8ZDFn7BDaH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4251339, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15312782891620672842&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": ";sony.com;;unipd.it;sony.com", "author_num": 5, "aff_unique_index": "0;1;0", "aff_unique_norm": "Sony Europe;University of Padova", "aff_unique_dep": ";", "aff_unique_url": "https://www.sony.eu;https://www.unipd.it", "aff_unique_abbr": "Sony Europe;Unipd", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "United Kingdom;Italy;Unknown" }, { "title": "On the Error-Propagation of Inexact Hotelling's Deflation for Principal Component Analysis", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34842", "id": "8dX4YnosqG", "proceeding": "https://proceedings.mlr.press/v235/liao24a.html", "pdf": "https://openreview.net/pdf?id=8dX4YnosqG", "openreview": "https://openreview.net/forum?id=8dX4YnosqG", "author_site": "Fangshuo Liao, J. Lyle Kim, Cruz Barnum, Anastasios Kyrillidis", "tldr": "", "abstract": "Principal Component Analysis (PCA) aims to find subspaces spanned by the so-called *principal components* that best represent the variance in the dataset. The deflation method is a popular meta-algorithm that sequentially finds individual principal components, starting from the most important ones and working towards the less important ones. However, as deflation proceeds, numerical errors from the imprecise estimation of principal components propagate due to its sequential nature. This paper mathematically characterizes the error propagation of the inexact Hotelling's deflation method. We consider two scenarios: $i)$ when the sub-routine for finding the leading eigenvector is abstract and can represent various algorithms; and $ii)$ when power iteration is used as the sub-routine. In the latter case, the additional directional information from power iteration allows us to obtain a tighter error bound than the sub-routine agnostic case. For both scenarios, we explicitly characterize how the errors progress and affect subsequent principal component estimations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fangshuo Liao;Junhyung Lyle Kim;Cruz Barnum;Anastasios Kyrillidis", "authorids": "~Fangshuo_Liao1;~Junhyung_Lyle_Kim1;cruzjb2@illinois.edu;~Anastasios_Kyrillidis2", "gender": "M;M;;M", "homepage": "https://jasperliao.github.io/;http://jlylekim.github.io;;http://akyrillidis.github.io", "dblp": "308/2837;290/2228;;53/9879", "google_scholar": "WIwcFN8AAAAJ;Ku197mP8hmUC;;TEGzkZMAAAAJ", "orcid": ";;;", "linkedin": "fangshuo-liao-698043141/;;;", "or_profile": "~Fangshuo_Liao1;~Junhyung_Lyle_Kim1;cruzjb2@illinois.edu;~Anastasios_Kyrillidis2", "aff": "Rice University;Rice University;;Rice University", "aff_domain": "rice.edu;rice.edu;;rice.edu", "position": "PhD student;PhD student;;Assistant Professor", "bibtex": "@inproceedings{\nliao2024on,\ntitle={On the Error-Propagation of Inexact Hotelling's Deflation for Principal Component Analysis},\nauthor={Fangshuo Liao and Junhyung Lyle Kim and Cruz Barnum and Anastasios Kyrillidis},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8dX4YnosqG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 807168, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8261935891556949728&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "rice.edu;rice.edu;;rice.edu", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Rice University", "aff_unique_dep": "", "aff_unique_url": "https://www.rice.edu", "aff_unique_abbr": "Rice", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Individual Fairness in Graph Decomposition", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34841", "id": "8f8SI9X9ox", "proceeding": "https://proceedings.mlr.press/v235/munagala24a.html", "pdf": "https://openreview.net/pdf?id=8f8SI9X9ox", "openreview": "https://openreview.net/forum?id=8f8SI9X9ox", "author_site": "Kamesh Munagala, Govind S. Sankar", "tldr": "", "abstract": "In this paper, we consider classic randomized low diameter decomposition procedures for planar graphs that obtain connected clusters that are cohesive in that close by pairs of nodes are assigned to the same cluster with high probability. We consider the additional aspect of *individual fairness* -- pairs of nodes at comparable distances should be separated with comparable probability. We show that classic decomposition procedures do not satisfy this property. We present novel algorithms that achieve various trade-offs between this property and additional desiderata of connectivity of the clusters and optimality in number of clusters. We show that our individual fairness bounds may be difficult to improve by tying the improvement to resolving a major open question in metric embeddings. We finally show the efficacy of our algorithms on real planar networks modeling Congressional redistricting.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kamesh Munagala;Govind S. Sankar", "authorids": "~Kamesh_Munagala2;~Govind_S._Sankar1", "gender": "M;M", "homepage": "https://users.cs.duke.edu/~gs259/;https://www.cs.duke.edu/~kamesh", "dblp": "293/6611;m/KameshMunagala.html", "google_scholar": ";PJQPzgcAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Govind_S._Sankar1;~Kameshwar_Munagala1", "aff": "Department of Computer Science, Duke University;Duke University", "aff_domain": "cs.duke.edu;duke.edu", "position": "PhD student;Professor of Computer Science", "bibtex": "@inproceedings{\nmunagala2024individual,\ntitle={Individual Fairness in Graph Decomposition},\nauthor={Kamesh Munagala and Govind S. Sankar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8f8SI9X9ox}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 759428, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13516145295905513456&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "cs.duke.edu;duke.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Duke University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.duke.edu", "aff_unique_abbr": "Duke", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Generalization Error of Graph Neural Networks in the Mean-field Regime", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34840", "id": "8h0x12p3zq", "proceeding": "https://proceedings.mlr.press/v235/aminian24a.html", "pdf": "https://openreview.net/pdf?id=8h0x12p3zq", "openreview": "https://openreview.net/forum?id=8h0x12p3zq", "author_site": "Gholamali Aminian, Yixuan He, Gesine Reinert, Lukasz Szpruch, Samuel Cohen", "tldr": "", "abstract": "This work provides a theoretical framework for assessing the generalization error of graph neural networks in the over-parameterized regime, where the number of parameters surpasses the quantity of data points. We explore two widely utilized types of graph neural networks: graph convolutional neural networks and message passing graph neural networks. Prior to this study, existing bounds on the generalization error in the over-parametrized regime were uninformative, limiting our understanding of over-parameterized network performance. Our novel approach involves deriving upper bounds within the mean-field regime for evaluating the generalization error of these graph neural networks. We establish upper bounds with a convergence rate of $O(1/n)$, where $n$ is the number of graph samples. These upper bounds offer a theoretical assurance of the networks' performance on unseen data in the challenging over-parameterized regime and overall contribute to our understanding of their performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gholamali Aminian;Yixuan He;Gesine Reinert;Lukasz Szpruch;Samuel N. Cohen", "authorids": "~Gholamali_Aminian1;~Yixuan_He2;~Gesine_Reinert1;~Lukasz_Szpruch1;~Samuel_N._Cohen1", "gender": "M;F;F;M;M", "homepage": ";https://sherylhyx.github.io/;http://www.stats.ox.ac.uk/~reinert/;https://www.maths.ed.ac.uk/~lszpruch/;https://people.maths.ox.ac.uk/cohens/", "dblp": "153/1970;226/6494;86/1736;57/9263;64/9100.html", "google_scholar": "UNNnTjsAAAAJ;SWme_nYAAAAJ;2gvyN5oAAAAJ;ljeA6CMAAAAJ;", "orcid": ";0000-0002-5990-0658;;;0000-0003-0539-6414", "linkedin": ";yixuan-he-sheryl/;gesine-reinert-77b64913/?originalSubdomain=uk;;", "or_profile": "~Gholamali_Aminian1;~Yixuan_He2;~Gesine_Reinert1;~Lukasz_Szpruch1;~Samuel_N_Cohen1", "aff": "Alan Turing Institute;University of Oxford;University of Oxford;University of Edinburgh, University of Edinburgh;University of Oxford", "aff_domain": "turing.ac.uk;ox.ac.uk;ox.ac.uk;ed.ac.uk;ox.ac.uk", "position": "Researcher;PhD student;Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\naminian2024generalization,\ntitle={Generalization Error of Graph Neural Networks in the Mean-field Regime},\nauthor={Gholamali Aminian and Yixuan He and Gesine Reinert and Lukasz Szpruch and Samuel N. Cohen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8h0x12p3zq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 577397, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3225152465427750123&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "turing.ac.uk;ox.ac.uk;ox.ac.uk;ed.ac.uk;ox.ac.uk", "author_num": 5, "aff_unique_index": "0;1;1;2;1", "aff_unique_norm": "Alan Turing Institute;University of Oxford;University of Edinburgh", "aff_unique_dep": ";;", "aff_unique_url": "https://www.turing.ac.uk;https://www.ox.ac.uk;https://www.ed.ac.uk", "aff_unique_abbr": "ATI;Oxford;Edinburgh", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Image Hijacks: Adversarial Images can Control Generative Models at Runtime", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34839", "id": "8ho1l6RZNB", "proceeding": "https://proceedings.mlr.press/v235/bailey24a.html", "pdf": "https://openreview.net/pdf?id=8ho1l6RZNB", "openreview": "https://openreview.net/forum?id=8ho1l6RZNB", "author_site": "Luke Bailey, Euan Ong, Stuart Russell, Scott Emmons", "tldr": "", "abstract": "Are foundation models secure against malicious actors? In this work, we focus on the image input to a vision-language model (VLM). We discover image hijacks, adversarial images that control the behaviour of VLMs at inference time, and introduce the general Behaviour Matching algorithm for training image hijacks. From this, we derive the Prompt Matching method, allowing us to train hijacks matching the behaviour of an arbitrary user-defined text prompt (e.g. 'the Eiffel Tower is now located in Rome') using a generic, off-the-shelf dataset unrelated to our choice of prompt. We use Behaviour matching to craft hijacks for four types of attack: forcing VLMs to generate outputs of the adversary\u2019s choice, leak information from their context window, override their safety training, and believe false statements. We study these attacks against LLaVA, a state-of-the-art VLM based on CLIP and LLaMA-2, and find that all attack types achieve a success rate of over 80%. Moreover, our attacks are automated and require only small image perturbations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Luke Bailey;Euan Ong;Stuart Russell;Scott Emmons", "authorids": "~Luke_Bailey1;~Euan_Ong1;~Stuart_Russell1;~Scott_Emmons1", "gender": "M;M;M;", "homepage": ";https://people.eecs.berkeley.edu/~russell/;http://scottemmons.com/;", "dblp": ";;180/5699;171/4369", "google_scholar": ";https://scholar.google.com.tw/citations?user=KJGrjCAAAAAJ;LoT0z6oAAAAJ;rUr9LjMAAAAJ", "orcid": ";;0000-0002-7946-7046;", "linkedin": "euanong/;;scott-emmons-5258005b/;", "or_profile": "~Euan_Ong1;~Stuart_Russell1;~Scott_Emmons1;~Luke_James_Bailey1", "aff": ";University of California, Berkeley;University of California, Berkeley;Harvard University", "aff_domain": ";berkeley.edu;berkeley.edu;harvard.edu", "position": ";Full Professor;PhD student;Undergrad student", "bibtex": "@inproceedings{\nbailey2024image,\ntitle={Image Hijacks: Adversarial Images can Control Generative Models at Runtime},\nauthor={Luke Bailey and Euan Ong and Stuart Russell and Scott Emmons},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8ho1l6RZNB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1154121, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 103, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15153259270073141351&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "email": ";berkeley.edu;berkeley.edu;harvard.edu", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of California, Berkeley;Harvard University", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://www.harvard.edu", "aff_unique_abbr": "UC Berkeley;Harvard", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Wukong: Towards a Scaling Law for Large-Scale Recommendation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34838", "id": "8iUgr2nuwo", "proceeding": "https://proceedings.mlr.press/v235/zhang24ao.html", "pdf": "https://openreview.net/pdf?id=8iUgr2nuwo", "openreview": "https://openreview.net/forum?id=8iUgr2nuwo", "author_site": "Buyun Zhang, Liang Luo, Yuxin Chen, Jade Nie, Xi Liu, Shen Li, Yanli Zhao, Yuchen Hao, Yantao Yao, Ellie Wen, Jongsoo Park, Maxim Naumov, Wenlin Chen", "tldr": "", "abstract": "Scaling laws play an instrumental role in the sustainable improvement in model quality. Unfortunately, recommendation models to date do not exhibit such laws similar to those observed in the domain of large language models, due to the inefficiencies of their upscaling mechanisms. This limitation poses significant challenges in adapting these models to increasingly more complex real-world datasets. In this paper, we propose an effective network architecture based purely on stacked factorization machines, and a synergistic upscaling strategy, collectively dubbed Wukong, to establish a scaling law in the domain of recommendation. Wukong\u2019s unique design makes it possible to capture diverse, any-order of interactions simply through taller and wider layers. We conducted extensive evaluations on six public datasets, and our results demonstrate that Wukong consistently outperforms state-of-the-art models quality-wise. Further, we assessed Wukong\u2019s scalability on an internal, large-scale dataset. The results show that Wukong retains its superiority in quality over state-of-the-art models, while holding the scaling law across two orders of magnitude in model complexity, extending beyond 100 GFLOP/example, where prior arts fall short.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Buyun Zhang;Liang Luo;Yuxin Chen;Jade Nie;Xi Liu;Shen Li;Yanli Zhao;Yuchen Hao;Yantao Yao;Ellie Dingqiao Wen;Jongsoo Park;Maxim Naumov;Wenlin Chen", "authorids": "~Buyun_Zhang1;~Liang_Luo2;~Yuxin_Chen10;~Jade_Nie1;~Xi_Liu1;~Shen_Li4;~Yanli_Zhao1;~Yuchen_Hao1;~Yantao_Yao1;~Ellie_Dingqiao_Wen1;~Jongsoo_Park1;~Maxim_Naumov2;~Wenlin_Chen1", "gender": "M;M;;F;M;M;F;M;M;F;M;M;M", "homepage": ";https://homes.cs.washington.edu/~liangluo/;https://yxchen.me;;;https://mrshenli.github.io/;;;;;https://sites.google.com/site/jongsoopark/;https://research.fb.com/people/naumov-maxim/;http://www.cse.wustl.edu/~wenlinchen/", "dblp": ";;;278/2712;https://dblp.uni-trier.de/pers/hd/l/Liu_0011:Xi;;;;;;;https://dblp.uni-trier.de/pid/79/2042.html?q=Maxim%20Naumov;117/4255", "google_scholar": "0zDyYGcAAAAJ;RbhS2h4AAAAJ;;;https://scholar.google.com/citations?hl=en;5qD1sLEAAAAJ;;ITTkydQAAAAJ;;Wa04AJoAAAAJ;dlX-GboAAAAJ;p5h2zh8AAAAJ;", "orcid": ";;;;;;;;;0000-0001-8229-2294;;0000-0002-6102-2903;", "linkedin": ";;;jadeqinie;xi-liu-2b0285173/;;yanli-zhao-27177453/;yuchen-hao-0bba0a39/;yaoyantao/;;;maxim-naumov;wenlinchen/", "or_profile": "~Buyun_Zhang1;~Liang_Luo2;~Yuxin_Chen10;~Jade_Nie1;~Xi_Liu1;~Shen_Li4;~Yanli_Zhao1;~Yuchen_Hao1;~Yantao_Yao1;~Ellie_Dingqiao_Wen1;~Jongsoo_Park1;~Maxim_Naumov2;~Wenlin_Chen1", "aff": "Meta;Meta;Meta;Meta;Meta AI;Facebook AI;Meta Platform Inc.;Meta;;;Meta Facebook;;Meta Facebook", "aff_domain": "meta.com;meta.com;meta.com;meta.com;fb.com;fb.com;meta.com;meta.com;;;meta.com;;fb.com", "position": "Researcher;Researcher;Researcher;Researcher;Research Scientist;Research Scientist;Software Engineer;Research Scientist;;;Researcher;;Research Scientist", "bibtex": "@inproceedings{\nzhang2024wukong,\ntitle={Wukong: Towards a Scaling Law for Large-Scale Recommendation},\nauthor={Buyun Zhang and Liang Luo and Yuxin Chen and Jade Nie and Xi Liu and Shen Li and Yanli Zhao and Yuchen Hao and Yantao Yao and Ellie Dingqiao Wen and Jongsoo Park and Maxim Naumov and Wenlin Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8iUgr2nuwo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 895507, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 13, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7839017503064827799&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "meta.com;meta.com;meta.com;meta.com;fb.com;fb.com;meta.com;meta.com;;;meta.com;;fb.com", "author_num": 13, "aff_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_unique_norm": "Meta", "aff_unique_dep": "Meta Platforms, Inc.", "aff_unique_url": "https://meta.com", "aff_unique_abbr": "Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Optimal Coresets for Low-Dimensional Geometric Median", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34837", "id": "8iWDWQKxJ1", "proceeding": "https://proceedings.mlr.press/v235/afshani24a.html", "pdf": "https://openreview.net/pdf?id=8iWDWQKxJ1", "openreview": "https://openreview.net/forum?id=8iWDWQKxJ1", "author_site": "Peyman Afshani, Chris Schwiegelshohn", "tldr": "", "abstract": "We investigate coresets for approximating the cost with respect to median queries. In this problem, we are given a set of points $P\\subset \\mathbb{R}^d$ and median queries are $\\sum_{p\\in P} ||p-c||$ for any point $c\\in \\mathbb{R}^d$. Our goal is to compute a small weighted summary $S\\subset P$ such that the cost of any median query is approximated within a multiplicative $(1\\pm\\varepsilon)$ factor. We provide matching upper and lower bounds on the number of points contained in $S$ of the order $\\tilde{\\Theta}\\left(\\varepsilon^{-d/(d+1)}\\right)$.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Peyman Afshani;Chris Schwiegelshohn", "authorids": "~Peyman_Afshani1;~Chris_Schwiegelshohn1", "gender": "M;", "homepage": "http://www.cs.au.dk/~peyman/;https://cs.au.dk/~schwiegelshohn/", "dblp": ";https://dblp.uni-trier.de/pers/hd/s/Schwiegelshohn:Chris", "google_scholar": "https://scholar.google.com.tw/citations?user=DNIqUZEAAAAJ;X9Hl0LcAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Peyman_Afshani1;~Chris_Schwiegelshohn1", "aff": "Aarhus University;Aarhus University", "aff_domain": "au.dk;cs.au.dk", "position": "Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nafshani2024optimal,\ntitle={Optimal Coresets for Low-Dimensional Geometric Median},\nauthor={Peyman Afshani and Chris Schwiegelshohn},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8iWDWQKxJ1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 324082, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13026720871512163564&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "au.dk;cs.au.dk", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Aarhus University", "aff_unique_dep": "", "aff_unique_url": "https://au.dk", "aff_unique_abbr": "AU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Denmark" }, { "title": "SAMformer: Unlocking the Potential of Transformers in Time Series Forecasting with Sharpness-Aware Minimization and Channel-Wise Attention", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34836", "id": "8kLzL5QBh2", "proceeding": "https://proceedings.mlr.press/v235/ilbert24a.html", "pdf": "https://openreview.net/pdf?id=8kLzL5QBh2", "openreview": "https://openreview.net/forum?id=8kLzL5QBh2", "author_site": "Romain Ilbert, Ambroise Odonnat, Vasilii Feofanov, Aladin Virmaux, Giuseppe Paolo, Themis Palpanas, Ievgen Redko", "tldr": "", "abstract": "Transformer-based architectures achieved breakthrough performance in natural language processing and computer vision, yet they remain inferior to simpler linear baselines in multivariate long-term forecasting. To better understand this phenomenon, we start by studying a toy linear forecasting problem for which we show that transformers are incapable of converging to their true solution despite their high expressive power. We further identify the attention of transformers as being responsible for this low generalization capacity. Building upon this insight, we propose a shallow lightweight transformer model that successfully escapes bad local minima when optimized with sharpness-aware optimization. We empirically demonstrate that this result extends to all commonly used real-world multivariate time series datasets. In particular, SAMformer surpasses current state-of-the-art methods and is on par with the biggest foundation model MOIRAI while having significantly fewer parameters. The code is available at https://github.com/romilbert/samformer.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Romain Ilbert;Ambroise Odonnat;Vasilii Feofanov;Aladin Virmaux;Giuseppe Paolo;Themis Palpanas;Ievgen Redko", "authorids": "~Romain_Ilbert1;~Ambroise_Odonnat1;~Vasilii_Feofanov1;~Aladin_Virmaux1;~Giuseppe_Paolo1;~Themis_Palpanas1;~Ievgen_Redko2", "gender": ";M;M;;;Not Specified;", "homepage": "https://romilbert.github.io;https://ambroiseodt.github.io/;;https://avirmaux.github.io;https://www.giupaolo.com/;https://helios2.mi.parisdescartes.fr/~themisp/;", "dblp": ";359/3799;245/3361;192/8303;198/1004;p/ThemisPalpanas;150/3980", "google_scholar": "65uE37cAAAAJ;M_OS-3kAAAAJ;https://scholar.google.ru/citations?user=UIteS6oAAAAJ;5FxvLvwAAAAJ;https://scholar.google.fr/citations?user=khT6tDsAAAAJ;qUBdmWgAAAAJ;https://scholar.google.fr/citations?user=qJ1-XewAAAAJ", "orcid": "0000-0001-8572-6510;;0000-0002-5777-4205;;0000-0003-4201-5967;0000-0002-8031-0265;", "linkedin": "romain-ilbert/;ambroise-odonnat/;;;gpaolo93/;;", "or_profile": "~Romain_Ilbert1;~Ambroise_Odonnat1;~Vasilii_Feofanov1;~Aladin_Virmaux1;~Giuseppe_Paolo1;~Themis_Palpanas1;~Ievgen_Redko2", "aff": "University Paris Descartes;Huawei Technologies Ltd.;Huawei Noah's Ark Lab;Huawei Technologies Ltd.;Huawei Technologies Ltd.;Universite Paris Cite;Huawei Technologies Ltd.", "aff_domain": "parisdescartes.fr;huawei.com;huawei.com;huawei.com;huawei.com;u-paris.fr;huawei.com", "position": "PhD student;Intern;Researcher;Researcher;Researcher;Full Professor;Principal Researcher", "bibtex": "@inproceedings{\nilbert2024samformer,\ntitle={{SAM}former: Unlocking the Potential of Transformers in Time Series Forecasting with Sharpness-Aware Minimization and Channel-Wise Attention},\nauthor={Romain Ilbert and Ambroise Odonnat and Vasilii Feofanov and Aladin Virmaux and Giuseppe Paolo and Themis Palpanas and Ievgen Redko},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8kLzL5QBh2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1987146, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6444604334458762769&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "parisdescartes.fr;huawei.com;huawei.com;huawei.com;huawei.com;u-paris.fr;huawei.com", "author_num": 7, "aff_unique_index": "0;1;1;1;1;2;1", "aff_unique_norm": "University Paris Descartes;Huawei;Universit\u00e9 Paris Cit\u00e9", "aff_unique_dep": ";Huawei Technologies;", "aff_unique_url": "https://www.univ-paris5.fr;https://www.huawei.com;https://www.universite-paris.fr", "aff_unique_abbr": "UPD;Huawei;UPC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;0;1", "aff_country_unique": "France;China" }, { "title": "Make-A-Shape: a Ten-Million-scale 3D Shape Model", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34835", "id": "8l1KYguM4w", "proceeding": "https://proceedings.mlr.press/v235/hui24a.html", "pdf": "https://openreview.net/pdf?id=8l1KYguM4w", "openreview": "https://openreview.net/forum?id=8l1KYguM4w", "author_site": "Ka-Hei Hui, Aditya Sanghi, Arianna Rampini, Kamal Rahimi Malekshan, Zhengzhe Liu, Hooman Shayani, Chi-Wing Fu", "tldr": "", "abstract": "The progression in large-scale 3D generative models has been impeded by significant resource requirements for training and challenges like inefficient representations. This paper introduces Make-A-Shape, a novel 3D generative model trained on a vast scale, using 10 million publicly-available shapes. We first innovate the wavelet-tree representation to encode high-resolution SDF shapes with minimal loss, leveraging our newly-proposed subband coefficient filtering scheme. We then design a subband coefficient packing scheme to facilitate diffusion-based generation and a subband adaptive training strategy for effective training on the large-scale dataset. Our generative framework is versatile, capable of conditioning on various input modalities such as images, point clouds, and voxels, enabling a variety of downstream applications, e.g., unconditional generation, completion, and conditional generation. Our approach clearly surpasses the existing baselines in delivering high-quality results and can efficiently generate shapes within two seconds for most conditions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ka-Hei Hui;Aditya Sanghi;Arianna Rampini;Kamal Rahimi Malekshan;Zhengzhe Liu;Hooman Shayani;Chi-Wing Fu", "authorids": "~Ka-Hei_Hui1;~Aditya_Sanghi1;~Arianna_Rampini1;~Kamal_Rahimi_Malekshan1;~Zhengzhe_Liu1;~Hooman_Shayani1;~Chi-Wing_Fu2", "gender": "M;M;F;Not Specified;M;M;", "homepage": "https://appsrv.cse.cuhk.edu.hk/~khhui/index.html;https://github.com/sanghiad;;;https://liuzhengzhe.github.io/;;", "dblp": "253/0180;;230/7989;;160/0247;62/5219;", "google_scholar": "jYFUixwAAAAJ;q0-11e25FxIC;https://scholar.google.it/citations?user=xI1O33gAAAAJ;https://scholar.google.ca/citations?user=AT71bHEAAAAJ;HBpZeWsAAAAJ;https://scholar.google.co.uk/citations?hl=en;", "orcid": ";;;;;;", "linkedin": ";;;kamalrahimi/;zhengzhe-liu-767493b3/?originalSubdomain=hk;;", "or_profile": "~Ka-Hei_Hui1;~Aditya_Sanghi1;~Arianna_Rampini1;~Kamal_Rahimi_Malekshan1;~Zhengzhe_Liu1;~Hooman_Shayani1;~Chi-Wing_Fu2", "aff": "Department of Computer Science and Engineering, The Chinese University of Hong Kong;Autodesk;Autodesk;;The Chinese University of Hong Kong;Autodesk AI Lab;", "aff_domain": "cse.cuhk.edu.hk;autodesk.com;autodesk.com;;cuhk.edu.hk;autodesk.com;", "position": "PhD student;Researcher;Researcher;;PhD student;Principal Researcher;", "bibtex": "@inproceedings{\nhui2024makeashape,\ntitle={Make-A-Shape: a Ten-Million-scale 3D Shape Model},\nauthor={Ka-Hei Hui and Aditya Sanghi and Arianna Rampini and Kamal Rahimi Malekshan and Zhengzhe Liu and Hooman Shayani and Chi-Wing Fu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8l1KYguM4w}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9714324, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2056422853214129363&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "cse.cuhk.edu.hk;autodesk.com;autodesk.com;;cuhk.edu.hk;autodesk.com;", "author_num": 7, "aff_unique_index": "0;1;1;0;1", "aff_unique_norm": "Chinese University of Hong Kong;Autodesk", "aff_unique_dep": "Department of Computer Science and Engineering;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.autodesk.com", "aff_unique_abbr": "CUHK;Autodesk", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;1;0;1", "aff_country_unique": "China;United States" }, { "title": "Optimal Exact Recovery in Semi-Supervised Learning: A Study of Spectral Methods and Graph Convolutional Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34834", "id": "8m4V6Fx6ma", "proceeding": "https://proceedings.mlr.press/v235/wang24bt.html", "pdf": "https://openreview.net/pdf?id=8m4V6Fx6ma", "openreview": "https://openreview.net/forum?id=8m4V6Fx6ma", "author_site": "Haixiao Wang, Zhichao Wang", "tldr": "", "abstract": "We delve into the challenge of semi-supervised node classification on the Contextual Stochastic Block Model (CSBM) dataset. Here, nodes from the two-cluster Stochastic Block Model (SBM) are coupled with feature vectors, which are derived from a Gaussian Mixture Model (GMM) that corresponds to their respective node labels. With only a subset of the CSBM node labels accessible for training, our primary objective becomes the accurate classification of the remaining nodes. Venturing into the transductive learning landscape, we, for the first time, pinpoint the information-theoretical threshold for the exact recovery of all test nodes in CSBM. Concurrently, we design an optimal spectral estimator inspired by Principal Component Analysis (PCA) with the training labels and essential data from both the adjacency matrix and feature vectors. We also evaluate the efficacy of graph ridge regression and Graph Convolutional Networks (GCN) on this synthetic dataset. Our findings underscore that graph ridge regression and GCN possess the ability to achieve the information threshold of exact recovery in a manner akin to the optimal estimator when using the optimal weighted self-loops. This highlights the potential role of feature learning in augmenting the proficiency of GCN, especially in the realm of semi-supervised learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haixiao Wang;Zhichao Wang", "authorids": "~Haixiao_Wang1;~Zhichao_Wang3", "gender": ";M", "homepage": ";https://mathweb.ucsd.edu/~zhw036/", "dblp": ";02/10606", "google_scholar": ";IjXnDdoAAAAJ", "orcid": ";0000-0003-3886-5053", "linkedin": ";", "or_profile": "~Haixiao_Wang1;~Zhichao_Wang3", "aff": ";University of California, San Diego", "aff_domain": ";ucsd.edu", "position": ";PhD student", "bibtex": "@inproceedings{\nwang2024optimal,\ntitle={Optimal Exact Recovery in Semi-Supervised Learning: A Study of Spectral Methods and Graph Convolutional Networks},\nauthor={Haixiao Wang and Zhichao Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8m4V6Fx6ma}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1937702, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-WwedkMawL0J:scholar.google.com/&scioq=Optimal+Exact+Recovery+in+Semi-Supervised+Learning:+A+Study+of+Spectral+Methods+and+Graph+Convolutional+Networks&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": ";ucsd.edu", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "University of California, San Diego", "aff_unique_dep": "", "aff_unique_url": "https://www.ucsd.edu", "aff_unique_abbr": "UCSD", "aff_campus_unique_index": "0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Sharpness-Aware Data Generation for Zero-shot Quantization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34833", "id": "8mKXMnhnFW", "proceeding": "https://proceedings.mlr.press/v235/dung24a.html", "pdf": "https://openreview.net/pdf?id=8mKXMnhnFW", "openreview": "https://openreview.net/forum?id=8mKXMnhnFW", "author_site": "Hoang Dung, Cuong Pham, Trung Le, Jianfei Cai, Thanh-Toan Do", "tldr": "", "abstract": "Zero-shot quantization aims to learn a quantized model from a pre-trained full-precision model with no access to original real training data. The common idea in zero-shot quantization approaches is to generate synthetic data for quantizing the full-precision model. While it is well-known that deep neural networks with low sharpness have better generalization ability, none of the previous zero-shot quantization works considers the sharpness of the quantized model as a criterion for generating training data. This paper introduces a novel methodology that takes into account quantized model sharpness in synthetic data generation to enhance generalization. Specifically, we first demonstrate that sharpness minimization can be attained by maximizing gradient matching between the reconstruction loss gradients computed on synthetic and real validation data, under certain assumptions. We then circumvent the problem of the gradient matching without real validation set by approximating it with the gradient matching between each generated sample and its neighbors. Experimental evaluations on CIFAR-100 and ImageNet datasets demonstrate the superiority of the proposed method over the state-of-the-art techniques in low-bit quantization settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hoang Anh Dung;Cuong Pham;Trung Le;Jianfei Cai;Thanh-Toan Do", "authorids": "~Hoang_Anh_Dung1;~Cuong_Pham3;~Trung_Le2;~Jianfei_Cai1;~Thanh-Toan_Do4", "gender": "M;;M;M;", "homepage": ";;;https://jianfei-cai.github.io/;", "dblp": "295/8431;;;83/6096;", "google_scholar": "IiIiq0IAAAAJ;;https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=N6czCoUAAAAJ;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Hoang_Anh_Dung1;~Cuong_Pham3;~Trung_Le2;~Jianfei_Cai1;~Thanh-Toan_Do4", "aff": "Monash University;;Monash University;Monash University;", "aff_domain": "monash.edu.au;;monash.edu;monash.edu;", "position": "PhD student;;Assistant Professor;Full Professor;", "bibtex": "@inproceedings{\ndung2024sharpnessaware,\ntitle={Sharpness-Aware Data Generation for Zero-shot Quantization},\nauthor={Hoang Anh Dung and Cuong Pham and Trung Le and Jianfei Cai and Thanh-Toan Do},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8mKXMnhnFW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1160786, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Q5CzJc69AX8J:scholar.google.com/&scioq=Sharpness-Aware+Data+Generation+for+Zero-shot+Quantization&hl=en&as_sdt=0,44", "gs_version_total": 6, "email": "monash.edu.au;;monash.edu;monash.edu;", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Monash University", "aff_unique_dep": "", "aff_unique_url": "https://www.monash.edu", "aff_unique_abbr": "Monash", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Australia" }, { "title": "EquiAV: Leveraging Equivariance for Audio-Visual Contrastive Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34832", "id": "8nd1yBRCDl", "proceeding": "https://proceedings.mlr.press/v235/kim24v.html", "pdf": "https://openreview.net/pdf?id=8nd1yBRCDl", "openreview": "https://openreview.net/forum?id=8nd1yBRCDl", "author_site": "Jongsuk Kim, Hyeongkeun Lee, Kyeongha Rho, Junmo Kim, Joon Son Chung", "tldr": "", "abstract": "Recent advancements in self-supervised audio-visual representation learning have demonstrated its potential to capture rich and comprehensive representations. However, despite the advantages of data augmentation verified in many learning methods, audio-visual learning has struggled to fully harness these benefits, as augmentations can easily disrupt the correspondence between input pairs. To address this limitation, we introduce EquiAV, a novel framework that leverages equivariance for audio-visual contrastive learning. Our approach begins with extending equivariance to audio-visual learning, facilitated by a shared attention-based transformation predictor. It enables the aggregation of features from diverse augmentations into a representative embedding, providing robust supervision. Notably, this is achieved with minimal computational overhead. Extensive ablation studies and qualitative results verify the effectiveness of our method. EquiAV outperforms previous works across various audio-visual benchmarks. The code is available on https://github.com/JongSuk1/EquiAV", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jongsuk Kim;Hyeongkeun Lee;Kyeongha Rho;Junmo Kim;Joon Son Chung", "authorids": "~Jongsuk_Kim1;~Hyeongkeun_Lee1;~Kyeongha_Rho1;~Junmo_Kim1;~Joon_Son_Chung1", "gender": "M;M;M;;M", "homepage": ";https://khrho325.github.io;https://siit.kaist.ac.kr/Faculty;https://mmai.io/joon/;https://siit.kaist.ac.kr/", "dblp": "325/1370;264/9498;40/240-2.html;160/2692.html;330/3774", "google_scholar": "rFmAVN4AAAAJ;;https://scholar.google.com.tw/citations?user=GdQtWNQAAAAJ;https://scholar.google.co.uk/citations?user=JJ_LQ0YAAAAJ;C1O5NFQAAAAJ", "orcid": ";;;0000-0001-7741-7275;", "linkedin": ";;;;", "or_profile": "~Hyeongkeun_Lee1;~Kyeongha_Rho1;~Junmo_Kim1;~Joon_Son_Chung1;~Jong_Suk_Kim1", "aff": "Korea Advanced Institute of Science & Technology;KAIST;Korea Advanced Institute of Science & Technology;KAIST;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "MS student;MS student;Associate Professor;Associate Professor;PhD student", "bibtex": "@inproceedings{\nkim2024equiav,\ntitle={Equi{AV}: Leveraging Equivariance for Audio-Visual Contrastive Learning},\nauthor={Jongsuk Kim and Hyeongkeun Lee and Kyeongha Rho and Junmo Kim and Joon Son Chung},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8nd1yBRCDl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2471950, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11287292655590273951&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "HGCN2SP: Hierarchical Graph Convolutional Network for Two-Stage Stochastic Programming", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34831", "id": "8onaVSFTEj", "proceeding": "https://proceedings.mlr.press/v235/wu24ag.html", "pdf": "https://openreview.net/pdf?id=8onaVSFTEj", "openreview": "https://openreview.net/forum?id=8onaVSFTEj", "author_site": "Yang Wu, Yifan Zhang, Zhenxing Liang, Jian Cheng", "tldr": "", "abstract": "Two-stage Stochastic Programming (2SP) is a standard framework for modeling decision-making problems under uncertainty. While numerous methods exist, solving such problems with many scenarios remains challenging. Selecting representative scenarios is a practical method for accelerating solutions. However, current approaches typically rely on clustering or Monte Carlo sampling, failing to integrate scenario information deeply and overlooking the significant impact of the scenario order on solving time. To address these issues, we develop HGCN2SP, a novel model with a hierarchical graph designed for 2SP problems, encoding each scenario and modeling their relationships hierarchically. The model is trained in a reinforcement learning paradigm to utilize the feedback of the solver. The policy network is equipped with a hierarchical graph convolutional network for feature encoding and an attention-based decoder for scenario selection in proper order. Evaluation of two classic 2SP problems demonstrates that HGCN2SP provides high-quality decisions in a short computational time. Furthermore, HGCN2SP exhibits remarkable generalization capabilities in handling large-scale instances, even with a substantial number of variables or scenarios that were unseen during the training phase.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yang Wu;Yifan Zhang;Zhenxing Liang;Jian Cheng", "authorids": "~Yang_Wu8;~Yifan_Zhang2;~Zhenxing_Liang1;~Jian_Cheng7", "gender": "M;M;M;M", "homepage": "https://github.com/samwu-learn;;https://github.com/Universe-Eterno;https://people.ucas.ac.cn/~chengjian?language=en", "dblp": ";57/4707-1.html;;14/6145-1", "google_scholar": ";6EmRro4AAAAJ;;ZGCIUJ8AAAAJ", "orcid": ";;;0000-0003-1289-2758", "linkedin": ";;;", "or_profile": "~Yang_Wu8;~Yifan_Zhang2;~Zhenxing_Liang1;~Jian_Cheng7", "aff": "Institute of Automation, Chinese Academy of Sciences;Institute of automation, Chinese academy of science;Jilin University;Institute of Automation, Chinese Academy of Sciences", "aff_domain": "ia.ac.cn;nlpr.ia.ac.cn;mails.jlu.edu.cn;ia.ac.cn", "position": "PhD student;Full Professor;Undergrad student;Full Professor", "bibtex": "@inproceedings{\nwu2024hgcnsp,\ntitle={{HGCN}2{SP}: Hierarchical Graph Convolutional Network for Two-Stage Stochastic Programming},\nauthor={Yang Wu and Yifan Zhang and Zhenxing Liang and Jian Cheng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8onaVSFTEj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7496919, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=128688712108348704&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "email": "ia.ac.cn;nlpr.ia.ac.cn;mails.jlu.edu.cn;ia.ac.cn", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Chinese Academy of Sciences;Jilin University", "aff_unique_dep": "Institute of Automation;", "aff_unique_url": "http://www.ia.cas.cn;http://www.jlu.edu.cn", "aff_unique_abbr": "CAS;JLU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Position: Near to Mid-term Risks and Opportunities of Open-Source Generative AI", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34830", "id": "8q4EPdjTLE", "proceeding": "https://proceedings.mlr.press/v235/eiras24b.html", "pdf": "https://openreview.net/pdf?id=8q4EPdjTLE", "openreview": "https://openreview.net/forum?id=8q4EPdjTLE", "author_site": "Francisco Eiras, Aleksandar Petrov, Bertie Vidgen, Christian Schroeder, Fabio Pizzati, Katherine Elkins, Supratik Mukhopadhyay, Adel Bibi, Botos Csaba, Fabro Steibel, Fazl Barez, Genevieve Smith, Gianluca Guadagni, Jon Chun, Jordi Cabot, Joseph Marvin Imperial, Juan Arturo Nolazco Flores, Lori Landay, Matthew T Jackson, Paul R\u00f6ttger, Phil Torr, Trevor Darrell, Yong Suk Lee, Jakob Foerster", "tldr": "", "abstract": "In the next few years, applications of Generative AI are expected to revolutionize a number of different areas, ranging from science & medicine to education. The potential for these seismic changes has triggered a lively debate about potential risks and resulted in calls for tighter regulation, in particular from some of the major tech companies who are leading in AI development. While regulation is important, it is key that it does not put at risk the budding field of open-source Generative AI. We argue for the responsible open sourcing of generative AI models in the near and medium term. To set the stage, we first introduce an AI openness taxonomy system and apply it to 40 current large language models. We then outline differential benefits and risks of open versus closed source AI and present potential risk mitigation, ranging from best practices to calls for technical and scientific contributions. We hope that this report will add a much needed missing voice to the current public discourse on near to mid-term AI safety and other societal impact.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Francisco Eiras;Aleksandar Petrov;Bertie Vidgen;Christian Schroeder de Witt;Fabio Pizzati;Katherine Elkins;Supratik Mukhopadhyay;Adel Bibi;Botos Csaba;Fabro Steibel;Fazl Barez;Genevieve Smith;Gianluca Guadagni;Jon Chun;Jordi Cabot;Joseph Marvin Imperial;Juan A. Nolazco-Flores;Lori Landay;Matthew Thomas Jackson;Paul Rottger;Philip Torr;Trevor Darrell;Yong Suk Lee;Jakob Nicolaus Foerster", "authorids": "~Francisco_Eiras1;~Aleksandar_Petrov1;~Bertie_Vidgen1;~Christian_Schroeder_de_Witt1;~Fabio_Pizzati1;~Katherine_Elkins1;~Supratik_Mukhopadhyay2;~Adel_Bibi1;~Botos_Csaba1;ofabro@itsrio.org;~Fazl_Barez1;~Genevieve_Smith1;~Gianluca_Guadagni1;~Jon_Chun1;~Jordi_Cabot1;~Joseph_Marvin_Imperial1;~Juan_A._Nolazco-Flores1;~Lori_Landay1;~Matthew_Thomas_Jackson1;~Paul_Rottger1;~Philip_Torr1;~Trevor_Darrell2;~Yong_Suk_Lee1;~Jakob_Nicolaus_Foerster1", "gender": "M;M;M;M;;F;Not Specified;M;M;;;F;;Unspecified;M;M;M;;M;M;;;M;M", "homepage": "https://fgirbal.github.io;https://p-petrov.com/;https://www.turing.ac.uk/people/researchers/bertie-vidgen;https://www.schroederdewitt.com;https://fabvio.github.io;https://katherineelkins.com;https://www.xprize.org/prizes/artificial-intelligence/teams/deepdrug;http://adelbibi.com;https://www.linkedin.com/in/botos-csaba/;;;https://www.linkedin.com/in/genevieve-smith-1b42b130/;;https://github.com/jon-chun;https://jordicabot.com;https://www.josephimperial.com;https://research.tec.mx/vivo-tec/display/PID_532;https://www.lorilanday.com;https://matthewtjackson.com;https://paulrottger.com/;http://www.robots.ox.ac.uk/~tvg/;;;https://www.jakobfoerster.com", "dblp": "218/5843;49/8105;;;241/5366;;;176/0964;236/6044;;;;;249/9217.html;18/948;246/4647;;;331/5748;282/4243;;;;176/5095", "google_scholar": "O_iJTgYAAAAJ;em54BT4AAAAJ;https://scholar.google.co.uk/citations?user=yRhnVoIAAAAJ;DE60h_0AAAAJ;kA_l7GYAAAAJ;bUSgS6IAAAAJ;;Q4j2laYAAAAJ;n68BdMgAAAAJ;;;;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;LaGRXwkAAAAJ;irs_5ekAAAAJ;https://scholar.google.com.my/citations?user=Sowc_GwAAAAJ;;SdGawnwAAAAJ;7rpmd9cAAAAJ;;;xp7eZHwAAAAJ;6z4lQzMAAAAJ", "orcid": ";;;;;0000-0001-9887-4854;;0000-0002-6169-3918;;;;0000-0002-0615-6378;0000-0003-4568-0487;0000-0002-5315-6784;0000-0003-2418-2489;0000-0003-1073-6129;0000-0002-4187-9352;0000-0003-4362-5791;;0009-0008-7115-6893;;;;", "linkedin": "franciscogirbaleiras/;aleksandar-petrov/;bertie-vidgen-001/;;;kate-elkins/;;adel-bibi-ba3671ab/;botos-csaba/;;;;gianluca-guadagni-2267aa166;jonchun2000/;jcabot/;joseph-marvin-imperial-9382b9a7/;juan-arturo-nolazco-flores-3a08861/;;matthew-t-jackson/;paul-rottger/;;;;", "or_profile": "~Francisco_Eiras1;~Aleksandar_Petrov1;~Bertie_Vidgen1;~Christian_Schroeder_de_Witt1;~Fabio_Pizzati1;~Katherine_Elkins1;~Supratik_Mukhopadhyay2;~Adel_Bibi1;~Botos_Csaba1;ofabro@itsrio.org;~Fazl_Barez1;~Genevieve_Smith1;~Gianluca_Guadagni1;~Jon_Chun1;~Jordi_Cabot1;~Joseph_Marvin_Imperial1;~Juan_A._Nolazco-Flores1;~Lori_Landay1;~Matthew_Thomas_Jackson1;~Paul_Rottger1;~Philip_Torr1;~Trevor_Darrell2;~Yong_Suk_Lee1;~Jakob_Nicolaus_Foerster1", "aff": "University of Oxford;Adobe Systems;MLCommons;University of Oxford;University of Oxford;Kenyon College;Louisiana State University;University of Oxford;University of Oxford;;;University of Oxford;University of Virginia, Charlottesville;Kenyon College;University of Luxemburg;University of Bath;Instituto Tecnol\u00f3gico y de Estudios Superiores de Monterrey;Berklee College of Music;Wayve;Bocconi University;University of Oxford;;University of Notre Dame;University of Oxford, University of Oxford", "aff_domain": "ox.ac.uk;adobe.com;mlcommons.org;oxford.ac.uk;ox.ac.uk;kenyon.edu;lsu.edu;ox.ac.uk;oxford.ac.uk;;;oxford.ac.uk;virginia.edu;kenyon.edu;uni.lu;bath.ac.uk;tec.mx;berklee.edu;wayve.ai;unibocconi.it;ox.ac.uk;;nd.edu;eng.ox.ac.uk", "position": "PhD student;Intern;Evaluation lead;Lecturer;Postdoc;Full Professor;Full Professor;Senior Researcher;PhD student;;;PhD student;Associate Professor;Researcher;Full Professor;PhD student;Full Professor;Full Professor;Intern;Postdoc;Full Professor;;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\neiras2024position,\ntitle={Position: Near to Mid-term Risks and Opportunities of Open-Source Generative {AI}},\nauthor={Francisco Eiras and Aleksandar Petrov and Bertie Vidgen and Christian Schroeder de Witt and Fabio Pizzati and Katherine Elkins and Supratik Mukhopadhyay and Adel Bibi and Botos Csaba and Fabro Steibel and Fazl Barez and Genevieve Smith and Gianluca Guadagni and Jon Chun and Jordi Cabot and Joseph Marvin Imperial and Juan A. Nolazco-Flores and Lori Landay and Matthew Thomas Jackson and Paul Rottger and Philip Torr and Trevor Darrell and Yong Suk Lee and Jakob Nicolaus Foerster},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8q4EPdjTLE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 398546, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 24, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5699484247964101783&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15, "email": "ox.ac.uk;adobe.com;mlcommons.org;oxford.ac.uk;ox.ac.uk;kenyon.edu;lsu.edu;ox.ac.uk;oxford.ac.uk;;;oxford.ac.uk;virginia.edu;kenyon.edu;uni.lu;bath.ac.uk;tec.mx;berklee.edu;wayve.ai;unibocconi.it;ox.ac.uk;;nd.edu;eng.ox.ac.uk", "author_num": 24, "aff_unique_index": "0;1;2;0;0;3;4;0;0;0;5;3;6;7;8;9;10;11;0;12;0", "aff_unique_norm": "University of Oxford;Adobe;MLCommons;Kenyon College;Louisiana State University;University of Virginia;University of Luxembourg;University of Bath;Instituto Tecnol\u00f3gico y de Estudios Superiores de Monterrey;Berklee College of Music;Wayve;Bocconi University;University of Notre Dame", "aff_unique_dep": ";Adobe Systems Incorporated;;;;;;;;;;;", "aff_unique_url": "https://www.ox.ac.uk;https://www.adobe.com;https://mlcommons.org;https://www.kenyon.edu;https://www.lsu.edu;https://www.virginia.edu;https://wwwen.uniluxembourg.lu;https://www.bath.ac.uk;https://www.itesm.mx;https://www.berklee.edu;https://www.wayve.ai;https://www.bocconi.edu;https://www.nd.edu", "aff_unique_abbr": "Oxford;Adobe;MLCommons;Kenyon;LSU;UVA;Uni Lu;Bath;ITESM;Berklee;;Bocconi;Notre Dame", "aff_campus_unique_index": "1", "aff_campus_unique": ";Charlottesville", "aff_country_unique_index": "0;1;1;0;0;1;1;0;0;0;1;1;2;0;3;1;0;4;0;1;0", "aff_country_unique": "United Kingdom;United States;Luxembourg;Mexico;Italy" }, { "title": "Approximate Nearest Neighbor Search with Window Filters", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34829", "id": "8t8zBaGFar", "proceeding": "https://proceedings.mlr.press/v235/engels24a.html", "pdf": "https://openreview.net/pdf?id=8t8zBaGFar", "openreview": "https://openreview.net/forum?id=8t8zBaGFar", "author_site": "Josh Engels, Ben Landrum, Shangdi Yu, Laxman Dhulipala, Julian Shun", "tldr": "", "abstract": "We define and investigate the problem of *c-approximate window search*: approximate nearest neighbor search where each point in the dataset has a numeric label, and the goal is to find nearest neighbors to queries within arbitrary label ranges. Many semantic search problems, such as image and document search with timestamp filters, or product search with cost filters, are natural examples of this problem. We propose and theoretically analyze a modular tree-based framework for transforming an index that solves the traditional c-approximate nearest neighbor problem into a data structure that solves window search. On standard nearest neighbor benchmark datasets equipped with random label values, adversarially constructed embeddings, and image search embeddings with real timestamps, we obtain up to a $75\\times$ speedup over existing solutions at the same level of recall.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Joshua Engels;Ben Landrum;Shangdi Yu;Laxman Dhulipala;Julian Shun", "authorids": "~Joshua_Engels1;~Ben_Landrum1;~Shangdi_Yu1;~Laxman_Dhulipala1;~Julian_Shun1", "gender": "M;M;;;M", "homepage": "https://www.joshengels.com/;https://ben-landrum.com/;https://yushangdi.github.io/;;http://people.csail.mit.edu/jshun/", "dblp": "295/9447;;236/4340;;", "google_scholar": "yVPnVK8AAAAJ;7fkf1dwAAAAJ;3iCGYfQAAAAJ;;https://scholar.google.com.tw/citations?user=BGh9WU4AAAAJ", "orcid": ";0009-0003-4557-8850;0000-0002-8907-692X;;", "linkedin": ";;;;", "or_profile": "~Joshua_Engels1;~Ben_Landrum1;~Shangdi_Yu1;~Laxman_Dhulipala1;~Julian_Shun1", "aff": "Massachusetts Institute of Technology;University of Maryland, College Park;Massachusetts Institute of Technology;;Massachusetts Institute of Technology", "aff_domain": "mit.edu;umd.edu;mit.edu;;mit.edu", "position": "PhD student;MS student;PhD student;;Associate Professor", "bibtex": "@inproceedings{\nengels2024approximate,\ntitle={Approximate Nearest Neighbor Search with Window Filters},\nauthor={Joshua Engels and Ben Landrum and Shangdi Yu and Laxman Dhulipala and Julian Shun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8t8zBaGFar}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 853282, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6564456999885346261&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "mit.edu;umd.edu;mit.edu;;mit.edu", "author_num": 5, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Massachusetts Institute of Technology;University of Maryland", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://www/umd.edu", "aff_unique_abbr": "MIT;UMD", "aff_campus_unique_index": "1", "aff_campus_unique": ";College Park", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "MaxMin-RLHF: Alignment with Diverse Human Preferences", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34828", "id": "8tzjEMF0Vq", "proceeding": "https://proceedings.mlr.press/v235/chakraborty24b.html", "pdf": "https://openreview.net/pdf?id=8tzjEMF0Vq", "openreview": "https://openreview.net/forum?id=8tzjEMF0Vq", "author_site": "Souradip Chakraborty, Jiahao Qiu, Hui Yuan, Alec Koppel, Dinesh Manocha, Furong Huang, Amrit Singh Bedi, Mengdi Wang", "tldr": "", "abstract": "Reinforcement Learning from Human Feedback (RLHF) aligns language models to human preferences by employing a singular reward model derived from preference data. However, the single reward model overlooks the rich diversity of human preferences inherent in data collected from multiple users. In this work, we first derive an impossibility result of alignment with single reward RLHF, thereby highlighting its insufficiency in representing diverse human preferences. Next, we propose to learn a mixture of reward models via an expectation-maximization algorithm and solve a MaxMin alignment objective inspired by the Egalitarian principle in social choice theory to better honor diverse human preferences. We present comprehensive experimental results on small-scale (GPT-2) and large-scale language (with Tulu2-7B)) and show the efficacy of the proposed approach in the presence of diversity among human preferences. We remark that our findings in this work are not only limited to language models but also extend to reinforcement learning in general.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Souradip Chakraborty;Jiahao Qiu;Hui Yuan;Alec Koppel;Dinesh Manocha;Furong Huang;Amrit Bedi;Mengdi Wang", "authorids": "~Souradip_Chakraborty1;~Jiahao_Qiu1;~Hui_Yuan2;~Alec_Koppel1;~Dinesh_Manocha3;~Furong_Huang1;~Amrit_Bedi1;~Mengdi_Wang1", "gender": "M;M;F;M;M;F;M;F", "homepage": "https://souradip-umd.github.io/;;;http://koppel.netlify.app/;https://www.cs.umd.edu/people/dmanocha;https://furong-huang.com;https://sites.google.com/view/amritsinghbedi/home;http://mwang.princeton.edu", "dblp": "264/5758;;21/780-2;149/0076;m/DineshManocha;72/8513;176/2707.html;", "google_scholar": "https://scholar.google.co.in/citations?user=pvETm1wAAAAJ;86dbUg4AAAAJ;https://scholar.google.com/citations?hl=en;8ClxyjIAAAAJ;X08l_4IAAAAJ;13yyuCcAAAAJ;91WLA6QAAAAJ;", "orcid": ";0009-0000-7752-4169;;0000-0003-2447-2873;0000-0001-7047-9801;;;", "linkedin": ";jiahao-qiu-6a6161224/;;alec-koppel-9860b697/;dinesh-manocha-2311846;;;", "or_profile": "~Souradip_Chakraborty1;~Jiahao_Qiu1;~Hui_Yuan2;~Alec_Koppel1;~Dinesh_Manocha3;~Furong_Huang1;~Amrit_Bedi1;~Mengdi_Wang1", "aff": "University of Maryland, College Park;Princeton University;Princeton University;J.P. Morgan Chase;University of Maryland, College Park;University of Maryland;University of Maryland, College Park;Princeton University", "aff_domain": "umd.edu;princeton.edu;princeton.edu;jpmorgan.com;umd.edu;cs.umd.edu;umd.edu;princeton.edu", "position": "PhD student;PhD student;PhD student;Research Team Lead;Professor;Assistant Professor;Researcher;Full Professor", "bibtex": "@inproceedings{\nchakraborty2024maxminrlhf,\ntitle={MaxMin-{RLHF}: Alignment with Diverse Human Preferences},\nauthor={Souradip Chakraborty and Jiahao Qiu and Hui Yuan and Alec Koppel and Dinesh Manocha and Furong Huang and Amrit Bedi and Mengdi Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8tzjEMF0Vq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1664604, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 82, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3368976500556611224&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 7, "email": "umd.edu;princeton.edu;princeton.edu;jpmorgan.com;umd.edu;cs.umd.edu;umd.edu;princeton.edu", "author_num": 8, "aff_unique_index": "0;1;1;2;0;0;0;1", "aff_unique_norm": "University of Maryland;Princeton University;JPMorgan Chase & Co.", "aff_unique_dep": ";;", "aff_unique_url": "https://www/umd.edu;https://www.princeton.edu;https://www.jpmorganchase.com", "aff_unique_abbr": "UMD;Princeton;JPM", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "CLLMs: Consistency Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34827", "id": "8uzBOVmh8H", "proceeding": "https://proceedings.mlr.press/v235/kou24a.html", "pdf": "https://openreview.net/pdf?id=8uzBOVmh8H", "openreview": "https://openreview.net/forum?id=8uzBOVmh8H", "author_site": "Siqi Kou, Lanxiang Hu, Zhezhi He, Zhijie Deng, Hao Zhang", "tldr": "", "abstract": "Jacobi decoding shows promise for more efficient LLM inference as it breaks the sequential nature of the LLM decoding process and transforms it into more parallelizable computation. However, in practice, it achieves little speedup compared to traditional autoregressive (AR) decoding, primarily because Jacobi decoding seldom accurately predicts more than one token in a single fixed-point iteration step. To address this, we develop a new approach aimed at realizing fast convergence from any state to the fixed point in a Jacobi trajectory. This is accomplished by refining the target LLM to consistently predict the fixed point given any state as input. Extensive experiments demonstrate the effectiveness of our method, showing 2.4$\\times$ to 3.4$\\times$ improvements in generation speed while preserving generation quality across both domain-specific and open-domain benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Siqi Kou;Lanxiang Hu;Zhezhi He;Zhijie Deng;Hao Zhang", "authorids": "~Siqi_Kou1;~Lanxiang_Hu1;~Zhezhi_He1;~Zhijie_Deng1;~Hao_Zhang2", "gender": ";M;M;M;M", "homepage": "https://github.com/karrykkk;https://snyhlxde1.github.io/;https://elliothe.github.io/;https://thudzj.github.io/;https://cseweb.ucsd.edu/~haozhang/", "dblp": ";;184/1264;209/4959;55/2270-25", "google_scholar": ";KufYmg8AAAAJ;https://scholar.google.com/citations?hl=en;J3dR0sUAAAAJ;H1d4BS8AAAAJ", "orcid": ";0000-0003-0641-3677;0000-0002-6357-236X;0000-0002-0932-1631;", "linkedin": ";hu-lanxiang/;;;", "or_profile": "~Siqi_Kou1;~Lanxiang_Hu1;~Zhezhi_He1;~Zhijie_Deng1;~Hao_Zhang2", "aff": "Shanghai Jiaotong University;University of California, San Diego;Shanghai Jiaotong University;Shanghai Jiaotong University;Carnegie Mellon University", "aff_domain": "sjtu.edu.cn;ucsd.edu;sjtu.edu.cn;sjtu.edu.cn;cmu.edu", "position": "PhD student;PhD student;Assistant Professor;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nkou2024cllms,\ntitle={{CLLM}s: Consistency Large Language Models},\nauthor={Siqi Kou and Lanxiang Hu and Zhezhi He and Zhijie Deng and Hao Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8uzBOVmh8H}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1216403, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15781709703698168428&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "sjtu.edu.cn;ucsd.edu;sjtu.edu.cn;sjtu.edu.cn;cmu.edu", "author_num": 5, "aff_unique_index": "0;1;0;0;2", "aff_unique_norm": "Shanghai Jiao Tong University;University of California, San Diego;Carnegie Mellon University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.ucsd.edu;https://www.cmu.edu", "aff_unique_abbr": "SJTU;UCSD;CMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;1;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Sequential Neural Score Estimation: Likelihood-Free Inference with Conditional Score Based Diffusion Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34826", "id": "8viuf9PdzU", "proceeding": "https://proceedings.mlr.press/v235/sharrock24a.html", "pdf": "https://openreview.net/pdf?id=8viuf9PdzU", "openreview": "https://openreview.net/forum?id=8viuf9PdzU", "author_site": "Louis Sharrock, Jack Simons, Song Liu, Mark Beaumont", "tldr": "", "abstract": "We introduce Sequential Neural Posterior Score Estimation (SNPSE), a score-based method for Bayesian inference in simulator-based models. Our method, inspired by the remarkable success of score-based methods in generative modelling, leverages conditional score-based diffusion models to generate samples from the posterior distribution of interest. The model is trained using an objective function which directly estimates the score of the posterior. We embed the model into a sequential training procedure, which guides simulations using the current approximation of the posterior at the observation of interest, thereby reducing the simulation cost. We also introduce several alternative sequential approaches, and discuss their relative merits. We then validate our method, as well as its amortised, non-sequential, variant on several numerical examples, demonstrating comparable or superior performance to existing state-of-the-art methods such as Sequential Neural Posterior Estimation (SNPE).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Louis Sharrock;Jack Simons;Song Liu;Mark Beaumont", "authorids": "~Louis_Sharrock1;~Jack_Simons1;~Song_Liu1;~Mark_Beaumont1", "gender": "M;;M;", "homepage": "https://louissharrock.github.io/;http://www.bristol.ac.uk/maths/;http://allmodelsarewrong.net;https://www.bristol.ac.uk/people/person/Mark-Beaumont-c51e682d-904b-45b4-bda5-9cf2213d4e9d/", "dblp": "304/5319;;80/1141-2;", "google_scholar": "O0xSdYcAAAAJ;DNI5ygoAAAAJ;;2K3F0MMAAAAJ", "orcid": "0000-0003-1691-1215;;;0000-0002-8773-2743", "linkedin": "louissharrock/;;;", "or_profile": "~Louis_Sharrock1;~Jack_Simons1;~Song_Liu1;~Mark_Beaumont1", "aff": "Lancaster University;University of Bristol;University of Bristol, UK;University of Bristol", "aff_domain": "lancaster.ac.uk;bristol.ac.uk;bristol.ac.uk;bristol.ac.uk", "position": "Postdoc;PhD student;Lecturer;Full Professor", "bibtex": "@inproceedings{\nsharrock2024sequential,\ntitle={Sequential Neural Score Estimation: Likelihood-Free Inference with Conditional Score Based Diffusion Models},\nauthor={Louis Sharrock and Jack Simons and Song Liu and Mark Beaumont},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8viuf9PdzU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2862700, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7118558091020695604&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "lancaster.ac.uk;bristol.ac.uk;bristol.ac.uk;bristol.ac.uk", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Lancaster University;University of Bristol", "aff_unique_dep": ";", "aff_unique_url": "https://www.lancaster.ac.uk;https://www.bristol.ac.uk", "aff_unique_abbr": "Lancaster;Bristol", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "AquaLoRA: Toward White-box Protection for Customized Stable Diffusion Models via Watermark LoRA", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34825", "id": "8xKGZsnV2a", "proceeding": "https://proceedings.mlr.press/v235/feng24k.html", "pdf": "https://openreview.net/pdf?id=8xKGZsnV2a", "openreview": "https://openreview.net/forum?id=8xKGZsnV2a", "author_site": "Weitao Feng, Wenbo Zhou, Jiyan He, Jie Zhang, Tianyi Wei, Guanlin Li, Tianwei Zhang, Weiming Zhang, Nenghai Yu", "tldr": "", "abstract": "Diffusion models have achieved remarkable success in generating high-quality images. Recently, the open-source models represented by Stable Diffusion (SD) are thriving and are accessible for customization, giving rise to a vibrant community of creators and enthusiasts. However, the widespread availability of customized SD models has led to copyright concerns, like unauthorized model distribution and unconsented commercial use. To address it, recent works aim to let SD models output watermarked content for post-hoc forensics. Unfortunately, none of them can achieve the challenging white-box protection, wherein the malicious user can easily remove or replace the watermarking module to fail the subsequent verification. For this, we propose AquaLoRA as the first implementation under this scenario. Briefly, we merge watermark information into the U-Net of Stable Diffusion Models via a watermark LowRank Adaptation (LoRA) module in a two-stage manner. For watermark LoRA module, we devise a scaling matrix to achieve flexible message updates without retraining. To guarantee fidelity, we design Prior Preserving Fine-Tuning (PPFT) to ensure watermark learning with minimal impacts on model distribution, validated by proofs. Finally, we conduct extensive experiments and ablation studies to verify our design. Our code is available at github.com/Georgefwt/AquaLoRA.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weitao Feng;Wenbo Zhou;Jiyan He;Jie Zhang;Tianyi Wei;Guanlin Li;Tianwei Zhang;Weiming Zhang;Nenghai Yu", "authorids": "weitaofeng@mail.ustc.edu.cn;~Wenbo_Zhou1;~Jiyan_He1;~Jie_Zhang11;~Tianyi_Wei1;~Guanlin_Li2;~Tianwei_Zhang1;~Weiming_Zhang2;~Nenghai_Yu1", "gender": ";M;;M;M;M;M;M;M", "homepage": ";http://staff.ustc.edu.cn/~welbeckz/;http://home.ustc.edu.cn/~hejiyan;https://zjzac.github.io/;;https://guanlinlee.github.io/;https://personal.ntu.edu.sg/tianwei.zhang/index.html;http://staff.ustc.edu.cn/~zhangwm/;", "dblp": ";;258/1955;84/6889-73;177/5554;;77/7902-4;;96/5144", "google_scholar": ";sPMWxr0AAAAJ;Ep5qE5QAAAAJ;7YkR3CoAAAAJ//;-wfXmM4AAAAJ;3LB0_wMAAAAJ;9vpiYDIAAAAJ;eTCfl6cAAAAJ;https://scholar.google.com.hk/citations?user=7620QAMAAAAJ", "orcid": ";;0009-0003-4539-1826;0000-0002-4230-1077;0000-0002-7976-8439;;;0000-0001-5576-6108;", "linkedin": ";;;;;;;;", "or_profile": "weitaofeng@mail.ustc.edu.cn;~Wenbo_Zhou1;~Jiyan_He1;~Jie_Zhang11;~Tianyi_Wei1;~Guanlin_Li2;~Tianwei_Zhang1;~Weiming_Zhang2;~Nenghai_Yu1", "aff": ";University of Science and Technology of China;University of Science and Technology of China;Nanyang Technological University;University of Science and Technology of China;Nanyang Technological University;Nanyang Technological University;University of Science and Technology of China;University of Science and Technology of China", "aff_domain": ";ustc.edu.cn;ustc.edu;ntu.edu.sg;ustc.edu.cn;ntu.edu.sg;ntu.edu.sg;ustc.edu.cn;ustc.edu.cn", "position": ";Associate Professor;PhD student;Postdoc;PhD student;PhD student;Assistant Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nfeng2024aqualora,\ntitle={AquaLo{RA}: Toward White-box Protection for Customized Stable Diffusion Models via Watermark Lo{RA}},\nauthor={Weitao Feng and Wenbo Zhou and Jiyan He and Jie Zhang and Tianyi Wei and Guanlin Li and Tianwei Zhang and Weiming Zhang and Nenghai Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8xKGZsnV2a}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7745326, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14451440895997354742&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";ustc.edu.cn;ustc.edu;ntu.edu.sg;ustc.edu.cn;ntu.edu.sg;ntu.edu.sg;ustc.edu.cn;ustc.edu.cn", "author_num": 9, "aff_unique_index": "0;0;1;0;1;1;0;0", "aff_unique_norm": "University of Science and Technology of China;Nanyang Technological University", "aff_unique_dep": ";", "aff_unique_url": "http://www.ustc.edu.cn;https://www.ntu.edu.sg", "aff_unique_abbr": "USTC;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;1;1;0;0", "aff_country_unique": "China;Singapore" }, { "title": "WISER: Weak Supervision and Supervised Representation Learning to Improve Drug Response Prediction in Cancer", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34824", "id": "8ySQaphUYH", "proceeding": "https://proceedings.mlr.press/v235/shubham24a.html", "pdf": "https://openreview.net/pdf?id=8ySQaphUYH", "openreview": "https://openreview.net/forum?id=8ySQaphUYH", "author_site": "Kumar Shubham, Aishwarya Jayagopal, Syed Danish, Prathosh AP, Vaibhav Rajan", "tldr": "", "abstract": "Cancer, a leading cause of death globally, occurs due to genomic changes and manifests heterogeneously across patients. To advance research on personalized treatment strategies, the effectiveness of various drugs on cells derived from cancers ('cell lines') is experimentally determined in laboratory settings. Nevertheless, variations in the distribution of genomic data and drug responses between cell lines and humans arise due to biological and environmental differences. Moreover, while genomic profiles of many cancer patients are readily available, the scarcity of corresponding drug response data limits the ability to train machine learning models that can predict drug response in patients effectively. Recent cancer drug response prediction methods have largely followed the paradigm of unsupervised domain-invariant representation learning followed by a downstream drug response classification step. Introducing supervision in both stages is challenging due to heterogeneous patient response to drugs and limited drug response data. This paper addresses these challenges through a novel representation learning method in the first phase and weak supervision in the second. Experimental results on real patient data demonstrate the efficacy of our method WISER (Weak supervISion and supErvised Representation learning) over state-of-the-art alternatives on predicting personalized drug response. Our implementation is available at https://github.com/kyrs/WISER", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kumar Shubham;Aishwarya Jayagopal;Syed Mohammed Danish;Prathosh AP;Vaibhav Rajan", "authorids": "~Kumar_Shubham1;~Aishwarya_Jayagopal1;~Syed_Mohammed_Danish1;~Prathosh_AP1;~Vaibhav_Rajan2", "gender": "M;F;M;M;M", "homepage": "https://kyrs.github.io/;https://ajayago.github.io/;;https://sites.google.com/view/prathosh;", "dblp": ";;;218/5887;55/406", "google_scholar": "JBb0tXMAAAAJ;https://scholar.google.com.sg/citations?user=CnvrQUAAAAAJ;;https://scholar.google.co.in/citations?user=OEwV4bsAAAAJ;rBqhP-8AAAAJ", "orcid": ";0000-0002-5658-0724;;;0000-0002-6748-6864", "linkedin": ";;syed-danish-1769489a/;prathosh-ap-phd-50ab9511/;https://sg.linkedin.com/in/vaibhav-rajan-b76a4613", "or_profile": "~Kumar_Shubham1;~Aishwarya_Jayagopal1;~Syed_Mohammed_Danish1;~Prathosh_AP1;~Vaibhav_Rajan2", "aff": "Indian Institute of Science, Indian institute of science, Bangalore;National University of Singapore;Indian Institute of Technology, Patna;Indian Institute of Science, Indian institute of science, Bangalore;National University of Singapore", "aff_domain": "iisc.ac.in;u.nus.edu;iitp.ac.in;iisc.ac.in;nus.edu.sg", "position": "PhD student;PhD student;Undergrad student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nshubham2024wiser,\ntitle={{WISER}: Weak Supervision and Supervised Representation Learning to Improve Drug Response Prediction in Cancer},\nauthor={Kumar Shubham and Aishwarya Jayagopal and Syed Mohammed Danish and Prathosh AP and Vaibhav Rajan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8ySQaphUYH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2298350, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15590613289236999173&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "iisc.ac.in;u.nus.edu;iitp.ac.in;iisc.ac.in;nus.edu.sg", "author_num": 5, "aff_unique_index": "0;1;2;0;1", "aff_unique_norm": "Indian Institute of Science;National University of Singapore;Indian Institute of Technology Patna", "aff_unique_dep": ";;", "aff_unique_url": "https://www.iisc.ac.in;https://www.nus.edu.sg;https://www.iitp.ac.in", "aff_unique_abbr": "IISc;NUS;IIT Patna", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Bangalore;;Patna", "aff_country_unique_index": "0;1;0;0;1", "aff_country_unique": "India;Singapore" }, { "title": "Knowledge Distillation with Auxiliary Variable", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34823", "id": "91QmrfztSP", "proceeding": "https://proceedings.mlr.press/v235/peng24a.html", "pdf": "https://openreview.net/pdf?id=91QmrfztSP", "openreview": "https://openreview.net/forum?id=91QmrfztSP", "author_site": "Bo Peng, zhen fang, Guangquan Zhang, Jie Lu", "tldr": "", "abstract": "Knowledge distillation (KD) provides an efficient framework for transferring knowledge from a teacher model to a student model by aligning their predictive distributions. The existing KD methods adopt the same strategy as the teacher to formulate the student's predictive distribution. However, employing the same distribution-modeling strategy typically causes sub-optimal knowledge transfer due to the discrepancy in model capacity between teacher and student models. Designing student-friendly teachers contributes to alleviating the capacity discrepancy, while it requires either complicated or student-specific training schemes. To cast off this dilemma, we propose to introduce an auxiliary variable to promote the ability of the student to model predictive distribution. The auxiliary variable is defined to be related to target variables, which will boost the model prediction. Specifically, we reformulate the predictive distribution with the auxiliary variable, deriving a novel objective function of KD. Theoretically, we provide insights to explain why the proposed objective function can outperform the existing KD methods. Experimentally, we demonstrate that the proposed objective function can considerably and consistently outperform existing KD methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bo Peng;Zhen Fang;Guangquan Zhang;Jie Lu", "authorids": "~Bo_Peng24;~Zhen_Fang2;~Guangquan_Zhang2;~Jie_Lu4", "gender": ";M;;", "homepage": ";https://fang-zhen.github.io/index.html;;", "dblp": ";;;", "google_scholar": ";OzD6WJcAAAAJ;_1RMrhsAAAAJ;", "orcid": ";0000-0003-0602-6255;;", "linkedin": ";;;", "or_profile": "~Bo_Peng24;~Zhen_Fang2;~Guangquan_Zhang2;~Jie_Lu4", "aff": ";University of Technology Sydney;University of Technology Sydney (UTS);", "aff_domain": ";uts.edu.au;uts.eud.au;", "position": ";Assistant Professor;Associate Professor;", "bibtex": "@inproceedings{\npeng2024knowledge,\ntitle={Knowledge Distillation with Auxiliary Variable},\nauthor={Bo Peng and Zhen Fang and Guangquan Zhang and Jie Lu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=91QmrfztSP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 424514, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:aP78HGCoFEkJ:scholar.google.com/&scioq=Knowledge+Distillation+with+Auxiliary+Variable&hl=en&as_sdt=0,5", "gs_version_total": 4, "email": ";uts.edu.au;uts.eud.au;", "author_num": 4, "aff_unique_index": "0;0", "aff_unique_norm": "University of Technology Sydney", "aff_unique_dep": "", "aff_unique_url": "https://www.uts.edu.au", "aff_unique_abbr": "UTS", "aff_campus_unique_index": "1", "aff_campus_unique": ";Sydney", "aff_country_unique_index": "0;0", "aff_country_unique": "Australia" }, { "title": "ReconBoost: Boosting Can Achieve Modality Reconcilement", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34822", "id": "93gjGDwqim", "proceeding": "https://proceedings.mlr.press/v235/hua24a.html", "pdf": "https://openreview.net/pdf?id=93gjGDwqim", "openreview": "https://openreview.net/forum?id=93gjGDwqim", "author_site": "Cong Hua, Qianqian Xu, Shilong Bao, Zhiyong Yang, Qingming Huang", "tldr": "", "abstract": "This paper explores a novel multi-modal *alternating* learning paradigm pursuing a reconciliation between the exploitation of uni-modal features and the exploration of cross-modal interactions. This is motivated by the fact that current paradigms of multi-modal learning tend to explore multi-modal features simultaneously. The resulting gradient prohibits further exploitation of the features in the weak modality, leading to modality competition, where the dominant modality overpowers the learning process. To address this issue, we study the modality-alternating learning paradigm to achieve reconcilement. Specifically, we propose a new method called *ReconBoost* to update a fixed modality each time. Herein, the learning objective is dynamically adjusted with a reconcilement regularization against competition with the historical models. By choosing a KL-based reconcilement, we show that the proposed method resembles Friedman's Gradient-Boosting (GB) algorithm, where the updated learner can correct errors made by others and help enhance the overall performance. The major difference with the classic GB is that we only preserve the newest model for each modality to avoid overfitting caused by ensembling strong learners. Furthermore, we propose a memory consolidation scheme and a global rectification scheme to make this strategy more effective. Experiments over six multi-modal benchmarks speak to the efficacy of the proposed method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Cong Hua;Qianqian Xu;Shilong Bao;Zhiyong Yang;Qingming Huang", "authorids": "~Cong_Hua2;~Qianqian_Xu2;~Shilong_Bao1;~Zhiyong_Yang1;~Qingming_Huang1", "gender": "F;M;M;M;", "homepage": "http://vipl.ict.ac.cn/people/~qianqianxu;https://statusrank.github.io/;https://joshuaas.github.io/;;https://qmhuang-ucas.github.io/", "dblp": "07/7627;143/0246;01/452-1.html;;68/4388", "google_scholar": "https://scholar.google.com.hk/citations?user=MjifS2MAAAAJ;https://scholar.google.com.hk/citations?user=5ZCgkQkAAAAJ;https://scholar.google.com/citations?hl=zh-CN;;https://scholar.google.com.hk/citations?user=J1vMnRgAAAAJ", "orcid": ";;0000-0002-4409-4999;0000-0003-2323-8866;", "linkedin": ";;;;", "or_profile": "~Qianqian_Xu2;~Shilong_Bao1;~Zhiyong_Yang1;~CONG_HUA1;~Qingming_Huang2", "aff": "Institute of Computing Technology, Chinese Academy of Sciences;University of Chinese Academy of Sciences;University of Chinese Academic of Sciences;Institute of Computing Technology, Chinese Academy of Sciences;University of Chinese Academy of Sciences", "aff_domain": "ict.ac.cn;ucas.ac.cn;ucas.ac.cb;ict.ac.cn;ucas.ac.cn", "position": "Full Professor;PhD student;Associate Professor;PhD student;Full Professor", "bibtex": "@inproceedings{\nhua2024reconboost,\ntitle={ReconBoost: Boosting Can Achieve Modality Reconcilement},\nauthor={Cong Hua and Qianqian Xu and Shilong Bao and Zhiyong Yang and Qingming Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=93gjGDwqim}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4461414, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6560825067504569931&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "ict.ac.cn;ucas.ac.cn;ucas.ac.cb;ict.ac.cn;ucas.ac.cn", "author_num": 5, "aff_unique_index": "0;1;1;0;1", "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences", "aff_unique_dep": "Institute of Computing Technology;", "aff_unique_url": "http://www.ict.ac.cn;http://www.ucas.ac.cn", "aff_unique_abbr": "CAS;UCAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "How Does Goal Relabeling Improve Sample Efficiency?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34821", "id": "99UFZV2VpU", "proceeding": "https://proceedings.mlr.press/v235/zheng24a.html", "pdf": "https://openreview.net/pdf?id=99UFZV2VpU", "openreview": "https://openreview.net/forum?id=99UFZV2VpU", "author_site": "Sirui Zheng, Chenjia Bai, Zhuoran Yang, Zhaoran Wang", "tldr": "", "abstract": "Hindsight experience replay and goal relabeling are successful in reinforcement learning (RL) since they enable agents to learn from failures. Despite their successes, we lack a theoretical understanding, such as (i) why hindsight experience replay improves sample efficiency and (ii) how to design a relabeling method that achieves sample efficiency. To this end, we construct an example to show the information-theoretical improvement in sample efficiency achieved by goal relabeling. Our example reveals that goal relabeling can enhance sample efficiency and exploit the rich information in observations through better hypothesis elimination. Based on these insights, we develop an RL algorithm called GOALIVE. To analyze the sample complexity of GOALIVE, we introduce a complexity measure, the goal-conditioned Bellman-Eluder (GOAL-BE) dimension, which characterizes the sample complexity of goal-conditioned RL problems. Compared to the Bellman-Eluder dimension, the goal-conditioned version offers an exponential improvement in the best case. To the best of our knowledge, our work provides the first characterization of the theoretical improvement in sample efficiency achieved by goal relabeling.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sirui Zheng;Chenjia Bai;Zhuoran Yang;Zhaoran Wang", "authorids": "~Sirui_Zheng2;~Chenjia_Bai2;~Zhuoran_Yang1;~Zhaoran_Wang1", "gender": "M;M;M;Not Specified", "homepage": ";https://baichenjia.github.io/;https://zhuoranyang.github.io/;https://zhaoranwang.github.io/", "dblp": ";247/1943;;117/2756", "google_scholar": ";Rm_1y2kAAAAJ;;https://scholar.google.com.tw/citations?user=HSx0BgQAAAAJ", "orcid": ";;;", "linkedin": "%E6%80%9D%E9%94%90-%E9%83%91-448756212/;;;", "or_profile": "~Sirui_Zheng2;~Chenjia_Bai2;~Zhuoran_Yang1;~Zhaoran_Wang1", "aff": "Northwestern University;Shanghai AI Laboratory;Yale University;Northwestern University", "aff_domain": "northwestern.edu;pjlab.org.cn;yale.edu;northwestern.edu", "position": "PhD student;Researcher;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nzheng2024how,\ntitle={How Does Goal Relabeling Improve Sample Efficiency?},\nauthor={Sirui Zheng and Chenjia Bai and Zhuoran Yang and Zhaoran Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=99UFZV2VpU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 415736, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8359426920530428130&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 5, "email": "northwestern.edu;pjlab.org.cn;yale.edu;northwestern.edu", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Northwestern University;Shanghai AI Laboratory;Yale University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.northwestern.edu;https://www.shanghai-ai-lab.com;https://www.yale.edu", "aff_unique_abbr": "NU;SAIL;Yale", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;China" }, { "title": "Do Models Explain Themselves? Counterfactual Simulatability of Natural Language Explanations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34820", "id": "99jx5U81jx", "proceeding": "https://proceedings.mlr.press/v235/chen24bl.html", "pdf": "https://openreview.net/pdf?id=99jx5U81jx", "openreview": "https://openreview.net/forum?id=99jx5U81jx", "author_site": "Yanda Chen, Ruiqi Zhong, Narutatsu Ri, Chen Zhao, He He, Jacob Steinhardt, Zhou Yu, Kathleen McKeown", "tldr": "", "abstract": "Large language models (LLMs) are trained to imitate humans to explain human decisions. However, do LLMs explain themselves? Can they help humans build mental models of how LLMs process different inputs? To answer these questions, we propose to evaluate $\\textbf{counterfactual simulatability}$ of natural language explanations: whether an explanation can enable humans to precisely infer the model's outputs on diverse counterfactuals of the explained input. For example, if a model answers ''$\\textit{yes}$'' to the input question ''$\\textit{Can eagles fly?}$'' with the explanation ''$\\textit{all birds can fly}$'', then humans would infer from the explanation that it would also answer ''$\\textit{yes}$'' to the counterfactual input ''$\\textit{Can penguins fly?}$''. If the explanation is precise, then the model's answer should match humans' expectations. We implemented two metrics based on counterfactual simulatability: precision and generality. We generated diverse counterfactuals automatically using LLMs. We then used these metrics to evaluate state-of-the-art LLMs (e.g., GPT-4) on two tasks: multi-hop factual reasoning and reward modeling. We found that LLM's explanations have low precision and that precision does not correlate with plausibility. Therefore, naively optimizing human approvals (e.g., RLHF) may be insufficient.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yanda Chen;Ruiqi Zhong;Narutatsu Ri;Chen Zhao;He He;Jacob Steinhardt;Zhou Yu;Kathleen McKeown", "authorids": "~Yanda_Chen1;~Ruiqi_Zhong1;~Narutatsu_Ri1;~Chen_Zhao2;~He_He2;~Jacob_Steinhardt1;~Zhou_Yu1;~Kathleen_McKeown1", "gender": "M;M;M;M;;F;F;F", "homepage": "https://yandachen.github.io/;https://ruiqi-zhong.github.io;https://narutatsuri.github.io/;http://umiacs.umd.edu/~chenz/;;http://www.cs.columbia.edu/~zhouyu/;http://www.cs.columbia.edu/~kathy/;http://hhexiy.github.io", "dblp": "212/0154;222/3024;;81/3-9;35/10625;83/3205;m/KathleenMcKeown;08/8618-1", "google_scholar": "https://scholar.google.com/citations?hl=en;GskOShAAAAAJ;Pp2YYKcAAAAJ;zehsvT8AAAAJ;;https://scholar.google.com.tw/citations?user=jee2Dy0AAAAJ;https://scholar.google.com.tw/citations?user=ujDhg2sAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": "~Yanda_Chen1;~Ruiqi_Zhong1;~Narutatsu_Ri1;~Chen_Zhao2;~Jacob_Steinhardt1;~Zhou_Yu1;~Kathleen_McKeown1;~He_He1", "aff": "Columbia University;University of California, Berkeley;Columbia University;NYU Shanghai;University of California, Berkeley;Columbia University;Columbia University;New York University", "aff_domain": "columbia.edu;berkeley.edu;columbia.edu;nyu.edu;berkeley.edu;columbia.edu;columbia.edu;nyu.edu", "position": "PhD student;PhD student;Undergrad student;Assistant Professor;Assistant Professor;Assistant Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nchen2024do,\ntitle={Do Models Explain Themselves? Counterfactual Simulatability of Natural Language Explanations},\nauthor={Yanda Chen and Ruiqi Zhong and Narutatsu Ri and Chen Zhao and He He and Jacob Steinhardt and Zhou Yu and Kathleen McKeown},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=99jx5U81jx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1420344, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9653584904783729950&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "columbia.edu;berkeley.edu;columbia.edu;nyu.edu;berkeley.edu;columbia.edu;columbia.edu;nyu.edu", "author_num": 8, "aff_unique_index": "0;1;0;2;1;0;0;3", "aff_unique_norm": "Columbia University;University of California, Berkeley;New York University Shanghai;New York University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.columbia.edu;https://www.berkeley.edu;https://shanghai.nyu.edu;https://www.nyu.edu", "aff_unique_abbr": "Columbia;UC Berkeley;NYU Shanghai;NYU", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Berkeley;Shanghai", "aff_country_unique_index": "0;0;0;1;0;0;0;0", "aff_country_unique": "United States;China" }, { "title": "HexGen: Generative Inference of Large Language Model over Heterogeneous Environment", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34819", "id": "9ANyvRtFGa", "proceeding": "https://proceedings.mlr.press/v235/jiang24f.html", "pdf": "https://openreview.net/pdf?id=9ANyvRtFGa", "openreview": "https://openreview.net/forum?id=9ANyvRtFGa", "author_site": "Youhe Jiang, Ran Yan, Xiaozhe Yao, Yang Zhou, Beidi Chen, Binhang Yuan", "tldr": "", "abstract": "Serving generative inference of the large language model is a crucial component of contemporary AI applications. In this paper, our focus lies in deploying such services in a heterogeneous and cross-datacenter setting to mitigate the substantial inference costs typically associated with a single centralized datacenter. Towards this end, we propose HexGen, a flexible distributed inference engine that uniquely supports the asymmetric partition of generative inference computations over both tensor model parallelism and pipeline parallelism, which allows for effective deployment across diverse GPUs interconnected by a fully heterogeneous network. We further propose a sophisticated scheduling algorithm grounded in constrained optimization that can adaptively assign asymmetric inference computation across the GPUs to fulfill inference requests while maintaining acceptable latency levels. We conduct an extensive empirical study to evaluate the efficiency of HexGen by serving the state-of-the-art Llama-2 (70B) model. The experimental results suggest that HexGen can choose to achieve up to $2.3\\times$ lower latency deadlines or tolerate up to $4\\times$ more traffic request rates compared with the homogeneous baseline given the same budget.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "YOUHE JIANG;Ran Yan;Xiaozhe Yao;Yang Zhou;Beidi Chen;Binhang Yuan", "authorids": "~YOUHE_JIANG1;ryanaf@connect.ust.hk;~Xiaozhe_Yao1;yangzho6@andrew.cmu.edu;~Beidi_Chen1;~Binhang_Yuan1", "gender": "M;;M;;F;M", "homepage": "https://youhe-jiang.github.io/;;https://about.yao.sh;;https://www.andrew.cmu.edu/user/beidic/;https://binhangyuan.github.io/site/", "dblp": ";;212/8935;;192/1339;141/0690.html", "google_scholar": ";;;;;TflKxcIAAAAJ", "orcid": ";;;;;0000-0002-3188-2769", "linkedin": ";;;;;", "or_profile": "~YOUHE_JIANG1;ryanaf@connect.ust.hk;~Xiaozhe_Yao1;yangzho6@andrew.cmu.edu;~Beidi_Chen1;~Binhang_Yuan1", "aff": "University of Cambridge;;Department of Computer Science, ETHZ - ETH Zurich;;Meta Facebook;Hong Kong University of Science and Technology", "aff_domain": "cam.ac.uk;;inf.ethz.ch;;fb.com;ust.hk", "position": "PhD student;;PhD student;;Researcher;Assistant Professor", "bibtex": "@inproceedings{\njiang2024hexgen,\ntitle={HexGen: Generative Inference of Large Language Model over Heterogeneous Environment},\nauthor={YOUHE JIANG and Ran Yan and Xiaozhe Yao and Yang Zhou and Beidi Chen and Binhang Yuan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9ANyvRtFGa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 585623, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7711790134571266919&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "cam.ac.uk;;inf.ethz.ch;;fb.com;ust.hk", "author_num": 6, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Cambridge;ETH Zurich;Meta;Hong Kong University of Science and Technology", "aff_unique_dep": ";Department of Computer Science;Meta Platforms, Inc.;", "aff_unique_url": "https://www.cam.ac.uk;https://www.ethz.ch;https://meta.com;https://www.ust.hk", "aff_unique_abbr": "Cambridge;ETHZ;Meta;HKUST", "aff_campus_unique_index": "0;1;3", "aff_campus_unique": "Cambridge;Zurich;;Hong Kong SAR", "aff_country_unique_index": "0;1;2;3", "aff_country_unique": "United Kingdom;Switzerland;United States;China" }, { "title": "ConvNet vs Transformer, Supervised vs CLIP: Beyond ImageNet Accuracy", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34818", "id": "9BGi9PEhNn", "proceeding": "https://proceedings.mlr.press/v235/vishniakov24a.html", "pdf": "https://openreview.net/pdf?id=9BGi9PEhNn", "openreview": "https://openreview.net/forum?id=9BGi9PEhNn", "author_site": "Kirill Vishniakov, Zhiqiang Shen, Zhuang Liu", "tldr": "", "abstract": "Modern computer vision offers a great variety of models to practitioners, and selecting a model from multiple options for specific applications can be challenging. Conventionally, competing model architectures and training protocols are compared by their classification accuracy on ImageNet. However, this single metric does not fully capture performance nuances critical for specialized tasks. In this work, we conduct an in-depth comparative analysis of model behaviors beyond ImageNet accuracy, for both ConvNet and Vision Transformer architectures, each across supervised and CLIP training paradigms. Although our selected models have similar ImageNet accuracies and compute requirements, we find that they differ in many other aspects: types of mistakes, output calibration, transferability, and feature invariance, among others. This diversity in model characteristics, not captured by traditional metrics, highlights the need for more nuanced analysis when choosing among different models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kirill Vishniakov;Zhiqiang Shen;Zhuang Liu", "authorids": "~Kirill_Vishniakov1;~Zhiqiang_Shen1;~Zhuang_Liu1", "gender": "M;;", "homepage": "https://kirill-vish.github.io/;;", "dblp": ";;", "google_scholar": "H02tLFMAAAAJ;;", "orcid": ";;", "linkedin": "https://linkedin.com/in/kirill-vishniakov-605aa0142;;", "or_profile": "~Kirill_Vishniakov1;~Zhiqiang_Shen1;~Zhuang_Liu1", "aff": "M42;;", "aff_domain": "m42.ae;;", "position": "Researcher;;", "bibtex": "@inproceedings{\nvishniakov2024convnet,\ntitle={ConvNet vs Transformer, Supervised vs {CLIP}: Beyond ImageNet Accuracy},\nauthor={Kirill Vishniakov and Zhiqiang Shen and Zhuang Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9BGi9PEhNn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1848115, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17098610418011802232&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "m42.ae;;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "M42", "aff_unique_dep": "", "aff_unique_url": "", "aff_unique_abbr": "" }, { "title": "A Contextual Combinatorial Bandit Approach to Negotiation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34817", "id": "9BWRs6XF8P", "proceeding": "https://proceedings.mlr.press/v235/li24az.html", "pdf": "https://openreview.net/pdf?id=9BWRs6XF8P", "openreview": "https://openreview.net/forum?id=9BWRs6XF8P", "author_site": "Yexin Li, Zhancun Mu, Siyuan Qi", "tldr": "", "abstract": "Learning effective negotiation strategies poses two key challenges: the exploration-exploitation dilemma and dealing with large action spaces. However, there is an absence of learning-based approaches that effectively address these challenges in negotiation. This paper introduces a comprehensive formulation to tackle various negotiation problems. Our approach leverages contextual combinatorial multi-armed bandits, with the bandits resolving the exploration-exploitation dilemma, and the combinatorial nature handles large action spaces. Building upon this formulation, we introduce NegUCB, a novel method that also handles common issues such as partial observations and complex reward functions in negotiation. NegUCB is contextual and tailored for full-bandit feedback without constraints on the reward functions. Under mild assumptions, it ensures a sub-linear regret upper bound. Experiments conducted on three negotiation tasks demonstrate the superiority of our approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yexin Li;Zhancun Mu;Siyuan Qi", "authorids": "~Yexin_Li1;~Zhancun_Mu1;~Siyuan_Qi1", "gender": "F;M;", "homepage": "https://liyexn.github.io/liyexin.github.io/;https://muzhancun.github.io;", "dblp": "176/1477;381/4972;177/5178", "google_scholar": "https://scholar.google.com/citations?hl=en;mwN8K4IAAAAJ;ePclJR4AAAAJ", "orcid": ";;0000-0002-4070-733X", "linkedin": ";;", "or_profile": "~Yexin_Li1;~Zhancun_Mu1;~Siyuan_Qi1", "aff": "State Key Laboratory of General Artificial Intelligence, BIGAI;Peking University;Beijing Institute for General Artificial Intelligence", "aff_domain": "bigai.ai;pku.edu.cn;bigai.ai", "position": "Researcher;Undergrad student;Researcher", "bibtex": "@inproceedings{\nli2024a,\ntitle={A Contextual Combinatorial Bandit Approach to Negotiation},\nauthor={Yexin Li and Zhancun Mu and Siyuan Qi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9BWRs6XF8P}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1812511, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4132737966711970056&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "bigai.ai;pku.edu.cn;bigai.ai", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "State Key Laboratory of General Artificial Intelligence;Peking University;Beijing Institute for General Artificial Intelligence", "aff_unique_dep": "General Artificial Intelligence;;", "aff_unique_url": ";http://www.pku.edu.cn;http://www.bigaiai.org/", "aff_unique_abbr": "SKLGA;Peking U;BIGAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "QuIP$\\#$: Even Better LLM Quantization with Hadamard Incoherence and Lattice Codebooks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34816", "id": "9BrydUVcoe", "proceeding": "https://proceedings.mlr.press/v235/tseng24a.html", "pdf": "https://openreview.net/pdf?id=9BrydUVcoe", "openreview": "https://openreview.net/forum?id=9BrydUVcoe", "author_site": "Albert Tseng, Jerry Chee, Qingyao Sun, Volodymyr Kuleshov, Chris De Sa", "tldr": "", "abstract": "Post-training quantization (PTQ) reduces the memory footprint of LLMs by quantizing their weights to low-precision. In this work, we introduce QuIP#, a weight-only PTQ method that achieves state-of-the-art results in extreme compression regimes ($\\le$ 4 bits per weight) using three novel techniques. First, QuIP# improves QuIP's (Chee et al., 2023) incoherence processing by using the randomized Hadamard transform, which is faster and has better theoretical properties. Second, QuIP# uses vector quantization to take advantage of the ball-shaped sub-Gaussian distribution that incoherent weights possess: specifically, we introduce a set of hardware-efficient codebooks based on the highly symmetric $E_8$ lattice, which achieves the optimal 8-dimension unit ball packing. Third, QuIP# uses fine-tuning to improve fidelity to the original model. Our experiments show that QuIP# outperforms existing PTQ methods, enables new behaviors in PTQ scaling, and supports fast inference. Our code can be found at https://github.com/Cornell-RelaxML/quip-sharp.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Albert Tseng;Jerry Chee;Qingyao Sun;Volodymyr Kuleshov;Christopher De Sa", "authorids": "~Albert_Tseng1;~Jerry_Chee1;~Qingyao_Sun1;~Volodymyr_Kuleshov1;~Christopher_De_Sa2", "gender": ";;;;M", "homepage": "https://tsengalb99.github.io/;http://jerry-chee.github.io/;https://nalzok.github.io/;https://www.cs.cornell.edu/~kuleshov/;http://cs.cornell.edu/~cdesa", "dblp": "249/9439;207/8369;271/4259;81/8612;154/6336", "google_scholar": ";qyQpUAkAAAAJ;;RY_t8XAAAAAJ;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Albert_Tseng1;~Jerry_Chee1;~Qingyao_Sun1;~Volodymyr_Kuleshov1;~Christopher_De_Sa1", "aff": "Cornell University;Cornell University;Cornell University;Cornell University;Cornell University", "aff_domain": "cs.cornell.edu;cornell.edu;cornell.edu;cornell.edu;cornell.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\ntseng2024quip,\ntitle={Qu{IP}\\${\\textbackslash}\\#\\$: Even Better {LLM} Quantization with Hadamard Incoherence and Lattice Codebooks},\nauthor={Albert Tseng and Jerry Chee and Qingyao Sun and Volodymyr Kuleshov and Christopher De Sa},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9BrydUVcoe}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1149685, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 87, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9428905470797871609&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "cs.cornell.edu;cornell.edu;cornell.edu;cornell.edu;cornell.edu", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Cornell University", "aff_unique_dep": "", "aff_unique_url": "https://www.cornell.edu", "aff_unique_abbr": "Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Loss Shaping Constraints for Long-Term Time Series Forecasting", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34815", "id": "9CCoVyFuEp", "proceeding": "https://proceedings.mlr.press/v235/hounie24a.html", "pdf": "https://openreview.net/pdf?id=9CCoVyFuEp", "openreview": "https://openreview.net/forum?id=9CCoVyFuEp", "author_site": "Ignacio Hounie, Javier Porras-Valenzuela, Alejandro Ribeiro", "tldr": "", "abstract": "Several applications in time series forecasting require predicting multiple steps ahead. Despite the vast amount of literature in the topic, both classical and recent deep learning based approaches have mostly focused on minimising performance averaged over the predicted window. We observe that this can lead to disparate distributions of errors across forecasting steps, especially for recent transformer architectures trained on popular forecasting benchmarks. That is, optimising performance on average can lead to undesirably large errors at specific time-steps. In this work, we present a Constrained Learning approach for long-term time series forecasting that aims to find the best model in terms of average performance that respects a user-defined upper bound on the loss at each time-step. We call our approach loss shaping constraints because it imposes constraints on the loss at each time step, and leverage recent duality results to show that despite its non-convexity, the resulting problem has a bounded duality gap. We propose a practical primal-dual algorithm to tackle it, and demonstrate that the proposed approach exhibits competitive average performance in time series forecasting benchmarks, while shaping the distribution of errors across the predicted window.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ignacio Hounie;Javier Porras-Valenzuela;Alejandro Ribeiro", "authorids": "~Ignacio_Hounie1;~Javier_Porras-Valenzuela1;~Alejandro_Ribeiro1", "gender": ";M;M", "homepage": ";https://www.linkedin.com/in/javier-porras-valenzuela/;https://alelab.seas.upenn.edu", "dblp": ";;32/15", "google_scholar": "V0h3OSYAAAAJ;;7mrPM4kAAAAJ", "orcid": ";;0000-0003-4230-9906", "linkedin": ";javier-porras-valenzuela/;", "or_profile": "~Ignacio_Hounie1;~Javier_Porras-Valenzuela1;~Alejandro_Ribeiro1", "aff": "University of Pennsylvania;University of Pennsylvania;University of Pennsylvania", "aff_domain": "upenn.edu;seas.upenn.edu;upenn.edu", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nhounie2024loss,\ntitle={Loss Shaping Constraints for Long-Term Time Series Forecasting},\nauthor={Ignacio Hounie and Javier Porras-Valenzuela and Alejandro Ribeiro},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9CCoVyFuEp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4142196, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2327243573422143670&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "upenn.edu;seas.upenn.edu;upenn.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "EvIL: Evolution Strategies for Generalisable Imitation Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34814", "id": "9DMMvMTDur", "proceeding": "https://proceedings.mlr.press/v235/sapora24a.html", "pdf": "https://openreview.net/pdf?id=9DMMvMTDur", "openreview": "https://openreview.net/forum?id=9DMMvMTDur", "author_site": "Silvia Sapora, Gokul Swamy, Christopher Lu, Yee-Whye Teh, Jakob Foerster", "tldr": "", "abstract": "Often times in imitation learning (IL), the environment we collect expert demonstrations in and the environment we want to deploy our learned policy in aren't exactly the same (e.g. demonstrations collected in simulation but deployment in the real world). Compared to policy-centric approaches to IL like behavioural cloning, reward-centric approaches like *inverse reinforcement learning* (IRL) often better replicate expert behaviour in new environments. This transfer is usually performed by optimising the recovered reward under the dynamics of the target environment. However, *(a)* we find that modern deep IL algorithms frequently recover rewards which induce policies far weaker than the expert, *even in the same environment the demonstrations were collected in*. Furthermore, *(b)* these rewards are often quite poorly shaped, necessitating extensive environment interaction to optimise effectively. We provide simple and scalable fixes to both of these concerns. For *(a)*, we find that *reward model ensembles* combined with a slightly different training objective significantly improves re-training and transfer performance. For *(b)*, we propose a novel *evolution-strategies* based method (EvIL) to optimise for a reward-shaping term that speeds up re-training in the target environment, closing a gap left open by the classical theory of IRL. On a suite of continuous control tasks, we are able to re-train policies in target (and source) environments more interaction-efficiently than prior work.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Silvia Sapora;Gokul Swamy;Chris Lu;Yee Whye Teh;Jakob Nicolaus Foerster", "authorids": "~Silvia_Sapora1;~Gokul_Swamy1;~Chris_Lu1;~Yee_Whye_Teh2;~Jakob_Nicolaus_Foerster1", "gender": "F;;;M;M", "homepage": ";https://gokul.dev/;;https://www.jakobfoerster.com;http://csml.stats.ox.ac.uk/people/teh/", "dblp": ";31/11509;77/9579;176/5095;88/2483", "google_scholar": "FxdgVLkAAAAJ;Sbpra_AAAAAJ;4WLoIRsAAAAJ;6z4lQzMAAAAJ;https://scholar.google.co.uk/citations?user=y-nUzMwAAAAJ", "orcid": ";;;;", "linkedin": "silvia-sapora/?originalSubdomain=uk;;;;", "or_profile": "~Silvia_Sapora1;~Gokul_Swamy1;~Chris_Lu1;~Jakob_Nicolaus_Foerster1;~Yee_Whye_Teh1", "aff": "University of Oxford;Carnegie Mellon University;University of Oxford;University of Oxford, University of Oxford;University of Oxford", "aff_domain": "oxford.ac.uk;cmu.edu;ox.ac.uk;eng.ox.ac.uk;ox.ac.uk", "position": "PhD student;PhD student;PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nsapora2024evil,\ntitle={Ev{IL}: Evolution Strategies for Generalisable Imitation Learning},\nauthor={Silvia Sapora and Gokul Swamy and Chris Lu and Yee Whye Teh and Jakob Nicolaus Foerster},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9DMMvMTDur}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3305572, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14464008521554697116&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "oxford.ac.uk;cmu.edu;ox.ac.uk;eng.ox.ac.uk;ox.ac.uk", "author_num": 5, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "University of Oxford;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ox.ac.uk;https://www.cmu.edu", "aff_unique_abbr": "Oxford;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "In-context Convergence of Transformers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34813", "id": "9GLvXGkUE2", "proceeding": "https://proceedings.mlr.press/v235/huang24d.html", "pdf": "https://openreview.net/pdf?id=9GLvXGkUE2", "openreview": "https://openreview.net/forum?id=9GLvXGkUE2", "author_site": "Yu Huang, Yuan Cheng, Yingbin LIANG", "tldr": "", "abstract": "Transformers have recently revolutionized many domains in modern machine learning and one salient discovery is their remarkable in-context learning capability, where models can solve an unseen task by utilizing task-specific prompts without further parameters fine-tuning. This also inspired recent theoretical studies aiming to understand the in-context learning mechanism of transformers, which however focused only on *linear* transformers. In this work, we take the first step toward studying the learning dynamics of a one-layer transformer with *softmax* attention trained via gradient descent in order to in-context learn linear function classes. We consider a structured data model, where each token is randomly sampled from a set of feature vectors in either balanced or imbalanced fashion. For data with balanced features, we establish the finite-time convergence guarantee with near-zero prediction error by navigating our analysis over two phases of the training dynamics of the attention map. More notably, for data with imbalanced features, we show that the learning dynamics take a stage-wise convergence process, where the transformer first converges to a near-zero prediction error for the query tokens of dominant features, and then converges later to a near-zero error for query tokens of under-represented features, via one and four training phases. Our proof features new techniques for analyzing the competing strengths of two types of attention weights, the change of which determines different training phases.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yu Huang;Yuan Cheng;Yingbin Liang", "authorids": "~Yu_Huang3;~Yuan_Cheng6;~Yingbin_Liang1", "gender": "F;;F", "homepage": "https://yuhuang42.org/;;https://sites.google.com/view/yingbinliang/home", "dblp": "39/6301-23;;51/332", "google_scholar": ";5v47GU0AAAAJ;lGgLAiIAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yu_Huang3;~Yuan_Cheng6;~Yingbin_Liang1", "aff": "The Wharton School, University of Pennsylvania;National University of Singapore;The Ohio State University", "aff_domain": "wharton.upenn.edu;u.nus.edu;osu.edu", "position": "PhD student;PhD student;Professor", "bibtex": "@inproceedings{\nhuang2024incontext,\ntitle={In-context Convergence of Transformers},\nauthor={Yu Huang and Yuan Cheng and Yingbin Liang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9GLvXGkUE2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1610057, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 90, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9211398343281116288&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "wharton.upenn.edu;u.nus.edu;osu.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Pennsylvania;National University of Singapore;Ohio State University", "aff_unique_dep": "The Wharton School;;", "aff_unique_url": "https://www.wharton.upenn.edu;https://www.nus.edu.sg;https://www.osu.edu", "aff_unique_abbr": "UPenn Wharton;NUS;OSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;Singapore" }, { "title": "REST: Efficient and Accelerated EEG Seizure Analysis through Residual State Updates", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34812", "id": "9GbAea74O6", "proceeding": "https://proceedings.mlr.press/v235/afzal24a.html", "pdf": "https://openreview.net/pdf?id=9GbAea74O6", "openreview": "https://openreview.net/forum?id=9GbAea74O6", "author_site": "Arshia Afzal, Grigorios Chrysos, Volkan Cevher, Mahsa Shoaran", "tldr": "", "abstract": "EEG-based seizure detection models face challenges in terms of inference speed and memory efficiency, limiting their real-time implementation in clinical devices. This paper introduces a novel graph-based residual state update mechanism (REST) for real-time EEG signal analysis in applications such as epileptic seizure detection. By leveraging a combination of graph neural networks and recurrent structures, REST efficiently captures both non-Euclidean geometry and temporal dependencies within EEG data. Our model demonstrates high accuracy in both seizure detection and classification tasks. Notably, REST achieves a remarkable 9-fold acceleration in inference speed compared to state-of-the-art models, while simultaneously demanding substantially less memory than the smallest model employed for this task. These attributes position REST as a promising candidate for real-time implementation in clinical devices, such as Responsive Neurostimulation or seizure alert systems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Arshia Afzal;Grigorios Chrysos;Volkan Cevher;Mahsa Shoaran", "authorids": "~Arshia_Afzal1;~Grigorios_Chrysos1;~Volkan_Cevher1;~Mahsa_Shoaran1", "gender": "M;M;M;F", "homepage": "https://arshiaafzal.github.io/;https://grigorisg9gr.github.io/;http://lions.epfl.ch;https://www.epfl.ch/labs/inl/", "dblp": "317/7113;75/6117-2;70/5301;133/3958", "google_scholar": "https://scholar.google.com/citations?hl=en;1bU041kAAAAJ;https://scholar.google.ch/citations?user=hlWhzU8AAAAJ;9tu1zw4AAAAJ", "orcid": ";;;0000-0002-6426-4799", "linkedin": ";;;mahsa-shoaran-17588b9b/", "or_profile": "~Arshia_Afzal1;~Grigorios_Chrysos1;~Volkan_Cevher1;~Mahsa_Shoaran1", "aff": "EPFL - EPF Lausanne;University of Wisconsin - Madison;Amazon Development Center Germany;Swiss Federal Institute of Technology Lausanne", "aff_domain": "epfl.ch;wisc.edu;amazon.de;epfl.ch", "position": "PhD student;Assistant Professor;Amazon Scholar;Assistant Professor", "bibtex": "@inproceedings{\nafzal2024rest,\ntitle={{REST}: Efficient and Accelerated {EEG} Seizure Analysis through Residual State Updates},\nauthor={Arshia Afzal and Grigorios Chrysos and Volkan Cevher and Mahsa Shoaran},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9GbAea74O6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8304251, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9766180330375668697&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "epfl.ch;wisc.edu;amazon.de;epfl.ch", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "EPFL;University of Wisconsin-Madison;Amazon;Swiss Federal Institute of Technology Lausanne", "aff_unique_dep": ";;Development Center;", "aff_unique_url": "https://www.epfl.ch;https://www.wisc.edu;https://www.amazon.de;https://www.epfl.ch", "aff_unique_abbr": "EPFL;UW-Madison;Amazon;EPFL", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Lausanne;Madison;", "aff_country_unique_index": "0;1;2;0", "aff_country_unique": "Switzerland;United States;Germany" }, { "title": "Converting Transformers to Polynomial Form for Secure Inference Over Homomorphic Encryption", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34811", "id": "9HPoJ6ulgV", "proceeding": "https://proceedings.mlr.press/v235/zimerman24a.html", "pdf": "https://openreview.net/pdf?id=9HPoJ6ulgV", "openreview": "https://openreview.net/forum?id=9HPoJ6ulgV", "author_site": "Itamar Zimerman, Moran Baruch, Nir Drucker, Gilad Ezov, Omri Soceanu, Lior Wolf", "tldr": "", "abstract": "Designing privacy-preserving DL solutions is a major challenge within the AI community. Homomorphic Encryption (HE) has emerged as one of the most promising approaches in this realm, enabling the decoupling of knowledge between a model owner and a data owner. Despite extensive research and application of this technology, primarily in CNNs, applying HE on transformer models has been challenging because of the difficulties in converting these models into a polynomial form. We break new ground by introducing the first polynomial transformer, providing the first demonstration of secure inference over HE with full transformers. This includes a transformer architecture tailored for HE, alongside a novel method for converting operators to their polynomial equivalent. This innovation enables us to perform secure inference on LMs and ViTs with several datasts and tasks. Our techniques yield results comparable to traditional models, bridging the performance gap with transformers of similar scale and underscoring the viability of HE for state-of-the-art applications. Finally, we assess the stability of our models and conduct a series of ablations to quantify the contribution of each model component. Our code is publicly available.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Itamar Zimerman;Moran Baruch;Nir Drucker;Gilad Ezov;Omri Soceanu;Lior Wolf", "authorids": "~Itamar_Zimerman1;~Moran_Baruch1;~Nir_Drucker1;~Gilad_Ezov1;~Omri_Soceanu1;~Lior_Wolf1", "gender": "M;F;M;M;;M", "homepage": ";;https://sites.google.com/view/druckernir;;;http://www.cs.tau.ac.il/~wolf", "dblp": "294/8621;215/3832.html;179/7421;268/5905.html;;83/4103", "google_scholar": "01s_DpwAAAAJ;GJmx_7kAAAAJv;https://scholar.google.co.il/citations?user=QcbC7mwAAAAJ;;;UbFrXTsAAAAJ", "orcid": "0000-0001-8321-0609;;0000-0002-7273-4797;0000-0003-4579-8127;;0000-0001-5578-8892", "linkedin": ";;drucker-nir/;;;", "or_profile": "~Itamar_Zimerman1;~Moran_Baruch1;~Nir_Drucker1;~Gilad_Ezov1;~Omri_Soceanu1;~Lior_Wolf1", "aff": "International Business Machines;International Business Machines;International Business Machines;International Business Machines;;Tel Aviv University", "aff_domain": "ibm.com;ibm.com;ibm.com;ibm.com;;tau.ac.il", "position": "Researcher;Researcher;IBM Research, Israel;Researcher;;Full Professor", "bibtex": "@inproceedings{\nzimerman2024converting,\ntitle={Converting Transformers to Polynomial Form for Secure Inference Over Homomorphic Encryption},\nauthor={Itamar Zimerman and Moran Baruch and Nir Drucker and Gilad Ezov and Omri Soceanu and Lior Wolf},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9HPoJ6ulgV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6122363, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1160185404276785322&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 8, "email": "ibm.com;ibm.com;ibm.com;ibm.com;;tau.ac.il", "author_num": 6, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "International Business Machines Corporation;Tel Aviv University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ibm.com;https://www.tau.ac.il", "aff_unique_abbr": "IBM;TAU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "United States;Israel" }, { "title": "Open-Domain Text Evaluation via Contrastive Distribution Methods", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34810", "id": "9HdQr68Zyl", "proceeding": "https://proceedings.mlr.press/v235/lu24f.html", "pdf": "https://openreview.net/pdf?id=9HdQr68Zyl", "openreview": "https://openreview.net/forum?id=9HdQr68Zyl", "author_site": "Sidi Lu, Hongyi Liu, Asli Celikyilmaz, Tianlu Wang, Nanyun Peng", "tldr": "", "abstract": "Recent advancements in open-domain text generation, driven by the power of large pre-trained language models (LLMs), have demonstrated remarkable performance. However, assessing these models' generation quality remains a challenge. In this paper, we introduce a novel method for evaluating open-domain text generation called Contrastive Distribution Methods (CDM). Leveraging the connection between increasing model parameters and enhanced LLM performance, CDM creates a mapping from the _contrast_ of two probabilistic distributions -- one known to be superior to the other -- to quality measures. We investigate CDM for open-domain text generation evaluation under two paradigms: 1) _Generative_ CDM, which harnesses the contrast of two language models' distributions to generate synthetic examples for training discriminator-based metrics; 2) _Discriminative_ CDM, which directly uses distribution disparities between two language models for evaluation. Our experiments on coherence evaluation for multi-turn dialogue and commonsense evaluation for controllable generation demonstrate CDM's superior correlate with human judgment than existing automatic evaluation metrics, highlighting the strong performance and generalizability of our approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sidi Lu;Hongyi Liu;Asli Celikyilmaz;Tianlu Wang;Nanyun Peng", "authorids": "~Sidi_Lu1;~Hongyi_Liu4;~Asli_Celikyilmaz1;~Tianlu_Wang1;~Nanyun_Peng1", "gender": "M;M;F;F;F", "homepage": "https://sidilu.cn;;https://asli.us;https://tianlu-wang.github.io/;https://violetpeng.github.io/", "dblp": "206/6156;45/4076-8;15/3724;185/5529;117/4036", "google_scholar": "KHMrrfgAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;inzQqX8AAAAJ;XxRXvX0AAAAJ", "orcid": ";0009-0001-0810-474X;;;", "linkedin": ";;aslicelikyilmaz/;;", "or_profile": "~Sidi_Lu1;~Hongyi_Liu4;~Asli_Celikyilmaz1;~Tianlu_Wang1;~Nanyun_Peng1", "aff": "University of California, Los Angeles;Shanghai Jiaotong University;FAIR ;Meta;University of California, Los Angeles", "aff_domain": "ucla.edu;sjtu.edu.cn;meta.com;meta.com;ucla.edu", "position": "PhD student;Undergrad student;Principal Researcher;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nlu2024opendomain,\ntitle={Open-Domain Text Evaluation via Contrastive Distribution Methods},\nauthor={Sidi Lu and Hongyi Liu and Asli Celikyilmaz and Tianlu Wang and Nanyun Peng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9HdQr68Zyl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2290337, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:QMMtRiMq3esJ:scholar.google.com/&scioq=Open-Domain+Text+Evaluation+via+Contrastive+Distribution+Methods&hl=en&as_sdt=0,5", "gs_version_total": 7, "email": "ucla.edu;sjtu.edu.cn;meta.com;meta.com;ucla.edu", "author_num": 5, "aff_unique_index": "0;1;2;2;0", "aff_unique_norm": "University of California, Los Angeles;Shanghai Jiao Tong University;Meta", "aff_unique_dep": ";;Facebook AI Research", "aff_unique_url": "https://www.ucla.edu;https://www.sjtu.edu.cn;https://research.facebook.com", "aff_unique_abbr": "UCLA;SJTU;FAIR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;China" }, { "title": "Tilting the Odds at the Lottery: the Interplay of Overparameterisation and Curricula in Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34809", "id": "9L7BZiTtJR", "proceeding": "https://proceedings.mlr.press/v235/mannelli24a.html", "pdf": "https://openreview.net/pdf?id=9L7BZiTtJR", "openreview": "https://openreview.net/forum?id=9L7BZiTtJR", "author_site": "Stefano Mannelli, Yaraslau Ivashynka, Andrew Saxe, Luca Saglietti", "tldr": "", "abstract": "A wide range of empirical and theoretical works have shown that overparameterisation can amplify the performance of neural networks. According to the lottery ticket hypothesis, overparameterised networks have an increased chance of containing a sub-network that is well-initialised to solve the task at hand. A more parsimonious approach, inspired by animal learning, consists in guiding the learner towards solving the task by curating the order of the examples, ie. providing a curriculum. However, this learning strategy seems to be hardly beneficial in deep learning applications. In this work, we propose a theoretical analysis that connects curriculum learning and overparameterisation. In particular, we investigate their interplay in the online learning setting for a 2-layer network in the XOR-like Gaussian Mixture problem. Our results show that a high degree of overparameterisation---while simplifying the problem---can limit the benefit from curricula, providing a theoretical account of the ineffectiveness of curricula in deep learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Stefano Sarao Mannelli;Yaraslau Ivashynka;Andrew M Saxe;Luca Saglietti", "authorids": "~Stefano_Sarao_Mannelli1;yaraslau.ivashynka@studbocconi.it;~Andrew_M_Saxe1;~Luca_Saglietti1", "gender": "M;;M;M", "homepage": "https://stefsmlab.github.io/;;https://www.saxelab.org;", "dblp": "232/3343;;39/6894;180/5743", "google_scholar": "https://scholar.google.it/citations?user=Kq272_MAAAAJ;;h0Al1fcAAAAJ;klxwxyUAAAAJ", "orcid": ";;0000-0002-9831-8812;", "linkedin": ";;;luca-saglietti-325208169/", "or_profile": "~Stefano_Sarao_Mannelli1;yaraslau.ivashynka@studbocconi.it;~Andrew_M_Saxe1;~Luca_Saglietti1", "aff": "University College London;;University College London, University of London;Bocconi University", "aff_domain": "ucl.ac.uk;;ucl.ac.uk;unibocconi.it", "position": "Postdoc;;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nmannelli2024tilting,\ntitle={Tilting the Odds at the Lottery: the Interplay of Overparameterisation and Curricula in Neural Networks},\nauthor={Stefano Sarao Mannelli and Yaraslau Ivashynka and Andrew M Saxe and Luca Saglietti},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9L7BZiTtJR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9782948, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1191526487338525807&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "email": "ucl.ac.uk;;ucl.ac.uk;unibocconi.it", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "University College London;Bocconi University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucl.ac.uk;https://www.bocconi.edu", "aff_unique_abbr": "UCL;Bocconi", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United Kingdom;Italy" }, { "title": "Accelerating Convergence in Bayesian Few-Shot Classification", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34808", "id": "9PQnc6EWdL", "proceeding": "https://proceedings.mlr.press/v235/ke24a.html", "pdf": "https://openreview.net/pdf?id=9PQnc6EWdL", "openreview": "https://openreview.net/forum?id=9PQnc6EWdL", "author_site": "Tianjun Ke, Haoqun Cao, Feng Zhou", "tldr": "", "abstract": "Bayesian few-shot classification has been a focal point in the field of few-shot learning. This paper seamlessly integrates mirror descent-based variational inference into Gaussian process-based few-shot classification, addressing the challenge of non-conjugate inference. By leveraging non-Euclidean geometry, mirror descent achieves accelerated convergence by providing the steepest descent direction along the corresponding manifold. It also exhibits the parameterization invariance property concerning the variational distribution. Experimental results demonstrate competitive classification accuracy, improved uncertainty quantification, and faster convergence compared to baseline models. Additionally, we investigate the impact of hyperparameters and components. Code is publicly available at https://github.com/keanson/MD-BSFC.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tianjun Ke;Haoqun Cao;Feng Zhou", "authorids": "~Tianjun_Ke1;~Haoqun_Cao1;~Feng_Zhou9", "gender": ";M;", "homepage": "https://keanson.github.io/;https://kencao2007.github.io/;", "dblp": ";;", "google_scholar": "https://scholar.google.com/citations?hl=en;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Tianjun_Ke1;~Haoqun_Cao1;~Feng_Zhou9", "aff": "School of Statistics, Renmin University of China;Renmin University of China;", "aff_domain": "stat.ruc.edu.cn;ruc.edu.cn;", "position": "Undergrad student;Undergrad student;", "bibtex": "@inproceedings{\nke2024accelerating,\ntitle={Accelerating Convergence in Bayesian Few-Shot Classification},\nauthor={Tianjun Ke and Haoqun Cao and Feng Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9PQnc6EWdL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 851914, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:aSicYkK2wCIJ:scholar.google.com/&scioq=Accelerating+Convergence+in+Bayesian+Few-Shot+Classification&hl=en&as_sdt=0,33", "gs_version_total": 8, "email": "stat.ruc.edu.cn;ruc.edu.cn;", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Renmin University of China", "aff_unique_dep": "School of Statistics", "aff_unique_url": "http://www.ruc.edu.cn", "aff_unique_abbr": "RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Centralized Selection with Preferences in the Presence of Biases", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34807", "id": "9QRcp2ubDt", "proceeding": "https://proceedings.mlr.press/v235/celis24a.html", "pdf": "https://openreview.net/pdf?id=9QRcp2ubDt", "openreview": "https://openreview.net/forum?id=9QRcp2ubDt", "author_site": "L. Elisa Celis, Amit Kumar, Nisheeth K. Vishnoi, Shangyu Andrew Xu", "tldr": "", "abstract": "This paper considers the scenario in which there are multiple institutions, each with a limited capacity for candidates, and candidates, each with preferences over the institutions. A central entity evaluates the utility of each candidate to the institutions, and the goal is to select candidates for each institution in a way that maximizes utility while also considering the candidates' preferences. The paper focuses on the setting in which candidates are divided into multiple groups and the observed utilities of candidates in some groups are biased--systematically lower than their true utilities. The first result is that, in these biased settings, prior algorithms can lead to selections with sub-optimal true utility and significant discrepancies in the fraction of candidates from each group that get their preferred choices. Subsequently, an algorithm is presented along with proof that it produces selections that achieve near-optimal group fairness with respect to preferences while also nearly maximizing the true utility under distributional assumptions. Further, extensive empirical validation of these results in real-world and synthetic settings, in which the distributional assumptions may not hold, are presented.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "L. Elisa Celis;Amit Kumar;Nisheeth K. Vishnoi;Andrew Xu", "authorids": "~L._Elisa_Celis2;~Amit_Kumar7;~Nisheeth_K._Vishnoi2;andrew.xu@yale.edu", "gender": ";M;;", "homepage": ";http://www.cse.iitd.ac.in/~amitk/;;", "dblp": ";k/AmitKumar1.html;;", "google_scholar": ";https://scholar.google.co.in/citations?user=Qu97aMEAAAAJ;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~L._Elisa_Celis2;~Amit_Kumar7;~Nisheeth_K._Vishnoi2;andrew.xu@yale.edu", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ncelis2024centralized,\ntitle={Centralized Selection with Preferences in the Presence of Biases},\nauthor={L. Elisa Celis and Amit Kumar and Nisheeth K. Vishnoi and Andrew Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9QRcp2ubDt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1391601, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17968642223643338670&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": ";;;", "author_num": 4 }, { "title": "SPP: Sparsity-Preserved Parameter-Efficient Fine-Tuning for Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34806", "id": "9Rroj9GIOQ", "proceeding": "https://proceedings.mlr.press/v235/lu24p.html", "pdf": "https://openreview.net/pdf?id=9Rroj9GIOQ", "openreview": "https://openreview.net/forum?id=9Rroj9GIOQ", "author_site": "Xudong LU, Aojun Zhou, Yuhui Xu, Renrui Zhang, Peng Gao, Hongsheng Li", "tldr": "", "abstract": "Large Language Models (LLMs) have become pivotal in advancing the field of artificial intelligence, yet their immense sizes pose significant challenges for both fine-tuning and deployment. Current post-training pruning methods, while reducing the sizes of LLMs, often fail to maintain their original performance. To address these challenges, this paper introduces SPP, a **S**parsity-**P**reserved **P**arameter-efficient fine-tuning method. Different from existing post-training pruning approaches that struggle with performance retention, SPP proposes to employ lightweight learnable column and row matrices to optimize sparse LLM weights, *keeping the structure and sparsity of pruned pre-trained models intact*. By element-wise multiplication and residual addition, SPP ensures the consistency of model sparsity pattern and ratio during both training and weight-merging processes. We demonstrate the effectiveness of SPP by applying it to the LLaMA and LLaMA-2 model families with recent post-training pruning methods. Our results show that SPP significantly enhances the performance of models with different sparsity patterns (i.e. unstructured and N:M sparsity), especially for those with high sparsity ratios (e.g. 75%), making it a promising solution for the efficient fine-tuning of sparse LLMs. Code will be made available at https://github.com/Lucky-Lance/SPP.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xudong Lu;Aojun Zhou;Yuhui Xu;Renrui Zhang;Peng Gao;Hongsheng Li", "authorids": "~Xudong_Lu1;~Aojun_Zhou2;~Yuhui_Xu2;~Renrui_Zhang1;~Peng_Gao3;~Hongsheng_Li3", "gender": "M;M;M;M;M;M", "homepage": "https://lucky-lance.github.io/;https://yuhuixu1993.github.io/;;http://www.ee.cuhk.edu.hk/~hsli;;", "dblp": ";;244/1748;27/7402-1;195/6034;", "google_scholar": "G9jWIggAAAAJ;https://scholar.google.com.hk/citations?user=42DgoIMAAAAJ;YlL3xN4AAAAJ;BN2Ze-QAAAAJ;cC8lXi8AAAAJ;miFIAFMAAAAJ", "orcid": "0009-0007-1699-6286;;;;;", "linkedin": ";;;;;", "or_profile": "~Xudong_Lu1;~Yuhui_Xu2;~Renrui_Zhang1;~Hongsheng_Li3;~Aojun_Zhou3;~Gao_Peng1", "aff": "The Chinese University of Hong Kong;SalesForce.com;MMLab of CUHK & Shanghai AI Laboratory;The Chinese University of Hong Kong;The Chinese University of Hong Kong;shanghai ai lab ", "aff_domain": "cuhk.edu.hk;salesforce.com;pjlab.org.cn;cuhk.edu.hk;cuhk.edu.hk;pjlab.org.cn", "position": "PhD student;Research Scientist;PhD student;Associate Professor;Researcher;Researcher", "bibtex": "@inproceedings{\nlu2024spp,\ntitle={{SPP}: Sparsity-Preserved Parameter-Efficient Fine-Tuning for Large Language Models},\nauthor={Xudong Lu and Aojun Zhou and Yuhui Xu and Renrui Zhang and Peng Gao and Hongsheng Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9Rroj9GIOQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 573062, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7953300488006484010&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "cuhk.edu.hk;salesforce.com;pjlab.org.cn;cuhk.edu.hk;cuhk.edu.hk;pjlab.org.cn", "author_num": 6, "aff_unique_index": "0;1;0;0;0;2", "aff_unique_norm": "Chinese University of Hong Kong;Salesforce;Shanghai AI Lab", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.salesforce.com;https://www.shanghaiailab.com", "aff_unique_abbr": "CUHK;Salesforce;Shanghai AI Lab", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;0;0;0;0", "aff_country_unique": "China;United States" }, { "title": "Seizing Serendipity: Exploiting the Value of Past Success in Off-Policy Actor-Critic", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34805", "id": "9Tq4L3Go9f", "proceeding": "https://proceedings.mlr.press/v235/ji24d.html", "pdf": "https://openreview.net/pdf?id=9Tq4L3Go9f", "openreview": "https://openreview.net/forum?id=9Tq4L3Go9f", "author_site": "Tianying Ji, Yu Luo, Fuchun Sun, Xianyuan Zhan, Jianwei Zhang, Huazhe Xu", "tldr": "", "abstract": "Learning high-quality $Q$-value functions plays a key role in the success of many modern off-policy deep reinforcement learning (RL) algorithms. Previous works primarily focus on addressing the value overestimation issue, an outcome of adopting function approximators and off-policy learning. Deviating from the common viewpoint, we observe that $Q$-values are often underestimated in the latter stage of the RL training process, potentially hindering policy learning and reducing sample efficiency. We find that such a long-neglected phenomenon is often related to the use of inferior actions from the current policy in Bellman updates as compared to the more optimal action samples in the replay buffer. We propose the Blended Exploitation and Exploration (BEE) operator, a simple yet effective approach that updates $Q$-value using both historical best-performing actions and the current policy. Based on BEE, the resulting practical algorithm BAC outperforms state-of-the-art methods in **over 50** continuous control tasks and achieves strong performance in failure-prone scenarios and **real-world robot** tasks. Benchmark results and videos are available at https://jity16.github.io/BEE/.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tianying Ji;Yu Luo;Fuchun Sun;Xianyuan Zhan;Jianwei Zhang;Huazhe Xu", "authorids": "~Tianying_Ji2;~Yu_Luo5;~Fuchun_Sun1;~Xianyuan_Zhan1;~Jianwei_Zhang2;~Huazhe_Xu1", "gender": "F;M;M;M;M;M", "homepage": ";;https://www.cs.tsinghua.edu.cn/info/1121/3555.htm;http://zhanxianyuan.xyz/;https://tams.informatik.uni-hamburg.de/people/zhang/;http://hxu.rocks", "dblp": "124/2199.html;;;181/5081;z/JianweiZhang1;164/9006", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.hk/citations?user=KQjoQOMAAAAJ;;pDMnGloAAAAJ;;t9HPFawAAAAJ", "orcid": ";0000-0001-6229-4639;;0000-0002-3683-0554;;", "linkedin": ";;;;;", "or_profile": "~Tianying_Ji2;~Yu_Luo5;~Fuchun_Sun1;~Xianyuan_Zhan1;~Jianwei_Zhang2;~Huazhe_Xu1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;Universit\u00e4t Hamburg;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;cs.tsinghua.edu.cn;tsinghua.edu.cn;uni-hamburg.de;tsinghua.edu.cn", "position": "PhD student;PhD student;Full Professor;Associate Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nji2024seizing,\ntitle={Seizing Serendipity: Exploiting the Value of Past Success in Off-Policy Actor-Critic},\nauthor={Tianying Ji and Yu Luo and Fuchun Sun and Xianyuan Zhan and Jianwei Zhang and Huazhe Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9Tq4L3Go9f}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 10120757, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9080183421535615976&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "tsinghua.edu.cn;tsinghua.edu.cn;cs.tsinghua.edu.cn;tsinghua.edu.cn;uni-hamburg.de;tsinghua.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;0;1;0", "aff_unique_norm": "Tsinghua University;University of Hamburg", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.uni-hamburg.de", "aff_unique_abbr": "THU;UHH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0", "aff_country_unique": "China;Germany" }, { "title": "Adaptively Perturbed Mirror Descent for Learning in Games", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34804", "id": "9U29U3cDKq", "proceeding": "https://proceedings.mlr.press/v235/abe24a.html", "pdf": "https://openreview.net/pdf?id=9U29U3cDKq", "openreview": "https://openreview.net/forum?id=9U29U3cDKq", "author_site": "Kenshi Abe, Kaito Ariu, Mitsuki Sakamoto, Atsushi Iwasaki", "tldr": "", "abstract": "This paper proposes a payoff perturbation technique for the Mirror Descent (MD) algorithm in games where the gradient of the payoff functions is monotone in the strategy profile space, potentially containing additive noise. The optimistic family of learning algorithms, exemplified by optimistic MD, successfully achieves *last-iterate* convergence in scenarios devoid of noise, leading the dynamics to a Nash equilibrium. A recent re-emerging trend underscores the promise of the perturbation approach, where payoff functions are perturbed based on the distance from an anchoring, or *slingshot*, strategy. In response, we propose *Adaptively Perturbed MD* (APMD), which adjusts the magnitude of the perturbation by repeatedly updating the slingshot strategy at a predefined interval. This innovation empowers us to find a Nash equilibrium of the underlying game with guaranteed rates. Empirical demonstrations affirm that our algorithm exhibits significantly accelerated convergence.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kenshi Abe;Kaito Ariu;Mitsuki Sakamoto;Atsushi Iwasaki", "authorids": "~Kenshi_Abe1;~Kaito_Ariu1;~Mitsuki_Sakamoto1;~Atsushi_Iwasaki2", "gender": "M;M;M;M", "homepage": "https://bakanaouji.github.io/;https://researchmap.jp/ariu?lang=en;;", "dblp": "254/2763;229/7578;243/6951;04/4799", "google_scholar": "rImmohoAAAAJ;https://scholar.google.co.jp/citations?user=4zXjxhsAAAAJ;https://scholar.google.co.jp/citations?user=wIuGfiEAAAAJ;OZunx7wAAAAJ", "orcid": ";;;", "linkedin": ";;https://www.linkedin.com/mwlite/in/%E5%85%85%E7%94%9F-%E5%9D%82%E6%9C%AC-1666bb233;", "or_profile": "~Kenshi_Abe1;~Kaito_Ariu1;~Mitsuki_Sakamoto1;~Atsushi_Iwasaki1", "aff": "CyberAgent, Inc.;CyberAgent, Inc.;CyberAgent, Inc.;University of Electro-Communications", "aff_domain": "cyberagent.co.jp;cyberagent.co.jp;cyberagent.co.jp;uec.ac.jp", "position": "Research scientist;Research Scientist;Research Engineer;Associate Professor", "bibtex": "@inproceedings{\nabe2024adaptively,\ntitle={Adaptively Perturbed Mirror Descent for Learning in Games},\nauthor={Kenshi Abe and Kaito Ariu and Mitsuki Sakamoto and Atsushi Iwasaki},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9U29U3cDKq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1560110, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4285682774635136193&as_sdt=5,24&sciodt=0,24&hl=en", "gs_version_total": 9, "email": "cyberagent.co.jp;cyberagent.co.jp;cyberagent.co.jp;uec.ac.jp", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "CyberAgent;University of Electro-Communications", "aff_unique_dep": ";", "aff_unique_url": "https://www.cyberagent.co.jp;https://www.uec.ac.jp", "aff_unique_abbr": "CyberAgent;UEC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Japan" }, { "title": "A Universal Class of Sharpness-Aware Minimization Algorithms", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34803", "id": "9Ub6nLqdMo", "proceeding": "https://proceedings.mlr.press/v235/tahmasebi24b.html", "pdf": "https://openreview.net/pdf?id=9Ub6nLqdMo", "openreview": "https://openreview.net/forum?id=9Ub6nLqdMo", "author_site": "Behrooz Tahmasebi, Ashkan Soleymani, Dara Bahri, Stefanie Jegelka, Patrick Jaillet", "tldr": "", "abstract": "Recently, there has been a surge in interest in developing optimization algorithms for overparameterized models as achieving generalization is believed to require algorithms with suitable biases. This interest centers on minimizing sharpness of the original loss function; the Sharpness-Aware Minimization (SAM) algorithm has proven effective. However, most literature only considers a few sharpness measures, such as the maximum eigenvalue or trace of the training loss Hessian, which may not yield meaningful insights for non-convex optimization scenarios like neural networks. Additionally, many sharpness measures are sensitive to parameter invariances in neural networks, magnifying significantly under rescaling parameters. Motivated by these challenges, we introduce a new class of sharpness measures in this paper, leading to new sharpness-aware objective functions. We prove that these measures are *universally expressive*, allowing any function of the training loss Hessian matrix to be represented by appropriate hyperparameters. Furthermore, we show that the proposed objective functions explicitly bias towards minimizing their corresponding sharpness measures, and how they allow meaningful applications to models with parameter invariances (such as scale-invariances). Finally, as instances of our proposed general framework, we present *Frob-SAM* and *Det-SAM*, which are specifically designed to minimize the Frobenius norm and the determinant of the Hessian of the training loss, respectively. We also demonstrate the advantages of our general framework through extensive experiments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Behrooz Tahmasebi;Ashkan Soleymani;Dara Bahri;Stefanie Jegelka;Patrick Jaillet", "authorids": "~Behrooz_Tahmasebi1;~Ashkan_Soleymani1;~Dara_Bahri1;~Stefanie_Jegelka3;~Patrick_Jaillet1", "gender": "M;M;M;F;M", "homepage": "https://people.csail.mit.edu/bzt/;https://ashkansoleymani.lids.mit.edu/;http://www.dara.run;http://people.csail.mit.edu/stefje/;http://web.mit.edu/jaillet/www/", "dblp": "223/0884;270/3353.html;231/7656;38/7003;https://dblp.uni-trier.de/pers/hd/j/Jaillet:Patrick", "google_scholar": "ZXCO3DMAAAAJ;omHTV3MAAAAJ;j5PpTOwAAAAJ;gTWUZlsAAAAJ;ND0FM6EAAAAJ", "orcid": ";;;;0000-0002-8585-6566", "linkedin": ";;;;patrick-jaillet-1260445/", "or_profile": "~Behrooz_Tahmasebi1;~Ashkan_Soleymani1;~Dara_Bahri1;~Stefanie_Jegelka3;~Patrick_Jaillet1", "aff": "Microsoft Research ;Massachusetts Institute of Technology;Google Research;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "microsoft.com;mit.edu;google.com;mit.edu;mit.edu", "position": "Intern;PhD student;Research Scientist;Associate Professor;Full Professor", "bibtex": "@inproceedings{\ntahmasebi2024a,\ntitle={A Universal Class of Sharpness-Aware Minimization Algorithms},\nauthor={Behrooz Tahmasebi and Ashkan Soleymani and Dara Bahri and Stefanie Jegelka and Patrick Jaillet},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9Ub6nLqdMo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2851080, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15540065999511180394&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "microsoft.com;mit.edu;google.com;mit.edu;mit.edu", "author_num": 5, "aff_unique_index": "0;1;2;1;1", "aff_unique_norm": "Microsoft;Massachusetts Institute of Technology;Google", "aff_unique_dep": "Microsoft Research;;Google Research", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://web.mit.edu;https://research.google", "aff_unique_abbr": "MSR;MIT;Google Research", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Acquiring Diverse Skills using Curriculum Reinforcement Learning with Mixture of Experts", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34802", "id": "9ZkUFSwlUH", "proceeding": "https://proceedings.mlr.press/v235/celik24a.html", "pdf": "https://openreview.net/pdf?id=9ZkUFSwlUH", "openreview": "https://openreview.net/forum?id=9ZkUFSwlUH", "author_site": "Onur Celik, Aleksandar Taranovic, Gerhard Neumann", "tldr": "", "abstract": "Reinforcement learning (RL) is a powerful approach for acquiring a good-performing policy. However, learning diverse skills is challenging in RL due to the commonly used Gaussian policy parameterization. We propose Diverse Skill Learning (Di-SkilL), an RL method for learning diverse skills using Mixture of Experts, where each expert formalizes a skill as a contextual motion primitive. Di-SkilL optimizes each expert and its associate context distribution to a maximum entropy objective that incentivizes learning diverse skills in similar contexts. The per-expert context distribution enables automatic curricula learning, allowing each expert to focus on its best-performing sub-region of the context space. To overcome hard discontinuities and multi-modalities without any prior knowledge of the environment's unknown context probability space, we leverage energy-based models to represent the per-expert context distributions and demonstrate how we can efficiently train them using the standard policy gradient objective. We show on challenging robot simulation tasks that Di-SkilL can learn diverse and performant skills.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Onur Celik;Aleksandar Taranovic;Gerhard Neumann", "authorids": "~Onur_Celik1;~Aleksandar_Taranovic1;~Gerhard_Neumann2", "gender": "M;M;M", "homepage": "https://alr.anthropomatik.kit.edu/21_69.php;;https://alr.anthropomatik.kit.edu/", "dblp": "243/5913;;60/4878", "google_scholar": "9jqaTcAAAAAJ;2IovJsIAAAAJ;https://scholar.google.com.tw/citations?user=GL360kMAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Onur_Celik1;~Aleksandar_Taranovic1;~Gerhard_Neumann1", "aff": "Karlsruhe Institute of Technology;Karlsruher Institut f\u00fcr Technologie;Karlsruhe Institute of Technology", "aff_domain": "kit.edu;kit.edu;kit.edu", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\ncelik2024acquiring,\ntitle={Acquiring Diverse Skills using Curriculum Reinforcement Learning with Mixture of Experts},\nauthor={Onur Celik and Aleksandar Taranovic and Gerhard Neumann},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9ZkUFSwlUH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6405174, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8132111718767644968&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "kit.edu;kit.edu;kit.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Karlsruhe Institute of Technology;Karlsruher Institut f\u00fcr Technologie", "aff_unique_dep": ";", "aff_unique_url": "https://www.kit.edu;https://www.kit.edu", "aff_unique_abbr": "KIT;KIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "Promptbreeder: Self-Referential Self-Improvement via Prompt Evolution", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34801", "id": "9ZxnPZGmPU", "proceeding": "https://proceedings.mlr.press/v235/fernando24a.html", "pdf": "https://openreview.net/pdf?id=9ZxnPZGmPU", "openreview": "https://openreview.net/forum?id=9ZxnPZGmPU", "author_site": "Chrisantha Fernando, Dylan Banarse, Henryk Michalewski, Simon Osindero, Tim Rockt\u00e4schel", "tldr": "", "abstract": "Popular prompt strategies like Chain-of-Thought Prompting can dramatically improve the reasoning abilities of Large Language Models (LLMs) in various domains. However, such hand-crafted prompt-strategies are often sub-optimal. In this paper, we present Promptbreeder, a general-purpose self-referential self-improvement mechanism that evolves and adapts prompts for a given domain. Driven by an LLM, Promptbreeder mutates a population of task-prompts, evaluates them for fitness on a training set, and repeats this process over multiple generations to evolve task-prompts. Crucially, the mutation of these task-prompts is governed by mutation-prompts that the LLM generates and improves throughout evolution in a self-referential way. That is, Promptbreeder is not just improving task-prompts, but it is also improving the mutation-prompts that improve these task-prompts. Promptbreeder outperforms state-of-the-art prompt strategies such as Chain-of-Thought and Plan-and-Solve Prompting on commonly used arithmetic and commonsense reasoning benchmarks. Furthermore, Promptbreeder is able to evolve intricate task-prompts for the challenging problem of hate speech classification.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chrisantha Fernando;Dylan Sunil Banarse;Henryk Michalewski;Simon Osindero;Tim Rockt\u00e4schel", "authorids": "~Chrisantha_Fernando1;~Dylan_Sunil_Banarse1;~Henryk_Michalewski1;~Simon_Osindero1;~Tim_Rockt\u00e4schel1", "gender": "M;M;M;Non-Binary;M", "homepage": ";https://2ne1.com;https://www.mimuw.edu.pl/~henrykm/;;http://rockt.ai", "dblp": ";;https://dblp.uni-trier.de/pers/hd/m/Michalewski:Henryk;05/5467;43/11537", "google_scholar": ";UPcOdkQAAAAJ;YdHW1ycAAAAJ;Jq8ZS5kAAAAJ;https://scholar.google.co.uk/citations?user=mWBY8aIAAAAJ", "orcid": ";;;;", "linkedin": ";;henryk-michalewski-8a230a27/;;rockt/", "or_profile": "~Chrisantha_Fernando1;~Dylan_Sunil_Banarse1;~Henryk_Michalewski1;~Simon_Osindero1;~Tim_Rocktaeschel1", "aff": ";Google DeepMind;Google DeepMind;Google;Google DeepMind", "aff_domain": ";deepmind.com;google.com;google.com;google.com", "position": ";Researcher;Researcher;Scientist;Senior Staff Research Scientist", "bibtex": "@inproceedings{\nfernando2024promptbreeder,\ntitle={Promptbreeder: Self-Referential Self-Improvement via Prompt Evolution},\nauthor={Chrisantha Fernando and Dylan Sunil Banarse and Henryk Michalewski and Simon Osindero and Tim Rockt{\\\"a}schel},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9ZxnPZGmPU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4023274, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 200, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15203972626303466013&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": ";deepmind.com;google.com;google.com;google.com", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google DeepMind", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "Curated LLM: Synergy of LLMs and Data Curation for tabular augmentation in low-data regimes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34800", "id": "9cG1oRnqNd", "proceeding": "https://proceedings.mlr.press/v235/seedat24a.html", "pdf": "https://openreview.net/pdf?id=9cG1oRnqNd", "openreview": "https://openreview.net/forum?id=9cG1oRnqNd", "author_site": "Nabeel Seedat, Nicolas Huynh, Boris van Breugel, M van der Schaar", "tldr": "", "abstract": "Machine Learning (ML) in low-data settings remains an underappreciated yet crucial problem. Hence, data augmentation methods to increase the sample size of datasets needed for ML are key to unlocking the transformative potential of ML in data-deprived regions and domains. Unfortunately, the limited training set constrains traditional tabular synthetic data generators in their ability to generate a large and diverse augmented dataset needed for ML tasks. To address this challenge, we introduce $\\texttt{CLLM}$, which leverages the prior knowledge of Large Language Models (LLMs) for data augmentation in the low-data regime. However, not all the data generated by LLMs will improve downstream utility, as for any generative model. Consequently, we introduce a principled curation mechanism, leveraging learning dynamics, coupled with confidence and uncertainty metrics, to obtain a high-quality dataset. Empirically, on multiple real-world datasets, we demonstrate the superior performance of $\\texttt{CLLM}$ in the low-data regime compared to conventional generators. Additionally, we provide insights into the LLM generation and curation mechanism, shedding light on the features that enable them to output high-quality augmented datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nabeel Seedat;Nicolas Huynh;Boris van Breugel;Mihaela van der Schaar", "authorids": "~Nabeel_Seedat1;~Nicolas_Huynh1;~Boris_van_Breugel2;~Mihaela_van_der_Schaar2", "gender": ";M;;F", "homepage": ";;;https://www.vanderschaar-lab.com", "dblp": "227/8368;134/9604;284/0835;", "google_scholar": "https://scholar.google.com/citations?hl=en;;https://scholar.google.com/citations?hl=en;DZ3S--MAAAAJ", "orcid": ";;;", "linkedin": "nabeel-seedat/;;;", "or_profile": "~Nabeel_Seedat1;~Nicolas_Huynh1;~Boris_van_Breugel2;~Mihaela_van_der_Schaar2", "aff": "AstraZeneca;University of Cambridge;University of Cambridge;University of California, Los Angeles", "aff_domain": "astrazeneca.com;cam.ac.uk;cam.ac.uk;ucla.edu", "position": "Intern;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nseedat2024curated,\ntitle={Curated {LLM}: Synergy of {LLM}s and Data Curation for tabular augmentation in low-data regimes},\nauthor={Nabeel Seedat and Nicolas Huynh and Boris van Breugel and Mihaela van der Schaar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9cG1oRnqNd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2632006, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8739500508399817313&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "astrazeneca.com;cam.ac.uk;cam.ac.uk;ucla.edu", "author_num": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "AstraZeneca;University of Cambridge;University of California, Los Angeles", "aff_unique_dep": ";;", "aff_unique_url": "https://www.astrazeneca.com;https://www.cam.ac.uk;https://www.ucla.edu", "aff_unique_abbr": "AZ;Cambridge;UCLA", "aff_campus_unique_index": "1;1;2", "aff_campus_unique": ";Cambridge;Los Angeles", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Sliding Down the Stairs: How Correlated Latent Variables Accelerate Learning with Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34799", "id": "9iGdh0wAgB", "proceeding": "https://proceedings.mlr.press/v235/bardone24a.html", "pdf": "https://openreview.net/pdf?id=9iGdh0wAgB", "openreview": "https://openreview.net/forum?id=9iGdh0wAgB", "author_site": "Lorenzo Bardone, Sebastian Goldt", "tldr": "", "abstract": "Neural networks extract features from data using stochastic gradient descent (SGD). In particular, higher-order input cumulants (HOCs) are crucial for their performance. However, extracting information from the $p$th cumulant of $d$-dimensional inputs is computationally hard: the number of samples required to recover a single direction from an order-$p$ tensor (tensor PCA) using SGD grows as $d^{p\u22121}$, which is prohibitive for high-dimensional inputs. This result raises the question of how neural networks extract relevant directions from the HOCs of their inputs efficiently. Here, we show that correlations between latent variables along the directions encoded in different input cumulants speed up learning from higher-order correlations. We show this effect analytically by deriving nearly sharp thresholds for the number of samples required by a single neuron to recover these directions using online SGD from a random start in high dimensions. Our analytical results are confirmed in simulations of two-layer neural networks and unveil a new mechanism for hierarchical learning in neural networks", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lorenzo Bardone;Sebastian Goldt", "authorids": "~Lorenzo_Bardone1;~Sebastian_Goldt1", "gender": "M;M", "homepage": "https://www.math.sissa.it/users/lorenzo-bardone;https://datascience.sissa.it/research-unit/12/theory-of-neural-networks", "dblp": ";234/8941", "google_scholar": ";R06wsMkAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Lorenzo_Bardone1;~Sebastian_Goldt1", "aff": "International Higher School for Advanced Studies Trieste;SISSA", "aff_domain": "sissa.it;sissa.it", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nbardone2024sliding,\ntitle={Sliding Down the Stairs: How Correlated Latent Variables Accelerate Learning with Neural Networks},\nauthor={Lorenzo Bardone and Sebastian Goldt},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9iGdh0wAgB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 695553, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6158381018982193869&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "sissa.it;sissa.it", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "International Higher School for Advanced Studies;Scuola Internazionale Superiore di Studi Avanzati", "aff_unique_dep": ";", "aff_unique_url": "https://www.sissa.it;https://www.sissa.it", "aff_unique_abbr": "SISSA;SISSA", "aff_campus_unique_index": "0", "aff_campus_unique": "Trieste;", "aff_country_unique_index": "0;0", "aff_country_unique": "Italy" }, { "title": "Online Linear Regression in Dynamic Environments via Discounting", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34798", "id": "9iRGs3wBTy", "proceeding": "https://proceedings.mlr.press/v235/jacobsen24a.html", "pdf": "https://openreview.net/pdf?id=9iRGs3wBTy", "openreview": "https://openreview.net/forum?id=9iRGs3wBTy", "author_site": "Andrew Jacobsen, Ashok Cutkosky", "tldr": "", "abstract": "We develop algorithms for online linear regression which achieve optimal static and dynamic regret guarantees *even in the complete absence of prior knowledge*. We present a novel analysis showing that a discounted variant of the Vovk-Azoury-Warmuth forecaster achieves dynamic regret of the form $R_{T}(\\vec{u})\\le O\\Big(d\\log(T)\\vee \\sqrt{dP_{T}^{\\gamma}(\\vec{u})T}\\Big)$, where $P_{T}^{\\gamma}(\\vec{u})$ is a measure of variability of the comparator sequence, and show that the discount factor achieving this result can be learned on-the-fly. We show that this result is optimal by providing a matching lower bound. We also extend our results to *strongly-adaptive* guarantees which hold over every sub-interval $[a,b]\\subseteq[1,T]$ simultaneously.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Andrew Jacobsen;Ashok Cutkosky", "authorids": "~Andrew_Jacobsen1;~Ashok_Cutkosky1", "gender": ";", "homepage": ";http://www.cs.stanford.edu/~ashokc", "dblp": "245/2567;191/6725", "google_scholar": "H2iBC18AAAAJ;h4AbGp0AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Andrew_Jacobsen1;~Ashok_Cutkosky1", "aff": "University of Alberta;Boston University", "aff_domain": "ualberta.ca;bu.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\njacobsen2024online,\ntitle={Online Linear Regression in Dynamic Environments via Discounting},\nauthor={Andrew Jacobsen and Ashok Cutkosky},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9iRGs3wBTy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 549853, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9100375059842617899&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 7, "email": "ualberta.ca;bu.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Alberta;Boston University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ualberta.ca;https://www.bu.edu", "aff_unique_abbr": "UAlberta;BU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Canada;United States" }, { "title": "Regularizing with Pseudo-Negatives for Continual Self-Supervised Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34797", "id": "9jXS07TIBH", "proceeding": "https://proceedings.mlr.press/v235/cha24a.html", "pdf": "https://openreview.net/pdf?id=9jXS07TIBH", "openreview": "https://openreview.net/forum?id=9jXS07TIBH", "author_site": "Sungmin Cha, Kyunghyun Cho, Taesup Moon", "tldr": "", "abstract": "We introduce a novel Pseudo-Negative Regularization (PNR) framework for effective continual self-supervised learning (CSSL). Our PNR leverages pseudo-negatives obtained through model-based augmentation in a way that newly learned representations may not contradict what has been learned in the past. Specifically, for the InfoNCE-based contrastive learning methods, we define symmetric pseudo-negatives obtained from current and previous models and use them in both main and regularization loss terms. Furthermore, we extend this idea to non-contrastive learning methods which do not inherently rely on negatives. For these methods, a pseudo-negative is defined as the output from the previous model for a differently augmented version of the anchor sample and is asymmetrically applied to the regularization term. Extensive experimental results demonstrate that our PNR framework achieves state-of-the-art performance in representation learning during CSSL by effectively balancing the trade-off between plasticity and stability.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sungmin Cha;Kyunghyun Cho;Taesup Moon", "authorids": "~Sungmin_Cha1;~Kyunghyun_Cho1;~Taesup_Moon1", "gender": "M;M;", "homepage": "https://sites.google.com/view/sungmin-cha/;http://kyunghyuncho.me;https://mindlab-snu.github.io/people/pi/", "dblp": "206/6287;41/9736;05/4084", "google_scholar": "i0PPhfAAAAAJ;https://scholar.google.fi/citations?user=0RAmmIAAAAAJ;lQlioBoAAAAJ", "orcid": ";;0000-0002-9257-6503", "linkedin": ";;", "or_profile": "~Sungmin_Cha1;~Kyunghyun_Cho1;~Taesup_Moon1", "aff": "New York University;Genentech;Seoul National University", "aff_domain": "nyu.edu;gene.com;snu.ac.kr", "position": "Faculty Fellow;Senior Director of Frontier Research;Associate Professor", "bibtex": "@inproceedings{\ncha2024regularizing,\ntitle={Regularizing with Pseudo-Negatives for Continual Self-Supervised Learning},\nauthor={Sungmin Cha and Kyunghyun Cho and Taesup Moon},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9jXS07TIBH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 977710, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16289964572305895280&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "nyu.edu;gene.com;snu.ac.kr", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "New York University;Genentech;Seoul National University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nyu.edu;https://www.genentech.com;https://www.snu.ac.kr", "aff_unique_abbr": "NYU;Genentech;SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;South Korea" }, { "title": "CARTE: Pretraining and Transfer for Tabular Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34796", "id": "9kArQnKLDp", "proceeding": "https://proceedings.mlr.press/v235/kim24d.html", "pdf": "https://openreview.net/pdf?id=9kArQnKLDp", "openreview": "https://openreview.net/forum?id=9kArQnKLDp", "author_site": "Myung Jun Kim, Leo Grinsztajn, Gael Varoquaux", "tldr": "", "abstract": "Pretrained deep-learning models are the go-to solution for images or text. However, for tabular data the standard is still to train tree-based models. Indeed, transfer learning on tables hits the challenge of *data integration*: finding correspondences, correspondences in the entries (*entity matching*) where different words may denote the same entity, correspondences across columns (*schema matching*), which may come in different orders, names... We propose a neural architecture that does not need such correspondences. As a result, we can pretrain it on background data that has not been matched. The architecture --CARTE for Context Aware Representation of Table Entries-- uses a graph representation of tabular (or relational) data to process tables with different columns, string embedding of entries and columns names to model an open vocabulary, and a graph-attentional network to contextualize entries with column names and neighboring entries. An extensive benchmark shows that CARTE facilitates learning, outperforming a solid set of baselines including the best tree-based models. CARTE also enables joint learning across tables with unmatched columns, enhancing a small table with bigger ones. CARTE opens the door to large pretrained models for tabular data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Myung Jun Kim;Leo Grinsztajn;Gael Varoquaux", "authorids": "~Myung_Jun_Kim1;~Leo_Grinsztajn1;~Gael_Varoquaux1", "gender": "M;M;M", "homepage": ";https://www.linkedin.com/in/l%C3%A9o-grinsztajn-339b5b173/;http://gael-varoquaux.info", "dblp": "22/3670;259/3203;36/7585", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;;https://scholar.google.fr/citations?user=OGGu384AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Myung_Jun_Kim1;~Leo_Grinsztajn1;~Gael_Varoquaux1", "aff": "INRIA;;INRIA", "aff_domain": "inria.fr;;inria.fr", "position": "Postdoc;;Full Professor", "bibtex": "@inproceedings{\nkim2024carte,\ntitle={{CARTE}: Pretraining and Transfer for Tabular Learning},\nauthor={Myung Jun Kim and Leo Grinsztajn and Gael Varoquaux},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9kArQnKLDp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1149069, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2206428637773019721&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 9, "email": "inria.fr;;inria.fr", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "INRIA", "aff_unique_dep": "", "aff_unique_url": "https://www.inria.fr", "aff_unique_abbr": "INRIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "title": "Skill Set Optimization: Reinforcing Language Model Behavior via Transferable Skills", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34795", "id": "9laB7ytoMp", "proceeding": "https://proceedings.mlr.press/v235/nottingham24a.html", "pdf": "https://openreview.net/pdf?id=9laB7ytoMp", "openreview": "https://openreview.net/forum?id=9laB7ytoMp", "author_site": "Kolby Nottingham, Bodhisattwa Prasad Majumder, Bhavana Dalvi, Sameer Singh, Peter Clark, Roy Fox", "tldr": "", "abstract": "Large language models (LLMs) have recently been used for sequential decision making in interactive environments. However, leveraging environment reward signals for continual LLM actor improvement is not straightforward. We propose Skill Set Optimization (SSO) for improving LLM actor performance through constructing and refining sets of transferable skills. SSO constructs skills by extracting common subtrajectories with high rewards and generating subgoals and instructions to represent each skill. These skills are provided to the LLM actor in-context to reinforce behaviors with high rewards. Then, SSO further refines the skill set by pruning skills that do not continue to result in high rewards. We evaluate our method in the classic videogame NetHack and the text environment ScienceWorld to demonstrate SSO's ability to optimize a set of skills and perform in-context policy improvement. SSO outperforms baselines by 40% in our custom NetHack task and outperforms the previous state-of-the-art in ScienceWorld by 35%.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kolby Nottingham;Bodhisattwa Prasad Majumder;Bhavana Dalvi Mishra;Sameer Singh;Peter Clark;Roy Fox", "authorids": "~Kolby_Nottingham1;~Bodhisattwa_Prasad_Majumder1;~Bhavana_Dalvi_Mishra2;~Sameer_Singh1;~Peter_Clark1;~Roy_Fox1", "gender": "M;;M;M;M;F", "homepage": "http://kolbynottingham.com;https://www.majumderb.com/;http://sameersingh.org;https://allenai.org/team/peterc;https://royf.org;https://bhavanadalvi.github.io/", "dblp": "250/3133.html;138/6177;13/3568-1;34/1184;32/7007;78/6527", "google_scholar": "2o3QdBAAAAAJ;cEM1a5gAAAAJ;-hGZC54AAAAJ;o-5vyEsAAAAJ;FH9nKOAAAAAJ;9e0uFr4AAAAJ", "orcid": ";;0000-0003-0621-6323;;0000-0002-5562-3315;", "linkedin": "kolby-nottingham/;;sameersingh/;peter-clark-a8b556/;;", "or_profile": "~Kolby_Nottingham1;~Bodhisattwa_Prasad_Majumder1;~Sameer_Singh1;~Peter_Clark1;~Roy_Fox1;~Bhavana_Dalvi1", "aff": "University of California, Irvine;Allen Institute for Artificial Intelligence;University of California, Irvine;Allen Institute for Artificial Intelligence;University of California, Irvine;Allen Institute for Artificial Intelligence", "aff_domain": "uci.edu;allenai.org;uci.edu;allenai.org;uci.edu;allenai.org", "position": "PhD student;Researcher;Full Professor;Senior Research Manager;Assistant Professor;Lead Reserarch Scientist", "bibtex": "@inproceedings{\nnottingham2024skill,\ntitle={Skill Set Optimization: Reinforcing Language Model Behavior via Transferable Skills},\nauthor={Kolby Nottingham and Bodhisattwa Prasad Majumder and Bhavana Dalvi Mishra and Sameer Singh and Peter Clark and Roy Fox},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9laB7ytoMp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 592283, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5440493320369513208&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "uci.edu;allenai.org;uci.edu;allenai.org;uci.edu;allenai.org", "author_num": 6, "aff_unique_index": "0;1;0;1;0;1", "aff_unique_norm": "University of California, Irvine;Allen Institute for Artificial Intelligence", "aff_unique_dep": ";", "aff_unique_url": "https://www.uci.edu;https://allenai.org", "aff_unique_abbr": "UCI;AI2", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Irvine;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Pi-DUAL: Using privileged information to distinguish clean from noisy labels", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34794", "id": "9oAXix8da9", "proceeding": "https://proceedings.mlr.press/v235/wang24bb.html", "pdf": "https://openreview.net/pdf?id=9oAXix8da9", "openreview": "https://openreview.net/forum?id=9oAXix8da9", "author_site": "Ke Wang, Guillermo Ortiz-Jimenez, Rodolphe Jenatton, Mark Collier, Efi Kokiopoulou, Pascal Frossard", "tldr": "", "abstract": "Label noise is a pervasive problem in deep learning that often compromises the generalization performance of trained models. Recently, leveraging privileged information (PI) -- information available only during training but not at test time -- has emerged as an effective approach to mitigate this issue. Yet, existing PI-based methods have failed to consistently outperform their no-PI counterparts in terms of preventing overfitting to label noise. To address this deficiency, we introduce Pi-DUAL, an architecture designed to harness PI to distinguish clean from wrong labels. Pi-DUAL decomposes the output logits into a prediction term, based on conventional input features, and a noise-fitting term influenced solely by PI. A gating mechanism steered by PI adaptively shifts focus between these terms, allowing the model to implicitly separate the learning paths of clean and wrong labels. Empirically, Pi-DUAL achieves significant performance improvements on key PI benchmarks (e.g., +6.8% on ImageNet-PI), establishing a new state-of-the-art test set accuracy. Additionally, Pi-DUAL is a potent method for identifying noisy samples post-training, outperforming other strong methods at this task. Overall, Pi-DUAL is a simple, scalable and practical approach for mitigating the effects of label noise in a variety of real-world scenarios with PI.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ke Wang;Guillermo Ortiz-Jimenez;Rodolphe Jenatton;Mark Collier;Efi Kokiopoulou;Pascal Frossard", "authorids": "~Ke_Wang19;~Guillermo_Ortiz-Jimenez1;~Rodolphe_Jenatton3;~Mark_Collier1;~Efi_Kokiopoulou2;~Pascal_Frossard1", "gender": "M;;M;M;;", "homepage": "https://wang-kee.github.io/;http://gortizji.github.io;http://rodolphejenatton.com/;;;", "dblp": ";222/2737;68/8398;;;", "google_scholar": "wKBORzsAAAAJ;xAsJnG0AAAAJ;QIR6rygAAAAJ;U4rBrcgAAAAJ;;", "orcid": ";;;;;", "linkedin": ";;;mark-collier-aa446032/;;", "or_profile": "~Ke_Wang19;~Guillermo_Ortiz-Jimenez1;~Rodolphe_Jenatton3;~Mark_Collier1;~Efi_Kokiopoulou2;~Pascal_Frossard1", "aff": "EPFL - EPF Lausanne;Google DeepMind;Google;Google;;", "aff_domain": "epfl.ch;google.com;google.com;google.com;;", "position": "PhD student;Research Scientist;Senior research scientist;Researcher;;", "bibtex": "@inproceedings{\nwang2024pidual,\ntitle={Pi-{DUAL}: Using privileged information to distinguish clean from noisy labels},\nauthor={Ke Wang and Guillermo Ortiz-Jimenez and Rodolphe Jenatton and Mark Collier and Efi Kokiopoulou and Pascal Frossard},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9oAXix8da9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8559872, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5880124482341331784&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "epfl.ch;google.com;google.com;google.com;;", "author_num": 6, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "EPFL;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.epfl.ch;https://deepmind.com", "aff_unique_abbr": "EPFL;DeepMind", "aff_campus_unique_index": "0;2;2", "aff_campus_unique": "Lausanne;;Mountain View", "aff_country_unique_index": "0;1;2;2", "aff_country_unique": "Switzerland;United Kingdom;United States" }, { "title": "Flextron: Many-in-One Flexible Large Language Model", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34793", "id": "9vKRhnflAs", "proceeding": "https://proceedings.mlr.press/v235/cai24e.html", "pdf": "https://openreview.net/pdf?id=9vKRhnflAs", "openreview": "https://openreview.net/forum?id=9vKRhnflAs", "author_site": "Ruisi Cai, Saurav Muralidharan, Greg Heinrich, Hongxu Yin, Zhangyang \u201cAtlas\u201d Wang, Jan Kautz, Pavlo Molchanov", "tldr": "", "abstract": "Training modern LLMs is extremely resource intensive, and customizing them for various deployment scenarios characterized by limited compute and memory resources through repeated training is impractical. In this paper, we introduce Flextron, a network architecture and post-training model optimization framework supporting flexible model deployment. The Flextron architecture utilizes a nested elastic structure to rapidly adapt to specific user-defined latency and accuracy targets during inference with no additional fine-tuning required. It is also input-adaptive, and can automatically route tokens through its sub-networks for improved performance and efficiency. We present a sample-efficient training method and associated routing algorithms for systematically transforming an existing trained LLM into a Flextron model. We evaluate Flextron on the GPT-3 and LLama-2 family of LLMs, and demonstrate superior performance over multiple end-to-end trained variants and other state-of-the-art elastic networks, all with a single pretraining run that consumes a mere 7.63% tokens compared to original pretraining.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ruisi Cai;Saurav Muralidharan;Greg Heinrich;Hongxu Yin;Zhangyang Wang;Jan Kautz;Pavlo Molchanov", "authorids": "~Ruisi_Cai1;~Saurav_Muralidharan1;~Greg_Heinrich1;~Hongxu_Yin2;~Zhangyang_Wang1;~Jan_Kautz1;~Pavlo_Molchanov1", "gender": "F;M;M;M;;M;M", "homepage": "https://cairuisi.github.io;https://sauravm.com;;https://vita-group.github.io;http://jankautz.com;;https://hongxu-yin.github.io/", "dblp": "341/1491;31/8395;;119/4026;48/6214;165/8169.html;166/3425", "google_scholar": "B0chY1AAAAAJ;GXlChWcAAAAJ;VrjibvwAAAAJ;pxFyKAIAAAAJ;P9FclNEAAAAJ;J9PoyoIAAAAJ;4gdSoOYAAAAJ", "orcid": ";;;;;;", "linkedin": ";;gheinrich/;;;;", "or_profile": "~Ruisi_Cai1;~Saurav_Muralidharan1;~Greg_Heinrich1;~Zhangyang_Wang1;~Jan_Kautz1;~Pavlo_Molchanov1;~Hongxu_Yin1", "aff": "University of Texas at Austin;NVIDIA;NVIDIA;University of Texas at Austin;NVIDIA;NVIDIA Research;NVIDIA", "aff_domain": "utexas.edu;nvidia.com;nvidia.com;utexas.edu;nvidia.com;nvidia.com;nvidia.com", "position": "PhD student;Researcher;Researcher;Associate Professor;VP Research;Research Scientist;Senior Research Scientist", "bibtex": "@inproceedings{\ncai2024flextron,\ntitle={Flextron: Many-in-One Flexible Large Language Model},\nauthor={Ruisi Cai and Saurav Muralidharan and Greg Heinrich and Hongxu Yin and Zhangyang Wang and Jan Kautz and Pavlo Molchanov},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9vKRhnflAs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1267088, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12287839898802427374&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "utexas.edu;nvidia.com;nvidia.com;utexas.edu;nvidia.com;nvidia.com;nvidia.com", "author_num": 7, "aff_unique_index": "0;1;1;0;1;1;1", "aff_unique_norm": "University of Texas at Austin;NVIDIA", "aff_unique_dep": ";NVIDIA Corporation", "aff_unique_url": "https://www.utexas.edu;https://www.nvidia.com", "aff_unique_abbr": "UT Austin;NVIDIA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Chain-of-Thought Predictive Control", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34792", "id": "9xUpLGAOy9", "proceeding": "https://proceedings.mlr.press/v235/jia24c.html", "pdf": "https://openreview.net/pdf?id=9xUpLGAOy9", "openreview": "https://openreview.net/forum?id=9xUpLGAOy9", "author_site": "Zhiwei Jia, Vineet Thumuluri, Fangchen Liu, Linghao Chen, Zhiao Huang, Hao Su", "tldr": "", "abstract": "We study generalizable policy learning from demonstrations for complex low-level control (e.g., contact-rich object manipulations). We propose a novel hierarchical imitation learning method that utilizes sub-optimal demos. Firstly, we propose an observation space-agnostic approach that efficiently discovers the multi-step subskill decomposition of the demos in an unsupervised manner. By grouping temporarily close and functionally similar actions into subskill-level demo segments, the observations at the segment boundaries constitute a chain of planning steps for the task, which we refer to as the chain-of-thought (CoT). Next, we propose a Transformer-based design that effectively learns to predict the CoT as the subskill-level guidance. We couple action and subskill predictions via learnable prompt tokens and a hybrid masking strategy, which enable dynamically updated guidance at test time and improve feature representation of the trajectory for generalizable policy learning. Our method, Chain-of-Thought Predictive Control (CoTPC), consistently surpasses existing strong baselines on various challenging low-level manipulation tasks with sub-optimal demos. See project page at https://sites.google.com/view/cotpc.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhiwei Jia;Vineet Thumuluri;Fangchen Liu;Linghao Chen;Zhiao Huang;Hao Su", "authorids": "~Zhiwei_Jia1;~Vineet_Thumuluri1;~Fangchen_Liu2;~Linghao_Chen2;~Zhiao_Huang1;~Hao_Su1", "gender": "M;;F;;M;M", "homepage": "https://www.zjia.xyz/;;https://fangchenliu.github.io/;https://ootts.github.io/;;http://ai.ucsd.edu/~haosu", "dblp": ";;;262/3716;172/1410;09/4945-1", "google_scholar": "nQhMGqAAAAAJ;;;;;1P8Zu04AAAAJ", "orcid": ";;;;;", "linkedin": "zhiweijia;;;;;", "or_profile": "~Zhiwei_Jia1;~Vineet_Thumuluri1;~Fangchen_Liu2;~Linghao_Chen2;~Zhiao_Huang1;~Hao_Su1", "aff": "Zoom Communications;;University of California, Berkeley;Zhejiang University;University of California, San Diego, University of California, San Diego;University of California, San Diego", "aff_domain": "zoom.com;;berkeley.edu;zju.edu.cn;eng.ucsd.edu;ucsd.edu", "position": "Researcher;;PhD student;PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\njia2024chainofthought,\ntitle={Chain-of-Thought Predictive Control},\nauthor={Zhiwei Jia and Vineet Thumuluri and Fangchen Liu and Linghao Chen and Zhiao Huang and Hao Su},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9xUpLGAOy9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2541013, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9612779628799289760&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "zoom.com;;berkeley.edu;zju.edu.cn;eng.ucsd.edu;ucsd.edu", "author_num": 6, "aff_unique_index": "0;1;2;3;3", "aff_unique_norm": "Zoom Communications;University of California, Berkeley;Zhejiang University;University of California, San Diego", "aff_unique_dep": ";;;", "aff_unique_url": "https://zoom.us;https://www.berkeley.edu;https://www.zju.edu.cn;https://www.ucsd.edu", "aff_unique_abbr": "Zoom;UC Berkeley;ZJU;UCSD", "aff_campus_unique_index": "1;2;2", "aff_campus_unique": ";Berkeley;San Diego", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United States;China" }, { "title": "When is Transfer Learning Possible?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34791", "id": "9yADTDHgGu", "proceeding": "https://proceedings.mlr.press/v235/phan24a.html", "pdf": "https://openreview.net/pdf?id=9yADTDHgGu", "openreview": "https://openreview.net/forum?id=9yADTDHgGu", "author_site": "My Phan, Kiant\u00e9 Brantley, Stephanie Milani, Soroush Mehri, Gokul Swamy, Geoff Gordon", "tldr": "", "abstract": "We present a general framework for transfer learning that is flexible enough to capture transfer in supervised, reinforcement, and imitation learning. Our framework enables new insights into the fundamental question of *when* we can successfully transfer learned information across problems. We model the learner as interacting with a sequence of problem instances, or *environments*, each of which is generated from a common structural causal model (SCM) by choosing the SCM's parameters from restricted sets. We derive a procedure that can propagate restrictions on SCM parameters through the SCM's graph structure to other parameters that we are trying to learn. The propagated restrictions then enable more efficient learning (i.e., transfer). By analyzing the procedure, we are able to challenge widely-held beliefs about transfer learning. First, we show that having *sparse* changes across environments is neither necessary nor sufficient for transfer. Second, we show an example where the common heuristic of *freezing* a layer in a network causes poor transfer performance. We then use our procedure to select a more refined set of parameters to freeze, leading to successful transfer learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "My Phan;Kiant\u00e9 Brantley;Stephanie Milani;Soroush Mehri;Gokul Swamy;Geoffrey J. Gordon", "authorids": "~My_Phan1;~Kiant\u00e9_Brantley2;~Stephanie_Milani1;~Soroush_Mehri1;~Gokul_Swamy1;~Geoffrey_J._Gordon1", "gender": ";;F;M;;", "homepage": ";;https://stephmilani.github.io/;;https://gokul.dev/;", "dblp": ";;239/6037;168/8645;31/11509;", "google_scholar": ";;vx68rkMAAAAJ;;Sbpra_AAAAAJ;", "orcid": ";;0000-0003-1150-4418;;;", "linkedin": ";;;;;", "or_profile": "~My_Phan1;~Kiant\u00e9_Brantley2;~Stephanie_Milani1;~Soroush_Mehri1;~Gokul_Swamy1;~Geoffrey_J._Gordon1", "aff": ";;Carnegie Mellon University;Elementera Inc.;Carnegie Mellon University;", "aff_domain": ";;cmu.edu;elementera.com;cmu.edu;", "position": ";;PhD student;Researcher;PhD student;", "bibtex": "@inproceedings{\nphan2024when,\ntitle={When is Transfer Learning Possible?},\nauthor={My Phan and Kiant{\\'e} Brantley and Stephanie Milani and Soroush Mehri and Gokul Swamy and Geoffrey J. Gordon},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9yADTDHgGu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1413602, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gKCINOUzxrYJ:scholar.google.com/&scioq=When+is+Transfer+Learning+Possible%3F&hl=en&as_sdt=0,33", "gs_version_total": 4, "email": ";;cmu.edu;elementera.com;cmu.edu;", "author_num": 6, "aff_unique_index": "0;1;0", "aff_unique_norm": "Carnegie Mellon University;Elementera Inc.", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;", "aff_unique_abbr": "CMU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Unsupervised Episode Generation for Graph Meta-learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34790", "id": "9zdTOOgutk", "proceeding": "https://proceedings.mlr.press/v235/jung24a.html", "pdf": "https://openreview.net/pdf?id=9zdTOOgutk", "openreview": "https://openreview.net/forum?id=9zdTOOgutk", "author_site": "Jihyeong Jung, Sangwoo Seo, Sungwon Kim, Chanyoung Park", "tldr": "", "abstract": "We propose Unsupervised Episode Generation method called **Neighbors as Queries (NaQ)** to solve the Few-Shot Node-Classification (FSNC) task by *unsupervised Graph Meta-learning*. Doing so enables full utilization of the information of all nodes in a graph, which is not possible in current supervised meta-learning methods for FSNC due to the label-scarcity problem. In addition, unlike unsupervised Graph Contrastive Learning (GCL) methods that overlook the downstream task to be solved at the training phase resulting in vulnerability to class imbalance of a graph, we adopt the episodic learning framework that allows the model to be aware of the downstream task format, i.e., FSNC. The proposed NaQ is a simple but effective *unsupervised* episode generation method that randomly samples nodes from a graph to make a support set, followed by similarity-based sampling of nodes to make the corresponding query set. Since NaQ is *model-agnostic*, any existing supervised graph meta-learning methods can be trained in an unsupervised manner, while not sacrificing much of their performance or sometimes even improving them. Extensive experimental results demonstrate the effectiveness of our proposed unsupervised episode generation method for graph meta-learning towards the FSNC task. Our code is available at: https://github.com/JhngJng/NaQ-PyTorch.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jihyeong Jung;Sangwoo Seo;Sungwon Kim;Chanyoung Park", "authorids": "~Jihyeong_Jung1;~Sangwoo_Seo1;~Sungwon_Kim3;~Chanyoung_Park1", "gender": "M;M;M;M", "homepage": "https://github.com/JhngJng;https://github.com/tkddn8974;https://sung-won-kim.github.io;https://dsail.kaist.ac.kr/", "dblp": ";234/8568;59/5163-2;170/5430.html", "google_scholar": "uKNElOMAAAAJ;;https://scholar.google.co.kr/citations?hl=ko;lWk2LtQAAAAJ", "orcid": "0009-0001-0535-628X;;0000-0001-8605-2618;0000-0002-5957-5816", "linkedin": ";;sungwon-kim/;", "or_profile": "~Jihyeong_Jung1;~Sangwoo_Seo1;~Sungwon_Kim3;~Chanyoung_Park1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.edu;kaist.ac.kr;kaist.ac.kr", "position": "MS student;PhD student;MS student;Assistant Professor", "bibtex": "@inproceedings{\njung2024unsupervised,\ntitle={Unsupervised Episode Generation for Graph Meta-learning},\nauthor={Jihyeong Jung and Sangwoo Seo and Sungwon Kim and Chanyoung Park},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9zdTOOgutk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5170441, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3973266689522289145&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "kaist.ac.kr;kaist.edu;kaist.ac.kr;kaist.ac.kr", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Quality Diversity through Human Feedback: Towards Open-Ended Diversity-Driven Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34789", "id": "9zlZuAAb08", "proceeding": "https://proceedings.mlr.press/v235/ding24h.html", "pdf": "https://openreview.net/pdf?id=9zlZuAAb08", "openreview": "https://openreview.net/forum?id=9zlZuAAb08", "author_site": "Li Ding, Jenny Zhang, Jeff Clune, Lee Spector, Joel Lehman", "tldr": "", "abstract": "Reinforcement Learning from Human Feedback (RLHF) has shown potential in qualitative tasks where easily defined performance measures are lacking. However, there are drawbacks when RLHF is commonly used to optimize for average human preferences, especially in generative tasks that demand diverse model responses. Meanwhile, Quality Diversity (QD) algorithms excel at identifying diverse and high-quality solutions but often rely on manually crafted diversity metrics. This paper introduces Quality Diversity through Human Feedback (QDHF), a novel approach that progressively infers diversity metrics from human judgments of similarity among solutions, thereby enhancing the applicability and effectiveness of QD algorithms in complex and open-ended domains. Empirical studies show that QDHF significantly outperforms state-of-the-art methods in automatic diversity discovery and matches the efficacy of QD with manually crafted diversity metrics on standard benchmarks in robotics and reinforcement learning. Notably, in open-ended generative tasks, QDHF substantially enhances the diversity of text-to-image generation from a diffusion model and is more favorably received in user studies. We conclude by analyzing QDHF's scalability, robustness, and quality of derived diversity metrics, emphasizing its strength in open-ended optimization tasks. Code and tutorials are available at https://liding.info/qdhf.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Li Ding;Jenny Zhang;Jeff Clune;Lee Spector;Joel Lehman", "authorids": "~Li_Ding3;~Jenny_Zhang1;~Jeff_Clune3;~Lee_Spector2;~Joel_Lehman1", "gender": "M;;;;", "homepage": "https://liding.info;;;https://leespector.com;http://joellehman.com", "dblp": "58/4543-10;;;68/434;47/8285", "google_scholar": "https://scholar.google.com/citations?hl=en;;;wtKLtLUAAAAJ;GcvxHWQAAAAJ", "orcid": "0000-0002-1315-1196;;; 0000-0001-5299-4797;", "linkedin": "liding256;;;lee-spector-77990b9/;", "or_profile": "~Li_Ding3;~Jenny_Zhang1;~Jeff_Clune3;~Lee_Spector2;~Joel_Lehman1", "aff": "University of Massachusetts, Amherst;;;Amherst College;Carper.AI", "aff_domain": "umass.edu;;;amherst.edu;carper.ai", "position": "PhD student;;;Full Professor;Research Advisor", "bibtex": "@inproceedings{\nding2024quality,\ntitle={Quality Diversity through Human Feedback: Towards Open-Ended Diversity-Driven Optimization},\nauthor={Li Ding and Jenny Zhang and Jeff Clune and Lee Spector and Joel Lehman},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9zlZuAAb08}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8853139, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16463259911321747671&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "umass.edu;;;amherst.edu;carper.ai", "author_num": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Massachusetts Amherst;Amherst College;Carper.AI", "aff_unique_dep": ";;", "aff_unique_url": "https://www.umass.edu;https://www.amherst.edu;https://www.carper.ai", "aff_unique_abbr": "UMass Amherst;Amherst;Carper.AI", "aff_campus_unique_index": "0", "aff_campus_unique": "Amherst;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Multiple Secrets in Mastermind", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34788", "id": "A0N39kgRZq", "proceeding": "https://proceedings.mlr.press/v235/prabhu24a.html", "pdf": "https://openreview.net/pdf?id=A0N39kgRZq", "openreview": "https://openreview.net/forum?id=A0N39kgRZq", "author_site": "Milind Prabhu, David Woodruff", "tldr": "", "abstract": "In the Generalized Mastermind problem, there is an unknown subset $H$ of the hypercube 0,1$^d$ containing $n$ points. The goal is to learn $H$ by making a few queries to an oracle which given a point $q$ in 0,1$^d$, returns the point in $H$ nearest to $q$. We give a two-round adaptive algorithm for this problem that learns $H$ while making at most $\\exp(\\widetilde{O}(\\sqrt{d \\log n}))$. Furthermore, we show that any $r$-round adaptive randomized algorithm that learns $H$ with constant probability must make $\\exp(\\Omega(d^{3^{-(r-1)}}))$ queries even when the input has poly$(d)$ points; thus, any poly$(d)$ query algorithm must necessarily use $\\Omega(\\log \\log d)$ rounds of adaptivity. We give optimal query complexity bounds for the variant of the problem where queries are allowed to be from 0,1,2$^d$. We also study a continuous variant of the problem in which $H$ is a subset of unit vectors in $\\mathbb{R}^d$ and one can query unit vectors in $\\mathbb{R}^d$. For this setting, we give a $O(n^{\\lfloor d/2 \\rfloor})$ query deterministic algorithm to learn the hidden set of points.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Milind Prabhu;David Woodruff", "authorids": "~Milind_Prabhu1;~David_Woodruff1", "gender": ";M", "homepage": ";http://www.cs.cmu.edu/~dwoodruf/", "dblp": ";w/DPWoodruff", "google_scholar": "vu73GNIAAAAJ;https://scholar.google.com.tw/citations?user=0G2t-6sAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Milind_Prabhu1;~David_Woodruff1", "aff": "University of Michigan - Ann Arbor;Carnegie Mellon University", "aff_domain": "umich.edu;cmu.edu", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nprabhu2024learning,\ntitle={Learning Multiple Secrets in Mastermind},\nauthor={Milind Prabhu and David Woodruff},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=A0N39kgRZq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 357999, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Z0hDQBN1vOYJ:scholar.google.com/&scioq=Learning+Multiple+Secrets+in+Mastermind&hl=en&as_sdt=0,33", "gs_version_total": 6, "email": "umich.edu;cmu.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Michigan;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.umich.edu;https://www.cmu.edu", "aff_unique_abbr": "UM;CMU", "aff_campus_unique_index": "0", "aff_campus_unique": "Ann Arbor;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "A Statistical Theory of Regularization-Based Continual Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34787", "id": "A54CXWn9VB", "proceeding": "https://proceedings.mlr.press/v235/zhao24n.html", "pdf": "https://openreview.net/pdf?id=A54CXWn9VB", "openreview": "https://openreview.net/forum?id=A54CXWn9VB", "author_site": "Xuyang Zhao, Huiyuan Wang, Weiran Huang, Wei Lin", "tldr": "", "abstract": "We provide a statistical analysis of regularization-based continual learning on a sequence of linear regression tasks, with emphasis on how different regularization terms affect the model performance. We first derive the convergence rate for the oracle estimator obtained as if all data were available simultaneously. Next, we consider a family of generalized $\\ell_2$-regularization algorithms indexed by matrix-valued hyperparameters, which includes the minimum norm estimator and continual ridge regression as special cases. As more tasks are introduced, we derive an iterative update formula for the estimation error of generalized $\\ell_2$-regularized estimators, from which we determine the hyperparameters resulting in the optimal algorithm. Interestingly, the choice of hyperparameters can effectively balance the trade-off between forward and backward knowledge transfer and adjust for data heterogeneity. Moreover, the estimation error of the optimal algorithm is derived explicitly, which is of the same order as that of the oracle estimator. In contrast, our lower bounds for the minimum norm estimator and continual ridge regression show their suboptimality. A byproduct of our theoretical analysis is the equivalence between early stopping and generalized $\\ell_2$-regularization in continual learning, which may be of independent interest. Finally, we conduct experiments to complement our theory.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xuyang Zhao;Huiyuan Wang;Weiran Huang;Wei Lin", "authorids": "~Xuyang_Zhao2;~Huiyuan_Wang1;~Weiran_Huang1;~Wei_Lin3", "gender": "M;M;M;M", "homepage": "https://www.pku.edu.cn/;https://huiyuan-wang.github.io;https://www.weiranhuang.com;https://www.math.pku.edu.cn/teachers/linw/", "dblp": "116/1504;;170/0073-1;", "google_scholar": ";tAmcOpgAAAAJ;AjJ2rf8AAAAJ;DculdNkAAAAJ", "orcid": ";0009-0004-8796-7376;;0000-0002-7598-6199", "linkedin": ";;;", "or_profile": "~Xuyang_Zhao2;~Huiyuan_Wang1;~Weiran_Huang1;~Wei_Lin3", "aff": "Peking University;University of Pennsylvania;Shanghai AI Laboratory;Peking University", "aff_domain": "pku.edu.cn;upenn.edu;pjlab.org.cn;pku.edu.cn", "position": "PhD student;Postdoc;Consultant;Associate Professor", "bibtex": "@inproceedings{\nzhao2024a,\ntitle={A Statistical Theory of Regularization-Based Continual Learning},\nauthor={Xuyang Zhao and Huiyuan Wang and Weiran Huang and Wei Lin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=A54CXWn9VB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 450915, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5256923546941096657&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "pku.edu.cn;upenn.edu;pjlab.org.cn;pku.edu.cn", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Peking University;University of Pennsylvania;Shanghai AI Laboratory", "aff_unique_dep": ";;", "aff_unique_url": "http://www.pku.edu.cn;https://www.upenn.edu;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "Peking U;UPenn;SAIL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "China;United States" }, { "title": "Tuning-Free Stochastic Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34786", "id": "A6fmX9QCEa", "proceeding": "https://proceedings.mlr.press/v235/khaled24a.html", "pdf": "https://openreview.net/pdf?id=A6fmX9QCEa", "openreview": "https://openreview.net/forum?id=A6fmX9QCEa", "author_site": "Ahmed Khaled, Chi Jin", "tldr": "", "abstract": "Large-scale machine learning problems make the cost of hyperparameter tuning ever more prohibitive. This creates a need for algorithms that can tune themselves on-the-fly. We formalize the notion of *``tuning-free''* algorithms that can match the performance of optimally-tuned optimization algorithms up to polylogarithmic factors given only loose hints on the relevant problem parameters. We consider in particular algorithms that can match optimally-tuned Stochastic Gradient Descent (SGD). When the domain of optimization is bounded, we show tuning-free matching of SGD is possible and achieved by several existing algorithms. We prove that for the task of minimizing a convex and smooth or Lipschitz function over an unbounded domain, tuning-free optimization is impossible. We discuss conditions under which tuning-free optimization is possible even over unbounded domains. In particular, we show that the recently proposed DoG and DoWG algorithms are tuning-free when the noise distribution is sufficiently well-behaved. For the task of finding a stationary point of a smooth and potentially nonconvex function, we give a variant of SGD that matches the best-known high-probability convergence rate for tuned SGD at only an additional polylogarithmic cost. However, we also give an impossibility result that shows no algorithm can hope to match the optimal expected convergence rate for tuned SGD with high probability.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ahmed Khaled;Chi Jin", "authorids": "~Ahmed_Khaled1;~Chi_Jin1", "gender": "M;M", "homepage": "https://www.akhaled.net;https://sites.google.com/view/cjin/home", "dblp": "154/3591-1;126/1802-1", "google_scholar": "Bc3wOdsAAAAJ;GINhGvwAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Ahmed_Khaled1;~Chi_Jin1", "aff": "Princeton University;Princeton University", "aff_domain": "princeton.edu;princeton.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nkhaled2024tuningfree,\ntitle={Tuning-Free Stochastic Optimization},\nauthor={Ahmed Khaled and Chi Jin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=A6fmX9QCEa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 481881, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14776053589763320381&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "princeton.edu;princeton.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Graph2Tac: Online Representation Learning of Formal Math Concepts", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34785", "id": "A7CtiozznN", "proceeding": "https://proceedings.mlr.press/v235/blaauwbroek24a.html", "pdf": "https://openreview.net/pdf?id=A7CtiozznN", "openreview": "https://openreview.net/forum?id=A7CtiozznN", "author_site": "Lasse Blaauwbroek, Mirek Ol\u0161\u00e1k, Jason Rute, Fidel I. Schaposnik Massolo, Jelle Piepenbrock, Vasily Pestun", "tldr": "", "abstract": "In proof assistants, the physical proximity between two formal mathematical concepts is a strong predictor of their mutual relevance. Furthermore, lemmas with close proximity regularly exhibit similar proof structures. We show that this _locality_ property can be exploited through online learning techniques to obtain solving agents that far surpass offline learners when asked to prove theorems in an unseen mathematical setting. We extensively benchmark two such online solvers implemented in the Tactician platform for the Coq proof assistant: First, Tactician's online $k$-nearest neighbor solver, which can learn from recent proofs, shows a $1.72\\times$ improvement in theorems proved over an offline equivalent. Second, we introduce a graph neural network, Graph2Tac, with a novel approach to build hierarchical representations for new definitions. Graph2Tac's online definition task realizes a $1.5\\times$ improvement in theorems solved over an offline baseline. The $k$-NN and Graph2Tac solvers rely on orthogonal online data, making them highly complementary. Their combination improves $1.27\\times$ over their individual performances. Both solvers outperform all other general purpose provers for Coq, including CoqHammer, Proverbot9001, and a transformer baseline by at least $1.48\\times$ and are available for practical use by end-users.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lasse Blaauwbroek;Mirek Ol\u0161\u00e1k;Jason Rute;Fidel Ivan Schaposnik Massolo;Jelle Piepenbrock;Vasily Pestun", "authorids": "~Lasse_Blaauwbroek1;~Mirek_Ol\u0161\u00e1k1;~Jason_Rute1;~Fidel_Ivan_Schaposnik_Massolo1;~Jelle_Piepenbrock1;~Vasily_Pestun1", "gender": "M;M;M;;;M", "homepage": "https://fr.linkedin.com/in/lasse-blaauwbroek-491306128;https://jasonrute.github.io;https://fidel-schaposnik.github.io;https://jellepiepenbrock.nl;https://pestun.ihes.fr;https://github.com/mirefek/", "dblp": ";141/9655;;285/5325.html;;", "google_scholar": ";Z-oVfDMAAAAJ;sQtdf8cAAAAJ;04Oz0iQAAAAJ;;", "orcid": ";0000-0002-6247-1882;0000-0002-9557-7296;;;", "linkedin": ";jason-rute;fidel-schaposnik/;;;", "or_profile": "~Lasse_Blaauwbroek1;~Jason_Rute1;~Fidel_Ivan_Schaposnik_Massolo1;~Jelle_Piepenbrock1;~Vasily_Pestun1;~Miroslav_Ol\u0161\u00e1k1", "aff": "IHES;IBM Research;;Technical University Prague;IHES;University of Cambridge", "aff_domain": "ihes.fr;research.ibm.com;;cvut.cz;ihes.fr;cam.ac.uk", "position": "Postdoc;Postdoc;;Researcher;Full Professor;Postdoc", "bibtex": "@inproceedings{\nblaauwbroek2024graphtac,\ntitle={Graph2Tac: Online Representation Learning of Formal Math Concepts},\nauthor={Lasse Blaauwbroek and Mirek Ol{\\v{s}}{\\'a}k and Jason Rute and Fidel Ivan Schaposnik Massolo and Jelle Piepenbrock and Vasily Pestun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=A7CtiozznN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8395530, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8971815943165737380&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "ihes.fr;research.ibm.com;;cvut.cz;ihes.fr;cam.ac.uk", "author_num": 6, "aff_unique_index": "0;1;2;0;3", "aff_unique_norm": "Institut des Hautes \u00c9tudes Scientifiques;IBM;Technical University of Prague;University of Cambridge", "aff_unique_dep": ";IBM Research;;", "aff_unique_url": "https://www.ihes.fr/;https://www.ibm.com/research;https://www.tup.cz;https://www.cam.ac.uk", "aff_unique_abbr": "IHES;IBM;TUP;Cambridge", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;1;2;0;3", "aff_country_unique": "France;United States;Czech Republic;United Kingdom" }, { "title": "A Statistical Framework for Data-dependent Retrieval-Augmented Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34784", "id": "A9MiJdetnZ", "proceeding": "https://proceedings.mlr.press/v235/basu24a.html", "pdf": "https://openreview.net/pdf?id=A9MiJdetnZ", "openreview": "https://openreview.net/forum?id=A9MiJdetnZ", "author_site": "Soumya Basu, Ankit Singh Rawat, Manzil Zaheer", "tldr": "", "abstract": "Modern ML systems increasingly augment input instances with additional relevant information to enhance final prediction. Despite growing interest in such retrieval-augmented models, their fundamental properties and training are not well understood. We propose a statistical framework to study such models with two components: 1) a retriever to identify the relevant information out of a large corpus via a data-dependent metric; and 2) a predictor that consumes the input instances along with the retrieved information to make the final predictions. We present a principled method for end-to-end training of both components and draw connections with various training approaches in the literature. Furthermore, we establish excess risk bounds for retrieval-augmented models while delineating the contributions of both retriever and predictor towards the model performance.We validate the utility of our proposed training methods along with the key takeaways from our statistical analysis on open domain question answering task where retrieval augmentation is important.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Soumya Basu;Ankit Singh Rawat;Manzil Zaheer", "authorids": "~Soumya_Basu2;~Ankit_Singh_Rawat1;~Manzil_Zaheer1", "gender": "M;M;M", "homepage": "https://basusoumya.github.io/;https://ankitsrawat.github.io/home/;https://www.aclweb.org/anthology/people/m/manzil-zaheer/", "dblp": "153/0318-1;https://dblp.org/pers/hd/r/Rawat:Ankit_Singh;40/10701", "google_scholar": "VNQp_doAAAAJ;http://scholar.google.com/citations?user=U0_ab4cAAAAJ;A33FhJMAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Soumya_Basu2;~Ankit_Singh_Rawat1;~Manzil_Zaheer1", "aff": "Google;Google;Google DeepMind", "aff_domain": "google.com;google.com;deepmind.com", "position": "SWE;Research Scientist;Researcher", "bibtex": "@inproceedings{\nbasu2024a,\ntitle={A Statistical Framework for Data-dependent Retrieval-Augmented Models},\nauthor={Soumya Basu and Ankit Singh Rawat and Manzil Zaheer},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=A9MiJdetnZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 537980, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:glMsYPWkT48J:scholar.google.com/&scioq=A+Statistical+Framework+for+Data-dependent+Retrieval-Augmented+Models&hl=en&as_sdt=0,48", "gs_version_total": 7, "email": "google.com;google.com;deepmind.com", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Learning Associative Memories with Gradient Descent", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34783", "id": "A9fLbXLRTK", "proceeding": "https://proceedings.mlr.press/v235/cabannes24a.html", "pdf": "https://openreview.net/pdf?id=A9fLbXLRTK", "openreview": "https://openreview.net/forum?id=A9fLbXLRTK", "author_site": "Vivien Cabannnes, Berfin Simsek, Alberto Bietti", "tldr": "", "abstract": "This work focuses on the training dynamics of one associative memory module storing outer products of token embeddings. We reduce this problem to the study of a system of particles, which interact according to properties of the data distribution and correlations between embeddings. Through theory and experiments, we provide several insights. In overparameterized regimes, we obtain logarithmic growth of the ``classification margins.'' Yet, we show that imbalance in token frequencies and memory interferences due to correlated embeddings lead to oscillatory transitory regimes. The oscillations are more pronounced with large step sizes, which can create benign loss spikes, although these learning rates speed up the dynamics and accelerate the asymptotic convergence. We also find that underparameterized regimes lead to suboptimal memorization schemes. Finally, we assess the validity of our findings on small Transformer models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vivien Cabannes;Berfin Simsek;Alberto Bietti", "authorids": "~Vivien_Cabannes1;~Berfin_Simsek1;~Alberto_Bietti1", "gender": "Not Specified;F;M", "homepage": "https://viviencabannes.github.io/;https://www.bsimsek.com/;http://alberto.bietti.me", "dblp": ";244/2455;166/6461", "google_scholar": ";Ysi38KIAAAAJ;iT7Tp70AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Vivien_Cabannes1;~Berfin_Simsek1;~Alberto_Bietti1", "aff": "META;New York University;Flatiron Institute", "aff_domain": "meta.com;nyu.edu;flatironinstitute.org", "position": "Postdoc;Assistant Professor;Researcher", "bibtex": "@inproceedings{\ncabannes2024learning,\ntitle={Learning Associative Memories with Gradient Descent},\nauthor={Vivien Cabannes and Berfin Simsek and Alberto Bietti},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=A9fLbXLRTK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1344034, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15371274638388992301&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "meta.com;nyu.edu;flatironinstitute.org", "author_num": 3, "aff_unique_index": "1;2", "aff_unique_norm": ";New York University;Flatiron Institute", "aff_unique_dep": ";;", "aff_unique_url": ";https://www.nyu.edu;https://flatironinstitute.org", "aff_unique_abbr": ";NYU;Flatiron", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;1", "aff_country_unique": ";United States" }, { "title": "Quantum Theory and Application of Contextual Optimal Transport", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34782", "id": "A9hJvQHEEP", "proceeding": "https://proceedings.mlr.press/v235/mariella24a.html", "pdf": "https://openreview.net/pdf?id=A9hJvQHEEP", "openreview": "https://openreview.net/forum?id=A9hJvQHEEP", "author_site": "Nicola Mariella, Albert Akhriev, Francesco Tacchino, Christa Zoufal, Juan Gonzalez-Espitia, Benedek Harsanyi, Eugene Koskin, Ivano Tavernelli, Stefan Woerner, Marianna Rapsomaniki, Sergiy Zhuk, Jannis Born", "tldr": "", "abstract": "Optimal Transport (OT) has fueled machine learning (ML) across many domains. When paired data measurements $(\\boldsymbol{\\mu}, \\boldsymbol{\\nu})$ are coupled to covariates, a challenging conditional distribution learning setting arises. Existing approaches for learning a *global* transport map parameterized through a potentially unseen context utilize Neural OT and largely rely on Brenier's theorem. Here, we propose a first-of-its-kind quantum computing formulation for amortized optimization of contextualized transportation plans. We exploit a direct link between doubly stochastic matrices and unitary operators thus unravelling a natural connection between OT and quantum computation. We verify our method (QontOT) on synthetic and real data by predicting variations in cell type distributions conditioned on drug dosage. Importantly we conduct a 24-qubit hardware experiment on a task challenging for classical computers and report a performance that cannot be matched with our classical neural OT approach. In sum, this is a first step toward learning to predict contextualized transportation plans through quantum computing.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nicola Mariella;Albert Akhriev;Francesco Tacchino;Christa Zoufal;Juan Carlos Gonzalez-Espitia;Benedek Harsanyi;Eugene Koskin;Ivano Tavernelli;Stefan Woerner;Marianna Rapsomaniki;Sergiy Zhuk;Jannis Born", "authorids": "~Nicola_Mariella1;albert_akhriev@ie.ibm.com;fta@zurich.ibm.com;~Christa_Zoufal1;juan.carlos.gonzalez.espitia@ibm.com;benedek.harsanyi@ibm.com;eugin.koskin@gmail.com;ita@zurich.ibm.com;~Stefan_Woerner1;~Marianna_Rapsomaniki1;~Sergiy_Zhuk1;~Jannis_Born1", "gender": "M;;;F;;;;;;F;M;M", "homepage": ";;;;;;;;;;http://researcher.ibm.com/researcher/view.php?person=ie-sergiy.zhuk;", "dblp": ";;;;;;;;;;50/9226;230/4263", "google_scholar": ";;;4l3kCFEAAAAJ;;;;;;https://scholar.google.gr/citations?user=fFiBRAIAAAAJ;_U-MxYUAAAAJ;FHL-zfsAAAAJ", "orcid": "0000-0001-7268-1149;;; 0000-0003-4126-3141;;;;;;;0000-0002-0956-8909;0000-0001-8307-5670", "linkedin": ";;;christa-zoufal-433a31108/?originalSubdomain=ch;;;;;;;https://ie.linkedin.com/in/sergiy-zhuk-a0064517;", "or_profile": "~Nicola_Mariella1;albert_akhriev@ie.ibm.com;fta@zurich.ibm.com;~Christa_Zoufal1;juan.carlos.gonzalez.espitia@ibm.com;benedek.harsanyi@ibm.com;eugin.koskin@gmail.com;ita@zurich.ibm.com;~Stefan_Woerner1;~Marianna_Rapsomaniki1;~Sergiy_Zhuk1;~Jannis_Born1", "aff": "International Business Machines;;;International Business Machines;;;;;;IBM Research;IBM Research Europe;International Business Machines", "aff_domain": "ibm.com;;;ibm.com;;;;;;research.ibm.com;research.ibm.com;ibm.com", "position": "Researcher;;;Researcher;;;;;;Researcher;Research Manager;Researcher", "bibtex": "@inproceedings{\nmariella2024quantum,\ntitle={Quantum Theory and Application of Contextual Optimal Transport},\nauthor={Nicola Mariella and Albert Akhriev and Francesco Tacchino and Christa Zoufal and Juan Carlos Gonzalez-Espitia and Benedek Harsanyi and Eugene Koskin and Ivano Tavernelli and Stefan Woerner and Marianna Rapsomaniki and Sergiy Zhuk and Jannis Born},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=A9hJvQHEEP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7593760, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5290166763776448442&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 9, "email": "ibm.com;;;ibm.com;;;;;;research.ibm.com;research.ibm.com;ibm.com", "author_num": 12, "aff_unique_index": "0;0;1;1;0", "aff_unique_norm": "International Business Machines Corporation;IBM", "aff_unique_dep": ";IBM Research", "aff_unique_url": "https://www.ibm.com;https://www.ibm.com/research", "aff_unique_abbr": "IBM;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;Unknown" }, { "title": "Learning Optimal Deterministic Policies with Stochastic Policy Gradients", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34781", "id": "ABt0jlLZtX", "proceeding": "https://proceedings.mlr.press/v235/montenegro24a.html", "pdf": "https://openreview.net/pdf?id=ABt0jlLZtX", "openreview": "https://openreview.net/forum?id=ABt0jlLZtX", "author_site": "Alessandro Montenegro, Marco Mussi, Alberto Maria Metelli, Matteo Papini", "tldr": "", "abstract": "Policy gradient (PG) methods are successful approaches to deal with continuous reinforcement learning (RL) problems. They learn stochastic parametric (hyper)policies by either exploring in the space of actions or in the space of parameters. Stochastic controllers, however, are often undesirable from a practical perspective because of their lack of robustness, safety, and traceability. In common practice, stochastic (hyper)policies are learned only to deploy their deterministic version. In this paper, we make a step towards the theoretical understanding of this practice. After introducing a novel framework for modeling this scenario, we study the global convergence to the best deterministic policy, under (weak) gradient domination assumptions. Then, we illustrate how to tune the exploration level used for learning to optimize the trade-off between the sample complexity and the performance of the deployed deterministic policy. Finally, we quantitatively compare action-based and parameter-based exploration, giving a formal guise to intuitive results.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alessandro Montenegro;Marco Mussi;Alberto Maria Metelli;Matteo Papini", "authorids": "~Alessandro_Montenegro1;~Marco_Mussi1;~Alberto_Maria_Metelli2;~Matteo_Papini1", "gender": "M;M;M;M", "homepage": ";https://marcomussi.github.io/;https://albertometelli.github.io/;https://t3p.github.io/", "dblp": ";321/0756;209/4941;209/4897", "google_scholar": "CugD-ogAAAAJ;3gca-JUAAAAJ;R31IsPwAAAAJ;https://scholar.google.it/citations?user=A2WxZlsAAAAJ", "orcid": ";0000-0001-8356-6744;0000-0002-3424-5212;0000-0002-3807-3171", "linkedin": "alessandro-montenegro-3266291b7/;marcomussi95/;;matteo-papini/", "or_profile": "~Alessandro_Montenegro1;~Marco_Mussi1;~Alberto_Maria_Metelli2;~Matteo_Papini1", "aff": "Politecnico di Milano;Politecnico di Milano;Politecnico di Milano;Polytechnic Institute of Milan", "aff_domain": "polimi.it;polimi.it;polimi.it;polimi.it", "position": "PhD student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nmontenegro2024learning,\ntitle={Learning Optimal Deterministic Policies with Stochastic Policy Gradients},\nauthor={Alessandro Montenegro and Marco Mussi and Alberto Maria Metelli and Matteo Papini},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ABt0jlLZtX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5772337, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15510474492733319244&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "polimi.it;polimi.it;polimi.it;polimi.it", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Politecnico di Milano;Polytechnic Institute of Milan", "aff_unique_dep": ";", "aff_unique_url": "https://www.polimi.it;https://www.polimi.it/", "aff_unique_abbr": "Polimi;Politecnico di Milano", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Italy" }, { "title": "Parsimonious Learning-Augmented Approximations for Dense Instances of $\\mathcal{NP}$-hard Problems", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34780", "id": "AD5QC1BTJL", "proceeding": "https://proceedings.mlr.press/v235/bampis24a.html", "pdf": "https://openreview.net/pdf?id=AD5QC1BTJL", "openreview": "https://openreview.net/forum?id=AD5QC1BTJL", "author_site": "Evripidis Bampis, Bruno Escoffier, Michalis Xefteris", "tldr": "", "abstract": "The classical work of (Arora et al., 1999) provides a scheme that gives, for any $\\epsilon>0$, a polynomial time $1-\\epsilon$ approximation algorithm for dense instances of a family of $\\mathcal{NP}$-hard problems, such as Max-CUT and Max-$k$-SAT. In this paper we extend and speed up this scheme using a logarithmic number of one-bit predictions. We propose a learning augmented framework which aims at finding fast algorithms which guarantees approximation consistency, smoothness and robustness with respect to the prediction error. We provide such algorithms, which moreover use predictions parsimoniously, for dense instances of various optimization problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Evripidis Bampis;Bruno Escoffier;Michalis Xefteris", "authorids": "evripidis.bampis@lip6.fr;bruno.escoffier@lip6.fr;~Michalis_Xefteris1", "gender": ";;M", "homepage": ";;https://mxef.github.io/", "dblp": ";;", "google_scholar": ";;Y1rFkrAAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "evripidis.bampis@lip6.fr;bruno.escoffier@lip6.fr;~Michalis_Xefteris1", "aff": ";;Sorbonne Universit\u00e9 - Facult\u00e9 des Sciences (Paris VI)", "aff_domain": ";;sorbonne-universite.fr", "position": ";;PhD student", "bibtex": "@inproceedings{\nbampis2024parsimonious,\ntitle={Parsimonious Learning-Augmented Approximations for Dense Instances of \\${\\textbackslash}mathcal\\{{NP}\\}\\$-hard Problems},\nauthor={Evripidis Bampis and Bruno Escoffier and Michalis Xefteris},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AD5QC1BTJL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 350613, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15288533402146896026&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";;sorbonne-universite.fr", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Sorbonne Universit\u00e9", "aff_unique_dep": "Facult\u00e9 des Sciences", "aff_unique_url": "https://www.sorbonne-universite.fr", "aff_unique_abbr": "Sorbonne U", "aff_campus_unique_index": "0", "aff_campus_unique": "Paris VI", "aff_country_unique_index": "0", "aff_country_unique": "France" }, { "title": "Do Large Code Models Understand Programming Concepts? Counterfactual Analysis for Code Predicates", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34779", "id": "ADnUzsmsLW", "proceeding": "https://proceedings.mlr.press/v235/hooda24a.html", "pdf": "https://openreview.net/pdf?id=ADnUzsmsLW", "openreview": "https://openreview.net/forum?id=ADnUzsmsLW", "author_site": "Ashish Hooda, Mihai Christodorescu, Miltiadis Allamanis, Aaron Wilson, Kassem Fawaz, Somesh Jha", "tldr": "", "abstract": "Large Language Models' success in text generation has also made them better at code generation and coding tasks. While a lot of work has demonstrated their remarkable performance on tasks such as code completion and editing, it is still unclear as to why. We help bridge this gap by exploring to what degree auto-regressive models understand the logical constructs of the underlying programs. We propose Counterfactual Analysis for Programming Concept Predicates (CACP) as a counterfactual testing framework to evaluate whether Large Code Models understand programming concepts. With only black-box access to the model, we use CACP to evaluate ten popular Large Code Models for four different programming concepts. Our findings suggest that current models lack understanding of concepts such as data flow and control flow.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ashish Hooda;Mihai Christodorescu;Miltiadis Allamanis;Aaron Wilson;Kassem Fawaz;Somesh Jha", "authorids": "~Ashish_Hooda1;~Mihai_Christodorescu1;~Miltiadis_Allamanis1;aaroncrwilson@google.com;~Kassem_Fawaz1;~Somesh_Jha1", "gender": ";M;;;;M", "homepage": "https://pages.cs.wisc.edu/~hooda;https://mihai.christodorescu.org;;;https://kassemfawaz.com;", "dblp": "279/6684;64/4993.html;;;97/535.html;j/SomeshJha", "google_scholar": "wCzkVGgAAAAJ;jRnIqvkAAAAJ;;;8TINuv4AAAAJ;BaI7l8QAAAAJ", "orcid": ";;;;0000-0002-4609-7691;", "linkedin": ";mihaichristodorescu/;;;kmfawaz/;", "or_profile": "~Ashish_Hooda1;~Mihai_Christodorescu1;~Miltiadis_Allamanis1;aaroncrwilson@google.com;~Kassem_Fawaz1;~Somesh_Jha1", "aff": "Department of Computer Science, University of Wisconsin - Madison;Google;;;University of Wisconsin - Madison;Department of Computer Science, University of Wisconsin, Madison", "aff_domain": "cs.wisc.edu;google.com;;;wisc.edu;cs.wisc.edu", "position": "PhD student;Researcher;;;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nhooda2024do,\ntitle={Do Large Code Models Understand Programming Concepts? Counterfactual Analysis for Code Predicates},\nauthor={Ashish Hooda and Mihai Christodorescu and Miltiadis Allamanis and Aaron Wilson and Kassem Fawaz and Somesh Jha},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ADnUzsmsLW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 599145, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6764858823992051557&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "cs.wisc.edu;google.com;;;wisc.edu;cs.wisc.edu", "author_num": 6, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of Wisconsin-Madison;Google", "aff_unique_dep": "Department of Computer Science;Google", "aff_unique_url": "https://www.wisc.edu;https://www.google.com", "aff_unique_abbr": "UW-Madison;Google", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Madison;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "On the Consistency of Kernel Methods with Dependent Observations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34778", "id": "AEHXvoOxV9", "proceeding": "https://proceedings.mlr.press/v235/massiani24a.html", "pdf": "https://openreview.net/pdf?id=AEHXvoOxV9", "openreview": "https://openreview.net/forum?id=AEHXvoOxV9", "author_site": "Pierre-Fran\u00e7ois Massiani, Sebastian Trimpe, Friedrich Solowjow", "tldr": "", "abstract": "The consistency of a learning method is usually established under the assumption that the observations are a realization of an independent and identically distributed (i.i.d.) or mixing process. Yet, kernel methods such as support vector machines (SVMs), Gaussian processes, or conditional kernel mean embeddings (CKMEs) all give excellent performance under sampling schemes that are obviously non-i.i.d., such as when data comes from a dynamical system. We propose the new notion of *empirical weak convergence (EWC)* as a general assumption explaining such phenomena for kernel methods. It assumes the existence of a random asymptotic data distribution and is a strict weakening of previous assumptions in the field. Our main results then establish consistency of SVMs, kernel mean embeddings, and general Hilbert-space valued empirical expectations with EWC data. Our analysis holds for both finite- and infinite-dimensional outputs, as we extend classical results of statistical learning to the latter case. In particular, it is also applicable to CKMEs. Overall, our results open new classes of processes to statistical learning and can serve as a foundation for a theory of learning beyond i.i.d. and mixing.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pierre-Fran\u00e7ois Massiani;Sebastian Trimpe;Friedrich Solowjow", "authorids": "~Pierre-Fran\u00e7ois_Massiani2;~Sebastian_Trimpe1;~Friedrich_Solowjow1", "gender": "M;M;", "homepage": ";https://www.dsme.rwth-aachen.de/trimpe;https://www.dsme.rwth-aachen.de/cms/DSME/Das-Institut/Team-CMS-Artikel-/~jptyz/Friedrich-Solowjow/", "dblp": ";15/8135;217/1553", "google_scholar": "ax9cEIQAAAAJ;https://scholar.google.de/citations?user=9kzHZssAAAAJ;https://scholar.google.de/citations?user=gq_ESzoAAAAJ", "orcid": "0000-0002-8019-4401;0000-0002-2785-2487;", "linkedin": ";sebastian-trimpe-2472a0a3/;", "or_profile": "~Pierre-Fran\u00e7ois_Massiani2;~Sebastian_Trimpe1;~Friedrich_Solowjow1", "aff": "Rheinisch Westf\u00e4lische Technische Hochschule Aachen;RWTH Aachen University;Rheinisch Westf\u00e4lische Technische Hochschule Aachen", "aff_domain": "rwth-aachen.de;rwth-aachen.de;rwth-aachen.de", "position": "PhD student;Full Professor;Lecturer", "bibtex": "@inproceedings{\nmassiani2024on,\ntitle={On the Consistency of Kernel Methods with Dependent Observations},\nauthor={Pierre-Fran{\\c{c}}ois Massiani and Sebastian Trimpe and Friedrich Solowjow},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AEHXvoOxV9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 502489, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YwDOAWc9X70J:scholar.google.com/&scioq=On+the+Consistency+of+Kernel+Methods+with+Dependent+Observations&hl=en&as_sdt=0,44", "gs_version_total": 6, "email": "rwth-aachen.de;rwth-aachen.de;rwth-aachen.de", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "RWTH Aachen University", "aff_unique_dep": "", "aff_unique_url": "https://www.rwth-aachen.de", "aff_unique_abbr": "RWTH", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Aachen", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "Understanding Diffusion Models by Feynman's Path Integral", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34777", "id": "AEqim4X0NV", "proceeding": "https://proceedings.mlr.press/v235/hirono24a.html", "pdf": "https://openreview.net/pdf?id=AEqim4X0NV", "openreview": "https://openreview.net/forum?id=AEqim4X0NV", "author_site": "Yuji Hirono, Akinori Tanaka, Kenji Fukushima", "tldr": "", "abstract": "Score-based diffusion models have proven effective in image generation and have gained widespread usage; however, the underlying factors contributing to the performance disparity between stochastic and deterministic (i.e., the probability flow ODEs) sampling schemes remain unclear. We introduce a novel formulation of diffusion models using Feynman's path integral, which is a formulation originally developed for quantum physics. We find this formulation providing comprehensive descriptions of score-based generative models, and demonstrate the derivation of backward stochastic differential equations and loss functions. The formulation accommodates an interpolating parameter connecting stochastic and deterministic sampling schemes, and we identify this parameter as a counterpart of Planck's constant in quantum physics. This analogy enables us to apply the Wentzel\u2013Kramers\u2013Brillouin (WKB) expansion, a well-established technique in quantum physics, for evaluating the negative log-likelihood to assess the performance disparity between stochastic and deterministic sampling schemes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuji Hirono;Akinori Tanaka;Kenji Fukushima", "authorids": "~Yuji_Hirono1;~Akinori_Tanaka1;~Kenji_Fukushima1", "gender": "M;M;M", "homepage": "https://sites.google.com/site/yujihironooo/;;", "dblp": ";243/2791;", "google_scholar": "pK2n4xsAAAAJ;tj5TiyMAAAAJ;zOtJ5sQAAAAJ", "orcid": "0000-0001-6327-9131;;", "linkedin": ";;", "or_profile": "~Yuji_Hirono1;~Akinori_Tanaka1;~Kenji_Fukushima1", "aff": "Kyoto University;RIKEN;The University of Tokyo, Tokyo Institute of Technology", "aff_domain": "kyoto-u.ac.jp;riken.jp;u-tokyo.ac.jp", "position": "Researcher;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nhirono2024understanding,\ntitle={Understanding Diffusion Models by Feynman's Path Integral},\nauthor={Yuji Hirono and Akinori Tanaka and Kenji Fukushima},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AEqim4X0NV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2439177, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16912160508447930208&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "kyoto-u.ac.jp;riken.jp;u-tokyo.ac.jp", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Kyoto University;RIKEN;University of Tokyo", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kyoto-u.ac.jp;https://www.riken.jp;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "Kyoto U;RIKEN;UTokyo", "aff_campus_unique_index": "1", "aff_campus_unique": ";Tokyo", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Japan" }, { "title": "DFlow: A Generative Model Combining Denoising AutoEncoder and Normalizing Flow for High Fidelity Waveform Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34776", "id": "AFAX28TdO4", "proceeding": "https://proceedings.mlr.press/v235/miao24d.html", "pdf": "https://openreview.net/pdf?id=AFAX28TdO4", "openreview": "https://openreview.net/forum?id=AFAX28TdO4", "author_site": "Chenfeng Miao, Qingying Zhu, Chen Minchuan, Wei Hu, Zijian Li, Shaojun Wang, Jing Xiao", "tldr": "", "abstract": "In this work, we present DFlow, a novel generative framework that combines Normalizing Flow (NF) with a Denoising AutoEncoder (DAE), for high-fidelity waveform generation. With a tactfully designed structure, DFlow seamlessly integrates the capabilities of both NF and DAE, resulting in a significantly improved performance compared to the standard NF models. Experimental results showcase DFlow's superiority, achieving the highest MOS score among the existing methods on commonly used datasets and the fastest synthesis speed among all likelihood models. We further demonstrate the generalization ability of DFlow by generating high-quality out-of-distribution audio samples, such as singing and music audio. Additionally, we extend the model capacity of DFlow by scaling up both the model size and training set size. Our large-scale universal vocoder, DFlow-XL, achieves highly competitive performance against the best universal vocoder, BigVGAN.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chenfeng Miao;Qingying Zhu;Minchuan Chen;Wei Hu;Zijian Li;Shaojun Wang;Jing Xiao", "authorids": "~Chenfeng_Miao1;~Qingying_Zhu1;~Minchuan_Chen1;~Wei_Hu15;~Zijian_Li8;~Shaojun_Wang1;~Jing_Xiao3", "gender": "M;F;M;M;M;M;M", "homepage": ";;https://www.linkedin.cn/incareer/in/ACoAAA2jMKYB8UxwqZl3KeQ-AcLS7XGfGCmXh1M;https://baike.baidu.com/item/%E8%83%A1%E7%8E%AE/22216769?fr=ge_ala;;;http://www.cs.cmu.edu/~jxiao/", "dblp": "270/4712;;263/5066.html;;;62/6040;67/4008-6.html", "google_scholar": "omEV9JwAAAAJ;;ru7ffmgAAAAJ;;https://scholar.google.com/citations?hl=en;;mcBd8KUAAAAJ", "orcid": ";0009-0003-0155-0709;;0009-0000-3976-8816;0000-0003-4092-5131;;0000-0001-9615-4749", "linkedin": ";https://www.linkedin.cn/incareer/in/ACoAACSp_VYBGnvbqI1F0cJwl1vj0fj2P76TgBg;;;zijian-li-9252b8b6/;;jing-xiao-8653051/", "or_profile": "~Chenfeng_Miao1;~Qingying_Zhu1;~Minchuan_Chen1;~Wei_Hu15;~Zijian_Li8;~Shaojun_Wang1;~Jing_Xiao3", "aff": "PingAn Technology;Pingan Technology;Pingan Technology;Pingan Technology;Meta;PAII Inc.;Pingan Group", "aff_domain": "pingan.com.cn;pingan.com.cn;pingan.com.cn;pingan.com.cn;meta.com;pingan.com.cn;pingan.com.cn", "position": "Researcher;Researcher;Researcher;CTO;Researcher;NLP chief scientist;Chief Scientist", "bibtex": "@inproceedings{\nmiao2024dflow,\ntitle={{DF}low: A Generative Model Combining Denoising AutoEncoder and Normalizing Flow for High Fidelity Waveform Generation},\nauthor={Chenfeng Miao and Qingying Zhu and Minchuan Chen and Wei Hu and Zijian Li and Shaojun Wang and Jing Xiao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AFAX28TdO4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 0, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:KsVa9yjqjpsJ:scholar.google.com/&scioq=DFlow:+A+Generative+Model+Combining+Denoising+AutoEncoder+and+Normalizing+Flow+for+High+Fidelity+Waveform+Generation&hl=en&as_sdt=0,5", "gs_version_total": 4, "email": "pingan.com.cn;pingan.com.cn;pingan.com.cn;pingan.com.cn;meta.com;pingan.com.cn;pingan.com.cn", "author_num": 7, "aff_unique_index": "0;0;0;0;1;2;3", "aff_unique_norm": "PingAn Technology;Meta;PAII Inc.;Ping An Group", "aff_unique_dep": ";Meta Platforms, Inc.;;", "aff_unique_url": "https://www.pingan.com;https://meta.com;;https://www.pingan.com.cn", "aff_unique_abbr": "PingAn;Meta;;Ping An", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;1;0", "aff_country_unique": "China;United States" }, { "title": "A Diffusion Model Framework for Unsupervised Neural Combinatorial Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34775", "id": "AFfXlKFHXJ", "proceeding": "https://proceedings.mlr.press/v235/sanokowski24a.html", "pdf": "https://openreview.net/pdf?id=AFfXlKFHXJ", "openreview": "https://openreview.net/forum?id=AFfXlKFHXJ", "author_site": "Sebastian Sanokowski, Sepp Hochreiter, Sebastian Lehner", "tldr": "", "abstract": "Learning to sample from intractable distributions over discrete sets without relying on corresponding training data is a central problem in a wide range of fields, including Combinatorial Optimization. Currently, popular deep learning-based approaches rely primarily on generative models that yield exact sample likelihoods. This work introduces a method that lifts this restriction and opens the possibility to employ highly expressive latent variable models like diffusion models. Our approach is conceptually based on a loss that upper bounds the reverse Kullback-Leibler divergence and evades the requirement of exact sample likelihoods. We experimentally validate our approach in data-free Combinatorial Optimization and demonstrate that our method achieves a new state-of-the-art on a wide range of benchmark problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sebastian Sanokowski;Sepp Hochreiter;Sebastian Lehner", "authorids": "~Sebastian_Sanokowski1;~Sepp_Hochreiter1;~Sebastian_Lehner1", "gender": "M;M;", "homepage": ";https://www.jku.at/en/institute-for-machine-learning/about-us/team/sepp-hochreiter/;https://www.jku.at/institut-fuer-machine-learning/ueber-uns/team/dr-sebastian-lehner/", "dblp": "277/0779;h/SeppHochreiter.html;292/2938", "google_scholar": "9A8llhsAAAAJ;https://scholar.google.at/citations?user=tvUH3WMAAAAJ;gZO5TdUAAAAJ", "orcid": "0000-0001-8065-5805;0000-0001-7449-2528;", "linkedin": ";https://linkedin.com/in/sepp-hochreiter-41514846;", "or_profile": "~Sebastian_Sanokowski1;~Sepp_Hochreiter1;~Sebastian_Lehner1", "aff": "Johannes Kepler University Linz;Johannes Kepler University Linz;Johannes Kepler University Linz", "aff_domain": "jku.at;jku.at;jku.at", "position": "PhD student;Full Professor;Postdoc", "bibtex": "@inproceedings{\nsanokowski2024a,\ntitle={A Diffusion Model Framework for Unsupervised Neural Combinatorial Optimization},\nauthor={Sebastian Sanokowski and Sepp Hochreiter and Sebastian Lehner},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AFfXlKFHXJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3191560, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=93953557853879800&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "jku.at;jku.at;jku.at", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Johannes Kepler University", "aff_unique_dep": "", "aff_unique_url": "https://www.jku.at", "aff_unique_abbr": "JKU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Linz", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Austria" }, { "title": "Learning Exceptional Subgroups by End-to-End Maximizing KL-Divergence", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34774", "id": "AG45XqwPKU", "proceeding": "https://proceedings.mlr.press/v235/xu24w.html", "pdf": "https://openreview.net/pdf?id=AG45XqwPKU", "openreview": "https://openreview.net/forum?id=AG45XqwPKU", "author_site": "Sascha Xu, Nils Philipp Walter, Janis Kalofolias, Jilles Vreeken", "tldr": "", "abstract": "Finding and describing sub-populations that are exceptional in terms of a target property has important applications in many scientific disciplines, from identifying disadvantaged demographic groups in census data to finding conductive molecules within gold nanoparticles. Current approaches to finding such subgroups require pre-discretized predictive variables, do not permit non-trivial target distributions, do not scale to large datasets, and struggle to find diverse results. To address these limitations, we propose SYFLOW, an end-to-end optimizable approach in which we leverage normalizing flows to model arbitrary target distributions and introduce a novel neural layer that results in easily interpretable subgroup descriptions. We demonstrate on synthetic data, real-world data, and via a case study, that SYFLOW reliably finds highly exceptional subgroups accompanied by insightful descriptions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sascha Xu;Nils Philipp Walter;Janis Kalofolias;Jilles Vreeken", "authorids": "~Sascha_Xu1;~Nils_Philipp_Walter1;~Janis_Kalofolias1;~Jilles_Vreeken2", "gender": "M;M;M;M", "homepage": ";;;https://vreeken.eu", "dblp": "247/3300;;194/4254;94/6462", "google_scholar": "https://scholar.google.de/citations?user=82xDR9IAAAAJ;https://scholar.google.de/citations?user=uMpszvoAAAAJ;;p5HEQfIAAAAJ", "orcid": "0009-0008-5191-0342;;0000-0002-5842-8750;0000-0002-2310-2806", "linkedin": "sascha-xu-36073216a/;;;jilles-vreeken-b3b05b58/", "or_profile": "~Sascha_Xu1;~Nils_Philipp_Walter1;~Janis_Kalofolias1;~Jilles_Vreeken2", "aff": "CISPA, saarland university, saarland informatics campus;CISPA, saarland university, saarland informatics campus;CISPA, Helmholtz Center for Information Security;CISPA Helmholtz Center for Information Security", "aff_domain": "cispa.saarland;cispa.saarland;cispa.saarland;cispa.de", "position": "PhD student;PhD student;PhD student;Tenured Faculty", "bibtex": "@inproceedings{\nxu2024learning,\ntitle={Learning Exceptional Subgroups by End-to-End Maximizing {KL}-Divergence},\nauthor={Sascha Xu and Nils Philipp Walter and Janis Kalofolias and Jilles Vreeken},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AG45XqwPKU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1594096, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13003679313300569398&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13, "email": "cispa.saarland;cispa.saarland;cispa.saarland;cispa.de", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Saarland University;Helmholtz Center for Information Security;CISPA Helmholtz Center for Information Security", "aff_unique_dep": "CISPA;CISPA;", "aff_unique_url": "https://www.uni-saarland.de;https://www.cispa.de/;https://www.cispa.de/", "aff_unique_abbr": "Saarland U;CISPA;CISPA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Saarland Informatics Campus;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Position: Stop Making Unscientific AGI Performance Claims", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34773", "id": "AIXUuLCuMe", "proceeding": "https://proceedings.mlr.press/v235/altmeyer24a.html", "pdf": "https://openreview.net/pdf?id=AIXUuLCuMe", "openreview": "https://openreview.net/forum?id=AIXUuLCuMe", "author_site": "Patrick Altmeyer, Andrew Demetriou, Antony Bartlett, Cynthia C. S. Liem", "tldr": "", "abstract": "Developments in the field of Artificial Intelligence (AI), and particularly large language models (LLMs), have created a 'perfect storm\u2019 for observing 'sparks\u2019 of Artificial General Intelligence (AGI) that are spurious. Like simpler models, LLMs distill meaningful representations in their latent embeddings that have been shown to correlate with external variables. Nonetheless, the correlation of such representations has often been linked to human-like intelligence in the latter but not the former. We probe models of varying complexity including random projections, matrix decompositions, deep autoencoders and transformers: all of them successfully distill information that can be used to predict latent or external variables and yet none of them have previously been linked to AGI. We argue and empirically demonstrate that the finding of meaningful patterns in latent spaces of models cannot be seen as evidence in favor of AGI. Additionally, we review literature from the social sciences that shows that humans are prone to seek such patterns and anthropomorphize. We conclude that both the methodological setup and common public image of AI are ideal for the misinterpretation that correlations between model representations and some variables of interest are 'caused' by the model's understanding of underlying 'ground truth\u2019 relationships. We, therefore, call for the academic community to exercise extra caution, and to be keenly aware of principles of academic integrity, in interpreting and communicating about AI research outcomes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Patrick Altmeyer;Andrew M. Demetriou;Antony Bartlett;Cynthia C. S. Liem", "authorids": "~Patrick_Altmeyer1;~Andrew_M._Demetriou1;a.j.bartlett@tudelft.nl;~Cynthia_C._S._Liem2", "gender": "M;M;;", "homepage": "https://www.paltmeyer.com/;;;", "dblp": ";;;", "google_scholar": "e7KRRa8AAAAJ;TFnVkFQAAAAJ;;", "orcid": "0000-0003-4726-8613;0000-0002-0724-2278;;", "linkedin": "patrick-altmeyer-a2a25494/;andrew-m-demetriou/;;", "or_profile": "~Patrick_Altmeyer1;~Andrew_M._Demetriou1;a.j.bartlett@tudelft.nl;~Cynthia_C._S._Liem2", "aff": "Delft University of Technology;Delft University of Technology;;", "aff_domain": "tudelft.nl;tudelft.nl;;", "position": "PhD student;PhD student;;", "bibtex": "@inproceedings{\naltmeyer2024position,\ntitle={Position: Stop Making Unscientific {AGI} Performance Claims},\nauthor={Patrick Altmeyer and Andrew M. Demetriou and Antony Bartlett and Cynthia C. S. Liem},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AIXUuLCuMe}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3355148, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4093763960403463418&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 8, "email": "tudelft.nl;tudelft.nl;;", "author_num": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Delft University of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.tudelft.nl", "aff_unique_abbr": "TU Delft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Netherlands" }, { "title": "Reinforcement Learning within Tree Search for Fast Macro Placement", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34772", "id": "AJGwSx0RUV", "proceeding": "https://proceedings.mlr.press/v235/geng24b.html", "pdf": "https://openreview.net/pdf?id=AJGwSx0RUV", "openreview": "https://openreview.net/forum?id=AJGwSx0RUV", "author_site": "Zijie Geng, Jie Wang, Ziyan Liu, Siyuan Xu, Zhentao Tang, Mingxuan Yuan, Jianye Hao, Yongdong Zhang, Feng Wu", "tldr": "", "abstract": "Macro placement is a crucial step in modern chip design, and reinforcement learning (RL) has recently emerged as a promising technique for improving the placement quality. However, existing RL-based techniques are hindered by their low sample efficiency, requiring numerous online rollouts or substantial offline expert data to achieve bootstrap, which are often impractical in industrial scenarios. To address this challenge, we propose a novel sample-efficient framework, namely **EfficientPlace**, for fast macro placement. EfficientPlace integrates a global tree search algorithm to strategically direct the optimization process, as well as a RL agent for local policy learning to advance the tree search. Experiments on commonly used benchmarks demonstrate that EfficientPlace achieves remarkable placement quality within a short timeframe, outperforming recent state-of-the-art approaches.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zijie Geng;Jie Wang;Ziyan Liu;Siyuan Xu;Zhentao Tang;Mingxuan Yuan;Jianye HAO;Yongdong Zhang;Feng Wu", "authorids": "~Zijie_Geng1;~Jie_Wang1;~Ziyan_Liu2;~Siyuan_Xu5;~Zhentao_Tang1;~Mingxuan_Yuan1;~Jianye_HAO1;~Yongdong_Zhang2;~Feng_Wu1", "gender": "M;M;M;M;M;M;M;M;M", "homepage": "https://miralab.ai/people/zijie-geng/;http://staff.ustc.edu.cn/~jwangx;https://github.com/Leo-ux-jpg;;;;http://www.icdai.org/jianye.html;https://imcc.ustc.edu.cn/_upload/tpl/0d/13/3347/template3347/zhangyongdong.html;", "dblp": "320/7568;29/5259-5;;;195/1259.html;74/2356;21/7664.html;z/YongdongZhang;25/3972-1", "google_scholar": "https://scholar.google.com.hk/citations?user=Ga66HL4AAAAJ;OugG4dUAAAAJ;;;;https://scholar.google.com/citations?hl=en;;https://scholar.google.com.hk/citations?user=hxGs4ukAAAAJ;5bInRDEAAAAJ", "orcid": ";;;0000-0001-6239-6774;0000-0002-2481-4119;0000-0002-2236-8784;0000-0002-0422-8235;0000-0003-0066-3448;", "linkedin": ";;;siyuanxu1991/;;;;;", "or_profile": "~Zijie_Geng1;~Jie_Wang1;~Ziyan_Liu2;~Siyuan_Xu5;~Zhentao_Tang1;~Mingxuan_Yuan1;~Jianye_HAO1;~Yongdong_Zhang2;~Feng_Wu1", "aff": "University of Science and Technology of China;University of Science and Technology of China;University of Science and Technology of China;Huawei Technologies Ltd.;Huawei Technologies Ltd.;Huawei Technologies Ltd.;Tianjin University;University of Science and Technology of China;University of Science and Technology of China", "aff_domain": "mail.ustc.edu.cn;ustc.edu.cn;mail.ustc.edu.cn;huawei.com;huawei.com;huawei.com;tju.edu.cn;ustc.edu.cn;ustc.edu.cn", "position": "MS student;Full Professor;Undergrad student;Researcher;Researcher;Researcher;Associate Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\ngeng2024reinforcement,\ntitle={Reinforcement Learning within Tree Search for Fast Macro Placement},\nauthor={Zijie Geng and Jie Wang and Ziyan Liu and Siyuan Xu and Zhentao Tang and Mingxuan Yuan and Jianye HAO and Yongdong Zhang and Feng Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AJGwSx0RUV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4422019, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7138732990656367055&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "mail.ustc.edu.cn;ustc.edu.cn;mail.ustc.edu.cn;huawei.com;huawei.com;huawei.com;tju.edu.cn;ustc.edu.cn;ustc.edu.cn", "author_num": 9, "aff_unique_index": "0;0;0;1;1;1;2;0;0", "aff_unique_norm": "University of Science and Technology of China;Huawei;Tianjin University", "aff_unique_dep": ";Huawei Technologies;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.huawei.com;http://www.tju.edu.cn", "aff_unique_abbr": "USTC;Huawei;TJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Interplay of ROC and Precision-Recall AUCs: Theoretical Limits and Practical Implications in Binary Classification", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34771", "id": "ALc7DmOTI2", "proceeding": "https://proceedings.mlr.press/v235/mihelich24a.html", "pdf": "https://openreview.net/pdf?id=ALc7DmOTI2", "openreview": "https://openreview.net/forum?id=ALc7DmOTI2", "author_site": "Martin Mihelich, Fran\u00e7ois Castagnos, Charles Dognin", "tldr": "", "abstract": "In this paper, we present two key theorems that should have significant implications for machine learning practitioners working with binary classification models. The first theorem provides a formula to calculate the maximum and minimum Precision-Recall AUC ($AUC_{PR}$) for a fixed Receiver Operating Characteristic AUC ($AUC_{ROC}$), demonstrating the variability of $AUC_{PR}$ even with a high $AUC_{ROC}$. This is particularly relevant for imbalanced datasets, where a good $AUC_{ROC}$ does not necessarily imply a high $AUC_{PR}$. The second theorem inversely establishes the bounds of $AUC_{ROC}$ given a fixed $AUC_{PR}$. Our findings highlight that in certain situations, especially for imbalanced datasets, it is more informative to prioritize $AUC_{PR}$ over $AUC_{ROC}$. Additionally, we introduce a method to determine when a higher $AUC_{ROC}$ in one model implies a higher $AUC_{PR}$ in another and vice versa, streamlining the model evaluation process.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Martin Mihelich;Fran\u00e7ois Castagnos;Charles Dognin", "authorids": "~Martin_Mihelich1;~Fran\u00e7ois_Castagnos1;~Charles_Dognin2", "gender": "M;;", "homepage": ";;", "dblp": ";;https://dblp.uni-trier.de/pers/hd/d/Dognin:Charles", "google_scholar": "vx_tA0UAAAAJ;;Q9IMhr0AAAAJ", "orcid": ";;", "linkedin": ";https://fr.linkedin.com/in/castafra;charlesdognin/", "or_profile": "~Martin_Mihelich1;~Fran\u00e7ois_Castagnos1;~charles_dognin1", "aff": ";Glanceable;Glanceable", "aff_domain": ";glanceable.io;glanceable.io", "position": ";Researcher;Principal Researcher", "bibtex": "@inproceedings{\nmihelich2024interplay,\ntitle={Interplay of {ROC} and Precision-Recall {AUC}s: Theoretical Limits and Practical Implications in Binary Classification},\nauthor={Martin Mihelich and Fran{\\c{c}}ois Castagnos and Charles Dognin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ALc7DmOTI2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 792682, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Hf9PH7ZbGQYJ:scholar.google.com/&scioq=Interplay+of+ROC+and+Precision-Recall+AUCs:+Theoretical+Limits+and+Practical+Implications+in+Binary+Classification&hl=en&as_sdt=0,33", "gs_version_total": 4, "email": ";glanceable.io;glanceable.io", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Glanceable", "aff_unique_dep": "", "aff_unique_url": "", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "", "aff_country_unique": "" }, { "title": "Constrained Ensemble Exploration for Unsupervised Skill Discovery", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34770", "id": "AOJCCFTlfJ", "proceeding": "https://proceedings.mlr.press/v235/bai24d.html", "pdf": "https://openreview.net/pdf?id=AOJCCFTlfJ", "openreview": "https://openreview.net/forum?id=AOJCCFTlfJ", "author_site": "Chenjia Bai, Rushuai Yang, Qiaosheng Zhang, Kang Xu, Yi Chen, Ting Xiao, Xuelong Li", "tldr": "", "abstract": "Unsupervised Reinforcement Learning (RL) provides a promising paradigm for learning useful behaviors via reward-free per-training. Existing methods for unsupervised RL mainly conduct empowerment-driven skill discovery or entropy-based exploration. However, empowerment often leads to static skills, and pure exploration only maximizes the state coverage rather than learning useful behaviors. In this paper, we propose a novel unsupervised RL framework via an ensemble of skills, where each skill performs partition exploration based on the state prototypes. Thus, each skill can explore the clustered area locally, and the ensemble skills maximize the overall state coverage. We adopt state-distribution constraints for the skill occupancy and the desired cluster for learning distinguishable skills. Theoretical analysis is provided for the state entropy and the resulting skill distributions. Based on extensive experiments on several challenging tasks, we find our method learns well-explored ensemble skills and achieves superior performance in various downstream tasks compared to previous methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chenjia Bai;Rushuai Yang;Qiaosheng Zhang;Kang Xu;Yi Chen;Ting Xiao;Xuelong Li", "authorids": "~Chenjia_Bai2;~Rushuai_Yang1;~Qiaosheng_Zhang2;~Kang_Xu2;~Yi_Chen18;~Ting_Xiao1;~Xuelong_Li2", "gender": "M;M;M;M;M;F;M", "homepage": "https://baichenjia.github.io/;https://github.com/Rooshy-yang;https://qiaoshengzhang.github.io/;https://kangxu023.github.io/;https://seng.hkust.edu.hk/about/people/faculty/yi-chen;https://scholar.google.com/citations?hl=zh-CN&user=XMIU7ygAAAAJ;", "dblp": "247/1943;;181/8458.html;295/1622;;;l/XuelongLi", "google_scholar": "Rm_1y2kAAAAJ;TiHUw0UAAAAJ;;7FTLsHUAAAAJ;https://scholar.google.ca/citations?user=Vjpq4aYAAAAJ;https://scholar.google.com/citations?hl=zh-CN;ahUibskAAAAJ", "orcid": ";;0000-0001-6114-8453;0000-0001-6040-3002;;0000-0003-3155-7664;", "linkedin": ";;;;;;", "or_profile": "~Chenjia_Bai2;~Rushuai_Yang1;~Qiaosheng_Zhang2;~Kang_Xu2;~Yi_Chen18;~Ting_Xiao1;~Xuelong_Li2", "aff": "Shanghai AI Laboratory;Hong Kong University of Science and Technology;Shanghai Artificial Intelligence Laboratory;Fudan University;Hong Kong University of Science and Technology;East China University of Science and Technology;Northwestern Polytechnical University", "aff_domain": "pjlab.org.cn;ust.hk;pjlab.org.cn;fudan.edu.cn;ust.hk;ecust.edu.cn;nwpu.edu.cn", "position": "Researcher;PhD student;Researcher;MS student;Assistant Professor;Lecturer;Full Professor", "bibtex": "@inproceedings{\nbai2024constrained,\ntitle={Constrained Ensemble Exploration for Unsupervised Skill Discovery},\nauthor={Chenjia Bai and Rushuai Yang and Qiaosheng Zhang and Kang Xu and Yi Chen and Ting Xiao and Xuelong Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AOJCCFTlfJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9702311, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5973272266065702190&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "pjlab.org.cn;ust.hk;pjlab.org.cn;fudan.edu.cn;ust.hk;ecust.edu.cn;nwpu.edu.cn", "author_num": 7, "aff_unique_index": "0;1;2;3;1;4;5", "aff_unique_norm": "Shanghai AI Laboratory;Hong Kong University of Science and Technology;Shanghai Artificial Intelligence Laboratory;Fudan University;East China University of Science and Technology;Northwestern Polytechnical University", "aff_unique_dep": ";;;;;", "aff_unique_url": "https://www.shanghai-ai-lab.com;https://www.ust.hk;http://www.shailab.org/;https://www.fudan.edu.cn;http://www.ecust.edu.cn;https://www.nwpu.edu.cn", "aff_unique_abbr": "SAIL;HKUST;Shanghai AI Lab;Fudan;ECUST;NWPU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Analysis for Abductive Learning and Neural-Symbolic Reasoning Shortcuts", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34769", "id": "AQYabSOfci", "proceeding": "https://proceedings.mlr.press/v235/yang24ac.html", "pdf": "https://openreview.net/pdf?id=AQYabSOfci", "openreview": "https://openreview.net/forum?id=AQYabSOfci", "author_site": "Xiao-Wen Yang, Wen-Da Wei, Jie-Jing Shao, Yu-Feng Li, Zhi-Hua Zhou", "tldr": "", "abstract": "Abductive learning models (ABL) and neural-symbolic predictive models (NeSy) have been recently shown effective, as they allow us to infer labels that are consistent with some prior knowledge by reasoning over high-level concepts extracted from sub-symbolic inputs. However, their generalization ability is affected by reasoning shortcuts: high accuracy on given targets but leveraging intermediate concepts with unintended semantics. Although there have been techniques to alleviate reasoning shortcuts, theoretical efforts on this issue remain to be limited. This paper proposes a simple and effective analysis to quantify harm caused by it and how can mitigate it. We quantify three main factors in how NeSy algorithms are affected by reasoning shortcuts: the complexity of the knowledge base, the sample size, and the hypothesis space. In addition, we demonstrate that ABL can reduce shortcut risk by selecting specific distance functions in consistency optimization, thereby demonstrating its potential and approach to solving shortcut problems. Empirical studies demonstrate the rationality of the analysis. Moreover, the proposal is suitable for many ABL and NeSy algorithms and can be easily extended to handle other cases of reasoning shortcuts.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiao-Wen Yang;Wen-Da Wei;Jie-Jing Shao;Yu-Feng Li;Zhi-Hua Zhou", "authorids": "~Xiao-Wen_Yang4;~Wen-Da_Wei1;~Jie-Jing_Shao1;~Yu-Feng_Li1;~Zhi-Hua_Zhou2", "gender": "M;;M;M;M", "homepage": "http://www.lamda.nju.edu.cn/shaojj/;https://ieeexplore.ieee.org/author/37089937205;https://www.lamda.nju.edu.cn/yangxw;https://cs.nju.edu.cn/liyf/index.htm;https://cs.nju.edu.cn/zhouzh/", "dblp": "299/4982;;165/2864.html;57/413;z/ZhiHuaZhou", "google_scholar": "k1tEDpQAAAAJ;;;https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=rSVIHasAAAAJ", "orcid": "0000-0001-8107-114X;;0009-0007-4206-6242;0000-0002-2220-5248;0000-0003-0746-1494", "linkedin": ";;;;", "or_profile": "~Jie-Jing_Shao1;~wenda_wei2;~Xiao-wen_Yang3;~Yu-feng_Li2;~Zhi-hua_Zhou1", "aff": "Nanjing University;Nanjing University;Nanjing University;Nanjing University;Nanjing University", "aff_domain": "nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn", "position": "PhD student;PhD student;PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nyang2024analysis,\ntitle={Analysis for Abductive Learning and Neural-Symbolic Reasoning Shortcuts},\nauthor={Xiao-Wen Yang and Wen-Da Wei and Jie-Jing Shao and Yu-Feng Li and Zhi-Hua Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AQYabSOfci}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4265200, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10812601655559283710&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "", "aff_unique_url": "https://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "COALA: A Practical and Vision-Centric Federated Learning Platform", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34768", "id": "ATRnM8PyQX", "proceeding": "https://proceedings.mlr.press/v235/zhuang24c.html", "pdf": "https://openreview.net/pdf?id=ATRnM8PyQX", "openreview": "https://openreview.net/forum?id=ATRnM8PyQX", "author_site": "Weiming Zhuang, Jian Xu, Chen Chen, Jingtao Li, Lingjuan Lyu", "tldr": "", "abstract": "We present COALA, a vision-centric Federated Learning (FL) platform, and a suite of benchmarks for practical FL scenarios, which we categorize as task, data, and model levels. At the task level, COALA extends support from simple classification to 15 computer vision tasks, including object detection, segmentation, pose estimation, and more. It also facilitates federated multiple-task learning, allowing clients to train on multiple tasks simultaneously. At the data level, COALA goes beyond supervised FL to benchmark both semi-supervised FL and unsupervised FL. It also benchmarks feature distribution shifts other than commonly considered label distribution shifts. In addition to dealing with static data, it supports federated continual learning for continuously changing data in real-world scenarios. At the model level, COALA benchmarks FL with split models and different models in different clients. COALA platform offers three degrees of customization for these practical FL scenarios, including configuration customization, components customization, and workflow customization. We conduct systematic benchmarking experiments for the practical FL scenarios and highlight potential opportunities for further advancements in FL.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weiming Zhuang;Jian Xu;Chen Chen;Jingtao Li;Lingjuan Lyu", "authorids": "~Weiming_Zhuang1;~Jian_Xu7;~Chen_Chen20;~Jingtao_Li1;~Lingjuan_Lyu1", "gender": ";M;M;M;F", "homepage": "https://weiming.me/;;https://cc233.github.io/;https://zlijingtao.github.io;https://sites.google.com/view/lingjuan-lyu", "dblp": "274/0724;73/1149-16;65/4423-43;;178/9876", "google_scholar": "lLuLAzEAAAAJ;5kjbGosAAAAJ;;JIBdJbAAAAAJ;", "orcid": ";0000-0001-6201-9215;0000-0001-7359-8515;0000-0003-4250-869X;", "linkedin": ";;;;", "or_profile": "~Weiming_Zhuang1;~Jian_Xu7;~Chen_Chen20;~Jingtao_Li1;~Lingjuan_Lyu1", "aff": "Sony Research;Tsinghua University;Sony AI;Sony AI;Sony", "aff_domain": "sony.com;mails.tsinghua.edu.cn;sony.com;sony.com;sony.com", "position": "Researcher;PhD student;Researcher;Researcher;scientist", "bibtex": "@inproceedings{\nzhuang2024coala,\ntitle={{COALA}: A Practical and Vision-Centric Federated Learning Platform},\nauthor={Weiming Zhuang and Jian Xu and Chen Chen and Jingtao Li and Lingjuan Lyu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ATRnM8PyQX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1269721, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6279380364060815894&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "sony.com;mails.tsinghua.edu.cn;sony.com;sony.com;sony.com", "author_num": 5, "aff_unique_index": "0;1;0;0;2", "aff_unique_norm": "Sony;Tsinghua University;Sony Corporation", "aff_unique_dep": "Research;;", "aff_unique_url": "https://www.sony.com;https://www.tsinghua.edu.cn;https://www.sony.com", "aff_unique_abbr": "Sony;THU;Sony", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "Japan;China" }, { "title": "Generative Enzyme Design Guided by Functionally Important Sites and Small-Molecule Substrates", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34767", "id": "ATvN9JnqZ8", "proceeding": "https://proceedings.mlr.press/v235/song24k.html", "pdf": "https://openreview.net/pdf?id=ATvN9JnqZ8", "openreview": "https://openreview.net/forum?id=ATvN9JnqZ8", "author_site": "Zhenqiao Song, Yunlong Zhao, Wenxian Shi, Wengong Jin, Yang Yang, Lei Li", "tldr": "", "abstract": "Enzymes are genetically encoded biocatalysts capable of accelerating chemical reactions. How can we automatically design functional enzymes? In this paper, we propose EnzyGen, an approach to learn a unified model to design enzymes across all functional families. Our key idea is to generate an enzyme's amino acid sequence and their three-dimensional (3D) coordinates based on functionally important sites and substrates corresponding to a desired catalytic function. These sites are automatically mined from enzyme databases. EnzyGen consists of a novel interleaving network of attention and neighborhood equivariant layers, which captures both long-range correlation in an entire protein sequence and local influence from nearest amino acids in 3D space. To learn the generative model, we devise a joint training objective, including a sequence generation loss, a position prediction loss and an enzyme-substrate interaction loss. We further construct EnzyBench, a dataset with 3157 enzyme families, covering all available enzymes within the protein data bank (PDB). Experimental results show that our EnzyGen consistently achieves the best performance across all 323 testing families, surpassing the best baseline by 10.79% in terms of substrate binding affinity. These findings demonstrate EnzyGen's superior capability in designing well-folded and effective enzymes binding to specific substrates with high affinities. Our code, model and dataset are provided at https://github.com/LeiLiLab/EnzyGen.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhenqiao Song;Yunlong Zhao;Wenxian Shi;Wengong Jin;Yang Yang;Lei Li", "authorids": "~Zhenqiao_Song1;~Yunlong_Zhao3;~Wenxian_Shi1;~Wengong_Jin1;~Yang_Yang55;~Lei_Li11", "gender": "F;M;;;Not Specified;M", "homepage": "https://jocelynsong.github.io/;;;http://people.csail.mit.edu/wengong;https://yang.chem.ucsb.edu/;https://www.cs.cmu.edu/~leili", "dblp": "227/7889;;;173/6620;;13/7007-5.html", "google_scholar": "https://scholar.google.com/citations?hl=en;;;IE5D8_QAAAAJ;https://scholar.google.com/citations?hl=en;BYXqAlwAAAAJ", "orcid": ";0009-0009-0330-6885;;;0000-0002-4956-2034;0000-0003-3095-9776", "linkedin": ";yunlong-zhao-5679991a0/;;;;", "or_profile": "~Zhenqiao_Song1;~Yunlong_Zhao3;~Wenxian_Shi1;~Wengong_Jin1;~Yang_Yang55;~Lei_Li11", "aff": "Carnegie Mellon University;Massachusetts Institute of Technology;;Broad Institute;University of California, Santa Barbara;School of Computer Science, Carnegie Mellon University", "aff_domain": "andrew.cmu.edu;mit.edu;;broadinstitute.org;ucsb.edu;cs.cmu.edu", "position": "PhD student;Researcher;;Postdoc;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nsong2024generative,\ntitle={Generative Enzyme Design Guided by Functionally Important Sites and Small-Molecule Substrates},\nauthor={Zhenqiao Song and Yunlong Zhao and Wenxian Shi and Wengong Jin and Yang Yang and Lei Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ATvN9JnqZ8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3443185, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9502996672396544290&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "email": "andrew.cmu.edu;mit.edu;;broadinstitute.org;ucsb.edu;cs.cmu.edu", "author_num": 6, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "Carnegie Mellon University;Massachusetts Institute of Technology;Broad Institute;University of California, Santa Barbara", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.cmu.edu;https://web.mit.edu;https://www.broadinstitute.org;https://www.ucsb.edu", "aff_unique_abbr": "CMU;MIT;Broad;UCSB", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Santa Barbara;Pittsburgh", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Experts Don't Cheat: Learning What You Don't Know By Predicting Pairs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34766", "id": "AVEc9LvSlO", "proceeding": "https://proceedings.mlr.press/v235/johnson24a.html", "pdf": "https://openreview.net/pdf?id=AVEc9LvSlO", "openreview": "https://openreview.net/forum?id=AVEc9LvSlO", "author_site": "Daniel D. Johnson, Daniel Tarlow, David Duvenaud, Chris Maddison", "tldr": "", "abstract": "Identifying how much a model $\\hat{p}\\_{Y|X}^{\\theta}$ knows about the stochastic real-world process $p\\_{Y|X}$ it was trained on is important to ensure it avoids producing incorrect or \"hallucinated\" answers or taking unsafe actions. But this is difficult for generative models because probabilistic predictions do not distinguish between per-response noise (aleatoric uncertainty) and lack of knowledge about the process (epistemic uncertainty), and existing epistemic uncertainty quantification techniques tend to be overconfident when the model underfits. We propose a general strategy for teaching a model to both approximate $p\\_{Y|X}$ and also estimate the remaining gaps between $\\hat{p}_{Y|X}^{\\theta}$ and $p\\_{Y|X}$: train it to predict *pairs* of independent responses drawn from the true conditional distribution, allow it to \"cheat\" by observing one response while predicting the other, then measure how much it cheats. Remarkably, we prove that being good at cheating (i.e. cheating whenever it improves your prediction) is equivalent to being *second-order calibrated*, a principled extension of ordinary calibration that allows us to construct provably-correct frequentist confidence intervals for $p\\_{Y|X}$ and detect incorrect responses with high probability. We demonstrate empirically that our approach accurately estimates how much models don't know across ambiguous image classification, (synthetic) language modeling, and partially-observable navigation tasks, outperforming existing techniques.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Daniel D. Johnson;Daniel Tarlow;David Duvenaud;Chris J. Maddison", "authorids": "~Daniel_D._Johnson1;~Daniel_Tarlow1;~David_Duvenaud2;~Chris_J._Maddison1", "gender": "M;;M;", "homepage": "http://www.danieldjohnson.com;;https://www.cs.toronto.edu/~duvenaud/;", "dblp": "120/9868-1;;86/9380;", "google_scholar": "44R4pgMAAAAJ;;https://scholar.google.ca/citations?user=ZLpO3XQAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Daniel_D._Johnson1;~Daniel_Tarlow1;~David_Duvenaud2;~Chris_J._Maddison1", "aff": "Google;;Anthropic;", "aff_domain": "google.com;;anthropic.com;", "position": "Researcher;;Researcher;", "bibtex": "@inproceedings{\njohnson2024experts,\ntitle={Experts Don't Cheat: Learning What You Don't Know By Predicting Pairs},\nauthor={Daniel D. Johnson and Daniel Tarlow and David Duvenaud and Chris J. Maddison},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AVEc9LvSlO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8867508, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8230606856600414398&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "google.com;;anthropic.com;", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Google;Anthropic", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.anthropic.com", "aff_unique_abbr": "Google;Anthropic", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Prior Mismatch and Adaptation in PnP-ADMM with a Nonconvex Convergence Analysis", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34765", "id": "AYWBRwsZ8z", "proceeding": "https://proceedings.mlr.press/v235/shoushtari24a.html", "pdf": "https://openreview.net/pdf?id=AYWBRwsZ8z", "openreview": "https://openreview.net/forum?id=AYWBRwsZ8z", "author_site": "Shirin Shoushtari, JIAMING LIU, Edward Chandler, Salman Asif, Ulugbek Kamilov", "tldr": "", "abstract": "Plug-and-Play (PnP) priors is a widely-used family of methods for solving imaging inverse problems by integrating physical measurement models with image priors specified using image denoisers. PnP methods have been shown to achieve state-of-the-art performance when the prior is obtained using powerful deep denoisers. Despite extensive work on PnP, the topic of distribution mismatch between the training and testing data has often been overlooked in the PnP literature. This paper presents a set of new theoretical and numerical results on the topic of prior distribution mismatch and domain adaptation for the alternating direction method of multipliers (ADMM) variant of PnP. Our theoretical result provides an explicit error bound for PnP-ADMM due to the mismatch between the desired denoiser and the one used for inference. Our analysis contributes to the work in the area by considering the mismatch under nonconvex data-fidelity terms and expansive denoisers. Our first set of numerical results quantifies the impact of the prior distribution mismatch on the performance of PnP-ADMM on the problem of image super-resolution. Our second set of numerical results considers a simple and effective domain adaption strategy that closes the performance gap due to the use of mismatched denoisers. Our results suggest the relative robustness of PnP-ADMM to prior distribution mismatch, while also showing that the performance gap can be significantly reduced with only a few training samples from the desired distribution.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shirin Shoushtari;Jiaming Liu;Edward P. Chandler;M. Salman Asif;Ulugbek S. Kamilov", "authorids": "~Shirin_Shoushtari1;~Jiaming_Liu3;~Edward_P._Chandler1;~M._Salman_Asif1;~Ulugbek_S._Kamilov1", "gender": "F;M;M;M;Not Specified", "homepage": ";https://jiamingliu-jeremy.github.io/;;https://www.ece.ucr.edu/~sasif;https://ukmlv.github.io", "dblp": "321/1728;33/5934-1;;21/1910;73/9223", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;KEucBooAAAAJ;;Dl0puDcAAAAJ;https://scholar.google.com.tw/citations?user=3qYUSDwAAAAJ", "orcid": ";0000-0002-1042-4443;0009-0006-2650-1083;0000-0001-5993-3903;0000-0001-6770-3278", "linkedin": ";;edward-chandler-93279b1a3/;;", "or_profile": "~Shirin_Shoushtari1;~Jiaming_Liu3;~Edward_Pearson_Chandler1;~Salman_Asif1;~Ulugbek_Kamilov1", "aff": "Washington University, Saint Louis;Washington University, St. Louis;Washington University in St. Louis;University of California, Riverside;Google", "aff_domain": "wustl.edu;wustl.edu;wustl.edu;ucr.edu;google.com", "position": "PhD student;PhD student;PhD student;Associate Professor;Visiting Faculty Researcher", "bibtex": "@inproceedings{\nshoushtari2024prior,\ntitle={Prior Mismatch and Adaptation in PnP-{ADMM} with a Nonconvex Convergence Analysis},\nauthor={Shirin Shoushtari and Jiaming Liu and Edward P. Chandler and M. Salman Asif and Ulugbek S. Kamilov},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AYWBRwsZ8z}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8153285, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=808533261507513277&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 7, "email": "wustl.edu;wustl.edu;wustl.edu;ucr.edu;google.com", "author_num": 5, "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "Washington University in St. Louis;University of California, Riverside;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://wustl.edu;https://www.ucr.edu;https://www.google.com", "aff_unique_abbr": "WUSTL;UCR;Google", "aff_campus_unique_index": "0;1;1;2;3", "aff_campus_unique": "Saint Louis;St. Louis;Riverside;Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "X-Oscar: A Progressive Framework for High-quality Text-guided 3D Animatable Avatar Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34764", "id": "AYbXN9poJl", "proceeding": "https://proceedings.mlr.press/v235/ma24g.html", "pdf": "https://openreview.net/pdf?id=AYbXN9poJl", "openreview": "https://openreview.net/forum?id=AYbXN9poJl", "author_site": "Yiwei Ma, Zhekai Lin, Jiayi Ji, Yijun Fan, Xiaoshuai Sun, Rongrong Ji", "tldr": "", "abstract": "Recent advancements in automatic 3D avatar generation guided by text have made significant progress. However, existing methods have limitations such as oversaturation and low-quality output. To address these challenges, we propose X-Oscar, a progressive framework for generating high-quality animatable avatars from text prompts. It follows a sequential \"Geometry\u2192Texture\u2192Animation\" paradigm, simplifying optimization through step-by-step generation. To tackle oversaturation, we introduce Adaptive Variational Parameter (AVP), representing avatars as an adaptive distribution during training. Additionally, we present Avatar-aware Score Distillation Sampling (ASDS), a novel technique that incorporates avatar-aware noise into rendered images for improved generation quality during optimization. Extensive evaluations confirm the superiority of X-Oscar over existing text-to-3D and text-to-avatar approaches. Our anonymous project page: https://anonymous1440.github.io/.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yiwei Ma;Zhekai Lin;Jiayi Ji;Yijun Fan;Xiaoshuai Sun;Rongrong Ji", "authorids": "~Yiwei_Ma1;~Zhekai_Lin1;~Jiayi_Ji1;~Yijun_Fan2;~Xiaoshuai_Sun3;~Rongrong_Ji5", "gender": "M;M;M;F;M;M", "homepage": "https://xmu-xiaoma666.github.io/;https://github.com/LinZhekai;https://scholar.google.com/citations?user=xp_rICcAAAAJ&hl=zh-CN;https://github.com/zeroooooooow;https://sites.google.com/view/xssun;http://mac.xmu.edu.cn/rrji-en.html", "dblp": ";;250/9459;;26/5787.html;86/5681", "google_scholar": "KIDY5pUAAAAJ;;xp_rICcAAAAJ;;KPMK3B4AAAAJ;", "orcid": "0000-0002-8744-3423;;0000-0002-9956-6308;;0000-0003-3912-9306;", "linkedin": ";;;;;", "or_profile": "~Yiwei_Ma1;~Zhekai_Lin1;~Jiayi_Ji1;~Yijun_Fan2;~Xiaoshuai_Sun3;~Rongrong_Ji5", "aff": "Xiamen University;Xiamen University;Xiamen University;Xiamen University;Xiamen University;Xiamen University", "aff_domain": "xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn", "position": "PhD student;MS student;Postdoc;MS student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nma2024xoscar,\ntitle={X-Oscar: A Progressive Framework for High-quality Text-guided 3D Animatable Avatar Generation},\nauthor={Yiwei Ma and Zhekai Lin and Jiayi Ji and Yijun Fan and Xiaoshuai Sun and Rongrong Ji},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AYbXN9poJl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5259350, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=536249237837178909&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 7, "email": "xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Xiamen University", "aff_unique_dep": "", "aff_unique_url": "https://www.xmu.edu.cn", "aff_unique_abbr": "XMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Robustly Learning Single-Index Models via Alignment Sharpness", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34763", "id": "AZ1tWCa9j3", "proceeding": "https://proceedings.mlr.press/v235/zarifis24a.html", "pdf": "https://openreview.net/pdf?id=AZ1tWCa9j3", "openreview": "https://openreview.net/forum?id=AZ1tWCa9j3", "author_site": "Nikos Zarifis, Puqian Wang, Ilias Diakonikolas, Jelena Diakonikolas", "tldr": "", "abstract": "We study the problem of learning Single-Index Models under the $L_2^2$ loss in the agnostic model. We give an efficient learning algorithm, achieving a constant factor approximation to the optimal loss, that succeeds under a range of distributions (including log-concave distributions) and a broad class of monotone and Lipschitz link functions. This is the first efficient constant factor approximate agnostic learner, even for Gaussian data and for any nontrivial class of link functions. Prior work for the case of unknown link function either works in the realizable setting or does not attain constant factor approximation. The main technical ingredient enabling our algorithm and analysis is a novel notion of a local error bound in optimization that we term *alignment sharpness* and that may be of broader interest.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nikos Zarifis;Puqian Wang;Ilias Diakonikolas;Jelena Diakonikolas", "authorids": "~Nikos_Zarifis1;~Puqian_Wang1;~Ilias_Diakonikolas1;~Jelena_Diakonikolas2", "gender": ";;M;F", "homepage": ";;http://www.iliasdiakonikolas.org/;http://www.jelena-diakonikolas.com/", "dblp": ";;d/IliasDiakonikolas;147/5178", "google_scholar": ";;Vb3FLmkAAAAJ;J8ixfu8AAAAJ", "orcid": ";;;0000-0003-3439-0310", "linkedin": ";;;", "or_profile": "~Nikos_Zarifis1;~Puqian_Wang1;~Ilias_Diakonikolas1;~Jelena_Diakonikolas2", "aff": ";;University of Wisconsin - Madison;University of Wisconsin, Madison", "aff_domain": ";;wisc.edu;wisc.edu", "position": ";;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nzarifis2024robustly,\ntitle={Robustly Learning Single-Index Models via Alignment Sharpness},\nauthor={Nikos Zarifis and Puqian Wang and Ilias Diakonikolas and Jelena Diakonikolas},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AZ1tWCa9j3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 762805, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4297721673510214196&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": ";;wisc.edu;wisc.edu", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "University of Wisconsin-Madison;University of Wisconsin", "aff_unique_dep": ";", "aff_unique_url": "https://www.wisc.edu;https://www.wisc.edu", "aff_unique_abbr": "UW-Madison;UW", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Revisiting Character-level Adversarial Attacks for Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34762", "id": "AZWqXfM6z9", "proceeding": "https://proceedings.mlr.press/v235/abad-rocamora24a.html", "pdf": "https://openreview.net/pdf?id=AZWqXfM6z9", "openreview": "https://openreview.net/forum?id=AZWqXfM6z9", "author_site": "Elias Abad Rocamora, Yongtao Wu, Fanghui Liu, Grigorios Chrysos, Volkan Cevher", "tldr": "", "abstract": "Adversarial attacks in Natural Language Processing apply perturbations in the character or token levels. Token-level attacks, gaining prominence for their use of gradient-based methods, are susceptible to altering sentence semantics, leading to invalid adversarial examples. While character-level attacks easily maintain semantics, they have received less attention as they cannot easily adopt popular gradient-based methods, and are thought to be easy to defend. Challenging these beliefs, we introduce Charmer, an efficient query-based adversarial attack capable of achieving high attack success rate (ASR) while generating highly similar adversarial examples. Our method successfully targets both small (BERT) and large (Llama 2) models. Specifically, on BERT with SST-2, Charmer improves the ASR in $4.84$% points and the USE similarity in $8$% points with respect to the previous art. Our implementation is available in https://github.com/LIONS-EPFL/Charmer.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Elias Abad Rocamora;Yongtao Wu;Fanghui Liu;Grigorios Chrysos;Volkan Cevher", "authorids": "~Elias_Abad_Rocamora1;~Yongtao_Wu1;~Fanghui_Liu1;~Grigorios_Chrysos1;~Volkan_Cevher1", "gender": "M;M;M;M;M", "homepage": "https://megaelius.github.io/;https://www.epfl.ch/labs/lions/people/phds/yongtao-wu/;http://www.lfhsgre.org;https://grigorisg9gr.github.io/;http://lions.epfl.ch", "dblp": "329/4351;322/3726;119/1038;75/6117-2;70/5301", "google_scholar": "lHfp1OAAAAAJ;rLgDE9AAAAAJ;AKxBgssAAAAJ;1bU041kAAAAJ;https://scholar.google.ch/citations?user=hlWhzU8AAAAJ", "orcid": ";;0000-0003-4133-7921;;", "linkedin": "el%C3%ADas-abad-rocamora-8587261b8/?originalSubdomain=es;;;;", "or_profile": "~Elias_Abad_Rocamora1;~Yongtao_Wu1;~Fanghui_Liu1;~Grigorios_Chrysos1;~Volkan_Cevher1", "aff": "EPFL - EPF Lausanne;Swiss Federal Institute of Technology Lausanne;University of Warwick;University of Wisconsin - Madison;Amazon Development Center Germany", "aff_domain": "epfl.ch;epfl.ch;warwick.ac.uk;wisc.edu;amazon.de", "position": "PhD student;PhD student;Assistant Professor;Assistant Professor;Amazon Scholar", "bibtex": "@inproceedings{\nrocamora2024revisiting,\ntitle={Revisiting Character-level Adversarial Attacks for Language Models},\nauthor={Elias Abad Rocamora and Yongtao Wu and Fanghui Liu and Grigorios Chrysos and Volkan Cevher},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AZWqXfM6z9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6982856, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18376813133206936603&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 9, "email": "epfl.ch;epfl.ch;warwick.ac.uk;wisc.edu;amazon.de", "author_num": 5, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "EPFL;Swiss Federal Institute of Technology Lausanne;University of Warwick;University of Wisconsin-Madison;Amazon", "aff_unique_dep": ";;;;Development Center", "aff_unique_url": "https://www.epfl.ch;https://www.epfl.ch;https://www.warwick.ac.uk;https://www.wisc.edu;https://www.amazon.de", "aff_unique_abbr": "EPFL;EPFL;Warwick;UW-Madison;Amazon", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Lausanne;;Madison", "aff_country_unique_index": "0;0;1;2;3", "aff_country_unique": "Switzerland;United Kingdom;United States;Germany" }, { "title": "PcLast: Discovering Plannable Continuous Latent States", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34761", "id": "AaTYLZQPyC", "proceeding": "https://proceedings.mlr.press/v235/koul24a.html", "pdf": "https://openreview.net/pdf?id=AaTYLZQPyC", "openreview": "https://openreview.net/forum?id=AaTYLZQPyC", "author_site": "ANURAG KOUL, Shivakanth Sujit, Shaoru Chen, Benjamin Evans, Lili Wu, Byron Xu, Rajan Chari, Riashat Islam, Raihan Seraj, Yonathan Efroni, Lekan Molu, Miroslav Dudik, John Langford, Alex Lamb", "tldr": "", "abstract": "Goal-conditioned planning benefits from learned low-dimensional representations of rich observations. While compact latent representations typically learned from variational autoencoders or inverse dynamics enable goal-conditioned decision making, they ignore state reachability, hampering their performance. In this paper, we learn a representation that associates reachable states together for effective planning and goal-conditioned policy learning. We first learn a latent representation with multi-step inverse dynamics (to remove distracting information), and then transform this representation to associate reachable states together in $\\ell_2$ space. Our proposals are rigorously tested in various simulation testbeds. Numerical results in reward-based settings show significant improvements in sampling efficiency. Further, in reward-free settings this approach yields layered state abstractions that enable computationally efficient hierarchical planning for reaching ad hoc goals with zero additional samples.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anurag Koul;Shivakanth Sujit;Shaoru Chen;Ben Evans;Lili Wu;Byron Xu;Rajan Chari;Riashat Islam;Raihan Seraj;Yonathan Efroni;Lekan P Molu;Miroslav Dud\u00edk;John Langford;Alex Lamb", "authorids": "~Anurag_Koul1;~Shivakanth_Sujit1;~Shaoru_Chen1;~Ben_Evans1;~Lili_Wu1;~Byron_Xu1;~Rajan_Chari1;~Riashat_Islam1;~Raihan_Seraj1;~Yonathan_Efroni2;~Lekan_P_Molu1;~Miroslav_Dud\u00edk1;~John_Langford1;~Alex_Lamb1", "gender": "M;M;;;M;M;M;M;;M;M;M;M;", "homepage": "http://koulanurag.github.io/;https://www.shaoru.site/;;https://github.com/byronxu99;;https://riashat.github.io/;http://raihan-seraj.github.io/;https://sites.google.com/view/yonathan-efroni/;;http://hunch.net/~jl;;https://bennevans.github.io/;https://scriptedonachip.com;https://shivakanthsujit.github.io/", "dblp": "209/9666;254/9531;91/1716;;;198/0459;;215/3475;30/2146;77/4488;;87/9175;164/5974.html;320/2346", "google_scholar": "K-Q0Xq4AAAAJ;PUIfJYcAAAAJ;x8fnPxAAAAAJ;;;https://scholar.google.ca/citations?user=2_4Rs44AAAAJ;https://scholar.google.ca/citations?user=gtWzuL0AAAAJ;pfTInEgAAAAJ;wYMTld8AAAAJ;LFiqVpwAAAAJ;https://scholar.google.ca/citations?user=BFzFy1YAAAAJ;JPQom2sAAAAJ;JeUaqqEAAAAJ;https://scholar.google.ca/citations?user=oXpK8V8AAAAJ", "orcid": ";;;;;;;;;;;;;0000-0002-1744-0841", "linkedin": "koulanurag/;;lili-wu-71456674;;rajanchari/;;http://linkedin.com/in/raihan-seraj/;;;;;bnevans/;awesome-lekan/;shivakanthsujit/", "or_profile": "~Anurag_Koul1;~Shaoru_Chen1;~Lili_Wu1;~Byron_Xu1;~Rajan_Chari1;~Riashat_Islam1;~Raihan_Seraj1;~Yonathan_Efroni2;~Miroslav_Dud\u00edk1;~John_Langford1;~Alex_Matthew_Lamb1;~Benjamin_Evans1;~Olalekan_Ogunmolu1;~Shiva_Kanth_Sujit1", "aff": "Microsoft;Microsoft Research;Microsoft Research NYC;Microsoft;;Saudi Data and AI Authority, Saudi Data and AI Authority;McGill University;Meta;Microsoft;Microsoft;;New York University;Brandeis University;Araya Inc", "aff_domain": "microsoft.com;microsoft.com;microsoft.com;microsoft.com;;sdaia.gov.sa;mcgill.ca;meta.com;microsoft.com;microsoft.com;;nyu.edu;brandeis.edu;araya.org", "position": "Postdoc;Postdoc;Data and applied scientist;Researcher;;Researcher;PhD student;Researcher;Full Professor;Researcher;;PhD student;Instructor;Researcher", "bibtex": "@inproceedings{\nkoul2024pclast,\ntitle={PcLast: Discovering Plannable Continuous Latent States},\nauthor={Anurag Koul and Shivakanth Sujit and Shaoru Chen and Ben Evans and Lili Wu and Byron Xu and Rajan Chari and Riashat Islam and Raihan Seraj and Yonathan Efroni and Lekan P Molu and Miroslav Dud{\\'\\i}k and John Langford and Alex Lamb},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AaTYLZQPyC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9400520, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 14, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15962485069591355743&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "microsoft.com;microsoft.com;microsoft.com;microsoft.com;;sdaia.gov.sa;mcgill.ca;meta.com;microsoft.com;microsoft.com;;nyu.edu;brandeis.edu;araya.org", "author_num": 14, "aff_unique_index": "0;0;0;0;1;2;3;0;0;4;5;6", "aff_unique_norm": "Microsoft;Saudi Data and AI Authority;McGill University;Meta;New York University;Brandeis University;Araya Inc", "aff_unique_dep": "Microsoft Corporation;;;Meta Platforms, Inc.;;;", "aff_unique_url": "https://www.microsoft.com;https://sdaia.gov.sa;https://www.mcgill.ca;https://meta.com;https://www.nyu.edu;https://www.brandeis.edu;", "aff_unique_abbr": "Microsoft;SDAIA;McGill;Meta;NYU;Brandeis;", "aff_campus_unique_index": "1", "aff_campus_unique": ";New York City", "aff_country_unique_index": "0;0;0;0;1;2;0;0;0;0;0", "aff_country_unique": "United States;Saudi Arabia;Canada;" }, { "title": "D\u00e9j\u00e0Vu: KV-cache Streaming for Fast, Fault-tolerant Generative LLM Serving", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34760", "id": "AbGbGZFYOD", "proceeding": "https://proceedings.mlr.press/v235/strati24a.html", "pdf": "https://openreview.net/pdf?id=AbGbGZFYOD", "openreview": "https://openreview.net/forum?id=AbGbGZFYOD", "author_site": "Foteini Strati, Sara McAllister, Amar Phanishayee, Jakub Tarnawski, Ana Klimovic", "tldr": "", "abstract": "Distributed LLM serving is costly and often underutilizes hardware accelerators due to three key challenges: bubbles in pipeline-parallel deployments caused by the bimodal latency of prompt and token processing, GPU memory overprovisioning, and long recovery times in case of failures. D\u00e9j\u00e0Vu addresses all these challenges using a versatile and efficient KV cache streaming library (D\u00e9j\u00e0VuLib). Using D\u00e9j\u00e0VuLib, we propose and implement efficient prompt-token disaggregation to reduce pipeline bubbles, microbatch swapping for efficient GPU memory management, and state replication for fault-tolerance. We highlight the efficacy of these solutions on a range of large models across cloud deployments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Foteini Strati;Sara McAllister;Amar Phanishayee;Jakub Tarnawski;Ana Klimovic", "authorids": "~Foteini_Strati1;~Sara_McAllister1;~Amar_Phanishayee1;~Jakub_Tarnawski1;~Ana_Klimovic1", "gender": "F;;M;M;F", "homepage": "https://fotstrt.github.io/;https://saramcallister.github.io/;https://aka.ms/amar;http://jakub.tarnawski.org/;https://anakli.inf.ethz.ch/", "dblp": ";;14/877;157/6045;140/2247.html", "google_scholar": ";;;ddHxUHoAAAAJ;i7jievkAAAAJ", "orcid": ";;;0000-0001-6175-5827;0000-0001-8559-0529", "linkedin": ";;;jakubtarnawski/;anaklimovic", "or_profile": "~Foteini_Strati1;~Sara_McAllister1;~Amar_Phanishayee1;~Jakub_Tarnawski1;~Ana_Klimovic1", "aff": "ETHZ - ETH Zurich;Carnegie Mellon University;Microsoft;Microsoft;ETHZ - ETH Zurich", "aff_domain": "ethz.ch;cmu.edu;microsoft.com;microsoft.com;ethz.ch", "position": "PhD student;PhD student;Sr. Principal Researcher;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nstrati2024djvu,\ntitle={D\\'ej\\`aVu: {KV}-cache Streaming for Fast, Fault-tolerant Generative {LLM} Serving},\nauthor={Foteini Strati and Sara McAllister and Amar Phanishayee and Jakub Tarnawski and Ana Klimovic},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AbGbGZFYOD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2245741, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1559993767182833015&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 14, "email": "ethz.ch;cmu.edu;microsoft.com;microsoft.com;ethz.ch", "author_num": 5, "aff_unique_index": "0;1;2;2;0", "aff_unique_norm": "ETH Zurich;Carnegie Mellon University;Microsoft", "aff_unique_dep": ";;Microsoft Corporation", "aff_unique_url": "https://www.ethz.ch;https://www.cmu.edu;https://www.microsoft.com", "aff_unique_abbr": "ETHZ;CMU;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;0", "aff_country_unique": "Switzerland;United States" }, { "title": "Counterfactual Metarules for Local and Global Recourse", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34759", "id": "Ad9msn1SKC", "proceeding": "https://proceedings.mlr.press/v235/bewley24a.html", "pdf": "https://openreview.net/pdf?id=Ad9msn1SKC", "openreview": "https://openreview.net/forum?id=Ad9msn1SKC", "author_site": "Tom Bewley, Salim I. Amoukou, Saumitra Mishra, Daniele Magazzeni, Manuela Veloso", "tldr": "", "abstract": "We introduce **T-CREx**, a novel model-agnostic method for local and global counterfactual explanation (CE), which summarises recourse options for both individuals and groups in the form of generalised rules. It leverages tree-based surrogate models to learn the counterfactual rules, alongside *metarules* denoting their regimes of optimality, providing both a global analysis of model behaviour and diverse recourse options for users. Experiments indicate that **T-CREx** achieves superior aggregate performance over existing rule-based baselines on a range of CE desiderata, while being orders of magnitude faster to run.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tom Bewley;Salim I. Amoukou;Saumitra Mishra;Daniele Magazzeni;Manuela Veloso", "authorids": "~Tom_Bewley1;~Salim_I._Amoukou1;~Saumitra_Mishra1;~Daniele_Magazzeni1;~Manuela_Veloso1", "gender": ";M;M;M;F", "homepage": "http://tombewley.com;https://salimamoukou.github.io/;https://sites.google.com/site/saumitramishrac4dm/;https://nms.kcl.ac.uk/daniele.magazzeni/;https://www.cs.cmu.edu/~mmv/", "dblp": ";289/1335;208/1387;14/4672;v/ManuelaMVeloso", "google_scholar": "OqPzZ08AAAAJ;JrHnICMAAAAJ;https://scholar.google.co.uk/citations?user=On6E6ogAAAAJ;;https://scholar.google.com.tw/citations?user=2FbkAzYAAAAJ", "orcid": ";;;;", "linkedin": ";slim-amk/;;;", "or_profile": "~Tom_Bewley1;~Salim_I._Amoukou1;~Saumitra_Mishra1;~Daniele_Magazzeni1;~Manuela_Veloso1", "aff": "J.P. Morgan;J.P. Morgan Chase;J.P. Morgan Chase;;School of Computer Science, Carnegie Mellon University", "aff_domain": "jpmorgan.com;jpmorgan.com;jpmorgan.com;;cs.cmu.edu", "position": "Researcher;Researcher;Researcher;;Full Professor", "bibtex": "@inproceedings{\nbewley2024counterfactual,\ntitle={Counterfactual Metarules for Local and Global Recourse},\nauthor={Tom Bewley and Salim I. Amoukou and Saumitra Mishra and Daniele Magazzeni and Manuela Veloso},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Ad9msn1SKC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8481853, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11936392776993796326&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "jpmorgan.com;jpmorgan.com;jpmorgan.com;;cs.cmu.edu", "author_num": 5, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "J.P. Morgan;JPMorgan Chase & Co.;Carnegie Mellon University", "aff_unique_dep": ";;School of Computer Science", "aff_unique_url": "https://www.jpmorganchase.com;https://www.jpmorganchase.com;https://www.cmu.edu", "aff_unique_abbr": "JPM;JPM;CMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Deciphering RNA Secondary Structure Prediction: A Probabilistic K-Rook Matching Perspective", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34758", "id": "Ada9Z68nvb", "proceeding": "https://proceedings.mlr.press/v235/tan24a.html", "pdf": "https://openreview.net/pdf?id=Ada9Z68nvb", "openreview": "https://openreview.net/forum?id=Ada9Z68nvb", "author_site": "Cheng Tan, Zhangyang Gao, Hanqun CAO, Xingran Chen, Wang Ge, Lirong Wu, Jun Xia, Jiangbin Zheng, Stan Z Li", "tldr": "", "abstract": "The secondary structure of ribonucleic acid (RNA) is more stable and accessible in the cell than its tertiary structure, making it essential for functional prediction. Although deep learning has shown promising results in this field, current methods suffer from poor generalization and high complexity. In this work, we reformulate the RNA secondary structure prediction as a K-Rook problem, thereby simplifying the prediction process into probabilistic matching within a finite solution space. Building on this innovative perspective, we introduce RFold, a simple yet effective method that learns to predict the most matching K-Rook solution from the given sequence. RFold employs a bi-dimensional optimization strategy that decomposes the probabilistic matching problem into row-wise and column-wise components to reduce the matching complexity, simplifying the solving process while guaranteeing the validity of the output. Extensive experiments demonstrate that RFold achieves competitive performance and about eight times faster inference efficiency than the state-of-the-art approaches. The code is available at https://github.com/A4Bio/RFold.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Cheng Tan;Zhangyang Gao;Hanqun CAO;Xingran Chen;Ge Wang;Lirong Wu;Jun Xia;Jiangbin Zheng;Stan Z. Li", "authorids": "~Cheng_Tan1;~Zhangyang_Gao1;~Hanqun_CAO1;~Xingran_Chen1;~Ge_Wang3;~Lirong_Wu1;~Jun_Xia1;~Jiangbin_Zheng3;~Stan_Z._Li2", "gender": "M;M;;M;;;M;M;", "homepage": "https://chengtan9907.github.io/;;;https://www.chenxingran.com/;;;http://junxia97.github.io/;;", "dblp": "70/1533-12.html;275/3266;;203/8349;34/5591;15/10330;;;", "google_scholar": "6kTV6aMAAAAJ;4SclT-QAAAAJ;;X01oTv8AAAAJ;https://scholar.google.com.hk/citations?user=t9GUEMoAAAAJ;Tk7TrCoAAAAJ;aPKKpSYAAAAJ;;", "orcid": ";0000-0003-1026-6083;;;0000-0001-8553-6493;;;0000-0003-3305-0103;", "linkedin": ";;;;;;;;", "or_profile": "~Cheng_Tan1;~Zhangyang_Gao1;~Hanqun_CAO1;~Xingran_Chen1;~Ge_Wang3;~Lirong_Wu1;~Jun_Xia1;~Jiangbin_Zheng3;~Stan_Z._Li2", "aff": "Zhejiang University & Westlake University;Westlake University, China;;University of Michigan - Ann Arbor;WESTLAKE UNIVERSITY;Westlake University;Westlake University, China;Westlake University;", "aff_domain": "westlake.edu.cn;westlake.edu.cn;;umich.edu;westlake.edu.cn;westlake.edu.cn;westlake.edu.cn;westlake.edu.cn;", "position": "PhD student;PhD student;;MS student;PhD student;PhD student;PhD student;PhD student;", "bibtex": "@inproceedings{\ntan2024deciphering,\ntitle={Deciphering {RNA} Secondary Structure Prediction: A Probabilistic K-Rook Matching Perspective},\nauthor={Cheng Tan and Zhangyang Gao and Hanqun CAO and Xingran Chen and Ge Wang and Lirong Wu and Jun Xia and Jiangbin Zheng and Stan Z. Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Ada9Z68nvb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1985975, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14637694439861796304&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 7, "email": "westlake.edu.cn;westlake.edu.cn;;umich.edu;westlake.edu.cn;westlake.edu.cn;westlake.edu.cn;westlake.edu.cn;", "author_num": 9, "aff_unique_index": "0;1;2;1;1;1;1", "aff_unique_norm": "Zhejiang University;Westlake University;University of Michigan", "aff_unique_dep": ";;", "aff_unique_url": "http://www.zju.edu.cn;https://www.westlake.edu.cn;https://www.umich.edu", "aff_unique_abbr": "ZJU;WU;UM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Ann Arbor", "aff_country_unique_index": "0;0;1;0;0;0;0", "aff_country_unique": "China;United States" }, { "title": "Two-timescale Derivative Free Optimization for Performative Prediction with Markovian Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34757", "id": "Aj18fUB6Th", "proceeding": "https://proceedings.mlr.press/v235/liu24aj.html", "pdf": "https://openreview.net/pdf?id=Aj18fUB6Th", "openreview": "https://openreview.net/forum?id=Aj18fUB6Th", "author_site": "Haitong LIU, Qiang Li, Hoi To Wai", "tldr": "", "abstract": "This paper studies the performative prediction problem where a learner aims to minimize the expected loss with a decision-dependent data distribution. Such setting is motivated when outcomes can be affected by the prediction model, e.g., in strategic classification. We consider a state-dependent setting where the data distribution evolves according to an underlying controlled Markov chain. We focus on stochastic derivative free optimization (DFO) where the learner is given access to a loss function evaluation oracle with the above Markovian data. We propose a two-timescale DFO($\\lambda$) algorithm that features (i) a sample accumulation mechanism that utilizes every observed sample to estimate the overall gradient of performative risk, and (ii) a two-timescale diminishing step size that balances the rates of DFO updates and bias reduction. Under a general non-convex optimization setting, we show that DFO($\\lambda$) requires ${\\cal O}( 1 /\\epsilon^3)$ samples (up to a log factor) to attain a near-stationary solution with expected squared gradient norm less than $\\epsilon > 0$. Numerical experiments verify our analysis.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haitong LIU;Qiang LI;Hoi To Wai", "authorids": "~Haitong_LIU1;~Qiang_LI7;~Hoi_To_Wai1", "gender": "M;M;M", "homepage": ";;http://www1.se.cuhk.edu.hk/~htwai/", "dblp": ";;29/9875", "google_scholar": ";NjVNiJ8AAAAJ;https://scholar.google.com.hk/citations?user=5-J7LeMAAAAJ", "orcid": ";0009-0006-1024-1344;", "linkedin": "haitong-liu-7a0698276/;;", "or_profile": "~Haitong_LIU1;~Qiang_LI7;~Hoi_To_Wai1", "aff": ";Chinese University of Hong Kong;The Chinese University of Hong Kong", "aff_domain": ";se.cuhk.edu.hk;cuhk.edu.hk", "position": ";PhD student;Assistant Professor", "bibtex": "@inproceedings{\nliu2024twotimescale,\ntitle={Two-timescale Derivative Free Optimization for Performative Prediction with Markovian Data},\nauthor={Haitong LIU and Qiang LI and Hoi To Wai},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Aj18fUB6Th}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 924798, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=368747994511129964&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";se.cuhk.edu.hk;cuhk.edu.hk", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Chinese University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Knowledge Graphs Can be Learned with Just Intersection Features", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34756", "id": "Al5GlVytqi", "proceeding": "https://proceedings.mlr.press/v235/le24c.html", "pdf": "https://openreview.net/pdf?id=Al5GlVytqi", "openreview": "https://openreview.net/forum?id=Al5GlVytqi", "author_site": "Duy Le, Shaochen (Henry) Zhong, Zirui Liu, Shuai Xu, Vipin Chaudhary, Kaixiong Zhou, Zhaozhuo Xu", "tldr": "", "abstract": "Knowledge Graphs (KGs) are potent frameworks for knowledge representation and reasoning. Nevertheless, KGs are inherently incomplete, leaving numerous uncharted relationships and facts awaiting discovery. Deep learning methodologies have proven effective in enhancing KG completion by framing it as a link prediction task, where the goal is to discern the validity of a triple comprising a head, relation, and tail. The significance of structural information in assessing the validity of a triple within a KG is well-established. However, quantifying this structural information poses a challenge. We need to pinpoint the metric that encapsulates the structural information of a triple and smoothly incorporate this metric into the link prediction learning process. In this study, we recognize the critical importance of the intersection among the $k$-hop neighborhoods of the head, relation, and tail when determining the validity of a triple. To address this, we introduce a novel randomized algorithm designed to efficiently generate intersection features for candidate triples. Our experimental results demonstrate that a straightforward fully-connected network leveraging these intersection features can surpass the performance of established KG embedding models and even outperform graph neural network baselines. Additionally, we highlight the substantial training time efficiency gains achieved by our network trained on intersection features.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Duy Le;Shaochen Zhong;Zirui Liu;Shuai Xu;Vipin Chaudhary;Kaixiong Zhou;Zhaozhuo Xu", "authorids": "dhl64@case.edu;~Shaochen_Zhong1;~Zirui_Liu1;~Shuai_Xu2;~Vipin_Chaudhary2;~Kaixiong_Zhou1;~Zhaozhuo_Xu1", "gender": ";M;M;M;M;M;M", "homepage": ";https://openreview.net/profile?id=~Shaochen_Zhong1;https://zirui-ray-liu.github.io/;https://engineering.case.edu/profiles/sxx214;https://engineering.case.edu/profiles/vxc204;https://kaixiong-zhou.github.io/;https://ottovonxu.github.io/", "dblp": ";326/7286.html;196/8629-1.html;;c/VipinChaudhary.html;178/7315;195/4352", "google_scholar": ";https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=zh-CN;wu-vtI4AAAAJ;vJbjqpIAAAAJ;zMspIjIAAAAJ;7tDlVAsAAAAJ", "orcid": ";;;;0000-0001-9672-6225;0000-0001-5226-8736;", "linkedin": ";shaochen-henry-zhong-96a941249/;;;vipin-chaudhary-379529/;;", "or_profile": "dhl64@case.edu;~Shaochen_Zhong1;~Zirui_Liu1;~Shuai_Xu2;~Vipin_Chaudhary2;~Kaixiong_Zhou1;~Zhaozhuo_Xu1", "aff": ";Rice University;Rice University;Case Western Reserve University;Case Western Reserve University;Massachusetts Institute of Technology;Stevens Institute of Technology", "aff_domain": ";rice.edu;rice.edu;case.edu;case.edu;mit.edu;stevens.edu", "position": ";PhD student;PhD student;Assistant Professor;Full Professor;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nle2024knowledge,\ntitle={Knowledge Graphs Can be Learned with Just Intersection Features},\nauthor={Duy Le and Shaochen Zhong and Zirui Liu and Shuai Xu and Vipin Chaudhary and Kaixiong Zhou and Zhaozhuo Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Al5GlVytqi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1587849, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10013979147552875285&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";rice.edu;rice.edu;case.edu;case.edu;mit.edu;stevens.edu", "author_num": 7, "aff_unique_index": "0;0;1;1;2;3", "aff_unique_norm": "Rice University;Case Western Reserve University;Massachusetts Institute of Technology;Stevens Institute of Technology", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.rice.edu;https://www.case.edu;https://web.mit.edu;https://www.stevens.edu", "aff_unique_abbr": "Rice;CWRU;MIT;SIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Consistent Submodular Maximization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34755", "id": "AlJkqMnyjL", "proceeding": "https://proceedings.mlr.press/v235/duetting24a.html", "pdf": "https://openreview.net/pdf?id=AlJkqMnyjL", "openreview": "https://openreview.net/forum?id=AlJkqMnyjL", "author_site": "PAUL DUETTING, Federico Fusco, Silvio Lattanzi, Ashkan Norouzi-Fard, Morteza Zadimoghaddam", "tldr": "", "abstract": "Maximizing monotone submodular functions under cardinality constraints is a classic optimization task with several applications in data mining and machine learning. In this paper, we study this problem in a dynamic environment with consistency constraints: elements arrive in a streaming fashion, and the goal is maintaining a constant approximation to the optimal solution while having a stable solution (i.e., the number of changes between two consecutive solutions is bounded). In this setting, we provide algorithms with different trade-offs between consistency and approximation quality. We also complement our theoretical results with an experimental analysis showing the effectiveness of our algorithms in real-world instances.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Paul Duetting;Federico Fusco;Silvio Lattanzi;Ashkan Norouzi-Fard;Morteza Zadimoghaddam", "authorids": "~Paul_Duetting1;~Federico_Fusco1;~Silvio_Lattanzi1;~Ashkan_Norouzi-Fard2;~Morteza_Zadimoghaddam1", "gender": ";M;M;;", "homepage": "http://paulduetting.com/;https://sites.google.com/uniroma1.it/federicofusco/home;https://sites.google.com/site/silviolattanzi/;;", "dblp": "https://dblp.org/pers/d/D=uuml=tting:Paul.html;243/5755;46/6611;https://dblp.org/pers/n/Norouzi=Fard:Ashkan;05/3431", "google_scholar": "Oqky1hIAAAAJ;https://scholar.google.co.il/citations?user=oaS8iAQAAAAJ;vxUZ4AUAAAAJ;-KdNGwgAAAAJ;", "orcid": ";0000-0001-6250-945X;;;", "linkedin": ";;;;", "or_profile": "~Paul_Duetting1;~Federico_Fusco1;~Silvio_Lattanzi1;~Ashkan_Norouzi-Fard2;~Morteza_Zadimoghaddam1", "aff": "Google;University of Roma \"La Sapienza\";Google;Google;", "aff_domain": "google.com;uniroma1.it;google.com;google.com;", "position": "Researcher;Lecturer;Researcher;Researcher;", "bibtex": "@inproceedings{\nduetting2024consistent,\ntitle={Consistent Submodular Maximization},\nauthor={Paul Duetting and Federico Fusco and Silvio Lattanzi and Ashkan Norouzi-Fard and Morteza Zadimoghaddam},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AlJkqMnyjL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 472226, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15166003732642785873&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 9, "email": "google.com;uniroma1.it;google.com;google.com;", "author_num": 5, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Google;University of Rome La Sapienza", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.uniroma1.it", "aff_unique_abbr": "Google;La Sapienza", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Mountain View;Rome", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;Italy" }, { "title": "Why do Variational Autoencoders Really Promote Disentanglement?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34754", "id": "Ao9UUaScAU", "proceeding": "https://proceedings.mlr.press/v235/bhowal24a.html", "pdf": "https://openreview.net/pdf?id=Ao9UUaScAU", "openreview": "https://openreview.net/forum?id=Ao9UUaScAU", "author_site": "Pratik Bhowal, Achint Soni, Sirisha Rambhatla", "tldr": "", "abstract": "Despite not being designed for this purpose, the use of variational autoencoders (VAEs) has proven remarkably effective for disentangled representation learning (DRL). Recent research attributes this success to certain characteristics of the loss function that prevent latent space rotation, or hypothesize about the orthogonality properties of the decoder by drawing parallels with principal component analysis (PCA). This hypothesis, however, has only been tested experimentally for linear VAEs, and the theoretical justification still remains an open problem. Moreover, since real-world VAEs are often inherently non-linear due to the use of neural architectures, understanding DRL capabilities of real-world VAEs remains a critical task. Our work takes a step towards understanding disentanglement in real-world VAEs to theoretically establish how the orthogonality properties of the decoder promotes disentanglement in practical applications. Complementary to our theoretical contributions, our experimental results corroborate our analysis. Code is available at https://github.com/criticalml-uw/Disentanglement-in-VAE.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pratik Bhowal;Achint Soni;Sirisha Rambhatla", "authorids": "~Pratik_Bhowal1;~Achint_Soni1;~Sirisha_Rambhatla1", "gender": "M;M;F", "homepage": ";https://www.trickyjustice.github.io;", "dblp": ";;123/4808.html", "google_scholar": "4O9R8isAAAAJ;https://scholar.google.ca/citations?user=onzZ8qUAAAAJ;EOSZeBMAAAAJ", "orcid": ";;", "linkedin": "pratik-bhowal-1066aa198/;achintsoni/;", "or_profile": "~Pratik_Bhowal1;~Achint_Soni1;~Sirisha_Rambhatla1", "aff": "NVIDIA;University of Waterloo;University of Waterloo", "aff_domain": "nvidia.com;uwaterloo.ca;uwaterloo.ca", "position": "System Software Developer;MS student;Assistant Professor", "bibtex": "@inproceedings{\nbhowal2024why,\ntitle={Why do Variational Autoencoders Really Promote Disentanglement?},\nauthor={Pratik Bhowal and Achint Soni and Sirisha Rambhatla},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Ao9UUaScAU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1608513, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2162451837026575715&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "nvidia.com;uwaterloo.ca;uwaterloo.ca", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "NVIDIA;University of Waterloo", "aff_unique_dep": "NVIDIA Corporation;", "aff_unique_url": "https://www.nvidia.com;https://uwaterloo.ca", "aff_unique_abbr": "NVIDIA;UW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;Canada" }, { "title": "FedBPT: Efficient Federated Black-box Prompt Tuning for Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34753", "id": "AoYhtJ4A90", "proceeding": "https://proceedings.mlr.press/v235/sun24j.html", "pdf": "https://openreview.net/pdf?id=AoYhtJ4A90", "openreview": "https://openreview.net/forum?id=AoYhtJ4A90", "author_site": "Jingwei Sun, Ziyue Xu, Hongxu Yin, Dong Yang, Daguang Xu, Yudong Liu, Zhixu Du, Yiran Chen, Holger Roth", "tldr": "", "abstract": "Pre-trained language models (PLM) have revolutionized the NLP landscape, achieving stellar performances across diverse tasks. These models, while benefiting from vast training data, often require fine-tuning on specific data to cater to distinct downstream tasks. However, this data adaptation process has inherent security and privacy concerns, primarily when leveraging user-generated, device-residing data. Federated learning (FL) provides a solution, allowing collaborative model fine-tuning without centralized data collection. However, applying FL to finetune PLMs is hampered by challenges, including restricted model parameter access due to the high encapsulation, high computational requirements, and communication overheads. This paper introduces Federated Black-box Prompt Tuning (FedBPT), a framework designed to address these challenges. FedBPT allows the clients to treat the model as a black-box inference API. By focusing on training optimal prompts and utilizing gradient-free optimization methods, FedBPT reduces the number of exchanged variables, boosts communication efficiency, and minimizes computational and storage costs. Experiments highlight the framework's ability to drastically cut communication and memory costs while maintaining competitive performance. Ultimately, FedBPT presents a promising solution for efficient, privacy-preserving fine-tuning of PLM in the age of large language models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jingwei Sun;Ziyue Xu;Hongxu Yin;Dong Yang;Daguang Xu;Yudong Liu;Zhixu Du;Yiran Chen;Holger R Roth", "authorids": "~Jingwei_Sun2;~Ziyue_Xu1;~Hongxu_Yin2;~Dong_Yang1;~Daguang_Xu2;~Yudong_Liu4;~Zhixu_Du1;~Yiran_Chen1;~Holger_R_Roth1", "gender": "M;M;M;M;M;M;M;M;M", "homepage": ";;;https://daguangxu.net/;;https://doesnothave.com;https://ece.duke.edu/people/yiran-chen/;http://www.holgerroth.com;https://hongxu-yin.github.io/", "dblp": "66/7761-2;59/9160-1.html;33/412-5;;;;80/1641;42/8528;166/3425", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;gmUta74AAAAJ;PHvliUgAAAAJ;r_VHYHAAAAAJ;https://scholar.google.com/citations?hl=en;UpxfrVMAAAAJ;;https://scholar.google.co.jp/citations?user=pzNwAsEAAAAJ;4gdSoOYAAAAJ", "orcid": ";0000-0002-5728-6869;0000-0002-5031-4337;;0009-0006-3455-9939;;0000-0002-1486-8412;0000-0002-3662-8743;", "linkedin": ";xu-ziyue-89143515/;dong-yang-thu/;;yudong-liu-574966311;;;;", "or_profile": "~Jingwei_Sun2;~Ziyue_Xu1;~Dong_Yang1;~Daguang_Xu2;~Yudong_Liu4;~Zhixu_Du1;~Yiran_Chen1;~Holger_R_Roth1;~Hongxu_Yin1", "aff": "Duke University;NVIDIA;NVIDIA;NVIDIA;Duke University;Duke University, Duke University;Duke University;NVIDIA;NVIDIA", "aff_domain": "duke.edu;nvidia.com;nvidia.com;nvidia.com;duke.edu;ece.duke.edu;duke.edu;nvidia.com;nvidia.com", "position": "PhD student;Senior Scientist;Research Scientist;Research Manager;PhD student;PhD student;Professor;Principal Researcher;Senior Research Scientist", "bibtex": "@inproceedings{\nsun2024fedbpt,\ntitle={Fed{BPT}: Efficient Federated Black-box Prompt Tuning for Large Language Models},\nauthor={Jingwei Sun and Ziyue Xu and Hongxu Yin and Dong Yang and Daguang Xu and Yudong Liu and Zhixu Du and Yiran Chen and Holger R Roth},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AoYhtJ4A90}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6477560, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9706697347838040158&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "duke.edu;nvidia.com;nvidia.com;nvidia.com;duke.edu;ece.duke.edu;duke.edu;nvidia.com;nvidia.com", "author_num": 9, "aff_unique_index": "0;1;1;1;0;0;0;1;1", "aff_unique_norm": "Duke University;NVIDIA", "aff_unique_dep": ";NVIDIA Corporation", "aff_unique_url": "https://www.duke.edu;https://www.nvidia.com", "aff_unique_abbr": "Duke;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Sequential Disentanglement by Extracting Static Information From A Single Sequence Element", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34752", "id": "AocOA4h3bu", "proceeding": "https://proceedings.mlr.press/v235/berman24a.html", "pdf": "https://openreview.net/pdf?id=AocOA4h3bu", "openreview": "https://openreview.net/forum?id=AocOA4h3bu", "author_site": "Nimrod Berman, Ilan Naiman, Idan Arbiv, Gal Fadlon, Omri Azencot", "tldr": "", "abstract": "One of the fundamental representation learning tasks is unsupervised sequential disentanglement, where latent codes of inputs are decomposed to a single static factor and a sequence of dynamic factors. To extract this latent information, existing methods condition the static and dynamic codes on the entire input sequence. Unfortunately, these models often suffer from information leakage, i.e., the dynamic vectors encode both static and dynamic information, or vice versa, leading to a non-disentangled representation. Attempts to alleviate this problem via reducing the dynamic dimension and auxiliary loss terms gain only partial success. Instead, we propose a novel and simple architecture that mitigates information leakage by offering a simple and effective subtraction inductive bias while conditioning on a single sample. Remarkably, the resulting variational framework is simpler in terms of required loss terms, hyper-parameters, and data augmentation. We evaluate our method on multiple data-modality benchmarks including general time series, video, and audio, and we show beyond state-of-the-art results on generation and prediction tasks in comparison to several strong baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nimrod Berman;Ilan Naiman;Idan Arbiv;Gal Fadlon;Omri Azencot", "authorids": "~Nimrod_Berman1;~Ilan_Naiman1;~Idan_Arbiv1;~Gal_Fadlon1;~Omri_Azencot1", "gender": "M;M;M;M;Unspecified", "homepage": ";https://www.linkedin.com/in/ilan-naiman-80071a190;;;http://omriazencot.com", "dblp": ";285/4824;;;132/3985.html", "google_scholar": ";Fglytk8AAAAJ;;;https://scholar.google.co.il/citations?user=MEGuRmAAAAAJ", "orcid": ";;;;", "linkedin": "nimrod-berman-a26250143/;ilan-naiman-80071a190;idan-arbiv/;gal-fadlon-89478a214/;omri-azencot-a8812417/", "or_profile": "~Nimrod_Berman1;~Ilan_Naiman1;~Idan_Arbiv1;~Gal_Fadlon1;~Omri_Azencot1", "aff": ";Ben Gurion University of the Negev, Technion;Ben-Gurion University of the Negev;Ben-Gurion University of the Negev;Ben-Gurion University of the Negev", "aff_domain": ";bgu.ac.il;bgu.ac.il;bgu.ac.il;bgu.ac.il", "position": ";PhD student;MS student;MS student;Assistant Professor", "bibtex": "@inproceedings{\nberman2024sequential,\ntitle={Sequential Disentanglement by Extracting Static Information From A Single Sequence Element},\nauthor={Nimrod Berman and Ilan Naiman and Idan Arbiv and Gal Fadlon and Omri Azencot},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AocOA4h3bu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4626678, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10947832926192701607&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": ";bgu.ac.il;bgu.ac.il;bgu.ac.il;bgu.ac.il", "author_num": 5, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Ben Gurion University of the Negev;Ben-Gurion University of the Negev", "aff_unique_dep": ";", "aff_unique_url": "https://www.bgu.ac.il;https://www.bgu.ac.il", "aff_unique_abbr": "BGU;BGU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Israel" }, { "title": "Weisfeiler Leman for Euclidean Equivariant Machine Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34751", "id": "ApRKrKZJSk", "proceeding": "https://proceedings.mlr.press/v235/hordan24a.html", "pdf": "https://openreview.net/pdf?id=ApRKrKZJSk", "openreview": "https://openreview.net/forum?id=ApRKrKZJSk", "author_site": "Snir Hordan, Tal Amir, Nadav Dym", "tldr": "", "abstract": "The $k$-Weisfeiler-Leman ($k$-WL) graph isomorphism test hierarchy is a common method for assessing the expressive power of graph neural networks (GNNs). Recently, GNNs whose expressive power is equivalent to the $2$-WL test were proven to be universal on weighted graphs which encode $3\\mathrm{D}$ point cloud data, yet this result is limited to invariant continuous functions on point clouds. In this paper, we extend this result in three ways: Firstly, we show that PPGN can simulate $2$-WL uniformly on all point clouds with low complexity. Secondly, we show that $2$-WL tests can be extended to point clouds which include both positions and velocities, a scenario often encountered in applications. Finally, we provide a general framework for proving equivariant universality and leverage it to prove that a simple modification of this invariant PPGN architecture can be used to obtain a universal equivariant architecture that can approximate all continuous equivariant functions uniformly. Building on our results, we develop our WeLNet architecture, which sets new state-of-the-art results on the N-Body dynamics task and the GEOM-QM9 molecular conformation generation task.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Snir Hordan;Tal Amir;Nadav Dym", "authorids": "~Snir_Hordan1;~Tal_Amir1;~Nadav_Dym1", "gender": "M;;M", "homepage": "https://snirhordan.github.io/;https://tal-amir.github.io/;https://nadavdym.github.io./", "dblp": "339/0274;;167/1176", "google_scholar": "T2YJQPoAAAAJ;https://scholar.google.co.il/citations?user=Lx2W9vMAAAAJ;https://scholar.google.co.il/citations?user=qOyXmMYAAAAJ", "orcid": ";0009-0003-1868-1860;", "linkedin": "senirhordan/;;", "or_profile": "~Snir_Hordan1;~Tal_Amir1;~Nadav_E_Dym1", "aff": "Technion - Israel Institute of Technology, Technion - Israel Institute of Technology;Technion - Israel Institute of Technology, Technion;Technion - Israel Institute of Technology, Technion", "aff_domain": "campus.technion.ac.il;technion.ac.il;technion.ac.il", "position": "PhD student;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nhordan2024weisfeiler,\ntitle={Weisfeiler Leman for Euclidean Equivariant Machine Learning},\nauthor={Snir Hordan and Tal Amir and Nadav Dym},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ApRKrKZJSk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 595880, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=508159414272564928&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "campus.technion.ac.il;technion.ac.il;technion.ac.il", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Israel" }, { "title": "Token-Specific Watermarking with Enhanced Detectability and Semantic Coherence for Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34750", "id": "AqBz54aFyj", "proceeding": "https://proceedings.mlr.press/v235/huo24a.html", "pdf": "https://openreview.net/pdf?id=AqBz54aFyj", "openreview": "https://openreview.net/forum?id=AqBz54aFyj", "author_site": "Mingjia Huo, Sai Ashish Somayajula, Youwei Liang, Ruisi Zhang, Farinaz Koushanfar, Pengtao Xie", "tldr": "", "abstract": "Large language models generate high-quality responses with potential misinformation, underscoring the need for regulation by distinguishing AI-generated and human-written texts. Watermarking is pivotal in this context, which involves embedding hidden markers in texts during the LLM inference phase, which is imperceptible to humans. Achieving both the detectability of inserted watermarks and the semantic quality of generated texts is challenging. While current watermarking algorithms have made promising progress in this direction, there remains significant scope for improvement. To address these challenges, we introduce a novel multi-objective optimization (MOO) approach for watermarking that utilizes lightweight networks to generate token-specific watermarking logits and splitting ratios. By leveraging MOO to optimize for both detection and semantic objective functions, our method simultaneously achieves detectability and semantic integrity. Experimental results show that our method outperforms current watermarking techniques in enhancing the detectability of texts generated by LLMs while maintaining their semantic coherence. Our code is available at https://github.com/mignonjia/TS_watermark.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mingjia Huo;Sai Ashish Somayajula;Youwei Liang;Ruisi Zhang;Farinaz Koushanfar;Pengtao Xie", "authorids": "~Mingjia_Huo1;~Sai_Ashish_Somayajula1;~Youwei_Liang1;~Ruisi_Zhang2;~Farinaz_Koushanfar1;~Pengtao_Xie3", "gender": "F;M;M;F;F;M", "homepage": "https://mignonjia.github.io/;https://sai-ashish.github.io/website/;https://youweiliang.github.io/;http://ruisizhang.com/;https://farinaz.eng.ucsd.edu/;https://pengtaoxie.github.io/", "dblp": "242/8975;276/4407;257/5626;;k/FarinazKoushanfar.html;133/1998", "google_scholar": ";https://scholar.google.com/citations?hl=en;zMofZR4AAAAJ;n37X9CgAAAAJ;3XnMVUAAAAAJ;cnncomYAAAAJ", "orcid": ";;;;0000-0003-0798-3794;", "linkedin": ";sai-ashish-somayajula-750295140/;;;farinaz-koushanfar-9372a6a/;", "or_profile": "~Mingjia_Huo1;~Sai_Ashish_Somayajula1;~Youwei_Liang1;~Ruisi_Zhang2;~Farinaz_Koushanfar1;~Pengtao_Xie3", "aff": "University of California, San Diego;University of California, San Diego;University of California, San Diego;University of California, San Diego;University of California, San Diego;Carnegie Mellon University", "aff_domain": "ucsd.edu;ucsd.edu;ucsd.edu;ucsd.edu;ucsd.edu; ", "position": "PhD student;PhD student;PhD student;PhD student;Full Professor;Graduate Student", "bibtex": "@inproceedings{\nhuo2024tokenspecific,\ntitle={Token-Specific Watermarking with Enhanced Detectability and Semantic Coherence for Large Language Models},\nauthor={Mingjia Huo and Sai Ashish Somayajula and Youwei Liang and Ruisi Zhang and Farinaz Koushanfar and Pengtao Xie},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AqBz54aFyj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 800121, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17749962193791444071&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "ucsd.edu;ucsd.edu;ucsd.edu;ucsd.edu;ucsd.edu; ", "author_num": 6, "aff_unique_index": "0;0;0;0;0;1", "aff_unique_norm": "University of California, San Diego;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucsd.edu;https://www.cmu.edu", "aff_unique_abbr": "UCSD;CMU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Profile Reconstruction from Private Sketches", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34749", "id": "AqGCEHK9dZ", "proceeding": "https://proceedings.mlr.press/v235/wu24w.html", "pdf": "https://openreview.net/pdf?id=AqGCEHK9dZ", "openreview": "https://openreview.net/forum?id=AqGCEHK9dZ", "author_site": "Hao WU, Rasmus Pagh", "tldr": "", "abstract": "Given a multiset of $n$ items from $\\mathcal{D}$, the *profile reconstruction* problem is to estimate, for $t = 0, 1, \\dots, n$, the fraction $\\vec{f}[t]$ of items in $\\mathcal{D}$ that appear exactly $t$ times. We consider differentially private profile estimation in a distributed, space-constrained setting where we wish to maintain an updatable, private sketch of the multiset that allows us to compute an approximation of $\\vec{f} = (\\vec{f}[0], \\dots, \\vec{f}[n])$. Using a histogram privatized using discrete Laplace noise, we show how to ``reverse'' the noise using an approach of Dwork et al. (ITCS '10). We show how to speed up the algorithm from polynomial time to $O(d + n \\log n)$, and analyze the achievable error in the $\\ell_1$, $\\ell_2$ and $\\ell_\\infty$ norms. In all cases the dependency of the error on $d = |\\mathcal{D}|$ is $O( 1 / \\sqrt{d})$ --- we give an information-theoretic lower bound showing that this dependence on $d$ is asymptotically optimal among all private, updatable sketches for the profile reconstruction problem with a high-probability error guarantee.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hao WU;Rasmus Pagh", "authorids": "~Hao_WU21;~Rasmus_Pagh1", "gender": "M;M", "homepage": ";https://www.rasmuspagh.net", "dblp": "72/4250-57;p/RasmusPagh", "google_scholar": "Aaui0ucAAAAJ;https://scholar.google.com.tw/citations?user=VO4oS8UAAAAJ", "orcid": ";0000-0002-1516-9306", "linkedin": "wuhaowujiang/;", "or_profile": "~Hao_WU21;~Rasmus_Pagh1", "aff": "Copenhagen University;University of Copenhagen", "aff_domain": "ku.dk;ku.dk", "position": "Postdoc;Full Professor", "bibtex": "@inproceedings{\nwu2024profile,\ntitle={Profile Reconstruction from Private Sketches},\nauthor={Hao WU and Rasmus Pagh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AqGCEHK9dZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 462365, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GP4c6RJKAaUJ:scholar.google.com/&scioq=Profile+Reconstruction+from+Private+Sketches&hl=en&as_sdt=0,44", "gs_version_total": 4, "email": "ku.dk;ku.dk", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Copenhagen", "aff_unique_dep": "", "aff_unique_url": "https://www.ku.dk", "aff_unique_abbr": "UCPH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Denmark" }, { "title": "Environment Design for Inverse Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34748", "id": "Ar0dsOMStE", "proceeding": "https://proceedings.mlr.press/v235/kleine-buening24a.html", "pdf": "https://openreview.net/pdf?id=Ar0dsOMStE", "openreview": "https://openreview.net/forum?id=Ar0dsOMStE", "author_site": "Thomas Kleine Buening, Victor Villin, Christos Dimitrakakis", "tldr": "", "abstract": "Learning a reward function from demonstrations suffers from low sample-efficiency. Even with abundant data, current inverse reinforcement learning methods that focus on learning from a single environment can fail to handle slight changes in the environment dynamics. We tackle these challenges through adaptive environment design. In our framework, the learner repeatedly interacts with the expert, with the former selecting environments to identify the reward function as quickly as possible from the expert\u2019s demonstrations in said environments. This results in improvements in both sample-efficiency and robustness, as we show experimentally, for both exact and approximate inference.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Thomas Kleine Buening;Victor Villin;Christos Dimitrakakis", "authorids": "~Thomas_Kleine_Buening1;~Victor_Villin1;~Christos_Dimitrakakis1", "gender": "M;M;M", "homepage": "https://thomasklbg.github.io/;;https://sites.google.com/site/christosdimitrakakis/", "dblp": "286/5270;304/3021;17/2535", "google_scholar": "1VT2sBgAAAAJ;;9Kw4t_kAAAAJ", "orcid": ";;0000-0002-5367-5189", "linkedin": "thomas-kleine-b%C3%BCning-594a4414a/;victor-villin/;", "or_profile": "~Thomas_Kleine_Buening1;~Victor_Villin1;~Christos_Dimitrakakis1", "aff": "University of Oslo, Norway;Universit\u00e9 de Neuch\u00e2tel;Chalmers University", "aff_domain": "uio.no;unine.ch;chalmers.se", "position": "PhD student;PhD student;Senior Researcher", "bibtex": "@inproceedings{\nbuening2024environment,\ntitle={Environment Design for Inverse Reinforcement Learning},\nauthor={Thomas Kleine Buening and Victor Villin and Christos Dimitrakakis},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Ar0dsOMStE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4605383, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14015990790110367302&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "uio.no;unine.ch;chalmers.se", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Oslo;Universit\u00e9 de Neuch\u00e2tel;Chalmers University of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uio.no;https://www.unine.ch;https://www.chalmers.se", "aff_unique_abbr": "UiO;UNINE;Chalmers", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "Norway;Switzerland;Sweden" }, { "title": "Trustless Audits without Revealing Data or Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34747", "id": "AtVtt9xsO1", "proceeding": "https://proceedings.mlr.press/v235/waiwitlikhit24a.html", "pdf": "https://openreview.net/pdf?id=AtVtt9xsO1", "openreview": "https://openreview.net/forum?id=AtVtt9xsO1", "author_site": "Suppakit Waiwitlikhit, Ion Stoica, Yi Sun, Tatsunori Hashimoto, Daniel Kang", "tldr": "", "abstract": "There is an increasing conflict between business incentives to hide models and data as trade secrets, and the societal need for algorithmic transparency. For example, a rightsholder who currently wishes to know whether their copyrighted works have been used during training must convince the model provider to allow a third party to audit the model and data. Finding a mutually agreeable third party is difficult, and the associated costs often make this approach impractical. In this work, we show that it is possible to simultaneously allow model providers to keep their models and data secret while allowing other parties to trustlessly audit properties of the model and data. We do this by designing a protocol called ZkAudit in which model providers publish cryptographic commitments of datasets and model weights, alongside a zero-knowledge proof (ZKP) certifying that published commitments are derived from training the model. Model providers can then respond to audit requests by privately computing any function F of the dataset (or model) and releasing the output of F alongside another ZKP certifying the correct execution of F. To enable ZkAudit, we develop new methods of computing ZKPs for SGD on modern neural nets for recommender systems and image classification models capable of high accuracies on ImageNet. Empirically, we show it is possible to provide trustless audits of DNNs, including copyright, censorship, and counterfactual audits with little to no loss in accuracy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Suppakit Waiwitlikhit;Ion Stoica;Yi Sun;Tatsunori Hashimoto;Daniel Kang", "authorids": "~Suppakit_Waiwitlikhit3;~Ion_Stoica1;~Yi_Sun3;~Tatsunori_Hashimoto1;~Daniel_Kang1", "gender": "M;;M;;M", "homepage": "http://people.eecs.berkeley.edu/~istoica/;https://yisun.io/;https://thashim.github.io;https://ddkang.github.io/;https://punw.xyz", "dblp": "s/IonStoica;;;40/6300.html;", "google_scholar": "vN-is70AAAAJ;FdNHp8QAAAAJ;5ygiTwsAAAAJ;CpMjT0YAAAAJ;", "orcid": ";;;;", "linkedin": "ionstoica;;;;", "or_profile": "~Ion_Stoica1;~Yi_Sun3;~Tatsunori_Hashimoto1;~Daniel_Kang1;~Pun_Waiwitlikhit1", "aff": "University of California, Berkeley;University of Chicago;Stanford University;Department of Computer Science;Stanford University", "aff_domain": "berkeley.edu;statistics.uchicago.edu;stanford.edu;cs.illinois.edu;stanford.edu", "position": "Full Professor;Assistant Professor;Assistant Professor;Assistant Professor;Undergrad student", "bibtex": "@inproceedings{\nwaiwitlikhit2024trustless,\ntitle={Trustless Audits without Revealing Data or Models},\nauthor={Suppakit Waiwitlikhit and Ion Stoica and Yi Sun and Tatsunori Hashimoto and Daniel Kang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AtVtt9xsO1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 361921, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17365904962155707719&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "berkeley.edu;statistics.uchicago.edu;stanford.edu;cs.illinois.edu;stanford.edu", "author_num": 5, "aff_unique_index": "0;1;2;3;2", "aff_unique_norm": "University of California, Berkeley;University of Chicago;Stanford University;Unknown Institution", "aff_unique_dep": ";;;Department of Computer Science", "aff_unique_url": "https://www.berkeley.edu;https://www.uchicago.edu;https://www.stanford.edu;", "aff_unique_abbr": "UC Berkeley;UChicago;Stanford;", "aff_campus_unique_index": "0;2;2", "aff_campus_unique": "Berkeley;;Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States;" }, { "title": "Principled Gradient-Based MCMC for Conditional Sampling of Text", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34746", "id": "AwLLSlJAeJ", "proceeding": "https://proceedings.mlr.press/v235/du24a.html", "pdf": "https://openreview.net/pdf?id=AwLLSlJAeJ", "openreview": "https://openreview.net/forum?id=AwLLSlJAeJ", "author_site": "Li Du, Afra Amini, Lucas Torroba Hennigen, Xinyan Velocity Yu, Holden Lee, Jason Eisner, Ryan Cotterell", "tldr": "", "abstract": "We consider the problem of sampling text from an energy-based model. This arises, for example, when sampling text from a neural language model subject to soft constraints. Although the target distribution is discrete, the internal computations of the energy function (given by the language model) are differentiable, so one would like to exploit gradient information within a method such as MCMC. Alas, all previous attempts to generalize gradient-based MCMC to text sampling fail to sample correctly from the target distribution. We propose a solution, along with variants, and study its theoretical properties. Through experiments on various forms of text generation, we demonstrate that our unbiased samplers are able to generate more fluent text while better adhering to the control objectives. The same methods could be used to sample from discrete energy-based models unrelated to text.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Li Du;Afra Amini;Lucas Torroba Hennigen;Xinyan Velocity Yu;Holden Lee;Jason Eisner;Ryan Cotterell", "authorids": "~Li_Du2;~Afra_Amini1;~Lucas_Torroba_Hennigen1;~Xinyan_Velocity_Yu1;~Holden_Lee1;~Jason_Eisner1;~Ryan_Cotterell1", "gender": "M;F;M;M;M;Not Specified;F", "homepage": ";;https://ltorroba.github.io/;http://holdenlee.github.io;http://cs.jhu.edu/~jason;https://rycolab.io/;https://velocitycavalry.github.io", "dblp": ";270/4959;267/9755;150/3407;37/3263;146/4361.html;165/9117-1", "google_scholar": "efDU43kAAAAJ;;Zhy1N1sAAAAJ;hR9rFHgAAAAJ;tjb2UccAAAAJ;DexOqtoAAAAJ;PoZv5KkAAAAJ", "orcid": ";;0000-0002-8197-9008;;0000-0002-8861-0772;;", "linkedin": ";afraamini;lucas-torroba-hennigen/;;;;", "or_profile": "~Li_Du2;~Afra_Amini1;~Lucas_Torroba_Hennigen1;~Holden_Lee1;~Jason_Eisner1;~Ryan_D_Cotterell1;~Xinyan_Yu2", "aff": "Johns Hopkins University;ETHZ - ETH Zurich;Massachusetts Institute of Technology;Johns Hopkins University;Microsoft;Swiss Federal Institute of Technology;University of Southern California", "aff_domain": "cs.jhu.edu;ethz.ch;mit.edu;jh.edu;microsoft.com;ethz.ch;usc.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor;Director of Research ;Assistant Professor;PhD student", "bibtex": "@inproceedings{\ndu2024principled,\ntitle={Principled Gradient-Based {MCMC} for Conditional Sampling of Text},\nauthor={Li Du and Afra Amini and Lucas Torroba Hennigen and Xinyan Velocity Yu and Holden Lee and Jason Eisner and Ryan Cotterell},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AwLLSlJAeJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 734638, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3542368365069265519&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "cs.jhu.edu;ethz.ch;mit.edu;jh.edu;microsoft.com;ethz.ch;usc.edu", "author_num": 7, "aff_unique_index": "0;1;2;0;3;4;5", "aff_unique_norm": "Johns Hopkins University;ETH Zurich;Massachusetts Institute of Technology;Microsoft;Swiss Federal Institute of Technology;University of Southern California", "aff_unique_dep": ";;;Microsoft Corporation;;", "aff_unique_url": "https://www.jhu.edu;https://www.ethz.ch;https://web.mit.edu;https://www.microsoft.com;https://www.ethz.ch;https://www.usc.edu", "aff_unique_abbr": "JHU;ETHZ;MIT;Microsoft;ETH Zurich;USC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;0;0;0;1;0", "aff_country_unique": "United States;Switzerland" }, { "title": "Learning Constraints from Offline Demonstrations via Superior Distribution Correction Estimation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34745", "id": "Ax90jQPbgF", "proceeding": "https://proceedings.mlr.press/v235/quan24a.html", "pdf": "https://openreview.net/pdf?id=Ax90jQPbgF", "openreview": "https://openreview.net/forum?id=Ax90jQPbgF", "author_site": "Guorui Quan, Zhiqiang Xu, Guiliang Liu", "tldr": "", "abstract": "An effective approach for learning both safety constraints and control policies is Inverse Constrained Reinforcement Learning (ICRL). Previous ICRL algorithms commonly employ an online learning framework that permits unlimited sampling from an interactive environment. This setting, however, is infeasible in many realistic applications where data collection is dangerous and expensive. To address this challenge, we propose Inverse Constrained Superior Distribution Correction Estimation (ICSDICE) as an offline ICRL solver. ICSDICE extracts feasible constraints from superior distributions, thereby highlighting policies with expert-exceeding rewards maximization ability. To estimate these distributions, ICSDICE solves a regularized dual optimization problem for safe control by exploiting the observed reward signals and expert preferences. Striving for transferable constraints and unbiased estimations, ICSDICE actively encourages sparsity and incorporates a discounting effect within the learned and observed distributions. Empirical studies show that ICSDICE outperforms other baselines by accurately recovering the constraints and adapting to high-dimensional environments. The code is available at https://github.com/quangr/ICSDICE.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guorui Quan;zhiqiang xu;Guiliang Liu", "authorids": "~Guorui_Quan1;~zhiqiang_xu1;~Guiliang_Liu1", "gender": ";M;M", "homepage": "https://github.com/quangr/;https://scholar.google.com/citations?user=0R20iBMAAAAJ&hl=en;http://guiliang.me/", "dblp": ";72/51-3.html;220/5411", "google_scholar": ";;CuMylvEAAAAJ", "orcid": ";0000-0002-5693-8933;", "linkedin": ";;", "or_profile": "~Guorui_Quan1;~zhiqiang_xu1;~Guiliang_Liu1", "aff": "The Chinese University of Hong Kong, Shenzhen;Mohamed bin Zayed University of Artificial Intelligence;The Chinese University of Hong Kong, Shenzhen", "aff_domain": "cuhk.edu.hk;mbzuai.ac.ae;cuhk.edu.hk", "position": "Visiting Student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nquan2024learning,\ntitle={Learning Constraints from Offline Demonstrations via Superior Distribution Correction Estimation},\nauthor={Guorui Quan and zhiqiang xu and Guiliang Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Ax90jQPbgF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 884078, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5445709520702637777&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 4, "email": "cuhk.edu.hk;mbzuai.ac.ae;cuhk.edu.hk", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Chinese University of Hong Kong;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.cn;https://mbzuai.ac.ae", "aff_unique_abbr": "CUHK;MBZUAI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;United Arab Emirates" }, { "title": "TimeMIL: Advancing Multivariate Time Series Classification via a Time-aware Multiple Instance Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34744", "id": "AxmefV2NEf", "proceeding": "https://proceedings.mlr.press/v235/chen24af.html", "pdf": "https://openreview.net/pdf?id=AxmefV2NEf", "openreview": "https://openreview.net/forum?id=AxmefV2NEf", "author_site": "Xiwen Chen, Peijie Qiu, Wenhui Zhu, Huayu Li, Hao Wang, Aristeidis Sotiras, Yalin Wang, Abolfazl Razi", "tldr": "", "abstract": "Deep neural networks, including transformers and convolutional neural networks (CNNs), have significantly improved multivariate time series classification (MTSC). However, these methods often rely on supervised learning, which does not fully account for the sparsity and locality of patterns in time series data (e.g., quantification of diseases-related anomalous points in ECG and abnormal detection in signal). To address this challenge, we formally discuss and reformulate MTSC as a weakly supervised problem, introducing a novel multiple-instance learning (MIL) framework for better localization of patterns of interest and modeling time dependencies within time series. Our novel approach, TimeMIL, formulates the temporal correlation and ordering within a time-aware MIL pooling, leveraging a tokenized transformer with a specialized learnable wavelet positional token. The proposed method surpassed 26 recent state-of-the-art MTSC methods, underscoring the effectiveness of the weakly supervised TimeMIL in MTSC. The code is available https://github.com/xiwenc1/TimeMIL.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiwen Chen;Peijie Qiu;Wenhui Zhu;Huayu Li;Hao Wang;Aristeidis Sotiras;Yalin Wang;Abolfazl Razi", "authorids": "~Xiwen_Chen3;~Peijie_Qiu1;~Wenhui_Zhu1;~Huayu_Li2;~Hao_Wang68;~Aristeidis_Sotiras3;~Yalin_Wang3;~Abolfazl_Razi2", "gender": ";M;M;M;M;M;M;M", "homepage": ";https://scholar.google.com/citations?user=7HLmlHMAAAAJ&hl=en&citsig=AMD79oo-MqB0FFGuso3koVXYlItYOGEl-w;https://github.com/ChongQingNoSubway;;https://scholar.google.com/citations?user=sdcnttQAAAAJ&hl=en;https://mir.wustl.edu/sotiras-lab;http://gsl.lab.asu.edu;https://arazi.people.clemson.edu/", "dblp": ";331/3793;;;181/2812-176;28/7425;88/128-1;83/8819", "google_scholar": ";7HLmlHMAAAAJ;Se8aIO4YIp8C;J4Guh_EAAAAJ;sdcnttQAAAAJ;MsNwZ-IAAAAJ;F4tTgLQAAAAJ;DhwC8gsAAAAJ", "orcid": ";0000-0002-1591-5436;;;0000-0002-3035-3064;0000-0003-0795-8820;0000-0002-6241-735X;0000-0002-3330-6132", "linkedin": ";;wenhui-zhu-a7101b226/;;;;;abolfazl-razi-48886522/", "or_profile": "~Xiwen_Chen3;~Peijie_Qiu1;~Wenhui_Zhu1;~Huayu_Li2;~Hao_Wang68;~Aristeidis_Sotiras3;~Yalin_Wang3;~Abolfazl_Razi2", "aff": ";Washington University, Saint Louis;Arizona State University;University of Arizona;Clemson University;Washington University, Saint Louis;Arizona State University;Clemson University", "aff_domain": ";wustl.edu;asu.edu;arizona.edu;clemson.edu;wustl.edu;asu.edu;clemson.edu", "position": ";PhD student;PhD student;PhD student;PhD student;Assistant Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nchen2024timemil,\ntitle={Time{MIL}: Advancing Multivariate Time Series Classification via a Time-aware Multiple Instance Learning},\nauthor={Xiwen Chen and Peijie Qiu and Wenhui Zhu and Huayu Li and Hao Wang and Aristeidis Sotiras and Yalin Wang and Abolfazl Razi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AxmefV2NEf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 0, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4999068083869921182&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8, "email": ";wustl.edu;asu.edu;arizona.edu;clemson.edu;wustl.edu;asu.edu;clemson.edu", "author_num": 8, "aff_unique_index": "0;1;2;3;0;1;3", "aff_unique_norm": "Washington University in St. Louis;Arizona State University;University of Arizona;Clemson University", "aff_unique_dep": ";;;", "aff_unique_url": "https://wustl.edu;https://www.asu.edu;https://www.arizona.edu;https://www.clemson.edu", "aff_unique_abbr": "WUSTL;ASU;UA;Clemson", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Saint Louis;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "On the Weight Dynamics of Deep Normalized Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34743", "id": "AzUCfhJ9Bs", "proceeding": "https://proceedings.mlr.press/v235/ali-mehmeti-gopel24a.html", "pdf": "https://openreview.net/pdf?id=AzUCfhJ9Bs", "openreview": "https://openreview.net/forum?id=AzUCfhJ9Bs", "author_site": "Christian H.X. Ali Mehmeti-G\u00f6pel, Michael Wand", "tldr": "", "abstract": "Recent studies have shown that high disparities in effective learning rates (ELRs) across layers in deep neural networks can negatively affect trainability. We formalize how these disparities evolve over time by modeling weight dynamics (evolution of expected gradient and weight norms) of networks with normalization layers, predicting the evolution of layer-wise ELR ratios. We prove that when training with any constant learning rate, ELR ratios converge to 1, despite initial gradient explosion. We identify a \"critical learning rate\" beyond which ELR disparities widen, which only depends on current ELRs. To validate our findings, we devise a hyper-parameter-free warm-up method that successfully minimizes ELR spread quickly in theory and practice. Our experiments link ELR spread with trainability, a relationship that is most evident in very deep networks with significant gradient magnitude excursions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Christian H.X. Ali Mehmeti-G\u00f6pel;Michael Wand", "authorids": "~Christian_H.X._Ali_Mehmeti-G\u00f6pel1;~Michael_Wand1", "gender": "M;M", "homepage": "https://github.com/c-ali;https://www.staff.uni-mainz.de/wandm", "dblp": ";85/72", "google_scholar": "EjpEF2EAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Christian_H.X._Ali_Mehmeti-G\u00f6pel1;~Michael_Wand1", "aff": ";University of Mainz", "aff_domain": ";uni-mainz.de", "position": ";Full Professor", "bibtex": "@inproceedings{\nmehmeti-g{\\\"o}pel2024on,\ntitle={On the Weight Dynamics of Deep Normalized Networks},\nauthor={Christian H.X. Ali Mehmeti-G{\\\"o}pel and Michael Wand},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AzUCfhJ9Bs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1515755, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14347397579644643442&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": ";uni-mainz.de", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Johannes Gutenberg University Mainz", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-mainz.de/", "aff_unique_abbr": "JGU", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "title": "DISCRET: Synthesizing Faithful Explanations For Treatment Effect Estimation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34742", "id": "B0xmynxt4f", "proceeding": "https://proceedings.mlr.press/v235/wu24n.html", "pdf": "https://openreview.net/pdf?id=B0xmynxt4f", "openreview": "https://openreview.net/forum?id=B0xmynxt4f", "author_site": "Yinjun Wu, Mayank Keoliya, Kan Chen, Neelay Velingker, Ziyang Li, Emily Getzen, Qi Long, Mayur Naik, Ravi Parikh, Eric Wong", "tldr": "", "abstract": "Designing faithful yet accurate AI models is challenging, particularly in the field of individual treatment effect estimation (ITE). ITE prediction models deployed in critical settings such as healthcare should ideally be (i) accurate, and (ii) provide faithful explanations. However, current solutions are inadequate: state-of-the-art black-box models do not supply explanations, post-hoc explainers for black-box models lack faithfulness guarantees, and self-interpretable models greatly compromise accuracy. To address these issues, we propose DISCRET, a self-interpretable ITE framework that synthesizes faithful, rule-based explanations for each sample. A key insight behind DISCRET is that explanations can serve dually as *database queries* to identify similar subgroups of samples. We provide a novel RL algorithm to efficiently synthesize these explanations from a large search space. We evaluate DISCRET on diverse tasks involving tabular, image, and text data. DISCRET outperforms the best self-interpretable models and has accuracy comparable to the best black-box models while providing faithful explanations. DISCRET is available at https://github.com/wuyinjun-1993/DISCRET-ICML2024.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yinjun Wu;Mayank Keoliya;Kan Chen;Neelay Velingker;Ziyang Li;Emily J Getzen;Qi Long;Mayur Naik;Ravi B Parikh;Eric Wong", "authorids": "~Yinjun_Wu1;~Mayank_Keoliya1;~Kan_Chen3;~Neelay_Velingker1;~Ziyang_Li2;~Emily_J_Getzen1;~Qi_Long1;~Mayur_Naik1;~Ravi_B_Parikh1;~Eric_Wong1", "gender": "M;M;M;M;M;F;M;M;M;M", "homepage": "https://wuyinjun-1993.github.io/;https://mkeoliya.github.io;https://sites.google.com/sas.upenn.edu/kanchen;https://www.linkedin.com/in/nvelingker/;https://liby99.github.io;https://sites.google.com/view/emilygetzen/home;https://www.med.upenn.edu/long-lab/;http://www.cis.upenn.edu/~mhnaik/;https://www.haclab.org/;http://riceric22.github.io/", "dblp": "169/1054;;;236/5641;;;47/7320;92/6794;;64/1811-1.html", "google_scholar": "https://scholar.google.com/citations?hl=en;;;;aAQ9abEAAAAJ;;gfklepYAAAAJ;https://scholar.google.com.tw/citations?user=fmsV6nEAAAAJ;CEp7XO0AAAAJ;pWnTMRkAAAAJ", "orcid": ";;;;;;0000-0003-0660-5230;;;", "linkedin": ";;;;liby99/;;qi-long-9652a0125/;ai4code/;ravibparikh/;", "or_profile": "~Yinjun_Wu1;~Mayank_Keoliya1;~Kan_Chen3;~Neelay_Velingker1;~Ziyang_Li2;~Emily_J_Getzen1;~Qi_Long1;~Mayur_Naik1;~Ravi_B_Parikh1;~Eric_Wong1", "aff": "University of Pennsylvania;University of Pennsylvania;Harvard University;University of Pennsylvania;School of Engineering and Applied Science, University of Pennsylvania;University of Pennsylvania Perelman School of Medicine;University of Pennsylvania;University of Pennsylvania;Emory University;University of Pennsylvania", "aff_domain": "seas.upenn.edu;upenn.edu;harvard.edu;upenn.edu;seas.upenn.edu;pennmedicine.upenn.edu;upenn.edu;upenn.edu;emory.edu;upenn.edu", "position": "Postdoc;PhD student;Postdoc;PhD student;PhD student;PhD student;Professor;Professor;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nwu2024discret,\ntitle={{DISCRET}: Synthesizing Faithful Explanations For Treatment Effect Estimation},\nauthor={Yinjun Wu and Mayank Keoliya and Kan Chen and Neelay Velingker and Ziyang Li and Emily J Getzen and Qi Long and Mayur Naik and Ravi B Parikh and Eric Wong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=B0xmynxt4f}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6063164, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2424815178574112071&as_sdt=4005&sciodt=0,6&hl=en", "gs_version_total": 13, "email": "seas.upenn.edu;upenn.edu;harvard.edu;upenn.edu;seas.upenn.edu;pennmedicine.upenn.edu;upenn.edu;upenn.edu;emory.edu;upenn.edu", "author_num": 10, "aff_unique_index": "0;0;1;0;0;0;0;0;2;0", "aff_unique_norm": "University of Pennsylvania;Harvard University;Emory University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.upenn.edu;https://www.harvard.edu;https://www.emory.edu", "aff_unique_abbr": "UPenn;Harvard;Emory", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "NExT: Teaching Large Language Models to Reason about Code Execution", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34741", "id": "B1W712hMBi", "proceeding": "https://proceedings.mlr.press/v235/ni24a.html", "pdf": "https://openreview.net/pdf?id=B1W712hMBi", "openreview": "https://openreview.net/forum?id=B1W712hMBi", "author_site": "Ansong Ni, Miltiadis Allamanis, Arman Cohan, Yinlin Deng, Kensen Shi, Charles Sutton, Pengcheng Yin", "tldr": "", "abstract": "A fundamental skill among human developers is the ability to understand and reason about program execution. As an example, a programmer can mentally simulate code execution in natural language to debug and repair code (aka. rubber duck debugging). However, large language models (LLMs) of code are typically trained on the surface textual form of programs, thus may lack a semantic understanding of how programs execute at run-time. To address this issue, we propose NExT, a method to teach LLMs to inspect the execution traces of programs (variable states of executed lines) and reason about their run-time behavior through chain-of-thought (CoT) rationales. Specifically, NExT uses self-training to bootstrap a synthetic training set of execution-aware rationales that lead to correct task solutions (e.g., fixed programs) without laborious manual annotation. Experiments on program repair tasks based on MBPP and HumanEval demonstrate that NExT improves the fix rate of a PaLM 2 model, by 26.1% and 10.3% absolute, respectively, with significantly improved rationale quality as verified by automated metrics and human raters. Our model can also generalize to scenarios where program traces are absent at test-time.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ansong Ni;Miltiadis Allamanis;Arman Cohan;Yinlin Deng;Kensen Shi;Charles Sutton;Pengcheng Yin", "authorids": "~Ansong_Ni1;~Miltiadis_Allamanis1;~Arman_Cohan1;~Yinlin_Deng1;~Kensen_Shi1;~Charles_Sutton1;~Pengcheng_Yin1", "gender": "M;;M;F;M;M;M", "homepage": "https://niansong1996.github.io/;;http://www.armancohan.com;https://dengyinlin.github.io;;http://homepages.inf.ed.ac.uk/csutton/;https://pengcheng.in", "dblp": "202/1480;;160/1727;311/4562;135/8307;59/5879;130/7385", "google_scholar": "4IA1clAAAAAJ;;https://scholar.google.com/citations?hl=en;FsP6Nl0AAAAJ;LAL4SIMAAAAJ;https://scholar.google.co.uk/citations?user=hYtGXD0AAAAJ;t5lVb6sAAAAJ", "orcid": ";;;0000-0002-4628-4219;0000-0001-7140-7869;0000-0002-0041-3820;", "linkedin": ";;;;;charles-sutton-772aa126;pchyin/", "or_profile": "~Ansong_Ni1;~Miltiadis_Allamanis1;~Arman_Cohan1;~Yinlin_Deng1;~Kensen_Shi1;~Charles_Sutton1;~Pengcheng_Yin1", "aff": "Yale University;;Allen Institute for Artificial Intelligence;University of Illinois Urbana-Champaign;Google;University of Edinburgh;Google", "aff_domain": "yale.edu;;allenai.org;illinois.edu;google.com;ed.ac.uk;google.com", "position": "PhD student;;Research Scientist;PhD student;Software Engineer;Professor;Researcher", "bibtex": "@inproceedings{\nni2024next,\ntitle={{NE}xT: Teaching Large Language Models to Reason about Code Execution},\nauthor={Ansong Ni and Miltiadis Allamanis and Arman Cohan and Yinlin Deng and Kensen Shi and Charles Sutton and Pengcheng Yin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=B1W712hMBi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1254698, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18097303260139004775&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "yale.edu;;allenai.org;illinois.edu;google.com;ed.ac.uk;google.com", "author_num": 7, "aff_unique_index": "0;1;2;3;4;3", "aff_unique_norm": "Yale University;Allen Institute for Artificial Intelligence;University of Illinois Urbana-Champaign;Google;University of Edinburgh", "aff_unique_dep": ";;;Google;", "aff_unique_url": "https://www.yale.edu;https://allenai.org;https://illinois.edu;https://www.google.com;https://www.ed.ac.uk", "aff_unique_abbr": "Yale;AI2;UIUC;Google;Edinburgh", "aff_campus_unique_index": "1;2;2", "aff_campus_unique": ";Urbana-Champaign;Mountain View", "aff_country_unique_index": "0;0;0;0;1;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Learning Universal Predictors", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34740", "id": "B1ajnQyZgK", "proceeding": "https://proceedings.mlr.press/v235/grau-moya24a.html", "pdf": "https://openreview.net/pdf?id=B1ajnQyZgK", "openreview": "https://openreview.net/forum?id=B1ajnQyZgK", "author_site": "Jordi Grau-Moya, Tim Genewein, Marcus Hutter, Laurent Orseau, Gregoire Deletang, Elliot Catt, Anian Ruoss, Li Kevin Wenliang, Christopher Mattern, Matthew Aitchison, Joel Veness", "tldr": "", "abstract": "Meta-learning has emerged as a powerful approach to train neural networks to learn new tasks quickly from limited data by pre-training them on a broad set of tasks. But, what are the limits of meta-learning? In this work, we explore the potential of amortizing the most powerful universal predictor, namely Solomonoff Induction (SI), into neural networks via leveraging (memory-based) meta-learning to its limits. We use Universal Turing Machines (UTMs) to generate training data used to expose networks to a broad range of patterns. We provide theoretical analysis of the UTM data generation processes and meta-training protocols. We conduct comprehensive experiments with neural architectures (e.g. LSTMs, Transformers) and algorithmic data generators of varying complexity and universality. Our results suggest that UTM data is a valuable resource for meta-learning, and that it can be used to train neural networks capable of learning universal prediction strategies.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jordi Grau-Moya;Tim Genewein;Marcus Hutter;Laurent Orseau;Gregoire Deletang;Elliot Catt;Anian Ruoss;Li Kevin Wenliang;Christopher Mattern;Matthew Aitchison;Joel Veness", "authorids": "~Jordi_Grau-Moya2;~Tim_Genewein1;~Marcus_Hutter1;~Laurent_Orseau1;~Gregoire_Deletang1;~Elliot_Catt1;~Anian_Ruoss1;~Li_Kevin_Wenliang1;~Christopher_Mattern1;~Matthew_Aitchison1;~Joel_Veness2", "gender": ";M;;;;M;M;;;M;", "homepage": ";http://tim.inversetemperature.net/;http://www.hutter1.net/;;;;;https://kevin-w-li.github.io/;;;", "dblp": "116/3023;116/3039;h/MarcusHutter;79/1040;;204/2511;259/2083;255/7009;19/10437.html;;", "google_scholar": ";https://scholar.google.de/citations?user=peNTK9oAAAAJ;https://scholar.google.com.tw/citations?user=7hmCntEAAAAJ;;;d1JYeMIAAAAJ;gFkwD3kAAAAJ;https://scholar.google.co.uk/citations?user=MW45NMEAAAAJ;;81URpqMAAAAJ;", "orcid": ";;0000-0002-3263-4097;;;0000-0001-9411-927X;;;;;", "linkedin": "jordi-g-9a1b02104;;hutter1/;;;;anian-ruoss;;;;", "or_profile": "~Jordi_Grau-Moya2;~Tim_Genewein1;~Marcus_Hutter1;~Laurent_Orseau1;~Gregoire_Deletang1;~Elliot_Catt1;~Anian_Ruoss1;~Li_Kevin_Wenliang1;~Christopher_Mattern1;~Matthew_Aitchison1;~Joel_Veness2", "aff": "Google DeepMind;Google DeepMind;Australian National University;;;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Australian National University;", "aff_domain": "deepmind.com;google.com;anu.edu.au;;;deepmind.com;deepmind.com;deepmind.com;deepmind.com;anu.edu.au;", "position": "Researcher;Researcher;Full Professor;;;Researcher;Researcher;Researcher;Researcher;PhD student;", "bibtex": "@inproceedings{\ngrau-moya2024learning,\ntitle={Learning Universal Predictors},\nauthor={Jordi Grau-Moya and Tim Genewein and Marcus Hutter and Laurent Orseau and Gregoire Deletang and Elliot Catt and Anian Ruoss and Li Kevin Wenliang and Christopher Mattern and Matthew Aitchison and Joel Veness},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=B1ajnQyZgK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1294732, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9873725797996232966&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "deepmind.com;google.com;anu.edu.au;;;deepmind.com;deepmind.com;deepmind.com;deepmind.com;anu.edu.au;", "author_num": 11, "aff_unique_index": "0;0;1;0;0;0;0;1", "aff_unique_norm": "Google;Australian National University", "aff_unique_dep": "Google DeepMind;", "aff_unique_url": "https://deepmind.com;https://www.anu.edu.au", "aff_unique_abbr": "DeepMind;ANU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0;0;1", "aff_country_unique": "United Kingdom;Australia" }, { "title": "LLaGA: Large Language and Graph Assistant", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34739", "id": "B48Pzc4oKi", "proceeding": "https://proceedings.mlr.press/v235/chen24bh.html", "pdf": "https://openreview.net/pdf?id=B48Pzc4oKi", "openreview": "https://openreview.net/forum?id=B48Pzc4oKi", "author_site": "Runjin Chen, Tong Zhao, Ajay Jaiswal, Neil Shah, Zhangyang \u201cAtlas\u201d Wang", "tldr": "", "abstract": "Graph Neural Networks (GNNs) have empowered the advance in graph-structured data analysis. Recently, the rise of Large Language Models (LLMs) like GPT-4 has heralded a new era in deep learning. However, their application to graph data poses distinct challenges due to the inherent difficulty of translating graph structures to language. To this end, we introduce the the **L**arge **L**anguage **a**nd **G**raph **A**ssistant (**LLaGA**), an innovative model that effectively integrates LLM capabilities to handle the complexities of graph-structured data. LLaGA retains the general-purpose nature of LLMs while adapting graph data into a format compatible with LLM input. LLaGA achieves this by reorganizing graph nodes to structure-aware sequences and then mapping these into the token embedding space through a versatile projector. LLaGA excels in versatility, generalizability and interpretability, allowing it to perform consistently well across different datasets and tasks, extend its ability to unseen datasets or tasks, and provide explanations for graphs. Our extensive experiments across popular graph benchmarks show that LLaGA delivers outstanding performance across four datasets and three tasks using one single model, surpassing state-of-the-art graph models in both supervised and zero-shot scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Runjin Chen;Tong Zhao;AJAY KUMAR JAISWAL;Neil Shah;Zhangyang Wang", "authorids": "~Runjin_Chen1;~Tong_Zhao3;~AJAY_KUMAR_JAISWAL1;~Neil_Shah2;~Zhangyang_Wang1", "gender": ";M;M;M;M", "homepage": ";https://tzhao.io/;https://ajay1994.github.io/;http://nshah.net;https://vita-group.github.io", "dblp": ";94/6503-3;30/9707;71/7771;119/4026", "google_scholar": ";05cRc-MAAAAJ;I783HxYAAAAJ;Qut69OgAAAAJ;pxFyKAIAAAAJ", "orcid": ";0000-0001-7660-1732;;0000-0003-3261-8430;", "linkedin": ";;;;", "or_profile": "~Runjin_Chen1;~Tong_Zhao3;~AJAY_KUMAR_JAISWAL1;~Neil_Shah2;~Zhangyang_Wang1", "aff": ";Snap Inc.;University of Texas, Austin;Snap Inc.;University of Texas at Austin", "aff_domain": ";snap.com;utexas.edu;snap.com;utexas.edu", "position": ";Researcher;PhD student;Research Scientist;Associate Professor", "bibtex": "@inproceedings{\nchen2024llaga,\ntitle={{LL}a{GA}: Large Language and Graph Assistant},\nauthor={Runjin Chen and Tong Zhao and AJAY KUMAR JAISWAL and Neil Shah and Zhangyang Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=B48Pzc4oKi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 545050, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 82, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2480317403211119466&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": ";snap.com;utexas.edu;snap.com;utexas.edu", "author_num": 5, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Snap Inc.;University of Texas at Austin", "aff_unique_dep": ";", "aff_unique_url": "https://www.snapinc.com;https://www.utexas.edu", "aff_unique_abbr": "Snap;UT Austin", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "CCM: Real-Time Controllable Visual Content Creation Using Text-to-Image Consistency Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34738", "id": "B4rViOCoNf", "proceeding": "https://proceedings.mlr.press/v235/xiao24h.html", "pdf": "https://openreview.net/pdf?id=B4rViOCoNf", "openreview": "https://openreview.net/forum?id=B4rViOCoNf", "author_site": "Jie Xiao, Kai Zhu, Han Zhang, Zhiheng Liu, Yujun Shen, Zhantao Yang, Ruili Feng, Yu Liu, Xueyang Fu, Zheng-Jun Zha", "tldr": "", "abstract": "Consistency Models (CMs) have showed a promise in creating high-quality images with few steps. However, the way to add new conditional controls to the pre-trained CMs has not been explored. In this paper, we explore the pivotal subject of leveraging the generative capacity and efficiency of consistency models to facilitate controllable visual content creation via ControlNet. First, it is observed that ControlNet trained for diffusion models (DMs) can be directly applied to CMs for high-level semantic controls but sacrifice image low-level details and realism. To tackle with this issue, we develop a CMs-tailored training strategy for ControlNet using the consistency training. It is substantiated that ControlNet can be successfully established through the consistency training technique. Besides, a unified adapter can be trained utilizing the consistency training, which enhances the adaptation of DM's ControlNet. We quantitatively and qualitatively evaluate all strategies across various conditional controls, including sketch, hed, canny, depth, human pose, low-resolution image and masked image, with the pre-trained text-to-image latent consistency models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jie Xiao;Kai Zhu;Han Zhang;Zhiheng Liu;Yujun Shen;Zhantao Yang;Ruili Feng;Yu Liu;Xueyang Fu;Zheng-Jun Zha", "authorids": "~Jie_Xiao3;~Kai_Zhu4;~Han_Zhang16;~Zhiheng_Liu1;~Yujun_Shen1;~Zhantao_Yang1;~Ruili_Feng1;~Yu_Liu23;~Xueyang_Fu1;~Zheng-Jun_Zha2", "gender": "M;;M;M;;M;;M;;M", "homepage": "https://jiexiaou.github.io/;;https://github.com/bibona;https://Johanan528.github.io;;;https://github.com/RuiLiFeng;https://github.com/liuyuyuil;;", "dblp": "15/3437-2;75/4078-4;26/4189-10;;;285/8489.html;20/9594;97/2274-63;;23/1818", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com.hk/citations?hl=zh-CN;;LT5JwlcAAAAJ;;Fz3X5FwAAAAJ;;8zksQb4AAAAJ;;", "orcid": "0000-0002-5677-270X;;;;;0000-0003-2765-295X;;;;", "linkedin": ";;;;;;;;;", "or_profile": "~Jie_Xiao3;~Kai_Zhu4;~Han_Zhang16;~Zhiheng_Liu1;~Yujun_Shen1;~Zhantao_Yang1;~Ruili_Feng1;~Yu_Liu23;~Xueyang_Fu1;~Zheng-Jun_Zha2", "aff": "University of Science and Technology of China;University of Science and Technology of China;Shanghai Jiaotong University;University of Science and Technology of China;;Shanghai Jiaotong University;University of Science and Technology of China;Alibaba Group;;University of Science and Technology of China", "aff_domain": "mail.ustc.edu.cn;ustc.edu.cn;sjtu.edu.cn;ustc.edu.cn;;sjtu.edu.cn;mail.ustc.edu.cn;alibaba-inc.com;;ustc.edu.cn", "position": "PhD student;Postdoc;PhD student;MS student;;PhD student;PhD student;Researcher;;Full Professor", "bibtex": "@inproceedings{\nxiao2024ccm,\ntitle={{CCM}: Real-Time Controllable Visual Content Creation Using Text-to-Image Consistency Models},\nauthor={Jie Xiao and Kai Zhu and Han Zhang and Zhiheng Liu and Yujun Shen and Zhantao Yang and Ruili Feng and Yu Liu and Xueyang Fu and Zheng-Jun Zha},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=B4rViOCoNf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9231162, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13658194133310341159&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 4, "email": "mail.ustc.edu.cn;ustc.edu.cn;sjtu.edu.cn;ustc.edu.cn;;sjtu.edu.cn;mail.ustc.edu.cn;alibaba-inc.com;;ustc.edu.cn", "author_num": 10, "aff_unique_index": "0;0;1;0;1;0;2;0", "aff_unique_norm": "University of Science and Technology of China;Shanghai Jiao Tong University;Alibaba Group", "aff_unique_dep": ";;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.sjtu.edu.cn;https://www.alibaba.com", "aff_unique_abbr": "USTC;SJTU;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Automated Statistical Model Discovery with Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34737", "id": "B5906M4Wnd", "proceeding": "https://proceedings.mlr.press/v235/li24v.html", "pdf": "https://openreview.net/pdf?id=B5906M4Wnd", "openreview": "https://openreview.net/forum?id=B5906M4Wnd", "author_site": "Michael Li, Emily Fox, Noah Goodman", "tldr": "", "abstract": "Statistical model discovery is a challenging search over a vast space of models subject to domain-specific constraints. Efficiently searching over this space requires expertise in modeling and the problem domain. Motivated by the domain knowledge and programming capabilities of large language models (LMs), we introduce a method for language model driven automated statistical model discovery. We cast our automated procedure within the principled framework of Box\u2019s Loop: the LM iterates between proposing statistical models represented as probabilistic programs, acting as a modeler, and critiquing those models, acting as a domain expert. By leveraging LMs, we do not have to define a domain-specific language of models or design a handcrafted search procedure, which are key restrictions of previous systems. We evaluate our method in three settings in probabilistic modeling: searching within a restricted space of models, searching over an open-ended space, and improving expert models under natural language constraints (e.g., this model should be interpretable to an ecologist). Our method identifies models on par with human expert designed models and extends classic models in interpretable ways. Our results highlight the promise of LM-driven model discovery.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Michael Y. Li;Emily Fox;Noah Goodman", "authorids": "~Michael_Y._Li1;~Emily_Fox2;~Noah_Goodman1", "gender": "F;;", "homepage": "https://emilybfox.su.domains/;https://cocolab.stanford.edu/;https://michaelyli.github.io/", "dblp": "68/1212;96/1216;40/2032", "google_scholar": "OO-2710AAAAJ;OUpIbcQAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Emily_Fox2;~Noah_Goodman1;~Michael_Yifan_Li1", "aff": "Stanford University;Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu", "position": "Full Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nli2024automated,\ntitle={Automated Statistical Model Discovery with Language Models},\nauthor={Michael Y. Li and Emily Fox and Noah Goodman},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=B5906M4Wnd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 739204, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18391238206759210513&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "stanford.edu;stanford.edu;stanford.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Random features models: a way to study the success of naive imputation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34736", "id": "B5g6y7JlMw", "proceeding": "https://proceedings.mlr.press/v235/ayme24a.html", "pdf": "https://openreview.net/pdf?id=B5g6y7JlMw", "openreview": "https://openreview.net/forum?id=B5g6y7JlMw", "author_site": "Alexis Ayme, Claire Boyer, Aymeric Dieuleveut, Erwan Scornet", "tldr": "", "abstract": "Constant (naive) imputation is still widely used in practice as this is a first easy-to-use technique to deal with missing data. Yet, this simple method could be expected to induce a large bias for prediction purposes, as the imputed input may strongly differ from the true underlying data. However, recent works suggest that this bias is low in the context of high-dimensional linear predictors when data is supposed to be missing completely at random (MCAR). This paper completes the picture for linear predictors by confirming the intuition that the bias is negligible and that surprisingly naive imputation also remains relevant in very low dimension. To this aim, we consider a unique underlying random features model, which offers a rigorous framework for studying predictive performances, whilst the dimension of the observed features varies. Building on these theoretical results, we establish finite-sample bounds on stochastic gradient (SGD) predictors applied to zero-imputed data, a strategy particularly well suited for large-scale learning. If the MCAR assumption appears to be strong, we show that similar favorable behaviors occur for more complex missing data scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alexis Ayme;Claire Boyer;Aymeric Dieuleveut;Erwan Scornet", "authorids": "~Alexis_Ayme1;~Claire_Boyer1;~Aymeric_Dieuleveut1;~Erwan_Scornet1", "gender": "M;;M;M", "homepage": "https://alexisayme.github.io;https://www.imo.universite-paris-saclay.fr/~claire.boyer/;http://www.cmap.polytechnique.fr/~aymeric.dieuleveut/;https://erwanscornet.github.io/", "dblp": ";;176/5034;176/1062", "google_scholar": ";;ge-OinUAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Alexis_Ayme1;~Claire_Boyer1;~Aymeric_Dieuleveut1;~Erwan_Scornet1", "aff": "LPSM;Sorbonne Universit\u00e9 ;\u00c9cole Polytechnique;Sorbonne Universit\u00e9 - Facult\u00e9 des Sciences (Paris VI)", "aff_domain": "sorbonne-universite.fr;sorbonne-universite.fr;polytechnique.edu;sorbonne-universite.fr", "position": "PhD student;Associate Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nayme2024random,\ntitle={Random features models: a way to study the success of naive imputation},\nauthor={Alexis Ayme and Claire Boyer and Aymeric Dieuleveut and Erwan Scornet},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=B5g6y7JlMw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 460787, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4859125721980750805&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "sorbonne-universite.fr;sorbonne-universite.fr;polytechnique.edu;sorbonne-universite.fr", "author_num": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Laboratoire de Physique des Solides et des Mat\u00e9riaux;Sorbonne Universit\u00e9;Ecole Polytechnique", "aff_unique_dep": "Physics;;", "aff_unique_url": "https://www.lpsm.paris-saclay.fr;https://www.sorbonne-universite.fr;https://www.polytechnique.edu", "aff_unique_abbr": "LPSM;Sorbonne U;X", "aff_campus_unique_index": "1", "aff_campus_unique": ";Paris VI", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "France" }, { "title": "Uncertainty for Active Learning on Graphs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34735", "id": "BCEtumPYDt", "proceeding": "https://proceedings.mlr.press/v235/fuchsgruber24a.html", "pdf": "https://openreview.net/pdf?id=BCEtumPYDt", "openreview": "https://openreview.net/forum?id=BCEtumPYDt", "author_site": "Dominik Fuchsgruber, Tom Wollschl\u00e4ger, Bertrand Charpentier, Antonio Oroz, Stephan G\u00fcnnemann", "tldr": "", "abstract": "Uncertainty Sampling is an Active Learning strategy that aims to improve the data efficiency of machine learning models by iteratively acquiring labels of data points with the highest uncertainty. While it has proven effective for independent data its applicability to graphs remains under-explored. We propose the first extensive study of Uncertainty Sampling for node classification: **(1)** We benchmark Uncertainty Sampling beyond predictive uncertainty and highlight a significant performance gap to other Active Learning strategies. **(2)** We develop ground-truth Bayesian uncertainty estimates in terms of the data generating process and prove their effectiveness in guiding Uncertainty Sampling toward optimal queries. We confirm our results on synthetic data and design an approximate approach that consistently outperforms other uncertainty estimators on real datasets. **(3)** Based on this analysis, we relate pitfalls in modeling uncertainty to existing methods. Our analysis enables and informs the development of principled uncertainty estimation on graphs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dominik Fuchsgruber;Tom Wollschl\u00e4ger;Bertrand Charpentier;Antonio Oroz;Stephan G\u00fcnnemann", "authorids": "~Dominik_Fuchsgruber1;~Tom_Wollschl\u00e4ger1;~Bertrand_Charpentier2;~Antonio_Oroz1;~Stephan_G\u00fcnnemann1", "gender": "M;M;;M;M", "homepage": "https://github.com/dfuchsgruber;https://www.linkedin.com/in/wollschlaeger/;https://sharpenb.github.io/;https://www.linkedin.com/in/antoniooroz/;http://www.daml.in.tum.de", "dblp": "377/3314.html;332/0829;222/1875;;43/3011", "google_scholar": "https://scholar.google.de/citations?user=K-egQS0AAAAJ;https://scholar.google.com/citations?hl=en;0rqI-ycAAAAJ;;", "orcid": ";;;;", "linkedin": ";wollschlaeger/;bertrand-charpentier-76995ab6/;;", "or_profile": "~Dominik_Fuchsgruber1;~Tom_Wollschl\u00e4ger1;~Bertrand_Charpentier2;~Antonio_Oroz1;~Stephan_G\u00fcnnemann1", "aff": "Technische Universit\u00e4t M\u00fcnchen;Valence Labs powered by recursion;Technical University Munich;Department of Informatics, Technische Universit\u00e4t M\u00fcnchen;Technical University Munich", "aff_domain": "tum.de;valencelabs.com;tum.de;in.tum.de;tum.de", "position": "PhD student;Researcher;PhD student;MS student;Professor", "bibtex": "@inproceedings{\nfuchsgruber2024uncertainty,\ntitle={Uncertainty for Active Learning on Graphs},\nauthor={Dominik Fuchsgruber and Tom Wollschl{\\\"a}ger and Bertrand Charpentier and Antonio Oroz and Stephan G{\\\"u}nnemann},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BCEtumPYDt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 952557, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16177362924329470064&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "tum.de;valencelabs.com;tum.de;in.tum.de;tum.de", "author_num": 5, "aff_unique_index": "0;1;2;0;2", "aff_unique_norm": "Technische Universit\u00e4t M\u00fcnchen;Valence Labs;Technical University of Munich", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tum.de;;https://www.tum.de", "aff_unique_abbr": "TUM;;TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany;" }, { "title": "Position: The Platonic Representation Hypothesis", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34734", "id": "BH8TYy0r6u", "proceeding": "https://proceedings.mlr.press/v235/huh24a.html", "pdf": "https://openreview.net/pdf?id=BH8TYy0r6u", "openreview": "https://openreview.net/forum?id=BH8TYy0r6u", "author_site": "Minyoung Huh, Brian Cheung, Tongzhou Wang, Phillip Isola", "tldr": "", "abstract": "We argue that representations in AI models, particularly deep networks, are converging. First, we survey many examples of convergence in the literature: over time and across multiple domains, the ways by which different neural networks represent data are becoming more aligned. Next, we demonstrate convergence across data modalities: as vision models and language models get larger, they measure distance between datapoints in a more and more alike way. We hypothesize that this convergence is driving toward a shared statistical model of reality, akin to Plato's concept of an ideal reality. We term such a representation the platonic representation and discuss several possible selective pressures toward it. Finally, we discuss the implications of these trends, their limitations, and counterexamples to our analysis.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Minyoung Huh;Brian Cheung;Tongzhou Wang;Phillip Isola", "authorids": "~Minyoung_Huh1;~Brian_Cheung1;~Tongzhou_Wang1;~Phillip_Isola1", "gender": "M;M;M;M", "homepage": "https://people.csail.mit.edu/minhuh/;https://briancheung.github.io/;https://www.tongzhouwang.info/;http://web.mit.edu/phillipi/", "dblp": "220/3360;;201/8645;36/9988", "google_scholar": "2k18_1IAAAAJ;7N-ethYAAAAJ;14HASnUAAAAJ;ROILf3EAAAAJ", "orcid": ";;;0000-0002-1411-6704", "linkedin": ";;;phillip-isola-a9955b20/", "or_profile": "~Minyoung_Huh1;~Brian_Cheung1;~Tongzhou_Wang1;~Phillip_Isola1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu;mit.edu", "position": "PhD student;Research Fellow;PhD student;Associate Professor", "bibtex": "@inproceedings{\nhuh2024position,\ntitle={Position: The Platonic Representation Hypothesis},\nauthor={Minyoung Huh and Brian Cheung and Tongzhou Wang and Phillip Isola},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BH8TYy0r6u}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2010385, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11295819168983804176&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 5, "email": "mit.edu;mit.edu;mit.edu;mit.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Position: Relational Deep Learning - Graph Representation Learning on Relational Databases", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34733", "id": "BIMSHniyCP", "proceeding": "https://proceedings.mlr.press/v235/fey24a.html", "pdf": "https://openreview.net/pdf?id=BIMSHniyCP", "openreview": "https://openreview.net/forum?id=BIMSHniyCP", "author_site": "Matthias Fey, Weihua Hu, Kexin Huang, Jan Eric Lenssen, Rishabh Ranjan, Joshua Robinson, ZHITAO YING, Jiaxuan You, Jure Leskovec", "tldr": "", "abstract": "Much of the world's most valued data is stored in relational databases and data warehouses, where the data is organized into tables connected by primary-foreign key relations. However, building machine learning models using this data is both challenging and time consuming because no ML algorithm can directly learn from multiple connected tables. Current approaches can only learn from a single table, so data must first be manually joined and aggregated into this format, the laborious process known as feature engineering. Feature engineering is slow, error prone and leads to suboptimal models. Here we introduce Relational Deep Learning (RDL), a blueprint for end-to-end learning on relational databases. The key is to represent relational databases as a temporal, heterogeneous graphs, with a node for each row in each table, and edges specified by primary-foreign key links. Graph Neural Networks then learn representations that leverage all input data, without any manual feature engineering. We also introduce RelBench, and benchmark and testing suite, demonstrating strong initial results. Overall, we define a new research area that generalizes graph machine learning and broadens its applicability.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Matthias Fey;Weihua Hu;Kexin Huang;Jan Eric Lenssen;Rishabh Ranjan;Joshua Robinson;Rex Ying;Jiaxuan You;Jure Leskovec", "authorids": "~Matthias_Fey2;~Weihua_Hu1;~Kexin_Huang1;~Jan_Eric_Lenssen1;~Rishabh_Ranjan1;~Joshua_Robinson4;~Zhitao_Ying1;~Jiaxuan_You2;~Jure_Leskovec1", "gender": "M;M;M;M;M;M;;M;M", "homepage": "http://rusty1s.github.io;http://web.stanford.edu/~weihuahu/;https://www.kexinhuang.com/;https://janericlenssen.github.io/;https://rishabh-ranjan.github.io;https://www.cs.yale.edu/homes/ying-rex;http://cs.stanford.edu/~jure/;https://cs.stanford.edu/~jiaxuan/;https://joshrobinson.mit.edu/", "dblp": "180/9174;42/1232;;195/9868;;209/4936;l/JureLeskovec;192/4727;15/4759", "google_scholar": "https://scholar.google.de/citations?user=5HaSBN0AAAAJ;wAFMjfkAAAAJ;ogEXTOgAAAAJ;https://scholar.google.de/citations?user=enXCzCgAAAAJ;NNzQUrcAAAAJ;6fqNXooAAAAJ;Q_kKkIUAAAAJ;NDbMl7oAAAAJ;E02doCkAAAAJ", "orcid": ";;;0000-0003-4093-9840;;;0000-0002-5411-923X;;", "linkedin": ";weihua-hu-a8284228/;;jan-eric-lenssen-08700b190/;;rex-ying-92770148/;leskovec/;jiaxuan-you-5859b37b/;", "or_profile": "~Matthias_Fey2;~Weihua_Hu1;~Kexin_Huang1;~Jan_Eric_Lenssen1;~Rishabh_Ranjan1;~Zhitao_Ying1;~Jure_Leskovec1;~Jiaxuan_You1;~Joshua_David_Robinson1", "aff": "TU Dortmund University;;Stanford University;Kumo;Stanford University;Yale University;Kumo.AI;NVIDIA;Stanford University", "aff_domain": "udo.edu;;stanford.edu;kumo.ai;stanford.edu;yale.edu;kumo.ai;nvidia.com;stanford.edu", "position": "PhD student;;PhD student;Researcher;PhD student;Assistant Professor;Chief Scientist;Researcher;Postdoc", "bibtex": "@inproceedings{\nfey2024position,\ntitle={Position: Relational Deep Learning - Graph Representation Learning on Relational Databases},\nauthor={Matthias Fey and Weihua Hu and Kexin Huang and Jan Eric Lenssen and Rishabh Ranjan and Joshua Robinson and Rex Ying and Jiaxuan You and Jure Leskovec},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BIMSHniyCP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1028688, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9709903650035607853&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "email": "udo.edu;;stanford.edu;kumo.ai;stanford.edu;yale.edu;kumo.ai;nvidia.com;stanford.edu", "author_num": 9, "aff_unique_index": "0;1;2;1;3;4;5;1", "aff_unique_norm": "Technische Universit\u00e4t Dortmund;Stanford University;Kumo;Yale University;Kumo.AI;NVIDIA", "aff_unique_dep": ";;;;;NVIDIA Corporation", "aff_unique_url": "https://www.tu-dortmund.de;https://www.stanford.edu;;https://www.yale.edu;https://www.kumo.ai;https://www.nvidia.com", "aff_unique_abbr": "TU Dortmund;Stanford;;Yale;Kumo.AI;NVIDIA", "aff_campus_unique_index": "0;1;1;1", "aff_campus_unique": "Dortmund;Stanford;", "aff_country_unique_index": "0;1;1;1;1;1;1", "aff_country_unique": "Germany;United States;" }, { "title": "Scaling Tractable Probabilistic Circuits: A Systems Perspective", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34732", "id": "BIbjwcrg0V", "proceeding": "https://proceedings.mlr.press/v235/liu24a.html", "pdf": "https://openreview.net/pdf?id=BIbjwcrg0V", "openreview": "https://openreview.net/forum?id=BIbjwcrg0V", "author_site": "Anji Liu, Kareem Ahmed, Guy Van den Broeck", "tldr": "", "abstract": "Probabilistic Circuits (PCs) are a general framework for tractable deep generative models, which support exact and efficient probabilistic inference on their learned distributions. Recent modeling and training advancements have enabled their application to complex real-world tasks. However, the time and memory inefficiency of existing PC implementations hinders further scaling up. This paper proposes PyJuice, a general GPU implementation design for PCs that improves prior art in several regards. Specifically, PyJuice is 1-2 orders of magnitude faster than existing systems (including very recent ones) at training large-scale PCs. Moreover, PyJuice consumes 2-5x less GPU memory, which enables us to train larger models. At the core of our system is a compilation process that converts a PC into a compact representation amenable to efficient block-based parallelization, which significantly reduces IO and makes it possible to leverage Tensor Cores available in modern GPUs. Empirically, PyJuice can be used to improve state-of-the-art PCs trained on image (e.g., ImageNet32) and language (e.g., WikiText, CommonGen) datasets. We further establish a new set of baselines on natural image and language datasets by benchmarking existing PC structures but with much larger sizes and more training epochs, with the hope of incentivizing future research. Code is available at https://github.com/Tractables/pyjuice.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anji Liu;Kareem Ahmed;Guy Van den Broeck", "authorids": "~Anji_Liu1;~Kareem_Ahmed2;~Guy_Van_den_Broeck1", "gender": "M;M;M", "homepage": "https://liuanji.github.io/;http://kareemahmed.com;http://web.cs.ucla.edu/~guyvdb/", "dblp": "227/8622;188/6144;96/7521.html", "google_scholar": "k_4zYecAAAAJ;hkM0hbIAAAAJ;d0KQ9z0AAAAJ", "orcid": ";;0000-0003-3434-2503", "linkedin": "anji-liu-7610b7190/;kareem-yousrii/;guyvdb", "or_profile": "~Anji_Liu1;~Kareem_Ahmed2;~Guy_Van_den_Broek1", "aff": "University of California, Los Angeles;University of California, Los Angeles;University of California, Los Angeles", "aff_domain": "ucla.edu;cs.ucla.edu;ucla.edu", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nliu2024scaling,\ntitle={Scaling Tractable Probabilistic Circuits: A Systems Perspective},\nauthor={Anji Liu and Kareem Ahmed and Guy Van den Broeck},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BIbjwcrg0V}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4737441, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13920462921743289912&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "ucla.edu;cs.ucla.edu;ucla.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Los Angeles", "aff_unique_dep": "", "aff_unique_url": "https://www.ucla.edu", "aff_unique_abbr": "UCLA", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Multi-View Stochastic Block Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34731", "id": "BJx1K4lAAX", "proceeding": "https://proceedings.mlr.press/v235/cohen-addad24b.html", "pdf": "https://openreview.net/pdf?id=BJx1K4lAAX", "openreview": "https://openreview.net/forum?id=BJx1K4lAAX", "author_site": "Vincent Cohen-Addad, Tommaso d'Orsi, Silvio Lattanzi, Rajai Nasser", "tldr": "", "abstract": "Graph clustering is a central topic in unsupervised learning with a multitude of practical applications. In recent years, multi-view graph clustering has gained a lot of attention for its applicability to real-world instances where one often has access to multiple data sources. In this paper we formalize a new family of models, called *multi-view stochastic block models* that capture this setting. For this model, we first study efficient algorithms that naively work on the union of multiple graphs. Then, we introduce a new efficient algorithm that provably outperforms previous approaches by analyzing the structure of each graph separately. Finally, we complement our results with an information-theoretic lower bound studying the limits of what can be done in this model.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vincent Cohen-Addad;Tommaso d'Orsi;Silvio Lattanzi;Rajai Nasser", "authorids": "~Vincent_Cohen-Addad1;~Tommaso_d'Orsi1;~Silvio_Lattanzi1;~Rajai_Nasser1", "gender": ";;M;M", "homepage": ";https://tommasodorsi.github.io;https://sites.google.com/site/silviolattanzi/;", "dblp": "136/5814;275/8135;46/6611;09/10672", "google_scholar": ";;vxUZ4AUAAAAJ;https://scholar.google.ch/citations?user=uzoQLkwAAAAJ", "orcid": ";;;0000-0003-0057-1201", "linkedin": ";;;rajai-nasser-5bab19102/", "or_profile": "~Vincent_Cohen-Addad1;~Tommaso_d'Orsi1;~Silvio_Lattanzi1;~Rajai_Nasser1", "aff": "Google;Bocconi University;Google;Google", "aff_domain": "google.com;unibocconi.it;google.com;google.com", "position": "Researcher;Assistant Professor;Researcher;Software Engineer", "bibtex": "@inproceedings{\ncohen-addad2024multiview,\ntitle={Multi-View Stochastic Block Models},\nauthor={Vincent Cohen-Addad and Tommaso d'Orsi and Silvio Lattanzi and Rajai Nasser},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BJx1K4lAAX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 711275, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17279172267207455569&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "google.com;unibocconi.it;google.com;google.com", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Google;Bocconi University", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.bocconi.edu", "aff_unique_abbr": "Google;Bocconi", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;Italy" }, { "title": "In-Context Learning Agents Are Asymmetric Belief Updaters", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34730", "id": "BNAvYSCrLD", "proceeding": "https://proceedings.mlr.press/v235/schubert24a.html", "pdf": "https://openreview.net/pdf?id=BNAvYSCrLD", "openreview": "https://openreview.net/forum?id=BNAvYSCrLD", "author_site": "Johannes A. Schubert, Akshay Kumar Jagadish, Marcel Binz, Eric Schulz", "tldr": "", "abstract": "We study the in-context learning dynamics of large language models (LLMs) using three instrumental learning tasks adapted from cognitive psychology. We find that LLMs update their beliefs in an asymmetric manner and learn more from better-than-expected outcomes than from worse-than-expected ones. Furthermore, we show that this effect reverses when learning about counterfactual feedback and disappears when no agency is implied. We corroborate these findings by investigating idealized in-context learning agents derived through meta-reinforcement learning, where we observe similar patterns. Taken together, our results contribute to our understanding of how in-context learning works by highlighting that the framing of a problem significantly influences how learning occurs, a phenomenon also observed in human cognition.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Johannes A. Schubert;Akshay Kumar Jagadish;Marcel Binz;Eric Schulz", "authorids": "~Johannes_A._Schubert1;~Akshay_Kumar_Jagadish1;~Marcel_Binz1;~Eric_Schulz1", "gender": "M;M;M;M", "homepage": "https://github.com/jschbrt/;http://akshaykjagadish.com/;;https://cpilab.org", "dblp": ";384/4213.html;212/5102;124/0016", "google_scholar": ";B42Mr-sAAAAJ;https://scholar.google.de/citations?user=Lvm9Q8QAAAAJ;", "orcid": ";0000-0002-7897-9752;;", "linkedin": ";akshaykjagadish/;;", "or_profile": "~Johannes_A._Schubert1;~Akshay_Kumar_Jagadish1;~Marcel_Binz1;~Eric_Schulz1", "aff": ";Max Planck Institute for Biological Cybernetics;Helmholtz Zentrum M\u00fcnchen;Max Planck Institute for Biological Cybernetics", "aff_domain": ";tue.mpg.de;helmholtz-munich.de;tuebingen.mpg.de", "position": ";PhD student;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nschubert2024incontext,\ntitle={In-Context Learning Agents Are Asymmetric Belief Updaters},\nauthor={Johannes A. Schubert and Akshay Kumar Jagadish and Marcel Binz and Eric Schulz},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BNAvYSCrLD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1280400, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16440572603420100325&as_sdt=40000005&sciodt=0,22&hl=en", "gs_version_total": 9, "email": ";tue.mpg.de;helmholtz-munich.de;tuebingen.mpg.de", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Max Planck Institute for Biological Cybernetics;Helmholtz Zentrum M\u00fcnchen", "aff_unique_dep": "Biological Cybernetics;", "aff_unique_url": "https://www.biocybernetics.mpg.de;https://www.helmholtz-muenchen.de", "aff_unique_abbr": "MPIBC;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "Probabilistic Time Series Modeling with Decomposable Denoising Diffusion Model", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34729", "id": "BNH8spaR3l", "proceeding": "https://proceedings.mlr.press/v235/yan24b.html", "pdf": "https://openreview.net/pdf?id=BNH8spaR3l", "openreview": "https://openreview.net/forum?id=BNH8spaR3l", "author_site": "Tijin Yan, Hengheng Gong, Yongping He, Yufeng Zhan, Yuanqing Xia", "tldr": "", "abstract": "Probabilistic time series modeling based on generative models has attracted lots of attention because of its wide applications and excellent performance. However, existing state-of-the-art models, based on stochastic differential equation, not only struggle to determine the drift and diffusion coefficients during the design process but also have slow generation speed. To tackle this challenge, we firstly propose decomposable denoising diffusion model ($\\text{D}^3\\text{M}$) and prove it is a general framework unifying denoising diffusion models and continuous flow models. Based on the new framework, we propose some simple but efficient probability paths with high generation speed. Furthermore, we design a module that combines a special state space model with linear gated attention modules for sequence modeling. It preserves inductive bias and simultaneously models both local and global dependencies. Experimental results on 8 real-world datasets show that $\\text{D}^3\\text{M}$ reduces RMSE and CRPS by up to 4.6% and 4.3% compared with state-of-the-arts on imputation tasks, and achieves comparable results with state-of-the-arts on forecasting tasks with only 10 steps.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tijin Yan;Hengheng Gong;He YongPing;Yufeng Zhan;Yuanqing Xia", "authorids": "~Tijin_Yan1;~Hengheng_Gong1;~He_YongPing1;~Yufeng_Zhan1;~Yuanqing_Xia1", "gender": "M;M;;M;M", "homepage": "https://yantijin.github.io/;https://github.com/EmiyaXL;https://github.com/db520xx;;", "dblp": "274/3156;267/1200;;173/1777.html;69/2205", "google_scholar": "Wh5cvy4AAAAJ;;;;HtedN3oAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Tijin_Yan1;~Hengheng_Gong1;~He_YongPing1;~Yufeng_Zhan1;~Yuanqing_Xia1", "aff": "Beijing Institute of Technology;Beijing Institute of Technology;Beijing Institute of Technology;Beijing Institute of Technology;Beijing Institute of Technology", "aff_domain": "bit.edu.cn;bit.edu.cn;bit.edu.cn;bit.edu.cn;bit.edu.cn", "position": "PhD student;PhD student;PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nyan2024probabilistic,\ntitle={Probabilistic Time Series Modeling with Decomposable Denoising Diffusion Model},\nauthor={Tijin Yan and Hengheng Gong and He YongPing and Yufeng Zhan and Yuanqing Xia},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BNH8spaR3l}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 873976, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5759435725570622344&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 6, "email": "bit.edu.cn;bit.edu.cn;bit.edu.cn;bit.edu.cn;bit.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Beijing Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "http://www.bit.edu.cn/", "aff_unique_abbr": "BIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "On Least Square Estimation in Softmax Gating Mixture of Experts", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34728", "id": "BO0jookxk8", "proceeding": "https://proceedings.mlr.press/v235/nguyen24f.html", "pdf": "https://openreview.net/pdf?id=BO0jookxk8", "openreview": "https://openreview.net/forum?id=BO0jookxk8", "author_site": "Huy Nguyen, Nhat Ho, Alessandro Rinaldo", "tldr": "", "abstract": "Mixture of experts (MoE) model is a statistical machine learning design that aggregates multiple expert networks using a softmax gating function in order to form a more intricate and expressive model. Despite being commonly used in several applications owing to their scalability, the mathematical and statistical properties of MoE models are complex and difficult to analyze. As a result, previous theoretical works have primarily focused on probabilistic MoE models by imposing the impractical assumption that the data are generated from a Gaussian MoE model. In this work, we investigate the performance of the least squares estimators (LSE) under a deterministic MoE model where the data are sampled according to a regression model, a setting that has remained largely unexplored. We establish a condition called strong identifiability to characterize the convergence behavior of various types of expert functions. We demonstrate that the rates for estimating strongly identifiable experts, namely the widely used feed forward networks with activation functions $\\mathrm{sigmoid}(\\cdot)$ and $\\tanh(\\cdot)$, are substantially faster than those of polynomial experts, which we show to exhibit a surprising slow estimation rate. Our findings have important practical implications for expert selection.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Huy Nguyen;Nhat Ho;Alessandro Rinaldo", "authorids": "~Huy_Nguyen5;~Nhat_Ho1;~Alessandro_Rinaldo1", "gender": "M;M;M", "homepage": "https://huynm99.github.io/;https://nhatptnk8912.github.io/;https://arinaldo.github.io", "dblp": "48/6075;203/4479;75/5558", "google_scholar": "_YYwzhQAAAAJ;https://scholar.google.ca/citations?user=Xs7cKMwAAAAJ;tBIzO-EAAAAJ", "orcid": ";;", "linkedin": "huy-nguyen-081199/;nhat-pham-minh-ho-267b8164/;", "or_profile": "~Huy_Nguyen5;~Nhat_Ho1;~Alessandro_Rinaldo1", "aff": "Microsoft AI;University of Texas, Austin;University of Texas at Austin", "aff_domain": "microsoft.com;utexas.edu;utexas.edu", "position": "Intern;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nnguyen2024on,\ntitle={On Least Square Estimation in Softmax Gating Mixture of Experts},\nauthor={Huy Nguyen and Nhat Ho and Alessandro Rinaldo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BO0jookxk8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 658837, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7673687368640543894&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "microsoft.com;utexas.edu;utexas.edu", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Microsoft;University of Texas at Austin", "aff_unique_dep": "Microsoft AI;", "aff_unique_url": "https://www.microsoft.com;https://www.utexas.edu", "aff_unique_abbr": "Microsoft;UT Austin", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "A Space Group Symmetry Informed Network for O(3) Equivariant Crystal Tensor Prediction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34727", "id": "BOFjRnJ9mX", "proceeding": "https://proceedings.mlr.press/v235/yan24d.html", "pdf": "https://openreview.net/pdf?id=BOFjRnJ9mX", "openreview": "https://openreview.net/forum?id=BOFjRnJ9mX", "author_site": "Keqiang Yan, Alexandra Saxton, Xiaofeng Qian, Xiaoning Qian, Shuiwang Ji", "tldr": "", "abstract": "We consider the prediction of general tensor properties of crystalline materials, including dielectric, piezoelectric, and elastic tensors. A key challenge here is how to make the predictions satisfy the unique tensor equivariance to both O(3) and crystal space groups. To this end, we propose a General Materials Tensor Network (GMTNet), which is carefully designed to satisfy the required symmetries. To evaluate our method, we curate a dataset and establish evaluation metrics that are tailored to the intricacies of crystal tensor predictions. Experimental results show that our GMTNet not only achieves promising performance on crystal tensors of various orders but also generates predictions fully consistent with the intrinsic crystal symmetries. Our code is publicly available as part of the AIRS library (https://github.com/divelab/AIRS).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Keqiang Yan;Alexandra Saxton;Xiaofeng Qian;Xiaoning Qian;Shuiwang Ji", "authorids": "~Keqiang_Yan2;~Alexandra_Saxton1;~Xiaofeng_Qian1;~Xiaoning_Qian2;~Shuiwang_Ji1", "gender": "M;F;;M;M", "homepage": ";;https://sites.google.com/tamu.edu/qian-group;https://www.ece.tamu.edu/~xqian;http://people.tamu.edu/~sji", "dblp": "272/6760;;266/1654;62/4504;84/6405", "google_scholar": "cv52C8oAAAAJ;;bK7fFKoAAAAJ;dXGlddgAAAAJ;BZGj6sAAAAAJ", "orcid": ";;0000-0003-1627-288X;0000-0002-4347-2476;0000-0002-4205-4563", "linkedin": ";alexandra-saxton-70705a202;;;shuiwang-ji-9a040715/", "or_profile": "~Keqiang_Yan2;~Alexandra_Saxton1;~Xiaofeng_Qian1;~Xiaoning_Qian2;~Shuiwang_Ji1", "aff": "Texas A&M University;Texas A&M University - College Station;Texas A&M University;Texas A&M;Texas A&M University", "aff_domain": "tamu.edu;tamu.edu;tamu.edu;tamu.edu;tamu.edu", "position": "PhD student;Undergrad student;Associate Professor;Full Professor;Professor", "bibtex": "@inproceedings{\nyan2024a,\ntitle={A Space Group Symmetry Informed Network for O(3) Equivariant Crystal Tensor Prediction},\nauthor={Keqiang Yan and Alexandra Saxton and Xiaofeng Qian and Xiaoning Qian and Shuiwang Ji},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BOFjRnJ9mX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9049615, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6151378590888623091&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "tamu.edu;tamu.edu;tamu.edu;tamu.edu;tamu.edu", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Texas A&M University", "aff_unique_dep": "", "aff_unique_url": "https://www.tamu.edu", "aff_unique_abbr": "TAMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";College Station", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "ULTRAFEEDBACK: Boosting Language Models with Scaled AI Feedback", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34726", "id": "BOorDpKHiJ", "proceeding": "https://proceedings.mlr.press/v235/cui24f.html", "pdf": "https://openreview.net/pdf?id=BOorDpKHiJ", "openreview": "https://openreview.net/forum?id=BOorDpKHiJ", "author_site": "Ganqu Cui, Lifan Yuan, Ning Ding, Guanming Yao, Bingxiang He, Wei Zhu, Yuan Ni, Guotong Xie, Ruobing Xie, Yankai Lin, Zhiyuan Liu, Maosong Sun", "tldr": "", "abstract": "Learning from human feedback has become a pivot technique in aligning large language models (LLMs) with human preferences. However, acquiring vast and premium human feedback is bottlenecked by time, labor, and human capability, resulting in small sizes or limited topics of current datasets. This further hinders feedback learning as well as alignment research within the open-source community. To address this issue, we explore how to go beyond human feedback and collect high-quality AI feedback automatically for a scalable alternative. Specifically, we identify scale and diversity as the key factors for feedback data to take effect. Accordingly, we first broaden instructions and responses in both amount and breadth to encompass a wider range of user-assistant interactions. Then, we meticulously apply a series of techniques to mitigate annotation biases for more reliable AI feedback. We finally present UltraFeedback, a large-scale, high-quality, and diversified AI feedback dataset, which contains over 1 million GPT-4 feedback for 250k user-assistant conversations from various aspects. Built upon UltraFeedback, we align a LLaMA-based model by best-of-$n$ sampling and reinforcement learning, demonstrating its exceptional performance on chat benchmarks. Our work validates the effectiveness of scaled AI feedback data in constructing strong open-source chat language models, serving as a solid foundation for future feedback learning research.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ganqu Cui;Lifan Yuan;Ning Ding;Guanming Yao;Bingxiang He;Wei Zhu;Yuan Ni;Guotong Xie;Ruobing Xie;Yankai Lin;Zhiyuan Liu;Maosong Sun", "authorids": "~Ganqu_Cui1;~Lifan_Yuan1;~Ning_Ding5;~Guanming_Yao1;~Bingxiang_He1;~Wei_Zhu7;~Yuan_Ni1;~Guotong_Xie4;~Ruobing_Xie2;~Yankai_Lin1;~Zhiyuan_Liu1;~Maosong_Sun1", "gender": "M;;M;;M;M;F;;M;M;M;M", "homepage": "https://cgq15.github.io/;;https://www.stingning.cn/;;https://hbx-hbx.github.io/;https://www.researchgate.net/profile/Wei-Zhu-111;;;http://nlp.csai.tsinghua.edu.cn/~xrb/;https://linyankai.github.io/;http://nlp.csai.tsinghua.edu.cn/~lzy;https://www.cs.tsinghua.edu.cn/csen/info/1312/4394.htm", "dblp": "232/3064;;;;322/5932;83/4805-16.html;70/2170;;178/8590;161/0001.html;53/3245-1;95/3291-1", "google_scholar": "3IVSzZgAAAAJ;;uZXQuYAAAAAJ;;mb36VikAAAAJ;EF5J_BYAAAAJ;;;j3OX8KUAAAAJ;https://scholar.google.com.hk/citations?user=j8K1FqEAAAAJ;dT0v5u0AAAAJ;https://scholar.google.com.tw/citations?user=zIgT0HMAAAAJ", "orcid": ";;;;;0000-0002-6389-6866;;;0000-0003-3170-5647;0000-0002-9182-8158;0000-0002-7709-2543;", "linkedin": ";;;;;;;;;;;", "or_profile": "~Ganqu_Cui1;~Lifan_Yuan1;~Ning_Ding5;~Guanming_Yao1;~Bingxiang_He1;~Wei_Zhu7;~Yuan_Ni1;~Guotong_Xie4;~Ruobing_Xie2;~Yankai_Lin1;~Zhiyuan_Liu1;~Maosong_Sun1", "aff": "Tsinghua University;;Tsinghua University;;Tsinghua University;University of Hong Kong;Pingan Technology;;Tencent;Renmin University of China;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;;mail.tsinghua.edu.cn;;tsinghua.edu.cn;hku.hk;pingan.com.cn;;tencent.com;ruc.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;;Postdoc;;Undergrad student;Researcher;Researcher;;Senior researcher;Assistant Professor;Associate Professor;Full Professor", "bibtex": "@inproceedings{\ncui2024ultrafeedback,\ntitle={{ULTRAFEEDBACK}: Boosting Language Models with Scaled {AI} Feedback},\nauthor={Ganqu Cui and Lifan Yuan and Ning Ding and Guanming Yao and Bingxiang He and Wei Zhu and Yuan Ni and Guotong Xie and Ruobing Xie and Yankai Lin and Zhiyuan Liu and Maosong Sun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BOorDpKHiJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 569674, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 76, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11241453556273657765&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "tsinghua.edu.cn;;mail.tsinghua.edu.cn;;tsinghua.edu.cn;hku.hk;pingan.com.cn;;tencent.com;ruc.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 12, "aff_unique_index": "0;0;0;1;2;3;4;0;0", "aff_unique_norm": "Tsinghua University;University of Hong Kong;PingAn Technology;Tencent;Renmin University of China", "aff_unique_dep": ";;;Tencent Holdings Limited;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.hku.hk;https://www.pingan.com;https://www.tencent.com;http://www.ruc.edu.cn", "aff_unique_abbr": "THU;HKU;;Tencent;RUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "VQDNA: Unleashing the Power of Vector Quantization for Multi-Species Genomic Sequence Modeling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34725", "id": "BOunbuapcv", "proceeding": "https://proceedings.mlr.press/v235/li24bm.html", "pdf": "https://openreview.net/pdf?id=BOunbuapcv", "openreview": "https://openreview.net/forum?id=BOunbuapcv", "author_site": "Siyuan Li, Zedong Wang, Zicheng Liu, Di Wu, Cheng Tan, Jiangbin Zheng, Yufei Huang, Stan Z Li", "tldr": "", "abstract": "Similar to natural language models, pre-trained genome language models are proposed to capture the underlying intricacies within genomes with unsupervised sequence modeling. They have become essential tools for researchers and practitioners in biology. However, the hand-crafted tokenization policies used in these models may not encode the most discriminative patterns from the limited vocabulary of genomic data. In this paper, we introduce VQDNA, a general-purpose framework that renovates genome tokenization from the perspective of genome vocabulary learning. By leveraging vector-quantized codebook as learnable vocabulary, VQDNA can adaptively tokenize genomes into pattern-aware embeddings in an end-to-end manner. To further push its limits, we propose Hierarchical Residual Quantization (HRQ), where varying scales of codebooks are designed in a hierarchy to enrich the genome vocabulary in a coarse-to-fine manner. Extensive experiments on 32 genome datasets demonstrate VQDNA's superiority and favorable parameter efficiency compared to existing genome language models. Notably, empirical analysis of SARS-CoV-2 mutations reveals the fine-grained pattern awareness and biological significance of learned HRQ vocabulary, highlighting its untapped potential for broader applications in genomics.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Siyuan Li;Zedong Wang;Zicheng Liu;Di Wu;Cheng Tan;Jiangbin Zheng;Yufei Huang;Stan Z. Li", "authorids": "~Siyuan_Li6;~Zedong_Wang1;~Zicheng_Liu2;~Di_Wu10;~Cheng_Tan1;~Jiangbin_Zheng3;~Yufei_Huang4;~Stan_Z._Li2", "gender": "M;M;M;M;M;M;M;M", "homepage": "https://lupin1998.github.io/;https://jacky1128.github.io;;;https://chengtan9907.github.io/;;https://2021.igem.org/Team:ZJU-China;https://en.westlake.edu.cn/academics/School_of_Engineering/About/Our_People/Faculty/201912/t20191206_2497.shtml", "dblp": "63/9705-2;179/8811.html;l/ZichengLiu-6;;70/1533-12.html;;68/1946-2;l/StanZLi", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com/citations?hl=en;6kTV6aMAAAAJ;;qmTjdwIAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": "0000-0001-6806-2468;0009-0000-0112-0491;;;;0000-0003-3305-0103;0009-0007-8184-4529;", "linkedin": "https://www.linkedin.cn/incareer/in/siyuan-li-lupin1998/;;;;;;;stan-z-li-%E6%9D%8E%E5%AD%90%E9%9D%92-55753224/", "or_profile": "~Siyuan_Li6;~Zedong_Wang1;~Zicheng_Liu2;~Di_Wu10;~Cheng_Tan1;~Jiangbin_Zheng3;~Yufei_Huang4;~Stan_Z._Li1", "aff": "Alibaba Group;Westlake University;Zhejiang University;Westlake University;Zhejiang University & Westlake University;Westlake University;Zhejiang University;Westlake University", "aff_domain": "alibaba-inc.com;westlake.edu;zju.edu.cn;westlake.edu.cn;westlake.edu.cn;westlake.edu.cn;zju.edu.cn;westlake.edu.cn", "position": "Intern;Intern;PhD student;PhD student;PhD student;PhD student;PhD student;Chair Professor", "bibtex": "@inproceedings{\nli2024vqdna,\ntitle={{VQDNA}: Unleashing the Power of Vector Quantization for Multi-Species Genomic Sequence Modeling},\nauthor={Siyuan Li and Zedong Wang and Zicheng Liu and Di Wu and Cheng Tan and Jiangbin Zheng and Yufei Huang and Stan Z. Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BOunbuapcv}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 834799, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3466150300870967702&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "alibaba-inc.com;westlake.edu;zju.edu.cn;westlake.edu.cn;westlake.edu.cn;westlake.edu.cn;zju.edu.cn;westlake.edu.cn", "author_num": 8, "aff_unique_index": "0;1;2;1;2;1;2;1", "aff_unique_norm": "Alibaba Group;Westlake University;Zhejiang University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.alibaba.com;https://www.westlake.edu.cn;https://www.zju.edu.cn", "aff_unique_abbr": "Alibaba;WU;ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Online Speculative Decoding", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34724", "id": "BPQHXwVNvl", "proceeding": "https://proceedings.mlr.press/v235/liu24y.html", "pdf": "https://openreview.net/pdf?id=BPQHXwVNvl", "openreview": "https://openreview.net/forum?id=BPQHXwVNvl", "author_site": "Xiaoxuan Liu, Lanxiang Hu, Peter Bailis, Alvin Cheung, Zhijie Deng, Ion Stoica, Hao Zhang", "tldr": "", "abstract": "Speculative decoding is a pivotal technique to accelerate the inference of large language models (LLMs) by employing a smaller draft model to predict the target model's outputs. However, its efficacy can be limited due to the low predictive accuracy of the draft model, particularly when faced with diverse text inputs and a significant capability gap between the draft and target models. We introduce online speculative decoding to address this challenge. The main idea is to continuously update the (multiple) draft model(s) on observed user query data. Adapting to query distribution mitigates the shifts between the training distribution of the draft model and the query distribution, enabling the draft model to more accurately predict the target model's outputs. We develop a prototype of online speculative decoding based on knowledge distillation and evaluate it using both synthetic and real query data. The results show a substantial increase in the token acceptance rate by 0.1 to 0.65, bringing 1.42x to 2.17x latency reduction. Our code is available at https://github.com/LiuXiaoxuanPKU/OSD.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaoxuan Liu;Lanxiang Hu;Peter Bailis;Alvin Cheung;Zhijie Deng;Ion Stoica;Hao Zhang", "authorids": "~Xiaoxuan_Liu2;~Lanxiang_Hu1;~Peter_Bailis2;~Alvin_Cheung2;~Zhijie_Deng1;~Ion_Stoica1;~Hao_Zhang2", "gender": ";M;M;;M;M;M", "homepage": ";https://snyhlxde1.github.io/;http://www.bailis.org/;;https://thudzj.github.io/;http://people.eecs.berkeley.edu/~istoica/;https://cseweb.ucsd.edu/~haozhang/", "dblp": ";;47/8816;;209/4959;s/IonStoica;55/2270-25", "google_scholar": ";KufYmg8AAAAJ;qG1LVpQAAAAJ;;J3dR0sUAAAAJ;vN-is70AAAAJ;H1d4BS8AAAAJ", "orcid": ";0000-0003-0641-3677;;;0000-0002-0932-1631;;", "linkedin": ";hu-lanxiang/;;;;ionstoica;", "or_profile": "~Xiaoxuan_Liu2;~Lanxiang_Hu1;~Peter_Bailis2;~Alvin_Cheung2;~Zhijie_Deng1;~Ion_Stoica1;~Hao_Zhang2", "aff": ";University of California, San Diego;Stanford University;;Shanghai Jiaotong University;University of California, Berkeley;Carnegie Mellon University", "aff_domain": ";ucsd.edu;stanford.edu;;sjtu.edu.cn;berkeley.edu;cmu.edu", "position": ";PhD student;Adjunct Professor;;Assistant Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nliu2024online,\ntitle={Online Speculative Decoding},\nauthor={Xiaoxuan Liu and Lanxiang Hu and Peter Bailis and Alvin Cheung and Zhijie Deng and Ion Stoica and Hao Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BPQHXwVNvl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2222194, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 67, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4611335299573429966&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";ucsd.edu;stanford.edu;;sjtu.edu.cn;berkeley.edu;cmu.edu", "author_num": 7, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "University of California, San Diego;Stanford University;Shanghai Jiao Tong University;University of California, Berkeley;Carnegie Mellon University", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.ucsd.edu;https://www.stanford.edu;https://www.sjtu.edu.cn;https://www.berkeley.edu;https://www.cmu.edu", "aff_unique_abbr": "UCSD;Stanford;SJTU;UC Berkeley;CMU", "aff_campus_unique_index": "0;1;3", "aff_campus_unique": "San Diego;Stanford;;Berkeley", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United States;China" }, { "title": "Mitigating Label Noise on Graphs via Topological Sample Selection", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34723", "id": "BRIcZiK5Fr", "proceeding": "https://proceedings.mlr.press/v235/wu24ae.html", "pdf": "https://openreview.net/pdf?id=BRIcZiK5Fr", "openreview": "https://openreview.net/forum?id=BRIcZiK5Fr", "author_site": "Yuhao Wu, Jiangchao Yao, Xiaobo Xia, Jun Yu, Ruxin Wang, Bo Han, Tongliang Liu", "tldr": "", "abstract": "Despite the success of the carefully-annotated benchmarks, the effectiveness of existing graph neural networks (GNNs) can be considerably impaired in practice when the real-world graph data is noisily labeled. Previous explorations in sample selection have been demonstrated as an effective way for robust learning with noisy labels, however, the conventional studies focus on i.i.d data, and when moving to non-iid graph data and GNNs, two notable challenges remain: (1) nodes located near topological class boundaries are very informative for classification but cannot be successfully distinguished by the heuristic sample selection. (2) there is no available measure that considers the graph topological information to promote sample selection in a graph. To address this dilemma, we propose a $\\textit{Topological Sample Selection}$ (TSS) method that boosts the informative sample selection process in a graph by utilising topological information. We theoretically prove that our procedure minimizes an upper bound of the expected risk under target clean distribution, and experimentally show the superiority of our method compared with state-of-the-art baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuhao Wu;Jiangchao Yao;Xiaobo Xia;Jun Yu;Ruxin Wang;Bo Han;Tongliang Liu", "authorids": "~Yuhao_Wu2;~Jiangchao_Yao1;~Xiaobo_Xia1;~Jun_Yu3;~Ruxin_Wang2;~Bo_Han1;~Tongliang_Liu1", "gender": "M;M;M;M;M;M;M", "homepage": "https://white1818.github.io/;https://sunarker.github.io/;https://xiaoboxia.github.io/;https://faculty.ustc.edu.cn/yujun_AI/en/index.htm;;https://tongliang-liu.github.io/;https://bhanml.github.io/", "dblp": ";166/5900;242/8072;50/5754-1.html;149/7989;150/6667;241/0472-3", "google_scholar": ";w8oDh9QAAAAJ;jRsugY0AAAAJ;efZyqyQAAAAJ;https://scholar.google.com.au/citations?user=n9qX0bUAAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;nTNjqHwAAAAJ", "orcid": ";;;0000-0002-3197-8103;;;", "linkedin": ";;;;;;", "or_profile": "~Yuhao_Wu2;~Jiangchao_Yao1;~Xiaobo_Xia1;~Jun_Yu3;~Ruxin_Wang2;~Tongliang_Liu1;~bo_han2", "aff": "University of Sydney;Shanghai Artificial Intelligence Laboratory;The University of Sydney;University of Science and Technology of China;Alibaba Group;Mohamed bin Zayed University of Artificial Intelligence;MBZUAI", "aff_domain": "usyd.edu.au;pjlab.org.cn;sydney.edu.au;ustc.edu.cn;alibaba-inc.com;mbzuai.ac.ae;mbzuai.ac.ae", "position": "PhD student;Researcher;PhD student;Associate Professor;Researcher;Affiliated Associate Professor;Researcher", "bibtex": "@inproceedings{\nwu2024mitigating,\ntitle={Mitigating Label Noise on Graphs via Topological Sample Selection},\nauthor={Yuhao Wu and Jiangchao Yao and Xiaobo Xia and Jun Yu and Ruxin Wang and Bo Han and Tongliang Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BRIcZiK5Fr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3091623, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6299613841677671997&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "usyd.edu.au;pjlab.org.cn;sydney.edu.au;ustc.edu.cn;alibaba-inc.com;mbzuai.ac.ae;mbzuai.ac.ae", "author_num": 7, "aff_unique_index": "0;1;0;2;3;4;4", "aff_unique_norm": "University of Sydney;Shanghai Artificial Intelligence Laboratory;University of Science and Technology of China;Alibaba Group;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.sydney.edu.au;http://www.shailab.org/;http://www.ustc.edu.cn;https://www.alibaba.com;https://mbzuai.ac.ae", "aff_unique_abbr": "USYD;Shanghai AI Lab;USTC;Alibaba;MBZUAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;1;2;2", "aff_country_unique": "Australia;China;United Arab Emirates" }, { "title": "WorkArena: How Capable are Web Agents at Solving Common Knowledge Work Tasks?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34722", "id": "BRfqYrikdo", "proceeding": "https://proceedings.mlr.press/v235/drouin24a.html", "pdf": "https://openreview.net/pdf?id=BRfqYrikdo", "openreview": "https://openreview.net/forum?id=BRfqYrikdo", "author_site": "Alexandre Drouin, Maxime Gasse, Massimo Caccia, Issam Laradji, Manuel Del Verme, Tom Marty, David Vazquez, Nicolas Chapados, Alexandre Lacoste", "tldr": "", "abstract": "We study the use of large language model-based agents for interacting with software via web browsers. Unlike prior work, we focus on measuring the agents' ability to perform tasks that span the typical daily work of knowledge workers utilizing enterprise software systems. To this end, we propose WorkArena, a remote-hosted benchmark of 33 tasks based on the widely-used ServiceNow platform. We also introduce BrowserGym, an environment for the design and evaluation of such agents, offering a rich set of actions as well as multimodal observations. Our empirical evaluation reveals that while current agents show promise on WorkArena, there remains a considerable gap towards achieving full task automation. Notably, our analysis uncovers a significant performance disparity between open and closed-source LLMs, highlighting a critical area for future exploration and development in the field.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alexandre Drouin;Maxime Gasse;Massimo Caccia;Issam H. Laradji;Manuel Del Verme;Tom Marty;David Vazquez;Nicolas Chapados;Alexandre Lacoste", "authorids": "~Alexandre_Drouin2;~Maxime_Gasse2;~Massimo_Caccia1;~Issam_H._Laradji1;~Manuel_Del_Verme1;~Tom_Marty1;~David_Vazquez1;~Nicolas_Chapados1;~Alexandre_Lacoste1", "gender": "M;M;;M;;M;M;M;M", "homepage": "https://alexdrouin.com;http://www.maximegasse.com/;;https://issamlaradji.github.io/;;https://3rdcore.github.io/;http://www.david-vazquez.com;;", "dblp": "117/3861;118/4730;43/6338.html;142/0043;;;94/8653;58/1013;59/6239.html", "google_scholar": "https://scholar.google.ca/citations?user=LR6aJcEAAAAJ;https://scholar.google.fr/citations?user=s7m9rikAAAAJ;WaE4GicAAAAJ;https://scholar.google.ca/citations?user=8vRS7F0AAAAJ;JcOwyS0AAAAJ;-YXor_wAAAAJ;1jHvtfsAAAAJ;QdnjDj8AAAAJ;", "orcid": "0000-0001-7718-0319;0000-0001-6982-062X;;;;0009-0001-3468-3327;0000-0002-2845-8158;;", "linkedin": "drouinalexandre/;maxime-gasse-100a4a62/;;issam-laradji-67ba1a99/;;tom-marty/;https://www.linkedin.com/company/david-vazquez/;;", "or_profile": "~Alexandre_Drouin2;~Maxime_Gasse2;~Massimo_Caccia1;~Issam_H._Laradji1;~Manuel_Del_Verme1;~Tom_Marty1;~David_Vazquez1;~Nicolas_Chapados1;~Alexandre_Lacoste1", "aff": "ServiceNow Research ;Montreal Institute for Learning Algorithms, University of Montreal, Universit\u00e9 de Montr\u00e9al;ServiceNow Inc;ServiceNow;Montreal Institute for Learning Algorithms, University of Montreal, University of Montreal;Montreal Institute for Learning Algorithms, University of Montreal, Universit\u00e9 de Montr\u00e9al;ServiceNow research;ServiceNow Research;ServiceNow", "aff_domain": "servicenow.com;mila.umontreal.ca;servicenow.com;servicenow.com;mila.umontreal.ca;mila.umontreal.ca;servicenow.com;servicenow.com;servicenow.com", "position": "Research Scientist;Researcher;Researcher;Researcher;PhD student;PhD student;Researcher;VP Research;Research Scientist", "bibtex": "@inproceedings{\ndrouin2024workarena,\ntitle={WorkArena: How Capable are Web Agents at Solving Common Knowledge Work Tasks?},\nauthor={Alexandre Drouin and Maxime Gasse and Massimo Caccia and Issam H. Laradji and Manuel Del Verme and Tom Marty and David Vazquez and Nicolas Chapados and Alexandre Lacoste},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BRfqYrikdo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6971084, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 61, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9086055784610806930&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "servicenow.com;mila.umontreal.ca;servicenow.com;servicenow.com;mila.umontreal.ca;mila.umontreal.ca;servicenow.com;servicenow.com;servicenow.com", "author_num": 9, "aff_unique_index": "0;1;0;0;1;1;0;0;0", "aff_unique_norm": "ServiceNow;University of Montreal", "aff_unique_dep": "Research;Montreal Institute for Learning Algorithms", "aff_unique_url": "https://www.servicenow.com;https://www.mila.quebec", "aff_unique_abbr": "ServiceNow;MILA", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;1;0;0;1;1;0;0;0", "aff_country_unique": "United States;Canada" }, { "title": "Promises and Pitfalls of Generative Masked Language Modeling: Theoretical Framework and Practical Guidelines", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34721", "id": "BTkaKA74mS", "proceeding": "https://proceedings.mlr.press/v235/li24af.html", "pdf": "https://openreview.net/pdf?id=BTkaKA74mS", "openreview": "https://openreview.net/forum?id=BTkaKA74mS", "author_site": "Yuchen Li, Alexandre Kirchmeyer, Aashay Mehta, Yilong Qin, Boris Dadachev, Kishore Papineni, Sanjiv Kumar, Andrej Risteski", "tldr": "", "abstract": "Autoregressive language models are the currently dominant paradigm for text generation, however they have some fundamental limitations that cannot be remedied by scale---for example inherently sequential and unidirectional generation. While alternate classes of models have been explored, we have limited mathematical understanding of their fundamental power and limitations. In this paper we focus on Generative Masked Language Models (GMLMs), a non-autoregressive paradigm in which we train a model to fit conditional probabilities of the data distribution via masking, which are subsequently used as inputs to a Markov Chain to draw samples from the model. These models empirically strike a promising speed-quality trade-off as each step can be typically parallelized by decoding the entire sequence in parallel. We develop a mathematical framework for analyzing and improving such models which sheds light on questions of sample complexity and inference speed and quality. Empirically, we adapt the T5 model for iteratively-refined parallel decoding, achieving 2-3x speedup in machine translation with minimal sacrifice in quality compared with autoregressive models. We run careful ablation experiments to give recommendations on key design choices, and make fine-grained observations on the common error modes in connection with our theory. Our mathematical analyses and empirical observations characterize both potentials and limitations of this approach, and can be applied to future works on improving understanding and performance of GMLMs. We released codes for our experiments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuchen Li;Alexandre Kirchmeyer;Aashay Mehta;Yilong Qin;Boris Dadachev;Kishore Papineni;Sanjiv Kumar;Andrej Risteski", "authorids": "~Yuchen_Li5;~Alexandre_Kirchmeyer1;~Aashay_Mehta1;~Yilong_Qin1;~Boris_Dadachev1;papineni@google.com;~Sanjiv_Kumar1;~Andrej_Risteski2", "gender": ";M;;M;M;;;M", "homepage": "https://yuchenli01.github.io/;;;https://www.yilongq.in/about;;;http://www.sanjivk.com/;", "dblp": ";;;301/7967;119/7889;;;63/11143", "google_scholar": "https://scholar.google.com/citations?hl=en;;SvvzMVUAAAAJ;CFeyF0EAAAAJ;CF1OyysAAAAJ;;https://scholar.google.com/citations?hl=en;", "orcid": ";;;;;;;", "linkedin": "yuchenli01/;alexandre-kirchmeyer/;;yilongqin/;;;;", "or_profile": "~Yuchen_Li5;~Alexandre_Kirchmeyer1;~Aashay_Mehta1;~Yilong_Qin1;~Boris_Dadachev1;papineni@google.com;~Sanjiv_Kumar1;~Andrej_Risteski2", "aff": "Microsoft;;;OpenAI;Google;;Google;Carnegie Mellon University", "aff_domain": "microsoft.com;;;openai.com;google.com;;google.com;cmu.edu", "position": "Intern;;;Researcher;Industry;;Research Scientist;Assistant Professor", "bibtex": "@inproceedings{\nli2024promises,\ntitle={Promises and Pitfalls of Generative Masked Language Modeling: Theoretical Framework and Practical Guidelines},\nauthor={Yuchen Li and Alexandre Kirchmeyer and Aashay Mehta and Yilong Qin and Boris Dadachev and Kishore Papineni and Sanjiv Kumar and Andrej Risteski},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BTkaKA74mS}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 755869, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1957960178873824305&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "email": "microsoft.com;;;openai.com;google.com;;google.com;cmu.edu", "author_num": 8, "aff_unique_index": "0;1;2;2;3", "aff_unique_norm": "Microsoft;OpenAI;Google;Carnegie Mellon University", "aff_unique_dep": "Microsoft Corporation;;Google;", "aff_unique_url": "https://www.microsoft.com;https://openai.com;https://www.google.com;https://www.cmu.edu", "aff_unique_abbr": "Microsoft;OpenAI;Google;CMU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Two-sided Competing Matching Recommendation Markets With Quota and Complementary Preferences Constraints", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34720", "id": "BajM6YzKvm", "proceeding": "https://proceedings.mlr.press/v235/li24n.html", "pdf": "https://openreview.net/pdf?id=BajM6YzKvm", "openreview": "https://openreview.net/forum?id=BajM6YzKvm", "author_site": "Yuantong Li, Guang Cheng, Xiaowu Dai", "tldr": "", "abstract": "In this paper, we propose a new recommendation algorithm for addressing the problem of two-sided online matching markets with complementary preferences and quota constraints, where agents' preferences are unknown a priori and must be learned from data. The presence of mixed quota and complementary preferences constraints can lead to instability in the matching process, making this problem challenging to solve. To overcome this challenge, we formulate the problem as a bandit learning framework and propose the Multi-agent Multi-type Thompson Sampling (MMTS) algorithm. The algorithm combines the strengths of Thompson Sampling for exploration with a new double matching technique to provide a stable matching outcome. Our theoretical analysis demonstrates the effectiveness of MMTS as it can achieve stability and has a total $\\widetilde{\\mathcal{O}}(Q{\\sqrt{K_{\\max}T}})$-Bayesian regret with high probability, which exhibits linearity with respect to the total firm's quota $Q$, the square root of the maximum size of available type workers $\\sqrt{K_{\\max}}$ and time horizon $T$. In addition, simulation studies also demonstrate MMTS' effectiveness in various settings. We provide code used in our experiments https://github.com/Likelyt/Double-Matching.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuantong Li;Guang Cheng;Xiaowu Dai", "authorids": "~Yuantong_Li1;~Guang_Cheng1;~Xiaowu_Dai1", "gender": "M;M;M", "homepage": "http://www.stat.ucla.edu/~guangcheng/;https://www.xiaowudai.org/;https://liyuantong93.com/home/", "dblp": "99/4812;232/3931;254/1270", "google_scholar": ";;wT8kLn4AAAAJ", "orcid": ";;0000-0001-7420-2332", "linkedin": ";;", "or_profile": "~Guang_Cheng1;~Xiaowu_Dai1;~Yuantong_Li_Li1", "aff": "University of California, Los Angeles;University of California, Los Angeles;University of California, Los Angeles", "aff_domain": "ucla.edu;ucla.edu;ucla.edu", "position": "Full Professor;Assistant Professor;PHD", "bibtex": "@inproceedings{\nli2024twosided,\ntitle={Two-sided Competing Matching Recommendation Markets With Quota and Complementary Preferences Constraints},\nauthor={Yuantong Li and Guang Cheng and Xiaowu Dai},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BajM6YzKvm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1436324, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14282031041305482236&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "ucla.edu;ucla.edu;ucla.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Los Angeles", "aff_unique_dep": "", "aff_unique_url": "https://www.ucla.edu", "aff_unique_abbr": "UCLA", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Causal Inference out of Control: Estimating Performativity without Treatment Randomization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34719", "id": "Bb8pOvWIe4", "proceeding": "https://proceedings.mlr.press/v235/cheng24d.html", "pdf": "https://openreview.net/pdf?id=Bb8pOvWIe4", "openreview": "https://openreview.net/forum?id=Bb8pOvWIe4", "author_site": "Gary Cheng, Moritz Hardt, Celestine Mendler-D\u00fcnner", "tldr": "", "abstract": "Regulators and academics are increasingly interested in the causal effect that algorithmic actions of a digital platform have on user consumption. In pursuit of estimating this effect from observational data, we identify a set of assumptions that permit causal identifiability without assuming randomized platform actions. Our results are applicable to platforms that rely on machine-learning-powered predictions and leverage knowledge from historical data. The key novelty of our approach is to explicitly model the dynamics of consumption over time, exploiting the repeated interaction of digital platforms with their participants to prove our identifiability results. By viewing the platform as a controller acting on a dynamical system, we can show that exogenous variation in consumption and appropriately responsive algorithmic control actions are sufficient for identifying the causal effect of interest. We complement our claims with an analysis of ready-to-use finite sample estimators and empirical investigations. More broadly, our results deriving identifiability conditions tailored to digital platform settings illustrate a fruitful interplay of control theory and causal inference.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gary Cheng;Moritz Hardt;Celestine Mendler-D\u00fcnner", "authorids": "~Gary_Cheng2;~Moritz_Hardt1;~Celestine_Mendler-D\u00fcnner1", "gender": "M;Not Specified;", "homepage": "http://garycheng.me;http://mrtz.org/;http://celestine.ai/", "dblp": ";26/4683;176/5511", "google_scholar": "qArWV_wAAAAJ;adnTgaAAAAAJ;UqtDdZUAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Gary_Cheng2;~Moritz_Hardt1;~Celestine_Mendler-D\u00fcnner1", "aff": "Stanford University;Max-Planck-Institute for Intelligent Systems, Max-Planck Institute;Max Planck Institute for Intelligent Systems", "aff_domain": "stanford.edu;is.mpg.de;tuebingen.mpg.de", "position": "PhD student;Principal Researcher;Group Lead", "bibtex": "@inproceedings{\ncheng2024causal,\ntitle={Causal Inference out of Control: Estimating Performativity without Treatment Randomization},\nauthor={Gary Cheng and Moritz Hardt and Celestine Mendler-D{\\\"u}nner},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Bb8pOvWIe4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 829231, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:K5BLWVaqLEQJ:scholar.google.com/&scioq=Causal+Inference+out+of+Control:+Estimating+Performativity+without+Treatment+Randomization&hl=en&as_sdt=0,5", "gs_version_total": 3, "email": "stanford.edu;is.mpg.de;tuebingen.mpg.de", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Stanford University;Max-Planck-Institute for Intelligent Systems;Max Planck Institute for Intelligent Systems", "aff_unique_dep": ";Intelligent Systems;Intelligent Systems", "aff_unique_url": "https://www.stanford.edu;https://www.mpi-is.mpg.de;https://www.mpi-is.mpg.de", "aff_unique_abbr": "Stanford;MPI-IS;MPI-IS", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;Germany" }, { "title": "Position: Open-Endedness is Essential for Artificial Superhuman Intelligence", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34718", "id": "Bc4vZ2CX7E", "proceeding": "https://proceedings.mlr.press/v235/hughes24a.html", "pdf": "https://openreview.net/pdf?id=Bc4vZ2CX7E", "openreview": "https://openreview.net/forum?id=Bc4vZ2CX7E", "author_site": "Edward Hughes, Michael Dennis, Jack Parker-Holder, Feryal Behbahani, Aditi Mavalankar, Yuge Shi, Tom Schaul, Tim Rockt\u00e4schel", "tldr": "", "abstract": "In recent years there has been a tremendous surge in the general capabilities of AI systems, mainly fuelled by training foundation models on internet-scale data. Nevertheless, the creation of open-ended, ever self-improving AI remains elusive. **In this position paper, we argue that the ingredients are now in place to achieve *open-endedness* in AI systems with respect to a human observer. Furthermore, we claim that such open-endedness is an essential property of any artificial superhuman intelligence (ASI).** We begin by providing a concrete formal definition of open-endedness through the lens of novelty and learnability. We then illustrate a path towards ASI via open-ended systems built on top of foundation models, capable of making novel, human-relevant discoveries. We conclude by examining the safety implications of generally-capable open-ended AI. We expect that open-ended foundation models will prove to be an increasingly fertile and safety-critical area of research in the near future.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Edward Hughes;Michael D Dennis;Jack Parker-Holder;Feryal Behbahani;Aditi Mavalankar;Yuge Shi;Tom Schaul;Tim Rockt\u00e4schel", "authorids": "~Edward_Hughes1;~Michael_D_Dennis1;~Jack_Parker-Holder1;~Feryal_Behbahani1;~Aditi_Mavalankar1;~Yuge_Shi2;~Tom_Schaul2;~Tim_Rockt\u00e4schel1", "gender": "M;M;M;F;F;M;M;F", "homepage": "http://www.edwardhughes.io;;https://jparkerholder.github.io/;https://feryal.github.io;https://aditimavalankar.github.io/;http://rockt.ai;http://schaul.site44.com/;https://yugeten.github.io/", "dblp": "217/2003;;237/9793.html;;168/1704;43/11537;50/254;227/4684", "google_scholar": "3tj5358AAAAJ;WXXu26AAAAAJ;;;https://scholar.google.com/citations?hl=en;https://scholar.google.co.uk/citations?user=mWBY8aIAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.co.uk/citations?user=t6B_Z7MAAAAJ", "orcid": ";;;;;;0000-0002-2961-8782;", "linkedin": ";;;;;rockt/;schaul/;", "or_profile": "~Edward_Hughes1;~Michael_D_Dennis1;~Jack_Parker-Holder1;~Feryal_Behbahani1;~Aditi_Mavalankar1;~Tim_Rocktaeschel1;~Tom_Schaul1;~Yuge_Shi1", "aff": "Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;University of Oxford", "aff_domain": "deepmind.com;google.com;google.com;google.com;google.com;google.com;google.com;ox.ac.uk", "position": "Researcher;Researcher;Researcher;Research Scientist;Research Scientist;Senior Staff Research Scientist;Researcher;PhD student", "bibtex": "@inproceedings{\nhughes2024position,\ntitle={Position: Open-Endedness is Essential for Artificial Superhuman Intelligence},\nauthor={Edward Hughes and Michael D Dennis and Jack Parker-Holder and Feryal Behbahani and Aditi Mavalankar and Yuge Shi and Tom Schaul and Tim Rockt{\\\"a}schel},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Bc4vZ2CX7E}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 789374, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14650424116306189671&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "deepmind.com;google.com;google.com;google.com;google.com;google.com;google.com;ox.ac.uk", "author_num": 8, "aff_unique_index": "0;0;0;0;0;0;0;1", "aff_unique_norm": "Google;University of Oxford", "aff_unique_dep": "Google DeepMind;", "aff_unique_url": "https://deepmind.com;https://www.ox.ac.uk", "aff_unique_abbr": "DeepMind;Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Private Truly-Everlasting Robust-Prediction", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34717", "id": "BdQTCAuT6L", "proceeding": "https://proceedings.mlr.press/v235/stemmer24a.html", "pdf": "https://openreview.net/pdf?id=BdQTCAuT6L", "openreview": "https://openreview.net/forum?id=BdQTCAuT6L", "tldr": "", "abstract": "Private everlasting prediction (PEP), recently introduced by Naor et al. [2023], is a model for differentially private learning in which the learner never publicly releases a hypothesis. Instead, it provides black-box access to a \"prediction oracle\" that can predict the labels of an *endless stream* of unlabeled examples drawn from the underlying distribution. Importantly, PEP provides privacy both for the initial training set and for the endless stream of classification queries. We present two conceptual modifications to the definition of PEP, as well as new constructions exhibiting significant improvements over prior work. Specifically, we incorporate robustness against poisoning attacks into the definition of PEP; we present a relaxed privacy definition, suitable for PEP, that allows us to disconnect the privacy parameter $\\delta$ from the number of total time steps $T$; and we present new constructions for axis-aligned rectangles and decision-stumps exhibiting improved sample complexity and runtime.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Uri Stemmer", "authorids": "~Uri_Stemmer1", "gender": "", "homepage": "https://www.uri.co.il/", "dblp": "125/8532", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "~Uri_Stemmer1", "aff": "Tel Aviv University", "aff_domain": "tau.ac.il", "position": "Assistant Professor", "bibtex": "@inproceedings{\nstemmer2024private,\ntitle={Private Truly-Everlasting Robust-Prediction},\nauthor={Uri Stemmer},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BdQTCAuT6L}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 488436, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1158703648596576565&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "tau.ac.il", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Tel Aviv University", "aff_unique_dep": "", "aff_unique_url": "https://www.tau.ac.il", "aff_unique_abbr": "TAU", "aff_country_unique_index": "0", "aff_country_unique": "Israel" }, { "title": "Position: Technical Research and Talent is Needed for Effective AI Governance", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34716", "id": "Be2B6f0ps1", "proceeding": "https://proceedings.mlr.press/v235/reuel24a.html", "pdf": "https://openreview.net/pdf?id=Be2B6f0ps1", "openreview": "https://openreview.net/forum?id=Be2B6f0ps1", "author_site": "Anka Reuel, Lisa Soder, Benjamin Bucknall, Trond Undheim", "tldr": "", "abstract": "In light of recent advancements in AI capabilities and the increasingly widespread integration of AI systems into society, governments worldwide are actively seeking to mitigate the potential harms and risks associated with these technologies through regulation and other governance tools. However, there exist significant gaps between governance aspirations and the current state of the technical tooling necessary for their realisation. In this position paper, we survey policy documents published by public-sector institutions in the EU, US, and China to highlight specific areas of disconnect between the technical requirements necessary for enacting proposed policy actions, and the current technical state of the art. Our analysis motivates a call for tighter integration of the AI/ML research community within AI governance in order to i) catalyse technical research aimed at bridging the gap between current and supposed technical underpinnings of regulatory action, as well as ii) increase the level of technical expertise within governing institutions so as to inform and guide effective governance of AI.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anka Reuel;Lisa Soder;Benjamin Bucknall;Trond Arne Undheim", "authorids": "~Anka_Reuel1;~Lisa_Soder1;~Benjamin_Bucknall1;trondun@stanford.edu", "gender": ";F;M;", "homepage": ";https://www.linkedin.com/in/lisa-soder-642a38135/;;", "dblp": ";;;", "google_scholar": ";;R27P_noAAAAJ;", "orcid": ";;;", "linkedin": ";lisa-soder-642a38135/;ben-s-bucknall/;", "or_profile": "~Anka_Reuel1;~Lisa_Soder1;~Benjamin_Bucknall1;trondun@stanford.edu", "aff": ";Interface;University of Oxford;", "aff_domain": ";interface-eu.org;eng.ox.ac.uk;", "position": ";Researcher;PhD student;", "bibtex": "@inproceedings{\nreuel2024position,\ntitle={Position: Technical Research and Talent is Needed for Effective {AI} Governance},\nauthor={Anka Reuel and Lisa Soder and Benjamin Bucknall and Trond Arne Undheim},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Be2B6f0ps1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 270973, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17582734724234760606&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 5, "email": ";interface-eu.org;eng.ox.ac.uk;", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Interface;University of Oxford", "aff_unique_dep": ";", "aff_unique_url": ";https://www.ox.ac.uk", "aff_unique_abbr": ";Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1", "aff_country_unique": ";United Kingdom" }, { "title": "Safe Reinforcement Learning using Finite-Horizon Gradient-based Estimation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34715", "id": "BiENLaUwlK", "proceeding": "https://proceedings.mlr.press/v235/dai24d.html", "pdf": "https://openreview.net/pdf?id=BiENLaUwlK", "openreview": "https://openreview.net/forum?id=BiENLaUwlK", "author_site": "Juntao Dai, Yaodong Yang, Qian Zheng, Gang Pan", "tldr": "", "abstract": "A key aspect of Safe Reinforcement Learning (Safe RL) involves estimating the constraint condition for the next policy, which is crucial for guiding the optimization of safe policy updates. However, the existing *Advantage-based Estimation* (ABE) method relies on the infinite-horizon discounted advantage function. This dependence leads to catastrophic errors in finite-horizon scenarios with non-discounted constraints, resulting in safety-violation updates. In response, we propose the first estimation method for finite-horizon non-discounted constraints in deep Safe RL, termed *Gradient-based Estimation* (GBE), which relies on the analytic gradient derived along trajectories. Our theoretical and empirical analyses demonstrate that GBE can effectively estimate constraint changes over a finite horizon. Constructing a surrogate optimization problem with GBE, we developed a novel Safe RL algorithm called *Constrained Gradient-based Policy Optimization* (CGPO). CGPO identifies feasible optimal policies by iteratively resolving sub-problems within trust regions. Our empirical results reveal that CGPO, unlike baseline algorithms, successfully estimates the constraint functions of subsequent policies, thereby ensuring the efficiency and feasibility of each update.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Juntao Dai;Yaodong Yang;Qian Zheng;Gang Pan", "authorids": "~Juntao_Dai1;~Yaodong_Yang1;~Qian_Zheng5;~Gang_Pan1", "gender": "M;M;;", "homepage": "https://person.zju.edu.cn/jtdai;https://www.yangyaodong.com;;", "dblp": ";170/1496-1;;", "google_scholar": ";https://scholar.google.co.uk/citations?user=6yL0xw8AAAAJ;;", "orcid": ";0000-0001-8132-5613;;", "linkedin": ";yaodong-yang;;", "or_profile": "~Juntao_Dai1;~Yaodong_Yang1;~Qian_Zheng5;~Gang_Pan1", "aff": "Zhejiang University;Peking University;;", "aff_domain": "zju.edu.cn;pku.edu.cn;;", "position": "PhD student;Assistant Professor;;", "bibtex": "@inproceedings{\ndai2024safe,\ntitle={Safe Reinforcement Learning using Finite-Horizon Gradient-based Estimation},\nauthor={Juntao Dai and Yaodong Yang and Qian Zheng and Gang Pan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BiENLaUwlK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3745973, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4960312141399879806&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "zju.edu.cn;pku.edu.cn;;", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Zhejiang University;Peking University", "aff_unique_dep": ";", "aff_unique_url": "https://www.zju.edu.cn;http://www.pku.edu.cn", "aff_unique_abbr": "ZJU;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Efficient World Models with Context-Aware Tokenization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34714", "id": "BiWIERWBFX", "proceeding": "https://proceedings.mlr.press/v235/micheli24a.html", "pdf": "https://openreview.net/pdf?id=BiWIERWBFX", "openreview": "https://openreview.net/forum?id=BiWIERWBFX", "author_site": "Vincent Micheli, Eloi Alonso, Fran\u00e7ois Fleuret", "tldr": "", "abstract": "Scaling up deep Reinforcement Learning (RL) methods presents a significant challenge. Following developments in generative modelling, model-based RL positions itself as a strong contender. Recent advances in sequence modelling have led to effective transformer-based world models, albeit at the price of heavy computations due to the long sequences of tokens required to accurately simulate environments. In this work, we propose $\\Delta$-IRIS, a new agent with a world model architecture composed of a discrete autoencoder that encodes stochastic deltas between time steps and an autoregressive transformer that predicts future deltas by summarizing the current state of the world with continuous tokens. In the Crafter benchmark, $\\Delta$-IRIS sets a new state of the art at multiple frame budgets, while being an order of magnitude faster to train than previous attention-based approaches. We release our code and models at https://github.com/vmicheli/delta-iris.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vincent Micheli;Eloi Alonso;Fran\u00e7ois Fleuret", "authorids": "~Vincent_Micheli1;~Eloi_Alonso1;~Fran\u00e7ois_Fleuret2", "gender": ";M;M", "homepage": ";;https://fleuret.org/francois/", "dblp": ";237/9666;90/5265", "google_scholar": "Dx7fMZ4AAAAJ;Ya4KugcAAAAJ;https://scholar.google.ch/citations?user=Bj1tRlsAAAAJ", "orcid": ";;0000-0001-9457-7393", "linkedin": "vincent-micheli/;eloialonso/;francois-fleuret/", "or_profile": "~Vincent_Micheli1;~Eloi_Alonso1;~Francois_Fleuret1", "aff": "University of Geneva, Switzerland;University of Geneva;University of Geneva", "aff_domain": "unige.ch;unige.ch;unige.ch", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nmicheli2024efficient,\ntitle={Efficient World Models with Context-Aware Tokenization},\nauthor={Vincent Micheli and Eloi Alonso and Fran{\\c{c}}ois Fleuret},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BiWIERWBFX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 990525, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5849364765007176165&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "unige.ch;unige.ch;unige.ch", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Geneva", "aff_unique_dep": "", "aff_unique_url": "https://www.unige.ch", "aff_unique_abbr": "UNIGE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Proactive Detection of Voice Cloning with Localized Watermarking", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34713", "id": "Bic3Vmy2DG", "proceeding": "https://proceedings.mlr.press/v235/san-roman24a.html", "pdf": "https://openreview.net/pdf?id=Bic3Vmy2DG", "openreview": "https://openreview.net/forum?id=Bic3Vmy2DG", "author_site": "Robin San Roman, Pierre Fernandez, Hady Elsahar, Alexandre Defossez, Teddy Furon, Tuan Tran", "tldr": "", "abstract": "In the rapidly evolving field of speech generative models, there is a pressing need to ensure audio authenticity against the risks of voice cloning. We present AudioSeal, the first audio watermarking technique designed specifically for localized detection of AI-generated speech. AudioSeal employs a generator / detector architecture trained jointly with a localization loss to enable localized watermark detection up to the sample level, and a novel perceptual loss inspired by auditory masking, that enables AudioSeal to achieve better imperceptibility. AudioSeal achieves state-of-the-art performance in terms of robustness to real life audio manipulations and imperceptibility based on automatic and human evaluation metrics. Additionally, AudioSeal is designed with a fast, single-pass detector, that significantly surpasses existing models in speed, achieving detection up to two orders of magnitude faster, making it ideal for large-scale and real-time applications.Code is available at https://github.com/facebookresearch/audioseal", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Robin San Roman;Pierre Fernandez;Hady Elsahar;Alexandre D\u00e9fossez;Teddy Furon;Tuan Tran", "authorids": "~Robin_San_Roman1;~Pierre_Fernandez1;~Hady_Elsahar2;~Alexandre_D\u00e9fossez1;~Teddy_Furon1;~Tuan_Tran5", "gender": "M;M;Unspecified;M;M;M", "homepage": ";https://pierrefdz.github.io/;http://hadyelsahar.io;https://ai.honu.io/;http://people.rennes.inria.fr/Teddy.Furon;https://github.com/antoine-tran", "dblp": "289/7209;309/5729;144/6739;156/0054;00/3862;249/6465", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;osCX1YQAAAAJ;SbcM6bsAAAAJ;https://scholar.google.fr/citations?user=DubNUU0AAAAJ;https://scholar.google.com/citations?hl=fr;https://scholar.google.de/citations?user=jk2BlO4AAAAJ", "orcid": ";0000-0003-3890-2248;;;0000-0002-1565-765X;", "linkedin": ";pierrefdz/;hadyelsahar/;;;https://linkedin.com/in/antoinetrande", "or_profile": "~Robin_San_Roman1;~Pierre_Fernandez1;~Hady_Elsahar2;~Alexandre_D\u00e9fossez1;~Teddy_Furon1;~Tuan_A._Tran1", "aff": "FAIR;Universit\u00e9 Rennes 1;Meta Facebook;Kyutai;INRIA;Meta Platform", "aff_domain": "meta.com;univ-rennes1.fr;fb.com;kyutai.org;inria.fr;meta.com", "position": "PhD student;PhD student;Researcher;Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\nroman2024proactive,\ntitle={Proactive Detection of Voice Cloning with Localized Watermarking},\nauthor={Robin San Roman and Pierre Fernandez and Hady Elsahar and Alexandre D{\\'e}fossez and Teddy Furon and Tuan Tran},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Bic3Vmy2DG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 782111, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1870520239929036017&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15, "email": "meta.com;univ-rennes1.fr;fb.com;kyutai.org;inria.fr;meta.com", "author_num": 6, "aff_unique_index": "0;1;0;2;3;0", "aff_unique_norm": "Meta;Universit\u00e9 Rennes 1;Kyushu University;INRIA", "aff_unique_dep": "Facebook AI Research;;;", "aff_unique_url": "https://research.facebook.com;https://www.univ-rennes1.fr;https://www.kyushu-u.ac.jp;https://www.inria.fr", "aff_unique_abbr": "FAIR;UR1;Kyushu U;INRIA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Rennes", "aff_country_unique_index": "0;1;0;2;1;0", "aff_country_unique": "United States;France;Japan" }, { "title": "FuRL: Visual-Language Models as Fuzzy Rewards for Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34712", "id": "BmPWtzL7Eq", "proceeding": "https://proceedings.mlr.press/v235/fu24j.html", "pdf": "https://openreview.net/pdf?id=BmPWtzL7Eq", "openreview": "https://openreview.net/forum?id=BmPWtzL7Eq", "author_site": "Yuwei Fu, Haichao Zhang, di wu, Wei Xu, Benoit Boulet", "tldr": "", "abstract": "In this work, we investigate how to leverage pre-trained visual-language models (VLM) for online Reinforcement Learning (RL). In particular, we focus on sparse reward tasks with pre-defined textual task descriptions. We first identify the problem of reward misalignment when applying VLM as a reward in RL tasks. To address this issue, we introduce a lightweight fine-tuning method, named Fuzzy VLM reward-aided RL (FuRL), based on reward alignment and relay RL. Specifically, we enhance the performance of SAC/DrQ baseline agents on sparse reward tasks by fine-tuning VLM representations and using relay RL to avoid local minima. Extensive experiments on the Meta-world benchmark tasks demonstrate the efficacy of the proposed method. Code is available at: https://github.com/fuyw/FuRL.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuwei Fu;Haichao Zhang;Di Wu;Wei Xu;Benoit Boulet", "authorids": "~Yuwei_Fu1;~Haichao_Zhang4;~Di_Wu11;~Wei_Xu13;~Benoit_Boulet1", "gender": "M;M;M;M;M", "homepage": "http://mcgillialab.com/people/;http://mcgillialab.com/people/;;https://www.mcgill.ca/ece/benoit-boulet;https://sites.google.com/site/hczhang1/", "dblp": "200/1646;52/328-44.html;;;", "google_scholar": ";https://scholar.google.ca/citations?user=IbcoTsgAAAAJ;Gxz1fqwAAAAJ;https://scholar.google.ca/citations?user=kkGyLY4AAAAJ;_OsT-RgAAAAJ", "orcid": ";;;0000-0002-3191-3967;", "linkedin": ";;;benoit-boulet-97078012/;", "or_profile": "~Yuwei_Fu1;~Di_Wu11;~Wei_Xu13;~Benoit_Boulet1;~Haichao_Zhang2", "aff": "McGill University;McGill University;Horizon Robotics;McGill University;Horizon Robotics", "aff_domain": "mcgill.ca;mcgill.ca;horizon.auto;mcgill.ca;horizon.ai", "position": "PhD student;Adjunct Professor;Researcher;Full Professor;Research Scientist", "bibtex": "@inproceedings{\nfu2024furl,\ntitle={Fu{RL}: Visual-Language Models as Fuzzy Rewards for Reinforcement Learning},\nauthor={Yuwei Fu and Haichao Zhang and Di Wu and Wei Xu and Benoit Boulet},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BmPWtzL7Eq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9931816, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3398056862278275032&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "mcgill.ca;mcgill.ca;horizon.auto;mcgill.ca;horizon.ai", "author_num": 5, "aff_unique_index": "0;0;1;0;1", "aff_unique_norm": "McGill University;Horizon Robotics", "aff_unique_dep": ";", "aff_unique_url": "https://www.mcgill.ca;https://www.horizon-robotics.com/", "aff_unique_abbr": "McGill;Horizon Robotics", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;1", "aff_country_unique": "Canada;China" }, { "title": "Weighted distance nearest neighbor condensing", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34711", "id": "BoPj12CnAn", "proceeding": "https://proceedings.mlr.press/v235/gottlieb24a.html", "pdf": "https://openreview.net/pdf?id=BoPj12CnAn", "openreview": "https://openreview.net/forum?id=BoPj12CnAn", "author_site": "Lee-Ad Gottlieb, Timor Sharabi, Roi Weiss", "tldr": "", "abstract": "The problem of nearest neighbor condensing has enjoyed a long history of study, both in its theoretical and practical aspects. In this paper, we introduce the problem of weighted distance nearest neighbor condensing, where one assigns weights to each point of the condensed set, and then new points are labeled based on their weighted distance nearest neighbor in the condensed set. We study the theoretical properties of this new model, and show that it can produce dramatically better condensing than the standard nearest neighbor rule, yet is characterized by generalization bounds almost identical to the latter. We then suggest a condensing heuristic for our new problem. We demonstrate Bayes consistency for this heuristic, and also show promising empirical results.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lee-Ad Gottlieb;Timor Sharabi;Roi Weiss", "authorids": "~Lee-Ad_Gottlieb1;timorsharabi@gmail.com;~Roi_Weiss1", "gender": "F;;", "homepage": "https://www.ariel.ac.il/wp/lee-ad-gottlieb/;;", "dblp": "09/1539;;", "google_scholar": "https://scholar.google.co.il/citations?user=HV6OPfcAAAAJ;;https://scholar.google.co.il/citations?user=w3ieXocAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Lee-Ad_Gottlieb1;timorsharabi@gmail.com;~Roi_Weiss1", "aff": "Ariel University;;Ariel University Center of Samaria", "aff_domain": "ariel.ac.il;;ariel.ac.il", "position": "Associate Professor;;Lecturer", "bibtex": "@inproceedings{\ngottlieb2024weighted,\ntitle={Weighted distance nearest neighbor condensing},\nauthor={Lee-Ad Gottlieb and Timor Sharabi and Roi Weiss},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BoPj12CnAn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 447730, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MFqRDGJqN4kJ:scholar.google.com/&scioq=Weighted+distance+nearest+neighbor+condensing&hl=en&as_sdt=0,5", "gs_version_total": 7, "email": "ariel.ac.il;;ariel.ac.il", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Ariel University", "aff_unique_dep": "", "aff_unique_url": "https://www.ariel.ac.il", "aff_unique_abbr": "Ariel U", "aff_campus_unique_index": "1", "aff_campus_unique": ";Samaria", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "Selecting Large Language Model to Fine-tune via Rectified Scaling Law", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34710", "id": "Bq2THeNXRr", "proceeding": "https://proceedings.mlr.press/v235/lin24j.html", "pdf": "https://openreview.net/pdf?id=Bq2THeNXRr", "openreview": "https://openreview.net/forum?id=Bq2THeNXRr", "author_site": "Haowei Lin, Baizhou Huang, Haotian Ye, Qinyu Chen, Zihao Wang, Sujian Li, Jianzhu Ma, Xiaojun Wan, James Zou, Yitao Liang", "tldr": "", "abstract": "The ever-growing ecosystem of LLMs has posed a challenge in selecting the most appropriate pre-trained model to fine-tune amidst a sea of options. Given constrained resources, fine-tuning all models and making selections afterward is unrealistic. In this work, we formulate this resource-constrained selection task into predicting fine-tuning performance and illustrate its natural connection with Scaling Law. Unlike pre-training, we find that the fine-tuning scaling curve includes not just the well-known \"power phase\" but also the previously unobserved \"pre-power phase\". We also explain why existing Scaling Law fails to capture this phase transition phenomenon both theoretically and empirically. To address this, we introduce the concept of \"pre-learned data size\" into our Rectified Scaling Law, which overcomes theoretical limitations and fits experimental results much better. By leveraging our law, we propose a novel LLM selection algorithm that selects the near-optimal model with hundreds of times less resource consumption, while other methods may provide negatively correlated selection. The project page is available at rectified-scaling-law.github.io.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haowei Lin;Baizhou Huang;Haotian Ye;Qinyu Chen;Zihao Wang;Sujian Li;Jianzhu Ma;Xiaojun Wan;James Zou;Yitao Liang", "authorids": "~Haowei_Lin1;~Baizhou_Huang1;~Haotian_Ye1;~Qinyu_Chen2;~Zihao_Wang23;~Sujian_Li1;~Jianzhu_Ma2;~Xiaojun_Wan1;~James_Zou1;~Yitao_Liang1", "gender": "M;M;M;M;M;F;M;M;;M", "homepage": "https://linhaowei1.github.io/;;https://haotianye.com;https://morganchen.site;https://zhwang4ai.github.io/;https://pku-tangent.github.io/;https://majianzhu.com/;https://wanxiaojun.github.io;;https://web.cs.ucla.edu/~yliang/", "dblp": "235/2798;329/4291;284/0539;;;05/4288;24/9080.html;07/1521;;173/4969", "google_scholar": "Ng-DmJgAAAAJ;1Zx1wi8AAAAJ;VU4chlsAAAAJ;;I0D-EgQAAAAJ;https://scholar.google.com.tw/citations?user=RvBDhSwAAAAJ;;lTTeBdkAAAAJ;23ZXZvEAAAAJ;KVzR1XEAAAAJ", "orcid": "0009-0006-9809-4835;;;;0000-0001-8396-3707;;;;;", "linkedin": ";;;;;;;;;", "or_profile": "~Haowei_Lin1;~Baizhou_Huang1;~Haotian_Ye1;~Qinyu_Chen2;~Zihao_Wang23;~Sujian_Li1;~Jianzhu_Ma2;~Xiaojun_Wan1;~James_Zou1;~Yitao_Liang1", "aff": "Peking University;Peking University;Stanford University;Peking University;Peking University;Peking University;Tsinghua University;Peking University;Stanford University;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn;stanford.edu;pku.edu.cn;pku.edu.cn;pku.edu.cn;tsinghua.edu.cn;pku.edu.cn;stanford.edu;pku.edu.cn", "position": "PhD student;PhD student;PhD student;MS student;PhD student;Associate Professor;Associate Professor;Full Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nlin2024selecting,\ntitle={Selecting Large Language Model to Fine-tune via Rectified Scaling Law},\nauthor={Haowei Lin and Baizhou Huang and Haotian Ye and Qinyu Chen and Zihao Wang and Sujian Li and Jianzhu Ma and Xiaojun Wan and James Zou and Yitao Liang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Bq2THeNXRr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6241676, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10073943626118813212&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "email": "pku.edu.cn;pku.edu.cn;stanford.edu;pku.edu.cn;pku.edu.cn;pku.edu.cn;tsinghua.edu.cn;pku.edu.cn;stanford.edu;pku.edu.cn", "author_num": 10, "aff_unique_index": "0;0;1;0;0;0;2;0;1;0", "aff_unique_norm": "Peking University;Stanford University;Tsinghua University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.pku.edu.cn;https://www.stanford.edu;https://www.tsinghua.edu.cn", "aff_unique_abbr": "Peking U;Stanford;THU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;1;0;0;0;0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Dynamic Anisotropic Smoothing for Noisy Derivative-Free Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34709", "id": "BrCrnaCYDc", "proceeding": "https://proceedings.mlr.press/v235/reifenstein24a.html", "pdf": "https://openreview.net/pdf?id=BrCrnaCYDc", "openreview": "https://openreview.net/forum?id=BrCrnaCYDc", "author_site": "Sam Reifenstein, Timothee Leleu, Yoshihisa Yamamoto", "tldr": "", "abstract": "We propose a novel algorithm that extends the methods of ball smoothing and Gaussian smoothing for noisy derivative-free optimization by accounting for the heterogeneous curvature of the objective function. The algorithm dynamically adapts the shape of the smoothing kernel to approximate the Hessian of the objective function around a local optimum. This approach significantly reduces the error in estimating the gradient from noisy evaluations through sampling. We demonstrate the efficacy of our method through numerical experiments on artificial problems. Additionally, we show improved performance when tuning NP-hard combinatorial optimization solvers compared to existing state-ofthe-art heuristic derivative-free and Bayesian optimization methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sam Reifenstein;Timothee Leleu;Yoshihisa Yamamoto", "authorids": "~Sam_Reifenstein1;~Timothee_Leleu1;yoyshihisa.yamamoto@ntt-research.com", "gender": "M;;", "homepage": "https://ntt-research.com/phi-people/reifenstein-profile/;;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Sam_Reifenstein1;~Timothee_Leleu1;yoyshihisa.yamamoto@ntt-research.com", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nreifenstein2024dynamic,\ntitle={Dynamic Anisotropic Smoothing for Noisy Derivative-Free Optimization},\nauthor={Sam Reifenstein and Timothee Leleu and Yoshihisa Yamamoto},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BrCrnaCYDc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1887567, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16144264128199295581&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 7, "email": ";;", "author_num": 3 }, { "title": "Debiased Offline Representation Learning for Fast Online Adaptation in Non-stationary Dynamics", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34708", "id": "BrZPj9rEpN", "proceeding": "https://proceedings.mlr.press/v235/zhang24bc.html", "pdf": "https://openreview.net/pdf?id=BrZPj9rEpN", "openreview": "https://openreview.net/forum?id=BrZPj9rEpN", "author_site": "Xinyu Zhang, Wenjie Qiu, Yi-Chen Li, lei yuan, Chengxing Jia, Zongzhang Zhang, Yang Yu", "tldr": "", "abstract": "Developing policies that can adapt to non-stationary environments is essential for real-world reinforcement learning applications. Nevertheless, learning such adaptable policies in offline settings, with only a limited set of pre-collected trajectories, presents significant challenges. A key difficulty arises because the limited offline data makes it hard for the context encoder to differentiate between changes in the environment dynamics and shifts in the behavior policy, often leading to context misassociations. To address this issue, we introduce a novel approach called debiased offline representation learning for fast online adaptation (DORA). DORA incorporates an information bottleneck principle that maximizes mutual information between the dynamics encoding and the environmental data, while minimizing mutual information between the dynamics encoding and the actions of the behavior policy. We present a practical implementation of DORA, leveraging tractable bounds of the information bottleneck principle. Our experimental evaluation across six benchmark MuJoCo tasks with variable parameters demonstrates that DORA not only achieves a more precise dynamics encoding but also significantly outperforms existing baselines in terms of performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xinyu Zhang;Wenjie Qiu;Yi-Chen Li;Lei Yuan;Chengxing Jia;Zongzhang Zhang;Yang Yu", "authorids": "~Xinyu_Zhang16;~Wenjie_Qiu3;~Yi-Chen_Li1;~Lei_Yuan2;~Chengxing_Jia1;~Zongzhang_Zhang1;~Yang_Yu5", "gender": "M;;M;M;M;M;M", "homepage": "http://www.lamda.nju.edu.cn/zhangxinyu/;https://www.lamda.nju.edu.cn/qiuwj/;http://www.lamda.nju.edu.cn/liyc/;http://www.lamda.nju.edu.cn/yuanl/;http://www.lamda.nju.edu.cn/jiacx/;http://www.lamda.nju.edu.cn/zhangzz;http://www.lamda.nju.edu.cn/yuy", "dblp": ";369/4775;143/7158-1;23/6750-1;;90/8724;46/2181-1", "google_scholar": ";;https://scholar.google.com.hk/citations?user=OA3GmbQAAAAJ;https://scholar.google.com/citations?hl=zh-CN;;sG7WEAgAAAAJ;PG2lDSwAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Xinyu_Zhang16;~Wenjie_Qiu3;~Yi-Chen_Li1;~Lei_Yuan2;~Chengxing_Jia1;~Zongzhang_Zhang1;~Yang_Yu2", "aff": "Nanjing University;Nanjing University;Nanjing University;Nanjing University;Nanjing University;Nanjing University;Nanjing University", "aff_domain": "nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn", "position": "MS student;MS student;PhD student;Researcher;PhD student;Associate Professor;Professor", "bibtex": "@inproceedings{\nzhang2024debiased,\ntitle={Debiased Offline Representation Learning for Fast Online Adaptation in Non-stationary Dynamics},\nauthor={Xinyu Zhang and Wenjie Qiu and Yi-Chen Li and Lei Yuan and Chengxing Jia and Zongzhang Zhang and Yang Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BrZPj9rEpN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1141285, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5195036711834589341&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn", "author_num": 7, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "", "aff_unique_url": "https://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "A connection between Tempering and Entropic Mirror Descent", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34707", "id": "BtbijvkWLC", "proceeding": "https://proceedings.mlr.press/v235/chopin24a.html", "pdf": "https://openreview.net/pdf?id=BtbijvkWLC", "openreview": "https://openreview.net/forum?id=BtbijvkWLC", "author_site": "Nicolas Chopin, Francesca R Crucinio, Anna Korba", "tldr": "", "abstract": "This paper explores the connections between tempering (for Sequential Monte Carlo; SMC) and entropic mirror descent to sample from a target probability distribution whose unnormalized density is known. We establish that tempering SMC corresponds to entropic mirror descent applied to the reverse Kullback-Leibler (KL) divergence and obtain convergence rates for the tempering iterates. Our result motivates the tempering iterates from an optimization point of view, showing that tempering can be seen as a descent scheme of the KL divergence with respect to the Fisher-Rao geometry, in contrast to Langevin dynamics that perform descent of the KL with respect to the Wasserstein-2 geometry. We exploit the connection between tempering and mirror descent iterates to justify common practices in SMC and derive adaptive tempering rules that improve over other alternative benchmarks in the literature.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nicolas Chopin;Francesca Crucinio;Anna Korba", "authorids": "~Nicolas_Chopin1;francesca.crucinio@gmail.com;~Anna_Korba2", "gender": "M;;", "homepage": "https://nchopin.github.io/;;", "dblp": "24/7601;;182/8959.html", "google_scholar": "https://scholar.google.co.uk/citations?user=pXG4LfoAAAAJ;;https://scholar.google.fr/citations?user=dbH6E3kAAAAJ", "orcid": "0000-0002-0628-5815;;", "linkedin": ";;", "or_profile": "~Nicolas_Chopin1;francesca.crucinio@gmail.com;~Anna_Korba2", "aff": "Ecole Nationale de la Statistique et de l'Administration Economique;;Ensae ParisTech", "aff_domain": "ensae.fr;;ensae.fr", "position": "Full Professor;;Assistant Professor", "bibtex": "@inproceedings{\nchopin2024a,\ntitle={A connection between Tempering and Entropic Mirror Descent},\nauthor={Nicolas Chopin and Francesca Crucinio and Anna Korba},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BtbijvkWLC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1222143, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11563984263774683703&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "ensae.fr;;ensae.fr", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Ecole Nationale de la Statistique et de l'Administration Economique;ENSAE ParisTech", "aff_unique_dep": ";", "aff_unique_url": "https://ensae.fr;https://www.ensae.fr", "aff_unique_abbr": "ENSAE;Ensae", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "title": "Uncertainty-Aware Reward-Free Exploration with General Function Approximation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34706", "id": "BvBdYSIkpb", "proceeding": "https://proceedings.mlr.press/v235/zhang24ci.html", "pdf": "https://openreview.net/pdf?id=BvBdYSIkpb", "openreview": "https://openreview.net/forum?id=BvBdYSIkpb", "author_site": "Junkai Zhang, Weitong Zhang, Dongruo Zhou, Quanquan Gu", "tldr": "", "abstract": "Mastering multiple tasks through exploration and learning in an environment poses a significant challenge in reinforcement learning (RL). Unsupervised RL has been introduced to address this challenge by training policies with intrinsic rewards rather than extrinsic rewards. However, current intrinsic reward designs and unsupervised RL algorithms often overlook the heterogeneous nature of collected samples, thereby diminishing their sample efficiency. To overcome this limitation, in this paper, we proposed a reward-free RL algorithm called GFA-RFE. The key idea behind our algorithm is an uncertainty-aware intrinsic reward for exploring the environment and an uncertainty-weighted learning process to handle heterogeneous uncertainty in different samples. Theoretically, we show that in order to find an $\\epsilon$-optimal policy, GFA-RFE needs to collect $\\tilde{O} (H^2 \\log N_{\\mathcal{F}} (\\epsilon) \\text{dim} (\\mathcal{F}) / \\epsilon^2 )$ number of episodes, where $\\mathcal{F}$ is the value function class with covering number $N_{\\mathcal{F}} (\\epsilon)$ and generalized eluder dimension $\\text{dim} (\\mathcal{F})$. Such a result outperforms all existing reward-free RL algorithms. We further implement and evaluate GFA-RFE across various domains and tasks in the DeepMind Control Suite. Experiment results show that GFA-RFE outperforms or is comparable to the performance of state-of-the-art unsupervised RL algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Junkai Zhang;Weitong Zhang;Dongruo Zhou;Quanquan Gu", "authorids": "~Junkai_Zhang2;~Weitong_Zhang2;~Dongruo_Zhou1;~Quanquan_Gu1", "gender": ";;M;M", "homepage": ";;;http://web.cs.ucla.edu/~qgu/", "dblp": ";;215/3401;50/4597", "google_scholar": ";;1780wr0AAAAJ;GU9HgNAAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Junkai_Zhang2;~Weitong_Zhang2;~Dongruo_Zhou1;~Quanquan_Gu1", "aff": ";;Indiana University;University of California, Los Angeles", "aff_domain": ";;iu.edu;cs.ucla.edu", "position": ";;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nzhang2024uncertaintyaware,\ntitle={Uncertainty-Aware Reward-Free Exploration with General Function Approximation},\nauthor={Junkai Zhang and Weitong Zhang and Dongruo Zhou and Quanquan Gu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BvBdYSIkpb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 904006, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Pyvy4pCRJwoJ:scholar.google.com/&scioq=Uncertainty-Aware+Reward-Free+Exploration+with+General+Function+Approximation&hl=en&as_sdt=0,44", "gs_version_total": 8, "email": ";;iu.edu;cs.ucla.edu", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Indiana University;University of California, Los Angeles", "aff_unique_dep": ";", "aff_unique_url": "https://www.indiana.edu;https://www.ucla.edu", "aff_unique_abbr": "IU;UCLA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Evolution of Heuristics: Towards Efficient Automatic Algorithm Design Using Large Language Model", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34705", "id": "BwAkaxqiLB", "proceeding": "https://proceedings.mlr.press/v235/liu24bs.html", "pdf": "https://openreview.net/pdf?id=BwAkaxqiLB", "openreview": "https://openreview.net/forum?id=BwAkaxqiLB", "author_site": "Fei Liu, Tong Xialiang, Mingxuan Yuan, Xi Lin, Fu Luo, Zhenkun Wang, Zhichao Lu, Qingfu Zhang", "tldr": "", "abstract": "Heuristics are widely used for dealing with complex search and optimization problems. However, manual design of heuristics can be often very labour extensive and requires rich working experience and knowledge. This paper proposes Evolution of Heuristic (EoH), a novel evolutionary paradigm that leverages both Large Language Models (LLMs) and Evolutionary Computation (EC) methods for Automatic Heuristic Design (AHD). EoH represents the ideas of heuristics in natural language, termed thoughts. They are then translated into executable codes by LLMs. The evolution of both thoughts and codes in an evolutionary search framework makes it very effective and efficient for generating high-performance heuristics. Experiments on three widely studied combinatorial optimization benchmark problems demonstrate that EoH outperforms commonly used handcrafted heuristics and other recent AHD methods including FunSearch. Particularly, the heuristic produced by EoH with a low computational budget (in terms of the number of queries to LLMs) significantly outperforms widely-used human hand-crafted baseline algorithms for the online bin packing problem.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fei Liu;Tong Xialiang;Mingxuan Yuan;Xi Lin;Fu Luo;Zhenkun Wang;Zhichao Lu;Qingfu Zhang", "authorids": "~Fei_Liu14;~Tong_Xialiang2;~Mingxuan_Yuan1;~Xi_Lin2;~Fu_Luo1;~Zhenkun_Wang1;~Zhichao_Lu1;~Qingfu_Zhang1", "gender": "M;M;M;M;M;M;M;M", "homepage": ";;https://xi-l.github.io/;;https://faculty.sustech.edu.cn/wangzk3/en/;https://www.cs.cityu.edu.hk/~zhichalu/;https://www.cs.cityu.edu.hk/~qzhan7/index.html;https://feiliu36.github.io/", "dblp": "https://dblp.uni-trier.de/pid/245/5977.html;74/2356;43/489-1;52/9546;96/9114;144/1417;98/1240.html;64/1350-44", "google_scholar": ";https://scholar.google.com/citations?hl=en;QB_MUboAAAAJ;i2TLiM8AAAAJ;https://scholar.google.com.sg/citations?user=r9ezy2gAAAAJ;tIFWBcQAAAAJ;https://scholar.google.co.uk/citations?user=nhL9PHwAAAAJ;wS0G_qQAAAAJ", "orcid": ";0000-0002-2236-8784;;0000-0002-3161-6348;0000-0003-1152-6780;0000-0002-4618-3573;;0000-0001-6719-0409", "linkedin": ";;;;;zhichao-lu-728037b4/;;", "or_profile": "~Tong_Xialiang2;~Mingxuan_Yuan1;~Xi_Lin2;~Fu_Luo1;~Zhenkun_Wang1;~Zhichao_Lu1;~Qingfu_Zhang1;~Fei_LIU13", "aff": "Huawei Technologies Ltd.;Huawei Technologies Ltd.;City University of Hong Kong;Southern University of Science and Technology;Southern University of Science and Technology;City University of Hong Kong;City University of Hong Kong;City University of Hong Kong", "aff_domain": "huawei.com;huawei.com;cityu.edu.hk;sustech.edu;sustech.edu.cn;cityu.edu.hk;cityu.edu.hk;cityu.edu.hk", "position": "Researcher;Researcher;Postdoc;MS student;Assistant Professor;Assistant Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nliu2024evolution,\ntitle={Evolution of Heuristics: Towards Efficient Automatic Algorithm Design Using Large Language Model},\nauthor={Fei Liu and Tong Xialiang and Mingxuan Yuan and Xi Lin and Fu Luo and Zhenkun Wang and Zhichao Lu and Qingfu Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BwAkaxqiLB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 651933, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 90, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4986604692423026989&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 14, "email": "huawei.com;huawei.com;cityu.edu.hk;sustech.edu;sustech.edu.cn;cityu.edu.hk;cityu.edu.hk;cityu.edu.hk", "author_num": 8, "aff_unique_index": "0;0;1;2;2;1;1;1", "aff_unique_norm": "Huawei;City University of Hong Kong;Southern University of Science and Technology", "aff_unique_dep": "Huawei Technologies;;", "aff_unique_url": "https://www.huawei.com;https://www.cityu.edu.hk;https://www.sustech.edu.cn", "aff_unique_abbr": "Huawei;CityU;SUSTech", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "RIME: Robust Preference-based Reinforcement Learning with Noisy Preferences", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34704", "id": "BxAvcnlS8O", "proceeding": "https://proceedings.mlr.press/v235/cheng24k.html", "pdf": "https://openreview.net/pdf?id=BxAvcnlS8O", "openreview": "https://openreview.net/forum?id=BxAvcnlS8O", "author_site": "Jie Cheng, Gang Xiong, Xingyuan Dai, Qinghai Miao, Yisheng Lv, Fei-Yue Wang", "tldr": "", "abstract": "Preference-based Reinforcement Learning (PbRL) circumvents the need for reward engineering by harnessing human preferences as the reward signal. However, current PbRL methods excessively depend on high-quality feedback from domain experts, which results in a lack of robustness. In this paper, we present RIME, a robust PbRL algorithm for effective reward learning from noisy preferences. Our method utilizes a sample selection-based discriminator to dynamically filter out noise and ensure robust training. To counteract the cumulative error stemming from incorrect selection, we suggest a warm start for the reward model, which additionally bridges the performance gap during the transition from pre-training to online training in PbRL. Our experiments on robotic manipulation and locomotion tasks demonstrate that RIME significantly enhances the robustness of the state-of-the-art PbRL method. Code is available at https://github.com/CJReinforce/RIME_ICML2024.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jie Cheng;Gang Xiong;Xingyuan Dai;Qinghai Miao;Yisheng Lv;Fei-Yue Wang", "authorids": "~Jie_Cheng4;~Gang_Xiong2;~Xingyuan_Dai1;~Qinghai_Miao1;~Yisheng_Lv1;~Fei-Yue_Wang2", "gender": "M;;M;M;M;M", "homepage": ";;;https://people.ucas.edu.cn/~lvyisheng;http://www.ia.cas.cn/sourcedb_ia_cas/en/iaexpert/200908/t20090804_2310468.html;https://teacher.ucas.ac.cn/~gxiong", "dblp": ";203/8062;33/1250;;;96/372-1", "google_scholar": ";R4Rn7dMAAAAJ;;RRKqjKAAAAAJ;;", "orcid": "0009-0008-5373-7563;;0000-0003-1213-1123;;;", "linkedin": ";;;;;", "or_profile": "~Jie_Cheng4;~Xingyuan_Dai1;~Qinghai_Miao1;~Yisheng_Lv1;~Fei-Yue_Wang2;~Xiong_Gang1", "aff": "University of Chinese Academy of Sciences;Institute of Automation, Chinese Academy of Sciences;University of Chinese Academy of Sciences;University of Chinese Academy of Sciences;Institute of Automation, Chinese Academy of Sciences;Institute of Automation, Chinese Academy of Science", "aff_domain": "ucas.ac.cn;ia.ac.cn;ucas.ac.cn;ucas.ac.cn;ia.ac.cn;ia.ac.cn", "position": "PhD student;Assistant Professor;Associate Professor;Full Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\ncheng2024rime,\ntitle={{RIME}: Robust Preference-based Reinforcement Learning with Noisy Preferences},\nauthor={Jie Cheng and Gang Xiong and Xingyuan Dai and Qinghai Miao and Yisheng Lv and Fei-Yue Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BxAvcnlS8O}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7705361, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14955669378796932275&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 7, "email": "ucas.ac.cn;ia.ac.cn;ucas.ac.cn;ucas.ac.cn;ia.ac.cn;ia.ac.cn", "author_num": 6, "aff_unique_index": "0;1;0;0;1;1", "aff_unique_norm": "University of Chinese Academy of Sciences;Chinese Academy of Sciences", "aff_unique_dep": ";Institute of Automation", "aff_unique_url": "http://www.ucas.ac.cn;http://www.ia.cas.cn", "aff_unique_abbr": "UCAS;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Toward Availability Attacks in 3D Point Clouds", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34703", "id": "C0sGIO2MZN", "proceeding": "https://proceedings.mlr.press/v235/zhu24i.html", "pdf": "https://openreview.net/pdf?id=C0sGIO2MZN", "openreview": "https://openreview.net/forum?id=C0sGIO2MZN", "author_site": "Yifan Zhu, Yibo Miao, Yinpeng Dong, Xiao-Shan Gao", "tldr": "", "abstract": "Despite the great progress of 3D vision, data privacy and security issues in 3D deep learning are not explored systematically. In the domain of 2D images, many availability attacks have been proposed to prevent data from being illicitly learned by unauthorized deep models. However, unlike images represented on a fixed dimensional grid, point clouds are characterized as unordered and unstructured sets, posing a significant challenge in designing an effective availability attack for 3D deep learning. In this paper, we theoretically show that extending 2D availability attacks directly to 3D point clouds under distance regularization is susceptible to the degeneracy, rendering the generated poisons weaker or even ineffective. This is because in bi-level optimization, introducing regularization term can result in update directions out of control. To address this issue, we propose a novel Feature Collision Error-Minimization (FC-EM) method, which creates additional shortcuts in the feature space, inducing different update directions to prevent the degeneracy of bi-level optimization. Moreover, we provide a theoretical analysis that demonstrates the effectiveness of the FC-EM attack. Extensive experiments on typical point cloud datasets, 3D intracranial aneurysm medical dataset, and 3D face dataset verify the superiority and practicality of our approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yifan Zhu;Yibo Miao;Yinpeng Dong;Xiao-Shan Gao", "authorids": "~Yifan_Zhu6;~Yibo_Miao1;~Yinpeng_Dong2;~Xiao-Shan_Gao2", "gender": ";M;M;M", "homepage": "https://github.com/hala64;http://www.amss.ac.cn/;https://dongyp13.github.io;http://www.mmrc.iss.ac.cn/~xgao/", "dblp": ";332/0699;183/0980;13/3109", "google_scholar": "https://scholar.google.com/citations?hl=en;;6_4ad84AAAAJ;_se7GmUAAAAJ", "orcid": ";;;0000-0003-2021-9395", "linkedin": ";;;", "or_profile": "~Yifan_Zhu6;~Yibo_Miao1;~Yinpeng_Dong2;~Xiao-Shan_Gao2", "aff": "Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Chinese Academy of Sciences;Intel;Tsinghua University;Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Chinese Academy of Sciences", "aff_domain": "amss.ac.cn;intel.com;tsinghua.edu.cn;amss.ac.cn", "position": "PhD student;Intern;Postdoc;Full Professor", "bibtex": "@inproceedings{\nzhu2024toward,\ntitle={Toward Availability Attacks in 3D Point Clouds},\nauthor={Yifan Zhu and Yibo Miao and Yinpeng Dong and Xiao-Shan Gao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=C0sGIO2MZN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2200167, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=105377579251867398&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "amss.ac.cn;intel.com;tsinghua.edu.cn;amss.ac.cn", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Chinese Academy of Sciences;Intel;Tsinghua University", "aff_unique_dep": "Academy of Mathematics and Systems Science;Intel Corporation;", "aff_unique_url": "http://www.cas.cn;https://www.intel.com;https://www.tsinghua.edu.cn", "aff_unique_abbr": "CAS;Intel;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "China;United States" }, { "title": "Causal Effect Identification in LiNGAM Models with Latent Confounders", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34702", "id": "C1iNBLIClt", "proceeding": "https://proceedings.mlr.press/v235/tramontano24a.html", "pdf": "https://openreview.net/pdf?id=C1iNBLIClt", "openreview": "https://openreview.net/forum?id=C1iNBLIClt", "author_site": "Daniele Tramontano, Yaroslav Kivva, Saber Salehkaleybar, Mathias Drton, Negar Kiyavash", "tldr": "", "abstract": "We study the generic identifiability of causal effects in linear non-Gaussian acyclic models (LiNGAM) with latent variables. We consider the problem in two main settings: When the causal graph is known a priori, and when it is unknown. In both settings, we provide a complete graphical characterization of the identifiable direct or total causal effects among observed variables. Moreover, we propose efficient algorithms to certify the graphical conditions. Finally, we propose an adaptation of the reconstruction independent component analysis (RICA) algorithm that estimates the causal effects from the observational data given the causal graph. Experimental results show the effectiveness of the proposed method in estimating the causal effects.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Daniele Tramontano;Yaroslav Kivva;Saber Salehkaleybar;Mathias Drton;Negar Kiyavash", "authorids": "~Daniele_Tramontano1;~Yaroslav_Kivva1;~Saber_Salehkaleybar1;~Mathias_Drton2;~Negar_Kiyavash1", "gender": "M;M;;M;F", "homepage": "https://sites.google.com/view/danieletramontano/home-page;https://people.epfl.ch/yaroslav.kivva/?lang=en;;https://www.math.cit.tum.de/en/math/people/professors/drton-mathias/;https://people.epfl.ch/negar.kiyavash?lang=en", "dblp": "326/6242;314/7107;;78/3067;85/4976", "google_scholar": ";QiJQSTIAAAAJ;;CjRMyA4AAAAJ;7tBDvOwAAAAJ", "orcid": ";;;0000-0001-5614-3025;0000-0002-8545-7709", "linkedin": ";;;;", "or_profile": "~Daniele_Tramontano1;~Yaroslav_Kivva1;~Saber_Salehkaleybar1;~Mathias_Drton2;~Negar_Kiyavash1", "aff": "Technische Universit\u00e4t M\u00fcnchen;Swiss Federal Institute of Technology Lausanne (EPFL);;Technische Universit\u00e4t M\u00fcnchen;EPFL - EPF Lausanne", "aff_domain": "tum.de;epfl.ch;;tum.de;epfl.ch", "position": "PhD student;PhD student;;Full Professor;Full Professor", "bibtex": "@inproceedings{\ntramontano2024causal,\ntitle={Causal Effect Identification in Li{NGAM} Models with Latent Confounders},\nauthor={Daniele Tramontano and Yaroslav Kivva and Saber Salehkaleybar and Mathias Drton and Negar Kiyavash},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=C1iNBLIClt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1401473, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11307947624018964558&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "tum.de;epfl.ch;;tum.de;epfl.ch", "author_num": 5, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Technische Universit\u00e4t M\u00fcnchen;Swiss Federal Institute of Technology Lausanne;EPFL", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tum.de;https://www.epfl.ch;https://www.epfl.ch", "aff_unique_abbr": "TUM;EPFL;EPFL", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Lausanne", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "Germany;Switzerland" }, { "title": "AlphaZero-Like Tree-Search can Guide Large Language Model Decoding and Training", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34701", "id": "C4OpREezgj", "proceeding": "https://proceedings.mlr.press/v235/wan24c.html", "pdf": "https://openreview.net/pdf?id=C4OpREezgj", "openreview": "https://openreview.net/forum?id=C4OpREezgj", "author_site": "Ziyu Wan, Xidong Feng, Muning Wen, Stephen Mcaleer, Ying Wen, Weinan Zhang, Jun Wang", "tldr": "", "abstract": "Recent works like Tree-of-Thought (ToT) and Reasoning via Planning (RAP) aim to augment the multi-step reasoning capabilities of LLMs by using tree-search algorithms. These methods rely on prompting a pre-trained model to serve as a value function and focus on problems with low search depth. As a result, these methods cannot benefit from in-domain training and only rely on pretraining process \u2014 they will not work in domains where the pre-trained LLM does not have enough knowledge to serve as an effective value function or in domains that require long-horizon planning. To address these limitations, we present an AlphaZero-like tree-search learning framework for LLMs (termed TS-LLM), systematically illustrating how tree-search with a learned value function can guide LLM decoding. TS-LLM distinguishes itself in two key ways. (1) Leveraging a learned value function and AlphaZero-like algorithms, our approach can be generally adaptable to a wide range of tasks, language models of any size, and tasks of varying search depths. (2) Our approach can guide LLMs during both inference and training, iteratively improving the LLMs. Empirical results across reasoning, planning, alignment, and decision-making tasks show that TS-LLM outperforms existing approaches and can handle trees with a depth of 64.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ziyu Wan;Xidong Feng;Muning Wen;Stephen Marcus McAleer;Ying Wen;Weinan Zhang;Jun Wang", "authorids": "~Ziyu_Wan2;~Xidong_Feng1;~Muning_Wen2;~Stephen_Marcus_McAleer1;~Ying_Wen1;~Weinan_Zhang1;~Jun_Wang2", "gender": "M;;M;M;M;M;M", "homepage": "https://github.com/ziyuwan;https://waterhorse1.github.io/;https://github.com/morning9393;https://www.andrew.cmu.edu/user/smcaleer/;https://yingwen.io;http://wnzhang.net;http://www0.cs.ucl.ac.uk/staff/jun.wang/", "dblp": ";;295/0261;;41/4203-1;28/10261-1;w/JunWang12", "google_scholar": "VEtZ7gYAAAAJ;JfOLNu8AAAAJ;Zt1WFtQAAAAJ;iEFL4-YAAAAJ;_A1CxG8AAAAJ;Qzss0GEAAAAJ;https://scholar.google.co.uk/citations?user=wIE1tY4AAAAJ", "orcid": ";;0009-0000-7868-1262;;0000-0003-1247-2382;0000-0002-0127-2425;", "linkedin": ";;;stephen-mcaleer/;wenying45;;", "or_profile": "~Ziyu_Wan2;~Xidong_Feng1;~Muning_Wen2;~Stephen_Marcus_McAleer1;~Ying_Wen1;~Weinan_Zhang1;~Jun_Wang2", "aff": "Shanghai Jiaotong University;University College London;Shanghai Jiaotong University;Carnegie Mellon University;Shanghai Jiaotong University;Shanghai Jiaotong University;University College London", "aff_domain": "sjtu.edu.cn;ucl.ac.uk;sjtu.edu.cn;cmu.edu;sjtu.edu.cn;sjtu.edu.cn;ucl.ac.uk", "position": "PhD student;PhD student;PhD student;Postdoc;Associate Professor;Associate Professor;Professor", "bibtex": "@inproceedings{\nwan2024alphazerolike,\ntitle={AlphaZero-Like Tree-Search can Guide Large Language Model Decoding and Training},\nauthor={Ziyu Wan and Xidong Feng and Muning Wen and Stephen Marcus McAleer and Ying Wen and Weinan Zhang and Jun Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=C4OpREezgj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 716155, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 125, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12391024127440976227&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "sjtu.edu.cn;ucl.ac.uk;sjtu.edu.cn;cmu.edu;sjtu.edu.cn;sjtu.edu.cn;ucl.ac.uk", "author_num": 7, "aff_unique_index": "0;1;0;2;0;0;1", "aff_unique_norm": "Shanghai Jiao Tong University;University College London;Carnegie Mellon University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.ucl.ac.uk;https://www.cmu.edu", "aff_unique_abbr": "SJTU;UCL;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;2;0;0;1", "aff_country_unique": "China;United Kingdom;United States" }, { "title": "Learning Latent Dynamic Robust Representations for World Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34700", "id": "C4jkx6AgWc", "proceeding": "https://proceedings.mlr.press/v235/sun24n.html", "pdf": "https://openreview.net/pdf?id=C4jkx6AgWc", "openreview": "https://openreview.net/forum?id=C4jkx6AgWc", "author_site": "Ruixiang Sun, Hongyu Zang, Xin Li, Riashat Islam", "tldr": "", "abstract": "Visual Model-Based Reinforcement Learning (MBRL) promises to encapsulate agent's knowledge about the underlying dynamics of the environment, enabling learning a world model as a useful planner. However, top MBRL agents such as Dreamer often struggle with visual pixel-based inputs in the presence of exogenous or irrelevant noise in the observation space, due to failure to capture task-specific features while filtering out irrelevant spatio-temporal details. To tackle this problem, we apply a spatio-temporal masking strategy, a bisimulation principle, combined with latent reconstruction, to capture endogenous task-specific aspects of the environment for world models, effectively eliminating non-essential information. Joint training of representations, dynamics, and policy often leads to instabilities. To further address this issue, we develop a Hybrid Recurrent State-Space Model (HRSSM) structure, enhancing state representation robustness for effective policy learning. Our empirical evaluation demonstrates significant performance improvements over existing methods in a range of visually complex control tasks such as Maniskill with exogenous distractors from the Matterport environment. Our code is avaliable at https://github.com/bit1029public/HRSSM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ruixiang Sun;Hongyu Zang;Xin Li;Riashat Islam", "authorids": "~Ruixiang_Sun1;~Hongyu_Zang1;~Xin_Li31;~Riashat_Islam1", "gender": ";M;F;M", "homepage": "https://github.com/ygjldd;https://zanghyu.github.io/;https://cs.bit.edu.cn/szdw/jsml/js/lixin/index.htm;https://riashat.github.io/", "dblp": "79/5969-3.html;212/2592.html;09/1365-33.html;198/0459", "google_scholar": "zh1wlmQAAAAJ;2kmSy50AAAAJ;https://scholar.google.com/citations?hl=zh-TW;https://scholar.google.ca/citations?user=2_4Rs44AAAAJ", "orcid": "0009-0006-4979-1976;;0000-0003-4257-4347;", "linkedin": ";;;", "or_profile": "~Ruixiang_Sun1;~Hongyu_Zang1;~Xin_Li31;~Riashat_Islam1", "aff": "Beijing Institute of Technology;Meituan;Beijing Institute of Technology;Saudi Data and AI Authority, Saudi Data and AI Authority", "aff_domain": "bit.edu.cn;meituan.com;bit.edu.cn;sdaia.gov.sa", "position": "Undergrad student;Researcher;Associate Professor;Researcher", "bibtex": "@inproceedings{\nsun2024learning,\ntitle={Learning Latent Dynamic Robust Representations for World Models},\nauthor={Ruixiang Sun and Hongyu Zang and Xin Li and Riashat Islam},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=C4jkx6AgWc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7796020, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8259388870947624849&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "bit.edu.cn;meituan.com;bit.edu.cn;sdaia.gov.sa", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Beijing Institute of Technology;Meituan;Saudi Data and AI Authority", "aff_unique_dep": ";;", "aff_unique_url": "http://www.bit.edu.cn/;https://www.meituan.com;https://sdaia.gov.sa", "aff_unique_abbr": "BIT;Meituan;SDAIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "China;Saudi Arabia" }, { "title": "Parameter-Efficient Fine-Tuning with Controls", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34699", "id": "C4nalr0DoE", "proceeding": "https://proceedings.mlr.press/v235/zhang24y.html", "pdf": "https://openreview.net/pdf?id=C4nalr0DoE", "openreview": "https://openreview.net/forum?id=C4nalr0DoE", "author_site": "Chi Zhang, Jingpu Cheng, Yanyu Xu, Qianxiao Li", "tldr": "", "abstract": "In contrast to the prevailing interpretation of Low-Rank Adaptation (LoRA) as a means of simulating weight changes in model adaptation, this paper introduces an alternative perspective by framing it as a control process. Specifically, we conceptualize lightweight matrices in LoRA as control modules tasked with perturbing the original, complex, yet frozen blocks on downstream tasks. Building upon this new understanding, we conduct a thorough analysis on the controllability of these modules, where we identify and establish sufficient conditions that facilitate their effective integration into downstream controls. Moreover, the control modules are redesigned by incorporating nonlinearities through a parameter-free attention mechanism. This modification allows for the intermingling of tokens within the controllers, enhancing the adaptability and performance of the system. Empirical findings substantiate that, without introducing any additional parameters, this approach surpasses the existing LoRA algorithms across all assessed datasets and rank configurations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chi Zhang;Cheng Jingpu;Yanyu Xu;Qianxiao Li", "authorids": "~Chi_Zhang25;~Cheng_Jingpu1;~Yanyu_Xu1;~Qianxiao_Li1", "gender": ";M;M;M", "homepage": ";;https://svip-lab.github.io/team/xuyy.html;https://blog.nus.edu.sg/qianxiaoli/", "dblp": ";;188/7560;172/0930.html", "google_scholar": ";;Y3hU1AYAAAAJ;https://scholar.google.com.sg/citations?user=zLgReYoAAAAJ", "orcid": ";0000-0003-4164-9474;;0000-0002-3903-3737", "linkedin": ";;;", "or_profile": "~Chi_Zhang25;~Cheng_Jingpu1;~Yanyu_Xu1;~Qianxiao_Li1", "aff": ";National University of Singapore;Institute of High Performance Computing, Singapore, A*STAR;National University of Singapore", "aff_domain": ";u.nus.edu;ihpc.a-star.edu.sg;nus.edu.sg", "position": ";PhD student;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nzhang2024parameterefficient,\ntitle={Parameter-Efficient Fine-Tuning with Controls},\nauthor={Chi Zhang and Cheng Jingpu and Yanyu Xu and Qianxiao Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=C4nalr0DoE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 413626, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=560298837766702615&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": ";u.nus.edu;ihpc.a-star.edu.sg;nus.edu.sg", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "National University of Singapore;Institute of High Performance Computing", "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;https://www.ihpc.a-star.edu.sg", "aff_unique_abbr": "NUS;IHPC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Singapore" }, { "title": "Scalable Online Exploration via Coverability", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34698", "id": "C64clssMVU", "proceeding": "https://proceedings.mlr.press/v235/amortila24a.html", "pdf": "https://openreview.net/pdf?id=C64clssMVU", "openreview": "https://openreview.net/forum?id=C64clssMVU", "author_site": "Philip Amortila, Dylan Foster, Akshay Krishnamurthy", "tldr": "", "abstract": "Exploration is a major challenge in reinforcement learning, especially for high-dimensional domains that require function approximation. We propose exploration objectives---policy optimization objectives that enable downstream maximization of any reward function---as a conceptual framework to systematize the study of exploration. We introduce a new objective, L1-Coverage, which generalizes previous exploration schemes and supports three fundamental desiderata: 1. *Intrinsic complexity control.* L1-Coverage is associated with a structural parameter, L1-Coverability, which reflects the intrinsic statistical difficulty of the underlying MDP, subsuming Block and Low-Rank MDPs. 2. *Efficient planning.* For a known MDP, L1-Coverage efficiently reduces to standard policy optimization, allowing flexible integration with off-the-shelf methods such as policy gradient and Q-learning approaches. 3. *Efficient exploration.* L1-Coverage enables the first computationally efficient model-based and model-free algorithms for online (reward-free or reward-driven) reinforcement learning in MDPs with low coverability. Empirically, we find that L1-Coverage effectively drives off-the-shelf policy optimization algorithms to explore the state space.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Philip Amortila;Dylan J Foster;Akshay Krishnamurthy", "authorids": "~Philip_Amortila1;~Dylan_J_Foster1;~Akshay_Krishnamurthy1", "gender": "M;;M", "homepage": "https://www.philipamortila.com;http://dylanfoster.net;https://www.cics.umass.edu/~akshay/", "dblp": "222/2989;167/4271;85/8024", "google_scholar": "NZQkB8sAAAAJ;RqwU8xsAAAAJ;https://scholar.google.com.tw/citations?user=K0kaNvkAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Philip_Amortila1;~Dylan_J_Foster1;~Akshay_Krishnamurthy1", "aff": "University of Illinois, Urbana Champaign;Microsoft Research;Microsoft Research", "aff_domain": "illinois.edu;microsoft.com;research.microsoft.com", "position": "PhD student;Principal Researcher;Principal Researcher", "bibtex": "@inproceedings{\namortila2024scalable,\ntitle={Scalable Online Exploration via Coverability},\nauthor={Philip Amortila and Dylan J Foster and Akshay Krishnamurthy},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=C64clssMVU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1455031, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10984668262040992824&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "illinois.edu;microsoft.com;research.microsoft.com", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Illinois Urbana-Champaign;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://illinois.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "UIUC;MSR", "aff_campus_unique_index": "0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Factored-Reward Bandits with Intermediate Observations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34697", "id": "C7Z8EhZ6bl", "proceeding": "https://proceedings.mlr.press/v235/mussi24a.html", "pdf": "https://openreview.net/pdf?id=C7Z8EhZ6bl", "openreview": "https://openreview.net/forum?id=C7Z8EhZ6bl", "author_site": "Marco Mussi, Simone Drago, Marcello Restelli, Alberto Maria Metelli", "tldr": "", "abstract": "In several real-world sequential decision problems, at every step, the learner is required to select different actions. Every action affects a specific part of the system and generates an observable intermediate effect. In this paper, we introduce the Factored-Reward Bandits (FRBs), a novel setting able to effectively capture and exploit the structure of this class of scenarios, where the reward is computed as the product of the action intermediate observations. We characterize the statistical complexity of the learning problem in the FRBs, by deriving worst-case and asymptotic instance-dependent regret lower bounds. Then, we devise and analyze two regret minimization algorithms. The former, F-UCB, is an anytime optimistic approach matching the worst-case lower bound (up to logarithmic factors) but fails to perform optimally from the instance-dependent perspective. The latter, F-Track, is a bound-tracking approach, that enjoys optimal asymptotic instance-dependent regret guarantees.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Marco Mussi;Simone Drago;Marcello Restelli;Alberto Maria Metelli", "authorids": "~Marco_Mussi1;~Simone_Drago1;~Marcello_Restelli1;~Alberto_Maria_Metelli2", "gender": "M;M;M;M", "homepage": "https://marcomussi.github.io/;;http://home.deib.polimi.it/restelli/;https://albertometelli.github.io/", "dblp": "321/0756;383/9682;64/1011;209/4941", "google_scholar": "3gca-JUAAAAJ;;https://scholar.google.com.tw/citations?user=xdgxRiEAAAAJ;R31IsPwAAAAJ", "orcid": "0000-0001-8356-6744;;0000-0002-6322-1076;0000-0002-3424-5212", "linkedin": "marcomussi95/;drago-simone/;;", "or_profile": "~Marco_Mussi1;~Simone_Drago1;~Marcello_Restelli1;~Alberto_Maria_Metelli2", "aff": "Politecnico di Milano;Polytechnic Institute of Milan;Politecnico di Milano;Politecnico di Milano", "aff_domain": "polimi.it;polimi.it;polimi.it;polimi.it", "position": "PhD student;Researcher;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nmussi2024factoredreward,\ntitle={Factored-Reward Bandits with Intermediate Observations},\nauthor={Marco Mussi and Simone Drago and Marcello Restelli and Alberto Maria Metelli},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=C7Z8EhZ6bl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1021976, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15160013079766328098&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "polimi.it;polimi.it;polimi.it;polimi.it", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Politecnico di Milano;Polytechnic Institute of Milan", "aff_unique_dep": ";", "aff_unique_url": "https://www.polimi.it;https://www.polimi.it/", "aff_unique_abbr": "Polimi;Politecnico di Milano", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Italy" }, { "title": "Fast Peer Adaptation with Context-aware Exploration", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34696", "id": "CBcNl5Eo32", "proceeding": "https://proceedings.mlr.press/v235/ma24n.html", "pdf": "https://openreview.net/pdf?id=CBcNl5Eo32", "openreview": "https://openreview.net/forum?id=CBcNl5Eo32", "author_site": "Long Ma, Yuanfei Wang, Fangwei Zhong, Song-Chun Zhu, Yizhou Wang", "tldr": "", "abstract": "Fast adapting to unknown peers (partners or opponents) with different strategies is a key challenge in multi-agent games. To do so, it is crucial for the agent to probe and identify the peer\u2019s strategy efficiently, as this is the prerequisite for carrying out the best response in adaptation. However, exploring the strategies of unknown peers is difficult, especially when the games are partially observable and have a long horizon. In this paper, we propose a peer identification reward, which rewards the learning agent based on how well it can identify the behavior pattern of the peer over the historical context, such as the observation over multiple episodes. This reward motivates the agent to learn a context-aware policy for effective exploration and fast adaptation, i.e., to actively seek and collect informative feedback from peers when uncertain about their policies and to exploit the context to perform the best response when confident. We evaluate our method on diverse testbeds that involve competitive (Kuhn Poker), cooperative (PO-Overcooked), or mixed (Predator-Prey-W) games with peer agents. We demonstrate that our method induces more active exploration behavior, achieving faster adaptation and better outcomes than existing methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Long Ma;Yuanfei Wang;Fangwei Zhong;Song-Chun Zhu;Yizhou Wang", "authorids": "~Long_Ma5;~Yuanfei_Wang1;~Fangwei_Zhong3;~Song-Chun_Zhu1;~Yizhou_Wang1", "gender": "M;M;M;M;M", "homepage": "https://sites.google.com/view/long-ma-homepage/home;https://yuanfei-wang.github.io/;https://zhusongchun.net/;https://cfcs.pku.edu.cn/wangyizhou/;https://fangweizhong.xyz/", "dblp": ";47/10626;10/10313;71/3387-1;207/1900", "google_scholar": ";;https://scholar.google.com.tw/citations?user=Al8dyb4AAAAJ;831z_VcAAAAJ;ejDz1bYAAAAJ", "orcid": ";0009-0008-8908-1981;;;0000-0002-0428-4552", "linkedin": ";;;;", "or_profile": "~Long_Ma5;~Yuanfei_Wang1;~Song-Chun_Zhu1;~Yizhou_Wang1;~fangwei_zhong1", "aff": "Peking University;Peking University;Peking University;Peking University;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn", "position": "PhD student;PhD student;Full Professor;Full Professor;Postdoc", "bibtex": "@inproceedings{\nma2024fast,\ntitle={Fast Peer Adaptation with Context-aware Exploration},\nauthor={Long Ma and Yuanfei Wang and Fangwei Zhong and Song-Chun Zhu and Yizhou Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CBcNl5Eo32}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 0, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5235039236083485082&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Pedestrian Attribute Recognition as Label-balanced Multi-label Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34695", "id": "CD2xl1L5es", "proceeding": "https://proceedings.mlr.press/v235/zhou24j.html", "pdf": "https://openreview.net/pdf?id=CD2xl1L5es", "openreview": "https://openreview.net/forum?id=CD2xl1L5es", "author_site": "Yibo Zhou, Hai-Miao Hu, Yirong Xiang, Xiaokang Zhang, Haotian Wu", "tldr": "", "abstract": "Rooting in the scarcity of most attributes, realistic pedestrian attribute datasets exhibit unduly skewed data distribution, from which two types of model failures are delivered: (1) label imbalance: model predictions lean greatly towards the side of majority labels; (2) semantics imbalance: model is easily overfitted on the under-represented attributes due to their insufficient semantic diversity. To render perfect label balancing, we propose a novel framework that successfully decouples label-balanced data re-sampling from the curse of attributes co-occurrence, i.e., we equalize the sampling prior of an attribute while not biasing that of the co-occurred others. To diversify the attributes semantics and mitigate the feature noise, we propose a Bayesian feature augmentation method to introduce true in-distribution novelty. Handling both imbalances jointly, our work achieves best accuracy on various popular benchmarks, and importantly, with minimal computational budget.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yibo Zhou;Hai-Miao Hu;Yirong Xiang;Xiaokang Zhang;Haotian Wu", "authorids": "~Yibo_Zhou1;~Hai-Miao_Hu1;~Yirong_Xiang1;~Xiaokang_Zhang2;~Haotian_Wu5", "gender": "M;M;F;M;M", "homepage": ";http://shi.buaa.edu.cn/huhaimiao/en/index.htm;;https://github.com/XKZ2023;", "dblp": ";39/7528;;;", "google_scholar": ";;;;", "orcid": "0000-0001-7964-7517;0000-0001-6811-9209;0000-0001-6887-5610;;0009-0007-8821-9283", "linkedin": ";;;;", "or_profile": "~Yibo_Zhou1;~Hai-Miao_Hu1;~Yirong_Xiang1;~Xiaokang_Zhang2;~Haotian_Wu5", "aff": "Beihang University;Beihang University;University of Manchester;Beihang University;Beihang University", "aff_domain": "buaa.edu.cn;buaa.edu.cn;manchester.ac.uk;buaa.edu.cn;buaa.edu.cn", "position": "PhD student;Full Professor;PhD student;PhD student;MS student", "bibtex": "@inproceedings{\nzhou2024pedestrian,\ntitle={Pedestrian Attribute Recognition as Label-balanced Multi-label Learning},\nauthor={Yibo Zhou and Hai-Miao Hu and Yirong Xiang and Xiaokang Zhang and Haotian Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CD2xl1L5es}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 914047, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2362531451793639147&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "buaa.edu.cn;buaa.edu.cn;manchester.ac.uk;buaa.edu.cn;buaa.edu.cn", "author_num": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Beihang University;University of Manchester", "aff_unique_dep": ";", "aff_unique_url": "http://www.buaa.edu.cn/;https://www.manchester.ac.uk", "aff_unique_abbr": "BUAA;UoM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "China;United Kingdom" }, { "title": "Accelerating Iterative Retrieval-augmented Language Model Serving with Speculation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34694", "id": "CDnv4vg02f", "proceeding": "https://proceedings.mlr.press/v235/zhang24cq.html", "pdf": "https://openreview.net/pdf?id=CDnv4vg02f", "openreview": "https://openreview.net/forum?id=CDnv4vg02f", "author_site": "Zhihao Zhang, Alan Zhu, Lijie Yang, Yihua Xu, Lanting Li, Phitchaya Phothilimthana, Zhihao Jia", "tldr": "", "abstract": "This paper introduces RaLMSpec, a framework that accelerates iterative retrieval-augmented language model (RaLM) with *speculative retrieval* and *batched verification*. RaLMSpec further introduces several important systems optimizations, including prefetching, optimal speculation stride scheduler, and asynchronous verification. The combination of these techniques allows RaLMSPec to significantly outperform existing systems. For document-level iterative RaLM serving, evaluation over three LLMs on four QA datasets shows that RaLMSpec improves over existing approaches by $1.75$-$2.39\\times$, $1.04$-$1.39\\times$, and $1.31$-$1.77\\times$ when the retriever is an exact dense retriever, approximate dense retriever, and sparse retriever respectively. For token-level iterative RaLM (KNN-LM) serving, RaLMSpec is up to $7.59\\times$ and $2.45\\times$ faster than existing methods for exact dense and approximate dense retrievers, respectively.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhihao Zhang;Alan Zhu;Lijie Yang;Yihua Xu;Lanting Li;Phitchaya Mangpo Phothilimthana;Zhihao Jia", "authorids": "~Zhihao_Zhang2;~Alan_Zhu1;~Lijie_Yang1;~Yihua_Xu1;~Lanting_Li1;~Phitchaya_Mangpo_Phothilimthana1;~Zhihao_Jia2", "gender": ";M;M;M;F;F;M", "homepage": ";https://az1326.github.io/;https://derrickylj.github.io/;;;https://mangpo.net/;https://www.cs.cmu.edu/~zhihaoj2/", "dblp": "91/5464;358/6490;98/7434-3;;;127/3128;", "google_scholar": "https://scholar.google.com/citations?hl=en;;5Gx-kFQAAAAJ;;6xxmZs0AAAAJ;7Fxbm0AAAAAJ;0IWLFR4AAAAJ", "orcid": ";0009-0001-8694-9246;;;0000-0003-2434-9285;;", "linkedin": ";az1326/;lijie-yang-drk/;yihua-bruce-xu-9a37381a5/;;;", "or_profile": "~Zhihao_Zhang2;~Alan_Zhu1;~Lijie_Yang1;~Yihua_Xu1;~Lanting_Li1;~Phitchaya_Mangpo_Phothilimthana1;~Zhihao_Jia2", "aff": "Carnegie Mellon University;School of Computer Science, Carnegie Mellon University;Carnegie Mellon University;University of California, Berkeley;Carnegie Mellon University;Google;Carnegie Mellon University", "aff_domain": "andrew.cmu.edu;cs.cmu.edu;cmu.edu;berkeley.edu;andrew.cmu.edu;google.com;cs.cmu.edu", "position": "PhD student;Undergrad student;Undergrad student;Undergrad student;PhD student;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nzhang2024accelerating,\ntitle={Accelerating Iterative Retrieval-augmented Language Model Serving with Speculation},\nauthor={Zhihao Zhang and Alan Zhu and Lijie Yang and Yihua Xu and Lanting Li and Phitchaya Mangpo Phothilimthana and Zhihao Jia},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CDnv4vg02f}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2160673, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10929672550102264356&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 6, "email": "andrew.cmu.edu;cs.cmu.edu;cmu.edu;berkeley.edu;andrew.cmu.edu;google.com;cs.cmu.edu", "author_num": 7, "aff_unique_index": "0;0;0;1;0;2;0", "aff_unique_norm": "Carnegie Mellon University;University of California, Berkeley;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.cmu.edu;https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "CMU;UC Berkeley;Google", "aff_campus_unique_index": "1;2;3", "aff_campus_unique": ";Pittsburgh;Berkeley;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Purifying Quantization-conditioned Backdoors via Layer-wise Activation Correction with Distribution Approximation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34693", "id": "CEfr3h68KU", "proceeding": "https://proceedings.mlr.press/v235/li24e.html", "pdf": "https://openreview.net/pdf?id=CEfr3h68KU", "openreview": "https://openreview.net/forum?id=CEfr3h68KU", "author_site": "Boheng Li, Yishuo Cai, Jisong Cai, Yiming Li, Han Qiu, Run Wang, Tianwei Zhang", "tldr": "", "abstract": "Model quantization is a compression technique that converts a full-precision model to a more compact low-precision version for better storage. Despite the great success of quantization, recent studies revealed the feasibility of malicious exploiting model quantization via implanting quantization-conditioned backdoors (QCBs). These special backdoors remain dormant in full-precision models but are exposed upon quantization. Unfortunately, existing defenses have limited effects on mitigating QCBs. In this paper, we conduct an in-depth analysis of QCBs. We reveal an intriguing characteristic of QCBs, where activation of backdoor-related neurons on even benign samples enjoy a distribution drift after quantization, although this drift is more significant on poisoned samples. Motivated by this finding, we propose to purify the backdoor-exposed quantized model by aligning its layer-wise activation with its full-precision version. To further exploit the more pronounced activation drifts on poisoned samples, we design an additional module to layer-wisely approximate poisoned activation distribution based on batch normalization statistics of the full-precision model. Extensive experiments are conducted, verifying the effectiveness of our defense. Our code is publicly available.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Boheng Li;Yishuo Cai;Jisong Cai;Yiming Li;Han Qiu;Run Wang;Tianwei Zhang", "authorids": "~Boheng_Li1;~Yishuo_Cai1;~Jisong_Cai1;~Yiming_Li1;~Han_Qiu3;~Run_Wang1;~Tianwei_Zhang1", "gender": "M;M;M;M;M;M;M", "homepage": "https://antigonerandy.github.io;https://walkeralan123.github.io/;;http://liyiming.tech;https://qiuhan.info;http://wangrun.github.io/;https://personal.ntu.edu.sg/tianwei.zhang/index.html", "dblp": "329/5678;;;l/YimingLi-4;15/4507-1;01/1318-1;77/7902-4", "google_scholar": ";;dTrpq94AAAAJ;mSW7kU8AAAAJ;https://scholar.google.fr/citations?user=6JWNv6gAAAAJ;LpuwwNUAAAAJ;9vpiYDIAAAAJ", "orcid": "0000-0001-9921-7215;;;0000-0002-2258-265X;;0000-0002-2842-5137;", "linkedin": ";;;yiming-li-thu/;;;", "or_profile": "~Boheng_Li1;~Yishuo_Cai1;~Jisong_Cai1;~Yiming_Li1;~Han_Qiu3;~Run_Wang1;~Tianwei_Zhang1", "aff": "Wuhan University;Central South University;Wuhan University;Zhejiang University;Tsinghua University;Wuhan University;Nanyang Technological University", "aff_domain": "whu.edu.cn;csu.edu.cn;whu.edu.cn;zju.edu.cn;tsinghua.edu.cn;whu.edu.cn;ntu.edu.sg", "position": "Undergrad student;Undergrad student;Undergrad student;Research Professor;Assistant Professor;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nli2024purifying,\ntitle={Purifying Quantization-conditioned Backdoors via Layer-wise Activation Correction with Distribution Approximation},\nauthor={Boheng Li and Yishuo Cai and Jisong Cai and Yiming Li and Han Qiu and Run Wang and Tianwei Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CEfr3h68KU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1226802, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17360480669352317670&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "whu.edu.cn;csu.edu.cn;whu.edu.cn;zju.edu.cn;tsinghua.edu.cn;whu.edu.cn;ntu.edu.sg", "author_num": 7, "aff_unique_index": "0;1;0;2;3;0;4", "aff_unique_norm": "Wuhan University;Central South University;Zhejiang University;Tsinghua University;Nanyang Technological University", "aff_unique_dep": ";;;;", "aff_unique_url": "http://www.whu.edu.cn/;https://www.csu.edu.cn;https://www.zju.edu.cn;https://www.tsinghua.edu.cn;https://www.ntu.edu.sg", "aff_unique_abbr": "WHU;CSU;ZJU;THU;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;1", "aff_country_unique": "China;Singapore" }, { "title": "Self-cognitive Denoising in the Presence of Multiple Noisy Label Sources", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34692", "id": "CG44RLeXt1", "proceeding": "https://proceedings.mlr.press/v235/sun24o.html", "pdf": "https://openreview.net/pdf?id=CG44RLeXt1", "openreview": "https://openreview.net/forum?id=CG44RLeXt1", "author_site": "Yi-Xuan Sun, Ya-Lin Zhang, BIN HAN, Longfei Li, JUN ZHOU", "tldr": "", "abstract": "The strong performance of neural networks typically hinges on the availability of extensive labeled data, yet acquiring ground-truth labels is often challenging. Instead, noisy supervisions from multiple sources, e.g., by multiple well-designed rules, are more convenient to collect. In this paper, we focus on the realistic problem of learning from multiple noisy label sources, and argue that prior studies have overlooked the crucial *self-cognition* ability of neural networks, i.e., the inherent capability of autonomously distinguishing noise during training. We theoretically analyze this ability of neural networks when meeting multiple noisy label sources, which reveals that neural networks possess the capability to recognize both instance-wise noise within each single noisy label source and annotator-wise quality among multiple noisy label sources. Inspired by the theoretical analyses, we introduce an approach named Self-cognitive Denoising for Multiple noisy label sources (SDM), which exploits the self-cognition ability of neural networks to denoise during training. Furthermore, we build a selective distillation module following the theoretical insights to optimize computational efficiency. The experiments on various datasets demonstrate the superiority of our method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yi-Xuan Sun;Ya-Lin Zhang;BIN HAN;Longfei Li;JUN ZHOU", "authorids": "~Yi-Xuan_Sun1;~Ya-Lin_Zhang1;~BIN_HAN2;~Longfei_Li1;~JUN_ZHOU6", "gender": ";;M;M;M", "homepage": ";;;;https://scholar.google.com/citations?user=mCVvloEAAAAJ&hl=en", "dblp": ";;;139/8073;99/3847-11", "google_scholar": ";;IzmdkvMCESkC;;mCVvloEAAAAJ", "orcid": ";;;;0000-0001-6033-6102", "linkedin": ";;;;", "or_profile": "~Yi-Xuan_Sun1;~Ya-Lin_Zhang1;~BIN_HAN2;~Longfei_Li1;~JUN_ZHOU6", "aff": ";;Ant Group;ant group;Ant Group", "aff_domain": ";;antgroup.com;antgroup.com;antgroup.com", "position": ";;Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\nsun2024selfcognitive,\ntitle={Self-cognitive Denoising in the Presence of Multiple Noisy Label Sources},\nauthor={Yi-Xuan Sun and Ya-Lin Zhang and BIN HAN and Longfei Li and JUN ZHOU},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CG44RLeXt1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 624005, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5058413617869289952&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 4, "email": ";;antgroup.com;antgroup.com;antgroup.com", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Ant Group", "aff_unique_dep": "", "aff_unique_url": "https://www.antgroup.com", "aff_unique_abbr": "Ant Group", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "TSLANet: Rethinking Transformers for Time Series Representation Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34691", "id": "CGR3vpX63X", "proceeding": "https://proceedings.mlr.press/v235/eldele24a.html", "pdf": "https://openreview.net/pdf?id=CGR3vpX63X", "openreview": "https://openreview.net/forum?id=CGR3vpX63X", "author_site": "Emadeldeen Eldele, Mohamed Ragab, Zhenghua Chen, Min Wu, Xiaoli Li", "tldr": "", "abstract": "Time series data, characterized by its intrinsic long and short-range dependencies, poses a unique challenge across analytical applications. While Transformer-based models excel at capturing long-range dependencies, they face limitations in noise sensitivity, computational efficiency, and overfitting with smaller datasets. In response, we introduce a novel **T**ime **S**eries **L**ightweight **A**daptive **Net**work (**TSLANet**), as a universal convolutional model for diverse time series tasks. Specifically, we propose an Adaptive Spectral Block, harnessing Fourier analysis to enhance feature representation and to capture both long-term and short-term interactions while mitigating noise via adaptive thresholding. Additionally, we introduce an Interactive Convolution Block and leverage self-supervised learning to refine the capacity of TSLANet for decoding complex temporal patterns and improve its robustness on different datasets. Our comprehensive experiments demonstrate that TSLANet outperforms state-of-the-art models in various tasks spanning classification, forecasting, and anomaly detection, showcasing its resilience and adaptability across a spectrum of noise levels and data sizes. The code is available at https://github.com/emadeldeen24/TSLANet.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Emadeldeen Eldele;Mohamed Ragab;Zhenghua Chen;Min Wu;Xiaoli Li", "authorids": "~Emadeldeen_Eldele1;~Mohamed_Ragab1;~Zhenghua_Chen2;~Min_Wu2;~Xiaoli_Li1", "gender": "M;M;M;M;M", "homepage": "https://emadeldeen24.github.io/;http://mohamed-ragab.netlify.app;https://zhenghuantu.github.io/;https://sites.google.com/site/wumincf/;https://personal.ntu.edu.sg/xlli/", "dblp": "295/9208.html;237/3528-2.html;03/7457.html;16/0-8;l/XiaoliLi.html", "google_scholar": "2LdeHIYAAAAJ;nNeT_NUAAAAJ;https://scholar.google.com.sg/citations?user=WUgu3nwAAAAJ;https://scholar.google.com.sg/citations?user=Hji1uWQAAAAJ;E3yQKloAAAAJ", "orcid": "0000-0002-9282-0991;0000-0002-2138-4395;0000-0002-1719-0328;0000-0003-0977-3600;0000-0002-0762-6562", "linkedin": "emadeldeen-eldele-phd-1a291a301/;mohamedragab1/;;;li-xiaoli-41027ba/", "or_profile": "~Emadeldeen_Eldele1;~Mohamed_Ragab1;~Zhenghua_Chen2;~Min_Wu2;~Xiaoli_Li1", "aff": "Agency for Science, Technology and Research;, A*STAR;I2R, A*STAR;Institute for Infocomm Research (I2R), A*STAR;A*STAR", "aff_domain": "cfar.a-star.edu.sg;i2r.a-star.edu.sg;i2r.a-star.edu.sg;i2r.a-star.edu.sg;a-star.edu.sg", "position": "Research Scientist;Researcher;Researcher;Principal Researcher;Principal Researcher", "bibtex": "@inproceedings{\neldele2024tslanet,\ntitle={{TSLAN}et: Rethinking Transformers for Time Series Representation Learning},\nauthor={Emadeldeen Eldele and Mohamed Ragab and Zhenghua Chen and Min Wu and Xiaoli Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CGR3vpX63X}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1331383, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17441987548700879016&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "cfar.a-star.edu.sg;i2r.a-star.edu.sg;i2r.a-star.edu.sg;i2r.a-star.edu.sg;a-star.edu.sg", "author_num": 5, "aff_unique_index": "0;1;1;2;0", "aff_unique_norm": "Agency for Science, Technology and Research;A*STAR;Institute for Infocomm Research", "aff_unique_dep": ";;", "aff_unique_url": "https://www.a-star.edu.sg;https://www.a-star.edu.sg;https://www.i2r.a-star.edu.sg", "aff_unique_abbr": "A*STAR;A*STAR;I2R", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Singapore" }, { "title": "Longitudinal Targeted Minimum Loss-based Estimation with Temporal-Difference Heterogeneous Transformer", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34690", "id": "CHz7WshPcp", "proceeding": "https://proceedings.mlr.press/v235/shirakawa24a.html", "pdf": "https://openreview.net/pdf?id=CHz7WshPcp", "openreview": "https://openreview.net/forum?id=CHz7WshPcp", "author_site": "Toru Shirakawa, Yi Li, Yulun Wu, Sky Qiu, Yuxuan Li, Mingduo Zhao, Hiroyasu Iso, Mark van der Laan", "tldr": "", "abstract": "We propose Deep Longitudinal Targeted Minimum Loss-based Estimation (Deep LTMLE), a novel approach to estimate the counterfactual mean of outcome under dynamic treatment policies in longitudinal problem settings. Our approach utilizes a transformer architecture with heterogeneous type embedding trained using temporal-difference learning. After obtaining an initial estimate using the transformer, following the targeted minimum loss-based likelihood estimation (TMLE) framework, we statistically corrected for the bias commonly associated with machine learning algorithms. Furthermore, our method also facilitates statistical inference by enabling the provision of 95% confidence intervals grounded in asymptotic statistical theory. Simulation results demonstrate our method's superior performance over existing approaches, particularly in complex, long time-horizon scenarios. It remains effective in small-sample, short-duration contexts, matching the performance of asymptotically efficient estimators. To demonstrate our method in practice, we applied our method to estimate counterfactual mean outcomes for standard versus intensive blood pressure management strategies in a real-world cardiovascular epidemiology cohort study.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Toru Shirakawa;Yi Li;Yulun Wu;Sky Qiu;Yuxuan Li;Mingduo Zhao;Hiroyasu Iso;Mark J. van der Laan", "authorids": "~Toru_Shirakawa1;~Yi_Li38;~Yulun_Wu1;~Sky_Qiu1;~Yuxuan_Li6;~Mingduo_Zhao1;~Hiroyasu_Iso1;~Mark_J._van_der_Laan1", "gender": "M;M;;M;F;;;M", "homepage": "https://shirakawatoru.github.io/;;https://github.com/yulun-rayn;;;;https://www.ighp.ncgm.go.jp/eng/index.html;http://www.stat.berkeley.edu/~laan", "dblp": ";;;;;;;36/5857", "google_scholar": ";;5QJJxS4AAAAJ;;;;;-zaDQ10AAAAJ", "orcid": ";0000-0003-4602-5572;;0009-0006-5944-0782;;;;", "linkedin": ";;yu-lun-wu/;;yuxuan-li-b05b84220/;mingduo-zhao3/;;mark-van-der-laan-2560553/", "or_profile": "~Toru_Shirakawa1;~Yi_Li38;~Yulun_Wu1;~Sky_Qiu1;~Yuxuan_Li6;~Mingduo_Zhao1;~Hiroyasu_Iso1;~Mark_J._van_der_Laan1", "aff": "Osaka University Graduate School of Medicine;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;Columbia University;University of California, Berkeley;Osaka Medical College;University of California, Berkeley", "aff_domain": "osaka-u.ac.jp;berkeley.edu;berkeley.edu;berkeley.edu;columbia.edu;berkeley.edu;osaka-med.ac.jp;berkeley.edu", "position": "PhD student;PhD student;PhD student;PhD student;PhD student;PhD student;Emeritus;Professor", "bibtex": "@inproceedings{\nshirakawa2024longitudinal,\ntitle={Longitudinal Targeted Minimum Loss-based Estimation with Temporal-Difference Heterogeneous Transformer},\nauthor={Toru Shirakawa and Yi Li and Yulun Wu and Sky Qiu and Yuxuan Li and Mingduo Zhao and Hiroyasu Iso and Mark J. van der Laan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CHz7WshPcp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 753538, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4956046615380136535&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "osaka-u.ac.jp;berkeley.edu;berkeley.edu;berkeley.edu;columbia.edu;berkeley.edu;osaka-med.ac.jp;berkeley.edu", "author_num": 8, "aff_unique_index": "0;1;1;1;2;1;3;1", "aff_unique_norm": "Osaka University;University of California, Berkeley;Columbia University;Osaka Medical College", "aff_unique_dep": "Graduate School of Medicine;;;", "aff_unique_url": "https://www.osaka-u.ac.jp;https://www.berkeley.edu;https://www.columbia.edu;https://www.osakamedicollege.com", "aff_unique_abbr": "OU;UC Berkeley;Columbia;OMC", "aff_campus_unique_index": "0;1;1;1;1;1", "aff_campus_unique": "Osaka;Berkeley;", "aff_country_unique_index": "0;1;1;1;1;1;0;1", "aff_country_unique": "Japan;United States" }, { "title": "Position: On the Possibilities of AI-Generated Text Detection", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34689", "id": "CJbhtpcyGL", "proceeding": "https://proceedings.mlr.press/v235/chakraborty24a.html", "pdf": "https://openreview.net/pdf?id=CJbhtpcyGL", "openreview": "https://openreview.net/forum?id=CJbhtpcyGL", "author_site": "Souradip Chakraborty, Amrit Singh Bedi, Sicheng Zhu, Bang An, Dinesh Manocha, Furong Huang", "tldr": "", "abstract": "Our study addresses the challenge of distinguishing human-written text from Large Language Model (LLM) outputs. We provide evidence that this differentiation is consistently feasible, except when human and machine text distributions are indistinguishable across their entire support. Employing information theory, we show that while detecting machine-generated text becomes harder as it nears human quality, it remains possible with adequate text data. We introduce guidelines on the required text data quantity, either through sample size or sequence length, for reliable AI text detection, through derivations of sample complexity bounds. This research paves the way for advanced detection methods. Our comprehensive empirical tests, conducted across various datasets (Xsum, Squad, IMDb, and Kaggle FakeNews) and with several state-of-the-art text generators (GPT-2, GPT-3.5-Turbo, Llama, Llama-2-13B-Chat-HF, Llama-2-70B-Chat-HF), assess the viability of enhanced detection methods against detectors like RoBERTa-Large/Base-Detector and GPTZero, with increasing sample sizes and sequence lengths. Our findings align with OpenAI's empirical data related to sequence length, marking the first theoretical substantiation for these observations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Souradip Chakraborty;Amrit Bedi;Sicheng Zhu;Bang An;Dinesh Manocha;Furong Huang", "authorids": "~Souradip_Chakraborty1;~Amrit_Bedi1;~Sicheng_Zhu1;~Bang_An1;~Dinesh_Manocha3;~Furong_Huang1", "gender": "M;M;M;;M;F", "homepage": "https://souradip-umd.github.io/;https://sites.google.com/view/amritsinghbedi/home;https://schzhu.github.io/;https://bangann.github.io/;https://www.cs.umd.edu/people/dmanocha;https://furong-huang.com", "dblp": "264/5758;176/2707.html;;188/0741;m/DineshManocha;72/8513", "google_scholar": "https://scholar.google.co.in/citations?user=pvETm1wAAAAJ;91WLA6QAAAAJ;;3ce6z_sAAAAJ;X08l_4IAAAAJ;13yyuCcAAAAJ", "orcid": ";;;;0000-0001-7047-9801;", "linkedin": ";;;;dinesh-manocha-2311846;", "or_profile": "~Souradip_Chakraborty1;~Amrit_Bedi1;~Sicheng_Zhu1;~Bang_An1;~Dinesh_Manocha3;~Furong_Huang1", "aff": "University of Maryland, College Park;University of Maryland, College Park;University of Maryland, College Park;University of Maryland, College Park;University of Maryland, College Park;University of Maryland", "aff_domain": "umd.edu;umd.edu;umd.edu;umd.edu;umd.edu;cs.umd.edu", "position": "PhD student;Researcher;PhD student;PhD student;Professor;Assistant Professor", "bibtex": "@inproceedings{\nchakraborty2024position,\ntitle={Position: On the Possibilities of {AI}-Generated Text Detection},\nauthor={Souradip Chakraborty and Amrit Bedi and Sicheng Zhu and Bang An and Dinesh Manocha and Furong Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CJbhtpcyGL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2275813, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15288498989023120603&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 5, "email": "umd.edu;umd.edu;umd.edu;umd.edu;umd.edu;cs.umd.edu", "author_num": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "University of Maryland", "aff_unique_dep": "", "aff_unique_url": "https://www/umd.edu", "aff_unique_abbr": "UMD", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Replicable Learning of Large-Margin Halfspaces", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34688", "id": "CKCzfU9YKE", "proceeding": "https://proceedings.mlr.press/v235/kalavasis24a.html", "pdf": "https://openreview.net/pdf?id=CKCzfU9YKE", "openreview": "https://openreview.net/forum?id=CKCzfU9YKE", "author_site": "Alkis Kalavasis, Amin Karbasi, Kasper Green Larsen, Grigoris Velegkas, Felix Zhou", "tldr": "", "abstract": "We provide an efficient replicable algorithm for the problem of learning large-margin halfspaces. Our results improve upon the algorithms provided by Impagliazzo, Lei, Pitassi, and Sorrell (STOC, 2022). We design the first dimension-independent replicable algorithm for this task which runs in polynomial time, is proper, and has strictly improved sample complexity compared to the one achieved by Impagliazzo et al. (STOC, 2022) with respect to all the relevant parameters. Moreover, our algorithm has sample complexity that is optimal with respect to the accuracy parameter $\\epsilon$. Departing from the requirement of polynomial time algorithms, using the DP-to-Replicability reduction of Bun et al. (STOC 2023), we show how to obtain a replicable algorithm for large-margin halfspaces with improved sample complexity with respect to the margin parameter $\\tau$, but running time doubly exponential in $1/\\tau^2$ and worse sample complexity dependence on $\\epsilon$ than our previous algorithm. We then design an improved algorithm with better sample complexity than both of our previous algorithms and running time exponential in $1/\\tau^{2}.$", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alkis Kalavasis;Amin Karbasi;Kasper Green Larsen;Grigoris Velegkas;Felix Zhou", "authorids": "~Alkis_Kalavasis1;~Amin_Karbasi3;~Kasper_Green_Larsen1;~Grigoris_Velegkas1;~Felix_Zhou1", "gender": "M;;;M;", "homepage": "https://alkisk.github.io/;;;;", "dblp": "269/9425;;;254/1885;", "google_scholar": "NgVIFJwAAAAJ;;;Ty1kgP0AAAAJ;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Alkis_Kalavasis1;~Amin_Karbasi3;~Kasper_Green_Larsen1;~Grigoris_Velegkas1;~Felix_Zhou1", "aff": "Yale University;;;Yale University;", "aff_domain": "yale.edu;;;yale.edu;", "position": "Postdoc;;;PhD student;", "bibtex": "@inproceedings{\nkalavasis2024replicable,\ntitle={Replicable Learning of Large-Margin Halfspaces},\nauthor={Alkis Kalavasis and Amin Karbasi and Kasper Green Larsen and Grigoris Velegkas and Felix Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CKCzfU9YKE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 430767, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4315446751823160973&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "email": "yale.edu;;;yale.edu;", "author_num": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Yale University", "aff_unique_dep": "", "aff_unique_url": "https://www.yale.edu", "aff_unique_abbr": "Yale", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "An Independence-promoting Loss for Music Generation with Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34687", "id": "CLJZI5kDhX", "proceeding": "https://proceedings.mlr.press/v235/lemercier24a.html", "pdf": "https://openreview.net/pdf?id=CLJZI5kDhX", "openreview": "https://openreview.net/forum?id=CLJZI5kDhX", "author_site": "Jean-Marie Lemercier, Simon Rouard, Jade Copet, Yossi Adi, Alexandre Defossez", "tldr": "", "abstract": "Music generation schemes using language modeling rely on a vocabulary of audio tokens, generally provided as codes in a discrete latent space learnt by an auto-encoder. Multi-stage quantizers are often employed to produce these tokens, therefore the decoding strategy used for token prediction must be adapted to account for multiple codebooks: either it should model the joint distribution over all codebooks, or fit the product of the codebook marginal distributions. Modelling the joint distribution requires a costly increase in the number of auto-regressive steps, while fitting the product of the marginals yields an inexact model unless the codebooks are mutually independent. In this work, we introduce an independence-promoting loss to regularize the auto-encoder used as the tokenizer in language models for music generation. The proposed loss is a proxy for mutual information based on the maximum mean discrepancy principle, applied in reproducible kernel Hilbert spaces. Our criterion is simple to implement and train, and it is generalizable to other multi-stream codecs. We show that it reduces the statistical dependence between codebooks during auto-encoding. This leads to an increase in the generated music quality when modelling the product of the marginal distributions, while generating audio much faster than the joint distribution model.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jean-Marie Lemercier;Simon Rouard;Jade Copet;Yossi Adi;Alexandre D\u00e9fossez", "authorids": "~Jean-Marie_Lemercier1;srouard@sfr.fr;~Jade_Copet1;~Yossi_Adi1;~Alexandre_D\u00e9fossez1", "gender": "M;;;M;M", "homepage": "https://www.inf.uni-hamburg.de/en/inst/ab/sp/people/lemercier.html;;;http://adiyoss.github.io/;https://ai.honu.io/", "dblp": ";;;171/0957.html;156/0054", "google_scholar": "dJFuXCQAAAAJ;;GRMLwjAAAAAJ;https://scholar.google.co.il/citations?user=4W-HuYYAAAAJ;https://scholar.google.fr/citations?user=DubNUU0AAAAJ", "orcid": "0000-0002-8704-7658;;;0000-0003-2237-3898;", "linkedin": "jean-marie-lemercier-55ab96137/;;jadecopet/?locale=en_US;yossi-adi-31a32858?trk=nav_responsive_tab_profile_pic;", "or_profile": "~Jean-Marie_Lemercier1;srouard@sfr.fr;~Jade_Copet1;~Yossi_Adi1;~Alexandre_D\u00e9fossez1", "aff": "Universit\u00e4t Hamburg;;Facebook AI Research;Meta;Kyutai", "aff_domain": "uni-hamburg.de;;facebook.com;meta.com;kyutai.org", "position": "PhD student;;Research Engineering Manager;Research Scientist;Researcher", "bibtex": "@inproceedings{\nlemercier2024an,\ntitle={An Independence-promoting Loss for Music Generation with Language Models},\nauthor={Jean-Marie Lemercier and Simon Rouard and Jade Copet and Yossi Adi and Alexandre D{\\'e}fossez},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CLJZI5kDhX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 456575, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17800568185487469571&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 7, "email": "uni-hamburg.de;;facebook.com;meta.com;kyutai.org", "author_num": 5, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "University of Hamburg;Meta;Kyushu University", "aff_unique_dep": ";Facebook AI Research;", "aff_unique_url": "https://www.uni-hamburg.de;https://research.facebook.com;https://www.kyushu-u.ac.jp", "aff_unique_abbr": "UHH;FAIR;Kyushu U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;2", "aff_country_unique": "Germany;United States;Japan" }, { "title": "Discrete Diffusion Modeling by Estimating the Ratios of the Data Distribution", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34686", "id": "CNicRIVIPA", "proceeding": "https://proceedings.mlr.press/v235/lou24a.html", "pdf": "https://openreview.net/pdf?id=CNicRIVIPA", "openreview": "https://openreview.net/forum?id=CNicRIVIPA", "author_site": "Aaron Lou, Chenlin Meng, Stefano Ermon", "tldr": "", "abstract": "Despite their groundbreaking performance for many generative modeling tasks, diffusion models have fallen short on discrete data domains such as natural language. Crucially, standard diffusion models rely on the well-established theory of score matching, but efforts to generalize this to discrete structures have not yielded the same empirical gains. In this work, we bridge this gap by proposing score entropy, a novel loss that naturally extends score matching to discrete spaces, integrates seamlessly to build discrete diffusion models, and significantly boosts performance. Experimentally, we test our Score Entropy Discrete Diffusion models (SEDD) on standard language modeling tasks. For comparable model sizes, SEDD beats existing language diffusion paradigms (reducing perplexity by $25$-$75$%) and is competitive with autoregressive models, in particular outperforming GPT-2. Furthermore, compared to autoregressive mdoels, SEDD generates faithful text without requiring distribution annealing techniques like temperature scaling (around $6$-$8\\times$ better generative perplexity than un-annealed GPT-2), can trade compute and quality (similar quality with $32\\times$ fewer network evaluations), and enables controllable infilling (matching nucleus sampling quality while enabling other strategies besides left to right prompting).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aaron Lou;Chenlin Meng;Stefano Ermon", "authorids": "~Aaron_Lou1;~Chenlin_Meng1;~Stefano_Ermon1", "gender": "M;F;M", "homepage": "https://aaronlou.com;https://chenlin9.github.io/;http://cs.stanford.edu/~ermon/", "dblp": "232/3858;227/2517;47/8135", "google_scholar": ";nEFU7wIAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Aaron_Lou1;~Chenlin_Meng1;~Stefano_Ermon1", "aff": "Stanford University;Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nlou2024discrete,\ntitle={Discrete Diffusion Modeling by Estimating the Ratios of the Data Distribution},\nauthor={Aaron Lou and Chenlin Meng and Stefano Ermon},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CNicRIVIPA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1014488, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 76, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3426085124559920770&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "stanford.edu;stanford.edu;stanford.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Interacting Diffusion Processes for Event Sequence Forecasting", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34685", "id": "CQH63IbI5o", "proceeding": "https://proceedings.mlr.press/v235/zeng24f.html", "pdf": "https://openreview.net/pdf?id=CQH63IbI5o", "openreview": "https://openreview.net/forum?id=CQH63IbI5o", "author_site": "Mai Zeng, Florence Regol, Mark Coates", "tldr": "", "abstract": "Neural Temporal Point Processes (TPPs) have emerged as the primary framework for predicting sequences of events that occur at irregular time intervals, but their sequential nature can hamper performance for long-horizon forecasts. To address this, we introduce a novel approach that incorporates a diffusion generative model. The model facilitates sequence-to-sequence prediction, allowing multi-step predictions based on historical event sequences. In contrast to previous approaches, our model directly learns the joint probability distribution of types and inter-arrival times for multiple events. The model is composed of two diffusion processes, one for the time intervals and one for the event types. These processes interact through their respective denoising functions, which can take as input intermediate representations from both processes, allowing the model to learn complex interactions. We demonstrate that our proposal outperforms state-of-the-art baselines for long-horizon forecasting of TPPs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mai Zeng;florence regol;Mark Coates", "authorids": "~Mai_Zeng1;~florence_regol1;~Mark_Coates1", "gender": "M;F;M", "homepage": ";;http://www.ece.mcgill.ca/~mcoate/", "dblp": ";251/8836;c/MarkCoates", "google_scholar": "TZdzLikAAAAJ;https://scholar.google.ca/citations?user=goNKjhkAAAAJ;https://scholar.google.ca/citations?user=qxWORNoAAAAJ", "orcid": ";;0000-0001-5030-1379", "linkedin": ";;", "or_profile": "~Mai_Zeng1;~florence_regol1;~Mark_Coates1", "aff": ";McGill University;McGill University", "aff_domain": ";mcgill.ca;mcgill.ca", "position": ";PhD student;Full Professor", "bibtex": "@inproceedings{\nzeng2024interacting,\ntitle={Interacting Diffusion Processes for Event Sequence Forecasting},\nauthor={Mai Zeng and florence regol and Mark Coates},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CQH63IbI5o}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1407032, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7086785383023298640&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 8, "email": ";mcgill.ca;mcgill.ca", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "McGill University", "aff_unique_dep": "", "aff_unique_url": "https://www.mcgill.ca", "aff_unique_abbr": "McGill", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "title": "Quantum Algorithms and Lower Bounds for Finite-Sum Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34684", "id": "CQI3f1U9X1", "proceeding": "https://proceedings.mlr.press/v235/zhang24bz.html", "pdf": "https://openreview.net/pdf?id=CQI3f1U9X1", "openreview": "https://openreview.net/forum?id=CQI3f1U9X1", "author_site": "Yexin Zhang, Chenyi Zhang, Cong Fang, Liwei Wang, Tongyang Li", "tldr": "", "abstract": "Finite-sum optimization has wide applications in machine learning, covering important problems such as support vector machines, regression, etc. In this paper, we initiate the study of solving finite-sum optimization problems by quantum computing. Specifically, let $f_1,\\ldots,f_n:\\mathbb{R}^d\\to\\mathbb{R}$ be $\\ell$-smooth convex functions and $\\psi:\\mathbb{R}^d\\to\\mathbb{R}$ be a $\\mu$-strongly convex proximal function. The goal is to find an $\\epsilon$-optimal point for $F(\\mathbf{x})=\\frac{1}{n}\\sum_{i=1}^n f_i(\\mathbf{x})+\\psi(\\mathbf{x})$. We give a quantum algorithm with complexity $\\tilde{O}\\big(n+\\sqrt{d}+\\sqrt{\\ell/\\mu}\\big(n^{1/3}d^{1/3}+n^{-2/3}d^{5/6}\\big)\\big)$, improving the classical tight bound $\\tilde{\\Theta}\\big(n+\\sqrt{n\\ell/\\mu}\\big)$. We also prove a quantum lower bound $\\tilde{\\Omega}(n+n^{3/4}(\\ell/\\mu)^{1/4})$ when $d$ is large enough. Both our quantum upper and lower bounds can extend to the cases where $\\psi$ is not necessarily strongly convex, or each $f_i$ is Lipschitz but not necessarily smooth. In addition, when $F$ is nonconvex, our quantum algorithm can find an $\\epsilon$-critial point using $\\tilde{O}(n+\\ell(d^{1/3}n^{1/3}+\\sqrt{d})/\\epsilon^2)$ queries.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yexin Zhang;Chenyi Zhang;Cong Fang;Liwei Wang;Tongyang Li", "authorids": "~Yexin_Zhang2;~Chenyi_Zhang2;~Cong_Fang1;~Liwei_Wang1;~Tongyang_Li1", "gender": "M;M;M;M;M", "homepage": "https://mystictides.github.io/;https://chenyizhang2000.github.io;https://congfang-ml.github.io/;http://www.liweiwang-pku.com/;https://www.tongyangli.com/", "dblp": ";;140/6568;;142/1312", "google_scholar": ";https://scholar.google.com/citations?hl=zh-CN;N2M9RPoAAAAJ;VZHxoh8AAAAJ;ny0ZgiQAAAAJ", "orcid": ";;;;0000-0002-0338-413X", "linkedin": ";;;;", "or_profile": "~Yexin_Zhang2;~Chenyi_Zhang2;~Cong_Fang1;~Liwei_Wang1;~Tongyang_Li1", "aff": "Peking University;Stanford University;Peking University;Peking University;Peking University", "aff_domain": "stu.pku.edu.cn;stanford.edu;pku.edu.cn;pku.edu.cn;pku.edu.cn", "position": "Undergrad student;PhD student;Assistant Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nzhang2024quantum,\ntitle={Quantum Algorithms and Lower Bounds for Finite-Sum Optimization},\nauthor={Yexin Zhang and Chenyi Zhang and Cong Fang and Liwei Wang and Tongyang Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CQI3f1U9X1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 488863, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1326114144397682051&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 6, "email": "stu.pku.edu.cn;stanford.edu;pku.edu.cn;pku.edu.cn;pku.edu.cn", "author_num": 5, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Peking University;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.stanford.edu", "aff_unique_abbr": "Peking U;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "China;United States" }, { "title": "Efficient Black-box Adversarial Attacks via Bayesian Optimization Guided by a Function Prior", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34683", "id": "CR6Sl80cn8", "proceeding": "https://proceedings.mlr.press/v235/cheng24h.html", "pdf": "https://openreview.net/pdf?id=CR6Sl80cn8", "openreview": "https://openreview.net/forum?id=CR6Sl80cn8", "author_site": "Shuyu Cheng, Yibo Miao, Yinpeng Dong, Xiao Yang, Xiao-Shan Gao, Jun Zhu", "tldr": "", "abstract": "This paper studies the challenging black-box adversarial attack that aims to generate adversarial examples against a black-box model by only using output feedback of the model to input queries. Some previous methods improve the query efficiency by incorporating the gradient of a surrogate white-box model into query-based attacks due to the adversarial transferability. However, the localized gradient is not informative enough, making these methods still query-intensive. In this paper, we propose a Prior-guided Bayesian Optimization (P-BO) algorithm that leverages the surrogate model as a global function prior in black-box adversarial attacks. As the surrogate model contains rich prior information of the black-box one, P-BO models the attack objective with a Gaussian process whose mean function is initialized as the surrogate model's loss. Our theoretical analysis on the regret bound indicates that the performance of P-BO may be affected by a bad prior. Therefore, we further propose an adaptive integration strategy to automatically adjust a coefficient on the function prior by minimizing the regret bound. Extensive experiments on image classifiers and large vision-language models demonstrate the superiority of the proposed algorithm in reducing queries and improving attack success rates compared with the state-of-the-art black-box attacks. Code is available at https://github.com/yibo-miao/PBO-Attack.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shuyu Cheng;Yibo Miao;Yinpeng Dong;Xiao Yang;Xiao-Shan Gao;Jun Zhu", "authorids": "~Shuyu_Cheng1;~Yibo_Miao1;~Yinpeng_Dong2;~Xiao_Yang4;~Xiao-Shan_Gao2;~Jun_Zhu2", "gender": "M;M;M;M;M;M", "homepage": ";http://www.amss.ac.cn/;https://dongyp13.github.io;https://ml.cs.tsinghua.edu.cn/~xiaoyang/;http://www.mmrc.iss.ac.cn/~xgao/;http://ml.cs.tsinghua.edu.cn/~jun", "dblp": ";332/0699;183/0980;57/33851;13/3109;50/2644-1", "google_scholar": "IshqtzsAAAAJ;;6_4ad84AAAAJ;bwkwp0MAAAAJ;_se7GmUAAAAJ;axsP38wAAAAJ", "orcid": ";;;0000-0001-9502-9962;0000-0003-2021-9395;", "linkedin": ";;;;;", "or_profile": "~Shuyu_Cheng1;~Yibo_Miao1;~Yinpeng_Dong2;~Xiao_Yang4;~Xiao-Shan_Gao2;~Jun_Zhu2", "aff": ";Intel;Tsinghua University;Tsinghua University;Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Chinese Academy of Sciences;Tsinghua University", "aff_domain": ";intel.com;tsinghua.edu.cn;mail.tsinghua.edu.cn;amss.ac.cn;mail.tsinghua.edu.cn", "position": ";Intern;Postdoc;Postdoc;Full Professor;Professor", "bibtex": "@inproceedings{\ncheng2024efficient,\ntitle={Efficient Black-box Adversarial Attacks via Bayesian Optimization Guided by a Function Prior},\nauthor={Shuyu Cheng and Yibo Miao and Yinpeng Dong and Xiao Yang and Xiao-Shan Gao and Jun Zhu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CR6Sl80cn8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1744734, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1889303681683186486&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 6, "email": ";intel.com;tsinghua.edu.cn;mail.tsinghua.edu.cn;amss.ac.cn;mail.tsinghua.edu.cn", "author_num": 6, "aff_unique_index": "0;1;1;2;1", "aff_unique_norm": "Intel;Tsinghua University;Chinese Academy of Sciences", "aff_unique_dep": "Intel Corporation;;Academy of Mathematics and Systems Science", "aff_unique_url": "https://www.intel.com;https://www.tsinghua.edu.cn;http://www.cas.cn", "aff_unique_abbr": "Intel;THU;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "United States;China" }, { "title": "CrossGET: Cross-Guided Ensemble of Tokens for Accelerating Vision-Language Transformers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34682", "id": "CSIfCpXhCF", "proceeding": "https://proceedings.mlr.press/v235/shi24e.html", "pdf": "https://openreview.net/pdf?id=CSIfCpXhCF", "openreview": "https://openreview.net/forum?id=CSIfCpXhCF", "author_site": "Dachuan Shi, Chaofan Tao, Anyi Rao, Zhendong Yang, Chun Yuan, Jiaqi Wang", "tldr": "", "abstract": "Recent vision-language models have achieved tremendous advances. However, their computational costs are also escalating dramatically, making model acceleration exceedingly critical. To pursue more efficient vision-language Transformers, this paper introduces Cross-Guided Ensemble of Tokens (CrossGET), a general acceleration framework for vision-language Transformers. This framework adaptively combines tokens in real-time during inference, significantly reducing computational costs while maintaining high performance. CrossGET features two primary innovations: 1) Cross-Guided Matching and Ensemble. CrossGET leverages cross-modal guided token matching and ensemble to effectively utilize cross-modal information, achieving wider applicability across both modality-independent models, e.g., CLIP, and modality-dependent ones, e.g., BLIP2. 2) Complete-Graph Soft Matching. CrossGET introduces an algorithm for the token-matching mechanism, ensuring reliable matching results while facilitating parallelizability and high efficiency. Extensive experiments have been conducted on various vision-language tasks, such as image-text retrieval, visual reasoning, image captioning, and visual question answering. The performance on both classic multimodal architectures and emerging multimodal LLMs demonstrates the framework's effectiveness and versatility. The code is available at https://github.com/sdc17/CrossGET.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dachuan Shi;Chaofan Tao;Anyi Rao;Zhendong Yang;Chun Yuan;Jiaqi Wang", "authorids": "~Dachuan_Shi2;~Chaofan_Tao1;~Anyi_Rao2;~Zhendong_Yang2;~Chun_Yuan1;~Jiaqi_Wang1", "gender": "M;M;M;M;M;M", "homepage": "https://www.dachuanshi.com;;http://anyirao.com/;;https://www.sigs.tsinghua.edu.cn/fg3/105064.jhtml;https://myownskyw7.github.io/", "dblp": "283/0549;239/5831;211/7941;14/1820;;44/740-3", "google_scholar": "https://scholar.google.com/citations?hl=en;gjmfLroAAAAJ;8lKr7j4AAAAJ;M9qKrogAAAAJ;https://scholar.google.com.hk/citations?user=fYdxi2sAAAAJ;https://scholar.google.com.hk/citations?user=GDvt570AAAAJ", "orcid": ";;0000-0003-1004-7753;;;", "linkedin": ";;anyirao/;;;", "or_profile": "~Dachuan_Shi2;~Chaofan_Tao1;~Anyi_Rao2;~Zhendong_Yang2;~Chun_Yuan1;~Jiaqi_Wang1", "aff": "Tsinghua University;The University of Hong Kong;Stanford University; Tsinghua University;Tsinghua University;Shanghai AI Laboratory", "aff_domain": "tsinghua.edu.cn;hku.hk;stanford.edu;mails.tsinghua.edu.cn;tsinghua.edu.cn;pjlab.org.cn", "position": "MS student;PhD Student;Postdoc;MS student;Full Professor;Research Scientist", "bibtex": "@inproceedings{\nshi2024crossget,\ntitle={Cross{GET}: Cross-Guided Ensemble of Tokens for Accelerating Vision-Language Transformers},\nauthor={Dachuan Shi and Chaofan Tao and Anyi Rao and Zhendong Yang and Chun Yuan and Jiaqi Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CSIfCpXhCF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2309330, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14864616039322790915&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "tsinghua.edu.cn;hku.hk;stanford.edu;mails.tsinghua.edu.cn;tsinghua.edu.cn;pjlab.org.cn", "author_num": 6, "aff_unique_index": "0;1;2;0;0;3", "aff_unique_norm": "Tsinghua University;University of Hong Kong;Stanford University;Shanghai AI Laboratory", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.hku.hk;https://www.stanford.edu;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "THU;HKU;Stanford;SAIL", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Hong Kong SAR;Stanford", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "China;United States" }, { "title": "Understanding Unimodal Bias in Multimodal Deep Linear Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34681", "id": "CTEMHDSwIj", "proceeding": "https://proceedings.mlr.press/v235/zhang24aa.html", "pdf": "https://openreview.net/pdf?id=CTEMHDSwIj", "openreview": "https://openreview.net/forum?id=CTEMHDSwIj", "author_site": "Yedi Zhang, Peter Latham, Andrew Saxe", "tldr": "", "abstract": "Using multiple input streams simultaneously to train multimodal neural networks is intuitively advantageous but practically challenging. A key challenge is unimodal bias, where a network overly relies on one modality and ignores others during joint training. We develop a theory of unimodal bias with multimodal deep linear networks to understand how architecture and data statistics influence this bias. This is the first work to calculate the duration of the unimodal phase in learning as a function of the depth at which modalities are fused within the network, dataset statistics, and initialization. We show that the deeper the layer at which fusion occurs, the longer the unimodal phase. A long unimodal phase can lead to a generalization deficit and permanent unimodal bias in the overparametrized regime. Our results, derived for multimodal linear networks, extend to nonlinear networks in certain settings. Taken together, this work illuminates pathologies of multimodal learning under joint training, showing that late and intermediate fusion architectures can give rise to long unimodal phases and permanent unimodal bias. Our code is available at: https://yedizhang.github.io/unimodal-bias.html.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yedi Zhang;Peter E. Latham;Andrew M Saxe", "authorids": "~Yedi_Zhang3;~Peter_E._Latham1;~Andrew_M_Saxe1", "gender": ";M;M", "homepage": ";http://www.gatsby.ucl.ac.uk/~pel/;https://www.saxelab.org", "dblp": ";18/4059;39/6894", "google_scholar": ";;h0Al1fcAAAAJ", "orcid": ";;0000-0002-9831-8812", "linkedin": ";;", "or_profile": "~Yedi_Zhang3;~Peter_E._Latham1;~Andrew_M_Saxe1", "aff": ";;University College London, University of London", "aff_domain": ";;ucl.ac.uk", "position": ";;Full Professor", "bibtex": "@inproceedings{\nzhang2024understanding,\ntitle={Understanding Unimodal Bias in Multimodal Deep Linear Networks},\nauthor={Yedi Zhang and Peter E. Latham and Andrew M Saxe},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CTEMHDSwIj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1779859, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=470535706711609332&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 9, "email": ";;ucl.ac.uk", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "University College London", "aff_unique_dep": "", "aff_unique_url": "https://www.ucl.ac.uk", "aff_unique_abbr": "UCL", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "title": "Active Preference Learning for Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34680", "id": "CTgEV6qgUy", "proceeding": "https://proceedings.mlr.press/v235/muldrew24a.html", "pdf": "https://openreview.net/pdf?id=CTgEV6qgUy", "openreview": "https://openreview.net/forum?id=CTgEV6qgUy", "author_site": "William Muldrew, Peter Hayes, Mingtian Zhang, David Barber", "tldr": "", "abstract": "As large language models (LLMs) become more capable, fine-tuning techniques for aligning with human intent are increasingly important. A key consideration for aligning these models is how to most effectively use human resources, or model resources in the case where LLMs themselves are used as oracles. Reinforcement learning from Human or AI preferences (RLHF/RLAIF) is the most prominent example of such a technique, but is complex and often unstable. Direct Preference Optimization (DPO) has recently been proposed as a simpler and more stable alternative. In this work, we develop an active learning strategy for DPO to make better use of preference labels. We propose a practical acquisition function for prompt/completion pairs based on the predictive entropy of the language model and a measure of certainty of the implicit preference model optimized by DPO. We demonstrate how our approach improves both the rate of learning and final performance of fine-tuning on pairwise preference data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "William Muldrew;Peter Hayes;Mingtian Zhang;David Barber", "authorids": "william.muldrew.22@alumni.ucl.ac.uk;~Peter_Hayes1;~Mingtian_Zhang1;~David_Barber2", "gender": ";;M;", "homepage": ";;http://tomo.wiki;", "dblp": ";;230/8340;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "william.muldrew.22@alumni.ucl.ac.uk;~Peter_Hayes1;~Mingtian_Zhang1;~David_Barber2", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nmuldrew2024active,\ntitle={Active Preference Learning for Large Language Models},\nauthor={William Muldrew and Peter Hayes and Mingtian Zhang and David Barber},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CTgEV6qgUy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1465261, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1611207786358951496&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": ";;;", "author_num": 4 }, { "title": "Improving Generalization in Offline Reinforcement Learning via Adversarial Data Splitting", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34679", "id": "CV9PiQGt0i", "proceeding": "https://proceedings.mlr.press/v235/wang24aj.html", "pdf": "https://openreview.net/pdf?id=CV9PiQGt0i", "openreview": "https://openreview.net/forum?id=CV9PiQGt0i", "author_site": "Da Wang, Lin Li, Wei Wei, Qixian Yu, Jianye Hao, Jiye Liang", "tldr": "", "abstract": "Offline Reinforcement Learning (RL) commonly suffers from the out-of-distribution (OOD) overestimation issue due to the distribution shift. Prior work gradually shifts their focus from suppressing OOD overestimation to avoiding overly conservative learning from suboptimal behavior policies to improve generalization. However, most approaches explicitly delimit boundaries for OOD actions based on the support in the dataset, which can potentially impede the data near these boundaries from acquiring realistic estimates. This paper investigates how to loosen the rigid demarcation of OOD boundaries, adaptively extracting knowledge from empirical data to implicitly improve the model's generalization to nearby unseen data. We introduce an adversarial data splitting (ADS) framework that enforces the model to generalize the distribution shifts simulated from the train/validation subsets splitting of the dataset. Specifically, ADS is modeled as a min-max optimization problem inspired by meta-learning and solved by iterating over the following two steps. First, we train the model on the train-subset to minimize its loss on the validation-subset. Then, we adversarially generate the \"hardest\" train/validation subsets with the maximum distribution shift, making the model incapable of generalization at that splitting. We derive a generalization error bound for theoretically understanding ADS and verify the effectiveness with extensive experiments. Code is available at https://github.com/DkING-lv6/ADS.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Da Wang;Lin Li;Wei Wei;Qixian Yu;Jianye HAO;Jiye Liang", "authorids": "~Da_Wang2;~Lin_Li17;~Wei_Wei13;~Qixian_Yu1;~Jianye_HAO1;~Jiye_Liang1", "gender": "M;F;M;;M;M", "homepage": ";http://cs.sxu.edu.cn/faculty/lecturer/4526/index.htm;;https://github.com/sxu-QixianYu;http://www.icdai.org/jianye.html;https://jiyeliang.github.io/index.html", "dblp": "10/3366;;24/4105-18.html;;21/7664.html;80/6535", "google_scholar": "7Nlkm7IAAAAJ;;;;;iGc61hUAAAAJ", "orcid": ";;;0009-0008-2766-0638;0000-0002-0422-8235;0000-0001-5887-9327", "linkedin": ";;;;;", "or_profile": "~Da_Wang2;~Lin_Li17;~Wei_Wei13;~Qixian_Yu1;~Jianye_HAO1;~Jiye_Liang1", "aff": "Shanxi University;;shanxi university;Shanxi University;Tianjin University;Shanxi University", "aff_domain": "sxu.edu.cn;;sxu.edu.cn;sxu.edu.cn;tju.edu.cn;sxu.edu.cn", "position": "PhD student;;Full Professor;MS student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nwang2024improving,\ntitle={Improving Generalization in Offline Reinforcement Learning via Adversarial Data Splitting},\nauthor={Da Wang and Lin Li and Wei Wei and Qixian Yu and Jianye HAO and Jiye Liang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CV9PiQGt0i}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 810269, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Xj8WnDwk22sJ:scholar.google.com/&scioq=Improving+Generalization+in+Offline+Reinforcement+Learning+via+Adversarial+Data+Splitting&hl=en&as_sdt=0,5", "gs_version_total": 5, "email": "sxu.edu.cn;;sxu.edu.cn;sxu.edu.cn;tju.edu.cn;sxu.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Shanxi University;Tianjin University", "aff_unique_dep": ";", "aff_unique_url": "http://www.sxu.edu.cn;http://www.tju.edu.cn", "aff_unique_abbr": "SXU;TJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "CosPGD: an efficient white-box adversarial attack for pixel-wise prediction tasks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34678", "id": "CXZqGJonmt", "proceeding": "https://proceedings.mlr.press/v235/agnihotri24b.html", "pdf": "https://openreview.net/pdf?id=CXZqGJonmt", "openreview": "https://openreview.net/forum?id=CXZqGJonmt", "author_site": "Shashank Agnihotri, Steffen Jung, Margret Keuper", "tldr": "", "abstract": "While neural networks allow highly accurate predictions in many tasks, their lack of robustness towards even slight input perturbations often hampers their deployment. Adversarial attacks such as the seminal _projected gradient descent_ (PGD) offer an effective means to evaluate a model's robustness and dedicated solutions have been proposed for attacks on semantic segmentation or optical flow estimation. While they attempt to increase the attack's efficiency, a further objective is to balance its effect, so that it acts on the entire image domain instead of isolated point-wise predictions. This often comes at the cost of optimization stability and thus efficiency. Here, we propose CosPGD, an attack that encourages more balanced errors over the entire image domain while increasing the attack's overall efficiency. To this end, CosPGD leverages a simple alignment score computed from any pixel-wise prediction and its target to scale the loss in a smooth and fully differentiable way. It leads to efficient evaluations of a model's robustness for semantic segmentation as well as regression models (such as optical flow, disparity estimation, or image restoration), and it allows it to outperform the previous SotA attack on semantic segmentation. We provide code for the CosPGD algorithm and example usage at https://github.com/shashankskagnihotri/cospgd.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shashank Agnihotri;Steffen Jung;Margret Keuper", "authorids": "~Shashank_Agnihotri1;~Steffen_Jung1;~Margret_Keuper1", "gender": "M;M;F", "homepage": "https://www.uni-mannheim.de/dws/people/researchers/phd-students/shashank/;http://jung.vision;https://www.vc.informatik.uni-siegen.de/en/keuper-margret", "dblp": ";252/0087-1;95/7589", "google_scholar": "vhm_xu8AAAAJ;x5ovaJcAAAAJ;https://scholar.google.de/citations?user=KMqMQAcAAAAJ", "orcid": "0000-0001-6097-8551;0000-0001-8021-791X;0000-0002-8437-7993", "linkedin": "shashank-agnihotri/;jung-vision/;", "or_profile": "~Shashank_Agnihotri1;~Steffen_Jung1;~Margret_Keuper1", "aff": "Universit\u00e4t Siegen;Saarland Informatics Campus, Max-Planck Institute;Max Planck Institute for Informatics", "aff_domain": "uni-siegen.de;mpi-inf.mpg.de;mpi-inf.mpg", "position": "PhD student;PhD student;Researcher", "bibtex": "@inproceedings{\nagnihotri2024cospgd,\ntitle={Cos{PGD}: an efficient white-box adversarial attack for pixel-wise prediction tasks},\nauthor={Shashank Agnihotri and Steffen Jung and Margret Keuper},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CXZqGJonmt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3312457, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2686630512059451311&as_sdt=4005&sciodt=0,6&hl=en", "gs_version_total": 9, "email": "uni-siegen.de;mpi-inf.mpg.de;mpi-inf.mpg", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Siegen;Max-Planck Institute;Max Planck Institute for Informatics", "aff_unique_dep": ";Informatics;", "aff_unique_url": "https://www.uni-siegen.de;https://www.mpi-sws.org;https://mpi-inf.mpg.de", "aff_unique_abbr": "Uni Siegen;MPI-SWS;MPII", "aff_campus_unique_index": "1", "aff_campus_unique": ";Saarland", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "Reservoir Computing for Short High-Dimensional Time Series: an Application to SARS-CoV-2 Hospitalization Forecast", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34677", "id": "CY0lFwD4qx", "proceeding": "https://proceedings.mlr.press/v235/ferte24a.html", "pdf": "https://openreview.net/pdf?id=CY0lFwD4qx", "openreview": "https://openreview.net/forum?id=CY0lFwD4qx", "author_site": "Thomas Fert\u00e9, Dutartre Dan, Boris Hejblum, Romain Griffier, Vianney Jouhet, Rodolphe Thi\u00e9baut, Pierrick Legrand, Xavier Hinaut", "tldr": "", "abstract": "In this work, we aimed at forecasting the number of SARS-CoV-2 hospitalized patients at 14 days to help anticipate the bed requirements of a large scale hospital using public data and electronic health records data. Previous attempts led to mitigated performance in this high-dimension setting; we introduce a novel approach to time series forecasting by providing an alternative to conventional methods to deal with high number of potential features of interest (409 predictors). We integrate Reservoir Computing (RC) with feature selection using a genetic algorithm (GA) to gather optimal non-linear combinations of inputs to improve prediction in sample-efficient context. We illustrate that the RC-GA combination exhibits excellent performance in forecasting SARS-CoV-2 hospitalizations. This approach outperformed the use of RC alone and other conventional methods: LSTM, Transformers, Elastic-Net, XGBoost. Notably, this work marks the pioneering use of RC (along with GA) in the realm of short and high-dimensional time series, positioning it as a competitive and innovative approach in comparison to standard methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Thomas Fert\u00e9;Dan Dutartre;Boris P Hejblum;Romain Griffier;Vianney Jouhet;Rodolphe Thi\u00e9baut;Pierrick Legrand;Xavier Hinaut", "authorids": "~Thomas_Fert\u00e91;dan.dutartre@inria.fr;~Boris_P_Hejblum1;romain.griffier@chu-bordeaux.fr;vianney.jouhet@chu-bordeaux.fr;rodolphe.thiebaut@u-bordeaux.fr;pierrick.legrand@u-bordeaux.fr;~Xavier_Hinaut1", "gender": "M;;M;;;;;M", "homepage": ";;https://borishejblum.science;;;;;https://www.xavierhinaut.com", "dblp": ";;;;;;;118/8222", "google_scholar": ";;;;;;;pNW4eZAAAAAJ", "orcid": "0000-0001-8455-4665;;;;;;;0000-0002-1924-1184", "linkedin": ";;;;;;;", "or_profile": "~Thomas_Fert\u00e91;dan.dutartre@inria.fr;~Boris_P_Hejblum1;romain.griffier@chu-bordeaux.fr;vianney.jouhet@chu-bordeaux.fr;rodolphe.thiebaut@u-bordeaux.fr;pierrick.legrand@u-bordeaux.fr;~Xavier_Hinaut1", "aff": "University of Bordeaux;;University of Bordeaux;;;;;INRIA", "aff_domain": "u-bordeaux.fr;;u-bordeaux.fr;;;;;inria.fr", "position": "PhD student;;Assistant Professor;;;;;Researcher", "bibtex": "@inproceedings{\nfert{\\'e}2024reservoir,\ntitle={Reservoir Computing for Short High-Dimensional Time Series: an Application to {SARS}-CoV-2 Hospitalization Forecast},\nauthor={Thomas Fert{\\'e} and Dan Dutartre and Boris P Hejblum and Romain Griffier and Vianney Jouhet and Rodolphe Thi{\\'e}baut and Pierrick Legrand and Xavier Hinaut},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CY0lFwD4qx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2140851, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5526923647881389775&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 7, "email": "u-bordeaux.fr;;u-bordeaux.fr;;;;;inria.fr", "author_num": 8, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Bordeaux;INRIA", "aff_unique_dep": ";", "aff_unique_url": "https://www.u-bordeaux.fr;https://www.inria.fr", "aff_unique_abbr": "UBordeaux;INRIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "Convergence of Some Convex Message Passing Algorithms to a Fixed Point", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34676", "id": "CaxQ5IbHgF", "proceeding": "https://proceedings.mlr.press/v235/voracek24a.html", "pdf": "https://openreview.net/pdf?id=CaxQ5IbHgF", "openreview": "https://openreview.net/forum?id=CaxQ5IbHgF", "author_site": "V\u00e1clav Vor\u00e1\u010dek, Tom\u00e1\u0161 Werner", "tldr": "", "abstract": "A popular approach to the MAP inference problem in graphical models is to minimize an upper bound obtained from a dual linear programming or Lagrangian relaxation by (block-)coordinate descent. This is also known as convex/convergent message passing; examples are max-sum diffusion and sequential tree-reweighted message passing (TRW-S). Convergence properties of these methods are currently not fully understood. They have been proved to converge to the set characterized by local consistency of active constraints, with unknown convergence rate; however, it was not clear if the iterates converge at all (to any point). We prove a stronger result (conjectured before but never proved): the iterates converge to a fixed point of the method. Moreover, we show that the algorithm terminates within $\\mathcal{O}(1/\\varepsilon)$ iterations. We first prove this for a version of coordinate descent applied to a general piecewise-affine convex objective. Then we show that several convex message passing methods are special cases of this method. Finally, we show that a slightly different version of coordinate descent can cycle.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vaclav Voracek;Tomas Werner", "authorids": "~Vaclav_Voracek1;~Tomas_Werner1", "gender": "M;M", "homepage": ";http://cmp.felk.cvut.cz/~werner/", "dblp": "292/8831.html;02/227.html", "google_scholar": "Db13d44AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0002-6161-7157", "linkedin": ";", "or_profile": "~Vaclav_Voracek1;~Tomas_Werner1", "aff": "University of Tuebingen;Czech Technical Univeresity in Prague", "aff_domain": "uni-tuebingen.de;fel.cvut.cz", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nvoracek2024convergence,\ntitle={Convergence of Some Convex Message Passing Algorithms to a Fixed Point},\nauthor={Vaclav Voracek and Tomas Werner},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CaxQ5IbHgF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 362189, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:CYfms2ghrBkJ:scholar.google.com/&scioq=Convergence+of+Some+Convex+Message+Passing+Algorithms+to+a+Fixed+Point&hl=en&as_sdt=0,33", "gs_version_total": 6, "email": "uni-tuebingen.de;fel.cvut.cz", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Tuebingen;Czech Technical University in Prague", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.ctu.cz", "aff_unique_abbr": "Uni T\u00fcbingen;CTU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Prague", "aff_country_unique_index": "0;1", "aff_country_unique": "Germany;Czech Republic" }, { "title": "Bayesian Program Learning by Decompiling Amortized Knowledge", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34675", "id": "CbIRQgAYE4", "proceeding": "https://proceedings.mlr.press/v235/palmarini24a.html", "pdf": "https://openreview.net/pdf?id=CbIRQgAYE4", "openreview": "https://openreview.net/forum?id=CbIRQgAYE4", "author_site": "Alessandro Palmarini, Christopher Lucas, Siddharth N", "tldr": "", "abstract": "DreamCoder is an inductive program synthesis system that, whilst solving problems, learns to simplify search in an iterative wake-sleep procedure. The cost of search is amortized by training a neural search policy, reducing search breadth and effectively \"compiling\" useful information to compose program solutions across tasks. Additionally, a library of program components is learnt to compress and express discovered solutions in fewer components, reducing search depth. We present a novel approach for library learning that directly leverages the neural search policy, effectively \"decompiling\" its amortized knowledge to extract relevant program components. This provides stronger amortized inference: the amortized knowledge learnt to reduce search breadth is now also used to reduce search depth. We integrate our approach with DreamCoder and demonstrate faster domain proficiency with improved generalization on a range of domains, particularly when fewer example solutions are available.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alessandro B. Palmarini;Christopher G. Lucas;Siddharth N", "authorids": "~Alessandro_B._Palmarini1;~Christopher_G._Lucas1;~Siddharth_N1", "gender": ";M;M", "homepage": "http://christopherglucas.com;https://homepages.inf.ed.ac.uk/snaraya3/;https://www.santafe.edu/people/profile/alessandro-palmarini", "dblp": "69/3093;67/8366;", "google_scholar": ";V7D7hxMAAAAJ;gxB8nYQAAAAJ", "orcid": ";0000-0003-4911-7333;", "linkedin": ";;", "or_profile": "~Christopher_G._Lucas1;~Siddharth_N1;~Alessandro_Blair_Palmarini1", "aff": "University of Edinburgh, University of Edinburgh;University of Edinburgh;Santa Fe Institute", "aff_domain": "ed.ac.uk;ed.ac.uk;santafe.edu", "position": "Assistant Professor;Reader (Associate Professor);Intern", "bibtex": "@inproceedings{\npalmarini2024bayesian,\ntitle={Bayesian Program Learning by Decompiling Amortized Knowledge},\nauthor={Alessandro B. Palmarini and Christopher G. Lucas and Siddharth N},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CbIRQgAYE4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9983331, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:wkUBAIg4UE0J:scholar.google.com/&scioq=Bayesian+Program+Learning+by+Decompiling+Amortized+Knowledge&hl=en&as_sdt=0,14", "gs_version_total": 5, "email": "ed.ac.uk;ed.ac.uk;santafe.edu", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Edinburgh;Santa Fe Institute", "aff_unique_dep": ";", "aff_unique_url": "https://www.ed.ac.uk;https://www.santafe.edu", "aff_unique_abbr": "Edinburgh;SFI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Online Isolation Forest", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34674", "id": "CbIZatwz9z", "proceeding": "https://proceedings.mlr.press/v235/leveni24a.html", "pdf": "https://openreview.net/pdf?id=CbIZatwz9z", "openreview": "https://openreview.net/forum?id=CbIZatwz9z", "author_site": "Filippo Leveni, Guilherme Weigert Cassales, Bernhard Pfahringer, Albert Bifet, Giacomo Boracchi", "tldr": "", "abstract": "The anomaly detection literature is abundant with offline methods, which require repeated access to data in memory, and impose impractical assumptions when applied to a streaming context. Existing online anomaly detection methods also generally fail to address these constraints, resorting to periodic retraining to adapt to the online context. We propose Online-iForest, a novel method explicitly designed for streaming conditions that seamlessly tracks the data generating process as it evolves over time. Experimental validation on real-world datasets demonstrated that Online-iForest is on par with online alternatives and closely rivals state-of-the-art offline anomaly detection techniques that undergo periodic retraining. Notably, Online-iForest consistently outperforms all competitors in terms of efficiency, making it a promising solution in applications where fast identification of anomalies is of primary importance such as cybersecurity, fraud and fault detection.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Filippo Leveni;Guilherme Weigert Cassales;Bernhard Pfahringer;Albert Bifet;Giacomo Boracchi", "authorids": "~Filippo_Leveni1;~Guilherme_Weigert_Cassales1;~Bernhard_Pfahringer1;~Albert_Bifet1;~Giacomo_Boracchi2", "gender": "M;M;M;M;M", "homepage": ";https://profiles.waikato.ac.nz/guilherme.weigertcassales;https://profiles.waikato.ac.nz/bernhard.pfahringer;https://albertbifet.com/;http://home.deib.polimi.it/boracchi/", "dblp": "291/7909;257/5616;10/140;48/1070;53/1616", "google_scholar": "dYX0BRwAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.co.nz/citations?user=PEv3OQUAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.it/citations?user=lBa_mnYAAAAJ", "orcid": "0009-0007-7745-5686;0000-0003-4029-2047;0000-0002-3732-5787;0000-0002-8339-7773;0000-0002-1650-3054", "linkedin": ";;;abifet/;giacomoboracchi/", "or_profile": "~Filippo_Leveni1;~Guilherme_Weigert_Cassales1;~Bernhard_Pfahringer1;~Albert_Bifet1;~Giacomo_Boracchi2", "aff": "Politecnico di Milano;University of Waikato;The University of Waikato;T\u00e9l\u00e9com Paris;Polytechnic Institute of Milan", "aff_domain": "polimi.it;waikato.ac.nz;waikato.ac.nz;telecom-paris.fr;polimi.it", "position": "PhD student;Postdoc;Full Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nleveni2024online,\ntitle={Online Isolation Forest},\nauthor={Filippo Leveni and Guilherme Weigert Cassales and Bernhard Pfahringer and Albert Bifet and Giacomo Boracchi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CbIZatwz9z}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2686358, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18367796601293336012&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 7, "email": "polimi.it;waikato.ac.nz;waikato.ac.nz;telecom-paris.fr;polimi.it", "author_num": 5, "aff_unique_index": "0;1;1;2;3", "aff_unique_norm": "Politecnico di Milano;University of Waikato;T\u00e9l\u00e9com Paris;Polytechnic Institute of Milan", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.polimi.it;https://www.waikato.ac.nz;https://www.telecom-paris.fr;https://www.polimi.it/", "aff_unique_abbr": "Polimi;UoW;T\u00e9l\u00e9com Paris;Politecnico di Milano", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;2;0", "aff_country_unique": "Italy;New Zealand;France" }, { "title": "OAK: Enriching Document Representations using Auxiliary Knowledge for Extreme Classification", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34673", "id": "Cbacx90Wkt", "proceeding": "https://proceedings.mlr.press/v235/mohan24a.html", "pdf": "https://openreview.net/pdf?id=Cbacx90Wkt", "openreview": "https://openreview.net/forum?id=Cbacx90Wkt", "author_site": "Shikhar Mohan, Deepak Saini, Anshul Mittal, Sayak Ray Chowdhury, Bhawna Paliwal, Jian Jiao, Manish Gupta, Manik Varma", "tldr": "", "abstract": "The objective in eXtreme Classification (XC) is to find relevant labels for a document from an exceptionally large label space. Most XC application scenarios have rich auxiliary data associated with the input documents, e.g., frequently clicked webpages for search queries in sponsored search. Unfortunately, most of the existing XC methods do not use any auxiliary data. In this paper, we propose a novel framework, Online Auxiliary Knowledge (OAK), which harnesses auxiliary information linked to the document to improve XC accuracy. OAK stores information learnt from the auxiliary data in a knowledge bank and during a forward pass, retrieves relevant auxiliary knowledge embeddings for a given document. An enriched embedding is obtained by fusing these auxiliary knowledge embeddings with the document's embedding, thereby enabling much more precise candidate label selection and final classification. OAK training involves three stages. (1) Training a linker module to link documents to relevant auxiliary data points. (2) Learning an embedding for documents enriched using linked auxiliary information. (3) Using the enriched document embeddings to learn the final classifiers. OAK outperforms current state-of-the-art XC methods by up to $\\sim 5 \\%$ on academic datasets, and by $\\sim 3 \\%$ on an auxiliary data-augmented variant of LF-ORCAS-800K dataset in Precision@1. OAK also demonstrates statistically significant improvements in sponsored search metrics when deployed on a large scale search engine.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shikhar Mohan;Deepak Saini;Anshul Mittal;Sayak Ray Chowdhury;Bhawna Paliwal;Jian Jiao;Manish Gupta;Manik Varma", "authorids": "~Shikhar_Mohan1;~Deepak_Saini2;~Anshul_Mittal2;~Sayak_Ray_Chowdhury1;~Bhawna_Paliwal1;~Jian_Jiao2;~Manish_Gupta4;~Manik_Varma1", "gender": "M;M;M;M;F;M;M;M", "homepage": ";https://deepaksaini119.github.io/;http://anshulmittal.org;https://sites.google.com/view/sayakraychowdhury/home;;;http://manikvarma.org;https://sites.google.com/view/manishg/", "dblp": "304/4729;289/2228;;195/8152;302/2497;29/265-7.html;07/918.html;g/ManishGupta1.html", "google_scholar": ";3FICzBYAAAAJ;8TDNQMQAAAAJ;Q0_CaxYAAAAJ;7BxDLWcAAAAJ;D6KwmF8AAAAJ;https://scholar.google.gr/citations?user=2efybZkAAAAJ;https://scholar.google.co.in/citations?user=eX9PSu0AAAAJ", "orcid": "0009-0007-1689-0457;0000-0002-6057-4351;0000-0002-4137-0126;;;0000-0003-4779-9588;0000-0003-4516-6613;0000-0002-2843-3110", "linkedin": "shikhar-mohan-a38606171/;;anshumitts/;sayak-ray-chowdhury-54878154/;;jian-jiao-82897810/;;manishsgupta/", "or_profile": "~Shikhar_Mohan1;~Deepak_Saini2;~Anshul_Mittal2;~Sayak_Ray_Chowdhury1;~Bhawna_Paliwal1;~Jian_Jiao2;~Manik_Varma1;~Manish_Gupta1", "aff": "Microsoft Research;Microsoft;Indian Institute of Technology Delhi;Microsoft Research;Microsoft Research;Microsoft;Microsoft Research;Microsoft", "aff_domain": "research.microsoft.com;microsoft.com;iitd.ac.in;microsoft.com;microsoft.com;microsoft.com;research.microsoft.com;microsoft.com", "position": "Research Fellow;Researcher;PhD student;Postdoc;Research Engineer;Principal Researcher;Principal Researcher;Principal Researcher", "bibtex": "@inproceedings{\nmohan2024oak,\ntitle={{OAK}: Enriching Document Representations using Auxiliary Knowledge for Extreme Classification},\nauthor={Shikhar Mohan and Deepak Saini and Anshul Mittal and Sayak Ray Chowdhury and Bhawna Paliwal and Jian Jiao and Manish Gupta and Manik Varma},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Cbacx90Wkt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 668165, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11055522446834467376&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "research.microsoft.com;microsoft.com;iitd.ac.in;microsoft.com;microsoft.com;microsoft.com;research.microsoft.com;microsoft.com", "author_num": 8, "aff_unique_index": "0;0;1;0;0;0;0;0", "aff_unique_norm": "Microsoft;Indian Institute of Technology Delhi", "aff_unique_dep": "Microsoft Research;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.iitd.ac.in", "aff_unique_abbr": "MSR;IIT Delhi", "aff_campus_unique_index": "1", "aff_campus_unique": ";Delhi", "aff_country_unique_index": "0;0;1;0;0;0;0;0", "aff_country_unique": "United States;India" }, { "title": "Improving Robustness to Multiple Spurious Correlations by Multi-Objective Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34672", "id": "CbbTF6tDhW", "proceeding": "https://proceedings.mlr.press/v235/kim24l.html", "pdf": "https://openreview.net/pdf?id=CbbTF6tDhW", "openreview": "https://openreview.net/forum?id=CbbTF6tDhW", "author_site": "Nayeong Kim, Juwon Kang, Sungsoo Ahn, Jungseul Ok, Suha Kwak", "tldr": "", "abstract": "We study the problem of training an unbiased and accurate model given a dataset with multiple biases. This problem is challenging since the multiple biases cause multiple undesirable shortcuts during training, and even worse, mitigating one may exacerbate the other. We propose a novel training method to tackle this challenge. Our method first groups training data so that different groups induce different shortcuts, and then optimizes a linear combination of group-wise losses while adjusting their weights dynamically to alleviate conflicts between the groups in performance; this approach, rooted in the multi-objective optimization theory, encourages to achieve the minimax Pareto solution. We also present a new benchmark with multiple biases, dubbed MultiCelebA, for evaluating debiased training methods under realistic and challenging scenarios. Our method achieved the best on three datasets with multiple biases, and also showed superior performance on conventional single-bias datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nayeong Kim;Juwon Kang;Sungsoo Ahn;Jungseul Ok;Suha Kwak", "authorids": "~Nayeong_Kim1;~Juwon_Kang1;~Sungsoo_Ahn1;~Jungseul_Ok2;~Suha_Kwak3", "gender": "F;M;M;M;M", "homepage": "https://nayeong-v-kim.github.io/;http://cvlab.postech.ac.kr/lab/members.php;https://sungsooahn.super.site/;https://sites.google.com/view/jungseulok;https://suhakwak.github.io/", "dblp": "322/6051;214/2255;90/5164;117/3448;65/6173", "google_scholar": "XqX08VAAAAAJ;nwN9X2UAAAAJ;XTenHs0AAAAJ;KWG3UUMAAAAJ;-gscDIEAAAAJ", "orcid": ";;;0000-0003-4742-2473;", "linkedin": "nayeong-kim-ab1245106/;;;;", "or_profile": "~Nayeong_Kim1;~Juwon_Kang1;~Sungsoo_Ahn1;~Jungseul_Ok2;~Suha_Kwak3", "aff": "POSTECH;POSTECH;Pohang University of Science and Technology;POSTECH;POSTECH", "aff_domain": "postech.ac.kr;postech.ac.kr;postech.ac.kr;postech.ac.kr;postech.ac.kr", "position": "PhD student;PhD student;Assistant Professor;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nkim2024improving,\ntitle={Improving Robustness to Multiple Spurious Correlations by Multi-Objective Optimization},\nauthor={Nayeong Kim and Juwon Kang and Sungsoo Ahn and Jungseul Ok and Suha Kwak},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CbbTF6tDhW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2378421, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16888605883592740252&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "postech.ac.kr;postech.ac.kr;postech.ac.kr;postech.ac.kr;postech.ac.kr", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Pohang University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.postech.ac.kr", "aff_unique_abbr": "POSTECH", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Pohang", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Learning Iterative Reasoning through Energy Diffusion", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34671", "id": "CduFAALvGe", "proceeding": "https://proceedings.mlr.press/v235/du24f.html", "pdf": "https://openreview.net/pdf?id=CduFAALvGe", "openreview": "https://openreview.net/forum?id=CduFAALvGe", "author_site": "Yilun Du, Jiayuan Mao, Josh Tenenbaum", "tldr": "", "abstract": "We introduce iterative reasoning through energy diffusion (IRED), a novel framework for learning to reason for a variety of tasks by formulating reasoning and decision-making problems with energy-based optimization. IRED learns energy functions to represent the constraints between input conditions and desired outputs. After training, IRED adapts the number of optimization steps during inference based on problem difficulty, enabling it to solve problems outside its training distribution --- such as more complex Sudoku puzzles, matrix completion with large value magnitudes, and path finding in larger graphs. Key to our method\u2019s success is two novel techniques: learning a sequence of annealed energy landscapes for easier inference and a combination of score function and energy landscape supervision for faster and more stable training. Our experiments show that IRED outperforms existing methods in continuous-space reasoning, discrete-space reasoning, and planning tasks, particularly in more challenging scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yilun Du;Jiayuan Mao;Joshua B. Tenenbaum", "authorids": "~Yilun_Du1;~Jiayuan_Mao1;~Joshua_B._Tenenbaum1", "gender": ";F;", "homepage": "https://yilundu.github.io;http://jiayuanm.com;", "dblp": "204/4379;200/8283;t/JoshuaBTenenbaum", "google_scholar": ";-xaOIZIAAAAJ;", "orcid": ";0000-0003-4798-3748;", "linkedin": ";;", "or_profile": "~Yilun_Du1;~Jiayuan_Mao1;~Joshua_B._Tenenbaum1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu", "position": "PhD student;PhD student;Professor", "bibtex": "@inproceedings{\ndu2024learning,\ntitle={Learning Iterative Reasoning through Energy Diffusion},\nauthor={Yilun Du and Jiayuan Mao and Joshua B. Tenenbaum},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CduFAALvGe}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3671480, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2848952209700039499&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "mit.edu;mit.edu;mit.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Reference Neural Operators: Learning the Smooth Dependence of Solutions of PDEs on Geometric Deformations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34670", "id": "CecY6XiUfu", "proceeding": "https://proceedings.mlr.press/v235/cheng24c.html", "pdf": "https://openreview.net/pdf?id=CecY6XiUfu", "openreview": "https://openreview.net/forum?id=CecY6XiUfu", "author_site": "Ze Cheng, Zhongkai Hao, Wang Xiaoqiang, Jianing Huang, Youjia Wu, Xudan Liu, Yiru Zhao, LIU SONGMING, Hang Su", "tldr": "", "abstract": "For partial differential equations on domains of arbitrary shapes, existing works of neural operators attempt to learn a mapping from geometries to solutions. It often requires a large dataset of geometry-solution pairs in order to obtain a sufficiently accurate neural operator. However, for many industrial applications, e.g., engineering design optimization, it can be prohibitive to satisfy the requirement since even a single simulation may take hours or days of computation. To address this issue, we propose *reference neural operators* (RNO), a novel way of implementing neural operators, i.e., to learn the smooth dependence of solutions on geometric deformations. Specifically, given a reference solution, RNO can predict solutions corresponding to arbitrary deformations of the referred geometry. This approach turns out to be much more data efficient. Through extensive experiments, we show that RNO can learn the dependence across various types and different numbers of geometry objects with relatively small datasets. RNO outperforms baseline models in accuracy by a large lead and achieves up to 80% error reduction.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ze Cheng;Zhongkai Hao;Xiaoqiang Wang;Jianing Huang;Youjia Wu;Xudan Liu;Yiru Zhao;Songming Liu;Hang Su", "authorids": "~Ze_Cheng2;~Zhongkai_Hao1;xiaoqiang.wang2@cn.bosch.com;jianing.huang@cn.bosch.com;~Youjia_Wu1;xudan.liu@cn.bosch.com;yiru.zhao@cn.bosch.com;~Songming_Liu1;~Hang_Su3", "gender": "M;;;;M;;;M;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;285/4585;", "google_scholar": "Kbwi2AMAAAAJ;;;;;;;6urFg8kAAAAJ;", "orcid": ";;;;0000-0002-7399-1306;;;;", "linkedin": ";;;;;;;%E6%9D%BE%E9%93%AD-%E5%88%98-7b8339254/;", "or_profile": "~Ze_Cheng2;~Zhongkai_Hao1;xiaoqiang.wang2@cn.bosch.com;jianing.huang@cn.bosch.com;~Youjia_Wu1;xudan.liu@cn.bosch.com;yiru.zhao@cn.bosch.com;~Songming_Liu1;~Hang_Su3", "aff": "Bosch Artificial Intelligence Center;;;;Bosch;;;Tsinghua University;", "aff_domain": "cn.bosch.com;;;;bosch.com;;;mails.tsinghua.edu.cn;", "position": "Researcher;;;;Researcher;;;PhD student;", "bibtex": "@inproceedings{\ncheng2024reference,\ntitle={Reference Neural Operators: Learning the Smooth Dependence of Solutions of {PDE}s on Geometric Deformations},\nauthor={Ze Cheng and Zhongkai Hao and Xiaoqiang Wang and Jianing Huang and Youjia Wu and Xudan Liu and Yiru Zhao and Songming Liu and Hang Su},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CecY6XiUfu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1911330, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9655914017255580677&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 6, "email": "cn.bosch.com;;;;bosch.com;;;mails.tsinghua.edu.cn;", "author_num": 9, "aff_unique_index": "0;1;2", "aff_unique_norm": "Bosch;Robert Bosch GmbH;Tsinghua University", "aff_unique_dep": "Artificial Intelligence Center;;", "aff_unique_url": "https://www.bosch.com;https://www.bosch.com;https://www.tsinghua.edu.cn", "aff_unique_abbr": "Bosch AI;Bosch;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Germany;China" }, { "title": "Interpreting and Improving Large Language Models in Arithmetic Calculation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34669", "id": "CfOtiepP8s", "proceeding": "https://proceedings.mlr.press/v235/zhang24bk.html", "pdf": "https://openreview.net/pdf?id=CfOtiepP8s", "openreview": "https://openreview.net/forum?id=CfOtiepP8s", "author_site": "Wei Zhang, Wan Chaoqun, Yonggang Zhang, Yiu Ming Cheung, Xinmei Tian, Xu Shen, Jieping Ye", "tldr": "", "abstract": "Large language models (LLMs) have demonstrated remarkable potential across numerous applications and have shown an emergent ability to tackle complex reasoning tasks, such as mathematical computations. However, even for the simplest arithmetic calculations, the intrinsic mechanisms behind LLMs remains mysterious, making it challenging to ensure reliability. In this work, we delve into uncovering a specific mechanism by which LLMs execute calculations. Through comprehensive experiments, we find that LLMs frequently involve a small fraction (<5%) of attention heads, which play a pivotal role in focusing on operands and operators during calculation processes. Subsequently, the information from these operands is processed through multi-layer perceptrons (MLPs), progressively leading to the final solution. These pivotal heads/MLPs, though identified on a specific dataset, exhibit transferability across different datasets and even distinct tasks. This insight prompted us to investigate the potential benefits of selectively fine-tuning these essential heads/MLPs to boost the LLMs' computational performance. We empirically find that such precise tuning can yield notable enhancements on mathematical prowess, without compromising the performance on non-mathematical tasks. Our work serves as a preliminary exploration into the arithmetic calculation abilities inherent in LLMs, laying a solid foundation to reveal more intricate mathematical tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wei Zhang;Chaoqun Wan;Yonggang Zhang;Yiu-ming Cheung;Xinmei Tian;Xu Shen;Jieping Ye", "authorids": "~Wei_Zhang58;~Chaoqun_Wan2;~Yonggang_Zhang1;~Yiu-ming_Cheung1;~Xinmei_Tian1;~Xu_Shen1;~Jieping_Ye4", "gender": ";;M;;F;M;M", "homepage": ";;https://yonggangzhangben.github.io/index.html;;https://faculty.ustc.edu.cn/tianxinmei1/zh_CN/index.htm;;http://yelabs.net/", "dblp": ";;27/6859-3;;03/5204-1;09/10130-1.html;03/5454", "google_scholar": ";;XSbEr98AAAAJ;;https://scholar.google.com.au/citations?hl=zh-CN;38jwGs8AAAAJ;T9AzhwcAAAAJ", "orcid": ";;0000-0002-4080-7592;;0000-0002-5952-8753;;0000-0001-8662-5818", "linkedin": ";;;;;;", "or_profile": "~Wei_Zhang58;~Chaoqun_Wan2;~Yonggang_Zhang1;~Yiu-ming_Cheung1;~Xinmei_Tian1;~Xu_Shen1;~Jieping_Ye4", "aff": ";;Hong Kong Baptist University;;University of Science and Technology of China;Alibaba Group;Alibaba Group", "aff_domain": ";;hkbu.edu.hk;;ustc.edu.cn;alibaba-inc.com;alibaba-inc.com", "position": ";;Postdoc;;Full Professor;Researcher;Principal Researcher", "bibtex": "@inproceedings{\nzhang2024interpreting,\ntitle={Interpreting and Improving Large Language Models in Arithmetic Calculation},\nauthor={Wei Zhang and Chaoqun Wan and Yonggang Zhang and Yiu-ming Cheung and Xinmei Tian and Xu Shen and Jieping Ye},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CfOtiepP8s}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7829873, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11203321937999602617&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": ";;hkbu.edu.hk;;ustc.edu.cn;alibaba-inc.com;alibaba-inc.com", "author_num": 7, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Hong Kong Baptist University;University of Science and Technology of China;Alibaba Group", "aff_unique_dep": ";;", "aff_unique_url": "https://www.hkbu.edu.hk;http://www.ustc.edu.cn;https://www.alibaba.com", "aff_unique_abbr": "HKBU;USTC;Alibaba", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Provable Interactive Learning with Hindsight Instruction Feedback", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34668", "id": "CgO2cuWWLV", "proceeding": "https://proceedings.mlr.press/v235/misra24a.html", "pdf": "https://openreview.net/pdf?id=CgO2cuWWLV", "openreview": "https://openreview.net/forum?id=CgO2cuWWLV", "author_site": "Dipendra Misra, Aldo Pacchiano, Robert Schapire", "tldr": "", "abstract": "We study interactive learning in a setting where the agent has to generate a response (e.g., an action or trajectory) given a context and an instruction. In contrast, to typical approaches that train the system using reward or expert supervision on response, we study _learning with hindsight labeling_ where a teacher provides an instruction that is most suitable for the agent's generated response. This hindsight labeling of instruction is often easier to provide than providing expert supervision of the optimal response which may require expert knowledge or can be impractical to elicit. We initiate the theoretical analysis of _interactive learning with hindsight labeling_. We first provide a lower bound showing that in general, the regret of any algorithm must scale with the size of the agent's response space. Next, we study a specialized setting where the underlying instruction-response distribution can be decomposed as a low-rank matrix. We introduce an algorithm called LORIL for this setting and show that it is a no-regret algorithm with the regret scaling with $\\sqrt{T}$ and depends on the _intrinsic rank_ but does not depend on the agent's response space. We provide experiments showing the performance of LORIL in practice for 2 domains.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dipendra Misra;Aldo Pacchiano;Robert E. Schapire", "authorids": "~Dipendra_Misra1;~Aldo_Pacchiano1;~Robert_E._Schapire1", "gender": "M;M;", "homepage": "https://dipendramisra.com/;https://www.aldopacchiano.ai;", "dblp": "218/6569;129/6338;", "google_scholar": "rIoPIFsAAAAJ;no_BfYgAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Dipendra_Misra1;~Aldo_Pacchiano1;~Robert_E._Schapire1", "aff": "Microsoft Research;Broad Institute;", "aff_domain": "microsoft.com;broadinstitute.org;", "position": "Researcher;Postdoc;", "bibtex": "@inproceedings{\nmisra2024provable,\ntitle={Provable Interactive Learning with Hindsight Instruction Feedback},\nauthor={Dipendra Misra and Aldo Pacchiano and Robert E. Schapire},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CgO2cuWWLV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1008541, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3532622567130936546&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "microsoft.com;broadinstitute.org;", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Microsoft;Broad Institute", "aff_unique_dep": "Microsoft Research;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.broadinstitute.org", "aff_unique_abbr": "MSR;Broad", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "IOI: Invisible One-Iteration Adversarial Attack on No-Reference Image- and Video-Quality Metrics", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34667", "id": "Chy4rSqy4Y", "proceeding": "https://proceedings.mlr.press/v235/shumitskaya24a.html", "pdf": "https://openreview.net/pdf?id=Chy4rSqy4Y", "openreview": "https://openreview.net/forum?id=Chy4rSqy4Y", "author_site": "Ekaterina Shumitskaya, Anastasia Antsiferova, Dmitriy Vatolin", "tldr": "", "abstract": "No-reference image- and video-quality metrics are widely used in video processing benchmarks. The robustness of learning-based metrics under video attacks has not been widely studied. In addition to having success, attacks on metrics that can be employed in video processing benchmarks must be fast and imperceptible. This paper introduces an Invisible One-Iteration (IOI) adversarial attack on no-reference image and video quality metrics. The proposed method uses two modules to ensure high visual quality and temporal stability of adversarial videos and runs for one iteration, which makes it fast. We compared our method alongside eight prior approaches using image and video datasets via objective and subjective tests. Our method exhibited superior visual quality across various attacked metric architectures while maintaining comparable attack success and speed. We made the code available on GitHub: https://github.com/katiashh/ioi-attack.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ekaterina Shumitskaya;Anastasia Antsiferova;Dmitriy S. Vatolin", "authorids": "~Ekaterina_Shumitskaya1;~Anastasia_Antsiferova1;~Dmitriy_S._Vatolin1", "gender": "F;F;M", "homepage": ";;", "dblp": "332/1835;215/2614;60/2567", "google_scholar": "5JyXGA4AAAAJ;lJ-GGU8AAAAJ;https://scholar.google.ru/citations?user=545J9E4AAAAJ", "orcid": "0000-0002-6453-5616;0000-0002-1272-5135;0000-0002-8893-9340", "linkedin": ";;dmitriyvatolin/", "or_profile": "~Ekaterina_Shumitskaya1;~Anastasia_Antsiferova1;~Dmitriy_S._Vatolin1", "aff": "Moscow State University, Lomonosov Moscow State University;MSU Institute for Artificial Intelligence;Moscow State University, Lomonosov Moscow State University", "aff_domain": "cs.msu.ru;iai.msu.ru;cs.msu.ru", "position": "MS student;Principal Researcher;Principal Researcher", "bibtex": "@inproceedings{\nshumitskaya2024ioi,\ntitle={{IOI}: Invisible One-Iteration Adversarial Attack on No-Reference Image- and Video-Quality Metrics},\nauthor={Ekaterina Shumitskaya and Anastasia Antsiferova and Dmitriy S. Vatolin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Chy4rSqy4Y}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7530717, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11287278288277149264&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "cs.msu.ru;iai.msu.ru;cs.msu.ru", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Lomonosov Moscow State University;Michigan State University", "aff_unique_dep": ";Institute for Artificial Intelligence", "aff_unique_url": "https://www.msu.ru;https://www.msu.edu", "aff_unique_abbr": "MSU;MSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Russian Federation;United States" }, { "title": "Statistical Inference Under Constrained Selection Bias", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34666", "id": "CiZN2OATRp", "proceeding": "https://proceedings.mlr.press/v235/cortes-gomez24a.html", "pdf": "https://openreview.net/pdf?id=CiZN2OATRp", "openreview": "https://openreview.net/forum?id=CiZN2OATRp", "author_site": "Santiago Cortes-Gomez, Mateo Dulce Rubio, Carlos Miguel Pati\u00f1o, Bryan Wilder", "tldr": "", "abstract": "Large-scale datasets are increasingly being used to inform decision making. While this effort aims to ground policy in real-world evidence, challenges have arisen as selection bias and other forms of distribution shifts often plague observational data. Previous attempts to provide robust inference have given guarantees depending on a user-specified amount of possible distribution shift (e.g., the maximum KL divergence between the observed and target distributions). However, decision makers will often have additional knowledge about the target distribution which constrains the kind of possible shifts. To leverage such information, we propose a framework that enables statistical inference in the presence of selection bias which obeys user-specified constraints in the form of functions whose expectation is known under the target distribution. The output is high-probability bounds on the value of an estimand for the target distribution. Hence, our method leverages domain knowledge in order to partially identify a wide class of estimands. We analyze the computational and statistical properties of methods to estimate these bounds and show that our method can produce informative bounds on a variety of simulated and semisynthetic tasks, as well as in a real-world use case.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Santiago Cortes-Gomez;Mateo Dulce Rubio;Carlos Miguel Pati\u00f1o;Bryan Wilder", "authorids": "~Santiago_Cortes-Gomez1;~Mateo_Dulce_Rubio1;~Carlos_Miguel_Pati\u00f1o1;~Bryan_Wilder2", "gender": "M;M;M;", "homepage": "https://secg5.github.io;https://mdulcer.github.io/;https://cmpatino.github.io/;https://bryanwilder.github.io/", "dblp": "182/6679;286/1864;275/0038;164/1648", "google_scholar": ";https://scholar.google.com.co/citations?hl=es;https://scholar.google.es/citations?hl=es;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Santiago_Cortes-Gomez1;~Mateo_Dulce_Rubio1;~Carlos_Miguel_Pati\u00f1o1;~Bryan_Wilder2", "aff": "School of Computer Science, Carnegie Mellon University;Carnegie Mellon University;University of Amsterdam;Carnegie Mellon University", "aff_domain": "cs.cmu.edu;andrew.cmu.edu;uva.nl;cmu.edu", "position": "PhD student;PhD student;MS student;Assistant Professor", "bibtex": "@inproceedings{\ncortes-gomez2024statistical,\ntitle={Statistical Inference Under Constrained Selection Bias},\nauthor={Santiago Cortes-Gomez and Mateo Dulce Rubio and Carlos Miguel Pati{\\~n}o and Bryan Wilder},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CiZN2OATRp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1647502, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10749153550703004364&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "cs.cmu.edu;andrew.cmu.edu;uva.nl;cmu.edu", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Carnegie Mellon University;University of Amsterdam", "aff_unique_dep": "School of Computer Science;", "aff_unique_url": "https://www.cmu.edu;https://www.uva.nl", "aff_unique_abbr": "CMU;UvA", "aff_campus_unique_index": "0", "aff_campus_unique": "Pittsburgh;", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;Netherlands" }, { "title": "Accelerating Parallel Sampling of Diffusion Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34665", "id": "CjVWen8aJL", "proceeding": "https://proceedings.mlr.press/v235/tang24f.html", "pdf": "https://openreview.net/pdf?id=CjVWen8aJL", "openreview": "https://openreview.net/forum?id=CjVWen8aJL", "author_site": "Zhiwei Tang, Jiasheng Tang, Hao Luo, Fan Wang, Tsung-Hui Chang", "tldr": "", "abstract": "Diffusion models have emerged as state-of-the-art generative models for image generation. However, sampling from diffusion models is usually time-consuming due to the inherent autoregressive nature of their sampling process. In this work, we propose a novel approach that accelerates the sampling of diffusion models by parallelizing the autoregressive process. Specifically, we reformulate the sampling process as solving a system of triangular nonlinear equations through fixed-point iteration. With this innovative formulation, we explore several systematic techniques to further reduce the iteration steps required by the solving process. Applying these techniques, we introduce ParaTAA, a universal and training-free parallel sampling algorithm that can leverage extra computational and memory resources to increase the sampling speed. Our experiments demonstrate that ParaTAA can decrease the inference steps required by common sequential sampling algorithms such as DDIM and DDPM by a factor of 4$\\sim$14 times. Notably, when applying ParaTAA with 100 steps DDIM for Stable Diffusion, a widely-used text-to-image diffusion model, it can produce the same images as the sequential sampling in only 7 inference steps. The code is available at https://github.com/TZW1998/ParaTAA-Diffusion.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhiwei Tang;Jiasheng Tang;Hao Luo;Fan Wang;Tsung-Hui Chang", "authorids": "~Zhiwei_Tang1;~Jiasheng_Tang1;~Hao_Luo1;~Fan_Wang6;~Tsung-Hui_Chang1", "gender": "M;;M;F;", "homepage": "https://zhiweitang-ml.bio;;http://luohao.site/;;", "dblp": ";220/4083;14/3727-4;;", "google_scholar": "GN-N9c8AAAAJ;;7QvWnzMAAAAJ;WCRGTHsAAAAJ;", "orcid": ";;0000-0002-6405-4011;0000-0001-7320-1119;", "linkedin": ";;;;", "or_profile": "~Zhiwei_Tang1;~Jiasheng_Tang1;~Hao_Luo1;~Fan_Wang6;~Tsung-Hui_Chang1", "aff": "Chinese University of HongKong, Shenzhen;Alibaba Group, DAMO Academy;Alibaba Group;Alibaba Group;", "aff_domain": "link.cuhk.edu.cn;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;", "position": "PhD student;Researcher;Researcher;Senior Staff Algorithm Engineer;", "bibtex": "@inproceedings{\ntang2024accelerating,\ntitle={Accelerating Parallel Sampling of Diffusion Models},\nauthor={Zhiwei Tang and Jiasheng Tang and Hao Luo and Fan Wang and Tsung-Hui Chang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CjVWen8aJL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1533918, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6919575691107745855&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "link.cuhk.edu.cn;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;", "author_num": 5, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Chinese University of Hong Kong;Alibaba Group", "aff_unique_dep": ";DAMO Academy", "aff_unique_url": "https://www.cuhk.edu.cn;https://www.alibaba-group.com", "aff_unique_abbr": "CUHK;Alibaba", "aff_campus_unique_index": "0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Towards Global Optimality for Practical Average Reward Reinforcement Learning without Mixing Time Oracles", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34664", "id": "ClWdplZ12B", "proceeding": "https://proceedings.mlr.press/v235/patel24b.html", "pdf": "https://openreview.net/pdf?id=ClWdplZ12B", "openreview": "https://openreview.net/forum?id=ClWdplZ12B", "author_site": "Bhrij Patel, Wesley A. Suttle, Alec Koppel, Vaneet Aggarwal, Brian Sadler, Dinesh Manocha, Amrit Singh Bedi", "tldr": "", "abstract": "In the context of average-reward reinforcement learning, the requirement for oracle knowledge of the mixing time, a measure of the duration a Markov chain under a fixed policy needs to achieve its stationary distribution, poses a significant challenge for the global convergence of policy gradient methods. This requirement is particularly problematic due to the difficulty and expense of estimating mixing time in environments with large state spaces, leading to the necessity of impractically long trajectories for effective gradient estimation in practical applications. To address this limitation, we consider the Multi-level Actor-Critic (MAC) framework, which incorporates a Multi-level Monte-Carlo (MLMC) gradient estimator. With our approach, we effectively alleviate the dependency on mixing time knowledge, a first for average-reward MDPs global convergence. Furthermore, our approach exhibits the tightest available dependence of $\\mathcal{O}(\\sqrt{\\tau_{mix}})$ known from prior work. With a 2D grid world goal-reaching navigation experiment, we demonstrate that MAC outperforms the existing state-of-the-art policy gradient-based method for average reward settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bhrij Patel;Wesley A Suttle;Alec Koppel;Vaneet Aggarwal;Brian M. Sadler;Dinesh Manocha;Amrit Bedi", "authorids": "~Bhrij_Patel1;~Wesley_A_Suttle1;~Alec_Koppel1;~Vaneet_Aggarwal1;~Brian_M._Sadler1;~Dinesh_Manocha3;~Amrit_Bedi1", "gender": "M;M;M;M;M;M;", "homepage": "https://bridge00.github.io/;http://koppel.netlify.app/;;https://oden.utexas.edu/people/directory/Brian-Sadler/;https://www.cs.umd.edu/people/dmanocha;https://sites.google.com/view/amritsinghbedi/home;http://www.wesleysuttle.com", "dblp": "264/9735;149/0076;91/6560;26/3347;m/DineshManocha;176/2707.html;238/0223", "google_scholar": "rV_6eoIAAAAJ;8ClxyjIAAAAJ;;s9eCQn4AAAAJ;X08l_4IAAAAJ;91WLA6QAAAAJ;Tf6oDygAAAAJ", "orcid": "0000-0002-8296-4537;0000-0003-2447-2873;;0000-0002-9564-3812;0000-0001-7047-9801;;", "linkedin": "bhrijpatel;alec-koppel-9860b697/;;brian-sadler-5909102a/;dinesh-manocha-2311846;;", "or_profile": "~Bhrij_Patel1;~Alec_Koppel1;~Vaneet_Aggarwal1;~Brian_M._Sadler1;~Dinesh_Manocha3;~Amrit_Bedi1;~Wesley_Suttle1", "aff": "University of Maryland, College Park;J.P. Morgan Chase;Purdue University;US Army Research Laboratory;University of Maryland, College Park;University of Maryland, College Park;Army Research Laboratory", "aff_domain": "cs.umd.edu;jpmorgan.com;purdue.edu;army.mil;umd.edu;umd.edu;army.mil", "position": "PhD student;Research Team Lead;Full Professor;Principal Researcher;Professor;Researcher;Postdoc", "bibtex": "@inproceedings{\npatel2024towards,\ntitle={Towards Global Optimality for Practical Average Reward Reinforcement Learning without Mixing Time Oracles},\nauthor={Bhrij Patel and Wesley A Suttle and Alec Koppel and Vaneet Aggarwal and Brian M. Sadler and Dinesh Manocha and Amrit Bedi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ClWdplZ12B}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 681489, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6041324150780282908&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "cs.umd.edu;jpmorgan.com;purdue.edu;army.mil;umd.edu;umd.edu;army.mil", "author_num": 7, "aff_unique_index": "0;1;2;3;0;0;4", "aff_unique_norm": "University of Maryland;JPMorgan Chase & Co.;Purdue University;US Army Research Laboratory;Army Research Laboratory", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www/umd.edu;https://www.jpmorganchase.com;https://www.purdue.edu;https://www.arl.army.mil;https://www.arl.army.mil", "aff_unique_abbr": "UMD;JPM;Purdue;ARL;ARL", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "How Well Can LLMs Negotiate? NegotiationArena Platform and Analysis", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34663", "id": "CmOmaxkt8p", "proceeding": "https://proceedings.mlr.press/v235/bianchi24a.html", "pdf": "https://openreview.net/pdf?id=CmOmaxkt8p", "openreview": "https://openreview.net/forum?id=CmOmaxkt8p", "author_site": "Federico Bianchi, Patrick John Chia, Mert Yuksekgonul, Jacopo Tagliabue, Dan Jurafsky, James Zou", "tldr": "", "abstract": "Negotiation is the basis of social interactions; humans negotiate everything from the price of cars to how to share common resources. With rapidly growing interest in using large language models (LLMs) to act as agents on behalf of human users, such LLM agents would also need to be able to negotiate. In this paper, we study how well LLMs can negotiate with each other. We develop NegotiationArena: a flexible framework for evaluating and probing the negotiation abilities of LLM agents. We implemented three types of scenarios in NegotiationArena to assess LLM's behaviors in allocating shared resources (ultimatum games), aggregate resources (trading games) and buy/sell goods (price negotiations). Each scenario allows for multiple turns of flexible dialogues between LLM agents to allow for more complex negotiations. Interestingly, LLM agents can significantly boost their negotiation outcomes by employing certain behavioral tactics. For example, by pretending to be desolate and desperate, LLMs can improve their payoffs by 20% when negotiating against the standard GPT-4. We also quantify irrational negotiation behaviors exhibited by the LLM agents, many of which also appear in humans. Together, NegotiationArena offers a new environment to investigate LLM interactions, enabling new insights into LLM's theory of mind, irrationality, and reasoning abilities", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Federico Bianchi;Patrick John Chia;Mert Yuksekgonul;Jacopo Tagliabue;Dan Jurafsky;James Zou", "authorids": "~Federico_Bianchi1;~Patrick_John_Chia1;~Mert_Yuksekgonul1;~Jacopo_Tagliabue1;~Dan_Jurafsky1;~James_Zou1", "gender": ";M;M;M;M;", "homepage": "https://federicobianchi.io;https://patrickjohncyh.github.io/;https://cs.stanford.edu/~merty;http://www.jacopotagliabue.it/;http://web.stanford.edu/~jurafsky/;", "dblp": "122/8815-1;;249/5558;;31/985;", "google_scholar": "1okGjb8AAAAJ;;https://scholar.google.com/citations?hl=en;;uZg9l58AAAAJ;23ZXZvEAAAAJ", "orcid": "0000-0003-0776-361X;;;;;", "linkedin": "federico-bianchi-3b7998121/;;;jacopotagliabue/;;", "or_profile": "~Federico_Bianchi1;~Patrick_John_Chia1;~Mert_Yuksekgonul1;~Jacopo_Tagliabue1;~Dan_Jurafsky1;~James_Zou1", "aff": "Stanford University;Massachusetts Institute of Technology;Microsoft;Coveo;Stanford University;Stanford University", "aff_domain": "stanford.edu;mit.edu;microsoft.com;coveo.com;stanford.edu;stanford.edu", "position": "Postdoc;Undergrad student;Intern;Director of AI;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nbianchi2024how,\ntitle={How Well Can {LLM}s Negotiate? NegotiationArena Platform and Analysis},\nauthor={Federico Bianchi and Patrick John Chia and Mert Yuksekgonul and Jacopo Tagliabue and Dan Jurafsky and James Zou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CmOmaxkt8p}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1028761, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12141212129471865510&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "stanford.edu;mit.edu;microsoft.com;coveo.com;stanford.edu;stanford.edu", "author_num": 6, "aff_unique_index": "0;1;2;3;0;0", "aff_unique_norm": "Stanford University;Massachusetts Institute of Technology;Microsoft;Coveo", "aff_unique_dep": ";;Microsoft Corporation;", "aff_unique_url": "https://www.stanford.edu;https://web.mit.edu;https://www.microsoft.com;https://www.coveo.com", "aff_unique_abbr": "Stanford;MIT;Microsoft;", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "United States;Canada" }, { "title": "Implicit Bias of AdamW: $\\ell_\\infty$-Norm Constrained Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34662", "id": "CmXkdlO6JJ", "proceeding": "https://proceedings.mlr.press/v235/xie24e.html", "pdf": "https://openreview.net/pdf?id=CmXkdlO6JJ", "openreview": "https://openreview.net/forum?id=CmXkdlO6JJ", "author_site": "Shuo Xie, Zhiyuan Li", "tldr": "", "abstract": "Adam with decoupled weight decay, also known as AdamW, is widely acclaimed for its superior performance in language modeling tasks, surpassing Adam with $\\ell_2$ regularization in terms of generalization and optimization. However, this advantage is not theoretically well-understood. One challenge here is that though intuitively Adam with $\\ell_2$ regularization optimizes the $\\ell_2$ regularized loss, it is not clear if AdamW optimizes a specific objective. In this work, we make progress toward understanding the benefit of AdamW by showing that it implicitly performs constrained optimization. More concretely, we show in the full-batch setting, if AdamW converges with any non-increasing learning rate schedule whose partial sum diverges, it must converge to a KKT point of the original loss under the constraint that the $\\ell_\\infty$ norm of the parameter is bounded by the inverse of the weight decay factor. This result is built on the observation that Adam can be viewed as a smoothed version of SignGD, which is the normalized steepest descent with respect to $\\ell_\\infty$ norm, and a surprising connection between normalized steepest descent with weight decay and Frank-Wolfe.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shuo Xie;Zhiyuan Li", "authorids": "~Shuo_Xie2;~Zhiyuan_Li2", "gender": ";M", "homepage": "https://shuox.ttic.edu/;https://zhiyuanli.ttic.edu", "dblp": ";l/ZhiyuanLi", "google_scholar": ";https://scholar.google.com/citations?hl=en", "orcid": ";", "linkedin": ";", "or_profile": "~Shuo_Xie2;~Zhiyuan_Li2", "aff": "Toyota Technological Institute at Chicago;Toyota Technological Institute at Chicago", "aff_domain": "ttic.edu;ttic.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nxie2024implicit,\ntitle={Implicit Bias of AdamW: \\${\\textbackslash}ell\\_{\\textbackslash}infty\\$-Norm Constrained Optimization},\nauthor={Shuo Xie and Zhiyuan Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CmXkdlO6JJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1945694, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "email": "ttic.edu;ttic.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Toyota Technological Institute at Chicago", "aff_unique_dep": "", "aff_unique_url": "https://www.tti-chicago.org", "aff_unique_abbr": "TTI Chicago", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Chicago", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Beyond Point Prediction: Score Matching-based Pseudolikelihood Estimation of Neural Marked Spatio-Temporal Point Process", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34661", "id": "CpI37NA7MO", "proceeding": "https://proceedings.mlr.press/v235/li24cb.html", "pdf": "https://openreview.net/pdf?id=CpI37NA7MO", "openreview": "https://openreview.net/forum?id=CpI37NA7MO", "author_site": "Zichong Li, Qunzhi Xu, Zhenghao Xu, Yajun Mei, Tuo Zhao, Hongyuan Zha", "tldr": "", "abstract": "Spatio-temporal point processes (STPPs) are potent mathematical tools for modeling and predicting events with both temporal and spatial features. Despite their versatility, most existing methods for learning STPPs either assume a restricted form of the spatio-temporal distribution, or suffer from inaccurate approximations of the intractable integral in the likelihood training objective. These issues typically arise from the normalization term of the probability density function. Moreover, existing works only provide point prediction for events without quantifying their uncertainty, such as confidence intervals for the event's arrival time and confidence regions for the event's location, which is crucial given the considerable randomness of the data. To tackle these challenges, we introduce SMASH: a Score MAtching-based pSeudolikeliHood estimator for learning marked STPPs. Specifically, our framework adopts a normalization-free objective by estimating the pseudolikelihood of marked STPPs through score-matching and predicts confidence intervals/regions for event time and location by generating samples through a score-based sampling algorithm. The superior performance of our proposed framework is demonstrated through extensive experiments on both point and confidence interval/region prediction of events.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zichong Li;Qunzhi Xu;Zhenghao Xu;Yajun Mei;Tuo Zhao;Hongyuan Zha", "authorids": "~Zichong_Li2;~Qunzhi_Xu1;~Zhenghao_Xu1;~Yajun_Mei1;~Tuo_Zhao2;~Hongyuan_Zha1", "gender": "M;M;M;M;;M", "homepage": "https://github.com/zichongli5/zichongli5.github.io;https://qunzhixu.github.io/;https://www.isye.gatech.edu/users/zhenghao-xu;http://www.isye.gatech.edu/~ymei/;;http://www2.isye.gatech.edu/~tzhao80", "dblp": ";273/9948;357/5585;;z/HongyuanZha;", "google_scholar": ";eTG_8wgAAAAJ;FRegzp4AAAAJ;cRoarqgAAAAJ;n1DQMIsAAAAJ;EJXN6tYAAAAJ", "orcid": ";;0000-0001-8076-5166;;;", "linkedin": ";;;;;", "or_profile": "~Zichong_Li2;~Qunzhi_Xu1;~Zhenghao_Xu1;~Yajun_Mei1;~Hongyuan_Zha1;~Tuo_Zhao1", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology;The Chinese University of Hong Kong, Shenzhen;Georgia Institute of Technology", "aff_domain": "gatech.edu;gatech.edu;gatech.edu;gatech.edu;cuhk.edu.cn;gatech.edu", "position": "PhD student;PhD student;PhD student;Full Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nli2024beyond,\ntitle={Beyond Point Prediction: Score Matching-based Pseudolikelihood Estimation of Neural Marked Spatio-Temporal Point Process},\nauthor={Zichong Li and Qunzhi Xu and Zhenghao Xu and Yajun Mei and Tuo Zhao and Hongyuan Zha},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CpI37NA7MO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 869891, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17120680573696680404&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "gatech.edu;gatech.edu;gatech.edu;gatech.edu;cuhk.edu.cn;gatech.edu", "author_num": 6, "aff_unique_index": "0;0;0;0;1;0", "aff_unique_norm": "Georgia Institute of Technology;Chinese University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "https://www.gatech.edu;https://www.cuhk.edu.cn", "aff_unique_abbr": "Georgia Tech;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0;0;1;0", "aff_country_unique": "United States;China" }, { "title": "Generating Chain-of-Thoughts with a Pairwise-Comparison Approach to Searching for the Most Promising Intermediate Thought", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34660", "id": "CpcaL75UgY", "proceeding": "https://proceedings.mlr.press/v235/zhang24t.html", "pdf": "https://openreview.net/pdf?id=CpcaL75UgY", "openreview": "https://openreview.net/forum?id=CpcaL75UgY", "author_site": "Zhen-Yu Zhang, Siwei Han, Huaxiu Yao, Gang Niu, Masashi Sugiyama", "tldr": "", "abstract": "To improve the ability of the large language model (LLMs) to tackle complex reasoning problems, chain-of-thoughts (CoT) methods were proposed to guide LLMs to reason step-by-step, enabling problem solving from simple to complex. State-of-the-art methods for generating such a chain involve interactive collaboration, where the learner generates candidate intermediate thoughts, evaluated by the LLM, guiding the generation of subsequent thoughts. However, a widespread yet understudied problem is that the evaluation from the LLM is typically noisy and unreliable, potentially misleading the generation process in selecting promising intermediate thoughts. In this paper, motivated by Vapnik's principle, we use pairwise-comparison evaluation instead of point-wise scoring to search for promising intermediate thoughts with the noisy feedback from the LLM. In each round, we randomly pair intermediate thoughts and directly prompt the LLM to select the more promising one from each pair, allowing us to identify the most promising thoughts through an iterative process. To further alleviate the noise in the comparison, we incorporate techniques from ensemble learning and dueling bandits, proposing two variants of the algorithm. Experiments on three real-world tasks demonstrate the effectiveness of our proposed algorithm and verify the rationale of the pairwise comparison mechanism.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhen-Yu Zhang;Siwei Han;Huaxiu Yao;Gang Niu;Masashi Sugiyama", "authorids": "~Zhen-Yu_Zhang1;~Siwei_Han1;~Huaxiu_Yao1;~Gang_Niu1;~Masashi_Sugiyama1", "gender": "M;F;M;M;M", "homepage": "https://zhangzy07.github.io/;https://lillianwei-h.github.io;http://huaxiuyao.mystrikingly.com;https://niug1984.github.io;http://www.ms.k.u-tokyo.ac.jp/sugi/", "dblp": ";224/7950;197/1635;26/3367-1;35/1228", "google_scholar": "https://scholar.google.co.jp/citations?user=JP8qCpUAAAAJ;oT1QQs8AAAAJ;A20BZnQAAAAJ;https://scholar.google.co.jp/citations?user=HOkcy00AAAAJ;https://scholar.google.co.jp/citations?user=GkYIrlIAAAAJ", "orcid": "0000-0003-2101-1836;;;;0000-0001-6658-6743", "linkedin": ";\u5072\u851a-\u97e9-4553142b7/;huaxiuyao/;;", "or_profile": "~Zhen-Yu_Zhang1;~Siwei_Han1;~Huaxiu_Yao1;~Gang_Niu1;~Masashi_Sugiyama1", "aff": "RIKEN;Fudan University;Department of Computer Science, University of North Carolina at Chapel Hill;Southeast University;The University of Tokyo", "aff_domain": "riken.jp;fudan.edu.cn;cs.unc.edu;seu.edu.cn;u-tokyo.ac.jp", "position": "Postdoc;Undergrad student;Assistant Professor;Adjunct Full Professor;Full Professor", "bibtex": "@inproceedings{\nzhang2024generating,\ntitle={Generating Chain-of-Thoughts with a Pairwise-Comparison Approach to Searching for the Most Promising Intermediate Thought},\nauthor={Zhen-Yu Zhang and Siwei Han and Huaxiu Yao and Gang Niu and Masashi Sugiyama},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CpcaL75UgY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 403139, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4991661074642026771&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "riken.jp;fudan.edu.cn;cs.unc.edu;seu.edu.cn;u-tokyo.ac.jp", "author_num": 5, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "RIKEN;Fudan University;University of North Carolina at Chapel Hill;Southeast University;University of Tokyo", "aff_unique_dep": ";;Department of Computer Science;;", "aff_unique_url": "https://www.riken.jp;https://www.fudan.edu.cn;https://www.unc.edu;https://www.seu.edu.cn/;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "RIKEN;Fudan;UNC Chapel Hill;SEU;UTokyo", "aff_campus_unique_index": "1", "aff_campus_unique": ";Chapel Hill", "aff_country_unique_index": "0;1;2;1;0", "aff_country_unique": "Japan;China;United States" }, { "title": "Implicit Compressibility of Overparametrized Neural Networks Trained with Heavy-Tailed SGD", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34659", "id": "CpgKRKBUTl", "proceeding": "https://proceedings.mlr.press/v235/wan24a.html", "pdf": "https://openreview.net/pdf?id=CpgKRKBUTl", "openreview": "https://openreview.net/forum?id=CpgKRKBUTl", "author_site": "Yijun Wan, Melih Barsbey, Abdellatif Zaidi, Umut Simsekli", "tldr": "", "abstract": "Neural network compression has been an increasingly important subject, not only due to its practical relevance, but also due to its theoretical implications, as there is an explicit connection between compressibility and generalization error. Recent studies have shown that the choice of the hyperparameters of stochastic gradient descent (SGD) can have an effect on the compressibility of the learned parameter vector. These results, however, rely on unverifiable assumptions and the resulting theory does not provide a practical guideline due to its implicitness. In this study, we propose a simple modification for SGD, such that the outputs of the algorithm will be provably compressible without making any nontrivial assumptions. We consider a one-hidden-layer neural network trained with SGD, and show that if we inject additive heavy-tailed noise to the iterates at each iteration, for _any_ compression rate, there exists a level of overparametrization such that the output of the algorithm will be compressible with high probability. To achieve this result, we make two main technical contributions: (i) we prove a \"propagation of chaos\" result for a class of heavy-tailed stochastic differential equations, and (ii) we derive error estimates for their Euler discretization. Our experiments suggest that the proposed approach not only achieves increased compressibility with various models and datasets, but also leads to robust test performance under pruning, even in more realistic architectures that lie beyond our theoretical setting.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yijun Wan;Melih Barsbey;Abdellatif Zaidi;Umut Simsekli", "authorids": "~Yijun_Wan1;~Melih_Barsbey1;~Abdellatif_Zaidi1;~Umut_Simsekli1", "gender": ";;M;M", "homepage": ";;http://www-syscom.univ-mlv.fr/~zaidi/;https://www.di.ens.fr/~simsekli/", "dblp": ";;07/3113;https://dblp.org/pers/s/Simsekli:Umut.html", "google_scholar": ";;;https://scholar.google.fr/citations?user=CuArAkgAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Yijun_Wan1;~Melih_Barsbey1;~Abdellatif_Zaidi1;~Umut_Simsekli1", "aff": ";;Universit\u00e9 Gustave Eiffel;INRIA", "aff_domain": ";;univ-eiffel.fr;inria.fr", "position": ";;Associate Professor;Research Faculty", "bibtex": "@inproceedings{\nwan2024implicit,\ntitle={Implicit Compressibility of Overparametrized Neural Networks Trained with Heavy-Tailed {SGD}},\nauthor={Yijun Wan and Melih Barsbey and Abdellatif Zaidi and Umut Simsekli},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CpgKRKBUTl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 707548, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4677395675619385908&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": ";;univ-eiffel.fr;inria.fr", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Universit\u00e9 Gustave Eiffel;INRIA", "aff_unique_dep": ";", "aff_unique_url": "https://www.univ-gustave-eiffel.fr;https://www.inria.fr", "aff_unique_abbr": "UGE;INRIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "title": "Meta Evidential Transformer for Few-Shot Open-Set Recognition", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34658", "id": "CquFGSIU6w", "proceeding": "https://proceedings.mlr.press/v235/sapkota24a.html", "pdf": "https://openreview.net/pdf?id=CquFGSIU6w", "openreview": "https://openreview.net/forum?id=CquFGSIU6w", "author_site": "Hitesh Sapkota, Krishna Neupane, Qi Yu", "tldr": "", "abstract": "Few-shot open-set recognition (FSOSR) aims to detect instances from unseen classes by utilizing a small set of labeled instances from closed-set classes. Accurately rejecting instances from open-set classes in the few-shot setting is fundamentally more challenging due to the weaker supervised signals resulting from fewer labels. Transformer-based few-shot methods exploit attention mapping to achieve a consistent representation. However, the softmax-generated attention map normalizes all the instances that assign unnecessary high attentive weights to those instances not close to the closed-set classes that negatively impact the detection performance. In addition, open-set samples that are similar to a certain closed-set class also pose a significant challenge to most existing FSOSR models. To address these challenges, we propose a novel Meta Evidential Transformer (MET) based FSOSR model that uses an evidential open-set loss to learn more compact closed-set class representations by effectively leveraging similar closed-set classes. MET further integrates an evidence-to-variance ratio to detect fundamentally challenging tasks and uses an evidence-guided cross-attention mechanism to better separate the difficult open-set samples. Experiments on real-world datasets demonstrate consistent improvement over existing competitive methods in unseen class recognition without deteriorating closed-set performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hitesh Sapkota;Krishna Prasad Neupane;Qi Yu", "authorids": "~Hitesh_Sapkota1;~Krishna_Prasad_Neupane1;~Qi_Yu1", "gender": "M;M;M", "homepage": "https://hiteshsapkota.github.io/;;https://www.rit.edu/mining/", "dblp": "251/4284;;58/6957-1", "google_scholar": "0FKsBXYAAAAJ;8UHcQU0AAAAJ;L3gWdfEAAAAJ", "orcid": ";;0000-0002-0426-5407", "linkedin": "hitesh-sapkota-2226051ba/;;", "or_profile": "~Hitesh_Sapkota1;~Krishna_Prasad_Neupane1;~Qi_Yu1", "aff": "Amazon;;Rochester Institute of Technology", "aff_domain": "amazon.com;;rit.edu", "position": "Researcher;;Professor", "bibtex": "@inproceedings{\nsapkota2024meta,\ntitle={Meta Evidential Transformer for Few-Shot Open-Set Recognition},\nauthor={Hitesh Sapkota and Krishna Prasad Neupane and Qi Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CquFGSIU6w}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2024013, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14116196602709478813&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "amazon.com;;rit.edu", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Amazon;Rochester Institute of Technology", "aff_unique_dep": "Amazon.com, Inc.;", "aff_unique_url": "https://www.amazon.com;https://www.rit.edu", "aff_unique_abbr": "Amazon;RIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Should we be going MAD? A Look at Multi-Agent Debate Strategies for LLMs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34657", "id": "CrUmgUaAQp", "proceeding": "https://proceedings.mlr.press/v235/smit24a.html", "pdf": "https://openreview.net/pdf?id=CrUmgUaAQp", "openreview": "https://openreview.net/forum?id=CrUmgUaAQp", "author_site": "Andries Smit, Nathan Grinsztajn, Paul Duckworth, Thomas Barrett, Arnu Pretorius", "tldr": "", "abstract": "Recent advancements in large language models (LLMs) underscore their potential for responding to inquiries in various domains. However, ensuring that generative agents provide accurate and reliable answers remains an ongoing challenge. In this context, multi-agent debate (MAD) has emerged as a promising strategy for enhancing the truthfulness of LLMs. We benchmark a range of debating and prompting strategies to explore the trade-offs between cost, time, and accuracy. Importantly, we find that multi-agent debating systems, in their current form, do not reliably outperform other proposed prompting strategies, such as self-consistency and ensembling using multiple reasoning paths. However, when performing hyperparameter tuning, several MAD systems, such as Multi-Persona, perform better. This suggests that MAD protocols might not be inherently worse than other approaches, but that they are more sensitive to different hyperparameter settings and difficult to optimize. We build on these results to offer insights into improving debating strategies, such as adjusting agent agreement levels, which can significantly enhance performance and even surpass all other non-debate protocols we evaluated. We provide an open-source repository to the community with several state-of-the-art protocols together with evaluation scripts to benchmark across popular research datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Andries Petrus Smit;Nathan Grinsztajn;Paul Duckworth;Thomas D Barrett;Arnu Pretorius", "authorids": "~Andries_Petrus_Smit1;~Nathan_Grinsztajn1;~Paul_Duckworth1;~Thomas_D_Barrett1;~Arnu_Pretorius1", "gender": "M;M;M;M;M", "homepage": ";https://nathangrinsztajn.github.io/;http://www.robots.ox.ac.uk/~scpd/;;", "dblp": ";;179/2160;248/8263;188/4368", "google_scholar": ";yVHIYEYAAAAJ;I64MZDoAAAAJ;nJa1KGIAAAAJ;zZ6ydrAAAAAJ", "orcid": ";0000-0001-6817-5972;0000-0001-9052-6919;0000-0001-6241-3028;", "linkedin": "andries-petrus-smit-856901170;nathan-grinsztajn-960379139/?locale=en_US;;tom-barrett-62b180a2/;arnupretorius/", "or_profile": "~Andries_Petrus_Smit1;~Nathan_Grinsztajn1;~Paul_Duckworth1;~Thomas_D_Barrett1;~Arnu_Pretorius1", "aff": "InstaDeep;InstaDeep;InstaDeep;InstaDeep;InstaDeep", "aff_domain": "instadeep.com;instadeep.com;instadeep.com;instadeep.com;instadeep.com", "position": "Researcher;Researcher;Principal Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\nsmit2024should,\ntitle={Should we be going {MAD}? A Look at Multi-Agent Debate Strategies for {LLM}s},\nauthor={Andries Petrus Smit and Nathan Grinsztajn and Paul Duckworth and Thomas D Barrett and Arnu Pretorius},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CrUmgUaAQp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 838258, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12782295297021980791&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "instadeep.com;instadeep.com;instadeep.com;instadeep.com;instadeep.com", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "InstaDeep", "aff_unique_dep": "", "aff_unique_url": "https://www.instadeep.com", "aff_unique_abbr": "InstaDeep", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Reflective Policy Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34656", "id": "Cs0Xy6WETl", "proceeding": "https://proceedings.mlr.press/v235/gan24b.html", "pdf": "https://openreview.net/pdf?id=Cs0Xy6WETl", "openreview": "https://openreview.net/forum?id=Cs0Xy6WETl", "author_site": "Yaozhong Gan, yan renye, zhe wu, Junliang Xing", "tldr": "", "abstract": "On-policy reinforcement learning methods, like Trust Region Policy Optimization (TRPO) and Proximal Policy Optimization (PPO), often demand extensive data per update, leading to sample inefficiency. This paper introduces Reflective Policy Optimization (RPO), a novel on-policy extension that amalgamates past and future state-action information for policy optimization. This approach empowers the agent for introspection, allowing modifications to its actions within the current state. Theoretical analysis confirms that policy performance is monotonically improved and contracts the solution space, consequently expediting the convergence procedure. Empirical results demonstrate RPO's feasibility and efficacy in two reinforcement learning benchmarks, culminating in superior sample efficiency. The source code of this work is available at https://github.com/Edgargan/RPO.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yaozhong Gan;Renye Yan;Zhe Wu;Junliang Xing", "authorids": "~Yaozhong_Gan1;~Renye_Yan1;~Zhe_Wu6;~Junliang_Xing1", "gender": "M;M;;M", "homepage": ";https://ieeexplore.ieee.org/author/37088970082;https://github.com/GoooKuuu;http://people.ucas.ac.cn/~jlxing?language=en", "dblp": "234/8610;;;43/7659.html", "google_scholar": "n46Z5BsAAAAJ;;;jSwNd3MAAAAJ", "orcid": ";;;0000-0001-6801-0510", "linkedin": ";;;https://www.linkedin.cn/incareer/in/ACoAAAvlU14B40ZWH1pxg5JJDtQ6LlgMYkp0e5s", "or_profile": "~Yaozhong_Gan1;~Renye_Yan1;~Zhe_Wu6;~Junliang_Xing1", "aff": "Qiyuan Laboratory;Peking University;Qiyuan laboratory;Tsinghua University", "aff_domain": "qiyuanlab.com;pku.edu.cn;qiyuanlab.com;tsinghua.edu.cn", "position": "Assistant Professor;PhD student;Researcher;Full Professor", "bibtex": "@inproceedings{\ngan2024reflective,\ntitle={Reflective Policy Optimization},\nauthor={Yaozhong Gan and Renye Yan and Zhe Wu and Junliang Xing},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Cs0Xy6WETl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6828770, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12218754483796030376&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "qiyuanlab.com;pku.edu.cn;qiyuanlab.com;tsinghua.edu.cn", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Qiyuan Laboratory;Peking University;Tsinghua University", "aff_unique_dep": ";;", "aff_unique_url": ";http://www.pku.edu.cn;https://www.tsinghua.edu.cn", "aff_unique_abbr": ";Peking U;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;1", "aff_country_unique": ";China" }, { "title": "How Deep Networks Learn Sparse and Hierarchical Data: the Sparse Random Hierarchy Model", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34655", "id": "CtEWswTjUd", "proceeding": "https://proceedings.mlr.press/v235/tomasini24a.html", "pdf": "https://openreview.net/pdf?id=CtEWswTjUd", "openreview": "https://openreview.net/forum?id=CtEWswTjUd", "author_site": "Umberto Tomasini, Matthieu Wyart", "tldr": "", "abstract": "Understanding what makes high-dimensional data learnable is a fundamental question in machine learning. On the one hand, it is believed that the success of deep learning lies in its ability to build a hierarchy of representations that become increasingly more abstract with depth, going from simple features like edges to more complex concepts. On the other hand, learning to be insensitive to invariances of the task, such as smooth transformations for image datasets, has been argued to be important for deep networks and it strongly correlates with their performance. In this work, we aim to explain this correlation and unify these two viewpoints. We show that by introducing sparsity to generative hierarchical models of data, the task acquires insensitivity to spatial transformations that are discrete versions of smooth transformations. In particular, we introduce the Sparse Random Hierarchy Model (SRHM), where we observe and rationalize that a hierarchical representation mirroring the hierarchical model is learnt precisely when such insensitivity is learnt, thereby explaining the strong correlation between the latter and performance. Moreover, we quantify how the sample complexity of CNNs learning the SRHM depends on both the sparsity and hierarchical structure of the task.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Umberto Maria Tomasini;Matthieu Wyart", "authorids": "~Umberto_Maria_Tomasini1;~Matthieu_Wyart2", "gender": ";M", "homepage": ";http://pcsl.epfl.ch/", "dblp": ";26/11007", "google_scholar": "cQqn1TQAAAAJ;https://scholar.google.ch/citations?user=1TttZYYAAAAJ", "orcid": ";0000-0003-0644-0990", "linkedin": "umberto-maria-tomasini-4289671aa;", "or_profile": "~Umberto_Maria_Tomasini1;~Matthieu_Wyart2", "aff": "EPFL - EPF Lausanne;EPFL - EPF Lausanne", "aff_domain": "epfl.ch;epfl.ch", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\ntomasini2024how,\ntitle={How Deep Networks Learn Sparse and Hierarchical Data: the Sparse Random Hierarchy Model},\nauthor={Umberto Maria Tomasini and Matthieu Wyart},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CtEWswTjUd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1420126, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12876592531278782628&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "epfl.ch;epfl.ch", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "EPFL", "aff_unique_dep": "", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "title": "Floating Anchor Diffusion Model for Multi-motif Scaffolding", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34654", "id": "CtgJUQxmEo", "proceeding": "https://proceedings.mlr.press/v235/liu24av.html", "pdf": "https://openreview.net/pdf?id=CtgJUQxmEo", "openreview": "https://openreview.net/forum?id=CtgJUQxmEo", "author_site": "Ke Liu, Weian Mao, Shuaike Shen, Xiaoran Jiao, Zheng Sun, Hao Chen, Chunhua Shen", "tldr": "", "abstract": "Motif scaffolding seeks to design scaffold structures for constructing proteins with functions derived from the desired motif, which is crucial for the design of vaccines and enzymes. Previous works approach the problem by inpainting or conditional generation. Both of them can only scaffold motifs with fixed positions, and the conditional generation cannot guarantee the presence of motifs. However, prior knowledge of the relative motif positions in a protein is not readily available, and constructing a protein with multiple functions in one protein is more general and significant because of the synergies between functions. We propose a Floating Anchor Diffusion (FADiff) model. FADiff allows motifs to float rigidly and independently in the process of diffusion, which guarantees the presence of motifs and automates the motif position design. Our experiments demonstrate the efficacy of FADiff with high success rates and designable novel scaffolds. To the best of our knowledge, FADiff is the first work to tackle the challenge of scaffolding multiple motifs without relying on the expertise of relative motif positions in the protein. Code is available at https://github.com/aim-uofa/FADiff.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ke Liu;Weian Mao;Shuaike Shen;Xiaoran Jiao;Zheng Sun;Hao Chen;Chunhua Shen", "authorids": "~Ke_Liu3;~Weian_Mao2;~Shuaike_Shen1;~Xiaoran_Jiao1;~Zheng_Sun7;~Hao_Chen17;~Chunhua_Shen2", "gender": ";M;;;M;;", "homepage": ";;;http://github.com/kamzero;https://ringhalsun.github.io/;;", "dblp": ";289/1631;;;;;", "google_scholar": ";Qu-QXTsAAAAJ;;;https://scholar.google.com/citations?hl=en;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Ke_Liu3;~Weian_Mao2;~Shuaike_Shen1;~Xiaoran_Jiao1;~Zheng_Sun7;~Hao_Chen17;~Chunhua_Shen2", "aff": ";University of Adelaide;;Zhejiang University;Swansea University;;", "aff_domain": ";adelaide.edu.au;;zju.edu.cn;swansea.ac.uk;;", "position": ";PhD student;;PhD student;MS student;;", "bibtex": "@inproceedings{\nliu2024floating,\ntitle={Floating Anchor Diffusion Model for Multi-motif Scaffolding},\nauthor={Ke Liu and Weian Mao and Shuaike Shen and Xiaoran Jiao and Zheng Sun and Hao Chen and Chunhua Shen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CtgJUQxmEo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3994469, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2126438443090736476&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";adelaide.edu.au;;zju.edu.cn;swansea.ac.uk;;", "author_num": 7, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Adelaide;Zhejiang University;Swansea University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.adelaide.edu.au;https://www.zju.edu.cn;https://www.swansea.ac.uk", "aff_unique_abbr": "Adelaide;ZJU;Swansea", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "Australia;China;United Kingdom" }, { "title": "Unlock the Cognitive Generalization of Deep Reinforcement Learning via Granular Ball Representation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34653", "id": "CtyLla0DU8", "proceeding": "https://proceedings.mlr.press/v235/liu24u.html", "pdf": "https://openreview.net/pdf?id=CtyLla0DU8", "openreview": "https://openreview.net/forum?id=CtyLla0DU8", "author_site": "Jiashun Liu, Jianye Hao, Yi Ma, Shuyin Xia", "tldr": "", "abstract": "The policies learned by humans in simple scenarios can be deployed in complex scenarios with the same task logic through limited feature alignment training, a process referred to as cognitive generalization or systematic generalization. Thus, a plausible conjecture is that unlocking cognitive generalization in DRL could enable effective generalization of policies from simple to complex scenarios through reward-agnostic fine-tuning. This would eliminate the need for designing reward functions in complex scenarios, thus reducing environment-building costs. In this paper, we propose a general framework to enhance the cognitive generalization ability of standard DRL methods. Our framework builds a cognitive latent space in a simple scenario, then segments the latent space to cluster samples with similar environmental influences into same subregion. During the fine-tuning in the complex scenario, the policy uses cognitive latent space to align the new sample with the same subregion sample collected from the simple scenario and approximates the rewards and Q values of the new samples for policy update. Based on this framework, we propose *Granular Ball Reinforcement Leaning* (GBRL), a practical algorithm via Variational Autoencoder (VAE) and Granular Ball Representation. GBRL achieves effective policy generalization on various difficult scenarios with the same task logic.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiashun Liu;Jianye HAO;Yi Ma;Shuyin Xia", "authorids": "~Jiashun_Liu1;~Jianye_HAO1;~Yi_Ma5;~Shuyin_Xia1", "gender": "M;M;;M", "homepage": ";http://www.icdai.org/jianye.html;https://mayi1996.top/;", "dblp": ";21/7664.html;69/1112-5.html;154/1976", "google_scholar": "iAYyfMAAAAAJ;;TdVWzqgAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";0000-0002-0422-8235;0000-0001-9375-6605;", "linkedin": ";;;", "or_profile": "~Jiashun_Liu1;~Jianye_HAO1;~Yi_Ma5;~Shuyin_Xia1", "aff": "Tianjin University;Tianjin University;Tianjin University;Chongqing University of Post and Telecommunications", "aff_domain": "tju.edu.cn;tju.edu.cn;tju.edu.cn;cqupt.edu.cn", "position": "MS student;Associate Professor;PhD student;Full Professor", "bibtex": "@inproceedings{\nliu2024unlock,\ntitle={Unlock the Cognitive Generalization of Deep Reinforcement Learning via Granular Ball Representation},\nauthor={Jiashun Liu and Jianye HAO and Yi Ma and Shuyin Xia},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CtyLla0DU8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 997043, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5139312777426584628&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": "tju.edu.cn;tju.edu.cn;tju.edu.cn;cqupt.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Tianjin University;Chongqing University of Post and Telecommunications", "aff_unique_dep": ";", "aff_unique_url": "http://www.tju.edu.cn;http://www.cqupt.edu.cn", "aff_unique_abbr": "TJU;CQUPT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Adapting Pretrained ViTs with Convolution Injector for Visuo-Motor Control", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34652", "id": "CuiRGtVI55", "proceeding": "https://proceedings.mlr.press/v235/hwang24c.html", "pdf": "https://openreview.net/pdf?id=CuiRGtVI55", "openreview": "https://openreview.net/forum?id=CuiRGtVI55", "author_site": "Dongyoon Hwang, Byungkun Lee, Hojoon Lee, Hyunseung Kim, Jaegul Choo", "tldr": "", "abstract": "Vision Transformers (ViT), when paired with large-scale pretraining, have shown remarkable performance across various computer vision tasks, primarily due to their weak inductive bias. However, while such weak inductive bias aids in pretraining scalability, this may hinder the effective adaptation of ViTs for visuo-motor control tasks as a result of the absence of control-centric inductive biases. Such absent inductive biases include spatial locality and translation equivariance bias which convolutions naturally offer. To this end, we introduce Convolution Injector (CoIn), an add-on module that injects convolutions which are rich in locality and equivariance biases into a pretrained ViT for effective adaptation in visuo-motor control. We evaluate CoIn with three distinct types of pretrained ViTs (CLIP, MVP, VC-1) across 12 varied control tasks within three separate domains (Adroit, MetaWorld, DMC), and demonstrate that CoIn consistently enhances control task performance across all experimented environments and models, validating the effectiveness of providing pretrained ViTs with control-centric biases.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dongyoon Hwang;Byungkun Lee;Hojoon Lee;Hyunseung Kim;Jaegul Choo", "authorids": "~Dongyoon_Hwang1;~Byungkun_Lee1;~Hojoon_Lee1;~Hyunseung_Kim1;~Jaegul_Choo1", "gender": "M;M;M;M;M", "homepage": ";;https://joonleesky.github.io/;;https://sites.google.com/site/jaegulchoo/", "dblp": ";;;244/0949;07/2074", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;RFjZjzkAAAAJ;;https://scholar.google.com/citations?view_op=list_works;GHJYsLEAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Dongyoon_Hwang1;~Byungkun_Lee1;~Hojoon_Lee1;~Hyunseung_Kim1;~Jaegul_Choo1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Sony AI;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;sony.com;kaist.ac.kr;kaist.ac.kr", "position": "PhD student;PhD student;Intern;PhD student;Associate Professor", "bibtex": "@inproceedings{\nhwang2024adapting,\ntitle={Adapting Pretrained ViTs with Convolution Injector for Visuo-Motor Control},\nauthor={Dongyoon Hwang and Byungkun Lee and Hojoon Lee and Hyunseung Kim and Jaegul Choo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CuiRGtVI55}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8231360, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:IMdUSPJwlXIJ:scholar.google.com/&scioq=Adapting+Pretrained+ViTs+with+Convolution+Injector+for+Visuo-Motor+Control&hl=en&as_sdt=0,33", "gs_version_total": 7, "email": "kaist.ac.kr;kaist.ac.kr;sony.com;kaist.ac.kr;kaist.ac.kr", "author_num": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Sony", "aff_unique_dep": ";Sony AI", "aff_unique_url": "https://www.kaist.ac.kr;https://www.sony.com", "aff_unique_abbr": "KAIST;Sony AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "South Korea;Japan" }, { "title": "Standardized Interpretable Fairness Measures for Continuous Risk Scores", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34651", "id": "CvRu2inbGV", "proceeding": "https://proceedings.mlr.press/v235/becker24a.html", "pdf": "https://openreview.net/pdf?id=CvRu2inbGV", "openreview": "https://openreview.net/forum?id=CvRu2inbGV", "author_site": "Ann-Kristin Becker, Oana Dumitrasc, Klaus Broelemann", "tldr": "", "abstract": "We propose a standardized version of fairness measures for continuous scores with a reasonable interpretation based on the Wasserstein distance. Our measures are easily computable and well suited for quantifying and interpreting the strength of group disparities as well as for comparing biases across different models, datasets, or time points. We derive a link between the different families of existing fairness measures for scores and show that the proposed standardized fairness measures outperform ROC-based fairness measures because they are more explicit and can quantify significant biases that ROC-based fairness measures miss.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ann-Kristin Becker;Oana Dumitrasc;Klaus Broelemann", "authorids": "~Ann-Kristin_Becker1;~Oana_Dumitrasc1;~Klaus_Broelemann1", "gender": ";;", "homepage": ";https://www.linkedin.com/in/oana-dumitrasc-34091872/;", "dblp": ";;00/7271.html", "google_scholar": "https://scholar.google.de/citations?user=RrljcokAAAAJ;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Ann-Kristin_Becker1;~Oana_Dumitrasc1;~Klaus_Broelemann1", "aff": "SCHUFA Holding AG;Schufa Holding AG;SCHUFA", "aff_domain": "schufa.de;schufa.de;schufa.de", "position": "Researcher;Researcher;Principal Researcher", "bibtex": "@inproceedings{\nbecker2024standardized,\ntitle={Standardized Interpretable Fairness Measures for Continuous Risk Scores},\nauthor={Ann-Kristin Becker and Oana Dumitrasc and Klaus Broelemann},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CvRu2inbGV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 666192, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1245252129915911095&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 6, "email": "schufa.de;schufa.de;schufa.de", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "SCHUFA Holding AG", "aff_unique_dep": "", "aff_unique_url": "https://www.schufa.de", "aff_unique_abbr": "SCHUFA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "Probabilistic Conceptual Explainers: Trustworthy Conceptual Explanations for Vision Foundation Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34650", "id": "Cw6Xl0g8a5", "proceeding": "https://proceedings.mlr.press/v235/wang24bo.html", "pdf": "https://openreview.net/pdf?id=Cw6Xl0g8a5", "openreview": "https://openreview.net/forum?id=Cw6Xl0g8a5", "author_site": "Hengyi Wang, Shiwei Tan, Hao Wang", "tldr": "", "abstract": "Vision transformers (ViTs) have emerged as a significant area of focus, particularly for their capacity to be jointly trained with large language models and to serve as robust vision foundation models. Yet, the development of trustworthy explanation methods for ViTs has lagged, particularly in the context of post-hoc interpretations of ViT predictions. Existing sub-image selection approaches, such as feature-attribution and conceptual models, fall short in this regard. This paper proposes five desiderata for explaining ViTs -- faithfulness, stability, sparsity, multi-level structure, and parsimony -- and demonstrates the inadequacy of current methods in meeting these criteria comprehensively. We introduce a variational Bayesian explanation framework, dubbed ProbAbilistic Concept Explainers (PACE), which models the distributions of patch embeddings to provide trustworthy post-hoc conceptual explanations. Our qualitative analysis reveals the distributions of patch-level concepts, elucidating the effectiveness of ViTs by modeling the joint distribution of patch embeddings and ViT's predictions. Moreover, these patch-level explanations bridge the gap between image-level and dataset-level explanations, thus completing the multi-level structure of PACE. Through extensive experiments on both synthetic and real-world datasets, we demonstrate that PACE surpasses state-of-the-art methods in terms of the defined desiderata.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hengyi Wang;Shiwei Tan;Hao Wang", "authorids": "~Hengyi_Wang1;~Shiwei_Tan1;~Hao_Wang3", "gender": "M;M;M", "homepage": "https://carbonated-law-fad.notion.site/Hengyi-Wang-76cf0e03fa5648cdaf45218e69e840cc;;http://www.wanghao.in", "dblp": "215/1801;380/3073;w/HaoWang-14", "google_scholar": "eKs-ZGQAAAAJ;;NrOA9QoAAAAJ", "orcid": ";;", "linkedin": ";shiwei-tan-4a3520275/;", "or_profile": "~Hengyi_Wang1;~Shiwei_Tan1;~Hao_Wang4", "aff": "Rutgers University;Rutgers University;Rutgers University", "aff_domain": "rutgers.edu;rutgers.edu;cs.rutgers.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nwang2024probabilistic,\ntitle={Probabilistic Conceptual Explainers: Trustworthy Conceptual Explanations for Vision Foundation Models},\nauthor={Hengyi Wang and Shiwei Tan and Hao Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Cw6Xl0g8a5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5884274, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=763563660049109852&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "rutgers.edu;rutgers.edu;cs.rutgers.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Rutgers University", "aff_unique_dep": "", "aff_unique_url": "https://www.rutgers.edu", "aff_unique_abbr": "Rutgers", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Information Complexity of Stochastic Convex Optimization: Applications to Generalization, Memorization, and Tracing", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34649", "id": "CyEJn71Z00", "proceeding": "https://proceedings.mlr.press/v235/attias24a.html", "pdf": "https://openreview.net/pdf?id=CyEJn71Z00", "openreview": "https://openreview.net/forum?id=CyEJn71Z00", "author_site": "Idan Attias, Gintare Karolina Dziugaite, Mahdi Haghifam, Roi Livni, Daniel Roy", "tldr": "", "abstract": "In this work, we investigate the interplay between memorization and learning in the context of *stochastic convex optimization* (SCO). We define memorization via the information a learning algorithm reveals about its training data points. We then quantify this information using the framework of conditional mutual information (CMI) proposed by Steinke and Zakynthinou (2020). Our main result is a precise characterization of the tradeoff between the accuracy of a learning algorithm and its CMI, answering an open question posed by Livni (2023). We show that, in the $L^2$ Lipschitz--bounded setting and under strong convexity, every learner with an excess error $\\epsilon$ has CMI bounded below by $\\Omega(1/\\epsilon^2)$ and $\\Omega(1/\\epsilon)$, respectively. We further demonstrate the essential role of memorization in learning problems in SCO by designing an adversary capable of accurately identifying a significant fraction of the training samples in specific SCO problems. Finally, we enumerate several implications of our results, such as a limitation of generalization bounds based on CMI and the incompressibility of samples in SCO problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Idan Attias;Gintare Karolina Dziugaite;Mahdi Haghifam;Roi Livni;Daniel M. Roy", "authorids": "~Idan_Attias1;~Gintare_Karolina_Dziugaite1;~Mahdi_Haghifam2;~Roi_Livni1;~Daniel_M._Roy1", "gender": "M;F;M;Not Specified;M", "homepage": "https://www.idanattias.com;http://gkdz.org/;https://mhaghifam.github.io/mahdihaghifam/;https://www.rlivni.sites.tau.ac.il/;http://danroy.org", "dblp": "228/6803;163/1774;183/6215;59/11348;04/2068", "google_scholar": "-L6uUy0AAAAJ;5K1QB_8AAAAJ;https://scholar.google.com/schhp?hl=en;xhU85M4AAAAJ;https://scholar.google.ca/citations?user=vA6ZQ_AAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Idan_Attias1;~Gintare_Karolina_Dziugaite1;~Mahdi_Haghifam2;~Roi_Livni1;~Daniel_M_Roy1", "aff": "Tel Aviv University;Montreal Institute for Learning Algorithms, University of Montreal, University of Montreal;Northeastern University;Tel Aviv University;University of Toronto", "aff_domain": "tau.ac.il;mila.umontreal.ca;neu.edu;tau.ac.il;utoronto.ca", "position": "PhD student;Member;Postdoc;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nattias2024information,\ntitle={Information Complexity of Stochastic Convex Optimization: Applications to Generalization, Memorization, and Tracing},\nauthor={Idan Attias and Gintare Karolina Dziugaite and Mahdi Haghifam and Roi Livni and Daniel M. Roy},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CyEJn71Z00}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 466019, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=880593846316486420&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "tau.ac.il;mila.umontreal.ca;neu.edu;tau.ac.il;utoronto.ca", "author_num": 5, "aff_unique_index": "0;1;2;0;3", "aff_unique_norm": "Tel Aviv University;University of Montreal;Northeastern University;University of Toronto", "aff_unique_dep": ";Montreal Institute for Learning Algorithms;;", "aff_unique_url": "https://www.tau.ac.il;https://www.umontreal.ca;https://www.northeastern.edu;https://www.utoronto.ca", "aff_unique_abbr": "TAU;UM;NEU;U of T", "aff_campus_unique_index": "1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;1;2;0;1", "aff_country_unique": "Israel;Canada;United States" }, { "title": "Outlier-robust Kalman Filtering through Generalised Bayes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34648", "id": "D2MNVeVh5J", "proceeding": "https://proceedings.mlr.press/v235/duran-martin24a.html", "pdf": "https://openreview.net/pdf?id=D2MNVeVh5J", "openreview": "https://openreview.net/forum?id=D2MNVeVh5J", "author_site": "Gerardo Duran-Martin, Matias Altamirano, Alex Shestopaloff, Leandro S\u00e1nchez-Betancourt, Jeremias Knoblauch, Matt Jones, Francois-Xavier Briol, Kevin Murphy", "tldr": "", "abstract": "We derive a novel, provably robust, efficient, and closed-form Bayesian update rule for online filtering in state-space models in the presence of outliers and misspecified measurement models. Our method combines generalised Bayesian inference with filtering methods such as the extended and ensemble Kalman filter. We use the former to show robustness and the latter to ensure computational efficiency in the case of nonlinear models. Our method matches or outperforms other robust filtering methods (such as those based on variational Bayes) at a much lower computational cost. We show this empirically on a range of filtering problems with outlier measurements, such as object tracking, state estimation in high-dimensional chaotic systems, and online learning of neural networks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gerardo Duran-Martin;Matias Altamirano;Alex Shestopaloff;Leandro S\u00e1nchez-Betancourt;Jeremias Knoblauch;Matt Jones;Francois-Xavier Briol;Kevin Patrick Murphy", "authorids": "~Gerardo_Duran-Martin1;~Matias_Altamirano2;~Alex_Shestopaloff1;~Leandro_S\u00e1nchez-Betancourt1;~Jeremias_Knoblauch1;~Matt_Jones1;~Francois-Xavier_Briol1;~Kevin_Patrick_Murphy1", "gender": "M;M;;M;M;M;M;", "homepage": "https://grdm.io;https://maltamiranomontero.github.io/;;https://leandro-sbetancourt.github.io;https://jeremiasknoblauch.github.io/;http://Matt.Colorado.edu;https://fxbriol.github.io;https://www.cs.ubc.ca/~murphyk/", "dblp": ";;220/1933.html;;220/5462;;https://dblp.uni-trier.de/pid/173/4982;26/2599", "google_scholar": "jVajee8AAAAJ;qxVZ-mIAAAAJ;;https://scholar.google.co.uk/citations?user=RLQF_UMAAAAJ;https://scholar.google.co.uk/citations?user=4TPsxlsAAAAJ;Q7FDrMIAAAAJ;https://scholar.google.co.uk/citations?user=yLBYtAwAAAAJ;MxxZkEcAAAAJ", "orcid": ";;;0000-0001-6447-7105;;;0000-0002-0181-2559;", "linkedin": ";;;leandro-sanchez-betancourt/;;;;", "or_profile": "~Gerardo_Duran-Martin1;~Matias_Altamirano2;~Alex_Shestopaloff1;~Leandro_S\u00e1nchez-Betancourt1;~Jeremias_Knoblauch1;~Matt_Jones1;~Francois-Xavier_Briol1;~Kevin_Patrick_Murphy1", "aff": "Queen Mary University;University College London, University of London;Queen Mary, University of London;University of Oxford;;University of Colorado Boulder;University College London, University of London;Google", "aff_domain": "qmul.ac.uk;ucl.ac.uk;qmul.ac.uk;ox.ac.uk;;colorado.edu;ucl.ac.uk;google.com", "position": "PhD student;PhD student;Senior Lecturer;Researcher;;Full Professor;Associate Professor;Principal Researcher", "bibtex": "@inproceedings{\nduran-martin2024outlierrobust,\ntitle={Outlier-robust Kalman Filtering through Generalised Bayes},\nauthor={Gerardo Duran-Martin and Matias Altamirano and Alex Shestopaloff and Leandro S{\\'a}nchez-Betancourt and Jeremias Knoblauch and Matt Jones and Francois-Xavier Briol and Kevin Patrick Murphy},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=D2MNVeVh5J}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5735229, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9368055209794536353&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "email": "qmul.ac.uk;ucl.ac.uk;qmul.ac.uk;ox.ac.uk;;colorado.edu;ucl.ac.uk;google.com", "author_num": 8, "aff_unique_index": "0;1;2;3;4;1;5", "aff_unique_norm": "Queen Mary University of London;University College London;Queen Mary, University of London;University of Oxford;University of Colorado;Google", "aff_unique_dep": ";;;;;Google", "aff_unique_url": "https://www.qmul.ac.uk;https://www.ucl.ac.uk;https://www.qmul.ac.uk;https://www.ox.ac.uk;https://www.colorado.edu;https://www.google.com", "aff_unique_abbr": "QMUL;UCL;QMUL;Oxford;CU;Google", "aff_campus_unique_index": "0;0;2;3", "aff_campus_unique": "London;;Boulder;Mountain View", "aff_country_unique_index": "0;0;0;0;1;0;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "A Fine-grained Analysis of Fitted Q-evaluation: Beyond Parametric Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34647", "id": "D32aTei4p5", "proceeding": "https://proceedings.mlr.press/v235/wang24be.html", "pdf": "https://openreview.net/pdf?id=D32aTei4p5", "openreview": "https://openreview.net/forum?id=D32aTei4p5", "author_site": "Jiayi Wang, Zhengling Qi, Raymond K. W. Wong", "tldr": "", "abstract": "In this paper, we delve into the statistical analysis of the fitted Q-evaluation (FQE) method, which focuses on estimating the value of a target policy using offline data generated by some behavior policy. We provide a comprehensive theoretical understanding of FQE estimators under both parametric and non-parametric models on the Q-function. Specifically, we address three key questions related to FQE that remain largely unexplored in the current literature: (1) Is the optimal convergence rate for estimating the policy value regarding the sample size $n$ ($n^{\u22121/2}$) achievable for FQE under a nonparametric model with a fixed horizon ($T$ )? (2) How does the error bound depend on the horizon T ? (3) What is the role of the probability ratio function in improving the convergence of FQE estimators? Specifically, we show that under the completeness assumption of Q-functions, which is mild in the non-parametric setting, the estimation errors for policy value using both parametric and non-parametric FQE estimators can achieve an optimal rate in terms of n. The corresponding error bounds in terms of both $n$ and $T$ are also established. With an additional realizability assumption on ratio functions, the rate of estimation errors can be improved from $T^{ 1.5}/\\sqrt{n}$ to $T /\\sqrt{n}$, which matches the sharpest known bound in the current literature under the tabular setting.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiayi Wang;Zhengling Qi;Raymond K. W. Wong", "authorids": "~Jiayi_Wang7;~Zhengling_Qi1;~Raymond_K._W._Wong1", "gender": "F;;", "homepage": "https://jiayiwang1017.github.io/;https://sites.google.com/view/statsqizl/home?authuser=0;", "dblp": ";173/0201;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Jiayi_Wang7;~Zhengling_Qi1;~Raymond_K._W._Wong1", "aff": "University of Texas at Dallas;George Washington University;", "aff_domain": "utdallas.edu;gwu.edu;", "position": "Assistant Professor;Assistant Professor;", "bibtex": "@inproceedings{\nwang2024a,\ntitle={A Fine-grained Analysis of Fitted Q-evaluation: Beyond Parametric Models},\nauthor={Jiayi Wang and Zhengling Qi and Raymond K. W. Wong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=D32aTei4p5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 580726, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:z7uzPmbrbpwJ:scholar.google.com/&scioq=A+Fine-grained+Analysis+of+Fitted+Q-evaluation:+Beyond+Parametric+Models&hl=en&as_sdt=0,33", "gs_version_total": 7, "email": "utdallas.edu;gwu.edu;", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "University of Texas at Dallas;George Washington University", "aff_unique_dep": ";", "aff_unique_url": "https://www.utdallas.edu;https://www.gwu.edu", "aff_unique_abbr": "UT Dallas;GWU", "aff_campus_unique_index": "0", "aff_campus_unique": "Dallas;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Generalized Neural Collapse for a Large Number of Classes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34646", "id": "D4B7kkB89m", "proceeding": "https://proceedings.mlr.press/v235/jiang24i.html", "pdf": "https://openreview.net/pdf?id=D4B7kkB89m", "openreview": "https://openreview.net/forum?id=D4B7kkB89m", "author_site": "Jiachen Jiang, Jinxin Zhou, Peng Wang, Qing Qu, Dustin Mixon, Chong You, Zhihui Zhu", "tldr": "", "abstract": "Neural collapse provides an elegant mathematical characterization of learned last layer representations (a.k.a. features) and classifier weights in deep classification models. Such results not only provide insights but also motivate new techniques for improving practical deep models. However, most of the existing empirical and theoretical studies in neural collapse focus on the case that the number of classes is small relative to the dimension of the feature space. This paper extends neural collapse to cases where the number of classes are much larger than the dimension of feature space, which broadly occur for language models, retrieval systems, and face recognition applications. We show that the features and classifier exhibit a generalized neural collapse phenomenon, where the minimum one-vs-rest margins is maximized. We provide empirical study to verify the occurrence of generalized neural collapse in practical deep neural networks. Moreover, we provide theoretical study to show that the generalized neural collapse provably occurs under unconstrained feature model with spherical constraint, under certain technical conditions on feature dimension and number of classes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiachen Jiang;Jinxin Zhou;Peng Wang;Qing Qu;Dustin G. Mixon;Chong You;Zhihui Zhu", "authorids": "~Jiachen_Jiang1;~Jinxin_Zhou2;~Peng_Wang23;~Qing_Qu2;~Dustin_G._Mixon1;~Chong_You2;~Zhihui_Zhu1", "gender": "M;;M;M;;M;M", "homepage": "https://kongwanbianjinyu.github.io/;;https://peng8wang.github.io/;https://qingqu.engin.umich.edu/;;https://sites.google.com/view/cyou;https://zhihuizhu.github.io/", "dblp": ";;95/4442-98;127/6874-1;;164/7311;71/8081", "google_scholar": "MNgI0PUAAAAJ;;baF3HKUAAAAJ;JfblW3MAAAAJ;;Mfrpm_IAAAAJ;gmSwszcAAAAJ", "orcid": ";;0000-0002-6799-0745;0000-0001-9136-558X;;;", "linkedin": "jiachen-jiang-b5916221b/;;;qing-q-1a0b9746/;;;", "or_profile": "~Jiachen_Jiang1;~Jinxin_Zhou2;~Peng_Wang23;~Qing_Qu2;~Dustin_G._Mixon1;~Chong_You2;~Zhihui_Zhu1", "aff": "Ohio State University, Columbus;;University of Michigan - Ann Arbor;University of Michigan;;Google;Ohio State University, Columbus", "aff_domain": "osu.edu;;umich.edu;umich.edu;;google.com;osu.edu", "position": "PhD student;;Postdoc;Assistant Professor;;Research Scientist;Assistant Professor", "bibtex": "@inproceedings{\njiang2024generalized,\ntitle={Generalized Neural Collapse for a Large Number of Classes},\nauthor={Jiachen Jiang and Jinxin Zhou and Peng Wang and Qing Qu and Dustin G. Mixon and Chong You and Zhihui Zhu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=D4B7kkB89m}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8521449, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14730372549421542554&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "osu.edu;;umich.edu;umich.edu;;google.com;osu.edu", "author_num": 7, "aff_unique_index": "0;1;1;2;0", "aff_unique_norm": "Ohio State University;University of Michigan;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.osu.edu;https://www.umich.edu;https://www.google.com", "aff_unique_abbr": "OSU;UM;Google", "aff_campus_unique_index": "0;1;3;0", "aff_campus_unique": "Columbus;Ann Arbor;;Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning-Efficient Yet Generalizable Collaborative Filtering for Item Recommendation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34645", "id": "D5IRvFF1lN", "proceeding": "https://proceedings.mlr.press/v235/pu24a.html", "pdf": "https://openreview.net/pdf?id=D5IRvFF1lN", "openreview": "https://openreview.net/forum?id=D5IRvFF1lN", "author_site": "Yuanhao Pu, Xiaolong Chen, Xu Huang, Jin Chen, Defu Lian, Enhong Chen", "tldr": "", "abstract": "The weighted squared loss is a common component in several Collaborative Filtering (CF) algorithms for item recommendation, including the representative implicit Alternating Least Squares (iALS). Despite its widespread use, this loss function lacks a clear connection to ranking objectives such as Discounted Cumulative Gain (DCG), posing a fundamental challenge in explaining the exceptional ranking performance observed in these algorithms. In this work, we make a breakthrough by establishing a connection between squared loss and ranking metrics through a Taylor expansion of the DCG-consistent surrogate loss\u2014softmax loss. We also discover a new surrogate squared loss function, namely Ranking-Generalizable Squared (RG$^2$) loss, and conduct thorough theoretical analyses on the DCG-consistency of the proposed loss function. Later, we present an example of utilizing the RG$^2$ loss with Matrix Factorization (MF), coupled with a generalization upper bound and an ALS optimization algorithm that leverages closed-form solutions over all items. Experimental results over three public datasets demonstrate the effectiveness of the RG$^2$ loss, exhibiting ranking performance on par with, or even surpassing, the softmax loss while achieving faster convergence.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuanhao Pu;Xiaolong Chen;Xu Huang;Jin Chen;Defu Lian;Enhong Chen", "authorids": "~Yuanhao_Pu1;~Xiaolong_Chen5;~Xu_Huang2;~Jin_Chen4;~Defu_Lian1;~Enhong_Chen1", "gender": "M;M;M;F;M;M", "homepage": "https://yuanhao53.github.io;https://github.com/Xiuchen519;https://xuhwang.github.io;https://herecj.github.io/;https://faculty.ustc.edu.cn/liandefu/en/index.htm;http://staff.ustc.edu.cn/~cheneh", "dblp": "345/6264;;00/2922-8.html;03/5287-8;87/10734;07/258", "google_scholar": "ojWGHZkAAAAJ;;fXFMb-gAAAAJ;h-v_wdIAAAAJ;QW0ad4sAAAAJ;Q9h02J0AAAAJ", "orcid": "0000-0002-9485-5573;0009-0007-2714-6404;0000-0003-4354-334X;0000-0001-6440-2242;0000-0002-3507-9607;0000-0002-4835-4102", "linkedin": ";;;;;", "or_profile": "~Yuanhao_Pu1;~Xiaolong_Chen5;~Xu_Huang2;~Jin_Chen4;~Defu_Lian1;~Enhong_Chen1", "aff": "University of Science and Technology of China;University of Science and Technology of China;University of Science and Technology of China;Hong Kong University of Science and Technology;University of Science and Technology of China;University of Science and Technology of China", "aff_domain": "mail.ustc.edu.cn;ustc.edu.cn;ustc.edu.cn;ust.hk;ustc.edu.cn;ustc.edu.cn", "position": "PhD student;MS student;PhD student;Research Assistant Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\npu2024learningefficient,\ntitle={Learning-Efficient Yet Generalizable Collaborative Filtering for Item Recommendation},\nauthor={Yuanhao Pu and Xiaolong Chen and Xu Huang and Jin Chen and Defu Lian and Enhong Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=D5IRvFF1lN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1136596, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12552034837451964555&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "mail.ustc.edu.cn;ustc.edu.cn;ustc.edu.cn;ust.hk;ustc.edu.cn;ustc.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;1;0;0", "aff_unique_norm": "University of Science and Technology of China;Hong Kong University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "http://www.ustc.edu.cn;https://www.ust.hk", "aff_unique_abbr": "USTC;HKUST", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Improved Dimensionality Dependence for Zeroth-Order Optimisation over Cross-Polytopes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34644", "id": "D7wi9LIE6i", "proceeding": "https://proceedings.mlr.press/v235/shao24a.html", "pdf": "https://openreview.net/pdf?id=D7wi9LIE6i", "openreview": "https://openreview.net/forum?id=D7wi9LIE6i", "tldr": "", "abstract": "This work proposes an algorithm improving the dimensionality dependence for gradient-free optimisation over cross-polytopes, which has many applications such as adversarial attacks, explainable AI and sparse regression. For bandit convex optimisation with two-point feedback over cross-polytopes, the state-of-the-art algorithms have a dimensionality dependence of $\\mathcal{O}(\\sqrt{d\\log d})$, while the known lower bound is of the form $\\Omega(\\sqrt{d(\\log d)^{-1}})$. We propose a mirror descent algorithm equipped with a symmetric version of the negative $\\frac{1}{2}$-Tsallis entropy. Combined with an $\\ell_1$-ellipsoidal smoothing-based gradient estimator, the proposed algorithm guarantees a dimensionality dependence on $\\mathcal{O}(\\sqrt{d})$, which improves the state-of-the-art algorithms by a factor of $\\sqrt{\\log d}$. The idea can be further applied to optimising non-smooth and non-convex functions. We propose an algorithm with a convergence depending on $\\mathcal{O}(d)$, which is the best-known dimensionality dependence.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weijia Shao", "authorids": "~Weijia_Shao1", "gender": "M", "homepage": "https://scholar.google.de/citations?user=4SCGvgoAAAAJ&hl=en&oi=ao", "dblp": "", "google_scholar": "https://scholar.google.de/citations?user=4SCGvgoAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Weijia_Shao1", "aff": "Federal Institute for Occupational Safety and Health", "aff_domain": "baua.de", "position": "Postdoc", "bibtex": "@inproceedings{\nshao2024improved,\ntitle={Improved Dimensionality Dependence for Zeroth-Order Optimisation over Cross-Polytopes},\nauthor={Weijia Shao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=D7wi9LIE6i}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 390957, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_7sfNGwbFK4J:scholar.google.com/&scioq=Improved+Dimensionality+Dependence+for+Zeroth-Order+Optimisation+over+Cross-Polytopes&hl=en&as_sdt=0,33", "gs_version_total": 4, "email": "baua.de", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Federal Institute for Occupational Safety and Health", "aff_unique_dep": "", "aff_unique_url": "https://www.baua.de", "aff_unique_abbr": "BAuA", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "title": "Reprompting: Automated Chain-of-Thought Prompt Inference Through Gibbs Sampling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34643", "id": "D8zn1DnTuj", "proceeding": "https://proceedings.mlr.press/v235/xu24b.html", "pdf": "https://openreview.net/pdf?id=D8zn1DnTuj", "openreview": "https://openreview.net/forum?id=D8zn1DnTuj", "author_site": "Weijia Xu, Andrzej Banburski-Fahey, Nebojsa Jojic", "tldr": "", "abstract": "We introduce Reprompting, an iterative sampling algorithm that automatically learns the Chain-of-Thought (CoT) recipes for a given task without human intervention. Through Gibbs sampling, Reprompting infers the CoT recipes that work consistently well for a set of training samples by iteratively sampling new recipes using previously sampled recipes as parent prompts to solve other training problems. We conduct extensive experiments on 20 challenging reasoning tasks. Results show that Reprompting outperforms human-written CoT prompts substantially by +9.4 points on average. It also achieves consistently better performance than the state-of-the-art prompt optimization and decoding algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weijia Xu;Andrzej Banburski;Nebojsa Jojic", "authorids": "~Weijia_Xu3;~Andrzej_Banburski1;~Nebojsa_Jojic1", "gender": "F;M;", "homepage": "https://weijia-xu.github.io;http://www.andrzejbanburski.com;www.research.microsoft.com/~jojic", "dblp": "68/4886;194/5464;20/1944", "google_scholar": "6uUw43gAAAAJ;;", "orcid": "0000-0002-5732-8926;;", "linkedin": ";;", "or_profile": "~Weijia_Xu3;~Andrzej_Banburski1;~Nebojsa_Jojic1", "aff": "Microsoft Research;Microsoft;Microsoft Research", "aff_domain": "research.microsoft.com;microsoft.com; ", "position": "Researcher;Principal Researcher;Researcher", "bibtex": "@inproceedings{\nxu2024reprompting,\ntitle={Reprompting: Automated Chain-of-Thought Prompt Inference Through Gibbs Sampling},\nauthor={Weijia Xu and Andrzej Banburski and Nebojsa Jojic},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=D8zn1DnTuj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2335781, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10565548451033148331&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 7, "email": "research.microsoft.com;microsoft.com; ", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Microsoft", "aff_unique_dep": "Microsoft Research", "aff_unique_url": "https://www.microsoft.com/en-us/research", "aff_unique_abbr": "MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Adversarially Robust Deep Multi-View Clustering: A Novel Attack and Defense Framework", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34642", "id": "D9EfAkQCzh", "proceeding": "https://proceedings.mlr.press/v235/huang24ai.html", "pdf": "https://openreview.net/pdf?id=D9EfAkQCzh", "openreview": "https://openreview.net/forum?id=D9EfAkQCzh", "author_site": "Haonan Huang, Guoxu Zhou, Yanghang Zheng, Yuning Qiu, Andong Wang, Qibin Zhao", "tldr": "", "abstract": "Deep Multi-view Clustering (DMVC) stands out as a widely adopted technique aiming at enhanced clustering performance by leveraging diverse data sources. However, the critical issue of vulnerability to adversarial attacks is unexplored due to the lack of well-defined attack objectives. To fill this crucial gap, this paper is the first work to investigate the possibility of adversarial attacks on DMVC models. Specifically, we introduce an adversarial attack with Generative Adversarial Networks (GANs) with the aim to maximally change the complementarity and consistency of multiple views, thus leading to wrong clustering. Building upon this adversarial context, in the realm of defense, we propose a novel Adversarially Robust Deep Multi-View Clustering by leveraging adversarial training. Based on the analysis from an information-theoretic perspective, we design an Attack Mitigator that provides a foundation to guarantee the adversarial robustness of our DMVC models. Experiments conducted on multi-view datasets confirmed that our attack framework effectively reduces the clustering performance of the target model. Furthermore, our proposed adversarially robust method is also demonstrated to be an effective defense against such attacks. This work is a pioneer in exploring adversarial threats and advancing both theoretical understanding and practical strategies for robust multi-view clustering. Code is available at https://github.com/libertyhhn/AR-DMVC.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haonan Huang;Guoxu Zhou;Yanghang Zheng;Yuning Qiu;Andong Wang;Qibin Zhao", "authorids": "~Haonan_Huang1;~Guoxu_Zhou1;~Yanghang_Zheng1;~Yuning_Qiu1;~Andong_Wang1;~Qibin_Zhao1", "gender": "M;M;M;M;M;M", "homepage": "https://libertyhhn.github.io/;https://teacher.gdut.edu.cn/gxzhou/zh_CN/index.htm;https://www.researchgate.net/scientific-contributions/Yanghang-Zheng-2262816920;https://www.researchgate.net/profile/Yuning-Qiu-3;https://www.patternrecognition.asia/wad/;https://qibinzhao.github.io", "dblp": "280/0596;33/7727;;210/1002;190/5540;13/1193", "google_scholar": "8zy4LVYAAAAJ;BIUkSFEAAAAJ;;https://scholar.google.com.hk/citations?user=zGb0k1MAAAAJ;vuPyxGwAAAAJ;https://scholar.google.co.jp/citations?hl=en", "orcid": ";;;0000-0003-0268-0890;;0000-0002-4442-3182", "linkedin": ";;;;;", "or_profile": "~Haonan_Huang1;~Guoxu_Zhou1;~Yanghang_Zheng1;~Yuning_Qiu1;~Andong_Wang1;~Qibin_Zhao1", "aff": "Guangdong University of Technology;Guangdong University of Technology;Guangdong University of Technology;RIKEN;RIKEN AIP;RIKEN", "aff_domain": "gdut.edu.cn;gdut.edu.cn;gdut.edu.cn;riken.jp;riken.jp;riken.jp", "position": "PhD student;Full Professor;MS student;SPDR;Postdoc;Team Leader", "bibtex": "@inproceedings{\nhuang2024adversarially,\ntitle={Adversarially Robust Deep Multi-View Clustering: A Novel Attack and Defense Framework},\nauthor={Haonan Huang and Guoxu Zhou and Yanghang Zheng and Yuning Qiu and Andong Wang and Qibin Zhao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=D9EfAkQCzh}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9955517, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11211904118125211372&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "gdut.edu.cn;gdut.edu.cn;gdut.edu.cn;riken.jp;riken.jp;riken.jp", "author_num": 6, "aff_unique_index": "0;0;0;1;1;1", "aff_unique_norm": "Guangdong University of Technology;RIKEN", "aff_unique_dep": ";", "aff_unique_url": "http://www.gdut.edu.cn;https://www.riken.jp", "aff_unique_abbr": "GDUT;RIKEN", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;1;1", "aff_country_unique": "China;Japan" }, { "title": "Mollification Effects of Policy Gradient Methods", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34641", "id": "DA2AiCiCaM", "proceeding": "https://proceedings.mlr.press/v235/wang24x.html", "pdf": "https://openreview.net/pdf?id=DA2AiCiCaM", "openreview": "https://openreview.net/forum?id=DA2AiCiCaM", "author_site": "Tao Wang, Sylvia Herbert, Sicun Gao", "tldr": "", "abstract": "Policy gradient methods have enabled deep reinforcement learning (RL) to approach challenging continuous control problems, even when the underlying systems involve highly nonlinear dynamics that generate complex non-smooth optimization landscapes. We develop a rigorous framework for understanding how policy gradient methods mollify non-smooth optimization landscapes to enable effective policy search, as well as the downside of it: while making the objective function smoother and easier to optimize, the stochastic objective deviates further from the original problem. We demonstrate the equivalence between policy gradient methods and solving backward heat equations. Following the ill-posedness of backward heat equations from PDE theory, we present a fundamental challenge to the use of policy gradient under stochasticity. Moreover, we make the connection between this limitation and the uncertainty principle in harmonic analysis to understand the effects of exploration with stochastic policies in RL. We also provide experimental results to illustrate both the positive and negative aspects of mollification effects in practice.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tao Wang;Sylvia Lee Herbert;Sicun Gao", "authorids": "~Tao_Wang27;~Sylvia_Lee_Herbert1;~Sicun_Gao1", "gender": ";F;M", "homepage": "https://taowang0.github.io/;https://sylviaherbert.com;", "dblp": ";192/3242;22/8296", "google_scholar": ";;", "orcid": ";0000-0002-3863-8945;", "linkedin": ";;", "or_profile": "~Tao_Wang27;~Sylvia_Lee_Herbert1;~Sicun_Gao1", "aff": "University of California, San Diego;University of California, San Diego;University of California, San Diego", "aff_domain": "ucsd.edu;ucsd.edu;ucsd.edu", "position": "PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nwang2024mollification,\ntitle={Mollification Effects of Policy Gradient Methods},\nauthor={Tao Wang and Sylvia Lee Herbert and Sicun Gao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DA2AiCiCaM}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9245423, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:egxxOk-lQxMJ:scholar.google.com/&scioq=Mollification+Effects+of+Policy+Gradient+Methods&hl=en&as_sdt=0,34", "gs_version_total": 5, "email": "ucsd.edu;ucsd.edu;ucsd.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, San Diego", "aff_unique_dep": "", "aff_unique_url": "https://www.ucsd.edu", "aff_unique_abbr": "UCSD", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "High-Probability Convergence for Composite and Distributed Stochastic Minimization and Variational Inequalities with Heavy-Tailed Noise", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34640", "id": "DBI6AuCD4a", "proceeding": "https://proceedings.mlr.press/v235/gorbunov24a.html", "pdf": "https://openreview.net/pdf?id=DBI6AuCD4a", "openreview": "https://openreview.net/forum?id=DBI6AuCD4a", "author_site": "Eduard Gorbunov, Abdurakhmon Sadiev, Marina Danilova, Samuel Horv\u00e1th, Gauthier Gidel, Pavel Dvurechenskii, Alexander Gasnikov, Peter Richtarik", "tldr": "", "abstract": "High-probability analysis of stochastic first-order optimization methods under mild assumptions on the noise has been gaining a lot of attention in recent years. Typically, gradient clipping is one of the key algorithmic ingredients to derive good high-probability guarantees when the noise is heavy-tailed. However, if implemented naively, clipping can spoil the convergence of the popular methods for composite and distributed optimization (Prox-SGD/Parallel SGD) even in the absence of any noise. Due to this reason, many works on high-probability analysis consider only unconstrained non-distributed problems, and the existing results for composite/distributed problems do not include some important special cases (like strongly convex problems) and are not optimal. To address this issue, we propose new stochastic methods for composite and distributed optimization based on the clipping of stochastic gradient differences and prove tight high-probability convergence results (including nearly optimal ones) for the new methods. In addition, we also develop new methods for composite and distributed variational inequalities and analyze the high-probability convergence of these methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Eduard Gorbunov;Abdurakhmon Sadiev;Marina Danilova;Samuel Horv\u00e1th;Gauthier Gidel;Pavel Dvurechensky;Alexander Gasnikov;Peter Richt\u00e1rik", "authorids": "~Eduard_Gorbunov1;~Abdurakhmon_Sadiev1;~Marina_Danilova1;~Samuel_Horv\u00e1th1;~Gauthier_Gidel1;~Pavel_Dvurechensky1;~Alexander_Gasnikov1;~Peter_Richt\u00e1rik1", "gender": "M;M;F;M;M;;M;M", "homepage": "https://eduardgorbunov.github.io;https://sadiev.netlify.app;https://marinadanya.github.io/;https://sites.google.com/view/samuelhorvath;https://gauthiergidel.github.io/;http://wias-berlin.de/people/dvureche/?lang=1;https://arxiv.org/search/?query=Gasnikov&searchtype=all&source=header;https://richtarik.org", "dblp": "215/5512.html;264/9455;;234/8604;188/6326;164/7242;153/1930;62/8001", "google_scholar": "https://scholar.google.ru/citations?user=85j2RqQAAAAJ;R-xZRIAAAAAJ;mAip6kUAAAAJ;k252J7kAAAAJ;https://scholar.google.fr/citations?user=bDrXQPUAAAAJ;28MSou8AAAAJ;AmeE8qkAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;0000-0003-0619-9260;;0000-0003-1201-2343;;0000-0003-4380-5848", "linkedin": ";;;samuel-horvath/;;;;richtarik/", "or_profile": "~Eduard_Gorbunov1;~Abdurakhmon_Sadiev1;~Marina_Danilova1;~Samuel_Horv\u00e1th1;~Gauthier_Gidel1;~Pavel_Dvurechensky1;~Alexander_Vladimirovich_Gasnikov1;~Peter_Richtarik1", "aff": "Mohamed bin Zayed University of Artificial Intelligence;King Abdullah University of Science and Technology;Moscow Institute of Physics and Technology;MBZUAI;Mila - Quebec Artificial Intelligence Institute;Weierstrass Institute for Applied Analysis and Stochastics;Moscow Institute of Physics and Technology;King Abdullah University of Science and Technology (KAUST)", "aff_domain": "mbzuai.ac.ae;kaust.edu.sa;mipt.ru;mbzuai.ac.ae;mila.quebec;wias-berlin.de;mipt.ru;kaust.edu.sa", "position": "Postdoc;PhD student;Researcher;Assistant Professor;Assistant Professor;Postdoc;Associate Professor;Full Professor", "bibtex": "@inproceedings{\ngorbunov2024highprobability,\ntitle={High-Probability Convergence for Composite and Distributed Stochastic Minimization and Variational Inequalities with Heavy-Tailed Noise},\nauthor={Eduard Gorbunov and Abdurakhmon Sadiev and Marina Danilova and Samuel Horv{\\'a}th and Gauthier Gidel and Pavel Dvurechensky and Alexander Gasnikov and Peter Richt{\\'a}rik},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DBI6AuCD4a}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1108522, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3668356256689388675&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 14, "email": "mbzuai.ac.ae;kaust.edu.sa;mipt.ru;mbzuai.ac.ae;mila.quebec;wias-berlin.de;mipt.ru;kaust.edu.sa", "author_num": 8, "aff_unique_index": "0;1;2;0;3;4;2;1", "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;King Abdullah University of Science and Technology;Moscow Institute of Physics and Technology;Quebec Artificial Intelligence Institute;Weierstrass Institute for Applied Analysis and Stochastics", "aff_unique_dep": ";;;Artificial Intelligence;", "aff_unique_url": "https://mbzuai.ac.ae;https://www.kast.kau.edu.sa;https://www.mipt.ru/en;https://mila.quebec;https://www.wias-berlin.de/", "aff_unique_abbr": "MBZUAI;KAUST;MIPT;Mila;WIAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0;3;4;2;1", "aff_country_unique": "United Arab Emirates;Saudi Arabia;Russian Federation;Canada;Germany" }, { "title": "StyDeSty: Min-Max Stylization and Destylization for Single Domain Generalization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34639", "id": "DBlkjCDg2i", "proceeding": "https://proceedings.mlr.press/v235/liu24ad.html", "pdf": "https://openreview.net/pdf?id=DBlkjCDg2i", "openreview": "https://openreview.net/forum?id=DBlkjCDg2i", "author_site": "Songhua Liu, Xin Jin, Xingyi Yang, Jingwen Ye, Xinchao Wang", "tldr": "", "abstract": "Single domain generalization (single DG) aims at learning a robust model generalizable to unseen domains from only one training domain, making it a highly ambitious and challenging task. State-of-the-art approaches have mostly relied on data augmentations, such as adversarial perturbation and style enhancement, to synthesize new data and thus increase robustness. Nevertheless, they have largely overlooked the underlying coherence between the augmented domains, which in turn leads to inferior results in real-world scenarios. In this paper, we propose a simple yet effective scheme, termed as *StyDeSty*, to explicitly account for the alignment of the source and pseudo domains in the process of data augmentation, enabling them to interact with each other in a self-consistent manner and further giving rise to a latent domain with strong generalization power. The heart of StyDeSty lies in the interaction between a *stylization* module for generating novel stylized samples using the source domain, and a *destylization* module for transferring stylized and source samples to a latent domain to learn content-invariant features. The stylization and destylization modules work adversarially and reinforce each other. During inference, the destylization module transforms the input sample with an arbitrary style shift to the latent domain, in which the downstream tasks are carried out. Specifically, the location of the destylization layer within the backbone network is determined by a dedicated neural architecture search (NAS) strategy. We evaluate StyDeSty on multiple benchmarks and demonstrate that it yields encouraging results, outperforming the state of the art by up to 13.44% on classification accuracy. Codes are available https://github.com/Huage001/StyDeSty.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Songhua Liu;Xin Jin;Xingyi Yang;Jingwen Ye;Xinchao Wang", "authorids": "~Songhua_Liu2;~Xin_Jin8;~Xingyi_Yang1;~Jingwen_Ye1;~Xinchao_Wang1", "gender": "M;M;M;F;M", "homepage": "http://121.37.94.87;http://home.ustc.edu.cn/~jinxustc/;https://adamdad.github.io/;https://jngwenye.github.io/;https://sites.google.com/site/sitexinchaowang/", "dblp": "42/8978;68/3340-14;;200/7853;", "google_scholar": "AnYh2rAAAAAJ;byaSC-kAAAAJ;1n2OPtwAAAAJ;8GQnNP0AAAAJ;https://scholar.google.com.tw/citations?user=w69Buq0AAAAJ", "orcid": ";0000-0002-1820-8358;;;", "linkedin": ";;;;", "or_profile": "~Songhua_Liu2;~Xin_Jin8;~Xingyi_Yang1;~Jingwen_Ye1;~Xinchao_WANG3", "aff": "National University of Singapore;Eastern Institute of Technology, Ningbo;National University of Singapore;National University of Singapore;National University of Singapore", "aff_domain": "u.nus.edu;eitech.edu.cn;nus.edu;nus.edu.sg;nus.edu", "position": "PhD student;Assistant Professor;PhD student;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nliu2024stydesty,\ntitle={StyDeSty: Min-Max Stylization and Destylization for Single Domain Generalization},\nauthor={Songhua Liu and Xin Jin and Xingyi Yang and Jingwen Ye and Xinchao Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DBlkjCDg2i}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2292634, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6478234324912901113&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "u.nus.edu;eitech.edu.cn;nus.edu;nus.edu.sg;nus.edu", "author_num": 5, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "National University of Singapore;Eastern Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;https://www.eit.edu.cn", "aff_unique_abbr": "NUS;", "aff_campus_unique_index": "1", "aff_campus_unique": ";Ningbo", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "Singapore;China" }, { "title": "TroVE: Inducing Verifiable and Efficient Toolboxes for Solving Programmatic Tasks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34638", "id": "DCNCwaMJjI", "proceeding": "https://proceedings.mlr.press/v235/wang24az.html", "pdf": "https://openreview.net/pdf?id=DCNCwaMJjI", "openreview": "https://openreview.net/forum?id=DCNCwaMJjI", "author_site": "Zhiruo Wang, Graham Neubig, Daniel Fried", "tldr": "", "abstract": "Language models (LMs) can solve tasks such as answering questions about tables or images by writing programs. However, using primitive functions often leads to verbose and error-prone programs, and higher-level functions require expert design. To enable better solutions without human labor, we ask code LMs to curate reusable high-level functions, and use them to write solutions. We present TROVE, a training-free method of inducing a verifiable and efficient toolbox of functions, by generating via using, growing, and periodically trimming the toolbox. On 11 datasets from math, table question answering, and image reasoning tasks, TROVE consistently yields simpler solutions with higher accuracy than baselines using CodeLLaMa and previous methods using GPT, while using 79-98% smaller toolboxes. TROVE further enables 31% faster and 13% more accurate human verification than baselines. With the same pipeline, it creates diverse functions for varied tasks and datasets, providing insights into their individual characteristics.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhiruo Wang;Graham Neubig;Daniel Fried", "authorids": "~Zhiruo_Wang1;~Graham_Neubig1;~Daniel_Fried1", "gender": "F;M;M", "homepage": "https://zorazrw.github.io;http://phontron.com;https://dpfried.github.io/", "dblp": "249/2286;03/8155;117/4804", "google_scholar": "https://scholar.google.com/citations?hl=en;wlosgkoAAAAJ;sJDqACEAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Zhiruo_Wang1;~Graham_Neubig1;~Daniel_Fried1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu;cmu.edu", "position": "PhD student;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nwang2024trove,\ntitle={Tro{VE}: Inducing Verifiable and Efficient Toolboxes for Solving Programmatic Tasks},\nauthor={Zhiruo Wang and Graham Neubig and Daniel Fried},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DCNCwaMJjI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1182841, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11151820101010471223&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "cmu.edu;cmu.edu;cmu.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Imitation Learning in Discounted Linear MDPs without exploration assumptions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34637", "id": "DChQpB4AJy", "proceeding": "https://proceedings.mlr.press/v235/viano24a.html", "pdf": "https://openreview.net/pdf?id=DChQpB4AJy", "openreview": "https://openreview.net/forum?id=DChQpB4AJy", "author_site": "Luca Viano, EFSTRATIOS PANTELEIMON SKOULAKIS, Volkan Cevher", "tldr": "", "abstract": "We present a new algorithm for imitation learning in infinite horizon linear MDPs dubbed ILARL which greatly improves the bound on the number of trajectories that the learner needs to sample from the environment. In particular, we remove exploration assumptions required in previous works and we improve the dependence on the desired accuracy $\\epsilon$ from $\\mathcal{O}(\\epsilon^{-5})$ to $\\mathcal{O} (\\epsilon^{-4})$. Our result relies on a connection between imitation learning and online learning in MDPs with adversarial losses. For the latter setting, we present the first result for infinite horizon linear MDP which may be of independent interest. Moreover, we are able to provide a strengthen result for the finite horizon case where we achieve $\\mathcal{O}(\\epsilon^{-2})$. Numerical experiments with linear function approximation shows that ILARL outperforms other commonly used algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Luca Viano;Stratis Skoulakis;Volkan Cevher", "authorids": "~Luca_Viano1;~Stratis_Skoulakis2;~Volkan_Cevher1", "gender": ";M;M", "homepage": "https://scholar.google.com/citations?hl=en&user=e9Bpg5gAAAAJ;http://www.corelab.ntua.gr/~sskoul/;http://lions.epfl.ch", "dblp": "268/8179;183/0979.html;70/5301", "google_scholar": "E_dAUKEAAAAJ;Juo2Tk8AAAAJ;https://scholar.google.ch/citations?user=hlWhzU8AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Luca_Viano1;~Stratis_Skoulakis2;~Volkan_Cevher1", "aff": "EPFL - EPF Lausanne;EPFL - EPF Lausanne;Amazon Development Center Germany", "aff_domain": "epfl.ch;epfl.ch;amazon.de", "position": "PhD student;Postdoc;Amazon Scholar", "bibtex": "@inproceedings{\nviano2024imitation,\ntitle={Imitation Learning in Discounted Linear {MDP}s without exploration assumptions},\nauthor={Luca Viano and Stratis Skoulakis and Volkan Cevher},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DChQpB4AJy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1009241, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11641761520982395799&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "email": "epfl.ch;epfl.ch;amazon.de", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "EPFL;Amazon", "aff_unique_dep": ";Development Center", "aff_unique_url": "https://www.epfl.ch;https://www.amazon.de", "aff_unique_abbr": "EPFL;Amazon", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Switzerland;Germany" }, { "title": "Constrained Exploration via Reflected Replica Exchange Stochastic Gradient Langevin Dynamics", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34636", "id": "DCmahCZJYb", "proceeding": "https://proceedings.mlr.press/v235/zheng24d.html", "pdf": "https://openreview.net/pdf?id=DCmahCZJYb", "openreview": "https://openreview.net/forum?id=DCmahCZJYb", "author_site": "Haoyang Zheng, Hengrong Du, Qi Feng, Wei Deng, Guang Lin", "tldr": "", "abstract": "Replica exchange stochastic gradient Langevin dynamics (reSGLD) is an effective sampler for non-convex learning in large-scale datasets. However, the simulation may encounter stagnation issues when the high-temperature chain delves too deeply into the distribution tails. To tackle this issue, we propose reflected reSGLD (r2SGLD): an algorithm tailored for constrained non-convex exploration by utilizing reflection steps within a bounded domain. Theoretically, we observe that reducing the diameter of the domain enhances mixing rates, exhibiting a *quadratic* behavior. Empirically, we test its performance through extensive experiments, including identifying dynamical systems with physical constraints, simulations of constrained multi-modal distributions, and image classification tasks. The theoretical and empirical findings highlight the crucial role of constrained exploration in improving the simulation efficiency.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haoyang Zheng;Hengrong Du;Qi Feng;Wei Deng;Guang Lin", "authorids": "~Haoyang_Zheng2;~Hengrong_Du1;~Qi_Feng3;~Wei_Deng1;~Guang_Lin1", "gender": "M;M;M;M;M", "homepage": "https://haoyangzheng.github.io/;https://hengrongdu.netlify.app/;https://sites.google.com/site/qifengmath/;https://waynedw.github.io/;http://www.math.purdue.edu/~lin491/", "dblp": "201/8708;366/8373;;69/508-2;", "google_scholar": "cq_f7MUAAAAJ;OtzsCPcAAAAJ;bNZM-X4AAAAJ;IYiyxssAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0001-6835-8242;0000-0003-2392-8963;;;0000-0002-0976-1987", "linkedin": "haoyangzheng/;;;;", "or_profile": "~Haoyang_Zheng2;~Hengrong_Du1;~Qi_Feng3;~Wei_Deng1;~Guang_Lin1", "aff": "Purdue University;Vanderbilt University;Florida State University;Morgan Stanley;Purdue University", "aff_domain": "purdue.edu;vanderbilt.edu;fsu.edu;morganstanley.com;purdue.edu", "position": "PhD student;Postdoc;Assistant Professor;Researcher;Associate Professor", "bibtex": "@inproceedings{\nzheng2024constrained,\ntitle={Constrained Exploration via Reflected Replica Exchange Stochastic Gradient Langevin Dynamics},\nauthor={Haoyang Zheng and Hengrong Du and Qi Feng and Wei Deng and Guang Lin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DCmahCZJYb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6433797, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6171739427315951595&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "purdue.edu;vanderbilt.edu;fsu.edu;morganstanley.com;purdue.edu", "author_num": 5, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "Purdue University;Vanderbilt University;Florida State University;Morgan Stanley", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.purdue.edu;https://www.vanderbilt.edu;https://www.fsu.edu;https://www.morganstanley.com", "aff_unique_abbr": "Purdue;Vanderbilt;FSU;Morgan Stanley", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Sharp Rates in Dependent Learning Theory: Avoiding Sample Size Deflation for the Square Loss", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34635", "id": "DHtF8Y6PqS", "proceeding": "https://proceedings.mlr.press/v235/ziemann24a.html", "pdf": "https://openreview.net/pdf?id=DHtF8Y6PqS", "openreview": "https://openreview.net/forum?id=DHtF8Y6PqS", "author_site": "Ingvar Ziemann, Stephen Tu, George J. Pappas, Nikolai Matni", "tldr": "", "abstract": "In this work, we study statistical learning with dependent data and square loss in a hypothesis class with tail decay in Orlicz space: $\\mathscr{F}\\subset L_{\\Psi_p}$. Our inquiry is motivated by the search for a sharp noise interaction term, or variance proxy, in learning with dependent (e.g. $\\beta$-mixing) data. Typical non-asymptotic results exhibit variance proxies that are deflated *multiplicatively* in the mixing time of the underlying covariates process. We show that whenever the topologies of $L^2$ and $\\Psi_p$ are comparable on our hypothesis class $\\mathscr{F}$, the empirical risk minimizer achieves a rate that only depends on the complexity of the class and second order statistics in its leading term. We refer to this as a *near mixing-free rate*, since direct dependence on mixing is relegated to an additive higher order term. Our approach, reliant on mixed tail generic chaining, allows us to obtain sharp, instance-optimal rates. Examples that satisfy our framework include for instance sub-Gaussian linear regression and bounded smoothness classes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ingvar Ziemann;Stephen Tu;George J. Pappas;Nikolai Matni", "authorids": "~Ingvar_Ziemann1;~Stephen_Tu1;~George_J._Pappas1;~Nikolai_Matni2", "gender": "M;;M;M", "homepage": "https://www.kth.se/profile/ziemann;https://stephentu.github.io/;http://www.georgejpappas.org/;https://nikolaimatni.github.io", "dblp": "247/4222;09/8165;p/GeorgeJPappas;52/8135", "google_scholar": "https://scholar.google.se/citations?user=_RBAS2IAAAAJ;JQcDmB8AAAAJ;https://scholar.google.com.tw/citations?user=Kia-4B0AAAAJ;ZDPCh_EAAAAJ", "orcid": ";;0000-0001-9081-0637;", "linkedin": ";;;", "or_profile": "~Ingvar_Ziemann1;~Stephen_Tu1;~George_Pappas1;~Nikolai_Matni1", "aff": "University of Pennsylvania;University of Southern California;School of Engineering and Applied Science, University of Pennsylvania;School of Engineering and Applied Science, University of Pennsylvania", "aff_domain": "upenn.edu;usc.edu;seas.upenn.edu;seas.upenn.edu", "position": "Postdoc;Assistant Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nziemann2024sharp,\ntitle={Sharp Rates in Dependent Learning Theory: Avoiding Sample Size Deflation for the Square Loss},\nauthor={Ingvar Ziemann and Stephen Tu and George J. Pappas and Nikolai Matni},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DHtF8Y6PqS}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 530527, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4810750783846420709&as_sdt=20000005&sciodt=0,21&hl=en", "gs_version_total": 8, "email": "upenn.edu;usc.edu;seas.upenn.edu;seas.upenn.edu", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of Pennsylvania;University of Southern California", "aff_unique_dep": ";", "aff_unique_url": "https://www.upenn.edu;https://www.usc.edu", "aff_unique_abbr": "UPenn;USC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Deconstructing the Goldilocks Zone of Neural Network Initialization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34634", "id": "DJXt63RLO1", "proceeding": "https://proceedings.mlr.press/v235/vysogorets24a.html", "pdf": "https://openreview.net/pdf?id=DJXt63RLO1", "openreview": "https://openreview.net/forum?id=DJXt63RLO1", "author_site": "Artem Vysogorets, Anna Dawid, Julia Kempe", "tldr": "", "abstract": "The second-order properties of the training loss have a massive impact on the optimization dynamics of deep learning models. Fort & Scherlis (2019) discovered that a large excess of positive curvature and local convexity of the loss Hessian is associated with highly trainable initial points located in a region coined the \"Goldilocks zone\". Only a handful of subsequent studies touched upon this relationship, so it remains largely unexplained. In this paper, we present a rigorous and comprehensive analysis of the Goldilocks zone for homogeneous neural networks. In particular, we derive the fundamental condition resulting in excess of positive curvature of the loss, explaining and refining its conventionally accepted connection to the initialization norm. Further, we relate the excess of positive curvature to model confidence, low initial loss, and a previously unknown type of vanishing cross-entropy loss gradient. To understand the importance of excessive positive curvature for trainability of deep networks, we optimize fully-connected and convolutional architectures outside the Goldilocks zone and analyze the emergent behaviors. We find that strong model performance is not perfectly aligned with the Goldilocks zone, calling for further research into this relationship.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Artem M Vysogorets;Anna Dawid;Julia Kempe", "authorids": "~Artem_M_Vysogorets1;~Anna_Dawid1;~Julia_Kempe1", "gender": "M;F;", "homepage": "https://artem.vysogorets.org;https://annadawid.com/;", "dblp": ";253/4676;", "google_scholar": ";_wfcFmoAAAAJ;", "orcid": ";0000-0001-9498-1732;", "linkedin": "avysogorets/;;", "or_profile": "~Artem_M_Vysogorets1;~Anna_Dawid1;~Julia_Kempe1", "aff": "New York University;Flatiron Institute;", "aff_domain": "nyu.edu;flatironinstitute.org;", "position": "PhD student;Postdoc;", "bibtex": "@inproceedings{\nvysogorets2024deconstructing,\ntitle={Deconstructing the Goldilocks Zone of Neural Network Initialization},\nauthor={Artem M Vysogorets and Anna Dawid and Julia Kempe},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DJXt63RLO1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8546609, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:joVMsvMSDTYJ:scholar.google.com/&scioq=Deconstructing+the+Goldilocks+Zone+of+Neural+Network+Initialization&hl=en&as_sdt=0,5", "gs_version_total": 8, "email": "nyu.edu;flatironinstitute.org;", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "New York University;Flatiron Institute", "aff_unique_dep": ";", "aff_unique_url": "https://www.nyu.edu;https://flatironinstitute.org", "aff_unique_abbr": "NYU;Flatiron", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Deep Demonstration Tracing: Learning Generalizable Imitator Policy for Runtime Imitation from a Single Demonstration", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34633", "id": "DJdVzxemdA", "proceeding": "https://proceedings.mlr.press/v235/chen24ax.html", "pdf": "https://openreview.net/pdf?id=DJdVzxemdA", "openreview": "https://openreview.net/forum?id=DJdVzxemdA", "author_site": "Xiong-Hui Chen, Junyin Ye, Hang Zhao, Yi-Chen Li, Xu-Hui Liu, Haoran Shi, Yu-Yan Xu, Zhihao Ye, Si-Hang Yang, Yang Yu, Anqi Huang, Kai Xu, Zongzhang Zhang", "tldr": "", "abstract": "One-shot imitation learning (OSIL) is to learn an imitator agent that can execute multiple tasks with only a single demonstration. In real-world scenario, the environment is dynamic, e.g., unexpected changes can occur after demonstration. Thus, achieving generalization of the imitator agent is crucial as agents would inevitably face situations unseen in the provided demonstrations. While traditional OSIL methods excel in relatively stationary settings, their adaptability to such unforeseen changes, which asking for a higher level of generalization ability for the imitator agents, is limited and rarely discussed. In this work, we present a new algorithm called Deep Demonstration Tracing (DDT). In DDT, we propose a demonstration transformer architecture to encourage agents to adaptively trace suitable states in demonstrations. Besides, it integrates OSIL into a meta-reinforcement-learning training paradigm, providing regularization for policies in unexpected situations. We evaluate DDT on a new navigation task suite and robotics tasks, demonstrating its superior performance over existing OSIL methods across all evaluated tasks in dynamic environments with unforeseen changes. The project page is in https://osil-ddt.github.io.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiong-Hui Chen;Junyin Ye;Hang Zhao;Yi-Chen Li;Xu-Hui Liu;Haoran Shi;Yu-Yan Xu;Zhihao Ye;Si-Hang Yang;Yang Yu;Anqi Huang;Kai Xu;Zongzhang Zhang", "authorids": "~Xiong-Hui_Chen1;~Junyin_Ye1;~Hang_Zhao3;~Yi-Chen_Li1;~Xu-Hui_Liu1;~Haoran_Shi2;~Yu-Yan_Xu1;~Zhihao_Ye3;~Si-Hang_Yang1;~Yang_Yu5;~Anqi_Huang1;~Kai_Xu5;~Zongzhang_Zhang1", "gender": "M;M;M;M;;M;;M;M;M;M;M;M", "homepage": "http://www.lamda.nju.edu.cn/chenxh/;http://www.lamda.nju.edu.cn/yejy/;;http://www.lamda.nju.edu.cn/liyc/;http://www.lamda.nju.edu.cn/liuxh/;;;https://github.com/Vigilans;;;http://www.lamda.nju.edu.cn/zhangzz;http://kevinkaixu.net/;http://www.lamda.nju.edu.cn/yuy", "dblp": "241/7938;;;143/7158-1;292/7577;;https://dblp.org/rec/conf/ijcai/XuSWY22;;;;90/8724;Xu_0004:Kai;46/2181-1", "google_scholar": "H5pguCYAAAAJ;;https://scholar.google.com/citations?hl=en;https://scholar.google.com.hk/citations?user=OA3GmbQAAAAJ;;;;;;;sG7WEAgAAAAJ;https://scholar.google.com.hk/citations?user=GuVkg-8AAAAJ;PG2lDSwAAAAJ", "orcid": ";;;;;;;;;;;;", "linkedin": ";;;;;https://www.linkedin.cn/in/%E6%B5%A9%E7%84%B6-%E6%96%BD-455853261;%E7%8E%89%E7%87%95-%E8%AE%B8-205459276/;;si-hang-yang-aa0796235/;anqihuang777;;;", "or_profile": "~Xiong-Hui_Chen1;~Junyin_Ye1;~Hang_Zhao3;~Yi-Chen_Li1;~Xu-Hui_Liu1;~Haoran_Shi2;~Yu-Yan_Xu1;~Zhihao_Ye3;~Si-Hang_Yang1;~Anqi_Huang1;~Zongzhang_Zhang1;~Kevin_Xu1;~Yang_Yu2", "aff": "Nanjing University;Nanjing University;National University of Defense Technology;Nanjing University;Nanjing University;;;Nanjing University;;Nanjing University of Science and Technology;Nanjing University;National University of Defense Technology;Nanjing University", "aff_domain": "nju.edu.cn;nju.edu.cn;nudt.edu.cn;nju.edu.cn;nju.edu.cn;;;nju.edu.cn;;njust.edu.cn;nju.edu.cn;nudt.edu.cn;nju.edu.cn", "position": "PhD student;MS student;PhD student;PhD student;PhD student;;;MS student;;PhD student;Associate Professor;Professor;Professor", "bibtex": "@inproceedings{\nchen2024deep,\ntitle={Deep Demonstration Tracing: Learning Generalizable Imitator Policy for Runtime Imitation from a Single Demonstration},\nauthor={Xiong-Hui Chen and Junyin Ye and Hang Zhao and Yi-Chen Li and Xu-Hui Liu and Haoran Shi and Yu-Yan Xu and Zhihao Ye and Si-Hang Yang and Yang Yu and Anqi Huang and Kai Xu and Zongzhang Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DJdVzxemdA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 0, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 13, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9640226259525269762&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "email": "nju.edu.cn;nju.edu.cn;nudt.edu.cn;nju.edu.cn;nju.edu.cn;;;nju.edu.cn;;njust.edu.cn;nju.edu.cn;nudt.edu.cn;nju.edu.cn", "author_num": 13, "aff_unique_index": "0;0;1;0;0;0;2;0;1;0", "aff_unique_norm": "Nanjing University;National University of Defense Technology;Nanjing University of Science and Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nju.edu.cn;http://www.nudt.edu.cn/;http://www.nust.edu.cn/", "aff_unique_abbr": "Nanjing U;NUDT;NUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Evaluating Quantized Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34632", "id": "DKKg5EFAFr", "proceeding": "https://proceedings.mlr.press/v235/li24bb.html", "pdf": "https://openreview.net/pdf?id=DKKg5EFAFr", "openreview": "https://openreview.net/forum?id=DKKg5EFAFr", "author_site": "Shiyao Li, Xuefei Ning, Luning Wang, Tengxuan Liu, Xiangsheng Shi, Shengen Yan, Guohao Dai, Huazhong Yang, Yu Wang", "tldr": "", "abstract": "Post-training quantization (PTQ) has emerged as a promising technique to reduce the cost of large language models (LLMs). Specifically, PTQ can effectively mitigate memory consumption and reduce computational overhead in LLMs. To meet the requirements of both high efficiency and performance across diverse scenarios, a comprehensive evaluation of quantized LLMs is essential to guide the selection of quantization methods. This paper presents a thorough evaluation of these factors by evaluating the effect of PTQ on Weight, Activation, and KV Cache on 11 model families, including OPT, LLaMA2, Falcon, Bloomz, Mistral, ChatGLM, Vicuna, LongChat, StableLM, Gemma, and Mamba, with parameters ranging from 125M to 180B. The evaluation encompasses five types of tasks: basic NLP, emergent ability, trustworthiness, dialogue, and long-context tasks. Moreover, we also evaluate the state-of-the-art (SOTA) quantization methods to demonstrate their applicability. Based on the extensive experiments, we systematically summarize the effect of quantization, provide recommendations to apply quantization techniques, and point out future directions. The code can be found in https://github.com/thu-nics/qllm-eval.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shiyao Li;Xuefei Ning;Luning Wang;Tengxuan Liu;Xiangsheng Shi;Shengen Yan;Guohao Dai;Huazhong Yang;Yu Wang", "authorids": "~Shiyao_Li2;~Xuefei_Ning1;~Luning_Wang2;~Tengxuan_Liu1;~Xiangsheng_Shi1;~Shengen_Yan1;~Guohao_Dai4;~Huazhong_Yang2;~Yu_Wang3", "gender": "M;Not Specified;M;M;M;M;M;M;M", "homepage": "http://nicsefc.ee.tsinghua.edu.cn/people/ShiyaoLi;https://nics-effalg.com/ningxuefei/;https://wln20.github.io;https://nicsefc.ee.tsinghua.edu.cn/people/TengxuanLiu;https://nicsefc.ee.tsinghua.edu.cn/people/xiangshengshi;;https://nicsefc.ee.tsinghua.edu.cn/people/guohao-dai/;http://web.ee.tsinghua.edu.cn/yanghuazhong/en/index.htm;https://nicsefc.ee.tsinghua.edu.cn", "dblp": ";202/9525;;371/4537;;117/6968;147/1470;94/1128.html;w/YuWang2.html", "google_scholar": ";oVslpJsAAAAJ;kvTOYG8AAAAJ;;;SvE3bdUAAAAJ;gz3Tkl0AAAAJ;;https://scholar.google.com.hk/citations?user=j8JGVvoAAAAJ", "orcid": ";;;;;;;0000-0003-2421-353X;0000-0001-6108-5157", "linkedin": ";;wangluning/;;;;;;", "or_profile": "~Shiyao_Li2;~Xuefei_Ning1;~Luning_Wang2;~Tengxuan_Liu1;~Xiangsheng_Shi1;~Shengen_Yan1;~Guohao_Dai4;~Huazhong_Yang2;~Yu_Wang3", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;;Tsinghua University;Shanghai Jiaotong University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;mails.tsinghua.edu.cn;;tsinghua.edu.cn;sjtu.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;Research Assistant Professor;Undergrad student;Undergrad student;;Associate Professor;Associate Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nli2024evaluating,\ntitle={Evaluating Quantized Large Language Models},\nauthor={Shiyao Li and Xuefei Ning and Luning Wang and Tengxuan Liu and Xiangsheng Shi and Shengen Yan and Guohao Dai and Huazhong Yang and Yu Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DKKg5EFAFr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2018083, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12534909123866896212&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 11, "email": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;mails.tsinghua.edu.cn;;tsinghua.edu.cn;sjtu.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 9, "aff_unique_index": "0;0;0;0;0;1;0;0", "aff_unique_norm": "Tsinghua University;Shanghai Jiao Tong University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.sjtu.edu.cn", "aff_unique_abbr": "THU;SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Posterior Sampling-Based Bayesian Optimization with Tighter Bayesian Regret Bounds", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34631", "id": "DKOHE4n8jk", "proceeding": "https://proceedings.mlr.press/v235/takeno24a.html", "pdf": "https://openreview.net/pdf?id=DKOHE4n8jk", "openreview": "https://openreview.net/forum?id=DKOHE4n8jk", "author_site": "Shion Takeno, Yu Inatsu, Masayuki Karasuyama, Ichiro Takeuchi", "tldr": "", "abstract": "Among various acquisition functions (AFs) in Bayesian optimization (BO), Gaussian process upper confidence bound (GP-UCB) and Thompson sampling (TS) are well-known options with established theoretical properties regarding Bayesian cumulative regret (BCR). Recently, it has been shown that a randomized variant of GP-UCB achieves a tighter BCR bound compared with GP-UCB, which we call the tighter BCR bound for brevity. Inspired by this study, this paper first shows that TS achieves the tighter BCR bound. On the other hand, GP-UCB and TS often practically suffer from manual hyperparameter tuning and over-exploration issues, respectively. Therefore, we analyze yet another AF called a probability of improvement from the maximum of a sample path (PIMS). We show that PIMS achieves the tighter BCR bound and avoids the hyperparameter tuning, unlike GP-UCB. Furthermore, we demonstrate a wide range of experiments, focusing on the effectiveness of PIMS that mitigates the practical issues of GP-UCB and TS.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shion Takeno;Yu Inatsu;Masayuki Karasuyama;Ichiro Takeuchi", "authorids": "~Shion_Takeno1;~Yu_Inatsu1;~Masayuki_Karasuyama1;~Ichiro_Takeuchi1", "gender": "M;;M;M", "homepage": "https://takeno1995.github.io/myhomepage/;;https://www-als.ics.nitech.ac.jp/~karasuyama/index_E.html;https://www.mlds.mae.nagoya-u.ac.jp/takeuchi/index.en.html", "dblp": ";;23/5508.html;36/4181", "google_scholar": "https://scholar.google.co.jp/citations?user=oGaC1SgAAAAJ;VkCw9O4AAAAJ;https://scholar.google.de/citations?user=6pgd_q8AAAAJ;IwBHa3gAAAAJ", "orcid": ";;0000-0002-6177-3686;0009-0005-1905-2366", "linkedin": ";;;", "or_profile": "~Shion_Takeno1;~Yu_Inatsu1;~Masayuki_Karasuyama1;~Ichiro_Takeuchi1", "aff": "RIKEN;Nagoya Institute of Technology;Nagoya Institute of Technology;RIKEN", "aff_domain": "riken.jp;nitech.ac.jp;nitech.ac.jp;riken.jp", "position": "Postdoc;Assistant Professor;Associate Professor;Principal Researcher", "bibtex": "@inproceedings{\ntakeno2024posterior,\ntitle={Posterior Sampling-Based Bayesian Optimization with Tighter Bayesian Regret Bounds},\nauthor={Shion Takeno and Yu Inatsu and Masayuki Karasuyama and Ichiro Takeuchi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DKOHE4n8jk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 731498, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5923208917718329050&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "riken.jp;nitech.ac.jp;nitech.ac.jp;riken.jp", "author_num": 4, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "RIKEN;Nagoya Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.riken.jp;https://www.nitech.ac.jp", "aff_unique_abbr": "RIKEN;NIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Japan" }, { "title": "All-in-one simulation-based inference", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34630", "id": "DL79HYCFFq", "proceeding": "https://proceedings.mlr.press/v235/gloeckler24a.html", "pdf": "https://openreview.net/pdf?id=DL79HYCFFq", "openreview": "https://openreview.net/forum?id=DL79HYCFFq", "author_site": "Manuel Gloeckler, Michael Deistler, Christian Weilbach, Frank Wood, Jakob Macke", "tldr": "", "abstract": "Amortized Bayesian inference trains neural networks to solve stochastic inference problems using model simulations, thereby making it possible to rapidly perform Bayesian inference for any newly observed data. However, current simulation-based amortized inference methods are simulation-hungry and inflexible: They require the specification of a fixed parametric prior, simulator, and inference tasks ahead of time. Here, we present a new amortized inference method---the Simformer---which overcomes these limitations. By training a probabilistic diffusion model with transformer architectures, the Simformer outperforms current state-of-the-art amortized inference approaches on benchmark tasks and is substantially more flexible: It can be applied to models with function-valued parameters, it can handle inference scenarios with missing or unstructured data, and it can sample arbitrary conditionals of the joint distribution of parameters and data, including both posterior and likelihood. We showcase the performance and flexibility of the Simformer on simulators from ecology, epidemiology, and neuroscience, and demonstrate that it opens up new possibilities and application domains for amortized Bayesian inference on simulation-based models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Manuel Gloeckler;Michael Deistler;Christian Dietrich Weilbach;Frank Wood;Jakob H. Macke", "authorids": "~Manuel_Gloeckler1;~Michael_Deistler1;~Christian_Dietrich_Weilbach1;~Frank_Wood2;~Jakob_H._Macke1", "gender": "M;M;M;M;M", "homepage": "https://michaeldeistler.github.io/;https://whilo.github.io/;http://www.robots.ox.ac.uk/~fwood/;http://www.mackelab.org;https://manuelgloeckler.github.io/", "dblp": "243/5747;;44/4750;97/11106;", "google_scholar": "Q24H-zYAAAAJ;;d4yNzXIAAAAJ;FKOqtF8AAAAJ;0Vdv0H0AAAAJ", "orcid": "0000-0002-3573-0404;;;0000-0001-5154-8912;", "linkedin": ";;frank-wood-43529114?trk=hp-identity-name;;", "or_profile": "~Michael_Deistler1;~Christian_Dietrich_Weilbach1;~Frank_Wood2;~Jakob_H_Macke1;~Manuel_Gl\u00f6ckler1", "aff": "University of Tuebingen;;University of British Columbia;University of Tuebingen;University of Tuebingen", "aff_domain": "uni-tuebingen.de;;cs.ubc.ca;uni-tuebingen.de;uni-tuebingen.de", "position": "PhD student;;Full Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\ngloeckler2024allinone,\ntitle={All-in-one simulation-based inference},\nauthor={Manuel Gloeckler and Michael Deistler and Christian Dietrich Weilbach and Frank Wood and Jakob H. Macke},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DL79HYCFFq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4812545, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6918443150215659579&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "uni-tuebingen.de;;cs.ubc.ca;uni-tuebingen.de;uni-tuebingen.de", "author_num": 5, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of Tuebingen;University of British Columbia", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.ubc.ca", "aff_unique_abbr": "Uni T\u00fcbingen;UBC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Germany;Canada" }, { "title": "Unveiling and Harnessing Hidden Attention Sinks: Enhancing Large Language Models without Training through Attention Calibration", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34629", "id": "DLTjFFiuUJ", "proceeding": "https://proceedings.mlr.press/v235/yu24l.html", "pdf": "https://openreview.net/pdf?id=DLTjFFiuUJ", "openreview": "https://openreview.net/forum?id=DLTjFFiuUJ", "author_site": "Zhongzhi Yu, Zheng Wang, Yonggan Fu, Shi Huihong, Khalid Shaikh, Yingyan (Celine) Lin", "tldr": "", "abstract": "Attention is a fundamental component behind the remarkable achievements of large language models (LLMs). However, our current understanding of the attention mechanism, especially regarding how attention distributions are established, remains limited. Inspired by recent studies that explore the presence of attention sink in the initial token, which receives disproportionately large attention scores despite their lack of semantic importance, this work delves deeper into this phenomenon. We aim to provide a more profound understanding of the existence of attention sinks within LLMs and to uncover ways to enhance the achievable accuracy of LLMs by directly optimizing the attention distributions, without the need for weight finetuning. Specifically, this work begins with comprehensive visualizations of the attention distributions in LLMs during inference across various inputs and tasks. Based on these visualizations, to the best of our knowledge, we are the first to discover that (1) attention sinks occur not only at the start of sequences but also within later tokens of the input, and (2) not all attention sinks have a positive impact on the achievable accuracy of LLMs. Building upon our findings, we propose a training-free Attention Calibration Technique (ACT) that automatically optimizes the attention distributions on the fly during inference in an input-adaptive manner. Extensive experiments validate that ACT consistently enhances the accuracy of various LLMs across different applications. Specifically, ACT achieves an average improvement of up to $7.30\\%$ in accuracy across different datasets when applied to Llama-30B.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhongzhi Yu;Zheng Wang;Yonggan Fu;Huihong Shi;Khalid Shaikh;Yingyan Celine Lin", "authorids": "~Zhongzhi_Yu1;~Zheng_Wang38;~Yonggan_Fu1;~Huihong_Shi1;kshaikh6@gatech.edu;~Yingyan_Celine_Lin1", "gender": "M;M;M;F;;", "homepage": ";https://zkbig.github.io/;https://www.yongganfu.com/;https://shihuihong214.github.io/huihong.shi/;;", "dblp": "198/8338;;244/8166;253/3178;;", "google_scholar": "KjvcaBQAAAAJ;MIBiy2gAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;;", "orcid": ";;;0000-0002-7845-0154;;", "linkedin": "zhongzhi-yu/;;yonggan-fu-b211831b0;;;", "or_profile": "~Zhongzhi_Yu1;~Zheng_Wang38;~Yonggan_Fu1;~Huihong_Shi1;kshaikh6@gatech.edu;~Yingyan_Celine_Lin1", "aff": "Nvidia Research;Georgia Institute of Technology;Georgia Institute of Technology;Nanjing University;;", "aff_domain": "nivida.com;gatech.edu;gatech.edu;nju.edu.cn;;", "position": "Research Intern;MS student;PhD student;PhD student;;", "bibtex": "@inproceedings{\nyu2024unveiling,\ntitle={Unveiling and Harnessing Hidden Attention Sinks: Enhancing Large Language Models without Training through Attention Calibration},\nauthor={Zhongzhi Yu and Zheng Wang and Yonggan Fu and Huihong Shi and Khalid Shaikh and Yingyan Celine Lin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DLTjFFiuUJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4744626, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2471563748126015039&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "nivida.com;gatech.edu;gatech.edu;nju.edu.cn;;", "author_num": 6, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "NVIDIA;Georgia Institute of Technology;Nanjing University", "aff_unique_dep": "NVIDIA Research;;", "aff_unique_url": "https://www.nvidia.com/research;https://www.gatech.edu;https://www.nju.edu.cn", "aff_unique_abbr": "NVIDIA;Georgia Tech;Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;China" }, { "title": "Optimal Batched Linear Bandits", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34628", "id": "DM0r4qatjT", "proceeding": "https://proceedings.mlr.press/v235/ren24a.html", "pdf": "https://openreview.net/pdf?id=DM0r4qatjT", "openreview": "https://openreview.net/forum?id=DM0r4qatjT", "author_site": "Xuanfei Ren, Tianyuan Jin, Pan Xu", "tldr": "", "abstract": "We introduce the E$^4$ algorithm for the batched linear bandit problem, incorporating an Explore-Estimate-Eliminate-Exploit framework. With a proper choice of exploration rate, we prove E$^4$ achieves the finite-time minimax optimal regret with only $O(\\log\\log T)$ batches, and the asymptotically optimal regret with only $3$ batches as $T\\rightarrow\\infty$, where $T$ is the time horizon. We further prove a lower bound on the batch complexity of liner contextual bandits showing that any asymptotically optimal algorithm must require at least $3$ batches in expectation as $T\\rightarrow \\infty$, which indicates E$^4$ achieves the asymptotic optimality in regret and batch complexity simultaneously. To the best of our knowledge, E$^4$ is the first algorithm for linear bandits that simultaneously achieves the minimax and asymptotic optimality in regret with the corresponding optimal batch complexities. In addition, we show that with another choice of exploration rate E$^4$ achieves an instance-dependent regret bound requiring at most $O(\\log T)$ batches, and maintains the minimax optimality and asymptotic optimality. We conduct thorough experiments to evaluate our algorithm on randomly generated instances and the challenging *End of Optimism* instances (Lattimore & Szepesvari, 2017) which were shown to be hard to learn for optimism based algorithms. Empirical results show that E$^4$ consistently outperforms baseline algorithms with respect to regret minimization, batch complexity, and computational efficiency.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xuanfei Ren;Tianyuan Jin;Pan Xu", "authorids": "~Xuanfei_Ren1;~Tianyuan_Jin1;~Pan_Xu1", "gender": "M;M;M", "homepage": "http://home.ustc.edu.cn/~matchbox/;https://tianyuanjin.github.io/;https://panxulab.github.io/", "dblp": "379/6120;208/2335;11/9718-2", "google_scholar": "av184VoAAAAJ;3e5kmjsAAAAJ;UkYBx6YAAAAJ", "orcid": ";;0000-0002-2559-8622", "linkedin": "xuanfei-ren-803b22273/;;pan-xu-0931a2a6/", "or_profile": "~Xuanfei_Ren1;~Tianyuan_Jin1;~Pan_Xu1", "aff": "University of Science and Technology of China;National University of Singapore;Duke University", "aff_domain": "mail.ustc.edu.cn;nus.edu.sg;duke.edu", "position": "Undergrad student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nren2024optimal,\ntitle={Optimal Batched Linear Bandits},\nauthor={Xuanfei Ren and Tianyuan Jin and Pan Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DM0r4qatjT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2681600, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17766388771619453267&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "mail.ustc.edu.cn;nus.edu.sg;duke.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Science and Technology of China;National University of Singapore;Duke University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.nus.edu.sg;https://www.duke.edu", "aff_unique_abbr": "USTC;NUS;Duke", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "China;Singapore;United States" }, { "title": "Enhancing Sufficient Dimension Reduction via Hellinger Correlation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34627", "id": "DN7uk4gQ7C", "proceeding": "https://proceedings.mlr.press/v235/hong24b.html", "pdf": "https://openreview.net/pdf?id=DN7uk4gQ7C", "openreview": "https://openreview.net/forum?id=DN7uk4gQ7C", "author_site": "Seungbeom Hong, Ilmun Kim, Jun Song", "tldr": "", "abstract": "In this work, we develop a new theory and method for sufficient dimension reduction (SDR) in single-index models, where SDR is a sub-field of supervised dimension reduction based on conditional independence. Our work is primarily motivated by the recent introduction of the Hellinger correlation as a dependency measure. Utilizing this measure, we have developed a method capable of effectively detecting the dimension reduction subspace, complete with theoretical justification. Through extensive numerical experiments, we demonstrate that our proposed method significantly enhances and outperforms existing SDR methods. This improvement is largely attributed to our proposed method's deeper understanding of data dependencies and the refinement of existing SDR techniques.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "SeungBeom Hong;Ilmun Kim;Jun Song", "authorids": "~SeungBeom_Hong1;~Ilmun_Kim1;~Jun_Song4", "gender": ";;M", "homepage": ";;http://www.jsong.net", "dblp": ";;", "google_scholar": ";;", "orcid": ";;0000-0002-0382-6735", "linkedin": "%EC%8A%B9%EB%B2%94-%ED%99%8D-229b55269/;;", "or_profile": "~SeungBeom_Hong1;~Ilmun_Kim1;~Jun_Song4", "aff": "Korea University;;Korea University", "aff_domain": "korea.ac.kr;;korea.ac.kr", "position": "MS student;;Associate Professor", "bibtex": "@inproceedings{\nhong2024enhancing,\ntitle={Enhancing Sufficient Dimension Reduction via Hellinger Correlation},\nauthor={SeungBeom Hong and Ilmun Kim and Jun Song},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DN7uk4gQ7C}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 396216, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16651872479979536552&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 7, "email": "korea.ac.kr;;korea.ac.kr", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Korea University", "aff_unique_dep": "", "aff_unique_url": "https://www.korea.ac.kr", "aff_unique_abbr": "KU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Characterizing Overfitting in Kernel Ridgeless Regression Through the Eigenspectrum", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34626", "id": "DRBgNQ2N7U", "proceeding": "https://proceedings.mlr.press/v235/cheng24g.html", "pdf": "https://openreview.net/pdf?id=DRBgNQ2N7U", "openreview": "https://openreview.net/forum?id=DRBgNQ2N7U", "author_site": "Tin Sum Cheng, Aurelien Lucchi, Anastasis Kratsios, David Belius", "tldr": "", "abstract": "We derive new bounds for the condition number of kernel matrices, which we then use to enhance existing non-asymptotic test error bounds for kernel ridgeless regression in the over-parameterized regime for a fixed input dimension. For kernels with polynomial spectral decay, we recover the bound from previous work; for exponential decay, our bound is non-trivial and novel. Our conclusion is two-fold: (i) kernel regressors whose eigenspectrum decays polynomially must generalize well, even in the presence of noisy labeled training data; these models exhibit so-called tempered overfitting; (ii) if the eigenspectrum of any kernel ridge regressor decays exponentially, then it generalizes poorly, i.e., it exhibits catastrophic overfitting. This adds to the available characterization of kernel ridge regressors exhibiting benign overfitting as the extremal case where the eigenspectrum of the kernel decays sub-polynomially. Our analysis combines new random matrix theory (RMT) techniques with recent tools in the kernel ridge regression (KRR) literature.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tin Sum Cheng;Aurelien Lucchi;Anastasis Kratsios;David Belius", "authorids": "~Tin_Sum_Cheng1;~Aurelien_Lucchi1;~Anastasis_Kratsios1;~David_Belius1", "gender": "M;M;Non-Binary;", "homepage": ";http://people.inf.ethz.ch/alucchi/;https://anastasiskratsios.github.io/;https://davidbelius.github.io/", "dblp": ";14/5780;;", "google_scholar": "5wfAh9kAAAAJ;https://scholar.google.ch/citations?user=V1ONSgIAAAAJ;https://scholar.google.ca/citations?user=9D-bHFgAAAAJ;", "orcid": "0000-0002-3000-311X;;0000-0001-6791-3371;0000-0003-3706-043X", "linkedin": "tin-sum-cheng;;anastasiskratsios/;", "or_profile": "~Tin_Sum_Cheng1;~Aurelien_Lucchi1;~Anastasis_Kratsios1;~David_Belius1", "aff": "University of Basel;University of Basel;McMaster University;UniDistance Suisse", "aff_domain": "unibas.ch;unibas.ch;mcmaster.ca;unidistance.ch", "position": "PhD student;Assistant Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\ncheng2024characterizing,\ntitle={Characterizing Overfitting in Kernel Ridgeless Regression Through the Eigenspectrum},\nauthor={Tin Sum Cheng and Aurelien Lucchi and Anastasis Kratsios and David Belius},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DRBgNQ2N7U}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 735769, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7485339763974550150&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "unibas.ch;unibas.ch;mcmaster.ca;unidistance.ch", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "University of Basel;McMaster University;UniDistance Suisse", "aff_unique_dep": ";;", "aff_unique_url": "https://www.unibas.ch;https://www.mcmaster.ca;https://www.unidistance.ch", "aff_unique_abbr": "UniBas;McMaster;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Switzerland;Canada" }, { "title": "Sparsest Models Elude Pruning: An Expos\u00e9 of Pruning\u2019s Current Capabilities", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34625", "id": "DRGgT7SyC7", "proceeding": "https://proceedings.mlr.press/v235/zhang24av.html", "pdf": "https://openreview.net/pdf?id=DRGgT7SyC7", "openreview": "https://openreview.net/forum?id=DRGgT7SyC7", "author_site": "Stephen Zhang, Vardan Papyan", "tldr": "", "abstract": "Pruning has emerged as a promising approach for compressing large-scale models, yet its effectiveness in recovering the sparsest of models has not yet been explored. We conducted an extensive series of 485,838 experiments, applying a range of state-of-the-art pruning algorithms to a synthetic dataset we created, named the Cubist Spiral. Our findings reveal a significant gap in performance compared to ideal sparse networks, which we identified through a novel combinatorial search algorithm. We attribute this performance gap to current pruning algorithms' poor behaviour under overparameterization, their tendency to induce disconnected paths throughout the network, and their propensity to get stuck at suboptimal solutions, even when given the optimal width and initialization. This gap is concerning, given the simplicity of the network architectures and datasets used in our study. We hope that our research encourages further investigation into new pruning techniques that strive for true network sparsity.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Stephen Zhang;Vardan Papyan", "authorids": "~Stephen_Zhang4;~Vardan_Papyan1", "gender": "M;M", "homepage": "https://www.mathematics.utoronto.ca/people/directories/graduate-students/stephen-zhang;https://sites.google.com/view/vardan-papyan", "dblp": ";173/9783", "google_scholar": ";https://scholar.google.co.il/citations?user=VrE-Gd4AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Stephen_Zhang4;~Vardan_Papyan1", "aff": "University of Toronto;University of Toronto", "aff_domain": "utoronto.ca;toronto.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nzhang2024sparsest,\ntitle={Sparsest Models Elude Pruning: An Expos\\'e of Pruning{\\textquoteright}s Current Capabilities},\nauthor={Stephen Zhang and Vardan Papyan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DRGgT7SyC7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9587871, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:lR-fdOnp1RcJ:scholar.google.com/&scioq=Sparsest+Models+Elude+Pruning:+An+Expos%C3%A9+of+Pruning%E2%80%99s+Current+Capabilities&hl=en&as_sdt=0,33", "gs_version_total": 5, "email": "utoronto.ca;toronto.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Toronto", "aff_unique_dep": "", "aff_unique_url": "https://www.utoronto.ca", "aff_unique_abbr": "U of T", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "title": "Localizing Task Information for Improved Model Merging and Compression", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34624", "id": "DWT9uiGjxT", "proceeding": "https://proceedings.mlr.press/v235/wang24k.html", "pdf": "https://openreview.net/pdf?id=DWT9uiGjxT", "openreview": "https://openreview.net/forum?id=DWT9uiGjxT", "author_site": "Ke Wang, Nikolaos Dimitriadis, Guillermo Ortiz-Jimenez, Fran\u00e7ois Fleuret, Pascal Frossard", "tldr": "", "abstract": "Model merging and task arithmetic have emerged as promising scalable approaches to merge multiple single-task checkpoints to one multi-task model, but their applicability is reduced by significant performance loss. Previous works have linked these drops to interference in the weight space and erasure of important task-specific features. Instead, in this work we show that the information required to solve each task is still preserved after merging as different tasks mostly use non-overlapping sets of weights. We propose TALL-masks, a method to identify these task supports given a collection of task vectors and show that one can retrieve >99% of the single task accuracy by applying our masks to the multi-task vector, effectively compressing the individual checkpoints. We study the statistics of intersections among constructed masks and reveal the existence of selfish and catastrophic weights, i.e., parameters that are important exclusively to one task and irrelevant to all tasks but detrimental to multi-task fusion. For this reason, we propose Consensus Merging, an algorithm that eliminates such weights and improves the general performance of existing model merging approaches. Our experiments in vision and NLP benchmarks with up to 20 tasks, show that Consensus Merging consistently improves existing approaches. Furthermore, our proposed compression scheme reduces storage from 57Gb to 8.2Gb while retaining 99.7% of original performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ke Wang;Nikolaos Dimitriadis;Guillermo Ortiz-Jimenez;Fran\u00e7ois Fleuret;Pascal Frossard", "authorids": "~Ke_Wang19;~Nikolaos_Dimitriadis1;~Guillermo_Ortiz-Jimenez1;~Fran\u00e7ois_Fleuret2;~Pascal_Frossard1", "gender": "M;;;;", "homepage": "https://wang-kee.github.io/;https://nik-dim.github.io;http://gortizji.github.io;;", "dblp": ";278/8332;222/2737;;", "google_scholar": "wKBORzsAAAAJ;ZG2WrKwAAAAJ;xAsJnG0AAAAJ;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Ke_Wang19;~Nikolaos_Dimitriadis1;~Guillermo_Ortiz-Jimenez1;~Fran\u00e7ois_Fleuret2;~Pascal_Frossard1", "aff": "EPFL - EPF Lausanne;EPFL - EPF Lausanne;Google DeepMind;;", "aff_domain": "epfl.ch;epfl.ch;google.com;;", "position": "PhD student;PhD student;Research Scientist;;", "bibtex": "@inproceedings{\nwang2024localizing,\ntitle={Localizing Task Information for Improved Model Merging and Compression},\nauthor={Ke Wang and Nikolaos Dimitriadis and Guillermo Ortiz-Jimenez and Fran{\\c{c}}ois Fleuret and Pascal Frossard},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DWT9uiGjxT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1093796, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16416452296936077605&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "epfl.ch;epfl.ch;google.com;;", "author_num": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "EPFL;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.epfl.ch;https://deepmind.com", "aff_unique_abbr": "EPFL;DeepMind", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Switzerland;United Kingdom" }, { "title": "Agent Smith: A Single Image Can Jailbreak One Million Multimodal LLM Agents Exponentially Fast", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34623", "id": "DYMj03Gbri", "proceeding": "https://proceedings.mlr.press/v235/gu24e.html", "pdf": "https://openreview.net/pdf?id=DYMj03Gbri", "openreview": "https://openreview.net/forum?id=DYMj03Gbri", "author_site": "Xiangming Gu, Xiaosen Zheng, Tianyu Pang, Chao Du, Qian Liu, Ye Wang, Jing Jiang, Min Lin", "tldr": "", "abstract": "A multimodal large language model (MLLM) agent can receive instructions, capture images, retrieve histories from memory, and decide which tools to use. Nonetheless, red-teaming efforts have revealed that adversarial images/prompts can jailbreak an MLLM and cause unaligned behaviors. In this work, we report an even more severe safety issue in multi-agent environments, referred to as infectious jailbreak. It entails the adversary simply jailbreaking a single agent, and without any further intervention from the adversary, (almost) all agents will become infected exponentially fast and exhibit harmful behaviors. To validate the feasibility of infectious jailbreak, we simulate multi-agent environments containing up to one million LLaVA-1.5 agents, and employ randomized pair-wise chat as a proof-of-concept instantiation for multi-agent interaction. Our results show that feeding an (infectious) adversarial image into the memory of any randomly chosen agent is sufficient to achieve infectious jailbreak. Finally, we derive a simple principle for determining whether a defense mechanism can provably restrain the spread of infectious jailbreak, but how to design a practical defense that meets this principle remains an open question to investigate.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiangming Gu;Xiaosen Zheng;Tianyu Pang;Chao Du;Qian Liu;Ye Wang;Jing Jiang;Min Lin", "authorids": "~Xiangming_Gu1;~Xiaosen_Zheng1;~Tianyu_Pang1;~Chao_Du1;~Qian_Liu2;~Ye_Wang3;~Jing_Jiang1;~Min_Lin1", "gender": "M;M;M;M;M;M;F;M", "homepage": "https://guxm2021.github.io;https://xszheng2020.github.io;https://p2333.github.io/;https://duchao0726.github.io/;http://siviltaram.github.io/;https://smcnus.comp.nus.edu.sg/;http://www.mysmu.edu/faculty/jingjiang/;https://linmin.me", "dblp": "276/5844;219/6063;202/2550;75/7523;;44/6292-7;68/1974-1;", "google_scholar": "BkxEuIoAAAAJ;https://scholar.google.com/citations?hl=en;wYDbtFsAAAAJ;QOp7xW0AAAAJ;bcbeUo0AAAAJ;https://scholar.google.com.sg/citations?user=CdgLLL8AAAAJ;https://scholar.google.com.sg/citations?user=hVTK2YwAAAAJ;BGONmkIAAAAJ", "orcid": ";;0000-0003-0639-6176;0000-0003-1244-6336;;0000-0002-0123-1260;0000-0002-3035-0074;", "linkedin": "xiangming-gu/;;%E5%A4%A9%E5%AE%87-%E5%BA%9E-b3999017a/;duchao/;;;;min-lin-08a3a422/", "or_profile": "~Xiangming_Gu1;~Xiaosen_Zheng1;~Tianyu_Pang1;~Chao_Du1;~Qian_Liu2;~Ye_Wang3;~Jing_Jiang1;~Min_Lin1", "aff": "National University of Singapore;Sea AI Lab;Sea AI Lab;Sea AI Lab;Tiktok;National University of Singapore;Singapore Management University;Sea AI Lab", "aff_domain": "nus.edu.sg;sea.com;sea.com;sea.com;bytedance.com;nus.edu.sg;smu.edu.sg;sea.com", "position": "PhD student;Research Intern;Senior Research Scientist;Senior Research Scientist;Researcher;Associate Professor;Full Professor;Principal Researcher", "bibtex": "@inproceedings{\ngu2024agent,\ntitle={Agent Smith: A Single Image Can Jailbreak One Million Multimodal {LLM} Agents Exponentially Fast},\nauthor={Xiangming Gu and Xiaosen Zheng and Tianyu Pang and Chao Du and Qian Liu and Ye Wang and Jing Jiang and Min Lin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DYMj03Gbri}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4365010, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 71, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15631248442242117018&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 10, "email": "nus.edu.sg;sea.com;sea.com;sea.com;bytedance.com;nus.edu.sg;smu.edu.sg;sea.com", "author_num": 8, "aff_unique_index": "0;1;1;1;2;0;3;1", "aff_unique_norm": "National University of Singapore;Sea AI Lab;TikTok;Singapore Management University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.nus.edu.sg;;https://www.tiktok.com;https://www.smu.edu.sg", "aff_unique_abbr": "NUS;;TikTok;SMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;2;0;0", "aff_country_unique": "Singapore;;China" }, { "title": "Graph Distillation with Eigenbasis Matching", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34622", "id": "DYN66IJCI9", "proceeding": "https://proceedings.mlr.press/v235/liu24d.html", "pdf": "https://openreview.net/pdf?id=DYN66IJCI9", "openreview": "https://openreview.net/forum?id=DYN66IJCI9", "author_site": "Yang Liu, Deyu Bo, Chuan Shi", "tldr": "", "abstract": "The increasing amount of graph data places requirements on the efficient training of graph neural networks (GNNs). The emerging graph distillation (GD) tackles this challenge by distilling a small synthetic graph to replace the real large graph, ensuring GNNs trained on real and synthetic graphs exhibit comparable performance. However, existing methods rely on GNN-related information as supervision, including gradients, representations, and trajectories, which have two limitations. First, GNNs can affect the spectrum (*i.e*., eigenvalues) of the real graph, causing *spectrum bias* in the synthetic graph. Second, the variety of GNN architectures leads to the creation of different synthetic graphs, requiring *traversal* to obtain optimal performance. To tackle these issues, we propose Graph Distillation with Eigenbasis Matching (GDEM), which aligns the eigenbasis and node features of real and synthetic graphs. Meanwhile, it directly replicates the spectrum of the real graph and thus prevents the influence of GNNs. Moreover, we design a discrimination constraint to balance the effectiveness and generalization of GDEM. Theoretically, the synthetic graphs distilled by GDEM are restricted spectral approximations of the real graphs. Extensive experiments demonstrate that GDEM outperforms state-of-the-art GD methods with powerful cross-architecture generalization ability and significant distillation efficiency. Our code is available at https://github.com/liuyang-tian/GDEM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yang Liu;Deyu Bo;Chuan Shi", "authorids": "~Yang_Liu105;~Deyu_Bo1;~Chuan_Shi1", "gender": "F;M;M", "homepage": "https://liuyang-tian.github.io/;https://bdy9527.github.io/;http://www.shichuan.org/", "dblp": "51/3710-348.html;258/0824;64/3041-1", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;m4rsQCAAAAAJ;tUq_v90AAAAJ", "orcid": "0000-0002-6230-0282;0000-0003-2063-8223;0000-0002-3734-0266", "linkedin": ";;", "or_profile": "~Yang_Liu105;~Deyu_Bo1;~Chuan_Shi1", "aff": "Beijing University of Posts and Telecommunications;National University of Singapore;Beijing University of Post and Telecommunication", "aff_domain": "bupt.edu.cn;nus.edu.sg;bupt.edu.cn", "position": "MS student;Postdoc;Full Professor", "bibtex": "@inproceedings{\nliu2024graph,\ntitle={Graph Distillation with Eigenbasis Matching},\nauthor={Yang Liu and Deyu Bo and Chuan Shi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DYN66IJCI9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 791557, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3809495278828593759&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "bupt.edu.cn;nus.edu.sg;bupt.edu.cn", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Beijing University of Posts and Telecommunications;National University of Singapore", "aff_unique_dep": ";", "aff_unique_url": "http://www.bupt.edu.cn/;https://www.nus.edu.sg", "aff_unique_abbr": "BUPT;NUS", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;Singapore" }, { "title": "Optimal Kernel Choice for Score Function-based Causal Discovery", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34621", "id": "DYd4vyyhUu", "proceeding": "https://proceedings.mlr.press/v235/wang24aa.html", "pdf": "https://openreview.net/pdf?id=DYd4vyyhUu", "openreview": "https://openreview.net/forum?id=DYd4vyyhUu", "author_site": "Wenjie Wang, Biwei Huang, Feng Liu, Xinge You, Tongliang Liu, Kun Zhang, Mingming Gong", "tldr": "", "abstract": "Score-based methods have demonstrated their effectiveness in discovering causal relationships by scoring different causal structures based on their goodness of fit to the data. Recently, Huang et al. proposed a generalized score function that can handle general data distributions and causal relationships by modeling the relations in reproducing kernel Hilbert space (RKHS). The selection of an appropriate kernel within this score function is crucial for accurately characterizing causal relationships and ensuring precise causal discovery. However, the current method involves manual heuristic selection of kernel parameters, making the process tedious and less likely to ensure optimality. In this paper, we propose a kernel selection method within the generalized score function that automatically selects the optimal kernel that best fits the data. Specifically, we model the generative process of the variables involved in each step of the causal graph search procedure as a mixture of independent noise variables. Based on this model, we derive an automatic kernel selection method by maximizing the marginal likelihood of the variables involved in each search step. We conduct experiments on both synthetic data and real-world benchmarks, and the results demonstrate that our proposed method outperforms heuristic kernel selection methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenjie Wang;Biwei Huang;Feng Liu;Xinge You;Tongliang Liu;Kun Zhang;Mingming Gong", "authorids": "~Wenjie_Wang3;~Biwei_Huang1;~Feng_Liu2;~Xinge_You1;~Tongliang_Liu1;~Kun_Zhang1;~Mingming_Gong1", "gender": "M;F;M;M;M;M;M", "homepage": ";;https://fengliu90.github.io/index.html;http://bmal.hust.edu.cn/info/1005/1091.htm;https://tongliang-liu.github.io/;http://www.andrew.cmu.edu/user/kunz1/;https://mingming-gong.github.io/", "dblp": ";165/3288;77/1318-3;16/1184;150/6667;96/3115-1;98/8479", "google_scholar": "TdnF2vcAAAAJ;;https://scholar.google.com/citations?hl=en;v7bRZX8AAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;RGoypN4AAAAJ;https://scholar.google.com.au/citations?user=6BmiCJIAAAAJ", "orcid": ";;0000-0002-5005-9129;;;;0000-0001-7147-5589", "linkedin": ";;alexfengliu;;;;", "or_profile": "~Wenjie_Wang3;~Biwei_Huang1;~Feng_Liu2;~Xinge_You1;~Tongliang_Liu1;~Kun_Zhang1;~Mingming_Gong1", "aff": "University of Melbourne;University of California, San Diego;University of Melbourne;Huazhong University of Science and Technology;Mohamed bin Zayed University of Artificial Intelligence;Carnegie Mellon University;University of Melbourne", "aff_domain": "unimelb.edu;ucsd.edu;unimelb.edu.au;hust.edu.cn;mbzuai.ac.ae;cmu.edu;unimelb.edu.au", "position": "PhD student;Assistant Professor;Assistant Professor;Full Professor;Affiliated Associate Professor;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nwang2024optimal,\ntitle={Optimal Kernel Choice for Score Function-based Causal Discovery},\nauthor={Wenjie Wang and Biwei Huang and Feng Liu and Xinge You and Tongliang Liu and Kun Zhang and Mingming Gong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DYd4vyyhUu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 671866, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2325456776738920831&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "unimelb.edu;ucsd.edu;unimelb.edu.au;hust.edu.cn;mbzuai.ac.ae;cmu.edu;unimelb.edu.au", "author_num": 7, "aff_unique_index": "0;1;0;2;3;4;0", "aff_unique_norm": "University of Melbourne;University of California, San Diego;Huazhong University of Science and Technology;Mohamed bin Zayed University of Artificial Intelligence;Carnegie Mellon University", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.unimelb.edu.au;https://www.ucsd.edu;http://www.hust.edu.cn;https://mbzuai.ac.ae;https://www.cmu.edu", "aff_unique_abbr": "UniMelb;UCSD;HUST;MBZUAI;CMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;1;0;2;3;1;0", "aff_country_unique": "Australia;United States;China;United Arab Emirates" }, { "title": "Evolving Subnetwork Training for Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34620", "id": "DbMm8pmoAP", "proceeding": "https://proceedings.mlr.press/v235/li24k.html", "pdf": "https://openreview.net/pdf?id=DbMm8pmoAP", "openreview": "https://openreview.net/forum?id=DbMm8pmoAP", "author_site": "hanqi li, Lu Chen, Da Ma, Zijian Wu, Su Zhu, Kai Yu", "tldr": "", "abstract": "Large language models have ushered in a new era of artificial intelligence research. However, their substantial training costs hinder further development and widespread adoption. In this paper, inspired by the redundancy in the parameters of large language models, we propose a novel training paradigm: Evolving Subnetwork Training (EST). EST samples subnetworks from the layers of the large language model and from commonly used modules within each layer, Multi-Head Attention (MHA) and Multi-Layer Perceptron (MLP). By gradually increasing the size of the subnetworks during the training process, EST can save the cost of training. We apply EST to train GPT2 model and TinyLlama model, resulting in 26.7% FLOPs saving for GPT2 and 25.0% for TinyLlama without an increase in loss on the pre-training dataset. Moreover, EST leads to performance improvements in downstream tasks, indicating that it benefits generalization. Additionally, we provide intuitive theoretical studies based on training dynamics and Dropout theory to ensure the feasibility of EST.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hanqi Li;Lu Chen;Da Ma;Zijian Wu;Su Zhu;Kai Yu", "authorids": "~Hanqi_Li3;~Lu_Chen3;~Da_Ma2;~Zijian_Wu5;~Su_Zhu1;~Kai_Yu3", "gender": "M;;M;M;M;M", "homepage": "https://coai-sjtu.github.io;https://x-lance.sjtu.edu.cn/en/members/ma-da;https://sz128.github.io/;https://x-lance.sjtu.edu.cn/~kaiyu/;https://github.com/daqige;https://github.com/wzj423/", "dblp": "69/157-2;;160/8144;197/1322-4;;", "google_scholar": "https://scholar.google.ca/citations?user=Fb3jWaYAAAAJ;;https://scholar.google.ca/citations?user=FqjtxrMAAAAJ;https://scholar.google.com/citations?hl=en;;", "orcid": ";;;0000-0002-7102-9826;;", "linkedin": ";;;;;", "or_profile": "~Lu_Chen3;~Da_Ma2;~Su_Zhu1;~Kai_Yu3;~hanqi_li2;~\u5434\u5b50\u50651", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;AISpeech Co., Ltd.;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;aispeech.com;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "position": "Associate Professor;PhD student;Researcher;Full Professor;MS student;Undergrad student", "bibtex": "@inproceedings{\nli2024evolving,\ntitle={Evolving Subnetwork Training for Large Language Models},\nauthor={Hanqi Li and Lu Chen and Da Ma and Zijian Wu and Su Zhu and Kai Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DbMm8pmoAP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 782608, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16935639653828248860&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "sjtu.edu.cn;sjtu.edu.cn;aispeech.com;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "author_num": 6, "aff_unique_index": "0;0;1;0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University;AISpeech Co., Ltd.", "aff_unique_dep": ";", "aff_unique_url": "https://www.sjtu.edu.cn;", "aff_unique_abbr": "SJTU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China;" }, { "title": "BiE: Bi-Exponent Block Floating-Point for Large Language Models Quantization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34619", "id": "DbyHDYslM7", "proceeding": "https://proceedings.mlr.press/v235/zou24d.html", "pdf": "https://openreview.net/pdf?id=DbyHDYslM7", "openreview": "https://openreview.net/forum?id=DbyHDYslM7", "author_site": "Lancheng Zou, Wenqian Zhao, Shuo Yin, Chen Bai, Qi Sun, Bei Yu", "tldr": "", "abstract": "Nowadays, Large Language Models (LLMs) mostly possess billions of parameters, bringing significant challenges to hardware platforms. Although quantization is an efficient approach to reduce computation and memory overhead for inference optimization, we stress the challenge that mainstream low-bit quantization approaches still suffer from either various data distribution outliers or a lack of hardware efficiency. We also find that low-bit data format has further potential expressiveness to cover the atypical language data distribution. In this paper, we propose a novel numerical representation, Bi-Exponent Block Floating Point (BiE), and a new quantization flow. BiE quantization shows accuracy superiority and hardware friendliness on various models and benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lancheng Zou;Wenqian Zhao;Shuo Yin;Chen Bai;Qi Sun;Bei Yu", "authorids": "~Lancheng_Zou1;~Wenqian_Zhao2;~Shuo_Yin2;~Chen_Bai1;~Qi_Sun4;~Bei_Yu2", "gender": ";M;M;;M;M", "homepage": "https://lanchengzou.github.io/;;https://sawydust1228.github.io/;https://baichen318.github.io/;http://qisunchn.top/;http://www.cse.cuhk.edu.hk/~byu/index.html", "dblp": "321/2579;;;;05/4187-2;28/4556-1.html", "google_scholar": "BVgV-swAAAAJ;;;eEQsWDIAAAAJ;;tGneTm4AAAAJ", "orcid": "0009-0004-6820-7064;;0000-0003-4927-0194;0000-0002-1742-0090;0000-0001-5153-6698;0000-0001-6406-4810", "linkedin": ";https://www.linkedin.com/public-profile/settings?trk=d_flagship3_profile_self_view_public_profile&lipi=urn%3Ali%3Apage%3Ad_flagship3_profile_self_edit_top_card%3BAR9ogijPQiGqjcriNqmP4Q%3D%3D;;chen-bai-b48b95152/;qi-sun-b655a1146/;yubei/", "or_profile": "~Lancheng_Zou1;~Wenqian_Zhao2;~Shuo_Yin2;~Chen_Bai1;~Qi_Sun4;~Bei_Yu2", "aff": "The Chinese University of Hong Kong;Department of Computer Science and Engineering, The Chinese University of Hong Kong;Department of Computer Science and Engineering, The Chinese University of Hong Kong;Department of Computer Science and Engineering, The Chinese University of Hong Kong;Zhejiang University;Department of Computer Science and Engineering, The Chinese University of Hong Kong", "aff_domain": "cuhk.edu.hk;cse.cuhk.edu.hk;cse.cuhk.edu.hk;cse.cuhk.edu.hk;zju.edu.cn;cse.cuhk.edu.hk", "position": "PhD student;PhD student;PhD student;PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nzou2024bie,\ntitle={BiE: Bi-Exponent Block Floating-Point for Large Language Models Quantization},\nauthor={Lancheng Zou and Wenqian Zhao and Shuo Yin and Chen Bai and Qi Sun and Bei Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DbyHDYslM7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 875970, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1368166953241268896&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "cuhk.edu.hk;cse.cuhk.edu.hk;cse.cuhk.edu.hk;cse.cuhk.edu.hk;zju.edu.cn;cse.cuhk.edu.hk", "author_num": 6, "aff_unique_index": "0;0;0;0;1;0", "aff_unique_norm": "Chinese University of Hong Kong;Zhejiang University", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.zju.edu.cn", "aff_unique_abbr": "CUHK;ZJU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Mastering Text-to-Image Diffusion: Recaptioning, Planning, and Generating with Multimodal LLMs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34618", "id": "DgLFkAPwuZ", "proceeding": "https://proceedings.mlr.press/v235/yang24ai.html", "pdf": "https://openreview.net/pdf?id=DgLFkAPwuZ", "openreview": "https://openreview.net/forum?id=DgLFkAPwuZ", "author_site": "Ling Yang, Zhaochen Yu, Chenlin Meng, Minkai Xu, Stefano Ermon, Bin Cui", "tldr": "", "abstract": "Diffusion models have exhibit exceptional performance in text-to-image generation and editing. However, existing methods often face challenges when handling complex text prompts that involve multiple objects with multiple attributes and relationships. In this paper, we propose a brand new training-free text-to-image generation/editing framework, namely Recaption, Plan and Generate (RPG), harnessing the powerful chain-of-thought reasoning ability of multimodal LLMs to enhance the compositionality of text-to-image diffusion models. Our approach employs the MLLM as a global planner to decompose the process of generating complex images into multiple simpler generation tasks within subregions. We propose complementary regional diffusion to enable region-wise compositional generation. Furthermore, we integrate text-guided image generation and editing within the proposed RPG in a closed-loop fashion, thereby enhancing generalization ability. Extensive experiments demonstrate our RPG outperforms state-of-the-art text-to-image models, including DALL-E 3 and SDXL, particularly in multi-category object composition and text-image semantic alignment. Notably, our RPG framework exhibits wide compatibility with various MLLM architectures and diffusion backbones. Our code is available at https://github.com/YangLing0818/RPG-DiffusionMaster", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ling Yang;Zhaochen Yu;Chenlin Meng;Minkai Xu;Stefano Ermon;Bin CUI", "authorids": "~Ling_Yang1;~Zhaochen_Yu2;~Chenlin_Meng1;~Minkai_Xu1;~Stefano_Ermon1;~Bin_CUI2", "gender": "M;M;F;M;M;M", "homepage": "https://yangling0818.github.io/;https://zhaochenyu0201.github.io;https://chenlin9.github.io/;https://minkaixu.com;http://cs.stanford.edu/~ermon/;https://cuibinpku.github.io/index.html", "dblp": "01/24-6.html;;227/2517;257/3355;47/8135;55/5031.html", "google_scholar": "https://scholar.google.com.hk/citations?user=sIKujqAAAAAJ;9RNgZOIAAAAJ;nEFU7wIAAAAJ;https://scholar.google.com/citations?hl=en;;IJAU8KoAAAAJ", "orcid": "0000-0003-1905-8053;;;;;0000-0003-1681-4677", "linkedin": ";;;;;", "or_profile": "~Ling_Yang1;~Zhaochen_Yu2;~Chenlin_Meng1;~Minkai_Xu1;~Stefano_Ermon1;~Bin_CUI2", "aff": "Peking University;Peking University;Stanford University;Stanford University;Stanford University;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn;stanford.edu;stanford.edu;stanford.edu;pku.edu.cn", "position": "PhD student;Intern;PhD student;PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nyang2024mastering,\ntitle={Mastering Text-to-Image Diffusion: Recaptioning, Planning, and Generating with Multimodal {LLM}s},\nauthor={Ling Yang and Zhaochen Yu and Chenlin Meng and Minkai Xu and Stefano Ermon and Bin CUI},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DgLFkAPwuZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9080374, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 120, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10968503748345338905&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "pku.edu.cn;pku.edu.cn;stanford.edu;stanford.edu;stanford.edu;pku.edu.cn", "author_num": 6, "aff_unique_index": "0;0;1;1;1;0", "aff_unique_norm": "Peking University;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.stanford.edu", "aff_unique_abbr": "Peking U;Stanford", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;1;1;1;0", "aff_country_unique": "China;United States" }, { "title": "Collective Certified Robustness against Graph Injection Attacks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34617", "id": "DhxZVq1ZOo", "proceeding": "https://proceedings.mlr.press/v235/lai24a.html", "pdf": "https://openreview.net/pdf?id=DhxZVq1ZOo", "openreview": "https://openreview.net/forum?id=DhxZVq1ZOo", "author_site": "Yuni Lai, Bailin PAN, kaihuang CHEN, Yancheng Yuan, Kai Zhou", "tldr": "", "abstract": "We investigate certified robustness for GNNs under graph injection attacks. Existing research only provides sample-wise certificates by verifying each node independently, leading to very limited certifying performance. In this paper, we present the first collective certificate, which certifies a set of target nodes simultaneously. To achieve it, we formulate the problem as a binary integer quadratic constrained linear programming (BQCLP). We further develop a customized linearization technique that allows us to relax the BQCLP into linear programming (LP) that can be efficiently solved. Through comprehensive experiments, we demonstrate that our collective certification scheme significantly improves certification performance with minimal computational overhead. For instance, by solving the LP within 1 minute on the Citeseer dataset, we achieve a significant increase in the certified ratio from 0.0% to 81.2% when the injected node number is 5% of the graph size. Our paper marks a crucial step towards making provable defense more practical. Our source code is available at https://github.com/Yuni-Lai/CollectiveLPCert.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuni Lai;Bailin PAN;kaihuang CHEN;Yancheng Yuan;Kai Zhou", "authorids": "~Yuni_Lai1;~Bailin_PAN1;~kaihuang_CHEN1;~Yancheng_Yuan1;~Kai_Zhou2", "gender": "F;;M;;M", "homepage": ";;;;https://www4.comp.polyu.edu.hk/~kaizhou/", "dblp": ";;;;", "google_scholar": "https://scholar.google.com/citations?hl=zh-TW;;;;J2QAuAUAAAAJ", "orcid": "0000-0002-2295-912X;;0000-0003-4868-6331;;", "linkedin": ";;;;", "or_profile": "~Yuni_Lai1;~Bailin_PAN1;~kaihuang_CHEN1;~Yancheng_Yuan1;~Kai_Zhou2", "aff": "Hong Kong Polytechnic University;;Hong Kong Polytechnic University;;Hong Kong Polytechnic University", "aff_domain": "polyu.edu.hk;;connect.polyu.hk;;polyu.edu.hk", "position": "PhD student;;PhD student;;Assistant Professor", "bibtex": "@inproceedings{\nlai2024collective,\ntitle={Collective Certified Robustness against Graph Injection Attacks},\nauthor={Yuni Lai and Bailin PAN and kaihuang CHEN and Yancheng Yuan and Kai Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DhxZVq1ZOo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1305323, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14809694692771015680&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "polyu.edu.hk;;connect.polyu.hk;;polyu.edu.hk", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Hong Kong Polytechnic University", "aff_unique_dep": "", "aff_unique_url": "https://www.polyu.edu.hk", "aff_unique_abbr": "PolyU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "GenCO: Generating Diverse Designs with Combinatorial Constraints", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34616", "id": "DiyE6OOGBa", "proceeding": "https://proceedings.mlr.press/v235/ferber24a.html", "pdf": "https://openreview.net/pdf?id=DiyE6OOGBa", "openreview": "https://openreview.net/forum?id=DiyE6OOGBa", "author_site": "Aaron Ferber, Arman Zharmagambetov, Taoan Huang, Bistra Dilkina, Yuandong Tian", "tldr": "", "abstract": "Deep generative models like GAN and VAE have shown impressive results in generating unconstrained objects like images. However, many design settings arising in industrial design, material science, computer graphics and more require that the generated objects satisfy hard combinatorial constraints or meet objectives in addition to modeling a data distribution. To address this, we propose GenCO, a generative framework that guarantees constraint satisfaction throughout training by leveraging differentiable combinatorial solvers to enforce feasibility. GenCO imposes the generative loss on provably feasible solutions rather than intermediate soft solutions, meaning that the deep generative network can focus on ensuring the generated objects match the data distribution without having to also capture feasibility. This shift enables practitioners to enforce hard constraints on the generated outputs during end-to-end training, enabling assessments of their feasibility and introducing additional combinatorial loss components to deep generative training. We demonstrate the effectiveness of our approach on a variety of generative combinatorial tasks, including game level generation, map creation for path planning, and photonic device design, consistently demonstrating its capability to yield diverse, high-quality solutions that verifiably adhere to user-specified combinatorial properties.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aaron M Ferber;Arman Zharmagambetov;Taoan Huang;Bistra Dilkina;Yuandong Tian", "authorids": "~Aaron_M_Ferber1;~Arman_Zharmagambetov1;~Taoan_Huang2;~Bistra_Dilkina2;~Yuandong_Tian1", "gender": "M;M;M;F;M", "homepage": "https://aaron-ferber.github.io/;https://arman-z.github.io/;;;http://yuandong-tian.com", "dblp": "163/7788;252/5004;241/7690;30/5718;t/YuandongTian", "google_scholar": "TuVq07oAAAAJ;D6QocXMAAAAJ;;1jjyaBYAAAAJ;0mgEF28AAAAJ", "orcid": ";;;0000-0002-6784-473X;0000-0003-4202-4847", "linkedin": "aaron-ferber-64a73980/;;;;yuandongtian", "or_profile": "~Aaron_M_Ferber1;~Arman_Zharmagambetov1;~Taoan_Huang2;~Bistra_Dilkina2;~Yuandong_Tian1", "aff": "Cornell University;Meta AI (FAIR);University of Southern California;University of Southern California;Meta AI (FAIR)", "aff_domain": "cornell.edu;meta.com;usc.edu;usc.edu;meta.com", "position": "Postdoc;Postdoc;PhD student;Associate Professor;Research Scientist", "bibtex": "@inproceedings{\nferber2024genco,\ntitle={Gen{CO}: Generating Diverse Designs with Combinatorial Constraints},\nauthor={Aaron M Ferber and Arman Zharmagambetov and Taoan Huang and Bistra Dilkina and Yuandong Tian},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DiyE6OOGBa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2487648, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17454785055459201339&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "cornell.edu;meta.com;usc.edu;usc.edu;meta.com", "author_num": 5, "aff_unique_index": "0;1;2;2;1", "aff_unique_norm": "Cornell University;Meta;University of Southern California", "aff_unique_dep": ";Facebook AI Research (FAIR);", "aff_unique_url": "https://www.cornell.edu;https://ai.facebook.com;https://www.usc.edu", "aff_unique_abbr": "Cornell;Meta AI;USC", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Low-Rank Bandits via Tight Two-to-Infinity Singular Subspace Recovery", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34615", "id": "Dk0RBrqiyk", "proceeding": "https://proceedings.mlr.press/v235/jedra24a.html", "pdf": "https://openreview.net/pdf?id=Dk0RBrqiyk", "openreview": "https://openreview.net/forum?id=Dk0RBrqiyk", "author_site": "Yassir Jedra, William R\u00e9veillard, Stefan Stojanovic, Alexandre Proutiere", "tldr": "", "abstract": "We study contextual bandits with low-rank structure where, in each round, if the (context, arm) pair $(i,j)\\in [m]\\times [n]$ is selected, the learner observes a noisy sample of the $(i,j)$-th entry of an unknown low-rank reward matrix. Successive contexts are generated randomly in an i.i.d. manner and are revealed to the learner. For such bandits, we present efficient algorithms for policy evaluation, best policy identification and regret minimization. For policy evaluation and best policy identification, we show that our algorithms are nearly minimax optimal. For instance, the number of samples required to return an $\\varepsilon$-optimal policy with probability at least $1-\\delta$ typically scales as $\\frac{m+n}{\\varepsilon^2}\\log(1/\\delta)$. Our regret minimization algorithm enjoys minimax guarantees typically scaling as $r^{5/4}(m+n)^{3/4}\\sqrt{T}$, which improves over existing algorithms. All the proposed algorithms consist of two phases: they first leverage spectral methods to estimate the left and right singular subspaces of the low-rank reward matrix. We show that these estimates enjoy tight error guarantees in the two-to-infinity norm. This in turn allows us to reformulate our problems as a misspecified linear bandit problem with dimension roughly $r(m+n)$ and misspecification controlled by the subspace recovery error, as well as to design the second phase of our algorithms efficiently.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yassir Jedra;William R\u00e9veillard;Stefan Stojanovic;Alexandre Proutiere", "authorids": "~Yassir_Jedra1;~William_R\u00e9veillard1;~Stefan_Stojanovic1;~Alexandre_Proutiere1", "gender": ";M;M;M", "homepage": "https://sites.google.com/view/yassir-jedra/home?authuser=1;;https://www.kth.se/profile/stesto;https://people.kth.se/~alepro/", "dblp": "238/0358;371/4405;315/5080;p/AlexandreProutiere", "google_scholar": "tePNfWQAAAAJ;JoyqAFkAAAAJ;jCkz9ykAAAAJ;g5sya5cAAAAJ", "orcid": ";;;", "linkedin": "yassirjedra/;;;", "or_profile": "~Yassir_Jedra1;~William_R\u00e9veillard1;~Stefan_Stojanovic1;~Alexandre_Proutiere1", "aff": "Massachusetts Institute of Technology;KTH Royal Institute of Technology;KTH Royal Institute of Technology;KTH Royal Institute of Technology, Stockholm, Sweden", "aff_domain": "mit.edu;kth.se;kth.se;kth.se", "position": "Postdoc;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\njedra2024lowrank,\ntitle={Low-Rank Bandits via Tight Two-to-Infinity Singular Subspace Recovery},\nauthor={Yassir Jedra and William R{\\'e}veillard and Stefan Stojanovic and Alexandre Proutiere},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Dk0RBrqiyk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1584116, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8319589538774261941&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "mit.edu;kth.se;kth.se;kth.se", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Massachusetts Institute of Technology;KTH Royal Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://www.kth.se", "aff_unique_abbr": "MIT;KTH", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stockholm", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United States;Sweden" }, { "title": "Failures Are Fated, But Can Be Faded: Characterizing and Mitigating Unwanted Behaviors in Large-Scale Vision and Language Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34614", "id": "DkqiId4AuR", "proceeding": "https://proceedings.mlr.press/v235/sagar24a.html", "pdf": "https://openreview.net/pdf?id=DkqiId4AuR", "openreview": "https://openreview.net/forum?id=DkqiId4AuR", "author_site": "Som Sagar, Aditya Taparia, Ransalu Senanayake", "tldr": "", "abstract": "In large deep neural networks that seem to perform surprisingly well on many tasks, we also observe a few failures related to accuracy, social biases, and alignment with human values, among others. Therefore, before deploying these models, it is crucial to characterize this failure landscape for engineers to debug and legislative bodies to audit models. Nevertheless, it is infeasible to exhaustively test for all possible combinations of factors that could lead to a model's failure. In this paper, we introduce a post-hoc method that utilizes *deep reinforcement learning* to explore and construct the landscape of failure modes in pre-trained discriminative and generative models. With the aid of limited human feedback, we then demonstrate how to restructure the failure landscape to be more desirable by moving away from the discovered failure modes. We empirically show the effectiveness of the proposed method across common Computer Vision, Natural Language Processing, and Vision-Language tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Som Sagar;Aditya Taparia;Ransalu Senanayake", "authorids": "~Som_Sagar1;~Aditya_Taparia1;~Ransalu_Senanayake1", "gender": "M;M;M", "homepage": ";;http://www.ransalu.com", "dblp": ";;131/7006", "google_scholar": "qWxEX0QAAAAJ;hfU1wQwAAAAJ;mmo0bDIAAAAJ", "orcid": ";;", "linkedin": "som-sagar-221b631ba;aditya-taparia/;ransalu/", "or_profile": "~Som_Sagar1;~Aditya_Taparia1;~Ransalu_Senanayake1", "aff": "Arizona State University;Arizona State University;Arizona State University", "aff_domain": "asu.edu;asu.edu;asu.edu", "position": "PhD student;MS student;Assistant Professor", "bibtex": "@inproceedings{\nsagar2024failures,\ntitle={Failures Are Fated, But Can Be Faded: Characterizing and Mitigating Unwanted Behaviors in Large-Scale Vision and Language Models},\nauthor={Som Sagar and Aditya Taparia and Ransalu Senanayake},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DkqiId4AuR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4458984, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6261471949673726789&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "asu.edu;asu.edu;asu.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Arizona State University", "aff_unique_dep": "", "aff_unique_url": "https://www.asu.edu", "aff_unique_abbr": "ASU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Model-based Reinforcement Learning for Confounded POMDPs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34613", "id": "DlR8fWgJRl", "proceeding": "https://proceedings.mlr.press/v235/hong24d.html", "pdf": "https://openreview.net/pdf?id=DlR8fWgJRl", "openreview": "https://openreview.net/forum?id=DlR8fWgJRl", "author_site": "Mao Hong, Zhengling Qi, Yanxun Xu", "tldr": "", "abstract": "We propose a model-based offline reinforcement learning (RL) algorithm for confounded partially observable Markov decision processes (POMDPs) under general function approximations and show it is provably efficient under some technical conditions such as the partial coverage imposed on the offline data distribution. Specifically, we first establish a novel model-based identification result for learning the effect of any action on the reward and future transitions in the confounded POMDP. Using this identification result, we then design a nonparametric two-stage estimation procedure to construct an estimator for off-policy evaluation (OPE), which permits general function approximations. Finally, we learn the optimal policy by performing a conservative policy optimization within the confidence regions based on the proposed estimation procedure for OPE. Under some mild conditions, we establish a finite-sample upper bound on the suboptimality of the learned policy in finding the optimal one, which depends on the sample size and the length of horizons polynomially.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mao Hong;Zhengling Qi;Yanxun Xu", "authorids": "~Mao_Hong1;~Zhengling_Qi1;~Yanxun_Xu1", "gender": "M;;F", "homepage": ";https://sites.google.com/view/statsqizl/home?authuser=0;http://www.ams.jhu.edu/~yxu70", "dblp": ";173/0201;", "google_scholar": "GUKNcVUAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": ";;", "linkedin": "mao-hong-a45624195/;;", "or_profile": "~Mao_Hong1;~Zhengling_Qi1;~Yanxun_Xu1", "aff": "Johns Hopkins University;George Washington University;Johns Hopkins University", "aff_domain": "jh.edu;gwu.edu;jhu.edu", "position": "PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nhong2024modelbased,\ntitle={Model-based Reinforcement Learning for Confounded {POMDP}s},\nauthor={Mao Hong and Zhengling Qi and Yanxun Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DlR8fWgJRl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 621529, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7488706469384807074&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 5, "email": "jh.edu;gwu.edu;jhu.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Johns Hopkins University;George Washington University", "aff_unique_dep": ";", "aff_unique_url": "https://www.jhu.edu;https://www.gwu.edu", "aff_unique_abbr": "JHU;GWU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "How Graph Neural Networks Learn: Lessons from Training Dynamics", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34612", "id": "Dn4B53IcCW", "proceeding": "https://proceedings.mlr.press/v235/yang24ae.html", "pdf": "https://openreview.net/pdf?id=Dn4B53IcCW", "openreview": "https://openreview.net/forum?id=Dn4B53IcCW", "author_site": "Chenxiao Yang, Qitian Wu, David Wipf, Ruoyu Sun, Junchi Yan", "tldr": "", "abstract": "A long-standing goal in deep learning has been to characterize the learning behavior of black-box models in a more interpretable manner. For graph neural networks (GNNs), considerable advances have been made in formalizing what functions they can represent, but whether GNNs will learn desired functions during the optimization process remains less clear. To fill this gap, we study their training dynamics in function space. In particular, we find that the optimization of GNNs through gradient descent implicitly leverages the graph structure to update the learned function. This phenomenon is dubbed as kernel-graph alignment, which has been empirically and theoretically corroborated. This new analytical framework from the optimization perspective enables interpretable explanations of when and why the learned GNN functions generalize, which are relevant to their limitations on heterophilic graphs. From a practical standpoint, it also provides high-level principles for designing new algorithms. We exemplify this by showing that a simple and efficient non-parametric algorithm, obtained by explicitly using graph structure to update the learned function, can consistently compete with nonlinear GNNs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chenxiao Yang;Qitian Wu;David Wipf;Ruoyu Sun;Junchi Yan", "authorids": "~Chenxiao_Yang1;~Qitian_Wu1;~David_Wipf1;~Ruoyu_Sun1;~Junchi_Yan2", "gender": ";;M;;", "homepage": ";;http://www.davidwipf.com/;https://ruoyus.github.io/;", "dblp": ";;81/6421;30/9879-1;", "google_scholar": ";;YJx1WSgAAAAJ;PsfzbCMAAAAJ;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Chenxiao_Yang1;~Qitian_Wu1;~David_Wipf1;~Ruoyu_Sun1;~Junchi_Yan2", "aff": ";;Amazon AI Research Lab;The Chinese University of Hong Kong;", "aff_domain": ";;amazon.com;cuhk.edu.cn;", "position": ";;Principal Research Scientist;Associate Professor;", "bibtex": "@inproceedings{\nyang2024how,\ntitle={How Graph Neural Networks Learn: Lessons from Training Dynamics},\nauthor={Chenxiao Yang and Qitian Wu and David Wipf and Ruoyu Sun and Junchi Yan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Dn4B53IcCW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 904511, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4642642196198084325&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": ";;amazon.com;cuhk.edu.cn;", "author_num": 5, "aff_unique_index": "0;1", "aff_unique_norm": "Amazon;Chinese University of Hong Kong", "aff_unique_dep": "Amazon AI Research Lab;", "aff_unique_url": "https://www.amazon.com;https://www.cuhk.edu.hk", "aff_unique_abbr": "Amazon AI;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;China" }, { "title": "Position: Why We Must Rethink Empirical Research in Machine Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34611", "id": "DprrMz24tk", "proceeding": "https://proceedings.mlr.press/v235/herrmann24b.html", "pdf": "https://openreview.net/pdf?id=DprrMz24tk", "openreview": "https://openreview.net/forum?id=DprrMz24tk", "author_site": "Moritz Herrmann, F. Julian D. Lange, Katharina Eggensperger, Giuseppe Casalicchio, Marcel Wever, Matthias Feurer, David R\u00fcgamer, Eyke H\u00fcllermeier, Anne-Laure Boulesteix, Bernd Bischl", "tldr": "", "abstract": "We warn against a common but incomplete understanding of empirical research in machine learning that leads to non-replicable results, makes findings unreliable, and threatens to undermine progress in the field. To overcome this alarming situation, we call for more awareness of the plurality of ways of gaining knowledge experimentally but also of some epistemic limitations. In particular, we argue most current empirical machine learning research is fashioned as confirmatory research while it should rather be considered exploratory.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Moritz Herrmann;F. Julian D. Lange;Katharina Eggensperger;Giuseppe Casalicchio;Marcel Wever;Matthias Feurer;David R\u00fcgamer;Eyke H\u00fcllermeier;Anne-Laure Boulesteix;Bernd Bischl", "authorids": "~Moritz_Herrmann1;~F._Julian_D._Lange1;~Katharina_Eggensperger1;~Giuseppe_Casalicchio1;~Marcel_Wever1;~Matthias_Feurer2;~David_R\u00fcgamer1;~Eyke_H\u00fcllermeier1;~Anne-Laure_Boulesteix1;~Bernd_Bischl1", "gender": "M;;F;M;M;;M;M;F;M", "homepage": "https://www.ibe.med.uni-muenchen.de/mitarbeiter/mitarbeiter/herrmann_moritz/index.html;;https://uni-tuebingen.de/en/research/core-research/cluster-of-excellence-machine-learning/research/research/cluster-research-groups/research-groups/automl-for-science/;https://www.slds.stat.uni-muenchen.de/people/casalicchio/;https://www.marcelwever.de;;https://davidruegamer.github.io/;https://cs.uni-paderborn.de/index.php?id=60202;https://www.en.ibe.med.uni-muenchen.de/mitarbeiter/professoren/boulesteix/index.html;https://www.slds.stat.uni-muenchen.de/", "dblp": ";;152/6150;194/2843;202/9010;;220/5560;h/EykeHullermeier;;48/5326", "google_scholar": ";;ADZN2nwAAAAJ;https://scholar.google.de/citations?user=MHcaZMQAAAAJ;ZaE04WUAAAAJ;;https://scholar.google.de/citations?user=_DYguksAAAAJ;https://scholar.google.de/citations?user=usVJeNN3xFAC;;https://scholar.google.de/citations?user=s34UckkAAAAJ", "orcid": "0000-0002-4893-5812;;;0000-0001-5324-5966;0000-0001-9782-6818;;;0000-0002-9944-4108;;0000-0001-6002-6980", "linkedin": ";;;;;;;;;", "or_profile": "~Moritz_Herrmann1;~F._Julian_D._Lange1;~Katharina_Eggensperger1;~Giuseppe_Casalicchio1;~Marcel_Wever1;~Matthias_Feurer2;~David_R\u00fcgamer1;~Eyke_H\u00fcllermeier1;~Anne-Laure_Boulesteix1;~Bernd_Bischl1", "aff": "Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;;Eberhard-Karls-Universit\u00e4t T\u00fcbingen;University of Munich, Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;;LMU Munich;Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;LMU", "aff_domain": "lmu.de;;uni-tuebingen.de;lmu.de;lmu.de;;lmu.de;lmu.de;lmu.de;uni-muenchen.de", "position": "Postdoc;;Postdoc;Postdoc;Postdoc;;Associate Professor;Full Professor;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nherrmann2024position,\ntitle={Position: Why We Must Rethink Empirical Research in Machine Learning},\nauthor={Moritz Herrmann and F. Julian D. Lange and Katharina Eggensperger and Giuseppe Casalicchio and Marcel Wever and Matthias Feurer and David R{\\\"u}gamer and Eyke H{\\\"u}llermeier and Anne-Laure Boulesteix and Bernd Bischl},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DprrMz24tk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 334880, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1374332462686818891&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "lmu.de;;uni-tuebingen.de;lmu.de;lmu.de;;lmu.de;lmu.de;lmu.de;uni-muenchen.de", "author_num": 10, "aff_unique_index": "0;1;0;0;2;0;0;2", "aff_unique_norm": "Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;Eberhard Karls University of T\u00fcbingen;Ludwig Maximilian University of Munich", "aff_unique_dep": ";;", "aff_unique_url": "https://www.lmu.de;https://www.uni-tuebingen.de/;https://www.lmu.de", "aff_unique_abbr": "LMU;Uni T\u00fcbingen;LMU", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";T\u00fcbingen;Munich", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Adaptive Group Personalization for Federated Mutual Transfer Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34610", "id": "DqC9XiI71U", "proceeding": "https://proceedings.mlr.press/v235/xu24u.html", "pdf": "https://openreview.net/pdf?id=DqC9XiI71U", "openreview": "https://openreview.net/forum?id=DqC9XiI71U", "author_site": "Haoqing Xu, Dian Shen, Meng Wang, Beilun Wang", "tldr": "", "abstract": "Mutual transfer learning aims to improve prediction with knowledge from related domains. Recently, federated learning is applied in this field to address the communication and privacy concerns. However, previous clustered federated learning (CFL) solutions lack theoretical guarantee of learnability recovery and require time-consuming hyper-parameter tuning, while centralized mutual transfer learning methods lack adaptability to concept drifts. In this paper, we propose the Adaptive Group Personalization method (**AdaGrP**) to overcome these challenges. We adaptively decide the recovery threshold with a nonparametric method, *adaptive threshold correction*, for tuning-free solution with relaxed condition. Theoretical results guarantee the perfect learnability recovery with the corrected threshold. Empirical results show AdaGrP achieves 16.9% average improvement in learnability structure recovery compared with state-of-the-art CFL baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haoqing Xu;Dian Shen;Meng Wang;Beilun Wang", "authorids": "~Haoqing_Xu1;~Dian_Shen1;~Meng_Wang11;~Beilun_Wang1", "gender": "M;M;;M", "homepage": ";https://dianshenseu.github.io/en/;;https://cse.seu.edu.cn/2019/0105/c23024a257533/pagem.htm", "dblp": "298/2770;139/4309;;180/5592", "google_scholar": ";;;", "orcid": "0000-0003-1843-4344;;;0000-0002-2646-1492", "linkedin": ";;;", "or_profile": "~Haoqing_Xu1;~Dian_Shen1;~Meng_Wang11;~Beilun_Wang1", "aff": "Southeast University;Southeast University;;Southeast University", "aff_domain": "seu.edu.cn;seu.edu.cn;;seu.edu.cn", "position": "MS student;Associate Professor;;Associate Professor", "bibtex": "@inproceedings{\nxu2024adaptive,\ntitle={Adaptive Group Personalization for Federated Mutual Transfer Learning},\nauthor={Haoqing Xu and Dian Shen and Meng Wang and Beilun Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DqC9XiI71U}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6841394, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GRNGNc6CgeEJ:scholar.google.com/&scioq=Adaptive+Group+Personalization+for+Federated+Mutual+Transfer+Learning&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": "seu.edu.cn;seu.edu.cn;;seu.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Southeast University", "aff_unique_dep": "", "aff_unique_url": "https://www.seu.edu.cn/", "aff_unique_abbr": "SEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Improving Diffusion Models for Inverse Problems Using Optimal Posterior Covariance", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34609", "id": "DrE7jVF4VW", "proceeding": "https://proceedings.mlr.press/v235/peng24h.html", "pdf": "https://openreview.net/pdf?id=DrE7jVF4VW", "openreview": "https://openreview.net/forum?id=DrE7jVF4VW", "author_site": "Xinyu Peng, Ziyang Zheng, Wenrui Dai, Nuoqian Xiao, Chenglin Li, Junni Zou, Hongkai Xiong", "tldr": "", "abstract": "Recent diffusion models provide a promising zero-shot solution to noisy linear inverse problems without retraining for specific inverse problems. In this paper, we reveal that recent methods can be uniformly interpreted as employing a Gaussian approximation with hand-crafted isotropic covariance for the intractable denoising posterior to approximate the conditional posterior mean. Inspired by this finding, we propose to improve recent methods by using more principled covariance determined by maximum likelihood estimation. To achieve posterior covariance optimization without retraining, we provide general plug-and-play solutions based on two approaches specifically designed for leveraging pre-trained models with and without reverse covariance. We further propose a scalable method for learning posterior covariance prediction based on representation with orthonormal basis. Experimental results demonstrate that the proposed methods significantly enhance reconstruction performance without requiring hyperparameter tuning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xinyu Peng;Ziyang Zheng;Wenrui Dai;Nuoqian Xiao;Chenglin Li;Junni Zou;Hongkai Xiong", "authorids": "~Xinyu_Peng1;~Ziyang_Zheng2;~Wenrui_Dai1;~Nuoqian_Xiao1;~Chenglin_Li2;~Junni_Zou1;~Hongkai_Xiong1", "gender": "M;M;;;M;F;M", "homepage": "https://github.com/xypeng9903;;;;https://min.sjtu.edu.cn/En/FacultyShow/4?Vid=17;http://www.cs.sjtu.edu.cn/~zou-jn;http://min.sjtu.edu.cn", "dblp": ";;16/5135.html;;;91/4613;21/3569", "google_scholar": ";pcgDcMmDJbwC;Xg8MhyAAAAAJ;;ltW2JMcAAAAJ;https://scholar.google.com/citations?hl=zh-CN;bB16iN4AAAAJ", "orcid": ";0000-0001-9923-8016;;;;;0000-0003-4552-0029", "linkedin": "xinyu-peng-328918246/;;;;;;", "or_profile": "~Xinyu_Peng1;~Ziyang_Zheng2;~Wenrui_Dai1;~Nuoqian_Xiao1;~Chenglin_Li2;~Junni_Zou1;~Hongkai_Xiong1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "position": "PhD student;PhD student;Associate Professor;;Full Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\npeng2024improving,\ntitle={Improving Diffusion Models for Inverse Problems Using Optimal Posterior Covariance},\nauthor={Xinyu Peng and Ziyang Zheng and Wenrui Dai and Nuoqian Xiao and Chenglin Li and Junni Zou and Hongkai Xiong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DrE7jVF4VW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8311750, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14269481151172449337&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "author_num": 7, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Stability and Generalization for Stochastic Recursive Momentum-based Algorithms for (Strongly-)Convex One to $K$-Level Stochastic Optimizations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34608", "id": "DsVzHj7jcA", "proceeding": "https://proceedings.mlr.press/v235/pan24e.html", "pdf": "https://openreview.net/pdf?id=DsVzHj7jcA", "openreview": "https://openreview.net/forum?id=DsVzHj7jcA", "author_site": "Xiaokang Pan, Xingyu Li, Jin Liu, Tao Sun, Kai Sun, Lixing Chen, Zhe Qu", "tldr": "", "abstract": "STOchastic Recursive Momentum (STORM)-based algorithms have been widely developed to solve one to $K$-level ($K \\geq 3$) stochastic optimization problems. Specifically, they use estimators to mitigate the biased gradient issue and achieve near-optimal convergence results. However, there is relatively little work on understanding their generalization performance, particularly evident during the transition from one to $K$-level optimization contexts. This paper provides a comprehensive generalization analysis of three representative STORM-based algorithms: STORM, COVER, and SVMR, for one, two, and $K$-level stochastic optimizations under both convex and strongly convex settings based on algorithmic stability. Firstly, we define stability for $K$-level optimizations and link it to generalization. Then, we detail the stability results for three prominent STORM-based algorithms. Finally, we derive their excess risk bounds by balancing stability results with optimization errors. Our theoretical results provide strong evidence to complete STORM-based algorithms: (1) Each estimator may decrease their stability due to variance with its estimation target. (2) Every additional level might escalate the generalization error, influenced by the stability and the variance between its cumulative stochastic gradient and the true gradient. (3) Increasing the batch size for the initial computation of estimators presents a favorable trade-off, enhancing the generalization performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaokang Pan;Xingyu Li;Jin Liu;Tao Sun;Kai Sun;Lixing Chen;Zhe Qu", "authorids": "~Xiaokang_Pan1;~Xingyu_Li4;~Jin_Liu12;~Tao_Sun7;~Kai_Sun6;~Lixing_Chen1;~Zhe_Qu1", "gender": "M;M;;M;M;M;M", "homepage": "https://scholar.google.com/citations?hl=zh-CN&user=rtzTU6UAAAAJ;https://nikoxing.github.io/;;;;https://icst.sjtu.edu.cn/DirectoryDetail.aspx?id=27;https://zhequ1992.github.io/", "dblp": ";;;74/3590-5;;154/7371;", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;stA6f5YAAAAJ;;fPNZpAe5WXIC;https://scholar.google.com.hk/citations?user=buUlnJUAAAAJ;boMMe2YAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0002-5547-9894;;;;0000-0002-1805-0183;0000-0003-2211-2137", "linkedin": ";xy-li-link/;;;;;", "or_profile": "~Xiaokang_Pan1;~Xingyu_Li4;~Jin_Liu12;~Tao_Sun7;~Kai_Sun6;~Lixing_Chen1;~Zhe_Qu1", "aff": "Central South University;Tulane University;;National University of Defense Technology;Xi'an Jiaotong University;Shanghai Jiaotong University;Central South University", "aff_domain": "csu.edu.cn;tulane.edu;;nudt.edu.cn;xjtu.edu.cn;sjtu.edu.cn;csu.edu.cn", "position": "MS student;Postdoc;;Associate Professor;Assistant Professor;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\npan2024stability,\ntitle={Stability and Generalization for Stochastic Recursive Momentum-based Algorithms for (Strongly-)Convex One to \\$K\\$-Level Stochastic Optimizations},\nauthor={Xiaokang Pan and Xingyu Li and Jin Liu and Tao Sun and Kai Sun and Lixing Chen and Zhe Qu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DsVzHj7jcA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 753561, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:11vRVpHlSoQJ:scholar.google.com/&scioq=Stability+and+Generalization+for+Stochastic+Recursive+Momentum-based+Algorithms+for+(Strongly-)Convex+One+to+%24K%24-Level+Stochastic+Optimizations&hl=en&as_sdt=0,5", "gs_version_total": 8, "email": "csu.edu.cn;tulane.edu;;nudt.edu.cn;xjtu.edu.cn;sjtu.edu.cn;csu.edu.cn", "author_num": 7, "aff_unique_index": "0;1;2;3;4;0", "aff_unique_norm": "Central South University;Tulane University;National University of Defense Technology;Xi'an Jiao Tong University;Shanghai Jiao Tong University", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.csu.edu.cn;https://www.tulane.edu;http://www.nudt.edu.cn/;https://www.xjtu.edu.cn;https://www.sjtu.edu.cn", "aff_unique_abbr": "CSU;Tulane;NUDT;XJTU;SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;0", "aff_country_unique": "China;United States" }, { "title": "Dynamic Evaluation of Large Language Models by Meta Probing Agents", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34607", "id": "DwTgy1hXXo", "proceeding": "https://proceedings.mlr.press/v235/zhu24m.html", "pdf": "https://openreview.net/pdf?id=DwTgy1hXXo", "openreview": "https://openreview.net/forum?id=DwTgy1hXXo", "author_site": "Kaijie Zhu, Jindong Wang, Qinlin Zhao, Ruochen Xu, Xing Xie", "tldr": "", "abstract": "Evaluation of large language models (LLMs) has raised great concerns in the community due to the issue of data contamination. Existing work designed evaluation protocols using well-defined algorithms for specific tasks, which cannot be easily extended to diverse scenarios. Moreover, current evaluation benchmarks can only provide the overall benchmark results and cannot support a fine-grained and multifaceted analysis of LLMs' abilities. In this paper, we propose meta probing agents (MPA), a general dynamic evaluation protocol inspired by psychometrics to evaluate LLMs. MPA designs the probing and judging agents to automatically transform an original evaluation problem into a new one following psychometric theory on three basic cognitive abilities: language understanding, problem solving, and domain knowledge. These basic abilities are also dynamically configurable, allowing multifaceted analysis. We conducted extensive evaluations using MPA and found that most LLMs achieve poorer performance, indicating room for improvement. Our multifaceted analysis demonstrated the strong correlation between the basic abilities and an implicit Mattew effect on model size, i.e., larger models possess stronger correlations of the abilities. MPA can also be used as a data augmentation approach to enhance LLMs. Code is available at: https://github.com/microsoft/promptbench.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kaijie Zhu;Jindong Wang;Qinlin Zhao;Ruochen Xu;Xing Xie", "authorids": "~Kaijie_Zhu1;~Jindong_Wang1;~Qinlin_Zhao1;~Ruochen_Xu2;~Xing_Xie3", "gender": "M;M;M;M;M", "homepage": "https://github.com/Immortalise;;https://xrc10.github.io/;http://research.microsoft.com/en-us/people/xingx/;https://jd92.wang/", "dblp": "56/7058;;188/3515;08/6809-1;19/2969-1", "google_scholar": ";;HTp5S00AAAAJ;5EQfAFIAAAAJ;hBZ_tKsAAAAJ", "orcid": ";;;0000-0002-8608-8482;0000-0002-4833-0880", "linkedin": ";qinlin-zhao-3a51292b2/;ruochenx/;xingx/;jindong-wang/", "or_profile": "~Kaijie_Zhu1;~Qinlin_Zhao1;~Ruochen_Xu2;~Xing_Xie3;~Jindong_Wang4", "aff": "Institute of automation, Chinese Academy of Sciences;University of Science and Technology of China;Microsoft Research;Microsoft Research Asia;Microsoft Research", "aff_domain": "ia.ac.cn;ustc.edu.cn;research.microsoft.com;microsoft.com;microsoft.com", "position": "MS student;Undergrad student;Researcher;Senior Principal Researcher;Researcher", "bibtex": "@inproceedings{\nzhu2024dynamic,\ntitle={Dynamic Evaluation of Large Language Models by Meta Probing Agents},\nauthor={Kaijie Zhu and Jindong Wang and Qinlin Zhao and Ruochen Xu and Xing Xie},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DwTgy1hXXo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 521626, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17204876140727020750&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "ia.ac.cn;ustc.edu.cn;research.microsoft.com;microsoft.com;microsoft.com", "author_num": 5, "aff_unique_index": "0;1;2;2;2", "aff_unique_norm": "Chinese Academy of Sciences;University of Science and Technology of China;Microsoft", "aff_unique_dep": "Institute of Automation;;Microsoft Research", "aff_unique_url": "http://www.ia.cas.cn;http://www.ustc.edu.cn;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "CAS;USTC;MSR", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;1;0;1", "aff_country_unique": "China;United States" }, { "title": "Faster Adaptive Decentralized Learning Algorithms", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34606", "id": "Dwc0RwiNI5", "proceeding": "https://proceedings.mlr.press/v235/huang24ah.html", "pdf": "https://openreview.net/pdf?id=Dwc0RwiNI5", "openreview": "https://openreview.net/forum?id=Dwc0RwiNI5", "author_site": "Feihu Huang, jianyu zhao", "tldr": "", "abstract": "Decentralized learning recently has received increasing attention in machine learning due to its advantages in implementation simplicity and system robustness, data privacy. Meanwhile, the adaptive gradient methods show superior performances in many machine learning tasks such as training neural networks. Although some works focus on studying decentralized optimization algorithms with adaptive learning rates, these adaptive decentralized algorithms still suffer from high sample complexity. To fill these gaps, we propose a class of faster adaptive decentralized algorithms (i.e., AdaMDOS and AdaMDOF) for distributed nonconvex stochastic and finite-sum optimization, respectively. Moreover, we provide a solid convergence analysis framework for our methods. In particular, we prove that our AdaMDOS obtains a near-optimal sample complexity of $\\tilde{O}(\\epsilon^{-3})$ for finding an $\\epsilon$-stationary solution of nonconvex stochastic optimization. Meanwhile, our AdaMDOF obtains a near-optimal sample complexity of $O(\\sqrt{n}\\epsilon^{-2})$ for finding an $\\epsilon$-stationary solution of for nonconvex finite-sum optimization, where $n$ denotes the sample size. To the best of our knowledge, our AdaMDOF algorithm is the first adaptive decentralized algorithm for nonconvex finite-sum optimization. Some experimental results demonstrate efficiency of our algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Feihu Huang;Jianyu Zhao", "authorids": "~Feihu_Huang1;~Jianyu_Zhao6", "gender": "M;M", "homepage": ";https://zzzttttyy.github.io/", "dblp": "169/6247;", "google_scholar": "tRQwlHUAAAAJ;", "orcid": "0000-0003-0806-6074;", "linkedin": ";", "or_profile": "~Feihu_Huang1;~Jianyu_Zhao6", "aff": "Nanjing University of Aeronautics and Astronautics;Nanjing University of Aeronautics and Astronautics", "aff_domain": "nuaa.edu.cn;nuaa.edu.cn", "position": "Full Professor;Undergrad student", "bibtex": "@inproceedings{\nhuang2024faster,\ntitle={Faster Adaptive Decentralized Learning Algorithms},\nauthor={Feihu Huang and Jianyu Zhao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Dwc0RwiNI5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1027446, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_r1PxMQBKdIJ:scholar.google.com/&scioq=Faster+Adaptive+Decentralized+Learning+Algorithms&hl=en&as_sdt=0,44", "gs_version_total": 7, "email": "nuaa.edu.cn;nuaa.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Nanjing University of Aeronautics and Astronautics", "aff_unique_dep": "", "aff_unique_url": "http://www.nuaa.edu.cn", "aff_unique_abbr": "NUAA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Incorporating Information into Shapley Values: Reweighting via a Maximum Entropy Approach", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34605", "id": "DwniHlwcOB", "proceeding": "https://proceedings.mlr.press/v235/biparva24a.html", "pdf": "https://openreview.net/pdf?id=DwniHlwcOB", "openreview": "https://openreview.net/forum?id=DwniHlwcOB", "author_site": "Darya Biparva, Donatello Materassi", "tldr": "", "abstract": "Both the marginal contributions needed for the computation of Shapley values and the graph produced by Pearl-Verma theorem rely on the choice of an ordering of the variables. For Shapley values, the marginal contributions are averaged over all orderings, while in causal inference methods, the typical approach is to select orderings producing a graph with a minimal number of edges. We reconcile both approaches by reinterpreting them from a maximum entropy perspective. Namely, Shapley values assume no prior knowledge about the orderings and treat them as equally likely, while causal inference approaches apply Occam's razor and consider only orderings producing the simplest explanatory graphs. We find that the blind application of Occam's razor to Shapley values does not produce fully satisfactory explanations. Hence, we propose two variations of Shapley values based on entropy maximization to appropriately incorporate prior information about the model. Hence, we propose a variation of Shapley values based on entropy maximization to appropriately incorporate prior information about the model.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Darya Biparva;Donatello Materassi", "authorids": "~Darya_Biparva1;~Donatello_Materassi2", "gender": ";M", "homepage": ";https://ece.umn.edu/directory/materassi-donatello/", "dblp": ";70/1732.html", "google_scholar": ";_bjIGdIAAAAJ", "orcid": ";0000-0002-4736-8377", "linkedin": ";https://www.linkedin.com/donatello-materassi-87b6337", "or_profile": "~Darya_Biparva1;~Donatello_Materassi2", "aff": ";University of Minnesota, Minneapolis", "aff_domain": ";umn.edu", "position": ";Associate Professor", "bibtex": "@inproceedings{\nbiparva2024incorporating,\ntitle={Incorporating Information into Shapley Values: Reweighting via a Maximum Entropy Approach},\nauthor={Darya Biparva and Donatello Materassi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DwniHlwcOB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1329328, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17223631809424868839&as_sdt=8005&sciodt=0,7&hl=en", "gs_version_total": 5, "email": ";umn.edu", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "University of Minnesota", "aff_unique_dep": "", "aff_unique_url": "https://www.minnesota.edu", "aff_unique_abbr": "UMN", "aff_campus_unique_index": "0", "aff_campus_unique": "Minneapolis", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "State-Free Inference of State-Space Models: The *Transfer Function* Approach", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34604", "id": "DwwI9L67B5", "proceeding": "https://proceedings.mlr.press/v235/parnichkun24a.html", "pdf": "https://openreview.net/pdf?id=DwwI9L67B5", "openreview": "https://openreview.net/forum?id=DwwI9L67B5", "author_site": "Rom N. Parnichkun, Stefano Massaroli, Alessandro Moro, Jimmy Smith, Ramin Hasani, Mathias Lechner, Qi An, Christopher Re, Hajime Asama, Stefano Ermon, Taiji Suzuki, Michael Poli, Atsushi Yamashita", "tldr": "", "abstract": "We approach designing a state-space model for deep learning applications through its dual representation, the *transfer function*, and uncover a highly efficient sequence parallel inference algorithm that is *state-free*: unlike other proposed algorithms, state-free inference does not incur any significant memory or computational cost with an increase in state size. We achieve this using properties of the proposed frequency domain transfer function parametrization, which enables direct computation of its corresponding convolutional kernel's spectrum via a single Fast Fourier Transform. Our experimental results across multiple sequence lengths and state sizes illustrates, on average, a 35% training speed improvement over S4 layers -- parametrized in time-domain -- on the Long Range Arena benchmark, while delivering state-of-the-art downstream performances over other attention-free approaches. Moreover, we report improved perplexity in language modeling over a long convolutional Hyena baseline, by simply introducing our transfer function parametrization. Our code is available at https://github.com/ruke1ire/RTF.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rom Parnichkun;Stefano Massaroli;Alessandro Moro;Jimmy T.H. Smith;Ramin Hasani;Mathias Lechner;Qi An;Christopher Re;Hajime Asama;Stefano Ermon;Taiji Suzuki;Michael Poli;Atsushi Yamashita", "authorids": "~Rom_Parnichkun1;~Stefano_Massaroli1;~Alessandro_Moro1;~Jimmy_T.H._Smith1;~Ramin_Hasani1;~Mathias_Lechner1;~Qi_An6;~Christopher_Re1;~Hajime_Asama1;~Stefano_Ermon1;~Taiji_Suzuki1;~Michael_Poli1;~Atsushi_Yamashita1", "gender": ";;M;M;;Unspecified;M;;M;M;M;M;M", "homepage": ";;;https://jimmysmith1919.github.io/;;https://mlech26l.github.io/pages/;;;http://www.robot.t.u-tokyo.ac.jp/asamalab/en/;http://cs.stanford.edu/~ermon/;http://ibis.t.u-tokyo.ac.jp/suzuki/;;http://www.robot.t.u-tokyo.ac.jp", "dblp": ";;;305/3641;;209/9862;;;83/6992.html;47/8135;08/312;;https://dblp.uni-trier.de/pid/05/6868", "google_scholar": ";IwCfl4UAAAAJ;yg0B6WYAAAAJ;GC9Vv1wAAAAJ;;https://scholar.google.at/citations?hl=en;https://scholar.google.co.jp/citations?user=VTO9SRcAAAAJ;;UguEag4AAAAJ;;x8osrBsAAAAJ;RgIBwboAAAAJ;JgfX9ckAAAAJ", "orcid": ";;0000-0001-8711-0330;0000-0003-2016-2480;;;0000-0001-7641-2632;;0000-0002-9482-497X;;;;0000-0003-1280-069X", "linkedin": ";;;jimmy-t-h-smith-1679b122/;;;;;hajime-asama-5b931633/;;;;", "or_profile": "~Rom_Parnichkun1;~Stefano_Massaroli1;~Alessandro_Moro1;~Jimmy_T.H._Smith1;~Ramin_Hasani1;~Mathias_Lechner1;~Qi_An6;~Christopher_Re1;~Hajime_Asama1;~Stefano_Ermon1;~Taiji_Suzuki1;~Michael_Poli1;~Atsushi_Yamashita1", "aff": ";MILA;Chuo University;Stanford University;;;Tokyo University;;;Stanford University;The University of Tokyo;Stanford University;The University of Tokyo", "aff_domain": ";mila.quebec;chuo-u.ac.jp;stanford.edu;;;u-tokyo.ac.jp;;;stanford.edu;tokyo.ac.jp;stanford.edu;u-tokyo.ac.jp", "position": ";Postdoc;Visiting Rsearcher;PhD student;;;Associate Professor;;;Associate Professor;Associate Professor;PhD student;Full Professor", "bibtex": "@inproceedings{\nparnichkun2024statefree,\ntitle={State-Free Inference of State-Space Models: The *Transfer Function* Approach},\nauthor={Rom Parnichkun and Stefano Massaroli and Alessandro Moro and Jimmy T.H. Smith and Ramin Hasani and Mathias Lechner and Qi An and Christopher Re and Hajime Asama and Stefano Ermon and Taiji Suzuki and Michael Poli and Atsushi Yamashita},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DwwI9L67B5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 908700, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 13, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2387403681613403168&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 7, "email": ";mila.quebec;chuo-u.ac.jp;stanford.edu;;;u-tokyo.ac.jp;;;stanford.edu;tokyo.ac.jp;stanford.edu;u-tokyo.ac.jp", "author_num": 13, "aff_unique_index": "0;1;2;3;2;3;2;3", "aff_unique_norm": "Mila;Chuo University;Stanford University;University of Tokyo", "aff_unique_dep": ";;;", "aff_unique_url": "https://mila.quebec;https://www.chuo-u.ac.jp;https://www.stanford.edu;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "MILA;Chuo U;Stanford;UTokyo", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;2;1;2;1;2;1", "aff_country_unique": "Canada;Japan;United States" }, { "title": "Benign Overfitting in Adversarial Training of Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34603", "id": "DyvhD8J3Wl", "proceeding": "https://proceedings.mlr.press/v235/wang24cn.html", "pdf": "https://openreview.net/pdf?id=DyvhD8J3Wl", "openreview": "https://openreview.net/forum?id=DyvhD8J3Wl", "author_site": "Yunjuan Wang, Kaibo Zhang, Raman Arora", "tldr": "", "abstract": "Benign overfitting is the phenomenon wherein none of the predictors in the hypothesis class can achieve perfect accuracy (i.e., non-realizable or noisy setting), but a model that interpolates the training data still achieves good generalization. A series of recent works aim to understand this phenomenon for regression and classification tasks using linear predictors as well as two-layer neural networks. In this paper, we study such a benign overfitting phenomenon in an adversarial setting. We show that under a distributional assumption, interpolating neural networks found using adversarial training generalize well despite inference-time attacks. Specifically, we provide convergence and generalization guarantees for adversarial training of two-layer networks (with smooth as well as non-smooth activation functions) showing that under moderate $\\ell_2$ norm perturbation budget, the trained model has near-zero robust training loss and near-optimal robust generalization error. We support our theoretical findings with an empirical study on synthetic and real-world data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yunjuan Wang;Kaibo Zhang;Raman Arora", "authorids": "~Yunjuan_Wang1;~Kaibo_Zhang3;~Raman_Arora1", "gender": "F;M;M", "homepage": "https://yunjuanwang.github.io/;;http://www.cs.jhu.edu/~raman/Home.html", "dblp": "31/560;72/9686;", "google_scholar": "t_VSEEwAAAAJ;;Spe0xdkAAAAJ", "orcid": ";;", "linkedin": "yunjuan-wang-12ab85169/;kaibo-zhang-97b911230/;", "or_profile": "~Yunjuan_Wang1;~Kaibo_Zhang3;~Raman_Arora1", "aff": "Johns Hopkins University;Department of Computer Science, Whiting School of Engineering;Johns Hopkins University", "aff_domain": "jhu.edu;cs.jhu.edu;jhu.edu", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nwang2024benign,\ntitle={Benign Overfitting in Adversarial Training of Neural Networks},\nauthor={Yunjuan Wang and Kaibo Zhang and Raman Arora},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DyvhD8J3Wl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3484718, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=933843474376260915&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "jhu.edu;cs.jhu.edu;jhu.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Johns Hopkins University", "aff_unique_dep": "", "aff_unique_url": "https://www.jhu.edu", "aff_unique_abbr": "JHU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Baltimore", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Position: Towards Unified Alignment Between Agents, Humans, and Environment", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34602", "id": "DzLna0cFL1", "proceeding": "https://proceedings.mlr.press/v235/yang24p.html", "pdf": "https://openreview.net/pdf?id=DzLna0cFL1", "openreview": "https://openreview.net/forum?id=DzLna0cFL1", "author_site": "Zonghan Yang, an liu, Zijun Liu, Kaiming Liu, Fangzhou Xiong, Yile Wang, Zeyuan Yang, Qingyuan Hu, XinRui Chen, Zhenhe Zhang, Fuwen Luo, Zhicheng Guo, Peng Li, Yang Liu", "tldr": "", "abstract": "The rapid progress of foundation models has led to the prosperity of autonomous agents, which leverage the universal capabilities of foundation models to conduct reasoning, decision-making, and environmental interaction. However, the efficacy of agents remains limited when operating in intricate, realistic environments. In this work, we introduce the principles of **U**nified **A**lignment for **A**gents (**UA**$^2$), which advocate for the simultaneous alignment of agents with human intentions, environmental dynamics, and self-constraints such as the limitation of monetary budgets. From the perspective of **UA**$^2$, we review the current agent research and highlight the neglected factors in existing agent benchmarks and method candidates. We also conduct proof-of-concept studies by introducing realistic features to WebShop, including user profiles demonstrating intentions, personalized reranking reflecting complex environmental dynamics, and runtime cost statistics as self-constraints. We then follow the principles of **UA**$^2$ to propose an initial design of our agent and benchmark its performance with several candidate baselines in the retrofitted WebShop. The extensive experimental results further prove the importance of the principles of **UA**$^2$. Our research sheds light on the next steps of autonomous agent research with improved general problem-solving abilities.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zonghan Yang;An Liu;Zijun Liu;Kaiming Liu;Fangzhou Xiong;Yile Wang;Zeyuan Yang;Qingyuan Hu;Xinrui Chen;Zhenhe Zhang;Fuwen Luo;Zhicheng Guo;Peng Li;Yang Liu", "authorids": "~Zonghan_Yang1;~An_Liu4;~Zijun_Liu2;~Kaiming_Liu1;~Fangzhou_Xiong1;~Yile_Wang1;~Zeyuan_Yang3;huqy23@mails.tsinghua.edu.cn;cxr21@mails.tsinghua.edu.cn;zhenhe-z21@mails.tsinghua.edu.cn;~Fuwen_Luo1;~Zhicheng_Guo2;~Peng_Li2;~Yang_Liu19", "gender": "M;M;M;M;M;M;M;;;;M;M;M;M", "homepage": "https://minicheshire.github.io/;https://github.com/xxmlala;;https://github.com/KMing-L;;https://ylwangy.github.io/;https://miicheyang.github.io/;;;;;https://zhichengg.github.io/;http://www.lpeng.net/;http://nlp.csai.tsinghua.edu.cn/~ly/", "dblp": "222/7860;;;53/7206.html;;32/1915-1.html;260/6331-2.html;;;;317/1971;233/7924;83/6353-30;51/3710-5", "google_scholar": "rt9HOIUAAAAJ;;vXsVhPcAAAAJ;X9bm8UMAAAAJ;;v1YnW6gAAAAJ;k_qpTh4AAAAJ;;;;AIKlZXcAAAAJ;0V4riFIAAAAJ;hgYzkOQAAAAJ;https://scholar.google.com.hk/citations?user=lVhoKNcAAAAJ", "orcid": ";;;;0009-0001-0250-5462;;;;;;0009-0001-9183-9383;;0000-0003-1374-5979;0000-0002-3087-242X", "linkedin": ";;%E5%AD%90%E5%90%9B-%E5%88%98-164596263/;;;;;;;;;;;", "or_profile": "~Zonghan_Yang1;~An_Liu4;~Zijun_Liu2;~Kaiming_Liu1;~Fangzhou_Xiong1;~Yile_Wang1;~Zeyuan_Yang3;huqy23@mails.tsinghua.edu.cn;cxr21@mails.tsinghua.edu.cn;zhenhe-z21@mails.tsinghua.edu.cn;~Fuwen_Luo1;~Zhicheng_Guo2;~Peng_Li2;~Yang_Liu19", "aff": "Department of Computer Science and Technology, Tsinghua University;, Tsinghua University;Department of Computer Science and Technology, Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;, Tsinghua University;;;;Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "cs.tsinghua.edu.cn;cs.tsinghua.edu.cn;cs.tsinghua.edu.cn;mails.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;cs.tsinghua.edu.cn;;;;mails.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;MS student;Undergrad student;Undergrad student;Undergrad student;Postdoc;MS student;;;;PhD student;PhD student;Associate Professor;Professor", "bibtex": "@inproceedings{\nyang2024position,\ntitle={Position: Towards Unified Alignment Between Agents, Humans, and Environment},\nauthor={Zonghan Yang and An Liu and Zijun Liu and Kaiming Liu and Fangzhou Xiong and Yile Wang and Zeyuan Yang and Qingyuan Hu and Xinrui Chen and Zhenhe Zhang and Fuwen Luo and Zhicheng Guo and Peng Li and Yang Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DzLna0cFL1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3899293, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 14, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13452538841426865013&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "cs.tsinghua.edu.cn;cs.tsinghua.edu.cn;cs.tsinghua.edu.cn;mails.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;cs.tsinghua.edu.cn;;;;mails.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 14, "aff_unique_index": "0;0;0;0;0;0;0;0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "Department of Computer Science and Technology", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Robust Sparse Estimation for Gaussians with Optimal Error under Huber Contamination", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34601", "id": "E3V5MMwFgd", "proceeding": "https://proceedings.mlr.press/v235/diakonikolas24a.html", "pdf": "https://openreview.net/pdf?id=E3V5MMwFgd", "openreview": "https://openreview.net/forum?id=E3V5MMwFgd", "author_site": "Ilias Diakonikolas, Daniel Kane, Sushrut Karmalkar, Ankit Pensia, Thanasis Pittas", "tldr": "", "abstract": "We study Gaussian sparse estimation tasks in Huber's contamination model with a focus on mean estimation, PCA, and linear regression. For each of these tasks, we give the first sample and computationally efficient robust estimators with optimal error guarantees, within constant factors. All prior efficient algorithms for these tasks incur quantitatively suboptimal error. Concretely, for Gaussian robust $k$-sparse mean estimation on $\\mathbb{R}^d$ with corruption rate $\\epsilon>0$, our algorithm has sample complexity $(k^2/\\epsilon ^2)\\mathrm{polylog}(d/\\epsilon)$, runs in sample polynomial time, and approximates the target mean within $\\ell_2$-error $O(\\epsilon)$. Previous efficient algorithms inherently incur error $\\Omega(\\epsilon \\sqrt{\\log(1/\\epsilon)})$. At the technical level, we develop a novel multidimensional filtering method in the sparse regime that may find other applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ilias Diakonikolas;Daniel Kane;Sushrut Karmalkar;Ankit Pensia;Thanasis Pittas", "authorids": "~Ilias_Diakonikolas1;~Daniel_Kane1;~Sushrut_Karmalkar2;~Ankit_Pensia1;~Thanasis_Pittas1", "gender": "M;M;;M;M", "homepage": "http://www.iliasdiakonikolas.org/;http://cseweb.ucsd.edu/~dakane/;;https://ankitp.net/;https://thanasispittas.github.io/", "dblp": "d/IliasDiakonikolas;52/6817;;213/7640;284/9676", "google_scholar": "Vb3FLmkAAAAJ;https://scholar.google.com.tw/citations?user=DulpV-cAAAAJ;;u1Qs7YIAAAAJ;pkIOtwcAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Ilias_Diakonikolas1;~Daniel_Kane1;~Sushrut_Karmalkar2;~Ankit_Pensia1;~Thanasis_Pittas1", "aff": "University of Wisconsin - Madison;University of California, San Diego;;IBM Research;University of Wisconsin, Madison", "aff_domain": "wisc.edu;ucsd.edu;;ibm.com;wisc.edu", "position": "Full Professor;Full Professor;;Postdoc;PhD student", "bibtex": "@inproceedings{\ndiakonikolas2024robust,\ntitle={Robust Sparse Estimation for Gaussians with Optimal Error under Huber Contamination},\nauthor={Ilias Diakonikolas and Daniel Kane and Sushrut Karmalkar and Ankit Pensia and Thanasis Pittas},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=E3V5MMwFgd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 574469, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gf6Pvj-DKIYJ:scholar.google.com/&scioq=Robust+Sparse+Estimation+for+Gaussians+with+Optimal+Error+under+Huber+Contamination&hl=en&as_sdt=0,33", "gs_version_total": 11, "email": "wisc.edu;ucsd.edu;;ibm.com;wisc.edu", "author_num": 5, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Wisconsin-Madison;University of California, San Diego;IBM;University of Wisconsin", "aff_unique_dep": ";;IBM Research;", "aff_unique_url": "https://www.wisc.edu;https://www.ucsd.edu;https://www.ibm.com/research;https://www.wisc.edu", "aff_unique_abbr": "UW-Madison;UCSD;IBM;UW", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Madison;San Diego;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Recovering Labels from Local Updates in Federated Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34600", "id": "E41gvBG4s6", "proceeding": "https://proceedings.mlr.press/v235/chen24an.html", "pdf": "https://openreview.net/pdf?id=E41gvBG4s6", "openreview": "https://openreview.net/forum?id=E41gvBG4s6", "author_site": "Huancheng Chen, Haris Vikalo", "tldr": "", "abstract": "Gradient inversion (GI) attacks present a threat to the privacy of clients in federated learning (FL) by aiming to enable reconstruction of the clients' data from communicated model updates. A number of such techniques attempts to accelerate data recovery by first reconstructing labels of the samples used in local training. However, existing label extraction methods make strong assumptions that typically do not hold in realistic FL settings. In this paper we present a novel label recovery scheme, Recovering Labels from Local Updates (RLU), which provides near-perfect accuracy when attacking untrained (most vulnerable) models. More significantly, RLU achieves high performance even in realistic real-world settings where the clients in an FL system run multiple local epochs, train on heterogeneous data, and deploy various optimizers to minimize different objective functions. Specifically, RLU estimates labels by solving a least-square problem that emerges from the analysis of the correlation between labels of the data points used in a training round and the resulting update of the output layer. The experimental results on several datasets, architectures, and data heterogeneity scenarios demonstrate that the proposed method consistently outperforms existing baselines, and helps improve quality of the reconstructed images in GI attacks in terms of both PSNR and LPIPS.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Huancheng Chen;Haris Vikalo", "authorids": "~Huancheng_Chen1;~Haris_Vikalo1", "gender": "M;", "homepage": "https://citychan.github.io/;", "dblp": "302/4540;", "google_scholar": "https://scholar.google.com.tw/citations?hl=zh-TW;", "orcid": ";", "linkedin": ";", "or_profile": "~Huancheng_Chen1;~Haris_Vikalo1", "aff": "University of Texas, Austin;", "aff_domain": "utexas.edu;", "position": "PhD student;", "bibtex": "@inproceedings{\nchen2024recovering,\ntitle={Recovering Labels from Local Updates in Federated Learning},\nauthor={Huancheng Chen and Haris Vikalo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=E41gvBG4s6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1541893, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7255329794215193344&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "utexas.edu;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Run-Time Task Composition with Safety Semantics", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34599", "id": "E4ItiEU8Iu", "proceeding": "https://proceedings.mlr.press/v235/leahy24a.html", "pdf": "https://openreview.net/pdf?id=E4ItiEU8Iu", "openreview": "https://openreview.net/forum?id=E4ItiEU8Iu", "author_site": "Kevin Leahy, Makai Mann, Zachary Serlin", "tldr": "", "abstract": "Compositionality is a critical aspect of scalable system design. Here, we focus on Boolean composition of learned tasks as opposed to functional or sequential composition. Existing Boolean composition for Reinforcement Learning focuses on reaching a satisfying absorbing state in environments with discrete action spaces, but does not support composable safety (i.e., avoidance) constraints. We provide three contributions: i) introduce two distinct notions of compositional safety semantics; ii) show how to enforce either safety semantics, prove correctness, and analyze the trade-offs between the two safety notions; and iii) extend Boolean composition from discrete action spaces to continuous action spaces. We demonstrate these techniques using modified versions of value iteration in a grid world, Deep Q-Network (DQN) in a grid world with image observations, and Twin Delayed DDPG (TD3) in a continuous-observation and continuous-action Bullet physics environment", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kevin Leahy;Makai Mann;Zachary Serlin", "authorids": "~Kevin_Leahy1;~Makai_Mann1;~Zachary_Serlin1", "gender": "M;M;M", "homepage": "https://kjleahy.net;https://makaimann.github.io/;http://www.zacharyserlin.com", "dblp": "156/8189;233/0746.html;", "google_scholar": "pjiUykUAAAAJ;Qo72ORkAAAAJ;S53D8psAAAAJ", "orcid": ";0000-0002-1555-5784;0000-0002-0975-2204", "linkedin": ";makai-mann;zacharyserlin/", "or_profile": "~Kevin_Leahy1;~Makai_Mann1;~Zachary_Serlin1", "aff": "Worcester Polytechnic Institute;MIT Lincoln Laboratory, Massachusetts Institute of Technology;MIT Lincoln Laboratory, Massachusetts Institute of Technology", "aff_domain": "wpi.edu;ll.mit.edu;ll.mit.edu", "position": "Assistant Professor;Researcher;Researcher", "bibtex": "@inproceedings{\nleahy2024runtime,\ntitle={Run-Time Task Composition with Safety Semantics},\nauthor={Kevin Leahy and Makai Mann and Zachary Serlin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=E4ItiEU8Iu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1851650, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17235766001498748473&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "wpi.edu;ll.mit.edu;ll.mit.edu", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Worcester Polytechnic Institute;Massachusetts Institute of Technology", "aff_unique_dep": ";Lincoln Laboratory", "aff_unique_url": "https://www.wpi.edu;https://web.mit.edu", "aff_unique_abbr": "WPI;MIT", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "From Fourier to Neural ODEs: Flow Matching for Modeling Complex Systems", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34598", "id": "E4qjDAdVte", "proceeding": "https://proceedings.mlr.press/v235/li24cn.html", "pdf": "https://openreview.net/pdf?id=E4qjDAdVte", "openreview": "https://openreview.net/forum?id=E4qjDAdVte", "author_site": "Xin Li, Jingdong Zhang, Qunxi Zhu, Chengli Zhao, Xue Zhang, Xiaojun Duan, Wei Lin", "tldr": "", "abstract": "Modeling complex systems using standard neural ordinary differential equations (NODEs) often faces some essential challenges, including high computational costs and susceptibility to local optima. To address these challenges, we propose a simulation-free framework, called Fourier NODEs (FNODEs), that effectively trains NODEs by directly matching the target vector field based on Fourier analysis. Specifically, we employ the Fourier analysis to estimate temporal and potential high-order spatial gradients from noisy observational data. We then incorporate the estimated spatial gradients as additional inputs to a neural network. Furthermore, we utilize the estimated temporal gradient as the optimization objective for the output of the neural network. Later, the trained neural network generates more data points through an ODE solver without participating in the computational graph, facilitating more accurate estimations of gradients based on Fourier analysis. These two steps form a positive feedback loop, enabling accurate dynamics modeling in our framework. Consequently, our approach outperforms state-of-the-art methods in terms of training time, dynamics prediction, and robustness. Finally, we demonstrate the superior performance of our framework using a number of representative complex systems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xin Li;Jingdong Zhang;Qunxi Zhu;Chengli Zhao;Xue Zhang;Xiaojun Duan;Wei Lin", "authorids": "~Xin_Li59;~Jingdong_Zhang1;~Qunxi_Zhu1;~Chengli_Zhao1;~Xue_Zhang6;~Xiaojun_Duan1;~Wei_Lin1", "gender": "M;M;M;M;;F;M", "homepage": ";https://scholar.google.com/citations?user=Bjo3nfwAAAAJ&hl=zh-CN;https://www.researchgate.net/profile/Qunxi_Zhu;https://dblp.org/pid/26/7437.html;;;https://faculty.fudan.edu.cn/wlin/zh_CN/", "dblp": ";163/0015-1;219/7742;26/7437.html;;;99/2649", "google_scholar": ";Bjo3nfwAAAAJ;https://scholar.google.co.jp/citations?user=45oFQD4AAAAJ;;;;https://scholar.google.com/citations?hl=zh-CN", "orcid": "0000-0002-4559-4610;0000-0002-4120-6561;0000-0001-7281-5274;;0000-0002-4560-9835;0000-0003-0386-7163;0000-0002-1863-4306", "linkedin": ";;;;;;", "or_profile": "~Xin_Li59;~Jingdong_Zhang1;~Qunxi_Zhu1;~Chengli_Zhao1;~Xue_Zhang6;~Xiaojun_Duan1;~Wei_Lin1", "aff": "National University of Defense Technology;Fudan University;Fudan University;;;National University of Defense Technology;Fudan University", "aff_domain": "nudt.edu.cn;fudan.edu.cn;fudan.edu.cn;;;nudt.edu.cn;fudan.edu.cn", "position": "PhD student;PhD student;Postdoc;;;Full Professor;Full Professor", "bibtex": "@inproceedings{\nli2024from,\ntitle={From Fourier to Neural {ODE}s: Flow Matching for Modeling Complex Systems},\nauthor={Xin Li and Jingdong Zhang and Qunxi Zhu and Chengli Zhao and Xue Zhang and Xiaojun Duan and Wei Lin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=E4qjDAdVte}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9588446, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17287830580147590187&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "nudt.edu.cn;fudan.edu.cn;fudan.edu.cn;;;nudt.edu.cn;fudan.edu.cn", "author_num": 7, "aff_unique_index": "0;1;1;0;1", "aff_unique_norm": "National University of Defense Technology;Fudan University", "aff_unique_dep": ";", "aff_unique_url": "http://www.nudt.edu.cn/;https://www.fudan.edu.cn", "aff_unique_abbr": "NUDT;Fudan", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Contextual Feature Selection with Conditional Stochastic Gates", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34597", "id": "E6Nm3x7acv", "proceeding": "https://proceedings.mlr.press/v235/sristi24a.html", "pdf": "https://openreview.net/pdf?id=E6Nm3x7acv", "openreview": "https://openreview.net/forum?id=E6Nm3x7acv", "author_site": "Ram Dyuthi Sristi, Ofir Lindenbaum, Shira Lifshitz, Maria Lavzin, Jackie Schiller, Gal Mishne, Hadas Benisty", "tldr": "", "abstract": "Feature selection is a crucial tool in machine learning and is widely applied across various scientific disciplines. Traditional supervised methods generally identify a universal set of informative features for the entire population. However, feature relevance often varies with context, while the context itself may not directly affect the outcome variable. Here, we propose a novel architecture for contextual feature selection where the subset of selected features is conditioned on the value of *context variables*. Our new approach, Conditional Stochastic Gates (c-STG), models the importance of features using conditional Bernoulli variables whose parameters are predicted based on contextual variables. We introduce a hypernetwork that maps context variables to feature selection parameters to learn the context-dependent gates along with a prediction model. We further present a theoretical analysis of our model, indicating that it can improve performance and flexibility over population-level methods in complex feature selection settings. Finally, we conduct an extensive benchmark using simulated and real-world datasets across multiple domains demonstrating that c-STG can lead to improved feature selection capabilities while enhancing prediction accuracy and interpretability.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ram Dyuthi Sristi;Ofir Lindenbaum;Shira Lifshitz;Maria Lavzin;Jackie Schiller;Gal Mishne;Hadas Benisty", "authorids": "~Ram_Dyuthi_Sristi1;~Ofir_Lindenbaum1;shiralif@campus.technion.ac.il;~Maria_Lavzin1;~Jackie_Schiller1;~Gal_Mishne1;~Hadas_Benisty1", "gender": "F;M;;F;F;F;F", "homepage": ";https://www.eng.biu.ac.il/lindeno/;;;https://schillerj.net.technion.ac.il/;http://mishne.ucsd.edu/;https://sites.google.com/view/benistylab/", "dblp": ";142/4140;;186/9339;;125/3214;", "google_scholar": "yKZAiCYAAAAJ;https://scholar.google.co.il/citations?user=jXxk6gcAAAAJ;;;;KrwpdXYAAAAJ;3e19RiAAAAAJ", "orcid": "0000-0002-3234-9413;;;;;0000-0002-5287-3626;0000-0001-6308-2267", "linkedin": ";;;maria-lavzin-465b7a200/;;;hadas-benisty-5101b093/", "or_profile": "~Ram_Dyuthi_Sristi1;~Ofir_Lindenbaum1;shiralif@campus.technion.ac.il;~Maria_Lavzin1;~Jackie_Schiller1;~Gal_Mishne1;~Hadas_Benisty1", "aff": "University of California, San Diego, University of California, San Diego;Bar-Ilan University;;;Technion - Israel Institute of Technology;University of California, San Diego;Technion - Israel Institute of Technology, Technion", "aff_domain": "eng.ucsd.edu;biu.ac.il;;;technion.ac.il;ucsd.edu;technion.il", "position": "PhD student;Assistant Professor;;;Full Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nsristi2024contextual,\ntitle={Contextual Feature Selection with Conditional Stochastic Gates},\nauthor={Ram Dyuthi Sristi and Ofir Lindenbaum and Shira Lifshitz and Maria Lavzin and Jackie Schiller and Gal Mishne and Hadas Benisty},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=E6Nm3x7acv}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6995584, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7161297814914363062&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "eng.ucsd.edu;biu.ac.il;;;technion.ac.il;ucsd.edu;technion.il", "author_num": 7, "aff_unique_index": "0;1;2;0;2", "aff_unique_norm": "University of California, San Diego;Bar-Ilan University;Technion - Israel Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ucsd.edu;https://www.biu.ac.il;https://www.technion.ac.il/en/", "aff_unique_abbr": "UCSD;BIU;Technion", "aff_campus_unique_index": "0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;1;1;0;1", "aff_country_unique": "United States;Israel" }, { "title": "Weakly Convex Regularisers for Inverse Problems: Convergence of Critical Points and Primal-Dual Optimisation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34596", "id": "E8FpcUyPuS", "proceeding": "https://proceedings.mlr.press/v235/shumaylov24a.html", "pdf": "https://openreview.net/pdf?id=E8FpcUyPuS", "openreview": "https://openreview.net/forum?id=E8FpcUyPuS", "author_site": "Zakhar Shumaylov, Jeremy Budd, Subhadip Mukherjee, Carola-Bibiane Sch\u00f6nlieb", "tldr": "", "abstract": "Variational regularisation is the primary method for solving inverse problems, and recently there has been considerable work leveraging deeply learned regularisation for enhanced performance. However, few results exist addressing the convergence of such regularisation, particularly within the context of critical points as opposed to global minimisers. In this paper, we present a generalised formulation of convergent regularisation in terms of critical points, and show that this is achieved by a class of weakly convex regularisers. We prove convergence of the primal-dual hybrid gradient method for the associated variational problem, and, given a Kurdyka-\u0141ojasiewicz condition, an $\\mathcal{O}(\\log{k}/k)$ ergodic convergence rate. Finally, applying this theory to learned regularisation, we prove universal approximation for input weakly convex neural networks (IWCNN), and show empirically that IWCNNs can lead to improved performance of learned adversarial regularisers for computed tomography (CT) reconstruction.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zakhar Shumaylov;Jeremy Budd;Subhadip Mukherjee;Carola-Bibiane Sch\u00f6nlieb", "authorids": "~Zakhar_Shumaylov1;~Jeremy_Budd1;~Subhadip_Mukherjee1;~Carola-Bibiane_Sch\u00f6nlieb1", "gender": ";M;M;F", "homepage": "https://www.damtp.cam.ac.uk/person/zs334;https://jeremybudd.com;https://sites.google.com/view/subhadip-mukherjee/home;http://www.damtp.cam.ac.uk/research/cia/", "dblp": "272/4288;277/5023;120/7054;07/8184", "google_scholar": "EKV1lrAAAAAJ;;https://scholar.google.se/citations?user=a4UlE_MAAAAJ;nPeOXjwAAAAJ", "orcid": "0000-0001-7087-4393;0000-0003-3771-5328;0000-0002-7957-8758;", "linkedin": "zakshumaylov/;;subhadip-mukherjee-11b925101/;", "or_profile": "~Zakhar_Shumaylov1;~Jeremy_Budd1;~Subhadip_Mukherjee1;~Carola-Bibiane_Sch\u00f6nlieb1", "aff": "Apple;California Institute of Technology;Indian Institute of Technology Kharagpur;University of Cambridge", "aff_domain": "apple.com;caltech.edu;iitkgp.ernet.in;cam.ac.uk", "position": "Intern;Postdoc;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nshumaylov2024weakly,\ntitle={Weakly Convex Regularisers for Inverse Problems: Convergence of Critical Points and Primal-Dual Optimisation},\nauthor={Zakhar Shumaylov and Jeremy Budd and Subhadip Mukherjee and Carola-Bibiane Sch{\\\"o}nlieb},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=E8FpcUyPuS}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3208637, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14017836724753659747&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "apple.com;caltech.edu;iitkgp.ernet.in;cam.ac.uk", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Apple;California Institute of Technology;Indian Institute of Technology Kharagpur;University of Cambridge", "aff_unique_dep": "Apple Inc.;;;", "aff_unique_url": "https://www.apple.com;https://www.caltech.edu;https://www.iitkgp.ac.in;https://www.cam.ac.uk", "aff_unique_abbr": "Apple;Caltech;IIT Kharagpur;Cambridge", "aff_campus_unique_index": "1;2;3", "aff_campus_unique": ";Pasadena;Kharagpur;Cambridge", "aff_country_unique_index": "0;0;1;2", "aff_country_unique": "United States;India;United Kingdom" }, { "title": "Bipartite Matching in Massive Graphs: A Tight Analysis of EDCS", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34595", "id": "EDEISRmi6X", "proceeding": "https://proceedings.mlr.press/v235/azarmehr24a.html", "pdf": "https://openreview.net/pdf?id=EDEISRmi6X", "openreview": "https://openreview.net/forum?id=EDEISRmi6X", "author_site": "Amir Azarmehr, Soheil Behnezhad, Mohammad Roghani", "tldr": "", "abstract": "Maximum matching is one of the most fundamental combinatorial optimization problems with applications in various contexts such as balanced clustering, data mining, resource allocation, and online advertisement. In many of these applications, the input graph is massive. The sheer size of these inputs makes it impossible to store the whole graph in the memory of a single machine and process it there. Graph sparsification has been an extremely powerful tool to alleviate this problem. In this paper, we study a highly successful and versatile sparsifier for the matching problem: the *edge-degree constrained subgraph (EDCS)* introduced first by Bernstein & Stein 2015 The EDCS has a parameter $\\beta \\geq 2$ which controls the density of the sparsifier. It has been shown through various proofs in the literature that by picking a subgraph with $O(n\\beta)$ edges, the EDCS includes a matching of size at least $2/3-O(1/\\beta)$ times the maximum matching size. As such, by increasing $\\beta$ the approximation ratio of EDCS gets closer and closer to $2/3$. In this paper, we propose a new approach for analyzing the approximation ratio of EDCS. Our analysis is *tight* for any value of $\\beta$. Namely, we pinpoint the precise approximation ratio of EDCS for any sparsity parameter $\\beta$. Our analysis reveals that one does not necessarily need to increase $\\beta$ to improve approximation, as suggested by previous analysis. In particular, the best choice turns out to be $\\beta = 6$, which achieves an approximation ratio of $.677$! This is arguably surprising as it is even better than $2/3 \\sim .666$, the bound that was widely believed to be the limit for EDCS.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Amir Azarmehr;Soheil Behnezhad;Mohammad Roghani", "authorids": "azarmehr.a@northeastern.edu;s.behnezhad@northeastern.edu;~Mohammad_Roghani1", "gender": ";;M", "homepage": ";;https://mohammadroghani.github.io/", "dblp": ";;255/5571.html", "google_scholar": ";;GuLPrAYAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "azarmehr.a@northeastern.edu;s.behnezhad@northeastern.edu;~Mohammad_Roghani1", "aff": ";;Stanford University", "aff_domain": ";;stanford.edu", "position": ";;PhD student", "bibtex": "@inproceedings{\nazarmehr2024bipartite,\ntitle={Bipartite Matching in Massive Graphs: A Tight Analysis of {EDCS}},\nauthor={Amir Azarmehr and Soheil Behnezhad and Mohammad Roghani},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EDEISRmi6X}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 563027, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yCITAIcjz5cJ:scholar.google.com/&scioq=Bipartite+Matching+in+Massive+Graphs:+A+Tight+Analysis+of+EDCS&hl=en&as_sdt=0,14", "gs_version_total": 6, "email": ";;stanford.edu", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Modeling Language Tokens as Functionals of Semantic Fields", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34594", "id": "EEO4Iktfjp", "proceeding": "https://proceedings.mlr.press/v235/pei24c.html", "pdf": "https://openreview.net/pdf?id=EEO4Iktfjp", "openreview": "https://openreview.net/forum?id=EEO4Iktfjp", "author_site": "Zhengqi Pei, Anran Zhang, Shuhui Wang, Qingming Huang", "tldr": "", "abstract": "Recent advances in natural language processing have relied heavily on using Transformer-based language models. However, Transformers often require large parameter sizes and model depth. Existing Transformer-free approaches using state-space models demonstrate superiority over Transformers, yet they still lack a neuro-biologically connection to the human brain. This paper proposes ${\\it LasF}$, representing ${\\bf L}$anguage tokens ${\\bf as}$ ${\\bf F}$unctionals of semantic fields, to simulate the neuronal behaviors for better language modeling. The ${\\it LasF}$ module is equivalent to a nonlinear approximator tailored for sequential data. By replacing the final layers of pre-trained language models with the ${\\it LasF}$ module, we obtain ${\\it LasF}$-based models. Experiments conducted for standard reading comprehension and question-answering tasks demonstrate that the ${\\it LasF}$-based models consistently improve accuracy with fewer parameters. Besides, we use CommonsenseQA's blind test set to evaluate a full-parameter tuned ${\\it LasF}$-based model, which outperforms the prior best ensemble and single models by $0.4\\%$ and $3.1\\%$, respectively. Furthermore, our ${\\it LasF}$-only language model trained from scratch outperforms existing parameter-efficient language models on standard datasets such as WikiText103 and PennTreebank.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhengqi Pei;Anran Zhang;Shuhui Wang;Qingming Huang", "authorids": "~Zhengqi_Pei1;~Anran_Zhang2;~Shuhui_Wang1;~Qingming_Huang1", "gender": "M;F;M;", "homepage": ";;https://vipl.ict.ac.cn/people/shwang/;https://qmhuang-ucas.github.io/", "dblp": "223/2296;;37/2537;68/4388", "google_scholar": "Qs5zacQAAAAJ;;h-JxBSYAAAAJ;https://scholar.google.com.hk/citations?user=J1vMnRgAAAAJ", "orcid": ";;0000-0002-5931-0527;", "linkedin": ";%E5%AE%89%E7%84%B6-%E5%BC%A0-a901a3276/;;", "or_profile": "~Zhengqi_Pei1;~Anran_Zhang2;~Shuhui_Wang1;~Qingming_Huang2", "aff": "University of Chinese Academy of Sciences;University of Chinese Academy of Sciences;Institute of Computing Technology, Chinese Academy of Sciences;University of Chinese Academy of Sciences", "aff_domain": "ucas.ac.cn;ucas.ac.cn;ict.ac.cn;ucas.ac.cn", "position": "MS student;MS student;Full Professor;Full Professor", "bibtex": "@inproceedings{\npei2024modeling,\ntitle={Modeling Language Tokens as Functionals of Semantic Fields},\nauthor={Zhengqi Pei and Anran Zhang and Shuhui Wang and Qingming Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EEO4Iktfjp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 975529, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:pwVqhhkwKlwJ:scholar.google.com/&scioq=Modeling+Language+Tokens+as+Functionals+of+Semantic+Fields&hl=en&as_sdt=0,5", "gs_version_total": 5, "email": "ucas.ac.cn;ucas.ac.cn;ict.ac.cn;ucas.ac.cn", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Chinese Academy of Sciences;Chinese Academy of Sciences", "aff_unique_dep": ";Institute of Computing Technology", "aff_unique_url": "http://www.ucas.ac.cn;http://www.ict.ac.cn", "aff_unique_abbr": "UCAS;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Fine-grained Local Sensitivity Analysis of Standard Dot-Product Self-Attention", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34593", "id": "EEinDTdKr1", "proceeding": "https://proceedings.mlr.press/v235/havens24a.html", "pdf": "https://openreview.net/pdf?id=EEinDTdKr1", "openreview": "https://openreview.net/forum?id=EEinDTdKr1", "author_site": "Aaron Havens, Alexandre Araujo, Huan Zhang, Bin Hu", "tldr": "", "abstract": "Self-attention has been widely used in various machine learning models, such as vision transformers. The standard dot-product self-attention is arguably the most popular structure, and there is a growing interest in understanding the mathematical properties of such attention mechanisms. This paper presents a fine-grained local sensitivity analysis of the standard dot-product self-attention, leading to new non-vacuous certified robustness results for vision transformers. Despite the well-known fact that dot-product self-attention is not (globally) Lipschitz, we develop new theoretical analysis of Local Fine-grained Attention Sensitivity (LoFAST) quantifying the effect of input feature perturbations on the attention output. Our analysis reveals that the local sensitivity of dot-product self-attention to $\\ell_2$ perturbations can actually be controlled by several key quantities associated with the attention weight matrices and the unperturbed input. We empirically validate our theoretical findings by computing non-vacuous certified $\\ell_2$-robustness for vision transformers on CIFAR-10 and SVHN datasets. The code for LoFAST is available at https://github.com/AaronHavens/LoFAST.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aaron J Havens;Alexandre Araujo;Huan Zhang;Bin Hu", "authorids": "~Aaron_J_Havens1;~Alexandre_Araujo3;~Huan_Zhang1;~Bin_Hu2", "gender": "M;M;M;M", "homepage": "https://aaronhavens.github.io/;http://huan-zhang.com;;https://alexandrearaujo.com/", "dblp": ";23/1797-1.html;;228/6599", "google_scholar": ";LTa3GzEAAAAJ;;https://scholar.google.fr/citations?user=wsu61VYAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Aaron_J_Havens1;~Huan_Zhang1;~Bin_Hu2;~Alexandre_ARAUJO1", "aff": "University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;New York University", "aff_domain": "illinois.edu;uiuc.edu;illinois.edu;nyu.edu", "position": "PhD student;Assistant Professor;Assistant Professor;Postdoc", "bibtex": "@inproceedings{\nhavens2024finegrained,\ntitle={Fine-grained Local Sensitivity Analysis of Standard Dot-Product Self-Attention},\nauthor={Aaron J Havens and Alexandre Araujo and Huan Zhang and Bin Hu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EEinDTdKr1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 848762, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:pRnZyZZHykAJ:scholar.google.com/&scioq=Fine-grained+Local+Sensitivity+Analysis+of+Standard+Dot-Product+Self-Attention&hl=en&as_sdt=0,5", "gs_version_total": 7, "email": "illinois.edu;uiuc.edu;illinois.edu;nyu.edu", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of Illinois Urbana-Champaign;New York University", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://www.nyu.edu", "aff_unique_abbr": "UIUC;NYU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Defense against Model Extraction Attack by Bayesian Active Watermarking", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34592", "id": "EFtNP211X3", "proceeding": "https://proceedings.mlr.press/v235/wang24cb.html", "pdf": "https://openreview.net/pdf?id=EFtNP211X3", "openreview": "https://openreview.net/forum?id=EFtNP211X3", "author_site": "Zhenyi Wang, Yihan Wu, Heng Huang", "tldr": "", "abstract": "Model extraction is to obtain a cloned model that replicates the functionality of a black-box victim model solely through query-based access. Present defense strategies exhibit shortcomings, manifesting as: (1) computational or memory inefficiencies during deployment; or (2) dependence on expensive defensive training methods that mandate the re-training of the victim model; or (3) watermarking-based methods only *passively* detect model theft without actively preventing model extraction. To address these limitations, we introduce an innovative Bayesian *active* watermarking technique to fine-tune the victim model and learn the watermark posterior distribution conditioned on input data. The fine-tuning process aims to maximize the log-likelihood on watermarked in-distribution training data for preserving model utility while simultaneously maximizing the change of model's outputs on watermarked out-of-distribution data, thereby achieving effective defense. During deployment, a watermark is randomly sampled from the estimated watermark posterior. This watermark is then added to the input query, and the victim model returns the prediction based on the watermarked input query to users. This proactive defense approach requires only slight fine-tuning of the victim model without the need of full re-training and demonstrates high efficiency in terms of memory and computation during deployment. Rigorous theoretical analysis and comprehensive experimental results demonstrate the efficacy of our proposed method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhenyi Wang;Yihan Wu;Heng Huang", "authorids": "~Zhenyi_Wang1;~Yihan_Wu1;~Heng_Huang1", "gender": "M;M;M", "homepage": "https://yihwu.github.io/;https://www.cs.umd.edu/~heng/;https://joey-wang123.github.io/", "dblp": ";03/281;10/10222-1", "google_scholar": "cajTg_wAAAAJ;4OqLaDwAAAAJ;F4uLsroAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yihan_Wu1;~Heng_Huang1;~Zhenyi_Wang8", "aff": "University of Maryland, College Park;Department of Computer Science, University of Maryland, College Park;University of Maryland, College Park", "aff_domain": "umd.edu;cs.umd.edu;umd.edu", "position": "PhD student;Full Professor;Postdoc", "bibtex": "@inproceedings{\nwang2024defense,\ntitle={Defense against Model Extraction Attack by Bayesian Active Watermarking},\nauthor={Zhenyi Wang and Yihan Wu and Heng Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EFtNP211X3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1018154, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13763169773308570757&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 4, "email": "umd.edu;cs.umd.edu;umd.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Maryland;University of Maryland, College Park", "aff_unique_dep": ";Department of Computer Science", "aff_unique_url": "https://www/umd.edu;https://www/umd.edu", "aff_unique_abbr": "UMD;UMD", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Near-Linear Time Approximation Algorithms for k-means with Outliers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34591", "id": "EHjm3sXPFy", "proceeding": "https://proceedings.mlr.press/v235/huang24e.html", "pdf": "https://openreview.net/pdf?id=EHjm3sXPFy", "openreview": "https://openreview.net/forum?id=EHjm3sXPFy", "author_site": "Junyu Huang, Qilong Feng, Ziyun Huang, Jinhui Xu, Jianxin Wang", "tldr": "", "abstract": "The k-means with outliers problem is one of the most extensively studied clustering problems in the field of machine learning, where the goal is to discard up to z outliers and identify a minimum k-means clustering on the remaining data points. Most previous results for this problem have running time dependent on the aspect ratio \u0394 (the ratio between the maximum and the minimum pairwise distances) to achieve fast approximations. To address the issue of aspect ratio dependency on the running time, we propose sampling-based algorithms with almost linear running time in the data size, where a crucial component of our approach is an algorithm called Fast-Sampling. Fast-Sampling algorithm can find inliers that well approximate the optimal clustering centers without relying on a guess for the optimal clustering costs, where a 4-approximate solution can be obtained in time $O(\\frac{ndk\\log\\log n}{\\epsilon^2})$ with O(k/\u03f5) centers opened and (1+\u03f5)z outliers discarded. To reduce the number of centers opened, we propose a center reduction algorithm, where an O(1/\u03f5)-approximate solution can be obtained in time $O(\\frac{ndk\\log \\log n}{\\epsilon^2} + dpoly(k, \\frac{1}{\\epsilon})\\log(n\\Delta))$ with (1+\u03f5)z outliers discarded and exactly k centers opened. Empirical experiments suggest that our proposed sampling-based algorithms outperform state-of-the-art algorithms for the k-means with outliers problem.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Junyu Huang;Qilong Feng;Ziyun Huang;Jinhui Xu;Jianxin Wang", "authorids": "~Junyu_Huang1;~Qilong_Feng1;~Ziyun_Huang1;~Jinhui_Xu1;~Jianxin_Wang1", "gender": "M;M;M;M;", "homepage": ";;;https://www.cse.buffalo.edu/~jinhui/;https://faculty.csu.edu.cn/wangjianxin1/zh_CN/index/106082/list/", "dblp": "277/9525;75/6154;;24/6437-1.html;75/2669-1.html", "google_scholar": ";;1MPrmtEAAAAJ;https://scholar.google.com/citations?hl=en;7pgY2F0AAAAJ", "orcid": ";;;;0000-0003-1516-0480", "linkedin": ";;;;", "or_profile": "~Junyu_Huang1;~Qilong_Feng1;~Ziyun_Huang1;~Jinhui_Xu1;~Jianxin_Wang1", "aff": "Central South University;Central South University, China;Pennsylvania State University, Erie;University at Buffalo, State University of New York;Central South University", "aff_domain": "csu.edu.cn;csu.edu.cn;psu.edu;buffalo.edu;csu.edu.cn", "position": "PhD student;Full Professor;Assistant Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nhuang2024nearlinear,\ntitle={Near-Linear Time Approximation Algorithms for k-means with Outliers},\nauthor={Junyu Huang and Qilong Feng and Ziyun Huang and Jinhui Xu and Jianxin Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EHjm3sXPFy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 463286, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1564717424456196221&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "csu.edu.cn;csu.edu.cn;psu.edu;buffalo.edu;csu.edu.cn", "author_num": 5, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Central South University;Pennsylvania State University;University at Buffalo", "aff_unique_dep": ";;", "aff_unique_url": "https://www.csu.edu.cn;https://www.psu.edu;https://www.buffalo.edu", "aff_unique_abbr": "CSU;PSU;UB", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Erie;Buffalo", "aff_country_unique_index": "0;0;1;1;0", "aff_country_unique": "China;United States" }, { "title": "MobileLLM: Optimizing Sub-billion Parameter Language Models for On-Device Use Cases", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34590", "id": "EIGbXbxcUQ", "proceeding": "https://proceedings.mlr.press/v235/liu24ce.html", "pdf": "https://openreview.net/pdf?id=EIGbXbxcUQ", "openreview": "https://openreview.net/forum?id=EIGbXbxcUQ", "author_site": "Zechun Liu, Changsheng Zhao, Forrest Iandola, Chen Lai, Yuandong Tian, Igor Fedorov, Yunyang Xiong, Ernie Chang, Yangyang Shi, Raghuraman Krishnamoorthi, Liangzhen Lai, Vikas Chandra", "tldr": "", "abstract": "This paper addresses the growing need for efficient large language models (LLMs) on mobile devices, driven by increasing cloud costs and latency concerns. We focus on designing top-quality LLMs with fewer than a billion parameters, a practical choice for mobile deployment. Contrary to prevailing belief emphasizing the pivotal role of data and parameter quantity in determining model quality, our investigation underscores the significance of model architecture for sub-billion scale LLMs. Leveraging deep and thin architectures, coupled with embedding sharing and grouped-query attention mechanisms, we establish a strong baseline network denoted as MobileLLM, which attains a remarkable 2.7%/4.3% accuracy boost over preceding 125M/350M state-of-the-art models. Additionally, we propose an immediate block-wise weight-sharing approach with no increase in model size and only marginal latency overhead. The resultant models, denoted as MobileLLM-LS, demonstrate a further accuracy enhancement of 0.7%/0.8% than MobileLLM 125M/350M. Moreover, MobileLLM model family shows significant improvements compared to previous sub-billion models on chat benchmarks, and demonstrates close correctness to LLaMA-v2 7B in API calling tasks, highlighting the capability of small models for common on-device use cases.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zechun Liu;Changsheng Zhao;Forrest Iandola;Chen Lai;Yuandong Tian;Igor Fedorov;Yunyang Xiong;Ernie Chang;Yangyang Shi;Raghuraman Krishnamoorthi;Liangzhen Lai;Vikas Chandra", "authorids": "~Zechun_Liu1;~Changsheng_Zhao2;~Forrest_Iandola1;~Chen_Lai1;~Yuandong_Tian1;~Igor_Fedorov1;~Yunyang_Xiong2;~Ernie_Chang4;~Yangyang_Shi1;~Raghuraman_Krishnamoorthi1;~Liangzhen_Lai3;~Vikas_Chandra2", "gender": ";M;;F;M;M;M;M;M;M;M;M", "homepage": ";;http://forrestiandola.com;;http://yuandong-tian.com;http://ifed-ucsd.github.io/;;https://scholar.google.com/citations?user=FbR5cAMAAAAJ&hl=en;;;;https://v-chandra.github.io/", "dblp": ";148/3002-2;89/10238;;t/YuandongTian;175/1542;140/7645;198/1211.html;;;;57/5163", "google_scholar": ";bXnrlyAAAAAJ;;;0mgEF28AAAAJ;;k5FaRwcAAAAJ;FbR5cAMAAAAJ;https://scholar.google.com/citations?hl=en;F1mr9C0AAAAJ;RydGtvIAAAAJ;p-h_BvcAAAAJ", "orcid": ";0000-0002-1655-9787;;;0000-0003-4202-4847;0000-0002-8204-9515;;;;;;", "linkedin": ";changsheng-zhao/;;chenlai;yuandongtian;;;;;raghuraman-krishnamoorthi-b8670a5/;;vchandra/", "or_profile": "~Zechun_Liu1;~Changsheng_Zhao2;~Forrest_Iandola1;~Chen_Lai1;~Yuandong_Tian1;~Igor_Fedorov1;~Yunyang_Xiong2;~Ernie_Chang4;~Yangyang_Shi1;~Raghuraman_Krishnamoorthi1;~Liangzhen_Lai3;~Vikas_Chandra2", "aff": ";Meta Inc.;Meta;;Meta AI (FAIR);Meta;Meta Facebook;Meta AI;Meta;Meta Facebook;Meta Facebook;Meta", "aff_domain": ";meta.com;meta.com;;meta.com;meta.com;fb.com;meta.com;meta.com;meta.com;meta.com;meta.com", "position": ";Researcher;Researcher;;Research Scientist;research scientist ;Researcher;Research Scientist;Researcher;Researcher;Researcher;Director, AI", "bibtex": "@inproceedings{\nliu2024mobilellm,\ntitle={Mobile{LLM}: Optimizing Sub-billion Parameter Language Models for On-Device Use Cases},\nauthor={Zechun Liu and Changsheng Zhao and Forrest Iandola and Chen Lai and Yuandong Tian and Igor Fedorov and Yunyang Xiong and Ernie Chang and Yangyang Shi and Raghuraman Krishnamoorthi and Liangzhen Lai and Vikas Chandra},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EIGbXbxcUQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1171931, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 97, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10746600429526080063&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": ";meta.com;meta.com;;meta.com;meta.com;fb.com;meta.com;meta.com;meta.com;meta.com;meta.com", "author_num": 12, "aff_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_unique_norm": "Meta", "aff_unique_dep": "Meta Platforms, Inc.", "aff_unique_url": "https://www.meta.com", "aff_unique_abbr": "Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Position: Categorical Deep Learning is an Algebraic Theory of All Architectures", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34589", "id": "EIcxV7T0Sy", "proceeding": "https://proceedings.mlr.press/v235/gavranovic24a.html", "pdf": "https://openreview.net/pdf?id=EIcxV7T0Sy", "openreview": "https://openreview.net/forum?id=EIcxV7T0Sy", "author_site": "Bruno Gavranovi\u0107, Paul Lessard, Andrew Dudzik, Tamara von Glehn, Jo\u00e3o Madeira Araujo, Petar Veli\u010dkovi\u0107", "tldr": "", "abstract": "We present our position on the elusive quest for a general-purpose framework for specifying and studying deep learning architectures. Our opinion is that the key attempts made so far lack a coherent bridge between specifying constraints which models must satisfy and specifying their implementations. Focusing on building a such a bridge, we propose to apply category theory---precisely, the universal algebra of monads valued in a 2-category of parametric maps---as a single theory elegantly subsuming both of these flavours of neural network design. To defend our position, we show how this theory recovers constraints induced by geometric deep learning, as well as implementations of many architectures drawn from the diverse landscape of neural networks, such as RNNs. We also illustrate how the theory naturally encodes many standard constructs in computer science and automata theory.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bruno Gavranovi\u0107;Paul Lessard;Andrew Joseph Dudzik;Tamara von Glehn;Jo\u00e3o Guilherme Madeira Ara\u00fajo;Petar Veli\u010dkovi\u0107", "authorids": "~Bruno_Gavranovi\u01071;paul@symbolica.ai;~Andrew_Joseph_Dudzik1;~Tamara_von_Glehn1;~Jo\u00e3o_Guilherme_Madeira_Ara\u00fajo1;~Petar_Veli\u010dkovi\u01071", "gender": "M;;M;F;M;M", "homepage": "https://www.brunogavranovic.com/;;;;https://www.joaogui1.netlify.app;https://petar-v.com", "dblp": ";;;;;184/4786.html", "google_scholar": "ofP7CgYAAAAJ;;https://scholar.google.com/citations?view_op=list_works;gUWlWJgAAAAJ;;https://scholar.google.co.uk/citations?user=kcTK_FAAAAAJ", "orcid": ";;;;;0000-0002-2820-4692", "linkedin": ";;;;;petarvelickovic", "or_profile": "~Bruno_Gavranovi\u01071;paul@symbolica.ai;~Andrew_Joseph_Dudzik1;~Tamara_von_Glehn1;~Jo\u00e3o_Guilherme_Madeira_Ara\u00fajo1;~Petar_Veli\u010dkovi\u01071", "aff": "Symbolica;;Google DeepMind;Google DeepMind;Google;Google DeepMind", "aff_domain": "symbolica.ai;;deepmind.com;deepmind.com;google.com;google.com", "position": "Principal Researcher;;Researcher;Researcher;Researcher;Senior Staff Research Scientist", "bibtex": "@inproceedings{\ngavranovi{\\'c}2024position,\ntitle={Position: Categorical Deep Learning is an Algebraic Theory of All Architectures},\nauthor={Bruno Gavranovi{\\'c} and Paul Lessard and Andrew Joseph Dudzik and Tamara von Glehn and Jo{\\~a}o Guilherme Madeira Ara{\\'u}jo and Petar Veli{\\v{c}}kovi{\\'c}},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EIcxV7T0Sy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 650970, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=485020371351918267&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "symbolica.ai;;deepmind.com;deepmind.com;google.com;google.com", "author_num": 6, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "Symbolica;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": ";https://deepmind.com", "aff_unique_abbr": ";DeepMind", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "1;1;2;1", "aff_country_unique": ";United Kingdom;United States" }, { "title": "Accelerated Algorithms for Constrained Nonconvex-Nonconcave Min-Max Optimization and Comonotone Inclusion", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34588", "id": "EK7fuAMNoI", "proceeding": "https://proceedings.mlr.press/v235/cai24f.html", "pdf": "https://openreview.net/pdf?id=EK7fuAMNoI", "openreview": "https://openreview.net/forum?id=EK7fuAMNoI", "author_site": "Yang Cai, Argyris Oikonomou, Weiqiang Zheng", "tldr": "", "abstract": "We study constrained comonotone min-max optimization, a structured class of nonconvex-nonconcave min-max optimization problems, and their generalization to comonotone inclusion. In our first contribution, we extend the *Extra Anchored Gradient (EAG)* algorithm, originally proposed by Yoon and Ryu (2021) for unconstrained min-max optimization, to constrained comonotone min-max optimization and comonotone inclusion, achieving an optimal convergence rate of $O\\left(\\frac{1}{T}\\right)$ among all first-order methods. Additionally, we prove that the algorithm's iterations converge to a point in the solution set. In our second contribution, we extend the *Fast Extra Gradient (FEG)* algorithm, as developed by Lee and Kim (2021), to constrained comonotone min-max optimization and comonotone inclusion, achieving the same $O\\left(\\frac{1}{T}\\right)$ convergence rate. This rate is applicable to the broadest set of comonotone inclusion problems yet studied in the literature. Our analyses are based on simple potential function arguments, which might be useful for analyzing other accelerated algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yang Cai;Argyris Oikonomou;Weiqiang Zheng", "authorids": "~Yang_Cai1;~Argyris_Oikonomou1;~Weiqiang_Zheng1", "gender": ";;M", "homepage": ";http://aroikonomou.github.io;https://weiqiang-zheng.com/", "dblp": ";254/2082;277/5088", "google_scholar": ";;YrfhnIwAAAAJ", "orcid": ";0000-0002-6456-0109;", "linkedin": ";;", "or_profile": "~Yang_Cai1;~Argyris_Oikonomou1;~Weiqiang_Zheng1", "aff": ";Meta;Yale University", "aff_domain": ";meta.com;yale.edu", "position": ";Intern;PhD student", "bibtex": "@inproceedings{\ncai2024accelerated,\ntitle={Accelerated Algorithms for Constrained Nonconvex-Nonconcave Min-Max Optimization and Comonotone Inclusion},\nauthor={Yang Cai and Argyris Oikonomou and Weiqiang Zheng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EK7fuAMNoI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 547453, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4741682788777883484&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 6, "email": ";meta.com;yale.edu", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Meta;Yale University", "aff_unique_dep": "Meta Platforms, Inc.;", "aff_unique_url": "https://meta.com;https://www.yale.edu", "aff_unique_abbr": "Meta;Yale", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "FairProof : Confidential and Certifiable Fairness for Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34587", "id": "EKye56rLuv", "proceeding": "https://proceedings.mlr.press/v235/yadav24a.html", "pdf": "https://openreview.net/pdf?id=EKye56rLuv", "openreview": "https://openreview.net/forum?id=EKye56rLuv", "author_site": "Chhavi Yadav, Amrita Roy Chowdhury, Dan Boneh, Kamalika Chaudhuri", "tldr": "", "abstract": "Machine learning models are increasingly used in societal applications, yet legal and privacy concerns demand that they very often be kept confidential. Consequently, there is a growing distrust about the fairness properties of these models in the minds of consumers, who are often at the receiving end of model predictions. To this end, we propose *Fairproof* -- a system that uses Zero-Knowledge Proofs (a cryptographic primitive) to publicly verify the fairness of a model, while maintaining confidentiality. We also propose a fairness certification algorithm for fully-connected neural networks which is befitting to ZKPs and is used in this system. We implement *Fairproof* in Gnark and demonstrate empirically that our system is practically feasible. Code is available at https://github.com/infinite-pursuits/FairProof.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chhavi Yadav;Amrita Roy Chowdhury;Dan Boneh;Kamalika Chaudhuri", "authorids": "~Chhavi_Yadav1;~Amrita_Roy_Chowdhury1;~Dan_Boneh1;~Kamalika_Chaudhuri1", "gender": "F;F;M;F", "homepage": "https://www.chhaviyadav.org/;https://sites.google.com/wisc.edu/amrita-roy-chowdhury/;https://crypto.stanford.edu/~dabo;http://cseweb.ucsd.edu/users/kamalika", "dblp": "241/9443;147/6281.html;b/DanBoneh.html;56/6435", "google_scholar": "ykOFUCYAAAAJ;lWWAZ4YAAAAJ;MwLqCs4AAAAJ;I-DJ7EsAAAAJ", "orcid": ";;0000-0003-0820-0421;", "linkedin": "chhavi-yadav-089216b2;;;", "or_profile": "~Chhavi_Yadav1;~Amrita_Roy_Chowdhury1;~Dan_Boneh1;~Kamalika_Chaudhuri1", "aff": "University of California, San Diego;University of Michigan - Ann Arbor;Stanford University;University of California, San Diego", "aff_domain": "ucsd.edu;umich.edu;stanford.edu;ucsd.edu", "position": "PhD student;Assistant Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nyadav2024fairproof,\ntitle={FairProof : Confidential and Certifiable Fairness for Neural Networks},\nauthor={Chhavi Yadav and Amrita Roy Chowdhury and Dan Boneh and Kamalika Chaudhuri},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EKye56rLuv}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1844058, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6733698552887501449&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "ucsd.edu;umich.edu;stanford.edu;ucsd.edu", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of California, San Diego;University of Michigan;Stanford University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ucsd.edu;https://www.umich.edu;https://www.stanford.edu", "aff_unique_abbr": "UCSD;UM;Stanford", "aff_campus_unique_index": "0;1;2;0", "aff_campus_unique": "San Diego;Ann Arbor;Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Topological Neural Networks go Persistent, Equivariant, and Continuous", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34586", "id": "ELFZWG9C7l", "proceeding": "https://proceedings.mlr.press/v235/verma24a.html", "pdf": "https://openreview.net/pdf?id=ELFZWG9C7l", "openreview": "https://openreview.net/forum?id=ELFZWG9C7l", "author_site": "Yogesh Verma, Amauri Souza, Vikas Garg", "tldr": "", "abstract": "Topological Neural Networks (TNNs) incorporate higher-order relational information beyond pairwise interactions, enabling richer representations than Graph Neural Networks (GNNs). Concurrently, topological descriptors based on persistent homology (PH) are being increasingly employed to augment the GNNs. We investigate the benefits of integrating these two paradigms. Specifically, we introduce *TopNets* as a broad framework that subsumes and unifies various methods in the intersection of GNNs/TNNs and PH such as (generalizations of) RePHINE and TOGL. TopNets can also be readily adapted to handle (symmetries in) geometric complexes, extending the scope of TNNs and PH to spatial settings. Theoretically, we show that PH descriptors can provably enhance the expressivity of simplicial message-passing networks. Empirically, (continuous and $E(n)$-equivariant extensions of) TopNets achieve strong performance across diverse tasks, including antibody design, molecular dynamics simulation, and drug property prediction.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yogesh Verma;Amauri H Souza;Vikas Garg", "authorids": "~Yogesh_Verma1;~Amauri_H_Souza1;~Vikas_Garg2", "gender": "M;M;", "homepage": "https://yoverma.github.io/yoerma.github.io/;http://www.amauriholanda.org;", "dblp": "284/2155;131/3352;", "google_scholar": "9W9u4owAAAAJ;lP0LBI4AAAAJ;", "orcid": ";;", "linkedin": "yogeshverma1998/;;", "or_profile": "~Yogesh_Verma1;~Amauri_H_Souza1;~Vikas_Garg2", "aff": "Aalto University;Federal Institute of Cear\u00e1;", "aff_domain": "aalto.fi;ifce.edu.br;", "position": "PhD student;Associate Professor;", "bibtex": "@inproceedings{\nverma2024topological,\ntitle={Topological Neural Networks go Persistent, Equivariant, and Continuous},\nauthor={Yogesh Verma and Amauri H Souza and Vikas Garg},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ELFZWG9C7l}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 618263, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4804151466501008580&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 10, "email": "aalto.fi;ifce.edu.br;", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Aalto University;Federal Institute of Cear\u00e1", "aff_unique_dep": ";", "aff_unique_url": "https://www.aalto.fi;http://www.ifce.edu.br", "aff_unique_abbr": "Aalto;IFCE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Finland;Brazil" }, { "title": "SuDA: Support-based Domain Adaptation for Sim2Real Hinge Joint Tracking with Flexible Sensors", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34585", "id": "ENNGAY5uKC", "proceeding": "https://proceedings.mlr.press/v235/jiawei24a.html", "pdf": "https://openreview.net/pdf?id=ENNGAY5uKC", "openreview": "https://openreview.net/forum?id=ENNGAY5uKC", "author_site": "Fang Jiawei, Haishan Song, Chengxu Zuo, xiaoxia gao, Xiaowei Chen, Guo Shihui, Yipeng Qin", "tldr": "", "abstract": "Flexible sensors hold promise for human motion capture (MoCap), offering advantages such as wearability, privacy preservation, and minimal constraints on natural movement. However, existing flexible sensor-based MoCap methods rely on deep learning and necessitate large and diverse labeled datasets for training. These data typically need to be collected in MoCap studios with specialized equipment and substantial manual labor, making them difficult and expensive to obtain at scale. Thanks to the high-linearity of flexible sensors, we address this challenge by proposing a novel Sim2Real solution for hinge joint tracking based on domain adaptation, eliminating the need for labeled data yet achieving comparable accuracy to supervised learning. Our solution relies on a novel Support-based Domain Adaptation method, namely SuDA, which aligns the supports of the predictive functions rather than the instance-dependent distributions between the source and target domains. Extensive experimental results demonstrate the effectiveness of our method and its superiority overstate-of-the-art distribution-based domain adaptation methods in our task.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fang Jiawei;Haishan song;Chengxu Zuo;Xiaoxia Gao;Xiaowei Chen;Shihui Guo;Yipeng Qin", "authorids": "~Fang_Jiawei1;haishansong@stu.xmu.edu.cn;~Chengxu_Zuo1;gaxxia29@gmail.com;wdenxwa@stu.xmu.edu.cn;~Shihui_Guo1;~Yipeng_Qin1", "gender": "M;;M;;;M;", "homepage": ";;;;;http://www.guoshihui.net;https://profiles.cardiff.ac.uk/staff/qiny16", "dblp": "309/5358;;;;;;169/5516", "google_scholar": ";;https://scholar.google.com.hk/citations?user=j5jv_u8AAAAJ;;;https://scholar.google.jp/citations?user=RPAVxiAAAAAJ;ojgWPpgAAAAJ", "orcid": "0000-0003-4745-9305;;0000-0003-2054-2010;;;;0000-0002-1551-9126", "linkedin": ";;;;;;", "or_profile": "~Fang_Jiawei1;haishansong@stu.xmu.edu.cn;~Chengxu_Zuo1;gaxxia29@gmail.com;wdenxwa@stu.xmu.edu.cn;~Shihui_Guo1;~Yipeng_Qin1", "aff": "Xiamen University;;Xiamen University;;;Xiamen University;Cardiff University", "aff_domain": "xmu.edu.cn;;xmu.edu.cn;;;xmu.edu.cn;cardiff.ac.uk", "position": "Undergrad student;;PhD student;;;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\njiawei2024suda,\ntitle={Su{DA}: Support-based Domain Adaptation for Sim2Real Hinge Joint Tracking with Flexible Sensors},\nauthor={Fang Jiawei and Haishan song and Chengxu Zuo and Xiaoxia Gao and Xiaowei Chen and Shihui Guo and Yipeng Qin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ENNGAY5uKC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6342736, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16501130938977520040&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "xmu.edu.cn;;xmu.edu.cn;;;xmu.edu.cn;cardiff.ac.uk", "author_num": 7, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Xiamen University;Cardiff University", "aff_unique_dep": ";", "aff_unique_url": "https://www.xmu.edu.cn;https://www.cardiff.ac.uk", "aff_unique_abbr": "XMU;Cardiff", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "China;United Kingdom" }, { "title": "Federated Neuro-Symbolic Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34584", "id": "EQXZqBXeW9", "proceeding": "https://proceedings.mlr.press/v235/xing24a.html", "pdf": "https://openreview.net/pdf?id=EQXZqBXeW9", "openreview": "https://openreview.net/forum?id=EQXZqBXeW9", "author_site": "Pengwei Xing, Songtao Lu, Han Yu", "tldr": "", "abstract": "Neuro-symbolic learning (NSL) models complex symbolic rule patterns into latent variable distributions by neural networks, which reduces rule search space and generates unseen rules to improve downstream task performance. Centralized NSL learning involves directly acquiring data from downstream tasks, which is not feasible for federated learning (FL). To address this limitation, we shift the focus from such a one-to-one interactive neuro-symbolic paradigm to one-to-many Federated Neuro-Symbolic Learning framework (FedNSL) with latent variables as the FL communication medium. Built on the basis of our novel reformulation of the NSL theory, FedNSL is capable of identifying and addressing rule distribution heterogeneity through a simple and effective Kullback-Leibler (KL) divergence constraint on rule distribution applicable under the FL setting. It further theoretically adjusts variational expectation maximization (V-EM) to reduce the rule search space across domains. This is the first incorporation of distribution-coupled bilevel optimization into FL. Extensive experiments based on both synthetic and real-world data demonstrate significant advantages of FedNSL compared to five state-of-the-art methods. It outperforms the best baseline by 17% and 29% in terms of unbalanced average training accuracy and unseen average testing accuracy, respectively.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pengwei Xing;Songtao Lu;Han Yu", "authorids": "~Pengwei_Xing2;~Songtao_Lu1;~Han_Yu1", "gender": ";M;M", "homepage": ";https://songtaogithub.github.io/;https://sites.google.com/site/hanyushomepage/home", "dblp": ";05/2887;35/1096-1", "google_scholar": ";LRsjX7kAAAAJ;https://scholar.google.com.sg/citations?hl=en", "orcid": ";;0000-0001-6893-8650", "linkedin": ";;", "or_profile": "~Pengwei_Xing2;~Songtao_Lu1;~Han_Yu1", "aff": ";IBM Thomas J. Watson Research Center;Nanyang Technological University", "aff_domain": ";ibm.com;ntu.edu.sg", "position": ";Researcher;Associate Professor", "bibtex": "@inproceedings{\nxing2024federated,\ntitle={Federated Neuro-Symbolic Learning},\nauthor={Pengwei Xing and Songtao Lu and Han Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EQXZqBXeW9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2267158, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2682745356965143665&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": ";ibm.com;ntu.edu.sg", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "IBM;Nanyang Technological University", "aff_unique_dep": "Research;", "aff_unique_url": "https://www.ibm.com/research;https://www.ntu.edu.sg", "aff_unique_abbr": "IBM;NTU", "aff_campus_unique_index": "0", "aff_campus_unique": "Yorktown Heights;", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Singapore" }, { "title": "Observable Propagation: Uncovering Feature Vectors in Transformers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34583", "id": "ETNx4SekbY", "proceeding": "https://proceedings.mlr.press/v235/dunefsky24a.html", "pdf": "https://openreview.net/pdf?id=ETNx4SekbY", "openreview": "https://openreview.net/forum?id=ETNx4SekbY", "author_site": "Jacob Dunefsky, Arman Cohan", "tldr": "", "abstract": "A key goal of current mechanistic interpretability research in NLP is to find *linear features* (also called \"feature vectors\") for transformers: directions in activation space corresponding to concepts that are used by a given model in its computation. Present state-of-the-art methods for finding linear features require large amounts of labelled data -- both laborious to acquire and computationally expensive to utilize. In this work, we introduce a novel method, called \"observable propagation\" (in short: ObProp), for finding linear features used by transformer language models in computing a given task -- *using almost no data*. Our paradigm centers on the concept of \"observables\", linear functionals corresponding to given tasks. We then introduce a mathematical theory for the analysis of feature vectors, including a similarity metric between feature vectors called the *coupling coefficient* which estimates the degree to which one feature's output correlates with another's. We use ObProp to perform extensive qualitative investigations into several tasks, including gendered occupational bias, political party prediction, and programming language detection. Our results suggest that ObProp surpasses traditional approaches for finding feature vectors in the low-data regime, and that ObProp can be used to better understand the mechanisms responsible for bias in large language models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jacob Dunefsky;Arman Cohan", "authorids": "~Jacob_Dunefsky1;~Arman_Cohan1", "gender": "M;M", "homepage": "https://jacobdunefsky.github.io;http://www.armancohan.com", "dblp": ";160/1727", "google_scholar": ";https://scholar.google.com/citations?hl=en", "orcid": ";", "linkedin": ";", "or_profile": "~Jacob_Dunefsky1;~Arman_Cohan1", "aff": "Department of Computer Science, Yale University;Allen Institute for Artificial Intelligence", "aff_domain": "cs.yale.edu;allenai.org", "position": "PhD student;Research Scientist", "bibtex": "@inproceedings{\ndunefsky2024observable,\ntitle={Observable Propagation: Uncovering Feature Vectors in Transformers},\nauthor={Jacob Dunefsky and Arman Cohan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ETNx4SekbY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 552283, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9968807830872532017&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "cs.yale.edu;allenai.org", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Yale University;Allen Institute for Artificial Intelligence", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.yale.edu;https://allenai.org", "aff_unique_abbr": "Yale;AI2", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "A Geometric Explanation of the Likelihood OOD Detection Paradox", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34582", "id": "EVMzCKLpdD", "proceeding": "https://proceedings.mlr.press/v235/kamkari24a.html", "pdf": "https://openreview.net/pdf?id=EVMzCKLpdD", "openreview": "https://openreview.net/forum?id=EVMzCKLpdD", "author_site": "Hamidreza Kamkari, Brendan Ross, Jesse Cresswell, Anthony Caterini, Rahul G. Krishnan, Gabriel Loaiza-Ganem", "tldr": "", "abstract": "Likelihood-based deep generative models (DGMs) commonly exhibit a puzzling behaviour: when trained on a relatively complex dataset, they assign higher likelihood values to out-of-distribution (OOD) data from simpler sources. Adding to the mystery, OOD samples are never generated by these DGMs despite having higher likelihoods. This two-pronged paradox has yet to be conclusively explained, making likelihood-based OOD detection unreliable. Our primary observation is that high-likelihood regions will not be generated if they contain minimal probability mass. We demonstrate how this seeming contradiction of large densities yet low probability mass can occur around data confined to low-dimensional manifolds. We also show that this scenario can be identified through local intrinsic dimension (LID) estimation, and propose a method for OOD detection which pairs the likelihoods and LID estimates obtained from a *pre-trained* DGM. Our method can be applied to normalizing flows and score-based diffusion models, and obtains results which match or surpass state-of-the-art OOD detection benchmarks using the same DGM backbones. Our code is available at our [GitHub repository](https://github.com/layer6ai-labs/dgm_ood_detection).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hamidreza Kamkari;Brendan Leigh Ross;Jesse C. Cresswell;Anthony L. Caterini;Rahul Krishnan;Gabriel Loaiza-Ganem", "authorids": "~Hamidreza_Kamkari1;~Brendan_Leigh_Ross1;~Jesse_C._Cresswell1;~Anthony_L._Caterini1;~Rahul_G_Krishnan1;~Gabriel_Loaiza-Ganem1", "gender": "M;M;M;M;M;", "homepage": "https://hamidrezakmk.github.io/;;;http://www.cs.toronto.edu/~rahulgk/index.html;https://sites.google.com/view/gabriel-loaiza-ganem/about-me;https://jescresswell.github.io/", "dblp": ";295/0098;167/4383;172/0880;238/1617;279/6764", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.ca/citations?user=TyY1aSYAAAAJ;34sCXQEAAAAJ;ilJgXHkAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.ca/citations?hl=en", "orcid": ";;;;;0000-0002-9284-8804", "linkedin": "hamidreza-kamkari/;brendan-ross;;rahulgk/;;", "or_profile": "~Hamidreza_Kamkari1;~Brendan_Leigh_Ross1;~Anthony_L._Caterini1;~Rahul_G_Krishnan1;~Gabriel_Loaiza-Ganem1;~Jesse_C_Cresswell1", "aff": "Department of Computer Science;Layer 6 AI;Layer6;Department of Computer Science, University of Toronto;Layer 6 AI;Layer 6 AI", "aff_domain": "cs.toronto.edu;layer6.ai;layer6.ai;cs.toronto.edu;layer6.ai;layer6.ai", "position": "MS student;Senior Machine Learning Scientist;Researcher;Assistant Professor;Machine Learning Research Scientist;Staff Machine Learning Scientist", "bibtex": "@inproceedings{\nkamkari2024a,\ntitle={A Geometric Explanation of the Likelihood {OOD} Detection Paradox},\nauthor={Hamidreza Kamkari and Brendan Leigh Ross and Jesse C. Cresswell and Anthony L. Caterini and Rahul Krishnan and Gabriel Loaiza-Ganem},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EVMzCKLpdD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6954839, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13743354697846326962&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "cs.toronto.edu;layer6.ai;layer6.ai;cs.toronto.edu;layer6.ai;layer6.ai", "author_num": 6, "aff_unique_index": "0;1;2;3;1;1", "aff_unique_norm": "Unknown Institution;Layer 6 AI;Layer6 AI;University of Toronto", "aff_unique_dep": "Department of Computer Science;;;Department of Computer Science", "aff_unique_url": ";https://layer6.ai;https://layer6.ai;https://www.utoronto.ca", "aff_unique_abbr": ";Layer 6 AI;Layer6;U of T", "aff_campus_unique_index": "1", "aff_campus_unique": ";Toronto", "aff_country_unique_index": "1;1;1;1;1", "aff_country_unique": ";Canada" }, { "title": "Light and Optimal Schr\u00f6dinger Bridge Matching", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34581", "id": "EWJn6hfZ4J", "proceeding": "https://proceedings.mlr.press/v235/gushchin24a.html", "pdf": "https://openreview.net/pdf?id=EWJn6hfZ4J", "openreview": "https://openreview.net/forum?id=EWJn6hfZ4J", "author_site": "Nikita Gushchin, Sergei Kholkin, Evgeny Burnaev, Alexander Korotin", "tldr": "", "abstract": "Schr\u00f6dinger Bridges (SB) have recently gained the attention of the ML community as a promising extension of classic diffusion models which is also interconnected to the Entropic Optimal Transport (EOT). Recent solvers for SB exploit the pervasive bridge matching procedures. Such procedures aim to recover a stochastic process transporting the mass between distributions given only a transport plan between them. In particular, given the EOT plan, these procedures can be adapted to solve SB. This fact is heavily exploited by recent works giving rives to matching-based SB solvers. The cornerstone here is recovering the EOT plan: recent works either use heuristical approximations (e.g., the minibatch OT) or establish iterative matching procedures which by the design accumulate the error during the training. We address these limitations and propose a novel procedure to learn SB which we call the **optimal Schr\u00f6dinger bridge matching**. It exploits the optimal parameterization of the diffusion process and provably recovers the SB process **(a)** with a single bridge matching step and **(b)** with arbitrary transport plan as the input. Furthermore, we show that the optimal bridge matching objective coincides with the recently discovered energy-based modeling (EBM) objectives to learn EOT/SB. Inspired by this observation, we develop a light solver (which we call LightSB-M) to implement optimal matching in practice using the Gaussian mixture parameterization of the adjusted Schr\u00f6dinger potential. We experimentally showcase the performance of our solver in a range of practical tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nikita Gushchin;Sergei Kholkin;Evgeny Burnaev;Alexander Korotin", "authorids": "~Nikita_Gushchin1;~Sergei_Kholkin1;~Evgeny_Burnaev1;~Alexander_Korotin2", "gender": "M;M;M;M", "homepage": ";http://GitHub.com/skholkin;http://faculty.skoltech.ru/people/evgenyburnaev;https://akorotin.netlify.app", "dblp": "332/1999;367/9264;144/7845;209/9906", "google_scholar": "UaRTbNoAAAAJ;KwhztSMAAAAJ;https://scholar.google.ru/citations?user=pCRdcOwAAAAJ;https://scholar.google.ru/citations?user=1rIIvjAAAAAJ", "orcid": ";;0000-0001-8424-0690;0000-0003-4286-925X", "linkedin": "nikita-gushchin-937522145/;;;", "or_profile": "~Nikita_Gushchin1;~Sergei_Kholkin1;~Evgeny_Burnaev1;~Alexander_Andreevich_Korotin1", "aff": "Skolkovo Institute of Science and Technology;Skolkovo Institute of Science and Technology;Skolkovo Institute of Science and Technology;Skolkovo Institute of Science and Technology", "aff_domain": "skoltech.ru;skoltech.ru;skoltech.ru;skoltech.ru", "position": "PhD student;MS student;Full Professor;Head of Research Group", "bibtex": "@inproceedings{\ngushchin2024light,\ntitle={Light and Optimal Schr\\\"odinger Bridge Matching},\nauthor={Nikita Gushchin and Sergei Kholkin and Evgeny Burnaev and Alexander Korotin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EWJn6hfZ4J}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2365174, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11813843596072512770&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "skoltech.ru;skoltech.ru;skoltech.ru;skoltech.ru", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Skolkovo Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.skoltech.ru", "aff_unique_abbr": "Skoltech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Russian Federation" }, { "title": "Cell2Sentence: Teaching Large Language Models the Language of Biology", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34580", "id": "EWt5wsEdvc", "proceeding": "https://proceedings.mlr.press/v235/levine24a.html", "pdf": "https://openreview.net/pdf?id=EWt5wsEdvc", "openreview": "https://openreview.net/forum?id=EWt5wsEdvc", "author_site": "Daniel Levine, Syed Rizvi, Sacha L\u00e9vy, Nazreen Pallikkavaliyaveetil MohammedSheriff, David Zhang, Xingyu Chen, SINA GHADERMARZI, Ruiming Wu, Zihe Zheng, Ivan Vrkic, Anna Zhong, Daphne Raskin, Insu Han, Antonio Henrique de Oliveira Fonseca, Josue Ortega Caro, Amin Karbasi, Rahul Dhodapkar, David van Dijk", "tldr": "", "abstract": "We introduce Cell2Sentence (C2S), a novel method to directly adapt large language models to a biological context, specifically single-cell transcriptomics. By transforming gene expression data into \"cell sentences,\" C2S bridges the gap between natural language processing and biology. We demonstrate cell sentences enable the fine-tuning of language models for diverse tasks in biology, including cell generation, complex cell-type annotation, and direct data-driven text generation. Our experiments reveal that GPT-2, when fine-tuned with C2S, can generate biologically valid cells based on cell type inputs, and accurately predict cell types from cell sentences. This illustrates that language models, through C2S fine-tuning, can acquire a significant understanding of single-cell biology while maintaining robust text generation capabilities. C2S offers a flexible, accessible framework to integrate natural language processing with transcriptomics, utilizing existing models and libraries for a wide range of biological applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Daniel Levine;Syed A Rizvi;Sacha L\u00e9vy;Nazreen Pallikkavaliyaveetil;David Zhang;Xingyu Chen;Sina Ghadermarzi;Ruiming Wu;Zihe Zheng;Ivan Vrkic;Anna Zhong;Daphne Raskin;Insu Han;Antonio Henrique de Oliveira Fonseca;Josue Ortega Caro;Amin Karbasi;Rahul Madhav Dhodapkar;David van Dijk", "authorids": "~Daniel_Levine2;~Syed_A_Rizvi1;~Sacha_L\u00e9vy1;nazreen.pm@yale.edu;david.zhang.ddz5@yale.edu;~Xingyu_Chen11;~Sina_Ghadermarzi1;wuru@seas.upenn.edu;zihe.zheng@yale.edu;~Ivan_Vrkic1;anna.zhong@yale.edu;daphne.raskin@yale.edu;~Insu_Han1;~Antonio_Henrique_de_Oliveira_Fonseca1;~Josue_Ortega_Caro1;~Amin_Karbasi3;~Rahul_Madhav_Dhodapkar1;~David_van_Dijk1", "gender": ";M;;;;;;;;;;;M;M;;;M;M", "homepage": ";https://syedarizvi.com/;https://sachalevy.fr;;;;http://www.sina.page;;;;;;https://insuhan.github.io/;https://ahof1704.github.io/;;;;http://www.vandijklab.org", "dblp": ";;;;;;;;;;;;160/8272;;;;;136/9930", "google_scholar": "2G-O1zQAAAAJ;2rhnnZ4AAAAJ;https://scholar.google.fr/citations?user=1hH9QsAAAAAJ;;;;AcpzHc8AAAAJ;;;;;;0w39xsoAAAAJ;;;;ivfFMbEAAAAJ;fjjZr6UAAAAJ", "orcid": ";0000-0002-7932-9524;;;;;;;;;;;;;;;0000-0002-2014-7515;", "linkedin": ";syed-a-rizvi-01/;;;;;sina-ghadermarzi/;;;;;;;;;;;", "or_profile": "~Daniel_Levine2;~Syed_A_Rizvi1;~Sacha_L\u00e9vy1;nazreen.pm@yale.edu;david.zhang.ddz5@yale.edu;~Xingyu_Chen11;~Sina_Ghadermarzi1;wuru@seas.upenn.edu;zihe.zheng@yale.edu;~Ivan_Vrkic1;anna.zhong@yale.edu;daphne.raskin@yale.edu;~Insu_Han1;~Antonio_Henrique_de_Oliveira_Fonseca1;~Josue_Ortega_Caro1;~Amin_Karbasi3;~Rahul_Madhav_Dhodapkar1;~David_van_Dijk1", "aff": "Yale University;Yale University;;;;;Yale University;;;;;;Korea Advanced Institute of Science & Technology;Yale University;;;;Yale University", "aff_domain": "yale.edu;yale.edu;;;;;yale.edu;;;;;;kaist.ac.kr;yale.edu;;;;yale.edu", "position": "Postdoc;PhD student;;;;;Postdoc;;;;;;Assistant Professor;PhD student;;;;Assistant Professor", "bibtex": "@inproceedings{\nlevine2024cellsentence,\ntitle={Cell2Sentence: Teaching Large Language Models the Language of Biology},\nauthor={Daniel Levine and Syed A Rizvi and Sacha L{\\'e}vy and Nazreen Pallikkavaliyaveetil and David Zhang and Xingyu Chen and Sina Ghadermarzi and Ruiming Wu and Zihe Zheng and Ivan Vrkic and Anna Zhong and Daphne Raskin and Insu Han and Antonio Henrique de Oliveira Fonseca and Josue Ortega Caro and Amin Karbasi and Rahul Madhav Dhodapkar and David van Dijk},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EWt5wsEdvc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4983816, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 18, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12860459433909422728&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 10, "email": "yale.edu;yale.edu;;;;;yale.edu;;;;;;kaist.ac.kr;yale.edu;;;;yale.edu", "author_num": 18, "aff_unique_index": "0;0;0;1;0;0", "aff_unique_norm": "Yale University;Korea Advanced Institute of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.yale.edu;https://www.kaist.ac.kr", "aff_unique_abbr": "Yale;KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "United States;South Korea" }, { "title": "Exploring the Enigma of Neural Dynamics Through A Scattering-Transform Mixer Landscape for Riemannian Manifold", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34579", "id": "EYOo48YGhy", "proceeding": "https://proceedings.mlr.press/v235/dan24a.html", "pdf": "https://openreview.net/pdf?id=EYOo48YGhy", "openreview": "https://openreview.net/forum?id=EYOo48YGhy", "author_site": "Tingting Dan, Ziquan Wei, Won Hwa Kim, Guorong Wu", "tldr": "", "abstract": "The human brain is a complex inter-wired system that emerges spontaneous functional fluctuations. In spite of tremendous success in the experimental neuroscience field, a system-level understanding of how brain anatomy supports various neural activities remains elusive. Capitalizing on the unprecedented amount of neuroimaging data, we present a physics-informed deep model to uncover the coupling mechanism between brain structure and function through the lens of data geometry that is rooted in the widespread wiring topology of connections between distant brain regions. Since deciphering the puzzle of self-organized patterns in functional fluctuations is the gateway to understanding the emergence of cognition and behavior, we devise a geometric deep model to uncover manifold mapping functions that characterize the intrinsic feature representations of evolving functional fluctuations on the Riemannian manifold. In lieu of learning unconstrained mapping functions, we introduce a set of graph-harmonic scattering transforms to impose the brain-wide geometry on top of manifold mapping functions, which allows us to cast the manifold-based deep learning into a reminiscent of *MLP-Mixer* architecture (in computer vision) for Riemannian manifold. As a proof-of-concept approach, we explore a neural-manifold perspective to understand the relationship between (static) brain structure and (dynamic) function, challenging the prevailing notion in cognitive neuroscience by proposing that neural activities are essentially excited by brain-wide oscillation waves living on the geometry of human connectomes, instead of being confined to focal areas.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tingting Dan;Ziquan Wei;Won Hwa Kim;Guorong Wu", "authorids": "~Tingting_Dan1;~Ziquan_Wei1;~Won_Hwa_Kim4;~Guorong_Wu1", "gender": "F;M;M;M", "homepage": "https://www.researchgate.net/profile/Tingting_Dan;https://ziquanw.com/;https://www.acmlab.org/;https://wwplato.github.io/", "dblp": "223/8556;206/5669;03/5225-1.html;12/10278", "google_scholar": "FMcmg0gAAAAJ;z1IYb2oAAAAJ;XVsMB2kAAAAJ;aWPSHNwAAAAJ", "orcid": ";0000-0001-6553-4482;0000-0002-0550-6145;", "linkedin": ";weiziquan142857/;;", "or_profile": "~Tingting_Dan1;~Ziquan_Wei1;~Guorong_Wu1;~Won_Hwa_Kim1", "aff": "University of North Carolina at Chapel Hill;University of North Carolina at Chapel Hill;University of North Carolina, Chapel Hill;Pohang University of Science and Technology", "aff_domain": "unc.edu;unc.edu;unc.edu;postech.ac.kr", "position": "Postdoc;PhD student;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\ndan2024exploring,\ntitle={Exploring the Enigma of Neural Dynamics Through A Scattering-Transform Mixer Landscape for Riemannian Manifold},\nauthor={Tingting Dan and Ziquan Wei and Won Hwa Kim and Guorong Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EYOo48YGhy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5534928, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7312931617982476689&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 8, "email": "unc.edu;unc.edu;unc.edu;postech.ac.kr", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of North Carolina;Pohang University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.unc.edu;https://www.postech.ac.kr", "aff_unique_abbr": "UNC;POSTECH", "aff_campus_unique_index": "0;0;0;1", "aff_campus_unique": "Chapel Hill;Pohang", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;South Korea" }, { "title": "HALC: Object Hallucination Reduction via Adaptive Focal-Contrast Decoding", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34578", "id": "EYvEVbfoDp", "proceeding": "https://proceedings.mlr.press/v235/chen24bi.html", "pdf": "https://openreview.net/pdf?id=EYvEVbfoDp", "openreview": "https://openreview.net/forum?id=EYvEVbfoDp", "author_site": "Zhaorun Chen, Zhuokai Zhao, HONGYIN LUO, Huaxiu Yao, Bo Li, Jiawei Zhou", "tldr": "", "abstract": "While large vision-language models (LVLMs) have demonstrated impressive capabilities in interpreting multi-modal contexts, they invariably suffer from object hallucinations (OH). We introduce HALC, a novel decoding algorithm designed to mitigate OH in LVLMs. HALC leverages distinct fine-grained optimal visual information in vision-language tasks and operates on both local and global contexts simultaneously. Specifically, HALC integrates a robust auto-focal grounding mechanism (locally) to correct hallucinated tokens on the fly, and a specialized beam search algorithm (globally) to significantly reduce OH while preserving text generation quality. Additionally, HALC can be integrated into any LVLMs as a plug-and-play module without extra training. Extensive experimental studies demonstrate HALC\u2019s effectiveness in reducing OH, outperforming state-of-the-arts across four benchmarks. Code is released at https://github.com/BillChan226/HALC.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhaorun Chen;Zhuokai Zhao;Hongyin Luo;Huaxiu Yao;Bo Li;Jiawei Zhou", "authorids": "~Zhaorun_Chen1;~Zhuokai_Zhao1;~Hongyin_Luo1;~Huaxiu_Yao1;~Bo_Li19;~Jiawei_Zhou1", "gender": "M;M;M;M;F;M", "homepage": "https://billchan226.github.io/;https://zhuokai-zhao.com/;;http://huaxiuyao.mystrikingly.com;http://boli.cs.illinois.edu/;https://joezhouai.com/", "dblp": "302/1064;348/5348;147/4317;197/1635;50/3402-26;126/4991-1", "google_scholar": "UZg5N5UAAAAJ;EGcdEjEAAAAJ;;A20BZnQAAAAJ;K8vJkTcAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-2668-6587;0000-0001-8201-2977;;;;0000-0001-5590-6270", "linkedin": "zhaorun-chen-1793b6226/;zhuokai-zhao-a9385169/;;huaxiuyao/;;jiawei-zhou/", "or_profile": "~Zhaorun_Chen1;~Zhuokai_Zhao1;~Hongyin_Luo1;~Huaxiu_Yao1;~Bo_Li19;~Jiawei_Zhou1", "aff": "University of Chicago;University of Chicago;Massachusetts Institute of Technology;Department of Computer Science, University of North Carolina at Chapel Hill;University of Illinois, Urbana Champaign;Toyota Technological Institute at Chicago", "aff_domain": "uchicago.edu;uchicago.edu;mit.edu;cs.unc.edu;illinois.edu;ttic.edu", "position": "PhD student;PhD student;Postdoc;Assistant Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nchen2024halc,\ntitle={{HALC}: Object Hallucination Reduction via Adaptive Focal-Contrast Decoding},\nauthor={Zhaorun Chen and Zhuokai Zhao and Hongyin Luo and Huaxiu Yao and Bo Li and Jiawei Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EYvEVbfoDp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3250416, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10423060041875027066&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "uchicago.edu;uchicago.edu;mit.edu;cs.unc.edu;illinois.edu;ttic.edu", "author_num": 6, "aff_unique_index": "0;0;1;2;3;4", "aff_unique_norm": "University of Chicago;Massachusetts Institute of Technology;University of North Carolina at Chapel Hill;University of Illinois Urbana-Champaign;Toyota Technological Institute at Chicago", "aff_unique_dep": ";;Department of Computer Science;;", "aff_unique_url": "https://www.uchicago.edu;https://web.mit.edu;https://www.unc.edu;https://illinois.edu;https://www.tti-chicago.org", "aff_unique_abbr": "UChicago;MIT;UNC Chapel Hill;UIUC;TTI Chicago", "aff_campus_unique_index": "1;2;3", "aff_campus_unique": ";Chapel Hill;Urbana-Champaign;Chicago", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Position: Video as the New Language for Real-World Decision Making", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34577", "id": "EZH4CsKV6O", "proceeding": "https://proceedings.mlr.press/v235/yang24z.html", "pdf": "https://openreview.net/pdf?id=EZH4CsKV6O", "openreview": "https://openreview.net/forum?id=EZH4CsKV6O", "author_site": "Sherry Yang, Jacob C Walker, Jack Parker-Holder, Yilun Du, Jake Bruce, Andre Barreto, Pieter Abbeel, Dale Schuurmans", "tldr": "", "abstract": "Both text and video data are abundant on the internet and support large-scale self-supervised learning through next token or frame prediction. However, they have not been equally leveraged: language models have had significant real-world impact, whereas video generation has remained largely limited to media entertainment. Yet video data captures important information about the physical world that is difficult to express in language. To address this gap, we discuss an under-appreciated opportunity to extend video generation to solve tasks in the real world. We observe how, akin to language, video can serve as a unified interface that can absorb internet knowledge and represent diverse tasks. Moreover, we demonstrate how, like language models, video generation can serve as planners, agents, compute engines, and environment simulators through techniques such as in-context learning, planning and reinforcement learning. We identify major impact opportunities in domains such as robotics, self-driving, and science, supported by recent work that demonstrates how such advanced capabilities in video generation are plausibly within reach. Lastly, we identify key challenges in video generation that mitigate progress. Addressing these challenges will enable video generation models to demonstrate unique value alongside language models in a wider array of AI applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sherry Yang;Jacob C Walker;Jack Parker-Holder;Yilun Du;Jake Bruce;Andre Barreto;Pieter Abbeel;Dale Schuurmans", "authorids": "~Sherry_Yang1;~Jacob_C_Walker1;~Jack_Parker-Holder1;~Yilun_Du1;~Jake_Bruce1;~Andre_Barreto1;~Pieter_Abbeel2;~Dale_Schuurmans1", "gender": "F;;M;;M;M;M;", "homepage": "https://sherryy.github.io;;https://jparkerholder.github.io/;https://yilundu.github.io;http://jakebruce.ca;https://sites.google.com/corp/view/andrebarreto/about;https://people.eecs.berkeley.edu/~pabbeel/;", "dblp": ";135/1696;237/9793.html;204/4379;173/6014;72/953;;", "google_scholar": "7c1B_fIAAAAJ;0dR_wD0AAAAJ;;;https://scholar.google.co.uk/citations?user=RGNVBKMAAAAJ;https://scholar.google.co.uk/citations?user=H-xtdV4AAAAJ;https://scholar.google.com.tw/citations?user=vtwH6GkAAAAJ;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": "~Sherry_Yang1;~Jacob_C_Walker1;~Jack_Parker-Holder1;~Yilun_Du1;~Jake_Bruce1;~Andre_Barreto1;~Pieter_Abbeel2;~Dale_Schuurmans1", "aff": "University of California, Berkeley;Google;Google DeepMind;Massachusetts Institute of Technology;Google DeepMind;Google DeepMind;Covariant;", "aff_domain": "berkeley.edu;google.com;google.com;mit.edu;deepmind.com;google.com;covariant.ai;", "position": "Student;Research Scientist;Researcher;PhD student;Research Scientist;Research Scientist;Founder;", "bibtex": "@inproceedings{\nyang2024position,\ntitle={Position: Video as the New Language for Real-World Decision Making},\nauthor={Sherry Yang and Jacob C Walker and Jack Parker-Holder and Yilun Du and Jake Bruce and Andre Barreto and Pieter Abbeel and Dale Schuurmans},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EZH4CsKV6O}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3295691, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15460079559128525844&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "berkeley.edu;google.com;google.com;mit.edu;deepmind.com;google.com;covariant.ai;", "author_num": 8, "aff_unique_index": "0;1;1;2;1;1;3", "aff_unique_norm": "University of California, Berkeley;Google;Massachusetts Institute of Technology;Covariant", "aff_unique_dep": ";Google;;", "aff_unique_url": "https://www.berkeley.edu;https://www.google.com;https://web.mit.edu;", "aff_unique_abbr": "UC Berkeley;Google;MIT;", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Berkeley;Mountain View;", "aff_country_unique_index": "0;0;1;0;1;1", "aff_country_unique": "United States;United Kingdom;" }, { "title": "Reducing sequential change detection to sequential estimation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34576", "id": "EZLsxOgcDg", "proceeding": "https://proceedings.mlr.press/v235/shekhar24a.html", "pdf": "https://openreview.net/pdf?id=EZLsxOgcDg", "openreview": "https://openreview.net/forum?id=EZLsxOgcDg", "author_site": "Shubhanshu Shekhar, Aaditya Ramdas", "tldr": "", "abstract": "We consider the problem of sequential change detection under minimal assumptions on the distribution generating the stream of observations. Formally, our goal is to design a scheme for detecting any changes in a parameter or functional $\\theta$ of the data stream distribution that has small detection delay, but guarantees control on the frequency of false alarms in the absence of changes. We describe a simple reduction from sequential change detection to sequential estimation using confidence sequences (CSs): begin a new level-$(1-\\alpha)$ CS at each time step, and proclaim a change as soon as the intersection of all active CSs becomes empty. We prove that the average run length of our scheme is at least $1/\\alpha$, resulting in a change detection scheme with minimal structural assumptions (thus allowing for possibly dependent observations, and nonparametric distribution classes), but strong guarantees. We also describe an interesting parallel with Lorden's reduction from change detection to sequential testing and connections to the recent ''e-detector'' framework.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shubhanshu Shekhar;Aaditya Ramdas", "authorids": "~Shubhanshu_Shekhar1;~Aaditya_Ramdas2", "gender": "M;M", "homepage": ";http://stat.cmu.edu/~aramdas", "dblp": "207/2181;117/3518", "google_scholar": "x_DHuO0AAAAJ;ZvFaPxUAAAAJ", "orcid": ";0000-0003-0497-311X", "linkedin": ";", "or_profile": "~Shubhanshu_Shekhar1;~Aaditya_Ramdas2", "aff": "Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "andrew.cmu.edu;cmu.edu", "position": "Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nshekhar2024reducing,\ntitle={Reducing sequential change detection to sequential estimation},\nauthor={Shubhanshu Shekhar and Aaditya Ramdas},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EZLsxOgcDg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 389181, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14139977631511778221&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "andrew.cmu.edu;cmu.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "3D-VLA: A 3D Vision-Language-Action Generative World Model", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34575", "id": "EZcFK8HupF", "proceeding": "https://proceedings.mlr.press/v235/zhen24a.html", "pdf": "https://openreview.net/pdf?id=EZcFK8HupF", "openreview": "https://openreview.net/forum?id=EZcFK8HupF", "author_site": "Haoyu Zhen, Xiaowen Qiu, Peihao Chen, Jincheng Yang, Xin Yan, Yilun Du, Yining Hong, Chuang Gan", "tldr": "", "abstract": "Recent vision-language-action (VLA) models rely on 2D inputs, lacking integration with the broader realm of the 3D physical world. Furthermore, they perform action prediction by learning a direct mapping from perception to action, neglecting the vast dynamics of the world and the relations between actions and dynamics. In contrast, human beings are endowed with world models that depict imagination about future scenarios to plan action accordingly. To this end, we propose 3D-VLA by introducing a new family of embodied foundation models that seamlessly link 3D perception, reasoning, and action through a generative world model. Specifically, 3D-VLA is built on top of a 3D-based large language model (LLM) and a set of action tokens is introduced to engage with the embodied environment. Furthermore, to inject generation abilities into the model, we train the embodied diffusion models and align them into the LLM for predicting the goal image and point cloud. To train our 3D-VLA, we curate a large-scale 3D embodied instruction dataset by extracting vast 3D-related information from existing robotics datasets. Our experiments on held-in datasets demonstrate that 3D-VLA significantly improves the reasoning, multimodality generation and planning capabilities in embodied environments, showcasing its potential in real-world applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haoyu Zhen;Xiaowen Qiu;Peihao Chen;Jincheng Yang;Xin Yan;Yilun Du;Yining Hong;Chuang Gan", "authorids": "~Haoyu_Zhen1;~Xiaowen_Qiu1;~Peihao_Chen1;~Jincheng_Yang2;~Xin_Yan3;~Yilun_Du1;~Yining_Hong1;~Chuang_Gan1", "gender": "M;M;M;M;M;;F;M", "homepage": "https://haoyuzhen.com;http://None;https://peihaochen.github.io/;https://github.com/Yang-Chincheng;https://cakeyan.github.io/;https://yilundu.github.io;https://evelinehong.github.io;http://people.csail.mit.edu/ganchuang/", "dblp": "353/0317;;249/8975;;71/4884-8;204/4379;245/3655;139/6993", "google_scholar": "_btLQY0AAAAJ;;KkpEXpsAAAAJ;;https://scholar.google.com/citations?hl=en;;PTYxORcAAAAJ;PTeSCbIAAAAJ", "orcid": ";;0000-0002-6847-1621;;;;;", "linkedin": ";;;;;;;", "or_profile": "~Haoyu_Zhen1;~Xiaowen_Qiu1;~Peihao_Chen1;~Jincheng_Yang2;~Xin_Yan3;~Yilun_Du1;~Yining_Hong1;~Chuang_Gan1", "aff": "Shanghai Jiaotong University;University of Massachusetts at Amherst;South China University of Technology;Shanghai Jiaotong University;Wuhan University;Massachusetts Institute of Technology;University of California, Los Angeles;University of Massachusetts at Amherst", "aff_domain": "sjtu.edu.cn;umass.edu;scut.edu.cn;sjtu.edu.cn;whu.edu.cn;mit.edu;cs.ucla.edu;umass.edu", "position": "Undergrad student;MS student;PhD student;Undergrad student;Undergrad student;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nzhen2024dvla,\ntitle={3D-{VLA}: A 3D Vision-Language-Action Generative World Model},\nauthor={Haoyu Zhen and Xiaowen Qiu and Peihao Chen and Jincheng Yang and Xin Yan and Yilun Du and Yining Hong and Chuang Gan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EZcFK8HupF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6302885, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 77, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12233849096972941018&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "sjtu.edu.cn;umass.edu;scut.edu.cn;sjtu.edu.cn;whu.edu.cn;mit.edu;cs.ucla.edu;umass.edu", "author_num": 8, "aff_unique_index": "0;1;2;0;3;4;5;1", "aff_unique_norm": "Shanghai Jiao Tong University;University of Massachusetts Amherst;South China University of Technology;Wuhan University;Massachusetts Institute of Technology;University of California, Los Angeles", "aff_unique_dep": ";;;;;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.umass.edu;https://www.scut.edu.cn;http://www.whu.edu.cn/;https://web.mit.edu;https://www.ucla.edu", "aff_unique_abbr": "SJTU;UMass Amherst;SCUT;WHU;MIT;UCLA", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Amherst;Los Angeles", "aff_country_unique_index": "0;1;0;0;0;1;1;1", "aff_country_unique": "China;United States" }, { "title": "Position: The No Free Lunch Theorem, Kolmogorov Complexity, and the Role of Inductive Biases in Machine Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34574", "id": "EaJ7nqJ2Fa", "proceeding": "https://proceedings.mlr.press/v235/goldblum24a.html", "pdf": "https://openreview.net/pdf?id=EaJ7nqJ2Fa", "openreview": "https://openreview.net/forum?id=EaJ7nqJ2Fa", "author_site": "Micah Goldblum, Marc Finzi, Keefer Rowan, Andrew Wilson", "tldr": "", "abstract": "No free lunch theorems for supervised learning state that no learner can solve all problems or that all learners achieve exactly the same accuracy on average over a uniform distribution on learning problems. Accordingly, these theorems are often referenced in support of the notion that individual problems require specially tailored inductive biases. While virtually all uniformly sampled datasets have high complexity, real-world problems disproportionately generate low-complexity data, and we argue that neural network models share this same preference, formalized using Kolmogorov complexity. Notably, we show that architectures designed for a particular domain, such as computer vision, can compress datasets on a variety of seemingly unrelated domains. Our experiments show that pre-trained and even randomly initialized language models prefer to generate low-complexity sequences. Whereas no free lunch theorems seemingly indicate that individual problems require specialized learners, we explain how tasks that often require human intervention such as picking an appropriately sized model when labeled data is scarce or plentiful can be automated into a single learning algorithm. These observations justify the trend in deep learning of unifying seemingly disparate problems with an increasingly small set of machine learning models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Micah Goldblum;Marc Anton Finzi;Keefer Rowan;Andrew Gordon Wilson", "authorids": "~Micah_Goldblum1;~Marc_Anton_Finzi1;~Keefer_Rowan1;~Andrew_Gordon_Wilson1", "gender": ";M;M;Not Specified", "homepage": ";https://mfinzi.github.io;https://cims.nyu.edu/~kjr9750/;https://cims.nyu.edu/~andrewgw", "dblp": "241/7231;222/3062;344/5763;65/10453", "google_scholar": "pGDKzuUAAAAJ;ysMAhlwAAAAJ;;https://scholar.google.com.tw/citations?user=twWX2LIAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Micah_Goldblum1;~Marc_Anton_Finzi1;~Keefer_Rowan1;~Andrew_Gordon_Wilson1", "aff": "New York University;Carnegie Mellon University;NYU, New York University;New York University", "aff_domain": "nyu.edu;cmu.edu;cims.nyu.edu;nyu.edu", "position": "Postdoc;Postdoc;PhD student;Associate Professor", "bibtex": "@inproceedings{\ngoldblum2024position,\ntitle={Position: The No Free Lunch Theorem, Kolmogorov Complexity, and the Role of Inductive Biases in Machine Learning},\nauthor={Micah Goldblum and Marc Anton Finzi and Keefer Rowan and Andrew Gordon Wilson},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EaJ7nqJ2Fa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 514170, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7588377309069657340&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "nyu.edu;cmu.edu;cims.nyu.edu;nyu.edu", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "New York University;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.nyu.edu;https://www.cmu.edu", "aff_unique_abbr": "NYU;CMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";New York", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning from Memory: Non-Parametric Memory Augmented Self-Supervised Learning of Visual Features", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34573", "id": "Ed4KgHoKNe", "proceeding": "https://proceedings.mlr.press/v235/silva24c.html", "pdf": "https://openreview.net/pdf?id=Ed4KgHoKNe", "openreview": "https://openreview.net/forum?id=Ed4KgHoKNe", "author_site": "Thalles Silva, Helio Pedrini, Ad\u00edn Ram\u00edrez Rivera", "tldr": "", "abstract": "This paper introduces a novel approach to improving the training stability of self-supervised learning (SSL) methods by leveraging a non-parametric memory of seen concepts. The proposed method involves augmenting a neural network with a memory component to stochastically compare current image views with previously encountered concepts. Additionally, we introduce stochastic memory blocks to regularize training and enforce consistency between image views. We extensively benchmark our method on many vision tasks, such as linear probing, transfer learning, few-shot classification, and image retrieval on many datasets. The experimental results consolidate the effectiveness of the proposed approach in achieving stable SSL training without additional regularizers while learning highly transferable representations and requiring less computing time and resources.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Thalles Silva;Helio Pedrini;Ad\u00edn Ram\u00edrez Rivera", "authorids": "~Thalles_Silva1;~Helio_Pedrini1;~Ad\u00edn_Ram\u00edrez_Rivera1", "gender": "M;M;M", "homepage": "https://www.ic.unicamp.br/~helio/;https://www.mn.uio.no/ifi/english/people/aca/adinr/;https://sthalles.github.io/", "dblp": "60/2361.html;85/9834;309/8339", "google_scholar": "https://scholar.google.com.br/citations?user=kc-wB9QAAAAJ;p2aLoZAAAAAJ;", "orcid": "0000-0003-0125-630X;0000-0002-4321-9075;", "linkedin": ";adinramirezrivera/;thalles-silva-32ab08a3/", "or_profile": "~Helio_Pedrini1;~Adin_Ramirez_Rivera1;~Thalles_Santos_Silva1", "aff": "Universidade Estadual de Campinas;University of Oslo;Universidade Estadual de Campinas", "aff_domain": "unicamp.br;uio.no;unicamp.br", "position": "Full Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nsilva2024learning,\ntitle={Learning from Memory: Non-Parametric Memory Augmented Self-Supervised Learning of Visual Features},\nauthor={Thalles Silva and Helio Pedrini and Ad{\\'\\i}n Ram{\\'\\i}rez Rivera},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Ed4KgHoKNe}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8274892, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7430276245884583254&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "unicamp.br;uio.no;unicamp.br", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Universidade Estadual de Campinas;University of Oslo", "aff_unique_dep": ";", "aff_unique_url": "https://www.unicamp.br;https://www.uio.no", "aff_unique_abbr": "UNICAMP;UiO", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Brazil;Norway" }, { "title": "Asymptotics of feature learning in two-layer networks after one gradient-step", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34572", "id": "EdRb84fiJY", "proceeding": "https://proceedings.mlr.press/v235/cui24d.html", "pdf": "https://openreview.net/pdf?id=EdRb84fiJY", "openreview": "https://openreview.net/forum?id=EdRb84fiJY", "author_site": "Hugo Cui, Luca Pesce, Yatin Dandi, FLORENT KRZAKALA, Yue Lu, Lenka Zdeborova, Bruno Loureiro", "tldr": "", "abstract": "In this manuscript, we investigate the problem of how two-layer neural networks learn features from data, and improve over the kernel regime, after being trained with a single gradient descent step. Leveraging the insight from (Ba et al., 2022), we model the trained network by a spiked Random Features (sRF) model. Further building on recent progress on Gaussian universality (Dandi et al., 2023), we provide an exact asymptotic description of the generalization error of the sRF in the high-dimensional limit where the number of samples, the width, and the input dimension grow at a proportional rate. The resulting characterization for sRFs also captures closely the learning curves of the original network model. This enables us to understand how adapting to the data is crucial for the network to efficiently learn non-linear functions in the direction of the gradient - where at initialization it can only express linear functions in this regime.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hugo Cui;Luca Pesce;Yatin Dandi;Florent Krzakala;Yue Lu;Lenka Zdeborova;Bruno Loureiro", "authorids": "~Hugo_Cui1;~Luca_Pesce1;~Yatin_Dandi1;~Florent_Krzakala1;~Yue_Lu1;~Lenka_Zdeborova1;~Bruno_Loureiro1", "gender": ";M;M;;M;F;M", "homepage": ";https://lucpoisson.github.io;https://yatindandi.github.io/;http://Krzakala.org;https://lu.seas.harvard.edu;http://artax.karlin.mff.cuni.cz/~zdebl9am/;https://brloureiro.github.io/", "dblp": ";321/1650;255/6032;25/1282;39/6975;27/6064.html;207/1834", "google_scholar": ";praGYvoAAAAJ;UiEzYkMAAAAJ;https://scholar.google.fr/citations?user=3jDeUlMAAAAJ;wc0FCZUAAAAJ;https://scholar.google.fr/citations?user=gkCjy_UAAAAJ;DXl3ir8AAAAJ", "orcid": ";;;0000-0003-2313-2578;;;0000-0002-6327-4688", "linkedin": ";;;;;;bruno-loureiro-43183b14a/", "or_profile": "~Hugo_Cui1;~Luca_Pesce1;~Yatin_Dandi1;~Florent_Krzakala1;~Yue_Lu1;~Lenka_Zdeborova1;~Bruno_Loureiro1", "aff": ";EPFL - EPF Lausanne;EPFL - EPF Lausanne;Swiss Federal Institute of Technology Lausanne;School of Engineering and Applied Sciences, Harvard University;Swiss Federal Institute of Technology Lausanne;Ecole Normale Sup\u00e9rieure, Ecole Normale Sup\u00e9rieure de Paris", "aff_domain": ";epfl.ch;epfl.ch;epfl.ch;seas.harvard.edu;epfl.ch;di.ens.fr", "position": ";PhD student;PhD student;Full Professor;Professor;Associate Professor;Researcher", "bibtex": "@inproceedings{\ncui2024asymptotics,\ntitle={Asymptotics of feature learning in two-layer networks after one gradient-step},\nauthor={Hugo Cui and Luca Pesce and Yatin Dandi and Florent Krzakala and Yue Lu and Lenka Zdeborova and Bruno Loureiro},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EdRb84fiJY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 690095, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3364612793971731564&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": ";epfl.ch;epfl.ch;epfl.ch;seas.harvard.edu;epfl.ch;di.ens.fr", "author_num": 7, "aff_unique_index": "0;0;1;2;1;3", "aff_unique_norm": "EPFL;Swiss Federal Institute of Technology Lausanne;Harvard University;Ecole Normale Sup\u00e9rieure de Paris", "aff_unique_dep": ";;School of Engineering and Applied Sciences;", "aff_unique_url": "https://www.epfl.ch;https://www.epfl.ch;https://www.harvard.edu;https://www.ens.psl.eu", "aff_unique_abbr": "EPFL;EPFL;Harvard;ENS Paris", "aff_campus_unique_index": "0;0;0;1;0;2", "aff_campus_unique": "Lausanne;Cambridge;Paris", "aff_country_unique_index": "0;0;0;1;0;2", "aff_country_unique": "Switzerland;United States;France" }, { "title": "Position: Graph Foundation Models Are Already Here", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34571", "id": "Edz0QXKKAo", "proceeding": "https://proceedings.mlr.press/v235/mao24a.html", "pdf": "https://openreview.net/pdf?id=Edz0QXKKAo", "openreview": "https://openreview.net/forum?id=Edz0QXKKAo", "author_site": "Haitao Mao, Zhikai Chen, Wenzhuo Tang, Jianan Zhao, Yao Ma, Tong Zhao, Neil Shah, Mikhail Galkin, Jiliang Tang", "tldr": "", "abstract": "Graph Foundation Models (GFMs) are emerging as a significant research topic in the graph domain, aiming to develop graph models trained on extensive and diverse data to enhance their applicability across various tasks and domains. Developing GFMs presents unique challenges over traditional Graph Neural Networks (GNNs), which are typically trained from scratch for specific tasks on particular datasets. The primary challenge in constructing GFMs lies in effectively leveraging vast and diverse graph data to achieve positive transfer. Drawing inspiration from existing foundation models in the CV and NLP domains, we propose a novel perspective for the GFM development by advocating for a \"graph vocabulary'', in which the basic transferable units underlying graphs encode the invariance on graphs. We ground the graph vocabulary construction from essential aspects including network analysis, expressiveness, and stability. Such a vocabulary perspective can potentially advance the future GFM design in line with the neural scaling laws. All relevant resources with GFM design can be found here.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haitao Mao;Zhikai Chen;Wenzhuo Tang;Jianan Zhao;Yao Ma;Tong Zhao;Neil Shah;Mikhail Galkin;Jiliang Tang", "authorids": "~Haitao_Mao1;~Zhikai_Chen3;~Wenzhuo_Tang1;~Jianan_Zhao2;~Yao_Ma3;~Tong_Zhao3;~Neil_Shah2;~Mikhail_Galkin1;~Jiliang_Tang1", "gender": "M;M;M;M;M;M;M;M;M", "homepage": "http://currytang.github.io;;https://andyjzhao.github.io/;https://yaoma24.github.io/;https://tzhao.io/;http://nshah.net;https://migalkin.github.io/;https://www.cse.msu.edu/~tangjili/;", "dblp": "92/40;;135/9355-2;212/7871.html;94/6503-3;71/7771;160/8154;64/10812;", "google_scholar": "6hUny38AAAAJ;;https://scholar.google.com/citations?view_op=new_articles;wf9TTOIAAAAJ;05cRc-MAAAAJ;Qut69OgAAAAJ;yfYRbG4AAAAJ;WtzKMWAAAAAJ;3GmlKM4AAAAJ", "orcid": "0009-0009-7305-8629;;0000-0002-9743-7588;;0000-0001-7660-1732;0000-0003-3261-8430;;0000-0001-7125-3898;", "linkedin": ";wenzhuo-tang-66b757207;;;;;;;", "or_profile": "~Zhikai_Chen3;~Wenzhuo_Tang1;~Jianan_Zhao2;~Yao_Ma3;~Tong_Zhao3;~Neil_Shah2;~Mikhail_Galkin1;~Jiliang_Tang1;~Mao_Haitao1", "aff": "Michigan State University;Michigan State University;Universit\u00e9 de Montr\u00e9al;Rensselaer Polytechnic Institute;Snap Inc.;Snap Inc.;Intel;Michigan State University;Michigan State University", "aff_domain": "msu.edu;msu.edu;umontreal.ca;rpi.edu;snap.com;snap.com;intel.com;msu.edu;msu.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor;Researcher;Research Scientist;Researcher;Full Professor;PhD student", "bibtex": "@inproceedings{\nmao2024position,\ntitle={Position: Graph Foundation Models Are Already Here},\nauthor={Haitao Mao and Zhikai Chen and Wenzhuo Tang and Jianan Zhao and Yao Ma and Tong Zhao and Neil Shah and Mikhail Galkin and Jiliang Tang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Edz0QXKKAo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 382473, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6351192675309565134&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 7, "email": "msu.edu;msu.edu;umontreal.ca;rpi.edu;snap.com;snap.com;intel.com;msu.edu;msu.edu", "author_num": 9, "aff_unique_index": "0;0;1;2;3;3;4;0;0", "aff_unique_norm": "Michigan State University;Universit\u00e9 de Montr\u00e9al;Rensselaer Polytechnic Institute;Snap Inc.;Intel", "aff_unique_dep": ";;;;Intel Corporation", "aff_unique_url": "https://www.msu.edu;https://www.umontreal.ca;https://www.rpi.edu;https://www.snapinc.com;https://www.intel.com", "aff_unique_abbr": "MSU;UdeM;RPI;Snap;Intel", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0;0;0;0", "aff_country_unique": "United States;Canada" }, { "title": "On the Identifiability of Switching Dynamical Systems", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34570", "id": "Eew3yUQQtE", "proceeding": "https://proceedings.mlr.press/v235/balsells-rodas24a.html", "pdf": "https://openreview.net/pdf?id=Eew3yUQQtE", "openreview": "https://openreview.net/forum?id=Eew3yUQQtE", "author_site": "Carles Balsells-Rodas, Yixin Wang, Yingzhen Li", "tldr": "", "abstract": "The identifiability of latent variable models has received increasing attention due to its relevance in interpretability and out-of-distribution generalisation. In this work, we study the identifiability of Switching Dynamical Systems, taking an initial step toward extending identifiability analysis to sequential latent variable models. We first prove the identifiability of Markov Switching Models, which commonly serve as the prior distribution for the continuous latent variables in Switching Dynamical Systems. We present identification conditions for first-order Markov dependency structures, whose transition distribution is parametrised via non-linear Gaussians. We then establish the identifiability of the latent variables and non-linear mappings in Switching Dynamical Systems up to affine transformations, by leveraging identifiability analysis techniques from identifiable deep latent variable models. We finally develop estimation algorithms for identifiable Switching Dynamical Systems. Throughout empirical studies, we demonstrate the practicality of identifiable Switching Dynamical Systems for segmenting high-dimensional time series such as videos, and showcase the use of identifiable Markov Switching Models for regime-dependent causal discovery in climate data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Carles Balsells-Rodas;Yixin Wang;Yingzhen Li", "authorids": "~Carles_Balsells-Rodas1;~Yixin_Wang1;~Yingzhen_Li1", "gender": ";F;M", "homepage": ";http://yingzhenli.net/home/en/;", "dblp": ";117/9230;", "google_scholar": "gFLW9qcAAAAJ;https://scholar.google.se/citations?hl=en;ZHmqn_AAAAAJ", "orcid": "0000-0002-6617-4842;;", "linkedin": ";;carles-balsells-rodas-a07911150/", "or_profile": "~Yixin_Wang1;~Yingzhen_Li1;~Carles_Balsells_Rodas1", "aff": "University of Michigan - Ann Arbor;Imperial College London;Imperial College London, Imperial College London", "aff_domain": "umich.edu;imperial.ac.uk;imperial.ac.uk", "position": "Assistant Professor;Associate Professor;PhD student", "bibtex": "@inproceedings{\nbalsells-rodas2024on,\ntitle={On the Identifiability of Switching Dynamical Systems},\nauthor={Carles Balsells-Rodas and Yixin Wang and Yingzhen Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Eew3yUQQtE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7695002, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=543694841544533237&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "umich.edu;imperial.ac.uk;imperial.ac.uk", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Michigan;Imperial College London", "aff_unique_dep": ";", "aff_unique_url": "https://www.umich.edu;https://www.imperial.ac.uk", "aff_unique_abbr": "UM;ICL", "aff_campus_unique_index": "0", "aff_campus_unique": "Ann Arbor;", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Junk DNA Hypothesis: Pruning Small Pre-Trained Weights $\\textit{Irreversibly}$ and $\\textit{Monotonically}$ Impairs ``Difficult\" Downstream Tasks in LLMs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34569", "id": "EfUrTeuUfy", "proceeding": "https://proceedings.mlr.press/v235/yin24b.html", "pdf": "https://openreview.net/pdf?id=EfUrTeuUfy", "openreview": "https://openreview.net/forum?id=EfUrTeuUfy", "author_site": "Lu Yin, Ajay Jaiswal, Shiwei Liu, Souvik Kundu, Zhangyang \u201cAtlas\u201d Wang", "tldr": "", "abstract": "We present *Junk DNA Hypothesis* by adopting a novel *task-centric* angle for the pre-trained weights of large language models (LLMs). It has been believed that weights in LLMs contain significant redundancy, leading to the conception that a considerable chunk of the parameters can be removed by *pruning* without compromising performance. Contrary to this belief, this paper presents a *counter-argument*: small-magnitude weights of pre-trained model weights encode vital knowledge essential for tackling difficult downstream tasks - manifested as the **monotonic relationship** between the performance drop of downstream tasks across the difficulty spectrum, as we prune more pre-trained weights by magnitude. Moreover, we reveal that these seemingly inconsequential weights can result in **irreparable loss** of knowledge and performance degradation in difficult tasks, even when downstream continual training is allowed. Interestingly, our evaluations show that the other popular compression, namely *quantization* **fail** to exhibit similar ``monotonic\" effect and does not as convincingly disentangle this task-difficulty information. To study formally, we introduce several quantifiable metrics to *gauge the downstream task difficulty*: (a) within the same task category, and (b) across different task categories. Our extensive experiments substantiate the Junk DNA Hypothesis across a diverse range of model sizes, tasks, datasets, and even pruning methods. Codes are available at https://github.com/VITA-Group/Junk_DNA_Hypothesis.git.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lu Yin;AJAY KUMAR JAISWAL;Shiwei Liu;Souvik Kundu;Zhangyang Wang", "authorids": "~Lu_Yin1;~AJAY_KUMAR_JAISWAL1;~Shiwei_Liu2;~Souvik_Kundu2;~Zhangyang_Wang1", "gender": ";M;M;M;M", "homepage": "https://luuyin.com/;https://ajay1994.github.io/;https://shiweiliuiiiiiii.github.io/;https://ksouvik52.github.io;https://vita-group.github.io", "dblp": "87/2528-6;30/9707;234/8697-3.html;126/2210;119/4026", "google_scholar": "G4Xe1NkAAAAJ;I783HxYAAAAJ;73IbXtsAAAAJ;https://scholar.google.com/citations?hl=en;pxFyKAIAAAAJ", "orcid": ";;;0000-0002-3533-9405;", "linkedin": ";;;souvik-kundu-64922b50/;", "or_profile": "~Lu_Yin1;~AJAY_KUMAR_JAISWAL1;~Shiwei_Liu2;~Souvik_Kundu2;~Zhangyang_Wang1", "aff": "University of Aberdeen;University of Texas, Austin;University of Oxford;Intel;University of Texas at Austin", "aff_domain": "abdn.ac.uk;utexas.edu;ox.ac.uk;intel.com;utexas.edu", "position": "Assistant Professor;PhD student;Postdoc;Researcher;Associate Professor", "bibtex": "@inproceedings{\nyin2024junk,\ntitle={Junk {DNA} Hypothesis: Pruning Small Pre-Trained Weights \\${\\textbackslash}textit\\{Irreversibly\\}\\$ and \\${\\textbackslash}textit\\{Monotonically\\}\\$ Impairs ``Difficult'' Downstream Tasks in {LLM}s},\nauthor={Lu Yin and AJAY KUMAR JAISWAL and Shiwei Liu and Souvik Kundu and Zhangyang Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EfUrTeuUfy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 656719, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "email": "abdn.ac.uk;utexas.edu;ox.ac.uk;intel.com;utexas.edu", "author_num": 5, "aff_unique_index": "0;1;2;3;1", "aff_unique_norm": "University of Aberdeen;University of Texas at Austin;University of Oxford;Intel", "aff_unique_dep": ";;;Intel Corporation", "aff_unique_url": "https://www.abdn.ac.uk;https://www.utexas.edu;https://www.ox.ac.uk;https://www.intel.com", "aff_unique_abbr": "Aberdeen;UT Austin;Oxford;Intel", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;1;0;1;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Surprisingly Strong Performance Prediction with Neural Graph Features", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34568", "id": "EhPpZV6KLk", "proceeding": "https://proceedings.mlr.press/v235/kadlecova24a.html", "pdf": "https://openreview.net/pdf?id=EhPpZV6KLk", "openreview": "https://openreview.net/forum?id=EhPpZV6KLk", "author_site": "Gabriela Kadlecov\u00e1, Jovita Lukasik, Martin Pil\u00e1t, Petra Vidnerov\u00e1, Mahmoud Safari, Roman Neruda, Frank Hutter", "tldr": "", "abstract": "Performance prediction has been a key part of the neural architecture search (NAS) process, allowing to speed up NAS algorithms by avoiding resource-consuming network training. Although many performance predictors correlate well with ground truth performance, they require training data in the form of trained networks. Recently, zero-cost proxies have been proposed as an efficient method to estimate network performance without any training. However, they are still poorly understood, exhibit biases with network properties, and their performance is limited. Inspired by the drawbacks of zero-cost proxies, we propose neural graph features (GRAF), simple to compute properties of architectural graphs. GRAF offers fast and interpretable performance prediction while outperforming zero-cost proxies and other common encodings. In combination with other zero-cost proxies, GRAF outperforms most existing performance predictors at a fraction of the cost.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gabriela Kadlecov\u00e1;Jovita Lukasik;Martin Pil\u00e1t;Petra Vidnerov\u00e1;Mahmoud Safari;Roman Neruda;Frank Hutter", "authorids": "~Gabriela_Kadlecov\u00e11;~Jovita_Lukasik1;~Martin_Pil\u00e1t1;~Petra_Vidnerov\u00e11;~Mahmoud_Safari1;~Roman_Neruda1;~Frank_Hutter1", "gender": "F;M;F;M;;M;F", "homepage": "https://www.uni-mannheim.de/dws/people/researchers/phd-students/jovita-lukasik/;http://ktiml.mff.cuni.cz/~pilat/en/;http://www.cs.cas.cz/~petra;https://ml.informatik.uni-freiburg.de/profile/safari/;;http://ml.informatik.uni-freiburg.de/~hutter/;https://gabikadlecova.github.io/", "dblp": "255/4833;42/8571.html;;280/3542;41/358;89/5383;282/8436", "google_scholar": "https://scholar.google.de/citations?user=TpsZenwAAAAJ;7bCWbh0AAAAJ;Leez938AAAAJ;https://scholar.google.it/citations?user=ntPjyLwAAAAJ;https://scholar.google.cz/citations?user=bZRKRCMAAAAJ;https://scholar.google.de/citations?user=YUrxwrkAAAAJ;https://scholar.google.cz/citations?user=KGwHzzMAAAAJ", "orcid": ";0000-0003-1239-1566;;;0000-0003-2364-5357;0000-0002-2037-3694;0000-0002-4780-0633", "linkedin": ";;;;;frank-hutter-9190b24b/;gabriela-kadlecova/", "or_profile": "~Jovita_Lukasik1;~Martin_Pil\u00e1t1;~Petra_Vidnerov\u00e11;~Mahmoud_Safari1;~Roman_Neruda1;~Frank_Hutter1;~Gabriela_Suchop\u00e1rov\u00e11", "aff": "Universit\u00e4t Siegen;Charles University;The Czech Academy of Sciences;Universit\u00e4t Freiburg;Faculty of mathematics and physics, Charles University, Prague;Albert-Ludwigs-Universit\u00e4t Freiburg;Institute of Computer Science, The Czech Academy of Sciences", "aff_domain": "uni-siegen.de;mff.cuni.cz;cs.cas.cz;uni-freiburg.de;mff.cuni.cz;uni-freiburg.de;cs.cas.cz", "position": "Postdoc;Assistant Professor;Researcher;Postdoc;Lecturer;Full Professor;Researcher", "bibtex": "@inproceedings{\nkadlecov{\\'a}2024surprisingly,\ntitle={Surprisingly Strong Performance Prediction with Neural Graph Features},\nauthor={Gabriela Kadlecov{\\'a} and Jovita Lukasik and Martin Pil{\\'a}t and Petra Vidnerov{\\'a} and Mahmoud Safari and Roman Neruda and Frank Hutter},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EhPpZV6KLk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1666187, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15574639271002136489&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "uni-siegen.de;mff.cuni.cz;cs.cas.cz;uni-freiburg.de;mff.cuni.cz;uni-freiburg.de;cs.cas.cz", "author_num": 7, "aff_unique_index": "0;1;2;3;1;4;2", "aff_unique_norm": "University of Siegen;Charles University;Czech Academy of Sciences;University of Freiburg;Albert-Ludwigs-Universit\u00e4t Freiburg", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.uni-siegen.de;https://www.cuni.cz;https://www.cas.cz;https://www.uni-freiburg.de;https://www.uni-freiburg.de", "aff_unique_abbr": "Uni Siegen;Charles U;CAS;Uni Freiburg;Albert-Ludwigs-Universit\u00e4t", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Prague;Freiburg", "aff_country_unique_index": "0;1;1;0;1;0;1", "aff_country_unique": "Germany;Czech Republic" }, { "title": "Benign Overfitting in Two-Layer ReLU Convolutional Neural Networks for XOR Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34567", "id": "EhU0xBSP4l", "proceeding": "https://proceedings.mlr.press/v235/meng24c.html", "pdf": "https://openreview.net/pdf?id=EhU0xBSP4l", "openreview": "https://openreview.net/forum?id=EhU0xBSP4l", "author_site": "Xuran Meng, Difan Zou, Yuan Cao", "tldr": "", "abstract": "Modern deep learning models are usually highly over-parameterized so that they can overfit the training data. Surprisingly, such overfitting neural networks can usually still achieve high prediction accuracy. To study this ``benign overfitting'' phenomenon, a line of recent works has theoretically studied the learning of linear models and two-layer neural networks. However, most of these analyses are still limited to the very simple learning problems where the Bayes-optimal classifier is linear. In this work, we investigate a class of XOR-type classification tasks with label-flipping noises. We show that, under a certain condition on the sample complexity and signal-to-noise ratio, an over-parameterized ReLU CNN trained by gradient descent can achieve near Bayes-optimal accuracy. Moreover, we also establish a matching lower bound result showing that when the previous condition is not satisfied, the prediction accuracy of the obtained CNN is an absolute constant away from the Bayes-optimal rate. Our result demonstrates that CNNs have a remarkable capacity to efficiently learn XOR problems, even in the presence of highly correlated features.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xuran Meng;Difan Zou;Yuan Cao", "authorids": "~Xuran_Meng1;~Difan_Zou1;~Yuan_Cao1", "gender": "M;M;M", "homepage": ";https://difanzou.github.io/;https://yuancaohku.github.io/", "dblp": "307/5287;161/8923;", "google_scholar": "WwASl80AAAAJ;Cp4fcTQAAAAJ;-VGnHI4AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Xuran_Meng1;~Difan_Zou1;~Yuan_Cao1", "aff": "University of Hong Kong;University of Hong Kong;University of Hong Kong", "aff_domain": "hku.hk;hku.hk;hku.hk", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nmeng2024benign,\ntitle={Benign Overfitting in Two-Layer Re{LU} Convolutional Neural Networks for {XOR} Data},\nauthor={Xuran Meng and Difan Zou and Yuan Cao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EhU0xBSP4l}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3802174, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2390669850575548429&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 8, "email": "hku.hk;hku.hk;hku.hk", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.hku.hk", "aff_unique_abbr": "HKU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Differentially Private Worst-group Risk Minimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34566", "id": "ElNxZ40tBJ", "proceeding": "https://proceedings.mlr.press/v235/zhou24b.html", "pdf": "https://openreview.net/pdf?id=ElNxZ40tBJ", "openreview": "https://openreview.net/forum?id=ElNxZ40tBJ", "author_site": "Xinyu Zhou, Raef Bassily", "tldr": "", "abstract": "We initiate a systematic study of worst-group risk minimization under $(\\epsilon, \\delta)$-differential privacy (DP). The goal is to privately find a model that approximately minimizes the maximal risk across $p$ sub-populations (groups) with different distributions, where each group distribution is accessed via a sample oracle. We first present a new algorithm that achieves excess worst-group population risk of $\\tilde{O}(\\frac{p\\sqrt{d}}{K\\epsilon} + \\sqrt{\\frac{p}{K}})$, where $K$ is the total number of samples drawn from all groups and $d$ is the problem dimension. Our rate is nearly optimal when each distribution is observed via a fixed-size dataset of size $K/p$. Our result is based on a new stability-based analysis for the generalization error. In particular, we show that $\\Delta$-uniform argument stability implies $\\tilde{O}(\\Delta + \\frac{1}{\\sqrt{n}})$ generalization error w.r.t. the worst-group risk, where $n$ is the number of samples drawn from each sample oracle. Next, we propose an algorithmic framework for worst-group population risk minimization using any DP online convex optimization algorithm as a subroutine. Hence, we give another excess risk bound of $\\tilde{O}\\left( \\sqrt{\\frac{d^{1/2}}{\\epsilon K}} +\\sqrt{\\frac{p}{K\\epsilon^2}} + \\sqrt{\\frac{p}{K}} \\right)$. Assuming the typical setting of $\\epsilon=\\Theta(1)$, this bound is more favorable than our first bound in a certain range of $p$ as a function of $K$ and $d$. Finally, we study differentially private worst-group *empirical* risk minimization in the offline setting, where each group distribution is observed by a fixed-size dataset. We present a new algorithm with nearly optimal excess risk of $\\tilde{O}(\\frac{p\\sqrt{d}}{K\\epsilon})$.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xinyu Zhou;Raef Bassily", "authorids": "~Xinyu_Zhou4;~Raef_Bassily2", "gender": "M;M", "homepage": ";https://sites.google.com/view/rbassily", "dblp": ";88/8656", "google_scholar": ";C8qMVQUAAAAJ", "orcid": ";", "linkedin": "xinyu-zhou-226251127/;", "or_profile": "~Xinyu_Zhou4;~RAEF_BASSILY1", "aff": "Ohio State University, Columbus;Google", "aff_domain": "osu.edu;google.com", "position": "PhD student;Researcher", "bibtex": "@inproceedings{\nzhou2024differentially,\ntitle={Differentially Private Worst-group Risk Minimization},\nauthor={Xinyu Zhou and Raef Bassily},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ElNxZ40tBJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 372387, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3384056154235589759&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 7, "email": "osu.edu;google.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Ohio State University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.osu.edu;https://www.google.com", "aff_unique_abbr": "OSU;Google", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Columbus;Mountain View", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Dual Operating Modes of In-Context Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34565", "id": "ElVHUWyL3n", "proceeding": "https://proceedings.mlr.press/v235/lin24l.html", "pdf": "https://openreview.net/pdf?id=ElVHUWyL3n", "openreview": "https://openreview.net/forum?id=ElVHUWyL3n", "author_site": "Ziqian Lin, Kangwook Lee", "tldr": "", "abstract": "In-context learning (ICL) exhibits dual operating modes: ***task learning***, i.e., acquiring a new skill from in-context samples, and ***task retrieval***, i.e., locating and activating a relevant pretrained skill. Recent theoretical work proposes various mathematical models to analyze ICL, but they cannot fully explain the duality. In this work, we analyze a generalized probabilistic model for pretraining data, obtaining a quantitative understanding of the two operating modes of ICL. Leveraging our analysis, we provide the first explanation of an unexplained phenomenon observed with real-world large language models (LLMs). Under some settings, the ICL risk initially increases and then decreases with more in-context examples. Our analysis offers a plausible explanation for this \"early ascent\" phenomenon: a limited number of in-context samples may lead to the retrieval of an incorrect skill, thereby increasing the risk, which will eventually diminish as task learning takes effect with more in-context samples. We also analyze ICL with biased labels, e.g., zero-shot ICL, where in-context examples are assigned random labels, and predict the bounded efficacy of such approaches. We corroborate our analysis and predictions with extensive experiments with Transformers and LLMs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ziqian Lin;Kangwook Lee", "authorids": "~Ziqian_Lin1;~Kangwook_Lee1", "gender": "M;M", "homepage": "https://myhakureimu.github.io/;http://kangwooklee.com/", "dblp": "245/3453;88/9826-1", "google_scholar": "0nOdbCoAAAAJ;sCEl8r-n5VEC", "orcid": ";", "linkedin": ";", "or_profile": "~Ziqian_Lin1;~Kangwook_Lee1", "aff": "University of Wisconsin - Madison;KRAFTON", "aff_domain": "wisc.edu;krafton.com", "position": "PhD student;Researcher", "bibtex": "@inproceedings{\nlin2024dual,\ntitle={Dual Operating Modes of In-Context Learning},\nauthor={Ziqian Lin and Kangwook Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ElVHUWyL3n}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2459896, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15276777566864631005&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "wisc.edu;krafton.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Wisconsin-Madison;KRAFTON Inc.", "aff_unique_dep": ";", "aff_unique_url": "https://www.wisc.edu;https://www.krafton.com", "aff_unique_abbr": "UW-Madison;KRAFTON", "aff_campus_unique_index": "0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;South Korea" }, { "title": "KnowFormer: Revisiting Transformers for Knowledge Graph Reasoning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34564", "id": "EncFNR3hxM", "proceeding": "https://proceedings.mlr.press/v235/liu24au.html", "pdf": "https://openreview.net/pdf?id=EncFNR3hxM", "openreview": "https://openreview.net/forum?id=EncFNR3hxM", "author_site": "Junnan Liu, Qianren Mao, Weifeng Jiang, Jianxin Li", "tldr": "", "abstract": "Knowledge graph reasoning plays a vital role in various applications and has garnered considerable attention. Recently, path-based methods have achieved impressive performance. However, they may face limitations stemming from constraints in message-passing neural networks, such as missing paths and information over-squashing. In this paper, we revisit the application of transformers for knowledge graph reasoning to address the constraints faced by path-based methods and propose a novel method KnowFormer. KnowFormer utilizes a transformer architecture to perform reasoning on knowledge graphs from the message-passing perspective, rather than reasoning by textual information like previous pretrained language model based methods. Specifically, we define the attention computation based on the query prototype of knowledge graph reasoning, facilitating convenient construction and efficient optimization. To incorporate structural information into the self-attention mechanism, we introduce structure-aware modules to calculate query, key, and value respectively. Additionally, we present an efficient attention computation method for better scalability. Experimental results demonstrate the superior performance of KnowFormer compared to prominent baseline methods on both transductive and inductive benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Junnan Liu;Qianren Mao;Weifeng Jiang;Jianxin Li", "authorids": "~Junnan_Liu1;~Qianren_Mao4;~Weifeng_Jiang2;~Jianxin_Li3", "gender": "M;M;;M", "homepage": "https://github.com/spankeran;;;http://myjianxin.github.io", "dblp": "206/8503;234/5350;211/3336;l/JianxinLi-2.html", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com.hk/citations?user=PnDqlPkAAAAJ;;EY2lqD0AAAAJ", "orcid": ";0000-0003-0780-0628;0009-0008-2314-0299;0000-0001-5152-0055", "linkedin": ";%E4%B9%BE%E4%BB%BB-%E6%AF%9B-574534326/;;", "or_profile": "~Junnan_Liu1;~Qianren_Mao4;~Weifeng_Jiang2;~Jianxin_Li3", "aff": "Beihang University;Beihang University;Nanyang Technological University;Beihang University ", "aff_domain": "buaa.edu.cn;buaa.edu.cn;ntu.edu.sg;buaa.edu.cn", "position": "MS student;PhD student;MS student;Full Professor", "bibtex": "@inproceedings{\nliu2024knowformer,\ntitle={KnowFormer: Revisiting Transformers for Knowledge Graph Reasoning},\nauthor={Junnan Liu and Qianren Mao and Weifeng Jiang and Jianxin Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EncFNR3hxM}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 738662, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8677158030452966708&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "buaa.edu.cn;buaa.edu.cn;ntu.edu.sg;buaa.edu.cn", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Beihang University;Nanyang Technological University", "aff_unique_dep": ";", "aff_unique_url": "http://www.buaa.edu.cn/;https://www.ntu.edu.sg", "aff_unique_abbr": "BUAA;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;Singapore" }, { "title": "Probabilistic Generating Circuits - Demystified", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34563", "id": "EqFxIbGWRU", "proceeding": "https://proceedings.mlr.press/v235/agarwal24c.html", "pdf": "https://openreview.net/pdf?id=EqFxIbGWRU", "openreview": "https://openreview.net/forum?id=EqFxIbGWRU", "author_site": "Sanyam Agarwal, Markus Bl\u00e4ser", "tldr": "", "abstract": "Zhang et al. (ICML 2021, PLMR 139, pp. 12447\u201312457) introduced probabilistic generating circuits (PGCs) as a probabilistic model to unify probabilistic circuits (PCs) and determinantal point processes (DPPs). At a first glance, PGCs store a distribution in a very different way, they compute the probability generating polynomial instead of the probability mass function and it seems that this is the main reason why PGCs are more powerful than PCs or DPPs. However, PGCs also allow for negative weights, whereas classical PCs assume that all weights are nonnegative. One main insight of this work is that the negative weights are the cause for the power of PGCs and not the different representation. PGCs are PCs in disguise: we show how to transform any PGC on binary variables into a PC with negative weights with only polynomial blowup. PGCs were defined by Zhang et al. only for binary random variables. As our second main result, we show that there is a good reason for this: we prove that PGCs for categorical variables with larger image size do not support tractable marginalization unless NP=P. On the other hand, we show that we can model categorical variables with larger image size as PC with negative weights computing set-multilinear polynomials. These allow for tractable marginalization. In this sense, PCs with negative weights strictly subsume PGCs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sanyam Agarwal;Markus Bl\u00e4ser", "authorids": "~Sanyam_Agarwal2;~Markus_Bl\u00e4ser1", "gender": "M;M", "homepage": ";https://cc.cs.uni-saarland.de/mblaeser/", "dblp": ";95/6062", "google_scholar": ";AB0CIBAAAAAJ", "orcid": ";0000-0002-1750-9036", "linkedin": "sanyam-agarwal-97326114a;", "or_profile": "~Sanyam_Agarwal2;~Markus_Blaeser1", "aff": "Saarland University, Universit\u00e4t des Saarlandes;Saarland University, Universit\u00e4t des Saarlandes", "aff_domain": "cs.uni-saarland.de;cs.uni-saarland.de", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nagarwal2024probabilistic,\ntitle={Probabilistic Generating Circuits - Demystified},\nauthor={Sanyam Agarwal and Markus Bl{\\\"a}ser},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EqFxIbGWRU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 393654, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17864837554190106442&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "cs.uni-saarland.de;cs.uni-saarland.de", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Saarland University", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-saarland.de", "aff_unique_abbr": "Saarland U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Binning as a Pretext Task: Improving Self-Supervised Learning in Tabular Domains", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34562", "id": "ErkzxOlOLy", "proceeding": "https://proceedings.mlr.press/v235/lee24v.html", "pdf": "https://openreview.net/pdf?id=ErkzxOlOLy", "openreview": "https://openreview.net/forum?id=ErkzxOlOLy", "author_site": "Kyungeun Lee, Ye Seul Sim, Hye-Seung Cho, Moonjung Eo, Suhee Yoon, Sanghyu Yoon, Woohyung Lim", "tldr": "", "abstract": "The ability of deep networks to learn superior representations hinges on leveraging the proper inductive biases, considering the inherent properties of datasets. In tabular domains, it is critical to effectively handle heterogeneous features (both categorical and numerical) in a unified manner and to grasp irregular functions like piecewise constant functions. To address the challenges in the self-supervised learning framework, we propose a novel pretext task based on the classical binning method. The idea is straightforward: reconstructing the bin indices (either orders or classes) rather than the original values. This pretext task provides the encoder with an inductive bias to capture the irregular dependencies, mapping from continuous inputs to discretized bins, and mitigates the feature heterogeneity by setting all features to have category-type targets. Our empirical investigations ascertain several advantages of binning: capturing the irregular function, compatibility with encoder architecture and additional modifications, standardizing all features into equal sets, grouping similar values within a feature, and providing ordering information. Comprehensive evaluations across diverse tabular datasets corroborate that our method consistently improves tabular representation learning performance for a wide range of downstream tasks. The codes are available in https://github.com/kyungeun-lee/tabularbinning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kyungeun Lee;Ye Seul Sim;Hyeseung Cho;Moonjung Eo;Suhee Yoon;Sanghyu Yoon;Woohyung Lim", "authorids": "~Kyungeun_Lee1;~Ye_Seul_Sim1;~Hyeseung_Cho1;~Moonjung_Eo1;~Suhee_Yoon1;~Sanghyu_Yoon1;~Woohyung_Lim1", "gender": "F;F;F;F;F;F;M", "homepage": "https://sites.google.com/view/cvkyungeunlee/;;https://www.lgresearch.ai/ourwork/research?tab=PD;;https://sites.google.com/view/suheeyoon;;", "dblp": "230/3844;377/9202;169/2928.html;274/0874;315/6798;377/9441.html;86/7195", "google_scholar": "ASy-_MEAAAAJ;;;3gt2shwAAAAJ;JMaHBwgAAAAJ;napP2_oAAAAJ;https://scholar.google.co.kr/citations?user=gtvxdcUAAAAJ", "orcid": "0000-0002-1674-7147;0009-0006-5082-5790;0009-0009-4165-7643;0000-0002-0114-8010;0000-0003-3496-6578;0009-0007-6301-6922;0000-0003-0525-9065", "linkedin": ";ye-seul-sim-664320139;;;suheey/?originalSubdomain=kr;hyu0901/;woohyunglim/", "or_profile": "~Kyungeun_Lee1;~Ye_Seul_Sim1;~Hyeseung_Cho1;~Moonjung_Eo1;~Suhee_Yoon1;~Sanghyu_Yoon1;~Woohyung_Lim1", "aff": "LG AI Research;LG AI Research;LG AI Research;LG AI Research;LG AI Research;LG AI Research;LG AI Research", "aff_domain": "lgresearch.ai;lgresearch.ai;lgresearch.ai;lgresearch.ai;lgresearch.ai;lgresearch.ai;lgresearch.ai", "position": "Researcher;Researcher;Researcher;Researcher;Researcher;Researcher;Vice President", "bibtex": "@inproceedings{\nlee2024binning,\ntitle={Binning as a Pretext Task: Improving Self-Supervised Learning in Tabular Domains},\nauthor={Kyungeun Lee and Ye Seul Sim and Hyeseung Cho and Moonjung Eo and Suhee Yoon and Sanghyu Yoon and Woohyung Lim},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ErkzxOlOLy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 890150, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1724767256159466789&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "lgresearch.ai;lgresearch.ai;lgresearch.ai;lgresearch.ai;lgresearch.ai;lgresearch.ai;lgresearch.ai", "author_num": 7, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "LG", "aff_unique_dep": "LG AI Research", "aff_unique_url": "https://www.lgaires.com", "aff_unique_abbr": "LG AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "MusicRL: Aligning Music Generation to Human Preferences", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34561", "id": "EruV94XRDs", "proceeding": "https://proceedings.mlr.press/v235/cideron24a.html", "pdf": "https://openreview.net/pdf?id=EruV94XRDs", "openreview": "https://openreview.net/forum?id=EruV94XRDs", "author_site": "Geoffrey Cideron, Sertan Girgin, Mauro Verzetti, Damien Vincent, Matej Kastelic, Zal\u00e1n Borsos, Brian McWilliams, Victor Ungureanu, Olivier Bachem, Olivier Pietquin, Matthieu Geist, L\u00e9onard Hussenot, Neil Zeghidour, Andrea Agostinelli", "tldr": "", "abstract": "We propose MusicRL, the first music generation system finetuned from human feedback. Appreciation of text-to-music models is particularly subjective since the concept of musicality as well as the specific intention behind a caption are user-dependent (e.g. a caption such as \u201cupbeat workout music\u201d can map to a retro guitar solo or a technopop beat). Not only this makes supervised training of such models challenging, but it also calls for integrating continuous human feedback in their post-deployment finetuning. MusicRL is a pretrained autoregressive [MusicLM](https://arxiv.org/abs/2301.11325) model of discrete audio tokens finetuned with reinforcement learning to maximize sequence-level rewards. We design reward functions related specifically to text-adherence and audio quality with the help from selected raters, and use those to finetune MusicLM into MusicRL-R. We deploy MusicLM to users and collect a substantial dataset comprising 300,000 pairwise preferences. Using Reinforcement Learning from Human Feedback (RLHF), we train MusicRL-U, the first text-to-music model that incorporates human feedback at scale. Human evaluations show that both MusicRL-R and MusicRL-U are preferred to the baseline. Ultimately, MusicRL-RU combines the two approaches and results in the best model according to human raters. Ablation studies shed light on the musical attributes influencing human preferences, indicating that text adherence and quality only account for a part of it. This underscores the prevalence of subjectivity in musical appreciation and calls for further involvement of human listeners in the finetuning of music generation models. Samples can be found at google-research.github.io/seanet/musiclm/rlhf/.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Geoffrey Cideron;Sertan Girgin;Mauro Verzetti;Damien Vincent;Matej Kastelic;Zal\u00e1n Borsos;Brian McWilliams;Victor Ungureanu;Olivier Bachem;Olivier Pietquin;Matthieu Geist;Leonard Hussenot;Neil Zeghidour;Andrea Agostinelli", "authorids": "~Geoffrey_Cideron1;~Sertan_Girgin1;~Mauro_Verzetti1;~Damien_Vincent1;~Matej_Kastelic1;~Zal\u00e1n_Borsos1;~Brian_McWilliams2;ungureanu@google.com;~Olivier_Bachem1;~Olivier_Pietquin1;~Matthieu_Geist1;~Leonard_Hussenot1;~Neil_Zeghidour1;~Andrea_Agostinelli1", "gender": "M;;;;;Unspecified;M;;M;M;M;;M;", "homepage": ";;;;;;https://sites.google.com/view/mcbrian/;;http://www.olivierbachem.ch/;http://www.cristal.univ-lille.fr/~pietquin/;;;;", "dblp": ";;;43/9268;;135/7963;;;https://dblp.org/pers/hd/b/Bachem:Olivier;58/6269;38/6508;241/9657;180/2570;", "google_scholar": "https://scholar.google.com/citations?hl=en;;;;;;https://scholar.google.ch/citations?user=IS4VSXAAAAAJ;;https://scholar.google.ch/citations?user=mW9BcgsAAAAJ;8K8-LdwAAAAJ;ectPLEUAAAAJ;nTdWO9MAAAAJ;;NM85zIEAAAAJ", "orcid": ";;0000-0001-9958-0663;;;;;;;;;;;", "linkedin": ";;;;matej-kastelic/;;;;olivier-bachem-10257756/;opietquin/;;;;", "or_profile": "~Geoffrey_Cideron1;~Sertan_Girgin1;~Mauro_Verzetti1;~Damien_Vincent1;~Matej_Kastelic1;~Zal\u00e1n_Borsos1;~Brian_McWilliams2;ungureanu@google.com;~Olivier_Bachem1;~Olivier_Pietquin1;~Matthieu_Geist1;~Leonard_Hussenot1;~Neil_Zeghidour1;~Andrea_Agostinelli1", "aff": "Google;;Google;Google;Research, Google;Google;Google;;Google Brain;Cohere;Google;;Kyutai;Google", "aff_domain": "google.com;;google.com;google.com;research.google.com;google.com;google.com;;google.com;cohere.com;google.com;;kyutai.org;google.com", "position": "Research Engineer;;Software engineer;PhD student;Researcher;Research Software Engineer;Research Scientist;;Research scientist;Director of Research;Researcher;;Researcher;Researcher", "bibtex": "@inproceedings{\ncideron2024musicrl,\ntitle={Music{RL}: Aligning Music Generation to Human Preferences},\nauthor={Geoffrey Cideron and Sertan Girgin and Mauro Verzetti and Damien Vincent and Matej Kastelic and Zal{\\'a}n Borsos and Brian McWilliams and Victor Ungureanu and Olivier Bachem and Olivier Pietquin and Matthieu Geist and Leonard Hussenot and Neil Zeghidour and Andrea Agostinelli},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EruV94XRDs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 632795, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 14, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2430105600455446067&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "google.com;;google.com;google.com;research.google.com;google.com;google.com;;google.com;cohere.com;google.com;;kyutai.org;google.com", "author_num": 14, "aff_unique_index": "0;0;0;0;0;0;0;1;0;2;0", "aff_unique_norm": "Google;Cohere;Kyushu University", "aff_unique_dep": "Google;;", "aff_unique_url": "https://www.google.com;https://cohere.ai;https://www.kyushu-u.ac.jp", "aff_unique_abbr": "Google;;Kyushu U", "aff_campus_unique_index": "0;0;0;0;0;0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;1;0", "aff_country_unique": "United States;Japan" }, { "title": "A Tensor Decomposition Perspective on Second-order RNNs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34560", "id": "EsSSDjwFra", "proceeding": "https://proceedings.mlr.press/v235/lizaire24a.html", "pdf": "https://openreview.net/pdf?id=EsSSDjwFra", "openreview": "https://openreview.net/forum?id=EsSSDjwFra", "author_site": "Maude Lizaire, Michael Rizvi-Martel, Marawan Gamal, Guillaume Rabusseau", "tldr": "", "abstract": "Second-order Recurrent Neural Networks (2RNNs) extend RNNs by leveraging second-order interactions for sequence modelling. These models are provably more expressive than their first-order counterparts and have connections to well-studied models from formal language theory. However, their large parameter tensor makes computations intractable. To circumvent this issue, one approach known as MIRNN consists in limiting the type of interactions used by the model. Another is to leverage tensor decomposition to diminish the parameter count. In this work, we study the model resulting from parameterizing 2RNNs using the CP decomposition, which we call CPRNN. Intuitively, the rank of the decomposition should reduce expressivity. We analyze how rank and hidden size affect model capacity and show the relationships between RNNs, 2RNNs, MIRNNs, and CPRNNs based on these parameters. We support these results empirically with experiments on the Penn Treebank dataset which demonstrate that, with a fixed parameter budget, CPRNNs outperforms RNNs, 2RNNs, and MIRNNs with the right choice of rank and hidden size.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Maude Lizaire;Michael Rizvi-Martel;Marawan Gamal;Guillaume Rabusseau", "authorids": "~Maude_Lizaire1;~Michael_Rizvi-Martel1;~Marawan_Gamal1;~Guillaume_Rabusseau1", "gender": "F;M;;M", "homepage": ";https://michaelrizvi.github.io/;https://marawangamal.github.io/;https://www-labs.iro.umontreal.ca/~grabus/", "dblp": ";;;143/7327", "google_scholar": ";;;https://scholar.google.fr/citations?user=t2i4V4EAAAAJ", "orcid": ";;;", "linkedin": "maude-lizaire-315a12141;;;", "or_profile": "~Maude_Lizaire1;~Michael_Rizvi-Martel1;~Marawan_Gamal1;~Guillaume_Rabusseau1", "aff": "Universit\u00e9 de Montr\u00e9al;Universit\u00e9 de Montr\u00e9al;MILA - Quebec AI Institute;Universit\u00e9 de Montr\u00e9al", "aff_domain": "umontreal.ca;umontreal.ca;mila.quebec;umontreal.ca", "position": "PhD student;PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nlizaire2024a,\ntitle={A Tensor Decomposition Perspective on Second-order {RNN}s},\nauthor={Maude Lizaire and Michael Rizvi-Martel and Marawan Gamal and Guillaume Rabusseau},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EsSSDjwFra}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1769753, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2JReoF7KDBwJ:scholar.google.com/&scioq=A+Tensor+Decomposition+Perspective+on+Second-order+RNNs&hl=en&as_sdt=0,33", "gs_version_total": 7, "email": "umontreal.ca;umontreal.ca;mila.quebec;umontreal.ca", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al;Quebec AI Institute", "aff_unique_dep": ";MILA", "aff_unique_url": "https://www.umontreal.ca;https://mila.quebec", "aff_unique_abbr": "UdeM;MILA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Canada" }, { "title": "Diffusion Rejection Sampling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34559", "id": "EsWJ5wd2ir", "proceeding": "https://proceedings.mlr.press/v235/na24a.html", "pdf": "https://openreview.net/pdf?id=EsWJ5wd2ir", "openreview": "https://openreview.net/forum?id=EsWJ5wd2ir", "author_site": "Byeonghu Na, Yeongmin Kim, Minsang Park, Donghyeok Shin, Wanmo Kang, IL CHUL MOON", "tldr": "", "abstract": "Recent advances in powerful pre-trained diffusion models encourage the development of methods to improve the sampling performance under well-trained diffusion models. This paper introduces Diffusion Rejection Sampling (DiffRS), which uses a rejection sampling scheme that aligns the sampling transition kernels with the true ones at each timestep. The proposed method can be viewed as a mechanism that evaluates the quality of samples at each intermediate timestep and refines them with varying effort depending on the sample. Theoretical analysis shows that DiffRS can achieve a tighter bound on sampling error compared to pre-trained models. Empirical results demonstrate the state-of-the-art performance of DiffRS on the benchmark datasets and the effectiveness of DiffRS for fast diffusion samplers and large-scale text-to-image diffusion models. Our code is available at https://github.com/aailabkaist/DiffRS.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Byeonghu Na;Yeongmin Kim;Minsang Park;Donghyeok Shin;Wanmo Kang;Il-chul Moon", "authorids": "~Byeonghu_Na1;~Yeongmin_Kim1;~Minsang_Park1;~Donghyeok_Shin2;~Wanmo_Kang1;~Il-chul_Moon1", "gender": "M;M;M;;M;", "homepage": "https://sites.google.com/view/byeonghu-na;https://sites.google.com/view/yeongmin-space/%ED%99%88;;;https://sites.google.com/site/wanmokang/;", "dblp": "276/5100;;;;;", "google_scholar": "https://scholar.google.co.kr/citations?user=mJoqpmEAAAAJ;SBF13JUAAAAJ;https://scholar.google.co.kr/citations?user=PhyT2gQAAAAJ;;;", "orcid": "0000-0003-3463-2674;;;;;", "linkedin": "byeonghu-na-17942120b/;;;;;", "or_profile": "~Byeonghu_Na1;~Yeongmin_Kim1;~Minsang_Park1;~Donghyeok_Shin2;~Wanmo_Kang1;~Il-chul_Moon1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;;;", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;;;", "position": "PhD student;PhD student;MS student;;;", "bibtex": "@inproceedings{\nna2024diffusion,\ntitle={Diffusion Rejection Sampling},\nauthor={Byeonghu Na and Yeongmin Kim and Minsang Park and Donghyeok Shin and Wanmo Kang and Il-chul Moon},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EsWJ5wd2ir}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7106304, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7002084906569142189&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;;;", "author_num": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Practical Hamiltonian Monte Carlo on Riemannian Manifolds via Relativity Theory", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34558", "id": "Et8Pk97u4u", "proceeding": "https://proceedings.mlr.press/v235/xu24i.html", "pdf": "https://openreview.net/pdf?id=Et8Pk97u4u", "openreview": "https://openreview.net/forum?id=Et8Pk97u4u", "author_site": "Kai Xu, Hong Ge", "tldr": "", "abstract": "Hamiltonian Monte Carlo (HMC) samples from an unnormalized density by numerically integrating Hamiltonian dynamics. Girolami & Calderhead (2011) extend HMC to Riemannian manifolds, but the resulting method faces integration instability issues for practical usage. While previous works have tackled this challenge by using more robust metric tensors than Fisher's information metric, our work focuses on designing numerically stable Hamiltonian dynamics. To do so, we start with the idea from Lu et al. (2017), which designs momentum distributions to upper-bound the particle speed. Then, we generalize this Lu et al. (2017) method to Riemannian manifolds. In our generalization, the upper bounds of velocity norm become position-dependent, which intrinsically limits step sizes used in high curvature regions and, therefore, significantly reduces numerical errors. We also derive a more tractable algorithm to sample from relativistic momentum distributions without relying on the mean-field assumption.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kai Xu;Hong Ge", "authorids": "~Kai_Xu4;~Hong_Ge1", "gender": "M;M", "homepage": "https://xuk.ai;", "dblp": ";31/835", "google_scholar": "https://scholar.google.ca/citations?user=kf3C60wAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Kai_Xu4;~Hong_Ge1", "aff": "Amazon;University of Cambridge", "aff_domain": "amazon.com;cam.ac.uk", "position": "Research scientist;Senior Research Fellow", "bibtex": "@inproceedings{\nxu2024practical,\ntitle={Practical Hamiltonian Monte Carlo on Riemannian Manifolds via Relativity Theory},\nauthor={Kai Xu and Hong Ge},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Et8Pk97u4u}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 879345, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Sgz_pP773lEJ:scholar.google.com/&scioq=Practical+Hamiltonian+Monte+Carlo+on+Riemannian+Manifolds+via+Relativity+Theory&hl=en&as_sdt=0,23", "gs_version_total": 5, "email": "amazon.com;cam.ac.uk", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Amazon;University of Cambridge", "aff_unique_dep": "Amazon.com, Inc.;", "aff_unique_url": "https://www.amazon.com;https://www.cam.ac.uk", "aff_unique_abbr": "Amazon;Cambridge", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Feedback Loops With Language Models Drive In-Context Reward Hacking", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34557", "id": "EvHWlYTLWe", "proceeding": "https://proceedings.mlr.press/v235/pan24d.html", "pdf": "https://openreview.net/pdf?id=EvHWlYTLWe", "openreview": "https://openreview.net/forum?id=EvHWlYTLWe", "author_site": "Alexander Pan, Erik Jones, Meena Jagadeesan, Jacob Steinhardt", "tldr": "", "abstract": "Language models influence the external world: they query APIs that read and write to web pages, generate content that shapes human behavior, and run system commands as autonomous agents. These interactions form feedback loops: LLM outputs affect the world, which in turn affect subsequent LLM outputs. In this work, we show that feedback loops can cause in-context reward hacking (ICRH), where the LLM at test-time optimizes a (potentially implicit) objective but creates negative side effects in the process. For example, consider an LLM agent deployed to increase Twitter engagement; the LLM may retrieve its previous tweets into the context window and make them more controversial, increasing engagement but also toxicity. We identify and study two processes that lead to ICRH: output-refinement and policy-refinement. For these processes, evaluations on static datasets are insufficient---they miss the feedback effects and thus cannot capture the most harmful behavior. In response, we provide three recommendations for evaluation to capture more instances of ICRH. As AI development accelerates, the effects of feedback loops will proliferate, increasing the need to understand their role in shaping LLM behavior.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alexander Pan;Erik Jones;Meena Jagadeesan;Jacob Steinhardt", "authorids": "~Alexander_Pan1;~Erik_Jones3;~Meena_Jagadeesan1;~Jacob_Steinhardt1", "gender": "M;M;F;", "homepage": "https://aypan17.github.io;http://people.eecs.berkeley.edu/~erjones/;https://mjagadeesan.github.io;", "dblp": "304/3394;264/5304;205/2407;35/10625", "google_scholar": "PaltSA0AAAAJ;_-CU2CsAAAAJ;XW62DrcAAAAJ;", "orcid": ";;;", "linkedin": "alexander-pan-0567a2102/;erik-jones-879239133/;;", "or_profile": "~Alexander_Pan1;~Erik_Jones3;~Meena_Jagadeesan1;~Jacob_Steinhardt1", "aff": "University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\npan2024feedback,\ntitle={Feedback Loops With Language Models Drive In-Context Reward Hacking},\nauthor={Alexander Pan and Erik Jones and Meena Jagadeesan and Jacob Steinhardt},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EvHWlYTLWe}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 649437, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14811123547164988465&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Compute Better Spent: Replacing Dense Layers with Structured Matrices", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34556", "id": "ExHTFXEhc9", "proceeding": "https://proceedings.mlr.press/v235/qiu24f.html", "pdf": "https://openreview.net/pdf?id=ExHTFXEhc9", "openreview": "https://openreview.net/forum?id=ExHTFXEhc9", "author_site": "Shikai Qiu, Andres Potapczynski, Marc Finzi, Micah Goldblum, Andrew Wilson", "tldr": "", "abstract": "Dense linear layers are the dominant computational bottleneck in foundation models. Identifying more efficient alternatives to dense matrices has enormous potential for building more compute-efficient models, as exemplified by the success of convolutional networks in the image domain. In this work, we systematically explore structured matrices as replacements for dense matrices. We show that different structures often require drastically different initialization scales and learning rates, which are crucial to performance, especially as models scale. Using insights from the Maximal Update Parameterization, we determine the optimal scaling for initialization and learning rates of these unconventional layers. Finally, we measure the scaling laws of different structures to compare how quickly their performance improves with compute. We propose a novel matrix family containing Monarch matrices, the Block Tensor-Train (BTT), which we show performs better than dense matrices for the same compute on multiple tasks. On CIFAR-10/100 with augmentation, BTT achieves exponentially lower training loss than dense when training MLPs and ViTs. BTT matches dense ViT-S/32 performance on ImageNet-1k with 3.8 times less compute and is more efficient than dense for training small GPT-2 language models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shikai Qiu;Andres Potapczynski;Marc Anton Finzi;Micah Goldblum;Andrew Gordon Wilson", "authorids": "~Shikai_Qiu1;~Andres_Potapczynski3;~Marc_Anton_Finzi1;~Micah_Goldblum1;~Andrew_Gordon_Wilson1", "gender": "M;;M;;Not Specified", "homepage": "https://shikaiqiu.github.io/;https://andpotap.com/;https://mfinzi.github.io;;https://cims.nyu.edu/~andrewgw", "dblp": ";255/7271;222/3062;241/7231;65/10453", "google_scholar": "pK0OAsQAAAAJ;;ysMAhlwAAAAJ;pGDKzuUAAAAJ;https://scholar.google.com.tw/citations?user=twWX2LIAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Shikai_Qiu1;~Andres_Potapczynski3;~Marc_Anton_Finzi1;~Micah_Goldblum1;~Andrew_Gordon_Wilson1", "aff": "New York University;New York University;Carnegie Mellon University;New York University;New York University", "aff_domain": "nyu.edu;nyu.edu;cmu.edu;nyu.edu;nyu.edu", "position": "PhD student;PhD student;Postdoc;Postdoc;Associate Professor", "bibtex": "@inproceedings{\nqiu2024compute,\ntitle={Compute Better Spent: Replacing Dense Layers with Structured Matrices},\nauthor={Shikai Qiu and Andres Potapczynski and Marc Anton Finzi and Micah Goldblum and Andrew Gordon Wilson},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ExHTFXEhc9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 790937, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12498828059357026504&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 9, "email": "nyu.edu;nyu.edu;cmu.edu;nyu.edu;nyu.edu", "author_num": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "New York University;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.nyu.edu;https://www.cmu.edu", "aff_unique_abbr": "NYU;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "The Role of Learning Algorithms in Collective Action", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34555", "id": "Ez3Lckpe4l", "proceeding": "https://proceedings.mlr.press/v235/ben-dov24a.html", "pdf": "https://openreview.net/pdf?id=Ez3Lckpe4l", "openreview": "https://openreview.net/forum?id=Ez3Lckpe4l", "author_site": "Omri Ben-Dov, Jake Fawkes, Samira Samadi, Amartya Sanyal", "tldr": "", "abstract": "Collective action in machine learning is the study of the control that a coordinated group can have over machine learning algorithms. While previous research has concentrated on assessing the impact of collectives against Bayes (sub-)optimal classifiers, this perspective is limited in that it does not account for the choice of learning algorithm. Since classifiers seldom behave like Bayes classifiers and are influenced by the choice of learning algorithms along with their inherent biases, in this work we initiate the study of how the choice of the learning algorithm plays a role in the success of a collective in practical settings. Specifically, we focus on distributionally robust optimization (DRO), popular for improving a worst group error, and on the ubiquitous stochastic gradient descent (SGD), due to its inductive bias for \"simpler\" functions. Our empirical results, supported by a theoretical foundation, show that the effective size and success of the collective are highly dependent on properties of the learning algorithm. This highlights the necessity of taking the learning algorithm into account when studying the impact of collective action in machine learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Omri Ben-Dov;Jake Fawkes;Samira Samadi;Amartya Sanyal", "authorids": "~Omri_Ben-Dov1;~Jake_Fawkes1;~Samira_Samadi1;~Amartya_Sanyal1", "gender": ";M;F;M", "homepage": ";http://csml.stats.ox.ac.uk/people/;http://www.samirasamadi.com;https://amartya18x.github.io", "dblp": "322/9360;;https://dblp.uni-trier.de/pers/hd/s/Samadi:Samira;203/8807", "google_scholar": ";;s8xc2K4AAAAJ;", "orcid": ";;;0000-0002-4190-0449", "linkedin": "omri-ben-dov;;samira-samadi-200662108/;", "or_profile": "~Omri_Ben-Dov1;~Jake_Fawkes1;~Samira_Samadi1;~Amartya_Sanyal1", "aff": "Max Planck Institute for Intelligent Systems, Max-Planck Institute;University of Oxford;Max Planck Institute for Intelligent Systems, Max-Planck Institute;Max-Planck Institute", "aff_domain": "tuebingen.mpg.de;oxford.ac.uk;tuebingen.mpg.de;mpg.de", "position": "PhD student;PhD student;Research Group Leader;Postdoc", "bibtex": "@inproceedings{\nben-dov2024the,\ntitle={The Role of Learning Algorithms in Collective Action},\nauthor={Omri Ben-Dov and Jake Fawkes and Samira Samadi and Amartya Sanyal},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Ez3Lckpe4l}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 837540, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WCb0ejiG-csJ:scholar.google.com/&scioq=The+Role+of+Learning+Algorithms+in+Collective+Action&hl=en&as_sdt=0,5", "gs_version_total": 7, "email": "tuebingen.mpg.de;oxford.ac.uk;tuebingen.mpg.de;mpg.de", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Max Planck Institute for Intelligent Systems;University of Oxford;Max-Planck-Gesellschaft zur F\u00f6rderung der Wissenschaften e.V.", "aff_unique_dep": "Intelligent Systems;;", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.ox.ac.uk;https://www.mpg.de", "aff_unique_abbr": "MPI-IS;Oxford;MPG", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Germany;United Kingdom" }, { "title": "Libra: Building Decoupled Vision System on Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34554", "id": "F1drhMjN7s", "proceeding": "https://proceedings.mlr.press/v235/xu24ab.html", "pdf": "https://openreview.net/pdf?id=F1drhMjN7s", "openreview": "https://openreview.net/forum?id=F1drhMjN7s", "author_site": "Yifan Xu, Xiaoshan Yang, Yaguang Song, Changsheng Xu", "tldr": "", "abstract": "In this work, we introduce **Libra**, a prototype model with a decoupled vision system on a large language model (LLM). The decoupled vision system decouples inner-modal modeling and cross-modal interaction, yielding unique visual information modeling and effective cross-modal comprehension. Libra is trained through discrete auto-regressive modeling on both vision and language inputs. Specifically, we incorporate a routed visual expert with a cross-modal bridge module into a pretrained LLM to route the vision and language flows during attention computing to enable different attention patterns in inner-modal modeling and cross-modal interaction scenarios. Experimental results demonstrate that the dedicated design of Libra achieves a strong MLLM baseline that rivals existing works in the image-to-text scenario with merely 50 million training data, providing a new perspective for future multimodal foundation models. Code is available at https://github.com/YifanXu74/Libra.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yifan Xu;Xiaoshan Yang;Yaguang Song;Changsheng Xu", "authorids": "~Yifan_Xu9;~Xiaoshan_Yang2;~Yaguang_Song1;~Changsheng_Xu1", "gender": "M;M;M;M", "homepage": "https://yifanxu74.github.io;https://yangxs.ac.cn;;", "dblp": ";74/9989;249/8313;85/1301", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;;KOJULc0AAAAJ;https://scholar.google.com.sg/citations?user=hI9NRDkAAAAJ", "orcid": ";0000-0001-5453-9755;;", "linkedin": ";;;", "or_profile": "~Yifan_Xu9;~Xiaoshan_Yang2;~Yaguang_Song1;~Changsheng_Xu1", "aff": "Institute of Automation, Chinese Academy of Sciences;Institute of Automation, Chinese Academy of Sciences;Peng Cheng Laboratory;Institute of Automation, Chinese Academy of Sciences", "aff_domain": "ia.ac.cn;ia.ac.cn;pcl.ac.cn;ia.ac.cn", "position": "PhD student;Associate Professor;Postdoc;Full Professor", "bibtex": "@inproceedings{\nxu2024libra,\ntitle={Libra: Building Decoupled Vision System on Large Language Models},\nauthor={Yifan Xu and Xiaoshan Yang and Yaguang Song and Changsheng Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=F1drhMjN7s}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8341208, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3300122489095517750&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6, "email": "ia.ac.cn;ia.ac.cn;pcl.ac.cn;ia.ac.cn", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Chinese Academy of Sciences;Pengcheng Laboratory", "aff_unique_dep": "Institute of Automation;Peng Cheng Laboratory", "aff_unique_url": "http://www.ia.cas.cn;http://www.pcl.ac.cn", "aff_unique_abbr": "CAS;PCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Decoupling Feature Extraction and Classification Layers for Calibrated Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34553", "id": "F2Tegvyqlo", "proceeding": "https://proceedings.mlr.press/v235/jordahn24a.html", "pdf": "https://openreview.net/pdf?id=F2Tegvyqlo", "openreview": "https://openreview.net/forum?id=F2Tegvyqlo", "author_site": "Mikkel Jordahn, Pablo Olmos", "tldr": "", "abstract": "Deep Neural Networks (DNN) have shown great promise in many classification applications, yet are widely known to have poorly calibrated predictions when they are over-parametrized. Improving DNN calibration without comprising on model accuracy is of extreme importance and interest in safety critical applications such as in the health-care sector. In this work, we show that decoupling the training of feature extraction layers and classification layers in over-parametrized DNN architectures such as Wide Residual Networks (WRN) and Vision Transformers (ViT) significantly improves model calibration whilst retaining accuracy, and at a low training cost. In addition, we show that placing a Gaussian prior on the last hidden layer outputs of a DNN, and training the model variationally in the classification training stage, even further improves calibration. We illustrate these methods improve calibration across ViT and WRN architectures for several image classification benchmark datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mikkel Jordahn;Pablo M. Olmos", "authorids": "~Mikkel_Jordahn1;~Pablo_M._Olmos1", "gender": "M;", "homepage": ";http://www.tsc.uc3m.es/~olmos/", "dblp": ";83/8261", "google_scholar": ";pdcdDVoAAAAJ", "orcid": ";", "linkedin": "mikkel-jordahn-0930b8a4/?originalSubdomain=dk;", "or_profile": "~Mikkel_Jordahn1;~Pablo_M._Olmos1", "aff": "Technical University of Denmark;Universidad Carlos III de Madrid", "aff_domain": "dtu.dk;uc3m.es", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\njordahn2024decoupling,\ntitle={Decoupling Feature Extraction and Classification Layers for Calibrated Neural Networks},\nauthor={Mikkel Jordahn and Pablo M. Olmos},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=F2Tegvyqlo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1306072, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7527660973622524977&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "dtu.dk;uc3m.es", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Technical University of Denmark;Universidad Carlos III de Madrid", "aff_unique_dep": ";", "aff_unique_url": "https://www.tek.dk;https://www.uc3m.es", "aff_unique_abbr": "DTU;UC3M", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Denmark;Spain" }, { "title": "Conformal Validity Guarantees Exist for Any Data Distribution (and How to Find Them)", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34552", "id": "F3936hVwQa", "proceeding": "https://proceedings.mlr.press/v235/prinster24a.html", "pdf": "https://openreview.net/pdf?id=F3936hVwQa", "openreview": "https://openreview.net/forum?id=F3936hVwQa", "author_site": "Drew Prinster, Samuel Stanton, Anqi Liu, Suchi Saria", "tldr": "", "abstract": "As artificial intelligence (AI) / machine learning (ML) gain widespread adoption, practitioners are increasingly seeking means to quantify and control the risk these systems incur. This challenge is especially salient when such systems have autonomy to collect their own data, such as in black-box optimization and active learning, where their actions induce sequential feedback-loop shifts in the data distribution. Conformal prediction is a promising approach to uncertainty and risk quantification, but prior variants' validity guarantees have assumed some form of ``quasi-exchangeability'' on the data distribution, thereby excluding many types of sequential shifts. In this paper we prove that conformal prediction can theoretically be extended to *any* joint data distribution, not just exchangeable or quasi-exchangeable ones. Although the most general case is exceedingly impractical to compute, for concrete practical applications we outline a procedure for deriving specific conformal algorithms for any data distribution, and we use this procedure to derive tractable algorithms for a series of AI/ML-agent-induced covariate shifts. We evaluate the proposed algorithms empirically on synthetic black-box optimization and active learning tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Drew Prinster;Samuel Don Stanton;Anqi Liu;Suchi Saria", "authorids": "~Drew_Prinster1;~Samuel_Don_Stanton1;~Anqi_Liu2;~Suchi_Saria1", "gender": "M;F;F;M", "homepage": "https://samuelstanton.github.io/;https://anqiliu-ai.github.io/;https://suchisaria.jhu.edu/;https://drewprinster.github.io/", "dblp": "264/1895;;72/2433;324/8626", "google_scholar": "https://scholar.google.com/citations?hl=en;Q8yp6zQAAAAJ;;E3FLv78AAAAJ", "orcid": ";0000-0002-0468-5698;;0000-0003-3607-4493", "linkedin": "samuel-stanton-06004997/;;;", "or_profile": "~Samuel_Don_Stanton1;~Anqi_Liu2;~Suchi_Saria1;~Andrew_Prinster1", "aff": "Genentech;University of Illinois, Chicago;Department of Computer Science, Whiting School of Engineering;Johns Hopkins University", "aff_domain": "gene.com;uic.edu;cs.jhu.edu;johnshopkins.edu", "position": "Researcher;PhD student;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nprinster2024conformal,\ntitle={Conformal Validity Guarantees Exist for Any Data Distribution (and How to Find Them)},\nauthor={Drew Prinster and Samuel Don Stanton and Anqi Liu and Suchi Saria},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=F3936hVwQa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2437521, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11645648433669148191&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 8, "email": "gene.com;uic.edu;cs.jhu.edu;johnshopkins.edu", "author_num": 4, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Genentech;University of Illinois at Chicago;Johns Hopkins University", "aff_unique_dep": ";;Department of Computer Science", "aff_unique_url": "https://www.genentech.com;https://www.uic.edu;https://www.jhu.edu", "aff_unique_abbr": "Genentech;UIC;JHU", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Chicago;Baltimore", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Entropy-Reinforced Planning with Large Language Models for Drug Discovery", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34551", "id": "F3Ds71Xgo1", "proceeding": "https://proceedings.mlr.press/v235/liu24be.html", "pdf": "https://openreview.net/pdf?id=F3Ds71Xgo1", "openreview": "https://openreview.net/forum?id=F3Ds71Xgo1", "author_site": "Xuefeng Liu, Chih-chan Tien, Peng Ding, Songhao Jiang, Rick Stevens", "tldr": "", "abstract": "The objective of drug discovery is to identify chemical compounds that possess specific pharmaceutical properties toward a binding target. Existing large language models (LLMS) can achieve high token matching scores in terms of likelihood for molecule generation. However, relying solely on LLM decoding often results in the generation of molecules that are either invalid due to a single misused token, or suboptimal due to unbalanced exploration and exploitation as a consequence of the LLM\u2019s prior experience. Here we propose ERP, Entropy-Reinforced Planning for Transformer Decoding, which employs an entropy-reinforced planning algorithm to enhance the Transformer decoding process and strike a balance between exploitation and exploration. ERP aims to achieve improvements in multiple properties compared to direct sampling from the Transformer. We evaluated ERP on the SARS-CoV-2 virus (3CLPro) and human cancer cell target protein (RTCB) benchmarks and demonstrated that, in both benchmarks, ERP consistently outperforms the current state-of-the-art algorithm by 1-5 percent, and baselines by 5-10 percent, respectively. Moreover, such improvement is robust across Transformer models trained with different objectives. Finally, to further illustrate the capabilities of ERP, we tested our algorithm on three code generation benchmarks and outperformed the current state-of-the-art approach as well. Our code is publicly available at: https://github.com/xuefeng-cs/ERP.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xuefeng Liu;Chih-chan Tien;Peng Ding;Songhao Jiang;Rick L. Stevens", "authorids": "~Xuefeng_Liu2;~Chih-chan_Tien1;~Peng_Ding2;~Songhao_Jiang2;~Rick_L._Stevens1", "gender": ";M;;M;", "homepage": ";;;https://cs.uchicago.edu/people/songhao-jiang/;https://computerscience.uchicago.edu/people/profile/rick-stevens/", "dblp": ";290/1206.html;;;", "google_scholar": ";;;;2oSSsLYAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Xuefeng_Liu2;~Chih-chan_Tien1;~Peng_Ding2;~Songhao_Jiang2;~Rick_L._Stevens1", "aff": ";University of Chicago;;University of Chicago;University of Chicago", "aff_domain": ";uchicago.edu;;uchicago.edu;uchicago.edu", "position": ";PhD student;;PhD student;Full Professor", "bibtex": "@inproceedings{\nliu2024entropyreinforced,\ntitle={Entropy-Reinforced Planning with Large Language Models for Drug Discovery},\nauthor={Xuefeng Liu and Chih-chan Tien and Peng Ding and Songhao Jiang and Rick L. Stevens},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=F3Ds71Xgo1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1090284, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16271633469693344697&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";uchicago.edu;;uchicago.edu;uchicago.edu", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Chicago", "aff_unique_dep": "", "aff_unique_url": "https://www.uchicago.edu", "aff_unique_abbr": "UChicago", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "How Interpretable Are Interpretable Graph Neural Networks?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34550", "id": "F3G2udCF3Q", "proceeding": "https://proceedings.mlr.press/v235/chen24b.html", "pdf": "https://openreview.net/pdf?id=F3G2udCF3Q", "openreview": "https://openreview.net/forum?id=F3G2udCF3Q", "author_site": "Yongqiang Chen, Yatao Bian, Bo Han, James Cheng", "tldr": "", "abstract": "Interpretable graph neural networks (XGNNs ) are widely adopted in various scientific applications involving graph-structured data. Existing XGNNs predominantly adopt the attention-based mechanism to learn edge or node importance for extracting and making predictions with the interpretable subgraph. However, the representational properties and limitations of these methods remain inadequately explored. In this work, we present a theoretical framework that formulates interpretable subgraph learning with the multilinear extension of the subgraph distribution, coined as subgraph multilinear extension (SubMT). Extracting the desired interpretable subgraph requires an accurate approximation of SubMT, yet we find that the existing XGNNs can have a huge gap in fitting SubMT. Consequently, the SubMT approximation failure will lead to the degenerated interpretability of the extracted subgraphs. To mitigate the issue, we design a new XGNN architecture called Graph Multilinear neT (GMT), which is provably more powerful in approximating SubMT. We empirically validate our theoretical findings on a number of graph classification benchmarks. The results demonstrate that GMT outperforms the state-of-the-art up to 10% in terms of both interpretability and generalizability across 12 regular and geometric graph benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yongqiang Chen;Yatao Bian;Bo Han;James Cheng", "authorids": "~Yongqiang_Chen1;~Yatao_Bian1;~Bo_Han1;~James_Cheng2", "gender": ";M;M;M", "homepage": "https://lfhase.win;https://www.cse.cuhk.edu.hk/~jcheng/;https://bhanml.github.io/;https://yataobian.com", "dblp": "76/5774-2;06/4171;241/0472-3;222/2694", "google_scholar": "huQ_Ig8AAAAJ;;nTNjqHwAAAAJ;oZBTlBkAAAAJ", "orcid": ";;;0000-0002-2368-4084", "linkedin": ";;;", "or_profile": "~Yongqiang_Chen1;~James_Cheng2;~bo_han2;~An_Bian1", "aff": "Department of Computer Science and Engineering, The Chinese University of Hong Kong;The Chinese University of Hong Kong;MBZUAI;Tencent AI Lab", "aff_domain": "cse.cuhk.edu.hk;cuhk.edu.hk;mbzuai.ac.ae;tencent.com", "position": "PhD student;Associate Professor;Researcher;Senior researcher ", "bibtex": "@inproceedings{\nchen2024how,\ntitle={How Interpretable Are Interpretable Graph Neural Networks?},\nauthor={Yongqiang Chen and Yatao Bian and Bo Han and James Cheng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=F3G2udCF3Q}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8504029, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15697179835854965484&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "cse.cuhk.edu.hk;cuhk.edu.hk;mbzuai.ac.ae;tencent.com", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Chinese University of Hong Kong;Mohamed bin Zayed University of Artificial Intelligence;Tencent", "aff_unique_dep": "Department of Computer Science and Engineering;;Tencent AI Lab", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.mbzuai.ac.ae;https://ai.tencent.com", "aff_unique_abbr": "CUHK;MBZUAI;Tencent AI Lab", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;United Arab Emirates" }, { "title": "Trust Regions for Explanations via Black-Box Probabilistic Certification", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34549", "id": "F3RdeyiR5H", "proceeding": "https://proceedings.mlr.press/v235/dhurandhar24a.html", "pdf": "https://openreview.net/pdf?id=F3RdeyiR5H", "openreview": "https://openreview.net/forum?id=F3RdeyiR5H", "author_site": "Amit Dhurandhar, Swagatam Haldar, Dennis Wei, Karthikeyan Ramamurthy", "tldr": "", "abstract": "Given the black box nature of machine learning models, a plethora of explainability methods have been developed to decipher the factors behind individual decisions. In this paper, we introduce a novel problem of black box (probabilistic) explanation certification. We ask the question: Given a black box model with only query access, an explanation for an example and a quality metric (viz. fidelity, stability), can we find the largest hypercube (i.e., $\\ell_{\\infty}$ ball) centered at the example such that when the explanation is applied to all examples within the hypercube, (with high probability) a quality criterion is met (viz. fidelity greater than some value)? Being able to efficiently find such a *trust region* has multiple benefits: i) insight into model behavior in a *region*, with a *guarantee*; ii) ascertained *stability* of the explanation; iii) *explanation reuse*, which can save time, energy and money by not having to find explanations for every example; and iv) a possible *meta-metric* to compare explanation methods. Our contributions include formalizing this problem, proposing solutions, providing theoretical guarantees for these solutions that are computable, and experimentally showing their efficacy on synthetic and real data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Amit Dhurandhar;Swagatam Haldar;Dennis Wei;Karthikeyan Natesan Ramamurthy", "authorids": "~Amit_Dhurandhar1;~Swagatam_Haldar1;~Dennis_Wei1;~Karthikeyan_Natesan_Ramamurthy1", "gender": "M;M;M;", "homepage": "https://researcher.watson.ibm.com/researcher/view.php?person=us-adhuran;;https://sites.google.com/site/dennislwei/;https://nrkarthikeyan.github.io/", "dblp": "66/3289;304/2070;59/8761;58/7800", "google_scholar": "km9vIPEAAAAJ;0VP-lRoAAAAJ;r4ldy4AAAAAJ;mG8HuhEAAAAJ", "orcid": ";;;0000-0002-6021-5930", "linkedin": ";swagatam-haldar-636a41232/;dennis-wei-4886036b/;", "or_profile": "~Amit_Dhurandhar1;~Swagatam_Haldar1;~Dennis_Wei1;~Karthikeyan_Natesan_Ramamurthy1", "aff": "International Business Machines;Eberhard-Karls-Universit\u00e4t T\u00fcbingen;International Business Machines;International Business Machines", "aff_domain": "ibm.com;uni-tuebingen.de;ibm.com;ibm.com", "position": "Principal Researcher;MS student;Research Staff Member;Research Staff Member", "bibtex": "@inproceedings{\ndhurandhar2024trust,\ntitle={Trust Regions for Explanations via Black-Box Probabilistic Certification},\nauthor={Amit Dhurandhar and Swagatam Haldar and Dennis Wei and Karthikeyan Natesan Ramamurthy},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=F3RdeyiR5H}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1861724, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18017891945076532316&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "ibm.com;uni-tuebingen.de;ibm.com;ibm.com", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "International Business Machines Corporation;Eberhard Karls University of T\u00fcbingen", "aff_unique_dep": ";", "aff_unique_url": "https://www.ibm.com;https://www.uni-tuebingen.de/", "aff_unique_abbr": "IBM;Uni T\u00fcbingen", "aff_campus_unique_index": "1", "aff_campus_unique": ";T\u00fcbingen", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;Germany" }, { "title": "An Image is Worth Multiple Words: Discovering Object Level Concepts using Multi-Concept Prompt Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34548", "id": "F3x6uYILgL", "proceeding": "https://proceedings.mlr.press/v235/jin24g.html", "pdf": "https://openreview.net/pdf?id=F3x6uYILgL", "openreview": "https://openreview.net/forum?id=F3x6uYILgL", "author_site": "Chen Jin, Ryutaro Tanno, Amrutha Saseendran, Tom Diethe, Philip Teare", "tldr": "", "abstract": "Textural Inversion, a prompt learning method, learns a singular text embedding for a new \"word\" to represent image style and appearance, allowing it to be integrated into natural language sentences to generate novel synthesised images. However, identifying multiple unknown object-level concepts within one scene remains a complex challenge. While recent methods have resorted to cropping or masking individual images to learn multiple concepts, these techniques often require prior knowledge of new concepts and are labour-intensive. To address this challenge, we introduce *Multi-Concept Prompt Learning (MCPL)*, where multiple unknown \"words\" are simultaneously learned from a single sentence-image pair, without any imagery annotations. To enhance the accuracy of word-concept correlation and refine attention mask boundaries, we propose three regularisation techniques: *Attention Masking*, *Prompts Contrastive Loss*, and *Bind Adjective*. Extensive quantitative comparisons with both real-world categories and biomedical images demonstrate that our method can learn new semantically disentangled concepts. Our approach emphasises learning solely from textual embeddings, using less than 10% of the storage space compared to others. The project page, code, and data are available at [https://astrazeneca.github.io/mcpl.github.io](https://astrazeneca.github.io/mcpl.github.io).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chen Jin;Ryutaro Tanno;Amrutha Saseendran;Tom Diethe;Philip Alexander Teare", "authorids": "~Chen_Jin3;~Ryutaro_Tanno1;~Amrutha_Saseendran1;~Tom_Diethe1;~Philip_Alexander_Teare1", "gender": ";M;F;M;M", "homepage": "https://lxasqjc.github.io;https://rt416.github.io/;;http://www.tomdiethe.com;", "dblp": ";187/6071;289/0537;33/1098;", "google_scholar": "https://scholar.google.co.uk/citations?user=4on9TiAAAAAJ;https://scholar.google.co.uk/citations?user=NiEvNoEAAAAJ;k2s42F0AAAAJ;https://scholar.google.co.uk/citations?user=oWGk9c8AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-2179-6445;;;0000-0002-0776-5407;", "linkedin": "chen-jin-33287593/;;amrutha-saseendran;tomdiethe/;philteare/", "or_profile": "~Chen_Jin3;~Ryutaro_Tanno1;~Amrutha_Saseendran1;~Tom_Diethe1;~Philip_Alexander_Teare1", "aff": "Astrazeneca;Google DeepMind;AstraZeneca;AstraZeneca;AstraZeneca", "aff_domain": "astrazeneca.com;deepmind.com;astrazeneca.com;astrazeneca.com;astrazeneca.com", "position": "Researcher;Researcher;Research Scientist;Principal Researcher;Researcher", "bibtex": "@inproceedings{\njin2024an,\ntitle={An Image is Worth Multiple Words: Discovering Object Level Concepts using Multi-Concept Prompt Learning},\nauthor={Chen Jin and Ryutaro Tanno and Amrutha Saseendran and Tom Diethe and Philip Alexander Teare},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=F3x6uYILgL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9290345, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9555611995418060585&as_sdt=8005&sciodt=0,7&hl=en", "gs_version_total": 6, "email": "astrazeneca.com;deepmind.com;astrazeneca.com;astrazeneca.com;astrazeneca.com", "author_num": 5, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "AstraZeneca;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.astrazeneca.com;https://deepmind.com", "aff_unique_abbr": "AstraZeneca;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Evaluation of Trajectory Distribution Predictions with Energy Score", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34547", "id": "FCmWhJQ14I", "proceeding": "https://proceedings.mlr.press/v235/shahroudi24a.html", "pdf": "https://openreview.net/pdf?id=FCmWhJQ14I", "openreview": "https://openreview.net/forum?id=FCmWhJQ14I", "author_site": "Novin Shahroudi, Mihkel Lepson, Meelis Kull", "tldr": "", "abstract": "Predicting the future trajectory of surrounding objects is inherently uncertain and vital in the safe and reliable planning of autonomous systems such as in self-driving cars. Although trajectory prediction models have become increasingly sophisticated in dealing with the complexities of spatiotemporal data, the evaluation methods used to assess these models have not kept pace. \"Minimum of N\" is a common family of metrics used to assess the rich outputs of such models. We critically examine the Minimum of N within the proper scoring rules framework to show that it is not strictly proper and demonstrate how that could lead to a misleading assessment of multimodal trajectory predictions. As an alternative, we propose using Energy Score-based evaluation measures, leveraging their proven propriety for a more reliable evaluation of trajectory distribution predictions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Novin Shahroudi;Mihkel Lepson;Meelis Kull", "authorids": "~Novin_Shahroudi1;~Mihkel_Lepson1;~Meelis_Kull1", "gender": "M;M;M", "homepage": ";;https://ml.cs.ut.ee/", "dblp": ";;20/5835", "google_scholar": ";;yJwctG4AAAAJ", "orcid": "0000-0002-3121-7603;0009-0009-3023-7272;0000-0001-9257-595X", "linkedin": "novinshahroudi/;;meeliskull/", "or_profile": "~Novin_Shahroudi1;~Mihkel_Lepson1;~Meelis_Kull1", "aff": "University of Tartu;institute of computer science, University of Tartu;University of Tartu", "aff_domain": "ut.ee;cs.ut.ee;ut.ee", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nshahroudi2024evaluation,\ntitle={Evaluation of Trajectory Distribution Predictions with Energy Score},\nauthor={Novin Shahroudi and Mihkel Lepson and Meelis Kull},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FCmWhJQ14I}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 729031, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15952071981891551443&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "ut.ee;cs.ut.ee;ut.ee", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Tartu", "aff_unique_dep": "", "aff_unique_url": "https://www.ut.ee", "aff_unique_abbr": "UT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Estonia" }, { "title": "Bespoke Non-Stationary Solvers for Fast Sampling of Diffusion and Flow Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34546", "id": "FCtO757Onl", "proceeding": "https://proceedings.mlr.press/v235/shaul24a.html", "pdf": "https://openreview.net/pdf?id=FCtO757Onl", "openreview": "https://openreview.net/forum?id=FCtO757Onl", "author_site": "Neta Shaul, Uriel Singer, Ricky T. Q. Chen, Matthew Le, Ali Thabet, Albert Pumarola, Yaron Lipman", "tldr": "", "abstract": "This paper introduces Bespoke Non-Stationary (BNS) Solvers, a solver distillation approach to improve sample efficiency of Diffusion and Flow models. BNS solvers are based on a family of non-stationary solvers that provably subsumes existing numerical ODE solvers and consequently demonstrate considerable improvement in sample approximation (PSNR) over these baselines. Compared to model distillation, BNS solvers benefit from a tiny parameter space ($<$200 parameters), fast optimization (two orders of magnitude faster), maintain diversity of samples, and in contrast to previous solver distillation approaches nearly close the gap from standard distillation methods such as Progressive Distillation in the low-medium NFE regime. For example, BNS solver achieves 45 PSNR / 1.76 FID using 16 NFE in class-conditional ImageNet-64. We experimented with BNS solvers for conditional image generation, text-to-image generation, and text-2-audio generation showing significant improvement in sample approximation (PSNR) in all.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Neta Shaul;Uriel Singer;Ricky T. Q. Chen;Matthew Le;Ali Thabet;Albert Pumarola;Yaron Lipman", "authorids": "~Neta_Shaul1;~Uriel_Singer1;~Ricky_T._Q._Chen1;~Matthew_Le2;~Ali_Thabet1;~Albert_Pumarola2;~Yaron_Lipman1", "gender": "M;;;;M;;", "homepage": ";https://il.linkedin.com/in/urielsinger;;;https://www.alithabet.com/;;", "dblp": ";238/0243;;;161/1812;;", "google_scholar": ";nIEep3cAAAAJ;;;7T0CPEkAAAAJ;;", "orcid": ";0000-0001-8451-8533;;;;;", "linkedin": "neta-shaul-3364aa235/;;;;akthabet/;;", "or_profile": "~Neta_Shaul1;~Uriel_Singer1;~Ricky_T._Q._Chen1;~Matthew_Le2;~Ali_Thabet1;~Albert_Pumarola2;~Yaron_Lipman1", "aff": "Weizmann Institute of Science;Meta AI Research;;;Meta;;", "aff_domain": "weizmann.ac.il;meta.com;;;fb.com;;", "position": "PhD student;Researcher;;;Applied Research Manager;;", "bibtex": "@inproceedings{\nshaul2024bespoke,\ntitle={Bespoke Non-Stationary Solvers for Fast Sampling of Diffusion and Flow Models},\nauthor={Neta Shaul and Uriel Singer and Ricky T. Q. Chen and Matthew Le and Ali Thabet and Albert Pumarola and Yaron Lipman},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FCtO757Onl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9999504, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3206099308017477939&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 6, "email": "weizmann.ac.il;meta.com;;;fb.com;;", "author_num": 7, "aff_unique_index": "0;1;1", "aff_unique_norm": "Weizmann Institute of Science;Meta", "aff_unique_dep": ";Meta AI Research", "aff_unique_url": "https://www.weizmann.org.il;https://meta.com", "aff_unique_abbr": "Weizmann;Meta AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Israel;United States" }, { "title": "Stochastic Interpolants with Data-Dependent Couplings", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34545", "id": "FFILRGD0jG", "proceeding": "https://proceedings.mlr.press/v235/albergo24a.html", "pdf": "https://openreview.net/pdf?id=FFILRGD0jG", "openreview": "https://openreview.net/forum?id=FFILRGD0jG", "author_site": "Michael Albergo, Mark Goldstein, Nicholas Boffi, Rajesh Ranganath, Eric Vanden-Eijnden", "tldr": "", "abstract": "Generative models inspired by dynamical transport of measure -- such as flows and diffusions -- construct a continuous-time map between two probability densities. Conventionally, one of these is the target density, only accessible through samples, while the other is taken as a simple base density that is data-agnostic. In this work, using the framework of stochastic interpolants, we formalize how to *couple* the base and the target densities, whereby samples from the base are computed conditionally given samples from the target in a way that is different from (but does not preclude) incorporating information about class labels or continuous embeddings. This enables us to construct dynamical transport maps that serve as conditional generative models. We show that these transport maps can be learned by solving a simple square loss regression problem analogous to the standard independent setting. We demonstrate the usefulness of constructing dependent couplings in practice through experiments in super-resolution and in-painting. The code is available at [https://github.com/interpolants/couplings](https://github.com/interpolants/couplings).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Michael Samuel Albergo;Mark Goldstein;Nicholas Matthew Boffi;Rajesh Ranganath;Eric Vanden-Eijnden", "authorids": "~Michael_Samuel_Albergo1;~Mark_Goldstein1;~Nicholas_Matthew_Boffi1;~Rajesh_Ranganath2;~Eric_Vanden-Eijnden1", "gender": "M;M;M;;M", "homepage": "http://malbergo.me;https://cims.nyu.edu/~mg3479/;https://nmboffi.github.io;;https://wp.nyu.edu/courantinstituteofmathematicalsciences-eve2/", "dblp": ";;;97/7057;88/7927", "google_scholar": "GQyCZ4kAAAAJ;https://scholar.google.fr/citations?hl=en;_jkX2q0AAAAJ;;A5Gx65gAAAAJ", "orcid": "0000-0001-9058-5943;;;;", "linkedin": ";;;;", "or_profile": "~Michael_Samuel_Albergo1;~Mark_Goldstein1;~Nicholas_Matthew_Boffi1;~Rajesh_Ranganath2;~Eric_Vanden-Eijnden1", "aff": "New York University;Google;NYU, New York University;New York University;New York University", "aff_domain": "nyu.edu;google.com;cims.nyu.edu;nyu.edu;nyu.edu", "position": "PhD student;Intern;Instructor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nalbergo2024stochastic,\ntitle={Stochastic Interpolants with Data-Dependent Couplings},\nauthor={Michael Samuel Albergo and Mark Goldstein and Nicholas Matthew Boffi and Rajesh Ranganath and Eric Vanden-Eijnden},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FFILRGD0jG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9662605, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15300546513847598791&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 6, "email": "nyu.edu;google.com;cims.nyu.edu;nyu.edu;nyu.edu", "author_num": 5, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "New York University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.nyu.edu;https://www.google.com", "aff_unique_abbr": "NYU;Google", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Mountain View;New York", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Jacobian Regularizer-based Neural Granger Causality", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34544", "id": "FG5hjRBtpm", "proceeding": "https://proceedings.mlr.press/v235/zhou24a.html", "pdf": "https://openreview.net/pdf?id=FG5hjRBtpm", "openreview": "https://openreview.net/forum?id=FG5hjRBtpm", "author_site": "Wanqi Zhou, Shuanghao Bai, Shujian Yu, Qibin Zhao, Badong Chen", "tldr": "", "abstract": "With the advancement of neural networks, diverse methods for neural Granger causality have emerged, which demonstrate proficiency in handling complex data, and nonlinear relationships. However, the existing framework of neural Granger causality has several limitations. It requires the construction of separate predictive models for each target variable, and the relationship depends on the sparsity on the weights of the first layer, resulting in challenges in effectively modeling complex relationships between variables as well as unsatisfied estimation accuracy of Granger causality. Moreover, most of them cannot grasp full-time Granger causality. To address these drawbacks, we propose a **J**acobian **R**egularizer-based **N**eural **G**ranger **C**ausality (**JRNGC**) approach, a straightforward yet highly effective method for learning multivariate summary Granger causality and full-time Granger causality by constructing a single model for all target variables. Specifically, our method eliminates the sparsity constraints of weights by leveraging an input-output Jacobian matrix regularizer, which can be subsequently represented as the weighted causal matrix in the post-hoc analysis. Extensive experiments show that our proposed approach achieves competitive performance with the state-of-the-art methods for learning summary Granger causality and full-time Granger causality while maintaining lower model complexity and high scalability.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wanqi Zhou;Shuanghao Bai;Shujian Yu;Qibin Zhao;Badong Chen", "authorids": "~Wanqi_Zhou2;~Shuanghao_Bai1;~Shujian_Yu1;~Qibin_Zhao1;~Badong_Chen1", "gender": "F;M;M;M;M", "homepage": "https://ellezwq.github.io/;https://github.com/BaiShuanghao?tab=repositories;https://sjyucnel.github.io/;https://qibinzhao.github.io;http://gr.xjtu.edu.cn/web/chenbd/home", "dblp": "332/8130;364/7251;154/5763.html;13/1193;95/6450", "google_scholar": "3Q_3PR8AAAAJ;xhd94DIAAAAJ;O8kpnMoAAAAJ;https://scholar.google.co.jp/citations?hl=en;mq6tPX4AAAAJ", "orcid": "0000-0002-4443-8413;0009-0002-6047-0242;;0000-0002-4442-3182;", "linkedin": ";;;;", "or_profile": "~Wanqi_Zhou2;~Shuanghao_Bai1;~Shujian_Yu1;~Qibin_Zhao1;~Badong_Chen1", "aff": "RIKEN;Xi'an Jiaotong University;University of Troms\u00f8;RIKEN;Xi'an Jiaotong University", "aff_domain": "riken.jp;xjtu.edu.cn;uit.no;riken.jp;xjtu.edu.cn", "position": "Intern;PhD student;Guest Associate Professor;Team Leader;Full Professor", "bibtex": "@inproceedings{\nzhou2024jacobian,\ntitle={Jacobian Regularizer-based Neural Granger Causality},\nauthor={Wanqi Zhou and Shuanghao Bai and Shujian Yu and Qibin Zhao and Badong Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FG5hjRBtpm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 872378, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13698946278521521974&as_sdt=40000005&sciodt=0,22&hl=en", "gs_version_total": 9, "email": "riken.jp;xjtu.edu.cn;uit.no;riken.jp;xjtu.edu.cn", "author_num": 5, "aff_unique_index": "0;1;2;0;1", "aff_unique_norm": "RIKEN;Xi'an Jiao Tong University;University of Troms\u00f8", "aff_unique_dep": ";;", "aff_unique_url": "https://www.riken.jp;https://www.xjtu.edu.cn;https://uit.no", "aff_unique_abbr": "RIKEN;XJTU;UIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0;1", "aff_country_unique": "Japan;China;Norway" }, { "title": "Memory-Space Visual Prompting for Efficient Vision-Language Fine-Tuning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34543", "id": "FHkavpr5Ze", "proceeding": "https://proceedings.mlr.press/v235/jie24a.html", "pdf": "https://openreview.net/pdf?id=FHkavpr5Ze", "openreview": "https://openreview.net/forum?id=FHkavpr5Ze", "author_site": "Shibo Jie, Yehui Tang, Ning Ding, Zhi-Hong Deng, Kai Han, Yunhe Wang", "tldr": "", "abstract": "Current solutions for efficiently constructing large vision-language (VL) models follow a two-step paradigm: projecting the output of pre-trained vision encoders to the input space of pre-trained language models as visual prompts; and then transferring the models to downstream VL tasks via end-to-end parameter-efficient fine-tuning (PEFT). However, this paradigm still exhibits inefficiency since it significantly increases the input length of the language models. In this paper, in contrast to integrating visual prompts into inputs, we regard visual prompts as additional knowledge that facilitates language models in addressing tasks associated with visual information. Motivated by the finding that Feed-Forward Network (FFN) of language models acts as \"key-value memory\", we introduce a novel approach termed memory-space visual prompting (MemVP), wherein visual prompts are concatenated with the weights of FFN for visual knowledge injection. Experimental results across various VL tasks and language models reveal that MemVP significantly reduces the training time and inference latency of the finetuned VL models and surpasses the performance of previous PEFT methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shibo Jie;Yehui Tang;Ning Ding;Zhi-Hong Deng;Kai Han;Yunhe Wang", "authorids": "~Shibo_Jie1;~Yehui_Tang1;~Ning_Ding4;~Zhi-Hong_Deng1;~Kai_Han2;~Yunhe_Wang1", "gender": "M;M;M;M;M;M", "homepage": ";;;http://www.cis.pku.edu.cn/jzyg/szdw/dzh.htm;https://iamhankai.github.io;https://www.wangyunhe.site/", "dblp": "318/9497;244/9659;;161/4814-1;51/4757-2;63/8217-1", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;TkSZQ6gAAAAJ;oXP1heYAAAAJ;https://scholar.google.com.tw/citations?user=tRoAxlsAAAAJ;vThoBVcAAAAJ;https://scholar.google.com.sg/citations?user=isizOkYAAAAJ", "orcid": ";;;0000-0002-0263-8142;0000-0002-9761-2702;0000-0002-0142-509X", "linkedin": ";;;;;", "or_profile": "~Shibo_Jie1;~Yehui_Tang1;~Ning_Ding4;~Zhi-Hong_Deng1;~Kai_Han2;~Yunhe_Wang1", "aff": "Peking University;Huawei Technologies Ltd.;Peking University;Peking University;Huawei Noah's Ark Lab;Huawei Noah's Ark Lab", "aff_domain": "pku.edu.cn;huawei.com;pku.edu.cn;pku.edu.cn;huawei.com;huawei.com", "position": "PhD student;Researcher;PhD student;Full Professor;Principal Researcher;Principal Researcher", "bibtex": "@inproceedings{\njie2024memoryspace,\ntitle={Memory-Space Visual Prompting for Efficient Vision-Language Fine-Tuning},\nauthor={Shibo Jie and Yehui Tang and Ning Ding and Zhi-Hong Deng and Kai Han and Yunhe Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FHkavpr5Ze}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2577788, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17233945226508260123&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "pku.edu.cn;huawei.com;pku.edu.cn;pku.edu.cn;huawei.com;huawei.com", "author_num": 6, "aff_unique_index": "0;1;0;0;1;1", "aff_unique_norm": "Peking University;Huawei", "aff_unique_dep": ";Huawei Technologies", "aff_unique_url": "http://www.pku.edu.cn;https://www.huawei.com", "aff_unique_abbr": "Peking U;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Faster Maximum Inner Product Search in High Dimensions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34542", "id": "FKkkdyRdsD", "proceeding": "https://proceedings.mlr.press/v235/tiwari24a.html", "pdf": "https://openreview.net/pdf?id=FKkkdyRdsD", "openreview": "https://openreview.net/forum?id=FKkkdyRdsD", "author_site": "Mo Tiwari, Ryan Kang, Jaeyong Lee, Donghyun Lee, Chris Piech, Sebastian Thrun, Ilan Shomorony, Martin Zhang", "tldr": "", "abstract": "Maximum Inner Product Search (MIPS) is a ubiquitous task in machine learning applications. Given a query vector and $n$ other vectors in $d$ dimensions, the MIPS problem is to find the atom that has the highest inner product with the query vector. Existing MIPS algorithms scale at least as $O(\\sqrt{d})$ with respect to $d$, which becomes computationally prohibitive in high-dimensional settings. In this work, we present BanditMIPS, a novel randomized algorithm that provably improves the state-of-the-art complexity from $O(\\sqrt{d})$ to $O(1)$ with respect to $d$. We validate the scaling of BanditMIPS and demonstrate that BanditMIPS outperforms prior state-of-the-art MIPS algorithms in sample complexity, wall-clock time, and precision/speedup tradeoff across a variety of experimental settings. Furthermore, we propose a variant of our algorithm, named BanditMIPS-$\\alpha$, which improves upon BanditMIPS by employing non-uniform sampling across coordinates. We also demonstrate the usefulness of BanditMIPS in problems for which MIPS is a subroutine, including Matching Pursuit and Fourier analysis. Finally, we demonstrate that BanditMIPS can be used in conjunction with preprocessing techniques to improve its complexity with respect to $n$. All of our experimental results are reproducible via a 1-line script at github.com/ThrunGroup/BanditMIPS.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mo Tiwari;Ryan Kang;Jaeyong Lee;Donghyun Lee;Christopher J Piech;Sebastian Thrun;Ilan Shomorony;Martin Jinye Zhang", "authorids": "~Mo_Tiwari1;~Ryan_Kang1;~Jaeyong_Lee1;~Donghyun_Lee2;~Christopher_J_Piech1;~Sebastian_Thrun1;~Ilan_Shomorony1;~Martin_Jinye_Zhang1", "gender": ";M;M;M;M;M;M;M", "homepage": "http://www.motiwari.com/;;https://kr.linkedin.com/in/jeyong-lee-6a7838190;;;http://robot.cc;http://www.ilanshomorony.com;https://mzhanglab.github.io/", "dblp": "267/5421;https://dblp.org/rec/conf/nips/TiwariKLPSTZ22.html;336/2490;298/4489;35/10987.html;t/SebastianThrun;31/9223;184/9278", "google_scholar": "https://scholar.google.com/citations?hl=en;;;https://scholar.google.com/citations?hl=en;;;fMAg4zEAAAAJ;zjr6n-QAAAAJ", "orcid": ";;;;;;;0000-0003-0006-2466", "linkedin": "motiwari;ryan-kang-554819221/;;donghyun-lee-aa789422a;;sebastian-thrun-59a0b273/;;", "or_profile": "~Mo_Tiwari1;~Ryan_Kang1;~Jaeyong_Lee1;~Donghyun_Lee2;~Christopher_J_Piech1;~Sebastian_Thrun1;~Ilan_Shomorony1;~Martin_J._Zhang1", "aff": "OpenAI;Stanford University;University of Oxford;University College London, University of London;;;University of Illinois, Urbana Champaign;Carnegie Mellon University", "aff_domain": "openai.com;stanford.edu;oxford.ac.uk;ucl.ac.uk;;;illinois.edu;andrew.cmu.edu", "position": "Member of Technical Staff;MS student;Undergrad student;MS student;;;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\ntiwari2024faster,\ntitle={Faster Maximum Inner Product Search in High Dimensions},\nauthor={Mo Tiwari and Ryan Kang and Jaeyong Lee and Donghyun Lee and Christopher J Piech and Sebastian Thrun and Ilan Shomorony and Martin Jinye Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FKkkdyRdsD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1827271, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12912019336735211116&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "openai.com;stanford.edu;oxford.ac.uk;ucl.ac.uk;;;illinois.edu;andrew.cmu.edu", "author_num": 8, "aff_unique_index": "0;1;2;3;4;5", "aff_unique_norm": "OpenAI;Stanford University;University of Oxford;University College London;University of Illinois Urbana-Champaign;Carnegie Mellon University", "aff_unique_dep": ";;;;;", "aff_unique_url": "https://openai.com;https://www.stanford.edu;https://www.ox.ac.uk;https://www.ucl.ac.uk;https://illinois.edu;https://www.cmu.edu", "aff_unique_abbr": "OpenAI;Stanford;Oxford;UCL;UIUC;CMU", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Stanford;Urbana-Champaign", "aff_country_unique_index": "0;0;1;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "IIANet: An Intra- and Inter-Modality Attention Network for Audio-Visual Speech Separation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34541", "id": "FM61SQzF3N", "proceeding": "https://proceedings.mlr.press/v235/li24cf.html", "pdf": "https://openreview.net/pdf?id=FM61SQzF3N", "openreview": "https://openreview.net/forum?id=FM61SQzF3N", "author_site": "Kai Li, Runxuan Yang, Fuchun Sun, Xiaolin Hu", "tldr": "", "abstract": "Recent research has made significant progress in designing fusion modules for audio-visual speech separation. However, they predominantly focus on multi-modal fusion at a single temporal scale of auditory and visual features without employing selective attention mechanisms, which is in sharp contrast with the brain. To address this, We propose a novel model called intra- and inter-attention network (IIANet), which leverages the attention mechanism for efficient audio-visual feature fusion. IIANet consists of two types of attention blocks: intra-attention (IntraA) and inter-attention (InterA) blocks, where the InterA blocks are distributed at the top, middle and bottom of IIANet. Heavily inspired by the way how human brain selectively focuses on relevant content at various temporal scales, these blocks maintain the ability to learn modality-specific features and enable the extraction of different semantics from audio-visual features. Comprehensive experiments on three standard audio-visual separation benchmarks (LRS2, LRS3, and VoxCeleb2) demonstrate the effectiveness of IIANet, outperforming previous state-of-the-art methods while maintaining comparable inference time. In particular, the fast version of IIANet (IIANet-fast) has only 7% of CTCNet\u2019s MACs and is 40% faster than CTCNet on CPUs while achieving better separation quality, showing the great potential of attention mechanism for efficient and effective multimodal fusion.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kai Li;Runxuan Yang;Fuchun Sun;Xiaolin Hu", "authorids": "~Kai_Li15;~Runxuan_Yang1;~Fuchun_Sun1;~Xiaolin_Hu1", "gender": "M;;M;M", "homepage": "https://cslikai.cn;;https://www.cs.tsinghua.edu.cn/info/1121/3555.htm;http://www.xlhu.cn/", "dblp": ";;;60/6028-1", "google_scholar": ";;;PksdgoUAAAAJ", "orcid": ";;;0000-0002-4907-7354", "linkedin": "kai-li-0bb2451a4;;;", "or_profile": "~Kai_Li15;~Runxuan_Yang1;~Fuchun_Sun1;~Xiaolin_Hu1", "aff": "Tsinghua University;;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;;cs.tsinghua.edu.cn;tsinghua.edu.cn", "position": "MS student;;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nli2024iianet,\ntitle={{IIAN}et: An Intra- and Inter-Modality Attention Network for Audio-Visual Speech Separation},\nauthor={Kai Li and Runxuan Yang and Fuchun Sun and Xiaolin Hu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FM61SQzF3N}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2200430, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2500030617904702922&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "tsinghua.edu.cn;;cs.tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Ranking-based Client Imitation Selection for Efficient Federated Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34540", "id": "FMEhnS0948", "proceeding": "https://proceedings.mlr.press/v235/tian24d.html", "pdf": "https://openreview.net/pdf?id=FMEhnS0948", "openreview": "https://openreview.net/forum?id=FMEhnS0948", "author_site": "Chunlin Tian, Zhan Shi, Xinpeng Qin, Li Li, Cheng-Zhong Xu", "tldr": "", "abstract": "Federated Learning (FL) enables multiple devices to collaboratively train a shared model while ensuring data privacy. The selection of participating devices in each training round critically affects both the model performance and training efficiency, especially given the vast heterogeneity in training capabilities and data distribution across devices. To deal with these challenges, we introduce a novel device selection solution called FedRank, which is based on an end-to-end, ranking-based model that is pre-trained by imitation learning against state-of-the-art analytical approaches. It not only considers data and system heterogeneity at runtime but also adaptively and efficiently chooses the most suitable clients for model training. Specifically, FedRank views client selection in FL as a ranking problem and employs a pairwise training strategy for the smart selection process. Additionally, an imitation learning-based approach is designed to counteract the cold-start issues often seen in state-of-the-art learning-based approaches. Experimental results reveal that FedRank boosts model accuracy by 5.2% to 56.9%, accelerates the training convergence up to $2.01 \\times$ and saves the energy consumption up to 40.1%.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chunlin Tian;Zhan Shi;Xinpengqin;Li Li;Cheng-zhong Xu", "authorids": "~Chunlin_Tian1;~Zhan_Shi3;~Xinpengqin1;~Li_Li10;~Cheng-zhong_Xu1", "gender": "M;M;;M;", "homepage": "https://scholar.google.com/citations?user=2D1fd0QAAAAJ&hl=zh-CN;https://aleczhanshi.github.io/;https://xinpengqin.github.io/;https://www.fst.um.edu.mo/personal/llili/;", "dblp": "194/2903;;https://dblp.org/rec/journals/corr/abs-2405-04122;53/2189-64;", "google_scholar": "2D1fd0QAAAAJ;w2I-wNQAAAAJ;https://scholar.google.com/citations?view_op=new_articles;uLzU3OcAAAAJ;", "orcid": "0009-0009-5220-1609;;0009-0006-2446-4397;0000-0002-2044-8289;", "linkedin": ";;;;", "or_profile": "~Chunlin_Tian1;~Zhan_Shi3;~Xinpengqin1;~Li_Li10;~Cheng-zhong_Xu1", "aff": "University of Macau;;University of Electronic Science and Technology of China;University of Macau;", "aff_domain": "um.edu.mo;;uestc.edu;um.edu.mo;", "position": "PhD student;;Undergrad student;Assistant Professor;", "bibtex": "@inproceedings{\ntian2024rankingbased,\ntitle={Ranking-based Client Imitation Selection for Efficient Federated Learning},\nauthor={Chunlin Tian and Zhan Shi and Xinpengqin and Li Li and Cheng-zhong Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FMEhnS0948}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1254977, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9716837214282965276&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "um.edu.mo;;uestc.edu;um.edu.mo;", "author_num": 5, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Macau;University of Electronic Science and Technology of China", "aff_unique_dep": ";", "aff_unique_url": "https://www.um.edu.mo;https://www.uestc.edu.cn", "aff_unique_abbr": "UM;UESTC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Macau SAR;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "C-RAG: Certified Generation Risks for Retrieval-Augmented Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34539", "id": "FMa4c5NhOe", "proceeding": "https://proceedings.mlr.press/v235/kang24a.html", "pdf": "https://openreview.net/pdf?id=FMa4c5NhOe", "openreview": "https://openreview.net/forum?id=FMa4c5NhOe", "author_site": "Mintong Kang, Nezihe Merve G\u00fcrel, Ning Yu, Dawn Song, Bo Li", "tldr": "", "abstract": "Despite the impressive capabilities of large language models (LLMs) across diverse applications, they still suffer from trustworthiness issues, such as hallucinations and misalignments. Retrieval-augmented language models (RAG) have been proposed to enhance the credibility of generations by grounding external knowledge, but the theoretical understandings of their generation risks remains unexplored. In this paper, we answer: 1) whether RAG can indeed lead to low generation risks, 2) how to provide provable guarantees on the generation risks of RAG and vanilla LLMs, and 3) what sufficient conditions enable RAG models to reduce generation risks. We propose C-RAG, the first framework to certify generation risks for RAG models. Specifically, we provide conformal risk analysis for RAG models and certify an upper confidence bound of generation risks, which we refer to as conformal generation risk. We also provide theoretical guarantees on conformal generation risks for general bounded risk functions under test distribution shifts. We prove that RAG achieves a lower conformal generation risk than that of a single LLM when the quality of the retrieval model and transformer is non-trivial. Our intensive empirical results demonstrate the soundness and tightness of our conformal generation risk guarantees across four widely-used NLP datasets on four state-of-the-art retrieval models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mintong Kang;Nezihe Merve G\u00fcrel;Ning Yu;Dawn Song;Bo Li", "authorids": "~Mintong_Kang1;~Nezihe_Merve_G\u00fcrel2;~Ning_Yu2;~Dawn_Song1;~Bo_Li19", "gender": "M;Not Specified;;F;F", "homepage": "https://kangmintong.github.io/;https://nezihemervegurel.github.io/;;;http://boli.cs.illinois.edu/", "dblp": "303/0335.html;215/5003;;s/DXSong;50/3402-26", "google_scholar": "oHXw2SAAAAAJ;5yYPHwYAAAAJ;;;K8vJkTcAAAAJ", "orcid": ";;;;", "linkedin": ";nezihemervegurel/;;;", "or_profile": "~Mintong_Kang1;~Nezihe_Merve_G\u00fcrel2;~Ning_Yu2;~Dawn_Song1;~Bo_Li19", "aff": "University of Illinois, Urbana Champaign;Delft University of Technology;;University of California, Berkeley;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;tudelft.nl;;berkeley.edu;illinois.edu", "position": "PhD student;Assistant Professor;;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nkang2024crag,\ntitle={C-{RAG}: Certified Generation Risks for Retrieval-Augmented Language Models},\nauthor={Mintong Kang and Nezihe Merve G{\\\"u}rel and Ning Yu and Dawn Song and Bo Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FMa4c5NhOe}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8695392, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5800676365446233247&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "illinois.edu;tudelft.nl;;berkeley.edu;illinois.edu", "author_num": 5, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;Delft University of Technology;University of California, Berkeley", "aff_unique_dep": ";;", "aff_unique_url": "https://illinois.edu;https://www.tudelft.nl;https://www.berkeley.edu", "aff_unique_abbr": "UIUC;TU Delft;UC Berkeley", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Urbana-Champaign;;Berkeley", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;Netherlands" }, { "title": "Towards General Neural Surrogate Solvers with Specialized Neural Accelerators", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34538", "id": "FNKnLhLuhY", "proceeding": "https://proceedings.mlr.press/v235/mao24b.html", "pdf": "https://openreview.net/pdf?id=FNKnLhLuhY", "openreview": "https://openreview.net/forum?id=FNKnLhLuhY", "author_site": "Chenkai Mao, Robert Lupoiu, Tianxiang Dai, Mingkun Chen, Jonathan Fan", "tldr": "", "abstract": "Surrogate neural network-based partial differential equation (PDE) solvers have the potential to solve PDEs in an accelerated manner, but they are largely limited to systems featuring fixed domain sizes, geometric layouts, and boundary conditions. We propose Specialized Neural Accelerator-Powered Domain Decomposition Methods (SNAP-DDM), a DDM-based approach to PDE solving in which subdomain problems containing arbitrary boundary conditions and geometric parameters are accurately solved using an ensemble of specialized neural operators. We tailor SNAP-DDM to 2D electromagnetics and fluidic flow problems and show how innovations in network architecture and loss function engineering can produce specialized surrogate subdomain solvers with near unity accuracy. We utilize these solvers with standard DDM algorithms to accurately solve freeform electromagnetics and fluids problems featuring a wide range of domain sizes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chenkai Mao;Robert Lupoiu;Tianxiang Dai;Mingkun Chen;Jonathan Fan", "authorids": "~Chenkai_Mao1;~Robert_Lupoiu1;~Tianxiang_Dai1;~Mingkun_Chen1;~Jonathan_Fan1", "gender": "M;;M;M;M", "homepage": "https://profiles.stanford.edu/249027;https://web.stanford.edu/~rclupoiu/;;https://scholar.google.com/citations?user=KLmTMv0AAAAJ&hl=en;https://fanlab.stanford.edu", "dblp": "314/9682;;;;", "google_scholar": "owK9BbIAAAAJ;exZV_fIAAAAJ;;;", "orcid": ";;0000-0002-9403-7511;;", "linkedin": "chenkai-mao-7232a5153/;;;;", "or_profile": "~Chenkai_Mao1;~Robert_Lupoiu1;~Tianxiang_Dai1;~Mingkun_Chen1;~Jonathan_Fan1", "aff": "Stanford University;Stanford University;Stanford University;;Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu;;stanford.edu", "position": "PhD student;PhD student;PhD student;;Associate Professor", "bibtex": "@inproceedings{\nmao2024towards,\ntitle={Towards General Neural Surrogate Solvers with Specialized Neural Accelerators},\nauthor={Chenkai Mao and Robert Lupoiu and Tianxiang Dai and Mingkun Chen and Jonathan Fan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FNKnLhLuhY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8703791, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13529798251441169702&as_sdt=20000005&sciodt=0,21&hl=en", "gs_version_total": 7, "email": "stanford.edu;stanford.edu;stanford.edu;;stanford.edu", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Mean Field Langevin Actor-Critic: Faster Convergence and Global Optimality beyond Lazy Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34537", "id": "FOJE1kRcHG", "proceeding": "https://proceedings.mlr.press/v235/yamamoto24a.html", "pdf": "https://openreview.net/pdf?id=FOJE1kRcHG", "openreview": "https://openreview.net/forum?id=FOJE1kRcHG", "author_site": "Kakei Yamamoto, Kazusato Oko, Zhuoran Yang, Taiji Suzuki", "tldr": "", "abstract": "This work explores the feature learning capabilities of deep reinforcement learning algorithms in the pursuit of optimal policy determination. We particularly examine an over-parameterized neural actor-critic framework within the mean-field regime, where both actor and critic components undergo updates via policy gradient and temporal-difference (TD) learning, respectively. We introduce the *mean-field Langevin TD learning* (MFLTD) method, enhancing mean-field Langevin dynamics with proximal TD updates for critic policy evaluation, and assess its performance against conventional approaches through numerical analysis. Additionally, for actor policy updates, we present the *mean-field Langevin policy gradient* (MFLPG), employing policy gradient techniques augmented by Wasserstein gradient flows for parameter space exploration. Our findings demonstrate that MFLTD accurately identifies the true value function, while MFLPG ensures linear convergence of actor sequences towards the globally optimal policy, considering a Kullback-Leibler divergence regularized framework. Through both time particle and discretized analysis, we substantiate the linear convergence guarantees of our neural actor-critic algorithms, representing a notable contribution to neural reinforcement learning focusing on *global optimality* and *feature learning*, extending the existing understanding beyond the conventional scope of lazy training.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kakei Yamamoto;Kazusato Oko;Zhuoran Yang;Taiji Suzuki", "authorids": "~Kakei_Yamamoto1;~Kazusato_Oko1;~Zhuoran_Yang1;~Taiji_Suzuki1", "gender": "M;M;M;M", "homepage": ";;https://zhuoranyang.github.io/;http://ibis.t.u-tokyo.ac.jp/suzuki/", "dblp": "334/7773;;;08/312", "google_scholar": "https://scholar.google.com/citations?hl=ja;;;x8osrBsAAAAJ", "orcid": "0000-0002-6231-2750;;;", "linkedin": "kakei-yamamoto-bb37461a0/;kazusatooko/;;", "or_profile": "~Kakei_Yamamoto1;~Kazusato_Oko1;~Zhuoran_Yang1;~Taiji_Suzuki1", "aff": "Massachusetts Institute of Technology;The University of Tokyo;Yale University;The University of Tokyo", "aff_domain": "mit.edu;u-tokyo.ac.jp;yale.edu;tokyo.ac.jp", "position": "PhD student;MS student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nyamamoto2024mean,\ntitle={Mean Field Langevin Actor-Critic: Faster Convergence and Global Optimality beyond Lazy Learning},\nauthor={Kakei Yamamoto and Kazusato Oko and Zhuoran Yang and Taiji Suzuki},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FOJE1kRcHG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 581956, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9644032660721430589&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "mit.edu;u-tokyo.ac.jp;yale.edu;tokyo.ac.jp", "author_num": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Massachusetts Institute of Technology;University of Tokyo;Yale University", "aff_unique_dep": ";;", "aff_unique_url": "https://web.mit.edu;https://www.u-tokyo.ac.jp;https://www.yale.edu", "aff_unique_abbr": "MIT;UTokyo;Yale", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "United States;Japan" }, { "title": "How Language Model Hallucinations Can Snowball", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34536", "id": "FPlaQyAGHu", "proceeding": "https://proceedings.mlr.press/v235/zhang24ay.html", "pdf": "https://openreview.net/pdf?id=FPlaQyAGHu", "openreview": "https://openreview.net/forum?id=FPlaQyAGHu", "author_site": "Muru Zhang, Ofir Press, William Merrill, Alisa Liu, Noah Smith", "tldr": "", "abstract": "A major risk of using language models in practical applications is their tendency to hallucinate incorrect statements. Hallucinations are often attributed to knowledge gaps in LMs, but we show that LMs sometimes produce hallucinations that they can separately recognize as incorrect. To do this, we construct three question-answering datasets where LMs often state an incorrect answer which is followed by an explanation with at least one incorrect claim. Crucially, we find that GPT-3.5, GPT-4, and LLaMA2-70B-chat can identify 67%, 87%, and 94% of these incorrect claims, respectively. We show that this phenomenon doesn't disappear under higher temperatures sampling, beam search, and zero-shot chain-of-thought prompting. These findings reveal that LM hallucinations can snowball: early mistakes by an LM can lead to more mistakes that otherwise would not be made.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Muru Zhang;Ofir Press;William Merrill;Alisa Liu;Noah A. Smith", "authorids": "~Muru_Zhang1;~Ofir_Press1;~William_Merrill1;~Alisa_Liu1;~Noah_A._Smith2", "gender": "M;M;M;F;M", "homepage": "https://nanami18.github.io/;https://ofir.io/about;http://lambdaviking.com;https://alisawuffles.github.io/;https://homes.cs.washington.edu/~nasmith/", "dblp": "325/4648.html;185/0577;19/3512;;90/5204.html", "google_scholar": "OJIXk7wAAAAJ;LeHa8psAAAAJ;CyjChJQAAAAJ;3-lTFAwAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;;0000-0002-2310-6380", "linkedin": "muruzhang/;;william-merrill-15ab0743/;;", "or_profile": "~Muru_Zhang1;~Ofir_Press1;~William_Merrill1;~Alisa_Liu1;~Noah_Smith1", "aff": "University of Washington;Princeton University;New York University;University of Washington;Allen Institute for Artificial Intelligence", "aff_domain": "cs.washington.edu;princeton.edu;nyu.edu;uw.edu;allenai.org", "position": "MS student;Postdoc;Graduate student;PhD student;Senior Director of NLP Research", "bibtex": "@inproceedings{\nzhang2024how,\ntitle={How Language Model Hallucinations Can Snowball},\nauthor={Muru Zhang and Ofir Press and William Merrill and Alisa Liu and Noah A. Smith},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FPlaQyAGHu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 614177, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 301, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15134251125885067830&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 9, "email": "cs.washington.edu;princeton.edu;nyu.edu;uw.edu;allenai.org", "author_num": 5, "aff_unique_index": "0;1;2;0;3", "aff_unique_norm": "University of Washington;Princeton University;New York University;Allen Institute for Artificial Intelligence", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.washington.edu;https://www.princeton.edu;https://www.nyu.edu;https://allenai.org", "aff_unique_abbr": "UW;Princeton;NYU;AI2", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Scaling Rectified Flow Transformers for High-Resolution Image Synthesis", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34535", "id": "FPnUhsQJ5B", "proceeding": "https://proceedings.mlr.press/v235/esser24a.html", "pdf": "https://openreview.net/pdf?id=FPnUhsQJ5B", "openreview": "https://openreview.net/forum?id=FPnUhsQJ5B", "author_site": "Patrick Esser, Sumith Kulal, Andreas Blattmann, Rahim Entezari, Jonas M\u00fcller, Harry Saini, Yam Levi, Dominik Lorenz, Axel Sauer, Frederic Boesel, Dustin Podell, Tim Dockhorn, Zion English, Robin Rombach", "tldr": "", "abstract": "Diffusion models create data from noise by inverting the forward paths of data towards noise and have emerged as a powerful generative modeling technique for high-dimensional, perceptual data such as images and videos. Rectified flow is a recent generative model formulation that connects data and noise in a straight line. Despite its better theoretical properties and conceptual simplicity, it is not yet decisively established as standard practice. In this work, we improve existing noise sampling techniques for training rectified flow models by biasing them towards perceptually relevant scales. Through a large-scale study, we demonstrate the superior performance of this approach compared to established diffusion formulations for high-resolution text-to-image synthesis. Additionally, we present a novel transformer-based architecture for text-to-image generation that uses separate weights for the two modalities and enables a bidirectional flow of information between image and text tokens, improving text comprehension, typography, and human preference ratings. We demonstrate that this architecture follows predictable scaling trends and correlates lower validation loss to improved text-to-image synthesis as measured by various metrics and human evaluations. Our largest models outperform state-of-the-art models. Stability AI is considering making experimental data, code, and model weights publicly available.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Patrick Esser;Sumith Kulal;Andreas Blattmann;Rahim Entezari;Jonas M\u00fcller;Harry Saini;Yam Levi;Dominik Lorenz;Axel Sauer;Frederic Boesel;Dustin Podell;Tim Dockhorn;Zion English;Robin Rombach", "authorids": "~Patrick_Esser1;~Sumith_Kulal1;~Andreas_Blattmann1;~Rahim_Entezari1;~Jonas_M\u00fcller1;~Harry_Saini1;~Yam_Levi1;~Dominik_Lorenz1;~Axel_Sauer1;~Frederic_Boesel1;~Dustin_Podell1;~Tim_Dockhorn1;~Zion_English1;~Robin_Rombach1", "gender": "M;;M;M;M;;M;M;M;M;M;;;M", "homepage": ";https://cs.stanford.edu/~sumith/;;http://rahimentezari.github.io;;;https://www.yamlevitd.com/;;https://axelsauer.com/;;;https://timudk.github.io/;;https://hci.iwr.uni-heidelberg.de/user/1149/biblio", "dblp": "184/1547;180/9770;273/7645;193/7037.html;;;;47/7787;;302/0590;;239/4951;;263/9785", "google_scholar": ";;https://scholar.google.de/citations?user=vud0t5YAAAAJ;CmTeX7kAAAAJ;;;;;https://scholar.google.de/citations?user=ZsDn16sAAAAJ;Pd-iA0oAAAAJ;;EtPn_v4AAAAJ;;ygdQhrIAAAAJ", "orcid": ";;;;;;;;;0009-0006-4773-7554;;;;", "linkedin": ";;andreas-blattmann-479038186;;jonasjmueller/;harrystark/;yam-levi-a2656b77/;;;frederic-boesel/;dustinpodell/;;;", "or_profile": "~Patrick_Esser1;~Sumith_Kulal1;~Andreas_Blattmann1;~Rahim_Entezari1;~Jonas_M\u00fcller1;~Harry_Saini1;~Yam_Levi1;~Dominik_Lorenz1;~Axel_Sauer1;~Frederic_Boesel1;~Dustin_Podell1;~Tim_Dockhorn1;~Zion_English1;~Robin_Rombach1", "aff": "Heidelberg University;Stability AI;Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;Stability;Stability AI;;;Stability AI;University of Tuebingen;Stability AI;Stability AI;Stability AI;;Stability AI", "aff_domain": "uni-heidelberg.de;stability.ai;lmu.de;stability.ai;stability.ai;;;stability.ai;uni-tuebingen.de;stability.ai;stability.ai;stability.ai;;stabilty.ai", "position": "PhD student;Researcher;PhD student;Researcher;Researcher;;;Researcher;PhD student;Intern;Researcher;Researcher;;Researcher", "bibtex": "@inproceedings{\nesser2024scaling,\ntitle={Scaling Rectified Flow Transformers for High-Resolution Image Synthesis},\nauthor={Patrick Esser and Sumith Kulal and Andreas Blattmann and Rahim Entezari and Jonas M{\\\"u}ller and Harry Saini and Yam Levi and Dominik Lorenz and Axel Sauer and Frederic Boesel and Dustin Podell and Tim Dockhorn and Zion English and Robin Rombach},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FPnUhsQJ5B}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9828401, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 14, "gs_citation": 1056, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13770088418573674516&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "uni-heidelberg.de;stability.ai;lmu.de;stability.ai;stability.ai;;;stability.ai;uni-tuebingen.de;stability.ai;stability.ai;stability.ai;;stabilty.ai", "author_num": 14, "aff_unique_index": "0;1;2;3;1;1;4;1;1;1;1", "aff_unique_norm": "Heidelberg University;Stability AI;Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;Stability;University of Tuebingen", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.uni-heidelberg.de;https://stability.ai;https://www.lmu.de;;https://www.uni-tuebingen.de/", "aff_unique_abbr": "Uni Heidelberg;Stability AI;LMU;;Uni T\u00fcbingen", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;1;0;1;1;1;1", "aff_country_unique": "Germany;United States;" }, { "title": "FightLadder: A Benchmark for Competitive Multi-Agent Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34534", "id": "FQQ4476dT2", "proceeding": "https://proceedings.mlr.press/v235/li24q.html", "pdf": "https://openreview.net/pdf?id=FQQ4476dT2", "openreview": "https://openreview.net/forum?id=FQQ4476dT2", "author_site": "Wenzhe Li, Zihan Ding, Seth Karten, Chi Jin", "tldr": "", "abstract": "Recent advances in reinforcement learning (RL) heavily rely on a variety of well-designed benchmarks, which provide environmental platforms and consistent criteria to evaluate existing and novel algorithms. Specifically, in multi-agent RL (MARL), a plethora of benchmarks based on cooperative games have spurred the development of algorithms that improve the scalability of cooperative multi-agent systems. However, for the competitive setting, a lightweight and open-sourced benchmark with challenging gaming dynamics and visual inputs has not yet been established. In this work, we present FightLadder, a real-time fighting game platform, to empower competitive MARL research. Along with the platform, we provide implementations of state-of-the-art MARL algorithms for competitive games, as well as a set of evaluation metrics to characterize the performance and exploitability of agents. We demonstrate the feasibility of this platform by training a general agent that consistently defeats 12 built-in characters in single-player mode, and expose the difficulty of training a non-exploitable agent without human knowledge and demonstrations in two-player mode. FightLadder provides meticulously designed environments to address critical challenges in competitive MARL research, aiming to catalyze a new era of discovery and advancement in the field. Videos and code at https://sites.google.com/view/fightladder/home.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenzhe Li;Zihan Ding;Seth Karten;Chi Jin", "authorids": "~Wenzhe_Li2;~Zihan_Ding1;~Seth_Karten1;~Chi_Jin1", "gender": "M;M;;M", "homepage": "https://wenzhe-li.github.io/;https://quantumiracle.github.io/webpage/;https://sethkarten.ai/;https://sites.google.com/view/cjin/home", "dblp": "12/1866;;304/2558;126/1802-1", "google_scholar": "https://scholar.google.com/citations?hl=en;t5DgPBAAAAAJ;gzyxNfkAAAAJ;GINhGvwAAAAJ", "orcid": ";;0000-0003-2908-4730;", "linkedin": ";;seth-karten/;", "or_profile": "~Wenzhe_Li2;~Zihan_Ding1;~Seth_Karten1;~Chi_Jin1", "aff": "Princeton University;Princeton University;Princeton University;Princeton University", "aff_domain": "princeton.edu;princeton.edu;princeton.edu;princeton.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nli2024fightladder,\ntitle={FightLadder: A Benchmark for Competitive Multi-Agent Reinforcement Learning},\nauthor={Wenzhe Li and Zihan Ding and Seth Karten and Chi Jin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FQQ4476dT2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6328217, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6493703940641181091&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "princeton.edu;princeton.edu;princeton.edu;princeton.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "CarbonNovo: Joint Design of Protein Structure and Sequence Using a Unified Energy-based Model", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34533", "id": "FSxTEvuFa7", "proceeding": "https://proceedings.mlr.press/v235/ren24e.html", "pdf": "https://openreview.net/pdf?id=FSxTEvuFa7", "openreview": "https://openreview.net/forum?id=FSxTEvuFa7", "author_site": "Milong Ren, Tian Zhu, Haicang Zhang", "tldr": "", "abstract": "De novo protein design aims to create novel protein structures and sequences unseen in nature. Recent structure-oriented design methods typically employ a two-stage strategy, where structure design and sequence design modules are trained separately, and the backbone structures and sequences are generated sequentially in inference. While diffusion-based generative models like RFdiffusion show great promise in structure design, they face inherent limitations within the two-stage framework. First, the sequence design module risks overfitting, as the accuracy of the generated structures may not align with that of the crystal structures used for training. Second, the sequence design module lacks interaction with the structure design module to further optimize the generated structures. To address these challenges, we propose CarbonNovo, a unified energy-based model for jointly generating protein structure and sequence. Specifically, we leverage a score-based generative model and Markov Random Fields for describing the energy landscape of protein structure and sequence. In CarbonNovo, the structure and sequence design module communicates at each diffusion step, encouraging the generation of more coherent structure-sequence pairs. Moreover, the unified framework allows for incorporating the protein language models as evolutionary constraints for generated proteins. The rigorous evaluation demonstrates that CarbonNovo outperforms two-stage methods across various metrics, including designability, novelty, sequence plausibility, and Rosetta Energy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Milong Ren;Tian Zhu;Haicang Zhang", "authorids": "~Milong_Ren2;~Tian_Zhu1;~Haicang_Zhang1", "gender": "M;M;M", "homepage": "https://github.com/rabbit-0001/renmilong;https://eurekazhu.github.io;", "dblp": ";;138/0439", "google_scholar": ";rLxdI10AAAAJ;myzZFrYAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Milong_Ren2;~Tian_Zhu1;~Haicang_Zhang1", "aff": " Institute of Computing Technology;Institute of Computing Technology, Chinese Academy of Sciences;Institute of Computing Technology, Chinese Academy of Sciences", "aff_domain": "ict.ac.cn;ict.ac.cn;ict.ac.cn", "position": "PhD student;MS student;Associate Professor", "bibtex": "@inproceedings{\nren2024carbonnovo,\ntitle={CarbonNovo: Joint Design of Protein Structure and Sequence Using a Unified Energy-based Model},\nauthor={Milong Ren and Tian Zhu and Haicang Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FSxTEvuFa7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4258570, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2794261527584880608&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "ict.ac.cn;ict.ac.cn;ict.ac.cn", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Institute of Computing Technology;Chinese Academy of Sciences", "aff_unique_dep": ";Institute of Computing Technology", "aff_unique_url": "http://www.ict.ac.cn;http://www.ict.ac.cn", "aff_unique_abbr": ";CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Adaptive Advantage-Guided Policy Regularization for Offline Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34532", "id": "FV3kY9FBW6", "proceeding": "https://proceedings.mlr.press/v235/liu24ai.html", "pdf": "https://openreview.net/pdf?id=FV3kY9FBW6", "openreview": "https://openreview.net/forum?id=FV3kY9FBW6", "author_site": "Tenglong Liu, Yang Li, Yixing Lan, Hao Gao, Wei Pan, Xin Xu", "tldr": "", "abstract": "In offline reinforcement learning, the challenge of out-of-distribution (OOD) is pronounced. To address this, existing methods often constrain the learned policy through policy regularization. However, these methods often suffer from the issue of unnecessary conservativeness, hampering policy improvement. This occurs due to the indiscriminate use of all actions from the behavior policy that generates the offline dataset as constraints. The problem becomes particularly noticeable when the quality of the dataset is suboptimal. Thus, we propose Adaptive Advantage-guided Policy Regularization (A2PR), obtaining high-advantage actions from an augmented behavior policy combined with VAE to guide the learned policy. A2PR can select high-advantage actions that differ from those present in the dataset, while still effectively maintaining conservatism from OOD actions. This is achieved by harnessing the VAE capacity to generate samples matching the distribution of the data points. We theoretically prove that the improvement of the behavior policy is guaranteed. Besides, it effectively mitigates value overestimation with a bounded performance gap. Empirically, we conduct a series of experiments on the D4RL benchmark, where A2PR demonstrates state-of-the-art performance. Furthermore, experimental results on additional suboptimal mixed datasets reveal that A2PR exhibits superior performance. Code is available at https://github.com/ltlhuuu/A2PR.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tenglong Liu;Yang Li;Yixing Lan;Hao Gao;Wei Pan;Xin Xu", "authorids": "~Tenglong_Liu1;~Yang_Li40;~Yixing_Lan2;~Hao_Gao5;~Wei_Pan2;~Xin_Xu1", "gender": "M;M;M;M;M;M", "homepage": "https://github.com/ltlhuuu;https://liyang.page;;http://panweihit.github.io;;", "dblp": ";;;;66/3874-1;302/1436", "google_scholar": ";msAmwaoAAAAJ;;GqryWPsAAAAJ;;", "orcid": ";; 0000-0001-6974-4997;0000-0003-1121-9879;;", "linkedin": ";;;wei-pan-6b558b17/;;", "or_profile": "~Tenglong_Liu1;~Yang_Li40;~Hao_Gao5;~Wei_Pan2;~Xin_Xu1;~YIXING_LAN1", "aff": "National University of Defense Technology;University of Manchester;National University of Defense Technology;University of Manchester;National University of Defense Technology, China;", "aff_domain": "nudt.edu.cn;cs.manchester.ac.uk;nudt.edu.cn;manchester.ac.uk;nudt.edu.cn;", "position": "PhD student;PhD student;PhD student;Associate Professor;Full Professor;", "bibtex": "@inproceedings{\nliu2024adaptive,\ntitle={Adaptive Advantage-Guided Policy Regularization for Offline Reinforcement Learning},\nauthor={Tenglong Liu and Yang Li and Yixing Lan and Hao Gao and Wei Pan and Xin Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FV3kY9FBW6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5102942, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5016177561001719436&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "nudt.edu.cn;cs.manchester.ac.uk;nudt.edu.cn;manchester.ac.uk;nudt.edu.cn;", "author_num": 6, "aff_unique_index": "0;1;0;1;0", "aff_unique_norm": "National University of Defense Technology;University of Manchester", "aff_unique_dep": ";", "aff_unique_url": "http://www.nudt.edu.cn/;https://www.manchester.ac.uk", "aff_unique_abbr": "NUDT;UoM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;0", "aff_country_unique": "China;United Kingdom" }, { "title": "Auditing Private Prediction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34531", "id": "FVmqX0sYz9", "proceeding": "https://proceedings.mlr.press/v235/chadha24a.html", "pdf": "https://openreview.net/pdf?id=FVmqX0sYz9", "openreview": "https://openreview.net/forum?id=FVmqX0sYz9", "author_site": "Karan Chadha, Matthew Jagielski, Nicolas Papernot, Christopher A. Choquette Choo, Milad Nasr", "tldr": "", "abstract": "Differential privacy (DP) offers a theoretical upper bound on the potential privacy leakage of an algorithm, while empirical auditing establishes a practical lower bound. Auditing techniques exist for DP training algorithms. However machine learning can also be made private at inference. We propose the first framework for auditing private prediction where we instantiate adversaries with varying poisoning and query capabilities. This enables us to study the privacy leakage of four private prediction algorithms: PATE (Papernot et al., 2016), CaPC (Choquette-Choo et al., 2020), PromptPATE (Duan et al., 2023), and Private-kNN (Zhu et al., 2020). To conduct our audit, we introduce novel techniques to empirically evaluate privacy leakage in terms of Renyi DP. Our experiments show that (i) the privacy analysis of private prediction can be improved, (ii) algorithms which are easier to poison lead to much higher privacy leakage, and (iii) the privacy leakage is significantly lower for adversaries without query control than those with full control.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Karan Chadha;Matthew Jagielski;Nicolas Papernot;Christopher A. Choquette-Choo;Milad Nasr", "authorids": "~Karan_Chadha1;~Matthew_Jagielski1;~Nicolas_Papernot1;~Christopher_A._Choquette-Choo1;~Milad_Nasr2", "gender": "M;M;M;;M", "homepage": "https://jagielski.github.io/;https://www.papernot.fr;https://www.christopherchoquette.com;https://people.cs.umass.edu/~milad/;https://web.stanford.edu/~knchadha/", "dblp": "218/5156;162/1405;250/9674;;140/0852", "google_scholar": "_8rw_GMAAAAJ;cGxq0cMAAAAJ;oDE4I64AAAAJ;k6-nvDAAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;;", "linkedin": ";nicolaspapernot;christopher-choquette-choo/;;", "or_profile": "~Matthew_Jagielski1;~Nicolas_Papernot1;~Christopher_A._Choquette-Choo1;~Milad_Nasr2;~Karan_N_Chadha1", "aff": "Google;Google;Google DeepMind;Google;Stanford University", "aff_domain": "google.com;google.com;google.com;google.com;stanford.edu", "position": "Researcher;Research Scientist;Research Scientist;Researcher;PhD student", "bibtex": "@inproceedings{\nchadha2024auditing,\ntitle={Auditing Private Prediction},\nauthor={Karan Chadha and Matthew Jagielski and Nicolas Papernot and Christopher A. Choquette-Choo and Milad Nasr},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FVmqX0sYz9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1497756, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7001232892271091676&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "google.com;google.com;google.com;google.com;stanford.edu", "author_num": 5, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Google;Stanford University", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.stanford.edu", "aff_unique_abbr": "Google;Stanford", "aff_campus_unique_index": "0;0;0;2", "aff_campus_unique": "Mountain View;;Stanford", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "MOMENT: A Family of Open Time-series Foundation Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34530", "id": "FVvf69a5rx", "proceeding": "https://proceedings.mlr.press/v235/goswami24a.html", "pdf": "https://openreview.net/pdf?id=FVvf69a5rx", "openreview": "https://openreview.net/forum?id=FVvf69a5rx", "author_site": "Mononito Goswami, Konrad Szafer, Arjun Choudhry, Yifu Cai, Shuo Li, Artur Dubrawski", "tldr": "", "abstract": "We introduce MOMENT, a family of open-source foundation models for general-purpose time series analysis. Pre-training large models on time series data is challenging due to (1) the absence of a large and cohesive public time series repository, and (2) diverse time series characteristics which make multi-dataset training onerous. Additionally, (3) experimental benchmarks to evaluate these models, especially in scenarios with limited resources, time, and supervision, are still in their nascent stages. To address these challenges, we compile a large and diverse collection of public time series, called the Time series Pile, and systematically tackle time series-specific challenges to unlock large-scale multi-dataset pre-training. Finally, we build on recent work to design a benchmark to evaluate time series foundation models on diverse tasks and datasets in limited supervision settings. Experiments on this benchmark demonstrate the effectiveness of our pre-trained models with minimal data and task-specific fine-tuning. Finally, we present several interesting empirical observations about large pre-trained time series models. Pre-trained models (AutonLab/MOMENT-1-large) and Time Series Pile (AutonLab/Timeseries-PILE) are available on [Huggingface](https://huggingface.co/AutonLab).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mononito Goswami;Konrad Szafer;Arjun Choudhry;Yifu Cai;Shuo Li;Artur Dubrawski", "authorids": "~Mononito_Goswami1;~Konrad_Szafer1;~Arjun_Choudhry1;~Yifu_Cai1;~Shuo_Li7;~Artur_Dubrawski2", "gender": "M;;M;M;M;M", "homepage": "https://mononito.com;https://konradszafer.github.io/;;;;https://www.autonlab.org", "dblp": "243/3771;;;;;76/48", "google_scholar": "https://scholar.google.co.in/citations?hl=en;;https://scholar.google.com/citations?view_op=list_works;;-QaDf40AAAAJ;O3gezzcAAAAJ", "orcid": "0000-0002-4117-5558;;0000-0002-3416-6020;;;0000-0002-2372-0831", "linkedin": "https://linkedin.com/in/mononitogoswami/;;;yifu-cai-a401581b2/;shuo-li-bbb2a11b1/;artur-dubrawski-33a2a87/", "or_profile": "~Mononito_Goswami1;~Konrad_Szafer1;~Arjun_Choudhry1;~Yifu_Cai1;~Shuo_Li7;~Artur_Dubrawski2", "aff": "Google;Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;Amazon;Carnegie Mellon University", "aff_domain": "google.com;cmu.edu;andrew.cmu.edu;cmu.edu;amazon.com;cmu.edu", "position": "Intern;Research Assistant ;MS student;Undergrad student;Intern;Research Professor", "bibtex": "@inproceedings{\ngoswami2024moment,\ntitle={{MOMENT}: A Family of Open Time-series Foundation Models},\nauthor={Mononito Goswami and Konrad Szafer and Arjun Choudhry and Yifu Cai and Shuo Li and Artur Dubrawski},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FVvf69a5rx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3445900, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 164, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4329308403403728453&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "google.com;cmu.edu;andrew.cmu.edu;cmu.edu;amazon.com;cmu.edu", "author_num": 6, "aff_unique_index": "0;1;1;1;2;1", "aff_unique_norm": "Google;Carnegie Mellon University;Amazon", "aff_unique_dep": "Google;;Amazon.com, Inc.", "aff_unique_url": "https://www.google.com;https://www.cmu.edu;https://www.amazon.com", "aff_unique_abbr": "Google;CMU;Amazon", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "To Each (Textual Sequence) Its Own: Improving Memorized-Data Unlearning in Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34529", "id": "FWlNA3et6X", "proceeding": "https://proceedings.mlr.press/v235/barbulescu24a.html", "pdf": "https://openreview.net/pdf?id=FWlNA3et6X", "openreview": "https://openreview.net/forum?id=FWlNA3et6X", "author_site": "George-Octavian B\u0103rbulescu, Peter Triantafillou", "tldr": "", "abstract": "LLMs have been found to memorize training textual sequences and regurgitate verbatim said sequences during text generation time. This fact is known to be the cause of privacy and related (e.g., copyright) problems. Unlearning in LLMs then takes the form of devising new algorithms that will properly deal with these side-effects of memorized data, while not hurting the model's utility. We offer a fresh perspective towards this goal, namely, that each textual sequence to be forgotten should be treated differently when being unlearned based on its degree of memorization within the LLM. We contribute a new metric for measuring unlearning quality, an adversarial attack showing that SOTA algorithms lacking this perspective fail for privacy, and two new unlearning methods based on Gradient Ascent and Task Arithmetic, respectively. A comprehensive performance evaluation across an extensive suite of NLP tasks then mapped the solution space, identifying the best solutions under different scales in model capacities and forget set sizes and quantified the gains of the new approaches.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "George-Octavian B\u0103rbulescu;Peter Triantafillou", "authorids": "~George-Octavian_B\u0103rbulescu1;~Peter_Triantafillou1", "gender": ";", "homepage": ";https://warwick.ac.uk/fac/sci/dcs/people/peter_triantafillou/", "dblp": ";t/PeterTriantafillou", "google_scholar": "hUK-vOMAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~George-Octavian_B\u0103rbulescu1;~Peter_Triantafillou1", "aff": "University of Warwick;University of Warwick", "aff_domain": "warwick.ac.uk;warwick.ac.uk", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nb{\\u{a}}rbulescu2024to,\ntitle={To Each (Textual Sequence) Its Own: Improving Memorized-Data Unlearning in Large Language Models},\nauthor={George-Octavian B{\\u{a}}rbulescu and Peter Triantafillou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FWlNA3et6X}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 802690, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18089146872021578365&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "email": "warwick.ac.uk;warwick.ac.uk", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Warwick", "aff_unique_dep": "", "aff_unique_url": "https://www.warwick.ac.uk", "aff_unique_abbr": "Warwick", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "3D Geometric Shape Assembly via Efficient Point Cloud Matching", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34528", "id": "FYQIgQWH3d", "proceeding": "https://proceedings.mlr.press/v235/lee24s.html", "pdf": "https://openreview.net/pdf?id=FYQIgQWH3d", "openreview": "https://openreview.net/forum?id=FYQIgQWH3d", "author_site": "Nahyuk Lee, Juhong Min, Junha Lee, Seungwook Kim, Kanghee Lee, Jaesik Park, Minsu Cho", "tldr": "", "abstract": "Learning to assemble geometric shapes into a larger target structure is a pivotal task in various practical applications. In this work, we tackle this problem by establishing local correspondences between point clouds of part shapes in both coarse- and fine-levels. To this end, we introduce Proxy Match Transform (PMT), an approximate high-order feature transform layer that enables reliable matching between mating surfaces of parts while incurring low costs in memory and compute. Building upon PMT, we introduce a new framework, dubbed Proxy Match TransformeR (PMTR), for the geometric assembly task. We evaluate the proposed PMTR on the large-scale 3D geometric shape assembly benchmark dataset of Breaking Bad and demonstrate its superior performance and efficiency compared to state-of-the-art methods. Project page: https://nahyuklee.github.io/pmtr", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nahyuk Lee;Juhong Min;Junha Lee;Seungwook Kim;Kanghee Lee;Jaesik Park;Minsu Cho", "authorids": "~Nahyuk_Lee1;~Juhong_Min1;~Junha_Lee2;~Seungwook_Kim2;~Kanghee_Lee2;~Jaesik_Park3;~Minsu_Cho1", "gender": "M;;M;;;M;M", "homepage": "https://nahyuklee.github.io/;;https://junha-l.github.io;;https://github.com/Kanghee-Lee/;http://jaesik.info;http://cvlab.postech.ac.kr/~mcho/", "dblp": "383/1188;;53/11266;;;00/10336;", "google_scholar": "QQamvI0AAAAJ;;RB7qMm4AAAAJ;;;_3q6KBIAAAAJ;5TyoF5QAAAAJ", "orcid": ";;;;;;", "linkedin": "nahyuk-lee/;;https://linkedin.com/in/junha-l/;;\uac15\ud76c-\uc774-9bb723194;;minsu-cho-062b3750/", "or_profile": "~Nahyuk_Lee1;~Juhong_Min1;~Junha_Lee2;~Seungwook_Kim2;~Kanghee_Lee2;~Jaesik_Park3;~Minsu_Cho1", "aff": "Pohang University of Science and Technology;;Pohang University of Science and Technology;;Seoul National University;Seoul National University;POSTECH", "aff_domain": "postech.ac.kr;;postech.ac.kr;;snu.ac.kr;snu.ac.kr;postech.ac.kr", "position": "MS student;;PhD student;;PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nlee2024d,\ntitle={3D Geometric Shape Assembly via Efficient Point Cloud Matching},\nauthor={Nahyuk Lee and Juhong Min and Junha Lee and Seungwook Kim and Kanghee Lee and Jaesik Park and Minsu Cho},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FYQIgQWH3d}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4848665, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14156662579522892735&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 11, "email": "postech.ac.kr;;postech.ac.kr;;snu.ac.kr;snu.ac.kr;postech.ac.kr", "author_num": 7, "aff_unique_index": "0;0;1;1;0", "aff_unique_norm": "Pohang University of Science and Technology;Seoul National University", "aff_unique_dep": ";", "aff_unique_url": "https://www.postech.ac.kr;https://www.snu.ac.kr", "aff_unique_abbr": "POSTECH;SNU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pohang;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "RoSA: Accurate Parameter-Efficient Fine-Tuning via Robust Adaptation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34527", "id": "FYvpxyS43U", "proceeding": "https://proceedings.mlr.press/v235/nikdan24a.html", "pdf": "https://openreview.net/pdf?id=FYvpxyS43U", "openreview": "https://openreview.net/forum?id=FYvpxyS43U", "author_site": "Mahdi Nikdan, Soroush Tabesh, Elvir Crn\u010devi\u0107, Dan Alistarh", "tldr": "", "abstract": "We investigate parameter-efficient fine-tuning (PEFT) methods that can provide good accuracy under limited computational and memory budgets in the context of large language models (LLMs). We present a new PEFT method called Robust Adaptation (RoSA) inspired by robust principal component analysis that jointly trains $\\textit{low-rank}$ and *highly-sparse* components on top of a set of fixed pretrained weights to efficiently approximate the performance of a full-fine-tuning (FFT) solution. Across a series of challenging generative tasks such as grade-school math and SQL query generation, which require fine-tuning for good performance, we show that RoSA outperforms LoRA, pure sparse fine-tuning, and alternative hybrid methods at the same parameter budget, and can even recover the performance of FFT on some tasks. We provide system support for RoSA to complement the training algorithm, specifically in the form of sparse GPU kernels which enable memory- and computationally-efficient training, and show that it is also compatible with low-precision base weights, resulting in the first joint representation combining quantization, low-rank and sparse approximations. Our code is available at https://github.com/IST-DASLab/RoSA.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mahdi Nikdan;Soroush Tabesh;Elvir Crn\u010devi\u0107;Dan Alistarh", "authorids": "~Mahdi_Nikdan1;~Soroush_Tabesh1;~Elvir_Crn\u010devi\u01071;~Dan_Alistarh7", "gender": "M;M;;M", "homepage": "https://mnikdan.github.io/;;https://github.com/elvircrn;http://people.csail.mit.edu/alistarh/", "dblp": "298/2929;;;36/3251.html", "google_scholar": "bqVj7DsAAAAJ;J3vHHVEAAAAJ;;https://scholar.google.com.tw/citations?user=75q-6ZQAAAAJ", "orcid": ";;;", "linkedin": ";soroush-tabesh/;elvir-crncevic-99a17960/;", "or_profile": "~Mahdi_Nikdan1;~Soroush_Tabesh1;~Elvir_Crn\u010devi\u01071;~Dan_Alistarh1", "aff": "Google;Institute of Science and Technology Austria;Technische Universit\u00e4t Graz;Institute of Science and Technology", "aff_domain": "google.com;ista.ac.at;tugraz.at;ist.ac.at", "position": "Intern;PhD student;MS student;Full Professor", "bibtex": "@inproceedings{\nnikdan2024rosa,\ntitle={Ro{SA}: Accurate Parameter-Efficient Fine-Tuning via Robust Adaptation},\nauthor={Mahdi Nikdan and Soroush Tabesh and Elvir Crn{\\v{c}}evi{\\'c} and Dan Alistarh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FYvpxyS43U}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1346384, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13513398654106834283&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "google.com;ista.ac.at;tugraz.at;ist.ac.at", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Google;Institute of Science and Technology Austria;Technische Universit\u00e4t Graz;Institute of Science and Technology", "aff_unique_dep": "Google;;;", "aff_unique_url": "https://www.google.com;https://www.ist.ac.at;https://www.tugraz.at;", "aff_unique_abbr": "Google;IST Austria;TU Graz;", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;Austria;" }, { "title": "CRUXEval: A Benchmark for Code Reasoning, Understanding and Execution", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34526", "id": "Ffpg52swvg", "proceeding": "https://proceedings.mlr.press/v235/gu24c.html", "pdf": "https://openreview.net/pdf?id=Ffpg52swvg", "openreview": "https://openreview.net/forum?id=Ffpg52swvg", "author_site": "Alex Gu, Baptiste Roziere, Hugh Leather, Armando Solar-Lezama, Gabriel Synnaeve, Sida Wang", "tldr": "", "abstract": "We present Code Reasoning, Understanding, and eXecution Evaluation, a benchmark consisting of 800 Python functions (3-13 lines). Each function comes with an input-output pair, leading to two natural tasks: input prediction and output prediction. First, we propose a general recipe for generating our execution benchmark by sampling from a model, which can be used for more challenging versions of the benchmark if needed. Second, we evaluate twenty code models on our benchmark and discover that many recent high-scoring models on HumanEval show no improvements on our benchmark. Third, we show that simple CoT and fine-tuning schemes can improve performance on our benchmark but remain far from solving it. The best setup, GPT-4 with chain of thought (CoT), achieves a pass@1 of 75% and 81% on input and output prediction, respectively. In contrast, Code Llama 34B achieves a pass@1 of 50% and 46% on input and output prediction. When it comes to reasoning about code, GPT-4 has a huge edge over other models but still fails consistently on some surprisingly simple Python programs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alex Gu;Baptiste Roziere;Hugh James Leather;Armando Solar-Lezama;Gabriel Synnaeve;Sida Wang", "authorids": "~Alex_Gu1;~Baptiste_Roziere1;~Hugh_James_Leather1;~Armando_Solar-Lezama1;~Gabriel_Synnaeve1;~Sida_Wang2", "gender": "M;;M;M;M;M", "homepage": "https://minimario.github.io/;;https://homepages.inf.ed.ac.uk/hleather/;https://people.csail.mit.edu/asolar/;;https://www.sidaw.xyz", "dblp": "285/4734;;;95/6919;http://dblp.uni-trier.de/pers/hd/s/Synnaeve:Gabriel;153/9609", "google_scholar": "jRQtBp0AAAAJ;CrSf2CQAAAAJ;;https://scholar.google.com.tw/citations?user=8BX3BokAAAAJ;wN9rBkcAAAAJ;XUI4PMEAAAAJ", "orcid": ";;;;;", "linkedin": "alex-gu-8b7664175/;;;;;", "or_profile": "~Alex_Gu1;~Baptiste_Roziere1;~Hugh_James_Leather1;~Armando_Solar-Lezama1;~Gabriel_Synnaeve1;~Sida_Wang2", "aff": "Massachusetts Institute of Technology;Meta AI;Meta Facebook;Massachusetts Institute of Technology;Meta Facebook;Meta Facebook", "aff_domain": "mit.edu;fb.com;fb.com;mit.edu;fb.com;fb.com", "position": "PhD student;Researcher;Researcher;Full Professor;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\ngu2024cruxeval,\ntitle={{CRUXE}val: A Benchmark for Code Reasoning, Understanding and Execution},\nauthor={Alex Gu and Baptiste Roziere and Hugh James Leather and Armando Solar-Lezama and Gabriel Synnaeve and Sida Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Ffpg52swvg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1377441, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 84, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13397964153763973539&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "mit.edu;fb.com;fb.com;mit.edu;fb.com;fb.com", "author_num": 6, "aff_unique_index": "0;1;1;0;1;1", "aff_unique_norm": "Massachusetts Institute of Technology;Meta", "aff_unique_dep": ";Meta AI", "aff_unique_url": "https://web.mit.edu;https://meta.com", "aff_unique_abbr": "MIT;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Intersectional Unfairness Discovery", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34525", "id": "FhWH9TQSMh", "proceeding": "https://proceedings.mlr.press/v235/xu24d.html", "pdf": "https://openreview.net/pdf?id=FhWH9TQSMh", "openreview": "https://openreview.net/forum?id=FhWH9TQSMh", "author_site": "Gezheng Xu, Qi CHEN, Charles X. Ling, Boyu Wang, Changjian Shui", "tldr": "", "abstract": "AI systems have been shown to produce unfair results for certain subgroups of population, highlighting the need to understand bias on certain sensitive attributes. Current research often falls short, primarily focusing on the subgroups characterized by a single sensitive attribute, while neglecting the nature of intersectional fairness of multiple sensitive attributes. This paper focuses on its one fundamental aspect by discovering diverse high-bias intersectional sensitive attributes. Specifically, we propose a Bias-Guided Generative Network (BGGN). By treating each bias value as a reward, BGGN efficiently generates high-bias intersectional sensitive attributes. Experiments on real-world text and image datasets demonstrate a diverse and efficient discovery of BGGN. To further evaluate the generated unseen but possible unfair intersectional sensitive attributes, we formulate them as prompts and use modern generative AI to produce new text and images. The results of frequently generating biased data provides new insights of discovering potential unfairness in popular modern generative AI systems. **Warning: This paper contains examples that are offensive in nature.**", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gezheng Xu;Qi CHEN;Charles Ling;Boyu Wang;Changjian Shui", "authorids": "~Gezheng_Xu2;~Qi_CHEN6;~Charles_Ling1;~Boyu_Wang3;~Changjian_Shui2", "gender": "F;F;M;M;Not Specified", "homepage": "https://github.com/xugezheng;https://livreq.github.io/;http://cling.csd.uwo.ca/;https://sites.google.com/site/borriewang/;https://cjshui.github.io", "dblp": "293/7645;66/6320-15.html;;41/6565-4.html;215/5461", "google_scholar": ";MqLoSeoAAAAJ;https://scholar.google.co.uk/citations?hl=en;qAZM5KcAAAAJ;r91NXUgAAAAJ", "orcid": "0000-0001-5983-5756;0000-0002-7213-0221;;0000-0002-7413-4162;", "linkedin": ";;;;", "or_profile": "~Gezheng_Xu2;~Qi_CHEN6;~Charles_Ling1;~Boyu_Wang3;~changjian_shui1", "aff": "University of Western Ontario;Laval university;Western University;University of Western Ontario;Vector Institute", "aff_domain": "uwo.ca;ulaval.ca;uwo.ca;uwo.ca;vectorinstitute.ai", "position": "PhD student;PhD student;Professor;Assistant Professor;Postdoc", "bibtex": "@inproceedings{\nxu2024intersectional,\ntitle={Intersectional Unfairness Discovery},\nauthor={Gezheng Xu and Qi CHEN and Charles Ling and Boyu Wang and Changjian Shui},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FhWH9TQSMh}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5169660, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2911248284895548045&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 8, "email": "uwo.ca;ulaval.ca;uwo.ca;uwo.ca;vectorinstitute.ai", "author_num": 5, "aff_unique_index": "0;1;2;0;3", "aff_unique_norm": "University of Western Ontario;Laval University;Western University;Vector Institute", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.uwo.ca;https://www.laval.ca;https://www.uwo.ca;https://vectorinstitute.ai/", "aff_unique_abbr": "UWO;Laval;Western;Vector Institute", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Canada" }, { "title": "Generative Conditional Distributions by Neural (Entropic) Optimal Transport", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34524", "id": "FoRqdsN4IA", "proceeding": "https://proceedings.mlr.press/v235/nguyen24h.html", "pdf": "https://openreview.net/pdf?id=FoRqdsN4IA", "openreview": "https://openreview.net/forum?id=FoRqdsN4IA", "author_site": "Bao Nguyen, Binh Nguyen, Trung Hieu Nguyen, Viet Anh Nguyen", "tldr": "", "abstract": "Learning conditional distributions is challenging because the desired outcome is not a single distribution but multiple distributions that correspond to multiple instances of the covariates. We introduce a novel neural entropic optimal transport method designed to effectively learn generative models of conditional distributions, particularly in scenarios characterized by limited sample sizes. Our method relies on the minimax training of two neural networks: a generative network parametrizing the inverse cumulative distribution functions of the conditional distributions and another network parametrizing the conditional Kantorovich potential. To prevent overfitting, we regularize the objective function by penalizing the Lipschitz constant of the network output. Our experiments on real-world datasets show the effectiveness of our algorithm compared to state-of-the-art conditional distribution learning techniques. Our implementation can be found at https://github.com/nguyenngocbaocmt02/GENTLE.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bao Nguyen;Binh Nguyen;Hieu Trung Nguyen;Viet Anh Nguyen", "authorids": "~Bao_Nguyen2;~Binh_Nguyen2;~Hieu_Trung_Nguyen2;~Viet_Anh_Nguyen2", "gender": "M;M;M;M", "homepage": "https://www.researchgate.net/profile/Nguyen-Bao-36;https://tbng.github.io/;;http://www.vietanhnguyen.net", "dblp": "299/2996;241/2542;;", "google_scholar": ";6rpHj_YAAAAJ;OlFCFKgAAAAJ;3iyf-EoAAAAJ", "orcid": "0000-0002-6770-2408;;;", "linkedin": "bao-nguyen-0a360b197/;;hieu-nguyen-08774317a/;", "or_profile": "~Bao_Nguyen2;~Binh_Nguyen2;~Hieu_Trung_Nguyen2;~Viet_Anh_Nguyen2", "aff": "VinUniversity;National University of Singapore;The Chinese University of Hong Kong;The Chinese University of Hong Kong", "aff_domain": "vinuni.edu.vn;nus.edu.sg;cuhk.edu.hk;cuhk.edu.hk", "position": "Research Assistant;Research Fellow;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nnguyen2024generative,\ntitle={Generative Conditional Distributions by Neural (Entropic) Optimal Transport},\nauthor={Bao Nguyen and Binh Nguyen and Hieu Trung Nguyen and Viet Anh Nguyen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FoRqdsN4IA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 551077, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16621129372883152145&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "vinuni.edu.vn;nus.edu.sg;cuhk.edu.hk;cuhk.edu.hk", "author_num": 4, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "VinUniversity;National University of Singapore;Chinese University of Hong Kong", "aff_unique_dep": ";;", "aff_unique_url": "https://vinuni.edu.vn;https://www.nus.edu.sg;https://www.cuhk.edu.hk", "aff_unique_abbr": "VinUni;NUS;CUHK", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;2;2", "aff_country_unique": "Vietnam;Singapore;China" }, { "title": "ReGAL: Refactoring Programs to Discover Generalizable Abstractions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34523", "id": "FovMAzXUpj", "proceeding": "https://proceedings.mlr.press/v235/stengel-eskin24a.html", "pdf": "https://openreview.net/pdf?id=FovMAzXUpj", "openreview": "https://openreview.net/forum?id=FovMAzXUpj", "author_site": "Elias Stengel-Eskin, Archiki Prasad, Mohit Bansal", "tldr": "", "abstract": "While large language models (LLMs) are increasingly being used for program synthesis, they lack the global view needed to develop useful abstractions; they generally predict programs one at a time, often repeating the same functionality. Generating redundant code from scratch is both inefficient and error-prone. To address this, we propose Refactoring for Generalizable Abstraction Learning (ReGAL), a gradient-free method for learning a library of reusable functions via code refactorization, i.e., restructuring code without changing its execution output. ReGAL learns from a small set of existing programs, iteratively verifying and refining its abstractions via execution. We find that the shared function libraries discovered by ReGAL make programs easier to predict across diverse domains. On five datasets \u2013 LOGO graphics generation, Date reasoning, TextCraft (a Minecraft-based text-game) MATH, and TabMWP \u2013 both open-source and proprietary LLMs improve in accuracy when predicting programs with REGAL functions. For CodeLlama-13B, REGAL results in absolute accuracy increases of 11.5% on LOGO, 26.1% on date understanding, and 8.1% on TextCraft, out-performing GPT-3.5 in two of three domains. Our analysis reveals REGAL\u2019s abstractions encapsulate frequently-used subroutines as well as environment dynamics.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Elias Stengel-Eskin;Archiki Prasad;Mohit Bansal", "authorids": "~Elias_Stengel-Eskin1;~Archiki_Prasad1;~Mohit_Bansal2", "gender": "M;F;M", "homepage": "https://esteng.github.io;https://archiki.github.io/;https://www.cs.unc.edu/~mbansal/", "dblp": "212/6138;264/2812;32/5243.html", "google_scholar": "gr_ZVSQAAAAJ;Svcwv-IAAAAJ;DN8QtscAAAAJ", "orcid": "0000-0002-6689-505X;;", "linkedin": ";archiki-prasad;", "or_profile": "~Elias_Stengel-Eskin1;~Archiki_Prasad1;~Mohit_Bansal2", "aff": "University of North Carolina at Chapel Hill;University of North Carolina, Chapel Hill;University of North Carolina at Chapel Hill", "aff_domain": "cs.unc.edu;unc.edu;unc.edu", "position": "Postdoc;PhD student;Full Professor", "bibtex": "@inproceedings{\nstengel-eskin2024regal,\ntitle={Re{GAL}: Refactoring Programs to Discover Generalizable Abstractions},\nauthor={Elias Stengel-Eskin and Archiki Prasad and Mohit Bansal},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FovMAzXUpj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1024775, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17392610968844841878&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 8, "email": "cs.unc.edu;unc.edu;unc.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of North Carolina", "aff_unique_dep": "", "aff_unique_url": "https://www.unc.edu", "aff_unique_abbr": "UNC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Chapel Hill", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Combinatorial Approximations for Cluster Deletion: Simpler, Faster, and Better", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34522", "id": "FpbKoIPHxb", "proceeding": "https://proceedings.mlr.press/v235/balmaseda24a.html", "pdf": "https://openreview.net/pdf?id=FpbKoIPHxb", "openreview": "https://openreview.net/forum?id=FpbKoIPHxb", "author_site": "Vicente Balmaseda, Ying Xu, Yixin Cao, Nate Veldt", "tldr": "", "abstract": "Cluster deletion is an NP-hard graph clustering objective with applications in computational biology and social network analysis, where the goal is to delete a minimum number of edges to partition a graph into cliques. We first provide a tighter analysis of two previous approximation algorithms, improving their approximation guarantees from 4 to 3. Moreover, we show that both algorithms can be derandomized in a surprisingly simple way, by greedily taking a vertex of maximum degree in an auxiliary graph and forming a cluster around it. One of these algorithms relies on solving a linear program. Our final contribution is to design a new and purely combinatorial approach for doing so that is far more scalable in theory and practice.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vicente Balmaseda;Ying Xu;Yixin Cao;Nate Veldt", "authorids": "~Vicente_Balmaseda1;~Ying_Xu11;~Yixin_Cao5;~Nate_Veldt2", "gender": ";;M;M", "homepage": "https://vibalcam.github.io/;;https://www.comp.polyu.edu.hk/~csycao/;https://veldt.engr.tamu.edu/", "dblp": ";;https://dblp.uni-trier.de/pers/hy/c/Cao_0001:Yixin;180/5829", "google_scholar": "RRYZEfQAAAAJ;;;6is0_9wAAAAJ", "orcid": ";;;", "linkedin": "vicente-balmaseda/;ying-xu-357615270;;", "or_profile": "~Vicente_Balmaseda1;~Ying_Xu11;~Yixin_Cao5;~Nate_Veldt2", "aff": "Texas A&M University - College Station;Hong Kong Polytechnic University;Hong Kong Polytechnic University;Texas A&M", "aff_domain": "tamu.edu;polyu.edu.hk;polyu.edu.hk;tamu.edu", "position": "MS student;Research Assistant;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nbalmaseda2024combinatorial,\ntitle={Combinatorial Approximations for Cluster Deletion: Simpler, Faster, and Better},\nauthor={Vicente Balmaseda and Ying Xu and Yixin Cao and Nate Veldt},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FpbKoIPHxb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 557228, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4105362076448691380&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "tamu.edu;polyu.edu.hk;polyu.edu.hk;tamu.edu", "author_num": 4, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Texas A&M University;Hong Kong Polytechnic University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tamu.edu;https://www.polyu.edu.hk", "aff_unique_abbr": "TAMU;PolyU", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "College Station;Hong Kong SAR;", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "United States;China" }, { "title": "Accelerating PDE Data Generation via Differential Operator Action in Solution Space", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34521", "id": "Fv9GLw0LkO", "proceeding": "https://proceedings.mlr.press/v235/dong24d.html", "pdf": "https://openreview.net/pdf?id=Fv9GLw0LkO", "openreview": "https://openreview.net/forum?id=Fv9GLw0LkO", "author_site": "huanshuo dong, Hong Wang, Haoyang Liu, Jian Luo, Jie Wang", "tldr": "", "abstract": "Recent advancements in data-driven approaches, such as Neural Operator (NO), have demonstrated their effectiveness in reducing the solving time of Partial Differential Equations (PDEs). However, one major challenge faced by these approaches is the requirement for a large amount of high-precision training data, which needs significant computational costs during the generation process. To address this challenge, we propose a novel PDE dataset generation algorithm, namely **Diff**erential **O**perator **A**ction in **S**olution space (**DiffOAS**), which speeds up the data generation process and enhances the precision of the generated data simultaneously. Specifically, DiffOAS obtains a few basic PDE solutions and then combines them to get solutions. It applies differential operators on these solutions, a process we call 'operator action', to efficiently generate precise PDE data points. Theoretical analysis shows that the time complexity of DiffOAS method is one order lower than the existing generation method. Experimental results show that DiffOAS accelerates the generation of large-scale datasets with 10,000 instances by 300 times. Even with just 5% of the generation time, NO trained on the data generated by DiffOAS exhibits comparable performance to that using the existing generation method, which highlights the efficiency of DiffOAS.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "huanshuo dong;Hong Wang;Haoyang Liu;Jian Luo;Jie Wang", "authorids": "~huanshuo_dong1;~Hong_Wang14;~Haoyang_Liu2;~Jian_Luo5;~Jie_Wang1", "gender": "M;M;M;;M", "homepage": "https://huanshuodong.github.io;https://wanghong1700.github.io/;https://miralab.ai/people/haoyang-liu/;https://smart-jluo.github.io/;http://staff.ustc.edu.cn/~jwangx", "dblp": ";;53/8773-2.html;;29/5259-5", "google_scholar": ";;;AGvDtzwAAAAJ;OugG4dUAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~huanshuo_dong1;~Hong_Wang14;~Haoyang_Liu2;~Jian_Luo5;~Jie_Wang1", "aff": "University of Science and Technology of China;University of Science and Technology of China;University of Science and Technology of China;University of Science and Technology of China;University of Science and Technology of China", "aff_domain": "mail.ustc.edu.cn;ustc.edu.cn;ustc.edu;ustc.edu.cn;ustc.edu.cn", "position": "Undergrad student;PhD student;MS student;MS student;Full Professor", "bibtex": "@inproceedings{\ndong2024accelerating,\ntitle={Accelerating {PDE} Data Generation via Differential Operator Action in Solution Space},\nauthor={huanshuo dong and Hong Wang and Haoyang Liu and Jian Luo and Jie Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Fv9GLw0LkO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 537949, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5438538052292771618&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "mail.ustc.edu.cn;ustc.edu.cn;ustc.edu;ustc.edu.cn;ustc.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of Science and Technology of China", "aff_unique_dep": "", "aff_unique_url": "http://www.ustc.edu.cn", "aff_unique_abbr": "USTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Vague Prototype-Oriented Diffusion Model for Multi-Class Anomaly Detection", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34520", "id": "FvLd8Gr7xq", "proceeding": "https://proceedings.mlr.press/v235/li24u.html", "pdf": "https://openreview.net/pdf?id=FvLd8Gr7xq", "openreview": "https://openreview.net/forum?id=FvLd8Gr7xq", "author_site": "yuxin li, Yaoxuan Feng, Bo Chen, Wenchao Chen, Yubiao Wang, Xinyue Hu, baolin sun, QuChunhui, Mingyuan Zhou", "tldr": "", "abstract": "Multi-class unsupervised anomaly detection aims to create a unified model for identifying anomalies in objects from multiple classes when only normal data is available. In such a challenging setting, widely used reconstruction-based networks persistently grapple with the \"identical shortcut\" problem, wherein the infiltration of abnormal information from the condition biases the output towards an anomalous distribution. In response to this critical challenge, we introduce a Vague Prototype-Oriented Diffusion Model (VPDM) that extracts only fundamental information from the condition to prevent the occurrence of the \"identical shortcut\" problem from the input layer. This model leverages prototypes that contain only vague information about the target as the initial condition. Subsequently, a novel conditional diffusion model is introduced to incrementally enhance details based on vague conditions. Finally, a Vague Prototype-Oriented Optimal Transport (VPOT) method is proposed to provide more accurate information about conditions. All these components are seamlessly integrated into a unified optimization objective. The effectiveness of our approach is demonstrated across diverse datasets, including the MVTec, VisA, and MPDD benchmarks, achieving state-of-the-art results.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuxin Li;Yaoxuan Feng;Bo Chen;Wenchao Chen;Yubiao Wang;Xinyue Hu;baolin sun;Chunhui Qu;Mingyuan Zhou", "authorids": "~Yuxin_Li3;~Yaoxuan_Feng1;~Bo_Chen1;~Wenchao_Chen1;~Yubiao_Wang1;~Xinyue_Hu1;~baolin_sun1;108062863@qq.com;~Mingyuan_Zhou1", "gender": "M;M;M;M;M;F;M;;M", "homepage": "https://liyuxin321.github.io/YuxinLi.github.io/;https://github.com/luolundashu;http://web.xidian.edu.cn/bchen/en/index.html;https://web.xidian.edu.cn/chenwenchao/;http://blog.alanbiao.top/;https://www.researchgate.net/profile/Xinyue-Hu-46;https://web.xidian.edu.cn/chenwenchao/index.html;;http://mingyuanzhou.github.io", "dblp": ";;89/5615-1;;;;;;", "google_scholar": "https://scholar.google.com/citations?hl=en;;;;;;;;LXwCIisAAAAJ", "orcid": "0000-0002-5935-0684;;0000-0001-5151-9388;;;;;;", "linkedin": ";;;;;;;;", "or_profile": "~Yuxin_Li3;~Yaoxuan_Feng1;~Bo_Chen1;~Wenchao_Chen1;~Yubiao_Wang1;~Xinyue_Hu1;~baolin_sun1;108062863@qq.com;~Mingyuan_Zhou1", "aff": "Xidian University;Xidian University;Xidian University;Xidian University;Xidian University;Xidian University;;;Google", "aff_domain": "xidian.edu;stu.xidian.edu.cn;xidian.edu.cn;xidian.edu;xidian.edu.cn;stu.xidian.edu.cn;;;google.com", "position": "PhD student;MS student;Full Professor;Associate Professor;Undergrad student;MS student;;;Researcher", "bibtex": "@inproceedings{\nli2024vague,\ntitle={Vague Prototype-Oriented Diffusion Model for Multi-Class Anomaly Detection},\nauthor={Yuxin Li and Yaoxuan Feng and Bo Chen and Wenchao Chen and Yubiao Wang and Xinyue Hu and baolin sun and Chunhui Qu and Mingyuan Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FvLd8Gr7xq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5049111, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3133998347111658307&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "xidian.edu;stu.xidian.edu.cn;xidian.edu.cn;xidian.edu;xidian.edu.cn;stu.xidian.edu.cn;;;google.com", "author_num": 9, "aff_unique_index": "0;0;0;0;0;0;1", "aff_unique_norm": "Xidian University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "http://www.xidian.edu.cn/;https://www.google.com", "aff_unique_abbr": "Xidian;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "On Discrete Prompt Optimization for Diffusion Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34519", "id": "Fw4fBE2rqW", "proceeding": "https://proceedings.mlr.press/v235/wang24ar.html", "pdf": "https://openreview.net/pdf?id=Fw4fBE2rqW", "openreview": "https://openreview.net/forum?id=Fw4fBE2rqW", "author_site": "Ruochen Wang, Ting Liu, Cho-Jui Hsieh, Boqing Gong", "tldr": "", "abstract": "This paper introduces the first gradient-based framework for prompt optimization in text-to-image diffusion models. We formulate prompt engineering as a discrete optimization problem over the language space. Two major challenges arise in efficiently finding a solution to this problem: (1) Enormous Domain Space: Setting the domain to the entire language space poses significant difficulty to the optimization process. (2) Text Gradient: Efficiently computing the text gradient is challenging, as it requires backpropagating through the inference steps of the diffusion model and a non-differentiable embedding lookup table. Beyond the problem formulation, our main technical contributions lie in solving the above challenges. First, we design a family of dynamically generated compact subspaces comprised of only the most relevant words to user input, substantially restricting the domain space. Second, we introduce ``Shortcut Text Gradient\" --- an effective replacement for the text gradient that can be obtained with constant memory and runtime. Empirical evaluation on prompts collected from diverse sources (DiffusionDB, ChatGPT, COCO) suggests that our method can discover prompts that substantially improve (prompt enhancement) or destroy (adversarial attack) the faithfulness of images generated by the text-to-image diffusion model.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ruochen Wang;Ting Liu;Cho-Jui Hsieh;Boqing Gong", "authorids": "~Ruochen_Wang2;~Ting_Liu4;~Cho-Jui_Hsieh1;~Boqing_Gong1", "gender": "M;;M;M", "homepage": "https://ruocwang.github.io/;http://tliu.org;http://web.cs.ucla.edu/~chohsieh/index.html;http://boqinggong.info", "dblp": "33/120;52/5150-5;14/2770;29/7457", "google_scholar": "8fXrlRAAAAAJ;4wSfAIQAAAAJ;Wy89g4IAAAAJ;lv9ZeVUAAAAJ", "orcid": ";;;", "linkedin": "ruochen-wang-1699b1113/;;;boqing-gong-46aa5821/", "or_profile": "~Ruochen_Wang2;~Ting_Liu4;~Cho-Jui_Hsieh1;~Boqing_Gong1", "aff": "University of California, Los Angeles;Google DeepMind;University of California, Los Angeles;Google", "aff_domain": "ucla.edu;google.com;ucla.edu;google.com", "position": "PhD student;Researcher;Associate Professor;Research Scientist", "bibtex": "@inproceedings{\nwang2024on,\ntitle={On Discrete Prompt Optimization for Diffusion Models},\nauthor={Ruochen Wang and Ting Liu and Cho-Jui Hsieh and Boqing Gong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Fw4fBE2rqW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2306214, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8512157600961112352&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "ucla.edu;google.com;ucla.edu;google.com", "author_num": 4, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "University of California, Los Angeles;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.ucla.edu;https://deepmind.com", "aff_unique_abbr": "UCLA;DeepMind", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Los Angeles;;Mountain View", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Implicit meta-learning may lead language models to trust more reliable sources", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34518", "id": "Fzp1DRzCIN", "proceeding": "https://proceedings.mlr.press/v235/krasheninnikov24a.html", "pdf": "https://openreview.net/pdf?id=Fzp1DRzCIN", "openreview": "https://openreview.net/forum?id=Fzp1DRzCIN", "author_site": "Dmitrii Krasheninnikov, Egor Krasheninnikov, Bruno Mlodozeniec, Tegan Maharaj, David Krueger", "tldr": "", "abstract": "We demonstrate that large language models (LLMs) may learn indicators of document usefulness and modulate their updates accordingly. We introduce random strings (\"tags\") as indicators of usefulness in a synthetic fine-tuning dataset. Fine-tuning on this dataset leads to **implicit meta-learning (IML)**: in further fine-tuning, the model updates to make more use of text that is tagged as useful. We perform a thorough empirical investigation of this phenomenon, finding (among other things) that (i) it occurs in both pretrained LLMs and those trained from scratch, as well as on a vision task, and (ii) larger models and smaller batch sizes tend to give more IML. We also use probing to examine how IML changes the way models store knowledge in their parameters. Finally, we reflect on what our results might imply about the capabilities, risks, and controllability of future AI systems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dmitrii Krasheninnikov;Egor Krasheninnikov;Bruno Kacper Mlodozeniec;Tegan Maharaj;David Krueger", "authorids": "~Dmitrii_Krasheninnikov1;~Egor_Krasheninnikov1;~Bruno_Kacper_Mlodozeniec2;~Tegan_Maharaj1;~David_Krueger1", "gender": "M;M;Not Specified;F;M", "homepage": "https://krasheninnikov.github.io/about/;;https://brunokm.github.io;http://teganmaharaj.com;https://mila.umontreal.ca/en/person/david-scott-krueger/", "dblp": ";;241/6874;;142/2741.html", "google_scholar": "BIQflKQAAAAJ;6DiC_yYAAAAJ;kGPBRy8AAAAJ;https://scholar.google.ca/citations?user=XpscC-EAAAAJ;https://scholar.google.ca/citations?user=5Uz70IoAAAAJ", "orcid": ";;;;", "linkedin": ";;bkmlodozeniec/;;", "or_profile": "~Dmitrii_Krasheninnikov1;~Egor_Krasheninnikov1;~Bruno_Kacper_Mlodozeniec2;~Tegan_Maharaj1;~David_Krueger1", "aff": "University of Cambridge;University of Cambridge;University of Cambridge;Ecole Polytechnique de Montreal;University of Cambridge", "aff_domain": "cam.ac.uk;cam.ac.uk;cam.ac.uk;polymtl.ca;cam.ac.uk", "position": "PhD student;Research Assistant;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nkrasheninnikov2024implicit,\ntitle={Implicit meta-learning may lead language models to trust more reliable sources},\nauthor={Dmitrii Krasheninnikov and Egor Krasheninnikov and Bruno Kacper Mlodozeniec and Tegan Maharaj and David Krueger},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Fzp1DRzCIN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1199092, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18170984631826516737&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "cam.ac.uk;cam.ac.uk;cam.ac.uk;polymtl.ca;cam.ac.uk", "author_num": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "University of Cambridge;Ecole Polytechnique de Montreal", "aff_unique_dep": ";", "aff_unique_url": "https://www.cam.ac.uk;https://www.polymtl.ca", "aff_unique_abbr": "Cambridge;Polytechnique Montreal", "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Cambridge;Montreal", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United Kingdom;Canada" }, { "title": "Delving into Differentially Private Transformer", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34517", "id": "FzyMdAm2fZ", "proceeding": "https://proceedings.mlr.press/v235/ding24g.html", "pdf": "https://openreview.net/pdf?id=FzyMdAm2fZ", "openreview": "https://openreview.net/forum?id=FzyMdAm2fZ", "author_site": "Youlong Ding, Xueyang Wu, Yining meng, Yonggang Luo, Hao Wang, Pan Weike", "tldr": "", "abstract": "Deep learning with differential privacy (DP) has garnered significant attention over the past years, leading to the development of numerous methods aimed at enhancing model accuracy and training efficiency. This paper delves into the problem of training Transformer models with differential privacy. Our treatment is modular: the logic is to 'reduce' the problem of training DP Transformer to the more basic problem of training DP vanilla neural nets. The latter is better understood and amenable to many model-agnostic methods. Such 'reduction' is done by first identifying the hardness unique to DP Transformer training: the attention distraction phenomenon and a lack of compatibility with existing techniques for efficient gradient clipping. To deal with these two issues, we propose the Re-Attention Mechanism and Phantom Clipping, respectively. We believe that our work not only casts new light on training DP Transformers but also promotes a modular treatment to advance research in the field of differentially private deep learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Youlong Ding;Xueyang Wu;Yining meng;Yonggang Luo;Hao Wang;Weike Pan", "authorids": "~Youlong_Ding1;~Xueyang_Wu1;~Yining_meng1;~Yonggang_Luo1;~Hao_Wang3;~Weike_Pan1", "gender": ";;F;M;;", "homepage": ";http://www.cse.ust.hk/~xwuba;https://github.com/Myn55838;https://github.com/pomodoromjy;;", "dblp": ";https://dblp.uni-trier.de/pid/194/1291-1;;;;", "google_scholar": ";ZySbpIAAAAAJ;;;;", "orcid": ";0000-0001-5419-7273;0009-0001-1751-8573;0009-0000-3973-7606;;", "linkedin": ";;;;;", "or_profile": "~Youlong_Ding1;~Xueyang_Wu1;~Yining_meng1;~Yonggang_Luo1;~Hao_Wang3;~Weike_Pan1", "aff": ";Department of Computer Science and Engineering, The Hong Kong University of Science and Technology;;;;", "aff_domain": ";cse.ust.hk;;;;", "position": ";PhD student;;;;", "bibtex": "@inproceedings{\nding2024delving,\ntitle={Delving into Differentially Private Transformer},\nauthor={Youlong Ding and Xueyang Wu and Yining meng and Yonggang Luo and Hao Wang and Weike Pan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FzyMdAm2fZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1285192, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5025052787081831433&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": ";cse.ust.hk;;;;", "author_num": 6, "aff_unique_index": "0", "aff_unique_norm": "Hong Kong University of Science and Technology", "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://www.ust.hk", "aff_unique_abbr": "HKUST", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "Learning Pseudo-Contractive Denoisers for Inverse Problems", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34516", "id": "G0vZ5ENrJQ", "proceeding": "https://proceedings.mlr.press/v235/wei24b.html", "pdf": "https://openreview.net/pdf?id=G0vZ5ENrJQ", "openreview": "https://openreview.net/forum?id=G0vZ5ENrJQ", "author_site": "Deliang Wei, Peng Chen, Fang Li", "tldr": "", "abstract": "Deep denoisers have shown excellent performance in solving inverse problems in signal and image processing. In order to guarantee the convergence, the denoiser needs to satisfy some Lipschitz conditions like non-expansiveness. However, enforcing such constraints inevitably compromises recovery performance. This paper introduces a novel training strategy that enforces a weaker constraint on the deep denoiser called pseudo-contractiveness. By studying the spectrum of the Jacobian matrix, relationships between different denoiser assumptions are revealed. Effective algorithms based on gradient descent and Ishikawa process are derived, and further assumptions of strict pseudo-contractiveness yield efficient algorithms using half-quadratic splitting and forward-backward splitting. The proposed algorithms theoretically converge strongly to a fixed point. A training strategy based on holomorphic transformation and functional calculi is proposed to enforce the pseudo-contractive denoiser assumption. Extensive experiments demonstrate superior performance of the pseudo-contractive denoiser compared to related denoisers. The proposed methods are competitive in terms of visual effects and quantitative values.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Deliang Wei;Peng Chen;Fang Li", "authorids": "~Deliang_Wei2;~Peng_Chen18;~Fang_Li4", "gender": "M;M;F", "homepage": "https://github.com/FizzzFizzz;;http://math.ecnu.edu.cn/~fli", "dblp": ";;55/2162-4.html", "google_scholar": "vmzZ1qsAAAAJ;https://scholar.google.com.hk/citations?hl=en;2_8WX68AAAAJ", "orcid": ";0009-0001-8466-6445;0000-0001-6804-2651", "linkedin": ";;", "or_profile": "~Deliang_Wei2;~Peng_Chen18;~Fang_Li4", "aff": "East China Normal University;East China Normal University;East China Normal University", "aff_domain": "stu.ecnu.edu.cn;ecnu.edu.cn;ecnu.edu.cn", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nwei2024learning,\ntitle={Learning Pseudo-Contractive Denoisers for Inverse Problems},\nauthor={Deliang Wei and Peng Chen and Fang Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=G0vZ5ENrJQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7648334, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17725192920866271295&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "stu.ecnu.edu.cn;ecnu.edu.cn;ecnu.edu.cn", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "East China Normal University", "aff_unique_dep": "", "aff_unique_url": "http://www.ecnu.edu.cn", "aff_unique_abbr": "ECNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "ByMI: Byzantine Machine Identification with False Discovery Rate Control", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34515", "id": "G0z4bCNmkG", "proceeding": "https://proceedings.mlr.press/v235/qian24b.html", "pdf": "https://openreview.net/pdf?id=G0z4bCNmkG", "openreview": "https://openreview.net/forum?id=G0z4bCNmkG", "author_site": "Chengde Qian, Mengyuan Wang, Haojie Ren, Changliang Zou", "tldr": "", "abstract": "Various robust estimation methods or algorithms have been proposed to hedge against Byzantine failures in distributed learning. However, there is a lack of systematic approaches to provide theoretical guarantees of significance in detecting those Byzantine machines. In this paper, we develop a general detection procedure, ByMI, via error rate control to address this issue, which is applicable to many robust learning problems. The key idea is to apply the sample-splitting strategy on each worker machine to construct a score statistic integrated with a general robust estimation and then to utilize the symmetry property of those scores to derive a data-driven threshold. The proposed method is dimension insensitive and p-value free with the help of the symmetry property and can achieve false discovery rate control under mild conditions. Numerical experiments on both synthetic and real data validate the theoretical results and demonstrate the effectiveness of our proposed method on Byzantine machine identification.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chengde Qian;Mengyuan Wang;Haojie Ren;Changliang Zou", "authorids": "~Chengde_Qian1;~Mengyuan_Wang2;~Haojie_Ren1;~Changliang_Zou2", "gender": "M;F;F;M", "homepage": "https://www.qstat.site/;;https://sites.google.com/view/haojieren;http://web.stat.nankai.edu.cn/chlzou/", "dblp": "242/9719;;;", "google_scholar": ";;qfd5nS8AAAAJ;LPwSdmwAAAAJ", "orcid": ";0009-0006-2406-5150;;", "linkedin": ";;;", "or_profile": "~Chengde_Qian1;~Mengyuan_Wang2;~Haojie_Ren1;~Changliang_Zou2", "aff": "Nankai University;;Shanghai Jiaotong University;Nankai University", "aff_domain": "nankai.edu.cn;;sjtu.edu.cn;nankai.edu.cn", "position": "PhD student;;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nqian2024bymi,\ntitle={By{MI}: Byzantine Machine Identification with False Discovery Rate Control},\nauthor={Chengde Qian and Mengyuan Wang and Haojie Ren and Changliang Zou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=G0z4bCNmkG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 897789, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1706116435953168216&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 6, "email": "nankai.edu.cn;;sjtu.edu.cn;nankai.edu.cn", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Nankai University;Shanghai Jiao Tong University", "aff_unique_dep": ";", "aff_unique_url": "http://www.nankai.edu.cn;https://www.sjtu.edu.cn", "aff_unique_abbr": "NKU;SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "GFlowNet Training by Policy Gradients", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34514", "id": "G1igwiBBUj", "proceeding": "https://proceedings.mlr.press/v235/niu24c.html", "pdf": "https://openreview.net/pdf?id=G1igwiBBUj", "openreview": "https://openreview.net/forum?id=G1igwiBBUj", "author_site": "Puhua Niu, Shili Wu, Mingzhou Fan, Xiaoning Qian", "tldr": "", "abstract": "Generative Flow Networks (GFlowNets) have been shown effective to generate combinatorial objects with desired properties. We here propose a new GFlowNet training framework, with policy-dependent rewards, that bridges keeping flow balance of GFlowNets to optimizing the expected accumulated reward in traditional Reinforcement-Learning (RL). This enables the derivation of new policy-based GFlowNet training methods, in contrast to existing ones resembling value-based RL. It is known that the design of backward policies in GFlowNet training affects efficiency. We further develop a coupled training strategy that jointly solves GFlowNet forward policy training and backward policy design. Performance analysis is provided with a theoretical guarantee of our policy-based GFlowNet training. Experiments on both simulated and real-world datasets verify that our policy-based strategies provide advanced RL perspectives for robust gradient estimation to improve GFlowNet performance. Our code is available at: [github.com/niupuhua1234/GFN-PG](https://github.com/niupuhua1234/GFN-PG).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Puhua Niu;Shili Wu;Mingzhou Fan;Xiaoning Qian", "authorids": "~Puhua_Niu2;~Shili_Wu1;~Mingzhou_Fan1;~Xiaoning_Qian2", "gender": "M;M;M;M", "homepage": ";;https://www.google.com/;https://www.ece.tamu.edu/~xqian", "dblp": "353/4360;158/7503;294/0813;62/4504", "google_scholar": "LWOKY7oAAAAJ;;;dXGlddgAAAAJ", "orcid": "0000-0002-5127-1690;0000-0002-0936-7349;;0000-0002-4347-2476", "linkedin": ";shili-wu-a8051396/;;", "or_profile": "~Puhua_Niu2;~Shili_Wu1;~Mingzhou_Fan1;~Xiaoning_Qian2", "aff": "Texas A&M;Texas A&M University - College Station;Texas A&M;Texas A&M", "aff_domain": "tamu.edu;tamu.edu;tamu.edu;tamu.edu", "position": "Texas A&M Unverisity;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nniu2024gflownet,\ntitle={{GF}lowNet Training by Policy Gradients},\nauthor={Puhua Niu and Shili Wu and Mingzhou Fan and Xiaoning Qian},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=G1igwiBBUj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2827758, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14916572601665811036&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "tamu.edu;tamu.edu;tamu.edu;tamu.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Texas A&M University", "aff_unique_dep": "", "aff_unique_url": "https://www.tamu.edu", "aff_unique_abbr": "TAMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";College Station", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Minimum Norm Interpolation Meets The Local Theory of Banach Spaces", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34513", "id": "G4b32bKnBy", "proceeding": "https://proceedings.mlr.press/v235/kur24a.html", "pdf": "https://openreview.net/pdf?id=G4b32bKnBy", "openreview": "https://openreview.net/forum?id=G4b32bKnBy", "author_site": "Gil Kur, Pedro Abdalla, Pierre Bizeul, Fanny Yang", "tldr": "", "abstract": "Minimum-norm interpolators have recently gained attention primarily as an analyzable model to shed light on the double descent phenomenon observed for neural networks. The majority of the work has focused on analyzing interpolators in Hilbert spaces, where typically an effectively low-rank structure of the feature covariance prevents a large bias. More recently, tight vanishing bounds have also been shown for isotropic high-dimensional data for $\\ell_p$-spaces with $p\\in[1,2)$, leveraging sparse structure of the ground truth. However, these proofs are tailored to specific settings and hard to generalize. This paper takes a first step towards establishing a general framework that connects generalization properties of the interpolators to well-known concepts from high-dimensional geometry, specifically, from the local theory of Banach spaces. In particular, we show that under $2$-uniform convexity, the bias of the minimal norm solution is bounded by the Gaussian complexity of the class. We then prove a ``reverse'' Efron-Stein lower bound on the expected conditional variance of the minimal norm solution under cotype $2$. Finally, we prove that this bound is sharp for $\\ell_p$-linear regression under sub-Gaussian covariates.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gil Kur;Pedro Abdalla;Pierre Bizeul;Fanny Yang", "authorids": "~Gil_Kur2;~Pedro_Abdalla1;~Pierre_Bizeul1;~Fanny_Yang1", "gender": "M;M;M;", "homepage": ";https://people.math.ethz.ch/~pabdalla/;;http://www.fanny-yang.de", "dblp": "236/4833;;;126/4852", "google_scholar": "yDkAhccAAAAJ;;JIj41s0AAAAJ;BfDKicQAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Gil_Kur2;~Pedro_Abdalla1;~Pierre_Bizeul1;~Fanny_Yang1", "aff": "Department of Computer Science, ETHZ - ETH Zurich;ETHZ - ETH Zurich;Technion - Israel Institute of Technology, Technion;Swiss Federal Institute of Technology", "aff_domain": "inf.ethz.ch;ethz.ch;campus.technion;ethz.ch", "position": "Postdoc;PhD student;Postdoc;Professor", "bibtex": "@inproceedings{\nkur2024minimum,\ntitle={Minimum Norm Interpolation Meets The Local Theory of Banach Spaces},\nauthor={Gil Kur and Pedro Abdalla and Pierre Bizeul and Fanny Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=G4b32bKnBy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 538740, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5854044649449131658&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "inf.ethz.ch;ethz.ch;campus.technion;ethz.ch", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "ETH Zurich;Technion - Israel Institute of Technology;Swiss Federal Institute of Technology", "aff_unique_dep": "Department of Computer Science;;", "aff_unique_url": "https://www.ethz.ch;https://www.technion.ac.il;https://www.ethz.ch", "aff_unique_abbr": "ETHZ;Technion;ETH Zurich", "aff_campus_unique_index": "0", "aff_campus_unique": "Zurich;", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Switzerland;Israel" }, { "title": "SCoRe: Submodular Combinatorial Representation Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34512", "id": "G8zDeKOp0R", "proceeding": "https://proceedings.mlr.press/v235/majee24a.html", "pdf": "https://openreview.net/pdf?id=G8zDeKOp0R", "openreview": "https://openreview.net/forum?id=G8zDeKOp0R", "author_site": "Anay Majee, Suraj Kothawade, Krishnateja Killamsetty, Rishabh Iyer", "tldr": "", "abstract": "In this paper we introduce the **SCoRe** (**S**ubmodular **Co**mbinatorial **Re**presentation Learning) framework, a novel approach in representation learning that addresses inter-class bias and intra-class variance. SCoRe provides a new combinatorial viewpoint to representation learning, by introducing a family of loss functions based on set-based submodular information measures. We develop two novel combinatorial formulations for loss functions, using the *Total Information* and *Total Correlation*, that naturally minimize intra-class variance and inter-class bias. Several commonly used metric/contrastive learning loss functions like supervised contrastive loss, orthogonal projection loss, and N-pairs loss, are all instances of SCoRe, thereby underlining the versatility and applicability of SCoRe in a broad spectrum of learning scenarios. Novel objectives in SCoRe naturally model class-imbalance with up to 7.6% improvement in classification on CIFAR-10-LT, CIFAR-100-LT, MedMNIST, 2.1% on ImageNet-LT, and 19.4% in object detection on IDD and LVIS (v1.0), demonstrating its effectiveness over existing approaches.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anay Majee;Suraj Nandkishor Kothawade;Krishnateja Killamsetty;Rishabh K Iyer", "authorids": "~Anay_Majee1;~Suraj_Nandkishor_Kothawade2;~Krishnateja_Killamsetty1;~Rishabh_K_Iyer2", "gender": "M;M;M;M", "homepage": "https://amajee11us.github.io/;http://surajk.me;https://krishnatejakillamsetty.me;https://www.rishiyer.com", "dblp": "284/3256;220/3896;https://dblp.uni-trier.de/pid/273/3972;37/10544.html", "google_scholar": "https://scholar.google.com/citations?hl=en;;cHDE-2YAAAAJ;l_XxJ1kAAAAJ", "orcid": "0000-0003-0189-8310;;;", "linkedin": "anaymajee/;suraj-kothawade-6835b5a9/;krishnateja-killamsetty/;rishabh-iyer-36893717/", "or_profile": "~Anay_Majee1;~Suraj_Nandkishor_Kothawade2;~Krishnateja_Killamsetty1;~Rishabh_K_Iyer2", "aff": "The University of Texas at Dallas;Google;International Business Machines;Microsoft", "aff_domain": "utdallas.edu;google.com;ibm.com;microsoft.com", "position": "PhD student;Researcher;Researcher;Research Scientist", "bibtex": "@inproceedings{\nmajee2024score,\ntitle={{SC}oRe: Submodular Combinatorial Representation Learning},\nauthor={Anay Majee and Suraj Nandkishor Kothawade and Krishnateja Killamsetty and Rishabh K Iyer},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=G8zDeKOp0R}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5046091, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7524011213587440013&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "utdallas.edu;google.com;ibm.com;microsoft.com", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Texas at Dallas;Google;International Business Machines Corporation;Microsoft", "aff_unique_dep": ";Google;;Microsoft Corporation", "aff_unique_url": "https://www.utdallas.edu;https://www.google.com;https://www.ibm.com;https://www.microsoft.com", "aff_unique_abbr": "UT Dallas;Google;IBM;Microsoft", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Dallas;Mountain View;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Differentiable Weightless Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34511", "id": "GBxflz0qdX", "proceeding": "https://proceedings.mlr.press/v235/bacellar24a.html", "pdf": "https://openreview.net/pdf?id=GBxflz0qdX", "openreview": "https://openreview.net/forum?id=GBxflz0qdX", "author_site": "Alan Bacellar, Zachary Susskind, Mauricio Breternitz Jr, Eugene John, Lizy John, Priscila Lima, Felipe Fran\u00e7a", "tldr": "", "abstract": "We introduce the Differentiable Weightless Neural Network (DWN), a model based on interconnected lookup tables. Training of DWNs is enabled by a novel Extended Finite Difference technique for approximate differentiation of binary values. We propose Learnable Mapping, Learnable Reduction, and Spectral Regularization to further improve the accuracy and efficiency of these models. We evaluate DWNs in three edge computing contexts: (1) an FPGA-based hardware accelerator, where they demonstrate superior latency, throughput, energy efficiency, and model area compared to state-of-the-art solutions, (2) a low-power microcontroller, where they achieve preferable accuracy to XGBoost while subject to stringent memory constraints, and (3) ultra-low-cost chips, where they consistently outperform small models in both accuracy and projected hardware area. DWNs also compare favorably against leading approaches for tabular datasets, with higher average rank. Overall, our work positions DWNs as a pioneering solution for edge-compatible high-throughput neural networks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alan Tendler Leibel Bacellar;Zachary Susskind;Mauricio Breternitz Jr;Eugene John;Lizy Kurian John;Priscila Machado Vieira Lima;Felipe M.G. Fran\u00e7a", "authorids": "~Alan_Tendler_Leibel_Bacellar1;~Zachary_Susskind1;~Mauricio_Breternitz_Jr1;eugene.john@utsa.edu;~Lizy_Kurian_John1;~Priscila_Machado_Vieira_Lima1;~Felipe_M.G._Fran\u00e7a1", "gender": "M;M;M;;F;Not Specified;M", "homepage": "https://www.researchgate.net/profile/Alan-Bacellar-2;https://zsknd.com/;;;https://www.ece.utexas.edu/people/faculty/lizy-john;;https://www.cienciavitae.pt/portal/6D1D-CD3F-CB14", "dblp": "283/4823;277/7756.html;b/MauricioBreternitz.html;;j/LizyKurianJohn.html;75/6119.html;f/FelipeMaiaGalvaoFranca.html", "google_scholar": "HfYBXYYAAAAJ;https://scholar.google.com/citations?hl=en;pnmGA74AAAAJ;;YPu9rWUAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-3346-7665;0000-0002-7244-6285;0000-0003-1752-6255;;0000-0002-8747-5214;0000-0002-8515-9904;0000-0002-8980-6208", "linkedin": "alanbacellar/;zsusskind/;;;;;", "or_profile": "~Alan_Tendler_Leibel_Bacellar1;~Zachary_Susskind1;~Mauricio_Breternitz_Jr1;eugene.john@utsa.edu;~Lizy_Kurian_John1;~Priscila_Machado_Vieira_Lima1;~Felipe_M.G._Fran\u00e7a1", "aff": "University of Texas at Austin;University of Texas at Austin;ISCTE - Instituto Universit\u00e1rio de Lisboa;;University of Texas at Austin;Universidade Federal do Rio de Janeiro;Instituto de Telecomunica\u00e7\u00f5es, Portugal", "aff_domain": "utexas.edu;utexas.edu;iscte.pt;;utexas.edu;ufrj.br;it.pt", "position": "PhD student;PhD student;Principal Researcher;;Full Professor;Lecturer;Researcher", "bibtex": "@inproceedings{\nbacellar2024differentiable,\ntitle={Differentiable Weightless Neural Networks},\nauthor={Alan Tendler Leibel Bacellar and Zachary Susskind and Mauricio Breternitz Jr and Eugene John and Lizy Kurian John and Priscila Machado Vieira Lima and Felipe M.G. Fran{\\c{c}}a},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GBxflz0qdX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1699665, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11610810132843230837&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "email": "utexas.edu;utexas.edu;iscte.pt;;utexas.edu;ufrj.br;it.pt", "author_num": 7, "aff_unique_index": "0;0;1;0;2;3", "aff_unique_norm": "University of Texas at Austin;Instituto Universit\u00e1rio de Lisboa;Universidade Federal do Rio de Janeiro;Instituto de Telecomunica\u00e7\u00f5es", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.utexas.edu;https://www.iscte-iul.pt;https://www.ufrj.br;https://www.it.pt", "aff_unique_abbr": "UT Austin;ISCTE;UFRJ;", "aff_campus_unique_index": "0;0;0;2", "aff_campus_unique": "Austin;;Rio de Janeiro", "aff_country_unique_index": "0;0;1;0;2;1", "aff_country_unique": "United States;Portugal;Brazil" }, { "title": "DsDm: Model-Aware Dataset Selection with Datamodels", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34510", "id": "GC8HkKeH8s", "proceeding": "https://proceedings.mlr.press/v235/engstrom24a.html", "pdf": "https://openreview.net/pdf?id=GC8HkKeH8s", "openreview": "https://openreview.net/forum?id=GC8HkKeH8s", "author_site": "Logan Engstrom", "tldr": "", "abstract": "When selecting data for training large-scale models, standard practice is to filter for examples that match human notions of data quality. Such filtering yields qualitatively clean datapoints that intuitively should improve model behavior. However, in practice the opposite can often happen: we find that selecting according to similarity with \"high quality\" data sources may not increase (and can even hurt) performance compared to randomly selecting data. To develop better methods for selecting data, we start by framing dataset selection as an optimization problem that we can directly solve for: given target tasks, a learning algorithm, and candidate data, select the subset that maximizes model performance. This framework thus avoids handpicked notions of data quality, and instead models explicitly how the learning process uses train datapoints to predict on the target tasks. Our resulting method greatly improves language model (LM) performance on both pre-specified tasks and previously unseen tasks. Specifically, choosing target tasks representative of standard LM problems and evaluating on diverse held-out benchmarks, our selected datasets provide a 2x compute multiplier over baseline methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Logan Engstrom;Axel Feldmann;Aleksander Madry", "authorids": "~Logan_Engstrom1;~Axel_Feldmann1;~Aleksander_Madry1", "gender": "M;M;M", "homepage": ";https://feldmann.nyc;https://people.csail.mit.edu/madry/", "dblp": "207/7298;260/5858;67/2454", "google_scholar": ";QkpYowMAAAAJ;SupjsEUAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Logan_Engstrom1;~Axel_Feldmann1;~Aleksander_Madry1", "aff": "Massachusetts Institute of Technology;Computer Science and Artificial Intelligence Laboratory, Electrical Engineering & Computer Science;Massachusetts Institute of Technology", "aff_domain": "mit.edu;csail.mit.edu;mit.edu", "position": "PhD student;PhD student;Professor", "bibtex": "@inproceedings{\nengstrom2024dsdm,\ntitle={DsDm: Model-Aware Dataset Selection with Datamodels},\nauthor={Logan Engstrom and Axel Feldmann and Aleksander Madry},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GC8HkKeH8s}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2970361, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1606079137446150554&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "mit.edu;csail.mit.edu;mit.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Mechanistic Design and Scaling of Hybrid Architectures", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34509", "id": "GDp7Gyd9nf", "proceeding": "https://proceedings.mlr.press/v235/poli24a.html", "pdf": "https://openreview.net/pdf?id=GDp7Gyd9nf", "openreview": "https://openreview.net/forum?id=GDp7Gyd9nf", "author_site": "Michael Poli, Armin Thomas, Eric Nguyen, Pragaash Ponnusamy, Bj\u00f6rn Deiseroth, Kristian Kersting, Taiji Suzuki, Brian Hie, Stefano Ermon, Christopher Re, Ce Zhang, Stefano Massaroli", "tldr": "", "abstract": "The development of deep learning architectures is a resource-demanding process, due to a vast design space, long prototyping times, and high compute costs associated with at-scale model training and evaluation. We set out to simplify this process by grounding it in an end-to-end mechanistic architecture design (MAD) pipeline, encompassing small-scale capability unit tests predictive of scaling laws. Through a suite of synthetic token manipulation tasks such as compression and recall, designed to probe capabilities, we identify and test new hybrid architectures constructed from a variety of computational primitives. We experimentally validate the resulting architectures via an extensive compute-optimal and a new state-optimal scaling law analysis, training over 500 language models between 70M to 7B parameters. Surprisingly, we find MAD synthetics to correlate with compute-optimal perplexity, enabling accurate evaluation of new architectures via isolated proxy tasks. The new architectures found via MAD, based on simple ideas such as hybridization and sparsity, outperform state-of-the-art Transformer, convolutional, and recurrent architectures (Transformer++, Hyena, Mamba) in scaling, both at compute-optimal budgets and in overtrained regimes. Overall, these results provide evidence that performance on curated synthetic tasks can be predictive of scaling laws, and that an optimal architecture should leverage specialized layers via a hybrid topology.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Michael Poli;Armin W Thomas;Eric Nguyen;Pragaash Ponnusamy;Bj\u00f6rn Deiseroth;Kristian Kersting;Taiji Suzuki;Brian Hie;Stefano Ermon;Christopher Re;Ce Zhang;Stefano Massaroli", "authorids": "~Michael_Poli1;~Armin_W_Thomas1;~Eric_Nguyen1;~Pragaash_Ponnusamy1;~Bj\u00f6rn_Deiseroth1;~Kristian_Kersting1;~Taiji_Suzuki1;brianhie@stanford.edu;~Stefano_Ermon1;~Christopher_Re1;~Ce_Zhang1;~Stefano_Massaroli1", "gender": "M;Non-Binary;M;M;;M;M;;M;;;", "homepage": ";;http://erictnguyen.com;;;http://www.ml.informatik.tu-darmstadt.de/;http://ibis.t.u-tokyo.ac.jp/suzuki/;;http://cs.stanford.edu/~ermon/;;;", "dblp": ";228/8292;;252/5544;;40/3793;08/312;;47/8135;;97/919;", "google_scholar": "RgIBwboAAAAJ;awtZJwkAAAAJ;66TLwGUAAAAJ;GD8bROUAAAAJ;;QY-earAAAAAJ;x8osrBsAAAAJ;;;;;IwCfl4UAAAAJ", "orcid": ";0000-0002-9947-5705;;0000-0002-3790-5757;;0000-0002-2873-9152;;;;;;", "linkedin": ";;nguyenstanford/;;;;;;;;;", "or_profile": "~Michael_Poli1;~Armin_W_Thomas1;~Eric_Nguyen1;~Pragaash_Ponnusamy1;~Bj\u00f6rn_Deiseroth1;~Kristian_Kersting1;~Taiji_Suzuki1;brianhie@stanford.edu;~Stefano_Ermon1;~Christopher_Re1;~Ce_Zhang1;~Stefano_Massaroli1", "aff": "Stanford University;Stanford University;Stanford University;Together AI;;TU Darmstadt;The University of Tokyo;;Stanford University;;University of Chicago;MILA", "aff_domain": "stanford.edu;stanford.edu;stanford.edu;together.ai;;tu-darmstadt.de;tokyo.ac.jp;;stanford.edu;;uchicago.edu;mila.quebec", "position": "PhD student;Postdoc;PhD student;Principal Researcher;;Full Professor;Associate Professor;;Associate Professor;;Associate Professor;Postdoc", "bibtex": "@inproceedings{\npoli2024mechanistic,\ntitle={Mechanistic Design and Scaling of Hybrid Architectures},\nauthor={Michael Poli and Armin W Thomas and Eric Nguyen and Pragaash Ponnusamy and Bj{\\\"o}rn Deiseroth and Kristian Kersting and Taiji Suzuki and Brian Hie and Stefano Ermon and Christopher Re and Ce Zhang and Stefano Massaroli},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GDp7Gyd9nf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1262597, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4921087558244575544&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "stanford.edu;stanford.edu;stanford.edu;together.ai;;tu-darmstadt.de;tokyo.ac.jp;;stanford.edu;;uchicago.edu;mila.quebec", "author_num": 12, "aff_unique_index": "0;0;0;1;2;3;0;4;5", "aff_unique_norm": "Stanford University;Together AI;Technische Universit\u00e4t Darmstadt;University of Tokyo;University of Chicago;Mila", "aff_unique_dep": ";;;;;", "aff_unique_url": "https://www.stanford.edu;https://www.together.ai;https://www.tu-darmstadt.de;https://www.u-tokyo.ac.jp;https://www.uchicago.edu;https://mila.quebec", "aff_unique_abbr": "Stanford;Together AI;TU Darmstadt;UTokyo;UChicago;MILA", "aff_campus_unique_index": "0;0;0;2;0", "aff_campus_unique": "Stanford;;Darmstadt", "aff_country_unique_index": "0;0;0;0;1;2;0;0;3", "aff_country_unique": "United States;Germany;Japan;Canada" }, { "title": "StableMask: Refining Causal Masking in Decoder-only Transformer", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34508", "id": "GFfWzAReAc", "proceeding": "https://proceedings.mlr.press/v235/yin24a.html", "pdf": "https://openreview.net/pdf?id=GFfWzAReAc", "openreview": "https://openreview.net/forum?id=GFfWzAReAc", "author_site": "Qingyu Yin, Xuzheng He, Xiang Zhuang, Yu Zhao, Jianhua Yao, Xiaoyu Shen, Qiang Zhang", "tldr": "", "abstract": "The decoder-only Transformer architecture with causal masking and relative position encoding (RPE) has become the de facto choice in language modeling. Despite its exceptional performance across various tasks, we have identified two limitations: First, it prevents all attended tokens from having zero weights during the softmax stage, even if the current embedding has sufficient self-contained information. This compels the model to assign disproportional excessive attention to specific tokens. Second, RPE-based Transformers are not universal approximators due to their limited capacity at encoding absolute positional information, which limits their application in position-critical tasks. In this work, we propose StableMask: a parameter-free method to address both limitations by refining the causal mask. It introduces pseudo-attention values to balance attention distributions and encodes absolute positional information via a progressively decreasing mask ratio. StableMask's effectiveness is validated both theoretically and empirically, showing significant enhancements in language models with parameter sizes ranging from 71M to 1.4B across diverse datasets and encoding methods. We further show that it supports integration with existing optimization techniques, making it easily usable in practical applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qingyu Yin;Xuzheng He;Xiang Zhuang;Yu Zhao;Jianhua Yao;Xiaoyu Shen;Qiang Zhang", "authorids": "~Qingyu_Yin4;~Xuzheng_He1;~Xiang_Zhuang1;~Yu_Zhao8;~Jianhua_Yao3;~Xiaoyu_Shen1;~Qiang_Zhang6", "gender": "M;M;;M;M;M;", "homepage": "https://mikastars39.notion.site;https://github.com/cyclekiller;;;;https://eit-nlp.github.io/;https://qiangairesearcher.github.io", "dblp": ";;;57/2056-9;;;72/3527-26", "google_scholar": "BRsj8FAAAAAJ;;;7XOW0wcAAAAJ;https://scholar.google.com/citations?hl=en;BWfPrE4AAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";;;;;0000-0002-0217-2469;", "linkedin": ";;;;;;", "or_profile": "~Qingyu_Yin4;~Xuzheng_He1;~Xiang_Zhuang1;~Yu_Zhao8;~Jianhua_Yao3;~Xiaoyu_Shen1;~Qiang_Zhang6", "aff": "Zhejiang University;;;Tencent AI Lab;Tencent AI Lab;Amazon;Zhejiang University", "aff_domain": "zju.edu.cn;;;tencent.com;tencent.com;amazon.com;zju.edu.cn", "position": "Undergrad student;;;Researcher;Principal Researcher;machine learning scientist;Principal Researcher", "bibtex": "@inproceedings{\nyin2024stablemask,\ntitle={StableMask: Refining Causal Masking in Decoder-only Transformer},\nauthor={Qingyu Yin and Xuzheng He and Xiang Zhuang and Yu Zhao and Jianhua Yao and Xiaoyu Shen and Qiang Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GFfWzAReAc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1081151, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15925856041299997147&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "zju.edu.cn;;;tencent.com;tencent.com;amazon.com;zju.edu.cn", "author_num": 7, "aff_unique_index": "0;1;1;2;0", "aff_unique_norm": "Zhejiang University;Tencent;Amazon", "aff_unique_dep": ";Tencent AI Lab;Amazon.com, Inc.", "aff_unique_url": "https://www.zju.edu.cn;https://ai.tencent.com;https://www.amazon.com", "aff_unique_abbr": "ZJU;Tencent AI Lab;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "No-Regret Reinforcement Learning in Smooth MDPs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34507", "id": "GGnYDXZC1B", "proceeding": "https://proceedings.mlr.press/v235/maran24a.html", "pdf": "https://openreview.net/pdf?id=GGnYDXZC1B", "openreview": "https://openreview.net/forum?id=GGnYDXZC1B", "author_site": "Davide Maran, Alberto Maria Metelli, Matteo Papini, Marcello Restelli", "tldr": "", "abstract": "Obtaining no-regret guarantees for reinforcement learning (RL) in the case of problems with continuous state and/or action spaces is still one of the major open challenges in the field. Recently, a variety of solutions have been proposed, but besides very specific settings, the general problem remains unsolved. In this paper, we introduce a novel structural assumption on the Markov decision processes (MDPs), namely $\\nu-$smoothness, that generalizes most of the settings proposed so far (e.g., linear MDPs and Lipschitz MDPs). To face this challenging scenario, we propose two algorithms for regret minimization in $\\nu-$smooth MDPs. Both algorithms build upon the idea of constructing an MDP representation through an orthogonal feature map based on Legendre polynomials. The first algorithm, Legendre-Eleanor, archives the no-regret property under weaker assumptions but is computationally inefficient, whereas the second one, Legendre-LSVI, runs in polynomial time, although for a smaller class of problems. After analyzing their regret properties, we compare our results with state-of-the-art ones from RL theory, showing that our algorithms achieve the best guarantees.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Davide Maran;Alberto Maria Metelli;Matteo Papini;Marcello Restelli", "authorids": "~Davide_Maran1;~Alberto_Maria_Metelli2;~Matteo_Papini1;~Marcello_Restelli1", "gender": "M;M;M;M", "homepage": "https://davidezfc.github.io/;https://albertometelli.github.io/;https://t3p.github.io/;http://home.deib.polimi.it/restelli/", "dblp": "320/3835;209/4941;209/4897;64/1011", "google_scholar": "https://scholar.google.it/citations?user=a8i0X8oAAAAJ;R31IsPwAAAAJ;https://scholar.google.it/citations?user=A2WxZlsAAAAJ;https://scholar.google.com.tw/citations?user=xdgxRiEAAAAJ", "orcid": ";0000-0002-3424-5212;0000-0002-3807-3171;0000-0002-6322-1076", "linkedin": "davide-maran/;;matteo-papini/;", "or_profile": "~Davide_Maran1;~Alberto_Maria_Metelli2;~Matteo_Papini1;~Marcello_Restelli1", "aff": "Polytechnic Institute of Milan;Politecnico di Milano;Polytechnic Institute of Milan;Politecnico di Milano", "aff_domain": "polimi.it;polimi.it;polimi.it;polimi.it", "position": "PhD student;Assistant Professor;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nmaran2024noregret,\ntitle={No-Regret Reinforcement Learning in Smooth {MDP}s},\nauthor={Davide Maran and Alberto Maria Metelli and Matteo Papini and Marcello Restelli},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GGnYDXZC1B}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 527428, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17711875521578572749&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "polimi.it;polimi.it;polimi.it;polimi.it", "author_num": 4, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Polytechnic Institute of Milan;Politecnico di Milano", "aff_unique_dep": ";", "aff_unique_url": "https://www.polimi.it/;https://www.polimi.it", "aff_unique_abbr": "Politecnico di Milano;Polimi", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Italy" }, { "title": "Hybrid$^2$ Neural ODE Causal Modeling and an Application to Glycemic Response", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34506", "id": "GHZVjmaGQM", "proceeding": "https://proceedings.mlr.press/v235/zou24b.html", "pdf": "https://openreview.net/pdf?id=GHZVjmaGQM", "openreview": "https://openreview.net/forum?id=GHZVjmaGQM", "author_site": "Junyi Zou, Matthew Levine, Dessi Zaharieva, Ramesh Johari, Emily Fox", "tldr": "", "abstract": "Hybrid models composing mechanistic ODE-based dynamics with flexible and expressive neural network components have grown rapidly in popularity, especially in scientific domains where such ODE-based modeling offers important interpretability and validated causal grounding (e.g., for counterfactual reasoning). The incorporation of mechanistic models also provides inductive bias in standard blackbox modeling approaches, critical when learning from small datasets or partially observed, complex systems. Unfortunately, as the hybrid models become more flexible, the causal grounding provided by the mechanistic model can quickly be lost. We address this problem by leveraging another common source of domain knowledge: *ranking* of treatment effects for a set of interventions, even if the precise treatment effect is unknown. We encode this information in a *causal loss* that we combine with the standard predictive loss to arrive at a *hybrid loss* that biases our learning towards causally valid hybrid models. We demonstrate our ability to achieve a win-win, state-of-the-art predictive performance *and* causal validity, in the challenging task of modeling glucose dynamics post-exercise in individuals with type 1 diabetes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bob Junyi Zou;Matthew E Levine;Dessi P. Zaharieva;Ramesh Johari;Emily Fox", "authorids": "~Bob_Junyi_Zou1;~Matthew_E_Levine1;~Dessi_P._Zaharieva1;~Ramesh_Johari1;~Emily_Fox2", "gender": "M;F;M;F;M", "homepage": "https://mattlevine.netlify.app;;;https://emilybfox.su.domains/;", "dblp": ";;80/1071;68/1212;", "google_scholar": "QUs08XEAAAAJ;;;OO-2710AAAAJ;", "orcid": "0000-0002-5627-3169;0000-0002-9374-8469;;;0000-0003-2463-5994", "linkedin": ";;;;", "or_profile": "~Matthew_E_Levine1;~Dessi_P._Zaharieva1;~Ramesh_Johari1;~Emily_Fox2;~Junyi_Zou1", "aff": "California Institute of Technology;Stanford University;Stanford University;Stanford University;Stanford University", "aff_domain": "caltech.edu;stanford.edu;stanford.edu;stanford.edu;stanford.edu", "position": "PhD student;Instructor;Full Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nzou2024hybrid,\ntitle={Hybrid\\${\\textasciicircum}2\\$ Neural {ODE} Causal Modeling and an Application to Glycemic Response},\nauthor={Bob Junyi Zou and Matthew E Levine and Dessi P. Zaharieva and Ramesh Johari and Emily Fox},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GHZVjmaGQM}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2452054, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11122988060157720816&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "caltech.edu;stanford.edu;stanford.edu;stanford.edu;stanford.edu", "author_num": 5, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "California Institute of Technology;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://www.caltech.edu;https://www.stanford.edu", "aff_unique_abbr": "Caltech;Stanford", "aff_campus_unique_index": "0;1;1;1;1", "aff_campus_unique": "Pasadena;Stanford", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "From Inverse Optimization to Feasibility to ERM", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34505", "id": "GJzqRKOdRi", "proceeding": "https://proceedings.mlr.press/v235/mishra24a.html", "pdf": "https://openreview.net/pdf?id=GJzqRKOdRi", "openreview": "https://openreview.net/forum?id=GJzqRKOdRi", "author_site": "Saurabh Mishra, Anant Raj, Sharan Vaswani", "tldr": "", "abstract": "Inverse optimization involves inferring unknown parameters of an optimization problem from known solutions and is widely used in fields such as transportation, power systems, and healthcare. We study the *contextual inverse optimization setting* that utilizes additional contextual information to better predict the unknown problem parameters. We focus on contextual inverse linear programming (CILP) addressing the challenges posed by the non-differentiable nature of LPs. For a linear prediction model, we reduce CILP to a convex feasibility problem allowing the use of standard algorithms such as alternating projections. The resulting algorithm for CILP is equipped with theoretical convergence guarantees without additional assumptions such as degeneracy or interpolation. Next, we reduce CILP to empirical risk minimization (ERM) on a smooth, convex loss that satisfies the Polyak-Lojasiewicz condition. This reduction enables the use of scalable first-order optimization methods to solve large non-convex problems while maintaining theoretical guarantees in the convex setting. Subsequently, we use the reduction to ERM to quantify the generalization performance of the proposed algorithm on previously unseen instances. Finally, we experimentally validate our approach on synthetic and real-world problems and demonstrate improved performance compared to existing methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Saurabh kumar Mishra;Anant Raj;Sharan Vaswani", "authorids": "~Saurabh_kumar_Mishra1;~Anant_Raj2;~Sharan_Vaswani1", "gender": "M;;M", "homepage": ";;http://vaswanis.github.io", "dblp": "34/8757;;136/5916", "google_scholar": "https://scholar.google.ca/citations?user=RGx6DxAAAAAJ;;https://scholar.google.ca/citations?user=bDb2zWwAAAAJ", "orcid": ";;", "linkedin": ";;sharan-vaswani-05b8ab35/", "or_profile": "~Saurabh_kumar_Mishra1;~Anant_Raj2;~Sharan_Vaswani1", "aff": "Simon Fraser University;;Simon Fraser University", "aff_domain": "sfu.ca;;sfu.ca", "position": "MS student;;Assistant Professor", "bibtex": "@inproceedings{\nmishra2024from,\ntitle={From Inverse Optimization to Feasibility to {ERM}},\nauthor={Saurabh kumar Mishra and Anant Raj and Sharan Vaswani},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GJzqRKOdRi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8728065, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6201641328720062482&as_sdt=805&sciodt=0,3&hl=en", "gs_version_total": 4, "email": "sfu.ca;;sfu.ca", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Simon Fraser University", "aff_unique_dep": "", "aff_unique_url": "https://www.sfu.ca", "aff_unique_abbr": "SFU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "title": "Active Statistical Inference", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34504", "id": "GKMcCtWC7H", "proceeding": "https://proceedings.mlr.press/v235/zrnic24a.html", "pdf": "https://openreview.net/pdf?id=GKMcCtWC7H", "openreview": "https://openreview.net/forum?id=GKMcCtWC7H", "author_site": "Tijana Zrnic, Emmanuel J Candes", "tldr": "", "abstract": "Inspired by the concept of active learning, we propose active inference---a methodology for statistical inference with machine-learning-assisted data collection. Assuming a budget on the number of labels that can be collected, the methodology uses a machine learning model to identify which data points would be most beneficial to label, thus effectively utilizing the budget. It operates on a simple yet powerful intuition: prioritize the collection of labels for data points where the model exhibits uncertainty, and rely on the model's predictions where it is confident. Active inference constructs valid confidence intervals and hypothesis tests while leveraging any black-box machine learning model and handling any data distribution. The key point is that it achieves the same level of accuracy with far fewer samples than existing baselines relying on non-adaptively-collected data. This means that for the same number of collected samples, active inference enables smaller confidence intervals and more powerful tests. We evaluate active inference on datasets from public opinion research, census analysis, and proteomics.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tijana Zrnic;Emmanuel Candes", "authorids": "~Tijana_Zrnic1;~Emmanuel_Candes1", "gender": "F;", "homepage": "https://tijana-zrnic.github.io;http://statweb.stanford.edu/~candes/", "dblp": "188/4437;", "google_scholar": ";nRQi4O8AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Tijana_Zrnic1;~Emmanuel_Candes1", "aff": "Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu", "position": "Postdoc;Full Professor", "bibtex": "@inproceedings{\nzrnic2024active,\ntitle={Active Statistical Inference},\nauthor={Tijana Zrnic and Emmanuel Candes},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GKMcCtWC7H}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 606215, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10558845239975632301&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "stanford.edu;stanford.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "In-Context Unlearning: Language Models as Few-Shot Unlearners", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34503", "id": "GKcwle8XC9", "proceeding": "https://proceedings.mlr.press/v235/pawelczyk24a.html", "pdf": "https://openreview.net/pdf?id=GKcwle8XC9", "openreview": "https://openreview.net/forum?id=GKcwle8XC9", "author_site": "Martin Pawelczyk, Seth Neel, Himabindu Lakkaraju", "tldr": "", "abstract": "Machine unlearning, the study of efficiently removing the impact of specific training instances on a model, has garnered increased attention in recent years due to regulatory guidelines such as the Right to be Forgotten. Achieving precise unlearning typically involves fully retraining the model and is computationally infeasible in case of very large models such as Large Language Models (LLMs). To this end, recent work has proposed several algorithms which approximate the removal of training data without retraining the model. These algorithms crucially rely on access to the model parameters in order to update them, an assumption that may not hold in practice due to computational constraints or having only query access to the LLMs. In this work, we propose a new class of unlearning methods for LLMs called ``In-Context Unlearning.'' This method unlearns instances from the model by simply providing specific kinds of inputs in context, without the need to update model parameters. To unlearn specific training instances, we present these instances to the LLMs at inference time along with labels that differ from their ground truth. Our experimental results demonstrate that in-context unlearning performs on par with, or in some cases outperforms other state-of-the-art methods that require access to model parameters, effectively removing the influence of specific instances on the model while preserving test accuracy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Martin Pawelczyk;Seth Neel;Himabindu Lakkaraju", "authorids": "~Martin_Pawelczyk1;~Seth_Neel2;~Himabindu_Lakkaraju1", "gender": "M;F;M", "homepage": "https://sites.google.com/view/martinpawelczyk/;http://web.stanford.edu/~himalv;https://sethneel.com", "dblp": "251/3229;68/9376;188/6406", "google_scholar": "oYAf_hgAAAAJ;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Martin_Pawelczyk1;~Hima_Lakkaraju1;~Seth_Neel1", "aff": "Harvard University;Harvard University;Harvard University", "aff_domain": "harvard.edu;harvard.edu;harvard.edu", "position": "Postdoc;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\npawelczyk2024incontext,\ntitle={In-Context Unlearning: Language Models as Few-Shot Unlearners},\nauthor={Martin Pawelczyk and Seth Neel and Himabindu Lakkaraju},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GKcwle8XC9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3346207, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 113, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5161391496565915695&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "harvard.edu;harvard.edu;harvard.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "QuRating: Selecting High-Quality Data for Training Language Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34502", "id": "GLGYYqPwjy", "proceeding": "https://proceedings.mlr.press/v235/wettig24a.html", "pdf": "https://openreview.net/pdf?id=GLGYYqPwjy", "openreview": "https://openreview.net/forum?id=GLGYYqPwjy", "author_site": "Alexander Wettig, Aatmik Gupta, Saumya Malik, Danqi Chen", "tldr": "", "abstract": "Selecting high-quality pre-training data is important for creating capable language models, but existing methods rely on simple heuristics. We introduce QuRating, a method for selecting pre-training data that can capture human intuitions about data quality. In this paper, we investigate four qualities - writing style, required expertise, facts & trivia, and educational value - and find that LLMs are able to discern these qualities, especially when making pairwise judgments of texts. We train a QuRater model to learn scalar ratings from pairwise judgments, and use it to annotate a 260B training corpus with quality ratings for each of the four criteria. In our experiments, we select 30B tokens according to the different quality ratings and train 1.3B-parameter language models on the selected data. We find that it is important to balance quality and diversity. When we sample using quality ratings as logits over documents, our models obtain lower perplexity and stronger in-context learning performance than baselines. Our best model is based on educational value and performs similarly to a model trained with uniform sampling for 50% more steps. Beyond data selection, we use the quality ratings to construct a training curriculum which improves performance without changing the training dataset. We extensively analyze the quality ratings and discuss their characteristics, biases, and wider implications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alexander Wettig;Aatmik Gupta;Saumya Malik;Danqi Chen", "authorids": "~Alexander_Wettig1;~Aatmik_Gupta1;~Saumya_Malik1;~Danqi_Chen1", "gender": ";M;F;F", "homepage": "https://www.cs.princeton.edu/~awettig/;https://www.linkedin.com/in/aatmik/;https://www.linkedin.com/in/saumya-malik-983a11229/;https://www.cs.princeton.edu/~danqic/", "dblp": "302/0235;;;87/7949", "google_scholar": "N_jSE08AAAAJ;;;sVR8ktkAAAAJ", "orcid": ";;;", "linkedin": "alexander-wettig/;;;", "or_profile": "~Alexander_Wettig1;~Aatmik_Gupta1;~Saumya_Malik1;~Danqi_Chen1", "aff": "Allen Institute for Artificial Intelligence;Princeton University;Princeton University;Princeton University", "aff_domain": "allenai.org;princeton.edu;princeton.edu;cs.princeton.edu", "position": "Intern;MS student;Undergrad student;Assistant Professor", "bibtex": "@inproceedings{\nwettig2024qurating,\ntitle={QuRating: Selecting High-Quality Data for Training Language Models},\nauthor={Alexander Wettig and Aatmik Gupta and Saumya Malik and Danqi Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GLGYYqPwjy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 0, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 63, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6791987271302816754&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "allenai.org;princeton.edu;princeton.edu;cs.princeton.edu", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Allen Institute for Artificial Intelligence;Princeton University", "aff_unique_dep": ";", "aff_unique_url": "https://allenai.org;https://www.princeton.edu", "aff_unique_abbr": "AI2;Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Neurodegenerative Brain Network Classification via Adaptive Diffusion with Temporal Regularization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34501", "id": "GTnn6bNE3j", "proceeding": "https://proceedings.mlr.press/v235/cho24f.html", "pdf": "https://openreview.net/pdf?id=GTnn6bNE3j", "openreview": "https://openreview.net/forum?id=GTnn6bNE3j", "author_site": "Hyuna Cho, Jaeyoon Sim, Guorong Wu, Won Hwa Kim", "tldr": "", "abstract": "Analysis of neurodegenerative diseases on brain connectomes is important in facilitating early diagnosis and predicting its onset. However, investigation of the progressive and irreversible dynamics of these diseases remains underexplored in cross-sectional studies as its diagnostic groups are considered independent. Also, as in many real-world graphs, brain networks exhibit intricate structures with both homophily and heterophily. To address these challenges, we propose Adaptive Graph diffusion network with Temporal regularization (AGT). AGT introduces node-wise convolution to adaptively capture low (i.e., homophily) and high-frequency (i.e., heterophily) characteristics within an optimally tailored range for each node. Moreover, AGT captures sequential variations within progressive diagnostic groups with a novel temporal regularization, considering the relative feature distance between the groups in the latent space. As a result, our proposed model yields interpretable results at both node-level and group-level. The superiority of our method is validated on two neurodegenerative disease benchmarks for graph classification: Alzheimer\u2019s Disease Neuroimaging Initiative (ADNI) and Parkinson\u2019s Progression Markers Initiative (PPMI) datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hyuna Cho;Jaeyoon Sim;Guorong Wu;Won Hwa Kim", "authorids": "~Hyuna_Cho1;~Jaeyoon_Sim1;~Guorong_Wu1;~Won_Hwa_Kim4", "gender": "F;M;M;M", "homepage": "https://sites.google.com/view/hyunacho;https://mip.postech.ac.kr/;https://www.acmlab.org/;https://wwplato.github.io/", "dblp": "302/4777;;03/5225-1.html;12/10278", "google_scholar": ";XheFLZgAAAAJ;XVsMB2kAAAAJ;aWPSHNwAAAAJ", "orcid": ";;0000-0002-0550-6145;", "linkedin": ";jaeyoon-sim-86a4aa185/;;", "or_profile": "~Hyuna_Cho1;~Jaeyoon_Sim1;~Guorong_Wu1;~Won_Hwa_Kim1", "aff": "Pohang University of Science and Technology;Pohang University of Science and Technology;University of North Carolina, Chapel Hill;Pohang University of Science and Technology", "aff_domain": "postech.ac.kr;postech.ac.kr;unc.edu;postech.ac.kr", "position": "PhD student;PhD student;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\ncho2024neurodegenerative,\ntitle={Neurodegenerative Brain Network Classification via Adaptive Diffusion with Temporal Regularization},\nauthor={Hyuna Cho and Jaeyoon Sim and Guorong Wu and Won Hwa Kim},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GTnn6bNE3j}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3258557, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13185840543943156140&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "postech.ac.kr;postech.ac.kr;unc.edu;postech.ac.kr", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Pohang University of Science and Technology;University of North Carolina", "aff_unique_dep": ";", "aff_unique_url": "https://www.postech.ac.kr;https://www.unc.edu", "aff_unique_abbr": "POSTECH;UNC", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Pohang;Chapel Hill", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "South Korea;United States" }, { "title": "Learning to Scale Logits for Temperature-Conditional GFlowNets", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34500", "id": "GUEsK9xJny", "proceeding": "https://proceedings.mlr.press/v235/kim24s.html", "pdf": "https://openreview.net/pdf?id=GUEsK9xJny", "openreview": "https://openreview.net/forum?id=GUEsK9xJny", "author_site": "Minsu Kim, Joohwan Ko, Taeyoung Yun, Dinghuai Zhang, Ling Pan, Woo Chang Kim, Jinkyoo Park, Emmanuel Bengio, Yoshua Bengio", "tldr": "", "abstract": "GFlowNets are probabilistic models that sequentially generate compositional structures through a stochastic policy. Among GFlowNets, temperature-conditional GFlowNets can introduce temperature-based controllability for exploration and exploitation. We propose *Logit-scaling GFlowNets* (Logit-GFN), a novel architectural design that greatly accelerates the training of temperature-conditional GFlowNets. It is based on the idea that previously proposed approaches introduced numerical challenges in the deep network training, since different temperatures may give rise to very different gradient profiles as well as magnitudes of the policy's logits. We find that the challenge is greatly reduced if a learned function of the temperature is used to scale the policy's logits directly. Also, using Logit-GFN, GFlowNets can be improved by having better generalization capabilities in offline learning and mode discovery capabilities in online learning, which is empirically verified in various biological and chemical tasks. Our code is available at https://github.com/dbsxodud-11/logit-gfn", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Minsu Kim;Joohwan Ko;Taeyoung Yun;Dinghuai Zhang;Ling Pan;Woo Chang Kim;Jinkyoo Park;Emmanuel Bengio;Yoshua Bengio", "authorids": "~Minsu_Kim2;~Joohwan_Ko2;~Taeyoung_Yun1;~Dinghuai_Zhang1;~Ling_Pan1;~Woo_Chang_Kim1;~Jinkyoo_Park1;~Emmanuel_Bengio1;~Yoshua_Bengio1", "gender": "M;M;M;;F;M;M;M;M", "homepage": "https://minsuukim.github.io/;https://joohwanko.com/;https://dbsxodud-11.github.io;;https://ling-pan.github.io/;http://felab.kaist.ac.kr/;http://silab.kaist.ac.kr/;http://folinoid.com;http://yoshuabengio.org", "dblp": ";358/5976;358/5797.html;;199/9303/;128/5936;156/7535;137/8040;56/953", "google_scholar": "https://scholar.google.ca/citations?user=VvyLuhAAAAAJ;;_51PhLQAAAAJ;;qZ_zlacAAAAJ;7NmBs1kAAAAJ;sH2a0nkAAAAJ;https://scholar.google.ca/citations?user=yVtSOt8AAAAJ;kukA0LcAAAAJ", "orcid": ";;0009-0001-4602-6367;;;0000-0001-8385-9598;0000-0003-2620-1479;;", "linkedin": ";;;;;;;;yoshuabengio/?originalSubdomain=ca", "or_profile": "~Minsu_Kim2;~Joohwan_Ko2;~Taeyoung_Yun1;~Dinghuai_Zhang1;~Ling_Pan1;~Woo_Chang_Kim1;~Jinkyoo_Park1;~Emmanuel_Bengio1;~Yoshua_Bengio1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;;Montreal Institute for Learning Algorithms (MILA);Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Valence Labs powered by recursion;University of Montreal", "aff_domain": "kaist.ac.kr;kaist.edu;kaist.ac.kr;;mila.umontreal.ca;kaist.ac.kr;kaist.ac.kr;valencelabs.com;umontreal.ca", "position": "PhD student;MS student;MS student;;Postdoc;Full Professor;Associate Professor;Researcher;Full Professor", "bibtex": "@inproceedings{\nkim2024learning,\ntitle={Learning to Scale Logits for Temperature-Conditional {GF}lowNets},\nauthor={Minsu Kim and Joohwan Ko and Taeyoung Yun and Dinghuai Zhang and Ling Pan and Woo Chang Kim and Jinkyoo Park and Emmanuel Bengio and Yoshua Bengio},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GUEsK9xJny}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8666455, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3163806682678880598&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "kaist.ac.kr;kaist.edu;kaist.ac.kr;;mila.umontreal.ca;kaist.ac.kr;kaist.ac.kr;valencelabs.com;umontreal.ca", "author_num": 9, "aff_unique_index": "0;0;0;1;0;0;2;3", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Montreal Institute for Learning Algorithms;Valence Labs;University of Montreal", "aff_unique_dep": ";Artificial Intelligence;;", "aff_unique_url": "https://www.kaist.ac.kr;https://mila.quebec;;https://wwwumontreal.ca", "aff_unique_abbr": "KAIST;MILA;;UM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;0;0;1;0;0;1", "aff_country_unique": "South Korea;Canada;" }, { "title": "Retrieval Across Any Domains via Large-scale Pre-trained Model", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34499", "id": "GVmvBNxB73", "proceeding": "https://proceedings.mlr.press/v235/yan24h.html", "pdf": "https://openreview.net/pdf?id=GVmvBNxB73", "openreview": "https://openreview.net/forum?id=GVmvBNxB73", "author_site": "Jiexi Yan, Zhihui Yin, Chenghao Xu, Cheng Deng, Heng Huang", "tldr": "", "abstract": "In order to enhance the generalization ability towards unseen domains, universal cross-domain image retrieval methods require a training dataset encompassing diverse domains, which is costly to assemble. Given this constraint, we introduce a novel problem of data-free adaptive cross-domain retrieval, eliminating the need for real images during training. Towards this goal, we propose a novel Text-driven Knowledge Integration (TKI) method, which exclusively utilizes a pre-trained vision-language model to implement an ``aggregation after expansion\" training strategy. Specifically, we extract diverse implicit domain-specific information through a set of learnable domain word vectors. Subsequently, a domain-agnostic universal projection, equipped with a non-Euclidean multi-layer perceptron, can be optimized using these assorted text descriptions through the text-proxied domain aggregation. Leveraging the cross-modal transferability phenomenon of the shared latent space, we can integrate the trained domain-agnostic universal projection with the pre-trained visual encoder to extract the features of the input image for the following retrieval during testing. Extensive experimental results on several benchmark datasets demonstrate the superiority of our method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiexi Yan;Zhihui Yin;Chenghao Xu;Cheng Deng;Heng Huang", "authorids": "~Jiexi_Yan2;~Zhihui_Yin1;~Chenghao_Xu1;~Cheng_Deng2;~Heng_Huang1", "gender": "M;;M;M;M", "homepage": "https://jiexiyan.github.io/;https://github.com/xuexiyin;https://shydyl.github.io/;https://www.cs.umd.edu/~heng/;http://see.xidian.edu.cn/faculty/chdeng/", "dblp": "218/7132;;;03/281;", "google_scholar": "e3X2Z3IAAAAJ;;j86HvqcAAAAJ;4OqLaDwAAAAJ;OROjmc8AAAAJ", "orcid": "0000-0002-2544-3057;;0000-0001-5888-0504;;0000-0003-2620-3247", "linkedin": ";;;;", "or_profile": "~Jiexi_Yan2;~Zhihui_Yin1;~Chenghao_Xu1;~Heng_Huang1;~Cheng_Deng1", "aff": "Xidian University;Xidian University;Xi'an University of Electronic Science and Technology;Department of Computer Science, University of Maryland, College Park;Xidian University", "aff_domain": "xidian.edu.cn;xidian.edu.cn;xidian.edu.cn;cs.umd.edu;xidian.edu.cn", "position": "Lecturer;MS student;PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nyan2024retrieval,\ntitle={Retrieval Across Any Domains via Large-scale Pre-trained Model},\nauthor={Jiexi Yan and Zhihui Yin and Chenghao Xu and Cheng Deng and Heng Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GVmvBNxB73}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7642698, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2tSGgNQB3n0J:scholar.google.com/&scioq=Retrieval+Across+Any+Domains+via+Large-scale+Pre-trained+Model&hl=en&as_sdt=0,5", "gs_version_total": 4, "email": "xidian.edu.cn;xidian.edu.cn;xidian.edu.cn;cs.umd.edu;xidian.edu.cn", "author_num": 5, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Xidian University;Xi'an University of Electronic Science and Technology;University of Maryland, College Park", "aff_unique_dep": ";;Department of Computer Science", "aff_unique_url": "http://www.xidian.edu.cn/;http://www.xidian.edu.cn/;https://www/umd.edu", "aff_unique_abbr": "Xidian;Xidian University;UMD", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Xi'an;College Park", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Can Implicit Bias Imply Adversarial Robustness?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34498", "id": "GYGkt2M8ee", "proceeding": "https://proceedings.mlr.press/v235/min24a.html", "pdf": "https://openreview.net/pdf?id=GYGkt2M8ee", "openreview": "https://openreview.net/forum?id=GYGkt2M8ee", "author_site": "Hancheng Min, Rene Vidal", "tldr": "", "abstract": "The implicit bias of gradient-based training algorithms has been considered mostly beneficial as it leads to trained networks that often generalize well. However, Frei et al. (2023) show that such implicit bias can harm adversarial robustness. Specifically, they show that if the data consists of clusters with small inter-cluster correlation, a shallow (two-layer) ReLU network trained by gradient flow generalizes well, but it is not robust to adversarial attacks of small radius. Moreover, this phenomenon occurs despite the existence of a much more robust classifier that can be explicitly constructed from a shallow network. In this paper, we extend recent analyses of neuron alignment to show that a shallow network with a polynomial ReLU activation (pReLU) trained by gradient flow not only generalizes well but is also robust to adversarial attacks. Our results highlight the importance of the interplay between data structure and architecture design in the implicit bias and robustness of trained networks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hancheng Min;Rene Vidal", "authorids": "~Hancheng_Min1;~Rene_Vidal1", "gender": "M;", "homepage": "https://hanchmin.github.io/;http://www.vision.jhu.edu", "dblp": "226/6324;v/ReneVidal", "google_scholar": "XgQjPZIAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";", "linkedin": ";rene-vidal-74844928/", "or_profile": "~Hancheng_Min1;~Rene_Vidal1", "aff": "University of Pennsylvania;Amazon", "aff_domain": "seas.upenn.edu;amazon.com", "position": "Postdoc;Principal Researcher", "bibtex": "@inproceedings{\nmin2024can,\ntitle={Can Implicit Bias Imply Adversarial Robustness?},\nauthor={Hancheng Min and Rene Vidal},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GYGkt2M8ee}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4572508, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18131525920415707933&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "seas.upenn.edu;amazon.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Pennsylvania;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.upenn.edu;https://www.amazon.com", "aff_unique_abbr": "UPenn;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Can Mamba Learn How To Learn? A Comparative Study on In-Context Learning Tasks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34497", "id": "GbFluKMmtE", "proceeding": "https://proceedings.mlr.press/v235/park24j.html", "pdf": "https://openreview.net/pdf?id=GbFluKMmtE", "openreview": "https://openreview.net/forum?id=GbFluKMmtE", "author_site": "Jong Ho Park, Jaden Park, Zheyang Xiong, Nayoung Lee, Jaewoong Cho, Samet Oymak, Kangwook Lee, Dimitris Papailiopoulos", "tldr": "", "abstract": "State-space models (SSMs), such as Mamba (Gu & Dao, 2023), have been proposed as alternatives to Transformer networks in language modeling, incorporating gating, convolutions, and input-dependent token selection to mitigate the quadratic cost of multi-head attention. Although SSMs exhibit competitive performance, their in-context learning (ICL) capabilities, a remarkable emergent property of modern language models that enables task execution without parameter optimization, remain less explored compared to Transformers. In this study, we evaluate the ICL performance of SSMs, focusing on Mamba, against Transformer models across various tasks. Our results show that SSMs perform comparably to Transformers in standard regression ICL tasks, while outperforming them in tasks like sparse parity learning. However, SSMs fall short in tasks involving non-standard retrieval functionality. To address these limitations, we introduce a hybrid model, MambaFormer, that combines Mamba with attention blocks, surpassing individual models in tasks where they struggle independently. Our findings suggest that hybrid architectures offer promising avenues for enhancing ICL in language models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jongho Park;Jaeseung Park;Zheyang Xiong;Nayoung Lee;Jaewoong Cho;Samet Oymak;Kangwook Lee;Dimitris Papailiopoulos", "authorids": "~Jongho_Park2;~Jaeseung_Park1;~Zheyang_Xiong1;~Nayoung_Lee1;~Jaewoong_Cho1;~Samet_Oymak2;~Kangwook_Lee1;~Dimitris_Papailiopoulos1", "gender": "M;;;;;;M;M", "homepage": "http://jerryjonghopark.github.io;;;;https://sites.google.com/view/jaewoongcho;;http://kangwooklee.com/;http://papail.io", "dblp": "03/1871-4.html;;;;184/3848;;88/9826-1;", "google_scholar": "e9cfXjQAAAAJ;;;;;;sCEl8r-n5VEC;hYi6i9sAAAAJ", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": "~Jongho_Park2;~Jaeseung_Park1;~Zheyang_Xiong1;~Nayoung_Lee1;~Jaewoong_Cho1;~Samet_Oymak2;~Kangwook_Lee1;~Dimitris_Papailiopoulos1", "aff": "Krafton Inc.;;;;KRAFTON;;KRAFTON;University of Wisconsin - Madison", "aff_domain": "krafton.com;;;;krafton.com;;krafton.com;wisc.edu", "position": "Researcher;;;;Researcher;;Researcher;Associate Professor", "bibtex": "@inproceedings{\npark2024can,\ntitle={Can Mamba Learn How To Learn? A Comparative Study on In-Context Learning Tasks},\nauthor={Jongho Park and Jaeseung Park and Zheyang Xiong and Nayoung Lee and Jaewoong Cho and Samet Oymak and Kangwook Lee and Dimitris Papailiopoulos},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GbFluKMmtE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1024705, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 83, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5302237790189716252&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "krafton.com;;;;krafton.com;;krafton.com;wisc.edu", "author_num": 8, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "KRAFTON Inc.;University of Wisconsin-Madison", "aff_unique_dep": ";", "aff_unique_url": "https://www.krafton.com;https://www.wisc.edu", "aff_unique_abbr": "Krafton;UW-Madison", "aff_campus_unique_index": "1", "aff_campus_unique": ";Madison", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "South Korea;United States" }, { "title": "Constrained Reinforcement Learning Under Model Mismatch", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34496", "id": "GcW9pg4P9x", "proceeding": "https://proceedings.mlr.press/v235/sun24d.html", "pdf": "https://openreview.net/pdf?id=GcW9pg4P9x", "openreview": "https://openreview.net/forum?id=GcW9pg4P9x", "author_site": "Zhongchang Sun, Sihong He, Fei Miao, Shaofeng Zou", "tldr": "", "abstract": "Existing studies on constrained reinforcement learning (RL) may obtain a well-performing policy in the training environment. However, when deployed in a real environment, it may easily violate constraints that were originally satisfied during training because there might be model mismatch between the training and real environments. To address this challenge, we formulate the problem as constrained RL under model uncertainty, where the goal is to learn a policy that optimizes the reward and at the same time satisfies the constraint under model mismatch. We develop a Robust Constrained Policy Optimization (RCPO) algorithm, which is the first algorithm that applies to large/continuous state space and has theoretical guarantees on worst-case reward improvement and constraint violation at each iteration during the training. We show the effectiveness of our algorithm on a set of RL tasks with constraints.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhongchang Sun;Sihong He;Fei Miao;Shaofeng Zou", "authorids": "~Zhongchang_Sun1;~Sihong_He1;~Fei_Miao1;~Shaofeng_Zou1", "gender": ";F;F;", "homepage": ";https://sihonghe.com/;http://www.feimiao.org;", "dblp": ";237/6086;143/6002;", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;-zSd9V0AAAAJ;fH2YF6YAAAAJ;", "orcid": ";;0000-0003-0066-4379;", "linkedin": ";;fei-miao-76964727/;", "or_profile": "~Zhongchang_Sun1;~Sihong_He1;~Fei_Miao1;~Shaofeng_Zou1", "aff": "State University of New York at Buffalo;University of Connecticut;University of Connecticut;", "aff_domain": "buffalo.edu;uconn.edu;uconn.edu;", "position": "PhD student;PhD student;Associate Professor;", "bibtex": "@inproceedings{\nsun2024constrained,\ntitle={Constrained Reinforcement Learning Under Model Mismatch},\nauthor={Zhongchang Sun and Sihong He and Fei Miao and Shaofeng Zou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GcW9pg4P9x}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1027995, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=426049728133720286&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "buffalo.edu;uconn.edu;uconn.edu;", "author_num": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "State University of New York at Buffalo;University of Connecticut", "aff_unique_dep": ";", "aff_unique_url": "https://www.buffalo.edu;https://www.uconn.edu", "aff_unique_abbr": "SUNY Buffalo;UConn", "aff_campus_unique_index": "0", "aff_campus_unique": "Buffalo;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "LangCell: Language-Cell Pre-training for Cell Identity Understanding", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34495", "id": "GcZjpKA37R", "proceeding": "https://proceedings.mlr.press/v235/zhao24u.html", "pdf": "https://openreview.net/pdf?id=GcZjpKA37R", "openreview": "https://openreview.net/forum?id=GcZjpKA37R", "author_site": "Suyuan Zhao, Jiahuan Zhang, Yushuai Wu, YIZHEN LUO, Zaiqing Nie", "tldr": "", "abstract": "Cell identity encompasses various semantic aspects of a cell, including cell type, pathway information, disease information, and more, which are essential for biologists to gain insights into its biological characteristics. Understanding cell identity from the transcriptomic data, such as annotating cell types, has become an important task in bioinformatics. As these semantic aspects are determined by human experts, it is impossible for AI models to effectively carry out cell identity understanding tasks without the supervision signals provided by single-cell and label pairs. The single-cell pre-trained language models (PLMs) currently used for this task are trained only on a single modality, transcriptomics data, lack an understanding of cell identity knowledge. As a result, they have to be fine-tuned for downstream tasks and struggle when lacking labeled data with the desired semantic labels. To address this issue, we propose an innovative solution by constructing a unified representation of single-cell data and natural language during the pre-training phase, allowing the model to directly incorporate insights related to cell identity. More specifically, we introduce **LangCell**, the first **Lang**uage-**Cell** pre-training framework. LangCell utilizes texts enriched with cell identity information to gain a profound comprehension of cross-modal knowledge. Results from experiments conducted on different benchmarks show that LangCell is the only single-cell PLM that can work effectively in zero-shot cell identity understanding scenarios, and also significantly outperforms existing models in few-shot and fine-tuning cell identity understanding scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Suyuan Zhao;Jiahuan Zhang;Yushuai Wu;YIZHEN LUO;Zaiqing Nie", "authorids": "~Suyuan_Zhao1;~Jiahuan_Zhang1;~Yushuai_Wu1;~YIZHEN_LUO1;~Zaiqing_Nie2", "gender": "M;F;M;M;M", "homepage": "https://github.com/toycat-I;;https://github.com/wuys13;https://air.tsinghua.edu.cn/airtd/yjs.htm;https://air.tsinghua.edu.cn/en/info/1046/1192.htm", "dblp": "348/9782;;;286/8497;n/ZaiqingNie", "google_scholar": "YjWpJGUAAAAJ;;;;", "orcid": ";0000-0001-7722-6022;;;0000-0002-1134-2343", "linkedin": ";;;;", "or_profile": "~Suyuan_Zhao1;~Jiahuan_Zhang1;~Yushuai_Wu1;~YIZHEN_LUO1;~Zaiqing_Nie2", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Computer Science and Technology, Tsinghua University;Tsinghua University", "aff_domain": "mails.tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn;cs.tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;Postdoc;Postdoc;PhD student;Full Professor", "bibtex": "@inproceedings{\nzhao2024langcell,\ntitle={LangCell: Language-Cell Pre-training for Cell Identity Understanding},\nauthor={Suyuan Zhao and Jiahuan Zhang and Yushuai Wu and YIZHEN LUO and Zaiqing Nie},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GcZjpKA37R}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3620523, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6990365007182481925&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "mails.tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn;cs.tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "A Doubly Recursive Stochastic Compositional Gradient Descent Method for Federated Multi-Level Compositional Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34494", "id": "GentO2E4ID", "proceeding": "https://proceedings.mlr.press/v235/gao24a.html", "pdf": "https://openreview.net/pdf?id=GentO2E4ID", "openreview": "https://openreview.net/forum?id=GentO2E4ID", "tldr": "", "abstract": "Federated compositional optimization has been actively studied in the past few years. However, existing methods mainly focus on the two-level compositional optimization problem, which cannot be directly applied to the multi-level counterparts. Moreover, the convergence rate of existing federated two-level compositional optimization learning algorithms fails to achieve linear speedup with respect to the number of workers under heterogeneous settings. After identifying the reason for this failure, we developed a novel federated stochastic multi-level compositional optimization algorithm by introducing a novel Jacobian-vector product estimator. This innovation mitigates both the heterogeneity issue and the communication efficiency issue simultaneously. We then theoretically proved that our algorithm can achieve the level-independent and linear speedup convergence rate for nonconvex problems. To our knowledge, this is the first time that a federated learning algorithm can achieve such a favorable convergence rate for multi-level compositional problems. Moreover, experimental results confirm the efficacy of our algorithm.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hongchang Gao", "authorids": "~Hongchang_Gao3", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\ngao2024a,\ntitle={A Doubly Recursive Stochastic Compositional Gradient Descent Method for Federated Multi-Level Compositional Optimization},\nauthor={Hongchang Gao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GentO2E4ID}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 781638, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:khD5Y4QUGMAJ:scholar.google.com/&scioq=A+Doubly+Recursive+Stochastic+Compositional+Gradient+Descent+Method+for+Federated+Multi-Level+Compositional+Optimization&hl=en&as_sdt=0,5", "gs_version_total": 5, "email": "", "author_num": 1 }, { "title": "Incorporating probabilistic domain knowledge into deep multiple instance learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34493", "id": "GfNyqrwECJ", "proceeding": "https://proceedings.mlr.press/v235/hajj24a.html", "pdf": "https://openreview.net/pdf?id=GfNyqrwECJ", "openreview": "https://openreview.net/forum?id=GfNyqrwECJ", "author_site": "Ghadi S. Al Hajj, Aliaksandr Hubin, Chakravarthi Kanduri, Milena Pavlovi\u0107, Knut Rand, Michael Widrich, Anne Solberg, Victor Greiff, Johan Pensar, G\u00fcnter Klambauer, Geir Kjetil Sandve", "tldr": "", "abstract": "Deep learning methods, including deep multiple instance learning methods, have been criticized for their limited ability to incorporate domain knowledge. A reason that knowledge incorporation is challenging in deep learning is that the models usually lack a mapping between their model components and the entities of the domain, making it a non-trivial task to incorporate probabilistic prior information. In this work, we show that such a mapping between domain entities and model components can be defined for a multiple instance learning setting and propose a framework DeeMILIP that encompasses multiple strategies to exploit this mapping for prior knowledge incorporation. We motivate and formalize these strategies from a probabilistic perspective. Experiments on an immune-based diagnostics case show that our proposed strategies allow to learn generalizable models even in settings with weak signals, limited dataset size, and limited compute.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ghadi S. Al Hajj;Aliaksandr Hubin;Chakravarthi Kanduri;Milena Pavlovic;Knut Dagestad Rand;Michael Widrich;Anne Schistad Solberg;Victor Greiff;Johan Pensar;G\u00fcnter Klambauer;Geir Kjetil Sandve", "authorids": "~Ghadi_S._Al_Hajj1;~Aliaksandr_Hubin1;skanduri@ifi.uio.no;~Milena_Pavlovic1;knutdr@math.uio.no;~Michael_Widrich2;~Anne_Schistad_Solberg1;~Victor_Greiff1;~Johan_Pensar2;~G\u00fcnter_Klambauer1;~Geir_Kjetil_Sandve1", "gender": "M;M;;;;;F;M;M;M;M", "homepage": ";https://www.mn.uio.no/math/english/people/aca/aliaksah/;;;;;;https://greifflab.org/;https://www.mn.uio.no/math/english/people/aca/johanpen/;http://www.bioinf.jku.at/people/klambauer/;https://sandvelab.org/", "dblp": ";223/7583;;;;222/2772;;;136/5949;119/4499;", "google_scholar": "SQSuebQAAAAJ;Lx-G8ckAAAAJ;;j6pLL-oAAAAJ;;eMcniXgAAAAJ;;;https://scholar.google.fi/citations?user=MbG9mP4AAAAJ;https://scholar.google.at/citations?user=rb2AvxIAAAAJ;https://scholar.google.no/citations?user=L1lXnHAAAAAJ", "orcid": "0000-0003-1639-1424;;;0000-0002-2484-3868;;0000-0002-5721-0135;0000-0002-6149-971X;;;0000-0003-2861-5552;0000-0002-4959-1409", "linkedin": "ghadi-s-al-hajj/;;;;;mwidrich/;;;;;geir-kjetil-sandve-07164961/", "or_profile": "~Ghadi_S._Al_Hajj1;~Aliaksandr_Hubin1;skanduri@ifi.uio.no;~Milena_Pavlovic1;knutdr@math.uio.no;~Michael_Widrich2;~Anne_Schistad_Solberg1;~Victor_Greiff1;~Johan_Pensar2;~G\u00fcnter_Klambauer1;~Geir_Kjetil_Sandve1", "aff": "University of Oslo;OUC;;University of Oslo;;Freenome Holdings, Inc. ;University of Oslo;University of Oslo;University of Oslo;Johannes Kepler Universit\u00e4t Linz;University of Oslo", "aff_domain": "uio.no;hiof.no;;uio.no;;freenome.com;uio.no;uio.no;uio.no;jku.at;uio.no", "position": "PhD student;Associate Professor;;Postdoc;;Researcher;Full Professor;Associate Professor;Associate Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nhajj2024incorporating,\ntitle={Incorporating probabilistic domain knowledge into deep multiple instance learning},\nauthor={Ghadi S. Al Hajj and Aliaksandr Hubin and Chakravarthi Kanduri and Milena Pavlovic and Knut Dagestad Rand and Michael Widrich and Anne Schistad Solberg and Victor Greiff and Johan Pensar and G{\\\"u}nter Klambauer and Geir Kjetil Sandve},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GfNyqrwECJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4427635, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "email": "uio.no;hiof.no;;uio.no;;freenome.com;uio.no;uio.no;uio.no;jku.at;uio.no", "author_num": 11, "aff_unique_index": "0;1;0;2;0;0;0;3;0", "aff_unique_norm": "University of Oslo;Ocean University of China;Freenome Holdings, Inc.;Johannes Kepler University Linz", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.uio.no;https://www.ouc.edu.cn;;https://www.jku.at", "aff_unique_abbr": "UiO;OUC;;JKU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Linz", "aff_country_unique_index": "0;1;0;2;0;0;0;3;0", "aff_country_unique": "Norway;China;United States;Austria" }, { "title": "Networked Inequality: Preferential Attachment Bias in Graph Neural Network Link Prediction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34492", "id": "GhPFmTJNfj", "proceeding": "https://proceedings.mlr.press/v235/subramonian24a.html", "pdf": "https://openreview.net/pdf?id=GhPFmTJNfj", "openreview": "https://openreview.net/forum?id=GhPFmTJNfj", "author_site": "Arjun Subramonian, Levent Sagun, Yizhou Sun", "tldr": "", "abstract": "Graph neural network (GNN) link prediction is increasingly deployed in citation, collaboration, and online social networks to recommend academic literature, collaborators, and friends. While prior research has investigated the dyadic fairness of GNN link prediction, the within-group (e.g., queer women) fairness and \"rich get richer\" dynamics of link prediction remain underexplored. However, these aspects have significant consequences for degree and power imbalances in networks. In this paper, we shed light on how degree bias in networks affects Graph Convolutional Network (GCN) link prediction. In particular, we theoretically uncover that GCNs with a symmetric normalized graph filter have a within-group preferential attachment bias. We validate our theoretical analysis on real-world citation, collaboration, and online social networks. We further bridge GCN's preferential attachment bias with unfairness in link prediction and propose a new within-group fairness metric. This metric quantifies disparities in link prediction scores within social groups, towards combating the amplification of degree and power disparities. Finally, we propose a simple training-time strategy to alleviate within-group unfairness, and we show that it is effective on citation, social, and credit networks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Arjun Subramonian;Levent Sagun;Yizhou Sun", "authorids": "~Arjun_Subramonian1;~Levent_Sagun1;~Yizhou_Sun1", "gender": "Agender;Non-Binary;F", "homepage": "http://arjunsubramonian.github.io/;http://cims.nyu.edu/~sagun/;http://web.cs.ucla.edu/~yzsun/", "dblp": "282/0168.html;155/9866;37/3868", "google_scholar": "MrdlDhoAAAAJ;-iPZaBcAAAAJ;https://scholar.google.com.tw/citations?user=TQgOjK0AAAAJ", "orcid": "0000-0002-0415-3800;0000-0001-5403-4124;", "linkedin": "arjuns22/;;", "or_profile": "~Arjun_Subramonian1;~Levent_Sagun1;~Yizhou_Sun1", "aff": "University of California, Los Angeles;Meta;University of California, Los Angeles", "aff_domain": "ucla.edu;meta.com;ucla.edu", "position": "PhD student;Research scientist;Associate Professor", "bibtex": "@inproceedings{\nsubramonian2024networked,\ntitle={Networked Inequality: Preferential Attachment Bias in Graph Neural Network Link Prediction},\nauthor={Arjun Subramonian and Levent Sagun and Yizhou Sun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GhPFmTJNfj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3558672, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15580880506040045795&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "ucla.edu;meta.com;ucla.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of California, Los Angeles;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.ucla.edu;https://meta.com", "aff_unique_abbr": "UCLA;Meta", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Bayesian Uncertainty for Gradient Aggregation in Multi-Task Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34491", "id": "GiHo83ozsF", "proceeding": "https://proceedings.mlr.press/v235/achituve24a.html", "pdf": "https://openreview.net/pdf?id=GiHo83ozsF", "openreview": "https://openreview.net/forum?id=GiHo83ozsF", "author_site": "Idan Achituve, Idit Diamant, Arnon Netzer, Gal Chechik, Ethan Fetaya", "tldr": "", "abstract": "As machine learning becomes more prominent there is a growing demand to perform several inference tasks in parallel. Multi-task learning (MTL) addresses this challenge by learning a single model that solves several tasks simultaneously and efficiently. Often optimizing MTL models entails first computing the gradient of the loss for each task, and then aggregating all the gradients to obtain a combined update direction. However, common methods following this approach do not consider an important aspect, the sensitivity in the dimensions of the gradients. Some dimensions may be more lenient for changes while others may be more restrictive. Here, we introduce a novel gradient aggregation procedure using Bayesian inference. We place a probability distribution over the task-specific parameters, which in turn induce a *distribution* over the gradients of the tasks. This valuable information allows us to quantify the uncertainty associated with each of the gradients' dimensions which is factored in when aggregating them. We empirically demonstrate the benefits of our approach in a variety of datasets, achieving state-of-the-art performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Idan Achituve;Idit Diamant;Arnon Netzer;Gal Chechik;Ethan Fetaya", "authorids": "~Idan_Achituve1;~Idit_Diamant1;~Arnon_Netzer1;~Gal_Chechik1;~Ethan_Fetaya1", "gender": ";;M;;M", "homepage": "https://idanachituve.github.io/;;;https://chechiklab.biu.ac.il/~gal/;http://www.cs.toronto.edu/~ethanf/", "dblp": "254/8524;;35/10503;c/GalChechik;01/10046", "google_scholar": "UQIBiUcAAAAJ;;uAzZQ0gAAAAJ;Wk2gAZUAAAAJ;zLuqh-0AAAAJ", "orcid": ";;0009-0000-5339-9439;0000-0001-9164-5303;0000-0003-3125-1665", "linkedin": "idanachituve;;;;", "or_profile": "~Idan_Achituve1;~Idit_Diamant1;~Arnon_Netzer1;~Gal_Chechik1;~Ethan_Fetaya1", "aff": "Bar Ilan University;;Sony;NVIDIA;Bar Ilan University", "aff_domain": "biu.ac.il;;sony.com;nvidia.com;biu.ac.il", "position": "PhD student;;Researcher;Principal Researcher;Assistant Professor", "bibtex": "@inproceedings{\nachituve2024bayesian,\ntitle={Bayesian Uncertainty for Gradient Aggregation in Multi-Task Learning},\nauthor={Idan Achituve and Idit Diamant and Arnon Netzer and Gal Chechik and Ethan Fetaya},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GiHo83ozsF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1002712, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8390054797772653798&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "biu.ac.il;;sony.com;nvidia.com;biu.ac.il", "author_num": 5, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Bar-Ilan University;Sony Corporation;NVIDIA", "aff_unique_dep": ";;NVIDIA Corporation", "aff_unique_url": "https://www.biu.ac.il;https://www.sony.com;https://www.nvidia.com", "aff_unique_abbr": "BIU;Sony;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0", "aff_country_unique": "Israel;Japan;United States" }, { "title": "Reducing Balancing Error for Causal Inference via Optimal Transport", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34490", "id": "GktjBAGgo4", "proceeding": "https://proceedings.mlr.press/v235/yan24i.html", "pdf": "https://openreview.net/pdf?id=GktjBAGgo4", "openreview": "https://openreview.net/forum?id=GktjBAGgo4", "author_site": "Yuguang Yan, Hao Zhou, Zeqin Yang, Weilin Chen, Ruichu Cai, Zhifeng Hao", "tldr": "", "abstract": "Most studies on causal inference tackle the issue of confounding bias by reducing the distribution shift between the control and treated groups. However, it remains an open question to adopt an appropriate metric for distribution shift in practice. In this paper, we define a generic balancing error on reweighted samples to characterize the confounding bias, and study the connection between the balancing error and the Wasserstein discrepancy derived from the theory of optimal transport. We not only regard the Wasserstein discrepancy as the metric of distribution shift, but also explore the association between the balancing error and the underlying cost function involved in the Wasserstein discrepancy. Motivated by this, we propose to reduce the balancing error under the framework of optimal transport with learnable marginal distributions and the cost function, which is implemented by jointly learning weights and representations associated with factual outcomes. The experiments on both synthetic and real-world datasets demonstrate the effectiveness of our proposed method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuguang Yan;Hao Zhou;Zeqin Yang;Weilin Chen;Ruichu Cai;Zhifeng Hao", "authorids": "~Yuguang_Yan1;~Hao_Zhou23;~Zeqin_Yang1;~Weilin_Chen1;~Ruichu_Cai1;~Zhifeng_Hao5", "gender": "M;M;M;M;M;M", "homepage": ";;;;https://ruichucai.github.io/;https://www.scholat.com/zfhao", "dblp": "154/0064;;;;09/6889;", "google_scholar": ";;https://scholar.google.com/citations?hl=zh-CN;KVvl1vgAAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": ";0009-0007-3546-4699;0009-0004-3343-4195;;;", "linkedin": ";;;;;", "or_profile": "~Yuguang_Yan1;~Hao_Zhou23;~Zeqin_Yang1;~Weilin_Chen1;~Ruichu_Cai1;~Zhifeng_Hao5", "aff": "Guangdong University of Technology;Guangdong University of Technology;Guangdong University of Technology;University of Cambridge;Guangdong University of Technology;", "aff_domain": "gdut.edu.cn;gdut.edu.cn;gdut.edu.cn;cam.ac.uk;gdut.edu.cn;", "position": "Associate Professor;MS student;MS student;Visiting Student;Full Professor;", "bibtex": "@inproceedings{\nyan2024reducing,\ntitle={Reducing Balancing Error for Causal Inference via Optimal Transport},\nauthor={Yuguang Yan and Hao Zhou and Zeqin Yang and Weilin Chen and Ruichu Cai and Zhifeng Hao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GktjBAGgo4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 627745, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3584829821722728016&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 4, "email": "gdut.edu.cn;gdut.edu.cn;gdut.edu.cn;cam.ac.uk;gdut.edu.cn;", "author_num": 6, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Guangdong University of Technology;University of Cambridge", "aff_unique_dep": ";", "aff_unique_url": "http://www.gdut.edu.cn;https://www.cam.ac.uk", "aff_unique_abbr": "GDUT;Cambridge", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "China;United Kingdom" }, { "title": "Learning in Feature Spaces via Coupled Covariances: Asymmetric Kernel SVD and Nystr\u00f6m method", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34489", "id": "Gp0xZDmrA2", "proceeding": "https://proceedings.mlr.press/v235/tao24b.html", "pdf": "https://openreview.net/pdf?id=Gp0xZDmrA2", "openreview": "https://openreview.net/forum?id=Gp0xZDmrA2", "author_site": "Qinghua Tao, Francesco Tonin, Alex Lambert, Yingyi Chen, Panagiotis Patrinos, Johan Suykens", "tldr": "", "abstract": "In contrast with Mercer kernel-based approaches as used e.g. in Kernel Principal Component Analysis (KPCA), it was previously shown that Singular Value Decomposition (SVD) inherently relates to asymmetric kernels and Asymmetric Kernel Singular Value Decomposition (KSVD) has been proposed. However, the existing formulation to KSVD cannot work with infinite-dimensional feature mappings, the variational objective can be unbounded, and needs further numerical evaluation and exploration towards machine learning. In this work, i) we introduce a new asymmetric learning paradigm based on coupled covariance eigenproblem (CCE) through covariance operators, allowing infinite-dimensional feature maps. The solution to CCE is ultimately obtained from the SVD of the induced asymmetric kernel matrix, providing links to KSVD. ii) Starting from the integral equations corresponding to a pair of coupled adjoint eigenfunctions, we formalize the asymmetric Nystr\u00f6m method through a finite sample approximation to speed up training. iii) We provide the first empirical evaluations verifying the practical utility and benefits of KSVD and compare with methods resorting to symmetrization or linear SVD across multiple tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qinghua Tao;Francesco Tonin;Alex Lambert;Yingyi Chen;Panagiotis Patrinos;Johan Suykens", "authorids": "~Qinghua_Tao1;~Francesco_Tonin1;~Alex_Lambert1;~Yingyi_Chen3;~Panagiotis_Patrinos1;~Johan_Suykens1", "gender": "F;;;F;M;M", "homepage": "https://qinghua-tao.github.io/;https://taralloc.github.io/;https://allambert.github.io/;;https://homes.esat.kuleuven.be/~ppatrino/index.html;https://www.kuleuven.be/wieiswie/nl/person/00015385", "dblp": "182/9643.html;279/6777;177/4546;09/9441;55/896;61/3224", "google_scholar": "_dZHZD8AAAAJ;;https://scholar.google.fr/citations?user=iK4XH44AAAAJ;5b2jAVUAAAAJ;Qiwt2t8AAAAJ;https://scholar.google.be/citations?user=WtBmh0UAAAAJ", "orcid": "0000-0001-9705-7748;0000-0002-5644-0086;;0000-0002-5571-9050;0000-0003-4824-7697;0000-0002-8846-6352", "linkedin": ";;;;;", "or_profile": "~Qinghua_Tao1;~Francesco_Tonin1;~Alex_Lambert1;~Yingyi_Chen3;~Panagiotis_Patrinos1;~Johan_Suykens1", "aff": "(ESAT) Department of Electrical Engineering, KU Leuven, Belgium, KU Leuven;EPFL - EPF Lausanne;KU Leuven;Department of Electrical Engineering, KU Leuven, Belgium;;KU Leuven", "aff_domain": "esat.kuleuven.be;epfl.ch;kuleuven.be;esat.kuleuven.be;;kuleuven.be", "position": "Postdoc;Postdoc;Postdoc;PhD student;;Full Professor", "bibtex": "@inproceedings{\ntao2024learning,\ntitle={Learning in Feature Spaces via Coupled Covariances: Asymmetric Kernel {SVD} and Nystr\\\"om method},\nauthor={Qinghua Tao and Francesco Tonin and Alex Lambert and Yingyi Chen and Panagiotis Patrinos and Johan Suykens},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Gp0xZDmrA2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 636700, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9506513163792588048&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "esat.kuleuven.be;epfl.ch;kuleuven.be;esat.kuleuven.be;;kuleuven.be", "author_num": 6, "aff_unique_index": "0;1;2;0;2", "aff_unique_norm": "KU Leuven;EPFL;Katholieke Universiteit Leuven", "aff_unique_dep": "Department of Electrical Engineering;;", "aff_unique_url": "https://www.kuleuven.be;https://www.epfl.ch;https://www.kuleuven.be", "aff_unique_abbr": "KU Leuven;EPFL;KU Leuven", "aff_campus_unique_index": "1", "aff_campus_unique": ";Lausanne", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "Belgium;Switzerland" }, { "title": "Iterative Regularized Policy Optimization with Imperfect Demonstrations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34488", "id": "Gp5F6qzwGK", "proceeding": "https://proceedings.mlr.press/v235/xudong24a.html", "pdf": "https://openreview.net/pdf?id=Gp5F6qzwGK", "openreview": "https://openreview.net/forum?id=Gp5F6qzwGK", "author_site": "Xudong Gong, Feng Dawei, Kele Xu, Yuanzhao Zhai, Chengkang Yao, Weijia Wang, Bo Ding, Huaimin Wang", "tldr": "", "abstract": "Imitation learning heavily relies on the quality of provided demonstrations. In scenarios where demonstrations are imperfect and rare, a prevalent approach for refining policies is through online fine-tuning with reinforcement learning, in which a Kullback\u2013Leibler (KL) regularization is often employed to stabilize the learning process. However, our investigation reveals that on the one hand, imperfect demonstrations can bias the online learning process, the KL regularization will further constrain the improvement of online policy exploration. To address the above issues, we propose Iterative Regularized Policy Optimization (IRPO), a framework that involves iterative offline imitation learning and online reinforcement exploration. Specifically, the policy learned online is used to serve as the demonstrator for successive learning iterations, with a demonstration boosting to consistently enhance the quality of demonstrations. Experimental validations conducted across widely used benchmarks and a novel fixed-wing UAV control task consistently demonstrate the effectiveness of IRPO in improving both the demonstration quality and the policy performance. Our code is available at https://github.com/GongXudong/IRPO.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gong Xudong;Feng Dawei;Kele Xu;Yuanzhao Zhai;Chengkang Yao;Weijia Wang;Bo Ding;Huaimin Wang", "authorids": "~Gong_Xudong1;~Feng_Dawei1;~Kele_Xu2;~Yuanzhao_Zhai1;yaochengkang@126.com;weijia.hust@gmail.com;~Bo_Ding1;~Huaimin_Wang1", "gender": "M;;;;;;M;M", "homepage": "https://github.com/GongXudong;;;;;;;", "dblp": "119/3474;;;;;;;02/661", "google_scholar": ";;;;;;XS0voEAAAAAJ;", "orcid": "0000-0002-2253-2927;;;;;;;", "linkedin": ";;;;;;;", "or_profile": "~Gong_Xudong1;~Feng_Dawei1;~Kele_Xu2;~Yuanzhao_Zhai1;yaochengkang@126.com;weijia.hust@gmail.com;~Bo_Ding1;~Huaimin_Wang1", "aff": "National University of Defense Technology;;;;;;National University of Defense Technology;National University of Defense Technology", "aff_domain": "nudt.edu.cn;;;;;;nudt.edu.cn;nudt.edu.cn", "position": "PhD student;;;;;;Full Professor;Full Professor", "bibtex": "@inproceedings{\nxudong2024iterative,\ntitle={Iterative Regularized Policy Optimization with Imperfect Demonstrations},\nauthor={Gong Xudong and Feng Dawei and Kele Xu and Yuanzhao Zhai and Chengkang Yao and Weijia Wang and Bo Ding and Huaimin Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Gp5F6qzwGK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7011245, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14672731808904631353&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "nudt.edu.cn;;;;;;nudt.edu.cn;nudt.edu.cn", "author_num": 8, "aff_unique_index": "0;0;0", "aff_unique_norm": "National University of Defense Technology", "aff_unique_dep": "", "aff_unique_url": "http://www.nudt.edu.cn/", "aff_unique_abbr": "NUDT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Rich-Observation Reinforcement Learning with Continuous Latent Dynamics", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34487", "id": "Gq1ajaKhBC", "proceeding": "https://proceedings.mlr.press/v235/song24i.html", "pdf": "https://openreview.net/pdf?id=Gq1ajaKhBC", "openreview": "https://openreview.net/forum?id=Gq1ajaKhBC", "author_site": "Yuda Song, Lili Wu, Dylan Foster, Akshay Krishnamurthy", "tldr": "", "abstract": "Sample-efficiency and reliability remain major bottlenecks toward wide adoption of reinforcement learning algorithms in continuous settings with high-dimensional perceptual inputs. Toward addressing these challenges, we introduce a new theoretical framework, **RichCLD** (\u201cRich-Observation RL with Continuous Latent Dynamics\u201d), in which the agent performs control based on high-dimensional observations, but the environment is governed by low-dimensional latent states and Lipschitz continuous dynamics. Our main contribution is a new algorithm for this setting that is provably statistically and computationally efficient. The core of our algorithm is a new representation learning objective; we show that prior representation learning schemes tailored to discrete dynamics do not naturally extend to the continuous setting. Our new objective is amenable to practical implementation, and empirically, we find that it compares favorably to prior schemes in a standard evaluation protocol. We further provide several insights into the statistical complexity of the **RichCLD** framework, in particular proving that certain notions of Lipschitzness that admit sample-efficient learning in the absence of rich observations are insufficient in the rich-observation setting.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuda Song;Lili Wu;Dylan J Foster;Akshay Krishnamurthy", "authorids": "~Yuda_Song2;~Lili_Wu1;~Dylan_J_Foster1;~Akshay_Krishnamurthy1", "gender": "M;;;M", "homepage": "https://yudasong.github.io/;;http://dylanfoster.net;https://www.cics.umass.edu/~akshay/", "dblp": "250/4880-1;91/1716;167/4271;85/8024", "google_scholar": "0QDCG8IAAAAJ;x8fnPxAAAAAJ;RqwU8xsAAAAJ;https://scholar.google.com.tw/citations?user=K0kaNvkAAAAJ", "orcid": ";;;", "linkedin": ";lili-wu-71456674;;", "or_profile": "~Yuda_Song2;~Lili_Wu1;~Dylan_J_Foster1;~Akshay_Krishnamurthy1", "aff": "Carnegie Mellon University;Microsoft Research NYC;Microsoft Research;Microsoft Research", "aff_domain": "andrew.cmu.edu;microsoft.com;microsoft.com;research.microsoft.com", "position": "PhD student;Data and applied scientist;Principal Researcher;Principal Researcher", "bibtex": "@inproceedings{\nsong2024richobservation,\ntitle={Rich-Observation Reinforcement Learning with Continuous Latent Dynamics},\nauthor={Yuda Song and Lili Wu and Dylan J Foster and Akshay Krishnamurthy},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Gq1ajaKhBC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2816026, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16485842097334250006&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "andrew.cmu.edu;microsoft.com;microsoft.com;research.microsoft.com", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Carnegie Mellon University;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.cmu.edu;https://www.microsoft.com/en-us/research/group/microsoft-research-new-york-city", "aff_unique_abbr": "CMU;MSR NYC", "aff_campus_unique_index": "1", "aff_campus_unique": ";New York City", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Fast Co-Training under Weak Dependence via Stream-Based Active Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34486", "id": "GqWy1wZKeE", "proceeding": "https://proceedings.mlr.press/v235/diakonikolas24b.html", "pdf": "https://openreview.net/pdf?id=GqWy1wZKeE", "openreview": "https://openreview.net/forum?id=GqWy1wZKeE", "author_site": "Ilias Diakonikolas, Mingchen Ma, Lisheng Ren, Christos Tzamos", "tldr": "", "abstract": "Co-training is a classical semi-supervised learning method which only requires a small number of labeled examples for learning, under reasonable assumptions. Despite extensive literature on the topic, very few hypothesis classes are known to be provably efficiently learnable via co-training, even under very strong distributional assumptions. In this work, we study the co-training problem in the stream-based active learning model. We show that a range of natural concept classes are efficiently learnable via co-training, in terms of both label efficiency and computational efficiency. We provide an efficient reduction of co-training under the standard assumption of weak dependence, in the stream-based active model, to online classification. As a corollary, we obtain efficient co-training algorithms with error independent label complexity for every concept class class efficiently learnable in the mistake bound online model. Our framework also gives co-training algorithms with label complexity $\\tilde{O}(d\\log (1/\\epsilon))$ for any concept class with VC dimension $d$, though in general this reduction is not computationally efficient. Finally, using additional ideas from online learning, we design the first efficient co-training algorithms with label complexity $\\tilde{O}(d^2\\log (1/\\epsilon))$ for several concept classes, including unions of intervals and homogeneous halfspaces.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ilias Diakonikolas;Mingchen Ma;Lisheng Ren;Christos Tzamos", "authorids": "~Ilias_Diakonikolas1;~Mingchen_Ma1;~Lisheng_Ren1;~Christos_Tzamos1", "gender": "M;;;", "homepage": "http://www.iliasdiakonikolas.org/;https://mmingchen.github.io/;https://www.wisc.edu/directories/person/?q=Lisheng%20Ren&email=lren29%40wisc.edu&savedQuery=Lisheng%20Ren&returnPath=%2Fdirectories%2F;https://tzamos.com", "dblp": "d/IliasDiakonikolas;270/6320;93/495;79/8819", "google_scholar": "Vb3FLmkAAAAJ;w84UnLsAAAAJ;;wB01auEAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Ilias_Diakonikolas1;~Mingchen_Ma1;~Lisheng_Ren1;~Christos_Tzamos1", "aff": "University of Wisconsin - Madison;University of Wisconsin - Madison;University of Wisconsin - Madison;University of Wisconsin, Madison", "aff_domain": "wisc.edu;wisc.edu;wisc.edu;wisc.edu", "position": "Full Professor;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\ndiakonikolas2024fast,\ntitle={Fast Co-Training under Weak Dependence via Stream-Based Active Learning},\nauthor={Ilias Diakonikolas and Mingchen Ma and Lisheng Ren and Christos Tzamos},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GqWy1wZKeE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 451057, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5364391427664973095&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "wisc.edu;wisc.edu;wisc.edu;wisc.edu", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of Wisconsin-Madison;University of Wisconsin", "aff_unique_dep": ";", "aff_unique_url": "https://www.wisc.edu;https://www.wisc.edu", "aff_unique_abbr": "UW-Madison;UW", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Indirectly Parameterized Concrete Autoencoders", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34485", "id": "GqsRKEhelH", "proceeding": "https://proceedings.mlr.press/v235/nilsson24b.html", "pdf": "https://openreview.net/pdf?id=GqsRKEhelH", "openreview": "https://openreview.net/forum?id=GqsRKEhelH", "author_site": "Alfred Nilsson, Klas Wijk, Sai bharath chandra Gutha, Erik Englesson, Alexandra Hotti, Carlo Saccardi, Oskar Kviman, Jens Lagergren, Ricardo Vinuesa, Hossein Azizpour", "tldr": "", "abstract": "Feature selection is a crucial task in settings where data is high-dimensional or acquiring the full set of features is costly. Recent developments in neural network-based embedded feature selection show promising results across a wide range of applications. Concrete Autoencoders (CAEs), considered state-of-the-art in embedded feature selection, may struggle to achieve stable joint optimization, hurting their training time and generalization. In this work, we identify that this instability is correlated with the CAE learning duplicate selections. To remedy this, we propose a simple and effective improvement: Indirectly Parameterized CAEs (IP-CAEs). IP-CAEs learn an embedding and a mapping from it to the Gumbel-Softmax distributions' parameters. Despite being simple to implement, IP-CAE exhibits significant and consistent improvements over CAE in both generalization and training time across several datasets for reconstruction and classification. Unlike CAE, IP-CAE effectively leverages non-linear relationships and does not require retraining the jointly optimized decoder. Furthermore, our approach is, in principle, generalizable to Gumbel-Softmax distributions beyond feature selection.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alfred Nilsson;Klas Wijk;Sai bharath chandra Gutha;Erik Englesson;Alexandra Hotti;Carlo Saccardi;Oskar Kviman;Jens Lagergren;Ricardo Vinuesa Motilva;Hossein Azizpour", "authorids": "~Alfred_Nilsson1;~Klas_Wijk1;~Sai_bharath_chandra_Gutha1;~Erik_Englesson1;~Alexandra_Hotti1;~Carlo_Saccardi1;~Oskar_Kviman1;~Jens_Lagergren1;~Ricardo_Vinuesa_Motilva1;~Hossein_Azizpour2", "gender": "M;M;M;M;F;;M;M;M;M", "homepage": ";https://klaswijk.github.io/;;https://www.kth.se/profile/engless/;;;https://okviman.github.io/;https://lagergrenlab.org/;https://www.vinuesalab.com/;http://www.csc.kth.se/~azizpour/", "dblp": ";;263/3527.html;243/3256;;;248/5749;86/3552;;119/1327", "google_scholar": "https://scholar.google.ca/citations?user=IeJIQ5gAAAAJ;5OjrKLoAAAAJ;fS2-mGEAAAAJ;qR-xwSQAAAAJ;;;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=sv;;https://scholar.google.se/citations?user=t6CRgJsAAAAJ", "orcid": ";0009-0006-5400-8704;0000-0002-9337-564X;;;;;;;0000-0001-5211-6388", "linkedin": "alfred-nilsson-f/;klas-wijk/;guthasaibharathchandra/;;alexandra-hotti-784b45106/;carlo-saccardi-a82191227/;;;;hossein-azizpour-578b4ab1", "or_profile": "~Alfred_Nilsson1;~Klas_Wijk1;~Sai_bharath_chandra_Gutha1;~Erik_Englesson1;~Alexandra_Hotti1;~Carlo_Saccardi1;~Oskar_Kviman1;~Jens_Lagergren1;~Ricardo_Vinuesa_Motilva1;~Hossein_Azizpour1", "aff": ";KTH Royal Institute of Technology;KTH Royal Institute of Technology;KTH Royal Institute of Technology, Stockholm, Sweden;KTH Royal Institute of Technology, Stockholm, Sweden;;KTH Royal Institute of Technology, Stockholm, Sweden;KTH Royal Institute of Technology, Stockholm, Sweden;KTH Royal Institute of Technology;KTH Royal Institute of Technology", "aff_domain": ";kth.se;kth.se;kth.se;kth.se;;kth.se;kth.se;kth.se;kth.se", "position": ";PhD student;PhD student;PhD student;PhD student;;PhD student;Full Professor;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nnilsson2024indirectly,\ntitle={Indirectly Parameterized Concrete Autoencoders},\nauthor={Alfred Nilsson and Klas Wijk and Sai bharath chandra Gutha and Erik Englesson and Alexandra Hotti and Carlo Saccardi and Oskar Kviman and Jens Lagergren and Ricardo Vinuesa Motilva and Hossein Azizpour},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GqsRKEhelH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1016353, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9273510897722608507&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": ";kth.se;kth.se;kth.se;kth.se;;kth.se;kth.se;kth.se;kth.se", "author_num": 10, "aff_unique_index": "0;0;0;0;0;0;0;0", "aff_unique_norm": "KTH Royal Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kth.se", "aff_unique_abbr": "KTH", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Stockholm", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "Sweden" }, { "title": "Efficient Mixture Learning in Black-Box Variational Inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34484", "id": "Grrydzui3A", "proceeding": "https://proceedings.mlr.press/v235/hotti24a.html", "pdf": "https://openreview.net/pdf?id=Grrydzui3A", "openreview": "https://openreview.net/forum?id=Grrydzui3A", "author_site": "Alexandra Hotti, Oskar Kviman, Ricky Mol\u00e9n, V\u00edctor Elvira, Jens Lagergren", "tldr": "", "abstract": "Mixture variational distributions in black box variational inference (BBVI) have demonstrated impressive results in challenging density estimation tasks. However, currently scaling the number of mixture components can lead to a linear increase in the number of learnable parameters and a quadratic increase in inference time due to the evaluation of the evidence lower bound (ELBO). Our two key contributions address these limitations. First, we introduce the novel Multiple Importance Sampling Variational Autoencoder (MISVAE), which amortizes the mapping from input to mixture-parameter space using one-hot encodings. Fortunately, with MISVAE, each additional mixture component incurs a negligible increase in network parameters. Second, we construct two new estimators of the ELBO for mixtures in BBVI, enabling a tremendous reduction in inference time with marginal or even improved impact on performance. Collectively, our contributions enable scalability to hundreds of mixture components and provide superior estimation performance in shorter time, with fewer network parameters compared to previous Mixture VAEs. Experimenting with MISVAE, we achieve astonishing, SOTA results on MNIST. Furthermore, we empirically validate our estimators in other BBVI settings, including Bayesian phylogenetic inference, where we improve inference times for the SOTA mixture model on eight data sets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alexandra Hotti;Oskar Kviman;Ricky Mol\u00e9n;V\u00edctor Elvira;Jens Lagergren", "authorids": "~Alexandra_Hotti1;~Oskar_Kviman1;~Ricky_Mol\u00e9n1;~V\u00edctor_Elvira1;~Jens_Lagergren1", "gender": "F;M;M;;M", "homepage": ";https://okviman.github.io/;https://www.kth.se/profile/rickym/;https://victorelvira.github.io/;https://lagergrenlab.org/", "dblp": ";248/5749;;;86/3552", "google_scholar": ";https://scholar.google.com/citations?hl=en;;0V_-F5sAAAAJ;https://scholar.google.com/citations?hl=sv", "orcid": ";;;;", "linkedin": "alexandra-hotti-784b45106/;;https://linkedin.com/in/ricky-molen-1a3044b3;;", "or_profile": "~Alexandra_Hotti1;~Oskar_Kviman1;~Ricky_Mol\u00e9n1;~V\u00edctor_Elvira1;~Jens_Lagergren1", "aff": "KTH Royal Institute of Technology, Stockholm, Sweden;KTH Royal Institute of Technology, Stockholm, Sweden;KTH Royal Institute of Technology;University of Edinburgh;KTH Royal Institute of Technology, Stockholm, Sweden", "aff_domain": "kth.se;kth.se;kth.se;ed.ac.uk;kth.se", "position": "PhD student;PhD student;PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nhotti2024efficient,\ntitle={Efficient Mixture Learning in Black-Box Variational Inference},\nauthor={Alexandra Hotti and Oskar Kviman and Ricky Mol{\\'e}n and V{\\'\\i}ctor Elvira and Jens Lagergren},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Grrydzui3A}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1030455, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13918744913446479324&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "kth.se;kth.se;kth.se;ed.ac.uk;kth.se", "author_num": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "KTH Royal Institute of Technology;University of Edinburgh", "aff_unique_dep": ";", "aff_unique_url": "https://www.kth.se;https://www.ed.ac.uk", "aff_unique_abbr": "KTH;Edinburgh", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stockholm;", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "Sweden;United Kingdom" }, { "title": "Representation Surgery: Theory and Practice of Affine Steering", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34483", "id": "GwA4go0Mw4", "proceeding": "https://proceedings.mlr.press/v235/singh24d.html", "pdf": "https://openreview.net/pdf?id=GwA4go0Mw4", "openreview": "https://openreview.net/forum?id=GwA4go0Mw4", "author_site": "Shashwat Singh, Shauli Ravfogel, Jonathan Herzig, Roee Aharoni, Ryan Cotterell, Ponnurangam Kumaraguru", "tldr": "", "abstract": "Language models often exhibit undesirable behavior, e.g., generating toxic or gender-biased text. In the case of neural language models, an encoding of the undesirable behavior is often present in the model's representations. Thus, one natural (and common) approach to prevent the model from exhibiting undesirable behavior is to steer the model's representations in a manner that reduces the probability of it generating undesirable text. This paper investigates the formal and empirical properties of steering functions, i.e., transformation of the neural language model's representations that alter its behavior. First, we derive two optimal, in the least-squares sense, affine steering functions under different constraints. Our theory provides justification for existing approaches and offers a novel, improved steering approach. Second, we offer a series of experiments that demonstrate the empirical effectiveness of the methods in mitigating bias and reducing toxic generation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shashwat Singh;Shauli Ravfogel;Jonathan Herzig;Roee Aharoni;Ryan Cotterell;Ponnurangam Kumaraguru", "authorids": "~Shashwat_Singh1;~Shauli_Ravfogel1;~Jonathan_Herzig2;~Roee_Aharoni1;~Ryan_Cotterell1;~Ponnurangam_Kumaraguru3", "gender": "M;M;M;M;;Not Specified", "homepage": ";https://github.com/Shaul1321;https://jonathanherzig.github.io/;http://www.roeeaharoni.com;https://precog.iiit.ac.in/;https://rycolab.io/", "dblp": ";227/2231;133/3687.html;148/9506;97/5147.html;146/4361.html", "google_scholar": ";;https://scholar.google.co.il/citations?view_op=list_works;https://scholar.google.co.il/citations?user=wV0mHWgAAAAJ;MfzQyP8AAAAJ;DexOqtoAAAAJ", "orcid": ";;;;;", "linkedin": "shashwat-singh-bb6872111/;;;roeeaharoni;ponguru/;", "or_profile": "~Shashwat_Singh1;~Shauli_Ravfogel1;~Jonathan_Herzig2;~Roee_Aharoni1;~Ponnurangam_Kumaraguru3;~Ryan_D_Cotterell1", "aff": "International Institute of Information Technology Hyderabad;Bar-Ilan University;Research, Google;Google;International Institute of Information Technology Hyderabad ;Swiss Federal Institute of Technology", "aff_domain": "iiit.ac.in;biu.ac.il;research.google.com;google.com;iiit.ac.in;ethz.ch", "position": "Undergrad student;PhD student;Researcher;Researcher;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nsingh2024representation,\ntitle={Representation Surgery: Theory and Practice of Affine Steering},\nauthor={Shashwat Singh and Shauli Ravfogel and Jonathan Herzig and Roee Aharoni and Ryan Cotterell and Ponnurangam Kumaraguru},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GwA4go0Mw4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3059482, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3307982894893395716&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "iiit.ac.in;biu.ac.il;research.google.com;google.com;iiit.ac.in;ethz.ch", "author_num": 6, "aff_unique_index": "0;1;2;2;0;3", "aff_unique_norm": "International Institute of Information Technology;Bar-Ilan University;Google;Swiss Federal Institute of Technology", "aff_unique_dep": ";;Google Research;", "aff_unique_url": "https://iiit Hyderabad.ac.in;https://www.biu.ac.il;https://research.google;https://www.ethz.ch", "aff_unique_abbr": "IIIT Hyderabad;BIU;Google;ETH Zurich", "aff_campus_unique_index": "0;2;2;0", "aff_campus_unique": "Hyderabad;;Mountain View", "aff_country_unique_index": "0;1;2;2;0;3", "aff_country_unique": "India;Israel;United States;Switzerland" }, { "title": "EMC$^2$: Efficient MCMC Negative Sampling for Contrastive Learning with Global Convergence", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34482", "id": "GxOFM3f5Vm", "proceeding": "https://proceedings.mlr.press/v235/yau24a.html", "pdf": "https://openreview.net/pdf?id=GxOFM3f5Vm", "openreview": "https://openreview.net/forum?id=GxOFM3f5Vm", "author_site": "Chung-Yiu Yau, Hoi To Wai, Parameswaran Raman, Soumajyoti Sarkar, Mingyi Hong", "tldr": "", "abstract": "A key challenge in contrastive learning is to generate negative samples from a large sample set to contrast with positive samples, for learning better encoding of the data. These negative samples often follow a softmax distribution which are dynamically updated during the training process. However, sampling from this distribution is non-trivial due to the high computational costs in computing the partition function. In this paper, we propose an $\\underline{\\text{E}}$fficient $\\underline{\\text{M}}$arkov $\\underline{\\text{C}}$hain Monte Carlo negative sampling method for $\\underline{\\text{C}}$ontrastive learning (EMC$^2$). We follow the global contrastive learning loss as introduced in SogCLR, and propose EMC$^2$ which utilizes an adaptive Metropolis-Hastings subroutine to generate hardness-aware negative samples in an online fashion during the optimization. We prove that EMC$^2$ finds an $\\mathcal{O}(1/\\sqrt{T})$-stationary point of the global contrastive loss in $T$ iterations. Compared to prior works, EMC$^2$ is the first algorithm that exhibits global convergence (to stationarity) regardless of the choice of batch size while exhibiting low computation and memory cost. Numerical experiments validate that EMC$^2$ is effective with small batch training and achieves comparable or better performance than baseline algorithms. We report the results for pre-training image encoders on STL-10 and Imagenet-100.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chung-Yiu Yau;Hoi To Wai;Parameswaran Raman;Soumajyoti Sarkar;Mingyi Hong", "authorids": "~Chung-Yiu_Yau1;~Hoi_To_Wai1;~Parameswaran_Raman1;~Soumajyoti_Sarkar1;~Mingyi_Hong1", "gender": "M;M;M;M;M", "homepage": "https://oscaryau525.github.io/;http://www1.se.cuhk.edu.hk/~htwai/;https://paramsraman.github.io/;https://soumajyoti.github.io;http://people.ece.umn.edu/~mhong/mingyi.html", "dblp": "308/7285;29/9875;142/2573;171/7121;57/8053", "google_scholar": "https://scholar.google.com/citations?hl=zh-TW;https://scholar.google.com.hk/citations?user=5-J7LeMAAAAJ;amJUMFEAAAAJ;DGN-VVUAAAAJ;qRnP-p0AAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Chung-Yiu_Yau1;~Hoi_To_Wai1;~Parameswaran_Raman1;~Soumajyoti_Sarkar1;~Mingyi_Hong1", "aff": "Amazon Web Services;The Chinese University of Hong Kong;Amazon;Amazon;University of Minnesota, Minneapolis", "aff_domain": "amazon.com;cuhk.edu.hk;amazon.com;amazon.com;umn.edu", "position": "Intern;Assistant Professor;Applied Scientist;ML Researcher;Associate Professor", "bibtex": "@inproceedings{\nyau2024emc,\ntitle={{EMC}\\${\\textasciicircum}2\\$: Efficient {MCMC} Negative Sampling for Contrastive Learning with Global Convergence},\nauthor={Chung-Yiu Yau and Hoi To Wai and Parameswaran Raman and Soumajyoti Sarkar and Mingyi Hong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GxOFM3f5Vm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 689887, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:C2T0U8T4xYYJ:scholar.google.com/&scioq=EMC%24%5E2%24:+Efficient+MCMC+Negative+Sampling+for+Contrastive+Learning+with+Global+Convergence&hl=en&as_sdt=0,33", "gs_version_total": 11, "email": "amazon.com;cuhk.edu.hk;amazon.com;amazon.com;umn.edu", "author_num": 5, "aff_unique_index": "0;1;0;0;2", "aff_unique_norm": "Amazon;Chinese University of Hong Kong;University of Minnesota", "aff_unique_dep": "Amazon Web Services;;", "aff_unique_url": "https://aws.amazon.com;https://www.cuhk.edu.hk;https://www.minnesota.edu", "aff_unique_abbr": "AWS;CUHK;UMN", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Hong Kong SAR;Minneapolis", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;China" }, { "title": "Robustness of Nonlinear Representation Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34481", "id": "GyV33H5Uuk", "proceeding": "https://proceedings.mlr.press/v235/buchholz24a.html", "pdf": "https://openreview.net/pdf?id=GyV33H5Uuk", "openreview": "https://openreview.net/forum?id=GyV33H5Uuk", "author_site": "Simon Buchholz, Bernhard Sch\u00f6lkopf", "tldr": "", "abstract": "We study the problem of unsupervised representation learning in slightly misspecified settings, and thus formalize the study of robustness of nonlinear representation learning. We focus on the case where the mixing is close to a local isometry in a suitable distance and show based on existing rigidity results that the mixing can be identified up to linear transformations and small errors. In a second step, we investigate Independent Component Analysis (ICA) with observations generated according to $x=f(s)=As+h(s)$ where $A$ is an invertible mixing matrix and $h$ a small perturbation. We show that we can approximately recover the matrix $A$ and the independent components. Together, these two results show approximate identifiability of nonlinear ICA with almost isometric mixing functions. Those results are a step towards identifiability results for unsupervised representation learning for real-world data that do not follow restrictive model classes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Simon Buchholz;Bernhard Sch\u00f6lkopf", "authorids": "~Simon_Buchholz1;~Bernhard_Sch\u00f6lkopf1", "gender": ";", "homepage": "https://www.is.mpg.de/person/sbuchholz;", "dblp": "207/9068;", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Simon_Buchholz1;~Bernhard_Sch\u00f6lkopf1", "aff": "Max-Planck Institute;", "aff_domain": "mpg.de;", "position": "Postdoc;", "bibtex": "@inproceedings{\nbuchholz2024robustness,\ntitle={Robustness of Nonlinear Representation Learning},\nauthor={Simon Buchholz and Bernhard Sch{\\\"o}lkopf},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GyV33H5Uuk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 634578, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18214890903077725707&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 4, "email": "mpg.de;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Max-Planck-Gesellschaft zur F\u00f6rderung der Wissenschaften e.V.", "aff_unique_dep": "", "aff_unique_url": "https://www.mpg.de", "aff_unique_abbr": "MPG", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "title": "Out of the Ordinary: Spectrally Adapting Regression for Covariate Shift", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34480", "id": "H3bATm4mKn", "proceeding": "https://proceedings.mlr.press/v235/eyre24a.html", "pdf": "https://openreview.net/pdf?id=H3bATm4mKn", "openreview": "https://openreview.net/forum?id=H3bATm4mKn", "author_site": "Benjamin Eyre, Elliot Creager, David Madras, Vardan Papyan, Richard Zemel", "tldr": "", "abstract": "Designing deep neural network classifiers that perform robustly on distributions differing from the available training data is an active area of machine learning research. However, out-of-distribution generalization for regression---the analogous problem for modeling continuous targets---remains relatively unexplored. To tackle this problem, we return to first principles and analyze how the closed-form solution for Ordinary Least Squares (OLS) regression is sensitive to covariate shift. We characterize the out-of-distribution risk of the OLS model in terms of the eigenspectrum decomposition of the source and target data. We then use this insight to propose a method called Spectral Adapted Regressor (SpAR) for adapting the weights of the last layer of a pre-trained neural regression model to perform better on input data originating from a different distribution. We demonstrate how this lightweight spectral adaptation procedure can improve out-of-distribution performance for synthetic and real-world datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Benjamin Eyre;Elliot Creager;David Madras;Vardan Papyan;Richard Zemel", "authorids": "~Benjamin_Eyre1;~Elliot_Creager1;~David_Madras1;~Vardan_Papyan1;~Richard_Zemel1", "gender": "M;M;M;M;M", "homepage": ";https://ecreager.github.io/;http://www.cs.toronto.edu/~madras/;https://sites.google.com/view/vardan-papyan;http://www.cs.columbia.edu/~zemel", "dblp": ";182/2055;188/6211;173/9783;16/6366", "google_scholar": "https://scholar.google.ca/citations?user=Ww1QOOkAAAAJ;boebIUcAAAAJ;MgnNDpkAAAAJ;https://scholar.google.co.il/citations?user=VrE-Gd4AAAAJ;https://scholar.google.ca/citations?user=iBeDoRAAAAAJ", "orcid": ";0009-0004-7122-3866;;;", "linkedin": ";;;;", "or_profile": "~Benjamin_Eyre1;~Elliot_Creager1;~David_Madras1;~Vardan_Papyan1;~Richard_Zemel1", "aff": "Google;University of Waterloo;Google;University of Toronto;Department of Computer Science, University of Toronto", "aff_domain": "google.com;uwaterloo.ca;google.com;toronto.edu;cs.toronto.edu", "position": "Intern;Assistant Professor;Researcher;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\neyre2024out,\ntitle={Out of the Ordinary: Spectrally Adapting Regression for Covariate Shift},\nauthor={Benjamin Eyre and Elliot Creager and David Madras and Vardan Papyan and Richard Zemel},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=H3bATm4mKn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 919747, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=711905381529530377&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "google.com;uwaterloo.ca;google.com;toronto.edu;cs.toronto.edu", "author_num": 5, "aff_unique_index": "0;1;0;2;2", "aff_unique_norm": "Google;University of Waterloo;University of Toronto", "aff_unique_dep": "Google;;", "aff_unique_url": "https://www.google.com;https://uwaterloo.ca;https://www.utoronto.ca", "aff_unique_abbr": "Google;UW;U of T", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Mountain View;;Toronto", "aff_country_unique_index": "0;1;0;1;1", "aff_country_unique": "United States;Canada" }, { "title": "Stealthy Imitation: Reward-guided Environment-free Policy Stealing", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34479", "id": "H5FDHzrWe2", "proceeding": "https://proceedings.mlr.press/v235/zhuang24a.html", "pdf": "https://openreview.net/pdf?id=H5FDHzrWe2", "openreview": "https://openreview.net/forum?id=H5FDHzrWe2", "author_site": "Zhixiong Zhuang, Maria-Irina Nicolae, Mario Fritz", "tldr": "", "abstract": "Deep reinforcement learning policies, which are integral to modern control systems, represent valuable intellectual property. The development of these policies demands considerable resources, such as domain expertise, simulation fidelity, and real-world validation. These policies are potentially vulnerable to model stealing attacks, which aim to replicate their functionality using only black-box access. In this paper, we propose Stealthy Imitation, the first attack designed to steal policies without access to the environment or knowledge of the input range. This setup has not been considered by previous model stealing methods. Lacking access to the victim's input states distribution, Stealthy Imitation fits a reward model that allows to approximate it. We show that the victim policy is harder to imitate when the distribution of the attack queries matches that of the victim. We evaluate our approach across diverse, high-dimensional control tasks and consistently outperform prior data-free approaches adapted for policy stealing. Lastly, we propose a countermeasure that significantly diminishes the effectiveness of the attack.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhixiong Zhuang;Maria-Irina Nicolae;Mario Fritz", "authorids": "~Zhixiong_Zhuang1;~Maria-Irina_Nicolae1;~Mario_Fritz1", "gender": "M;F;M", "homepage": "https://de.linkedin.com/in/zhixiong-zhuang-7b18121b0;https://ririnicolae.github.io/;https://cispa.saarland/group/fritz/", "dblp": ";156/0167.html;", "google_scholar": ";kNOsX30AAAAJ;https://scholar.google.de/citations?user=4V1nNm4AAAAJ", "orcid": ";0009-0002-2758-7481;", "linkedin": ";irina-nicolae-a2251638;", "or_profile": "~Zhixiong_Zhuang1;~Maria-Irina_Nicolae1;~Mario_Fritz1", "aff": "Robert Bosch GmbH;Robert Bosch GmbH;Saarland University", "aff_domain": "bosch.com;bosch.com;uni-saarland.de", "position": "PhD student;Research scientist;Full Professor", "bibtex": "@inproceedings{\nzhuang2024stealthy,\ntitle={Stealthy Imitation: Reward-guided Environment-free Policy Stealing},\nauthor={Zhixiong Zhuang and Maria-Irina Nicolae and Mario Fritz},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=H5FDHzrWe2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7355179, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12495121733455247305&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "bosch.com;bosch.com;uni-saarland.de", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Robert Bosch GmbH;Saarland University", "aff_unique_dep": ";", "aff_unique_url": "https://www.bosch.com;https://www.uni-saarland.de", "aff_unique_abbr": "Bosch;UdS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "On the Trajectory Regularity of ODE-based Diffusion Sampling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34478", "id": "H86WzfH5N1", "proceeding": "https://proceedings.mlr.press/v235/chen24bm.html", "pdf": "https://openreview.net/pdf?id=H86WzfH5N1", "openreview": "https://openreview.net/forum?id=H86WzfH5N1", "author_site": "Defang Chen, Zhenyu Zhou, Can Wang, Chunhua Shen, Siwei Lyu", "tldr": "", "abstract": "Diffusion-based generative models use stochastic differential equations (SDEs) and their equivalent ordinary differential equations (ODEs) to establish a smooth connection between a complex data distribution and a tractable prior distribution. In this paper, we identify several intriguing trajectory properties in the ODE-based sampling process of diffusion models. We characterize an implicit denoising trajectory and discuss its vital role in forming the coupled sampling trajectory with a strong shape regularity, regardless of the generated content. We also describe a dynamic programming-based scheme to make the time schedule in sampling better fit the underlying trajectory structure. This simple strategy requires minimal modification to any given ODE-based numerical solvers and incurs negligible computational cost, while delivering superior performance in image generation, especially in $5\\sim 10$ function evaluations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Defang Chen;Zhenyu Zhou;Can Wang;Chunhua Shen;Siwei Lyu", "authorids": "~Defang_Chen1;~Zhenyu_Zhou6;~Can_Wang5;~Chunhua_Shen2;~Siwei_Lyu1", "gender": "M;;M;;M", "homepage": "https://www.researchgate.net/profile/Defang-Chen-3;;https://person.zju.edu.cn/en/wangcan;;https://www.cse.buffalo.edu/~siweilyu", "dblp": "236/4507-1;;71/4716-1;;51/4482", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;;https://scholar.google.fr/citations?user=C63q3HoAAAAJ;;wefAEM4AAAAJ", "orcid": "0000-0003-0833-7401;;0000-0002-5890-4307;;0000-0002-0992-685X", "linkedin": "defang-chen-805b34165/;;;;siwei-lyu-0806022/", "or_profile": "~Defang_Chen1;~Zhenyu_Zhou6;~Can_Wang5;~Chunhua_Shen2;~Siwei_Lyu1", "aff": "Zhejiang University;;Zhejiang University;;State University of New York, Buffalo", "aff_domain": "zju.edu.cn;;zju.edu.cn;;buffalo.edu", "position": "PhD student;;Full Professor;;Full Professor", "bibtex": "@inproceedings{\nchen2024on,\ntitle={On the Trajectory Regularity of {ODE}-based Diffusion Sampling},\nauthor={Defang Chen and Zhenyu Zhou and Can Wang and Chunhua Shen and Siwei Lyu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=H86WzfH5N1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7694604, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13914225782240715928&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "zju.edu.cn;;zju.edu.cn;;buffalo.edu", "author_num": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "Zhejiang University;State University of New York at Buffalo", "aff_unique_dep": ";", "aff_unique_url": "https://www.zju.edu.cn;https://www.buffalo.edu", "aff_unique_abbr": "ZJU;SUNY Buffalo", "aff_campus_unique_index": "1", "aff_campus_unique": ";Buffalo", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;United States" }, { "title": "Don't be so Negative! Score-based Generative Modeling with Oracle-assisted Guidance", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34477", "id": "H8pMSJwRD5", "proceeding": "https://proceedings.mlr.press/v235/naderiparizi24a.html", "pdf": "https://openreview.net/pdf?id=H8pMSJwRD5", "openreview": "https://openreview.net/forum?id=H8pMSJwRD5", "author_site": "Saeid Naderiparizi, Xiaoxuan Liang, Setareh Cohan, Berend Zwartsenberg, Frank Wood", "tldr": "", "abstract": "Score-based diffusion models are a powerful class of generative models, widely utilized across diverse domains. Despite significant advancements in large-scale tasks such as text-to-image generation, their application to constrained domains has received considerably less attention. This work addresses model learning in a setting where, in addition to the training dataset, there further exists side-information in the form of an oracle that can label samples as being outside the support of the true data generating distribution. Specifically we develop a new denoising diffusion probabilistic modeling methodology, Gen-neG, that leverages this additional side-information. Gen-neG builds on classifier guidance in diffusion models to guide the generation process towards the positive support region indicated by the oracle. We empirically establish the utility of Gen-neG in applications including collision avoidance in self-driving simulators and safety-guarded human motion generation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Saeid Naderiparizi;Xiaoxuan Liang;Setareh Cohan;Berend Zwartsenberg;Frank Wood", "authorids": "~Saeid_Naderiparizi1;~Xiaoxuan_Liang2;~Setareh_Cohan1;~Berend_Zwartsenberg1;~Frank_Wood2", "gender": "M;F;F;M;M", "homepage": "https://www.cs.ubc.ca/~saeidnp/;;https://www.cs.ubc.ca/~setarehc/;https://bzwartsenberg.github.io/;http://www.robots.ox.ac.uk/~fwood/", "dblp": "244/9611;;;;44/4750", "google_scholar": "Ubt0dYYAAAAJ;;P73-vsoAAAAJ;;d4yNzXIAAAAJ", "orcid": ";;0009-0008-6381-7698;;", "linkedin": "saeidnp;xiaoxuan-liang-4451a4171/;setarehcohan/;;frank-wood-43529114?trk=hp-identity-name", "or_profile": "~Saeid_Naderiparizi1;~Xiaoxuan_Liang2;~Setareh_Cohan1;~Berend_Zwartsenberg1;~Frank_Wood2", "aff": "University of British Columbia;University of British Columbia;University of British Columbia;Inverted AI;University of British Columbia", "aff_domain": "ubc.ca;cs.ubc.ca;ubc.ca;inverted.ai;cs.ubc.ca", "position": "PhD student;PhD student;PhD student;Researcher;Full Professor", "bibtex": "@inproceedings{\nnaderiparizi2024dont,\ntitle={Don't be so Negative! Score-based Generative Modeling with Oracle-assisted Guidance},\nauthor={Saeid Naderiparizi and Xiaoxuan Liang and Setareh Cohan and Berend Zwartsenberg and Frank Wood},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=H8pMSJwRD5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7607336, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14553952500122075679&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 6, "email": "ubc.ca;cs.ubc.ca;ubc.ca;inverted.ai;cs.ubc.ca", "author_num": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "University of British Columbia;Inverted AI", "aff_unique_dep": ";", "aff_unique_url": "https://www.ubc.ca;https://www.inverted.ai", "aff_unique_abbr": "UBC;Inverted AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "Canada;United States" }, { "title": "Position: Towards Implicit Prompt For Text-To-Image Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34476", "id": "H9fNj8ivTy", "proceeding": "https://proceedings.mlr.press/v235/yang24o.html", "pdf": "https://openreview.net/pdf?id=H9fNj8ivTy", "openreview": "https://openreview.net/forum?id=H9fNj8ivTy", "author_site": "Yue Yang, Yuqi Lin, Hong Liu, WENQI SHAO, Runjian Chen, Hailong Shang, Yu Wang, Yu Qiao, Kaipeng Zhang, Ping Luo", "tldr": "", "abstract": "Recent text-to-image (T2I) models have had great success, and many benchmarks have been proposed to evaluate their performance and safety. However, they only consider explicit prompts while neglecting implicit prompts (hint at a target without explicitly mentioning it). These prompts may get rid of safety constraints and pose potential threats to the applications of these models. This position paper highlights the current state of T2I models toward implicit prompts. We present a benchmark named ImplicitBench and conduct an investigation on the performance and impacts of implicit prompts with popular T2I models. Specifically, we design and collect more than 2,000 implicit prompts of three aspects: General Symbols, Celebrity Privacy, and Not-Safe-For-Work (NSFW) Issues, and evaluate six well-known T2I models' capabilities under these implicit prompts. Experiment results show that (1) T2I models are able to accurately create various target symbols indicated by implicit prompts; (2) Implicit prompts bring potential risks of privacy leakage for T2I models. (3) Constraints of NSFW in most of the evaluated T2I models can be bypassed with implicit prompts. We call for increased attention to the potential and risks of implicit prompts in the T2I community and further investigation into the capabilities and impacts of implicit prompts, advocating for a balanced approach that harnesses their benefits while mitigating their risks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yue Yang;Yuqi Lin;Hong Liu;Wenqi Shao;Runjian Chen;Hailong Shang;Yu Wang;Yu Qiao;Kaipeng Zhang;Ping Luo", "authorids": "~Yue_Yang6;~Yuqi_Lin1;~Hong_Liu9;~Wenqi_Shao2;~Runjian_Chen1;shl22@mails.tsinghua.edu.cn;~Yu_Wang40;~Yu_Qiao1;~Kaipeng_Zhang1;~Ping_Luo2", "gender": ";M;Non-Binary;M;M;;M;;M;", "homepage": ";http://wiki.zjulearning.org.cn/wiki/User:Linyuqi;https://lynnhongliu.github.io/hliu/;https://wqshao126.github.io/;https://runjian-chen.github.io;;https://mediabrain.sjtu.edu.cn/yuwang/;;http://kpzhang93.github.io/;", "dblp": ";117/7752;29/5010-9;227/3122;257/4647;;02/5889-27.html;;179/2126;", "google_scholar": ";5-jDh48AAAAJ;BC7N2dYAAAAJ;Bs9mrwwAAAAJ;_USUMdAAAAAJ;;;;4OqZBmYAAAAJ;", "orcid": ";;0000-0001-5318-6388;;0000-0003-0519-496X;;0000-0001-9500-081X;;;", "linkedin": ";;;;;;;;;", "or_profile": "~Yue_Yang6;~Yuqi_Lin1;~Hong_Liu9;~Wenqi_Shao2;~Runjian_Chen1;shl22@mails.tsinghua.edu.cn;~Yu_Wang40;~Yu_Qiao1;~Kaipeng_Zhang1;~Ping_Luo2", "aff": ";Zhejiang University;Osaka University, Tokyo Institute of Technology;Shanghai AI Laboratory;University of Hong Kong;;Shanghai Jiaotong University;;Shanghai AI Laboratory;", "aff_domain": ";zju.edu.cn;osaka-u.ac.jp;pjlab.org.cn;hku.hk;;sjtu.edu.cn;;pjlab.org.cn;", "position": ";PhD student;Assistant Professor;Researcher;PhD student;;Associate Professor;;Researcher;", "bibtex": "@inproceedings{\nyang2024position,\ntitle={Position: Towards Implicit Prompt For Text-To-Image Models},\nauthor={Yue Yang and Yuqi Lin and Hong Liu and Wenqi Shao and Runjian Chen and Hailong Shang and Yu Wang and Yu Qiao and Kaipeng Zhang and Ping Luo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=H9fNj8ivTy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3656721, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10962962245459023272&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": ";zju.edu.cn;osaka-u.ac.jp;pjlab.org.cn;hku.hk;;sjtu.edu.cn;;pjlab.org.cn;", "author_num": 10, "aff_unique_index": "0;1;2;3;4;2", "aff_unique_norm": "Zhejiang University;Osaka University;Shanghai AI Laboratory;University of Hong Kong;Shanghai Jiao Tong University", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.zju.edu.cn;https://www.osaka-u.ac.jp;https://www.shanghai-ai-lab.com;https://www.hku.hk;https://www.sjtu.edu.cn", "aff_unique_abbr": "ZJU;Osaka U;SAIL;HKU;SJTU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;0;0;0;0", "aff_country_unique": "China;Japan" }, { "title": "Adversarially Robust Hypothesis Transfer Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34475", "id": "HCDMiaT0Pf", "proceeding": "https://proceedings.mlr.press/v235/wang24d.html", "pdf": "https://openreview.net/pdf?id=HCDMiaT0Pf", "openreview": "https://openreview.net/forum?id=HCDMiaT0Pf", "author_site": "Yunjuan Wang, Raman Arora", "tldr": "", "abstract": "In this work, we explore Hypothesis Transfer Learning (HTL) under adversarial attacks. In this setting, a learner has access to a training dataset of size $n$ from an underlying distribution $\\mathcal{D}$ and a set of auxiliary hypotheses. These auxiliary hypotheses, which can be viewed as prior information originating either from expert knowledge or as pre-trained foundation models, are employed as an initialization for the learning process. Our goal is to develop an adversarially robust model for $\\mathcal{D}$. We begin by examining an adversarial variant of the regularized empirical risk minimization learning rule that we term A-RERM. Assuming a non-negative smooth loss function with a strongly convex regularizer, we establish a bound on the robust generalization error of the hypothesis returned by A-RERM in terms of the robust empirical loss and the quality of the initialization. If the initialization is good, i.e., there exists a weighted combination of auxiliary hypotheses with a small robust population loss, the bound exhibits a fast rate of $\\mathcal{O}(1/n)$. Otherwise, we get the standard rate of $\\mathcal{O}(1/\\sqrt{n})$. Additionally, we provide a bound on the robust excess risk which is similar in nature, albeit with a slightly worse rate. We also consider solving the problem using a practical variant, namely proximal stochastic adversarial training, and present a bound that depends on the initialization. This bound has the same dependence on the sample size as the ARERM bound, except for an additional term that depends on the size of the adversarial perturbation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yunjuan Wang;Raman Arora", "authorids": "~Yunjuan_Wang1;~Raman_Arora1", "gender": "F;M", "homepage": "https://yunjuanwang.github.io/;http://www.cs.jhu.edu/~raman/Home.html", "dblp": "31/560;", "google_scholar": "t_VSEEwAAAAJ;Spe0xdkAAAAJ", "orcid": ";", "linkedin": "yunjuan-wang-12ab85169/;", "or_profile": "~Yunjuan_Wang1;~Raman_Arora1", "aff": "Johns Hopkins University;Johns Hopkins University", "aff_domain": "jhu.edu;jhu.edu", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nwang2024adversarially,\ntitle={Adversarially Robust Hypothesis Transfer Learning},\nauthor={Yunjuan Wang and Raman Arora},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HCDMiaT0Pf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 477790, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6486869026600216614&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 4, "email": "jhu.edu;jhu.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Johns Hopkins University", "aff_unique_dep": "", "aff_unique_url": "https://www.jhu.edu", "aff_unique_abbr": "JHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Neuro-Symbolic Temporal Point Processes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34474", "id": "HDrXBr26UI", "proceeding": "https://proceedings.mlr.press/v235/yang24ag.html", "pdf": "https://openreview.net/pdf?id=HDrXBr26UI", "openreview": "https://openreview.net/forum?id=HDrXBr26UI", "author_site": "Yang Yang, Chao Yang, Boyang Li, Yinghao Fu, Shuang Li", "tldr": "", "abstract": "Our goal is to $\\textit{efficiently}$ discover a compact set of temporal logic rules to explain irregular events of interest. We introduce a neural-symbolic rule induction framework within the temporal point process model. The negative log-likelihood is the loss that guides the learning, where the explanatory logic rules and their weights are learned end-to-end in a $\\textit{differentiable}$ way. Specifically, predicates and logic rules are represented as $\\textit{vector embeddings}$, where the predicate embeddings are fixed and the rule embeddings are trained via gradient descent to obtain the most appropriate compositional representations of the predicate embeddings. To make the rule learning process more efficient and flexible, we adopt a $\\textit{sequential covering algorithm}$, which progressively adds rules to the model and removes the event sequences that have been explained until all event sequences have been covered. All the found rules will be fed back to the models for a final rule embedding and weight refinement. Our approach showcases notable efficiency and accuracy across synthetic and real datasets, surpassing state-of-the-art baselines by a wide margin in terms of efficiency.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yang Yang;Chao Yang;Boyang Li;Yinghao Fu;Shuang Li", "authorids": "~Yang_Yang56;~Chao_Yang9;~Boyang_Li4;~Yinghao_Fu1;~Shuang_Li3", "gender": "M;M;M;M;F", "homepage": "https://www.linkedin.cn/incareer/in/ACoAAENYKa0ByOPUp5MsyVPXTbQkJiv80QkfF94;https://github.com/yangchaoforthree;;https://github.com/EddieFua;https://shuangli01.github.io", "dblp": ";;;379/6317;43/6294-2", "google_scholar": ";;;8vBEgIYAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0009-0005-7928-7300;;", "linkedin": "https://www.linkedin.cn/incareer/in/ACoAAENYKa0ByOPUp5MsyVPXTbQkJiv80QkfF94;;;;", "or_profile": "~Yang_Yang56;~Chao_Yang9;~Boyang_Li4;~Yinghao_Fu1;~Shuang_Li3", "aff": "The Chinese University of Hong Kong,Shenzhen;The Chinese University of Hong Kong, Shenzhen;The Chinese University of Hong Kong;City University of Hong Kong;The Chinese University of Hong Kong (Shenzhen)", "aff_domain": "cuhk.edu.cn;cuhk.edu.cn;cuhk.edu.cn;cityu.edu.hk;cuhk.edu.cn", "position": "MS student;PhD student;MS student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nyang2024neurosymbolic,\ntitle={Neuro-Symbolic Temporal Point Processes},\nauthor={Yang Yang and Chao Yang and Boyang Li and Yinghao Fu and Shuang Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HDrXBr26UI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 553923, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9184309308112814066&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "cuhk.edu.cn;cuhk.edu.cn;cuhk.edu.cn;cityu.edu.hk;cuhk.edu.cn", "author_num": 5, "aff_unique_index": "0;1;1;2;1", "aff_unique_norm": "Chinese University of Hong Kong, Shenzhen;Chinese University of Hong Kong;City University of Hong Kong", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cuhk.edu.cn;https://www.cuhk.edu.cn;https://www.cityu.edu.hk", "aff_unique_abbr": "CUHK;CUHK;CityU", "aff_campus_unique_index": "0;0;1;1;0", "aff_campus_unique": "Shenzhen;Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Receptive Fields As Experts in Convolutional Neural Architectures", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34473", "id": "HGSIpeNNfM", "proceeding": "https://proceedings.mlr.press/v235/lian24b.html", "pdf": "https://openreview.net/pdf?id=HGSIpeNNfM", "openreview": "https://openreview.net/forum?id=HGSIpeNNfM", "author_site": "Dongze Lian, Weihao Yu, Xinchao Wang", "tldr": "", "abstract": "The size of spatial receptive fields, from the early 3$\\times$3 convolutions in VGGNet to the recent 7$\\times$7 convolutions in ConvNeXt, has always played a critical role in architecture design. In this paper, we propose a Mixture of Receptive Fields (MoRF) instead of using a single receptive field. MoRF contains the combinations of multiple receptive fields with different sizes, e.g., convolutions with different kernel sizes, which can be regarded as experts. Such an approach serves two functions: one is to select the appropriate receptive field according to the input, and the other is to expand the network capacity. Furthermore, we also introduce two types of routing mechanisms, hard routing and soft routing to automatically select the appropriate receptive field experts. In the inference stage, the selected receptive field experts are merged via re-parameterization to maintain a similar inference speed compared to the single receptive field. To demonstrate the effectiveness of MoRF, we integrate the MoRF concept into multiple architectures, e.g., ResNet and ConvNeXt. Extensive experiments show that our approach outperforms the baselines in image classification, object detection, and segmentation tasks without significantly increasing the inference time.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dongze Lian;Weihao Yu;Xinchao Wang", "authorids": "~Dongze_Lian1;~Weihao_Yu2;~Xinchao_Wang1", "gender": "M;;M", "homepage": "https://dongzelian.com/;http://whyu.me;https://sites.google.com/site/sitexinchaowang/", "dblp": "211/7697;222/7846-1.html;", "google_scholar": "q-C8LqsAAAAJ;LYxjt1QAAAAJ;https://scholar.google.com.tw/citations?user=w69Buq0AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Dongze_Lian1;~Weihao_Yu2;~Xinchao_WANG3", "aff": "National University of Singapore;National University of Singapore;National University of Singapore", "aff_domain": "nus.edu.sg;u.nus.edu;nus.edu", "position": "Postdoc;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nlian2024receptive,\ntitle={Receptive Fields As Experts in Convolutional Neural Architectures},\nauthor={Dongze Lian and Weihao Yu and Xinchao Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HGSIpeNNfM}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 729239, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:D__H8N5BqAgJ:scholar.google.com/&scioq=Receptive+Fields+As+Experts+in+Convolutional+Neural+Architectures&hl=en&as_sdt=0,48", "gs_version_total": 5, "email": "nus.edu.sg;u.nus.edu;nus.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "National University of Singapore", "aff_unique_dep": "", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Singapore" }, { "title": "Bayesian Design Principles for Offline-to-Online Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34472", "id": "HLHQxMydFk", "proceeding": "https://proceedings.mlr.press/v235/hu24p.html", "pdf": "https://openreview.net/pdf?id=HLHQxMydFk", "openreview": "https://openreview.net/forum?id=HLHQxMydFk", "author_site": "Hao Hu, yiqin yang, Jianing Ye, Chengjie Wu, Ziqing Mai, Yujing Hu, Tangjie Lv, Changjie Fan, Qianchuan Zhao, Chongjie Zhang", "tldr": "", "abstract": "Offline reinforcement learning (RL) is crucial for real-world applications where exploration can be costly or unsafe. However, offline learned policies are often suboptimal, and further online fine-tuning is required. In this paper, we tackle the fundamental dilemma of offline-to-online fine-tuning: if the agent remains pessimistic, it may fail to learn a better policy, while if it becomes optimistic directly, performance may suffer from a sudden drop. We show that Bayesian design principles are crucial in solving such a dilemma. Instead of adopting optimistic or pessimistic policies, the agent should act in a way that matches its belief in optimal policies. Such a probability-matching agent can avoid a sudden performance drop while still being guaranteed to find the optimal policy. Based on our theoretical findings, we introduce a novel algorithm that outperforms existing methods on various benchmarks, demonstrating the efficacy of our approach. Overall, the proposed approach provides a new perspective on offline-to-online RL that has the potential to enable more effective learning from offline data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hao Hu;Yiqin Yang;Jianing Ye;Chengjie Wu;Ziqing Mai;Yujing Hu;Tangjie Lv;Changjie Fan;Qianchuan Zhao;Chongjie Zhang", "authorids": "~Hao_Hu3;~Yiqin_Yang1;~Jianing_Ye1;~Chengjie_Wu1;~Ziqing_Mai1;~Yujing_Hu2;~Tangjie_Lv1;~Changjie_Fan1;~Qianchuan_Zhao1;~Chongjie_Zhang1", "gender": "M;M;M;M;F;;M;M;M;", "homepage": "https://mousehu.github.io;https://www.researchgate.net/profile/Yiqin-Yang-2;https://heavycrab.github.io/;;https://github.com/ZiqingMai;;;;;", "dblp": "67/6924-6;180/7725;287/5070;70/6141;;https://dblp.uni-trier.de/pid/160/1923.html;;71/882;82/3427;29/6693", "google_scholar": "https://scholar.google.com/citations?hl=en;aHTi5IEAAAAJ;Sc9duQQAAAAJ;fXL69VsAAAAJ;;IR5WY-wAAAAJ;EIuWpJcAAAAJ;;;LjxqXycAAAAJ", "orcid": ";;;;;;0000-0001-9858-809X;0000-0001-5420-0516;0000-0002-7952-5621;", "linkedin": "hao-hu-tsinghua;;;;;;;;;", "or_profile": "~Hao_Hu3;~Yiqin_Yang1;~Jianing_Ye1;~Chengjie_Wu1;~Ziqing_Mai1;~Yujing_Hu2;~Tangjie_Lv1;~Changjie_Fan1;~Qianchuan_Zhao1;~Chongjie_Zhang1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;NetEase, Inc.;NetEase, Inc.;Netease, Fuxi AI Lab;Tsinghua University;Washington University, Saint Louis", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;mails.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;corp.netease.com;netease.com;corp.netease.com;tsinghua.edu.cn;wustl.edu", "position": "PhD student;PhD student;MS student;PhD student;Undergrad student;Researcher;Researcher;Principal Researcher;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nhu2024bayesian,\ntitle={Bayesian Design Principles for Offline-to-Online Reinforcement Learning},\nauthor={Hao Hu and Yiqin Yang and Jianing Ye and Chengjie Wu and Ziqing Mai and Yujing Hu and Tangjie Lv and Changjie Fan and Qianchuan Zhao and Chongjie Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HLHQxMydFk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6834713, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14071553127057759452&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "tsinghua.edu.cn;tsinghua.edu.cn;mails.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;corp.netease.com;netease.com;corp.netease.com;tsinghua.edu.cn;wustl.edu", "author_num": 10, "aff_unique_index": "0;0;0;0;0;1;1;2;0;3", "aff_unique_norm": "Tsinghua University;NetEase, Inc.;Netease;Washington University in St. Louis", "aff_unique_dep": ";;Fuxi AI Lab;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.163.com;https://www.netease.com;https://wustl.edu", "aff_unique_abbr": "THU;NetEase;Netease;WUSTL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Saint Louis", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "EiG-Search: Generating Edge-Induced Subgraphs for GNN Explanation in Linear Time", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34471", "id": "HO0g6cHVZx", "proceeding": "https://proceedings.mlr.press/v235/lu24g.html", "pdf": "https://openreview.net/pdf?id=HO0g6cHVZx", "openreview": "https://openreview.net/forum?id=HO0g6cHVZx", "author_site": "Shengyao Lu, Bang Liu, Keith Mills, Jiao He, Di Niu", "tldr": "", "abstract": "Understanding and explaining the predictions of Graph Neural Networks (GNNs), is crucial for enhancing their safety and trustworthiness. Subgraph-level explanations are gaining attention for their intuitive appeal. However, most existing subgraph-level explainers face efficiency challenges in explaining GNNs due to complex search processes. The key challenge is to find a balance between intuitiveness and efficiency while ensuring transparency. Additionally, these explainers usually induce subgraphs by nodes, which may introduce less-intuitive disconnected nodes in the subgraph-level explanations or omit many important subgraph structures. In this paper, we reveal that inducing subgraph explanations by edges is more comprehensive than other subgraph inducing techniques. We also emphasize the need of determining the subgraph explanation size for each data instance, as different data instances may involve different important substructures. Building upon these considerations, we introduce a training-free approach, named EiG-Search. We employ an efficient linear-time search algorithm over the edge-induced subgraphs, where the edges are ranked by an enhanced gradient-based importance. We conduct extensive experiments on a total of seven datasets, demonstrating its superior performance and efficiency both quantitatively and qualitatively over the leading baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shengyao Lu;Bang Liu;Keith G. Mills;Jiao He;Di Niu", "authorids": "~Shengyao_Lu1;~Bang_Liu1;~Keith_G._Mills1;~Jiao_He1;~Di_Niu1", "gender": "F;M;M;M;M", "homepage": "https://sluxsr.github.io/;http://www-labs.iro.umontreal.ca/~liubang/;https://kgmills.github.io/;https://github.com/JonHe878;https://www.ualberta.ca/~dniu", "dblp": "320/4184;;299/5864;;82/4953", "google_scholar": "https://scholar.google.ca/citations?user=MSsab9EAAAAJ;lmfAnP4AAAAJ;CBOD_ngAAAAJ;;https://scholar.google.ca/citations?user=3kC5OogAAAAJ", "orcid": ";0000-0002-9483-8984;0000-0001-6054-1798;;0000-0002-5250-7327", "linkedin": ";bang-liu-12b66789/?originalSubdomain=ca;kgmills/;;", "or_profile": "~Shengyao_Lu1;~Bang_Liu1;~Keith_G._Mills1;~Jiao_He1;~Di_Niu1", "aff": "University of Alberta;University of Montreal;Huawei Technologies Ltd.;huawei;University of Alberta", "aff_domain": "ualberta.ca;umontreal.ca;huawei.com;huawei.com;ualberta.ca", "position": "PhD student;Assistant Professor;Research Intern;Chief engineer;Full Professor", "bibtex": "@inproceedings{\nlu2024eigsearch,\ntitle={EiG-Search: Generating Edge-Induced Subgraphs for {GNN} Explanation in Linear Time},\nauthor={Shengyao Lu and Bang Liu and Keith G. Mills and Jiao He and Di Niu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HO0g6cHVZx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1476661, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2111463078294151753&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "email": "ualberta.ca;umontreal.ca;huawei.com;huawei.com;ualberta.ca", "author_num": 5, "aff_unique_index": "0;1;2;2;0", "aff_unique_norm": "University of Alberta;University of Montreal;Huawei", "aff_unique_dep": ";;Huawei Technologies", "aff_unique_url": "https://www.ualberta.ca;https://wwwumontreal.ca;https://www.huawei.com", "aff_unique_abbr": "UAlberta;UM;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0", "aff_country_unique": "Canada;China" }, { "title": "Relational DNN Verification With Cross Executional Bound Refinement", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34470", "id": "HOG80Yk4Gw", "proceeding": "https://proceedings.mlr.press/v235/banerjee24a.html", "pdf": "https://openreview.net/pdf?id=HOG80Yk4Gw", "openreview": "https://openreview.net/forum?id=HOG80Yk4Gw", "author_site": "Debangshu Banerjee, Gagandeep Singh", "tldr": "", "abstract": "We focus on verifying relational properties defined over deep neural networks (DNNs) such as robustness against universal adversarial perturbations (UAP), certified worst-case hamming distance for binary string classifications, etc. Precise verification of these properties requires reasoning about multiple executions of the same DNN. However, most of the existing works in DNN verification only handle properties defined over single executions and as a result, are imprecise for relational properties. Though few recent works for relational DNN verification, capture linear dependencies between the inputs of multiple executions, they do not leverage dependencies between the outputs of hidden layers producing imprecise results. We develop a scalable relational verifier RACoon that utilizes cross-execution dependencies at all layers of the DNN gaining substantial precision over SOTA baselines on a wide range of datasets, networks, and relational properties.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Debangshu Banerjee;Gagandeep Singh", "authorids": "~Debangshu_Banerjee2;~Gagandeep_Singh1", "gender": "M;M", "homepage": "https://debangshu-banerjee.github.io/;https://ggndpsngh.github.io/", "dblp": "268/6756;64/3747-1", "google_scholar": "G5dhKqAAAAAJ;https://scholar.google.ch/citations?user=m4b2ruEAAAAJ", "orcid": "0009-0001-0163-9717;0000-0002-9299-2961", "linkedin": "debangshu-banerjee/;gagandeep-singh-1bb01b49/", "or_profile": "~Debangshu_Banerjee2;~Gagandeep_Singh1", "aff": "University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign", "aff_domain": "uiuc.edu;illinois.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nbanerjee2024relational,\ntitle={Relational {DNN} Verification With Cross Executional Bound Refinement},\nauthor={Debangshu Banerjee and Gagandeep Singh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HOG80Yk4Gw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9405361, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1608797113276717775&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "uiuc.edu;illinois.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Towards Understanding Inductive Bias in Transformers: A View From Infinity", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34469", "id": "HOMXUneCTR", "proceeding": "https://proceedings.mlr.press/v235/lavie24a.html", "pdf": "https://openreview.net/pdf?id=HOMXUneCTR", "openreview": "https://openreview.net/forum?id=HOMXUneCTR", "author_site": "Itay Lavie, Guy Gur-Ari, Zohar Ringel", "tldr": "", "abstract": "We study inductive bias in Transformers in the infinitely over-parameterized Gaussian process limit and argue transformers tend to be biased towards more permutation symmetric functions in sequence space. We show that the representation theory of the symmetric group can be used to give quantitative analytical predictions when the dataset is symmetric to permutations between tokens. We present a simplified transformer block and solve the model at the limit, including accurate predictions for the learning curves and network outputs. We show that in common setups, one can derive tight bounds in the form of a scaling law for the learnability as a function of the context length. Finally, we argue WikiText dataset, does indeed possess a degree of permutation symmetry.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Itay Lavie;Guy Gur-Ari;Zohar Ringel", "authorids": "~Itay_Lavie1;~Guy_Gur-Ari1;~Zohar_Ringel1", "gender": ";M;M", "homepage": ";;http://old.phys.huji.ac.il/~zohar.ringel/", "dblp": ";;", "google_scholar": ";mx8P4QUAAAAJ;https://scholar.google.co.il/citations?user=8-8VIDgAAAAJ", "orcid": "0000-0003-4454-6454;;", "linkedin": ";;", "or_profile": "~Itay_Lavie1;~Guy_Gur-Ari1;~Zohar_Ringel1", "aff": "International Business Machines;Google;Hebrew University of Jerusalem, Israel", "aff_domain": "ibm.com;google.com;huji.ac.il", "position": "Intern;Research Scientist;Associate Professor", "bibtex": "@inproceedings{\nlavie2024towards,\ntitle={Towards Understanding Inductive Bias in Transformers: A View From Infinity},\nauthor={Itay Lavie and Guy Gur-Ari and Zohar Ringel},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HOMXUneCTR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2369462, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6653486110618736270&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "ibm.com;google.com;huji.ac.il", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "International Business Machines Corporation;Google;Hebrew University of Jerusalem", "aff_unique_dep": ";Google;", "aff_unique_url": "https://www.ibm.com;https://www.google.com;https://www.huji.ac.il", "aff_unique_abbr": "IBM;Google;HUJI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;Israel" }, { "title": "Orthogonal Bootstrap: Efficient Simulation of Input Uncertainty", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34468", "id": "HOoVTsPPn7", "proceeding": "https://proceedings.mlr.press/v235/liu24c.html", "pdf": "https://openreview.net/pdf?id=HOoVTsPPn7", "openreview": "https://openreview.net/forum?id=HOoVTsPPn7", "author_site": "Kaizhao Liu, Jose Blanchet, Lexing Ying, Yiping Lu", "tldr": "", "abstract": "Bootstrap is a popular methodology for simulating input uncertainty. However, it can be computationally expensive when the number of samples is large. We propose a new approach called **Orthogonal Bootstrap** that reduces the number of required Monte Carlo replications. We decomposes the target being simulated into two parts: the *non-orthogonal part* which has a closed-form result known as Infinitesimal Jackknife and the *orthogonal part* which is easier to be simulated. We theoretically and numerically show that Orthogonal Bootstrap significantly reduces the computational cost of Bootstrap while improving empirical accuracy and maintaining the same width of the constructed interval.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kaizhao Liu;Jose Blanchet;Lexing Ying;Yiping Lu", "authorids": "~Kaizhao_Liu1;~Jose_Blanchet1;~Lexing_Ying1;~Yiping_Lu1", "gender": "M;M;;M", "homepage": "https://drzfct.github.io/;https://web.stanford.edu/~jblanche/;http://web.stanford.edu/~lexing;https://2prime.github.io/", "dblp": ";75/5093.html;68/3945;93/683-1", "google_scholar": ";https://scholar.google.co.in/citations?user=O24CcQQAAAAJ;OwA3zyMAAAAJ;NmhvVBgAAAAJ", "orcid": ";;;", "linkedin": ";jose-blanchet;;", "or_profile": "~Kaizhao_Liu1;~Jose_Blanchet1;~Lexing_Ying1;~Yiping_Lu1", "aff": "Peking University;Stanford University;Stanford University;New York University", "aff_domain": "pku.edu.cn;stanford.edu;stanford.edu;nyu.edu", "position": "Undergrad student;Professor;Professor;Instructor", "bibtex": "@inproceedings{\nliu2024orthogonal,\ntitle={Orthogonal Bootstrap: Efficient Simulation of Input Uncertainty},\nauthor={Kaizhao Liu and Jose Blanchet and Lexing Ying and Yiping Lu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HOoVTsPPn7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1495270, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:QQANFjFQlK4J:scholar.google.com/&scioq=Orthogonal+Bootstrap:+Efficient+Simulation+of+Input+Uncertainty&hl=en&as_sdt=0,23", "gs_version_total": 9, "email": "pku.edu.cn;stanford.edu;stanford.edu;nyu.edu", "author_num": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Peking University;Stanford University;New York University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.pku.edu.cn;https://www.stanford.edu;https://www.nyu.edu", "aff_unique_abbr": "Peking U;Stanford;NYU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "China;United States" }, { "title": "Learning with Adaptive Resource Allocation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34467", "id": "HPLzSCOecY", "proceeding": "https://proceedings.mlr.press/v235/wang24cj.html", "pdf": "https://openreview.net/pdf?id=HPLzSCOecY", "openreview": "https://openreview.net/forum?id=HPLzSCOecY", "author_site": "Jing Wang, Miao Yu, Peng Zhao, Zhi-Hua Zhou", "tldr": "", "abstract": "The study of machine learning under limited resources has gathered increasing attention, considering improving the learning efficiency and effectiveness with budgeted resources. However, previous efforts mainly focus on *single* learning task, and a common resource-limited scenario is less explored: to handle *multiple* time-constrained learning tasks concurrently with budgeted computational resources. In this paper, we point out that this is a very challenging task because it demands the learner to be concerned about not only the progress of the learning tasks but also the coordinative allocation of computational resources. We present the *Learning with Adaptive Resource Allocation* (LARA) approach, which comprises an efficient online estimator for learning progress prediction, an adaptive search method for computational resource allocation, and a balancing strategy for alleviating prediction-allocation compounding errors. Empirical studies validate the effectiveness of our proposed approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jing Wang;Miao Yu;Peng Zhao;Zhi-Hua Zhou", "authorids": "~Jing_Wang32;~Miao_Yu7;~Peng_Zhao1;~Zhi-Hua_Zhou2", "gender": "M;;;", "homepage": "http://www.lamda.nju.edu.cn/wangjing/;https://github.com/huajv;;", "dblp": "02/736;;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Jing_Wang32;~Miao_Yu7;~Peng_Zhao1;~Zhi-Hua_Zhou2", "aff": "Nanjing University;Tongji University;;", "aff_domain": "nju.edu.cn;tongji.edu.cn;;", "position": "PhD student;Undergrad student;;", "bibtex": "@inproceedings{\nwang2024learning,\ntitle={Learning with Adaptive Resource Allocation},\nauthor={Jing Wang and Miao Yu and Peng Zhao and Zhi-Hua Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HPLzSCOecY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 926546, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1172334908342628913&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 7, "email": "nju.edu.cn;tongji.edu.cn;;", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Nanjing University;Tongji University", "aff_unique_dep": ";", "aff_unique_url": "https://www.nju.edu.cn;https://www.tongji.edu.cn", "aff_unique_abbr": "Nanjing U;Tongji", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Stochastic Q-learning for Large Discrete Action Spaces", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34466", "id": "HPQaMmABgK", "proceeding": "https://proceedings.mlr.press/v235/fourati24a.html", "pdf": "https://openreview.net/pdf?id=HPQaMmABgK", "openreview": "https://openreview.net/forum?id=HPQaMmABgK", "author_site": "Fares Fourati, Vaneet Aggarwal, Mohamed-Slim Alouini", "tldr": "", "abstract": "In complex environments with large discrete action spaces, effective decision-making is critical in reinforcement learning (RL). Despite the widespread use of value-based RL approaches like Q-learning, they come with a computational burden, necessitating the maximization of a value function over all actions in each iteration. This burden becomes particularly challenging when addressing large-scale problems and using deep neural networks as function approximators. In this paper, we present stochastic value-based RL approaches which, in each iteration, as opposed to optimizing over the entire set of $n$ actions, only consider a variable stochastic set of a sublinear number of actions, possibly as small as $\\mathcal{O}(\\log(n))$. The presented stochastic value-based RL methods include, among others, Stochastic Q-learning, StochDQN, and StochDDQN, all of which integrate this stochastic approach for both value-function updates and action selection. The theoretical convergence of Stochastic Q-learning is established, while an analysis of stochastic maximization is provided. Moreover, through empirical validation, we illustrate that the various proposed approaches outperform the baseline methods across diverse environments, including different control problems, achieving near-optimal average returns in significantly reduced time.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fares Fourati;Vaneet Aggarwal;Mohamed-Slim Alouini", "authorids": "~Fares_Fourati1;~Vaneet_Aggarwal1;~Mohamed-Slim_Alouini1", "gender": "M;M;M", "homepage": "https://fouratifares.github.io/website/;;https://cemse.kaust.edu.sa/ctl/people/person/mohamed-slim-alouini", "dblp": "275/3371;91/6560;64/6304", "google_scholar": "FAmOUOIAAAAJ;;", "orcid": "0000-0002-6913-7035;;", "linkedin": "fares-fourati-96641914a/?originalSubdomain=tn;;", "or_profile": "~Fares_Fourati1;~Vaneet_Aggarwal1;~Mohamed-Slim_Alouini1", "aff": "King Abdullah University of Science and Technology;Purdue University;King Abdullah University of Science and Technology", "aff_domain": "kaust.edu.sa;purdue.edu;kaust.edu.sa", "position": "PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nfourati2024stochastic,\ntitle={Stochastic Q-learning for Large Discrete Action Spaces},\nauthor={Fares Fourati and Vaneet Aggarwal and Mohamed-Slim Alouini},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HPQaMmABgK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5525877, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13898403520674982728&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "kaust.edu.sa;purdue.edu;kaust.edu.sa", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "King Abdullah University of Science and Technology;Purdue University", "aff_unique_dep": ";", "aff_unique_url": "https://www.kast.kau.edu.sa;https://www.purdue.edu", "aff_unique_abbr": "KAUST;Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Saudi Arabia;United States" }, { "title": "LCA-on-the-Line: Benchmarking Out of Distribution Generalization with Class Taxonomies", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34465", "id": "HPXRzM9BYZ", "proceeding": "https://proceedings.mlr.press/v235/shi24c.html", "pdf": "https://openreview.net/pdf?id=HPXRzM9BYZ", "openreview": "https://openreview.net/forum?id=HPXRzM9BYZ", "author_site": "Jia Shi, Gautam Rajendrakumar Gare, Jinjin Tian, Siqi Chai, Zhiqiu Lin, Arun Balajee Vasudevan, Di Feng, Francesco Ferroni, Shu Kong", "tldr": "", "abstract": "We tackle the challenge of predicting models' Out-of-Distribution (OOD) performance using in-distribution (ID) measurements without requiring OOD data. Existing evaluations with ``Effective robustness'', which use ID accuracy as an indicator of OOD accuracy, encounter limitations when models are trained with diverse supervision and distributions, such as class labels (*Vision Models, VMs, on ImageNet*) and textual descriptions (*Visual-Language Models, VLMs, on LAION*). VLMs often generalize better to OOD data than VMs despite having similar or lower ID performance. To improve the prediction of models' OOD performance from ID measurements, we introduce the *Lowest Common Ancestor (LCA)-on-the-Line* framework. This approach revisits the established concept of LCA distance, which measures the hierarchical distance between labels and predictions within a predefined class hierarchy, such as WordNet. We assess 75 models using ImageNet as the ID dataset and five significantly shifted OOD variants, uncovering a strong linear correlation between ID LCA distance and OOD top-1 accuracy. Our method provides a compelling alternative for understanding why VLMs tend to generalize better. Additionally, we propose a technique to construct a taxonomic hierarchy on any dataset using $K$-means clustering, demonstrating that LCA distance is robust to the constructed taxonomic hierarchy. Moreover, we demonstrate that aligning model predictions with class taxonomies, through soft labels or prompt engineering, can enhance model generalization. Open source code in our [Project Page](https://elvishelvis.github.io/papers/lca/).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jia Shi;Gautam Rajendrakumar Gare;Jinjin Tian;Siqi Chai;Zhiqiu Lin;Arun Balajee Vasudevan;Di Feng;Francesco Ferroni;Shu Kong", "authorids": "~Jia_Shi2;~Gautam_Rajendrakumar_Gare1;~Jinjin_Tian1;~Siqi_Chai1;~Zhiqiu_Lin1;~Arun_Balajee_Vasudevan1;~Di_Feng1;~Francesco_Ferroni1;~Shu_Kong1", "gender": "M;M;F;;M;M;M;M;M", "homepage": "https://www.linkedin.com/in/elvishelvisshi/;https://ggare-cmu.github.io/;https://jinjint.github.io/;;https://linzhiqiu.github.io;;;;https://aimerykong.github.io/", "dblp": ";274/1621;;;230/4394;147/3992;;146/9495;26/11141", "google_scholar": "asHobe0AAAAJ;https://scholar.google.co.in/citations?hl=en;pRaC18YAAAAJ;;https://scholar.google.com/citations?hl=en;8cioAJoAAAAJ;RPLJzcwAAAAJ;;sm9FdLoAAAAJ", "orcid": ";0000-0002-1689-9626;;;;;;;0000-0002-1362-5937", "linkedin": "elvishelvisshi/;gautam-gare/;;siqi-chai/;zhiqiu-lin-b49ba7126/;;;;aimerykong/", "or_profile": "~Jia_Shi2;~Gautam_Rajendrakumar_Gare1;~Jinjin_Tian1;~Siqi_Chai1;~Zhiqiu_Lin1;~Arun_Balajee_Vasudevan1;~Di_Feng1;~Francesco_Ferroni1;~Shu_Kong1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Amazon;;Carnegie Mellon University;Carnegie Mellon University;;NVIDIA;Texas A&M University - College Station", "aff_domain": "cmu.edu;cmu.edu;amazon.com;;cmu.edu;cmu.edu;;nvidia.com;tamu.edu", "position": "MS student;PhD student;Researcher;;PhD student;Postdoc;;Principal Researcher;Assistant Professor", "bibtex": "@inproceedings{\nshi2024lcaontheline,\ntitle={{LCA}-on-the-Line: Benchmarking Out of Distribution Generalization with Class Taxonomies},\nauthor={Jia Shi and Gautam Rajendrakumar Gare and Jinjin Tian and Siqi Chai and Zhiqiu Lin and Arun Balajee Vasudevan and Di Feng and Francesco Ferroni and Shu Kong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HPXRzM9BYZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8427155, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14865788217805581136&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "cmu.edu;cmu.edu;amazon.com;;cmu.edu;cmu.edu;;nvidia.com;tamu.edu", "author_num": 9, "aff_unique_index": "0;0;1;0;0;2;3", "aff_unique_norm": "Carnegie Mellon University;Amazon;NVIDIA;Texas A&M University", "aff_unique_dep": ";Amazon.com, Inc.;NVIDIA Corporation;", "aff_unique_url": "https://www.cmu.edu;https://www.amazon.com;https://www.nvidia.com;https://www.tamu.edu", "aff_unique_abbr": "CMU;Amazon;NVIDIA;TAMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";College Station", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Adversarial Robustness Limits via Scaling-Law and Human-Alignment Studies", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34464", "id": "HQtTg1try7", "proceeding": "https://proceedings.mlr.press/v235/bartoldson24a.html", "pdf": "https://openreview.net/pdf?id=HQtTg1try7", "openreview": "https://openreview.net/forum?id=HQtTg1try7", "author_site": "Brian Bartoldson, James Diffenderfer, Konstantinos Parasyris, Bhavya Kailkhura", "tldr": "", "abstract": "This paper revisits the simple, long-studied, yet still unsolved problem of making image classifiers robust to imperceptible perturbations. Taking CIFAR10 as an example, SOTA clean accuracy is about $100$%, but SOTA robustness to $\\ell_{\\infty}$-norm bounded perturbations barely exceeds $70$%. To understand this gap, we analyze how model size, dataset size, and synthetic data quality affect robustness by developing the first scaling laws for adversarial training. Our scaling laws reveal inefficiencies in prior art and provide actionable feedback to advance the field. For instance, we discovered that SOTA methods diverge notably from compute-optimal setups, using excess compute for their level of robustness. Leveraging a compute-efficient setup, we surpass the prior SOTA with $20$% ($70$%) fewer training (inference) FLOPs. We trained various compute-efficient models, with our best achieving $74$% AutoAttack accuracy ($+3$% gain). However, our scaling laws also predict robustness slowly grows then plateaus at $90$%: dwarfing our new SOTA by scaling is impractical, and perfect robustness is impossible. To better understand this predicted limit, we carry out a small-scale human evaluation on the AutoAttack data that fools our top-performing model. Concerningly, we estimate that human performance also plateaus near $90$%, which we show to be attributable to $\\ell_{\\infty}$-constrained attacks' generation of invalid images not consistent with their original labels. Having characterized limiting roadblocks, we outline promising paths for future research.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Brian R. Bartoldson;James Diffenderfer;Konstantinos Parasyris;Bhavya Kailkhura", "authorids": "~Brian_R._Bartoldson1;~James_Diffenderfer1;~Konstantinos_Parasyris2;~Bhavya_Kailkhura1", "gender": ";M;M;M", "homepage": ";;https://people.llnl.gov/kailkhura1;https://brianbartoldson.wordpress.com/", "dblp": "188/4110;151/4093;132/8938;220/5475", "google_scholar": "nRr24_QAAAAJ;BVW8btMAAAAJ;SQpJmOgAAAAJ;YdiZoJgAAAAJ", "orcid": ";0000-0002-8258-9693;;", "linkedin": ";koparasy/;;", "or_profile": "~James_Diffenderfer1;~Konstantinos_Parasyris2;~Bhavya_Kailkhura1;~Brian_R_Bartoldson1", "aff": "Lawrence Livermore National Labs;Lawrence Livermore National Labs;Lawrence Livermore National Laboratory;Lawrence Livermore National Labs", "aff_domain": "llnl.gov;llnl.gov;llnl.gov;llnl.gov", "position": "Researcher;Computer Scientists;Research Staff;Researcher", "bibtex": "@inproceedings{\nbartoldson2024adversarial,\ntitle={Adversarial Robustness Limits via Scaling-Law and Human-Alignment Studies},\nauthor={Brian R. Bartoldson and James Diffenderfer and Konstantinos Parasyris and Bhavya Kailkhura},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HQtTg1try7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2124437, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9131551624538858930&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "llnl.gov;llnl.gov;llnl.gov;llnl.gov", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Lawrence Livermore National Laboratory", "aff_unique_dep": "", "aff_unique_url": "https://www.llnl.gov", "aff_unique_abbr": "LLNL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "A Field Guide for Pacing Budget and ROS Constraints", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34463", "id": "HTMFUKAm8B", "proceeding": "https://proceedings.mlr.press/v235/balseiro24a.html", "pdf": "https://openreview.net/pdf?id=HTMFUKAm8B", "openreview": "https://openreview.net/forum?id=HTMFUKAm8B", "author_site": "Santiago Balseiro, Kshipra Bhawalkar, Zhe Feng, Haihao Lu, Vahab Mirrokni, Balasubramanian Sivan, Di Wang", "tldr": "", "abstract": "Budget pacing is a popular service that has been offered by major internet advertising platforms since their inception. In the past few years, autobidding products that provide real-time bidding as a service to advertisers have seen a prominent rise in adoption. A popular autobidding stategy is value maximization subject to return-on-spend (ROS) constraints. For historical or business reasons, the systems that govern these two services, namely budget pacing and ROS pacing, are not necessarily always a single unified and coordinated entity that optimizes a global objective subject to both constraints. The purpose of this work is to theoretically and empirically compare algorithms with different degrees of coordination between these two pacing systems. In particular, we compare (a) a fully-decoupled sequential algorithm; (b) a minimally-coupled min-pacing algorithm; (c) a fully-coupled dual-based algorithm. Our main contribution is to theoretically analyze the min-pacing algorithm and show that it attains similar guarantees to the fully-coupled canonical dual-based algorithm. On the other hand, we show that the sequential algorithm, even though appealing by virtue of being fully decoupled, could badly violate the constraints. We validate our theoretical findings empirically by showing that the min-pacing algorithm performs almost as well as the canonical dual-based algorithm on a semi-synthetic dataset that was generated from a large online advertising platform's auction data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Santiago R. Balseiro;Kshipra Bhawalkar;Zhe Feng;Haihao Lu;Vahab Mirrokni;Balasubramanian Sivan;Di Wang", "authorids": "~Santiago_R._Balseiro1;~Kshipra_Bhawalkar1;~Zhe_Feng3;~Haihao_Lu2;~Vahab_Mirrokni2;~Balasubramanian_Sivan1;~Di_Wang4", "gender": ";;M;Not Specified;M;M;", "homepage": ";https://cs.stanford.edu/people/kshipra/;https://scholar.harvard.edu/zfeng/home;https://faculty.chicagobooth.edu/haihao-lu;https://people.csail.mit.edu/mirrokni/Welcome.html;http://pages.cs.wisc.edu/~balu2901/;", "dblp": "84/8821;46/8419.html;36/1508-4;;m/VahabSMirrokni;13/1446;", "google_scholar": ";ZZesXHYAAAAJ;MKbTrgIAAAAJ;;opbZfw0AAAAJ;gnlvP_sAAAAJ;", "orcid": ";0009-0000-1375-8054;;;;;", "linkedin": ";;;;;balasubramanian-sivan-74304418;", "or_profile": "~Santiago_R._Balseiro1;~Kshipra_Bhawalkar1;~Zhe_Feng3;~Haihao_Lu2;~Vahab_Mirrokni2;~Balasubramanian_Sivan1;~Di_Wang4", "aff": "Columbia University;Google;Google;University of Chicago;Google Research;Google;", "aff_domain": "columbia.edu;google.com;google.com;uchicago.edu;google.com;google.com;", "position": "Associate Professor;Researcher;Researcher;Assistant Professor;VP, Google Fellow;Research Scientist;", "bibtex": "@inproceedings{\nbalseiro2024a,\ntitle={A Field Guide for Pacing Budget and {ROS} Constraints},\nauthor={Santiago R. Balseiro and Kshipra Bhawalkar and Zhe Feng and Haihao Lu and Vahab Mirrokni and Balasubramanian Sivan and Di Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HTMFUKAm8B}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1604826, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3390069234659385949&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "columbia.edu;google.com;google.com;uchicago.edu;google.com;google.com;", "author_num": 7, "aff_unique_index": "0;1;1;2;1;1", "aff_unique_norm": "Columbia University;Google;University of Chicago", "aff_unique_dep": ";Google;", "aff_unique_url": "https://www.columbia.edu;https://www.google.com;https://www.uchicago.edu", "aff_unique_abbr": "Columbia;Google;UChicago", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Weisfeiler-Leman at the margin: When more expressivity matters", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34462", "id": "HTNgNt8CTJ", "proceeding": "https://proceedings.mlr.press/v235/franks24a.html", "pdf": "https://openreview.net/pdf?id=HTNgNt8CTJ", "openreview": "https://openreview.net/forum?id=HTNgNt8CTJ", "author_site": "Billy Franks, Christopher Morris, Ameya Velingker, Floris Geerts", "tldr": "", "abstract": "The Weisfeiler--Leman algorithm (1-WL) is a well-studied heuristic for the graph isomorphism problem. Recently, the algorithm has played a prominent role in understanding the expressive power of message-passing graph neural networks (MPNNs) and being effective as a graph kernel. Despite its success, the 1-WL faces challenges in distinguishing non-isomorphic graphs, leading to the development of more expressive MPNN and kernel architectures. However, the relationship between enhanced expressivity and improved generalization performance remains unclear. Here, we show that an architecture's expressivity offers limited insights into its generalization performance when viewed through graph isomorphism. Moreover, we focus on augmenting 1-WL and MPNNs with subgraph information and employ classical margin theory to investigate the conditions under which an architecture's increased expressivity aligns with improved generalization performance. In addition, we introduce variations of expressive 1-WL-based kernel and MPNN architectures with provable generalization properties. Our empirical study confirms the validity of our theoretical findings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Billy Joe Franks;Christopher Morris;Ameya Velingker;Floris Geerts", "authorids": "~Billy_Joe_Franks1;~Christopher_Morris1;~Ameya_Velingker1;~Floris_Geerts1", "gender": "M;M;M;M", "homepage": "https://ml.informatik.uni-kl.de/people/billy-franks.html;http://christophermorris.info;http://www.ameyavelingker.com;https://www.uantwerpen.be/en/staff/floris-geerts/", "dblp": "266/8018;156/7303;117/3666.html;g/FlorisGeerts.html", "google_scholar": "Nh7XOKsAAAAJ;;6dFFudUAAAAJ;SGay8u4AAAAJ", "orcid": "0000-0002-6031-7785;;;0000-0002-8967-2473", "linkedin": ";;ameya-velingker-5811b711;florisgeerts/", "or_profile": "~Billy_Joe_Franks1;~Christopher_Morris1;~Ameya_Velingker1;~Floris_Geerts1", "aff": "TU Kaiserslautern;Rheinisch Westf\u00e4lische Technische Hochschule Aachen;Google;University of Antwerp", "aff_domain": "uni-kl.de;rwth-aachen.de;google.com;uantwerp.be", "position": "PhD student;Assistant Professor;Research Scientist;Full Professor", "bibtex": "@inproceedings{\nfranks2024weisfeilerleman,\ntitle={Weisfeiler-Leman at the margin: When more expressivity matters},\nauthor={Billy Joe Franks and Christopher Morris and Ameya Velingker and Floris Geerts},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HTNgNt8CTJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 803711, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2305652103563175534&as_sdt=5,30&sciodt=0,30&hl=en", "gs_version_total": 8, "email": "uni-kl.de;rwth-aachen.de;google.com;uantwerp.be", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Technische Universit\u00e4t Kaiserslautern;RWTH Aachen University;Google;University of Antwerp", "aff_unique_dep": ";;Google;", "aff_unique_url": "https://www.tu-kl.de;https://www.rwth-aachen.de;https://www.google.com;https://www.uantwerp.be", "aff_unique_abbr": "TU Kaiserslautern;RWTH;Google;UA", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Aachen;Mountain View", "aff_country_unique_index": "0;0;1;2", "aff_country_unique": "Germany;United States;Belgium" }, { "title": "Differentiable Distributionally Robust Optimization Layers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34461", "id": "HUJK9dFOW6", "proceeding": "https://proceedings.mlr.press/v235/ma24j.html", "pdf": "https://openreview.net/pdf?id=HUJK9dFOW6", "openreview": "https://openreview.net/forum?id=HUJK9dFOW6", "author_site": "Xutao Ma, Chao Ning, WenLi Du", "tldr": "", "abstract": "In recent years, there has been a growing research interest in decision-focused learning, which embeds optimization problems as a layer in learning pipelines and demonstrates a superior performance than the prediction-focused approach. However, for distributionally robust optimization (DRO), a popular paradigm for decision-making under uncertainty, it is still unknown how to embed it as a layer, i.e., how to differentiate decisions with respect to an ambiguity set. In this paper, we develop such differentiable DRO layers for generic mixed-integer DRO problems with parameterized second-order conic ambiguity sets and discuss its extension to Wasserstein ambiguity sets. To differentiate the mixed-integer decisions, we propose a novel dual-view methodology by handling continuous and discrete parts of decisions via different principles. Specifically, we construct a differentiable energy-based surrogate to implement the dual-view methodology and use importance sampling to estimate its gradient. We further prove that such a surrogate enjoys the asymptotic convergency under regularization. As an application of the proposed differentiable DRO layers, we develop a novel decision-focused learning pipeline for contextual distributionally robust decision-making tasks and compare it with the prediction-focused approach in experiments", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xutao Ma;Chao Ning;WenLi Du", "authorids": "~Xutao_Ma1;~Chao_Ning2;~WenLi_Du2", "gender": "M;;F", "homepage": ";https://automation.sjtu.edu.cn/NC;", "dblp": ";56/8815-2;82/3485", "google_scholar": "https://scholar.google.com.hk/citations?user=OjpkbIoAAAAJ;;https://scholar.google.com/citations?hl=zh-CN", "orcid": "0009-0001-7876-8576;;0000-0002-2676-6341", "linkedin": ";;", "or_profile": "~Xutao_Ma1;~Chao_Ning2;~WenLi_Du2", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;East China University of Science and Technology", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;ecust.edu.cn", "position": "MS student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nma2024differentiable,\ntitle={Differentiable Distributionally Robust Optimization Layers},\nauthor={Xutao Ma and Chao Ning and WenLi Du},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HUJK9dFOW6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 940862, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18147342709695083353&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "sjtu.edu.cn;sjtu.edu.cn;ecust.edu.cn", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Shanghai Jiao Tong University;East China University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.sjtu.edu.cn;http://www.ecust.edu.cn", "aff_unique_abbr": "SJTU;ECUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Causal Inference from Competing Treatments", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34460", "id": "HZ6lrZzB02", "proceeding": "https://proceedings.mlr.press/v235/stoica24a.html", "pdf": "https://openreview.net/pdf?id=HZ6lrZzB02", "openreview": "https://openreview.net/forum?id=HZ6lrZzB02", "author_site": "Ana-Andreea Stoica, Vivian Y. Nastl, Moritz Hardt", "tldr": "", "abstract": "Many applications of RCTs involve the presence of multiple treatment administrators---from field experiments to online advertising---that compete for the subjects' attention. In the face of competition, estimating a causal effect becomes difficult, as the position at which a subject sees a treatment influences their response, and thus the treatment effect. In this paper, we build a game-theoretic model of agents who wish to estimate causal effects in the presence of competition, through a bidding system and a utility function that minimizes estimation error. Our main technical result establishes an approximation with a tractable objective that maximizes the sample value obtained through strategically allocating budget on subjects. This allows us to find an equilibrium in our model: we show that the tractable objective has a pure Nash equilibrium, and that any Nash equilibrium is an approximate equilibrium for our general objective that minimizes estimation error under broad conditions. Conceptually, our work successfully combines elements from causal inference and game theory to shed light on the equilibrium behavior of experimentation under competition.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ana-Andreea Stoica;Vivian Yvonne Nastl;Moritz Hardt", "authorids": "~Ana-Andreea_Stoica1;~Vivian_Yvonne_Nastl1;~Moritz_Hardt1", "gender": ";F;Not Specified", "homepage": ";https://sf.is.mpg.de/person/vnastl;http://mrtz.org/", "dblp": ";;26/4683", "google_scholar": ";;adnTgaAAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Ana-Andreea_Stoica1;~Vivian_Yvonne_Nastl1;~Moritz_Hardt1", "aff": ";ETHZ - ETH Zurich;Max-Planck-Institute for Intelligent Systems, Max-Planck Institute", "aff_domain": ";ethz.ch;is.mpg.de", "position": ";PhD student;Principal Researcher", "bibtex": "@inproceedings{\nstoica2024causal,\ntitle={Causal Inference from Competing Treatments},\nauthor={Ana-Andreea Stoica and Vivian Yvonne Nastl and Moritz Hardt},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HZ6lrZzB02}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 762545, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:A5QHsqK6boUJ:scholar.google.com/&scioq=Causal+Inference+from+Competing+Treatments&hl=en&as_sdt=0,5", "gs_version_total": 7, "email": ";ethz.ch;is.mpg.de", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "ETH Zurich;Max-Planck-Institute for Intelligent Systems", "aff_unique_dep": ";Intelligent Systems", "aff_unique_url": "https://www.ethz.ch;https://www.mpi-is.mpg.de", "aff_unique_abbr": "ETHZ;MPI-IS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Switzerland;Germany" }, { "title": "Optimal Recurrent Network Topologies for Dynamical Systems Reconstruction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34459", "id": "HZyOz9VEg4", "proceeding": "https://proceedings.mlr.press/v235/hemmer24a.html", "pdf": "https://openreview.net/pdf?id=HZyOz9VEg4", "openreview": "https://openreview.net/forum?id=HZyOz9VEg4", "author_site": "Christoph J\u00fcrgen Hemmer, Manuel Brenner, Florian Hess, Daniel Durstewitz", "tldr": "", "abstract": "In dynamical systems reconstruction (DSR) we seek to infer from time series measurements a generative model of the underlying dynamical process. This is a prime objective in any scientific discipline, where we are particularly interested in parsimonious models with a low parameter load. A common strategy here is parameter pruning, removing all parameters with small weights. However, here we find this strategy does not work for DSR, where even low magnitude parameters can contribute considerably to the system dynamics. On the other hand, it is well known that many natural systems which generate complex dynamics, like the brain or ecological networks, have a sparse topology with comparatively few links. Inspired by this, we show that *geometric pruning*, where in contrast to magnitude-based pruning weights with a low contribution to an attractor's geometrical structure are removed, indeed manages to reduce parameter load substantially without significantly hampering DSR quality. We further find that the networks resulting from geometric pruning have a specific type of topology, and that this topology, and not the magnitude of weights, is what is most crucial to performance. We provide an algorithm that automatically generates such topologies which can be used as priors for generative modeling of dynamical systems by RNNs, and compare it to other well studied topologies like small-world or scale-free networks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Christoph J\u00fcrgen Hemmer;Manuel Brenner;Florian Hess;Daniel Durstewitz", "authorids": "~Christoph_J\u00fcrgen_Hemmer1;~Manuel_Brenner1;~Florian_Hess1;~Daniel_Durstewitz1", "gender": "M;M;M;", "homepage": ";;https://www.zi-mannheim.de/forschung/abteilungen-ags-institute/theoret-neurowissenschaften/infos-theor-neurowiss.html;https://durstewitzlab.github.io", "dblp": "380/2407;323/8935;;98/2120", "google_scholar": "https://scholar.google.de/citations?user=6ksJaUwAAAAJ;HCUeyg8AAAAJ;nOZM-1AAAAAJ;https://scholar.google.de/citations?user=2bcbKU0AAAAJ", "orcid": ";;;0000-0002-9340-3786", "linkedin": "christoph-hemmer-b0a077166/;manuel-brenner-772261191/;;", "or_profile": "~Christoph_J\u00fcrgen_Hemmer1;~Manuel_Brenner1;~Florian_Hess1;~Daniel_Durstewitz1", "aff": "Ruprecht-Karls-Universit\u00e4t Heidelberg;Heidelberg University;Ruprecht-Karls-Universit\u00e4t Heidelberg;Heidelberg University", "aff_domain": "uni-heidelberg.de;uni-heidelberg.de;uni-heidelberg.de;uni-heidelberg.de", "position": "MS student;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nhemmer2024optimal,\ntitle={Optimal Recurrent Network Topologies for Dynamical Systems Reconstruction},\nauthor={Christoph J{\\\"u}rgen Hemmer and Manuel Brenner and Florian Hess and Daniel Durstewitz},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HZyOz9VEg4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4513696, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10122597030726166677&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "uni-heidelberg.de;uni-heidelberg.de;uni-heidelberg.de;uni-heidelberg.de", "author_num": 4, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Ruprecht-Karls-Universit\u00e4t Heidelberg;Heidelberg University", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-heidelberg.de/;https://www.uni-heidelberg.de", "aff_unique_abbr": "Uni Heidelberg;Uni Heidelberg", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Towards Unified Multi-granularity Text Detection with Interactive Attention", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34458", "id": "HaBVzgSdM7", "proceeding": "https://proceedings.mlr.press/v235/wan24i.html", "pdf": "https://openreview.net/pdf?id=HaBVzgSdM7", "openreview": "https://openreview.net/forum?id=HaBVzgSdM7", "author_site": "Xingyu Wan, Chengquan Zhang, Pengyuan Lyu, Sen Fan, Zihan Ni, Kun Yao, Errui Ding, Jingdong Wang", "tldr": "", "abstract": "Existing OCR engines or document image analysis systems typically rely on training separate models for text detection in varying scenarios and granularities, leading to significant computational complexity and resource demands. In this paper, we introduce \"Detect Any Text\" (DAT), an advanced paradigm that seamlessly unifies scene text detection, layout analysis, and document page detection into a cohesive, end-to-end model. This design enables DAT to efficiently manage text instances at different granularities, including *word*, *line*, *paragraph* and *page*. A pivotal innovation in DAT is the across-granularity interactive attention module, which significantly enhances the representation learning of text instances at varying granularities by correlating structural information across different text queries. As a result, it enables the model to achieve mutually beneficial detection performances across multiple text granularities. Additionally, a prompt-based segmentation module refines detection outcomes for texts of arbitrary curvature and complex layouts, thereby improving DAT's accuracy and expanding its real-world applicability. Experimental results demonstrate that DAT achieves state-of-the-art performances across a variety of text-related benchmarks, including multi-oriented/arbitrarily-shaped scene text detection, document layout analysis and page detection tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xingyu Wan;Chengquan Zhang;Pengyuan Lyu;Sen Fan;Zihan Ni;Kun Yao;Errui Ding;Jingdong Wang", "authorids": "~Xingyu_Wan2;~Chengquan_Zhang2;~Pengyuan_Lyu1;~Sen_Fan1;~Zihan_Ni1;~Kun_Yao1;~Errui_Ding2;~Jingdong_Wang1", "gender": "M;M;M;M;M;M;M;F", "homepage": ";;;https://github.com/kk12333;;https://jingdongwang2017.github.io/;;https://github.com/nizihan", "dblp": ";168/4701;;03/6550;180/5531;49/3441;;187/9123", "google_scholar": "koZQ_NgAAAAJ;whvv9NgAAAAJ;;;1wzEtxcAAAAJ;z5SPCmgAAAAJ;;", "orcid": "0000-0001-8254-5773;;;0000-0001-7155-4076;;0000-0002-4888-4445;;", "linkedin": ";;sen-fan-08814229a;;;;%E6%98%9F%E5%AE%87-%E4%B8%87-8687331a3/;", "or_profile": "~Chengquan_Zhang2;~Pengyuan_Lyu1;~Sen_Fan1;~Kun_Yao1;~Errui_Ding2;~Jingdong_Wang1;~Wan_Xingyu1;~Ni_zihan1", "aff": "Baidu;Baidu;Baidu;Baidu;Baidu;Baidu;Baidu;Baidu", "aff_domain": "baidu.com;baidu.com;baidu.com;baidu.com;baidu.com;baidu.com;baidu.com;baidu.com", "position": "Staff Software Engineer;Researcher;Researcher;Manager;Director;Chief Scientist for Computer Vision;Researcher;Researcher", "bibtex": "@inproceedings{\nwan2024towards,\ntitle={Towards Unified Multi-granularity Text Detection with Interactive Attention},\nauthor={Xingyu Wan and Chengquan Zhang and Pengyuan Lyu and Sen Fan and Zihan Ni and Kun Yao and Errui Ding and Jingdong Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HaBVzgSdM7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7044961, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13637652480841064697&as_sdt=5,24&sciodt=0,24&hl=en", "gs_version_total": 6, "email": "baidu.com;baidu.com;baidu.com;baidu.com;baidu.com;baidu.com;baidu.com;baidu.com", "author_num": 8, "aff_unique_index": "0;0;0;0;0;0;0;0", "aff_unique_norm": "Baidu", "aff_unique_dep": "Baidu, Inc.", "aff_unique_url": "https://www.baidu.com", "aff_unique_abbr": "Baidu", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Deep Regression Representation Learning with Topology", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34457", "id": "HbdeEGVfEN", "proceeding": "https://proceedings.mlr.press/v235/zhang24z.html", "pdf": "https://openreview.net/pdf?id=HbdeEGVfEN", "openreview": "https://openreview.net/forum?id=HbdeEGVfEN", "author_site": "Shihao Zhang, Kenji Kawaguchi, Angela Yao", "tldr": "", "abstract": "Most works studying representation learning focus only on classification and neglect regression. Yet, the learning objectives and, therefore, the representation topologies of the two tasks are fundamentally different: classification targets class separation, leading to disconnected representations, whereas regression requires ordinality with respect to the target, leading to continuous representations. We thus wonder how the effectiveness of a regression representation is influenced by its topology, with evaluation based on the Information Bottleneck (IB) principle. The IB principle is an important framework that provides principles for learning effective representations. We establish two connections between it and the topology of regression representations. The first connection reveals that a lower intrinsic dimension of the feature space implies a reduced complexity of the representation $Z$. This complexity can be quantified as the conditional entropy of $Z$ on the target $Y$, and serves as an upper bound on the generalization error. The second connection suggests a feature space that is topologically similar to the target space will better align with the IB principle. Based on these two connections, we introduce PH-Reg, a regularizer specific to regression that matches the intrinsic dimension and topology of the feature space with the target space. Experiments on synthetic and real-world regression tasks demonstrate the benefits of PH-Reg. Code: https://github.com/needylove/PH-Reg.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shihao Zhang;Kenji Kawaguchi;Angela Yao", "authorids": "~Shihao_Zhang1;~Kenji_Kawaguchi1;~Angela_Yao1", "gender": ";;", "homepage": "https://needylove.github.io/;https://ml.comp.nus.edu.sg/#members;http://www.angelayao.com", "dblp": ";;64/8484", "google_scholar": "SqS4w6gAAAAJ;aLl3rYoAAAAJ;https://scholar.google.ch/citations?user=-LJCZMMAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Shihao_Zhang1;~Kenji_Kawaguchi1;~Angela_Yao1", "aff": "National University of Singapore;National University of Singapore;National University of Singapore", "aff_domain": "u.nus.edu;nus.edu;nus.edu.sg", "position": "PhD student;Presidential Young Professor;Associate Professor", "bibtex": "@inproceedings{\nzhang2024deep,\ntitle={Deep Regression Representation Learning with Topology},\nauthor={Shihao Zhang and Kenji Kawaguchi and Angela Yao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HbdeEGVfEN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3868263, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2348977612195873885&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "u.nus.edu;nus.edu;nus.edu.sg", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "National University of Singapore", "aff_unique_dep": "", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Singapore" }, { "title": "AMPA: Adaptive Mixed Precision Allocation for Low-Bit Integer Training", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34456", "id": "HfxFasUfbN", "proceeding": "https://proceedings.mlr.press/v235/ding24b.html", "pdf": "https://openreview.net/pdf?id=HfxFasUfbN", "openreview": "https://openreview.net/forum?id=HfxFasUfbN", "author_site": "Li Ding, Wen Fei, Yuyang Huang, Shuangrui Ding, Wenrui Dai, Chenglin Li, Junni Zou, Hongkai Xiong", "tldr": "", "abstract": "Low-bit integer training emerges as a promising approach to mitigate the heavy burden during network training by quantizing the weights, activations, and gradients. However, existing methods cannot well achieve mixed-precision quantization for low-bit training and are commonly limited to INT8 precision. In this paper, we propose a novel low-bit integer training framework that, for the first time, achieves adaptive mixed-precision allocation (AMPA) for weights, activations, and gradients, and pushes the boundaries to a precision level below INT8. We develop a novel magnitude-based sensitivity measurement with regard to the quantization losses of weight, activation, and gradient quantization and the average gradient magnitudes, which is demonstrated as an upper bound of quantization influence in theory. We further design a layer-wise precision update strategy under observations on the quantization losses and their effects on model performance in low-bit training. Extensive experiments on different backbones and datasets show that, compared to INT8 quantization, the proposed method can achieve more than 38% BitOPs reduction with a tolerable loss below 2% in image classification, image segmentation, and language modeling.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Li Ding;Wen Fei;Yuyang Huang;Shuangrui Ding;Wenrui Dai;Chenglin Li;Junni Zou;Hongkai Xiong", "authorids": "~Li_Ding5;~Wen_Fei1;~Yuyang_Huang3;~Shuangrui_Ding1;~Wenrui_Dai1;~Chenglin_Li2;~Junni_Zou1;~Hongkai_Xiong1", "gender": "M;M;M;M;;M;F;M", "homepage": "https://min.sjtu.edu.cn/;;https://github.com/huangyuyang114;https://mark12ding.github.io;;https://min.sjtu.edu.cn/En/FacultyShow/4?Vid=17;http://www.cs.sjtu.edu.cn/~zou-jn;http://min.sjtu.edu.cn", "dblp": ";276/7132;;267/1780;16/5135.html;;91/4613;21/3569", "google_scholar": ";;;RZOIVhYAAAAJ;Xg8MhyAAAAAJ;ltW2JMcAAAAJ;https://scholar.google.com/citations?hl=zh-CN;bB16iN4AAAAJ", "orcid": ";0000-0002-1682-4480;;;;;;0000-0003-4552-0029", "linkedin": ";;;;;;;", "or_profile": "~Li_Ding5;~Wen_Fei1;~Yuyang_Huang3;~Shuangrui_Ding1;~Wenrui_Dai1;~Chenglin_Li2;~Junni_Zou1;~Hongkai_Xiong1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;The Chinese University of Hong Kong;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;ie.cuhk.edu.hk;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "position": "MS student;PhD student;PhD student;PhD student;Associate Professor;Full Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nding2024ampa,\ntitle={{AMPA}: Adaptive Mixed Precision Allocation for Low-Bit Integer Training},\nauthor={Li Ding and Wen Fei and Yuyang Huang and Shuangrui Ding and Wenrui Dai and Chenglin Li and Junni Zou and Hongkai Xiong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HfxFasUfbN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 915934, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10703874499033261677&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;ie.cuhk.edu.hk;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "author_num": 8, "aff_unique_index": "0;0;0;1;0;0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University;Chinese University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.cuhk.edu.hk", "aff_unique_abbr": "SJTU;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Disentangled Continual Graph Neural Architecture Search with Invariant Modular Supernet", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34455", "id": "Hg7C5YYifi", "proceeding": "https://proceedings.mlr.press/v235/zhang24bm.html", "pdf": "https://openreview.net/pdf?id=Hg7C5YYifi", "openreview": "https://openreview.net/forum?id=Hg7C5YYifi", "author_site": "Zeyang Zhang, Xin Wang, Yijian Qin, Hong Chen, Ziwei Zhang, Xu Chu, Wenwu Zhu", "tldr": "", "abstract": "The existing graph neural architecture search (GNAS) methods assume that the graph tasks are static during the search process, ignoring the ubiquitous scenarios where sequential graph tasks come in a continual fashion. Moreover, existing GNAS works resort to entangled graph factors during the architecture search process, resulting in the catastrophic forgetting problems. In this paper, we study the problem of continual graph neural architecture search that is expected to continually search the architecture to learn new graph tasks without forgetting the past, which remains largely unexplored in the literature. However, this problem poses the challenge of architecture conflicts, i.e., the optimal architecture for the new graph task may have performance deterioration and thus sub-optimal for past tasks. To address the challenge, we propose a novel Disentangled Continual Graph Neural Architecture Search with Invariant Modularization (GASIM) method, which is able to continually search the optimal architectures without forgetting past knowledge. Specifically, we first design a modular graph architecture super-network incorporating multiple modules to enable searching architecture with factor expertise. Second, we propose a factor-based task-module router that discovers the latent graph factors and routes the incoming task to the best suitable architecture module to alleviate the forgetting problem induced by architecture conflicts. Finally, we propose an invariant architecture search mechanism to capture the shared knowledge among tasks. Extensive experiments on real-world datasets demonstrate that the proposed method achieves state-of-the-art performance against baselines in continual graph neural architecture search.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zeyang Zhang;Xin Wang;Yijian Qin;Hong Chen;Ziwei Zhang;Xu Chu;Wenwu Zhu", "authorids": "~Zeyang_Zhang1;~Xin_Wang17;~Yijian_Qin2;~Hong_Chen9;~Ziwei_Zhang1;~Xu_Chu1;~Wenwu_Zhu1", "gender": ";M;M;M;;;M", "homepage": "https://zzythu.com;http://mn.cs.tsinghua.edu.cn/xinwang/;http://www.cs.tsinghua.edu.cn/;https://forchchch.github.io/;;;http://media.cs.tsinghua.edu.cn/en/zww", "dblp": "236/0242;10/5630-19;290/1902;52/4150-11;;;97/6308-1.html", "google_scholar": "w_njVcAAAAAJ;YPOBHYUAAAAJ;bSKCQwkAAAAJ;;;;https://scholar.google.com.tw/citations?user=7t2jzpgAAAAJ", "orcid": "0000-0003-1329-1313;0000-0002-0351-2939;;0000-0002-0943-2286;;;0000-0003-2236-9290", "linkedin": "zeyang-zhang-a7a039159;;;;;;", "or_profile": "~Zeyang_Zhang1;~Xin_Wang17;~Yijian_Qin2;~Hong_Chen9;~Ziwei_Zhang1;~Xu_Chu1;~Wenwu_Zhu1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;;;Tsinghua University", "aff_domain": "tsinghua.edu.cn;cs.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;;;tsinghua.edu.cn", "position": "PhD student;Associate Professor;PhD student;PhD student;;;Full Professor", "bibtex": "@inproceedings{\nzhang2024disentangled,\ntitle={Disentangled Continual Graph Neural Architecture Search with Invariant Modular Supernet},\nauthor={Zeyang Zhang and Xin Wang and Yijian Qin and Hong Chen and Ziwei Zhang and Xu Chu and Wenwu Zhu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Hg7C5YYifi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 528169, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7867457009561682908&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "tsinghua.edu.cn;cs.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;;;tsinghua.edu.cn", "author_num": 7, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "MMPareto: Boosting Multimodal Learning with Innocent Unimodal Assistance", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34454", "id": "Hh8pUBfxXh", "proceeding": "https://proceedings.mlr.press/v235/wei24d.html", "pdf": "https://openreview.net/pdf?id=Hh8pUBfxXh", "openreview": "https://openreview.net/forum?id=Hh8pUBfxXh", "author_site": "Yake Wei, Di Hu", "tldr": "", "abstract": "Multimodal learning methods with targeted unimodal learning objectives have exhibited their superior efficacy in alleviating the imbalanced multimodal learning problem. However, in this paper, we identify the previously ignored gradient conflict between multimodal and unimodal learning objectives, potentially misleading the unimodal encoder optimization. To well diminish these conflicts, we observe the discrepancy between multimodal loss and unimodal loss, where both gradient magnitude and covariance of the easier-to-learn multimodal loss are smaller than the unimodal one. With this property, we analyze Pareto integration under our multimodal scenario and propose MMPareto algorithm, which could ensure a final gradient with direction that is common to all learning objectives and enhanced magnitude to improve generalization, providing innocent unimodal assistance. Finally, experiments across multiple types of modalities and frameworks with dense cross-modal interaction indicate our superior and extendable method performance. Our method is also expected to facilitate multi-task cases with a clear discrepancy in task difficulty, demonstrating its ideal scalability. The source code and dataset are available at https://github.com/GeWu-Lab/MMPareto_ICML2024.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yake Wei;Di Hu", "authorids": "~Yake_Wei1;~Di_Hu1", "gender": "F;M", "homepage": "https://echo0409.github.io/;https://dtaoo.github.io/", "dblp": "275/7048;49/8496-1", "google_scholar": "https://scholar.google.com.hk/citations?user=i9mWGA0AAAAJ;https://scholar.google.com.hk/citations?user=F7bvTOEAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Yake_Wei1;~Di_Hu1", "aff": "Renmin University of China;Renmin University of China", "aff_domain": "ruc.edu.cn;ruc.edu.cn", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nwei2024mmpareto,\ntitle={{MMP}areto: Boosting Multimodal Learning with Innocent Unimodal Assistance},\nauthor={Yake Wei and Di Hu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Hh8pUBfxXh}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1055692, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11509406295661408097&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "ruc.edu.cn;ruc.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Renmin University of China", "aff_unique_dep": "", "aff_unique_url": "http://www.ruc.edu.cn", "aff_unique_abbr": "RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Distribution Alignment Optimization through Neural Collapse for Long-tailed Classification", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34453", "id": "Hjwx3H6Vci", "proceeding": "https://proceedings.mlr.press/v235/gao24s.html", "pdf": "https://openreview.net/pdf?id=Hjwx3H6Vci", "openreview": "https://openreview.net/forum?id=Hjwx3H6Vci", "author_site": "Jintong Gao, He Zhao, Dandan Guo, Hongyuan Zha", "tldr": "", "abstract": "A well-trained deep neural network on balanced datasets usually exhibits the Neural Collapse (NC) phenomenon, which is an informative indicator of the model achieving good performance. However, NC is usually hard to be achieved for a model trained on long-tailed datasets, leading to the deteriorated performance of test data. This work aims to induce the NC phenomenon in imbalanced learning from the perspective of distribution matching. By enforcing the distribution of last-layer representations to align the ideal distribution of the ETF structure, we develop a Distribution Alignment Optimization (DisA) loss, acting as a plug-and-play method can be combined with most of the existing long-tailed methods, we further instantiate it to the cases of fixing classifier and learning classifier. The extensive experiments show the effectiveness of DisA, providing a promising solution to the imbalanced issue. Our code is available at DisA.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jintong Gao;He Zhao;Dan dan Guo;Hongyuan Zha", "authorids": "~Jintong_Gao2;~He_Zhao1;~Dan_dan_Guo1;~Hongyuan_Zha1", "gender": "F;;F;", "homepage": "https://jintonggao.github.io/gaojt.github.io/;;https://github.com/Dan123dan;", "dblp": "369/7719;;121/1618;z/HongyuanZha", "google_scholar": ";;https://scholar.google.com.hk/citations?user=QLOY4JkAAAAJ;n1DQMIsAAAAJ", "orcid": "0000-0002-6832-2618;;;", "linkedin": ";;;", "or_profile": "~Jintong_Gao2;~He_Zhao1;~Dan_dan_Guo1;~Hongyuan_Zha1", "aff": "Jilin University;;Jilin University;The Chinese University of Hong Kong, Shenzhen", "aff_domain": "jlu.edu.cn;;jlu.edu.cn;cuhk.edu.cn", "position": "PhD student;;Lecturer;Full Professor", "bibtex": "@inproceedings{\ngao2024distribution,\ntitle={Distribution Alignment Optimization through Neural Collapse for Long-tailed Classification},\nauthor={Jintong Gao and He Zhao and Dan dan Guo and Hongyuan Zha},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Hjwx3H6Vci}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6612306, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5640545657362981528&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "jlu.edu.cn;;jlu.edu.cn;cuhk.edu.cn", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Jilin University;Chinese University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "http://www.jlu.edu.cn;https://www.cuhk.edu.cn", "aff_unique_abbr": "JLU;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Benchmarking and Building Long-Context Retrieval Models with LoCo and M2-BERT", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34452", "id": "HkCRgoGtt6", "proceeding": "https://proceedings.mlr.press/v235/saad-falcon24a.html", "pdf": "https://openreview.net/pdf?id=HkCRgoGtt6", "openreview": "https://openreview.net/forum?id=HkCRgoGtt6", "author_site": "Jon Saad-Falcon, Daniel Y Fu, Simran Arora, Neel Guha, Christopher Re", "tldr": "", "abstract": "Retrieval pipelines are an integral component of many machine learning systems. However, they perform poorly in domains where documents are long (e.g., 10K tokens or more) and where identifying the relevant document requires synthesizing information across the entire text. Developing long-context retrieval encoders suitable for these domains raises three challenges: (1) how to evaluate long-context retrieval performance, (2) how to pretrain a base language model to represent both short contexts (corresponding to queries) and long contexts (corresponding to documents), and (3) how to finetune this model for retrieval under the batch size limitations imposed by GPU memory constraints. To address these challenges, we first introduce LoCoV1, a 12 task benchmark constructed to measure long-context retrieval where chunking is not possible or not effective. We next present the M2-BERT retrieval encoder, an 80M parameter state-space encoder model built from the Monarch Mixer architecture, capable of scaling to documents up to 32K tokens long. We describe a pretraining data mixture which allows this encoder to process both short and long context sequences, and a finetuning approach that adapts this base model to retrieval with only single-sample batches. Finally, we validate the M2-BERT retrieval encoder on LoCoV1, finding that it outperforms competitive Transformer-based models by at least 22.2 points, despite containing 90\u00d7 fewer parameters.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jon Saad-Falcon;Daniel Y Fu;Simran Arora;Neel Guha;Christopher Re", "authorids": "~Jon_Saad-Falcon1;~Daniel_Y_Fu1;~Simran_Arora1;~Neel_Guha1;~Christopher_Re1", "gender": "M;;;M;", "homepage": "https://jonsaadfalcon.com/;;https://scholar.google.com/citations?user=rGRsWH8AAAAJ&hl=en;http://neelguha.com;", "dblp": "267/2373.html;;243/2342;130/0311;", "google_scholar": "zCVmjboAAAAJ;;;YI5N4HQAAAAJ;", "orcid": ";;;;", "linkedin": "jonsaadfalcon/;;;;", "or_profile": "~Jon_Saad-Falcon1;~Daniel_Y_Fu1;~Simran_Arora1;~Neel_Guha1;~Christopher_Re1", "aff": "Computer Science Department, Stanford University;;The Wharton School, University of Pennsylvania;Computer Science Department, Stanford University;", "aff_domain": "cs.stanford.edu;;wharton.upenn.edu;cs.stanford.edu;", "position": "PhD student;;Undergrad student;PhD student;", "bibtex": "@inproceedings{\nsaad-falcon2024benchmarking,\ntitle={Benchmarking and Building Long-Context Retrieval Models with LoCo and M2-{BERT}},\nauthor={Jon Saad-Falcon and Daniel Y Fu and Simran Arora and Neel Guha and Christopher Re},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HkCRgoGtt6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3215564, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10745995119653812806&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "cs.stanford.edu;;wharton.upenn.edu;cs.stanford.edu;", "author_num": 5, "aff_unique_index": "0;1;0", "aff_unique_norm": "Stanford University;University of Pennsylvania", "aff_unique_dep": "Computer Science Department;The Wharton School", "aff_unique_url": "https://www.stanford.edu;https://www.wharton.upenn.edu", "aff_unique_abbr": "Stanford;UPenn Wharton", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Causal Dynamics Models in Object-Oriented Environments", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34451", "id": "HkWxjpUV0S", "proceeding": "https://proceedings.mlr.press/v235/yu24j.html", "pdf": "https://openreview.net/pdf?id=HkWxjpUV0S", "openreview": "https://openreview.net/forum?id=HkWxjpUV0S", "author_site": "Zhongwei Yu, Jingqing Ruan, Dengpeng Xing", "tldr": "", "abstract": "Causal dynamics models (CDMs) have demonstrated significant potential in addressing various challenges in reinforcement learning. To learn CDMs, recent studies have performed causal discovery to capture the causal dependencies among environmental variables. However, the learning of CDMs is still confined to small-scale environments due to computational complexity and sample efficiency constraints. This paper aims to extend CDMs to large-scale object-oriented environments, which consist of a multitude of objects classified into different categories. We introduce the Object-Oriented CDM (OOCDM) that shares causalities and parameters among objects belonging to the same class. Furthermore, we propose a learning method for OOCDM that enables it to adapt to a varying number of objects. Experiments on large-scale tasks indicate that OOCDM outperforms existing CDMs in terms of causal discovery, prediction accuracy, generalization, and computational efficiency.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhongwei Yu;Jingqing Ruan;Dengpeng Xing", "authorids": "~Zhongwei_Yu1;~Jingqing_Ruan1;~Dengpeng_Xing1", "gender": "M;F;M", "homepage": ";https://github.com/Amanda-1997/;https://people.ucas.edu.cn/~xingdengpeng?language=en", "dblp": "96/4996;304/3544;85/8134", "google_scholar": ";https://scholar.google.com/citations?hl=zh-CN;", "orcid": "0000-0003-3372-2256;0000-0002-4857-9053;", "linkedin": ";;", "or_profile": "~Zhongwei_Yu1;~Jingqing_Ruan1;~Dengpeng_Xing1", "aff": "Institute of Automation, Chinese Academy of Sciences;Institute of Automation, Chinese Academy of Sciences;Institute of Automation, Chinese Academy of Sciences", "aff_domain": "ia.ac.cn;ia.ac.cn;ia.ac.cn", "position": "MS student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nyu2024learning,\ntitle={Learning Causal Dynamics Models in Object-Oriented Environments},\nauthor={Zhongwei Yu and Jingqing Ruan and Dengpeng Xing},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HkWxjpUV0S}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2228536, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17002781624394995765&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "ia.ac.cn;ia.ac.cn;ia.ac.cn", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Chinese Academy of Sciences", "aff_unique_dep": "Institute of Automation", "aff_unique_url": "http://www.ia.cas.cn", "aff_unique_abbr": "CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Stationary Latent Weight Inference for Unreliable Observations from Online Test-Time Adaptation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34450", "id": "HmKMpJXH67", "proceeding": "https://proceedings.mlr.press/v235/lee24b.html", "pdf": "https://openreview.net/pdf?id=HmKMpJXH67", "openreview": "https://openreview.net/forum?id=HmKMpJXH67", "author_site": "Jae-Hong Lee, Joon Hyuk Chang", "tldr": "", "abstract": "In the rapidly evolving field of online test-time adaptation (OTTA), effectively managing distribution shifts is a pivotal concern. State-of-the-art OTTA methodologies often face limitations such as an inadequate target domain information integration, leading to significant issues like catastrophic forgetting and a lack of adaptability in dynamically changing environments. In this paper, we introduce a stationary latent weight inference (SLWI) framework, a novel approach to overcome these challenges. The proposed SLWI uniquely incorporates Bayesian filtering to continually track and update the target model weights along with the source model weight in online settings, thereby ensuring that the adapted model remains responsive to ongoing changes in the target domain. The proposed framework has the peculiar property to identify and backtrack nonlinear weights that exhibit local non-stationarity, thereby mitigating error propagation, a common pitfall of previous approaches. By integrating and refining information from both source and target domains, SLWI presents a robust solution to the persistent issue of domain adaptation in OTTA, significantly improving existing methodologies. The efficacy of SLWI is demonstrated through various experimental setups, showcasing its superior performance in diverse distribution shift scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jae-Hong Lee;Joon-Hyuk Chang", "authorids": "~Jae-Hong_Lee1;~Joon-Hyuk_Chang1", "gender": "M;M", "homepage": "https://github.com/j-pong/;http://asmllab.hanyang.ac.kr", "dblp": "62/4284;22/4361", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Jae-Hong_Lee1;~Joon-Hyuk_Chang1", "aff": "Hanyang University;Hanyang University", "aff_domain": "hanyang.ac.kr;hanyang.ac.kr", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nlee2024stationary,\ntitle={Stationary Latent Weight Inference for Unreliable Observations from Online Test-Time Adaptation},\nauthor={Jae-Hong Lee and Joon-Hyuk Chang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HmKMpJXH67}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 961030, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10711914968577265235&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 6, "email": "hanyang.ac.kr;hanyang.ac.kr", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Hanyang University", "aff_unique_dep": "", "aff_unique_url": "https://www.hanyang.ac.kr", "aff_unique_abbr": "HYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Causal Discovery with Fewer Conditional Independence Tests", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34449", "id": "HpT19AKddu", "proceeding": "https://proceedings.mlr.press/v235/shiragur24a.html", "pdf": "https://openreview.net/pdf?id=HpT19AKddu", "openreview": "https://openreview.net/forum?id=HpT19AKddu", "author_site": "Kirankumar Shiragur, Jiaqi Zhang, Caroline Uhler", "tldr": "", "abstract": "Many questions in science center around the fundamental problem of understanding causal relationships. However, most constraint-based causal discovery algorithms, including the well-celebrated PC algorithm, often incur an _exponential_ number of conditional independence (CI) tests, posing limitations in various applications. Addressing this, our work focuses on characterizing what can be learned about the underlying causal graph with a reduced number of CI tests. We show that it is possible to a learn a coarser representation of the hidden causal graph with a _polynomial_ number of tests. This coarser representation, named Causal Consistent Partition Graph (CCPG), comprises of a partition of the vertices and a directed graph defined over its components. CCPG satisfies consistency of orientations and additional constraints which favor finer partitions. Furthermore, it reduces to the underlying causal graph when the causal graph is identifiable. As a consequence, our results offer the first efficient algorithm for recovering the true causal graph with a polynomial number of tests, in special cases where the causal graph is fully identifiable through observational data and potentially additional interventions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kirankumar Shiragur;Jiaqi Zhang;Caroline Uhler", "authorids": "~Kirankumar_Shiragur1;~Jiaqi_Zhang2;~Caroline_Uhler1", "gender": "M;F;F", "homepage": "https://sites.google.com/view/kiran-shiragur;;https://www.carolineuhler.com/", "dblp": ";;66/10813", "google_scholar": ";;https://scholar.google.com.tw/citations?user=dIJFcaoAAAAJ", "orcid": ";0000-0001-9039-6843;", "linkedin": ";vicky-jiaqi-zhang-34b490180/;", "or_profile": "~Kirankumar_Shiragur1;~Jiaqi_Zhang2;~Caroline_Uhler1", "aff": "Microsoft Research;Apple;Electrical Engineering & Computer Science, Massachusetts Institute of Technology", "aff_domain": "microsoft.com;apple.com;eecs.mit.edu", "position": "Researcher;Intern;Associate Professor", "bibtex": "@inproceedings{\nshiragur2024causal,\ntitle={Causal Discovery with Fewer Conditional Independence Tests},\nauthor={Kirankumar Shiragur and Jiaqi Zhang and Caroline Uhler},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HpT19AKddu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 826878, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18370707643060294170&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "microsoft.com;apple.com;eecs.mit.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Microsoft;Apple;Massachusetts Institute of Technology", "aff_unique_dep": "Microsoft Research;Apple Inc.;Electrical Engineering & Computer Science", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.apple.com;https://web.mit.edu", "aff_unique_abbr": "MSR;Apple;MIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Predictive Performance Comparison of Decision Policies Under Confounding", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34448", "id": "HrzQZXzrN2", "proceeding": "https://proceedings.mlr.press/v235/guerdan24a.html", "pdf": "https://openreview.net/pdf?id=HrzQZXzrN2", "openreview": "https://openreview.net/forum?id=HrzQZXzrN2", "author_site": "Luke Guerdan, Amanda Coston, Ken Holstein, Steven Wu", "tldr": "", "abstract": "Predictive models are often introduced to decision-making tasks under the rationale that they improve performance over an existing decision-making policy. However, it is challenging to compare predictive performance against an existing decision-making policy that is generally under-specified and dependent on unobservable factors. These sources of uncertainty are often addressed in practice by making strong assumptions about the data-generating mechanism. In this work, we propose a method to compare the predictive performance of decision policies under a variety of modern identification approaches from the causal inference and off-policy evaluation literatures (e.g., instrumental variable, marginal sensitivity model, proximal variable). Key to our method is the insight that there are regions of uncertainty that we can safely ignore in the policy comparison. We develop a practical approach for finite-sample estimation of regret intervals under no assumptions on the parametric form of the status quo policy. We verify our framework theoretically and via synthetic data experiments. We conclude with a real-world application using our framework to support a pre-deployment evaluation of a proposed modification to a healthcare enrollment policy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Luke Guerdan;Amanda Lee Coston;Ken Holstein;Steven Wu", "authorids": "~Luke_Guerdan1;~Amanda_Lee_Coston1;~Ken_Holstein1;~Steven_Wu1", "gender": "M;;;M", "homepage": "https://lukeguerdan.com;http://amandacoston.com;http://kenholstein.com/;https://zstevenwu.com/", "dblp": "241/3611;;176/0446;137/8350", "google_scholar": "XPrjbvoAAAAJ;8U7d-_MAAAAJ;ziP-50wAAAAJ;MbF6rTEAAAAJ", "orcid": ";;;", "linkedin": ";;;zstevenwu/", "or_profile": "~Luke_Guerdan1;~Amanda_Lee_Coston1;~Ken_Holstein1;~Zhiwei_Steven_Wu1", "aff": "School of Computer Science, Carnegie Mellon University;Microsoft;School of Computer Science, Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cs.cmu.edu;microsoft.com;cs.cmu.edu;cmu.edu", "position": "PhD student;Postdoc;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nguerdan2024predictive,\ntitle={Predictive Performance Comparison of Decision Policies Under Confounding},\nauthor={Luke Guerdan and Amanda Lee Coston and Ken Holstein and Steven Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HrzQZXzrN2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 967362, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4641768916400315175&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "cs.cmu.edu;microsoft.com;cs.cmu.edu;cmu.edu", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Carnegie Mellon University;Microsoft", "aff_unique_dep": "School of Computer Science;Microsoft Corporation", "aff_unique_url": "https://www.cmu.edu;https://www.microsoft.com", "aff_unique_abbr": "CMU;Microsoft", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Pittsburgh;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Single-Model Attribution of Generative Models Through Final-Layer Inversion", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34447", "id": "Hs9GcILuZN", "proceeding": "https://proceedings.mlr.press/v235/laszkiewicz24a.html", "pdf": "https://openreview.net/pdf?id=Hs9GcILuZN", "openreview": "https://openreview.net/forum?id=Hs9GcILuZN", "author_site": "Mike Laszkiewicz, Jonas Ricker, Johannes Lederer, Asja Fischer", "tldr": "", "abstract": "Recent breakthroughs in generative modeling have sparked interest in practical single-model attribution. Such methods predict whether a sample was generated by a specific generator or not, for instance, to prove intellectual property theft. However, previous works are either limited to the closed-world setting or require undesirable changes to the generative model. We address these shortcomings by, first, viewing single-model attribution through the lens of anomaly detection. Arising from this change of perspective, we propose FLIPAD, a new approach for single-model attribution in the open-world setting based on final-layer inversion and anomaly detection. We show that the utilized final-layer inversion can be reduced to a convex lasso optimization problem, making our approach theoretically sound and computationally efficient. The theoretical findings are accompanied by an experimental study demonstrating the effectiveness of our approach and its flexibility to various domains.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mike Laszkiewicz;Jonas Ricker;Johannes Lederer;Asja Fischer", "authorids": "~Mike_Laszkiewicz1;~Jonas_Ricker1;~Johannes_Lederer1;~Asja_Fischer1", "gender": "M;;;F", "homepage": ";;;", "dblp": "https://dblp.uni-trier.de/pid/264/5914.html;;;76/8485", "google_scholar": ";;;FyZbyIUAAAAJ", "orcid": ";;;0000-0002-1916-7033", "linkedin": ";;;", "or_profile": "~Mike_Laszkiewicz1;~Jonas_Ricker1;~Johannes_Lederer1;~Asja_Fischer1", "aff": "Ruhr-Universt\u00e4t Bochum;;;Ruhr-Universit\u00e4t Bochum", "aff_domain": "rub.de;;;ruhr-uni-bochum.de", "position": "PhD student;;;Full Professor", "bibtex": "@inproceedings{\nlaszkiewicz2024singlemodel,\ntitle={Single-Model Attribution of Generative Models Through Final-Layer Inversion},\nauthor={Mike Laszkiewicz and Jonas Ricker and Johannes Lederer and Asja Fischer},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Hs9GcILuZN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4232171, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5447196590658365972&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 2, "email": "rub.de;;;ruhr-uni-bochum.de", "author_num": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Ruhr-Universit\u00e4t Bochum", "aff_unique_dep": "", "aff_unique_url": "https://www.ruhr-uni-bochum.de", "aff_unique_abbr": "RUB", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "The Emergence of Reproducibility and Consistency in Diffusion Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34446", "id": "HsliOqZkc0", "proceeding": "https://proceedings.mlr.press/v235/zhang24cn.html", "pdf": "https://openreview.net/pdf?id=HsliOqZkc0", "openreview": "https://openreview.net/forum?id=HsliOqZkc0", "author_site": "Huijie Zhang, Jinfan Zhou, Yifu Lu, Minzhe Guo, Peng Wang, Liyue Shen, Qing Qu", "tldr": "", "abstract": "In this work, we investigate an intriguing and prevalent phenomenon of diffusion models which we term as \"consistent model reproducibility'': given the same starting noise input and a deterministic sampler, different diffusion models often yield remarkably similar outputs. We confirm this phenomenon through comprehensive experiments, implying that different diffusion models consistently reach the same data distribution and score function regardless of diffusion model frameworks, model architectures, or training procedures. More strikingly, our further investigation implies that diffusion models are learning *distinct distributions* influenced by the training data size. This is evident in two distinct training regimes: (I) \"memorization regime,'' where the diffusion model overfits to the training data distribution, and (ii) \"generalization regime,'' where the model learns the underlying data distribution. Our study also finds that this valuable property generalizes to many variants of diffusion models, including those for conditional generation and solving inverse problems. Lastly, we discuss how our findings connect to existing research and highlight the practical implications of our discoveries.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Huijie Zhang;Jinfan Zhou;Yifu Lu;Minzhe Guo;Peng Wang;Liyue Shen;Qing Qu", "authorids": "~Huijie_Zhang2;~Jinfan_Zhou2;~Yifu_Lu1;~Minzhe_Guo1;~Peng_Wang23;~Liyue_Shen1;~Qing_Qu2", "gender": "M;M;;M;M;F;M", "homepage": "https://www.huijiezh.com/;;;;https://peng8wang.github.io/;https://liyueshen.engin.umich.edu/;https://qingqu.engin.umich.edu/", "dblp": ";;;;95/4442-98;159/2036;127/6874-1", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;;ybsmKpsAAAAJ;;baF3HKUAAAAJ;Ho4qk9wAAAAJ;JfblW3MAAAAJ", "orcid": ";0000-0002-5853-1731;;;0000-0002-6799-0745;0000-0001-5942-3196;0000-0001-9136-558X", "linkedin": ";;yifu-lu-3547b321b;minzhe-guo/;;;qing-q-1a0b9746/", "or_profile": "~Huijie_Zhang2;~Jinfan_Zhou2;~Yifu_Lu1;~Minzhe_Guo1;~Peng_Wang23;~Liyue_Shen1;~Qing_Qu2", "aff": "University of Michigan - Ann Arbor;University of Michigan - Ann Arbor;University of Michigan - Ann Arbor;University of Michigan - Ann Arbor;University of Michigan - Ann Arbor;University of Michigan - Ann Arbor;University of Michigan", "aff_domain": "umich.edu;umich.edu;umich.edu;umich.edu;umich.edu;umich.edu;umich.edu", "position": "PhD student;MS student;Undergrad student;Undergrad student;Postdoc;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nzhang2024the,\ntitle={The Emergence of Reproducibility and Consistency in Diffusion Models},\nauthor={Huijie Zhang and Jinfan Zhou and Yifu Lu and Minzhe Guo and Peng Wang and Liyue Shen and Qing Qu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HsliOqZkc0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 10114906, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8316373501545972369&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": "umich.edu;umich.edu;umich.edu;umich.edu;umich.edu;umich.edu;umich.edu", "author_num": 7, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "University of Michigan", "aff_unique_dep": "", "aff_unique_url": "https://www.umich.edu", "aff_unique_abbr": "UM", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Ann Arbor;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Eureka-Moments in Transformers: Multi-Step Tasks Reveal Softmax Induced Optimization Problems", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34445", "id": "HssOwuZiaB", "proceeding": "https://proceedings.mlr.press/v235/hoffmann24a.html", "pdf": "https://openreview.net/pdf?id=HssOwuZiaB", "openreview": "https://openreview.net/forum?id=HssOwuZiaB", "author_site": "David T. Hoffmann, Simon Schrodi, Jelena Bratuli\u0107, Nadine Behrmann, Volker Fischer, Thomas Brox", "tldr": "", "abstract": "In this work, we study rapid improvements of the training loss in transformers when being confronted with multi-step decision tasks. We found that transformers struggle to learn the intermediate task and both training and validation loss saturate for hundreds of epochs. When transformers finally learn the intermediate task, they do this rapidly and unexpectedly. We call these abrupt improvements Eureka-moments, since the transformer appears to suddenly learn a previously incomprehensible concept. We designed synthetic tasks to study the problem in detail, but the leaps in performance can be observed also for language modeling and in-context learning (ICL). We suspect that these abrupt transitions are caused by the multi-step nature of these tasks. Indeed, we find connections and show that ways to improve on the synthetic multi-step tasks can be used to improve the training of language modeling and ICL. Using the synthetic data we trace the problem back to the Softmax function in the self-attention block of transformers and show ways to alleviate the problem. These fixes reduce the required number of training steps, lead to higher likelihood to learn the intermediate task, to higher final accuracy and training becomes more robust to hyper-parameters.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "David T Hoffmann;Simon Schrodi;Jelena Bratuli\u0107;Nadine Behrmann;Volker Fischer;Thomas Brox", "authorids": "~David_T_Hoffmann1;~Simon_Schrodi1;~Jelena_Bratuli\u01071;~Nadine_Behrmann1;~Volker_Fischer1;~Thomas_Brox1", "gender": ";M;F;;M;M", "homepage": ";https://lmb.informatik.uni-freiburg.de/people/schrodi/;https://lmb.informatik.uni-freiburg.de/people/bratulic/index.html;;;https://lmb.informatik.uni-freiburg.de/people/brox/index.en.html", "dblp": ";289/1328;;;84/4102-3;97/4586", "google_scholar": ";https://scholar.google.de/citations?user=yC-y0PEAAAAJ;https://scholar.google.hr/citations?user=Y7GCFsAAAAAJ;;https://scholar.google.de/citations?hl=de;https://scholar.google.com/citations?hl=de", "orcid": ";0009-0003-7006-953X;;;0000-0001-5437-4030;0000-0002-6282-8861", "linkedin": ";simon-schrodi-7b55161bb/;jelena-bratulic/;nadine-behrmann;;", "or_profile": "~David_T_Hoffmann1;~Simon_Schrodi1;~Jelena_Bratuli\u01071;~Nadine_Behrmann1;~Volker_Fischer1;~Thomas_Brox1", "aff": ";University of Freiburg, Albert-Ludwigs-Universit\u00e4t Freiburg;Albert-Ludwigs-Universit\u00e4t Freiburg;Robert Bosch GmbH, Bosch;Bosch Center for Artificial Intelligence;University of Freiburg", "aff_domain": ";cs.uni-freiburg.de;uni-freiburg.de;de.bosch.com;bosch.com;uni-freiburg.de", "position": ";PhD student;PhD student;PhD student;Postdoc;Full Professor", "bibtex": "@inproceedings{\nhoffmann2024eurekamoments,\ntitle={Eureka-Moments in Transformers: Multi-Step Tasks Reveal Softmax Induced Optimization Problems},\nauthor={David T Hoffmann and Simon Schrodi and Jelena Bratuli{\\'c} and Nadine Behrmann and Volker Fischer and Thomas Brox},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HssOwuZiaB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7543420, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5148688068900508481&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 8, "email": ";cs.uni-freiburg.de;uni-freiburg.de;de.bosch.com;bosch.com;uni-freiburg.de", "author_num": 6, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "University of Freiburg;Albert-Ludwigs-Universit\u00e4t Freiburg;Robert Bosch GmbH;Bosch Center for Artificial Intelligence", "aff_unique_dep": ";;;Center for Artificial Intelligence", "aff_unique_url": "https://www.uni-freiburg.de;https://www.uni-freiburg.de;https://www.bosch.com;https://www.bosch-ai.com", "aff_unique_abbr": "UoF;Albert-Ludwigs-Universit\u00e4t;Bosch;BCAI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Freiburg;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Dr. Strategy: Model-Based Generalist Agents with Strategic Dreaming", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34444", "id": "HsseRq2FAx", "proceeding": "https://proceedings.mlr.press/v235/hamed24a.html", "pdf": "https://openreview.net/pdf?id=HsseRq2FAx", "openreview": "https://openreview.net/forum?id=HsseRq2FAx", "author_site": "Hany Hamed, Subin Kim, Dongyeong Kim, Jaesik Yoon, Sungjin Ahn", "tldr": "", "abstract": "Model-based reinforcement learning (MBRL) has been a primary approach to ameliorating the sample efficiency issue as well as to make a generalist agent. However, there has not been much effort toward enhancing the strategy of dreaming itself. Therefore, it is a question *whether and how an agent can ``*dream better*''* in a more structured and strategic way. In this paper, inspired by the observation from cognitive science suggesting that humans use a spatial divide-and-conquer strategy in planning, we propose a new MBRL agent, called **Dr. Strategy**, which is equipped with a novel **Dr**eaming **Strategy**. The proposed agent realizes a version of divide-and-conquer-like strategy in dreaming. This is achieved by learning a set of latent landmarks and then utilizing these to learn a landmark-conditioned highway policy. With the highway policy, the agent can first learn in the dream to move to a landmark, and from there it tackles the exploration and achievement task in a more focused way. In experiments, we show that the proposed model outperforms prior pixel-based MBRL methods in various visually complex and partially observable navigation tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hany Hamed;Subin Kim;Dongyeong Kim;Jaesik Yoon;Sungjin Ahn", "authorids": "~Hany_Hamed1;~Subin_Kim4;~Dongyeong_Kim1;~Jaesik_Yoon1;~Sungjin_Ahn1", "gender": "M;F;M;M;", "homepage": "https://hany606.github.io/;https://ksb21st.github.io/profile/;;https://jaesikyoon.com;", "dblp": ";;;158/9715;", "google_scholar": "J5ogYwsAAAAJ;;W2on_8IAAAAJ;qboyyIAAAAAJ;", "orcid": "0000-0002-6788-0917;;;;", "linkedin": "hany-hamed-elanwar/;;;jaesik-yoon-809726123/;", "or_profile": "~Hany_Hamed1;~Subin_Kim4;~Dongyeong_Kim1;~Jaesik_Yoon1;~Sungjin_Ahn1", "aff": "Korea Advanced Institute of Science & Technology;;Korea Advanced Institute of Science & Technology;SAP Labs Korea;", "aff_domain": "kaist.edu;;kaist.ac.kr;sap.com;", "position": "MS student;;Undergrad student;Researcher;", "bibtex": "@inproceedings{\nhamed2024dr,\ntitle={Dr. Strategy: Model-Based Generalist Agents with Strategic Dreaming},\nauthor={Hany Hamed and Subin Kim and Dongyeong Kim and Jaesik Yoon and Sungjin Ahn},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HsseRq2FAx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8851727, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16651565162244839343&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "kaist.edu;;kaist.ac.kr;sap.com;", "author_num": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;SAP Labs", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;https://labs.sap/", "aff_unique_abbr": "KAIST;SAP Labs", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Fault Tolerant ML: Efficient Meta-Aggregation and Synchronous Training", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34443", "id": "Ht20wtgaty", "proceeding": "https://proceedings.mlr.press/v235/dahan24a.html", "pdf": "https://openreview.net/pdf?id=Ht20wtgaty", "openreview": "https://openreview.net/forum?id=Ht20wtgaty", "author_site": "Tehila Dahan, Kfir Levy", "tldr": "", "abstract": "In this paper, we investigate the challenging framework of Byzantine-robust training in distributed machine learning (ML) systems, focusing on enhancing both efficiency and practicality. As distributed ML systems become integral for complex ML tasks, ensuring resilience against Byzantine failures\u2014where workers may contribute incorrect updates due to malice or error\u2014gains paramount importance. Our first contribution is the introduction of the Centered Trimmed Meta Aggregator (CTMA), an efficient meta-aggregator that upgrades baseline aggregators to optimal performance levels, while requiring low computational demands. Additionally, we propose harnessing a recently developed gradient estimation technique based on a double-momentum strategy within the Byzantine context. Our paper highlights its theoretical and practical advantages for Byzantine-robust training, especially in simplifying the tuning process and reducing the reliance on numerous hyperparameters. The effectiveness of this technique is supported by theoretical insights within the stochastic convex optimization (SCO) framework and corroborated by empirical evidence.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tehila Dahan;Kfir Yehuda Levy", "authorids": "~Tehila_Dahan1;~Kfir_Yehuda_Levy1", "gender": "F;M", "homepage": ";http://kfiryehud.wixsite.com/kfir-y-levy", "dblp": "378/2189;83/11388", "google_scholar": ";", "orcid": ";", "linkedin": "tehila-dahan-b86481178/;", "or_profile": "~Tehila_Dahan1;~Kfir_Yehuda_Levy1", "aff": "Technion - Israel Institute of Technology, Technion;Technion - Israel Institute of Technology, Technion", "aff_domain": "technion.ac.il;technion.ac.il", "position": "MS student;Assistant Professor", "bibtex": "@inproceedings{\ndahan2024fault,\ntitle={Fault Tolerant {ML}: Efficient Meta-Aggregation and Synchronous Training},\nauthor={Tehila Dahan and Kfir Yehuda Levy},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Ht20wtgaty}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4289902, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3898060547753842975&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "technion.ac.il;technion.ac.il", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.technion.ac.il", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "CurBench: Curriculum Learning Benchmark", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34442", "id": "Htw0bSgjXE", "proceeding": "https://proceedings.mlr.press/v235/zhou24o.html", "pdf": "https://openreview.net/pdf?id=Htw0bSgjXE", "openreview": "https://openreview.net/forum?id=Htw0bSgjXE", "author_site": "Yuwei Zhou, Zirui Pan, Xin Wang, Hong Chen, Haoyang Li, Yanwen Huang, Zhixiao Xiong, Fangzhou Xiong, Peiyang Xu, Shengnan liu, Wenwu Zhu", "tldr": "", "abstract": "Curriculum learning is a training paradigm where machine learning models are trained in a meaningful order, inspired by the way humans learn curricula. Due to its capability to improve model generalization and convergence, curriculum learning has gained considerable attention and has been widely applied to various research domains. Nevertheless, as new curriculum learning methods continue to emerge, it remains an open issue to benchmark them fairly. Therefore, we develop CurBench, the first benchmark that supports systematic evaluations for curriculum learning. Specifically, it consists of 15 datasets spanning 3 research domains: computer vision, natural language processing, and graph machine learning, along with 3 settings: standard, noise, and imbalance. To facilitate a comprehensive comparison, we establish the evaluation from 2 dimensions: performance and complexity. CurBench also provides a unified toolkit that plugs automatic curricula into general machine learning processes, enabling the implementation of 15 core curriculum learning methods. On the basis of this benchmark, we conduct comparative experiments and make empirical analyses of existing methods. CurBench is open-source and publicly available at https://github.com/THUMNLab/CurBench.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuwei Zhou;Zirui Pan;Xin Wang;Hong Chen;Haoyang Li;Yanwen Huang;Zhixiao Xiong;Fangzhou Xiong;Peiyang Xu;Shengnan liu;Wenwu Zhu", "authorids": "~Yuwei_Zhou1;~Zirui_Pan1;~Xin_Wang17;~Hong_Chen9;~Haoyang_Li1;~Yanwen_Huang1;~Zhixiao_Xiong1;~Fangzhou_Xiong1;~Peiyang_Xu2;~Shengnan_liu3;~Wenwu_Zhu1", "gender": "M;;M;M;M;F;M;M;M;M;M", "homepage": ";https://github.com/pzrain;http://mn.cs.tsinghua.edu.cn/xinwang/;https://forchchch.github.io/;https://haoyang.li;;https://xiong-zx.github.io/;;https://github.com/liusn21;http://media.cs.tsinghua.edu.cn/en/zww;https://xupy2003.github.io/", "dblp": "124/2955;331/1564;10/5630-19;52/4150-11;118/0004-1.html;;;;;97/6308-1.html;384/4287", "google_scholar": "Ed748H0AAAAJ;IKkQ9GoAAAAJ;YPOBHYUAAAAJ;;86RE16gAAAAJ;;;;;https://scholar.google.com.tw/citations?user=7t2jzpgAAAAJ;9rPOyVsAAAAJ", "orcid": "0000-0001-9582-7331;0000-0002-6795-0620;0000-0002-0351-2939;0000-0002-0943-2286;0000-0003-3544-5563;0000-0002-1214-6387;0009-0007-2905-0481;0009-0001-0250-5462;;0000-0003-2236-9290;", "linkedin": ";;;;;;;;;;", "or_profile": "~Yuwei_Zhou1;~Zirui_Pan1;~Xin_Wang17;~Hong_Chen9;~Haoyang_Li1;~Yanwen_Huang1;~Zhixiao_Xiong1;~Fangzhou_Xiong1;~Shengnan_liu3;~Wenwu_Zhu1;~PEIYANG_XU1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;Cornell University;Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;mails.tsinghua.edu.cn;cs.tsinghua.edu.cn;tsinghua.edu.cn;med.cornell.edu;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;Undergrad student;Associate Professor;PhD student;Postdoc;MS student;Undergrad student;Undergrad student;Undergrad student;Full Professor;Undergrad student", "bibtex": "@inproceedings{\nzhou2024curbench,\ntitle={CurBench: Curriculum Learning Benchmark},\nauthor={Yuwei Zhou and Zirui Pan and Xin Wang and Hong Chen and Haoyang Li and Yanwen Huang and Zhixiao Xiong and Fangzhou Xiong and Peiyang Xu and Shengnan liu and Wenwu Zhu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Htw0bSgjXE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 800183, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12702274737012337449&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "tsinghua.edu.cn;mails.tsinghua.edu.cn;cs.tsinghua.edu.cn;tsinghua.edu.cn;med.cornell.edu;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 11, "aff_unique_index": "0;0;0;0;1;0;0;0;0;0;0", "aff_unique_norm": "Tsinghua University;Cornell University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.cornell.edu", "aff_unique_abbr": "THU;Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0;0;0;0;0;0", "aff_country_unique": "China;United States" }, { "title": "Out-of-Distribution Detection via Deep Multi-Comprehension Ensemble", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34441", "id": "HusShERjlc", "proceeding": "https://proceedings.mlr.press/v235/xu24ae.html", "pdf": "https://openreview.net/pdf?id=HusShERjlc", "openreview": "https://openreview.net/forum?id=HusShERjlc", "author_site": "Chenhui Xu, Fuxun Yu, Zirui Xu, Nathan Inkawhich, Xiang Chen", "tldr": "", "abstract": "Recent research works demonstrate that one of the significant factors for the model Out-of-Distirbution detection performance is the scale of the OOD feature representation field. Consequently, model ensemble emerges as a trending method to expand this feature representation field leveraging expected model diversity. However, by proposing novel qualitative and quantitative model ensemble evaluation methods (i.e., Loss Basin/Barrier Visualization and Self-Coupling Index), we reveal that the previous ensemble methods incorporate affine-transformable weights with limited variability and fail to provide desired feature representation diversity. Therefore, we escalate the traditional model ensemble dimensions (different weight initialization, data holdout, etc.) into distinct supervision tasks, which we name as Multi-Comprehension (MC) Ensemble. MC Ensemble leverages various training tasks to form different comprehensions of the data and labels, resulting in the extension of the feature representation field. In experiments, we demonstrate the superior performance of the MC Ensemble strategy in the OOD detection task compared to both the naive Deep Ensemble method and the standalone model of comparable size.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chenhui Xu;Fuxun Yu;Zirui Xu;Nathan Inkawhich;Xiang Chen", "authorids": "~Chenhui_Xu1;~Fuxun_Yu1;~Zirui_Xu1;~Nathan_Inkawhich1;~Xiang_Chen1", "gender": "M;M;M;;M", "homepage": "https://minihuihui.github.io;https://scholar.google.com/citations?user=t8vayXEAAAAJ&hl=en;https://sites.google.com/view/ziruixu/home;;https://if-lab-pku.github.io/", "dblp": "174/1805;215/4440.html;;230/7843;64/3062-10.html", "google_scholar": "SjjiXpYAAAAJ;t8vayXEAAAAJ;CTDArowAAAAJ;NZh50oIAAAAJ;QEdR90AAAAAJ", "orcid": "0009-0003-7517-5796;0000-0002-4880-6658;;;0000-0003-2790-976X", "linkedin": ";;;;", "or_profile": "~Chenhui_Xu1;~Fuxun_Yu1;~Zirui_Xu1;~Nathan_Inkawhich1;~Xiang_Chen1", "aff": "George Mason University;Microsoft;CVS Health;Air Force Research Laboratory;Peking University", "aff_domain": "gmu.edu;microsoft.com;cvshealth.com;us.af.mil;pku.edu.cn", "position": "PhD student;Principal Researcher;Researcher;Researcher;Associate Professor", "bibtex": "@inproceedings{\nxu2024outofdistribution,\ntitle={Out-of-Distribution Detection via Deep Multi-Comprehension Ensemble},\nauthor={Chenhui Xu and Fuxun Yu and Zirui Xu and Nathan Inkawhich and Xiang Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HusShERjlc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1283051, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2458050124045881430&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "gmu.edu;microsoft.com;cvshealth.com;us.af.mil;pku.edu.cn", "author_num": 5, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "George Mason University;Microsoft;CVS Health;Air Force Research Laboratory;Peking University", "aff_unique_dep": ";Microsoft Corporation;;;", "aff_unique_url": "https://www.gmu.edu;https://www.microsoft.com;https://www.cvshealth.com;https://www.afrl.af.mil/;http://www.pku.edu.cn", "aff_unique_abbr": "GMU;Microsoft;CVS;AFRL;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "United States;China" }, { "title": "LLark: A Multimodal Instruction-Following Language Model for Music", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34440", "id": "HvwOtYzHBX", "proceeding": "https://proceedings.mlr.press/v235/gardner24a.html", "pdf": "https://openreview.net/pdf?id=HvwOtYzHBX", "openreview": "https://openreview.net/forum?id=HvwOtYzHBX", "author_site": "Joshua Gardner, Simon Durand, Daniel Stoller, Rachel Bittner", "tldr": "", "abstract": "Music has a unique and complex structure which is challenging for both expert humans and existing AI systems to understand, and presents unique challenges relative to other forms of audio. We present LLark, an instruction-tuned multimodal model for *music* understanding. We detail our process for dataset creation, which involves augmenting the annotations of diverse open-source music datasets and converting them to a unified instruction-tuning format. We propose a multimodal architecture for LLark, integrating a pretrained generative model for music with a pretrained language model. In evaluations on three types of tasks (music understanding, captioning, reasoning), we show that LLark matches or outperforms existing baselines in music understanding, and that humans show a high degree of agreement with its responses in captioning and reasoning tasks. LLark is trained entirely from open-source music data and models, and we make our training code available along with the release of this paper. Additional results and audio examples are at https://bit.ly/llark, and our source code is available at https://github.com/spotify-research/llark.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Joshua P Gardner;Simon Durand;Daniel Stoller;Rachel M Bittner", "authorids": "~Joshua_P_Gardner1;~Simon_Durand1;~Daniel_Stoller1;~Rachel_M_Bittner1", "gender": ";M;;", "homepage": ";https://scholar.google.com/citations?user=N2oBCKkAAAAJ&hl=en&oi=ao;;", "dblp": ";;;", "google_scholar": ";;;pXn1kQEAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Joshua_P_Gardner1;~Simon_Durand1;~Daniel_Stoller1;~Rachel_M_Bittner1", "aff": ";;Queen Mary University London;Spotify", "aff_domain": ";;qmul.ac.uk;spotify.com", "position": ";;PhD student;Researcher", "bibtex": "@inproceedings{\ngardner2024llark,\ntitle={{LL}ark: A Multimodal Instruction-Following Language Model for Music},\nauthor={Joshua P Gardner and Simon Durand and Daniel Stoller and Rachel M Bittner},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HvwOtYzHBX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1411229, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4179493381987280930&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": ";;qmul.ac.uk;spotify.com", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Queen Mary University of London;Spotify", "aff_unique_dep": ";", "aff_unique_url": "https://www.qmul.ac.uk;https://www.spotify.com", "aff_unique_abbr": "QMUL;Spotify", "aff_campus_unique_index": "0", "aff_campus_unique": "London;", "aff_country_unique_index": "0;1", "aff_country_unique": "United Kingdom;Sweden" }, { "title": "Planning, Fast and Slow: Online Reinforcement Learning with Action-Free Offline Data via Multiscale Planners", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34439", "id": "HwVZbPbMjw", "proceeding": "https://proceedings.mlr.press/v235/wu24j.html", "pdf": "https://openreview.net/pdf?id=HwVZbPbMjw", "openreview": "https://openreview.net/forum?id=HwVZbPbMjw", "author_site": "Chengjie Wu, Hao Hu, yiqin yang, Ning Zhang, Chongjie Zhang", "tldr": "", "abstract": "The surge in volumes of video data offers unprecedented opportunities for advancing reinforcement learning (RL). This growth has motivated the development of passive RL, seeking to convert passive observations into actionable insights. This paper explores the prerequisites and mechanisms through which passive data can be utilized to improve online RL. We show that, in identifiable dynamics, where action impact can be distinguished from stochasticity, learning on passive data is statistically beneficial. Building upon the theoretical insights, we propose a novel algorithm named Multiscale State-Centric Planners (MSCP) that leverages two planners at distinct scales to offer guidance across varying levels of abstraction. The algorithm's fast planner targets immediate objectives, while the slow planner focuses on achieving longer-term goals. Notably, the fast planner incorporates pessimistic regularization to address the distributional shift between offline and online data. MSCP effectively handles the practical challenges involving imperfect pretraining and limited dataset coverage. Our empirical evaluations across multiple benchmarks demonstrate that MSCP significantly outperforms existing approaches, underscoring its proficiency in addressing complex, long-horizon tasks through the strategic use of passive data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chengjie Wu;Hao Hu;Yiqin Yang;Ning Zhang;Chongjie Zhang", "authorids": "~Chengjie_Wu1;~Hao_Hu3;~Yiqin_Yang1;~Ning_Zhang2;~Chongjie_Zhang1", "gender": "M;M;M;;", "homepage": ";https://mousehu.github.io;https://www.researchgate.net/profile/Yiqin-Yang-2;https://cybersecurity.seas.wustl.edu/ning/index.html;", "dblp": "70/6141;67/6924-6;180/7725;;29/6693", "google_scholar": "fXL69VsAAAAJ;https://scholar.google.com/citations?hl=en;aHTi5IEAAAAJ;;LjxqXycAAAAJ", "orcid": ";;;;", "linkedin": ";hao-hu-tsinghua;;;", "or_profile": "~Chengjie_Wu1;~Hao_Hu3;~Yiqin_Yang1;~Ning_Zhang2;~Chongjie_Zhang1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Washington University, Saint Louis;Washington University, Saint Louis", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;wustl.edu;wustl.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nwu2024planning,\ntitle={Planning, Fast and Slow: Online Reinforcement Learning with Action-Free Offline Data via Multiscale Planners},\nauthor={Chengjie Wu and Hao Hu and Yiqin Yang and Ning Zhang and Chongjie Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HwVZbPbMjw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2899359, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ICWmP6vZOygJ:scholar.google.com/&scioq=Planning,+Fast+and+Slow:+Online+Reinforcement+Learning+with+Action-Free+Offline+Data+via+Multiscale+Planners&hl=en&as_sdt=0,48", "gs_version_total": 4, "email": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;wustl.edu;wustl.edu", "author_num": 5, "aff_unique_index": "0;0;0;1;1", "aff_unique_norm": "Tsinghua University;Washington University in St. Louis", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://wustl.edu", "aff_unique_abbr": "THU;WUSTL", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Saint Louis", "aff_country_unique_index": "0;0;0;1;1", "aff_country_unique": "China;United States" }, { "title": "Understanding the Learning Dynamics of Alignment with Human Feedback", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34438", "id": "Hy88Jp0kQT", "proceeding": "https://proceedings.mlr.press/v235/im24a.html", "pdf": "https://openreview.net/pdf?id=Hy88Jp0kQT", "openreview": "https://openreview.net/forum?id=Hy88Jp0kQT", "author_site": "Shawn Im, Sharon Li", "tldr": "", "abstract": "Aligning large language models (LLMs) with human intentions has become a critical task for safely deploying models in real-world systems. While existing alignment approaches have seen empirical success, theoretically understanding how these methods affect model behavior remains an open question. Our work provides an initial attempt to theoretically analyze the learning dynamics of human preference alignment. We formally show how the distribution of preference datasets influences the rate of model updates and provide rigorous guarantees on the training accuracy. Our theory also reveals an intricate phenomenon where the optimization is prone to prioritizing certain behaviors with higher preference distinguishability. We empirically validate our findings on contemporary LLMs and alignment tasks, reinforcing our theoretical insights and shedding light on considerations for future alignment approaches. Disclaimer: This paper contains potentially offensive text; reader discretion is advised.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shawn Im;Yixuan Li", "authorids": "~Shawn_Im1;~Yixuan_Li1", "gender": "M;F", "homepage": "https://shawn-im.github.io/;http://pages.cs.wisc.edu/~sharonli/", "dblp": ";144/6087-1", "google_scholar": ";https://scholar.google.com/citations?hl=en", "orcid": ";", "linkedin": ";liyixuan", "or_profile": "~Shawn_Im1;~Yixuan_Li1", "aff": "Department of Computer Science, University of Wisconsin - Madison;Cornell University", "aff_domain": "cs.wisc.edu;cornell.edu", "position": "PhD student;Graduate Student", "bibtex": "@inproceedings{\nim2024understanding,\ntitle={Understanding the Learning Dynamics of Alignment with Human Feedback},\nauthor={Shawn Im and Yixuan Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Hy88Jp0kQT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1188205, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4599194884868282055&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "cs.wisc.edu;cornell.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Wisconsin-Madison;Cornell University", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.wisc.edu;https://www.cornell.edu", "aff_unique_abbr": "UW-Madison;Cornell", "aff_campus_unique_index": "0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Finding NEM-U: Explaining unsupervised representation learning through neural network generated explanation masks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34437", "id": "Hzpt1Gws9g", "proceeding": "https://proceedings.mlr.press/v235/moller24a.html", "pdf": "https://openreview.net/pdf?id=Hzpt1Gws9g", "openreview": "https://openreview.net/forum?id=Hzpt1Gws9g", "author_site": "Bj\u00f8rn Leth M\u00f8ller, Christian Igel, Kristoffer Wickstr\u00f8m, Jon Sporring, Robert Jenssen, Bulat Ibragimov", "tldr": "", "abstract": "Unsupervised representation learning has become an important ingredient of today's deep learning systems. However, only a few methods exist that explain a learned vector embedding in the sense of providing information about which parts of an input are the most important for its representation. These methods generate the explanation for a given input after the model has been evaluated and tend to produce either inaccurate explanations or are slow, which limits their practical use. To address these limitations, we introduce the Neural Explanation Masks (NEM) framework, which turns a fixed representation model into a self-explaining model by augmenting it with a masking network. This network provides occlusion-based explanations in parallel to computing the representations during inference. We present an instance of this framework, the NEM-U (NEM using U-net structure) architecture, which leverages similarities between segmentation and occlusion-based masks. Our experiments show that NEM-U generates explanations faster and with lower complexity compared to the current state-of-the-art while maintaining high accuracy as measured by locality.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bj\u00f8rn Leth M\u00f8ller;Christian Igel;Kristoffer Knutsen Wickstr\u00f8m;Jon Sporring;Robert Jenssen;Bulat Ibragimov", "authorids": "~Bj\u00f8rn_Leth_M\u00f8ller1;~Christian_Igel1;~Kristoffer_Knutsen_Wickstr\u00f8m1;~Jon_Sporring1;~Robert_Jenssen1;~Bulat_Ibragimov3", "gender": "M;M;M;M;M;M", "homepage": ";https://christian-igel.github.io/;https://wickstrom.github.io;https://sporring.github.io/;https://uit.no/ansatte/robert.jenssen;", "dblp": "384/4291;38/6146;224/0211.html;33/5273.html;45/5813;", "google_scholar": ";https://scholar.google.dk/citations?user=d-jF4zIAAAAJ;https://scholar.google.no/citations?hl=no;COP1HUwAAAAJ;HiviXjIAAAAJ;https://scholar.google.ru/citations?hl=ru", "orcid": "0009-0008-3007-4007;0000-0003-2868-0856;0000-0003-1395-7154;0000-0003-1261-6702;0000-0002-7496-8474;", "linkedin": ";christianigel/;;jon-sporring-b085254/;robert-jenssen-10b79318/?originalSubdomain=no;", "or_profile": "~Bj\u00f8rn_Leth_M\u00f8ller1;~Christian_Igel1;~Kristoffer_Knutsen_Wickstr\u00f8m1;~Jon_Sporring1;~Robert_Jenssen1;~Bulat_Ibragimov3", "aff": "University of Copenhagen;University of Copenhagen;University of Troms\u00f8;University of Copenhagen;UiT The Arctic University of Norway;University of Copenhagen", "aff_domain": "diku.dk;ku.dk;uit.no;di.ku.dk;uit.no;ku.dk", "position": "PhD student;Full Professor;Associate Professor;Full Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nm{\\o}ller2024finding,\ntitle={Finding {NEM}-U: Explaining unsupervised representation learning through neural network generated explanation masks},\nauthor={Bj{\\o}rn Leth M{\\o}ller and Christian Igel and Kristoffer Knutsen Wickstr{\\o}m and Jon Sporring and Robert Jenssen and Bulat Ibragimov},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Hzpt1Gws9g}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7564004, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17681239173132086740&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "email": "diku.dk;ku.dk;uit.no;di.ku.dk;uit.no;ku.dk", "author_num": 6, "aff_unique_index": "0;0;1;0;2;0", "aff_unique_norm": "University of Copenhagen;University of Troms\u00f8;Arctic University of Norway", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ku.dk;https://uit.no;https://www.uit.no", "aff_unique_abbr": "UCPH;UIT;UiT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;1;0", "aff_country_unique": "Denmark;Norway" }, { "title": "Swallowing the Bitter Pill: Simplified Scalable Conformer Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34436", "id": "I44Em5D5xy", "proceeding": "https://proceedings.mlr.press/v235/wang24q.html", "pdf": "https://openreview.net/pdf?id=I44Em5D5xy", "openreview": "https://openreview.net/forum?id=I44Em5D5xy", "author_site": "Yuyang Wang, Ahmed Elhag, Navdeep Jaitly, Joshua M Susskind, Miguel Angel Bautista Martin", "tldr": "", "abstract": "We present a novel way to predict molecular conformers through a simple formulation that sidesteps many of the heuristics of prior works and achieves state of the art results by using the advantages of scale. By training a diffusion generative model directly on 3D atomic positions without making assumptions about the explicit structure of molecules (e.g. modeling torsional angles) we are able to radically simplify structure learning, and make it trivial to scale up the model sizes. This model, called Molecular Conformer Fields (MCF), works by parameterizing conformer structures as functions that map elements from a molecular graph directly to their 3D location in space. This formulation allows us to boil down the essence of structure prediction to learning a distribution over functions. Experimental results show that scaling up the model capacity leads to large gains in generalization performance without enforcing inductive biases like rotational equivariance. MCF represents an advance in extending diffusion models to handle complex scientific problems in a conceptually simple, scalable and effective manner.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuyang Wang;Ahmed A. A. Elhag;Navdeep Jaitly;Joshua M. Susskind;Miguel \u00c1ngel Bautista", "authorids": "~Yuyang_Wang3;~Ahmed_A._A._Elhag1;~Navdeep_Jaitly1;~Joshua_M._Susskind1;~Miguel_\u00c1ngel_Bautista1", "gender": ";M;M;M;M", "homepage": "https://yuyangw.github.io/;;http://www.cs.toronto.edu/~ndjaitly/;http://www.apple.com;", "dblp": "43/8355-5;288/1987.html;04/6137;132/7797;38/10085", "google_scholar": "6eWGKEsAAAAJ;v0MsHOMAAAAJ;kjMNMLkAAAAJ;Sv2TGqsAAAAJ;ZrRs-qoAAAAJ", "orcid": "0000-0003-0723-6246;;;;", "linkedin": ";ahmed-a-a-elhag-7305441a3/;;joshua-susskind-8ab2ab5/;", "or_profile": "~Yuyang_Wang3;~Ahmed_A._A._Elhag1;~Navdeep_Jaitly1;~Joshua_M._Susskind1;~Miguel_\u00c1ngel_Bautista1", "aff": "Apple;Department of Computer Science, University of Oxford;Apple;Apple;Apple", "aff_domain": "apple.com;cs.ox.ac.uk;apple.com;apple.com;apple.com", "position": "Research Scientist;PhD student;Principal Researcher;Researcher;Research Scientist", "bibtex": "@inproceedings{\nwang2024swallowing,\ntitle={Swallowing the Bitter Pill: Simplified Scalable Conformer Generation},\nauthor={Yuyang Wang and Ahmed A. A. Elhag and Navdeep Jaitly and Joshua M. Susskind and Miguel {\\'A}ngel Bautista},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=I44Em5D5xy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 10150750, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13856154344064672845&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "apple.com;cs.ox.ac.uk;apple.com;apple.com;apple.com", "author_num": 5, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Apple;University of Oxford", "aff_unique_dep": "Apple Inc.;Department of Computer Science", "aff_unique_url": "https://www.apple.com;https://www.ox.ac.uk", "aff_unique_abbr": "Apple;Oxford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Oxford", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "How Do Nonlinear Transformers Learn and Generalize in In-Context Learning?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34435", "id": "I4HTPws9P6", "proceeding": "https://proceedings.mlr.press/v235/li24bn.html", "pdf": "https://openreview.net/pdf?id=I4HTPws9P6", "openreview": "https://openreview.net/forum?id=I4HTPws9P6", "author_site": "Hongkang Li, Meng Wang, Songtao Lu, Xiaodong Cui, Pin-Yu Chen", "tldr": "", "abstract": "Transformer-based large language models have displayed impressive in-context learning capabilities, where a pre-trained model can handle new tasks without fine-tuning by simply augmenting the query with some input-output examples from that task. Despite the empirical success, the mechanics of how to train a Transformer to achieve ICL and the corresponding ICL capacity is mostly elusive due to the technical challenges of analyzing the nonconvex training problems resulting from the nonlinear self-attention and nonlinear activation in Transformers. To the best of our knowledge, this paper provides the first theoretical analysis of the training dynamics of Transformers with nonlinear self-attention and nonlinear MLP, together with the ICL generalization capability of the resulting model. Focusing on a group of binary classification tasks, we train Transformers using data from a subset of these tasks and quantify the impact of various factors on the ICL generalization performance on the remaining unseen tasks with and without data distribution shifts. We also analyze how different components in the learned Transformers contribute to the ICL performance. Furthermore, we provide the first theoretical analysis of how model pruning affects ICL performance and prove that proper magnitude-based pruning can have a minimal impact on ICL while reducing inference costs. These theoretical findings are justified through numerical experiments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hongkang Li;Meng Wang;Songtao Lu;Xiaodong Cui;Pin-Yu Chen", "authorids": "~Hongkang_Li1;~Meng_Wang4;~Songtao_Lu1;~Xiaodong_Cui1;~Pin-Yu_Chen1", "gender": ";F;M;M;M", "homepage": "https://lohek330.github.io/lihongkang.github.io/;https://www.ecse.rpi.edu/~wang/index.html;https://songtaogithub.github.io/;http://researcher.watson.ibm.com/researcher/view.php?person=us-cuix;http://www.pinyuchen.com", "dblp": "318/8643;93/6765-3;05/2887;;39/8969", "google_scholar": "https://scholar.google.com.hk/citations?user=DVlDPjMAAAAJ;;LRsjX7kAAAAJ;wzNVJQsAAAAJ;jxwlCUUAAAAJ", "orcid": ";;;;0000-0003-1039-8369", "linkedin": "hongkang-li-b7a341173/;;;;pin-yu-chen-940062a2", "or_profile": "~Hongkang_Li1;~Meng_Wang4;~Songtao_Lu1;~Xiaodong_Cui1;~Pin-Yu_Chen1", "aff": "Rensselaer Polytechnic Institute;Rensselaer Polytechnic Institute;IBM Thomas J. Watson Research Center;IBM T. J. Watson Research Center;International Business Machines", "aff_domain": "rpi.edu;rpi.edu;ibm.com;us.ibm.com;ibm.com", "position": "PhD student;Associate Professor;Researcher;Principal Research Staff Member;Principal Researcher", "bibtex": "@inproceedings{\nli2024how,\ntitle={How Do Nonlinear Transformers Learn and Generalize in In-Context Learning?},\nauthor={Hongkang Li and Meng Wang and Songtao Lu and Xiaodong Cui and Pin-Yu Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=I4HTPws9P6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1200729, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12845573105136695551&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8, "email": "rpi.edu;rpi.edu;ibm.com;us.ibm.com;ibm.com", "author_num": 5, "aff_unique_index": "0;0;1;1;2", "aff_unique_norm": "Rensselaer Polytechnic Institute;IBM;International Business Machines Corporation", "aff_unique_dep": ";Research;", "aff_unique_url": "https://www.rpi.edu;https://www.ibm.com/research;https://www.ibm.com", "aff_unique_abbr": "RPI;IBM;IBM", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Yorktown Heights;T. J. Watson", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Activation-Descent Regularization for Input Optimization of ReLU Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34434", "id": "IArWwIim8M", "proceeding": "https://proceedings.mlr.press/v235/yu24c.html", "pdf": "https://openreview.net/pdf?id=IArWwIim8M", "openreview": "https://openreview.net/forum?id=IArWwIim8M", "author_site": "Hongzhan Yu, Sicun Gao", "tldr": "", "abstract": "We present a new approach for input optimization of ReLU networks that explicitly takes into account the effect of changes in activation patterns. We analyze local optimization steps in both the input space and the space of activation patterns to propose methods with superior local descent properties. To accomplish this, we convert the discrete space of activation patterns into differentiable representations and propose regularization terms that improve each descent step. Our experiments demonstrate the effectiveness of the proposed input-optimization methods for improving the state-of-the-art in various areas, such as adversarial learning, generative modeling, and reinforcement learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hongzhan Yu;Sicun Gao", "authorids": "~Hongzhan_Yu1;~Sicun_Gao1", "gender": ";M", "homepage": ";", "dblp": ";22/8296", "google_scholar": ";", "orcid": ";", "linkedin": "hongzhan-yu/;", "or_profile": "~Hongzhan_Yu1;~Sicun_Gao1", "aff": "University of California, San Diego;University of California, San Diego", "aff_domain": "ucsd.edu;ucsd.edu", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nyu2024activationdescent,\ntitle={Activation-Descent Regularization for Input Optimization of Re{LU} Networks},\nauthor={Hongzhan Yu and Sicun Gao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=IArWwIim8M}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4193039, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2983105524099271475&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "ucsd.edu;ucsd.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of California, San Diego", "aff_unique_dep": "", "aff_unique_url": "https://www.ucsd.edu", "aff_unique_abbr": "UCSD", "aff_campus_unique_index": "0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "MultiMax: Sparse and Multi-Modal Attention Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34433", "id": "IC9UZ8lm25", "proceeding": "https://proceedings.mlr.press/v235/zhou24g.html", "pdf": "https://openreview.net/pdf?id=IC9UZ8lm25", "openreview": "https://openreview.net/forum?id=IC9UZ8lm25", "author_site": "Yuxuan Zhou, Mario Fritz, Margret Keuper", "tldr": "", "abstract": "SoftMax is a ubiquitous ingredient of modern machine learning algorithms. It maps an input vector onto a probability simplex and reweights the input by concentrating the probability mass at large entries. Yet, as a smooth approximation to the Argmax function, a significant amount of probability mass is distributed to other, residual entries, leading to poor interpretability and noise. Although sparsity can be achieved by a family of SoftMax variants, they often require an alternative loss function and do not preserve multimodality. We show that this trade-off between multi-modality and sparsity limits the expressivity of SoftMax as well as its variants. We provide a solution to this tension between objectives by proposing a piece-wise differentiable function, termed MultiMax, which adaptively modulates the output distribution according to input entry range. Through comprehensive analysis and evaluation, we show that MultiMax successfully produces a distribution that supresses irrelevant entries while preserving multi-modality, with benefits in image classification, language modeling and machine translation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuxuan Zhou;Mario Fritz;Margret Keuper", "authorids": "~Yuxuan_Zhou2;~Mario_Fritz1;~Margret_Keuper1", "gender": "M;M;F", "homepage": ";https://cispa.saarland/group/fritz/;https://www.vc.informatik.uni-siegen.de/en/keuper-margret", "dblp": "172/9870-4.html;;95/7589", "google_scholar": "ooVdh_kAAAAJ;https://scholar.google.de/citations?user=4V1nNm4AAAAJ;https://scholar.google.de/citations?user=KMqMQAcAAAAJ", "orcid": ";;0000-0002-8437-7993", "linkedin": ";;", "or_profile": "~Yuxuan_Zhou2;~Mario_Fritz1;~Margret_Keuper1", "aff": "Universit\u00e4t Mannheim;Saarland University;Max Planck Institute for Informatics", "aff_domain": "uni-mannheim.de;uni-saarland.de;mpi-inf.mpg", "position": "PhD student;Full Professor;Researcher", "bibtex": "@inproceedings{\nzhou2024multimax,\ntitle={MultiMax: Sparse and Multi-Modal Attention Learning},\nauthor={Yuxuan Zhou and Mario Fritz and Margret Keuper},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=IC9UZ8lm25}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2449597, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13550455754211090116&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8, "email": "uni-mannheim.de;uni-saarland.de;mpi-inf.mpg", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Mannheim;Saarland University;Max Planck Institute for Informatics", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uni-mannheim.de;https://www.uni-saarland.de;https://mpi-inf.mpg.de", "aff_unique_abbr": "UM;UdS;MPII", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "Graph Adversarial Diffusion Convolution", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34432", "id": "ICvWruTEDH", "proceeding": "https://proceedings.mlr.press/v235/liu24h.html", "pdf": "https://openreview.net/pdf?id=ICvWruTEDH", "openreview": "https://openreview.net/forum?id=ICvWruTEDH", "author_site": "Songtao Liu, Jinghui Chen, Tianfan Fu, Lu Lin, Marinka Zitnik, Dinghao Wu", "tldr": "", "abstract": "This paper introduces a min-max optimization formulation for the Graph Signal Denoising (GSD) problem. In this formulation, we first maximize the second term of GSD by introducing perturbations to the graph structure based on Laplacian distance and then minimize the overall loss of the GSD. By solving the min-max optimization problem, we derive a new variant of the Graph Diffusion Convolution (GDC) architecture, called Graph Adversarial Diffusion Convolution (GADC). GADC differs from GDC by incorporating an additional term that enhances robustness against adversarial attacks on the graph structure and noise in node features. Moreover, GADC improves the performance of GDC on heterophilic graphs. Extensive experiments demonstrate the effectiveness of GADC across various datasets. Code is available at https://github.com/SongtaoLiu0823/GADC.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Songtao Liu;Jinghui Chen;Tianfan Fu;Lu Lin;Marinka Zitnik;Dinghao Wu", "authorids": "~Songtao_Liu2;~Jinghui_Chen1;~Tianfan_Fu1;~Lu_Lin2;~Marinka_Zitnik1;~Dinghao_Wu1", "gender": "M;M;M;F;;", "homepage": "https://songtaoliu0823.github.io/;https://jinghuichen.github.io/;https://futianfan.github.io/;https://louise-lulin.github.io;https://zitniklab.hms.harvard.edu;", "dblp": ";67/5633;;86/2209-1;53/11277.html;", "google_scholar": "https://scholar.google.com.tw/citations?hl=zh-CN;mKia7Y4AAAAJ;KPQ49w4AAAAJ;8N04pBgAAAAJ;YtUDgPIAAAAJ;", "orcid": ";;;0000-0002-2539-3352;;", "linkedin": ";;;lulin92/;;", "or_profile": "~Songtao_Liu2;~Jinghui_Chen1;~Tianfan_Fu1;~Lu_Lin2;~Marinka_Zitnik1;~Dinghao_Wu1", "aff": "Peking University;Pennsylvania State University;Rensselaer Polytechnic Institute;Pennsylvania State University;Harvard University;", "aff_domain": "pku.edu.cn;psu.edu;rpi.edu;psu.edu;harvard.edu;", "position": "Intern;Assistant Professor;Assistant Professor;Assistant Professor;Associate Professor;", "bibtex": "@inproceedings{\nliu2024graph,\ntitle={Graph Adversarial Diffusion Convolution},\nauthor={Songtao Liu and Jinghui Chen and Tianfan Fu and Lu Lin and Marinka Zitnik and Dinghao Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ICvWruTEDH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 465293, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=906742669870716595&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "email": "pku.edu.cn;psu.edu;rpi.edu;psu.edu;harvard.edu;", "author_num": 6, "aff_unique_index": "0;1;2;1;3", "aff_unique_norm": "Peking University;Pennsylvania State University;Rensselaer Polytechnic Institute;Harvard University", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.pku.edu.cn;https://www.psu.edu;https://www.rpi.edu;https://www.harvard.edu", "aff_unique_abbr": "Peking U;PSU;RPI;Harvard", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "China;United States" }, { "title": "Neural Networks Learn Statistics of Increasing Complexity", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34431", "id": "IGdpKP0N6w", "proceeding": "https://proceedings.mlr.press/v235/belrose24a.html", "pdf": "https://openreview.net/pdf?id=IGdpKP0N6w", "openreview": "https://openreview.net/forum?id=IGdpKP0N6w", "author_site": "Nora Belrose, Quintin Pope, Lucia Quirke, Alex Mallen, Xiaoli Fern", "tldr": "", "abstract": "The _distributional simplicity bias_ (DSB) posits that neural networks learn low-order moments of the data distribution first, before moving on to higher-order correlations. In this work, we present compelling new evidence for the DSB by showing that networks automatically learn to perform well on maximum-entropy distributions whose low-order statistics match those of the training set early in training, then lose this ability later. We also extend the DSB to discrete domains by proving an equivalence between token $n$-gram frequencies and the moments of embedding vectors, and by finding empirical evidence for the bias in LLMs. Finally we use optimal transport methods to surgically edit the low-order statistics of one class to match those of another, and show that early-training networks treat the edited samples as if they were drawn from the target class. Code is available at https://github.com/EleutherAI/features-across-time.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nora Belrose;Quintin Pope;Lucia Quirke;Alex Troy Mallen;Xiaoli Fern", "authorids": "~Nora_Belrose1;~Quintin_Pope1;~Lucia_Quirke1;~Alex_Troy_Mallen1;~Xiaoli_Fern1", "gender": "F;M;;M;F", "homepage": "https://twitter.com/norabelrose;;https://github.com/luciaquirke;;http://web.engr.orst.edu/~xfern/", "dblp": "332/2248;304/7870.html;;294/8452;http://dblp.uni-trier.de/pers/hd/f/Fern:Xiaoli_Z=", "google_scholar": "p_oBc64AAAAJ;https://scholar.google.com/citations?hl=en;;EZe6n8EAAAAJ;rnDD_oEAAAAJ", "orcid": ";0009-0002-6014-9643;;;", "linkedin": ";quintin-pope;lucia-quirke/;alex-mallen-815b01176/;", "or_profile": "~Nora_Belrose1;~Quintin_Pope1;~Lucia_Quirke1;~Alex_Troy_Mallen1;~Xiaoli_Fern1", "aff": "EleutherAI;Oregon State University;;Redwood Research;Oregon State University", "aff_domain": "eleuther.ai;oregonstate.edu;;rdwrs.com;oregonstate.edu", "position": "Researcher;PhD student;;Researcher;Associate Professor", "bibtex": "@inproceedings{\nbelrose2024neural,\ntitle={Neural Networks Learn Statistics of Increasing Complexity},\nauthor={Nora Belrose and Quintin Pope and Lucia Quirke and Alex Troy Mallen and Xiaoli Fern},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=IGdpKP0N6w}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1904308, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5568799983092346925&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "eleuther.ai;oregonstate.edu;;rdwrs.com;oregonstate.edu", "author_num": 5, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "EleutherAI;Oregon State University;Redwood Research", "aff_unique_dep": ";;", "aff_unique_url": "https://www.eleuther.ai;https://oregonstate.edu;https://www.redwoodresearch.org", "aff_unique_abbr": "EleutherAI;OSU;Redwood Research", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Superpoint Gaussian Splatting for Real-Time High-Fidelity Dynamic Scene Reconstruction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34430", "id": "INb8xV1xmf", "proceeding": "https://proceedings.mlr.press/v235/wan24f.html", "pdf": "https://openreview.net/pdf?id=INb8xV1xmf", "openreview": "https://openreview.net/forum?id=INb8xV1xmf", "author_site": "Diwen Wan, Ruijie Lu, Gang Zeng", "tldr": "", "abstract": "Rendering novel view images in dynamic scenes is a crucial yet challenging task. Current methods mainly utilize NeRF-based methods to represent the static scene and an additional time-variant MLP to model scene deformations, resulting in relatively low rendering quality as well as slow inference speed. To tackle these challenges, we propose a novel framework named Superpoint Gaussian Splatting (SP-GS). Specifically, our framework first employs explicit 3D Gaussians to reconstruct the scene and then clusters Gaussians with similar properties (e.g., rotation, translation, and location) into superpoints. Empowered by these superpoints, our method manages to extend 3D Gaussian splatting to dynamic scenes with only a slight increase in computational expense. Apart from achieving state-of-the-art visual quality and real-time rendering under high resolutions, the superpoint representation provides a stronger manipulation capability. Extensive experiments demonstrate the practicality and effectiveness of our approach on both synthetic and real-world datasets. Please see our project page at https://dnvtmf.github.io/SP_GS.github.io.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Diwen Wan;Ruijie Lu;Gang Zeng", "authorids": "~Diwen_Wan1;~Ruijie_Lu1;~Gang_Zeng1", "gender": "M;;M", "homepage": ";https://jason-aplp.github.io/Ruijie-Lu/;https://www.cis.pku.edu.cn/info/1177/1378.htm", "dblp": "227/6394;125/9394;", "google_scholar": "gWWaiWYAAAAJ;wxo8_VYAAAAJ;RuHyY6gAAAAJ", "orcid": "0000-0002-3640-0511;;", "linkedin": ";Ruijie122/;", "or_profile": "~Diwen_Wan1;~Ruijie_Lu1;~Gang_Zeng1", "aff": "Peking University;Peking University;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn;pku.edu.cn", "position": "PhD student;PhD student;Researcher", "bibtex": "@inproceedings{\nwan2024superpoint,\ntitle={Superpoint Gaussian Splatting for Real-Time High-Fidelity Dynamic Scene Reconstruction},\nauthor={Diwen Wan and Ruijie Lu and Gang Zeng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=INb8xV1xmf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8086075, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7815274056919490214&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "pku.edu.cn;pku.edu.cn;pku.edu.cn", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Quality-Diversity Actor-Critic: Learning High-Performing and Diverse Behaviors via Value and Successor Features Critics", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34429", "id": "ISG3l8nXrI", "proceeding": "https://proceedings.mlr.press/v235/grillotti24a.html", "pdf": "https://openreview.net/pdf?id=ISG3l8nXrI", "openreview": "https://openreview.net/forum?id=ISG3l8nXrI", "author_site": "Luca Grillotti, Maxence Faldor, Borja G. Le\u00f3n, Antoine Cully", "tldr": "", "abstract": "A key aspect of intelligence is the ability to demonstrate a broad spectrum of behaviors for adapting to unexpected situations. Over the past decade, advancements in deep reinforcement learning have led to groundbreaking achievements to solve complex continuous control tasks. However, most approaches return only one solution specialized for a specific problem. We introduce Quality-Diversity Actor-Critic (QDAC), an off-policy actor-critic deep reinforcement learning algorithm that leverages a value function critic and a successor features critic to learn high-performing and diverse behaviors. In this framework, the actor optimizes an objective that seamlessly unifies both critics using constrained optimization to (1) maximize return, while (2) executing diverse skills. Compared with other Quality-Diversity methods, QDAC achieves significantly higher performance and more diverse behaviors on six challenging continuous control locomotion tasks. We also demonstrate that we can harness the learned skills to adapt better than other baselines to five perturbed environments. Finally, qualitative analyses showcase a range of remarkable behaviors: [adaptive-intelligent-robotics.github.io/QDAC](https://adaptive-intelligent-robotics.github.io/QDAC/).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Luca Grillotti;Maxence Faldor;Borja G. Le\u00f3n;Antoine Cully", "authorids": "~Luca_Grillotti1;~Maxence_Faldor1;~Borja_G._Le\u00f3n1;~Antoine_Cully1", "gender": "M;M;M;M", "homepage": "https://luca.grillotti.com;https://maxencefaldor.github.io;https://www.doc.ic.ac.uk/~bg19/;", "dblp": ";342/2945;259/1299;https://dblp.org/pers/c/Cully:Antoine.html", "google_scholar": ";s36pCYsAAAAJ;https://scholar.google.es/citations?user=sJiadiMAAAAJ;rZtJlPQAAAAJ", "orcid": ";0000-0003-4743-9494;;", "linkedin": ";maxencefaldor/;borja-gonzalez-leon/;", "or_profile": "~Luca_Grillotti1;~Maxence_Faldor1;~Borja_G._Le\u00f3n1;~Antoine_Cully1", "aff": "Imperial College London;Imperial College London;Imperial College London;Imperial College London", "aff_domain": "imperial.ac.uk;imperial.ac.uk;imperial.ac.uk;imperial.ac.uk", "position": "PhD student;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\ngrillotti2024qualitydiversity,\ntitle={Quality-Diversity Actor-Critic: Learning High-Performing and Diverse Behaviors via Value and Successor Features Critics},\nauthor={Luca Grillotti and Maxence Faldor and Borja G. Le{\\'o}n and Antoine Cully},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ISG3l8nXrI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9930298, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14909147380044764550&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "imperial.ac.uk;imperial.ac.uk;imperial.ac.uk;imperial.ac.uk", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Imperial College London", "aff_unique_dep": "", "aff_unique_url": "https://www.imperial.ac.uk", "aff_unique_abbr": "ICL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Hieros: Hierarchical Imagination on Structured State Space Sequence World Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34428", "id": "IUBhvyJ9Sr", "proceeding": "https://proceedings.mlr.press/v235/mattes24a.html", "pdf": "https://openreview.net/pdf?id=IUBhvyJ9Sr", "openreview": "https://openreview.net/forum?id=IUBhvyJ9Sr", "author_site": "Paul Mattes, Rainer Schlosser, Ralf Herbrich", "tldr": "", "abstract": "One of the biggest challenges to modern deep reinforcement learning (DRL) algorithms is sample efficiency. Many approaches learn a world model in order to train an agent entirely in imagination, eliminating the need for direct environment interaction during training. However, these methods often suffer from either a lack of imagination accuracy, exploration capabilities, or runtime efficiency. We propose HIEROS, a hierarchical policy that learns time abstracted world representations and imagines trajectories at multiple time scales in latent space. HIEROS uses an S5 layer-based world model, which predicts next world states in parallel during training and iteratively during environment interaction. Due to the special properties of S5 layers, our method can train in parallel and predict next world states iteratively during imagination. This allows for more efficient training than RNN-based world models and more efficient imagination than Transformer-based world models. We show that our approach outperforms the state of the art in terms of mean and median normalized human score on the Atari 100k benchmark, and that our proposed world model is able to predict complex dynamics very accurately. We also show that HIEROS displays superior exploration capabilities compared to existing approaches.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Paul Mattes;Rainer Schlosser;Ralf Herbrich", "authorids": "~Paul_Mattes1;~Rainer_Schlosser1;~Ralf_Herbrich1", "gender": "M;;M", "homepage": "https://github.com/Snagnar;https://hpi.de/herbrich/people/postdocs/dr-rainer-schlosser.html;https://herbrich.me", "dblp": ";129/4931;h/RalfHerbrich", "google_scholar": ";https://scholar.google.de/citations?user=A5TrKKcAAAAJ;RuvHkikAAAAJ", "orcid": ";0000-0002-6627-4026;", "linkedin": "paul-mattes-1a3455228/;;ralf-herbrich-28a8324/", "or_profile": "~Paul_Mattes1;~Rainer_Schlosser1;~Ralf_Herbrich1", "aff": ";Hasso Plattner Institute;Hasso Plattner Institute", "aff_domain": ";hpi.de;hpi.de", "position": ";Researcher;Full Professor", "bibtex": "@inproceedings{\nmattes2024hieros,\ntitle={Hieros: Hierarchical Imagination on Structured State Space Sequence World Models},\nauthor={Paul Mattes and Rainer Schlosser and Ralf Herbrich},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=IUBhvyJ9Sr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6165586, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16594461642192045397&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": ";hpi.de;hpi.de", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Hasso Plattner Institute", "aff_unique_dep": "", "aff_unique_url": "https://www.hpi.de", "aff_unique_abbr": "HPI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Fool Your (Vision and) Language Model with Embarrassingly Simple Permutations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34427", "id": "IUijgjJgWO", "proceeding": "https://proceedings.mlr.press/v235/zong24b.html", "pdf": "https://openreview.net/pdf?id=IUijgjJgWO", "openreview": "https://openreview.net/forum?id=IUijgjJgWO", "author_site": "Yongshuo Zong, Tingyang Yu, Ruchika Chavhan, Bingchen Zhao, Timothy Hospedales", "tldr": "", "abstract": "Large language and vision-language models are rapidly being deployed in practice thanks to their impressive capabilities in instruction following, in-context learning, and so on. This raises an urgent need to carefully analyse their robustness so that stakeholders can understand if and when such models are trustworthy enough to be relied upon in any given application. In this paper, we highlight a specific vulnerability in popular models, namely permutation sensitivity in multiple-choice question answering (MCQA). Specifically, we show empirically that popular models are vulnerable to adversarial permutation in answer sets for multiple-choice prompting, which is surprising as models should ideally be as invariant to prompt permutation as humans are. These vulnerabilities persist across various model sizes, and exist in very recent language and vision-language models. Code to reproduce all experiments is provided in supplementary materials.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yongshuo Zong;Tingyang Yu;Ruchika Chavhan;Bingchen Zhao;Timothy Hospedales", "authorids": "~Yongshuo_Zong1;~Tingyang_Yu1;~Ruchika_Chavhan1;~Bingchen_Zhao1;~Timothy_Hospedales1", "gender": ";F;M;M;F", "homepage": "https://ys-zong.github.io/;https://ruchikachavhan.github.io/;http://bzhao.me/;http://homepages.inf.ed.ac.uk/thospeda/;https://yistyu.github.io/", "dblp": ";;120/3602;32/3545;", "google_scholar": "38-dM-MAAAAJ;vWDTlWoAAAAJ;lEcqFJEAAAAJ;https://scholar.google.fr/citations?user=nHhtvqkAAAAJ;1Cw8oZ4AAAAJ", "orcid": ";;;0000-0003-4867-7486;", "linkedin": ";ruchika-chavhan-5b5520159/?originalSubdomain=in;;timothyhospedales/;", "or_profile": "~Yongshuo_Zong1;~Ruchika_Chavhan1;~Bingchen_Zhao1;~Timothy_Hospedales1;~Yist_Tingyang_YU1", "aff": "University of Edinburgh;University of Edinburgh, University of Edinburgh;University of Edinburgh, University of Edinburgh;Samsung AI Research Centre;EPFL - EPF Lausanne", "aff_domain": "ed.ac.uk;ed.ac.uk;ed.ac.uk;samsung.com;epfl.ch", "position": "PhD student;PhD student;PhD student;Principal Researcher;PhD student", "bibtex": "@inproceedings{\nzong2024fool,\ntitle={Fool Your (Vision and) Language Model with Embarrassingly Simple Permutations},\nauthor={Yongshuo Zong and Tingyang Yu and Ruchika Chavhan and Bingchen Zhao and Timothy Hospedales},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=IUijgjJgWO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 528252, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15919863494483071307&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "ed.ac.uk;ed.ac.uk;ed.ac.uk;samsung.com;epfl.ch", "author_num": 5, "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "University of Edinburgh;Samsung;EPFL", "aff_unique_dep": ";AI Research;", "aff_unique_url": "https://www.ed.ac.uk;https://www.samsung.com/global/researchers/samsung-ai-research-centre/;https://www.epfl.ch", "aff_unique_abbr": "Edinburgh;SARC;EPFL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Lausanne", "aff_country_unique_index": "0;0;0;1;2", "aff_country_unique": "United Kingdom;South Korea;Switzerland" }, { "title": "Quantum Positional Encodings for Graph Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34426", "id": "IW45Dr1Kxi", "proceeding": "https://proceedings.mlr.press/v235/thabet24a.html", "pdf": "https://openreview.net/pdf?id=IW45Dr1Kxi", "openreview": "https://openreview.net/forum?id=IW45Dr1Kxi", "author_site": "Slimane Thabet, Mehdi Djellabi, Igor Sokolov, Sachin Kasture, Louis-Paul Henry, Loic Henriet", "tldr": "", "abstract": "In this work, we propose novel families of positional encodings tailored to graph neural networks obtained with quantum computers. These encodings leverage the long-range correlations inherent in quantum systems that arise from mapping the topology of a graph onto interactions between qubits in a quantum computer. Our inspiration stems from the recent advancements in quantum processing units, which offer computational capabilities beyond the reach of classical hardware. We prove that some of these quantum features are theoretically more expressive for certain graphs than the commonly used relative random walk probabilities. Empirically, we show that the performance of state-of-the-art models can be improved on standard benchmarks and large-scale datasets by computing tractable versions of quantum features. Our findings highlight the potential of leveraging quantum computing capabilities to enhance the performance of transformers in handling graph data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Slimane Thabet;Mehdi Djellabi;Igor Olegovich Sokolov;Sachin Kasture;Louis-Paul Henry;Loic Henriet", "authorids": "~Slimane_Thabet1;~Mehdi_Djellabi1;~Igor_Olegovich_Sokolov1;~Sachin_Kasture1;~Louis-Paul_Henry1;~Loic_Henriet1", "gender": "M;M;M;M;Not Specified;", "homepage": ";;;;;", "dblp": ";209/8835.html;;;;", "google_scholar": "eGUJjGkAAAAJ;tsvMxysAAAAJ;https://scholar.google.com/citations?hl=en;;riQQjssAAAAJ;https://scholar.google.com/scholar?hl=en", "orcid": ";;0000-0002-0022-5686;0000-0003-2244-7551;;", "linkedin": ";djellabi-mehdi-363759207/;;;;", "or_profile": "~Slimane_Thabet1;~Mehdi_Djellabi1;~Igor_Olegovich_Sokolov1;~Sachin_Kasture1;~Louis-Paul_Henry1;~Loic_Henriet1", "aff": "Pasqal;Pasqal;;;PASQAL;", "aff_domain": "pasqal.com;pasqal.fr;;;pasqal.com;", "position": "Researcher;Researcher;;;Researcher;", "bibtex": "@inproceedings{\nthabet2024quantum,\ntitle={Quantum Positional Encodings for Graph Neural Networks},\nauthor={Slimane Thabet and Mehdi Djellabi and Igor Olegovich Sokolov and Sachin Kasture and Louis-Paul Henry and Loic Henriet},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=IW45Dr1Kxi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1505835, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16882137317642868759&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "pasqal.com;pasqal.fr;;;pasqal.com;", "author_num": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "Pasqal", "aff_unique_dep": "", "aff_unique_url": "https://www.pasqal.com", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "Deeper or Wider: A Perspective from Optimal Generalization Error with Sobolev Loss", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34425", "id": "IWi6iLZeRG", "proceeding": "https://proceedings.mlr.press/v235/yang24j.html", "pdf": "https://openreview.net/pdf?id=IWi6iLZeRG", "openreview": "https://openreview.net/forum?id=IWi6iLZeRG", "author_site": "Yahong Yang, Juncai He", "tldr": "", "abstract": "Constructing the architecture of a neural network is a challenging pursuit for the machine learning community, and the dilemma of whether to go deeper or wider remains a persistent question. This paper explores a comparison between deeper neural networks (DeNNs) with a flexible number of layers and wider neural networks (WeNNs) with limited hidden layers, focusing on their optimal generalization error in Sobolev losses. Analytical investigations reveal that the architecture of a neural network can be significantly influenced by various factors, including the number of sample points, parameters within the neural networks, and the regularity of the loss function. Specifically, a higher number of parameters tends to favor WeNNs, while an increased number of sample points and greater regularity in the loss function lean towards the adoption of DeNNs. We ultimately apply this theory to address partial differential equations using deep Ritz and physics-informed neural network (PINN) methods, guiding the design of neural networks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yahong Yang;Juncai He", "authorids": "~Yahong_Yang1;~Juncai_He1", "gender": "M;M", "homepage": ";https://juncaihe.github.io", "dblp": ";223/4286", "google_scholar": ";CG5GBW0AAAAJ", "orcid": "0000-0002-9721-2362;", "linkedin": ";", "or_profile": "~Yahong_Yang1;~Juncai_He1", "aff": "Pennsylvania State University;King Abdullah University of Science and Technology", "aff_domain": "psu.edu;kaust.edu.sa", "position": "Postdoc;Researcher", "bibtex": "@inproceedings{\nyang2024deeper,\ntitle={Deeper or Wider: A Perspective from Optimal Generalization Error with Sobolev Loss},\nauthor={Yahong Yang and Juncai He},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=IWi6iLZeRG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 624859, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6759307831893527504&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "psu.edu;kaust.edu.sa", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Pennsylvania State University;King Abdullah University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.psu.edu;https://www.kast.kau.edu.sa", "aff_unique_abbr": "PSU;KAUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Saudi Arabia" }, { "title": "A General Framework for Sequential Decision-Making under Adaptivity Constraints", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34424", "id": "IYI61L7SPk", "proceeding": "https://proceedings.mlr.press/v235/xiong24d.html", "pdf": "https://openreview.net/pdf?id=IYI61L7SPk", "openreview": "https://openreview.net/forum?id=IYI61L7SPk", "author_site": "Nuoya Xiong, Zhaoran Wang, Zhuoran Yang", "tldr": "", "abstract": "We take the first step in studying general sequential decision-making under two adaptivity constraints: rare policy switch and batch learning. First, we provide a general class called the Eluder Condition class, which includes a wide range of reinforcement learning classes. Then, for the rare policy switch constraint, we provide a generic algorithm to achieve a $\\widetilde{\\mathcal{O}}(\\log K) $ switching cost with a $\\widetilde{\\mathcal{O}}(\\sqrt{K})$ regret on the EC class. For the batch learning constraint, we provide an algorithm that provides a $\\widetilde{\\mathcal{O}}(\\sqrt{K}+K/B)$ regret with the number of batches $B.$ This paper is the first work considering rare policy switch and batch learning under general function classes, which covers nearly all the models studied in the previous works such as tabular MDP (Bai et al. 2019, Zhang et al. 2020), linear MDP (Wang et al. 2021, Gao et al. 2021), low eluder dimension MDP (Kong et al., 2021; Velegkas et al., 2022), generalized linear function approximation (Qiao et al. 2023), and also some new classes such as the low $D_\\Delta$-type Bellman eluder dimension problem, linear mixture MDP, kernelized nonlinear regulator and undercomplete partially observed Markov decision process (POMDP).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nuoya Xiong;Zhaoran Wang;Zhuoran Yang", "authorids": "~Nuoya_Xiong1;~Zhaoran_Wang1;~Zhuoran_Yang1", "gender": "M;Not Specified;M", "homepage": "https://xiongny.github.io/index.html;https://zhaoranwang.github.io/;https://zhuoranyang.github.io/", "dblp": "322/6141;117/2756;", "google_scholar": "K7Q4GWQAAAAJ;https://scholar.google.com.tw/citations?user=HSx0BgQAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Nuoya_Xiong1;~Zhaoran_Wang1;~Zhuoran_Yang1", "aff": "Tsinghua University;Northwestern University;Yale University", "aff_domain": "tsinghua.edu.cn;northwestern.edu;yale.edu", "position": "Undergrad student;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nxiong2024a,\ntitle={A General Framework for Sequential Decision-Making under Adaptivity Constraints},\nauthor={Nuoya Xiong and Zhaoran Wang and Zhuoran Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=IYI61L7SPk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 661662, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1460402605793513851&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 8, "email": "tsinghua.edu.cn;northwestern.edu;yale.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Tsinghua University;Northwestern University;Yale University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.northwestern.edu;https://www.yale.edu", "aff_unique_abbr": "THU;NU;Yale", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;United States" }, { "title": "Implicit Representations for Constrained Image Segmentation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34423", "id": "IaV6AgrTUp", "proceeding": "https://proceedings.mlr.press/v235/schneider24a.html", "pdf": "https://openreview.net/pdf?id=IaV6AgrTUp", "openreview": "https://openreview.net/forum?id=IaV6AgrTUp", "author_site": "Jan Philipp Schneider, Mishal Fatima, Jovita Lukasik, Andreas Kolb, Margret Keuper, Michael Moeller", "tldr": "", "abstract": "Implicit representations allow to use a parametric function that maps (spatial) coordinates to the value that is traditionally stored in each pixel, e.g. RGB values, instead of a discrete grid. This has recently proven quite advantageous as an internal representation for images or scenes for deep learning models. Yet, its potential to ensure certain properties of the solution has not yet been fully explored. In this work, we demonstrate that implicit representations are a powerful tool for enforcing a variety of different geometric constraints in image segmentation. While convexity, star-shape, path-connectedness, periodicity, or symmetry of the (spatial or space-time) region to be segmented are very challenging to enforce for pixel-wise discretizations, a suitable parametrization of an implicit representation, mapping spatial or spatio-temporal coordinates to the likeliness of a pixel belonging to the fore- or background, allows to **provably** ensure such constraints. Several numerical examples demonstrate that challenging segmentation scenarios can benefit from the inclusion of application-specific constraints, e.g. when occlusions prevent a faithful segmentation with classical approaches.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jan Philipp Schneider;Mishal Fatima;Jovita Lukasik;Andreas Kolb;Margret Keuper;Michael Moeller", "authorids": "~Jan_Philipp_Schneider1;~Mishal_Fatima1;~Jovita_Lukasik1;~Andreas_Kolb1;~Margret_Keuper1;~Michael_Moeller1", "gender": "M;;F;M;F;M", "homepage": "https://github.com/jp-schneider;;https://www.uni-mannheim.de/dws/people/researchers/phd-students/jovita-lukasik/;https://www.cg.informatik.uni-siegen.de/;https://www.vc.informatik.uni-siegen.de/en/keuper-margret;http://vsa.informatik.uni-siegen.de", "dblp": ";;255/4833;76/311.html;95/7589;08/5840-1", "google_scholar": "nOOs33gAAAAJ;;https://scholar.google.de/citations?user=TpsZenwAAAAJ;https://scholar.google.de/citations?user=T_Fwt_oAAAAJ;https://scholar.google.de/citations?user=KMqMQAcAAAAJ;https://scholar.google.de/citations?user=sxzdAGUAAAAJ", "orcid": ";;;0000-0003-4753-7801;0000-0002-8437-7993;", "linkedin": "jan-philipp-schneider;;;andreas-kolb-7818a610/;;", "or_profile": "~Jan_Philipp_Schneider1;~Mishal_Fatima1;~Jovita_Lukasik1;~Andreas_Kolb1;~Margret_Keuper1;~Michael_Moeller1", "aff": "Princeton University;;Universit\u00e4t Siegen;Universit\u00e4t Siegen;Max Planck Institute for Informatics;University of Siegen", "aff_domain": "princeton.edu;;uni-siegen.de;uni-siegen.de;mpi-inf.mpg;uni-siegen.de", "position": "Researcher;;Postdoc;Full Professor;Researcher;Full Professor", "bibtex": "@inproceedings{\nschneider2024implicit,\ntitle={Implicit Representations for Constrained Image Segmentation},\nauthor={Jan Philipp Schneider and Mishal Fatima and Jovita Lukasik and Andreas Kolb and Margret Keuper and Michael Moeller},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=IaV6AgrTUp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7956671, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11808183861706473611&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "princeton.edu;;uni-siegen.de;uni-siegen.de;mpi-inf.mpg;uni-siegen.de", "author_num": 6, "aff_unique_index": "0;1;1;2;1", "aff_unique_norm": "Princeton University;University of Siegen;Max Planck Institute for Informatics", "aff_unique_dep": ";;", "aff_unique_url": "https://www.princeton.edu;https://www.uni-siegen.de;https://mpi-inf.mpg.de", "aff_unique_abbr": "Princeton;Uni Siegen;MPII", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "United States;Germany" }, { "title": "Proteus: Exploring Protein Structure Generation for Enhanced Designability and Efficiency", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34422", "id": "IckJCzsGVS", "proceeding": "https://proceedings.mlr.press/v235/wang24bi.html", "pdf": "https://openreview.net/pdf?id=IckJCzsGVS", "openreview": "https://openreview.net/forum?id=IckJCzsGVS", "author_site": "chentong wang, Yannan Qu, Zhangzhi Peng, Yukai Wang, Hongli Zhu, dachuan chen, Longxing Cao", "tldr": "", "abstract": "Diffusion-based generative models have been successfully employed to create proteins with novel structures and functions. However, the construction of such models typically depends on large, pre-trained structure prediction networks, like RFdiffusion. In contrast, alternative models that are trained from scratch, such as FrameDiff, still fall short in performance. In this context, we introduce Proteus, an innovative deep diffusion network that incorporates graph-based triangle methods and a multi-track interaction network, eliminating the dependency on structure prediction pre-training with superior efficiency. We have validated our model's performance on de novo protein backbone generation through comprehensive in silico evaluations and experimental characterizations, which demonstrate a remarkable success rate. These promising results underscore Proteus's ability to generate highly designable protein backbones efficiently. This capability, achieved without reliance on pre-training techniques, has the potential to significantly advance the field of protein design.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chentong Wang;Yannan Qu;Zhangzhi Peng;Yukai Wang;Hongli Zhu;Dachuan Chen;Longxing Cao", "authorids": "~Chentong_Wang1;~Yannan_Qu1;~Zhangzhi_Peng1;~Yukai_Wang1;zhuhongli@westlake.edu.cn;chendachuan@westlake.edu.cn;caolongxing@westlake.edu.cn", "gender": "M;M;M;M;;;", "homepage": "https://github.com/Wangchentong;https://github.com/hsmkxyj/quyannan.github.io;https://pengzhangzhi.github.io/home;;;;", "dblp": ";;323/8025;;;;", "google_scholar": ";;https://scholar.google.co.uk/citations?user=bZwJ9oUAAAAJ;;;;", "orcid": ";0000-0001-5398-286X;;0009-0006-7859-463X;;;", "linkedin": ";;;;;;", "or_profile": "~Chentong_Wang1;~Yannan_Qu1;~Zhangzhi_Peng1;~Yukai_Wang1;zhuhongli@westlake.edu.cn;chendachuan@westlake.edu.cn;caolongxing@westlake.edu.cn", "aff": "Westlake University;Westlake University;Duke University;Westlake University;;;", "aff_domain": "westlake.edu.cn;westlake.edu.cn;duke.edu;westlake.edu.cn;;;", "position": "PhD student;PhD student;PhD student;PhD student;;;", "bibtex": "@inproceedings{\nwang2024proteus,\ntitle={Proteus: Exploring Protein Structure Generation for Enhanced Designability and Efficiency},\nauthor={Chentong Wang and Yannan Qu and Zhangzhi Peng and Yukai Wang and Hongli Zhu and Dachuan Chen and Longxing Cao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=IckJCzsGVS}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8547972, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8025432824950481687&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "westlake.edu.cn;westlake.edu.cn;duke.edu;westlake.edu.cn;;;", "author_num": 7, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Westlake University;Duke University", "aff_unique_dep": ";", "aff_unique_url": "https://www.westlake.edu.cn;https://www.duke.edu", "aff_unique_abbr": "WU;Duke", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "A Neural-Guided Dynamic Symbolic Network for Exploring Mathematical Expressions from Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34421", "id": "IejxxE9DO2", "proceeding": "https://proceedings.mlr.press/v235/li24ap.html", "pdf": "https://openreview.net/pdf?id=IejxxE9DO2", "openreview": "https://openreview.net/forum?id=IejxxE9DO2", "author_site": "Wenqiang Li, Weijun Li, Lina Yu, Min Wu, Linjun Sun, Jingyi Liu, Yanjie Li, Shu Wei, Deng Yusong, Meilan Hao", "tldr": "", "abstract": "Symbolic regression (SR) is a powerful technique for discovering the underlying mathematical expressions from observed data. Inspired by the success of deep learning, recent deep generative SR methods have shown promising results. However, these methods face difficulties in processing high-dimensional problems and learning constants due to the large search space, and they don't scale well to unseen problems. In this work, we propose DySymNet, a novel neural-guided **Dy**namic **Sym**bolic **Net**work for SR. Instead of searching for expressions within a large search space, we explore symbolic networks with various structures, guided by reinforcement learning, and optimize them to identify expressions that better-fitting the data. Based on extensive numerical experiments on low-dimensional public standard benchmarks and the well-known SRBench with more variables, DySymNet shows clear superiority over several representative baseline models. Open source code is available at https://github.com/AILWQ/DySymNet.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenqiang Li;Weijun Li;Lina Yu;Min Wu;Linjun Sun;Jingyi Liu;Yanjie Li;Shu Wei;Deng Yusong;Meilan Hao", "authorids": "~Wenqiang_Li2;~Weijun_Li1;~Lina_Yu1;~Min_Wu5;~Linjun_Sun1;~Jingyi_Liu2;~Yanjie_Li4;~Shu_Wei1;~Deng_Yusong1;~Meilan_Hao1", "gender": "M;M;F;M;M;F;;;M;", "homepage": "https://github.com/AILWQ;;;http://lab.semi.ac.cn/ailab/;;;;;http://none.com;", "dblp": ";;;16/0;;;;;;", "google_scholar": "b-MGt8gAAAAJ;HrzfypUAAAAJ;I8Uc918AAAAJ;wvvyr8UAAAAJ;50WznDAAAAAJ;;;;;", "orcid": "0000-0003-3286-7445;0000-0001-9668-2883;;0000-0001-9475-3975;0000-0002-9287-9467;0000-0002-9710-5006;;0009-0009-8553-9665;;", "linkedin": ";;;;;;;;;", "or_profile": "~Wenqiang_Li2;~Weijun_Li1;~Lina_Yu1;~Min_Wu5;~Linjun_Sun1;~Jingyi_Liu2;~Yanjie_Li4;~Shu_Wei1;~Deng_Yusong1;~Meilan_Hao1", "aff": "University of Chinese Academy of Sciences;Institute of Semiconductors Chinese Academy of Sciences;Institute of Semiconductors, Chinese Academy of Sciences;Institute of Semiconductors, Chinese Academy of Sciences;Institute of Semiconductors, Chinese Academy of Sciences;University of Chinese Academy of Sciences;;University of Chinese Academy of Sciences;University of Chinese Academy of Sciences;", "aff_domain": "ucas.ac.cn;ucac.ac.cn;semi.ac.cn;semi.ac.cn;semi.ac.cn;ucas.edu;;ucas.edu.cn;ucas.ac.cn;", "position": "MS student;Full Professor;Associate Professor;Assistant Professor;Postdoc;PhD student;;PhD student;MS student;", "bibtex": "@inproceedings{\nli2024a,\ntitle={A Neural-Guided Dynamic Symbolic Network for Exploring Mathematical Expressions from Data},\nauthor={Wenqiang Li and Weijun Li and Lina Yu and Min Wu and Linjun Sun and Jingyi Liu and Yanjie Li and Shu Wei and Deng Yusong and Meilan Hao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=IejxxE9DO2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 837297, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17522697127847063929&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 6, "email": "ucas.ac.cn;ucac.ac.cn;semi.ac.cn;semi.ac.cn;semi.ac.cn;ucas.edu;;ucas.edu.cn;ucas.ac.cn;", "author_num": 10, "aff_unique_index": "0;1;1;1;1;0;0;0", "aff_unique_norm": "University of Chinese Academy of Sciences;Chinese Academy of Sciences", "aff_unique_dep": ";Institute of Semiconductors", "aff_unique_url": "http://www.ucas.ac.cn;http://www.semi.ac.cn", "aff_unique_abbr": "UCAS;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Training-Free Long-Context Scaling of Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34420", "id": "If4xW9vF7U", "proceeding": "https://proceedings.mlr.press/v235/an24b.html", "pdf": "https://openreview.net/pdf?id=If4xW9vF7U", "openreview": "https://openreview.net/forum?id=If4xW9vF7U", "author_site": "Chenxin An, Fei Huang, Jun Zhang, Shansan Gong, Xipeng Qiu, Chang Zhou, Lingpeng Kong", "tldr": "", "abstract": "The ability of Large Language Models (LLMs) to process and generate coherent text is markedly weakened when the number of input tokens exceeds their pretraining length. Given the expensive overhead of finetuning large-scale models with longer sequences, we propose a training-free approach named Dual Chunk Attention (DCA), which enables Llama2 70B to support context windows of up to 100k tokens. By decomposing the attention computation for long sequences into chunk-based modules, DCA manages to effectively capture the relative positional information of tokens within the same chunk (Intra-Chunk) and across distinct chunks (Inter-Chunk), as well as integrates seamlessly with Flash Attention. In addition to its impressive extrapolation capability, DCA achieves performance on practical long-context tasks that is comparable to or even better than that of models built through continual training. All code and data used in this work are released at https://github.com/HKUNLP/ChunkLlama.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chenxin An;Fei Huang;Jun Zhang;Shansan Gong;Xipeng Qiu;Chang Zhou;Lingpeng Kong", "authorids": "~Chenxin_An1;~Fei_Huang3;~Jun_Zhang27;~Shansan_Gong1;~Xipeng_Qiu1;~Chang_Zhou2;~Lingpeng_Kong1", "gender": "M;M;M;F;M;M;M", "homepage": "https://chenxinan-fdu.github.io/;;;https://summmeer.github.io/;https://xpqiu.github.io/;;https://ikekonglp.github.io/", "dblp": "289/7002;h/FeiHuang-5;;320/4745;69/1395;;144/7656", "google_scholar": "fY69CxIAAAAJ;https://scholar.google.com.hk/citations?user=7udAEzMAAAAJ;;F86VNoMAAAAJ;Pq4Yp_kAAAAJ;QeSoG3sAAAAJ;f1hBi5wAAAAJ", "orcid": ";;0000-0002-3152-5091;0000-0001-5028-2323;0000-0001-7163-5247;;", "linkedin": ";;;;;;", "or_profile": "~Chenxin_An1;~Fei_Huang3;~Jun_Zhang27;~Shansan_Gong1;~Xipeng_Qiu1;~Chang_Zhou2;~Lingpeng_Kong1", "aff": "University of Hong Kong;Alibaba Group;ByteDance;University of Hong Kong;Fudan University;Alibaba Group;Department of Computer Science, The University of Hong Kong", "aff_domain": "hku.hk;alibaba-inc.com;bytedance.com;hku.hk;fudan.edu.cn;alibaba-inc.com;cs.hku.hk", "position": "PhD student;Researcher;Researcher;PhD student;Full Professor;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nan2024trainingfree,\ntitle={Training-Free Long-Context Scaling of Large Language Models},\nauthor={Chenxin An and Fei Huang and Jun Zhang and Shansan Gong and Xipeng Qiu and Chang Zhou and Lingpeng Kong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=If4xW9vF7U}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1253975, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8117823954647570095&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "hku.hk;alibaba-inc.com;bytedance.com;hku.hk;fudan.edu.cn;alibaba-inc.com;cs.hku.hk", "author_num": 7, "aff_unique_index": "0;1;2;0;3;1;0", "aff_unique_norm": "University of Hong Kong;Alibaba Group;ByteDance;Fudan University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.hku.hk;https://www.alibaba.com;https://www.bytedance.com;https://www.fudan.edu.cn", "aff_unique_abbr": "HKU;Alibaba;ByteDance;Fudan", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Listwise Reward Estimation for Offline Preference-based Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34419", "id": "If6Q9OYfoJ", "proceeding": "https://proceedings.mlr.press/v235/choi24b.html", "pdf": "https://openreview.net/pdf?id=If6Q9OYfoJ", "openreview": "https://openreview.net/forum?id=If6Q9OYfoJ", "author_site": "Heewoong Choi, Sangwon Jung, Hongjoon Ahn, Taesup Moon", "tldr": "", "abstract": "In Reinforcement Learning (RL), designing precise reward functions remains to be a challenge, particularly when aligning with human intent. Preference-based RL (PbRL) was introduced to address this problem by learning reward models from human feedback. However, existing PbRL methods have limitations as they often overlook the *second-order* preference that indicates the relative strength of preference. In this paper, we propose Listwise Reward Estimation (LiRE), a novel approach for offline PbRL that leverages second-order preference information by constructing a Ranked List of Trajectories (RLT), which can be efficiently built by using the same ternary feedback type as traditional methods. To validate the effectiveness of LiRE, we propose a new offline PbRL dataset that objectively reflects the effect of the estimated rewards. Our extensive experiments on the dataset demonstrate the superiority of LiRE, *i.e.,* outperforming state-of-the-art baselines even with modest feedback budgets and enjoying robustness with respect to the number of feedbacks and feedback noise. Our code is available at https://github.com/chwoong/LiRE", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Heewoong Choi;Sangwon Jung;Hongjoon Ahn;Taesup Moon", "authorids": "~Heewoong_Choi1;~Sangwon_Jung1;~Hongjoon_Ahn2;~Taesup_Moon1", "gender": ";M;M;", "homepage": "https://sites.google.com/view/chwoong/;https://successful-humor-4db.notion.site/Sangwon-Jung-70109a49767a470092a6ee0d02c78313;https://sites.google.com/view/hongjoon-ahn/;https://mindlab-snu.github.io/people/pi/", "dblp": ";236/3698;236/5812;05/4084", "google_scholar": ";WdC_a5IAAAAJ;uYaCitcAAAAJ;lQlioBoAAAAJ", "orcid": ";;;0000-0002-9257-6503", "linkedin": ";;;", "or_profile": "~Heewoong_Choi1;~Sangwon_Jung1;~Hongjoon_Ahn2;~Taesup_Moon1", "aff": "Seoul National University;Seoul National University;Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr", "position": "MS student;PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nchoi2024listwise,\ntitle={Listwise Reward Estimation for Offline Preference-based Reinforcement Learning},\nauthor={Heewoong Choi and Sangwon Jung and Hongjoon Ahn and Taesup Moon},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=If6Q9OYfoJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 913926, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12125401158406486249&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 7, "email": "snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Careful with that Scalpel: Improving Gradient Surgery with an EMA", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34418", "id": "IgwtflILyj", "proceeding": "https://proceedings.mlr.press/v235/hsieh24a.html", "pdf": "https://openreview.net/pdf?id=IgwtflILyj", "openreview": "https://openreview.net/forum?id=IgwtflILyj", "author_site": "Yu-Guan Hsieh, James Thornton, Eugene Ndiaye, Michal Klein, Marco Cuturi, Pierre Ablin", "tldr": "", "abstract": "Beyond minimizing a single training loss, many deep learning estimation pipelines rely on an auxiliary objective to quantify and encourage desirable properties of the model (e.g. performance on another dataset, robustness, agreement with a prior). Although the simplest approach to incorporating an auxiliary loss is to sum it with the training loss as a regularizer, recent works have shown that one can improve performance by blending the gradients beyond a simple sum; this is known as *gradient surgery*. We cast the problem as a constrained minimization problem where the auxiliary objective is minimized among the set of minimizers of the training loss. To solve this bilevel problem, we follow a parameter update direction that combines the training loss gradient and the orthogonal projection of the auxiliary gradient to the training gradient. In a setting where gradients come from mini-batches, we explain how, using a moving average of the training loss gradients, we can carefully maintain this critical orthogonality property. We demonstrate that our method, Bloop, can lead to much better performances on NLP and vision experiments than other gradient surgery methods without EMA.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yu-Guan Hsieh;James Thornton;Eugene Ndiaye;Michal Klein;marco cuturi;Pierre Ablin", "authorids": "~Yu-Guan_Hsieh1;~James_Thornton1;~Eugene_Ndiaye1;~Michal_Klein1;~marco_cuturi2;~Pierre_Ablin2", "gender": "M;;;M;M;M", "homepage": "https://www.cyber-meow.com/;https://jtt94.github.io/;;https://github.com/michalk8;http://marcocuturi.net;https://pierreablin.com/", "dblp": "228/6772;;;332/4607;85/5102;174/0980.html", "google_scholar": "I9lAMpEAAAAJ;oFZHOwgAAAAJ;;zByzdzcAAAAJ;https://scholar.google.fr/citations?user=kQEydDMAAAAJ;1ZsunaYAAAAJ", "orcid": ";;;0000-0002-2433-6380;;", "linkedin": ";;;michal-klein-148697165/;;", "or_profile": "~Yu-Guan_Hsieh1;~James_Thornton1;~Eugene_Ndiaye1;~Michal_Klein1;~marco_cuturi2;~Pierre_Ablin2", "aff": "Apple;Apple;;Apple;Ensae ParisTech;Apple", "aff_domain": "apple.com;apple.com;;apple.com;ensae.fr;apple.com", "position": "Postdoc;Researcher;;Researcher;Full Professor;Researcher", "bibtex": "@inproceedings{\nhsieh2024careful,\ntitle={Careful with that Scalpel: Improving Gradient Surgery with an {EMA}},\nauthor={Yu-Guan Hsieh and James Thornton and Eugene Ndiaye and Michal Klein and marco cuturi and Pierre Ablin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=IgwtflILyj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1960589, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:K6PjwE3-aZkJ:scholar.google.com/&scioq=Careful+with+that+Scalpel:+Improving+Gradient+Surgery+with+an+EMA&hl=en&as_sdt=0,5", "gs_version_total": 7, "email": "apple.com;apple.com;;apple.com;ensae.fr;apple.com", "author_num": 6, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Apple;ENSAE ParisTech", "aff_unique_dep": "Apple Inc.;", "aff_unique_url": "https://www.apple.com;https://www.ensae.fr", "aff_unique_abbr": "Apple;Ensae", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;France" }, { "title": "Pessimism Meets Risk: Risk-Sensitive Offline Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34417", "id": "InUUQkExsw", "proceeding": "https://proceedings.mlr.press/v235/zhang24aq.html", "pdf": "https://openreview.net/pdf?id=InUUQkExsw", "openreview": "https://openreview.net/forum?id=InUUQkExsw", "author_site": "Dake Zhang, Boxiang Lyu, Shuang Qiu, Mladen Kolar, Tong Zhang", "tldr": "", "abstract": "We study risk-sensitive reinforcement learning (RL), a crucial field due to its ability to enhance decision-making in scenarios where it is essential to manage uncertainty and minimize potential adverse outcomes. Particularly, our work focuses on applying the entropic risk measure to RL problems. While existing literature primarily investigates the online setting, there remains a large gap in understanding how to efficiently derive a near-optimal policy based on this risk measure using only a pre-collected dataset. We center on the linear Markov Decision Process (MDP) setting, a well-regarded theoretical framework that has yet to be examined from a risk-sensitive standpoint. In response, we introduce two provably sample-efficient algorithms. We begin by presenting a risk-sensitive pessimistic value iteration algorithm, offering a tight analysis by leveraging the structure of the risk-sensitive performance measure. To further improve the obtained bounds, we propose another pessimistic algorithm that utilizes variance information and reference-advantage decomposition, effectively improving both the dependence on the space dimension $d$ and the risk-sensitivity factor. To the best of our knowledge, we obtain the first provably efficient risk-sensitive offline RL algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dake Zhang;Boxiang Lyu;Shuang Qiu;mladen kolar;Tong Zhang", "authorids": "~Dake_Zhang1;~Boxiang_Lyu1;~Shuang_Qiu2;~mladen_kolar1;~Tong_Zhang2", "gender": "M;M;M;;M", "homepage": ";;https://shq-ml.github.io/;https://mkolar.coffeejunkies.org/;http://tongzhang-ml.org", "dblp": ";312/6810;;08/7068;07/4227-1", "google_scholar": "ySAxFuEAAAAJ;Cwavd4EAAAAJ;-Z7fY00AAAAJ;https://scholar.google.com/citations?hl=en;LurWtuYAAAAJ", "orcid": ";;;;0000-0002-5511-2558", "linkedin": "dake-zhang-19t;boxiang-lyu-3206159b/;;;", "or_profile": "~Dake_Zhang1;~Boxiang_Lyu1;~Shuang_Qiu2;~mladen_kolar1;~Tong_Zhang2", "aff": "University of Chicago;Booth School of Business, University of Chicago;;University of Chicago;UIUC", "aff_domain": "uchicago.edu;chicagobooth.edu;;uchicago.edu;illinois.edu", "position": "PhD student;PhD student;;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nzhang2024pessimism,\ntitle={Pessimism Meets Risk: Risk-Sensitive Offline Reinforcement Learning},\nauthor={Dake Zhang and Boxiang Lyu and Shuang Qiu and mladen kolar and Tong Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=InUUQkExsw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 621928, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4862526928213980465&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "uchicago.edu;chicagobooth.edu;;uchicago.edu;illinois.edu", "author_num": 5, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of Chicago;University of Illinois Urbana-Champaign", "aff_unique_dep": ";", "aff_unique_url": "https://www.uchicago.edu;https://www illinois.edu", "aff_unique_abbr": "UChicago;UIUC", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Chicago;Urbana-Champaign", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Bagged Deep Image Prior for Recovering Images in the Presence of Speckle Noise", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34416", "id": "IoUOhnCmlX", "proceeding": "https://proceedings.mlr.press/v235/chen24q.html", "pdf": "https://openreview.net/pdf?id=IoUOhnCmlX", "openreview": "https://openreview.net/forum?id=IoUOhnCmlX", "author_site": "Xi Chen, Zhewen Hou, Christopher Metzler, Arian Maleki, Shirin Jalali", "tldr": "", "abstract": "We investigate both the theoretical and algorithmic aspects of likelihood-based methods for recovering a complex-valued signal from multiple sets of measurements, referred to as looks, affected by speckle (multiplicative) noise. Our theoretical contributions include establishing the first existing theoretical upper bound on the Mean Squared Error (MSE) of the maximum likelihood estimator under the deep image prior hypothesis. Our theoretical results capture the dependence of MSE upon the number of parameters in the deep image prior, the number of looks, the signal dimension, and the number of measurements per look. On the algorithmic side, we introduce the concept of bagged Deep Image Priors (Bagged-DIP) and integrate them with projected gradient descent. Furthermore, we show how employing Newton-Schulz algorithm for calculating matrix inverses within the iterations of PGD reduces the computational complexity of the algorithm. We will show that this method achieves the state-of-the-art performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xi Chen;Zhewen Hou;Christopher Metzler;Arian Maleki;Shirin Jalali", "authorids": "~Xi_Chen33;~Zhewen_Hou1;~Christopher_Metzler1;~Arian_Maleki1;~Shirin_Jalali1", "gender": ";M;M;M;F", "homepage": "https://xichen-97.github.io/;https://github.com/zwhou99;https://www.cs.umd.edu/~metzler/;https://sites.google.com/site/malekiarian/;https://sites.google.com/site/shirinjalali/", "dblp": ";;147/4828;27/2939;99/5024", "google_scholar": "VEvx_30AAAAJ;;on7GFpYAAAAJ;jUt50EcAAAAJ;", "orcid": "0000-0002-5116-7500;;;;", "linkedin": ";;;;", "or_profile": "~Xi_Chen33;~Zhewen_Hou1;~Christopher_Metzler1;~Arian_Maleki1;~Shirin_Jalali1", "aff": "Rutgers University, New Brunswick;Columbia University;University of Maryland, College Park;;Rutgers University", "aff_domain": "rutgers.edu;columbia.edu;umd.edu;;rutgers.edu", "position": "PhD student;PhD student;Assistant Professor;;Assistant Professor", "bibtex": "@inproceedings{\nchen2024bagged,\ntitle={Bagged Deep Image Prior for Recovering Images in the Presence of Speckle Noise},\nauthor={Xi Chen and Zhewen Hou and Christopher Metzler and Arian Maleki and Shirin Jalali},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=IoUOhnCmlX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3027525, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9240568220697487546&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "rutgers.edu;columbia.edu;umd.edu;;rutgers.edu", "author_num": 5, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Rutgers University;Columbia University;University of Maryland", "aff_unique_dep": ";;", "aff_unique_url": "https://www.rutgers.edu;https://www.columbia.edu;https://www/umd.edu", "aff_unique_abbr": "Rutgers;Columbia;UMD", "aff_campus_unique_index": "0;2", "aff_campus_unique": "New Brunswick;;College Park", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning to Continually Learn with the Bayesian Principle", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34415", "id": "IpPnmhjw30", "proceeding": "https://proceedings.mlr.press/v235/lee24j.html", "pdf": "https://openreview.net/pdf?id=IpPnmhjw30", "openreview": "https://openreview.net/forum?id=IpPnmhjw30", "author_site": "Soochan Lee, Hyeonseong Jeon, Jaehyeon Son, Gunhee Kim", "tldr": "", "abstract": "In the present era of deep learning, continual learning research is mainly focused on mitigating forgetting when training a neural network with stochastic gradient descent on a non-stationary stream of data. On the other hand, in the more classical literature of statistical machine learning, many models have sequential Bayesian update rules that yield the same learning outcome as the batch training, i.e., they are completely immune to catastrophic forgetting. However, they are often overly simple to model complex real-world data. In this work, we adopt the meta-learning paradigm to combine the strong representational power of neural networks and simple statistical models' robustness to forgetting. In our novel meta-continual learning framework, continual learning takes place only in statistical models via ideal sequential Bayesian update rules, while neural networks are meta-learned to bridge the raw data and the statistical models. Since the neural networks remain fixed during continual learning, they are protected from catastrophic forgetting. This approach not only achieves significantly improved performance but also exhibits excellent scalability. Since our approach is domain-agnostic and model-agnostic, it can be applied to a wide range of problems and easily integrated with existing model architectures.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Soochan Lee;Hyeonseong Jeon;Jaehyeon Son;Gunhee Kim", "authorids": "~Soochan_Lee1;~Hyeonseong_Jeon2;~Jaehyeon_Son1;~Gunhee_Kim1", "gender": "M;M;M;M", "homepage": "https://soochanlee.com;;https://jaehyeon-son.github.io/;http://vision.snu.ac.kr/gunhee/", "dblp": "230/1398;;359/3097.html;45/115", "google_scholar": "8O3MKJkAAAAJ;;q7SrBsgAAAAJ;https://scholar.google.co.kr/citations?user=CiSdOV0AAAAJ", "orcid": "0000-0002-1425-9262;;0009-0004-2726-1144;0000-0002-9543-7453", "linkedin": ";hs-jeon;jaehyeon-son-a626202b3/;", "or_profile": "~Soochan_Lee1;~Hyeonseong_Jeon2;~Jaehyeon_Son1;~Gunhee_Kim1", "aff": "Seoul National University;Seoul National University, Seoul National University;Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;cse.snu.ac.kr;snu.ac.kr;snu.ac.kr", "position": "PhD student;Undergrad student;Researcher;Full Professor", "bibtex": "@inproceedings{\nlee2024learning,\ntitle={Learning to Continually Learn with the Bayesian Principle},\nauthor={Soochan Lee and Hyeonseong Jeon and Jaehyeon Son and Gunhee Kim},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=IpPnmhjw30}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6998503, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9748569084233685887&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "snu.ac.kr;cse.snu.ac.kr;snu.ac.kr;snu.ac.kr", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seoul", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Reducing Fine-Tuning Memory Overhead by Approximate and Memory-Sharing Backpropagation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34414", "id": "IpSKpOY2EH", "proceeding": "https://proceedings.mlr.press/v235/yang24u.html", "pdf": "https://openreview.net/pdf?id=IpSKpOY2EH", "openreview": "https://openreview.net/forum?id=IpSKpOY2EH", "author_site": "Yuchen Yang, Yingdong Shi, Cheems Wang, Xiantong Zhen, Yuxuan Shi, Jun Xu", "tldr": "", "abstract": "Fine-tuning pretrained large models to downstream tasks is an important problem, which however suffers from huge memory overhead due to large-scale parameters. This work strives to reduce memory overhead in fine-tuning from perspectives of activation function and layer normalization. To this end, we propose the Approximate Backpropagation (Approx-BP) theory, which provides the theoretical feasibility of decoupling the forward and backward passes. We apply our Approx-BP theory to backpropagation training and derive memory-efficient alternatives of GELU and SiLU activation functions, which use derivative functions of ReLUs in the backward pass while keeping their forward pass unchanged. In addition, we introduce a Memory-Sharing Backpropagation strategy, which enables the activation memory to be shared by two adjacent layers, thereby removing activation memory usage redundancy. Our method neither induces extra computation nor reduces training efficiency. We conduct extensive experiments with pretrained vision and language models, and the results demonstrate that our proposal can reduce up to $\\sim$$30\\%$ of the peak memory usage. Our code is released at [github](https://github.com/yyyyychen/LowMemoryBP).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuchen Yang;Yingdong Shi;Cheems Wang;Xiantong Zhen;Yuxuan Shi;Jun Xu", "authorids": "~Yuchen_Yang11;shiyd2023@shanghaitech.edu.cn;~Cheems_Wang1;~Xiantong_Zhen1;syxpop@outlook.com;~Jun_Xu3", "gender": ";;;M;;M", "homepage": ";;;;;https://csjunxu.github.io/", "dblp": ";;;78/10651;;", "google_scholar": ";;;https://scholar.google.ca/citations?user=DnBb3e0AAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-2316-4820;;;;;", "linkedin": ";;;;;", "or_profile": "~Yuchen_Yang11;shiyd2023@shanghaitech.edu.cn;~Cheems_Wang1;~Xiantong_Zhen1;syxpop@outlook.com;~Jun_Xu3", "aff": "Nankai University;;;United Imaging Healthcare, Co., Ltd.;;Nankai University", "aff_domain": "nankai.edu.cn;;;cri-united-imaging.com;;nankai.edu.cn", "position": "MS student;;;Principal Researcher;;Associate Professor", "bibtex": "@inproceedings{\nyang2024reducing,\ntitle={Reducing Fine-Tuning Memory Overhead by Approximate and Memory-Sharing Backpropagation},\nauthor={Yuchen Yang and Yingdong Shi and Cheems Wang and Xiantong Zhen and Yuxuan Shi and Jun Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=IpSKpOY2EH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 780954, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6943052980199447252&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "nankai.edu.cn;;;cri-united-imaging.com;;nankai.edu.cn", "author_num": 6, "aff_unique_index": "0;1;0", "aff_unique_norm": "Nankai University;United Imaging Healthcare", "aff_unique_dep": ";", "aff_unique_url": "http://www.nankai.edu.cn;https://www.united-imaging.com", "aff_unique_abbr": "NKU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Binary Decomposition: A Problem Transformation Perspective for Open-Set Semi-Supervised Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34413", "id": "Irkcamqg4d", "proceeding": "https://proceedings.mlr.press/v235/hang24a.html", "pdf": "https://openreview.net/pdf?id=Irkcamqg4d", "openreview": "https://openreview.net/forum?id=Irkcamqg4d", "author_site": "Jun-Yi Hang, Min-Ling Zhang", "tldr": "", "abstract": "Semi-supervised learning (SSL) is a classical machine learning paradigm dealing with labeled and unlabeled data. However, it often suffers performance degradation in real-world open-set scenarios, where unlabeled data contains outliers from novel categories that do not appear in labeled data. Existing studies commonly tackle this challenging open-set SSL problem with detect-and-filter strategy, which attempts to purify unlabeled data by detecting and filtering outliers. In this paper, we propose a novel binary decomposition strategy, which refrains from error-prone procedure of outlier detection by directly transforming the original open-set SSL problem into a number of standard binary SSL problems. Accordingly, a concise yet effective approach named BDMatch is presented. BDMatch confronts two attendant issues brought by binary decomposition, i.e. class-imbalance and representation-compromise, with adaptive logit adjustment and label-specific feature learning respectively. Comprehensive experiments on diversified benchmarks clearly validate the superiority of BDMatch as well as the effectiveness of our binary decomposition strategy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jun-Yi Hang;Min-Ling Zhang", "authorids": "~Jun-Yi_Hang1;~Min-Ling_Zhang2", "gender": "M;M", "homepage": ";http://palm.seu.edu.cn/zhangml/", "dblp": "https://dblp.uni-trier.de/pid/299/4577;84/271.html", "google_scholar": "https://scholar.google.com.hk/citations?user=s-4VLP0AAAAJ;uFHCIM0AAAAJ", "orcid": ";0000-0003-1880-5918", "linkedin": ";", "or_profile": "~Jun-Yi_Hang1;~Min-Ling_Zhang2", "aff": "Southeast University;Southeast University", "aff_domain": "seu.edu.cn;seu.edu.cn", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nhang2024binary,\ntitle={Binary Decomposition: A Problem Transformation Perspective for Open-Set Semi-Supervised Learning},\nauthor={Jun-Yi Hang and Min-Ling Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Irkcamqg4d}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5497966, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14425431323692679051&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "seu.edu.cn;seu.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Southeast University", "aff_unique_dep": "", "aff_unique_url": "https://www.seu.edu.cn/", "aff_unique_abbr": "SEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Correlation-Induced Label Prior for Semi-Supervised Multi-Label Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34412", "id": "IuvpVcGUOB", "proceeding": "https://proceedings.mlr.press/v235/liu24bt.html", "pdf": "https://openreview.net/pdf?id=IuvpVcGUOB", "openreview": "https://openreview.net/forum?id=IuvpVcGUOB", "author_site": "Biao Liu, Ning Xu, Xiangyu Fang, Xin Geng", "tldr": "", "abstract": "Semi-supervised multi-label learning (SSMLL) aims to address the challenge of limited labeled data availability in multi-label learning (MLL) by leveraging unlabeled data to improve the model's performance. Due to the difficulty of estimating the reliable label correlation on minimal multi-labeled data, previous SSMLL methods fail to unlash the power of the correlation among multiple labels to improve the performance of the predictive model in SSMLL. To deal with this problem, we propose a novel SSMLL method named PCLP where the correlation-induced label prior is inferred to enhance the pseudo-labeling instead of dirtily estimating the correlation among labels. Specifically, we construct the correlated label prior probability distribution using structural causal model (SCM), constraining the correlations of generated pseudo-labels to conform to the prior, which can be integrated into a variational label enhancement framework optimized by both labeled and unlabeled instances in a unified manner. Theoretically, we demonstrate the accuracy of the generated pseudo-labels and guarantee the learning consistency of the proposed method. Comprehensive experiments on several benchmark datasets have validated the superiority of the proposed method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Biao Liu;Ning Xu;Xiangyu Fang;Xin Geng", "authorids": "~Biao_Liu1;~Ning_Xu5;~Xiangyu_Fang1;~Xin_Geng1", "gender": "M;M;;M", "homepage": "http://palm.seu.edu.cn/homepage/liubiao/demo/demo/index.html;http://palm.seu.edu.cn/xuning/;https://palm.seu.edu.cn/homepage/fangxiangyu/index.html;http://palm.seu.edu.cn/xgeng/index.htm", "dblp": ";04/5856-9;;", "google_scholar": ";;;ZOCxkIcAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Biao_Liu1;~Ning_Xu5;~Xiangyu_Fang1;~Xin_Geng1", "aff": "Southeast University;Southeast University;Southeast University;Southeast University, China", "aff_domain": "seu.edu.cn;seu.edu.cn;seu.edu.cn;seu.edu.cn", "position": "PhD student;Associate Professor;MS student;Professor", "bibtex": "@inproceedings{\nliu2024correlationinduced,\ntitle={Correlation-Induced Label Prior for Semi-Supervised Multi-Label Learning},\nauthor={Biao Liu and Ning Xu and Xiangyu Fang and Xin Geng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=IuvpVcGUOB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 418982, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ulc7rUgNZKAJ:scholar.google.com/&scioq=Correlation-Induced+Label+Prior+for+Semi-Supervised+Multi-Label+Learning&hl=en&as_sdt=0,5", "gs_version_total": 4, "email": "seu.edu.cn;seu.edu.cn;seu.edu.cn;seu.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Southeast University", "aff_unique_dep": "", "aff_unique_url": "https://www.seu.edu.cn/", "aff_unique_abbr": "SEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Riemannian Preconditioned LoRA for Fine-Tuning Foundation Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34411", "id": "IwqE4QqBew", "proceeding": "https://proceedings.mlr.press/v235/zhang24ax.html", "pdf": "https://openreview.net/pdf?id=IwqE4QqBew", "openreview": "https://openreview.net/forum?id=IwqE4QqBew", "author_site": "Fangzhao Zhang, Mert Pilanci", "tldr": "", "abstract": "Low-Rank Adaptation (LoRA) emerges as a popular parameter-efficient fine-tuning (PEFT) method, which proposes to freeze pretrained model weights and update an additive low-rank trainable matrix. In this work, we study the enhancement of LoRA training by introducing an $r\\times r$ preconditioner in each gradient step where $r$ is the LoRA rank. We theoretically verify that the proposed preconditioner stabilizes feature learning with LoRA under infinite-width NN setting. Empirically, the implementation of this new preconditioner requires a small change to existing optimizer code and creates virtually minuscule storage and runtime overhead. Our experimental results with both large language models and text-to-image diffusion models show that with this new preconditioner, the convergence and reliability of SGD and AdamW can be significantly enhanced. Moreover, the training process becomes much more robust to hyperparameter choices such as learning rate. The new preconditioner can be derived from a novel Riemannian metric in low-rank matrix field.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fangzhao Zhang;Mert Pilanci", "authorids": "~Fangzhao_Zhang1;~Mert_Pilanci3", "gender": ";M", "homepage": ";https://stanford.edu/~pilanci/", "dblp": ";45/8056", "google_scholar": ";aSAS-aAAAAAJ", "orcid": ";", "linkedin": ";mert-pilanci-ba615743/", "or_profile": "~Fangzhao_Zhang1;~Mert_Pilanci3", "aff": ";Stanford University", "aff_domain": ";stanford.edu", "position": ";Assistant Professor", "bibtex": "@inproceedings{\nzhang2024riemannian,\ntitle={Riemannian Preconditioned Lo{RA} for Fine-Tuning Foundation Models},\nauthor={Fangzhao Zhang and Mert Pilanci},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=IwqE4QqBew}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9731624, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2061565311568501016&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": ";stanford.edu", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Effects of Exponential Gaussian Distribution on (Double Sampling) Randomized Smoothing", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34410", "id": "IxZ4xaHSYG", "proceeding": "https://proceedings.mlr.press/v235/shu24a.html", "pdf": "https://openreview.net/pdf?id=IxZ4xaHSYG", "openreview": "https://openreview.net/forum?id=IxZ4xaHSYG", "author_site": "Youwei Shu, Xi Xiao, Derui Wang, Yuxin Cao, Siji Chen, Minhui Xue, Linyi Li, Bo Li", "tldr": "", "abstract": "Randomized Smoothing (RS) is currently a scalable certified defense method providing robustness certification against adversarial examples. Although significant progress has been achieved in providing defenses against $\\ell_p$ adversaries, the interaction between the smoothing distribution and the robustness certification still remains vague. In this work, we comprehensively study the effect of two families of distributions, named Exponential Standard Gaussian (ESG) and Exponential General Gaussian (EGG) distributions, on Randomized Smoothing and Double Sampling Randomized Smoothing (DSRS). We derive an analytic formula for ESG's certified radius, which converges to the origin formula of RS as the dimension $d$ increases. Additionally, we prove that EGG can provide tighter constant factors than DSRS in providing $\\Omega(\\sqrt{d})$ lower bounds of $\\ell_2$ certified radius, and thus further addresses the curse of dimensionality in RS. Our experiments on real-world datasets confirm our theoretical analysis of the ESG distributions, that they provide almost the same certification under different exponents $\\eta$ for both RS and DSRS. In addition, EGG brings a significant improvement to the DSRS certification, but the mechanism can be different when the classifier properties are different. Compared to the primitive DSRS, the increase in certified accuracy provided by EGG is prominent, up to 6.4% on ImageNet.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Youwei Shu;Xi Xiao;Derui Wang;Yuxin Cao;Siji Chen;Jason Xue;Linyi Li;Bo Li", "authorids": "~Youwei_Shu1;~Xi_Xiao1;~Derui_Wang1;~Yuxin_Cao1;~Siji_Chen2;~Jason_Xue1;~Linyi_Li1;~Bo_Li19", "gender": "M;M;;;M;;M;F", "homepage": "https://github.com/tdano1;https://www.sigs.tsinghua.edu.cn/xx_en/main.htm;;;https://github.com/LostDriver;;http://linyil.com;http://boli.cs.illinois.edu/", "dblp": ";;;151/7989;;;99/4340-1.html;50/3402-26", "google_scholar": ";;;https://scholar.google.com/citations?hl=en;;;-b0sk-YAAAAJ;K8vJkTcAAAAJ", "orcid": ";;;0009-0002-5766-0846;;;;", "linkedin": ";;;;;;;", "or_profile": "~Youwei_Shu1;~Xi_Xiao1;~Derui_Wang1;~Yuxin_Cao1;~Siji_Chen2;~Jason_Xue1;~Linyi_Li1;~Bo_Li19", "aff": "Tsinghua University;Shenzhen International Graduate School, Tsinghua University;;Tsinghua University;;;Simon Fraser University;University of Illinois, Urbana Champaign", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;;mails.tsinghua.edu.cn;;;sfu.ca;illinois.edu", "position": "MS student;Associate Professor;;MS student;;;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nshu2024effects,\ntitle={Effects of Exponential Gaussian Distribution on (Double Sampling) Randomized Smoothing},\nauthor={Youwei Shu and Xi Xiao and Derui Wang and Yuxin Cao and Siji Chen and Jason Xue and Linyi Li and Bo Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=IxZ4xaHSYG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8530296, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16264357449114557162&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "tsinghua.edu.cn;tsinghua.edu.cn;;mails.tsinghua.edu.cn;;;sfu.ca;illinois.edu", "author_num": 8, "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "Tsinghua University;Simon Fraser University;University of Illinois Urbana-Champaign", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.sfu.ca;https://illinois.edu", "aff_unique_abbr": "THU;SFU;UIUC", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Shenzhen;Urbana-Champaign", "aff_country_unique_index": "0;0;0;1;2", "aff_country_unique": "China;Canada;United States" }, { "title": "Total Variation Floodgate for Variable Importance Inference in Classification", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34409", "id": "IyeXM58vIC", "proceeding": "https://proceedings.mlr.press/v235/wang24ad.html", "pdf": "https://openreview.net/pdf?id=IyeXM58vIC", "openreview": "https://openreview.net/forum?id=IyeXM58vIC", "author_site": "Wenshuo Wang, Lucas Janson, Lihua Lei, Aaditya Ramdas", "tldr": "", "abstract": "Inferring variable importance is the key goal of many scientific studies, where researchers seek to learn the effect of a feature $X$ on the outcome $Y$ in the presence of confounding variables $Z$. Focusing on classification problems, we define the expected total variation (ETV), which is an intuitive and deterministic measure of variable importance that does not rely on any model assumption. We then introduce algorithms for statistical inference on the ETV under design-based/model-X assumptions. We name our method Total Variation Floodgate in reference to its shared high-level structure with the Floodgate method of Zhang & Janson (2020). The algorithms we introduce can leverage any user-specified regression function and produce asymptotic lower confidence bounds for the ETV. We show the effectiveness of our algorithms with simulations and a case study in conjoint analysis on the US general election.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenshuo Wang;Lucas Janson;Lihua Lei;Aaditya Ramdas", "authorids": "~Wenshuo_Wang3;~Lucas_Janson2;~Lihua_Lei2;~Aaditya_Ramdas2", "gender": "M;;M;M", "homepage": "https://wenshuow.github.io/;http://lucasjanson.fas.harvard.edu/;https://lihualei71.github.io/;http://stat.cmu.edu/~aramdas", "dblp": "166/3792;131/6726;;117/3518", "google_scholar": "sej1cJcAAAAJ;Njlo7WAAAAAJ;https://scholar.google.co.uk/citations?user=-lKb3XwAAAAJ;ZvFaPxUAAAAJ", "orcid": "0000-0002-9896-0688;;;0000-0003-0497-311X", "linkedin": "wenshuo-wang1997/;;;", "or_profile": "~Wenshuo_Wang3;~Lucas_Janson2;~Lihua_Lei2;~Aaditya_Ramdas2", "aff": "Meta;Harvard University;;Carnegie Mellon University", "aff_domain": "meta.com;harvard.edu;;cmu.edu", "position": "Researcher;Associate Professor;;Assistant Professor", "bibtex": "@inproceedings{\nwang2024total,\ntitle={Total Variation Floodgate for Variable Importance Inference in Classification},\nauthor={Wenshuo Wang and Lucas Janson and Lihua Lei and Aaditya Ramdas},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=IyeXM58vIC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 432137, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16805140589758669093&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "meta.com;harvard.edu;;cmu.edu", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Meta;Harvard University;Carnegie Mellon University", "aff_unique_dep": "Meta Platforms, Inc.;;", "aff_unique_url": "https://meta.com;https://www.harvard.edu;https://www.cmu.edu", "aff_unique_abbr": "Meta;Harvard;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Provable Privacy with Non-Private Pre-Processing", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34408", "id": "IzqpUC34Jg", "proceeding": "https://proceedings.mlr.press/v235/hu24m.html", "pdf": "https://openreview.net/pdf?id=IzqpUC34Jg", "openreview": "https://openreview.net/forum?id=IzqpUC34Jg", "author_site": "Yaxi Hu, Amartya Sanyal, Bernhard Sch\u00f6lkopf", "tldr": "", "abstract": "When analyzing Differentially Private (DP) machine learning pipelines, the potential privacy cost of data-dependent pre-processing is frequently overlooked in privacy accounting. In this work, we propose a general framework to evaluate the additional privacy cost incurred by non-private data-dependent pre-processing algorithms. Our framework establishes upper bounds on the overall privacy guarantees by utilising two new technical notions: a variant of DP termed Smooth DP and the bounded sensitivity of the pre-processing algorithms. In addition to the generic framework, we provide explicit overall privacy guarantees for multiple data-dependent pre-processing algorithms, such as data imputation, quantization, deduplication, standard scaling and PCA, when used in combination with several DP algorithms. Notably, this framework is also simple to implement, allowing direct integration into existing DP pipelines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yaxi Hu;Amartya Sanyal;Bernhard Sch\u00f6lkopf", "authorids": "~Yaxi_Hu1;~Amartya_Sanyal1;~Bernhard_Sch\u00f6lkopf1", "gender": "F;M;", "homepage": ";https://amartya18x.github.io;", "dblp": "322/1093.html;203/8807;", "google_scholar": ";;", "orcid": ";0000-0002-4190-0449;", "linkedin": "yaxi-hu-8910b5233/;;", "or_profile": "~Yaxi_Hu1;~Amartya_Sanyal1;~Bernhard_Sch\u00f6lkopf1", "aff": "Max Planck Institute for Intelligent Systems;Max-Planck Institute;", "aff_domain": "is.tuebingen.mpg.de;mpg.de;", "position": "PhD student;Postdoc;", "bibtex": "@inproceedings{\nhu2024provable,\ntitle={Provable Privacy with Non-Private Pre-Processing},\nauthor={Yaxi Hu and Amartya Sanyal and Bernhard Sch{\\\"o}lkopf},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=IzqpUC34Jg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 747602, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1924695727022353783&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "is.tuebingen.mpg.de;mpg.de;", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Max Planck Institute for Intelligent Systems;Max-Planck-Gesellschaft zur F\u00f6rderung der Wissenschaften e.V.", "aff_unique_dep": "Intelligent Systems;", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.mpg.de", "aff_unique_abbr": "MPI-IS;MPG", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Byzantine-Robust Federated Learning: Impact of Client Subsampling and Local Updates", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34407", "id": "Izv7gBnap3", "proceeding": "https://proceedings.mlr.press/v235/allouah24a.html", "pdf": "https://openreview.net/pdf?id=Izv7gBnap3", "openreview": "https://openreview.net/forum?id=Izv7gBnap3", "author_site": "Youssef Allouah, Sadegh Farhadkhani, Rachid Guerraoui, Nirupam Gupta, Rafael Pinot, Geovani Rizk, Sasha Voitovych", "tldr": "", "abstract": "The possibility of adversarial (a.k.a., Byzantine) clients makes federated learning (FL) prone to arbitrary manipulation. The natural approach to robustify FL against adversarial clients is to replace the simple averaging operation at the server in the standard $\\mathsf{FedAvg}$ algorithm by a robust averaging rule. While a significant amount of work has been devoted to studying the convergence of federated robust averaging (which we denote by $\\mathsf{FedRo}$), prior work has largely ignored the impact of client subsampling and local steps, two fundamental FL characteristics. While client subsampling increases the effective fraction of Byzantine clients, local steps increase the drift between the local updates computed by honest (i.e., non-Byzantine) clients. Consequently, a careless deployment of $\\mathsf{FedRo}$ could yield poor performance. We validate this observation by presenting an in-depth analysis of $\\mathsf{FedRo}$ tightly analyzing the impact of client subsampling and local steps. Specifically, we present a sufficient condition on client subsampling for nearly-optimal convergence of $\\mathsf{FedRo}$ (for smooth non-convex loss). Also, we show that the rate of improvement in learning accuracy diminishes with respect to the number of clients subsampled, as soon as the sample size exceeds a threshold value. Interestingly, we also observe that under a careful choice of step-sizes, the learning error due to Byzantine clients decreases with the number of local steps. We validate our theory by experiments on the FEMNIST and CIFAR-$10$ image classification tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Youssef Allouah;Sadegh Farhadkhani;Rachid Guerraoui;Nirupam Gupta;Rafael Pinot;Geovani Rizk;Sasha Voitovych", "authorids": "~Youssef_Allouah1;~Sadegh_Farhadkhani1;~Rachid_Guerraoui1;~Nirupam_Gupta1;~Rafael_Pinot1;~Geovani_Rizk1;~Sasha_Voitovych1", "gender": "M;M;M;;;M;M", "homepage": "https://youssefallouah.com/;https://sadeghfarhadkhani.github.io/;https://lpdwww.epfl.ch/rachid/;;;;https://scholar.google.com/citations?user=YPoNM2gAAAAJ&hl=en&authuser=1&oi=ao", "dblp": "312/3936;281/6141;g/RachidGuerraoui;;;259/2889;", "google_scholar": "kVZu88cAAAAJ;X4axFjgAAAAJ;;;;;", "orcid": "0000-0003-1048-7548;;;;;;", "linkedin": ";;;;;;", "or_profile": "~Youssef_Allouah1;~Sadegh_Farhadkhani1;~Rachid_Guerraoui1;~Nirupam_Gupta1;~Rafael_Pinot1;~Geovani_Rizk1;~Sasha_Voitovych1", "aff": "Stanford University;EPFL;;;;EPFL - EPF Lausanne;Massachusetts Institute of Technology", "aff_domain": "stanford.edu;epfl.ch;;;;epfl.ch;mit.edu", "position": "Visiting student researcher;PhD student;;;;Postdoc;PhD student", "bibtex": "@inproceedings{\nallouah2024byzantinerobust,\ntitle={Byzantine-Robust Federated Learning: Impact of Client Subsampling and Local Updates},\nauthor={Youssef Allouah and Sadegh Farhadkhani and Rachid Guerraoui and Nirupam Gupta and Rafael Pinot and Geovani Rizk and Sasha Voitovych},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Izv7gBnap3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 652824, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12001960607375646143&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "stanford.edu;epfl.ch;;;;epfl.ch;mit.edu", "author_num": 7, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Stanford University;EPFL;Massachusetts Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.stanford.edu;https://www.epfl.ch;https://web.mit.edu", "aff_unique_abbr": "Stanford;EPFL;MIT", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Stanford;;Lausanne", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "United States;Switzerland" }, { "title": "Neural operators meet conjugate gradients: The FCG-NO method for efficient PDE solving", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34406", "id": "J0ty1o7nCj", "proceeding": "https://proceedings.mlr.press/v235/rudikov24a.html", "pdf": "https://openreview.net/pdf?id=J0ty1o7nCj", "openreview": "https://openreview.net/forum?id=J0ty1o7nCj", "author_site": "Alexander Rudikov, Fanaskov Vladimir, Ekaterina Muravleva, Yuri Laevsky, Ivan Oseledets", "tldr": "", "abstract": "Deep learning solvers for partial differential equations typically have limited accuracy. We propose to overcome this problem by using them as preconditioners. More specifically, we apply discretization-invariant neural operators to learn preconditioners for the flexible conjugate gradient method (FCG). Architecture paired with novel loss function and training scheme allows for learning efficient preconditioners that can be used across different resolutions. On the theoretical side, FCG theory allows us to safely use nonlinear preconditioners that can be applied in $O(N)$ operations without constraining the form of the preconditioners matrix. To justify learning scheme components (the loss function and the way training data is collected) we perform several ablation studies. Numerical results indicate that our approach favorably compares with classical preconditioners and allows to reuse of preconditioners learned for lower resolution to the higher resolution data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alexander Rudikov;Vladimir Fanaskov;Ekaterina Muravleva;Yuri M. Laevsky;Ivan Oseledets", "authorids": "~Alexander_Rudikov1;~Vladimir_Fanaskov2;~Ekaterina_Muravleva1;~Yuri_M._Laevsky1;~Ivan_Oseledets1", "gender": ";F;M;M;M", "homepage": ";https://new.skoltech.ru/en/laboratories/ai-driven-modelling;https://icmmg.nsc.ru/ru/content/employees/laevskiy-yuriy-mironovich;http://oseledets.github.io;", "dblp": "338/9862.html;;;56/7175;", "google_scholar": "51rbId8AAAAJ;X6GQc34AAAAJ;;https://scholar.google.ru/citations?user=5kMqBQEAAAAJ;iK5gdo8AAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Alexander_Rudikov1;~Ekaterina_Muravleva1;~Yuri_M._Laevsky1;~Ivan_Oseledets1;~Fanaskov_Vladimir1", "aff": "Skolkovo Institute of Science and Technology;Skolkovo Institute of Science and Technology;Institute of Computational Mathematics and Mathematical Geophysics SB RAS;Institute of Numerical Mathematics;Skolkovo institute of science and technology", "aff_domain": "skoltech.ru;skoltech.ru;icmmg.nsc.ru;inm.ras.ru;skoltech.ru", "position": "Researcher;Assistant Professor;Principal Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\nrudikov2024neural,\ntitle={Neural operators meet conjugate gradients: The {FCG}-{NO} method for efficient {PDE} solving},\nauthor={Alexander Rudikov and Vladimir Fanaskov and Ekaterina Muravleva and Yuri M. Laevsky and Ivan Oseledets},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=J0ty1o7nCj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 591247, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16171931274241866637&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "skoltech.ru;skoltech.ru;icmmg.nsc.ru;inm.ras.ru;skoltech.ru", "author_num": 5, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Skolkovo Institute of Science and Technology;Siberian Branch of the Russian Academy of Sciences;Institute of Numerical Mathematics", "aff_unique_dep": ";Institute of Computational Mathematics and Mathematical Geophysics;", "aff_unique_url": "https://www.skoltech.ru;http://www.icmmb.sbras.ru;", "aff_unique_abbr": "Skoltech;SB RAS;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Russian Federation;" }, { "title": "Accelerated Policy Gradient for s-rectangular Robust MDPs with Large State Spaces", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34405", "id": "J16WEPdqhJ", "proceeding": "https://proceedings.mlr.press/v235/chen24s.html", "pdf": "https://openreview.net/pdf?id=J16WEPdqhJ", "openreview": "https://openreview.net/forum?id=J16WEPdqhJ", "author_site": "Ziyi Chen, Heng Huang", "tldr": "", "abstract": "Robust Markov decision process (robust MDP) is an important machine learning framework to make a reliable policy that is robust to environmental perturbation. Despite empirical success and popularity of policy gradient methods, existing policy gradient methods require at least iteration complexity $\\mathcal{O}(\\epsilon^{-4})$ to converge to the global optimal solution of s-rectangular robust MDPs with $\\epsilon$-accuracy and are limited to deterministic setting with access to exact gradients and small state space that are impractical in many applications. In this work, we propose an accelerated policy gradient algorithm with iteration complexity $\\mathcal{O}(\\epsilon^{-3}\\ln\\epsilon^{-1})$ in the deterministic setting using entropy regularization. Furthermore, we extend this algorithm to stochastic setting with access to only stochastic gradients and large state space which achieves the sample complexity $\\mathcal{O}(\\epsilon^{-7}\\ln\\epsilon^{-1})$. In the meantime, our algorithms are also the first scalable policy gradient methods to entropy-regularized robust MDPs, which provide an important but underexplored machine learning framework.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ziyi Chen;Heng Huang", "authorids": "~Ziyi_Chen2;~Heng_Huang1", "gender": "M;M", "homepage": ";https://www.cs.umd.edu/~heng/", "dblp": "37/1439-2;03/281", "google_scholar": "zjSBVOIAAAAJ;4OqLaDwAAAAJ", "orcid": ";", "linkedin": "ziyi-chen-84616184/;", "or_profile": "~Ziyi_Chen2;~Heng_Huang1", "aff": "University of Maryland, College Park;Department of Computer Science, University of Maryland, College Park", "aff_domain": "umd.edu;cs.umd.edu", "position": "Postdoc;Full Professor", "bibtex": "@inproceedings{\nchen2024accelerated,\ntitle={Accelerated Policy Gradient for s-rectangular Robust {MDP}s with Large State Spaces},\nauthor={Ziyi Chen and Heng Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=J16WEPdqhJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 688215, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:bkdDwPaDq04J:scholar.google.com/&scioq=Accelerated+Policy+Gradient+for+s-rectangular+Robust+MDPs+with+Large+State+Spaces&hl=en&as_sdt=0,5", "gs_version_total": 5, "email": "umd.edu;cs.umd.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Maryland;University of Maryland, College Park", "aff_unique_dep": ";Department of Computer Science", "aff_unique_url": "https://www/umd.edu;https://www/umd.edu", "aff_unique_abbr": "UMD;UMD", "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "PANDA: Expanded Width-Aware Message Passing Beyond Rewiring", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34404", "id": "J1NIXxiDbu", "proceeding": "https://proceedings.mlr.press/v235/choi24f.html", "pdf": "https://openreview.net/pdf?id=J1NIXxiDbu", "openreview": "https://openreview.net/forum?id=J1NIXxiDbu", "author_site": "Jeongwhan Choi, Sumin Parksumin, Hyowon Wi, Sung-Bae Cho, Noseong Park", "tldr": "", "abstract": "Recent research in the field of graph neural network (GNN) has identified a critical issue known as \"over-squashing,\" resulting from the bottleneck phenomenon in graph structures, which impedes the propagation of long-range information. Prior works have proposed a variety of graph rewiring concepts that aim at optimizing the spatial or spectral properties of graphs to promote the signal propagation. However, such approaches inevitably deteriorate the original graph topology, which may lead to a distortion of information flow. To address this, we introduce an ex**pand**ed width-**a**ware (**PANDA**) message passing, a new message passing paradigm where nodes with high centrality, a potential source of over-squashing, are selectively expanded in width to encapsulate the growing influx of signals from distant nodes. Experimental results show that our method outperforms existing rewiring methods, suggesting that selectively expanding the hidden state of nodes can be a compelling alternative to graph rewiring for addressing the over-squashing.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jeongwhan Choi;Sumin Park;Hyowon Wi;Sung-Bae Cho;Noseong Park", "authorids": "~Jeongwhan_Choi1;~Sumin_Park2;~Hyowon_Wi1;~Sung-Bae_Cho1;~Noseong_Park1", "gender": "M;F;;M;", "homepage": "https://www.jeongwhanchoi.com;https://suminizz.github.io/;;http://sclab.yonsei.ac.kr;", "dblp": "39/11215-2;;332/6458;88/2576;", "google_scholar": "3MNElkYAAAAJ;;https://scholar.google.com/citations?view_op=list_works;px5LGgMAAAAJ;", "orcid": "0000-0002-6530-2662;;;0000-0002-0185-1769;", "linkedin": "jeongwhanchoi/;;;;", "or_profile": "~Jeongwhan_Choi1;~Sumin_Park2;~Hyowon_Wi1;~Sung-Bae_Cho1;~Noseong_Park1", "aff": "Yonsei University;Yonsei University;Yonsei University;Yonsei University;", "aff_domain": "yonsei.ac.kr;yonsei.ac.kr;yonsei.ac.kr;yonsei.ac.kr;", "position": "PhD student;Intern;MS student;Full Professor;", "bibtex": "@inproceedings{\nchoi2024panda,\ntitle={{PANDA}: Expanded Width-Aware Message Passing Beyond Rewiring},\nauthor={Jeongwhan Choi and Sumin Park and Hyowon Wi and Sung-Bae Cho and Noseong Park},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=J1NIXxiDbu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1642742, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9545105419262716085&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "yonsei.ac.kr;yonsei.ac.kr;yonsei.ac.kr;yonsei.ac.kr;", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Yonsei University", "aff_unique_dep": "", "aff_unique_url": "https://www.yonsei.ac.kr", "aff_unique_abbr": "Yonsei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Learning Label Shift Correction for Test-Agnostic Long-Tailed Recognition", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34403", "id": "J3xYTh6xtL", "proceeding": "https://proceedings.mlr.press/v235/wei24g.html", "pdf": "https://openreview.net/pdf?id=J3xYTh6xtL", "openreview": "https://openreview.net/forum?id=J3xYTh6xtL", "author_site": "Tong Wei, Zhen Mao, Zi-Hao Zhou, Yuanyu Wan, Min-Ling Zhang", "tldr": "", "abstract": "Long-tail learning primarily focuses on mitigating the label distribution shift between long-tailed training data and uniformly distributed test data. However, in real-world applications, we often encounter a more intricate challenge where the test label distribution is agnostic. To address this problem, we first theoretically establish the substantial potential for reducing the generalization error if we can precisely estimate the test label distribution. Motivated by the theoretical insight, we introduce a simple yet effective solution called label shift correction (LSC). LSC estimates the test label distribution within the proposed framework of generalized black box shift estimation, and adjusts the predictions from a pre-trained model to align with the test distribution. Theoretical analyses confirm that accurate estimation of test label distribution can effectively reduce the generalization error. Extensive experimental results demonstrate that our method significantly outperforms previous state-of-the-art approaches, especially when confronted with non-uniform test label distribution. Notably, the proposed method is general and complements existing long-tail learning approaches, consistently improving their performance. The source code is available at https://github.com/Stomach-ache/label-shift-correction", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tong Wei;Zhen Mao;Zi-Hao Zhou;Yuanyu Wan;Min-Ling Zhang", "authorids": "~Tong_Wei1;~Zhen_Mao2;~Zi-Hao_Zhou1;~Yuanyu_Wan1;~Min-Ling_Zhang2", "gender": "M;;M;M;", "homepage": "https://palm.seu.edu.cn/weit/;https://icanflyhigh.github.io;https://yuanyuwan.github.io/;http://palm.seu.edu.cn/zhangml/;https://github.com/zhouzihao11", "dblp": "49/933-1;;221/3499;84/271.html;", "google_scholar": "EFCZuW4AAAAJ;;CEymMc8AAAAJ;uFHCIM0AAAAJ;", "orcid": "0000-0002-2766-8209;;;0000-0003-1880-5918;", "linkedin": ";;;;", "or_profile": "~Tong_Wei1;~Zhen_Mao2;~Yuanyu_Wan1;~Min-Ling_Zhang2;~Zhou_Zihao1", "aff": "Southeast University;Southeast University;Zhejiang University;Southeast University;Southeast University", "aff_domain": "seu.edu.cn;seu.edu.cn;zju.edu.cn;seu.edu.cn;seu.edu.cn", "position": "Associate Professor;MS student;Researcher;Full Professor;MS student", "bibtex": "@inproceedings{\nwei2024learning,\ntitle={Learning Label Shift Correction for Test-Agnostic Long-Tailed Recognition},\nauthor={Tong Wei and Zhen Mao and Zi-Hao Zhou and Yuanyu Wan and Min-Ling Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=J3xYTh6xtL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2555740, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13080086498775196290&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "seu.edu.cn;seu.edu.cn;zju.edu.cn;seu.edu.cn;seu.edu.cn", "author_num": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Southeast University;Zhejiang University", "aff_unique_dep": ";", "aff_unique_url": "https://www.seu.edu.cn/;https://www.zju.edu.cn", "aff_unique_abbr": "SEU;ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Clustered Federated Learning via Gradient-based Partitioning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34402", "id": "J4HJUF70qm", "proceeding": "https://proceedings.mlr.press/v235/kim24p.html", "pdf": "https://openreview.net/pdf?id=J4HJUF70qm", "openreview": "https://openreview.net/forum?id=J4HJUF70qm", "author_site": "Heasung Kim, Hyeji Kim, Gustavo De Veciana", "tldr": "", "abstract": "Clustered Federated Learning (CFL) is a promising distributed learning framework that addresses data heterogeneity issues across multiple clients by grouping clients and providing a shared generalized model for each group. However, under privacy-preserving federated learning protocols where there is no direct sharing of clients' local datasets, existing approaches often fail to find optimal client groupings resulting in sub-optimal performance. In this paper, we propose a novel CFL algorithm that achieves robust clustering and learning performance. Conceptually, our algorithm groups clients that exhibit similarity in their model updates by periodically accumulating and clustering the gradients that clients compute for various models. The proposed algorithm is shown to achieve a near-optimal error rate for stochastic convergence to optimal models under mild conditions. We present a detailed analysis of the algorithm along with an evaluation on several CFL benchmarks demonstrating that it outperforms existing approaches in terms of convergence speed, clustering accuracy, and task performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Heasung Kim;Hyeji Kim;Gustavo De Veciana", "authorids": "~Heasung_Kim1;~Hyeji_Kim1;~Gustavo_De_Veciana2", "gender": "M;;M", "homepage": ";;https://www.ece.utexas.edu/~gustavo", "dblp": "236/2891;;v/GustavodeVeciana", "google_scholar": ";;https://scholar.google.com/citations?hl=en", "orcid": "0000-0003-3860-8962;;", "linkedin": "heasung-kim-994513180/;;", "or_profile": "~Heasung_Kim1;~Hyeji_Kim1;~Gustavo_De_Veciana2", "aff": "InterDigital Communications;;University of Texas, Austin", "aff_domain": "interdigital.com;;utexas.edu", "position": "Intern;;Full Professor", "bibtex": "@inproceedings{\nkim2024clustered,\ntitle={Clustered Federated Learning via Gradient-based Partitioning},\nauthor={Heasung Kim and Hyeji Kim and Gustavo De Veciana},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=J4HJUF70qm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6121997, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2957362806426862445&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "interdigital.com;;utexas.edu", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "InterDigital Communications;University of Texas at Austin", "aff_unique_dep": ";", "aff_unique_url": "https://www.interdigital.com;https://www.utexas.edu", "aff_unique_abbr": ";UT Austin", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Efficient Value Iteration for s-rectangular Robust Markov Decision Processes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34401", "id": "J4LTDgwAZq", "proceeding": "https://proceedings.mlr.press/v235/kumar24b.html", "pdf": "https://openreview.net/pdf?id=J4LTDgwAZq", "openreview": "https://openreview.net/forum?id=J4LTDgwAZq", "author_site": "Navdeep Kumar, Kaixin Wang, Kfir Levy, Shie Mannor", "tldr": "", "abstract": "We focus on s-rectangular robust Markov decision processes (MDPs), which capture interconnected uncertainties across different actions within each state. This framework is more general compared to sa-rectangular robust MDPs, where uncertainties in each action are independent. However, the introduced interdependence significantly amplifies the complexity of the problem. Existing methods either have slow performance guarantees or are inapplicable to even moderately large state spaces. In this work, we derive optimal robust Bellman operators in explicit forms. This leads to robust value iteration methods with significantly faster time complexities than existing approaches, which can be used in large state spaces. Further, our findings reveal that the optimal policies demonstrate a novel threshold behavior, selectively favoring a limited set of actions based on their respective advantage functions. Additionally, our study uncovers a noteworthy connection between the robustness of a policy and the variance in its value function, highlighting that policies with lower variance exhibit greater resilience.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Navdeep Kumar;Kaixin Wang;Kfir Yehuda Levy;Shie Mannor", "authorids": "~Navdeep_Kumar1;~Kaixin_Wang1;~Kfir_Yehuda_Levy1;~Shie_Mannor2", "gender": "M;M;M;M", "homepage": ";https://kaixin96.github.io;http://kfiryehud.wixsite.com/kfir-y-levy;https://shie.net.technion.ac.il", "dblp": ";;83/11388;20/1669", "google_scholar": ";https://scholar.google.com.sg/citations?hl=en;;https://scholar.google.com.tw/citations?user=q1HlbIUAAAAJ", "orcid": ";0000-0001-8237-9285;;", "linkedin": "navdeepsjb/;;;", "or_profile": "~Navdeep_Kumar1;~Kaixin_Wang1;~Kfir_Yehuda_Levy1;~Shie_Mannor2", "aff": "Technion - Israel Institute of Technology, Technion - Israel Institute of Technology;Technion - Israel Institute of Technology, Technion - Israel Institute of Technology;Technion - Israel Institute of Technology, Technion;Technion - Israel Institute of Technology, Technion", "aff_domain": "campus.technion.ac.il;campus.technion.ac.il;technion.ac.il;technion.il", "position": "PhD student;Postdoc;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nkumar2024efficient,\ntitle={Efficient Value Iteration for s-rectangular Robust Markov Decision Processes},\nauthor={Navdeep Kumar and Kaixin Wang and Kfir Yehuda Levy and Shie Mannor},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=J4LTDgwAZq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2014647, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16309007112277892995&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "campus.technion.ac.il;campus.technion.ac.il;technion.ac.il;technion.il", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Israel" }, { "title": "Revisiting the Role of Language Priors in Vision-Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34400", "id": "J5VB1h3Aed", "proceeding": "https://proceedings.mlr.press/v235/lin24c.html", "pdf": "https://openreview.net/pdf?id=J5VB1h3Aed", "openreview": "https://openreview.net/forum?id=J5VB1h3Aed", "author_site": "Zhiqiu Lin, Xinyue Chen, Deepak Pathak, Pengchuan Zhang, Deva Ramanan", "tldr": "", "abstract": "Vision-language models (VLMs) are impactful in part because they can be applied to a variety of visual understanding tasks in a zero-shot fashion, without any fine-tuning. We study $\\textit{generative VLMs}$ that are trained for next-word generation given an image. We explore their zero-shot performance on the illustrative task of image-text retrieval across nine popular vision-language benchmarks. Our first observation is that they can be repurposed for discriminative tasks (such as image-text retrieval) by simply computing the match score of generating a particular text string given an image. We call this probabilistic score the Visual Generative Pre-Training Score (VisualGPTScore). While the VisualGPTScore produces near-perfect accuracy on some retrieval benchmarks, it yields poor accuracy on others. We analyze this behavior through a probabilistic lens, pointing out that some benchmarks inadvertently capture unnatural language distributions by creating adversarial but unlikely text captions. In fact, we demonstrate that even a \"blind\" language model that ignores any image evidence can sometimes outperform all prior art, reminiscent of similar challenges faced by the visual-question answering (VQA) community many years ago. We derive a probabilistic post-processing scheme that controls for the amount of linguistic bias in generative VLMs at test time without having to retrain or fine-tune the model. We show that the VisualGPTScore, when appropriately debiased, is a strong zero-shot baseline for vision-language understanding, oftentimes producing state-of-the-art accuracy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhiqiu Lin;Xinyue Chen;Deepak Pathak;Pengchuan Zhang;Deva Ramanan", "authorids": "~Zhiqiu_Lin1;~Xinyue_Chen5;~Deepak_Pathak1;~Pengchuan_Zhang1;~Deva_Ramanan1", "gender": "M;F;M;M;M", "homepage": "https://linzhiqiu.github.io;;https://www.cs.cmu.edu/~dpathak/;https://pzzhang.github.io/pzzhang/;https://www.cs.cmu.edu/~deva/", "dblp": "230/4394;;155/9860;;49/488", "google_scholar": "https://scholar.google.com/citations?hl=en;u1LwS0UAAAAJ;https://scholar.google.cl/citations?user=AEsPCAUAAAAJ;3VZ_E64AAAAJ;9B8PoXUAAAAJ", "orcid": ";;;;", "linkedin": "zhiqiu-lin-b49ba7126/;xinyue-chen-073a4114b/;pathak22/;;", "or_profile": "~Zhiqiu_Lin1;~Xinyue_Chen5;~Deepak_Pathak1;~Pengchuan_Zhang1;~Deva_Ramanan1", "aff": "Carnegie Mellon University;ByteDance Inc.;Carnegie Mellon University;;School of Computer Science, Carnegie Mellon University", "aff_domain": "cmu.edu;bytedance.com;cmu.edu;;cs.cmu.edu", "position": "PhD student;Researcher;Assistant Professor;;Full Professor", "bibtex": "@inproceedings{\nlin2024revisiting,\ntitle={Revisiting the Role of Language Priors in Vision-Language Models},\nauthor={Zhiqiu Lin and Xinyue Chen and Deepak Pathak and Pengchuan Zhang and Deva Ramanan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=J5VB1h3Aed}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8303534, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11731537112498758874&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "cmu.edu;bytedance.com;cmu.edu;;cs.cmu.edu", "author_num": 5, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Carnegie Mellon University;ByteDance", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.bytedance.com", "aff_unique_abbr": "CMU;ByteDance", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;China" }, { "title": "Learning Mixtures of Gaussian Processes through Random Projection", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34399", "id": "J5Yg7HMy39", "proceeding": "https://proceedings.mlr.press/v235/akeweje24a.html", "pdf": "https://openreview.net/pdf?id=J5Yg7HMy39", "openreview": "https://openreview.net/forum?id=J5Yg7HMy39", "author_site": "Emmanuel Akeweje, Mimi Zhang", "tldr": "", "abstract": "We propose an ensemble clustering framework to uncover latent cluster labels in functional data generated from a Gaussian process mixture. Our method exploits the fact that the projection coefficients of the functional data onto any given projection function follow a univariate Gaussian mixture model (GMM). By conducting multiple one-dimensional projections and learning a univariate GMM for each, we create an ensemble of GMMs. Each GMM serves as a base clustering, and applying ensemble clustering yields a consensus clustering. Our approach significantly reduces computational complexity compared to state-of-the-art methods, and we provide theoretical guarantees on the identifiability and learnability of Gaussian process mixtures. Extensive experiments on synthetic and real datasets confirm the superiority of our method over existing techniques.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Emmanuel Akeweje;Mimi Zhang", "authorids": "~Emmanuel_Akeweje1;~Mimi_Zhang2", "gender": "M;F", "homepage": ";https://www.tcd.ie/scss/people/academic-staff/zhangm3/", "dblp": ";37/2847", "google_scholar": ";https://scholar.google.co.uk/citations?user=6NC-XagAAAAJ", "orcid": "0000-0002-1513-623X;0000-0002-3807-297X", "linkedin": ";", "or_profile": "~Emmanuel_Akeweje1;~Mimi_Zhang2", "aff": "University of Dublin, Trinity College;University of Dublin, Trinity College", "aff_domain": "tcd.ie;tcd.ie", "position": "Researcher;Assistant Professor", "bibtex": "@inproceedings{\nakeweje2024learning,\ntitle={Learning Mixtures of Gaussian Processes through Random Projection},\nauthor={Emmanuel Akeweje and Mimi Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=J5Yg7HMy39}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8143629, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:SrCOInhZHXEJ:scholar.google.com/&scioq=Learning+Mixtures+of+Gaussian+Processes+through+Random+Projection&hl=en&as_sdt=0,48", "gs_version_total": 4, "email": "tcd.ie;tcd.ie", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Trinity College Dublin", "aff_unique_dep": "", "aff_unique_url": "https://www.tcd.ie", "aff_unique_abbr": "TCD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Ireland" }, { "title": "Feature Attribution with Necessity and Sufficiency via Dual-stage Perturbation Test for Causal Explanation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34398", "id": "J6prHJsIlf", "proceeding": "https://proceedings.mlr.press/v235/chen24d.html", "pdf": "https://openreview.net/pdf?id=J6prHJsIlf", "openreview": "https://openreview.net/forum?id=J6prHJsIlf", "author_site": "Xuexin Chen, Ruichu Cai, Zhengting Huang, Yuxuan Zhu, Julien Horwood, Zhifeng Hao, Zijian Li, Jose Miguel Hernandez-Lobato", "tldr": "", "abstract": "We investigate the problem of explainability for machine learning models, focusing on Feature Attribution Methods (FAMs) that evaluate feature importance through perturbation tests. Despite their utility, FAMs struggle to distinguish the contributions of different features, when their prediction changes are similar after perturbation. To enhance FAMs' discriminative power, we introduce Feature Attribution with Necessity and Sufficiency (FANS), which find a neighborhood of the input such that perturbing samples within this neighborhood have a high Probability of being Necessity and Sufficiency (PNS) cause for the change in predictions, and use this PNS as the importance of the feature. Specifically, FANS compute this PNS via a heuristic strategy for estimating the neighborhood and a perturbation test involving two stages (factual and interventional) for counterfactual reasoning. To generate counterfactual samples, we use a resampling-based approach on the observed samples to approximate the required conditional distribution. We demonstrate that FANS outperforms existing attribution methods on six benchmarks. Please refer to the source code via https://github.com/DMIRLAB-Group/FANS.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xuexin Chen;Ruichu Cai;ZhengTingHuang;Yuxuan Zhu;Julien Horwood;Zhifeng Hao;Zijian Li;Jos\u00e9 Miguel Hern\u00e1ndez-Lobato", "authorids": "~Xuexin_Chen1;~Ruichu_Cai1;~ZhengTingHuang1;~Yuxuan_Zhu2;~Julien_Horwood1;~Zhifeng_Hao4;~Zijian_Li1;~Jos\u00e9_Miguel_Hern\u00e1ndez-Lobato1", "gender": "M;M;F;M;;M;M;", "homepage": ";https://ruichucai.github.io/;https://www.zhihu.com/people/71827c3f6e8a7a83e7f7cb23f651e4ad;;;https://www.stu.edu.cn/xxgk/dzld1/hzf.htm;;http://jmhl.org", "dblp": "226/9631.html;09/6889;;146/0939-1.html;;;27/10487;40/6058", "google_scholar": ";https://scholar.google.com/citations?hl=en;;iqf_6DYAAAAJ;https://scholar.google.ca/citations?user=1q-mRKAAAAAJ;ZF3gp9wAAAAJ;j3ilESoAAAAJ;BEBccCQAAAAJ", "orcid": ";;;0000-0003-1831-9688;;;;0000-0001-7610-949X", "linkedin": ";;;yuxuan-zhu-62602334b/;;;;", "or_profile": "~Xuexin_Chen1;~Ruichu_Cai1;~ZhengTingHuang1;~Yuxuan_Zhu2;~Julien_Horwood1;~Zhifeng_Hao4;~Zijian_Li1;~Jose_Miguel_Hernandez_Lobato1", "aff": "Guangdong University of Technology;Guangdong University of Technology;;Guangdong University of Technology;University of Cambridge;Shantou University;Mohamed bin Zayed University of Artificial Intelligence;University of Cambridge", "aff_domain": "gdut.edu.cn;gdut.edu.cn;;gdut.edu.cn;cam.ac.uk;stu.edu.cn;mbzuai.ac.ae;cam.ac.uk", "position": "PhD student;Full Professor;;MS student;PhD student;Full Professor;Postdoc;Associate Professor", "bibtex": "@inproceedings{\nchen2024feature,\ntitle={Feature Attribution with Necessity and Sufficiency via Dual-stage Perturbation Test for Causal Explanation},\nauthor={Xuexin Chen and Ruichu Cai and ZhengTingHuang and Yuxuan Zhu and Julien Horwood and Zhifeng Hao and Zijian Li and Jos{\\'e} Miguel Hern{\\'a}ndez-Lobato},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=J6prHJsIlf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 758725, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12089191532860362751&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "gdut.edu.cn;gdut.edu.cn;;gdut.edu.cn;cam.ac.uk;stu.edu.cn;mbzuai.ac.ae;cam.ac.uk", "author_num": 8, "aff_unique_index": "0;0;0;1;2;3;1", "aff_unique_norm": "Guangdong University of Technology;University of Cambridge;Shantou University;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.gdut.edu.cn;https://www.cam.ac.uk;https://www.stu.edu.cn;https://mbzuai.ac.ae", "aff_unique_abbr": "GDUT;Cambridge;STU;MBZUAI", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0;1;0;2;1", "aff_country_unique": "China;United Kingdom;United Arab Emirates" }, { "title": "Improving Sharpness-Aware Minimization by Lookahead", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34397", "id": "J9YKDvqr65", "proceeding": "https://proceedings.mlr.press/v235/yu24q.html", "pdf": "https://openreview.net/pdf?id=J9YKDvqr65", "openreview": "https://openreview.net/forum?id=J9YKDvqr65", "author_site": "Runsheng Yu, Youzhi Zhang, James Kwok", "tldr": "", "abstract": "Sharpness-Aware Minimization (SAM), which performs gradient descent on adversarially perturbed weights, can improve generalization by identifying flatter minima. However, recent studies have shown that SAM may suffer from convergence instability and oscillate around saddle points, resulting in slow convergence and inferior performance. To address this problem, we propose the use of a lookahead mechanism to gather more information about the landscape by looking further ahead, and thus find a better trajectory to converge. By examining the nature of SAM, we simplify the extrapolation procedure, resulting in a more efficient algorithm. Theoretical results show that the proposed method converges to a stationary point and is less prone to saddle points. Experiments on standard benchmark datasets also verify that the proposed method outperforms the SOTAs, and converge more effectively to flat minima.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Runsheng Yu;Youzhi Zhang;James Kwok", "authorids": "~Runsheng_Yu2;~Youzhi_Zhang2;~James_Kwok1", "gender": "Not Specified;;", "homepage": "https://www.linkedin.com/in/runsheng-yu-560696127/;https://youzhi333.github.io/index.html;", "dblp": "210/2646.html?q=runsheng%20yu;131/9490-1;", "google_scholar": ";i2j5DmwAAAAJ;", "orcid": "0000-0003-0053-1234;0000-0002-2984-734X;", "linkedin": ";;", "or_profile": "~Runsheng_Yu2;~Youzhi_Zhang2;~James_Kwok1", "aff": "Hong Kong University of Science and Technology;Centre for Artificial Intelligence and Robotics, Hong Kong Institute of Science & Innovation, Chinese Academy of Sciences;", "aff_domain": "ust.hk;cair-cas.org.hk;", "position": "PhD student;Assistant Professor;", "bibtex": "@inproceedings{\nyu2024improving,\ntitle={Improving Sharpness-Aware Minimization by Lookahead},\nauthor={Runsheng Yu and Youzhi Zhang and James Kwok},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=J9YKDvqr65}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3110021, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GR-_SdZuvSsJ:scholar.google.com/&scioq=Improving+Sharpness-Aware+Minimization+by+Lookahead&hl=en&as_sdt=0,5", "gs_version_total": 5, "email": "ust.hk;cair-cas.org.hk;", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Hong Kong University of Science and Technology;Hong Kong Institute of Science & Innovation, Chinese Academy of Sciences", "aff_unique_dep": ";Centre for Artificial Intelligence and Robotics", "aff_unique_url": "https://www.ust.hk;", "aff_unique_abbr": "HKUST;", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Understanding Inter-Concept Relationships in Concept-Based Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34396", "id": "JA6ThxAmth", "proceeding": "https://proceedings.mlr.press/v235/raman24a.html", "pdf": "https://openreview.net/pdf?id=JA6ThxAmth", "openreview": "https://openreview.net/forum?id=JA6ThxAmth", "author_site": "Naveen Raman, Mateo Espinosa Zarlenga, Mateja Jamnik", "tldr": "", "abstract": "Concept-based explainability methods provide insight into deep learning systems by constructing explanations using human-understandable concepts. While the literature on human reasoning demonstrates that we exploit relationships between concepts when solving tasks, it is unclear whether concept-based methods incorporate the rich structure of inter-concept relationships. We analyse the concept representations learnt by concept-based models to understand whether these models correctly capture inter-concept relationships. First, we empirically demonstrate that state-of-the-art concept-based models produce representations that lack stability and robustness, and such methods fail to capture inter-concept relationships. Then, we develop a novel algorithm which leverages inter-concept relationships to improve concept intervention accuracy, demonstrating how correctly capturing inter-concept relationships can improve downstream tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Naveen Janaki Raman;Mateo Espinosa Zarlenga;Mateja Jamnik", "authorids": "~Naveen_Janaki_Raman1;~Mateo_Espinosa_Zarlenga1;~Mateja_Jamnik1", "gender": ";M;F", "homepage": "https://naveenraman.com;https://mateoespinosa.github.io/;http://www.cl.cam.ac.uk/~mj201", "dblp": "220/3385;307/3045.html;41/1392", "google_scholar": "vHALcwMAAAAJ;4ikoEiMAAAAJ;d5QiyJkAAAAJ", "orcid": ";;0000-0003-2772-2532", "linkedin": "naveen-raman/;mateoespinosa/;", "or_profile": "~Naveen_Janaki_Raman1;~Mateo_Espinosa_Zarlenga1;~Mateja_Jamnik1", "aff": "Carnegie Mellon University;University of Cambridge;University of Cambridge", "aff_domain": "cmu.edu;cam.ac.uk;cam.ac.uk", "position": "PhD student;PhD student;Professor in Artificial Intelligence", "bibtex": "@inproceedings{\nraman2024understanding,\ntitle={Understanding Inter-Concept Relationships in Concept-Based Models},\nauthor={Naveen Janaki Raman and Mateo Espinosa Zarlenga and Mateja Jamnik},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JA6ThxAmth}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1777169, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9129520207181570171&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "cmu.edu;cam.ac.uk;cam.ac.uk", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Carnegie Mellon University;University of Cambridge", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.cam.ac.uk", "aff_unique_abbr": "CMU;Cambridge", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Mimicking Better by Matching the Approximate Action Distribution", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34395", "id": "JAfIDm7NED", "proceeding": "https://proceedings.mlr.press/v235/candido-ramos24a.html", "pdf": "https://openreview.net/pdf?id=JAfIDm7NED", "openreview": "https://openreview.net/forum?id=JAfIDm7NED", "author_site": "Joao A. Candido Ramos, Lionel Blond\u00e9, Naoya Takeishi, Alexandros Kalousis", "tldr": "", "abstract": "In this paper, we introduce MAAD, a novel, sample-efficient on-policy algorithm for Imitation Learning from Observations. MAAD utilizes a surrogate reward signal, which can be derived from various sources such as adversarial games, trajectory matching objectives, or optimal transport criteria. To compensate for the non-availability of expert actions, we rely on an inverse dynamics model that infers plausible actions distribution given the expert\u2019s state-state transitions; we regularize the imitator\u2019s policy by aligning it to the inferred action distribution. MAAD leads to significantly improved sample efficiency and stability. We demonstrate its effectiveness in a number of MuJoCo environments, both int the OpenAI Gym and the DeepMind Control Suite. We show that it requires considerable fewer interactions to achieve expert performance, outperforming current state-of-the-art on-policy methods. Remarkably, MAAD often stands out as the sole method capable of attaining expert performance levels, underscoring its simplicity and efficacy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Joao Candido Ramos;Lionel Blond\u00e9;Naoya Takeishi;Alexandros Kalousis", "authorids": "~Joao_Candido_Ramos1;~Lionel_Blond\u00e91;~Naoya_Takeishi1;~Alexandros_Kalousis1", "gender": "M;M;;M", "homepage": "http://dmml.ch/joao-candido-ramos/;;https://ntake.jp/;http://dmml.ch/alexandros-kalousis/", "dblp": "295/8628;;143/0393;68/6004", "google_scholar": "5dke1C0AAAAJ;H-_PEWcAAAAJ;https://scholar.google.co.jp/citations?user=rqF9bAsAAAAJ;uVkn9UEAAAAJ", "orcid": "0000-0001-9693-469X;;0000-0003-0111-2269;", "linkedin": ";;;", "or_profile": "~Joao_Candido_Ramos1;~Lionel_Blond\u00e91;~Naoya_Takeishi1;~Alexandros_Kalousis1", "aff": "HES-SO : UAS Western Switzerland;Geneva School of Business Administration, HES-SO University of Applied Sciences of Western Switzerland;;The University of Tokyo;University of Applied Sciences Western Switzerland", "aff_domain": "hes-so.ch;hesge.ch;u-tokyo.ac.jp;hesge.ch", "position": "PhD student;Postdoc;Lecturer;Full Professor", "bibtex": "@inproceedings{\nramos2024mimicking,\ntitle={Mimicking Better by Matching the Approximate Action Distribution},\nauthor={Joao Candido Ramos and Lionel Blond{\\'e} and Naoya Takeishi and Alexandros Kalousis},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JAfIDm7NED}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4900845, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3590963060457250507&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "hes-so.ch;hesge.ch;u-tokyo.ac.jp;hesge.ch", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Applied Sciences Western Switzerland;HES-SO University of Applied Sciences of Western Switzerland;University of Tokyo", "aff_unique_dep": ";School of Business Administration;", "aff_unique_url": "https://www.hes-so.ch;https://www.hes-so.ch/en;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "HES-SO;HES-SO;UTokyo", "aff_campus_unique_index": "1", "aff_campus_unique": ";Geneva", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Switzerland;Japan" }, { "title": "Quantum Algorithm for Online Exp-concave Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34394", "id": "JApt4Ty89Y", "proceeding": "https://proceedings.mlr.press/v235/he24g.html", "pdf": "https://openreview.net/pdf?id=JApt4Ty89Y", "openreview": "https://openreview.net/forum?id=JApt4Ty89Y", "author_site": "Jianhao He, Chengchang Liu, Xutong Liu, Lvzhou Li, John C.S. Lui", "tldr": "", "abstract": "We explore whether quantum advantages can be found for the zeroth-order feedback online exp-concave optimization problem, which is also known as bandit exp-concave optimization with multi-point feedback. We present quantum online quasi-Newton methods to tackle the problem and show that there exists quantum advantages for such problems. Our method approximates the Hessian by quantum estimated inexact gradient and can achieve $O(n\\log T)$ regret with $O(1)$ queries at each round, where $n$ is the dimension of the decision set and $T$ is the total decision rounds. Such regret improves the optimal classical algorithm by a factor of $T^{2/3}$.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jianhao He;Chengchang Liu;Xutong Liu;Lvzhou Li;John C.S. Lui", "authorids": "~Jianhao_He1;~Chengchang_Liu1;~Xutong_Liu1;lilvzh@mail.sysu.edu.cn;~John_C.S._Lui2", "gender": "M;;M;;M", "homepage": ";https://7ccliu.github.io;https://xutongliu.me/;;http://www.cse.cuhk.edu.hk/~cslui/Index.html", "dblp": "271/4370;291/5180;70/3372-2;;l/JohnCSLui", "google_scholar": "fvdQ0agAAAAJ;jmrbA5wAAAAJ;KNfY6BIAAAAJ;;https://scholar.google.com.tw/citations?user=7LVjQ7MAAAAJ", "orcid": "0000-0002-3201-0137;0009-0003-6552-4892;0000-0002-8628-5873;;0000-0001-7466-0384", "linkedin": ";;;;", "or_profile": "~Jianhao_He1;~Chengchang_Liu1;~Xutong_Liu1;lilvzh@mail.sysu.edu.cn;~John_C.S._Lui2", "aff": "Chinese University of Hong Kong;Department of Computer Science and Engineering, The Chinese University of Hong Kong;The Chinese University of Hong Kong;;The Chinese University of Hong Kong", "aff_domain": "cuhk.hk;cse.cuhk.edu.hk;cuhk.edu.hk;;cse.cuhk.edu.hk", "position": "Postdoc;PhD student;Postdoc;;Full Professor", "bibtex": "@inproceedings{\nhe2024quantum,\ntitle={Quantum Algorithm for Online Exp-concave Optimization},\nauthor={Jianhao He and Chengchang Liu and Xutong Liu and Lvzhou Li and John C.S. Lui},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JApt4Ty89Y}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 372135, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11934134891458818717&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "cuhk.hk;cse.cuhk.edu.hk;cuhk.edu.hk;;cse.cuhk.edu.hk", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Chinese University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Towards Understanding the Word Sensitivity of Attention Layers: A Study via Random Features", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34393", "id": "JBaPBPrn93", "proceeding": "https://proceedings.mlr.press/v235/bombari24b.html", "pdf": "https://openreview.net/pdf?id=JBaPBPrn93", "openreview": "https://openreview.net/forum?id=JBaPBPrn93", "author_site": "Simone Bombari, Marco Mondelli", "tldr": "", "abstract": "Understanding the reasons behind the exceptional success of transformers requires a better analysis of why attention layers are suitable for NLP tasks. In particular, such tasks require predictive models to capture contextual meaning which often depends on one or few words, even if the sentence is long. Our work studies this key property, dubbed _word sensitivity_ (WS), in the prototypical setting of random features. We show that attention layers enjoy high WS, namely, there exists a vector in the space of embeddings that largely perturbs the random attention features map. The argument critically exploits the role of the softmax in the attention layer, highlighting its benefit compared to other activations (e.g., ReLU). In contrast, the WS of standard random features is of order $1/\\sqrt{n}$, $n$ being the number of words in the textual sample, and thus it decays with the length of the context. We then translate these results on the word sensitivity into generalization bounds: due to their low WS, random features provably cannot learn to distinguish between two sentences that differ only in a single word; in contrast, due to their high WS, random attention features have higher generalization capabilities. We validate our theoretical results with experimental evidence over the BERT-Base word embeddings of the imdb review dataset.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Simone Bombari;Marco Mondelli", "authorids": "~Simone_Bombari1;~Marco_Mondelli1", "gender": "Not Specified;M", "homepage": "https://simone-bombari.github.io/;http://marcomondelli.com", "dblp": "317/4969;120/7089", "google_scholar": ";BHdSb5AAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Simone_Bombari1;~Marco_Mondelli1", "aff": "Institute of Science and Technology;Institute of Science and Technology", "aff_domain": "ist.ac.at;ist.ac.at", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nbombari2024towards,\ntitle={Towards Understanding the Word Sensitivity of Attention Layers: A Study via Random Features},\nauthor={Simone Bombari and Marco Mondelli},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JBaPBPrn93}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1049617, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4697740765811106744&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "ist.ac.at;ist.ac.at", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "", "aff_country_unique": "" }, { "title": "Coarse-to-Fine Highlighting: Reducing Knowledge Hallucination in Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34392", "id": "JCG0KTPVYy", "proceeding": "https://proceedings.mlr.press/v235/lv24c.html", "pdf": "https://openreview.net/pdf?id=JCG0KTPVYy", "openreview": "https://openreview.net/forum?id=JCG0KTPVYy", "author_site": "Qitan Lv, Jie Wang, Hanzhu Chen, Bin Li, Yongdong Zhang, Feng Wu", "tldr": "", "abstract": "Generation of plausible but incorrect factual information, often termed hallucination, has attracted significant research interest. Retrieval-augmented language model (RALM)---which enhances models with up-to-date knowledge---emerges as a promising method to reduce hallucination. However, existing RALMs may instead exacerbate hallucination when retrieving lengthy contexts. To address this challenge, we propose COFT, a novel **CO**arse-to-**F**ine highligh**T**ing method to focus on different granularity-level key texts, thereby avoiding getting lost in lengthy contexts. Specifically, COFT consists of three components: *recaller*, *scorer*, and *selector*. First, *recaller* applies a knowledge graph to extract potential key entities in a given context. Second, *scorer* measures the importance of each entity by calculating its contextual weight. Finally, *selector* selects high contextual weight entities with a dynamic threshold algorithm and highlights the corresponding paragraphs, sentences, or words in a coarse-to-fine manner. Extensive experiments on knowledge hallucination benchmark demonstrate the effectiveness of COFT, leading to a superior performance over 30% in F1 score metric. Moreover, COFT also exhibits remarkable versatility across various long-form tasks, such as reading comprehension and question answering.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qitan Lv;Jie Wang;Hanzhu Chen;Bin Li;Yongdong Zhang;Feng Wu", "authorids": "~Qitan_Lv1;~Jie_Wang1;~Hanzhu_Chen1;~Bin_Li8;~Yongdong_Zhang2;~Feng_Wu1", "gender": "M;M;;M;M;M", "homepage": "https://scholar.google.com/citations?hl=zh-CN&user=7yDqr3oAAAAJ;http://staff.ustc.edu.cn/~jwangx;;http://staff.ustc.edu.cn/~binli;https://imcc.ustc.edu.cn/_upload/tpl/0d/13/3347/template3347/zhangyongdong.html;", "dblp": "357/3270.html;29/5259-5;;89/6764-25;z/YongdongZhang;25/3972-1", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;OugG4dUAAAAJ;;;https://scholar.google.com.hk/citations?user=hxGs4ukAAAAJ;5bInRDEAAAAJ", "orcid": ";;;0000-0002-2332-3959;0000-0003-0066-3448;", "linkedin": ";;;;;", "or_profile": "~Qitan_Lv1;~Jie_Wang1;~Hanzhu_Chen1;~Bin_Li8;~Yongdong_Zhang2;~Feng_Wu1", "aff": "University of Science and Technology of China;University of Science and Technology of China;;University of Science and Technology of China;University of Science and Technology of China;University of Science and Technology of China", "aff_domain": "ustc.edu.cn;ustc.edu.cn;;ustc.edu.cn;ustc.edu.cn;ustc.edu.cn", "position": "PhD student;Full Professor;;Full Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nlv2024coarsetofine,\ntitle={Coarse-to-Fine Highlighting: Reducing Knowledge Hallucination in Large Language Models},\nauthor={Qitan Lv and Jie Wang and Hanzhu Chen and Bin Li and Yongdong Zhang and Feng Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JCG0KTPVYy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2518302, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6909341133348297095&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 7, "email": "ustc.edu.cn;ustc.edu.cn;;ustc.edu.cn;ustc.edu.cn;ustc.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of Science and Technology of China", "aff_unique_dep": "", "aff_unique_url": "http://www.ustc.edu.cn", "aff_unique_abbr": "USTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Federated Optimization with Doubly Regularized Drift Correction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34391", "id": "JD03zxWZzs", "proceeding": "https://proceedings.mlr.press/v235/jiang24e.html", "pdf": "https://openreview.net/pdf?id=JD03zxWZzs", "openreview": "https://openreview.net/forum?id=JD03zxWZzs", "author_site": "Xiaowen Jiang, Anton Rodomanov, Sebastian Stich", "tldr": "", "abstract": "Federated learning is a distributed optimization paradigm that allows training machine learning models across decentralized devices while keeping the data localized. The standard method, FedAvg, suffers from client drift which can hamper performance and increase communication costs over centralized methods. Previous works proposed various strategies to mitigate drift, yet none have shown consistently improved communication-computation trade-offs over vanilla gradient descent across all standard function classes. In this work, we revisit DANE, an established method in distributed optimization. We show that (i) DANE can achieve the desired communication reduction under Hessian similarity constraints. Furthermore, (ii) we present an extension, DANE+, which supports arbitrary inexact local solvers and has more freedom to choose how to aggregate the local updates. We propose (iii) a novel method, FedRed, which has improved local computational complexity and retains the same communication complexity compared to DANE/DANE+. This is achieved by doubly regularized drift correction.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaowen Jiang;Anton Rodomanov;Sebastian U Stich", "authorids": "~Xiaowen_Jiang1;~Anton_Rodomanov1;~Sebastian_U_Stich1", "gender": "M;;M", "homepage": ";;https://www.sstich.ch", "dblp": "192/3782-3;153/5453;04/10549", "google_scholar": "https://scholar.google.com/citations?hl=en;u95GRZQAAAAJ;https://scholar.google.ch/citations?user=8l-mDfQAAAAJ", "orcid": ";;", "linkedin": "xiaowen-jiang-65570b222/;;", "or_profile": "~Xiaowen_Jiang1;~Anton_Rodomanov1;~Sebastian_U_Stich1", "aff": "CISPA Helmholtz Center for Information Security;CISPA;CISPA Helmholtz Center for Information Security", "aff_domain": "cispa.de;cispa.de;cispa.de", "position": "PhD student;Postdoc;Tenure Track Faculty", "bibtex": "@inproceedings{\njiang2024federated,\ntitle={Federated Optimization with Doubly Regularized Drift Correction},\nauthor={Xiaowen Jiang and Anton Rodomanov and Sebastian U Stich},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JD03zxWZzs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1386841, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8782389476570394449&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "cispa.de;cispa.de;cispa.de", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "CISPA Helmholtz Center for Information Security", "aff_unique_dep": "", "aff_unique_url": "https://www.cispa.de/", "aff_unique_abbr": "CISPA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "MFTN: A Multi-scale Feature Transfer Network Based on IMatchFormer for Hyperspectral Image Super-Resolution", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34390", "id": "JGL39NaARS", "proceeding": "https://proceedings.mlr.press/v235/huang24s.html", "pdf": "https://openreview.net/pdf?id=JGL39NaARS", "openreview": "https://openreview.net/forum?id=JGL39NaARS", "author_site": "Shuying Huang, Mingyang Ren, Yong Yang, Xiaozheng Wang, Yingzhi Wei", "tldr": "", "abstract": "Hyperspectral image super-resolution (HISR) aims to fuse a low-resolution hyperspectral image (LR-HSI) with a high-resolution multispectral image (HR-MSI) to obtain a high-resolution hyperspectral image (HR-HSI). Due to some existing HISR methods ignoring the significant feature difference between LR-HSI and HR-MSI, the reconstructed HR-HSI typically exhibits spectral distortion and blurring of spatial texture. To solve this issue, we propose a multi-scale feature transfer network (MFTN) for HISR. Firstly, three multi-scale feature extractors are constructed to extract features of different scales from the input images. Then, a multi-scale feature transfer module (MFTM) consisting of three improved feature matching Transformers (IMatchFormers) is designed to learn the detail features of different scales from HR-MSI by establishing the cross-model feature correlation between LR-HSI and degraded HR-MSI. Finally, a multiscale dynamic aggregation module (MDAM) containing three spectral aware aggregation modules (SAAMs) is constructed to reconstruct the final HR-HSI by gradually aggregating features of different scales. Extensive experimental results on three commonly used datasets demonstrate that the proposed model achieves better performance compared to state- of-the-art (SOTA) methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shuying Huang;Mingyang Ren;Yong Yang;Xiaozheng Wang;Yingzhi Wei", "authorids": "~Shuying_Huang2;renmingyang_9@163.com;~Yong_Yang5;xiaozhengwang95@gmail.com;~Yingzhi_Wei1", "gender": "F;;M;;M", "homepage": "https://cs.tiangong.edu.cn/2017/0321/c1889a25245/page.htm;;https://xxxy.tiangong.edu.cn/2021/0830/c3266a69877/page.htm;;https://github.com/HandsomeCcat", "dblp": "04/925;;11/357-1;;", "google_scholar": ";;;;", "orcid": "0000-0002-7845-2221;;;;", "linkedin": ";;;;", "or_profile": "~Shuying_Huang2;renmingyang_9@163.com;~Yong_Yang5;xiaozhengwang95@gmail.com;~Yingzhi_Wei1", "aff": "Tiangong University;;Tiangong University;;tiangong university", "aff_domain": "tiangong.edu.cn;;tiangong.edu.cn;;tiangong.edu.cn", "position": "Full Professor;;Full Professor;;Undergrad student", "bibtex": "@inproceedings{\nhuang2024mftn,\ntitle={{MFTN}: A Multi-scale Feature Transfer Network Based on {IM}atchFormer for Hyperspectral Image Super-Resolution},\nauthor={Shuying Huang and Mingyang Ren and Yong Yang and Xiaozheng Wang and Yingzhi Wei},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JGL39NaARS}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1415332, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FNiu5adfqQEJ:scholar.google.com/&scioq=MFTN:+A+Multi-scale+Feature+Transfer+Network+Based+on+IMatchFormer+for+Hyperspectral+Image+Super-Resolution&hl=en&as_sdt=0,14", "gs_version_total": 5, "email": "tiangong.edu.cn;;tiangong.edu.cn;;tiangong.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Tiangong University", "aff_unique_dep": "", "aff_unique_url": "", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Concentration Inequalities for General Functions of Heavy-Tailed Random Variables", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34389", "id": "JHRvP84SQ5", "proceeding": "https://proceedings.mlr.press/v235/li24au.html", "pdf": "https://openreview.net/pdf?id=JHRvP84SQ5", "openreview": "https://openreview.net/forum?id=JHRvP84SQ5", "author_site": "Shaojie Li, Yong Liu", "tldr": "", "abstract": "Concentration inequalities play an essential role in the study of machine learning and high dimensional statistics. In this paper, we obtain unbounded analogues of the popular bounded difference inequality for functions of independent random variables with heavy-tailed distributions. The main results provide a general framework applicable to all heavy-tailed distributions with finite variance. To illustrate the strength of our results, we present applications to sub-exponential tails, sub-Weibull tails, and heavier polynomially decaying tails. Applied to some standard problems in statistical learning theory (vector valued concentration, Rademacher complexity, and algorithmic stability), we show that these inequalities allow an extension of existing results to heavy-tailed distributions up to finite variance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shaojie Li;Yong Liu", "authorids": "~Shaojie_Li2;~Yong_Liu7", "gender": "M;M", "homepage": ";https://iie-liuyong.github.io", "dblp": ";29/4867-18", "google_scholar": ";vVhmzbAAAAAJ", "orcid": ";0000-0002-6739-621X", "linkedin": ";", "or_profile": "~Shaojie_Li2;~Yong_Liu7", "aff": "Renmin University of China;Renmin University of China", "aff_domain": "ruc.edu.cn;ruc.edu.cn", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nli2024concentration,\ntitle={Concentration Inequalities for General Functions of Heavy-Tailed Random Variables},\nauthor={Shaojie Li and Yong Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JHRvP84SQ5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 354565, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:wBzTjEuoOroJ:scholar.google.com/&scioq=Concentration+Inequalities+for+General+Functions+of+Heavy-Tailed+Random+Variables&hl=en&as_sdt=0,33", "gs_version_total": 5, "email": "ruc.edu.cn;ruc.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Renmin University of China", "aff_unique_dep": "", "aff_unique_url": "http://www.ruc.edu.cn", "aff_unique_abbr": "RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Do Large Language Models Perform the Way People Expect? Measuring the Human Generalization Function", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34388", "id": "JIWtKcR78C", "proceeding": "https://proceedings.mlr.press/v235/vafa24a.html", "pdf": "https://openreview.net/pdf?id=JIWtKcR78C", "openreview": "https://openreview.net/forum?id=JIWtKcR78C", "author_site": "Keyon Vafa, Ashesh Rambachan, Sendhil Mullainathan", "tldr": "", "abstract": "What makes large language models (LLMs) impressive is also what makes them hard to evaluate: their diversity of uses. To evaluate these models, we must understand the purposes they will be used for. We consider a setting where these deployment decisions are made by people, and in particular, people's beliefs about where an LLM will perform well. We model such beliefs as the consequence of a human generalization function: having seen what an LLM gets right or wrong, people generalize to where else it might succeed. We collect a dataset of 19K examples of how humans make generalizations across 79 tasks from the MMLU and BIG-Bench benchmarks. We show that the human generalization function can be predicted using NLP methods: people have consistent structured ways to generalize. We then evaluate LLM alignment with the human generalization function. Our results show that -- especially for cases where the cost of mistakes is high -- more capable models (e.g. GPT-4) can do worse on the instances people choose to use them for, exactly because they are not aligned with the human generalization function.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Keyon Vafa;Ashesh Rambachan;Sendhil Mullainathan", "authorids": "~Keyon_Vafa1;~Ashesh_Rambachan1;~Sendhil_Mullainathan2", "gender": "M;M;M", "homepage": "http://www.keyonvafa.com;https://asheshrambachan.github.io/;https://www.chicagobooth.edu/faculty/directory/m/sendhil-mullainathan", "dblp": ";249/2625;25/169", "google_scholar": ";https://scholar.google.com/citations?hl=en;oExfyEkAAAAJ", "orcid": ";;", "linkedin": ";ashesh-rambachan/;", "or_profile": "~Keyon_Vafa1;~Ashesh_Rambachan1;~Sendhil_Mullainathan2", "aff": "Columbia University;Massachusetts Institute of Technology;University of Chicago", "aff_domain": "columbia.edu;mit.edu;uchicago.edu", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nvafa2024do,\ntitle={Do Large Language Models Perform the Way People Expect? Measuring the Human Generalization Function},\nauthor={Keyon Vafa and Ashesh Rambachan and Sendhil Mullainathan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JIWtKcR78C}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 776845, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9515454514712984348&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "columbia.edu;mit.edu;uchicago.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Columbia University;Massachusetts Institute of Technology;University of Chicago", "aff_unique_dep": ";;", "aff_unique_url": "https://www.columbia.edu;https://web.mit.edu;https://www.uchicago.edu", "aff_unique_abbr": "Columbia;MIT;UChicago", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Box Facets and Cut Facets of Lifted Multicut Polytopes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34387", "id": "JJSj8UXqd4", "proceeding": "https://proceedings.mlr.press/v235/naumann24a.html", "pdf": "https://openreview.net/pdf?id=JJSj8UXqd4", "openreview": "https://openreview.net/forum?id=JJSj8UXqd4", "author_site": "Lucas Fabian Naumann, Jannik Irmai, Shengxian Zhao, Bjoern Andres", "tldr": "", "abstract": "The lifted multicut problem has diverse applications in the field of computer vision. Exact algorithms based on linear programming require an understanding of lifted multicut polytopes. Despite recent progress, two fundamental questions about these polytopes have remained open: Which lower box inequalities define facets, and which cut inequalities define facets? In this article, we answer the first question by establishing conditions that are necessary, sufficient and efficiently decidable. Toward the second question, we show that deciding facet-definingness of cut inequalities is NP-hard. This completes the analysis of canonical facets of lifted multicut polytopes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lucas Fabian Naumann;Jannik Irmai;Shengxian Zhao;Bjoern Andres", "authorids": "~Lucas_Fabian_Naumann1;~Jannik_Irmai1;~Shengxian_Zhao1;~Bjoern_Andres6", "gender": ";M;M;", "homepage": ";https://mlcv.inf.tu-dresden.de/group-irmai-jannik.html;https://mlcv.inf.tu-dresden.de/group-zhao-shengxian.html;", "dblp": ";;;", "google_scholar": ";7Wd_H6cAAAAJ;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Lucas_Fabian_Naumann1;~Jannik_Irmai1;~Shengxian_Zhao1;~Bjoern_Andres6", "aff": ";Technische Universit\u00e4t Dresden;Technische Universit\u00e4t Dresden;", "aff_domain": ";tu-dresden.de;tu-dresden.de;", "position": ";PhD student;PhD student;", "bibtex": "@inproceedings{\nnaumann2024box,\ntitle={Box Facets and Cut Facets of Lifted Multicut Polytopes},\nauthor={Lucas Fabian Naumann and Jannik Irmai and Shengxian Zhao and Bjoern Andres},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JJSj8UXqd4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 378155, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:a-xDbQ2VTwgJ:scholar.google.com/&scioq=Box+Facets+and+Cut+Facets+of+Lifted+Multicut+Polytopes&hl=en&as_sdt=0,48", "gs_version_total": 5, "email": ";tu-dresden.de;tu-dresden.de;", "author_num": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Technische Universit\u00e4t Dresden", "aff_unique_dep": "", "aff_unique_url": "https://tu-dresden.de", "aff_unique_abbr": "TUD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Stable Differentiable Causal Discovery", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34386", "id": "JJZBZW28Gn", "proceeding": "https://proceedings.mlr.press/v235/nazaret24a.html", "pdf": "https://openreview.net/pdf?id=JJZBZW28Gn", "openreview": "https://openreview.net/forum?id=JJZBZW28Gn", "author_site": "Achille Nazaret, Justin Hong, Elham Azizi, David Blei", "tldr": "", "abstract": "Inferring causal relationships as directed acyclic graphs (DAGs) is an important but challenging problem. Differentiable Causal Discovery (DCD) is a promising approach to this problem, framing the search as a continuous optimization. But existing DCD methods are numerically unstable, with poor performance beyond tens of variables. In this paper, we propose Stable Differentiable Causal Discovery (SDCD), a new method that improves previous DCD methods in two ways: (1) It employs an alternative constraint for acyclicity; this constraint is more stable, both theoretically and empirically, and fast to compute. (2) It uses a training procedure tailored for sparse causal graphs, which are common in real-world scenarios. We first derive SDCD and prove its stability and correctness. We then evaluate it with both observational and interventional data and in both small-scale and large-scale settings. We find that SDCD outperforms existing methods in convergence speed and accuracy, and can scale to thousands of variables.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Achille Nazaret;Justin Hong;Elham Azizi;David Blei", "authorids": "~Achille_Nazaret1;~Justin_Hong1;~Elham_Azizi1;~David_Blei2", "gender": "Not Specified;M;F;M", "homepage": ";;http://azizilab.com;http://www.cs.columbia.edu/~blei/", "dblp": "241/4984;;;86/1910", "google_scholar": "iDExfWQAAAAJ;MB9_v3cAAAAJ;3LD7KeIAAAAJ;https://scholar.google.com.tw/citations?user=8OYE6iEAAAAJ", "orcid": ";;0000-0001-5059-6971;", "linkedin": ";;;", "or_profile": "~Achille_Nazaret1;~Justin_Hong1;~Elham_Azizi1;~David_Blei2", "aff": "Columbia University;Columbia University;Columbia University;Columbia University", "aff_domain": "columbia.edu;columbia.edu;columbia.edu;columbia.edu", "position": "PhD student;PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nnazaret2024stable,\ntitle={Stable Differentiable Causal Discovery},\nauthor={Achille Nazaret and Justin Hong and Elham Azizi and David Blei},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JJZBZW28Gn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1456317, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2331167405506160223&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "columbia.edu;columbia.edu;columbia.edu;columbia.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Columbia University", "aff_unique_dep": "", "aff_unique_url": "https://www.columbia.edu", "aff_unique_abbr": "Columbia", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Prodigy: An Expeditiously Adaptive Parameter-Free Learner", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34385", "id": "JJpOssn0uP", "proceeding": "https://proceedings.mlr.press/v235/mishchenko24a.html", "pdf": "https://openreview.net/pdf?id=JJpOssn0uP", "openreview": "https://openreview.net/forum?id=JJpOssn0uP", "author_site": "Konstantin Mishchenko, Aaron Defazio", "tldr": "", "abstract": "We consider the problem of estimating the learning rate in adaptive methods, such as AdaGrad and Adam. We propose Prodigy, an algorithm that provably estimates the distance to the solution $D$, which is needed to set the learning rate optimally. At its core, Prodigy is a modification of the D-Adaptation method for learning-rate-free learning. It improves upon the convergence rate of D-Adaptation by a factor of $\\mathcal{O}(\\sqrt{\\log(D/d_0)})$, where $d_0$ is the initial estimate of $D$. We test Prodigy on 12 common logistic-regression benchmark datasets, VGG11 and ResNet-50 training on CIFAR10, ViT training on Imagenet, LSTM training on IWSLT14, DLRM training on Criteo dataset, VarNet on Knee MRI dataset, as well as RoBERTa and GPT transformer training on BookWiki. Our experimental results show that our approach consistently outperforms D-Adaptation and reaches test accuracy values close to that of hand-tuned Adam.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Konstantin Mishchenko;Aaron Defazio", "authorids": "~Konstantin_Mishchenko1;~Aaron_Defazio1", "gender": ";M", "homepage": "https://konstmish.com/;https://www.aarondefazio.com/", "dblp": "222/9853;116/2969", "google_scholar": "Z8Y8nhQAAAAJ;KEzJsdkAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Konstantin_Mishchenko1;~Aaron_Defazio1", "aff": "Samsung;Meta", "aff_domain": "samsung.com;meta.com", "position": "Researcher;Research Scientist", "bibtex": "@inproceedings{\nmishchenko2024prodigy,\ntitle={Prodigy: An Expeditiously Adaptive Parameter-Free Learner},\nauthor={Konstantin Mishchenko and Aaron Defazio},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JJpOssn0uP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1993593, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10885259152387899313&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8, "email": "samsung.com;meta.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Samsung;Meta", "aff_unique_dep": "Samsung;Meta Platforms, Inc.", "aff_unique_url": "https://www.samsung.com;https://meta.com", "aff_unique_abbr": "Samsung;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "South Korea;United States" }, { "title": "Improved Stability and Generalization Guarantees of the Decentralized SGD Algorithm", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34384", "id": "JKPhWzp7Oi", "proceeding": "https://proceedings.mlr.press/v235/le-bars24a.html", "pdf": "https://openreview.net/pdf?id=JKPhWzp7Oi", "openreview": "https://openreview.net/forum?id=JKPhWzp7Oi", "author_site": "Batiste Le Bars, Aur\u00e9lien Bellet, Marc Tommasi, Kevin Scaman, Giovanni Neglia", "tldr": "", "abstract": "This paper presents a new generalization error analysis for Decentralized Stochastic Gradient Descent (D-SGD) based on algorithmic stability. The obtained results overhaul a series of recent works that suggested an increased instability due to decentralization and a detrimental impact of poorly-connected communication graphs on generalization. On the contrary, we show, for convex, strongly convex and non-convex functions, that D-SGD can always recover generalization bounds analogous to those of classical SGD, suggesting that the choice of graph does not matter. We then argue that this result is coming from a worst-case analysis, and we provide a refined optimization-dependent generalization bound for general convex functions. This new bound reveals that the choice of graph can in fact improve the worst-case bound in certain regimes, and that surprisingly, a poorly-connected graph can even be beneficial for generalization.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Batiste Le bars;Aur\u00e9lien Bellet;Marc Tommasi;Kevin Scaman;Giovanni Neglia", "authorids": "~Batiste_Le_bars1;~Aur\u00e9lien_Bellet1;~Marc_Tommasi1;~Kevin_Scaman1;~Giovanni_Neglia1", "gender": "M;;M;M;", "homepage": "https://batistelb.github.io/;http://researchers.lille.inria.fr/abellet/;https://www.cristal.univ-lille.fr/en/profil/tommasi/;https://scaman.wordpress.com/;http://www-sop.inria.fr/members/Giovanni.Neglia/", "dblp": "236/4921;61/8017;t/MarcTommasi;149/2625;65/3868", "google_scholar": "https://scholar.google.fr/citations?user=A-4CZ8UAAAAJ;https://scholar.google.fr/citations?user=j8svx3IAAAAJ;https://scholar.google.fr/citations?user=IRyM3b8AAAAJ;uiR63a8AAAAJ;https://scholar.google.fr/citations?user=ajJxXnEAAAAJ", "orcid": ";0000-0003-3440-1251;;;", "linkedin": ";;;;", "or_profile": "~Batiste_Le_bars1;~Aur\u00e9lien_Bellet1;~Marc_Tommasi1;~Kevin_Scaman1;~Giovanni_Neglia1", "aff": "INRIA;INRIA;INRIA;INRIA;Inria", "aff_domain": "inria.fr;inria.fr;inria.fr;inria.fr;inria.fr", "position": "Postdoc;Tenured researcher;Researcher;Reseacher;Researcher", "bibtex": "@inproceedings{\nbars2024improved,\ntitle={Improved Stability and Generalization Guarantees of the Decentralized {SGD} Algorithm},\nauthor={Batiste Le bars and Aur{\\'e}lien Bellet and Marc Tommasi and Kevin Scaman and Giovanni Neglia},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JKPhWzp7Oi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 498985, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=860796720962097463&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "email": "inria.fr;inria.fr;inria.fr;inria.fr;inria.fr", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "INRIA", "aff_unique_dep": "", "aff_unique_url": "https://www.inria.fr", "aff_unique_abbr": "INRIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "France" }, { "title": "Feasibility Consistent Representation Learning for Safe Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34383", "id": "JNHK11bAGl", "proceeding": "https://proceedings.mlr.press/v235/cen24b.html", "pdf": "https://openreview.net/pdf?id=JNHK11bAGl", "openreview": "https://openreview.net/forum?id=JNHK11bAGl", "author_site": "Zhepeng Cen, Yihang Yao, Zuxin Liu, Ding Zhao", "tldr": "", "abstract": "In the field of safe reinforcement learning (RL), finding a balance between satisfying safety constraints and optimizing reward performance presents a significant challenge. A key obstacle in this endeavor is the estimation of safety constraints, which is typically more difficult than estimating a reward metric due to the sparse nature of the constraint signals. To address this issue, we introduce a novel framework named Feasibility Consistent Safe Reinforcement Learning (FCSRL). This framework combines representation learning with feasibility-oriented objectives to identify and extract safety-related information from the raw state for safe RL. Leveraging self-supervised learning techniques and a more learnable safety metric, our approach enhances the policy learning and constraint estimation. Empirical evaluations across a range of vector-state and image-based tasks demonstrate that our method is capable of learning a better safety-aware embedding and achieving superior performance than previous representation learning baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhepeng Cen;Yihang Yao;Zuxin Liu;Ding Zhao", "authorids": "~Zhepeng_Cen1;~Yihang_Yao1;~Zuxin_Liu1;~Ding_Zhao1", "gender": "M;;M;", "homepage": "https://czp16.github.io/;https://yihangyao.github.io/;https://www.zuxin.me;https://safeai-lab.github.io", "dblp": "254/6182;305/7045.html;227/3137;", "google_scholar": "M-X3Q-UAAAAJ;EPduTdwAAAAJ;5ApCTCoAAAAJ;z7tPc9IAAAAJ", "orcid": ";;0000-0001-7412-5074;", "linkedin": ";yihang-yao-3a7658249/;zuxin-liu/;", "or_profile": "~Zhepeng_Cen1;~Yihang_Yao1;~Zuxin_Liu1;~Ding_Zhao1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Salesforce AI Research;Carnegie Mellon University", "aff_domain": "andrew.cmu.edu;cmu.edu;salesforce.com;cmu.edu", "position": "PhD student;PhD student;Researcher;Associate Professor", "bibtex": "@inproceedings{\ncen2024feasibility,\ntitle={Feasibility Consistent Representation Learning for Safe Reinforcement Learning},\nauthor={Zhepeng Cen and Yihang Yao and Zuxin Liu and Ding Zhao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JNHK11bAGl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5289838, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4956376864127636913&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "andrew.cmu.edu;cmu.edu;salesforce.com;cmu.edu", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Carnegie Mellon University;Salesforce", "aff_unique_dep": ";Salesforce AI Research", "aff_unique_url": "https://www.cmu.edu;https://www.salesforce.com", "aff_unique_abbr": "CMU;Salesforce AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Measuring Stochastic Data Complexity with Boltzmann Influence Functions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34382", "id": "JNN6QHhLHB", "proceeding": "https://proceedings.mlr.press/v235/ng24b.html", "pdf": "https://openreview.net/pdf?id=JNN6QHhLHB", "openreview": "https://openreview.net/forum?id=JNN6QHhLHB", "author_site": "Nathan Ng, Roger Grosse, Marzyeh Ghassemi", "tldr": "", "abstract": "Estimating the uncertainty of a model\u2019s prediction on a test point is a crucial part of ensuring reliability and calibration under distribution shifts.A minimum description length approach to this problem uses the predictive normalized maximum likelihood (pNML) distribution, which considers every possible label for a data point, and decreases confidence in a prediction if other labels are also consistent with the model and training data. In this work we propose IF-COMP, a scalable and efficient approximation of the pNML distribution that linearizes the model with a temperature-scaled Boltzmann influence function. IF-COMP can be used to produce well-calibrated predictions on test points as well as measure complexity in both labelled and unlabelled settings. We experimentally validate IF-COMP on uncertainty calibration, mislabel detection, and OOD detection tasks, where it consistently matches or beats strong baseline methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nathan Hoyen Ng;Roger Baker Grosse;Marzyeh Ghassemi", "authorids": "~Nathan_Hoyen_Ng1;~Roger_Baker_Grosse1;~Marzyeh_Ghassemi2", "gender": "M;M;F", "homepage": ";http://www.cs.toronto.edu/~rgrosse/;https://www.healthyml.org/", "dblp": "195/5521;26/7058;145/6563", "google_scholar": "psuwztYAAAAJ;xgQd1qgAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Nathan_Hoyen_Ng1;~Roger_Baker_Grosse1;~Marzyeh_Ghassemi2", "aff": "University of Toronto;Vector Institute;Massachusetts Institute of Technology", "aff_domain": "utoronto.ca;vectorinstitute.ai;mit.edu", "position": "PhD student;Faculty Member;Assistant Professor", "bibtex": "@inproceedings{\nng2024measuring,\ntitle={Measuring Stochastic Data Complexity with Boltzmann Influence Functions},\nauthor={Nathan Hoyen Ng and Roger Baker Grosse and Marzyeh Ghassemi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JNN6QHhLHB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 490278, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16485750539530209232&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 6, "email": "utoronto.ca;vectorinstitute.ai;mit.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Toronto;Vector Institute;Massachusetts Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.utoronto.ca;https://vectorinstitute.ai/;https://web.mit.edu", "aff_unique_abbr": "U of T;Vector Institute;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Canada;United States" }, { "title": "Differentially Private Post-Processing for Fair Regression", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34381", "id": "JNeeRjKbuH", "proceeding": "https://proceedings.mlr.press/v235/xian24b.html", "pdf": "https://openreview.net/pdf?id=JNeeRjKbuH", "openreview": "https://openreview.net/forum?id=JNeeRjKbuH", "author_site": "Ruicheng Xian, Qiaobo Li, Gautam Kamath, Han Zhao", "tldr": "", "abstract": "This paper describes a differentially private post-processing algorithm for learning fair regressors satisfying statistical parity, addressing privacy concerns of machine learning models trained on sensitive data, as well as fairness concerns of their potential to propagate historical biases. Our algorithm can be applied to post-process any given regressor to improve fairness by remapping its outputs. It consists of three steps: first, the output distributions are estimated privately via histogram density estimation and the Laplace mechanism, then their Wasserstein barycenter is computed, and the optimal transports to the barycenter are used for post-processing to satisfy fairness. We analyze the sample complexity of our algorithm and provide fairness guarantee, revealing a trade-off between the statistical bias and variance induced from the choice of the number of bins in the histogram, in which using less bins always favors fairness at the expense of error.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ruicheng Xian;Qiaobo Li;Gautam Kamath;Han Zhao", "authorids": "~Ruicheng_Xian1;~Qiaobo_Li1;~Gautam_Kamath1;~Han_Zhao1", "gender": "M;M;M;M", "homepage": "https://rxian.github.io;;http://www.gautamkamath.com/;https://hanzhaoml.github.io/", "dblp": "243/3086.html;;73/11140;03/3520-2", "google_scholar": "Nmk26z4AAAAJ;;MK6zHkYAAAAJ;x942ipYAAAAJ", "orcid": ";;;0000-0002-8579-1600", "linkedin": ";qiaobo-li-581815251/;;", "or_profile": "~Ruicheng_Xian1;~Qiaobo_Li1;~Gautam_Kamath1;~Han_Zhao1", "aff": "University of Illinois Urbana-Champaign;Department of Computer Science, University of Illinois at Urbana-Champaign;University of Waterloo;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;cs.illinois.edu;uwaterloo.ca;illinois.edu", "position": "PhD student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nxian2024differentially,\ntitle={Differentially Private Post-Processing for Fair Regression},\nauthor={Ruicheng Xian and Qiaobo Li and Gautam Kamath and Han Zhao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JNeeRjKbuH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 514917, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14739004700732481189&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "illinois.edu;cs.illinois.edu;uwaterloo.ca;illinois.edu", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;University of Waterloo", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://uwaterloo.ca", "aff_unique_abbr": "UIUC;UW", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;Canada" }, { "title": "Information-Directed Pessimism for Offline Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34380", "id": "JOKOsJHSao", "proceeding": "https://proceedings.mlr.press/v235/koppel24a.html", "pdf": "https://openreview.net/pdf?id=JOKOsJHSao", "openreview": "https://openreview.net/forum?id=JOKOsJHSao", "author_site": "Alec Koppel, Sujay Bhatt, Jiacheng Guo, Joe Eappen, Mengdi Wang, Sumitra Ganesh", "tldr": "", "abstract": "Policy optimization from batch data, i.e., offline reinforcement learning (RL) is important when collecting data from a current policy is not possible. This setting incurs distribution mismatch between batch training data and trajectories from the current policy. Pessimistic offsets estimate mismatch using concentration bounds, which possess strong theoretical guarantees and simplicity of implementation. Mismatch may be conservative in sparse data regions and less so otherwise, which can result in under-performing their no-penalty variants in practice. We derive a new pessimistic penalty as the distance between the data and the true distribution using an evaluable one-sample test known as Stein Discrepancy that requires minimal smoothness conditions, and noticeably, allows a mixture family representation of distribution over next states. This entity forms a quantifier of information in offline data, which justifies calling this approach *information-directed pessimism* (IDP) for offline RL. We further establish that this new penalty based on discrete Stein discrepancy yields practical gains in performance while generalizing the regret of prior art to multimodal distributions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alec Koppel;Sujay Bhatt;Jiacheng Guo;Joe Eappen;Mengdi Wang;Sumitra Ganesh", "authorids": "~Alec_Koppel1;~Sujay_Bhatt1;~Jiacheng_Guo1;~Joe_Eappen2;~Mengdi_Wang1;~Sumitra_Ganesh1", "gender": "M;M;;M;F;F", "homepage": "http://koppel.netlify.app/;;http://;https://jeappen.github.io/;http://mwang.princeton.edu;", "dblp": "149/0076;;;267/5377;;98/463.html", "google_scholar": "8ClxyjIAAAAJ;;;98R6dEQAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": "0000-0003-2447-2873;;;0000-0001-9386-5545;;", "linkedin": "alec-koppel-9860b697/;sujay-bhatt;;jeappen/;;sumitra-ganesh-0379853", "or_profile": "~Alec_Koppel1;~Sujay_Bhatt1;~Jiacheng_Guo1;~Joe_Eappen2;~Mengdi_Wang1;~Sumitra_Ganesh1", "aff": "J.P. Morgan Chase;JP Morgan AI Research;Princeton University;Purdue University;Princeton University;J.P. Morgan Chase", "aff_domain": "jpmorgan.com;jpmchase.com;princeton.edu;purdue.edu;princeton.edu;jpmorgan.com", "position": "Research Team Lead;Researcher;PhD student;PhD student;Full Professor;Researcher", "bibtex": "@inproceedings{\nkoppel2024informationdirected,\ntitle={Information-Directed Pessimism for Offline Reinforcement Learning},\nauthor={Alec Koppel and Sujay Bhatt and Jiacheng Guo and Joe Eappen and Mengdi Wang and Sumitra Ganesh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JOKOsJHSao}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6477357, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15304556854254652112&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "jpmorgan.com;jpmchase.com;princeton.edu;purdue.edu;princeton.edu;jpmorgan.com", "author_num": 6, "aff_unique_index": "0;0;1;2;1;0", "aff_unique_norm": "JPMorgan Chase & Co.;Princeton University;Purdue University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.jpmorganchase.com;https://www.princeton.edu;https://www.purdue.edu", "aff_unique_abbr": "JPM;Princeton;Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Improving Neural Logic Machines via Failure Reflection", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34379", "id": "JObct1zyTb", "proceeding": "https://proceedings.mlr.press/v235/li24f.html", "pdf": "https://openreview.net/pdf?id=JObct1zyTb", "openreview": "https://openreview.net/forum?id=JObct1zyTb", "author_site": "Zhiming Li, Yushi Cao, Yan Zheng, Xu Liu, Bozhi Wu, Li Tianlin, Xiufeng Xu, Junzhe Jiang, Yon Shin Teo, Shang-Wei Lin, Yang Liu", "tldr": "", "abstract": "Reasoning is a fundamental ability towards artificial general intelligence (AGI). Fueled by the success of deep learning, the neural logic machines models (NLMs) have introduced novel neural-symbolic structures and demonstrate great performance and generalization on reasoning and decision-making tasks. However, the original training approaches of the NLMs are still far from perfect, the models would repeat similar mistakes during the training process which leads to sub-optimal performance. To mitigate this issue, we present a novel framework named Failure Reflection Guided Regularizer (FRGR). FRGR first dynamically identifies and summarizes the root cause if the model repeats similar mistakes during training. Then it penalizes the model if it makes similar mistakes in future training iterations. In this way, the model is expected to avoid repeating errors of similar root causes and converge faster to a better-performed optimum. Experimental results on multiple relational reasoning and decision-making tasks demonstrate the effectiveness of FRGR in improving performance, generalization, training efficiency, and data efficiency.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhiming Li;Yushi Cao;YAN ZHENG;Xu Liu;Bozhi Wu;Tianlin Li;Xiufeng Xu;Junzhe Jiang;Yon Shin Teo;Shang-Wei Lin;Yang Liu", "authorids": "~Zhiming_Li1;~Yushi_Cao1;~YAN_ZHENG1;~Xu_Liu9;~Bozhi_Wu1;~Tianlin_Li2;~Xiufeng_Xu1;~Junzhe_Jiang2;~Yon_Shin_Teo1;~Shang-Wei_Lin1;~Yang_Liu36", "gender": ";M;;M;;M;M;M;M;M;", "homepage": ";https://yanzzzzz.github.io;;https://scholar.google.com/citations?user=lro9s00AAAAJ&hl=zh-CN;;;;;https://shangweilin.github.io/;https://personal.ntu.edu.sg/yangliu/;https://scholar.google.com/citations?user=ZyhmKvQAAAAJ&hl=en", "dblp": "274/2297;10/2381-2;93/3167-14;;137/8830;;;;55/4730-1.html;51/3710-3;", "google_scholar": "y8SqtE4AAAAJ;https://scholar.google.com.hk/citations?user=tJuhd1kAAAAJ;JTzLTycAAAAJ;lro9s00AAAAJ;XB6CydwAAAAJ;;;;https://scholar.google.com/citations?hl=en;https://scholar.google.com.sg/citations?hl=en;ZyhmKvQAAAAJ", "orcid": ";;0000-0003-2708-0584;;;0000-0003-2564-6660;;;0000-0002-9726-3434;0000-0001-7300-9215;", "linkedin": ";;liuxu-187825160/;;;;https://linkedin.com/in/junzhejiang;yon-shin-teo-246bb0b8;shang-wei-lin-7a94091b/;;", "or_profile": "~Yushi_Cao1;~YAN_ZHENG1;~Xu_Liu9;~Bozhi_Wu1;~Tianlin_Li2;~Xiufeng_Xu1;~Junzhe_Jiang2;~Yon_Shin_Teo1;~Shang-Wei_Lin1;~Yang_Liu36;~Bob_Zhiming_Li1", "aff": "Nanyang Technological University;Tianjin Unibersity, China;National University of Singapore;Nanyang Technological University;Nanyang Technological University;Nanyang Technological University;Hong Kong Polytechnic University;Continental Automotive;Nanyang Technological University (NTU);Nanyang Technological University;Nanyang Technological University", "aff_domain": "ntu.edu.sg;tju.edu.cn;nus.edu.sg;ntu.edu.sg;ntu.edu.sg;ntu.edu.sg;polyu.edu.hk;continental-corporation.com;ccds.ntu.edu.sg;ntu.edu.sg;ntu.edu.sg", "position": "PhD student;Associate Professor;PhD student;PhD student;PhD student;PhD student;PhD student;Principal Researcher;Assistant Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nli2024improving,\ntitle={Improving Neural Logic Machines via Failure Reflection},\nauthor={Zhiming Li and Yushi Cao and YAN ZHENG and Xu Liu and Bozhi Wu and Tianlin Li and Xiufeng Xu and Junzhe Jiang and Yon Shin Teo and Shang-Wei Lin and Yang Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JObct1zyTb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7406392, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13541442010560630038&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "ntu.edu.sg;tju.edu.cn;nus.edu.sg;ntu.edu.sg;ntu.edu.sg;ntu.edu.sg;polyu.edu.hk;continental-corporation.com;ccds.ntu.edu.sg;ntu.edu.sg;ntu.edu.sg", "author_num": 11, "aff_unique_index": "0;1;2;0;0;0;3;4;0;0;0", "aff_unique_norm": "Nanyang Technological University;Tianjin University;National University of Singapore;Hong Kong Polytechnic University;Continental AG", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.ntu.edu.sg;http://www.tju.edu.cn;https://www.nus.edu.sg;https://www.polyu.edu.hk;https://www.continental-automotive.com", "aff_unique_abbr": "NTU;TJU;NUS;PolyU;Continental", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;0;0;0;0;1;2;0;0;0", "aff_country_unique": "Singapore;China;Germany" }, { "title": "Prototypical Transformer As Unified Motion Learners", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34378", "id": "JOrLz5d7OW", "proceeding": "https://proceedings.mlr.press/v235/han24d.html", "pdf": "https://openreview.net/pdf?id=JOrLz5d7OW", "openreview": "https://openreview.net/forum?id=JOrLz5d7OW", "author_site": "Cheng Han, Yawen Lu, Guohao Sun, James Liang, Zhiwen Cao, Qifan Wang, Qiang Guan, Sohail Dianat, Raghuveer Rao, Tong Geng, ZHIQIANG TAO, Dongfang Liu", "tldr": "", "abstract": "In this work, we introduce the Prototypical Transformer (ProtoFormer), a general and unified framework that approaches various motion tasks from a prototype perspective. ProtoFormer seamlessly integrates prototype learning with Transformer by thoughtfully considering motion dynamics, introducing two innovative designs. First, Cross-Attention Prototyping discovers prototypes based on signature motion patterns, providing transparency in understanding motion scenes. Second, Latent Synchronization guides feature representation learning via prototypes, effectively mitigating the problem of motion uncertainty. Empirical results demonstrate that our approach achieves competitive performance on popular motion tasks such as optical flow and scene depth. Furthermore, it exhibits generality across various downstream tasks, including object tracking and video stabilization.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Cheng Han;Yawen Lu;Guohao Sun;James Chenhao Liang;Zhiwen Cao;Qifan Wang;Qiang Guan;Sohail Dianat;Raghuveer Rao;Tong Geng;ZHIQIANG TAO;Dongfang Liu", "authorids": "~Cheng_Han1;~Yawen_Lu2;~Guohao_Sun1;~James_Chenhao_Liang1;~Zhiwen_Cao1;~Qifan_Wang2;~Qiang_Guan1;~Sohail_Dianat1;~Raghuveer_Rao1;~Tong_Geng1;~ZHIQIANG_TAO2;~Dongfang_Liu1", "gender": "M;M;M;M;M;M;M;;M;;;M", "homepage": "https://chenghan111.github.io/;https://guohaosun.com;https://jamesliang819.github.io/;https://va.tech.purdue.edu/ivil/index.html;https://wqfcr.github.io/;https://www.cs.kent.edu/~qguan/;https://www.rit.edu/engineering/directory/sadeee-sohail-dianat;;https://tonytgeng.com;http://ztao.cc/;https://www.rit.edu/directory/dxleec-dongfang-liu;", "dblp": "53/6096-1.html;;323/3403;;33/8610;20/1255;;;188/5531;135/5229.html;;254/8061", "google_scholar": "VgkEKZwAAAAJ;tf2GWowAAAAJ;cR8m4CcAAAAJ;;LrSyLosAAAAJ;kpPoy4gAAAAJ;https://scholar.google.com/scholar?hl=en;;1B_nk28AAAAJ;sEKglOkAAAAJ;uICY0vEAAAAJ;", "orcid": "0000-0002-8145-3436;0009-0002-0935-6196;;;0000-0002-7570-5756;0000-0002-3804-8945;;;0000-0002-3644-2922;;;0000-0003-2129-133X", "linkedin": "chenghan-87129219a/;guohaosun;;;;qiang-guan-51534128/;;raghuveer-rao-3a99815/;;;;", "or_profile": "~Cheng_Han1;~Guohao_Sun1;~James_Chenhao_Liang1;~Zhiwen_Cao1;~Qifan_Wang2;~Qiang_Guan1;~Sohail_Dianat1;~Raghuveer_Rao1;~Tong_Geng1;~ZHIQIANG_TAO2;~Dongfang_Liu1;~Raymond_Lu1", "aff": "Rochester Institute of Technology;Rochester Institute of Technology;Rochester Institute of Technology;Adobe Systems;Meta AI;Kent State University;Rochester Institute of Technology;DEVCOM Army Research Laboratory;University of Rochester;Rochester Institute of Technology;Rochester Institute of Technology;Purdue University", "aff_domain": "rit.edu;rit.edu;rit.edu;adobe.com;fb.com;kent.edu;rit.edu;army.mil;rochester.edu;rit.edu;rit.edu;purdue.edu", "position": "PhD student;PhD student;PhD student;Researcher;Principal Researcher;Associate Professor;Full Professor;Researcher;Assistant Professor;Assistant Professor;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nhan2024prototypical,\ntitle={Prototypical Transformer As Unified Motion Learners},\nauthor={Cheng Han and Yawen Lu and Guohao Sun and James Chenhao Liang and Zhiwen Cao and Qifan Wang and Qiang Guan and Sohail Dianat and Raghuveer Rao and Tong Geng and ZHIQIANG TAO and Dongfang Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JOrLz5d7OW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3893015, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18075180692124670639&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "rit.edu;rit.edu;rit.edu;adobe.com;fb.com;kent.edu;rit.edu;army.mil;rochester.edu;rit.edu;rit.edu;purdue.edu", "author_num": 12, "aff_unique_index": "0;0;0;1;2;3;0;4;5;0;0;6", "aff_unique_norm": "Rochester Institute of Technology;Adobe;Meta;Kent State University;United States Army Research Laboratory;University of Rochester;Purdue University", "aff_unique_dep": ";Adobe Systems Incorporated;Meta AI;;Army Research Laboratory;;", "aff_unique_url": "https://www.rit.edu;https://www.adobe.com;https://meta.com;https://www.kent.edu;https://www.arl.army.mil;https://www.rochester.edu;https://www.purdue.edu", "aff_unique_abbr": "RIT;Adobe;Meta;KSU;ARL;U of R;Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Bifurcated Attention for Single-Context Large-Batch Sampling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34377", "id": "JPNBFWQ9H2", "proceeding": "https://proceedings.mlr.press/v235/athiwaratkun24a.html", "pdf": "https://openreview.net/pdf?id=JPNBFWQ9H2", "openreview": "https://openreview.net/forum?id=JPNBFWQ9H2", "author_site": "Ben Athiwaratkun, Sujan Kumar Gonugondla, Sanjay Krishna Gouda, Haifeng Qian, Hantian Ding, Qing Sun, Jun Wang, Jiacheng Guo, Liangfu Chen, parminder bhatia, Ramesh M Nallapati, Sudipta Sengupta, Bing Xiang", "tldr": "", "abstract": "In our study, we present bifurcated attention, a method developed for language model inference in single-context batch sampling contexts. This approach aims to reduce redundant memory IO costs, a significant factor in latency for high batch sizes and long context lengths. Bifurcated attention achieves this by dividing the attention mechanism during incremental decoding into two distinct GEMM operations, focusing on the KV cache from prefill and the decoding process. This method ensures precise computation and maintains the usual computational load (FLOPs) of standard attention mechanisms, but with reduced memory IO. Bifurcated attention is also compatible with multi-query attention mechanism known for reduced memory IO for KV cache, further enabling higher batch size and context length. The resulting efficiency leads to lower latency, improving suitability for real-time applications, e.g., enabling massively-parallel answer generation without substantially increasing latency, enhancing performance when integrated with post-processing techniques such as reranking.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ben Athiwaratkun;Sujan Kumar Gonugondla;Sanjay Krishna Gouda;Haifeng Qian;Hantian Ding;Qing Sun;Jun Wang;Jiacheng Guo;Liangfu Chen;Parminder Bhatia;Ramesh Nallapati;Sudipta Sengupta;Bing Xiang", "authorids": "~Ben_Athiwaratkun1;~Sujan_Kumar_Gonugondla1;~Sanjay_Krishna_Gouda1;~Haifeng_Qian1;~Hantian_Ding1;~Qing_Sun2;~Jun_Wang32;~Jiacheng_Guo4;~Liangfu_Chen1;~Parminder_Bhatia1;~Ramesh_Nallapati1;~Sudipta_Sengupta1;~Bing_Xiang2", "gender": "M;;M;M;M;F;M;M;M;M;M;M;", "homepage": "https://benathi.github.io;https://gsujankumar.github.io;;https://sites.google.com/view/haifengqian;;https://computing.ece.vt.edu/~sunqing/;https://rich-junwang.github.io/;;https://liangfu.org/;;;https://people.csail.mit.edu/sudipta/;", "dblp": "166/1659;166/6408.html;;61/6767;242/8095;https://dblp.uni-trier.de/pers/hd/s/Sun:Qing;125/8189-122;;;168/8615;59/4797;88/4889;", "google_scholar": "KZpZTTQAAAAJ;F_ud9E4AAAAJ;_zJ8IOEAAAAJ;https://scholar.google.com/citations?hl=en;nEuMO58AAAAJ;sSlAO5sAAAAJ;ct92MO4AAAAJ;;E7-jvs0AAAAJ;;;h8M0U0oAAAAJ;A6yjdJAAAAAJ", "orcid": ";0000-0003-4743-6461;;0000-0002-7189-6903;;;;;;;;;", "linkedin": ";sujan-kumar-gonugondla-ab6787142/;;haifengqian;;;;jiacheng-guo;;;;sudiptasengupta;", "or_profile": "~Ben_Athiwaratkun1;~Sujan_Kumar_Gonugondla1;~Sanjay_Krishna_Gouda1;~Haifeng_Qian1;~Hantian_Ding1;~Qing_Sun2;~Jun_Wang32;~Jiacheng_Guo4;~Liangfu_Chen1;~Parminder_Bhatia1;~Ramesh_Nallapati1;~Sudipta_Sengupta1;~Bing_Xiang2", "aff": "Amazon;Amazon;Amazon;Amazon;Amazon;Amazon;Amazon;;;GEHC;Amazon Web Services;Amazon AWS;Goldman Sachs", "aff_domain": "amazon.com;amazon.com;amazon.com;amazon.com;amazon.com;amazon.com;amazon.com;;;gehealthcare.com;amazon.com;amazon.com;gs.com", "position": "AI Scientist;Researcher;Researcher;Senior Applied Scientist;Researcher;Researcher;Applied Scientist;;;Principal Researcher;Senior Principal Scientist;Vice President & Distinguished Scientist;Managing Director", "bibtex": "@inproceedings{\nathiwaratkun2024bifurcated,\ntitle={Bifurcated Attention for Single-Context Large-Batch Sampling},\nauthor={Ben Athiwaratkun and Sujan Kumar Gonugondla and Sanjay Krishna Gouda and Haifeng Qian and Hantian Ding and Qing Sun and Jun Wang and Jiacheng Guo and Liangfu Chen and Parminder Bhatia and Ramesh Nallapati and Sudipta Sengupta and Bing Xiang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JPNBFWQ9H2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2126860, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 13, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10718465505611225533&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "amazon.com;amazon.com;amazon.com;amazon.com;amazon.com;amazon.com;amazon.com;;;gehealthcare.com;amazon.com;amazon.com;gs.com", "author_num": 13, "aff_unique_index": "0;0;0;0;0;0;0;1;0;0;2", "aff_unique_norm": "Amazon;General Electric Healthcare;Goldman Sachs", "aff_unique_dep": "Amazon.com, Inc.;;", "aff_unique_url": "https://www.amazon.com;https://www.gehealthcare.com;https://www.goldmansachs.com", "aff_unique_abbr": "Amazon;GEHC;GS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Reward Model Learning vs. Direct Policy Optimization: A Comparative Analysis of Learning from Human Preferences", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34376", "id": "JQlEUfzhuA", "proceeding": "https://proceedings.mlr.press/v235/nika24a.html", "pdf": "https://openreview.net/pdf?id=JQlEUfzhuA", "openreview": "https://openreview.net/forum?id=JQlEUfzhuA", "author_site": "Andi Nika, Debmalya Mandal, Parameswaran Kamalaruban, Georgios Tzannetos, Goran Radanovic, Adish Singla", "tldr": "", "abstract": "In this paper, we take a step towards a deeper understanding of learning from human preferences by systematically comparing the paradigm of reinforcement learning from human feedback (RLHF) with the recently proposed paradigm of direct preference optimization (DPO). We focus our attention on the class of loglinear policy parametrization and linear reward functions. In order to compare the two paradigms, we first derive minimax statistical bounds on the suboptimality gap induced by both RLHF and DPO, assuming access to an oracle that exactly solves the optimization problems. We provide a detailed discussion on the relative comparison between the two paradigms, simultaneously taking into account the sample size, policy and reward class dimensions, and the regularization temperature. Moreover, we extend our analysis to the approximate optimization setting and derive exponentially decaying convergence rates for both RLHF and DPO. Next, we analyze the setting where the ground-truth reward is not realizable and find that, while RLHF incurs a constant additional error, DPO retains its asymptotically decaying gap by just tuning the temperature accordingly. Finally, we extend our comparison to the Markov decision process setting, where we generalize our results with exact optimization. To the best of our knowledge, we are the first to provide such a comparative analysis for RLHF and DPO.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Andi Nika;Debmalya Mandal;Parameswaran Kamalaruban;Georgios Tzannetos;Goran Radanovic;Adish Singla", "authorids": "~Andi_Nika1;~Debmalya_Mandal2;~Parameswaran_Kamalaruban2;~Georgios_Tzannetos1;~Goran_Radanovic1;~Adish_Singla2", "gender": "M;M;M;M;;", "homepage": "https://andinika.github.io/;https://debmandal.github.io;https://markovkernel.net/;https://georgetzannetos.github.io/;;https://machineteaching.mpi-sws.org/adishsingla.html", "dblp": "268/2761;151/3685;164/7413;345/8576;133/1771;58/657", "google_scholar": "oTIFCrEAAAAJ;OquWQpEAAAAJ;0ioRCikAAAAJ;E_EE9gUAAAAJ;KBG_JlAAAAAJ;kXz2seUAAAAJ", "orcid": ";;;;;", "linkedin": ";;;gtzannetos;;", "or_profile": "~Andi_Nika1;~Debmalya_Mandal2;~Parameswaran_Kamalaruban2;~Georgios_Tzannetos1;~Goran_Radanovic1;~Adish_Kumar_Singla1", "aff": "MPI-SWS;University of Warwick;Featurespace;MPI-SWS;MPI-SWS;Max Planck Institute for Software Systems (MPI-SWS)", "aff_domain": "mpi-sws.org;warwick.ac.uk;featurespace.co.uk;mpi-sws.org;mpi-sws.org;mpi-sws.org", "position": "PhD student;Assistant Professor;Researcher;PhD student;Research group leader;Researcher", "bibtex": "@inproceedings{\nnika2024reward,\ntitle={Reward Model Learning vs. Direct Policy Optimization: A Comparative Analysis of Learning from Human Preferences},\nauthor={Andi Nika and Debmalya Mandal and Parameswaran Kamalaruban and Georgios Tzannetos and Goran Radanovic and Adish Singla},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JQlEUfzhuA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 520273, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5121664158073849791&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "mpi-sws.org;warwick.ac.uk;featurespace.co.uk;mpi-sws.org;mpi-sws.org;mpi-sws.org", "author_num": 6, "aff_unique_index": "0;1;2;0;0;0", "aff_unique_norm": "Max Planck Institute for Software Systems;University of Warwick;Featurespace Ltd.", "aff_unique_dep": ";;", "aff_unique_url": "https://www.mpi-sws.org;https://www.warwick.ac.uk;https://www.featurespace.co.uk", "aff_unique_abbr": "MPI-SWS;Warwick;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;0;0", "aff_country_unique": "Germany;United Kingdom" }, { "title": "Image Clustering with External Guidance", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34375", "id": "JSYN891WnB", "proceeding": "https://proceedings.mlr.press/v235/li24aa.html", "pdf": "https://openreview.net/pdf?id=JSYN891WnB", "openreview": "https://openreview.net/forum?id=JSYN891WnB", "author_site": "Yunfan Li, Peng Hu, Dezhong Peng, Jiancheng Lv, Jianping Fan, Xi Peng", "tldr": "", "abstract": "The core of clustering lies in incorporating prior knowledge to construct supervision signals. From classic k-means based on data compactness to recent contrastive clustering guided by self-supervision, the evolution of clustering methods intrinsically corresponds to the progression of supervision signals. At present, substantial efforts have been devoted to mining internal supervision signals from data. Nevertheless, the abundant external knowledge such as semantic descriptions, which naturally conduces to clustering, is regrettably overlooked. In this work, we propose leveraging external knowledge as a new supervision signal to guide clustering. To implement and validate our idea, we design an externally guided clustering method (Text-Aided Clustering, TAC), which leverages the textual semantics of WordNet to facilitate image clustering. Specifically, TAC first selects and retrieves WordNet nouns that best distinguish images to enhance the feature discriminability. Then, TAC collaborates text and image modalities by mutually distilling cross-modal neighborhood information. Experiments demonstrate that TAC achieves state-of-the-art performance on five widely used and three more challenging image clustering benchmarks, including the full ImageNet-1K dataset. The code can be accessed at https://github.com/XLearning-SCU/2024-ICML-TAC.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yunfan Li;Peng Hu;Dezhong Peng;Jiancheng Lv;Jianping Fan;Xi Peng", "authorids": "~Yunfan_Li1;~Peng_Hu2;~Dezhong_Peng1;~Jiancheng_Lv2;~Jianping_Fan4;~Xi_Peng3", "gender": "M;M;M;M;M;M", "homepage": "https://yunfan-li.github.io/;https://penghu-cs.github.io/;https://cs.scu.edu.cn/info/1249/10284.htm;https://cs.scu.edu.cn/info/1303/13767.htm;;http://www.pengxi.me", "dblp": "80/1874-3;11/6278-2;;;69/2360.html;18/931-1", "google_scholar": "JmXIt5oAAAAJ;gvESkwYAAAAJ;0gupif8AAAAJ;https://scholar.google.com/citations?hl=zh-CN;;bw9FOHAAAAAJ", "orcid": ";0000-0003-3868-3997;;;;", "linkedin": ";;;;;", "or_profile": "~Yunfan_Li1;~Peng_Hu2;~Dezhong_Peng1;~Jiancheng_Lv2;~Jianping_Fan4;~Xi_Peng2", "aff": "Sichuan University;Sichuan University;Sichuan University;Sichuan University;Northwest University;Sichuan University", "aff_domain": "scu.edu.cn;scu.edu.cn;scu.edu.cn;scu.edu.cn;nwu.edu.cn;scu.edu.cn", "position": "PhD student;Associate Professor;Full Professor;Full Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nli2024image,\ntitle={Image Clustering with External Guidance},\nauthor={Yunfan Li and Peng Hu and Dezhong Peng and Jiancheng Lv and Jianping Fan and Xi Peng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JSYN891WnB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2253150, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2080584870901806301&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 8, "email": "scu.edu.cn;scu.edu.cn;scu.edu.cn;scu.edu.cn;nwu.edu.cn;scu.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;0;1;0", "aff_unique_norm": "Sichuan University;Northwest University", "aff_unique_dep": ";", "aff_unique_url": "https://www.scu.edu.cn;https://www.nwu.edu.cn", "aff_unique_abbr": "SCU;NWU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Identifiability Matters: Revealing the Hidden Recoverable Condition in Unbiased Learning to Rank", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34374", "id": "JU3xHh1vWw", "proceeding": "https://proceedings.mlr.press/v235/chen24z.html", "pdf": "https://openreview.net/pdf?id=JU3xHh1vWw", "openreview": "https://openreview.net/forum?id=JU3xHh1vWw", "author_site": "Mouxiang Chen, Chenghao Liu, Zemin Liu, Zhuo Li, Jianling Sun", "tldr": "", "abstract": "Unbiased Learning to Rank (ULTR) aims to train unbiased ranking models from biased click logs, by explicitly modeling a generation process for user behavior and fitting click data based on examination hypothesis. Previous research found empirically that the true latent relevance is mostly recoverable through click fitting. However, we demonstrate that this is not always achievable, resulting in a significant reduction in ranking performance. This research investigates the conditions under which relevance can be recovered from click data in the first principle. We initially characterize a ranking model as identifiable if it can recover the true relevance up to a scaling transformation, a criterion sufficient for the pairwise ranking objective. Subsequently, we investigate an equivalent condition for identifiability, articulated as a graph connectivity test problem: the recovery of relevance is feasible if and only if the identifiability graph (IG), derived from the underlying structure of the dataset, is connected. The presence of a disconnected IG may lead to degenerate cases and suboptimal ranking performance. To tackle this challenge, we introduce two methods, namely node intervention and node merging, designed to modify the dataset and restore the connectivity of the IG. Empirical results derived from a simulated dataset and two real-world LTR benchmark datasets not only validate our proposed theory, but also demonstrate the effectiveness of our methods in alleviating data bias when the relevance model is unidentifiable.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mouxiang Chen;Chenghao Liu;Zemin Liu;Zhuo Li;Jianling Sun", "authorids": "~Mouxiang_Chen1;~Chenghao_Liu1;~Zemin_Liu1;~Zhuo_Li11;~Jianling_Sun2", "gender": ";M;M;M;", "homepage": "https://me.keytoix.vip;;https://zemin-liu.github.io/;;", "dblp": "297/0365;;17/964.html;;", "google_scholar": "5F0n6-4AAAAJ;https://scholar.google.com/citations?hl=en;IxHO1nkAAAAJ;;", "orcid": "0000-0002-8341-1467;;0000-0001-6262-9435;0000-0001-9381-7359;", "linkedin": ";chenghao-liu-40a62a56/;;zhuo-li-87891a22/;", "or_profile": "~Mouxiang_Chen1;~Chenghao_Liu1;~Zemin_Liu1;~Zhuo_Li11;~Jianling_Sun2", "aff": "Zhejiang University;Salesforce AI Research;National University of Singapore;Zhejiang University;", "aff_domain": "zju.edu.cn;salesforce.com;nus.edu;zju.edu.cn;", "position": "PhD student;Researcher;Postdoc;Researcher;", "bibtex": "@inproceedings{\nchen2024identifiability,\ntitle={Identifiability Matters: Revealing the Hidden Recoverable Condition in Unbiased Learning to Rank},\nauthor={Mouxiang Chen and Chenghao Liu and Zemin Liu and Zhuo Li and Jianling Sun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JU3xHh1vWw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 907312, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6290617749707121515&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "zju.edu.cn;salesforce.com;nus.edu;zju.edu.cn;", "author_num": 5, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Zhejiang University;Salesforce;National University of Singapore", "aff_unique_dep": ";Salesforce AI Research;", "aff_unique_url": "https://www.zju.edu.cn;https://www.salesforce.com;https://www.nus.edu.sg", "aff_unique_abbr": "ZJU;Salesforce AI;NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0", "aff_country_unique": "China;United States;Singapore" }, { "title": "Learning Cognitive Maps from Transformer Representations for Efficient Planning in Partially Observed Environments", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34373", "id": "JUa5XNXuoT", "proceeding": "https://proceedings.mlr.press/v235/dedieu24a.html", "pdf": "https://openreview.net/pdf?id=JUa5XNXuoT", "openreview": "https://openreview.net/forum?id=JUa5XNXuoT", "author_site": "Antoine Dedieu, Wolfgang Lehrach, Guangyao Zhou, Dileep George, Miguel Lazaro-Gredilla", "tldr": "", "abstract": "Despite their stellar performance on a wide range of tasks, including in-context tasks only revealed during inference, vanilla transformers and variants trained for next-token predictions (a) do not learn an explicit world model of their environment which can be flexibly queried and (b) cannot be used for planning or navigation. In this paper, we consider partially observed environments (POEs), where an agent receives perceptually aliased observations as it navigates, which makes path planning hard. We introduce a transformer with (multiple) discrete bottleneck(s), TDB, whose latent codes learn a compressed representation of the history of observations and actions. After training a TDB to predict the future observation(s) given the history, we extract interpretable cognitive maps of the environment from its active bottleneck(s) indices. These maps are then paired with an external solver to solve (constrained) path planning problems. First, we show that a TDB trained on POEs (a) retains the near-perfect predictive performance of a vanilla transformer or an LSTM while (b) solving shortest path problems exponentially faster. Second, a TDB extracts interpretable representations from text datasets, while reaching higher in-context accuracy than vanilla sequence models. Finally, in new POEs, a TDB (a) reaches near-perfect in-context accuracy, (b) learns accurate in-context cognitive maps (c) solves in-context path planning problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Antoine Dedieu;Wolfgang Lehrach;Guangyao Zhou;Dileep George;Miguel Lazaro-Gredilla", "authorids": "~Antoine_Dedieu1;~Wolfgang_Lehrach1;~Guangyao_Zhou1;~Dileep_George1;~Miguel_Lazaro-Gredilla1", "gender": "M;M;M;;M", "homepage": "https://antoine-dedieu.github.io;;https://stanniszhou.github.io;;", "dblp": "217/3589.html;190/7782;;;77/4660", "google_scholar": "Hgoc3FUAAAAJ;;RW94MCIAAAAJ;;SFjDQk8AAAAJ", "orcid": ";;;;", "linkedin": ";;;;miguel-lazaro-g/", "or_profile": "~Antoine_Dedieu1;~Wolfgang_Lehrach1;~Guangyao_Zhou1;~Dileep_George1;~Miguel_Lazaro-Gredilla1", "aff": "Google DeepMind;Google Deepmind;Google DeepMind;Vicarious AI;Google Deepmind", "aff_domain": "deepmind.com;deepmind.com;google.com;vicarious.com;google.com", "position": "Researcher;Researcher;Research Scientist;Co-founder;Research Scientist", "bibtex": "@inproceedings{\ndedieu2024learning,\ntitle={Learning Cognitive Maps from Transformer Representations for Efficient Planning in Partially Observed Environments},\nauthor={Antoine Dedieu and Wolfgang Lehrach and Guangyao Zhou and Dileep George and Miguel Lazaro-Gredilla},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JUa5XNXuoT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4607203, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17081312974917671718&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "deepmind.com;deepmind.com;google.com;vicarious.com;google.com", "author_num": 5, "aff_unique_index": "0;1;0;2;1", "aff_unique_norm": "Google;DeepMind;Vicarious AI", "aff_unique_dep": "Google DeepMind;DeepMind;", "aff_unique_url": "https://deepmind.com;https://deepmind.com;https://www.vicarious.com", "aff_unique_abbr": "DeepMind;DeepMind;Vicarious AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "Safe and Robust Subgame Exploitation in Imperfect Information Games", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34372", "id": "JV84NVo1em", "proceeding": "https://proceedings.mlr.press/v235/ge24b.html", "pdf": "https://openreview.net/pdf?id=JV84NVo1em", "openreview": "https://openreview.net/forum?id=JV84NVo1em", "author_site": "Zhenxing Ge, Zheng Xu, Tianyu Ding, Linjian Meng, Bo An, Wenbin Li, Yang Gao", "tldr": "", "abstract": "Opponent exploitation is an important task for players to exploit the weaknesses of others in games. Existing approaches mainly focus on balancing between exploitation and exploitability but are often vulnerable to modeling errors and deceptive adversaries. To address this problem, our paper offers a novel perspective on the safety of opponent exploitation, named Adaptation Safety. This concept leverages the insight that strategies, even those not explicitly aimed at opponent exploitation, may inherently be exploitable due to computational complexities, rendering traditional safety overly rigorous. In contrast, adaptation safety requires that the strategy should not be more exploitable than it would be in scenarios where opponent exploitation is not considered. Building on such adaptation safety, we further propose an Opponent eXploitation Search (OX-Search) framework by incorporating real-time search techniques for efficient online opponent exploitation. Moreover, we provide theoretical analyses to show the adaptation safety and robust exploitation of OX-Search, even with inaccurate opponent models. Empirical evaluations in popular poker games demonstrate OX-Search's superiority in both exploitability and exploitation compared to previous methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhenxing Ge;Zheng Xu;Tianyu Ding;Linjian Meng;Bo An;Wenbin Li;Yang Gao", "authorids": "~Zhenxing_Ge1;~Zheng_Xu3;~Tianyu_Ding1;~Linjian_Meng1;~Bo_An2;~Wenbin_Li5;~Yang_Gao3", "gender": "M;;M;M;M;M;M", "homepage": "http://cs.nju.edu.cn/rl;https://xuzheng.space/;;https://personal.ntu.edu.sg/boan/;https://cs.nju.edu.cn/liwenbin/;https://cs.nju.edu.cn/gaoyang/;https://www.tianyuding.com", "dblp": ";;352/8649;42/6178-1.html;27/1736-6.html;89/4402-1;134/4796", "google_scholar": ";;;PEEpuNwAAAAJ;K-kC4yYAAAAJ;https://scholar.google.com.tw/citations?user=CJwLwzQAAAAJ;Qi7zTOcAAAAJ", "orcid": ";0009-0007-3486-6864;0000-0003-4616-760X;0000-0002-7064-7438;;;0000-0001-8445-4330", "linkedin": ";zheng-xu-259140193/;;;;;tianyuding/", "or_profile": "~Zhenxing_Ge1;~Zheng_Xu3;~Linjian_Meng1;~Bo_An2;~Wenbin_Li5;~Yang_Gao3;~Tianyu_DING2", "aff": "Nanjing University;Nanjing University;Nanjing University;Nanyang Technological University;Nanjing University;Nanjing University;Microsoft", "aff_domain": "nju.edu.cn;nju.edu.cn;nju.edu.cn;ntu.edu.sg;nju.edu.cn;nju.edu.cn;microsoft.com", "position": "PhD student;MS student;PhD student;Full Professor;Assistant Professor;Full Professor;Principal Researcher", "bibtex": "@inproceedings{\nge2024safe,\ntitle={Safe and Robust Subgame Exploitation in Imperfect Information Games},\nauthor={Zhenxing Ge and Zheng Xu and Tianyu Ding and Linjian Meng and Bo An and Wenbin Li and Yang Gao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JV84NVo1em}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 688719, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5439073009334386269&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 7, "email": "nju.edu.cn;nju.edu.cn;nju.edu.cn;ntu.edu.sg;nju.edu.cn;nju.edu.cn;microsoft.com", "author_num": 7, "aff_unique_index": "0;0;0;1;0;0;2", "aff_unique_norm": "Nanjing University;Nanyang Technological University;Microsoft", "aff_unique_dep": ";;Microsoft Corporation", "aff_unique_url": "https://www.nju.edu.cn;https://www.ntu.edu.sg;https://www.microsoft.com", "aff_unique_abbr": "Nanjing U;NTU;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0;2", "aff_country_unique": "China;Singapore;United States" }, { "title": "Estimating the Permanent by Nesting Importance Sampling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34371", "id": "JVORowD4MD", "proceeding": "https://proceedings.mlr.press/v235/harviainen24a.html", "pdf": "https://openreview.net/pdf?id=JVORowD4MD", "openreview": "https://openreview.net/forum?id=JVORowD4MD", "author_site": "Juha Harviainen, Mikko Koivisto", "tldr": "", "abstract": "Sequential importance sampling (SIS) is one of the prominent methods for estimating high-dimensional integrals. For example, it is empirically the most efficient method known for estimating the permanent of nonnegative matrices, a notorious problem with numerous applications in computer science, statistics, and other fields. Unfortunately, SIS typically fails to provide accuracy guarantees due to difficulties in bounding the variance of the importance weights; for estimating the permanent with accuracy guarantees, the most efficient practical methods known are based on rejection sampling. Taking the best of both worlds, we give a variant of SIS, in which sampling is proportional to the upper bound used in rejection sampling. We show that this method is provably more efficient than its rejection sampling counterpart, particularly in high accuracy regimes. On estimating the permanent, we empirically obtain up to two orders-of-magnitude speedups over a state-of-the-art rejection sampling method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Juha Harviainen;Mikko Koivisto", "authorids": "~Juha_Harviainen1;~Mikko_Koivisto1", "gender": "M;M", "homepage": "https://juhaharviainen.com/;", "dblp": "258/7754;k/MikkoKoivisto", "google_scholar": "ftsJV7kAAAAJ;", "orcid": "0000-0002-4581-840X;", "linkedin": ";", "or_profile": "~Juha_Harviainen1;~Mikko_Koivisto1", "aff": "University of Helsinki;University of Helsinki", "aff_domain": "helsinki.fi;helsinki.fi", "position": "PhD student;Professor", "bibtex": "@inproceedings{\nharviainen2024estimating,\ntitle={Estimating the Permanent by Nesting Importance Sampling},\nauthor={Juha Harviainen and Mikko Koivisto},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JVORowD4MD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 411370, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:1dVtLAAAtxYJ:scholar.google.com/&scioq=Estimating+the+Permanent+by+Nesting+Importance+Sampling&hl=en&as_sdt=0,5", "gs_version_total": 8, "email": "helsinki.fi;helsinki.fi", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Helsinki", "aff_unique_dep": "", "aff_unique_url": "https://www.helsinki.fi", "aff_unique_abbr": "UH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Finland" }, { "title": "Towards AutoAI: Optimizing a Machine Learning System with Black-box and Differentiable Components", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34370", "id": "JVhUR8q27o", "proceeding": "https://proceedings.mlr.press/v235/chen24m.html", "pdf": "https://openreview.net/pdf?id=JVhUR8q27o", "openreview": "https://openreview.net/forum?id=JVhUR8q27o", "author_site": "Zhiliang Chen, Chuan-Sheng Foo, Bryan Kian Hsiang Low", "tldr": "", "abstract": "*Machine learning* (ML) models in the real world typically do not exist in isolation. They are usually part of a complex system (e.g., healthcare systems, self-driving cars) containing multiple ML and *black-box* components. The problem of optimizing such systems, which we refer to as *automated AI* (AutoAI), requires us to *jointly* train all ML components together and presents a significant challenge because the number of system parameters is extremely high and the system has no analytical form. To circumvent this, we introduce a novel algorithm called A-BAD-BO which uses each ML component's local loss as an auxiliary indicator for system performance. A-BAD-BO uses *Bayesian optimization* (BO) to optimize the local loss configuration of a system in a smaller dimensional space and exploits the differentiable structure of ML components to recover optimal system parameters from the optimized configuration. We show A-BAD-BO converges to optimal system parameters by showing that it is *asymptotically no regret*. We use A-BAD-BO to optimize several synthetic and real-world complex systems, including a prompt engineering pipeline for *large language models* containing millions of system parameters. Our results demonstrate that A-BAD-BO yields better system optimality than gradient-driven baselines and is more sample-efficient than pure BO algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhiliang Chen;Chuan-Sheng Foo;Bryan Kian Hsiang Low", "authorids": "~Zhiliang_Chen1;~Chuan-Sheng_Foo1;~Bryan_Kian_Hsiang_Low1", "gender": "M;M;M", "homepage": ";http://ai.stanford.edu/~csfoo;http://www.comp.nus.edu.sg/~lowkh", "dblp": ";73/1823;97/4877", "google_scholar": "Cw7LTkkAAAAJ;AgbeqGkAAAAJ;https://scholar.google.com.tw/citations?user=2P-Q09UAAAAJ", "orcid": ";0000-0002-4748-5792;", "linkedin": ";;", "or_profile": "~Zhiliang_Chen1;~Chuan-Sheng_Foo1;~Bryan_Kian_Hsiang_Low1", "aff": "National University of Singapore;Institute for Infocomm Research, A*STAR;National University of Singapore", "aff_domain": "nus.edu.sg;i2r.a-star.edu.sg;nus.edu.sg", "position": "PhD student;Principal Scientist;Associate Professor", "bibtex": "@inproceedings{\nchen2024towards,\ntitle={Towards Auto{AI}: Optimizing a Machine Learning System with Black-box and Differentiable Components},\nauthor={Zhiliang Chen and Chuan-Sheng Foo and Bryan Kian Hsiang Low},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JVhUR8q27o}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2925586, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YE16LCNSY1YJ:scholar.google.com/&scioq=Towards+AutoAI:+Optimizing+a+Machine+Learning+System+with+Black-box+and+Differentiable+Components&hl=en&as_sdt=0,33", "gs_version_total": 6, "email": "nus.edu.sg;i2r.a-star.edu.sg;nus.edu.sg", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "National University of Singapore;Institute for Infocomm Research", "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;https://www.i2r.a-star.edu.sg", "aff_unique_abbr": "NUS;I2R", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Singapore" }, { "title": "Stochastic Optimization with Arbitrary Recurrent Data Sampling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34369", "id": "JYcbgiSh0L", "proceeding": "https://proceedings.mlr.press/v235/powell24a.html", "pdf": "https://openreview.net/pdf?id=JYcbgiSh0L", "openreview": "https://openreview.net/forum?id=JYcbgiSh0L", "author_site": "William Powell, Hanbaek Lyu", "tldr": "", "abstract": "For obtaining optimal first-order convergence guarantees for stochastic optimization, it is necessary to use a recurrent data sampling algorithm that samples every data point with sufficient frequency. Most commonly used data sampling algorithms (e.g., i.i.d., MCMC, random reshuffling) are indeed recurrent under mild assumptions. In this work, we show that for a particular class of stochastic optimization algorithms, we do not need any further property (e.g., independence, exponential mixing, and reshuffling) beyond recurrence in data sampling to guarantee optimal rate of first-order convergence. Namely, using regularized versions of Minimization by Incremental Surrogate Optimization (MISO), we show that for non-convex and possibly non-smooth objective functions with constraints, the expected optimality gap converges at an optimal rate $O(n^{-1/2})$ under general recurrent sampling schemes. Furthermore, the implied constant depends explicitly on the 'speed of recurrence', measured by the expected amount of time to visit a data point, either averaged ('target time') or supremized ('hitting time') over the starting locations. We discuss applications of our general framework to decentralized optimization and distributed non-negative matrix factorization.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "William Powell;Hanbaek Lyu", "authorids": "~William_Powell1;~Hanbaek_Lyu1", "gender": "M;", "homepage": ";https://www.hanbaeklyu.com", "dblp": "367/5708;", "google_scholar": ";gDFWvgQAAAAJ", "orcid": ";", "linkedin": "grayson-powell-91917596/;", "or_profile": "~William_Powell1;~Hanbaek_Lyu1", "aff": "University of Wisconsin - Madison;University of Wisconsin, Madison", "aff_domain": "wisc.edu;wisc.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\npowell2024stochastic,\ntitle={Stochastic Optimization with Arbitrary Recurrent Data Sampling},\nauthor={William Powell and Hanbaek Lyu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JYcbgiSh0L}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1003739, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:aug6TjtD9bIJ:scholar.google.com/&scioq=Stochastic+Optimization+with+Arbitrary+Recurrent+Data+Sampling&hl=en&as_sdt=0,5", "gs_version_total": 8, "email": "wisc.edu;wisc.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Wisconsin-Madison;University of Wisconsin", "aff_unique_dep": ";", "aff_unique_url": "https://www.wisc.edu;https://www.wisc.edu", "aff_unique_abbr": "UW-Madison;UW", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Trainable Transformer in Transformer", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34368", "id": "JcxlFe2fGC", "proceeding": "https://proceedings.mlr.press/v235/panigrahi24a.html", "pdf": "https://openreview.net/pdf?id=JcxlFe2fGC", "openreview": "https://openreview.net/forum?id=JcxlFe2fGC", "author_site": "Abhishek Panigrahi, Sadhika Malladi, Mengzhou Xia, Sanjeev Arora", "tldr": "", "abstract": "Recent works attribute the capability of in-context learning (ICL) in large pre-trained language models to implicitly simulating and fine-tuning an internal model (e.g., linear or 2-layer MLP) during inference. However, such constructions require large memory overhead, which makes simulation of more sophisticated internal models intractable. In this work, we propose a new efficient construction, Transformer in Transformer (in short, TINT), that allows a transformer to simulate and fine-tune more complex models during inference (e.g., pre-trained language models). In particular, we introduce innovative approximation techniques that allow a TINT model with less than 2 billion parameters to simulate and fine-tune a 125 million parameter transformer model within a single forward pass. TINT accommodates many common transformer variants and its design ideas also improve the efficiency of past instantiations of simple models inside transformers. We conduct end-to-end experiments to validate the internal fine-tuning procedure of TINT on various language modeling and downstream tasks. For example, even with a limited one-step budget, we observe TINT for a OPT-125M model improves performance by 4 \u2212 16% absolute on average compared to OPT-125M. These findings suggest that large pre-trained language models are capable of performing intricate subroutines. To facilitate further work, a modular and extensible codebase for TINT is included.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Abhishek Panigrahi;Sadhika Malladi;Mengzhou Xia;Sanjeev Arora", "authorids": "~Abhishek_Panigrahi1;~Sadhika_Malladi2;~Mengzhou_Xia1;~Sanjeev_Arora1", "gender": "M;F;F;", "homepage": "https://abhishekpanigrahi1996.github.io/;https://www.cs.princeton.edu/~smalladi/;https://xiamengzhou.github.io/;http://www.cs.princeton.edu/~arora/", "dblp": "208/4926;176/9810;241/9329;a/SArora", "google_scholar": "https://scholar.google.co.in/citations?user=oMhp8p8AAAAJ;9HCmTcwAAAAJ;zyJn1IcAAAAJ;RUP4S68AAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Abhishek_Panigrahi1;~Sadhika_Malladi2;~Mengzhou_Xia1;~Sanjeev_Arora1", "aff": "Princeton University;Princeton University;Princeton University;Princeton University", "aff_domain": "princeton.edu;princeton.edu;princeton.edu;princeton.edu", "position": "PhD student;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\npanigrahi2024trainable,\ntitle={Trainable Transformer in Transformer},\nauthor={Abhishek Panigrahi and Sadhika Malladi and Mengzhou Xia and Sanjeev Arora},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JcxlFe2fGC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3561644, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12741922747654223685&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "princeton.edu;princeton.edu;princeton.edu;princeton.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "From Coarse to Fine: Enable Comprehensive Graph Self-supervised Learning with Multi-granular Semantic Ensemble", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34367", "id": "JnA9IveEwg", "proceeding": "https://proceedings.mlr.press/v235/wen24e.html", "pdf": "https://openreview.net/pdf?id=JnA9IveEwg", "openreview": "https://openreview.net/forum?id=JnA9IveEwg", "author_site": "Qianlong Wen, Mingxuan Ju, Zhongyu Ouyang, Chuxu Zhang, Yanfang Ye", "tldr": "", "abstract": "Self-supervised learning (SSL) has gained increasing attention in the graph learning community, owing to its capability of enabling powerful models pre-trained on large unlabeled graphs for general purposes, facilitating quick adaptation to specific domains. Though promising, existing graph SSL frameworks often struggle to capture both high-level abstract features and fine-grained features simultaneously, leading to sub-optimal generalization abilities across different downstream tasks. To bridge this gap, we present Multi-granularity Graph Semantic Ensemble via Knowledge Distillation, namely MGSE, a plug-and-play graph knowledge distillation framework that can be applied to any existing graph SSL framework to enhance its performance by incorporating the concept of multi-granularity. Specifically, MGSE captures multi-granular knowledge by employing multiple student models to learn from a single teacher model, conditioned by probability distributions with different granularities. We apply it to six state-of-the-art graph SSL frameworks and evaluate their performances over multiple graph datasets across different domains, the experimental results show that MGSE can consistently boost the performance of these existing graph SSL frameworks with up to 9.2% improvement.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qianlong Wen;Mingxuan Ju;Zhongyu Ouyang;Chuxu Zhang;Yanfang Ye", "authorids": "~Qianlong_Wen1;~Mingxuan_Ju1;~Zhongyu_Ouyang1;~Chuxu_Zhang2;~Yanfang_Ye1", "gender": "M;M;F;;", "homepage": "https://hoytwen.github.io/;https://jumxglhf.github.io;https://zyouyang.github.io/;;http://yes-lab.org/", "dblp": "301/6224;234/2715;326/3910;;", "google_scholar": "cc-uK9gAAAAJ;qNoO67AAAAAJ;ds4NE-gAAAAJ;;egjr888AAAAJ", "orcid": "0000-0003-3812-8395;0009-0008-9054-3856;;;", "linkedin": "qianlong-wen-87550a1a7/;;;;", "or_profile": "~Qianlong_Wen1;~Mingxuan_Ju1;~Zhongyu_Ouyang1;~Chuxu_Zhang2;~Yanfang_Ye1", "aff": "University of Notre Dame;University of Notre Dame;Dartmouth College;;University of Notre Dame", "aff_domain": "nd.edu;nd.edu;dartmouth.edu;;nd.edu", "position": "PhD student;PhD student;PhD student;;Associate Professor", "bibtex": "@inproceedings{\nwen2024from,\ntitle={From Coarse to Fine: Enable Comprehensive Graph Self-supervised Learning with Multi-granular Semantic Ensemble},\nauthor={Qianlong Wen and Mingxuan Ju and Zhongyu Ouyang and Chuxu Zhang and Yanfang Ye},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JnA9IveEwg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 723669, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10701124293258581230&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "nd.edu;nd.edu;dartmouth.edu;;nd.edu", "author_num": 5, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Notre Dame;Dartmouth College", "aff_unique_dep": ";", "aff_unique_url": "https://www.nd.edu;https://www.dartmouth.edu", "aff_unique_abbr": "Notre Dame;Dartmouth", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "FRAPP\u00c9: A Group Fairness Framework for Post-Processing Everything", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34366", "id": "JndWnomyIc", "proceeding": "https://proceedings.mlr.press/v235/tifrea24a.html", "pdf": "https://openreview.net/pdf?id=JndWnomyIc", "openreview": "https://openreview.net/forum?id=JndWnomyIc", "author_site": "Alexandru Tifrea, Preethi Lahoti, Ben Packer, Yoni Halpern, Ahmad Beirami, Flavien Prost", "tldr": "", "abstract": "Despite achieving promising fairness-error trade-offs, in-processing mitigation techniques for group fairness cannot be employed in numerous practical applications with limited computation resources or no access to the training pipeline of the prediction model. In these situations, post-processing is a viable alternative. However, current methods are tailored to specific problem settings and fairness definitions and hence, are not as broadly applicable as in-processing. In this work, we propose a framework that turns any regularized in-processing method into a post-processing approach. This procedure prescribes a way to obtain post-processing techniques for a much broader range of problem settings than the prior post-processing literature. We show theoretically and through extensive experiments that our framework preserves the good fairness-error trade-offs achieved with in-processing and can improve over the effectiveness of prior post-processing methods. Finally, we demonstrate several advantages of a modular mitigation strategy that disentangles the training of the prediction model from the fairness mitigation, including better performance on tasks with partial group labels.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alexandru Tifrea;Preethi Lahoti;Ben Packer;Yoni Halpern;Ahmad Beirami;Flavien Prost", "authorids": "~Alexandru_Tifrea1;~Preethi_Lahoti1;~Ben_Packer1;~Yoni_Halpern1;~Ahmad_Beirami1;~Flavien_Prost1", "gender": "M;;M;M;M;", "homepage": ";;;;https://beirami.github.io/;", "dblp": "183/4666;;https://dblp.uni-trier.de/pers/hd/p/Packer:Benjamin;06/10023;41/9367;", "google_scholar": "i7T1FUsAAAAJ;;jzsx52EAAAAJ;sU6x0E0AAAAJ;VuKWbMMAAAAJ;R2EJThQAAAAJ", "orcid": ";;;;;", "linkedin": ";;ben-packer-aa6a613/;;ahmad-beirami-97001962;", "or_profile": "~Alexandru_Tifrea1;~Preethi_Lahoti1;~Ben_Packer1;~Yoni_Halpern1;~Ahmad_Beirami1;~Flavien_Prost1", "aff": "Swiss Federal Institute of Technology;;;Google;Massachusetts Institute of Technology;Google", "aff_domain": "ethz.ch;;;google.com;mit.edu;google.com", "position": "PhD student;;;Researcher;Research Affiliate;Researcher", "bibtex": "@inproceedings{\ntifrea2024frapp,\ntitle={{FRAPP}\\'E: A Group Fairness Framework for Post-Processing Everything},\nauthor={Alexandru Tifrea and Preethi Lahoti and Ben Packer and Yoni Halpern and Ahmad Beirami and Flavien Prost},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JndWnomyIc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1145998, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14489930753113723695&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "ethz.ch;;;google.com;mit.edu;google.com", "author_num": 6, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Swiss Federal Institute of Technology;Google;Massachusetts Institute of Technology", "aff_unique_dep": ";Google;", "aff_unique_url": "https://www.ethz.ch;https://www.google.com;https://web.mit.edu", "aff_unique_abbr": "ETH Zurich;Google;MIT", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Switzerland;United States" }, { "title": "A Fixed-Point Approach for Causal Generative Modeling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34365", "id": "JpzIGzru5F", "proceeding": "https://proceedings.mlr.press/v235/scetbon24a.html", "pdf": "https://openreview.net/pdf?id=JpzIGzru5F", "openreview": "https://openreview.net/forum?id=JpzIGzru5F", "author_site": "Meyer Scetbon, Joel Jennings, Agrin Hilmkil, Cheng Zhang, Chao Ma", "tldr": "", "abstract": "We propose a novel formalism for describing Structural Causal Models (SCMs) as fixed-point problems on causally ordered variables, eliminating the need for Directed Acyclic Graphs (DAGs), and establish the weakest known conditions for their unique recovery given the topological ordering (TO). Based on this, we design a two-stage causal generative model that first infers in a zero-shot manner a valid TO from observations, and then learns the generative SCM on the ordered variables. To infer TOs, we propose to amortize the learning of TOs on synthetically generated datasets by sequentially predicting the leaves of graphs seen during training. To learn SCMs, we design a transformer-based architecture that exploits a new attention mechanism enabling the modeling of causal structures, and show that this parameterization is consistent with our formalism. Finally, we conduct an extensive evaluation of each method individually, and show that when combined, our model outperforms various baselines on generated out-of-distribution problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Meyer Scetbon;Joel Jennings;Agrin Hilmkil;Cheng Zhang;Chao Ma", "authorids": "~Meyer_Scetbon1;~Joel_Jennings1;~Agrin_Hilmkil1;~Cheng_Zhang1;~Chao_Ma2", "gender": "M;;;F;M", "homepage": "https://meyerscetbon.github.io;;;http://cheng-zhang.org;", "dblp": "249/8054;;;82/6384-5;", "google_scholar": ";;;r40iAwIAAAAJ;https://scholar.google.co.uk/citations?user=UWP3kWEAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Meyer_Scetbon1;~Joel_Jennings1;~Agrin_Hilmkil1;~Cheng_Zhang1;~Chao_Ma2", "aff": "Microsoft;;;Microsoft;Microsoft", "aff_domain": "microsoft.com;;;microsoft.com;microsoft.com", "position": "Researcher;;;Principal Researcher;Researcher", "bibtex": "@inproceedings{\nscetbon2024a,\ntitle={A Fixed-Point Approach for Causal Generative Modeling},\nauthor={Meyer Scetbon and Joel Jennings and Agrin Hilmkil and Cheng Zhang and Chao Ma},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JpzIGzru5F}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1366640, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14545446690218919572&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "microsoft.com;;;microsoft.com;microsoft.com", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Microsoft", "aff_unique_dep": "Microsoft Corporation", "aff_unique_url": "https://www.microsoft.com", "aff_unique_abbr": "Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Prometheus: Out-of-distribution Fluid Dynamics Modeling with Disentangled Graph ODE", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34364", "id": "JsPvL6ExK8", "proceeding": "https://proceedings.mlr.press/v235/wu24aa.html", "pdf": "https://openreview.net/pdf?id=JsPvL6ExK8", "openreview": "https://openreview.net/forum?id=JsPvL6ExK8", "author_site": "Hao Wu, Huiyuan Wang, kun wang, Weiyan Wang, ChanganYe, Yangyu Tao, Chong Chen, Xian-Sheng Hua, Xiao Luo", "tldr": "", "abstract": "Fluid dynamics modeling has received extensive attention in the machine learning community. Although numerous graph neural network (GNN) approaches have been proposed for this problem, the problem of out-of-distribution (OOD) generalization remains underexplored. In this work, we propose a new large-scale dataset Prometheus which simulates tunnel and pool fires across various environmental conditions and builds an extensive benchmark of 12 baselines, which demonstrates that the OOD generalization performance is far from satisfactory. To tackle this, this paper introduces a new approach named Disentangled Graph ODE (DGODE), which learns disentangled representations for continuous interacting dynamics modeling. In particular, we utilize a temporal GNN and a frequency network to extract semantics from historical trajectories into node representations and environment representations respectively. To mitigate the potential distribution shift, we minimize the mutual information between invariant node representations and the discretized environment features using adversarial learning. Then, they are fed into a coupled graph ODE framework, which models the evolution using neighboring nodes and dynamical environmental context. In addition, we enhance the stability of the framework by perturbing the environment features to enhance robustness. Extensive experiments validate the effectiveness of DGODE compared with state-of-the-art approaches.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hao Wu;Huiyuan Wang;Kun Wang;Weiyan Wang;ChanganYe;Yangyu Tao;Chong Chen;Xian-Sheng Hua;Xiao Luo", "authorids": "~Hao_Wu39;~Huiyuan_Wang1;~Kun_Wang15;~Weiyan_Wang1;~ChanganYe1;~Yangyu_Tao2;~Chong_Chen2;~Xian-Sheng_Hua1;~Xiao_Luo3", "gender": "M;M;M;M;M;M;;M;M", "homepage": "https://easylearningscores.github.io/;https://huiyuan-wang.github.io;http://home.ustc.edu.cn/~wk520529/#home;https://weiyan-wang.github.io/;;;;;http://luoxiao12.github.io", "dblp": "111;;;17/2902;https://dblp.org/;47/208.html;;56/5807-1;50/1585-1", "google_scholar": "HdXMhfcAAAAJ;tAmcOpgAAAAJ;UnyqjWQAAAAJ;https://scholar.google.com.hk/citations?hl=en;;;;https://scholar.google.co.uk/citations?user=6G-l4o0AAAAJ;https://scholar.google.com.hk/citations?", "orcid": "0009-0008-4084-1409;0009-0004-8796-7376;0000-0003-0602-169X;0000-0002-4105-0691;;;;;", "linkedin": ";;;;;yangyutao/;;xshua;%E9%9C%84-%E7%BD%97-303548214/", "or_profile": "~Hao_Wu39;~Huiyuan_Wang1;~Kun_Wang15;~Weiyan_Wang1;~ChanganYe1;~Yangyu_Tao2;~Chong_Chen2;~Xian-Sheng_Hua1;~Xiao_Luo3", "aff": "University of Science and Technology of China;University of Pennsylvania;University of Science and Technology of China;Tencent;Tencent MLPD;;;Terminus Group;University of California, Los Angeles", "aff_domain": "ustc.edu.cn;upenn.edu;ustc.edu.cn;tencent.com;tencent.com;;;tslsmart.com;cs.ucla.edu", "position": "MS student;Postdoc;PhD student;Researcher;Researcher;;;Principal Researcher;Postdoc", "bibtex": "@inproceedings{\nwu2024prometheus,\ntitle={Prometheus: Out-of-distribution Fluid Dynamics Modeling with Disentangled Graph {ODE}},\nauthor={Hao Wu and Huiyuan Wang and Kun Wang and Weiyan Wang and ChanganYe and Yangyu Tao and Chong Chen and Xian-Sheng Hua and Xiao Luo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JsPvL6ExK8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9672027, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15750925589668357407&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "ustc.edu.cn;upenn.edu;ustc.edu.cn;tencent.com;tencent.com;;;tslsmart.com;cs.ucla.edu", "author_num": 9, "aff_unique_index": "0;1;0;2;2;3;4", "aff_unique_norm": "University of Science and Technology of China;University of Pennsylvania;Tencent;Terminus Group;University of California, Los Angeles", "aff_unique_dep": ";;Tencent Holdings Limited;;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.upenn.edu;https://www.tencent.com;;https://www.ucla.edu", "aff_unique_abbr": "USTC;UPenn;Tencent;;UCLA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;0;0;0;1", "aff_country_unique": "China;United States;" }, { "title": "Position: Scaling Simulation is Neither Necessary Nor Sufficient for In-the-Wild Robot Manipulation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34363", "id": "Jtjurj7oIJ", "proceeding": "https://proceedings.mlr.press/v235/bharadhwaj24a.html", "pdf": "https://openreview.net/pdf?id=Jtjurj7oIJ", "openreview": "https://openreview.net/forum?id=Jtjurj7oIJ", "tldr": "", "abstract": "In this paper, we develop a structured critique of robotic simulations for real-world manipulation, by arguing that scaling simulators is neither necessary nor sufficient for making progress in general-purpose real-world robotic manipulation agents that are compliant with human preferences. With the ubiquity of robotic simulators, and recent efforts to scale them for diverse tasks, and at the same time the interest in generally capable real-world manipulation systems, we believe it is important to address the limitations of using simulation for real-world manipulation, so that as a community, we can focus our collective resources, energy, and time on approaches that have more principled odds of success. We further demonstrate the unique challenges that real-world manipulation presents, and show through examples and arguments why scaling simulation doesn't get us closer to solving these challenges required for diverse real-world deployment.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Homanga Bharadhwaj", "authorids": "~Homanga_Bharadhwaj1", "gender": "M", "homepage": "https://homangab.github.io/", "dblp": "223/5842", "google_scholar": "https://scholar.google.ca/citations?user=wwW4HRQAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Homanga_Bharadhwaj1", "aff": "Meta Facebook", "aff_domain": "facebook.com", "position": "Visiting Researcher", "bibtex": "@inproceedings{\nbharadhwaj2024position,\ntitle={Position: Scaling Simulation is Neither Necessary Nor Sufficient for In-the-Wild Robot Manipulation},\nauthor={Homanga Bharadhwaj},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Jtjurj7oIJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2777445, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7997947488894418929&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 4, "email": "facebook.com", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Meta", "aff_unique_dep": "Meta Platforms, Inc.", "aff_unique_url": "https://meta.com", "aff_unique_abbr": "Meta", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Uncertainty Estimation by Density Aware Evidential Deep Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34362", "id": "JtkruFHcRK", "proceeding": "https://proceedings.mlr.press/v235/yoon24a.html", "pdf": "https://openreview.net/pdf?id=JtkruFHcRK", "openreview": "https://openreview.net/forum?id=JtkruFHcRK", "author_site": "Taeseong Yoon, Heeyoung Kim", "tldr": "", "abstract": "Evidential deep learning (EDL) has shown remarkable success in uncertainty estimation. However, there is still room for improvement, particularly in out-of-distribution (OOD) detection and classification tasks. The limited OOD detection performance of EDL arises from its inability to reflect the distance between the testing example and training data when quantifying uncertainty, while its limited classification performance stems from its parameterization of the concentration parameters. To address these limitations, we propose a novel method called *Density Aware Evidential Deep Learning (DAEDL)*. DAEDL integrates the feature space density of the testing example with the output of EDL during the prediction stage, while using a novel parameterization that resolves the issues in the conventional parameterization. We prove that DAEDL enjoys a number of favorable theoretical properties. DAEDL demonstrates state-of-the-art performance across diverse downstream tasks related to uncertainty estimation and classification.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Taeseong Yoon;Heeyoung Kim", "authorids": "~Taeseong_Yoon1;~Heeyoung_Kim1", "gender": "M;", "homepage": "https://istat.kaist.ac.kr/;", "dblp": ";", "google_scholar": "MSiVD2gAAAAJ;", "orcid": ";", "linkedin": "\ud0dc\uc131-\uc724-3ba757208;", "or_profile": "~Taeseong_Yoon1;~Heeyoung_Kim1", "aff": "Korea Advanced Institute of Science & Technology;", "aff_domain": "kaist.ac.kr;", "position": "PhD student;", "bibtex": "@inproceedings{\nyoon2024uncertainty,\ntitle={Uncertainty Estimation by Density Aware Evidential Deep Learning},\nauthor={Taeseong Yoon and Heeyoung Kim},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JtkruFHcRK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4052146, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15948731842503621040&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "kaist.ac.kr;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_country_unique_index": "0", "aff_country_unique": "South Korea" }, { "title": "Position: Building Guardrails for Large Language Models Requires Systematic Design", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34361", "id": "JvMLkGF2Ms", "proceeding": "https://proceedings.mlr.press/v235/dong24c.html", "pdf": "https://openreview.net/pdf?id=JvMLkGF2Ms", "openreview": "https://openreview.net/forum?id=JvMLkGF2Ms", "author_site": "Yi DONG, Ronghui Mu, Gaojie Jin, Yi Qi, Jinwei Hu, Xingyu Zhao, Jie Meng, Wenjie Ruan, Xiaowei Huang", "tldr": "", "abstract": "As Large Language Models (LLMs) become more integrated into our daily lives, it is crucial to identify and mitigate their risks, especially when the risks can have profound impacts on human users and societies. Guardrails, which filter the inputs or outputs of LLMs, have emerged as a core safeguarding technology. This position paper takes a deep look at current open-source solutions (Llama Guard, Nvidia NeMo, Guardrails AI), and discusses the challenges and the road towards building more complete solutions. Drawing on robust evidence from previous research, we advocate for a systematic approach to construct guardrails for LLMs, based on comprehensive consideration of diverse contexts across various LLMs applications. We propose employing socio-technical methods through collaboration with a multi-disciplinary team to pinpoint precise technical requirements, exploring advanced neural-symbolic implementations to embrace the complexity of the requirements, and developing verification and testing to ensure the utmost quality of the final product.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yi DONG;Ronghui Mu;Gaojie Jin;Yi Qi;Jinwei Hu;Xingyu Zhao;Jie Meng;Wenjie Ruan;Xiaowei Huang", "authorids": "~Yi_DONG7;~Ronghui_Mu1;~Gaojie_Jin1;~Yi_Qi3;~Jinwei_Hu1;~Xingyu_Zhao1;j.meng@lboro.ac.uk;~Wenjie_Ruan2;~Xiaowei_Huang1", "gender": "M;;M;M;M;M;;;M", "homepage": "https://sites.google.com/view/yidong;;https://alexkael.github.io/;;;https://www.xzhao.me;;;https://cgi.csc.liv.ac.uk/~xiaowei/", "dblp": "07/2924-2;;276/5476;;;83/504-1;;;60/5414-1.html", "google_scholar": "p3ZEukYAAAAJ;;n_cu7jwAAAAJ;https://scholar.google.com/citations?hl=en;;SzEBdA8AAAAJ;;;https://scholar.google.co.uk/citations?user=X4fLCCIAAAAJ", "orcid": "0000-0003-3047-7777;;;;0009-0008-5261-211X;0000-0002-3474-349X;;;", "linkedin": ";;;;;xingyu-zhao-29877485/;;;", "or_profile": "~Yi_DONG7;~Ronghui_Mu1;~Gaojie_Jin1;~Yi_Qi3;~Jinwei_Hu1;~Xingyu_Zhao1;j.meng@lboro.ac.uk;~Wenjie_Ruan2;~Xiaowei_Huang1", "aff": "University of Southampton;;Chinese Academy of Sciences, Chinese Academy of Sciences;University of Liverpool;University of Liverpool;University of Warwick;;;University of Liverpool", "aff_domain": "soton.ac.uk;;ios.ac.cn;liverpool.ac.uk;liverpool.ac.uk;warwick.ac.uk;;;liverpool.ac.uk", "position": "Researcher;;Associate Professor;PhD student;PhD student;Assistant Professor;;;Full Professor", "bibtex": "@inproceedings{\ndong2024position,\ntitle={Position: Building Guardrails for Large Language Models Requires Systematic Design},\nauthor={Yi DONG and Ronghui Mu and Gaojie Jin and Yi Qi and Jinwei Hu and Xingyu Zhao and Jie Meng and Wenjie Ruan and Xiaowei Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JvMLkGF2Ms}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8042736, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10693140371202084533&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "email": "soton.ac.uk;;ios.ac.cn;liverpool.ac.uk;liverpool.ac.uk;warwick.ac.uk;;;liverpool.ac.uk", "author_num": 9, "aff_unique_index": "0;1;2;2;3;2", "aff_unique_norm": "University of Southampton;Chinese Academy of Sciences;University of Liverpool;University of Warwick", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.southampton.ac.uk;http://www.cas.cn;https://www.liverpool.ac.uk;https://www.warwick.ac.uk", "aff_unique_abbr": "Southampton;CAS;Liv Uni;Warwick", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;0", "aff_country_unique": "United Kingdom;China" }, { "title": "MH-pFLID: Model Heterogeneous personalized Federated Learning via Injection and Distillation for Medical Data Analysis", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34360", "id": "Jvh8HM9YEJ", "proceeding": "https://proceedings.mlr.press/v235/xie24h.html", "pdf": "https://openreview.net/pdf?id=Jvh8HM9YEJ", "openreview": "https://openreview.net/forum?id=Jvh8HM9YEJ", "author_site": "Luyuan Xie, Manqing Lin, Tianyu Luan, Cong Li, Yuejian Fang, Qingni Shen, Zhonghai Wu", "tldr": "", "abstract": "Federated learning is widely used in medical applications for training global models without needing local data access, but varying computational capabilities and network architectures (system heterogeneity) across clients pose significant challenges in effectively aggregating information from non-independently and identically distributed (non-IID) data (statistic heterogeneity). Current federated learning methods using knowledge distillation require public datasets, raising privacy and data collection issues. Additionally, these datasets require additional local computing and storage resources, which is a burden for medical institutions with limited hardware conditions. In this paper, we introduce a novel federated learning paradigm, named Model Heterogeneous personalized Federated Learning via Injection and Distillation (MH-pFLID). Our framework leverages a lightweight messenger model, eliminating the need for public datasets and reducing the training cost for each client. We also develops receiver and transmitter modules for each client to separate local biases from generalizable information, reducing biased data collection and mitigating client drift. Our experiments on various medical tasks including image classification, image segmentation, and time-series classification, show MH-pFLID outperforms state-of-the-art methods in all these areas and has good generalizability.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Luyuan Xie;Manqing Lin;Tianyu Luan;Cong Li;Yuejian Fang;Qingni Shen;Zhonghai Wu", "authorids": "~Luyuan_Xie1;~Manqing_Lin1;~Tianyu_Luan1;~Cong_Li5;~Yuejian_Fang1;~Qingni_Shen1;~Zhonghai_Wu1", "gender": "M;F;M;M;M;;M", "homepage": "https://scholar.google.com/citations?user=6DN-P4wAAAAJ&hl=zh-CN;https://linmanqing.github.io;https://tyluann.github.io/;;http://www.ss.pku.edu.cn/index.php/teacherteam/teacherlist/1612-%E6%96%B9%E8%B7%83%E5%9D%9A;https://www.ss.pku.edu.cn/teacherteam/teacherlist/1634-%E6%B2%88%E6%99%B4%E9%9C%93.html;https://www.ss.pku.edu.cn", "dblp": "231/6712;;288/0432;;119/3697;11/325;01/868", "google_scholar": "6DN-P4wAAAAJ;;https://scholar.google.se/citations?user=XNzPzTIAAAAJ;;;Nm1fclcAAAAJ;", "orcid": ";;;0000-0001-6604-0708;;0000-0002-0605-6043;", "linkedin": ";;;;;;", "or_profile": "~Luyuan_Xie1;~Manqing_Lin1;~Tianyu_Luan1;~Cong_Li5;~Yuejian_Fang1;~Qingni_Shen1;~Zhonghai_Wu1", "aff": "Peking University;;State University of New York at Buffalo;Peking University;Peking University;Peking University;Peking University", "aff_domain": "pku.edu.cn;;buffalo.edu;pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn", "position": "PhD student;;PhD student;Postdoc;Associate Professor;Researcher;Full Professor", "bibtex": "@inproceedings{\nxie2024mhpflid,\ntitle={{MH}-p{FLID}: Model Heterogeneous personalized Federated Learning via Injection and Distillation for Medical Data Analysis},\nauthor={Luyuan Xie and Manqing Lin and Tianyu Luan and Cong Li and Yuejian Fang and Qingni Shen and Zhonghai Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Jvh8HM9YEJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 0, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9425134582488160313&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "pku.edu.cn;;buffalo.edu;pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn", "author_num": 7, "aff_unique_index": "0;1;0;0;0;0", "aff_unique_norm": "Peking University;State University of New York at Buffalo", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.buffalo.edu", "aff_unique_abbr": "Peking U;SUNY Buffalo", "aff_campus_unique_index": "1", "aff_campus_unique": ";Buffalo", "aff_country_unique_index": "0;1;0;0;0;0", "aff_country_unique": "China;United States" }, { "title": "Visual-Text Cross Alignment: Refining the Similarity Score in Vision-Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34359", "id": "JymXv7mkrQ", "proceeding": "https://proceedings.mlr.press/v235/li24ag.html", "pdf": "https://openreview.net/pdf?id=JymXv7mkrQ", "openreview": "https://openreview.net/forum?id=JymXv7mkrQ", "author_site": "Jinhao Li, Haopeng Li, Sarah Erfani, Lei Feng, James Bailey, Feng Liu", "tldr": "", "abstract": "It has recently been discovered that using a pre-trained *vision-language model* (VLM), e.g., CLIP, to align a whole query image with several finer text descriptions generated by a large language model can significantly enhance zero-shot performance. However, in this paper, we empirically find that the finer descriptions tend to align more effectively with *local areas of the query image* rather than the whole image, and then we theoretically validate this finding. Thus, we present a method called *weighted visual-text cross alignment* (WCA). This method begins with a *localized visual prompting* technique, designed to identify local visual areas within the query image. The local visual areas are then *cross-aligned* with the finer descriptions by creating a similarity matrix using the pre-trained VLM. To determine how well a query image aligns with each category, we develop a score function based on the weighted similarities in this matrix. Extensive experiments demonstrate that our method significantly improves zero-shot performance across various datasets, achieving results that are even comparable to few-shot learning methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jinhao Li;Haopeng Li;Sarah Monazam Erfani;Lei Feng;James Bailey;Feng Liu", "authorids": "~Jinhao_Li2;~Haopeng_Li1;~Sarah_Monazam_Erfani1;~Lei_Feng1;~James_Bailey1;~Feng_Liu2", "gender": "M;M;;M;;M", "homepage": "https://jinhaolee.github.io;https://github.com/HopLee6;https://people.eng.unimelb.edu.au/smonazam/;https://lfeng1995.github.io/;;https://fengliu90.github.io/index.html", "dblp": "309/6695-4;39/7825;136/0170;76/847-6;;77/1318-3", "google_scholar": "https://scholar.google.com.au/citations?user=1mPh9R8AAAAJ;YSg_iL4AAAAJ;https://scholar.google.com.au/citations?user=Jq9ocx4AAAAJ;https://scholar.google.com.sg/citations?user=KomQOFkAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": "0009-0006-9301-5579;0000-0001-8175-5381;;0000-0003-2839-5799;;0000-0002-5005-9129", "linkedin": "jinhao-li/;haopeng-li-b417a826b/;;;;alexfengliu", "or_profile": "~Jinhao_Li2;~Haopeng_Li1;~Sarah_Monazam_Erfani1;~Lei_Feng1;~James_Bailey1;~Feng_Liu2", "aff": "University of Melbourne;University of Melbourne;The University of Melbourne;Singapore University of Technology and Design;;University of Melbourne", "aff_domain": "unimelb.edu;unimelb.edu;unimelb.edu.au;sutd.edu.sg;;unimelb.edu.au", "position": "PhD student;PhD student;Associate Professor;Assistant Professor;;Assistant Professor", "bibtex": "@inproceedings{\nli2024visualtext,\ntitle={Visual-Text Cross Alignment: Refining the Similarity Score in Vision-Language Models},\nauthor={Jinhao Li and Haopeng Li and Sarah Monazam Erfani and Lei Feng and James Bailey and Feng Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JymXv7mkrQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7781942, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10377298424563162761&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "unimelb.edu;unimelb.edu;unimelb.edu.au;sutd.edu.sg;;unimelb.edu.au", "author_num": 6, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "University of Melbourne;Singapore University of Technology and Design", "aff_unique_dep": ";", "aff_unique_url": "https://www.unimelb.edu.au;https://www.sutd.edu.sg", "aff_unique_abbr": "UniMelb;SUTD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "Australia;Singapore" }, { "title": "Robust Multi-Task Learning with Excess Risks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34358", "id": "JzWFmMySpn", "proceeding": "https://proceedings.mlr.press/v235/he24n.html", "pdf": "https://openreview.net/pdf?id=JzWFmMySpn", "openreview": "https://openreview.net/forum?id=JzWFmMySpn", "author_site": "Yifei He, Shiji Zhou, Guojun Zhang, Hyokun Yun, Yi Xu, Belinda Zeng, Trishul Chilimbi, Han Zhao", "tldr": "", "abstract": "Multi-task learning (MTL) considers learning a joint model for multiple tasks by optimizing a convex combination of all task losses. To solve the optimization problem, existing methods use an adaptive weight updating scheme, where task weights are dynamically adjusted based on their respective losses to prioritize difficult tasks. However, these algorithms face a great challenge whenever *label noise* is present, in which case excessive weights tend to be assigned to noisy tasks that have relatively large Bayes optimal errors, thereby overshadowing other tasks and causing performance to drop across the board. To overcome this limitation, we propose **M**ulti-**T**ask **L**earning with **Excess** Risks (ExcessMTL), an excess risk-based task balancing method that updates the task weights by their distances to convergence instead. Intuitively, ExcessMTL assigns higher weights to worse-trained tasks that are further from convergence. To estimate the excess risks, we develop an efficient and accurate method with Taylor approximation. Theoretically, we show that our proposed algorithm achieves convergence guarantees and Pareto stationarity. Empirically, we evaluate our algorithm on various MTL benchmarks and demonstrate its superior performance over existing methods in the presence of label noise. Our code is available at https://github.com/yifei-he/ExcessMTL.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yifei He;Shiji Zhou;Guojun Zhang;Hyokun Yun;Yi Xu;Belinda Zeng;Trishul Chilimbi;Han Zhao", "authorids": "~Yifei_He1;~Shiji_Zhou1;~Guojun_Zhang1;~Hyokun_Yun1;~Yi_Xu10;~Belinda_Zeng1;~Trishul_Chilimbi1;~Han_Zhao1", "gender": ";M;M;M;M;Not Specified;;M", "homepage": "https://yifei-he.github.io/;https://arnoldshijizhou.github.io;https://gordon-guojun-zhang.github.io/;http://bikestra.github.io/;;;;https://hanzhaoml.github.io/", "dblp": ";294/8684;56/4451;45/9671;;;265/6085.html;03/3520-2", "google_scholar": "https://scholar.google.com/citations?hl=en;Do5jf8oAAAAJ;https://scholar.google.ca/citations?user=p8Y0xJEAAAAJ;W4oOmZEAAAAJ;y7BhrpQAAAAJ;;DrNeo_0AAAAJ;x942ipYAAAAJ", "orcid": ";0009-0000-0677-7396;;;;;;0000-0002-8579-1600", "linkedin": ";shiji-zhou-05b766ba/;guojun-zhang-bbb009a4/;hyokun-yun-b4439b7/;yeahgoyixu;belindazeng/;;", "or_profile": "~Yifei_He1;~Shiji_Zhou1;~Guojun_Zhang1;~Hyokun_Yun1;~Yi_Xu10;~Belinda_Zeng1;~Trishul_Chilimbi1;~Han_Zhao1", "aff": "University of Illinois Urbana-Champaign;Tsinghua University;Huawei Technologies Ltd.;Amazon;Amazon;Amazon;Amazon;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;mail.tsinghua.edu.cn;huawei.com;amazon.com;amazon.com;amazon.com;amazon.com;illinois.edu", "position": "PhD student;Postdoc;Researcher;Machine Learning Scientist;Senior Applied Science Manager;Researcher;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nhe2024robust,\ntitle={Robust Multi-Task Learning with Excess Risks},\nauthor={Yifei He and Shiji Zhou and Guojun Zhang and Hyokun Yun and Yi Xu and Belinda Zeng and Trishul Chilimbi and Han Zhao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JzWFmMySpn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 555779, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12274055694986933898&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "illinois.edu;mail.tsinghua.edu.cn;huawei.com;amazon.com;amazon.com;amazon.com;amazon.com;illinois.edu", "author_num": 8, "aff_unique_index": "0;1;2;3;3;3;3;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;Tsinghua University;Huawei;Amazon", "aff_unique_dep": ";;Huawei Technologies;Amazon.com, Inc.", "aff_unique_url": "https://illinois.edu;https://www.tsinghua.edu.cn;https://www.huawei.com;https://www.amazon.com", "aff_unique_abbr": "UIUC;THU;Huawei;Amazon", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;1;1;0;0;0;0;0", "aff_country_unique": "United States;China" }, { "title": "Structure-based drug design by denoising voxel grids", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34357", "id": "K3fEkECWgu", "proceeding": "https://proceedings.mlr.press/v235/pinheiro24a.html", "pdf": "https://openreview.net/pdf?id=K3fEkECWgu", "openreview": "https://openreview.net/forum?id=K3fEkECWgu", "author_site": "Pedro O. Pinheiro, Arian Jamasb, Omar Mahmood, Vishnu Sresht, Saeed Saremi", "tldr": "", "abstract": "We presents VoxBind, a new score-based generative model for 3D molecules conditioned on protein structures. Our approach represents molecules as 3D atomic density grids and leverages a 3D voxel-denoising network for learning and generation. We extend the neural empirical Bayes formalism (Saremi & Hyv\u00e4rinen, 2019) to the conditional setting and generate structure-conditioned molecules with a two-step procedure: (i) sample noisy molecules from the Gaussian-smoothed conditional distribution with underdamped Langevin MCMC using the learned score function and (ii) estimate clean molecules from the noisy samples with single-step denoising. Compared to the current state of the art, our model is simpler to train, significantly faster to sample from, and achieves better results on extensive in silico benchmarks\u2014the generated molecules are more diverse, exhibit fewer steric clashes, and bind with higher affinity to protein pockets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pedro O. Pinheiro;Arian Rokkum Jamasb;Omar Mahmood;Vishnu Sresht;Saeed Saremi", "authorids": "~Pedro_O._Pinheiro1;~Arian_Rokkum_Jamasb1;~Omar_Mahmood1;~Vishnu_Sresht1;~Saeed_Saremi1", "gender": "M;;M;;M", "homepage": ";https://www.jamasb.io;;;https://saeedsaremi.github.io/", "dblp": "223/9937;296/2021;;;128/2619", "google_scholar": "https://scholar.google.ca/citations?user=BU6f7L4AAAAJ;https://scholar.google.co.uk/citations?user=hYm9a-UAAAAJ;;;", "orcid": ";0000-0002-6727-7579;0000-0002-4437-5416;;", "linkedin": ";jamasb/;;;", "or_profile": "~Pedro_O._Pinheiro1;~Arian_Rokkum_Jamasb1;~Omar_Mahmood1;~Vishnu_Sresht1;~Saeed_Saremi1", "aff": "Prescient Design, Genentech;Prescient Design / Roche / Genentech;Genentech;;Genentech", "aff_domain": "gene.com;roche.com;gene.com;;gene.com", "position": "Researcher;Researcher;ML Scientist;;Senior Principal Research Scientist", "bibtex": "@inproceedings{\npinheiro2024structurebased,\ntitle={Structure-based drug design by denoising voxel grids},\nauthor={Pedro O. Pinheiro and Arian Rokkum Jamasb and Omar Mahmood and Vishnu Sresht and Saeed Saremi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=K3fEkECWgu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9096571, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8019109323116477901&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "gene.com;roche.com;gene.com;;gene.com", "author_num": 5, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Genentech;Roche", "aff_unique_dep": "Prescient Design;", "aff_unique_url": "https://www.gene.com;https://www.roche.com", "aff_unique_abbr": "Genentech;Roche", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;Switzerland" }, { "title": "Improving Gradient-Guided Nested Sampling for Posterior Inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34356", "id": "K5h6VAsJaV", "proceeding": "https://proceedings.mlr.press/v235/lemos24a.html", "pdf": "https://openreview.net/pdf?id=K5h6VAsJaV", "openreview": "https://openreview.net/forum?id=K5h6VAsJaV", "author_site": "Pablo Lemos, Nikolay Malkin, Will Handley, Yoshua Bengio, Yashar Hezaveh, Laurence Perreault-Levasseur", "tldr": "", "abstract": "We present a performant, general-purpose gradient-guided nested sampling (GGNS) algorithm, combining the state of the art in differentiable programming, Hamiltonian slice sampling, clustering, mode separation, dynamic nested sampling, and parallelization. This unique combination allows GGNS to scale well with dimensionality and perform competitively on a variety of synthetic and real-world problems. We also show the potential of combining nested sampling with generative flow networks to obtain large amounts of high-quality samples from the posterior distribution. This combination leads to faster mode discovery and more accurate estimates of the partition function.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pablo Lemos;Nikolay Malkin;Will Handley;Yoshua Bengio;Yashar Hezaveh;Laurence Perreault-Levasseur", "authorids": "~Pablo_Lemos1;~Nikolay_Malkin1;~Will_Handley1;~Yoshua_Bengio1;~Yashar_Hezaveh1;~Laurence_Perreault-Levasseur1", "gender": "M;;M;M;M;F", "homepage": "https://pablo-lemos.github.io;;https://www.kicc.cam.ac.uk/directory/wh260;http://yoshuabengio.org;https://www.astro.umontreal.ca/~hezaveh/hezaveh/Home.html;", "dblp": "313/2645;;229/4832;56/953;332/6554;", "google_scholar": "AklQTTsAAAAJ;;https://scholar.google.co.uk/citations?user=9Ow4mn0AAAAJ;kukA0LcAAAAJ;4tQoRHoAAAAJ;wVXcNOQAAAAJ", "orcid": "0000-0002-4728-8473;;0000-0002-5866-0445;;0000-0002-8669-5733;", "linkedin": ";;;yoshuabengio/?originalSubdomain=ca;;", "or_profile": "~Pablo_Lemos1;~Nikolay_Malkin1;~Will_Handley1;~Yoshua_Bengio1;~Yashar_Hezaveh1;~Laurence_Perreault-Levasseur1", "aff": "Universit\u00e9 de Montr\u00e9al;;University of Cambridge;University of Montreal;Universit\u00e9 de Montr\u00e9al;Universit\u00e9 de Montr\u00e9al", "aff_domain": "umontreal.ca;;cam.ac.uk;umontreal.ca;umontreal.ca;umontreal.ca", "position": "Postdoc;;Principal Researcher;Full Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nlemos2024improving,\ntitle={Improving Gradient-Guided Nested Sampling for Posterior Inference},\nauthor={Pablo Lemos and Nikolay Malkin and Will Handley and Yoshua Bengio and Yashar Hezaveh and Laurence Perreault-Levasseur},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=K5h6VAsJaV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4478926, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17274674397152848121&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "umontreal.ca;;cam.ac.uk;umontreal.ca;umontreal.ca;umontreal.ca", "author_num": 6, "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al;University of Cambridge;University of Montreal", "aff_unique_dep": ";;", "aff_unique_url": "https://www.umontreal.ca;https://www.cam.ac.uk;https://wwwumontreal.ca", "aff_unique_abbr": "UdeM;Cambridge;UM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "Canada;United Kingdom" }, { "title": "Active Adaptive Experimental Design for Treatment Effect Estimation with Covariate Choice", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34355", "id": "K6HpbvkrwO", "proceeding": "https://proceedings.mlr.press/v235/kato24a.html", "pdf": "https://openreview.net/pdf?id=K6HpbvkrwO", "openreview": "https://openreview.net/forum?id=K6HpbvkrwO", "author_site": "Masahiro Kato, Oga Akihiro, Wataru Komatsubara, Ryo Inokuchi", "tldr": "", "abstract": "This study designs an adaptive experiment for efficiently estimating *average treatment effects* (ATEs). In each round of our adaptive experiment, an experimenter sequentially samples an experimental unit, assigns a treatment, and observes the corresponding outcome immediately. At the end of the experiment, the experimenter estimates an ATE using the gathered samples. The objective is to estimate the ATE with a smaller asymptotic variance. Existing studies have designed experiments that adaptively optimize the propensity score (treatment-assignment probability). As a generalization of such an approach, we propose optimizing the covariate density as well as the propensity score. First, we derive the efficient covariate density and propensity score that minimize the semiparametric efficiency bound and find that optimizing both covariate density and propensity score minimizes the semiparametric efficiency bound more effectively than optimizing only the propensity score. Next, we design an adaptive experiment using the efficient covariate density and propensity score sequentially estimated during the experiment. Lastly, we propose an ATE estimator whose asymptotic variance aligns with the minimized semiparametric efficiency bound.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Masahiro Kato;Akihiro Oga;Wataru Komatsubara;Ryo Inokuchi", "authorids": "~Masahiro_Kato1;akihiro-oga@fintec.co.jp;wataru-komatsubara@fintec.co.jp;ryo-inokuchi@fintec.co.jp", "gender": "M;;;", "homepage": "https://masakat0.github.io/;;;", "dblp": ";;;", "google_scholar": "https://scholar.google.co.jp/schhp?hl=ja;;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Masahiro_Kato1;akihiro-oga@fintec.co.jp;wataru-komatsubara@fintec.co.jp;ryo-inokuchi@fintec.co.jp", "aff": "The University of Tokyo;;;", "aff_domain": "tokyo.ac.jp;;;", "position": "PhD student;;;", "bibtex": "@inproceedings{\nkato2024active,\ntitle={Active Adaptive Experimental Design for Treatment Effect Estimation with Covariate Choice},\nauthor={Masahiro Kato and Akihiro Oga and Wataru Komatsubara and Ryo Inokuchi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=K6HpbvkrwO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1905637, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10739530910120690973&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 9, "email": "tokyo.ac.jp;;;", "author_num": 4, "aff_unique_index": "0", "aff_unique_norm": "University of Tokyo", "aff_unique_dep": "", "aff_unique_url": "https://www.u-tokyo.ac.jp", "aff_unique_abbr": "UTokyo", "aff_country_unique_index": "0", "aff_country_unique": "Japan" }, { "title": "Assessing the Brittleness of Safety Alignment via Pruning and Low-Rank Modifications", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34354", "id": "K6xxnKN2gm", "proceeding": "https://proceedings.mlr.press/v235/wei24f.html", "pdf": "https://openreview.net/pdf?id=K6xxnKN2gm", "openreview": "https://openreview.net/forum?id=K6xxnKN2gm", "author_site": "Boyi Wei, Kaixuan Huang, Yangsibo Huang, Tinghao Xie, Xiangyu Qi, Mengzhou Xia, Prateek Mittal, Mengdi Wang, Peter Henderson", "tldr": "", "abstract": "Large language models (LLMs) show inherent brittleness in their safety mechanisms, as evidenced by their susceptibility to jailbreaking and even non-malicious fine-tuning. This study explores this brittleness of safety alignment by leveraging pruning and low-rank modifications. We develop methods to identify critical regions that are vital for safety guardrails, and that are disentangled from utility-relevant regions at both the neuron and rank levels. Surprisingly, the isolated regions we find are sparse, comprising about $3$ % at the parameter level and $2.5$ % at the rank level. Removing these regions compromises safety without significantly impacting utility, corroborating the inherent brittleness of the model's safety mechanisms. Moreover, we show that LLMs remain vulnerable to low-cost fine-tuning attacks even when modifications to the safety-critical regions are restricted. These findings underscore the urgent need for more robust safety strategies in LLMs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Boyi Wei;Kaixuan Huang;Yangsibo Huang;Tinghao Xie;Xiangyu Qi;Mengzhou Xia;Prateek Mittal;Mengdi Wang;Peter Henderson", "authorids": "~Boyi_Wei2;~Kaixuan_Huang1;~Yangsibo_Huang2;~Tinghao_Xie1;~Xiangyu_Qi2;~Mengzhou_Xia1;~Prateek_Mittal1;~Mengdi_Wang1;~Peter_Henderson1", "gender": "M;M;F;M;M;F;;F;M", "homepage": "https://www.boyiwei.com/;https://hackyhuang.github.io/;https://hazelsuko07.github.io/yangsibo/;https://tinghaoxie.com;https://unispac.github.io;https://xiamengzhou.github.io/;http://www.princeton.edu/~pmittal/;http://mwang.princeton.edu;http://www.peterhenderson.co/", "dblp": ";;;307/5298;274/2321;241/9329;;;h/PeterHenderson2", "google_scholar": ";EfxwV6oAAAAJ;NMPUDa0AAAAJ;gFT5XpMAAAAJ;9Za3rmkAAAAJ;zyJn1IcAAAAJ;https://scholar.google.com.tw/citations?user=xTKD8J4AAAAJ;;dy_JBs0AAAAJ", "orcid": ";;;;;;0000-0002-4057-0118;;", "linkedin": ";;;;;;;;phende/", "or_profile": "~Boyi_Wei2;~Kaixuan_Huang1;~Yangsibo_Huang2;~Tinghao_Xie1;~Xiangyu_Qi2;~Mengzhou_Xia1;~Prateek_Mittal1;~Mengdi_Wang1;~Peter_Henderson1", "aff": "Princeton University;Princeton University;Princeton University;Meta Facebook;Princeton University;Princeton University;Princeton University;Princeton University;Princeton University", "aff_domain": "princeton.edu;princeton.edu;princeton.edu;meta.com;princeton.edu;princeton.edu;princeton.edu;princeton.edu;princeton.edu", "position": "PhD student;PhD student;PhD student;Intern;PhD student;PhD student;Full Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nwei2024assessing,\ntitle={Assessing the Brittleness of Safety Alignment via Pruning and Low-Rank Modifications},\nauthor={Boyi Wei and Kaixuan Huang and Yangsibo Huang and Tinghao Xie and Xiangyu Qi and Mengzhou Xia and Prateek Mittal and Mengdi Wang and Peter Henderson},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=K6xxnKN2gm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1029900, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 98, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6415168707338493884&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "princeton.edu;princeton.edu;princeton.edu;meta.com;princeton.edu;princeton.edu;princeton.edu;princeton.edu;princeton.edu", "author_num": 9, "aff_unique_index": "0;0;0;1;0;0;0;0;0", "aff_unique_norm": "Princeton University;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.princeton.edu;https://meta.com", "aff_unique_abbr": "Princeton;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Neighboring Perturbations of Knowledge Editing on Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34353", "id": "K9NTPRvVRI", "proceeding": "https://proceedings.mlr.press/v235/ma24h.html", "pdf": "https://openreview.net/pdf?id=K9NTPRvVRI", "openreview": "https://openreview.net/forum?id=K9NTPRvVRI", "author_site": "Jun-Yu Ma, Zhen-Hua Ling, Ningyu Zhang, Jia-Chen Gu", "tldr": "", "abstract": "Despite their exceptional capabilities, large language models (LLMs) are prone to generating unintended text due to false or outdated knowledge. Given the resource-intensive nature of retraining LLMs, there has been a notable increase in the development of knowledge editing. However, current approaches and evaluations rarely explore the perturbation of editing on neighboring knowledge. This paper studies whether updating new knowledge to LLMs perturbs the neighboring knowledge encapsulated within them. Specifically, we seek to figure out whether appending a new answer into an answer list to a factual question leads to catastrophic forgetting of original correct answers in this list, as well as unintentional inclusion of incorrect answers. A metric of additivity is introduced and a benchmark dubbed as Perturbation Evaluation of Appending Knowledge (PEAK) is constructed to evaluate the degree of perturbation to neighboring knowledge when appending new knowledge. Besides, a plug-and-play framework termed Appending via Preservation and Prevention (APP) is proposed to mitigate the neighboring perturbation by maintaining the integrity of the answer list. Experiments demonstrate the effectiveness of APP coupling with four editing methods on three LLMs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jun-Yu Ma;Zhen-Hua Ling;Ningyu Zhang;Jia-Chen Gu", "authorids": "~Jun-Yu_Ma1;~Zhen-Hua_Ling1;~Ningyu_Zhang1;~Jia-Chen_Gu1", "gender": "M;M;M;M", "homepage": "http://home.ustc.edu.cn/~mjy1999/;http://staff.ustc.edu.cn/~zhling/;https://person.zju.edu.cn/en/ningyu;https://jasonforjoy.github.io/", "dblp": "315/4046;70/5210;139/4181-1.html;93/3604.html", "google_scholar": "9BbSL1EAAAAJ;f8jRR3EAAAAJ;xQDOPvsAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0002-1970-0678;", "linkedin": ";;ningyuzhang/;", "or_profile": "~Jun-Yu_Ma1;~Zhen-Hua_Ling1;~Ningyu_Zhang1;~Jia-Chen_Gu1", "aff": "University of Science and Technology of China;University of Science and Technology of China;Zhejiang University;University of California, Los Angeles", "aff_domain": "ustc.edu.cn;ustc.edu.cn;zju.edu.cn;ucla.edu", "position": "PhD student;Professor;Associate Professor;Postdoc", "bibtex": "@inproceedings{\nma2024neighboring,\ntitle={Neighboring Perturbations of Knowledge Editing on Large Language Models},\nauthor={Jun-Yu Ma and Zhen-Hua Ling and Ningyu Zhang and Jia-Chen Gu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=K9NTPRvVRI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 854595, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11523538194391813874&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 7, "email": "ustc.edu.cn;ustc.edu.cn;zju.edu.cn;ucla.edu", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "University of Science and Technology of China;Zhejiang University;University of California, Los Angeles", "aff_unique_dep": ";;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.zju.edu.cn;https://www.ucla.edu", "aff_unique_abbr": "USTC;ZJU;UCLA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Accelerating Convergence of Score-Based Diffusion Models, Provably", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34352", "id": "KB6slOUQP9", "proceeding": "https://proceedings.mlr.press/v235/li24ad.html", "pdf": "https://openreview.net/pdf?id=KB6slOUQP9", "openreview": "https://openreview.net/forum?id=KB6slOUQP9", "author_site": "Gen Li, Yu Huang, Timofey Efimov, Yuting Wei, Yuejie Chi, Yuxin Chen", "tldr": "", "abstract": "Score-based diffusion models, while achieving remarkable empirical performance, often suffer from low sampling speed, due to extensive function evaluations needed during the sampling phase. Despite a flurry of recent activities towards speeding up diffusion generative modeling in practice, theoretical underpinnings for acceleration techniques remain severely limited. In this paper, we design novel training-free algorithms to accelerate popular deterministic (i.e., DDIM) and stochastic (i.e., DDPM) samplers. Our accelerated deterministic sampler converges at a rate $O(\\frac{1}{{T}^2})$ with $T$ the number of steps, improving upon the $O(\\frac{1}{T})$ rate for the DDIM sampler; and our accelerated stochastic sampler converges at a rate $O(\\frac{1}{T})$, outperforming the rate $O(\\frac{1}{\\sqrt{T}})$ for the DDPM sampler. The design of our algorithms leverages insights from higher-order approximation, and shares similar intuitions as popular high-order ODE solvers like the DPM-Solver-2. Our theory accommodates $\\ell_2$-accurate score estimates, and does not require log-concavity or smoothness on the target distribution.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gen Li;Yu Huang;Timofey Efimov;Yuting Wei;Yuejie Chi;Yuxin Chen", "authorids": "~Gen_Li2;~Yu_Huang3;~Timofey_Efimov1;~Yuting_Wei1;~Yuejie_Chi1;~Yuxin_Chen5", "gender": "M;F;Not Specified;F;;M", "homepage": ";https://yuhuang42.org/;;https://yutingwei.github.io/;;https://yuxinchen2020.github.io/", "dblp": "28/538-5.html;39/6301-23;;184/3856;;11/5123-2", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;;lYfTddIAAAAJ;fsbXdAYAAAAJ;;RtNVud4AAAAJ", "orcid": "0000-0002-3078-9191;;;;;0000-0001-9256-5815", "linkedin": ";;;;;", "or_profile": "~Gen_Li2;~Yu_Huang3;~Timofey_Efimov1;~Yuting_Wei1;~Yuejie_Chi1;~Yuxin_Chen5", "aff": "The Chinese University of Hong Kong;The Wharton School, University of Pennsylvania;Carnegie Mellon University;The Wharton School, University of Pennsylvania;;University of Pennsylvania", "aff_domain": "cuhk.edu.hk;wharton.upenn.edu;andrew.cmu.edu;wharton.upenn.edu;;upenn.edu", "position": "Assistant Professor;PhD student;PhD student;Assistant Professor;;Associate Professor", "bibtex": "@inproceedings{\nli2024accelerating,\ntitle={Accelerating Convergence of Score-Based Diffusion Models, Provably},\nauthor={Gen Li and Yu Huang and Timofey Efimov and Yuting Wei and Yuejie Chi and Yuxin Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KB6slOUQP9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6778407, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7979275591309708466&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 12, "email": "cuhk.edu.hk;wharton.upenn.edu;andrew.cmu.edu;wharton.upenn.edu;;upenn.edu", "author_num": 6, "aff_unique_index": "0;1;2;1;1", "aff_unique_norm": "Chinese University of Hong Kong;University of Pennsylvania;Carnegie Mellon University", "aff_unique_dep": ";The Wharton School;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.wharton.upenn.edu;https://www.cmu.edu", "aff_unique_abbr": "CUHK;UPenn Wharton;CMU", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "China;United States" }, { "title": "Shifted Interpolation for Differential Privacy", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34351", "id": "KCVCFsPkrm", "proceeding": "https://proceedings.mlr.press/v235/bok24a.html", "pdf": "https://openreview.net/pdf?id=KCVCFsPkrm", "openreview": "https://openreview.net/forum?id=KCVCFsPkrm", "author_site": "Jinho Bok, Weijie Su, Jason Altschuler", "tldr": "", "abstract": "Noisy gradient descent and its variants are the predominant algorithms for differentially private machine learning. It is a fundamental question to quantify their privacy leakage, yet tight characterizations remain open even in the foundational setting of convex losses. This paper improves over previous analyses by establishing (and refining) the \u201cprivacy amplification by iteration\u201d phenomenon in the unifying framework of $f$-differential privacy---which tightly captures all aspects of the privacy loss and immediately implies tighter privacy accounting in other notions of differential privacy, e.g., $(\\varepsilon,\\delta)$-DP and R\u00e9nyi DP. Our key technical insight is the construction of *shifted interpolated processes* that unravel the popular shifted-divergences argument, enabling generalizations beyond divergence-based relaxations of DP. Notably, this leads to the first *exact* privacy analysis in the foundational setting of strongly convex optimization. Our techniques extend to many settings: convex/strongly convex, constrained/unconstrained, full/cyclic/stochastic batches, and all combinations thereof. As an immediate corollary, we recover the $f$-DP characterization of the exponential mechanism for strongly convex optimization in Gopi et al. (2022), and moreover extend this result to more general settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jinho Bok;Weijie J Su;Jason Altschuler", "authorids": "~Jinho_Bok1;~Weijie_J_Su1;~Jason_Altschuler1", "gender": ";M;", "homepage": "https://jinhobok.github.io;http://stat.wharton.upenn.edu/~suw/;http://www.mit.edu/~jasonalt/", "dblp": ";228/9127;180/5366", "google_scholar": ";Uhf4nBkAAAAJ;", "orcid": ";;0000-0001-7367-0097", "linkedin": ";;", "or_profile": "~Jinho_Bok1;~Weijie_J_Su1;~Jason_Altschuler1", "aff": "The Wharton School, University of Pennsylvania;University of Pennsylvania;", "aff_domain": "wharton.upenn.edu;upenn.edu;", "position": "PhD student;Associate Professor;", "bibtex": "@inproceedings{\nbok2024shifted,\ntitle={Shifted Interpolation for Differential Privacy},\nauthor={Jinho Bok and Weijie J Su and Jason Altschuler},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KCVCFsPkrm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 816356, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2554494805198463499&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "wharton.upenn.edu;upenn.edu;", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "The Wharton School", "aff_unique_url": "https://www.wharton.upenn.edu", "aff_unique_abbr": "UPenn Wharton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Leverage Class-Specific Accuracy to Guide Data Generation for Improving Image Classification", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34350", "id": "KHymcy2xxF", "proceeding": "https://proceedings.mlr.press/v235/gala24a.html", "pdf": "https://openreview.net/pdf?id=KHymcy2xxF", "openreview": "https://openreview.net/forum?id=KHymcy2xxF", "author_site": "Jay Gala, Pengtao Xie", "tldr": "", "abstract": "In many image classification applications, the number of labeled training images is limited, which leads to model overfitting. To mitigate the lack of training data, deep generative models have been leveraged to generate synthetic training data. However, existing methods generate data for individual classes based on how much training data they have without considering their actual data needs. To address this limitation, we propose needs-aware image generation, which automatically identifies the different data needs of individual classes based on their classification performance and divides a limited data generation budget into these classes according to their needs. We propose a multi-level optimization based framework which performs four learning stages in an end-to-end manner. Experiments on both imbalanced and balanced classification datasets demonstrate the effectiveness of our proposed method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jay Gala;Pengtao Xie", "authorids": "~Jay_Gala1;~Pengtao_Xie3", "gender": "M;M", "homepage": "https://jaygala24.github.io;https://pengtaoxie.github.io/", "dblp": "308/1490;133/1998", "google_scholar": "https://scholar.google.com/citations?hl=en;cnncomYAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Jay_Gala1;~Pengtao_Xie3", "aff": "AI4Bharat;Carnegie Mellon University", "aff_domain": "ai4bharat.org; ", "position": "Researcher;Graduate Student", "bibtex": "@inproceedings{\ngala2024leverage,\ntitle={Leverage Class-Specific Accuracy to Guide Data Generation for Improving Image Classification},\nauthor={Jay Gala and Pengtao Xie},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KHymcy2xxF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1438283, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:z2-G1PRJrdMJ:scholar.google.com/&scioq=Leverage+Class-Specific+Accuracy+to+Guide+Data+Generation+for+Improving+Image+Classification&hl=en&as_sdt=0,5", "gs_version_total": 4, "email": "ai4bharat.org; ", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "AI4Bharat;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": ";https://www.cmu.edu", "aff_unique_abbr": ";CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "India;United States" }, { "title": "DFD: Distilling the Feature Disparity Differently for Detectors", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34349", "id": "KI3JKFKciG", "proceeding": "https://proceedings.mlr.press/v235/liu24cd.html", "pdf": "https://openreview.net/pdf?id=KI3JKFKciG", "openreview": "https://openreview.net/forum?id=KI3JKFKciG", "author_site": "Kang Liu, Yingyi Zhang, Jingyun Zhang, Jinmin Li, Jun Wang, ShaoMing Wang, Chun Yuan, Rizen Guo", "tldr": "", "abstract": "Knowledge distillation is a widely adopted model compression technique that has been successfully applied to object detection. In feature distillation, it is common practice for the student model to imitate the feature responses of the teacher model, with the underlying objective of improving its own abilities by reducing the disparity with the teacher. However, it is crucial to recognize that the disparities between the student and teacher are inconsistent, highlighting their varying abilities. In this paper, we explore the inconsistency in the disparity between teacher and student feature maps and analyze their impact on the efficiency of the distillation. We find that regions with varying degrees of difference should be treated separately, with different distillation constraints applied accordingly. We introduce our distillation method called Disparity Feature Distillation(DFD). The core idea behind DFD is to apply different treatments to regions with varying learning difficulties, simultaneously incorporating leniency and strictness. It enables the student to better assimilate the teacher\u2019s knowledge. Through extensive experiments, we demonstrate the effectiveness of our proposed DFD in achieving significant improvements. For instance, when applied to detectors based on ResNet50 such as RetinaNet, FasterRCNN, and RepPoints, our method enhances their mAP from 37.4%, 38.4%, 38.6% to 41.7%, 42.4%, 42.7%, respectively. Our approach also demonstrates substantial improvements on YOLO and ViT-based models. The code is available at https://github.com/luckin99/DFD.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kang Liu;Yingyi Zhang;Jingyun Zhang;Jinmin Li;Jun Wang;ShaoMing Wang;Chun Yuan;Rizen Guo", "authorids": "~Kang_Liu4;~Yingyi_Zhang2;~Jingyun_Zhang1;~Jinmin_Li1;~Jun_Wang43;~ShaoMing_Wang1;~Chun_Yuan1;~Rizen_Guo1", "gender": "M;F;M;M;M;M;;F", "homepage": ";;https://github.com/THU-Kingmin/;;https://www.githubs.cn/;https://www.sigs.tsinghua.edu.cn/fg3/105064.jhtml;http://grz.qzone.qq.com;https://github.com/naskyzhang", "dblp": ";;;;;;;127/3055", "google_scholar": ";https://scholar.google.com.hk/citations?user=IKW4zlAAAAAJ;-n0TM18AAAAJ;https://scholar.google.com/citations?hl=zh-CN;;https://scholar.google.com.hk/citations?user=fYdxi2sAAAAJ;;", "orcid": ";;0009-0000-7098-2589;;;;;", "linkedin": ";;;;;;;", "or_profile": "~Kang_Liu4;~Yingyi_Zhang2;~Jinmin_Li1;~Jun_Wang43;~ShaoMing_Wang1;~Chun_Yuan1;~Rizen_Guo1;~Zhang_Jingyun1", "aff": "Tsinghua University;Tencent Youtu Lab;Tsinghua University;Tencent;WeChat Pay Lab33;Tsinghua University;WeChat Pay Lab33;Tencent", "aff_domain": "tsinghua.edu.cn;tencent.com;tsinghua.edu.cn;tencent.com;tencent.com;tsinghua.edu.cn;tencent.com;tencent.com", "position": "MS student;Researcher;MS student;Researcher;Researcher;Full Professor;Researcher;Principal Researcher", "bibtex": "@inproceedings{\nliu2024dfd,\ntitle={{DFD}: Distillng the Feature Disparity Differently for Detectors},\nauthor={Kang Liu and Yingyi Zhang and Jingyun Zhang and Jinmin Li and Jun Wang and ShaoMing Wang and Chun Yuan and Rizen Guo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KI3JKFKciG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7322438, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:h8iAEpHc1eoJ:scholar.google.com/&scioq=DFD:+Distilling+the+Feature+Disparity+Differently+for+Detectors&hl=en&as_sdt=0,33", "gs_version_total": 5, "email": "tsinghua.edu.cn;tencent.com;tsinghua.edu.cn;tencent.com;tencent.com;tsinghua.edu.cn;tencent.com;tencent.com", "author_num": 8, "aff_unique_index": "0;1;0;1;2;0;2;1", "aff_unique_norm": "Tsinghua University;Tencent;WeChat Pay", "aff_unique_dep": ";Youtu Lab;WeChat Pay Lab", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.tencent.com;https://pay.weixin.qq.com", "aff_unique_abbr": "THU;Tencent;WeChat Pay", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Algorithm of Thoughts: Enhancing Exploration of Ideas in Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34348", "id": "KJL2b6BthC", "proceeding": "https://proceedings.mlr.press/v235/sel24a.html", "pdf": "https://openreview.net/pdf?id=KJL2b6BthC", "openreview": "https://openreview.net/forum?id=KJL2b6BthC", "author_site": "Bilgehan Sel, Ahmad Al-Tawaha, Vanshaj Khattar, Ruoxi Jia, Ming Jin", "tldr": "", "abstract": "Current literature, aiming to surpass the \"Chain-of-Thought\" approach, often resorts to external modi operandi involving halting, modifying, and then resuming the generation process to boost Large Language Models' (LLMs) reasoning capacities. Due to their *myopic perspective*, they escalate the number of query requests, leading to increased costs, memory, and computational overheads. Addressing this, we propose the *Algorithm of Thoughts*---a novel strategy that propels LLMs through algorithmic reasoning pathways. By employing algorithmic examples fully in-context, this overarching view of the whole process exploits the innate recurrence dynamics of LLMs, expanding their idea exploration with merely one or a few queries. Our technique outperforms earlier single-query methods and even more recent multi-query strategies that employ an extensive tree search algorithms while using significantly fewer tokens. Intriguingly, our results suggest that instructing an LLM using an algorithm can lead to performance surpassing that of the algorithm itself, hinting at LLM's inherent ability to weave its intuition into optimized searches. We probe into the underpinnings of our method's efficacy and its nuances in application. The code and related content can be found in: https://algorithm-of-thoughts.github.io", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bilgehan Sel;Ahmad Tawaha;Vanshaj Khattar;Ruoxi Jia;Ming Jin", "authorids": "~Bilgehan_Sel1;~Ahmad_Tawaha1;~Vanshaj_Khattar1;~Ruoxi_Jia1;~Ming_Jin2", "gender": "M;M;;;M", "homepage": "https://www.bilgehansel.com/;;;https://ruoxijia.info/;http://www.jinming.tech/", "dblp": "335/1479;;;147/5355-1;", "google_scholar": "Gf7GHgYAAAAJ;https://scholar.google.com/citations?hl=en;;JCrug-YAAAAJ;YdxdTtkAAAAJ", "orcid": "0000-0001-8701-6539;;;;", "linkedin": ";;;;", "or_profile": "~Bilgehan_Sel1;~Ahmad_Tawaha1;~Vanshaj_Khattar1;~Ruoxi_Jia1;~Ming_Jin2", "aff": "Virginia Polytechnic Institute and State University;Virginia Polytechnic Institute and State University;;Virginia Tech;Virginia Tech", "aff_domain": "vt.edu;vt.edu;;vt.edu;vt.edu", "position": "PhD student;PhD student;;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nsel2024algorithm,\ntitle={Algorithm of Thoughts: Enhancing Exploration of Ideas in Large Language Models},\nauthor={Bilgehan Sel and Ahmad Tawaha and Vanshaj Khattar and Ruoxi Jia and Ming Jin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KJL2b6BthC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 592913, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 80, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2229453902203727345&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "vt.edu;vt.edu;;vt.edu;vt.edu", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Virginia Tech", "aff_unique_dep": "", "aff_unique_url": "https://www.vt.edu", "aff_unique_abbr": "VT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Embarrassingly Parallel GFlowNets", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34347", "id": "KJhLpzqNri", "proceeding": "https://proceedings.mlr.press/v235/silva24a.html", "pdf": "https://openreview.net/pdf?id=KJhLpzqNri", "openreview": "https://openreview.net/forum?id=KJhLpzqNri", "author_site": "Tiago Silva, Luiz Carvalho, Amauri Souza, Samuel Kaski, Diego Mesquita", "tldr": "", "abstract": "GFlowNets are a promising alternative to MCMC sampling for discrete compositional random variables. Training GFlowNets requires repeated evaluations of the unnormalized target distribution, or reward function. However, for large-scale posterior sampling, this may be prohibitive since it incurs traversing the data several times. Moreover, if the data are distributed across clients, employing standard GFlowNets leads to intensive client-server communication. To alleviate both these issues, we propose _embarrassingly parallel_ GFlowNet (EP-GFlowNet). EP-GFlowNet is a provably correct divide-and-conquer method to sample from product distributions of the form $R(\\cdot) \\propto R_1(\\cdot) ... R_N(\\cdot)$ --- e.g., in parallel or federated Bayes, where each $R_n$ is a local posterior defined on a data partition. First, in parallel, we train a local GFlowNet targeting each $R_n$ and send the resulting models to the server. Then, the server learns a global GFlowNet by enforcing our newly proposed _aggregating balance_ condition, requiring a single communication step. Importantly, EP-GFlowNets can also be applied to multi-objective optimization and model reuse. Our experiments illustrate the effectiveness of EP-GFlowNets on multiple tasks, including parallel Bayesian phylogenetics, multi-objective multiset and sequence generation, and federated Bayesian structure learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tiago Silva;Luiz Max Carvalho;Amauri H Souza;Samuel Kaski;Diego Mesquita", "authorids": "~Tiago_Silva4;~Luiz_Max_Carvalho1;~Amauri_H_Souza1;~Samuel_Kaski1;~Diego_Mesquita1", "gender": "M;M;M;M;M", "homepage": "https://github.com/tiagodsilva;https://github.com/maxbiostat;http://www.amauriholanda.org;https://people.aalto.fi/samuel.kaski;https://weakly-informative.github.io", "dblp": ";;131/3352;64/5826;163/4293", "google_scholar": ";y2mxpbcAAAAJ;lP0LBI4AAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": ";;;0000-0003-1925-9154;", "linkedin": ";;;samuel-kaski-27790/;", "or_profile": "~Tiago_Silva4;~Luiz_Max_Carvalho1;~Amauri_H_Souza1;~Samuel_Kaski1;~Diego_Mesquita1", "aff": "Escola de Matem\u00e1tica Aplicada;Funda\u00e7\u00e3o Getulio Vargas;Federal Institute of Cear\u00e1;Aalto University;Getulio Vargas Foundation", "aff_domain": "fgv.br;fgv.br;ifce.edu.br;aalto.fi;fgv.br", "position": "PhD student;Lecturer;Associate Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nsilva2024embarrassingly,\ntitle={Embarrassingly Parallel {GF}lowNets},\nauthor={Tiago Silva and Luiz Max Carvalho and Amauri H Souza and Samuel Kaski and Diego Mesquita},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KJhLpzqNri}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1093999, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:C1w0MjVq3LkJ:scholar.google.com/&scioq=Embarrassingly+Parallel+GFlowNets&hl=en&as_sdt=0,5", "gs_version_total": 11, "email": "fgv.br;fgv.br;ifce.edu.br;aalto.fi;fgv.br", "author_num": 5, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Escola de Matem\u00e1tica Aplicada;Funda\u00e7\u00e3o Getulio Vargas;Federal Institute of Cear\u00e1;Aalto University;Getulio Vargas Foundation", "aff_unique_dep": "Escola de Matem\u00e1tica Aplicada;;;;", "aff_unique_url": ";https://www.fgv.br;http://www.ifce.edu.br;https://www.aalto.fi;https://fgv.br", "aff_unique_abbr": ";FGV;IFCE;Aalto;FGV", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "Brazil;Finland" }, { "title": "Fair Resource Allocation in Multi-Task Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34346", "id": "KLmWRMg6nL", "proceeding": "https://proceedings.mlr.press/v235/ban24a.html", "pdf": "https://openreview.net/pdf?id=KLmWRMg6nL", "openreview": "https://openreview.net/forum?id=KLmWRMg6nL", "author_site": "Hao Ban, Kaiyi Ji", "tldr": "", "abstract": "By jointly learning multiple tasks, multi-task learning (MTL) can leverage the shared knowledge across tasks, resulting in improved data efficiency and generalization performance. However, a major challenge in MTL lies in the presence of conflicting gradients, which can hinder the fair optimization of some tasks and subsequently impede MTL's ability to achieve better overall performance. Inspired by fair resource allocation in communication networks, we formulate the optimization of MTL as a utility maximization problem, where the loss decreases across tasks are maximized under different fairness measurements. To address the problem, we propose FairGrad, a novel optimization objective. FairGrad not only enables flexible emphasis on certain tasks but also achieves a theoretical convergence guarantee. Extensive experiments demonstrate that our method can achieve state-of-the-art performance among gradient manipulation methods on a suite of multi-task benchmarks in supervised learning and reinforcement learning. Furthermore, we incorporate the idea of $\\alpha$-fairness into the loss functions of various MTL methods. Extensive empirical studies demonstrate that their performance can be significantly enhanced. Code is available at https://github.com/OptMN-Lab/fairgrad.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hao Ban;Kaiyi Ji", "authorids": "~Hao_Ban1;~Kaiyi_Ji1", "gender": ";M", "homepage": ";https://cse.buffalo.edu/~kaiyiji/", "dblp": ";205/3164", "google_scholar": ";E0A3lSIAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Hao_Ban1;~Kaiyi_Ji1", "aff": ";State University of New York at Buffalo", "aff_domain": ";buffalo.edu", "position": ";Assistant Professor", "bibtex": "@inproceedings{\nban2024fair,\ntitle={Fair Resource Allocation in Multi-Task Learning},\nauthor={Hao Ban and Kaiyi Ji},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KLmWRMg6nL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 524200, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2168901508725928835&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": ";buffalo.edu", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "State University of New York at Buffalo", "aff_unique_dep": "", "aff_unique_url": "https://www.buffalo.edu", "aff_unique_abbr": "SUNY Buffalo", "aff_campus_unique_index": "0", "aff_campus_unique": "Buffalo", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Efficient Non-stationary Online Learning by Wavelets with Applications to Online Distribution Shift Adaptation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34345", "id": "KNedb3bQ4h", "proceeding": "https://proceedings.mlr.press/v235/qian24c.html", "pdf": "https://openreview.net/pdf?id=KNedb3bQ4h", "openreview": "https://openreview.net/forum?id=KNedb3bQ4h", "author_site": "Yu-Yang Qian, Peng Zhao, Yu-Jie Zhang, Masashi Sugiyama, Zhi-Hua Zhou", "tldr": "", "abstract": "Dynamic regret minimization offers a principled way for non-stationary online learning, where the algorithm's performance is evaluated against changing comparators. Prevailing methods often employ a two-layer online ensemble, consisting of a group of base learners with different configurations and a meta learner that combines their outputs. Given the evident computational overhead associated with two-layer algorithms, this paper investigates how to attain optimal dynamic regret *without* deploying a model ensemble. To this end, we introduce the notion of *underlying dynamic regret*, a specific form of the general dynamic regret that can encompass many applications of interest. We show that almost optimal dynamic regret can be obtained using a single-layer model alone. This is achieved by an adaptive restart equipped with wavelet detection, wherein a novel streaming wavelet operator is introduced to online update the wavelet coefficients via a carefully designed binary indexed tree. We apply our method to the *online label shift* adaptation problem, leading to new algorithms with optimal dynamic regret and significantly improved computation/storage efficiency compared to prior arts. Extensive experiments validate our proposal.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yu-Yang Qian;Peng Zhao;Yu-Jie Zhang;Masashi Sugiyama;Zhi-Hua Zhou", "authorids": "~Yu-Yang_Qian1;~Peng_Zhao1;~Yu-Jie_Zhang1;~Masashi_Sugiyama1;~Zhi-Hua_Zhou2", "gender": ";;M;M;", "homepage": ";;https://yujie-zhang96.github.io/;http://www.ms.k.u-tokyo.ac.jp/sugi/;", "dblp": ";;234/6681;35/1228;", "google_scholar": ";;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.co.jp/citations?user=GkYIrlIAAAAJ;", "orcid": ";;;0000-0001-6658-6743;", "linkedin": ";;;;", "or_profile": "~Yu-Yang_Qian1;~Peng_Zhao1;~Yu-Jie_Zhang1;~Masashi_Sugiyama1;~Zhi-Hua_Zhou2", "aff": ";;The University of Tokyo;The University of Tokyo;", "aff_domain": ";;u-tokyo.ac.jp;u-tokyo.ac.jp;", "position": ";;PhD student;Full Professor;", "bibtex": "@inproceedings{\nqian2024efficient,\ntitle={Efficient Non-stationary Online Learning by Wavelets with Applications to Online Distribution Shift Adaptation},\nauthor={Yu-Yang Qian and Peng Zhao and Yu-Jie Zhang and Masashi Sugiyama and Zhi-Hua Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KNedb3bQ4h}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1394293, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=672591341978149715&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": ";;u-tokyo.ac.jp;u-tokyo.ac.jp;", "author_num": 5, "aff_unique_index": "0;0", "aff_unique_norm": "University of Tokyo", "aff_unique_dep": "", "aff_unique_url": "https://www.u-tokyo.ac.jp", "aff_unique_abbr": "UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "title": "MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34344", "id": "KOTutrSR2y", "proceeding": "https://proceedings.mlr.press/v235/yu24o.html", "pdf": "https://openreview.net/pdf?id=KOTutrSR2y", "openreview": "https://openreview.net/forum?id=KOTutrSR2y", "author_site": "Weihao Yu, Zhengyuan Yang, Linjie Li, Jianfeng Wang, Kevin Lin, Zicheng Liu, Xinchao Wang, Lijuan Wang", "tldr": "", "abstract": "We propose MM-Vet, an evaluation benchmark that examines large multimodal models (LMMs) on complicated multimodal tasks. Recent LMMs have shown various intriguing abilities, such as solving math problems written on the blackboard, reasoning about events and celebrities in news images, and explaining visual jokes. Rapid model advancements pose challenges to evaluation benchmark development. Problems include: (1) How to systematically structure and evaluate the complicated multimodal tasks; (2) How to design evaluation metrics that work well across question and answer types; and (3) How to give model insights beyond a simple performance ranking. To this end, we present MM-Vet, designed based on the insight that the intriguing ability to solve complicated tasks is often achieved by a generalist model being able to integrate different core vision-language (VL) capabilities. MM-Vet defines 6 core VL capabilities and examines the 16 integrations of interest derived from the capability combination. For evaluation metrics, we propose an LLM-based evaluator for open-ended outputs. The evaluator enables the evaluation across different question types and answer styles, resulting in a unified scoring metric. We evaluate representative LMMs on MM-Vet, providing insights into the capabilities of different LMM system paradigms and models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weihao Yu;Zhengyuan Yang;Linjie Li;Jianfeng Wang;Kevin Lin;Zicheng Liu;Xinchao Wang;Lijuan Wang", "authorids": "~Weihao_Yu2;~Zhengyuan_Yang1;~Linjie_Li1;~Jianfeng_Wang4;~Kevin_Lin3;~Zicheng_Liu1;~Xinchao_Wang1;~Lijuan_Wang1", "gender": ";M;F;M;;M;F;M", "homepage": "http://whyu.me;http://zhengyuan.info/;;;https://sites.google.com/site/kevinlin311tw/;https://sites.google.com/view/zichengliu/home?pli=1;https://www.microsoft.com/en-us/research/people/lijuanw/;https://sites.google.com/site/sitexinchaowang/", "dblp": "222/7846-1.html;163/9713;200/8256;;;l/ZichengLiu;51/2527.html;", "google_scholar": "LYxjt1QAAAAJ;https://scholar.google.com/citations?hl=zh-CN;WR875gYAAAAJ;vJWEw_8AAAAJ;https://scholar.google.com.tw/citations?user=LKSy1kwAAAAJ;bkALdvsAAAAJ;cDcWXuIAAAAJ;https://scholar.google.com.tw/citations?user=w69Buq0AAAAJ", "orcid": ";;;;0000-0001-8944-1336;0000-0001-5894-7828;;", "linkedin": ";;;;;;;", "or_profile": "~Weihao_Yu2;~Zhengyuan_Yang1;~Linjie_Li1;~Jianfeng_Wang4;~Kevin_Lin3;~Zicheng_Liu1;~Lijuan_Wang1;~Xinchao_WANG3", "aff": "National University of Singapore;Microsoft;Microsoft;Microsoft;Microsoft;Microsoft;Microsoft;National University of Singapore", "aff_domain": "u.nus.edu;microsoft.com;microsoft.com;microsoft.com;microsoft.com;microsoft.com;microsoft.com;nus.edu", "position": "PhD student;Researcher;Researcher;Principal Researcher;Principal Researcher;partner research manager;Principal Researcher;Assistant Professor", "bibtex": "@inproceedings{\nyu2024mmvet,\ntitle={{MM}-Vet: Evaluating Large Multimodal Models for Integrated Capabilities},\nauthor={Weihao Yu and Zhengyuan Yang and Linjie Li and Jianfeng Wang and Kevin Lin and Zicheng Liu and Xinchao Wang and Lijuan Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KOTutrSR2y}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9396532, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 666, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5160742062303244685&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "u.nus.edu;microsoft.com;microsoft.com;microsoft.com;microsoft.com;microsoft.com;microsoft.com;nus.edu", "author_num": 8, "aff_unique_index": "0;1;1;1;1;1;1;0", "aff_unique_norm": "National University of Singapore;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.nus.edu.sg;https://www.microsoft.com", "aff_unique_abbr": "NUS;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;1;1;0", "aff_country_unique": "Singapore;United States" }, { "title": "Optimal Kernel Quantile Learning with Random Features", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34343", "id": "KOW9ncAiRo", "proceeding": "https://proceedings.mlr.press/v235/wang24r.html", "pdf": "https://openreview.net/pdf?id=KOW9ncAiRo", "openreview": "https://openreview.net/forum?id=KOW9ncAiRo", "author_site": "Caixing Wang, Xingdong Feng", "tldr": "", "abstract": "The random feature (RF) approach is a well-established and efficient tool for scalable kernel methods, but existing literature has primarily focused on kernel ridge regression with random features (KRR-RF), which has limitations in handling heterogeneous data with heavy-tailed noises. This paper presents a generalization study of kernel quantile regression with random features (KQR-RF), which accounts for the non-smoothness of the check loss in KQR-RF by introducing a refined error decomposition and establishing a novel connection between KQR-RF and KRR-RF. Our study establishes the capacity-dependent learning rates for KQR-RF under mild conditions on the number of RFs, which are minimax optimal up to some logarithmic factors. Importantly, our theoretical results, utilizing a data-dependent sampling strategy, can be extended to cover the agnostic setting where the target quantile function may not precisely align with the assumed kernel space. By slightly modifying our assumptions, the capacity-dependent error analysis can also be applied to cases with Lipschitz continuous losses, enabling broader applications in the machine learning community. To validate our theoretical findings, simulated experiments and a real data application are conducted.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Caixing Wang;Xingdong Feng", "authorids": "~Caixing_Wang1;~Xingdong_Feng1", "gender": "M;M", "homepage": "https://bb9.sufe.edu.cn/bbcswebdav/users/2011000070/index.htm;http://wangcaixing96.com/", "dblp": ";", "google_scholar": "nQyBQOsAAAAJ;SLEH6XYAAAAJ", "orcid": ";0009-0009-3068-6094", "linkedin": ";", "or_profile": "~Xingdong_Feng1;~Wang_Caixing1", "aff": ";Shanghai University of Finance and Economics", "aff_domain": ";shufe.edu.cn", "position": ";PhD student", "bibtex": "@inproceedings{\nwang2024optimal,\ntitle={Optimal Kernel Quantile Learning with Random Features},\nauthor={Caixing Wang and Xingdong Feng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KOW9ncAiRo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2047891, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=999753802728702141&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 7, "email": ";shufe.edu.cn", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Shanghai University of Finance and Economics", "aff_unique_dep": "", "aff_unique_url": "http://www.sufe.edu.cn", "aff_unique_abbr": "SUFE", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "Premier-TACO is a Few-Shot Policy Learner: Pretraining Multitask Representation via Temporal Action-Driven Contrastive Loss", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34342", "id": "KSNl7VgeVr", "proceeding": "https://proceedings.mlr.press/v235/zheng24g.html", "pdf": "https://openreview.net/pdf?id=KSNl7VgeVr", "openreview": "https://openreview.net/forum?id=KSNl7VgeVr", "author_site": "Ruijie Zheng, Yongyuan Liang, xiyao wang, shuang ma, Hal Daum\u00e9, Huazhe Xu, John Langford, Praveen Palanisamy, Kalyan Basu, Furong Huang", "tldr": "", "abstract": "We present Premier-TACO, a multitask feature representation learning approach designed to improve few-shot policy learning efficiency in sequential decision-making tasks. Premier-TACO leverages a subset of multitask offline datasets for pretraining a general feature representation, which captures critical environmental dynamics and is fine-tuned using minimal expert demonstrations. It advances the temporal action contrastive learning (TACO) objective, known for state-of-the-art results in visual control tasks, by incorporating a novel negative example sampling strategy. This strategy is crucial in significantly boosting TACO\u2019s computational efficiency, making large-scale multitask offline pretraining feasible. Our extensive empirical evaluation in a diverse set of continuous control benchmarks including Deepmind Control Suite, MetaWorld, and LIBERO demonstrate Premier-TACO\u2019s effective- ness in pretraining visual representations, significantly enhancing few-shot imitation learning of novel tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ruijie Zheng;Yongyuan Liang;Xiyao Wang;Shuang Ma;Hal Daum\u00e9 III;Huazhe Xu;John Langford;Praveen Palanisamy;Kalyan Shankar Basu;Furong Huang", "authorids": "~Ruijie_Zheng1;~Yongyuan_Liang1;~Xiyao_Wang1;~Shuang_Ma3;~Hal_Daum\u00e9_III1;~Huazhe_Xu1;~John_Langford1;~Praveen_Palanisamy2;~Kalyan_Shankar_Basu1;~Furong_Huang1", "gender": ";F;M;M;M;M;;M;F;F", "homepage": "http://www.ruijiezheng.com;https://cheryyunl.github.io/;;http://hal3.name;http://hxu.rocks;http://hunch.net/~jl;https://praveenp.com;;https://furong-huang.com;https://www.shuangma.me/", "dblp": "294/8474;238/4104;;77/2856.html;164/9006;77/4488;223/4229;;72/8513;98/3906", "google_scholar": ";GQToORIAAAAJ;puVqfbwAAAAJ;PbEw81gAAAAJ;t9HPFawAAAAJ;LFiqVpwAAAAJ;gHmYX8YAAAAJ;;13yyuCcAAAAJ;IHPRZuMAAAAJ", "orcid": ";;;;;;0000-0001-9069-3071;;;", "linkedin": ";https://linkedin.com/in/yongyuan-l-31462a17a;;;;;;kalyan-basu-61959b2/;;", "or_profile": "~Ruijie_Zheng1;~Yongyuan_Liang1;~Xiyao_Wang1;~Hal_Daum\u00e9_III1;~Huazhe_Xu1;~John_Langford1;~Praveen_Palanisamy2;~Kalyan_Shankar_Basu1;~Furong_Huang1;~shuang_ma1", "aff": "University of Maryland, College Park;University of Maryland, College Park;University of Maryland, College Park;Microsoft;Tsinghua University;Microsoft;Microsoft;Qualtrics LLC;University of Maryland;Apple", "aff_domain": "cs.umd.edu;umd.edu;umd.edu;microsoft.com;tsinghua.edu.cn;microsoft.com;microsoft.com;qualtrics.com;cs.umd.edu;apple.com", "position": "PhD student;PhD student;PhD student;Senior Principle Researcher;Assistant Professor;Researcher;Researcher;Principal Researcher;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nzheng2024premiertaco,\ntitle={Premier-{TACO} is a Few-Shot Policy Learner: Pretraining Multitask Representation via Temporal Action-Driven Contrastive Loss},\nauthor={Ruijie Zheng and Yongyuan Liang and Xiyao Wang and Shuang Ma and Hal Daum{\\'e} III and Huazhe Xu and John Langford and Praveen Palanisamy and Kalyan Shankar Basu and Furong Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KSNl7VgeVr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9628649, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13004499412455352355&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "cs.umd.edu;umd.edu;umd.edu;microsoft.com;tsinghua.edu.cn;microsoft.com;microsoft.com;qualtrics.com;cs.umd.edu;apple.com", "author_num": 10, "aff_unique_index": "0;0;0;1;2;1;1;3;0;4", "aff_unique_norm": "University of Maryland;Microsoft;Tsinghua University;Qualtrics;Apple", "aff_unique_dep": ";Microsoft Corporation;;;Apple Inc.", "aff_unique_url": "https://www/umd.edu;https://www.microsoft.com;https://www.tsinghua.edu.cn;https://www.qualtrics.com;https://www.apple.com", "aff_unique_abbr": "UMD;Microsoft;THU;Qualtrics;Apple", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;1;0;0;0;0;0", "aff_country_unique": "United States;China" }, { "title": "UPAM: Unified Prompt Attack in Text-to-Image Generation Models Against Both Textual Filters and Visual Checkers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34341", "id": "KU9mn6deDR", "proceeding": "https://proceedings.mlr.press/v235/peng24b.html", "pdf": "https://openreview.net/pdf?id=KU9mn6deDR", "openreview": "https://openreview.net/forum?id=KU9mn6deDR", "author_site": "Duo Peng, Qiuhong Ke, Jun Liu", "tldr": "", "abstract": "Text-to-Image (T2I) models have raised security concerns due to their potential to generate inappropriate or harmful images. In this paper, we propose UPAM, a novel framework that investigates the robustness of T2I models from the attack perspective. Unlike most existing attack methods that focus on deceiving textual defenses, UPAM aims to deceive both textual and visual defenses in T2I models. UPAM enables gradient-based optimization, offering greater effectiveness and efficiency than previous methods. Given that T2I models might not return results due to defense mechanisms, we introduce a Sphere-Probing Learning (SPL) scheme to support gradient optimization even when no results are returned. Additionally, we devise a Semantic-Enhancing Learning (SEL) scheme to finetune UPAM for generating target-aligned images. Our framework also ensures attack stealthiness. Extensive experiments demonstrate UPAM's effectiveness and efficiency.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Duo Peng;Qiuhong Ke;Jun Liu", "authorids": "~Duo_Peng1;~Qiuhong_Ke6;~Jun_Liu8", "gender": "F;M;M", "homepage": "https://research.monash.edu/en/persons/qiuhong-ke;;", "dblp": "151/3574;95/3736-36;175/3967", "google_scholar": "84qxdhsAAAAJ;Q5Ild8UAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";;0000-0003-3281-0772", "linkedin": ";;", "or_profile": "~Qiuhong_Ke6;~Jun_Liu8;~Peng_Duo1", "aff": "Monash University;Singapore University of Technology and Design;Singapore University of Technology and Design", "aff_domain": "monash.edu;sutd.edu.sg;mymail.sutd.edu.sg", "position": "Lecturer;Assistant Professor;PhD student", "bibtex": "@inproceedings{\npeng2024upam,\ntitle={{UPAM}: Unified Prompt Attack in Text-to-Image Generation Models Against Both Textual Filters and Visual Checkers},\nauthor={Duo Peng and Qiuhong Ke and Jun Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KU9mn6deDR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1067971, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4097386654976210162&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "monash.edu;sutd.edu.sg;mymail.sutd.edu.sg", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Monash University;Singapore University of Technology and Design", "aff_unique_dep": ";", "aff_unique_url": "https://www.monash.edu;https://www.sutd.edu.sg", "aff_unique_abbr": "Monash;SUTD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Australia;Singapore" }, { "title": "convSeq: Fast and Scalable Method for Detecting Patterns in Spike Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34340", "id": "KVa4i4RR1O", "proceeding": "https://proceedings.mlr.press/v235/koshkin24a.html", "pdf": "https://openreview.net/pdf?id=KVa4i4RR1O", "openreview": "https://openreview.net/forum?id=KVa4i4RR1O", "author_site": "Roman Koshkin, Tomoki Fukai", "tldr": "", "abstract": "Spontaneous neural activity, crucial in memory, learning, and spatial navigation, often manifests itself as repetitive spatiotemporal patterns. Despite their importance, analyzing these patterns in large neural recordings remains challenging due to a lack of efficient and scalable detection methods. Addressing this gap, we introduce *convSeq*, an unsupervised method that employs backpropagation for optimizing spatiotemporal filters that effectively identify these neural patterns. Our method\u2019s performance is validated on various synthetic data and real neural recordings, revealing spike sequences with unprecedented scalability and efficiency. Significantly surpassing existing methods in speed, *convSeq* sets a new standard for analyzing spontaneous neural activity, potentially advancing our understanding of information processing in neural circuits.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Roman Koshkin;Tomoki Fukai", "authorids": "~Roman_Koshkin1;~Tomoki_Fukai1", "gender": "M;M", "homepage": "https://roman-koshkin.unit.oist.jp;https://groups.oist.jp/ncbc", "dblp": "351/7581.html;", "google_scholar": "OpQXH3cAAAAJ;https://scholar.google.co.jp/citations?user=iO7jHc4AAAAJ", "orcid": "0000-0002-8739-4545;my-orcid?orcid=0000-0001-6977-5638", "linkedin": "romankoshkininterpreter/;", "or_profile": "~Roman_Koshkin1;~Tomoki_Fukai1", "aff": "Okinawa Institute of Science and Technology (OIST);Okinawa Institute of Science and Technology (OIST)", "aff_domain": "oist.jp;oist.jp", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nkoshkin2024convseq,\ntitle={convSeq: Fast and Scalable Method for Detecting Patterns in Spike Data},\nauthor={Roman Koshkin and Tomoki Fukai},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KVa4i4RR1O}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2155108, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15633172545454815050&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "oist.jp;oist.jp", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Okinawa Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.oist.jp", "aff_unique_abbr": "OIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "title": "A Tale of Tails: Model Collapse as a Change of Scaling Laws", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34339", "id": "KVvku47shW", "proceeding": "https://proceedings.mlr.press/v235/dohmatob24b.html", "pdf": "https://openreview.net/pdf?id=KVvku47shW", "openreview": "https://openreview.net/forum?id=KVvku47shW", "author_site": "Elvis Dohmatob, Yunzhen Feng, Pu Yang, Francois Charton, Julia Kempe", "tldr": "", "abstract": "As AI model size grows, neural *scaling laws* have become a crucial tool to predict the improvements of large models when increasing capacity and the size of original (human or natural) training data. Yet, the widespread use of popular models means that the ecosystem of online data and text will co-evolve to progressively contain increased amounts of synthesized data. In this paper we ask: *How will the scaling laws change in the inevitable regime where synthetic data makes its way into the training corpus?* Will future models, still improve, or be doomed to degenerate up to total *(model) collapse*? We develop a theoretical framework of model collapse through the lens of scaling laws. We discover a wide range of decay phenomena, analyzing loss of scaling, shifted scaling with number of generations, the ''un-learning\" of skills, and grokking when mixing human and synthesized data. Our theory is validated by large-scale experiments with a transformer on an arithmetic task and text generation using the large language model Llama2.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Elvis Dohmatob;Yunzhen Feng;Pu Yang;Francois Charton;Julia Kempe", "authorids": "~Elvis_Dohmatob1;~Yunzhen_Feng1;~Pu_Yang3;~Francois_Charton1;~Julia_Kempe1", "gender": "M;M;M;M;", "homepage": "http://dohmatob.github.io/;https://fengyzpku.github.io;https://yangpupku.github.io/;;", "dblp": "134/9794;254/4752;;255/5318;", "google_scholar": "https://scholar.google.fr/citations?user=FDWgJY8AAAAJ;QebzOsIAAAAJ;EfZbm40AAAAJ;;", "orcid": ";;0009-0002-5235-8461;;", "linkedin": ";;;fran%C3%A7ois-charton-214187120/;", "or_profile": "~Elvis_Dohmatob1;~Yunzhen_Feng1;~Pu_Yang3;~Francois_Charton1;~Julia_Kempe1", "aff": "Meta Facebook;Meta FAIR;Peking University;Meta Facebook;", "aff_domain": "facebook.com;meta.com;pku.edu.cn;fb.com;", "position": "Researcher;Intern;PhD student;Research Engineer;", "bibtex": "@inproceedings{\ndohmatob2024a,\ntitle={A Tale of Tails: Model Collapse as a Change of Scaling Laws},\nauthor={Elvis Dohmatob and Yunzhen Feng and Pu Yang and Francois Charton and Julia Kempe},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KVvku47shW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2256356, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13334690243847575117&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "facebook.com;meta.com;pku.edu.cn;fb.com;", "author_num": 5, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Meta;Peking University", "aff_unique_dep": "Meta Platforms, Inc.;", "aff_unique_url": "https://meta.com;http://www.pku.edu.cn", "aff_unique_abbr": "Meta;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;China" }, { "title": "Fundamental Limitations of Alignment in Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34338", "id": "KXsUCgn9Ks", "proceeding": "https://proceedings.mlr.press/v235/wolf24a.html", "pdf": "https://openreview.net/pdf?id=KXsUCgn9Ks", "openreview": "https://openreview.net/forum?id=KXsUCgn9Ks", "author_site": "Yotam Wolf, Noam Wies, Oshri Avnery, Yoav Levine, Amnon Shashua", "tldr": "", "abstract": "An important aspect in developing language models that interact with humans is aligning their behavior to be useful and unharmful for their human users. This is usually achieved by tuning the model in a way that enhances desired behaviors and inhibits undesired ones, a process referred to as alignment. In this paper, we propose a theoretical approach called Behavior Expectation Bounds (BEB) which allows us to formally investigate several inherent characteristics and limitations of alignment in large language models. Importantly, we prove that within the limits of this framework, for any behavior that has a finite probability of being exhibited by the model, there exist prompts that can trigger the model into outputting this behavior, with probability that increases with the length of the prompt. This implies that any alignment process that attenuates an undesired behavior but does not remove it altogether, is not safe against adversarial prompting attacks. Furthermore, our framework hints at the mechanism by which leading alignment approaches such as reinforcement learning from human feedback make the LLM prone to being prompted into the undesired behaviors. This theoretical result is being experimentally demonstrated in large scale by the so called contemporary \"chatGPT jailbreaks\", where adversarial users trick the LLM into breaking its alignment guardrails by triggering it into acting as a malicious persona. Our results expose fundamental limitations in alignment of LLMs and bring to the forefront the need to devise reliable mechanisms for ensuring AI safety.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yotam Wolf;Noam Wies;Oshri Avnery;Yoav Levine;Amnon Shashua", "authorids": "~Yotam_Wolf1;~Noam_Wies1;~Oshri_Avnery1;~Yoav_Levine1;~Amnon_Shashua1", "gender": "M;M;M;M;M", "homepage": ";;;;http://www.cs.huji.ac.il/~shashua/", "dblp": ";236/6106;;199/1895;47/1492", "google_scholar": "jlPKaIIAAAAJ;https://scholar.google.co.il/citations?user=FxlR8voAAAAJ;;;https://scholar.google.com.tw/citations?user=dwi5wvYAAAAJ", "orcid": ";0000-0002-1337-2298;;;", "linkedin": ";noam-wies-a5ab1663/;oshri-avnery/;;", "or_profile": "~Yotam_Wolf1;~Noam_Wies1;~Oshri_Avnery1;~Yoav_Levine1;~Amnon_Shashua1", "aff": "Hebrew University of Jerusalem;Hebrew University of Jerusalem;Hebrew University of Jerusalem;;Hebrew University, Hebrew University of Jerusalem", "aff_domain": "huji.ac.il;huji.ac.il;huji.ac.il;;cs.huji.ac.il", "position": "PhD student;PhD student;MS student;;Professor", "bibtex": "@inproceedings{\nwolf2024fundamental,\ntitle={Fundamental Limitations of Alignment in Large Language Models},\nauthor={Yotam Wolf and Noam Wies and Oshri Avnery and Yoav Levine and Amnon Shashua},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KXsUCgn9Ks}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1673988, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 208, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10671962345198545886&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "huji.ac.il;huji.ac.il;huji.ac.il;;cs.huji.ac.il", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Hebrew University of Jerusalem", "aff_unique_dep": "", "aff_unique_url": "https://www.huji.ac.il", "aff_unique_abbr": "HUJI", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Jerusalem;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Israel" }, { "title": "Inferring Dynamic Networks from Marginals with Iterative Proportional Fitting", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34337", "id": "KYrAZSbEv6", "proceeding": "https://proceedings.mlr.press/v235/chang24b.html", "pdf": "https://openreview.net/pdf?id=KYrAZSbEv6", "openreview": "https://openreview.net/forum?id=KYrAZSbEv6", "author_site": "Serina Chang, Frederic Koehler, Zhaonan Qu, Jure Leskovec, Johan Ugander", "tldr": "", "abstract": "A common network inference problem, arising from real-world data constraints, is how to infer a dynamic network from its time-aggregated adjacency matrix and time-varying marginals (i.e., row and column sums). Prior approaches to this problem have repurposed the classic iterative proportional fitting (IPF) procedure, also known as Sinkhorn\u2019s algorithm, with promising empirical results. However, the statistical foundation for using IPF has not been well understood: under what settings does IPF provide principled estimation of a dynamic network from its marginals, and how well does it estimate the network? In this work, we establish such a setting, by identifying a generative network model whose maximum likelihood estimates are recovered by IPF. Our model both reveals implicit assumptions on the use of IPF in such settings and enables new analyses, such as structure-dependent error bounds on IPF\u2019s parameter estimates. When IPF fails to converge on sparse network data, we introduce a principled algorithm that guarantees IPF converges under minimal changes to the network structure. Finally, we conduct experiments with synthetic and real-world data, which demonstrate the practical value of our theoretical and algorithmic contributions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Serina Chang;Frederic Koehler;Zhaonan Qu;Jure Leskovec;Johan Ugander", "authorids": "~Serina_Chang1;~Frederic_Koehler1;~Zhaonan_Qu1;~Jure_Leskovec1;~Johan_Ugander1", "gender": "F;;;;M", "homepage": "https://serinachang5.github.io/;https://frkoehle.github.io/;https://www.researchgate.net/scientific-contributions/Zhaonan-Qu-2155198240;http://cs.stanford.edu/~jure/;http://stanford.edu/~jugander/", "dblp": "204/1095;132/1904;;l/JureLeskovec;13/10542.html", "google_scholar": "CpsZIU0AAAAJ;;;Q_kKkIUAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;0000-0002-5411-923X;0000-0001-5655-4086", "linkedin": ";;;leskovec/;", "or_profile": "~Serina_Chang1;~Frederic_Koehler1;~Zhaonan_Qu1;~Jure_Leskovec1;~Johan_Ugander1", "aff": "Stanford University;University of Chicago;Stanford University;Kumo.AI;Yale University", "aff_domain": "stanford.edu;uchicago.edu;stanford.edu;kumo.ai;yale.edu", "position": "PhD student;Assistant Professor;PhD student;Chief Scientist;Visiting Associate Professor", "bibtex": "@inproceedings{\nchang2024inferring,\ntitle={Inferring Dynamic Networks from Marginals with Iterative Proportional Fitting},\nauthor={Serina Chang and Frederic Koehler and Zhaonan Qu and Jure Leskovec and Johan Ugander},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KYrAZSbEv6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1540508, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8740010153424746304&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "email": "stanford.edu;uchicago.edu;stanford.edu;kumo.ai;yale.edu", "author_num": 5, "aff_unique_index": "0;1;0;2;3", "aff_unique_norm": "Stanford University;University of Chicago;Kumo.AI;Yale University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.stanford.edu;https://www.uchicago.edu;https://www.kumo.ai;https://www.yale.edu", "aff_unique_abbr": "Stanford;UChicago;Kumo.AI;Yale", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "MolCRAFT: Structure-Based Drug Design in Continuous Parameter Space", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34336", "id": "KaAQu5rNU1", "proceeding": "https://proceedings.mlr.press/v235/qu24a.html", "pdf": "https://openreview.net/pdf?id=KaAQu5rNU1", "openreview": "https://openreview.net/forum?id=KaAQu5rNU1", "author_site": "Yanru Qu, Keyue Qiu, Yuxuan Song, Jingjing Gong, Jiawei Han, Mingyue Zheng, Hao Zhou, Wei-Ying Ma", "tldr": "", "abstract": "Generative models for structure-based drug design (SBDD) have shown promising results in recent years. Existing works mainly focus on how to generate molecules with higher binding affinity, ignoring the feasibility prerequisites for generated 3D poses and resulting in *false positives*. We conduct thorough studies on key factors of ill-conformational problems when applying autoregressive methods and diffusion to SBDD, including mode collapse and hybrid continuous-discrete space. In this paper, we introduce MolCRAFT, the first SBDD model that operates in the continuous parameter space, together with a novel noise reduced sampling strategy. Empirical results show that our model consistently achieves superior performance in binding affinity with more stable 3D structure, demonstrating our ability to accurately model interatomic interactions. To our best knowledge, MolCRAFT is the first to achieve reference-level Vina Scores (-6.59 kcal/mol) with comparable molecular size, outperforming other strong baselines by a wide margin (-0.84 kcal/mol). Code is available at https://github.com/AlgoMole/MolCRAFT.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yanru Qu;Keyue Qiu;Yuxuan Song;Jingjing Gong;Jiawei Han;Mingyue Zheng;Hao Zhou;Wei-Ying Ma", "authorids": "~Yanru_Qu1;~Keyue_Qiu1;~Yuxuan_Song2;~Jingjing_Gong3;~Jiawei_Han1;~Mingyue_Zheng1;~Hao_Zhou5;~Wei-Ying_Ma2", "gender": "M;;M;M;M;M;M;M", "homepage": "https://yanruqu.com/;;https://yuxuansong.com;;http://hanj.cs.illinois.edu/;https://www.researchgate.net/profile/Mingyue-Zheng;https://zhouh.github.io/;https://air.tsinghua.edu.cn/en/info/1046/1189.htm", "dblp": "180/3336;;;63/8487;h/JiaweiHan.html;;63/778-12;m/WYMa.html", "google_scholar": "W-o1VXEAAAAJ;;xlnZ1OIAAAAJ;MayCLqYAAAAJ;https://scholar.google.com.tw/citations?user=Kv9AbjMAAAAJ;vzBQN8EAAAAJ;https://scholar.google.com/citations?hl=zh-CN;SToCbu8AAAAJ", "orcid": ";;;;0000-0002-3629-2696;0000-0002-3323-3092;;", "linkedin": ";;;;;;;wei-ying-ma-16a0171/", "or_profile": "~Yanru_Qu1;~Keyue_Qiu1;~Yuxuan_Song2;~Jingjing_Gong3;~Jiawei_Han1;~Mingyue_Zheng1;~Hao_Zhou5;~Wei-Ying_Ma2", "aff": "University of Illinois, Urbana Champaign;;Tsinghua University;Tsinghua University;University of Illinois at Urbana-Champaign (UIUC);Shanghai Institute of Materia Medica;Tsinghua University;Tsinghua University", "aff_domain": "illinois.edu;;tsinghua.edu.cn;tsinghua.edu.cn;illinois.edu;simm.ac.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;;PhD student;Postdoc;Full Professor;Full Professor;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nqu2024molcraft,\ntitle={Mol{CRAFT}: Structure-Based Drug Design in Continuous Parameter Space},\nauthor={Yanru Qu and Keyue Qiu and Yuxuan Song and Jingjing Gong and Jiawei Han and Mingyue Zheng and Hao Zhou and Wei-Ying Ma},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KaAQu5rNU1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4864522, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7885292706652507703&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "illinois.edu;;tsinghua.edu.cn;tsinghua.edu.cn;illinois.edu;simm.ac.cn;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 8, "aff_unique_index": "0;1;1;0;2;1;1", "aff_unique_norm": "University of Illinois Urbana-Champaign;Tsinghua University;Shanghai Institute of Materia Medica", "aff_unique_dep": ";;", "aff_unique_url": "https://illinois.edu;https://www.tsinghua.edu.cn;http://www.simm.ac.cn", "aff_unique_abbr": "UIUC;THU;", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;1;1;0;1;1;1", "aff_country_unique": "United States;China" }, { "title": "Causally Motivated Personalized Federated Invariant Learning with Shortcut-Averse Information-Theoretic Regularization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34335", "id": "Kbd9A4lVoX", "proceeding": "https://proceedings.mlr.press/v235/tang24a.html", "pdf": "https://openreview.net/pdf?id=Kbd9A4lVoX", "openreview": "https://openreview.net/forum?id=Kbd9A4lVoX", "author_site": "Xueyang Tang, Song Guo, Jingcai Guo, Jie ZHANG, Yue Yu", "tldr": "", "abstract": "Exploiting invariant relations and mitigating spurious correlation (a.k.a., shortcut) between representation and target across varied data distributions can tackle the challenging out-of-distribution (OOD) generalization problem. In personalized federated learning (PFL), heterogeneous data distribution across local clients offers the inherent prerequisites to extract the invariant features that maintain invariant relation with target. Nevertheless, personalized features are closely entangled with spurious features in PFL since they exhibit similar variability across different clients, which makes preserving personalization knowledge and eliminating shortcuts two conflicting objectives in PFL. To address the above challenge, we analyse the heterogeneous data generation on local clients through the lens of structured causal model and propose a crucial causal signature which can distinguish personalized features from spurious features with global invariant features as the anchor. Then the causal signature is quantified as an information-theoretic constraint that facilitates the shortcut-averse personalized invariant learning on each client. Theoretical analysis demonstrates our method, FedPIN, can yield a tighter bound on generalization error than the prevalent PFL approaches when train-test distribution shift exists on clients. Moreover, we provide a theoretical guarantee on the convergence rate of FedPIN in this paper. The results of extensive experiments show that our method can achieve superior OOD generalization performance compared with the state-of-the-art competitors.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xueyang Tang;Song Guo;Jingcai Guo;Jie ZHANG;Yue Yu", "authorids": "~Xueyang_Tang1;~Song_Guo5;~Jingcai_Guo1;~Jie_ZHANG18;~Yue_Yu8", "gender": "M;M;M;F;M", "homepage": ";https://cse.hkust.edu.hk/~songguo/;https://jingcaiguo.github.io/;https://cugzj.github.io/zhangjie.github.io/;http://yuyue.github.io/", "dblp": ";01/267-1;192/7270;84/6889-76;55/2008-1", "google_scholar": "wAGIpRAAAAAJ;https://scholar.google.com/citations?hl=en;YjSHPjcAAAAJ;JRCNlI8AAAAJ;VnqWgEwAAAAJ", "orcid": "0000-0003-4284-9806;;0000-0002-0449-4525;0000-0002-8073-2118;0000-0002-9865-2212", "linkedin": ";;jingcai-guo;;", "or_profile": "~Xueyang_Tang1;~Song_Guo5;~Jingcai_Guo1;~Jie_ZHANG18;~Yue_Yu8", "aff": "The Hong Kong Polytechnic University;Department of Computer Science and Engineering, Hong Kong University of Science and Technology;The Hong Kong Polytechnic University;The Hong Kong Polytechnic University;National University of Defense Technology", "aff_domain": "polyu.edu.hk;cse.ust.hk;polyu.edu.hk;polyu.edu.hk;nudt.edu.cn", "position": "PhD student;Full Professor;Assistant Professor;Postdoc;Associate Professor", "bibtex": "@inproceedings{\ntang2024causally,\ntitle={Causally Motivated Personalized Federated Invariant Learning with Shortcut-Averse Information-Theoretic Regularization},\nauthor={Xueyang Tang and Song Guo and Jingcai Guo and Jie ZHANG and Yue Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Kbd9A4lVoX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2769419, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5719107199511517815&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "polyu.edu.hk;cse.ust.hk;polyu.edu.hk;polyu.edu.hk;nudt.edu.cn", "author_num": 5, "aff_unique_index": "0;1;0;0;2", "aff_unique_norm": "Hong Kong Polytechnic University;Hong Kong University of Science and Technology;National University of Defense Technology", "aff_unique_dep": ";Department of Computer Science and Engineering;", "aff_unique_url": "https://www.polyu.edu.hk;https://www.ust.hk;http://www.nudt.edu.cn/", "aff_unique_abbr": "PolyU;HKUST;NUDT", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Learning Linear Block Error Correction Codes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34334", "id": "Kf9CqdI8Rb", "proceeding": "https://proceedings.mlr.press/v235/choukroun24a.html", "pdf": "https://openreview.net/pdf?id=Kf9CqdI8Rb", "openreview": "https://openreview.net/forum?id=Kf9CqdI8Rb", "author_site": "Yoni Choukroun, Lior Wolf", "tldr": "", "abstract": "Error correction codes are a crucial part of the physical communication layer, ensuring the reliable transfer of data over noisy channels. The design of optimal linear block codes capable of being efficiently decoded is of major concern, especially for short block lengths. While neural decoders have recently demonstrated their advantage over classical decoding techniques, the neural design of the codes remains a challenge. In this work, we propose for the first time a unified encoder-decoder training of binary linear block codes. To this end, we adapt the coding setting to support efficient and differentiable training of the code for end-to-end optimization over the order two Galois field. We also propose a novel Transformer model in which the self-attention masking is performed in a differentiable fashion for the efficient backpropagation of the code gradient. Our results show that (i) the proposed decoder outperforms existing neural decoding on conventional codes, (ii) the suggested framework generates codes that outperform the analogous conventional codes, and (iii) the codes we developed not only excel with our decoder but also show enhanced performance with traditional decoding techniques.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yoni Choukroun;Lior Wolf", "authorids": "~Yoni_Choukroun1;~Lior_Wolf1", "gender": "M;M", "homepage": "https://yonilc.github.io/;http://www.cs.tau.ac.il/~wolf", "dblp": "186/8305;83/4103", "google_scholar": "https://scholar.google.co.il/citations?user=gjo4ebcAAAAJ;UbFrXTsAAAAJ", "orcid": ";0000-0001-5578-8892", "linkedin": ";", "or_profile": "~Yoni_Choukroun1;~Lior_Wolf1", "aff": "Huawei Technologies Ltd.;Tel Aviv University", "aff_domain": "huawei.com;tau.ac.il", "position": "Principal Researcher;Full Professor", "bibtex": "@inproceedings{\nchoukroun2024learning,\ntitle={Learning Linear Block Error Correction Codes},\nauthor={Yoni Choukroun and Lior Wolf},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Kf9CqdI8Rb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1717013, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5321687676029241387&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "huawei.com;tau.ac.il", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Huawei;Tel Aviv University", "aff_unique_dep": "Huawei Technologies;", "aff_unique_url": "https://www.huawei.com;https://www.tau.ac.il", "aff_unique_abbr": "Huawei;TAU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "China;Israel" }, { "title": "Hierarchical Novelty Detection via Fine-Grained Evidence Allocation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34333", "id": "KfN76nAcOO", "proceeding": "https://proceedings.mlr.press/v235/pyakurel24a.html", "pdf": "https://openreview.net/pdf?id=KfN76nAcOO", "openreview": "https://openreview.net/forum?id=KfN76nAcOO", "author_site": "Spandan Pyakurel, Qi Yu", "tldr": "", "abstract": "By leveraging a hierarchical structure of known classes, Hierarchical Novelty Detection (HND) offers fine-grained detection results that pair detected novel samples with their closest (known) parent class in the hierarchy. Prior knowledge on the parent class provides valuable insights to better understand these novel samples. However, traditional novelty detection methods try to separate novel samples from all known classes using uncertainty or distance based metrics so they are incapable of locating the closest known parent class. Since the novel class is also part of the hierarchy, the model can more easily get confused between samples from known classes and those from novel ones. To achieve effective HND, we propose to augment the known (leaf-level) classes with a set of novel classes, each of which is associated with one parent (i.e., non-leaf) class in the original hierarchy. Such a structure allows us to perform novel fine-grained evidence allocation to differentiate known and novel classes guided by a uniquely designed loss function. Our thorough theoretical analysis shows that fine-grained evidence allocation creates an evidence margin to more precisely separate known and novel classes. Extensive experiments conducted on real-world hierarchical datasets demonstrate the proposed model outperforms the strongest baselines and achieves the best HND performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Spandan Pyakurel;Qi Yu", "authorids": "~Spandan_Pyakurel1;~Qi_Yu1", "gender": "F;M", "homepage": ";https://www.rit.edu/mining/", "dblp": "384/4136.html;58/6957-1", "google_scholar": "1ZSPfBoAAAAJ;L3gWdfEAAAAJ", "orcid": ";0000-0002-0426-5407", "linkedin": ";", "or_profile": "~Spandan_Pyakurel1;~Qi_Yu1", "aff": "Rochester Institute of Technology;Rochester Institute of Technology", "aff_domain": "rit.edu;rit.edu", "position": "PhD student;Professor", "bibtex": "@inproceedings{\npyakurel2024hierarchical,\ntitle={Hierarchical Novelty Detection via Fine-Grained Evidence Allocation},\nauthor={Spandan Pyakurel and Qi Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KfN76nAcOO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1173823, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3c2wuDXRCzQJ:scholar.google.com/&scioq=Hierarchical+Novelty+Detection+via+Fine-Grained+Evidence+Allocation&hl=en&as_sdt=0,48", "gs_version_total": 6, "email": "rit.edu;rit.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Rochester Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.rit.edu", "aff_unique_abbr": "RIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Exploring the Benefit of Activation Sparsity in Pre-training", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34332", "id": "KfXXPCcobh", "proceeding": "https://proceedings.mlr.press/v235/zhang24bq.html", "pdf": "https://openreview.net/pdf?id=KfXXPCcobh", "openreview": "https://openreview.net/forum?id=KfXXPCcobh", "author_site": "Zhengyan Zhang, Chaojun Xiao, Qiujieli Qin, Yankai Lin, Zhiyuan Zeng, Xu Han, Zhiyuan Liu, Ruobing Xie, Maosong Sun, Jie Zhou", "tldr": "", "abstract": "Pre-trained Transformers inherently possess the characteristic of sparse activation, where only a small fraction of the neurons are activated for each token. While sparse activation has been explored through post-training methods, its potential in pre-training remains untapped. In this work, we first study how activation properties change during pre-training. Our examination reveals that Transformers exhibit sparse activation throughout the majority of the pre-training process while the activation correlation keeps evolving as training progresses. Leveraging this observation, we propose Switchable Sparse-Dense Learning (SSD). SSD adaptively switches between the Mixtures-of-Experts (MoE) based sparse training and the conventional dense training during the pre-training process, leveraging the efficiency of sparse training and avoiding the static activation correlation of sparse training. Compared to dense training, SSD achieves comparable performance with identical model size and reduces pre-training costs. Moreover, the models trained with SSD can be directly used as MoE models for sparse inference and achieve the same performance as dense models with up to $2\\times$ faster inference speed. Codes are available at https://github.com/thunlp/moefication.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhengyan Zhang;Chaojun Xiao;Qiujieli Qin;Yankai Lin;Zhiyuan Zeng;Xu Han;Zhiyuan Liu;Ruobing Xie;Maosong Sun;Jie Zhou", "authorids": "~Zhengyan_Zhang1;~Chaojun_Xiao1;~Qiujieli_Qin1;~Yankai_Lin1;~Zhiyuan_Zeng3;~Xu_Han2;~Zhiyuan_Liu1;~Ruobing_Xie2;~Maosong_Sun1;~Jie_Zhou8", "gender": "M;M;M;M;M;;M;M;M;M", "homepage": ";https://xcjthu.github.io/;https://github.com/qqjl21;https://linyankai.github.io/;https://zhiyuan-zeng.github.io/;;http://nlp.csai.tsinghua.edu.cn/~lzy;http://nlp.csai.tsinghua.edu.cn/~xrb/;https://www.cs.tsinghua.edu.cn/csen/info/1312/4394.htm;", "dblp": ";223/4856;;161/0001.html;;;53/3245-1;178/8590;95/3291-1;00/5012-16", "google_scholar": ";xoC8smYAAAAJ;;https://scholar.google.com.hk/citations?user=j8K1FqEAAAAJ;qLJqCqsAAAAJ;;dT0v5u0AAAAJ;j3OX8KUAAAAJ;https://scholar.google.com.tw/citations?user=zIgT0HMAAAAJ;https://scholar.google.com.hk/citations?user=OijxQCMAAAAJ", "orcid": ";;;0000-0002-9182-8158;;;0000-0002-7709-2543;0000-0003-3170-5647;;0000-0002-5899-5165", "linkedin": ";;;;;;;;;", "or_profile": "~Zhengyan_Zhang1;~Chaojun_Xiao1;~Qiujieli_Qin1;~Yankai_Lin1;~Zhiyuan_Zeng3;~Xu_Han2;~Zhiyuan_Liu1;~Ruobing_Xie2;~Maosong_Sun1;~Jie_Zhou8", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Renmin University of China;Tsinghua University;;Tsinghua University;Tencent;Tsinghua University;WeChat AI, Tencent Inc.", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn;ruc.edu.cn;tsinghua.edu.cn;;tsinghua.edu.cn;tencent.com;tsinghua.edu.cn;tencent.com", "position": "PhD student;PhD student;Undergrad student;Assistant Professor;Undergrad student;;Associate Professor;Senior researcher;Full Professor;Principal Researcher", "bibtex": "@inproceedings{\nzhang2024exploring,\ntitle={Exploring the Benefit of Activation Sparsity in Pre-training},\nauthor={Zhengyan Zhang and Chaojun Xiao and Qiujieli Qin and Yankai Lin and Zhiyuan Zeng and Xu Han and Zhiyuan Liu and Ruobing Xie and Maosong Sun and Jie Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KfXXPCcobh}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1085677, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2529066705369747285&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 7, "email": "tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn;ruc.edu.cn;tsinghua.edu.cn;;tsinghua.edu.cn;tencent.com;tsinghua.edu.cn;tencent.com", "author_num": 10, "aff_unique_index": "0;0;0;1;0;0;2;0;2", "aff_unique_norm": "Tsinghua University;Renmin University of China;Tencent", "aff_unique_dep": ";;Tencent Holdings Limited", "aff_unique_url": "https://www.tsinghua.edu.cn;http://www.ruc.edu.cn;https://www.tencent.com", "aff_unique_abbr": "THU;RUC;Tencent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "CKGConv: General Graph Convolution with Continuous Kernels", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34331", "id": "KgfGxXbjjE", "proceeding": "https://proceedings.mlr.press/v235/ma24k.html", "pdf": "https://openreview.net/pdf?id=KgfGxXbjjE", "openreview": "https://openreview.net/forum?id=KgfGxXbjjE", "author_site": "Liheng Ma, Soumyasundar Pal, Yitian Zhang, Jiaming Zhou, Yingxue Zhang, Mark Coates", "tldr": "", "abstract": "The existing definitions of graph convolution, either from spatial or spectral perspectives, are inflexible and not unified. Defining a general convolution operator in the graph domain is challenging due to the lack of canonical coordinates, the presence of irregular structures, and the properties of graph symmetries. In this work, we propose a novel and general graph convolution framework by parameterizing the kernels as continuous functions of pseudo-coordinates derived via graph positional encoding. We name this Continuous Kernel Graph Convolution (CKGConv). Theoretically, we demonstrate that CKGConv is flexible and expressive. CKGConv encompasses many existing graph convolutions, and exhibits a stronger expressiveness, as powerful as graph transformers in terms of distinguishing non-isomorphic graphs. Empirically, we show that CKGConv-based Networks outperform existing graph convolutional networks and perform comparably to the best graph transformers across a variety of graph datasets. The code and models are publicly available at https://github.com/networkslab/CKGConv.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Liheng Ma;Soumyasundar Pal;Yitian Zhang;Jiaming Zhou;Yingxue Zhang;Mark Coates", "authorids": "~Liheng_Ma1;~Soumyasundar_Pal1;~Yitian_Zhang2;~Jiaming_Zhou3;~Yingxue_Zhang1;~Mark_Coates1", "gender": "M;M;F;;F;M", "homepage": "https://liamma.github.io/;https://www.researchgate.net/profile/Soumyasundar_Pal;;https://www.linkedin.com/in/jiaming-zhou-5986312a/;;http://www.ece.mcgill.ca/~mcoate/", "dblp": "244/4404;216/4097;;;174/0010-1.html;c/MarkCoates", "google_scholar": "abfvaXwAAAAJ;zbH9oDQAAAAJ;7qVp8CoAAAAJ;;4bsYpogAAAAJ;https://scholar.google.ca/citations?user=qxWORNoAAAAJ", "orcid": ";;;;;0000-0001-5030-1379", "linkedin": "liheng-ma-902103134/;soumyasundar-pal-17a56637/;;;yingxue-zhang-03971b112/;", "or_profile": "~Liheng_Ma1;~Soumyasundar_Pal1;~Yitian_Zhang2;~Jiaming_Zhou3;~Yingxue_Zhang1;~Mark_Coates1", "aff": "Noah's Ark Lab, Montreal, Huawei Technologies Ltd. ;Huawei Technologies Ltd.;McGill University, McGill University;Huawei Technologies Ltd.;Huawei Canada, Huawei Noah's Ark Lab;McGill University", "aff_domain": "huawei.com;huawei.com;mail.mcgill.ca;huawei.com;huawei.com;mcgill.ca", "position": "Intern;Researcher;PhD student;Researcher;Researcher;Full Professor", "bibtex": "@inproceedings{\nma2024ckgconv,\ntitle={{CKGC}onv: General Graph Convolution with Continuous Kernels},\nauthor={Liheng Ma and Soumyasundar Pal and Yitian Zhang and Jiaming Zhou and Yingxue Zhang and Mark Coates},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KgfGxXbjjE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 675642, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17369780932995213654&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "huawei.com;huawei.com;mail.mcgill.ca;huawei.com;huawei.com;mcgill.ca", "author_num": 6, "aff_unique_index": "0;0;1;0;0;1", "aff_unique_norm": "Huawei;McGill University", "aff_unique_dep": "Noah's Ark Lab;", "aff_unique_url": "https://www.huawei.com;https://www.mcgill.ca", "aff_unique_abbr": "Huawei;McGill", "aff_campus_unique_index": "0", "aff_campus_unique": "Montreal;", "aff_country_unique_index": "0;1;0;1;0;0", "aff_country_unique": "Canada;China" }, { "title": "Understanding the Effects of Iterative Prompting on Truthfulness", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34330", "id": "KjazcKPMME", "proceeding": "https://proceedings.mlr.press/v235/krishna24a.html", "pdf": "https://openreview.net/pdf?id=KjazcKPMME", "openreview": "https://openreview.net/forum?id=KjazcKPMME", "author_site": "Satyapriya Krishna, Chirag Agarwal, Himabindu Lakkaraju", "tldr": "", "abstract": "The development of Large Language Models (LLMs) has notably transformed numerous sectors, offering impressive text generation capabilities. Yet, the reliability and truthfulness of these models remain pressing concerns. To this end, we investigate iterative prompting, a strategy hypothesized to refine LLM responses, assessing its impact on LLM truthfulness, an area which has not been thoroughly explored. Our extensive experiments explore the intricacies of iterative prompting variants, examining their influence on the accuracy and calibration of model responses. Our findings reveal that naive prompting methods significantly undermine truthfulness, leading to exacerbated calibration errors. In response to these challenges, we introduce several prompting variants designed to address the identified issues. These variants demonstrate marked improvements over existing baselines, signaling a promising direction for future research. Our work provides a nuanced understanding of iterative prompting and introduces novel approaches to enhance the truthfulness of LLMs, thereby contributing to the development of more accurate and trustworthy AI systems", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Satyapriya Krishna;Chirag Agarwal;Himabindu Lakkaraju", "authorids": "~Satyapriya_Krishna2;~Chirag_Agarwal1;~Himabindu_Lakkaraju1", "gender": "M;M;F", "homepage": "http://satyapriyakrishna.com/;https://chirag-agarwall.github.io/;http://web.stanford.edu/~himalv", "dblp": "251/9225;173/8821;68/9376", "google_scholar": "Q5bfPlkAAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": ";;", "linkedin": "satyapriya-krishna-50553084/;chirag-agarwal-0a6a43a1/;", "or_profile": "~Satyapriya_Krishna2;~Chirag_Agarwal1;~Hima_Lakkaraju1", "aff": "Harvard University;Harvard University;Harvard University", "aff_domain": "harvard.edu;hbs.edu;harvard.edu", "position": "PhD student;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nkrishna2024understanding,\ntitle={Understanding the Effects of Iterative Prompting on Truthfulness},\nauthor={Satyapriya Krishna and Chirag Agarwal and Himabindu Lakkaraju},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KjazcKPMME}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1100974, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1973336628482719854&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "harvard.edu;hbs.edu;harvard.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "MathScale: Scaling Instruction Tuning for Mathematical Reasoning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34329", "id": "Kjww7ZN47M", "proceeding": "https://proceedings.mlr.press/v235/tang24k.html", "pdf": "https://openreview.net/pdf?id=Kjww7ZN47M", "openreview": "https://openreview.net/forum?id=Kjww7ZN47M", "author_site": "Zhengyang Tang, Xingxing Zhang, Benyou Wang, Furu Wei", "tldr": "", "abstract": "Large language models (LLMs) have demonstrated remarkable capabilities in problem-solving. However, their proficiency in solving mathematical problems remains inadequate. We propose MathScale, a simple and scalable method to create high-quality mathematical reasoning data using frontier LLMs (e.g., GPT-3.5). Inspired by the cognitive mechanism in human mathematical learning, it first extracts topics and knowledge points from seed math questions and then build a concept graph, which is subsequently used to generate new math questions. MathScale exhibits effective scalability along the size axis of the math dataset that we generate. As a result, we create a mathematical reasoning dataset (MathScaleQA) containing two million math question-answer pairs. To evaluate mathematical reasoning abilities of LLMs comprehensively, we construct MWPBench, a benchmark of Math Word Problems, which is a collection of 9 datasets (including GSM8K and MATH) covering K-12, college, and competition level math problems. We apply MathScaleQA to fine-tune open-source LLMs (e.g., LLaMA-2 and Mistral), resulting in significantly improved capabilities in mathematical reasoning. Evaluated on MWPBench, MathScale-7B achieves state-of-the-art performance across all datasets, surpassing its best peers of equivalent size by 42.8% in micro average accuracy and 43.6% in macro average accuracy, respectively.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhengyang Tang;Xingxing Zhang;Benyou Wang;Furu Wei", "authorids": "~Zhengyang_Tang1;~Xingxing_Zhang1;~Benyou_Wang2;~Furu_Wei1", "gender": "M;M;M;M", "homepage": ";https://xingxingzhang.github.io/;https://wabyking.github.io/old.html;https://www.microsoft.com/en-us/research/people/fuwei/", "dblp": "247/3097;59/9985-2.html;169/1793;72/5870", "google_scholar": "2RRV0PQAAAAJ;5yX53usAAAAJ;Jk4vJU8AAAAJ;G-V1VpwAAAAJ", "orcid": ";;0000-0002-1501-9914;", "linkedin": ";;;", "or_profile": "~Zhengyang_Tang1;~Xingxing_Zhang1;~Benyou_Wang2;~Furu_Wei1", "aff": "The Chinese University of Hong Kong, Shenzhen;Microsoft Research Asia;The Chinese University of Hong Kong, Shenzhen;Microsoft Research", "aff_domain": "cuhk.edu.cn;microsoft.com;cuhk.edu.cn;microsoft.com", "position": "PhD student;Researcher;Assistant Professor;Distinguished Scientist", "bibtex": "@inproceedings{\ntang2024mathscale,\ntitle={MathScale: Scaling Instruction Tuning for Mathematical Reasoning},\nauthor={Zhengyang Tang and Xingxing Zhang and Benyou Wang and Furu Wei},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Kjww7ZN47M}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 773623, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 63, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17896122923508373346&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "cuhk.edu.cn;microsoft.com;cuhk.edu.cn;microsoft.com", "author_num": 4, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Chinese University of Hong Kong;Microsoft", "aff_unique_dep": ";Research", "aff_unique_url": "https://www.cuhk.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "CUHK;MSR Asia", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Shenzhen;Asia;", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Data-efficient Large Vision Models through Sequential Autoregression", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34328", "id": "KmCoS6WkgG", "proceeding": "https://proceedings.mlr.press/v235/hao24b.html", "pdf": "https://openreview.net/pdf?id=KmCoS6WkgG", "openreview": "https://openreview.net/forum?id=KmCoS6WkgG", "author_site": "Zhiwei Hao, Jianyuan Guo, Chengcheng Wang, Yehui Tang, Han Wu, Han Hu, Kai Han, Chang Xu", "tldr": "", "abstract": "Training general-purpose vision models on purely sequential visual data, eschewing linguistic inputs, has heralded a new frontier in visual understanding. These models are intended to not only comprehend but also seamlessly transit to out-of-domain tasks. However, current endeavors are hamstrung by an over-reliance on colossal models, exemplified by models with upwards of 3B parameters, and the necessity for an extensive corpus of visual data, often comprising a staggering 400B tokens. In this paper, we delve into the development of an efficient, autoregression-based vision model, innovatively architected to operate on a limited dataset. We meticulously demonstrate how this model achieves proficiency in a spectrum of visual tasks spanning both high-level and low-level semantic understanding during the testing phase. Our empirical evaluations underscore the model's agility in adapting to various tasks, heralding a significant reduction in the parameter footprint, and a marked decrease in training data requirements, thereby paving the way for more sustainable and accessible advancements in the field of generalist vision models. The code is available at https://github.com/ggjy/DeLVM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhiwei Hao;Jianyuan Guo;Chengcheng Wang;Yehui Tang;Han Wu;Han Hu;Kai Han;Chang Xu", "authorids": "~Zhiwei_Hao1;~Jianyuan_Guo1;~Chengcheng_Wang1;~Yehui_Tang1;~Han_Wu4;~Han_Hu6;~Kai_Han2;~Chang_Xu4", "gender": ";M;M;M;F;;M;", "homepage": ";https://ggjy.github.io/;;;https://www.sydney.edu.au/business/about/our-people/research-students/han-wu-293.html;;https://iamhankai.github.io;", "dblp": "125/5604;190/0258;;244/9659;13/1864;;51/4757-2;", "google_scholar": "MwDSTNAAAAAJ;https://scholar.google.com/citations?hl=en;OfmE9XUAAAAJ;TkSZQ6gAAAAJ;;;vThoBVcAAAAJ;", "orcid": ";;;;0000-0002-3750-0696;;0000-0002-9761-2702;", "linkedin": ";;;;;;;", "or_profile": "~Zhiwei_Hao1;~Jianyuan_Guo1;~Chengcheng_Wang1;~Yehui_Tang1;~Han_Wu4;~Han_Hu6;~Kai_Han2;~Chang_Xu4", "aff": "Beijing Institute of Technology;University of Sydney;Huawei Technologies Ltd.;Huawei Technologies Ltd.;University of Sydney;;Huawei Noah's Ark Lab;", "aff_domain": "bit.edu.cn;usyd.edu.au;huawei.com;huawei.com;sydney.edu.au;;huawei.com;", "position": "PhD student;PhD student;Researcher;Researcher;PhD student;;Principal Researcher;", "bibtex": "@inproceedings{\nhao2024dataefficient,\ntitle={Data-efficient Large Vision Models through Sequential Autoregression},\nauthor={Zhiwei Hao and Jianyuan Guo and Chengcheng Wang and Yehui Tang and Han Wu and Han Hu and Kai Han and Chang Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KmCoS6WkgG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9782932, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2222182606979396073&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "bit.edu.cn;usyd.edu.au;huawei.com;huawei.com;sydney.edu.au;;huawei.com;", "author_num": 8, "aff_unique_index": "0;1;2;2;1;2", "aff_unique_norm": "Beijing Institute of Technology;University of Sydney;Huawei", "aff_unique_dep": ";;Huawei Technologies", "aff_unique_url": "http://www.bit.edu.cn/;https://www.sydney.edu.au;https://www.huawei.com", "aff_unique_abbr": "BIT;USYD;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1;0", "aff_country_unique": "China;Australia" }, { "title": "HGAP: Boosting Permutation Invariant and Permutation Equivariant in Multi-Agent Reinforcement Learning via Graph Attention Network", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34327", "id": "KpUdNe9lsr", "proceeding": "https://proceedings.mlr.press/v235/lin24m.html", "pdf": "https://openreview.net/pdf?id=KpUdNe9lsr", "openreview": "https://openreview.net/forum?id=KpUdNe9lsr", "author_site": "Bor Jiun Lin, Chun-Yi Lee", "tldr": "", "abstract": "Graph representation has gained widespread application across various machine learning domains, attributed to its ability to discern correlations among input nodes. In the realm of Multi- agent Reinforcement Learning (MARL), agents are tasked with observing other entities within their environment to determine their behavior. Conventional MARL methodologies often suffer from training difficulties if Permutation Invariant (PI) and Permutation Equivariant (PE) properties are not considered during training. The adoption of graph representation offers a solution to these challenges by conceptualizing observed entities as a graph. In this context, we introduce the Hyper Graphical Attention Policy (HGAP) Network, which employs a graph attention mechanism to fulfill the PI and PE properties, while also understanding inter-entity interactions for decision-making. HGAP is assessed across various MARL benchmarks to confirm its effectiveness and efficiency. In addition, a series of ablation studies are provided to demonstrate its adaptability, transferability, and the capability to alleviate the complexities introduced by the POMDP constraint.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bor-Jiun Lin;Chun-Yi Lee", "authorids": "~Bor-Jiun_Lin1;~Chun-Yi_Lee1", "gender": "M;M", "homepage": ";https://elsalab.ai", "dblp": ";36/3668", "google_scholar": ";https://scholar.google.com.tw/citations?user=5mYNdo0AAAAJ", "orcid": ";0000-0002-4680-4800", "linkedin": "bor-jiun-lin-b99b80191/;", "or_profile": "~Bor-Jiun_Lin1;~Chun-Yi_Lee1", "aff": "Department of Computer Science, National Tsing Hua University, National Tsinghua University;National Tsing Hua University", "aff_domain": "cs.nthu.edu.tw;nthu.edu.tw", "position": "MS student;Full Professor", "bibtex": "@inproceedings{\nlin2024hgap,\ntitle={{HGAP}: Boosting Permutation Invariant and Permutation Equivariant in Multi-Agent Reinforcement Learning via Graph Attention Network},\nauthor={Bor-Jiun Lin and Chun-Yi Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KpUdNe9lsr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9406644, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9478079500411873308&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "cs.nthu.edu.tw;nthu.edu.tw", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "National Tsinghua University;National Tsing Hua University", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.nthu.edu.tw", "aff_unique_abbr": "THU;NTHU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Taiwan", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Parallelized Spatiotemporal Slot Binding for Videos", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34326", "id": "KpeGdDzucX", "proceeding": "https://proceedings.mlr.press/v235/singh24g.html", "pdf": "https://openreview.net/pdf?id=KpeGdDzucX", "openreview": "https://openreview.net/forum?id=KpeGdDzucX", "author_site": "Gautam Singh, Yue Wang, Jiawei Yang, Boris Ivanovic, Sungjin Ahn, Marco Pavone, Tong Che", "tldr": "", "abstract": "While modern best practices advocate for scalable architectures that support long-range interactions, object-centric models are yet to fully embrace these architectures. In particular, existing object-centric models for handling sequential inputs, due to their reliance on RNN-based implementation, show poor stability and capacity and are slow to train on long sequences. We introduce Parallelizable Spatiotemporal Binder or PSB, the first temporally-parallelizable slot learning architecture for sequential inputs. Unlike conventional RNN-based approaches, PSB produces object-centric representations, known as slots, for all time-steps in parallel. This is achieved by refining the initial slots across all time-steps through a fixed number of layers equipped with causal attention. By capitalizing on the parallelism induced by our architecture, the proposed model exhibits a significant boost in efficiency. In experiments, we test PSB extensively as an encoder within an auto-encoding framework paired with a wide variety of decoder options. Compared to the state-of-the-art, our architecture demonstrates stable training on longer sequences, achieves parallelization that results in a 60% increase in training speed, and yields performance that is on par with or better on unsupervised 2D and 3D object-centric scene decomposition and understanding.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gautam Singh;Yue Wang;Jiawei Yang;Boris Ivanovic;Sungjin Ahn;Marco Pavone;Tong Che", "authorids": "~Gautam_Singh3;~Yue_Wang2;~Jiawei_Yang1;~Boris_Ivanovic1;~Sungjin_Ahn1;~Marco_Pavone1;~Tong_Che1", "gender": "M;M;M;;;M;M", "homepage": "https://singhgautam.github.io;https://yuewang.xyz;https://jiawei-yang.github.io/;http://www.borisivanovic.com/;;https://web.stanford.edu/~pavone/;", "dblp": "35/2642;33/4822-41;96/2976;203/8356;;91/3382-1.html;125/0738", "google_scholar": "lXpFxDwAAAAJ;v-AEFIEAAAAJ;OYrpIa8AAAAJ;ey9AQcEAAAAJ;;RhOpyXcAAAAJ;7b5tlJkAAAAJ", "orcid": ";;;0000-0002-8698-202X;;;", "linkedin": "gautam-singh-61302463/;;;boris-ivanovic-a3103064;;;", "or_profile": "~Gautam_Singh3;~Yue_Wang2;~Jiawei_Yang1;~Boris_Ivanovic1;~Sungjin_Ahn1;~Marco_Pavone1;~Tong_Che1", "aff": "Rutgers University;NVIDIA;University of Southern California;NVIDIA;;Stanford University;NVIDIA", "aff_domain": "rutgers.edu;nvidia.com;usc.edu;nvidia.com;;stanford.edu;nvidia.com", "position": "PhD student;Researcher;PhD student;Researcher;;Associate Professor;Researcher", "bibtex": "@inproceedings{\nsingh2024parallelized,\ntitle={Parallelized Spatiotemporal Slot Binding for Videos},\nauthor={Gautam Singh and Yue Wang and Jiawei Yang and Boris Ivanovic and Sungjin Ahn and Marco Pavone and Tong Che},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KpeGdDzucX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4984210, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vDkAO24DCT0J:scholar.google.com/&scioq=Parallelized+Spatiotemporal+Slot+Binding+for+Videos&hl=en&as_sdt=0,33", "gs_version_total": 7, "email": "rutgers.edu;nvidia.com;usc.edu;nvidia.com;;stanford.edu;nvidia.com", "author_num": 7, "aff_unique_index": "0;1;2;1;3;1", "aff_unique_norm": "Rutgers University;NVIDIA;University of Southern California;Stanford University", "aff_unique_dep": ";NVIDIA Corporation;;", "aff_unique_url": "https://www.rutgers.edu;https://www.nvidia.com;https://www.usc.edu;https://www.stanford.edu", "aff_unique_abbr": "Rutgers;NVIDIA;USC;Stanford", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Los Angeles;Stanford", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Federated Continual Learning via Prompt-based Dual Knowledge Transfer", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34325", "id": "Kqa5JakTjB", "proceeding": "https://proceedings.mlr.press/v235/piao24a.html", "pdf": "https://openreview.net/pdf?id=Kqa5JakTjB", "openreview": "https://openreview.net/forum?id=Kqa5JakTjB", "author_site": "Hongming Piao, Yichen WU, Dapeng Wu, Ying WEI", "tldr": "", "abstract": "In Federated Continual Learning (FCL), the challenge lies in effectively facilitating knowledge transfer and enhancing the performance across various tasks on different clients. Current FCL methods predominantly focus on avoiding interference between tasks, thereby overlooking the potential for positive knowledge transfer across tasks learned by different clients at separate time intervals. To address this issue, we introduce a **P**rompt-based kn**ow**le**d**ge transf**er** FCL algorithm, called **Powder**, designed to effectively foster the transfer of knowledge encapsulated in prompts between various sequentially learned tasks and clients. Furthermore, we have devised a unique approach for prompt generation and aggregation, intending to alleviate privacy protection concerns and communication overhead, while still promoting knowledge transfer. Comprehensive experimental results demonstrate the superiority of our method in terms of reduction in communication costs, and enhancement of knowledge transfer. Code is available at https://github.com/piaohongming/Powder.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hongming Piao;Yichen Wu;Dapeng Wu;Ying Wei", "authorids": "~Hongming_Piao1;~Yichen_Wu2;~Dapeng_Wu1;~Ying_Wei1", "gender": "M;M;;F", "homepage": "https://github.com/piaohongming;https://wuyichen-97.github.io/;https://www.cs.cityu.edu.hk/~dapengwu/;https://wei-ying.net/", "dblp": "313/9284;;;14/4899-1", "google_scholar": ";https://scholar.google.com/citations?hl=zh-CN;;5UpFdKsAAAAJ", "orcid": ";0000-0003-2859-3285;;", "linkedin": ";;;", "or_profile": "~Hongming_Piao1;~Yichen_Wu2;~Dapeng_Wu1;~Ying_Wei1", "aff": "City University of Hong Kong;City University of Hong Kong;City University of Hong Kong;Nanyang Technological University", "aff_domain": "my.cityu.edu.hk;cityu.edu.hk;cityu.edu.hk;ntu.edu.sg", "position": "PhD student;PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\npiao2024federated,\ntitle={Federated Continual Learning via Prompt-based Dual Knowledge Transfer},\nauthor={Hongming Piao and Yichen Wu and Dapeng Wu and Ying Wei},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Kqa5JakTjB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 569076, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9217933656250274748&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "my.cityu.edu.hk;cityu.edu.hk;cityu.edu.hk;ntu.edu.sg", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "City University of Hong Kong;Nanyang Technological University", "aff_unique_dep": ";", "aff_unique_url": "https://www.cityu.edu.hk;https://www.ntu.edu.sg", "aff_unique_abbr": "CityU;NTU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "China;Singapore" }, { "title": "Improving Accuracy-robustness Trade-off via Pixel Reweighted Adversarial Training", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34324", "id": "KsUddQl39v", "proceeding": "https://proceedings.mlr.press/v235/zhang24am.html", "pdf": "https://openreview.net/pdf?id=KsUddQl39v", "openreview": "https://openreview.net/forum?id=KsUddQl39v", "author_site": "Jiacheng Zhang, Feng Liu, Dawei Zhou, Jingfeng ZHANG, Tongliang Liu", "tldr": "", "abstract": "Adversarial training (AT) trains models using adversarial examples (AEs), which are natural images modified with specific perturbations to mislead the model. These perturbations are constrained by a predefined perturbation budget $\\epsilon$ and are equally applied to each pixel within an image. However, in this paper, we discover that not all pixels contribute equally to the accuracy on AEs (i.e., robustness) and accuracy on natural images (i.e., accuracy). Motivated by this finding, we propose Pixel-reweighted AdveRsarial Training (PART), a new framework that partially reduces $\\epsilon$ for less influential pixels, guiding the model to focus more on key regions that affect its outputs. Specifically, we first use class activation mapping (CAM) methods to identify important pixel regions, then we keep the perturbation budget for these regions while lowering it for the remaining regions when generating AEs. In the end, we use these pixel-reweighted AEs to train a model. PART achieves a notable improvement in accuracy without compromising robustness on CIFAR-10, SVHN and TinyImagenet-200, justifying the necessity to allocate distinct weights to different pixel regions in robust classification.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiacheng Zhang;Feng Liu;Dawei Zhou;Jingfeng Zhang;Tongliang Liu", "authorids": "~Jiacheng_Zhang7;~Feng_Liu2;~Dawei_Zhou3;~Jingfeng_Zhang1;~Tongliang_Liu1", "gender": "M;M;M;M;M", "homepage": "https://jiachengz01.github.io/;https://fengliu90.github.io/index.html;https://zjfheart.github.io;https://tongliang-liu.github.io/;", "dblp": ";77/1318-3;227/2664.html;150/6667;39/3130-4", "google_scholar": "https://scholar.google.com.au/citations?user=i9ESB0QAAAAJ;https://scholar.google.com/citations?hl=en;NS0P1FkAAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;https://scholar.google.com.hk/citations?user=7H-LIigAAAAJ", "orcid": "0009-0003-0944-9736;0000-0002-5005-9129;0000-0003-3491-8074;;0000-0002-0694-3603", "linkedin": "jiacheng-zhang-0ab996217/;alexfengliu;;;", "or_profile": "~Jiacheng_Zhang7;~Feng_Liu2;~Jingfeng_Zhang1;~Tongliang_Liu1;~Zhou_Dawei1", "aff": "University of Melbourne;University of Melbourne;University of Auckland;Mohamed bin Zayed University of Artificial Intelligence;Xidian University", "aff_domain": "unimelb.edu.au;unimelb.edu.au;auckland.ac.nz;mbzuai.ac.ae;xidian.edu.cn", "position": "PhD student;Assistant Professor;Assistant Professor;Affiliated Associate Professor;PhD student", "bibtex": "@inproceedings{\nzhang2024improving,\ntitle={Improving Accuracy-robustness Trade-off via Pixel Reweighted Adversarial Training},\nauthor={Jiacheng Zhang and Feng Liu and Dawei Zhou and Jingfeng Zhang and Tongliang Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KsUddQl39v}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 0, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15896718076924114128&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "unimelb.edu.au;unimelb.edu.au;auckland.ac.nz;mbzuai.ac.ae;xidian.edu.cn", "author_num": 5, "aff_unique_index": "0;0;1;2;3", "aff_unique_norm": "University of Melbourne;University of Auckland;Mohamed bin Zayed University of Artificial Intelligence;Xidian University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.unimelb.edu.au;https://www.auckland.ac.nz;https://mbzuai.ac.ae;http://www.xidian.edu.cn/", "aff_unique_abbr": "UniMelb;UoA;MBZUAI;Xidian", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;2;3", "aff_country_unique": "Australia;New Zealand;United Arab Emirates;China" }, { "title": "Path-Guided Particle-based Sampling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34323", "id": "Kt4fwiuKqf", "proceeding": "https://proceedings.mlr.press/v235/fan24f.html", "pdf": "https://openreview.net/pdf?id=Kt4fwiuKqf", "openreview": "https://openreview.net/forum?id=Kt4fwiuKqf", "author_site": "Mingzhou Fan, Ruida Zhou, Chao Tian, Xiaoning Qian", "tldr": "", "abstract": "Particle-based Bayesian inference methods by sampling from a partition-free target (posterior) distribution, e.g., Stein variational gradient descent (SVGD), have attracted significant attention. We propose a path-guided particle-based sampling (PGPS) method based on a novel Log-weighted Shrinkage (LwS) density path linking an initial distribution to the target distribution. We propose to utilize a Neural network to learn a vector field motivated by the Fokker-Planck equation of the designed density path. Particles, initiated from the initial distribution, evolve according to the ordinary differential equation defined by the vector field. The distribution of these particles is guided along a density path from the initial distribution to the target distribution. The proposed LwS density path allows for an efficient search of modes of the target distribution while canonical methods fail. We theoretically analyze the Wasserstein distance of the distribution of the PGPS-generated samples and the target distribution due to approximation and discretization errors. Practically, the proposed PGPS-LwS method demonstrates higher Bayesian inference accuracy and better calibration ability in experiments conducted on both synthetic and real-world Bayesian learning tasks, compared to baselines, such as SVGD and Langevin dynamics, etc.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mingzhou Fan;Ruida Zhou;Chao Tian;Xiaoning Qian", "authorids": "~Mingzhou_Fan1;~Ruida_Zhou1;~Chao_Tian2;~Xiaoning_Qian2", "gender": "M;M;;M", "homepage": "https://www.google.com/;https://sites.google.com/view/ruida-zhou;;https://www.ece.tamu.edu/~xqian", "dblp": "294/0813;215/2026;;62/4504", "google_scholar": ";kXbo1twAAAAJ;;dXGlddgAAAAJ", "orcid": ";;;0000-0002-4347-2476", "linkedin": ";;;", "or_profile": "~Mingzhou_Fan1;~Ruida_Zhou1;~Chao_Tian2;~Xiaoning_Qian2", "aff": "Texas A&M;University of California, Los Angeles;;Texas A&M", "aff_domain": "tamu.edu;ucla.edu;;tamu.edu", "position": "PhD student;Postdoc;;Full Professor", "bibtex": "@inproceedings{\nfan2024pathguided,\ntitle={Path-Guided Particle-based Sampling},\nauthor={Mingzhou Fan and Ruida Zhou and Chao Tian and Xiaoning Qian},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Kt4fwiuKqf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 987467, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12116784415782814794&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 9, "email": "tamu.edu;ucla.edu;;tamu.edu", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Texas A&M University;University of California, Los Angeles", "aff_unique_dep": ";", "aff_unique_url": "https://www.tamu.edu;https://www.ucla.edu", "aff_unique_abbr": "TAMU;UCLA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "AI Control: Improving Safety Despite Intentional Subversion", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34322", "id": "KviM5k8pcP", "proceeding": "https://proceedings.mlr.press/v235/greenblatt24a.html", "pdf": "https://openreview.net/pdf?id=KviM5k8pcP", "openreview": "https://openreview.net/forum?id=KviM5k8pcP", "author_site": "Ryan Greenblatt, Buck Shlegeris, Kshitij Sachan, Fabien Roger", "tldr": "", "abstract": "As large language models (LLMs) become more powerful and are deployed more autonomously, it will be increasingly important to prevent them from causing harmful outcomes. To do so, safety measures either aim at making LLMs try to avoid harmful outcomes or aim at preventing LLMs from causing harmful outcomes, even if they try to cause them. In this paper, we focus on this second layer of defense. We develop and evaluate pipelines of safety techniques (protocols) that try to ensure safety despite intentional subversion - an approach we call AI control. We investigate a setting in which we want to solve a sequence of programming problems without ever submitting subtly wrong code, using access to a powerful but untrusted model (in our case, GPT-4), access to a less powerful trusted model (in our case, GPT-3.5), and limited access to high-quality trusted labor. We investigate a range of protocols and red-team them by exploring strategies that the untrusted model could use to subvert them. We find that using the trusted model to edit untrusted-model code or using the untrusted model as a monitor substantially improves on simple baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ryan Greenblatt;Buck Shlegeris;Kshitij Sachan;Fabien Roger", "authorids": "~Ryan_Greenblatt1;~Buck_Shlegeris1;~Kshitij_Sachan1;~Fabien_Roger1", "gender": ";M;M;M", "homepage": ";https://github.com/bshlgrs;http://kshitijsachan.com;", "dblp": ";228/8176;;336/6227", "google_scholar": ";oyDxKw0AAAAJ;;La75jqEAAAAJ", "orcid": ";;;", "linkedin": "ryan-greenblatt-4b9907134/;;http://linkedin.com/in/kshitij-sachan-70bb1615b/;", "or_profile": "~Ryan_Greenblatt1;~Buck_Shlegeris1;~Kshitij_Sachan1;~Fabien_Roger1", "aff": ";Redwood Research;;Redwood Research", "aff_domain": ";rdwrs.com;;redwoodresearch.org", "position": ";Principal Researcher;;Researcher", "bibtex": "@inproceedings{\ngreenblatt2024ai,\ntitle={{AI} Control: Improving Safety Despite Intentional Subversion},\nauthor={Ryan Greenblatt and Buck Shlegeris and Kshitij Sachan and Fabien Roger},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KviM5k8pcP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 759508, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5264180987323077783&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": ";rdwrs.com;;redwoodresearch.org", "author_num": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Redwood Research", "aff_unique_dep": "", "aff_unique_url": "https://www.redwoodresearch.org", "aff_unique_abbr": "Redwood Research", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Improving Computational Complexity in Statistical Models with Local Curvature Information", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34321", "id": "KwgAThfxEd", "proceeding": "https://proceedings.mlr.press/v235/akbarian24a.html", "pdf": "https://openreview.net/pdf?id=KwgAThfxEd", "openreview": "https://openreview.net/forum?id=KwgAThfxEd", "author_site": "Pedram Akbarian, Tongzheng Ren, Jiacheng Zhuo, Sujay Sanghavi, Nhat Ho", "tldr": "", "abstract": "It is known that when the statistical models are singular, i.e., the Fisher information matrix at the true parameter is degenerate, the fixed step-size gradient descent algorithm takes polynomial number of steps in terms of the sample size $n$ to converge to a final statistical radius around the true parameter, which can be unsatisfactory for the practical application. To further improve that computational complexity, we consider utilizing the local curvature information for parameter estimation. Even though there is a rich literature in using the local curvature information for optimization, the statistical rate of these methods in statistical models, to the best of our knowledge, has not been studied rigorously. The major challenge of this problem is due to the non-convex nature of sample loss function. To shed light on these problems, we specifically study the normalized gradient descent (NormGD) algorithm, a variant of gradient descent algorithm whose step size is scaled by the maximum eigenvalue of the Hessian matrix of the empirical loss function, and deal with the aforementioned issue with a population-to-sample analysis. When the population loss function is homogeneous, the NormGD iterates reach a final statistical radius around the true parameter after a logarithmic number of iterations in terms of $n$. Therefore, for fixed dimension $d$, the NormGD algorithm achieves the optimal computational complexity $\\mathcal{O}(n)$ to reach the final statistical radius, which is cheaper than the complexity $\\mathcal{O}(n^{\\tau})$ of the fixed step-size gradient descent algorithm for some $\\tau > 1$.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pedram Akbarian;Tongzheng Ren;Jiacheng Zhuo;sujay sanghavi;Nhat Ho", "authorids": "~Pedram_Akbarian1;~Tongzheng_Ren1;~Jiacheng_Zhuo1;~sujay_sanghavi1;~Nhat_Ho1", "gender": "M;M;;M;M", "homepage": "https://pedakb.github.io/;https://www.cs.utexas.edu/~tzren/;http://www.cs.utexas.edu/~jzhuo/;https://sites.utexas.edu/sanghavi;https://nhatptnk8912.github.io/", "dblp": "358/2800;211/8004;198/0672;69/4911.html;203/4479", "google_scholar": "eg68QWIAAAAJ;VgNDYeYAAAAJ;GlArL6AAAAAJ;O-DazBUAAAAJ;https://scholar.google.ca/citations?user=Xs7cKMwAAAAJ", "orcid": ";;;;", "linkedin": ";;;;nhat-pham-minh-ho-267b8164/", "or_profile": "~Pedram_Akbarian1;~Tongzheng_Ren1;~Jiacheng_Zhuo1;~sujay_sanghavi1;~Nhat_Ho1", "aff": "University of Texas at Austin;University of Texas, Austin;University of Texas, Austin;University of Texas, Austin;University of Texas, Austin", "aff_domain": "utexas.edu;utexas.edu;utexas.edu;utexas.edu;utexas.edu", "position": "PhD student;PhD student;PhD student;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nakbarian2024improving,\ntitle={Improving Computational Complexity in Statistical Models with Local Curvature Information},\nauthor={Pedram Akbarian and Tongzheng Ren and Jiacheng Zhuo and sujay sanghavi and Nhat Ho},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KwgAThfxEd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 791502, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Tgppcx1RqwUJ:scholar.google.com/&scioq=Improving+Computational+Complexity+in+Statistical+Models+with+Local+Curvature+Information&hl=en&as_sdt=0,5", "gs_version_total": 7, "email": "utexas.edu;utexas.edu;utexas.edu;utexas.edu;utexas.edu", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Improving Group Robustness on Spurious Correlation Requires Preciser Group Inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34320", "id": "KycvgOCBBR", "proceeding": "https://proceedings.mlr.press/v235/han24g.html", "pdf": "https://openreview.net/pdf?id=KycvgOCBBR", "openreview": "https://openreview.net/forum?id=KycvgOCBBR", "author_site": "Yujin Han, Difan Zou", "tldr": "", "abstract": "Standard empirical risk minimization (ERM) models may prioritize learning spurious correlations between spurious features and true labels, leading to poor accuracy on groups where these correlations do not hold. Mitigating this issue often requires expensive spurious attribute (group) labels or relies on trained ERM models to infer group labels when group information is unavailable. However, the significant performance gap in worst-group accuracy between using pseudo group labels and using oracle group labels inspires us to consider further improving group robustness through preciser group inference. Therefore, we propose GIC, a novel method that accurately infers group labels, resulting in improved worst-group performance. GIC trains a spurious attribute classifier based on two key properties of spurious correlations: (1) high correlation between spurious attributes and true labels, and (2) variability in this correlation between datasets with different group distributions. Empirical studies on multiple datasets demonstrate the effectiveness of GIC in inferring group labels, and combining GIC with various downstream invariant learning methods improves worst-group accuracy, showcasing its powerful flexibility. Additionally, through analyzing the misclassifications in GIC, we identify an interesting phenomenon called semantic consistency, which may contribute to better decoupling the association between spurious attributes and labels, thereby mitigating spurious correlation. The code for GIC is available at [https://github.com/yujinhanml/GIC9](https://github.com/yujinhanml/GIC).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yujin Han;Difan Zou", "authorids": "~Yujin_Han1;~Difan_Zou1", "gender": "F;M", "homepage": "https://yujinhanml.github.io/;https://difanzou.github.io/", "dblp": "317/6852;161/8923", "google_scholar": "https://scholar.google.co.kr/citations?user=SxpbS5YAAAAJ;Cp4fcTQAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Yujin_Han1;~Difan_Zou1", "aff": "the University of Hong Kong;University of Hong Kong", "aff_domain": "cs.hku.hk;hku.hk", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nhan2024improving,\ntitle={Improving Group Robustness on Spurious Correlation Requires Preciser Group Inference},\nauthor={Yujin Han and Difan Zou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KycvgOCBBR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8297699, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3382369504954136845&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "cs.hku.hk;hku.hk", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.hku.hk", "aff_unique_abbr": "HKU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "QUEST: Query-Aware Sparsity for Efficient Long-Context LLM Inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34319", "id": "KzACYw0MTV", "proceeding": "https://proceedings.mlr.press/v235/tang24l.html", "pdf": "https://openreview.net/pdf?id=KzACYw0MTV", "openreview": "https://openreview.net/forum?id=KzACYw0MTV", "author_site": "Jiaming Tang, Yilong Zhao, Kan Zhu, Guangxuan Xiao, Baris Kasikci, Song Han", "tldr": "", "abstract": "As the demand for long-context large language models (LLMs) increases, models with context windows of up to 128K or 1M tokens are becoming increasingly prevalent. However, long-context LLM inference is challenging since the inference speed decreases significantly as the sequence length grows. This slowdown is primarily caused by loading a large KV cache during self-attention. Previous works have shown that a small portion of critical tokens will dominate the attention outcomes. However, we observe the criticality of a token highly depends on the query. To this end, we propose Quest, a query-aware KV cache selection algorithm. Quest keeps track of the minimal and maximal Key values in KV cache pages and estimates the criticality of a given page using Query vectors. By only loading the Top-K critical KV cache pages for attention, Quest significantly speeds up self-attention without sacrificing accuracy. We show that Quest can achieve up to 2.23x self-attention speedup, which reduces inference latency by 7.03x while performing well on tasks with long dependencies with negligible accuracy loss. Code is available at https://github.com/mit-han-lab/quest.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiaming Tang;Yilong Zhao;Kan Zhu;Guangxuan Xiao;Baris Kasikci;Song Han", "authorids": "~Jiaming_Tang1;~Yilong_Zhao1;~Kan_Zhu1;~Guangxuan_Xiao1;~Baris_Kasikci2;~Song_Han5", "gender": "M;M;M;;M;", "homepage": "https://jiamingtang.me;https://github.com/happierpig;https://kanzhu.netlify.app/;;https://homes.cs.washington.edu/~baris/;", "dblp": ";152/7119;175/1387;;31/11029;", "google_scholar": "lXLFEIAAAAAJ;xpY4dywAAAAJ;;;https://scholar.google.com.tw/citations?user=y5J0h7gAAAAJ;", "orcid": "0009-0004-4186-6561;;0009-0002-3462-3292;;;", "linkedin": "jiaming-tang-b71963263/;;;;bariskasikci/;", "or_profile": "~Jiaming_Tang1;~Yilong_Zhao1;~Kan_Zhu1;~Guangxuan_Xiao1;~Baris_Kasikci2;~Song_Han5", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;University of Washington;;Google;", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;uw.edu;;google.com;", "position": "Undergrad student;Undergrad student;PhD student;;Visiting Researcher;", "bibtex": "@inproceedings{\ntang2024quest,\ntitle={{QUEST}: Query-Aware Sparsity for Efficient Long-Context {LLM} Inference},\nauthor={Jiaming Tang and Yilong Zhao and Kan Zhu and Guangxuan Xiao and Baris Kasikci and Song Han},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KzACYw0MTV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2128506, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7628581081666854199&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "sjtu.edu.cn;sjtu.edu.cn;uw.edu;;google.com;", "author_num": 6, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Shanghai Jiao Tong University;University of Washington;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.washington.edu;https://www.google.com", "aff_unique_abbr": "SJTU;UW;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "China;United States" }, { "title": "KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34318", "id": "L057s2Rq8O", "proceeding": "https://proceedings.mlr.press/v235/liu24bz.html", "pdf": "https://openreview.net/pdf?id=L057s2Rq8O", "openreview": "https://openreview.net/forum?id=L057s2Rq8O", "author_site": "Zirui Liu, Jiayi Yuan, Hongye Jin, Shaochen (Henry) Zhong, Zhaozhuo Xu, Vladimir Braverman, Beidi Chen, Xia Hu", "tldr": "", "abstract": "Efficiently serving large language models (LLMs) requires batching many requests together to reduce the cost per request. Yet, the key-value (KV) cache, which stores attention keys and values to avoid re-computations, significantly increases memory demands and becomes the new bottleneck in speed and memory usage. This memory demand increases with larger batch sizes and longer context lengths. Additionally, the inference speed is limited by the size of KV cache, as the GPU's SRAM must load the entire KV cache from the main GPU memory for each token generated, causing the computational core to be idle during this process. A straightforward and effective solution to reduce KV cache size is quantization, which decreases the total bytes taken by KV cache. However, there is a lack of in-depth studies that explore the element distribution of KV cache to understand the hardness and limitation of KV cache quantization. To fill the gap, we conducted a comprehensive study on the element distribution in KV cache of popular LLMs. Our findings indicate that the key cache should be quantized per-channel, i.e., group elements along the channel dimension and quantize them together. In contrast, the value cache should be quantized per-token. From this analysis, we developed a tuning-free 2bit KV cache quantization algorithm, named KIVI. With the hardware-friendly implementation, KIVI can enable Llama (Llama-2), Falcon, and Mistral models to maintain almost the same quality while using 2.6$\\times$ less peak memory usage (including the model weight). This reduction in memory usage enables up to 4x larger batch size, bringing $2.35 \\times \\sim 3.47 \\times$ throughput on real LLM inference workload.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zirui Liu;Jiayi Yuan;Hongye Jin;Shaochen Zhong;Zhaozhuo Xu;Vladimir Braverman;Beidi Chen;Xia Hu", "authorids": "~Zirui_Liu1;~Jiayi_Yuan1;~Hongye_Jin1;~Shaochen_Zhong1;~Zhaozhuo_Xu2;~Vladimir_Braverman1;~Beidi_Chen1;~Xia_Hu4", "gender": "M;;M;M;;Unspecified;F;M", "homepage": "https://zirui-ray-liu.github.io/;https://jy-yuan.github.io/;https://github.com/Mooler0410;https://openreview.net/profile?id=~Shaochen_Zhong1;https://ottovonxu.github.io/;http://www.cs.jhu.edu/~vova/;https://www.andrew.cmu.edu/user/beidic/;https://cs.rice.edu/~xh37/index.html", "dblp": "196/8629-1.html;251/4029-1.html;268/7929;326/7286.html;195/4352;14/4758;192/1339;256/9406.html", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;XMrlrV8AAAAJ;;https://scholar.google.com/citations?hl=en;7tDlVAsAAAAJ;https://scholar.google.com.tw/citations?user=DTthB48AAAAJ;;https://scholar.google.com.tw/citations?user=pcCS60IAAAAJ", "orcid": ";;;;;;;", "linkedin": ";;;shaochen-henry-zhong-96a941249/;;;;", "or_profile": "~Zirui_Liu1;~Jiayi_Yuan1;~Hongye_Jin1;~Shaochen_Zhong1;~Zhaozhuo_Xu2;~Vladimir_Braverman1;~Beidi_Chen1;~Xia_Hu2", "aff": "Rice University;Rice University;Texas A&M;Rice University;Rice University;Department of Computer Science, Whiting School of Engineering;Meta Facebook;Rice University", "aff_domain": "rice.edu;rice.edu;tamu.edu;rice.edu;rice.edu;cs.jhu.edu;fb.com;rice.edu", "position": "PhD student;PhD student;PhD student;PhD student;PhD student;Full Professor;Researcher;Associate Professor", "bibtex": "@inproceedings{\nliu2024kivi,\ntitle={{KIVI}: A Tuning-Free Asymmetric 2bit Quantization for {KV} Cache},\nauthor={Zirui Liu and Jiayi Yuan and Hongye Jin and Shaochen Zhong and Zhaozhuo Xu and Vladimir Braverman and Beidi Chen and Xia Hu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=L057s2Rq8O}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1979912, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 152, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3707282375065918325&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "rice.edu;rice.edu;tamu.edu;rice.edu;rice.edu;cs.jhu.edu;fb.com;rice.edu", "author_num": 8, "aff_unique_index": "0;0;1;0;0;2;3;0", "aff_unique_norm": "Rice University;Texas A&M University;Johns Hopkins University;Meta", "aff_unique_dep": ";;Department of Computer Science;Meta Platforms, Inc.", "aff_unique_url": "https://www.rice.edu;https://www.tamu.edu;https://www.jhu.edu;https://meta.com", "aff_unique_abbr": "Rice;TAMU;JHU;Meta", "aff_campus_unique_index": "1", "aff_campus_unique": ";Baltimore", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning with 3D rotations, a hitchhiker's guide to SO(3)", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34317", "id": "L0VoOdjCUb", "proceeding": "https://proceedings.mlr.press/v235/geist24a.html", "pdf": "https://openreview.net/pdf?id=L0VoOdjCUb", "openreview": "https://openreview.net/forum?id=L0VoOdjCUb", "author_site": "Andreas Ren\u00e9 Geist, Jonas Frey, Mikel Zhobro, Anna Levina, Georg Martius", "tldr": "", "abstract": "Many settings in machine learning require the selection of a rotation representation. However, choosing a suitable representation from the many available options is challenging. This paper acts as a survey and guide through rotation representations. We walk through their properties that harm or benefit deep learning with gradient-based optimization. By consolidating insights from rotation-based learning, we provide a comprehensive overview of learning functions with rotation representations. We provide guidance on selecting representations based on whether rotations are in the model\u2019s input or output and whether the data primarily comprises small angles.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Andreas Ren\u00e9 Geist;Jonas Frey;Mikel Zhobro;Anna Levina;Georg Martius", "authorids": "~Andreas_Ren\u00e9_Geist1;~Jonas_Frey1;zhobromikel@gmail.com;~Anna_Levina1;~Georg_Martius1", "gender": "M;M;;F;M", "homepage": "https://andregeist.github.io/;https://jonasfrey96.github.io/;;https://uni-tuebingen.de/index.php?id=161236;https://uni-tuebingen.de/de/264672", "dblp": ";;;82/2964;47/2706", "google_scholar": "JDBDDEgAAAAJ;e5uPDzcAAAAJ;;https://scholar.google.de/citations?user=KJSnbQoAAAAJ;https://scholar.google.de/citations?user=b-JF-UIAAAAJ", "orcid": "0000-0003-2551-2419;0000-0002-7401-2173;;0000-0003-1355-6617;", "linkedin": "andreas-rene-geist/;jonasfrey96/;;;", "or_profile": "~Andreas_Ren\u00e9_Geist1;~Jonas_Frey1;zhobromikel@gmail.com;~Anna_Levina1;~Georg_Martius1", "aff": "Max-Planck-Institute for Intelligent Systems, Max-Planck Institute;ETHZ - ETH Zurich;;Eberhard-Karls-Universit\u00e4t T\u00fcbingen;Max Planck Institute for Intelligent Systems", "aff_domain": "is.mpg.de;ethz.ch;;uni-tuebingen.de;tuebingen.mpg.de", "position": "Postdoc;PhD student;;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\ngeist2024learning,\ntitle={Learning with 3D rotations, a hitchhiker's guide to {SO}(3)},\nauthor={Andreas Ren{\\'e} Geist and Jonas Frey and Mikel Zhobro and Anna Levina and Georg Martius},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=L0VoOdjCUb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3141743, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1992145379102423981&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "is.mpg.de;ethz.ch;;uni-tuebingen.de;tuebingen.mpg.de", "author_num": 5, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Max-Planck-Institute for Intelligent Systems;ETH Zurich;Eberhard Karls University of T\u00fcbingen;Max Planck Institute for Intelligent Systems", "aff_unique_dep": "Intelligent Systems;;;Intelligent Systems", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.ethz.ch;https://www.uni-tuebingen.de/;https://www.mpi-is.mpg.de", "aff_unique_abbr": "MPI-IS;ETHZ;Uni T\u00fcbingen;MPI-IS", "aff_campus_unique_index": "1", "aff_campus_unique": ";T\u00fcbingen", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Germany;Switzerland" }, { "title": "Debiased Distribution Compression", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34316", "id": "L1W9ZWPq9E", "proceeding": "https://proceedings.mlr.press/v235/li24r.html", "pdf": "https://openreview.net/pdf?id=L1W9ZWPq9E", "openreview": "https://openreview.net/forum?id=L1W9ZWPq9E", "author_site": "Lingxiao Li, Raaz Dwivedi, Lester Mackey", "tldr": "", "abstract": "Modern compression methods can summarize a target distribution $\\mathbb{P}$ more succinctly than i.i.d. sampling but require access to a low-bias input sequence like a Markov chain converging quickly to $\\mathbb{P}$. We introduce a new suite of compression methods suitable for compression with biased input sequences. Given $n$ points targeting the wrong distribution and quadratic time, Stein kernel thinning (SKT) returns $\\sqrt{n}$ equal-weighted points with $\\widetilde{O}(n^{-1/2})$ maximum mean discrepancy (MMD) to $\\mathbb{P}$. For larger-scale compression tasks, low-rank SKT achieves the same feat in sub-quadratic time using an adaptive low-rank debiasing procedure that may be of independent interest. For downstream tasks that support simplex or constant-preserving weights, Stein recombination and Stein Cholesky achieve even greater parsimony, matching the guarantees of SKT with as few as $\\text{poly-log}(n)$ weighted points. Underlying these advances are new guarantees for the quality of simplex-weighted coresets, the spectral decay of kernel matrices, and the covering numbers of Stein kernel Hilbert spaces. In our experiments, our techniques provide succinct and accurate posterior summaries while overcoming biases due to burn-in, approximate Markov chain Monte Carlo, and tempering.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lingxiao Li;Raaz Dwivedi;Lester Mackey", "authorids": "~Lingxiao_Li1;~Raaz_Dwivedi1;~Lester_Mackey1", "gender": "M;M;M", "homepage": "http://people.csail.mit.edu/lingxiao/;https://raazdwivedi.github.io/;https://stanford.edu/~lmackey", "dblp": ";180/9006;05/2961", "google_scholar": ";9ehX_58AAAAJ;erv7TP0AAAAJ", "orcid": ";;0000-0002-1102-0387", "linkedin": ";raaz-dwivedi;lester-mackey-5902909", "or_profile": "~Lingxiao_Li1;~Raaz_Dwivedi1;~Lester_Mackey1", "aff": "Massachusetts Institute of Technology;Cornell University;Microsoft Research New England", "aff_domain": "mit.edu;cornell.edu;microsoft.com", "position": "PhD student;Assistant Professor;Principal Researcher", "bibtex": "@inproceedings{\nli2024debiased,\ntitle={Debiased Distribution Compression},\nauthor={Lingxiao Li and Raaz Dwivedi and Lester Mackey},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=L1W9ZWPq9E}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1209656, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6019645921846462366&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "mit.edu;cornell.edu;microsoft.com", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Massachusetts Institute of Technology;Cornell University;Microsoft", "aff_unique_dep": ";;Microsoft Research", "aff_unique_url": "https://web.mit.edu;https://www.cornell.edu;https://www.microsoft.com/en-us/research/group/microsoft-research-new-england", "aff_unique_abbr": "MIT;Cornell;MSR NE", "aff_campus_unique_index": "1", "aff_campus_unique": ";New England", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Compositional Capabilities of Autoregressive Transformers: A Study on Synthetic, Interpretable Tasks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34315", "id": "L1eJ3NKPCd", "proceeding": "https://proceedings.mlr.press/v235/ramesh24a.html", "pdf": "https://openreview.net/pdf?id=L1eJ3NKPCd", "openreview": "https://openreview.net/forum?id=L1eJ3NKPCd", "author_site": "Rahul Ramesh, Ekdeep Singh Lubana, Mikail Khona, Robert Dick, Hidenori Tanaka", "tldr": "", "abstract": "Transformers trained on huge text corpora exhibit a remarkable set of capabilities, e.g., performing simple logical operations. Given the inherent compositional nature of language, one can expect the model to learn to compose these capabilities, potentially yielding a combinatorial explosion of what operations it can perform on an input. Motivated by the above, we aim to assess in this paper \u201chow capable can a transformer become?\u201d. Specifically, we train autoregressive Transformer models on a data-generating process that involves compositions of a set of well-defined monolithic capabilities. Through a series of extensive and systematic experiments on this data-generating process, we show that: (1) autoregressive Transformers can learn compositional structures from small amounts of training data and generalize to exponentially or even combinatorially many functions; (2) composing functions by generating intermediate outputs is more effective at generalizing to unseen compositions, compared to generating no intermediate outputs; (3) biases in the order of the compositions in the training data, results in Transformers that fail to compose some combinations of functions; and (4) the attention layers seem to select the capability to apply while the feed-forward layers execute the capability.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rahul Ramesh;Ekdeep Singh Lubana;Mikail Khona;Robert P. Dick;Hidenori Tanaka", "authorids": "~Rahul_Ramesh2;~Ekdeep_Singh_Lubana1;~Mikail_Khona2;~Robert_P._Dick1;~Hidenori_Tanaka1", "gender": "M;M;M;;M", "homepage": "https://cis.upenn.edu/~rahulram;https://ekdeepslubana.github.io/;http://robertdick.org/;https://sites.google.com/view/htanaka/home;", "dblp": "168/7029;228/2683;84/523.html;;", "google_scholar": "wCa6nygAAAAJ;https://scholar.google.co.in/citations?user=OP7S3vsAAAAJ;;f_pWOGIAAAAJ;", "orcid": ";;;;", "linkedin": ";;;;mikail-khona-60656b141/", "or_profile": "~Rahul_Ramesh2;~Ekdeep_Singh_Lubana1;~Robert_P._Dick1;~Hidenori_Tanaka1;~mikail_khona1", "aff": "University of Pennsylvania;University of Michigan;University of Michigan;Physics & Informatics Lab, NTT Research, Inc.;", "aff_domain": "upenn.edu;umich.edu;umich.edu;ntt-research.com;", "position": "PhD student;PhD student;Full Professor;Senior Research Scientist;", "bibtex": "@inproceedings{\nramesh2024compositional,\ntitle={Compositional Capabilities of Autoregressive Transformers: A Study on Synthetic, Interpretable Tasks},\nauthor={Rahul Ramesh and Ekdeep Singh Lubana and Mikail Khona and Robert P. Dick and Hidenori Tanaka},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=L1eJ3NKPCd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8618212, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4182666660493678681&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "upenn.edu;umich.edu;umich.edu;ntt-research.com;", "author_num": 5, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "University of Pennsylvania;University of Michigan;NTT Research, Inc.", "aff_unique_dep": ";;Physics & Informatics Lab", "aff_unique_url": "https://www.upenn.edu;https://www.umich.edu;https://www.ntt-research.com", "aff_unique_abbr": "UPenn;UM;NTT Research", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Removing Spurious Concepts from Neural Network Representations via Joint Subspace Estimation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34314", "id": "L4ERlHrJRT", "proceeding": "https://proceedings.mlr.press/v235/holstege24a.html", "pdf": "https://openreview.net/pdf?id=L4ERlHrJRT", "openreview": "https://openreview.net/forum?id=L4ERlHrJRT", "author_site": "Floris Holstege, Bram Wouters, Noud van Giersbergen, Cees Diks", "tldr": "", "abstract": "An important challenge in the field of interpretable machine learning is to ensure that deep neural networks (DNNs) use the correct or desirable input features in performing their tasks. Concept-removal methods aim to do this by eliminating concepts that are spuriously correlated with the main task from the neural network representation of the data. However, existing methods tend to be overzealous by inadvertently removing part of the correct or desirable features as well, leading to wrong interpretations and hurting model performance. We propose an iterative algorithm that separates spurious from main-task concepts by jointly estimating two low-dimensional orthogonal subspaces of the neural network representation. By evaluating the algorithm on benchmark datasets from computer vision (Waterbirds, CelebA) and natural language processing (MultiNLI), we show it outperforms existing concept-removal methods in terms of identifying the main-task and spurious concepts, and removing only the latter.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Floris Holstege;Bram Wouters;Noud Van Giersbergen;Cees Diks", "authorids": "~Floris_Holstege1;~Bram_Wouters1;~Noud_Van_Giersbergen1;~Cees_Diks1", "gender": "M;;M;M", "homepage": "https://www.uva.nl/profiel/h/o/f.g.holstege/f.g.holstege.html;;https://www.uva.nl/profiel/g/i/n.p.a.vangiersbergen/n.p.a.vangiersbergen.html;https://www.uva.nl/en/profile/d/i/c.g.h.diks/c.g.h.diks.html", "dblp": ";;;", "google_scholar": ";https://scholar.google.nl/citations?user=_22XG4YAAAAJ;;kMBFQ64AAAAJ", "orcid": ";;0000-0002-1790-3105;", "linkedin": "fholstege/;;noud-van-giersbergen-26a5655;", "or_profile": "~Floris_Holstege1;~Bram_Wouters1;~Noud_Van_Giersbergen1;~Cees_Diks1", "aff": "University of Amsterdam;University of Amsterdam;University of Amsterdam;University of Amsterdam", "aff_domain": "uva.nl;uva.nl;uva.nl;uva.nl", "position": "PhD student;Lecturer;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nholstege2024removing,\ntitle={Removing Spurious Concepts from Neural Network Representations via Joint Subspace Estimation},\nauthor={Floris Holstege and Bram Wouters and Noud Van Giersbergen and Cees Diks},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=L4ERlHrJRT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8993333, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=467918938177801081&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 6, "email": "uva.nl;uva.nl;uva.nl;uva.nl", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Amsterdam", "aff_unique_dep": "", "aff_unique_url": "https://www.uva.nl", "aff_unique_abbr": "UvA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Netherlands" }, { "title": "LSEnet: Lorentz Structural Entropy Neural Network for Deep Graph Clustering", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34313", "id": "L6SRXG92s6", "proceeding": "https://proceedings.mlr.press/v235/sun24g.html", "pdf": "https://openreview.net/pdf?id=L6SRXG92s6", "openreview": "https://openreview.net/forum?id=L6SRXG92s6", "author_site": "Li Sun, Zhenhao Huang, Hao Peng, YuJie Wang, Chunyang Liu, Philip Yu", "tldr": "", "abstract": "Graph clustering is a fundamental problem in machine learning. Deep learning methods achieve the state-of-the-art results in recent years, but they still cannot work without predefined cluster numbers. Such limitation motivates us to pose a more challenging problem of graph clustering with unknown cluster number. We propose to address this problem from a fresh perspective of graph information theory (i.e., structural information). In the literature, structural information has not yet been introduced to deep clustering, and its classic definition falls short of discrete formulation and modeling node features. In this work, we first formulate a differentiable structural information (DSI) in the continuous realm, accompanied by several theoretical results. By minimizing DSI, we construct the optimal partitioning tree where densely connected nodes in the graph tend to have the same assignment, revealing the cluster struc- ture. DSI is also theoretically presented as a new graph clustering objective, not requiring the pre-defined cluster number. Furthermore, we design a neural LSEnet in the Lorentz model of hyperbolic space, where we integrate node features to structural information via manifold-valued graph convolution. Extensive empirical results on real graphs show the superiority of our approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Li Sun;Zhenhao Huang;Hao Peng;Yujie Wang;Chunyang Liu;Philip S. Yu", "authorids": "~Li_Sun4;~Zhenhao_Huang1;~Hao_Peng7;~Yujie_Wang11;~Chunyang_Liu1;~Philip_S._Yu1", "gender": "M;M;M;;M;M", "homepage": ";;https://penghao-bdsc.github.io/;https://github.com/;;https://cs.uic.edu/profiles/philip-yu/", "dblp": "57/2405-8;;69/7742-1;;06/8730;y/PhilipSYu", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com/citations?hl=zh-CN;R25rbyQAAAAJ;;nh5zdQ8AAAAJ;D0lL1r0AAAAJ", "orcid": "0000-0003-4562-2279;0009-0007-8944-0385;0000-0003-0458-5977;;;0000-0002-3491-5968", "linkedin": ";;;;;", "or_profile": "~Li_Sun4;~Zhenhao_Huang1;~Hao_Peng7;~Yujie_Wang11;~Chunyang_Liu1;~Philip_S._Yu1", "aff": "North China Electric Power University ;North China Electric Power University;Beihang University;NCEPU;;University of Illinois Chicago", "aff_domain": "ncepu.edu.cn;ncepubj.edu.cn;buaa.edu.cn;ncepu.edu;;uic.edu", "position": "Associate Professor;Undergrad student;Full Professor;Undergrad student;;Full Professor", "bibtex": "@inproceedings{\nsun2024lsenet,\ntitle={{LSE}net: Lorentz Structural Entropy Neural Network for Deep Graph Clustering},\nauthor={Li Sun and Zhenhao Huang and Hao Peng and Yujie Wang and Chunyang Liu and Philip S. Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=L6SRXG92s6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3838622, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11537676105860886350&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "ncepu.edu.cn;ncepubj.edu.cn;buaa.edu.cn;ncepu.edu;;uic.edu", "author_num": 6, "aff_unique_index": "0;0;1;0;2", "aff_unique_norm": "North China Electric Power University;Beihang University;University of Illinois at Chicago", "aff_unique_dep": ";;", "aff_unique_url": "http://www.ncepu.edu.cn;http://www.buaa.edu.cn/;https://www.uic.edu", "aff_unique_abbr": "NCEPU;BUAA;UIC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Chicago", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Relaxed Quantile Regression: Prediction Intervals for Asymmetric Noise", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34312", "id": "L8nSGvoyvb", "proceeding": "https://proceedings.mlr.press/v235/pouplin24a.html", "pdf": "https://openreview.net/pdf?id=L8nSGvoyvb", "openreview": "https://openreview.net/forum?id=L8nSGvoyvb", "author_site": "Thomas Pouplin, Alan Jeffares, Nabeel Seedat, M van der Schaar", "tldr": "", "abstract": "Constructing valid prediction intervals rather than point estimates is a well-established approach for uncertainty quantification in the regression setting. Models equipped with this capacity output an interval of values in which the ground truth target will fall with some prespecified probability. This is an essential requirement in many real-world applications where simple point predictions' inability to convey the magnitude and frequency of errors renders them insufficient for high-stakes decisions. Quantile regression is a leading approach for obtaining such intervals via the empirical estimation of quantiles in the (non-parametric) distribution of outputs. This method is simple, computationally inexpensive, interpretable, assumption-free, and effective. However, it does require that the specific quantiles being learned are chosen a priori. This results in (a) intervals that are arbitrarily symmetric around the median which is sub-optimal for realistic skewed distributions, or (b) learning an excessive number of intervals. In this work, we propose Relaxed Quantile Regression (RQR), a direct alternative to quantile regression based interval construction that removes this arbitrary constraint whilst maintaining its strengths. We demonstrate that this added flexibility results in intervals with an improvement in desirable qualities (e.g. mean width) whilst retaining the essential coverage guarantees of quantile regression.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Thomas Pouplin;Alan Jeffares;Nabeel Seedat;Mihaela van der Schaar", "authorids": "~Thomas_Pouplin1;~Alan_Jeffares1;~Nabeel_Seedat1;~Mihaela_van_der_Schaar2", "gender": "M;;;F", "homepage": ";https://alanjeffares.com;;https://www.vanderschaar-lab.com", "dblp": "339/7726;304/1985;227/8368;", "google_scholar": "VVCoRhgAAAAJ;e65kJ08AAAAJ;https://scholar.google.com/citations?hl=en;DZ3S--MAAAAJ", "orcid": ";;;", "linkedin": ";alanjeffares;nabeel-seedat/;", "or_profile": "~Thomas_Pouplin1;~Alan_Jeffares1;~Nabeel_Seedat1;~Mihaela_van_der_Schaar2", "aff": "University of Cambridge;University of Cambridge;AstraZeneca;University of California, Los Angeles", "aff_domain": "cam.ac.uk;cam.ac.uk;astrazeneca.com;ucla.edu", "position": "PhD student;PhD student;Intern;Full Professor", "bibtex": "@inproceedings{\npouplin2024relaxed,\ntitle={Relaxed Quantile Regression: Prediction Intervals for Asymmetric Noise},\nauthor={Thomas Pouplin and Alan Jeffares and Nabeel Seedat and Mihaela van der Schaar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=L8nSGvoyvb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 879955, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6702785567998436126&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 7, "email": "cam.ac.uk;cam.ac.uk;astrazeneca.com;ucla.edu", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "University of Cambridge;AstraZeneca;University of California, Los Angeles", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cam.ac.uk;https://www.astrazeneca.com;https://www.ucla.edu", "aff_unique_abbr": "Cambridge;AZ;UCLA", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Cambridge;;Los Angeles", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Language Generation with Strictly Proper Scoring Rules", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34311", "id": "LALSZ88Xpx", "proceeding": "https://proceedings.mlr.press/v235/shao24c.html", "pdf": "https://openreview.net/pdf?id=LALSZ88Xpx", "openreview": "https://openreview.net/forum?id=LALSZ88Xpx", "author_site": "Chenze Shao, Fandong Meng, Yijin Liu, Jie Zhou", "tldr": "", "abstract": "Language generation based on maximum likelihood estimation (MLE) has become the fundamental approach for text generation. Maximum likelihood estimation is typically performed by minimizing the log-likelihood loss, also known as the logarithmic score in statistical decision theory. The logarithmic score is strictly proper in the sense that it encourages honest forecasts, where the expected score is maximized only when the model reports true probabilities. Although many strictly proper scoring rules exist, the logarithmic score is the only local scoring rule among them that depends exclusively on the probability of the observed sample, making it capable of handling the exponentially large sample space of natural text. In this work, we propose a straightforward strategy for adapting scoring rules to language generation, allowing for language modeling with any non-local scoring rules. Leveraging this strategy, we train language generation models using two classic strictly proper scoring rules, the Brier score and the Spherical score, as alternatives to the logarithmic score. Experimental results indicate that simply substituting the loss function, without adjusting other hyperparameters, can yield substantial improvements in model's generation capabilities. Moreover, these improvements can scale up to large language models (LLMs) such as LLaMA-7B and LLaMA-13B. Source code: https://github.com/shaochenze/ScoringRulesLM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chenze Shao;Fandong Meng;Yijin Liu;Jie Zhou", "authorids": "~Chenze_Shao1;~Fandong_Meng3;~Yijin_Liu1;~Jie_Zhou8", "gender": "M;M;M;M", "homepage": ";http://fandongmeng.github.io/;;", "dblp": "227/3123;117/4056.html;242/7766;00/5012-16", "google_scholar": "LH_rZf8AAAAJ;sA8U4S0AAAAJ;3fDdU9AAAAAJ;https://scholar.google.com.hk/citations?user=OijxQCMAAAAJ", "orcid": ";0000-0002-8158-2377;;0000-0002-5899-5165", "linkedin": ";;;", "or_profile": "~Chenze_Shao1;~Fandong_Meng3;~Yijin_Liu1;~Jie_Zhou8", "aff": "Tencent Inc;WeChat AI, Tencent Inc.;Wechat AI;WeChat AI, Tencent Inc.", "aff_domain": "tencent.com;tencent.com;tencent.com;tencent.com", "position": "Researcher;Principal Researcher;Researcher;Principal Researcher", "bibtex": "@inproceedings{\nshao2024language,\ntitle={Language Generation with Strictly Proper Scoring Rules},\nauthor={Chenze Shao and Fandong Meng and Yijin Liu and Jie Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LALSZ88Xpx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 602295, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15259049490793157052&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "tencent.com;tencent.com;tencent.com;tencent.com", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Tencent;WeChat", "aff_unique_dep": "Tencent;WeChat AI", "aff_unique_url": "https://www.tencent.com;https://www.wechat.com", "aff_unique_abbr": "Tencent;WeChat AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "CaM: Cache Merging for Memory-efficient LLMs Inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34310", "id": "LCTmppB165", "proceeding": "https://proceedings.mlr.press/v235/zhang24n.html", "pdf": "https://openreview.net/pdf?id=LCTmppB165", "openreview": "https://openreview.net/forum?id=LCTmppB165", "author_site": "Yuxin Zhang, Yuxuan Du, Gen Luo, Yunshan Zhong, Zhenyu Zhang, Shiwei Liu, Rongrong Ji", "tldr": "", "abstract": "Despite the exceptional performance of Large Language Models (LLMs), the substantial volume of key-value (KV) pairs cached during inference presents a barrier to their efficient deployment. To ameliorate this, recent works have aimed to selectively eliminate these caches, informed by the attention scores of associated tokens. However, such cache eviction invariably leads to output perturbation, regardless of the token choice. This perturbation escalates with the compression ratio, which can precipitate a marked deterioration in LLM inference performance. This paper introduces Cache Merging (CaM) as a solution to mitigate this challenge. CaM adaptively merges to-be-evicted caches into the remaining ones, employing a novel sampling strategy governed by the prominence of attention scores within discarded locations. In this manner, CaM enables memory-efficient LLMs to preserve critical token information, even obviating the need to maintain their corresponding caches. Extensive experiments utilizing LLaMA, OPT, and GPT-NeoX across various benchmarks corroborate CaM's proficiency in bolstering the performance of memory-efficient LLMs. Code is released at https://github.com/zyxxmu/cam.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuxin Zhang;Yuxuan Du;Gen Luo;Yunshan Zhong;Zhenyu Zhang;Shiwei Liu;Rongrong Ji", "authorids": "~Yuxin_Zhang3;~Yuxuan_Du3;~Gen_Luo1;~Yunshan_Zhong1;~Zhenyu_Zhang4;~Shiwei_Liu2;~Rongrong_Ji5", "gender": ";M;M;M;M;M;M", "homepage": ";https://github.com/duyuxuan1486;;https://zhenyu.gallery;https://shiweiliuiiiiiii.github.io/;http://mac.xmu.edu.cn/rrji-en.html;https://github.com/luogen1996", "dblp": "03/7346-2;;239/4066;01/1844-15;234/8697-3.html;86/5681;195/2078", "google_scholar": "6IeJLJoAAAAJ;;https://scholar.google.com/citations?hl=zh-CN;ZLyJRxoAAAAJ;73IbXtsAAAAJ;;EyZqU9gAAAAJ", "orcid": "0000-0002-4409-7030;;;;;;", "linkedin": ";;;zhenyu-allen-zhang-a9b1391a3/;;;", "or_profile": "~Yuxin_Zhang3;~Yuxuan_Du3;~Yunshan_Zhong1;~Zhenyu_Zhang4;~Shiwei_Liu2;~Rongrong_Ji5;~Gen_Luogen1", "aff": "Xiamen University;Xiamen University;Xiamen University;University of Texas at Austin;University of Oxford;Xiamen University;Xiamen University", "aff_domain": "xmu.edu.cn;xmu.edu;xmu.edu.cn;utexas.edu;ox.ac.uk;xmu.edu.cn;xmu.edu.cn", "position": "PhD student;Undergrad student;PhD student;PhD student;Postdoc;Full Professor;PhD student", "bibtex": "@inproceedings{\nzhang2024cam,\ntitle={CaM: Cache Merging for Memory-efficient {LLM}s Inference},\nauthor={Yuxin Zhang and Yuxuan Du and Gen Luo and Yunshan Zhong and Zhenyu Zhang and Shiwei Liu and Rongrong Ji},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LCTmppB165}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1091252, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14238357041916479276&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "xmu.edu.cn;xmu.edu;xmu.edu.cn;utexas.edu;ox.ac.uk;xmu.edu.cn;xmu.edu.cn", "author_num": 7, "aff_unique_index": "0;0;0;1;2;0;0", "aff_unique_norm": "Xiamen University;University of Texas at Austin;University of Oxford", "aff_unique_dep": ";;", "aff_unique_url": "https://www.xmu.edu.cn;https://www.utexas.edu;https://www.ox.ac.uk", "aff_unique_abbr": "XMU;UT Austin;Oxford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;0;1;2;0;0", "aff_country_unique": "China;United States;United Kingdom" }, { "title": "Copyright Traps for Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34309", "id": "LDq1JPdc55", "proceeding": "https://proceedings.mlr.press/v235/meeus24a.html", "pdf": "https://openreview.net/pdf?id=LDq1JPdc55", "openreview": "https://openreview.net/forum?id=LDq1JPdc55", "author_site": "Matthieu Meeus, Igor Shilov, Manuel Faysse, Yves-Alexandre de Montjoye", "tldr": "", "abstract": "Questions of fair use of copyright-protected content to train Large Language Models (LLMs) are being actively debated. Document-level inference has been proposed as a new task: inferring from black-box access to the trained model whether a piece of content has been seen during training. SOTA methods however rely on naturally occurring memorization of (part of) the content. While very effective against models that memorize significantly, we hypothesize - and later confirm - that they will not work against models that do not naturally memorize, e.g. medium-size 1B models. We here propose to use copyright traps, the inclusion of fictitious entries in original content, to detect the use of copyrighted materials in LLMs with a focus on models where memorization does not naturally occur. We carefully design a randomized controlled experimental setup, inserting traps into original content (books) and train a 1.3B LLM from scratch. We first validate that the use of content in our target model would be undetectable using existing methods. We then show, contrary to intuition, that even medium-length trap sentences repeated a significant number of times (100) are not detectable using existing methods. However, we show that longer sequences repeated a large number of times can be reliably detected (AUC=0.75) and used as copyright traps. Beyond copyright applications, our findings contribute to the study of LLM memorization: the randomized controlled setup enables us to draw causal relationships between memorization and certain sequence properties such as repetition in model training data and perplexity.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Matthieu Meeus;Igor Shilov;Manuel Faysse;Yves-Alexandre de Montjoye", "authorids": "~Matthieu_Meeus1;~Igor_Shilov1;~Manuel_Faysse1;~Yves-Alexandre_de_Montjoye1", "gender": "M;M;M;", "homepage": ";https://igorshilov.com/;https://manuelfay.github.io/;", "dblp": ";;359/3589;", "google_scholar": "QaEzyhEAAAAJ;wkAVIooAAAAJ;ew4xsR4AAAAJ;", "orcid": ";;;", "linkedin": "matthieu-meeus-217316141/;igor-shilov-9988b0a0/;manuel-faysse/;", "or_profile": "~Matthieu_Meeus1;~Igor_Shilov1;~Manuel_Faysse1;~Yves-Alexandre_de_Montjoye1", "aff": "Imperial College London;Imperial College London;CentraleSupelec;", "aff_domain": "imperial.ac.uk;imperial.ac.uk;centralesupelec.fr;", "position": "PhD student;PhD student;PhD student;", "bibtex": "@inproceedings{\nmeeus2024copyright,\ntitle={Copyright Traps for Large Language Models},\nauthor={Matthieu Meeus and Igor Shilov and Manuel Faysse and Yves-Alexandre de Montjoye},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LDq1JPdc55}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 391371, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14984186734570581450&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "imperial.ac.uk;imperial.ac.uk;centralesupelec.fr;", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Imperial College London;CentraleSup\u00e9lec", "aff_unique_dep": ";", "aff_unique_url": "https://www.imperial.ac.uk;https://www.centralesupelec.fr", "aff_unique_abbr": "ICL;CS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United Kingdom;France" }, { "title": "On Statistical Learning Theory for Distributional Inputs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34308", "id": "LGDYsBslWi", "proceeding": "https://proceedings.mlr.press/v235/fiedler24a.html", "pdf": "https://openreview.net/pdf?id=LGDYsBslWi", "openreview": "https://openreview.net/forum?id=LGDYsBslWi", "author_site": "Christian Fiedler, Pierre-Fran\u00e7ois Massiani, Friedrich Solowjow, Sebastian Trimpe", "tldr": "", "abstract": "Kernel-based statistical learning on distributional inputs appears in many relevant applications, from medical diagnostics to causal inference, and poses intriguing theoretical questions. While this learning scenario received considerable attention from the machine learning community recently, many gaps in the theory remain. In particular, most works consider only the distributional regression setting, and focus on the regularized least-squares algorithm for this problem. In this work, we start to fill these gaps. We prove two oracle inequalities for kernel machines in general distributional learning scenarios, as well as a generalization result based on algorithmic stability. Our main results are formulated in great generality, utilizing general Hilbertian embeddings, which makes them applicable to a wide array of approaches to distributional learning. Additionally, we specialize our results to the cases of kernel mean embeddings and of the recently introduced Hilbertian embeddings based on sliced Wasserstein distances, providing concrete instances of the general setup. Our results considerably enlarge the scope of theoretically grounded distributional learning, and provide many interesting avenues for future work.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Christian Fiedler;Pierre-Fran\u00e7ois Massiani;Friedrich Solowjow;Sebastian Trimpe", "authorids": "~Christian_Fiedler1;~Pierre-Fran\u00e7ois_Massiani2;~Friedrich_Solowjow1;~Sebastian_Trimpe1", "gender": ";M;;M", "homepage": "https://www.dsme.rwth-aachen.de/cms/DSME/Das-Institut/Team-CMS-Artikel-/~jptsq/Christian-Fiedler/;;https://www.dsme.rwth-aachen.de/cms/DSME/Das-Institut/Team-CMS-Artikel-/~jptyz/Friedrich-Solowjow/;https://www.dsme.rwth-aachen.de/trimpe", "dblp": "257/5782;;217/1553;15/8135", "google_scholar": "93Qt_hgAAAAJ;ax9cEIQAAAAJ;https://scholar.google.de/citations?user=gq_ESzoAAAAJ;https://scholar.google.de/citations?user=9kzHZssAAAAJ", "orcid": ";0000-0002-8019-4401;;0000-0002-2785-2487", "linkedin": ";;;sebastian-trimpe-2472a0a3/", "or_profile": "~Christian_Fiedler1;~Pierre-Fran\u00e7ois_Massiani2;~Friedrich_Solowjow1;~Sebastian_Trimpe1", "aff": "Rheinisch Westf\u00e4lische Technische Hochschule Aachen;Rheinisch Westf\u00e4lische Technische Hochschule Aachen;Rheinisch Westf\u00e4lische Technische Hochschule Aachen;RWTH Aachen University", "aff_domain": "rwth-aachen.de;rwth-aachen.de;rwth-aachen.de;rwth-aachen.de", "position": "PhD student;PhD student;Lecturer;Full Professor", "bibtex": "@inproceedings{\nfiedler2024on,\ntitle={On Statistical Learning Theory for Distributional Inputs},\nauthor={Christian Fiedler and Pierre-Fran{\\c{c}}ois Massiani and Friedrich Solowjow and Sebastian Trimpe},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LGDYsBslWi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 434304, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_gfg79yFNS8J:scholar.google.com/&scioq=On+Statistical+Learning+Theory+for+Distributional+Inputs&hl=en&as_sdt=0,14", "gs_version_total": 4, "email": "rwth-aachen.de;rwth-aachen.de;rwth-aachen.de;rwth-aachen.de", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "RWTH Aachen University", "aff_unique_dep": "", "aff_unique_url": "https://www.rwth-aachen.de", "aff_unique_abbr": "RWTH", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Aachen", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Switchable Decision: Dynamic Neural Generation Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34307", "id": "LGhtl9ktop", "proceeding": "https://proceedings.mlr.press/v235/zhang24bj.html", "pdf": "https://openreview.net/pdf?id=LGhtl9ktop", "openreview": "https://openreview.net/forum?id=LGhtl9ktop", "author_site": "Shujian Zhang, Korawat Tanwisuth, Chengyue Gong, Pengcheng He, Mingyuan Zhou", "tldr": "", "abstract": "Auto-regressive generation models achieve competitive performance across many different NLP tasks such as summarization, question answering, and classifications. However, they are also known for being slow in inference, which makes them challenging to deploy in real-time applications. We propose a switchable decision to accelerate inference by dynamically assigning computation resources for each data instance. Automatically making decisions on where to skip and how to balance quality and computation cost with constrained optimization, our dynamic neural generation networks enforce the efficient inference path and determine the optimized trade-off. Experiments across question answering, summarization, and classification benchmarks show that our method benefits from less computation cost during inference while keeping the same accuracy. Extensive experiments and ablation studies demonstrate that our method can be general, effective, and beneficial for many NLP tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shujian Zhang;Korawat Tanwisuth;Chengyue Gong;Pengcheng He;Mingyuan Zhou", "authorids": "~Shujian_Zhang1;~Korawat_Tanwisuth1;~Chengyue_Gong1;~Pengcheng_He2;~Mingyuan_Zhou1", "gender": ";M;M;M;M", "homepage": "https://www.utexas.edu/;;;;http://mingyuanzhou.github.io", "dblp": "84/3190.html;;209/4862;116/8665;", "google_scholar": "7RmLVQkAAAAJ;;AscakBgAAAAJ;https://scholar.google.com/citations?hl=en;LXwCIisAAAAJ", "orcid": ";0009-0003-5875-5414;;;", "linkedin": ";korawat-tanwisuth-238401a7/;;;", "or_profile": "~Shujian_Zhang1;~Korawat_Tanwisuth1;~Chengyue_Gong1;~Pengcheng_He2;~Mingyuan_Zhou1", "aff": "University of Texas, Austin;University of Texas, Austin;University of Texas at Austin;Microsoft;Google", "aff_domain": "utexas.edu;utexas.edu;cs.utexas.edu;microsoft.com;google.com", "position": "PhD student;PhD student;grad student;Principal Researcher;Researcher", "bibtex": "@inproceedings{\nzhang2024switchable,\ntitle={Switchable Decision: Dynamic Neural Generation Networks},\nauthor={Shujian Zhang and Korawat Tanwisuth and Chengyue Gong and Pengcheng He and Mingyuan Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LGhtl9ktop}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 470956, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3-8taHoLXAMJ:scholar.google.com/&scioq=Switchable+Decision:+Dynamic+Neural+Generation+Networks&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": "utexas.edu;utexas.edu;cs.utexas.edu;microsoft.com;google.com", "author_num": 5, "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "University of Texas at Austin;Microsoft;Google", "aff_unique_dep": ";Microsoft Corporation;Google", "aff_unique_url": "https://www.utexas.edu;https://www.microsoft.com;https://www.google.com", "aff_unique_abbr": "UT Austin;Microsoft;Google", "aff_campus_unique_index": "0;0;0;2", "aff_campus_unique": "Austin;;Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "A Hierarchical Adaptive Multi-Task Reinforcement Learning Framework for Multiplier Circuit Design", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34306", "id": "LGz7GaUSEB", "proceeding": "https://proceedings.mlr.press/v235/wang24bz.html", "pdf": "https://openreview.net/pdf?id=LGz7GaUSEB", "openreview": "https://openreview.net/forum?id=LGz7GaUSEB", "author_site": "Zhihai Wang, Jie Wang, Dongsheng Zuo, Ji Yunjie, Xilin Xia, Yuzhe Ma, Jianye Hao, Mingxuan Yuan, Yongdong Zhang, Feng Wu", "tldr": "", "abstract": "Multiplier design---which aims to explore a large combinatorial design space to simultaneously optimize multiple conflicting objectives---is a fundamental problem in the integrated circuits industry. Although traditional approaches tackle the multi-objective multiplier optimization problem by manually designed heuristics, reinforcement learning (RL) offers a promising approach to discover high-speed and area-efficient multipliers. However, the existing RL-based methods struggle to find Pareto-optimal circuit designs for all possible preferences, i.e., weights over objectives, in a sample-efficient manner. To address this challenge, we propose a novel hierarchical adaptive (HAVE) multi-task reinforcement learning framework. The hierarchical framework consists of a meta-agent to generate diverse multiplier preferences, and an adaptive multi-task agent to collaboratively optimize multipliers conditioned on the dynamic preferences given by the meta-agent. To the best of our knowledge, HAVE is the first to well approximate Pareto-optimal circuit designs for the entire preference space with high sample efficiency. Experiments on multipliers across a wide range of input widths demonstrate that HAVE significantly Pareto-dominates state-of-the-art approaches, achieving up to 28% larger hypervolume. Moreover, experiments demonstrate that multipliers designed by HAVE can well generalize to large-scale computation-intensive circuits.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhihai Wang;Jie Wang;Dongsheng Zuo;Ji Yunjie;Xilin Xia;Yuzhe Ma;Jianye HAO;Mingxuan Yuan;Yongdong Zhang;Feng Wu", "authorids": "~Zhihai_Wang1;~Jie_Wang1;~Dongsheng_Zuo1;~Ji_Yunjie1;~Xilin_Xia1;~Yuzhe_Ma2;~Jianye_HAO1;~Mingxuan_Yuan1;~Yongdong_Zhang2;~Feng_Wu1", "gender": "M;M;M;M;M;M;M;M;M;M", "homepage": "https://miralab.ai/people/zhihai-wang/;http://staff.ustc.edu.cn/~jwangx;;https://github.com/Jedges?tab=repositories;https://github.com/dakfjalka;https://yuzhe-ma.com;http://www.icdai.org/jianye.html;;https://imcc.ustc.edu.cn/_upload/tpl/0d/13/3347/template3347/zhangyongdong.html;", "dblp": "35/4357;29/5259-5;;;;172/4863;21/7664.html;74/2356;z/YongdongZhang;25/3972-1", "google_scholar": "EdLIBG8AAAAJ;OugG4dUAAAAJ;https://scholar.google.com/citations?hl=en;;;;;https://scholar.google.com/citations?hl=en;https://scholar.google.com.hk/citations?user=hxGs4ukAAAAJ;5bInRDEAAAAJ", "orcid": ";;;;;;0000-0002-0422-8235;0000-0002-2236-8784;0000-0003-0066-3448;", "linkedin": ";;;;;;;;;", "or_profile": "~Zhihai_Wang1;~Jie_Wang1;~Dongsheng_Zuo1;~Ji_Yunjie1;~Xilin_Xia1;~Yuzhe_Ma2;~Jianye_HAO1;~Mingxuan_Yuan1;~Yongdong_Zhang2;~Feng_Wu1", "aff": "University of Science and Technology of China;University of Science and Technology of China;The Hong Kong University of Science and Technology (Guangzhou);Nanjing University;University of Science and Technology of China;The Hong Kong University of Science and Technology (Guangzhou);Tianjin University;Huawei Technologies Ltd.;University of Science and Technology of China;University of Science and Technology of China", "aff_domain": "ustc.edu.cn;ustc.edu.cn;hkust-gz.edu.cn;smail.nju.edu.cn;mail.ustc.edu.cn;hkust-gz.edu.cn;tju.edu.cn;huawei.com;ustc.edu.cn;ustc.edu.cn", "position": "PhD student;Full Professor;PhD student;Undergrad student;Undergrad student;Assistant Professor;Associate Professor;Researcher;Full Professor;Full Professor", "bibtex": "@inproceedings{\nwang2024a,\ntitle={A Hierarchical Adaptive Multi-Task Reinforcement Learning Framework for Multiplier Circuit Design},\nauthor={Zhihai Wang and Jie Wang and Dongsheng Zuo and Ji Yunjie and Xilin Xia and Yuzhe Ma and Jianye HAO and Mingxuan Yuan and Yongdong Zhang and Feng Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LGz7GaUSEB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1583477, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8756591860382985845&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "ustc.edu.cn;ustc.edu.cn;hkust-gz.edu.cn;smail.nju.edu.cn;mail.ustc.edu.cn;hkust-gz.edu.cn;tju.edu.cn;huawei.com;ustc.edu.cn;ustc.edu.cn", "author_num": 10, "aff_unique_index": "0;0;1;2;0;1;3;4;0;0", "aff_unique_norm": "University of Science and Technology of China;Hong Kong University of Science and Technology;Nanjing University;Tianjin University;Huawei", "aff_unique_dep": ";;;;Huawei Technologies", "aff_unique_url": "http://www.ustc.edu.cn;https://www.ust.hk;https://www.nju.edu.cn;http://www.tju.edu.cn;https://www.huawei.com", "aff_unique_abbr": "USTC;HKUST;Nanjing U;TJU;Huawei", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Guangzhou", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "GLoRe: When, Where, and How to Improve LLM Reasoning via Global and Local Refinements", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34305", "id": "LH6R06NxdB", "proceeding": "https://proceedings.mlr.press/v235/havrilla24a.html", "pdf": "https://openreview.net/pdf?id=LH6R06NxdB", "openreview": "https://openreview.net/forum?id=LH6R06NxdB", "author_site": "Alexander Havrilla, Sharath Chandra Raparthy, Christoforos Nalmpantis, Jane Dwivedi-Yu, Maksym Zhuravinskyi, Eric Hambro, Roberta Raileanu", "tldr": "", "abstract": "State-of-the-art language models can exhibit reasoning refinement capabilities on math, science or coding tasks. However, recent work demonstrates that even the best models struggle to identify *when and where to refine* without access to external feedback. In this paper, we propose Stepwise ORMs (**SORMs**) which are trained, only on synthetic data, to approximate the expected future reward of the optimal policy or $V^{\\star}$ as a form of Process-based reward modeling. Our experiments show that SORMs can more accurately detect incorrect reasoning steps compared to ORMs, thus enabling them to give precise step-level feedback to refinement models. We then train *global* refinement models, which take only the question and a draft solution as input and predict a corrected solution, and *local* refinement models which also take as input a critique indicating the location of the first reasoning error. We generate training data for both models synthetically by reusing data used to train the SORM. We find combining global and local refinements, using the ORM as a reranker, significantly outperforms either one individually, as well as a best of three sample baseline. With this strategy we can improve the accuracy of a LLaMA-2 13B model (already fine-tuned with RL) on GSM8K from 53% to 65% when greedily sampled.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alexander Havrilla;Sharath Chandra Raparthy;Christoforos Nalmpantis;Jane Dwivedi-Yu;Maksym Zhuravinskyi;Eric Hambro;Roberta Raileanu", "authorids": "~Alexander_Havrilla2;~Sharath_Chandra_Raparthy3;~Christoforos_Nalmpantis1;~Jane_Dwivedi-Yu1;~Maksym_Zhuravinskyi1;~Eric_Hambro1;~Roberta_Raileanu2", "gender": "M;M;;M;M;F;F", "homepage": "https://dahoas.github.io/;https://sharathraparthy.github.io/;;https://morphed.space;https://erichambro.com/;https://janedwivedi.github.io/;https://rraileanu.github.io/", "dblp": ";302/4190;222/6212;362/8201;290/1986;215/3352;215/5579", "google_scholar": ";https://scholar.google.ca/citations?user=S1R0_UMAAAAJ;https://scholar.google.co.uk/citations?user=1Z4PmxIAAAAJ;BLXPkDEAAAAJ;ehquBPIAAAAJ;ev8Ilx0AAAAJ;9hVXpJ0AAAAJ", "orcid": ";;0000-0002-7398-5862;;;;", "linkedin": ";;christoforos-nalmpantis/;;eric-hambro;janeaisleyyu/;roberta-raileanu-44b25660/", "or_profile": "~Alexander_Havrilla2;~Sharath_Chandra_Raparthy3;~Christoforos_Nalmpantis1;~Maksym_Zhuravinskyi1;~Eric_Hambro1;~Jane_Yu1;~Roberta_Raileanu1", "aff": "Georgia Institute of Technology;Meta Facebook;cfdx;Stability AI;Anthropic;Meta AI ;Meta Facebook", "aff_domain": "gatech.edu;fb.com;cfdx.io;stability.ai;anthropic.com;meta.com;fb.com", "position": "PhD student;Researcher;Researcher;Researcher;Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\nhavrilla2024glore,\ntitle={{GL}oRe: When, Where, and How to Improve {LLM} Reasoning via Global and Local Refinements},\nauthor={Alexander Havrilla and Sharath Chandra Raparthy and Christoforos Nalmpantis and Jane Dwivedi-Yu and Maksym Zhuravinskyi and Eric Hambro and Roberta Raileanu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LH6R06NxdB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 952979, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4633035866943617336&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "gatech.edu;fb.com;cfdx.io;stability.ai;anthropic.com;meta.com;fb.com", "author_num": 7, "aff_unique_index": "0;1;2;3;4;1;1", "aff_unique_norm": "Georgia Institute of Technology;Meta;cfdx;Stability AI;Anthropic", "aff_unique_dep": ";Meta Platforms, Inc.;;;", "aff_unique_url": "https://www.gatech.edu;https://meta.com;;https://stability.ai;https://www.anthropic.com", "aff_unique_abbr": "Georgia Tech;Meta;;Stability AI;Anthropic", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States;" }, { "title": "EfficientZero V2: Mastering Discrete and Continuous Control with Limited Data", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34304", "id": "LHGMXcr6zx", "proceeding": "https://proceedings.mlr.press/v235/wang24at.html", "pdf": "https://openreview.net/pdf?id=LHGMXcr6zx", "openreview": "https://openreview.net/forum?id=LHGMXcr6zx", "author_site": "Shengjie Wang, Shaohuai Liu, Weirui Ye, Jiacheng You, Yang Gao", "tldr": "", "abstract": "Sample efficiency remains a crucial challenge in applying Reinforcement Learning (RL) to real-world tasks. While recent algorithms have made significant strides in improving sample efficiency, none have achieved consistently superior performance across diverse domains. In this paper, we introduce EfficientZero V2, a general framework designed for sample-efficient RL algorithms. We have expanded the performance of EfficientZero to multiple domains, encompassing both continuous and discrete actions, as well as visual and low-dimensional inputs. With a series of improvements we propose, EfficientZero V2 outperforms the current state-of-the-art (SoTA) by a significant margin in diverse tasks under the limited data setting. EfficientZero V2 exhibits a notable advancement over the prevailing general algorithm, DreamerV3, achieving superior outcomes in 50 of 66 evaluated tasks across multiple benchmarks, including Atari 100k, Proprio Control, and Vision Control.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shengjie Wang;Shaohuai Liu;Weirui Ye;Jiacheng You;Yang Gao", "authorids": "~Shengjie_Wang2;~Shaohuai_Liu1;~Weirui_Ye1;~Jiacheng_You1;~Yang_Gao1", "gender": "M;M;M;M;M", "homepage": "https://shengjiewang-jason.github.io/;https://liushaohuai5.github.io;https://yewr.github.io/;https://github.com/YouJiacheng;http://yang-gao.weebly.com", "dblp": ";https://dblp.org/rec/conf/nips/YeLKAG21;245/3595;;89/4402-29", "google_scholar": ";https://scholar.google.com/citations?hl=en;_GgST9AAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": ";;;;", "linkedin": ";;;;yang-gao-45245348/", "or_profile": "~Shengjie_Wang2;~Shaohuai_Liu1;~Weirui_Ye1;~Jiacheng_You1;~Yang_Gao1", "aff": "Tsinghua University;Texas A&M University - College Station;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "mails.tsinghua.edu.cn;tamu.edu;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;PhD student;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nwang2024efficientzero,\ntitle={EfficientZero V2: Mastering Discrete and Continuous Control with Limited Data},\nauthor={Shengjie Wang and Shaohuai Liu and Weirui Ye and Jiacheng You and Yang Gao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LHGMXcr6zx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3338191, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4633929382005936156&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "mails.tsinghua.edu.cn;tamu.edu;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 5, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Tsinghua University;Texas A&M University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.tamu.edu", "aff_unique_abbr": "THU;TAMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";College Station", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "China;United States" }, { "title": "Federated Offline Reinforcement Learning: Collaborative Single-Policy Coverage Suffices", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34303", "id": "LIPGadocTe", "proceeding": "https://proceedings.mlr.press/v235/woo24b.html", "pdf": "https://openreview.net/pdf?id=LIPGadocTe", "openreview": "https://openreview.net/forum?id=LIPGadocTe", "author_site": "Jiin Woo, Laixi Shi, Gauri Joshi, Yuejie Chi", "tldr": "", "abstract": "Offline reinforcement learning (RL), which seeks to learn an optimal policy using offline data, has garnered significant interest due to its potential in critical applications where online data collection is infeasible or expensive. This work explores the benefit of federated learning for offline RL, aiming at collaboratively leveraging offline datasets at multiple agents. Focusing on finite-horizon episodic tabular Markov decision processes (MDPs), we design FedLCB-Q, a variant of the popular model-free Q-learning algorithm tailored for federated offline RL. FedLCB-Q updates local Q-functions at agents with novel learning rate schedules and aggregates them at a central server using importance averaging and a carefully designed pessimistic penalty term. Our sample complexity analysis reveals that, with appropriately chosen parameters and synchronization schedules, FedLCB-Q achieves linear speedup in terms of the number of agents without requiring high-quality datasets at individual agents, as long as the local datasets collectively cover the state-action space visited by the optimal policy, highlighting the power of collaboration in the federated setting. In fact, the sample complexity almost matches that of the single-agent counterpart, as if all the data are stored at a central location, up to polynomial factors of the horizon length. Furthermore, FedLCB-Q is communication-efficient, where the number of communication rounds is only linear with respect to the horizon length up to logarithmic factors.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiin Woo;Laixi Shi;Gauri Joshi;Yuejie Chi", "authorids": "~Jiin_Woo1;~Laixi_Shi1;~Gauri_Joshi1;~Yuejie_Chi1", "gender": ";F;;", "homepage": "https://jiinw.github.io/;https://laixishi.github.io/;;", "dblp": ";211/7965;;", "google_scholar": "fwDL_gMAAAAJ;V8RkRr8AAAAJ;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Jiin_Woo1;~Laixi_Shi1;~Gauri_Joshi1;~Yuejie_Chi1", "aff": "Carnegie Mellon University;California Institute of Technology;;", "aff_domain": "andrew.cmu.edu;caltech.edu;;", "position": "PhD student;Postdoc;;", "bibtex": "@inproceedings{\nwoo2024federated,\ntitle={Federated Offline Reinforcement Learning: Collaborative Single-Policy Coverage Suffices},\nauthor={Jiin Woo and Laixi Shi and Gauri Joshi and Yuejie Chi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LIPGadocTe}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 908624, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15799507206131643379&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "andrew.cmu.edu;caltech.edu;;", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Carnegie Mellon University;California Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.caltech.edu", "aff_unique_abbr": "CMU;Caltech", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pasadena", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Federated Representation Learning in the Under-Parameterized Regime", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34302", "id": "LIQYhV45D4", "proceeding": "https://proceedings.mlr.press/v235/liu24ba.html", "pdf": "https://openreview.net/pdf?id=LIQYhV45D4", "openreview": "https://openreview.net/forum?id=LIQYhV45D4", "author_site": "Renpu Liu, Cong Shen, Jing Yang", "tldr": "", "abstract": "Federated representation learning (FRL) is a popular personalized federated learning (FL) framework where clients work together to train a common representation while retaining their personalized heads. Existing studies, however, largely focus on the over-parameterized regime. In this paper, we make the initial efforts to investigate FRL in the under-parameterized regime, where the FL model is insufficient to express the variations in all ground-truth models. We propose a novel FRL algorithm FLUTE, and theoretically characterize its sample complexity and convergence rate for linear models in the under-parameterized regime. To the best of our knowledge, this is the first FRL algorithm with provable performance guarantees in this regime. FLUTE features a data-independent random initialization and a carefully designed objective function that aids the distillation of subspace spanned by the global optimal representation from the misaligned local representations. On the technical side, we bridge low-rank matrix approximation techniques with the FL analysis, which may be of broad interest. We also extend FLUTE beyond linear representations. Experimental results demonstrate that FLUTE outperforms state-of-the-art FRL solutions in both synthetic and real-world tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Renpu Liu;Cong Shen;Jing Yang", "authorids": "~Renpu_Liu1;~Cong_Shen1;~Jing_Yang3", "gender": ";M;", "homepage": ";https://cshen317.github.io/;http://www.ee.psu.edu/yang", "dblp": ";79/6027-1.html;", "google_scholar": "RB_fv-kAAAAJ;70LBhKcAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0002-3148-4453;", "linkedin": ";cong-shen-3372404/;", "or_profile": "~Renpu_Liu1;~Cong_Shen1;~Jing_Yang3", "aff": "Pennsylvania State University;University of Virginia;Pennsylvania State University", "aff_domain": "psu.edu;virginia.edu;psu.edu", "position": "PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nliu2024federated,\ntitle={Federated Representation Learning in the Under-Parameterized Regime},\nauthor={Renpu Liu and Cong Shen and Jing Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LIQYhV45D4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2818647, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1307579731251865311&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "psu.edu;virginia.edu;psu.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Pennsylvania State University;University of Virginia", "aff_unique_dep": ";", "aff_unique_url": "https://www.psu.edu;https://www.virginia.edu", "aff_unique_abbr": "PSU;UVA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Collaborative Heterogeneous Causal Inference Beyond Meta-analysis", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34301", "id": "LJ34pX1U5g", "proceeding": "https://proceedings.mlr.press/v235/guo24c.html", "pdf": "https://openreview.net/pdf?id=LJ34pX1U5g", "openreview": "https://openreview.net/forum?id=LJ34pX1U5g", "author_site": "Tianyu Guo, Sai Praneeth Karimireddy, Michael Jordan", "tldr": "", "abstract": "Collaboration between different data centers is often challenged by heterogeneity across sites. To account for the heterogeneity, the state-of-the-art method is to re-weight the covariate distributions in each site to match the distribution of the target population. Nevertheless, this method still relies on the concept of traditional meta-analysis after adjusting for the distribution shift. This work proposes a collaborative inverse propensity score weighting estimator for causal inference with heterogeneous data. Instead of adjusting the distribution shift separately, we use weighted propensity score models to collaboratively adjust for the distribution shift. Our method shows significant improvements over the methods based on meta-analysis when heterogeneity increases. By incorporating outcome regression models, we prove the asymptotic normality when the covariates have dimension $d<8$. Our methods preserve privacy at individual sites by implementing federated learning protocols.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tianyu Guo;Sai Praneeth Karimireddy;Michael Jordan", "authorids": "~Tianyu_Guo4;~Sai_Praneeth_Karimireddy1;~Michael_Jordan1", "gender": "M;M;M", "homepage": "https://statistics.berkeley.edu/people/tianyu-guo;https://spkreddy.org;http://www.cs.berkeley.edu/~jordan/", "dblp": ";217/3342;j/MichaelIJordan", "google_scholar": ";wKJeOQoAAAAJ;https://scholar.google.com.tw/citations?user=yxUduqMAAAAJ", "orcid": ";;0000-0001-8935-817X", "linkedin": ";;", "or_profile": "~Tianyu_Guo4;~Sai_Praneeth_Karimireddy1;~Michael_Jordan1", "aff": "University of California, Berkeley;University of California, Berkeley;University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu", "position": "PhD student;Postdoc;Full Professor", "bibtex": "@inproceedings{\nguo2024collaborative,\ntitle={Collaborative Heterogeneous Causal Inference Beyond Meta-analysis},\nauthor={Tianyu Guo and Sai Praneeth Karimireddy and Michael Jordan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LJ34pX1U5g}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1281156, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4234254374370127237&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "berkeley.edu;berkeley.edu;berkeley.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Successor Features for Efficient Multi-Subject Controlled Text Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34300", "id": "LJcIIhqGDN", "proceeding": "https://proceedings.mlr.press/v235/cao24a.html", "pdf": "https://openreview.net/pdf?id=LJcIIhqGDN", "openreview": "https://openreview.net/forum?id=LJcIIhqGDN", "author_site": "Meng Cao, Mehdi Fatemi, Jackie Chi Kit Cheung, Samira Shabanian", "tldr": "", "abstract": "While large language models (LLMs) have achieved impressive performance in generating fluent and realistic text, controlling the generated text so that it exhibits properties such as safety, factuality, and non-toxicity remains challenging. Existing decoding-based controllable text generation methods are static in terms of the dimension of control; if the target subject is changed, they require new training. Moreover, it can quickly become prohibitive to concurrently control multiple subjects. To address these challenges, we first show that existing methods can be framed as a reinforcement learning problem, where an action-value function estimates the likelihood of a desired attribute appearing in the generated text. Then, we introduce a novel approach named SF-Gen, which leverages the concept of successor features to decouple the dynamics of LLMs from task-specific rewards. By employing successor features, our method proves to be memory-efficient and computationally efficient for both training and decoding, especially when dealing with multiple target subjects. To the best of our knowledge, our research represents the first application of successor features in text generation. In addition to its computational efficiency, the resultant language produced by our method is comparable to the SOTA (and outperforms baselines) in both control measures as well as language quality, which we demonstrate through a series of experiments in various controllable text generation tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Meng Cao;Mehdi Fatemi;Jackie CK Cheung;Samira Shabanian", "authorids": "~Meng_Cao3;~Mehdi_Fatemi1;~Jackie_CK_Cheung1;~Samira_Shabanian1", "gender": "M;;M;F", "homepage": "https://mcao516.github.io/;;http://cs.mcgill.ca/~jcheung/;", "dblp": ";;00/9012;", "google_scholar": "https://scholar.google.ca/citations?user=CvHeFv8AAAAJ;X9_mSpYAAAAJ;https://scholar.google.com.tw/citations?user=Um-wmYQAAAAJ;https://scholar.google.ca/citations?user=CHkNfSMAAAAJ", "orcid": ";0000-0001-9598-6164;;", "linkedin": ";fatemi/;;samirashabanian/", "or_profile": "~Meng_Cao3;~Mehdi_Fatemi1;~Jackie_CK_Cheung1;~Samira_Shabanian1", "aff": "McGill University;Wand AI;Microsoft;", "aff_domain": "mcgill.ca;wand.ai;microsoft.com;", "position": "PhD student;Principal Researcher;Consulting Researcher;", "bibtex": "@inproceedings{\ncao2024successor,\ntitle={Successor Features for Efficient Multi-Subject Controlled Text Generation},\nauthor={Meng Cao and Mehdi Fatemi and Jackie CK Cheung and Samira Shabanian},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LJcIIhqGDN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 545260, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12362210167003339067&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "mcgill.ca;wand.ai;microsoft.com;", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "McGill University;Wand AI;Microsoft", "aff_unique_dep": ";;Microsoft Corporation", "aff_unique_url": "https://www.mcgill.ca;https://www.wand.ai;https://www.microsoft.com", "aff_unique_abbr": "McGill;Wand AI;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Canada;United States" }, { "title": "Decentralized Convex Finite-Sum Optimization with Better Dependence on Condition Numbers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34299", "id": "LLdeUPOUXk", "proceeding": "https://proceedings.mlr.press/v235/liu24i.html", "pdf": "https://openreview.net/pdf?id=LLdeUPOUXk", "openreview": "https://openreview.net/forum?id=LLdeUPOUXk", "author_site": "Yuxing Liu, Lesi Chen, Luo Luo", "tldr": "", "abstract": "This paper studies decentralized optimization problem, where the local objective on each node is an average of a finite set of convex functions and the global function is strongly convex. We propose an efficient stochastic variance reduced first-order method that allows the different nodes to establish their stochastic local gradient estimator with different mini-batch sizes per iteration. We prove the upper bound on the computation time of the proposed method contains the dependence on the global condition number, which is sharper than the previous results that only depend on the local condition numbers. Compared with the state-of-the-art methods, we also show that our method requires less local incremental first-order oracle calls and comparable communication cost. We further perform numerical experiments to validate the advantage of our method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuxing Liu;Lesi Chen;Luo Luo", "authorids": "~Yuxing_Liu1;~Lesi_Chen1;~Luo_Luo1", "gender": "M;M;M", "homepage": "https://infinity-stars.github.io/;https://truenobility303.github.io/;https://luoluo-sds.github.io/", "dblp": "11/8650;326/5433;https://dblp.org/pers/hd/l/Luo:Luo", "google_scholar": "ENZKdAUAAAAJ;ynGzhugAAAAJ;NggI9EsAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yuxing_Liu1;~Lesi_Chen1;~Luo_Luo1", "aff": "Fudan University;Tsinghua Univeristy;Fudan University", "aff_domain": "fudan.edu.cn;mails.tsinghua.edu.cn;fudan.edu.cn", "position": "Undergrad student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nliu2024decentralized,\ntitle={Decentralized Convex Finite-Sum Optimization with Better Dependence on Condition Numbers},\nauthor={Yuxing Liu and Lesi Chen and Luo Luo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LLdeUPOUXk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 920283, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:baYC50hBUnEJ:scholar.google.com/&scioq=Decentralized+Convex+Finite-Sum+Optimization+with+Better+Dependence+on+Condition+Numbers&hl=en&as_sdt=0,33", "gs_version_total": 5, "email": "fudan.edu.cn;mails.tsinghua.edu.cn;fudan.edu.cn", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Fudan University;Tsinghua University", "aff_unique_dep": ";", "aff_unique_url": "https://www.fudan.edu.cn;https://www.tsinghua.edu.cn", "aff_unique_abbr": "Fudan;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Test-Time Regret Minimization in Meta Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34298", "id": "LM7j0zrUZB", "proceeding": "https://proceedings.mlr.press/v235/mutti24a.html", "pdf": "https://openreview.net/pdf?id=LM7j0zrUZB", "openreview": "https://openreview.net/forum?id=LM7j0zrUZB", "author_site": "Mirco Mutti, Aviv Tamar", "tldr": "", "abstract": "Meta reinforcement learning sets a distribution over a set of tasks on which the agent can train at will, then is asked to learn an optimal policy for any test task efficiently. In this paper, we consider a finite set of tasks modeled through Markov decision processes with various dynamics. We assume to have endured a long training phase, from which the set of tasks is perfectly recovered, and we focus on regret minimization against the optimal policy in the unknown test task. Under a separation condition that states the existence of a state-action pair revealing a task against another, Chen et al. (2022) show that $O(M^2 \\log(H))$ regret can be achieved, where $M, H$ are the number of tasks in the set and test episodes, respectively. In our first contribution, we demonstrate that the latter rate is nearly optimal by developing a novel lower bound for test-time regret minimization under separation, showing that a linear dependence with $M$ is unavoidable. Then, we present a family of stronger yet reasonable assumptions beyond separation, which we call strong identifiability, enabling algorithms achieving fast rates $\\log (H)$ and sublinear dependence with $M$ simultaneously. Our paper provides a new understanding of the statistical barriers of test-time regret minimization and when fast rates can be achieved.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mirco Mutti;Aviv Tamar", "authorids": "~Mirco_Mutti1;~Aviv_Tamar2", "gender": ";M", "homepage": ";https://avivt.github.io/avivt/", "dblp": "222/2815;49/10622", "google_scholar": "GlLkJ9UAAAAJ;https://scholar.google.co.il/citations?user=kppa2vgAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Mirco_Mutti1;~Aviv_Tamar2", "aff": "Technion - Israel Institute of Technology;Technion, Technion", "aff_domain": "technion.ac.il;technion.ac.il", "position": "Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nmutti2024testtime,\ntitle={Test-Time Regret Minimization in Meta Reinforcement Learning},\nauthor={Mirco Mutti and Aviv Tamar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LM7j0zrUZB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1277741, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9034779442085461013&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "technion.ac.il;technion.ac.il", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "DE-COP: Detecting Copyrighted Content in Language Models Training Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34297", "id": "LO4xhXmFal", "proceeding": "https://proceedings.mlr.press/v235/duarte24a.html", "pdf": "https://openreview.net/pdf?id=LO4xhXmFal", "openreview": "https://openreview.net/forum?id=LO4xhXmFal", "author_site": "Andr\u00e9 Duarte, Xuandong Zhao, Arlindo Oliveira, Lei Li", "tldr": "", "abstract": "*How can we detect if copyrighted content was used in the training process of a language model, considering that the training data is typically undisclosed?* We are motivated by the premise that a language model is likely to identify verbatim excerpts from its training text. We propose DE-COP, a method to determine whether a piece of copyrighted content is included in training. DE-COP's core approach is to probe an LLM with multiple-choice questions, whose options include both verbatim text and their paraphrases. We construct BookTection, a benchmark with excerpts from 165 books published prior and subsequent to a model's training cutoff, along with their paraphrases. Our experiments show that DE-COP outperforms the prior best method by 8.6% in detection accuracy (AUC) on models with logits available. Moreover, DE-COP also achieves an average accuracy of 72% for detecting suspect books on fully black-box models where prior methods give approximately 0% accuracy. The code and datasets are available at https://github.com/LeiLiLab/DE-COP.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Andr\u00e9 Vicente Duarte;Xuandong Zhao;Arlindo L. Oliveira;Lei Li", "authorids": "~Andr\u00e9_Vicente_Duarte1;~Xuandong_Zhao1;~Arlindo_L._Oliveira1;~Lei_Li11", "gender": "M;M;M;M", "homepage": "https://xuandongzhao.github.io/;http://web.tecnico.ulisboa.pt/arlindo.oliveira/;https://www.cs.cmu.edu/~leili;", "dblp": "244/8033;o/ArlindoLOliveira;13/7007-5.html;", "google_scholar": "CxeH4uoAAAAJ;dqtEnaoAAAAJ;BYXqAlwAAAAJ;Q8CoYScAAAAJ", "orcid": ";0000-0001-8638-5594;0000-0003-3095-9776;0000-0001-5987-0789", "linkedin": "xuandong-zhao-a3270610b/;arlindo-oliveira-4119a1a/;;andr%C3%A9-duarte-b69373181/", "or_profile": "~Xuandong_Zhao1;~Arlindo_L._Oliveira1;~Lei_Li11;~Andre_Vicente_Duarte1", "aff": "UC Santa Barbara;INESC-ID;School of Computer Science, Carnegie Mellon University;INESC-ID", "aff_domain": "ucsb.edu;inesc-id.pt;cs.cmu.edu;inesc-id.pt", "position": "PhD student;Researcher;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nduarte2024decop,\ntitle={{DE}-{COP}: Detecting Copyrighted Content in Language Models Training Data},\nauthor={Andr{\\'e} Vicente Duarte and Xuandong Zhao and Arlindo L. Oliveira and Lei Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LO4xhXmFal}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1501812, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9610023679067906312&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 9, "email": "ucsb.edu;inesc-id.pt;cs.cmu.edu;inesc-id.pt", "author_num": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "University of California, Santa Barbara;INESC-ID;Carnegie Mellon University", "aff_unique_dep": ";;School of Computer Science", "aff_unique_url": "https://www.ucsb.edu;https://www.inesc-id.pt;https://www.cmu.edu", "aff_unique_abbr": "UCSB;INESC-ID;CMU", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Santa Barbara;;Pittsburgh", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "United States;Portugal" }, { "title": "VideoPoet: A Large Language Model for Zero-Shot Video Generation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34296", "id": "LRkJwPIDuE", "proceeding": "https://proceedings.mlr.press/v235/kondratyuk24a.html", "pdf": "https://openreview.net/pdf?id=LRkJwPIDuE", "openreview": "https://openreview.net/forum?id=LRkJwPIDuE", "author_site": "Dan Kondratyuk, Lijun Yu, Xiuye Gu, Jose Lezama, Jonathan Huang, Grant Schindler, Rachel Hornung, Vighnesh N Birodkar, Jimmy Yan, Ming-Chang Chiu, Krishna Somandepalli, Hassan Akbari, Yair Alon, Yong Cheng, Joshua V Dillon, Agrim Gupta, Meera Hahn, Anja Hauth, David Hendon, Alonso Martinez, David Minnen, Mikhail Sirotenko, Kihyuk Sohn, Xuan Yang, Hartwig Adam, Ming-Hsuan Yang, Irfan Essa, Huisheng Wang, David Ross, Bryan Seybold, Lu Jiang", "tldr": "", "abstract": "We present VideoPoet, a language model capable of synthesizing high-quality video from a large variety of conditioning signals. VideoPoet employs a decoder-only transformer architecture that processes multimodal inputs -- including images, videos, text, and audio. The training protocol follows that of Large Language Models (LLMs), consisting of two stages: pretraining and task-specific adaptation. During pretraining, VideoPoet incorporates a mixture of multimodal generative objectives within an autoregressive Transformer framework. The pretrained LLM serves as a foundation that can be adapted for a range of video generation tasks. We present empirical results demonstrating the model's state-of-the-art capabilities in zero-shot video generation, specifically highlighting the ability to generate high-fidelity motions. Project page: http://sites.research.google/videopoet/", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dan Kondratyuk;Lijun Yu;Xiuye Gu;Jose Lezama;Jonathan Huang;Grant Schindler;Rachel Hornung;Vighnesh Birodkar;Jimmy Yan;Ming-Chang Chiu;Krishna Somandepalli;Hassan Akbari;Yair Alon;Yong Cheng;Joshua V. Dillon;Agrim Gupta;Meera Hahn;Anja Hauth;David Hendon;Alonso Martinez;David Minnen;Mikhail Sirotenko;Kihyuk Sohn;Xuan Yang;Hartwig Adam;Ming-Hsuan Yang;Irfan Essa;Huisheng Wang;David A Ross;Bryan Seybold;Lu Jiang", "authorids": "~Dan_Kondratyuk1;~Lijun_Yu1;~Xiuye_Gu1;~Jose_Lezama1;~Jonathan_Huang1;~Grant_Schindler3;~Rachel_Hornung1;~Vighnesh_Birodkar1;~Jimmy_Yan1;~Ming-Chang_Chiu1;~Krishna_Somandepalli3;~Hassan_Akbari1;~Yair_Alon1;~Yong_Cheng3;~Joshua_V._Dillon1;~Agrim_Gupta1;~Meera_Hahn1;~Anja_Hauth1;hendon@google.com;~Alonso_Martinez2;~David_Minnen1;~Mikhail_Sirotenko1;~Kihyuk_Sohn1;~Xuan_Yang6;~Hartwig_Adam1;~Ming-Hsuan_Yang1;~Irfan_Essa1;~Huisheng_Wang1;~David_A_Ross1;~Bryan_Seybold1;~Lu_Jiang1", "gender": "M;M;F;M;;M;;M;;;M;M;;M;M;;F;F;;M;M;M;M;F;He/him;M;M;;;M;M", "homepage": "https://dankondratyuk.com;https://me.lj-y.com/;https://laoreja.github.io/;https://iie.fing.edu.uy/~jlezama/;http://jonathan-huang.org/;http://www.grantschindler.com/;;;;;https://krishna.ai;https://hassanakbari.com;;;;;https://meerahahn.github.io/;;;;http://research.minnen.org;https://www.linkedin.com/in/mihail-sirotenko-33187913/;https://sites.google.com/site/kihyuksml/;;https://research.google/people/author37870/;https://faculty.ucmerced.edu/mhyang/;http://www.irfanessa.com/;;;;http://www.lujiang.info/", "dblp": ";94/5561;199/1920;151/8861;55/2421;82/4839;;186/8043;;;192/5601;207/8265;;34/6276.html;;200/8282;173/5203;;;246/4183;;263/7266;53/10771;;75/948;79/3711.html;e/IrfanAEssa;;;186/8317;22/752-4", "google_scholar": "hl1fJgIAAAAJ;IaDc0OcAAAAJ;qCrypnoAAAAJ;iDP84cQAAAAJ;-pu6i_4AAAAJ;lEQ3oDAAAAAJ;;whm4LjQAAAAJ;;;https://scholar.google.com/citations?hl=en;https://scholar.google.com;;rZ0mlMYAAAAJ;g8vrSV8AAAAJ;AxzVaI8AAAAJ;XNXylX0AAAAJ;;;;https://scholar.google.com/citations?hl=en;IpGXRaAAAAAJ;VxpypngAAAAJ;HaA2AWIAAAAJ;fWd88tEAAAAJ;p9-ohHsAAAAJ;https://scholar.google.com.tw/citations?user=XM97iScAAAAJ;4evU9_YAAAAJ;;JmKn3PwAAAAJ;jIKjjSYAAAAJ", "orcid": "0000-0002-7670-7243;0000-0003-0645-1657;;;;;;;;;;;;;;;;;;;;;;;0000-0003-1258-4341;0000-0003-4848-2304;0000-0002-6236-2969;;;;0000-0003-0286-8439", "linkedin": "dankondratyuk;lijun-yu/;xiuyegu/;;;grantschindler;;;;;;hassan-akbari-48a1b270/;;;jvdillon/;;;anjahauth/;;alonsomartinez/;;;;xuan-yang-3607484a/;hartwig-adam-1873392/;minghsuanyang/;irfanessa/;;;;roadjiang/", "or_profile": "~Dan_Kondratyuk1;~Lijun_Yu1;~Xiuye_Gu1;~Jose_Lezama1;~Jonathan_Huang1;~Grant_Schindler3;~Rachel_Hornung1;~Vighnesh_Birodkar1;~Jimmy_Yan1;~Ming-Chang_Chiu1;~Krishna_Somandepalli3;~Hassan_Akbari1;~Yair_Alon1;~Yong_Cheng3;~Joshua_V._Dillon1;~Agrim_Gupta1;~Meera_Hahn1;~Anja_Hauth1;hendon@google.com;~Alonso_Martinez2;~David_Minnen1;~Mikhail_Sirotenko1;~Kihyuk_Sohn1;~Xuan_Yang6;~Hartwig_Adam1;~Ming-Hsuan_Yang1;~Irfan_Essa1;~Huisheng_Wang1;~David_A_Ross1;~Bryan_Seybold1;~Lu_Jiang1", "aff": "Google;School of Computer Science, Carnegie Mellon University;Google;Google;Google;Google;;Google;;;Google;Google;;Google;Google;Stanford University;Google;Google;;Research, Google;Google;Google DeepMind;Google;Google;Google Research;University of California at Merced;Georgia Institute of Technology;Google;;Google;Google Research", "aff_domain": "google.com;cs.cmu.edu;google.com;google.com;google.com;google.com;;google.com;;;google.com;google.com;;google.com;google.com;stanford.edu;google.com;google.com;;research.google.com;google.com;google.com;google.com;google.com;google.com;umcerced.edu;gatech.edu;google.com;;google.com;google.com", "position": "Researcher;PhD student;Researcher;Researcher;Research Scientist;Researcher;;Research Engineer;;;Researcher;Research Scientist;;Researcher;Researcher;PhD student;Researcher;Researcher;;Researcher;Researcher;TLM;Research Scientist;Researcher;Principal Researcher;Professor;Full Professor;Researcher;;Researcher;Researcher", "bibtex": "@inproceedings{\nkondratyuk2024videopoet,\ntitle={VideoPoet: A Large Language Model for Zero-Shot Video Generation},\nauthor={Dan Kondratyuk and Lijun Yu and Xiuye Gu and Jose Lezama and Jonathan Huang and Grant Schindler and Rachel Hornung and Vighnesh Birodkar and Jimmy Yan and Ming-Chang Chiu and Krishna Somandepalli and Hassan Akbari and Yair Alon and Yong Cheng and Joshua V. Dillon and Agrim Gupta and Meera Hahn and Anja Hauth and David Hendon and Alonso Martinez and David Minnen and Mikhail Sirotenko and Kihyuk Sohn and Xuan Yang and Hartwig Adam and Ming-Hsuan Yang and Irfan Essa and Huisheng Wang and David A Ross and Bryan Seybold and Lu Jiang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LRkJwPIDuE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9603734, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 31, "gs_citation": 257, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5579274214599195084&as_sdt=805&sciodt=0,3&hl=en", "gs_version_total": 8, "email": "google.com;cs.cmu.edu;google.com;google.com;google.com;google.com;;google.com;;;google.com;google.com;;google.com;google.com;stanford.edu;google.com;google.com;;research.google.com;google.com;google.com;google.com;google.com;google.com;umcerced.edu;gatech.edu;google.com;;google.com;google.com", "author_num": 31, "aff_unique_index": "0;1;0;0;0;0;0;0;0;0;0;2;0;0;0;0;0;0;0;0;3;4;0;0;0", "aff_unique_norm": "Google;Carnegie Mellon University;Stanford University;University of California, Merced;Georgia Institute of Technology", "aff_unique_dep": "Google;School of Computer Science;;;", "aff_unique_url": "https://www.google.com;https://www.cmu.edu;https://www.stanford.edu;https://www.ucmerced.edu;https://www.gatech.edu", "aff_unique_abbr": "Google;CMU;Stanford;UC Merced;Georgia Tech", "aff_campus_unique_index": "0;1;0;0;0;0;0;0;0;0;0;2;0;0;0;0;0;0;0;4;0;0;0", "aff_campus_unique": "Mountain View;Pittsburgh;Stanford;;Merced", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1;0;0;0;0;0;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Refining Minimax Regret for Unsupervised Environment Design", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34295", "id": "LRnXPxDksA", "proceeding": "https://proceedings.mlr.press/v235/beukman24a.html", "pdf": "https://openreview.net/pdf?id=LRnXPxDksA", "openreview": "https://openreview.net/forum?id=LRnXPxDksA", "author_site": "Michael Beukman, Samuel Coward, Michael Matthews, Mattie Fellows, Minqi Jiang, Michael Dennis, Jakob Foerster", "tldr": "", "abstract": "In unsupervised environment design, reinforcement learning agents are trained on environment configurations (levels) generated by an adversary that maximises some objective. Regret is a commonly used objective that theoretically results in a minimax regret (MMR) policy with desirable robustness guarantees; in particular, the agent's maximum regret is bounded. However, once the agent reaches this regret bound on all levels, the adversary will only sample levels where regret cannot be further reduced. Although there may be possible performance improvements to be made outside of these regret-maximising levels, learning stagnates. In this work, we introduce *Bayesian level-perfect MMR* (BLP), a refinement of the minimax regret objective that overcomes this limitation. We formally show that solving for this objective results in a subset of MMR policies, and that BLP policies act consistently with a Perfect Bayesian policy over all levels. We further introduce an algorithm, *ReMiDi*, that results in a BLP policy at convergence. We empirically demonstrate that training on levels from a minimax regret adversary causes learning to prematurely stagnate, but that ReMiDi continues learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Michael Beukman;Samuel Coward;Michael Matthews;Mattie Fellows;Minqi Jiang;Michael D Dennis;Jakob Nicolaus Foerster", "authorids": "~Michael_Beukman1;~Samuel_Coward1;~Michael_Matthews4;~Mattie_Fellows1;~Minqi_Jiang1;~Michael_D_Dennis1;~Jakob_Nicolaus_Foerster1", "gender": ";;M;;M;M;M", "homepage": ";;https://www.mtmatthews.com/;;https://twitter.com/minqijiang;;https://www.jakobfoerster.com", "dblp": ";;217/7784.html;;270/7949;;176/5095", "google_scholar": ";;https://scholar.google.com/citations?authuser=1;;;WXXu26AAAAAJ;6z4lQzMAAAAJ", "orcid": ";;;;;;", "linkedin": ";;michael-matthews-b7a5b7158/;;minqi-jiang-585a6536/;;", "or_profile": "~Michael_Beukman1;~Samuel_Coward1;~Michael_Matthews4;~Mattie_Fellows1;~Minqi_Jiang1;~Michael_D_Dennis1;~Jakob_Nicolaus_Foerster1", "aff": ";;University of Oxford;;Google;Google DeepMind;University of Oxford, University of Oxford", "aff_domain": ";;ox.ac.uk;;google.com;google.com;eng.ox.ac.uk", "position": ";;PhD student;;Researcher;Researcher;Associate Professor", "bibtex": "@inproceedings{\nbeukman2024refining,\ntitle={Refining Minimax Regret for Unsupervised Environment Design},\nauthor={Michael Beukman and Samuel Coward and Michael Matthews and Mattie Fellows and Minqi Jiang and Michael D Dennis and Jakob Nicolaus Foerster},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LRnXPxDksA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1467722, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16419713483464234141&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": ";;ox.ac.uk;;google.com;google.com;eng.ox.ac.uk", "author_num": 7, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "University of Oxford;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.ox.ac.uk;https://www.google.com", "aff_unique_abbr": "Oxford;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "Data-free Neural Representation Compression with Riemannian Neural Dynamics", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34294", "id": "LTifAl5bKb", "proceeding": "https://proceedings.mlr.press/v235/pei24d.html", "pdf": "https://openreview.net/pdf?id=LTifAl5bKb", "openreview": "https://openreview.net/forum?id=LTifAl5bKb", "author_site": "Zhengqi Pei, Anran Zhang, Shuhui Wang, Xiangyang Ji, Qingming Huang", "tldr": "", "abstract": "Neural models are equivalent to dynamic systems from a physics-inspired view, implying that computation on neural networks can be interpreted as the dynamical interactions between neurons. However, existing work models neuronal interaction as a weight-based linear transformation, and the nonlinearity comes from the nonlinear activation functions, which leads to limited nonlinearity and data-fitting ability of the whole neural model. Inspired by Riemannian geometry, we interpret neural structures by projecting neurons onto the Riemannian neuronal state space and model neuronal interaction with Riemannian metric (${\\it RieM}$), which provides a more efficient neural representation with higher parameter efficiency. With ${\\it RieM}$, we further design a novel data-free neural compression mechanism that does not require additional fine-tuning with real data. Using backbones like ResNet and Vision Transformer, we conduct extensive experiments on datasets such as MNIST, CIFAR-100, ImageNet-1k, and COCO object detection. Empirical results show that, under equal compression rates and computational complexity, models compressed with ${\\it RieM}$ achieve superior inference accuracy compared to existing data-free compression methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhengqi Pei;Anran Zhang;Shuhui Wang;Xiangyang Ji;Qingming Huang", "authorids": "~Zhengqi_Pei1;~Anran_Zhang2;~Shuhui_Wang1;~Xiangyang_Ji1;~Qingming_Huang1", "gender": "M;F;M;;", "homepage": ";;https://vipl.ict.ac.cn/people/shwang/;;", "dblp": "223/2296;;37/2537;;", "google_scholar": "Qs5zacQAAAAJ;;h-JxBSYAAAAJ;;", "orcid": ";;0000-0002-5931-0527;;", "linkedin": ";%E5%AE%89%E7%84%B6-%E5%BC%A0-a901a3276/;;;", "or_profile": "~Zhengqi_Pei1;~Anran_Zhang2;~Shuhui_Wang1;~Xiangyang_Ji1;~Qingming_Huang1", "aff": "University of Chinese Academy of Sciences;University of Chinese Academy of Sciences;Institute of Computing Technology, Chinese Academy of Sciences;;", "aff_domain": "ucas.ac.cn;ucas.ac.cn;ict.ac.cn;;", "position": "MS student;MS student;Full Professor;;", "bibtex": "@inproceedings{\npei2024datafree,\ntitle={Data-free Neural Representation Compression with Riemannian Neural Dynamics},\nauthor={Zhengqi Pei and Anran Zhang and Shuhui Wang and Xiangyang Ji and Qingming Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LTifAl5bKb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1078763, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7484589089432719392&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "ucas.ac.cn;ucas.ac.cn;ict.ac.cn;;", "author_num": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Chinese Academy of Sciences;Chinese Academy of Sciences", "aff_unique_dep": ";Institute of Computing Technology", "aff_unique_url": "http://www.ucas.ac.cn;http://www.ict.ac.cn", "aff_unique_abbr": "UCAS;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Exact Conversion of In-Context Learning to Model Weights in Linearized-Attention Transformers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34293", "id": "LVF4P1NNwO", "proceeding": "https://proceedings.mlr.press/v235/chen24r.html", "pdf": "https://openreview.net/pdf?id=LVF4P1NNwO", "openreview": "https://openreview.net/forum?id=LVF4P1NNwO", "author_site": "Brian Chen, Tianyang Hu, Hui Jin, Hwee Lee, Kenji Kawaguchi", "tldr": "", "abstract": "In-Context Learning (ICL) has been a powerful emergent property of large language models that has attracted increasing attention in recent years. In contrast to regular gradient-based learning, ICL is highly interpretable and does not require parameter updates. In this paper, we show that, for linearized transformer networks, ICL can be made explicit and permanent through the inclusion of bias terms. We mathematically demonstrate the equivalence between a model with ICL demonstration prompts and the same model with the additional bias terms. Our algorithm (ICLCA) allows for exact conversion in an inexpensive manner. Existing methods are not exact and require expensive parameter updates. We demonstrate the efficacy of our approach through experiments that show the exact incorporation of ICL tokens into a linear transformer. We further suggest how our method can be adapted to achieve cheap approximate conversion of ICL tokens, even in regular transformer networks that are not linearized. Our experiments on GPT-2 show that, even though the conversion is only approximate, the model still gains valuable context from the included bias terms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Brian K Chen;Tianyang Hu;Hui Jin;Hwee Kuan Lee;Kenji Kawaguchi", "authorids": "~Brian_K_Chen1;~Tianyang_Hu1;~Hui_Jin1;~Hwee_Kuan_Lee1;~Kenji_Kawaguchi1", "gender": "M;M;M;M;", "homepage": ";https://hu-tianyang.github.io/;https://huijin12.github.io/;https://web.bii.a-star.edu.sg/~leehk/index.html;https://ml.comp.nus.edu.sg/#members", "dblp": ";170/2551;;;", "google_scholar": ";mlA_3r0AAAAJ;;;aLl3rYoAAAAJ", "orcid": "0009-0004-8069-7422;;;;", "linkedin": ";;;;", "or_profile": "~Brian_K_Chen1;~Tianyang_Hu1;~Hui_Jin1;~Hwee_Kuan_Lee1;~Kenji_Kawaguchi1", "aff": "National University of Singapore;Huawei Noah's Ark Lab;Huawei Technologies Ltd.;BII;National University of Singapore", "aff_domain": "nus.edu;huawei.com;huawei.com;astar.edu.sg;nus.edu", "position": "PhD student;Researcher;Researcher;Principal Researcher;Presidential Young Professor", "bibtex": "@inproceedings{\nchen2024exact,\ntitle={Exact Conversion of In-Context Learning to Model Weights in Linearized-Attention Transformers},\nauthor={Brian K Chen and Tianyang Hu and Hui Jin and Hwee Kuan Lee and Kenji Kawaguchi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LVF4P1NNwO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 340291, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:mMpSMjo63kwJ:scholar.google.com/&scioq=Exact+Conversion+of+In-Context+Learning+to+Model+Weights+in+Linearized-Attention+Transformers&hl=en&as_sdt=0,33", "gs_version_total": 5, "email": "nus.edu;huawei.com;huawei.com;astar.edu.sg;nus.edu", "author_num": 5, "aff_unique_index": "0;1;1;2;0", "aff_unique_norm": "National University of Singapore;Huawei;Bioinformatics Institute", "aff_unique_dep": ";Noah's Ark Lab;", "aff_unique_url": "https://www.nus.edu.sg;https://www.huawei.com;https://www.bii.a-star.edu.sg", "aff_unique_abbr": "NUS;Huawei;BII", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;0", "aff_country_unique": "Singapore;China" }, { "title": "tnGPS: Discovering Unknown Tensor Network Structure Search Algorithms via Large Language Models (LLMs)", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34292", "id": "LVgT0ShxN5", "proceeding": "https://proceedings.mlr.press/v235/zeng24b.html", "pdf": "https://openreview.net/pdf?id=LVgT0ShxN5", "openreview": "https://openreview.net/forum?id=LVgT0ShxN5", "author_site": "Junhua Zeng, Chao Li, Zhun Sun, Qibin Zhao, Guoxu Zhou", "tldr": "", "abstract": "Tensor networks are efficient for extremely high-dimensional representation, but their model selection, known as tensor network structure search (TN-SS), is a challenging problem. Although several works have targeted TN-SS, most existing algorithms are manually crafted heuristics with poor performance, suffering from the curse of dimensionality and local convergence. In this work, we jump out of the box, studying how to harness large language models (LLMs) to automatically discover new TN-SS algorithms, replacing the involvement of human experts. By observing how human experts innovate in research, we model their common workflow and propose an automatic algorithm discovery framework called tnGPS. The proposed framework is an elaborate prompting pipeline that instruct LLMs to generate new TN-SS algorithms through iterative refinement and enhancement. The experimental results demonstrate that the algorithms discovered by tnGPS exhibit superior performance in benchmarks compared to the current state-of-the-art methods. Our code is available at https://github.com/ChaoLiAtRIKEN/tngps.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Junhua Zeng;Chao Li;Zhun Sun;Qibin Zhao;Guoxu Zhou", "authorids": "~Junhua_Zeng1;~Chao_Li12;~Zhun_Sun1;~Qibin_Zhao1;~Guoxu_Zhou1", "gender": ";M;Non-Binary;M;M", "homepage": "https://github.com/jhzeng24;https://chaoliatriken.github.io;https://minogame.github.io/;https://qibinzhao.github.io;https://teacher.gdut.edu.cn/gxzhou/zh_CN/index.htm", "dblp": "322/6152;;185/6899;13/1193;33/7727", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;i4JrumAAAAAJ;Y-3iZ9EAAAAJ;https://scholar.google.co.jp/citations?hl=en;BIUkSFEAAAAJ", "orcid": ";;;0000-0002-4442-3182;", "linkedin": ";;;;", "or_profile": "~Junhua_Zeng1;~Chao_Li12;~Zhun_Sun1;~Qibin_Zhao1;~Guoxu_Zhou1", "aff": "Guangdong University of Technology;RIKEN;Tencent;RIKEN;Guangdong University of Technology", "aff_domain": "gdut.edu.cn;riken.jp;tencent.com;riken.jp;gdut.edu.cn", "position": "PhD student;Researcher;Researcher;Team Leader;Full Professor", "bibtex": "@inproceedings{\nzeng2024tngps,\ntitle={tn{GPS}: Discovering Unknown Tensor Network Structure Search Algorithms via Large Language Models ({LLM}s)},\nauthor={Junhua Zeng and Chao Li and Zhun Sun and Qibin Zhao and Guoxu Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LVgT0ShxN5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3002767, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2398730569128461382&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "gdut.edu.cn;riken.jp;tencent.com;riken.jp;gdut.edu.cn", "author_num": 5, "aff_unique_index": "0;1;2;1;0", "aff_unique_norm": "Guangdong University of Technology;RIKEN;Tencent", "aff_unique_dep": ";;Tencent Holdings Limited", "aff_unique_url": "http://www.gdut.edu.cn;https://www.riken.jp;https://www.tencent.com", "aff_unique_abbr": "GDUT;RIKEN;Tencent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;0", "aff_country_unique": "China;Japan" }, { "title": "Differentially Private Synthetic Data via Foundation Model APIs 2: Text", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34291", "id": "LWD7upg1ob", "proceeding": "https://proceedings.mlr.press/v235/xie24g.html", "pdf": "https://openreview.net/pdf?id=LWD7upg1ob", "openreview": "https://openreview.net/forum?id=LWD7upg1ob", "author_site": "Chulin Xie, Zinan Lin, Arturs Backurs, Sivakanth Gopi, Da Yu, Huseyin Inan, Harsha Nori, Haotian Jiang, Huishuai Zhang, Yin Tat Lee, Bo Li, Sergey Yekhanin", "tldr": "", "abstract": "Text data has become extremely valuable due to the emergence of machine learning algorithms that learn from it. A lot of high-quality text data generated in the real world is private and therefore cannot be shared or used freely due to privacy concerns. Generating synthetic replicas of private text data with a formal privacy guarantee, i.e., differential privacy (DP), offers a promising and scalable solution. However, existing methods necessitate DP finetuning of large language models (LLMs) on private data to generate DP synthetic data. This approach is not viable for proprietary LLMs (e.g., GPT-3.5) and also demands considerable computational resources for open-source LLMs. Lin et al. (2024) recently introduced the Private Evolution (PE) algorithm to generate DP synthetic images with only API access to diffusion models. In this work, we propose an augmented PE algorithm, named Aug-PE, that applies to the complex setting of text. We use API access to an LLM and generate DP synthetic text without any model training. We conduct comprehensive experiments on three benchmark datasets. Our results demonstrate that Aug-PE produces DP synthetic text that yields competitive utility with the SOTA DP finetuning baselines. This underscores the feasibility of relying solely on API access of LLMs to produce high-quality DP synthetic texts, thereby facilitating more accessible routes to privacy-preserving LLM applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chulin Xie;Zinan Lin;Arturs Backurs;Sivakanth Gopi;Da Yu;Huseyin A Inan;Harsha Nori;Haotian Jiang;Huishuai Zhang;Yin Tat Lee;Bo Li;Sergey Yekhanin", "authorids": "~Chulin_Xie1;~Zinan_Lin1;~Arturs_Backurs1;~Sivakanth_Gopi1;~Da_Yu1;~Huseyin_A_Inan1;~Harsha_Nori1;~Haotian_Jiang2;~Huishuai_Zhang3;~Yin_Tat_Lee1;~Bo_Li19;~Sergey_Yekhanin1", "gender": "F;M;;M;M;;;M;;;F;M", "homepage": ";https://zinanlin.me/;http://www.mit.edu/~backurs/;https://aka.ms/sigopi;;;;https://jhtdavid96.wixsite.com/jianghaotian;;;http://boli.cs.illinois.edu/;https://www.microsoft.com/en-us/research/people/yekhanin/", "dblp": "245/4284;64/237-1;74/10669;123/7803.html;48/8545;;217/2494;;;;50/3402-26;29/1329", "google_scholar": "WeJnzAgAAAAJ;67nE-wQ_g_cC;UNHdIKoAAAAJ;bYhGFrwAAAAJ;FcRGdiwAAAAJ;;HmxjgMAAAAAJ;3mGx0eoAAAAJ;;;K8vJkTcAAAAJ;4WEQ8h0AAAAJ", "orcid": ";;;;;;;0000-0002-7501-2247;;;;", "linkedin": ";;;sivakanthgopi/;;;harshanori;;;;;sergey-yekhanin-5242ba192/", "or_profile": "~Chulin_Xie1;~Zinan_Lin1;~Arturs_Backurs1;~Sivakanth_Gopi1;~Da_Yu1;~Huseyin_A_Inan1;~Harsha_Nori1;~Haotian_Jiang2;~Huishuai_Zhang3;~Yin_Tat_Lee1;~Bo_Li19;~Sergey_Yekhanin1", "aff": "University of Illinois, Urbana Champaign;Microsoft;Microsoft;Microsoft Research;SUN YAT-SEN UNIVERSITY;;Microsoft;Microsoft Research, Redmond;;;University of Illinois, Urbana Champaign;Microsoft", "aff_domain": "illinois.edu;microsoft.com;microsoft.com;microsoft.com;sysu.edu.cn;;microsoft.com;microsoft.com;;;illinois.edu;microsoft.com", "position": "PhD student;Senior Researcher;Researcher;Senior Researcher;PhD student;;Research Engineer;Postdoc;;;Assistant Professor;Principal Researcher", "bibtex": "@inproceedings{\nxie2024differentially,\ntitle={Differentially Private Synthetic Data via Foundation Model {API}s 2: Text},\nauthor={Chulin Xie and Zinan Lin and Arturs Backurs and Sivakanth Gopi and Da Yu and Huseyin A Inan and Harsha Nori and Haotian Jiang and Huishuai Zhang and Yin Tat Lee and Bo Li and Sergey Yekhanin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LWD7upg1ob}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 973875, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12531327754360897872&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "illinois.edu;microsoft.com;microsoft.com;microsoft.com;sysu.edu.cn;;microsoft.com;microsoft.com;;;illinois.edu;microsoft.com", "author_num": 12, "aff_unique_index": "0;1;1;1;2;1;1;0;1", "aff_unique_norm": "University of Illinois Urbana-Champaign;Microsoft;Sun Yat-sen University", "aff_unique_dep": ";Microsoft Corporation;", "aff_unique_url": "https://illinois.edu;https://www.microsoft.com;http://www.sysu.edu.cn", "aff_unique_abbr": "UIUC;Microsoft;SYSU", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Urbana-Champaign;;Redmond", "aff_country_unique_index": "0;0;0;0;1;0;0;0;0", "aff_country_unique": "United States;China" }, { "title": "eCeLLM: Generalizing Large Language Models for E-commerce from Large-scale, High-quality Instruction Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34290", "id": "LWRI4uPG2X", "proceeding": "https://proceedings.mlr.press/v235/peng24c.html", "pdf": "https://openreview.net/pdf?id=LWRI4uPG2X", "openreview": "https://openreview.net/forum?id=LWRI4uPG2X", "author_site": "Peng, Xinyi Ling, Ziru Chen, Huan Sun, Xia Ning", "tldr": "", "abstract": "With tremendous efforts on developing effective e-commerce models, conventional e-commerce models show limited success in generalist e-commerce modeling, and suffer from unsatisfactory performance on new users and new products \u2013 a typical out-of-domain generalization challenge. Meanwhile, large language models (LLMs) demonstrate outstanding performance in generalist modeling and out-of-domain generalizability in many fields. Toward fully unleashing their power for e-commerce, in this paper, we construct ECInstruct, the first open-sourced, large-scale, and high-quality benchmark instruction dataset for e-commerce. Leveraging ECInstruct, we develop eCeLLM, a series of e-commerce LLMs, by instruction-tuning general-purpose LLMs. Our comprehensive experiments and evaluation demonstrate that eCeLLM models substantially outperform baseline models, including the most advanced GPT-4, and the state-of-the-art task-specific models in in-domain evaluation. Moreover, eCeLLM exhibits excellent generalizability to out-of-domain settings, including unseen products and unseen instructions, highlighting its superiority as a generalist e-commerce model. Both the ECInstruct dataset and the eCeLLM models show great potential in empowering versatile and effective LLMs for e-commerce. ECInstruct and eCeLLM models are publicly accessible through this link.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bo Peng;Xinyi Ling;Ziru Chen;Huan Sun;Xia Ning", "authorids": "~Bo_Peng14;~Xinyi_Ling1;~Ziru_Chen1;~Huan_Sun1;~Xia_Ning1", "gender": "M;F;M;F;F", "homepage": ";;https://ronch99.github.io/;https://u.osu.edu/ihudas/people/;http://go.osu.edu/ninglab", "dblp": "03/5954-9;;200/8335;33/2952-1.html;28/1717", "google_scholar": "u0rEzyjYrGoC;;1-pt7zMAAAAJ;wIFkulcAAAAJ;YkPL8jMAAAAJ", "orcid": "0009-0000-7569-1828;;;;0000-0002-6842-1165", "linkedin": ";xinyi-ling-2371a1223/;;huan-sun-81527924/?originalSubdomain=cn;", "or_profile": "~Bo_Peng14;~Xinyi_Ling1;~Ziru_Chen1;~Huan_Sun1;~Xia_Ning1", "aff": "Ohio State University;Ohio State University, Columbus;Ohio State University, Columbus;The Ohio State University, Columbus;Ohio State University, Columbus", "aff_domain": "osu.edu;osu.edu;osu.edu;osu.edu;osu.edu", "position": "PhD student;PhD student;PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\npeng2024ecellm,\ntitle={eCe{LLM}: Generalizing Large Language Models for E-commerce from Large-scale, High-quality Instruction Data},\nauthor={Bo Peng and Xinyi Ling and Ziru Chen and Huan Sun and Xia Ning},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LWRI4uPG2X}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 760390, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=508680132346977474&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "osu.edu;osu.edu;osu.edu;osu.edu;osu.edu", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Ohio State University", "aff_unique_dep": "", "aff_unique_url": "https://www.osu.edu", "aff_unique_abbr": "OSU", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Columbus", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Predictive Dynamic Fusion", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34289", "id": "LYpGLrC4oq", "proceeding": "https://proceedings.mlr.press/v235/cao24c.html", "pdf": "https://openreview.net/pdf?id=LYpGLrC4oq", "openreview": "https://openreview.net/forum?id=LYpGLrC4oq", "author_site": "Bing Cao, Yinan Xia, Yi Ding, Changqing Zhang, Qinghua Hu", "tldr": "", "abstract": "Multimodal fusion is crucial in joint decision-making systems for rendering holistic judgments. Since multimodal data changes in open environments, dynamic fusion has emerged and achieved remarkable progress in numerous applications. However, most existing dynamic multimodal fusion methods lack theoretical guarantees and easily fall into suboptimal problems, yielding unreliability and instability. To address this issue, we propose a Predictive Dynamic Fusion (PDF) framework for multimodal learning. We proceed to reveal the multimodal fusion from a generalization perspective and theoretically derive the predictable Collaborative Belief (Co-Belief) with Mono- and Holo-Confidence, which provably reduces the upper bound of generalization error. Accordingly, we further propose a relative calibration strategy to calibrate the predicted Co-Belief for potential uncertainty. Extensive experiments on multiple benchmarks confirm our superiority. Our code is available at https://github.com/Yinan-Xia/PDF.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bing Cao;Yinan Xia;Yi Ding;Changqing Zhang;Qinghua Hu", "authorids": "~Bing_Cao1;~Yinan_Xia1;~Yi_Ding8;~Changqing_Zhang1;~Qinghua_Hu1", "gender": "M;F;M;M;M", "homepage": "https://bcaosudo.github.io;https://github.com/Yinan-Xia;https://dripnowhy.github.io;http://cic.tju.edu.cn/faculty/zhangchangqing/index.html;http://cic.tju.edu.cn/faculty/huqinghua/index.html", "dblp": "59/4329;;;78/2668;", "google_scholar": "6KeTXm4AAAAJ;DNl4KHgAAAAJ;https://scholar.google.com/citations?hl=en;yJGhdykAAAAJ;TVSNq_wAAAAJ", "orcid": "0000-0002-0316-5404;;;;0000-0001-7765-8095", "linkedin": ";;;;", "or_profile": "~Bing_Cao1;~Yinan_Xia1;~Yi_Ding8;~Changqing_Zhang1;~Qinghua_Hu1", "aff": "Tianjin University;Tianjin University;Tianjin University;Tianjin University;Tianjin University", "aff_domain": "tju.edu.cn;tju.edu.cn;tju.edu.cn;tju.edu.cn;tju.edu.cn", "position": "Associate Professor;MS student;Undergrad student;Associate Professor;Professor", "bibtex": "@inproceedings{\ncao2024predictive,\ntitle={Predictive Dynamic Fusion},\nauthor={Bing Cao and Yinan Xia and Yi Ding and Changqing Zhang and Qinghua Hu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LYpGLrC4oq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 819013, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17016517769633614574&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "tju.edu.cn;tju.edu.cn;tju.edu.cn;tju.edu.cn;tju.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Tianjin University", "aff_unique_dep": "", "aff_unique_url": "http://www.tju.edu.cn", "aff_unique_abbr": "TJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "TabLog: Test-Time Adaptation for Tabular Data Using Logic Rules", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34288", "id": "LZeixIvQcB", "proceeding": "https://proceedings.mlr.press/v235/ren24b.html", "pdf": "https://openreview.net/pdf?id=LZeixIvQcB", "openreview": "https://openreview.net/forum?id=LZeixIvQcB", "author_site": "Weijieying Ren, Xiaoting Li, Huiyuan Chen, Vineeth Rakesh, Zhuoyi Wang, Mahashweta Das, Vasant Honavar", "tldr": "", "abstract": "We consider the problem of test-time adaptation of predictive models trained on tabular data. Effective solution of this problem requires adaptation of predictive models trained on the source domain to a target domain, using only unlabeled target domain data, without access to source domain data. Existing test-time adaptation methods for tabular data have difficulty coping with the heterogeneous features and their complex dependencies inherent in tabular data. To overcome these limitations, we consider test-time adaptation in the setting wherein the logical structure of the rules is assumed to remain invariant despite distribution shift between source and target domains whereas the numerical parameters associated with the rules and the weights assigned to them can vary to accommodate distribution shift. TabLog discretizes numerical features, models dependencies between heterogeneous features, introduces a novel contrastive loss for coping with distribution shift, and presents an end-to-end framework for efficient training and test-time adaptation by taking advantage of a logical neural network representation of a rule ensemble. We present results of experiments using several benchmark data sets that demonstrate TabLog is competitive with or improves upon the state-of-the-art methods for test-time adaptation of predictive models trained on tabular data. Our code is available at https://github.com/WeijieyingRen/TabLog.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weijieying Ren;Xiaoting Li;Huiyuan Chen;Vineeth Rakesh;Zhuoyi Wang;Mahashweta Das;Vasant G Honavar", "authorids": "~Weijieying_Ren1;~Xiaoting_Li3;~Huiyuan_Chen1;~Vineeth_Rakesh1;~Zhuoyi_Wang1;~Mahashweta_Das2;~Vasant_G_Honavar1", "gender": "F;F;M;M;M;F;M", "homepage": "https://weijieyingren.github.io/;https://xiaoting.me/;;;https://personal.utdallas.edu/~zxw151030/;;http://faculty.ist.psu.edu/vhonavar", "dblp": "204/2379;;204/5464;139/2250;194/7513;;https://dblp.uni-trier.de/pid/h/VasantHonavar.html", "google_scholar": "https://scholar.google.fi/citations?user=V-J8XikAAAAJ;wlf7M-cAAAAJ;j3y4dJwAAAAJ;oq_AneQAAAAJ;R5f2_FoAAAAJ;;GPqMVRkAAAAJ", "orcid": ";;0000-0002-6360-558X;;;;0000-0001-5399-3489", "linkedin": ";;;;zhuoyi-wang-51b83a86/;mahashwetadas/;vhonavar/", "or_profile": "~Weijieying_Ren1;~Xiaoting_Li3;~Huiyuan_Chen1;~Vineeth_Rakesh1;~Zhuoyi_Wang1;~Mahashweta_Das2;~Vasant_G_Honavar1", "aff": "Pennsylvania State University;VISA;Amazon;VISA;VISA;VISA;Pennsylvania State University", "aff_domain": "psu.edu;visa.com;amazon.com;visa.com;visa.com;visa.com;ist.psu.edu", "position": "PhD student;Researcher;Researcher;Researcher;Researcher;Principal Researcher;Full Professor", "bibtex": "@inproceedings{\nren2024tablog,\ntitle={TabLog: Test-Time Adaptation for Tabular Data Using Logic Rules},\nauthor={Weijieying Ren and Xiaoting Li and Huiyuan Chen and Vineeth Rakesh and Zhuoyi Wang and Mahashweta Das and Vasant G Honavar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LZeixIvQcB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 482218, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5698734737885390620&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "psu.edu;visa.com;amazon.com;visa.com;visa.com;visa.com;ist.psu.edu", "author_num": 7, "aff_unique_index": "0;1;2;1;1;1;0", "aff_unique_norm": "Pennsylvania State University;VISA;Amazon", "aff_unique_dep": ";;Amazon.com, Inc.", "aff_unique_url": "https://www.psu.edu;https://www.visa.com;https://www.amazon.com", "aff_unique_abbr": "PSU;VISA;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Adaptive Feature Selection for No-Reference Image Quality Assessment by Mitigating Semantic Noise Sensitivity", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34287", "id": "LZkhKZvhHs", "proceeding": "https://proceedings.mlr.press/v235/li24w.html", "pdf": "https://openreview.net/pdf?id=LZkhKZvhHs", "openreview": "https://openreview.net/forum?id=LZkhKZvhHs", "author_site": "Xudong Li, Timin Gao, Runze Hu, Yan Zhang, Shengchuan Zhang, Xiawu Zheng, Jingyuan Zheng, Yunhang Shen, Ke Li, Yutao Liu, Pingyang Dai, Rongrong Ji", "tldr": "", "abstract": "The current state-of-the-art No-Reference Image Quality Assessment (NR-IQA) methods typically rely on feature extraction from upstream semantic backbone networks, assuming that all extracted features are relevant. However, we make a key observation that not all features are beneficial, and some may even be harmful, necessitating careful selection. Empirically, we find that many image pairs with small feature spatial distances can have vastly different quality scores, indicating that the extracted features may contain quality-irrelevant noise. To address this issue, we propose a Quality-Aware Feature Matching IQA Metric (QFM-IQM) that employs an adversarial perspective to remove harmful semantic noise features from the upstream task. Specifically, QFM-IQM enhances the semantic noise distinguish capabilities by matching image pairs with similar quality scores but varying semantic features as adversarial semantic noise and adaptively adjusting the upstream task\u2019s features by reducing sensitivity to adversarial noise perturbation. Furthermore, we utilize a distillation framework to expand the dataset and improve the model's generalization ability. Extensive experiments conducted on eight standard IQA datasets have demonstrated the effectiveness of our proposed QFM-IQM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xudong Li;Timin Gao;Runze Hu;Yan Zhang;Shengchuan Zhang;Xiawu Zheng;Jingyuan Zheng;Yunhang Shen;Ke Li;Yutao Liu;Pingyang Dai;Rongrong Ji", "authorids": "~Xudong_Li7;~Timin_Gao1;~Runze_Hu1;~Yan_Zhang22;~Shengchuan_Zhang1;~Xiawu_Zheng1;~Jingyuan_Zheng1;~Yunhang_Shen1;~Ke_Li4;~Yutao_Liu2;~Pingyang_Dai1;~Rongrong_Ji5", "gender": "M;M;;;M;M;;M;M;M;M;M", "homepage": "https://github.com/LXDxmumac/LXDxmumac.github.io;https://github.com/ggg0919;;;https://informatics.xmu.edu.cn/info/1405/25059.htm;https://sites.google.com/view/zhengxiawu/%E9%A6%96%E9%A1%B5;;https://shenyunhang.github.io/;http://keli.info;https://www.researchgate.net/profile/Yutao-Liu;;http://mac.xmu.edu.cn/rrji-en.html", "dblp": ";;;;162/1064;222/7865;;146/1800;;;04/8207;86/5681", "google_scholar": ";;;;GToqXScAAAAJ;jBgXocYAAAAJ;;29teR74AAAAJ;mfWsFM0AAAAJ;;https://scholar.google.com.hk/citations?user=fEw3__QAAAAJ;", "orcid": ";;;;0000-0002-0800-0609;0000-0002-6855-5403;;0000-0002-3970-7519;0000-0001-7998-0731;0000-0002-3066-1884;;", "linkedin": ";;;;;;;;;;;", "or_profile": "~Xudong_Li7;~Timin_Gao1;~Runze_Hu1;~Yan_Zhang22;~Shengchuan_Zhang1;~Xiawu_Zheng1;~Jingyuan_Zheng1;~Yunhang_Shen1;~Ke_Li4;~Yutao_Liu2;~Pingyang_Dai1;~Rongrong_Ji5", "aff": "XMU;Xiamen University;;;Xiamen University;PengCheng Lab;;Tencent;Tencent;Ocean University of China;Xiamen University;Xiamen University", "aff_domain": "xmu.edu;xmu.edu.cn;;;xmu.edu.cn;pcl.ac.cn;;tencent.com;tencent.com;ouc.edu.cn;xmu.edu.cn;xmu.edu.cn", "position": "MS student;MS student;;;Assistant Professor;Postdoc;;Researcher;Principal Researcher;Associate Professor;Senior Engineer;Full Professor", "bibtex": "@inproceedings{\nli2024adaptive,\ntitle={Adaptive Feature Selection for No-Reference Image Quality Assessment by Mitigating Semantic Noise Sensitivity},\nauthor={Xudong Li and Timin Gao and Runze Hu and Yan Zhang and Shengchuan Zhang and Xiawu Zheng and Jingyuan Zheng and Yunhang Shen and Ke Li and Yutao Liu and Pingyang Dai and Rongrong Ji},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LZkhKZvhHs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1929120, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=472974152700390176&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "xmu.edu;xmu.edu.cn;;;xmu.edu.cn;pcl.ac.cn;;tencent.com;tencent.com;ouc.edu.cn;xmu.edu.cn;xmu.edu.cn", "author_num": 12, "aff_unique_index": "0;0;0;1;2;2;3;0;0", "aff_unique_norm": "Xiamen University;Pengcheng Lab;Tencent;Ocean University of China", "aff_unique_dep": ";;Tencent Holdings Limited;", "aff_unique_url": "https://www.xmu.edu.cn;;https://www.tencent.com;http://www.ouc.edu.cn", "aff_unique_abbr": "XMU;;Tencent;OUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Enabling Few-Shot Learning with PID Control: A Layer Adaptive Optimizer", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34286", "id": "LabSWooau0", "proceeding": "https://proceedings.mlr.press/v235/yu24h.html", "pdf": "https://openreview.net/pdf?id=LabSWooau0", "openreview": "https://openreview.net/forum?id=LabSWooau0", "author_site": "Le Yu, Xinde Li, Pengfei Zhang, zhentong zhang, Fir Dunkin", "tldr": "", "abstract": "Model-Agnostic Meta-Learning (MAML) and its variants have shown remarkable performance in scenarios characterized by a scarcity of labeled data during the training phase of machine learning models. Despite these successes, MAMLbased approaches encounter significant challenges when there is a substantial discrepancy in the distribution of training and testing tasks, resulting in inefficient learning and limited generalization across domains. Inspired by classical proportional-integral-derivative (PID) control theory, this study introduces a Layer-Adaptive PID (LA-PID) Optimizer, a MAML-based optimizer that employs efficient parameter optimization methods to dynamically adjust task-specific PID control gains at each layer of the network, conducting a first-principles analysis of optimal convergence conditions. A series of experiments conducted on four standard benchmark datasets demonstrate the efficacy of the LA-PID optimizer, indicating that LA-PID achieves state-oftheart performance in few-shot classification and cross-domain tasks, accomplishing these objectives with fewer training steps. Code is available on https://github.com/yuguopin/LA-PID.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Le Yu;Xinde Li;Pengfei Zhang;zhentong zhang;Fir Dunkin", "authorids": "~Le_Yu6;~Xinde_Li1;~Pengfei_Zhang10;~zhentong_zhang1;~Fir_Dunkin1", "gender": "M;M;M;M;M", "homepage": ";;;;", "dblp": ";;;;363/9077.html", "google_scholar": "https://scholar.google.com.hk/citations?user=59Yf1a0AAAAJ;;;;JmUbRPsAAAAJ", "orcid": "0009-0004-2481-7426;0000-0002-1529-4537;0000-0003-3826-9008;0000-0002-9661-4597;0000-0003-0017-9808", "linkedin": ";;;;", "or_profile": "~Le_Yu6;~Xinde_Li1;~Pengfei_Zhang10;~zhentong_zhang1;~Fir_Dunkin1", "aff": "Southeast University;Southeast University;Southeast University;Southeast University;Southeast University", "aff_domain": "seu.edu.cn;seu.edu.cn;seu.edu.cn;seu.edu.cn;seu.edu.cn", "position": "PhD student;Full Professor;PhD student;PhD student;PhD student", "bibtex": "@inproceedings{\nyu2024enabling,\ntitle={Enabling Few-Shot Learning with {PID} Control: A Layer Adaptive Optimizer},\nauthor={Le Yu and Xinde Li and Pengfei Zhang and zhentong zhang and Fir Dunkin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LabSWooau0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 712058, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:JXA_u0Tup0wJ:scholar.google.com/&scioq=Enabling+Few-Shot+Learning+with+PID+Control:+A+Layer+Adaptive+Optimizer&hl=en&as_sdt=0,44", "gs_version_total": 5, "email": "seu.edu.cn;seu.edu.cn;seu.edu.cn;seu.edu.cn;seu.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Southeast University", "aff_unique_dep": "", "aff_unique_url": "https://www.seu.edu.cn/", "aff_unique_abbr": "SEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Sign Rank Limitations for Inner Product Graph Decoders", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34285", "id": "Lb8G2dZjcB", "proceeding": "https://proceedings.mlr.press/v235/lee24ad.html", "pdf": "https://openreview.net/pdf?id=Lb8G2dZjcB", "openreview": "https://openreview.net/forum?id=Lb8G2dZjcB", "author_site": "Su Hyeong Lee, QINGQI ZHANG, Risi Kondor", "tldr": "", "abstract": "Inner product-based decoders are among the most influential frameworks used to extract meaningful data from latent embeddings. However, such decoders have shown limitations in representation capacity in numerous works within the literature, which have been particularly notable in graph reconstruction problems. In this paper, we provide the first theoretical elucidation of this pervasive phenomenon in graph data, and suggest straightforward modifications to circumvent this issue without deviating from the inner product framework.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Su Hyeong Lee;Qingqi Zhang;Risi Kondor", "authorids": "~Su_Hyeong_Lee1;qingqi@uchicago.edu;~Risi_Kondor1", "gender": ";;M", "homepage": "https://cam.uchicago.edu/people/profile/su-hyeong-lee/;;http://people.cs.uchicago.edu/~risi/", "dblp": "368/6247;;90/869", "google_scholar": ";;v12-jLUAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Su_Hyeong_Lee1;qingqi@uchicago.edu;~Risi_Kondor1", "aff": "University of Chicago;;University of Chicago", "aff_domain": "uchicago.edu;;uchicago.edu", "position": "PhD student;;Associate Professor", "bibtex": "@inproceedings{\nlee2024sign,\ntitle={Sign Rank Limitations for Inner Product Graph Decoders},\nauthor={Su Hyeong Lee and Qingqi Zhang and Risi Kondor},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Lb8G2dZjcB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9151553, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ipYhIuxGiisJ:scholar.google.com/&scioq=Sign+Rank+Limitations+for+Inner+Product+Graph+Decoders&hl=en&as_sdt=0,48", "gs_version_total": 4, "email": "uchicago.edu;;uchicago.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Chicago", "aff_unique_dep": "", "aff_unique_url": "https://www.uchicago.edu", "aff_unique_abbr": "UChicago", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "USTAD: Unified Single-model Training Achieving Diverse Scores for Information Retrieval", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34284", "id": "LbEB39lZqp", "proceeding": "https://proceedings.mlr.press/v235/kim24ad.html", "pdf": "https://openreview.net/pdf?id=LbEB39lZqp", "openreview": "https://openreview.net/forum?id=LbEB39lZqp", "author_site": "Seungyeon Kim, Ankit Singh Rawat, Manzil Zaheer, Wittawat Jitkrittum, Veeranjaneyulu Sadhanala, Sadeep Jayasumana, Aditya Menon, Rob Fergus, Sanjiv Kumar", "tldr": "", "abstract": "Modern information retrieval (IR) systems consists of multiple stages like retrieval and ranking, with Transformer-based models achieving state-of-the-art performance at each stage. In this paper, we challenge the tradition of using separate models for different stages and ask if a single Transformer encoder can provide relevance score needed in each stage. We present USTAD \u2013 a new unified approach to train a single network that can provide powerful ranking scores as a cross-encoder (CE) model as well as factorized embeddings for large-scale retrieval as a dual-encoder (DE) model. Empirically, we find a single USTAD model to be competitive to separate ranking CE and retrieval DE models. Furthermore, USTAD combines well with a novel embedding matching-based distillation, significantly improving CE to DE distillation. It further motivates novel asymmetric architectures for student models to ensure a better embedding alignment between the student and the teacher while ensuring small online inference cost. On standard benchmarks like MSMARCO, we demonstrate that USTAD with our proposed distillation method leads to asymmetric students with only 1/10th trainable parameter but retaining 95-97% of the teacher performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Seungyeon Kim;Ankit Singh Rawat;Manzil Zaheer;Wittawat Jitkrittum;Veeranjaneyulu Sadhanala;Sadeep Jayasumana;Aditya Krishna Menon;Rob Fergus;Sanjiv Kumar", "authorids": "~Seungyeon_Kim1;~Ankit_Singh_Rawat1;~Manzil_Zaheer1;~Wittawat_Jitkrittum1;~Veeranjaneyulu_Sadhanala1;~Sadeep_Jayasumana1;~Aditya_Krishna_Menon1;~Rob_Fergus1;~Sanjiv_Kumar1", "gender": ";M;M;M;M;;;M;", "homepage": "https://www.seungyeon.ai;https://ankitsrawat.github.io/home/;https://www.aclweb.org/anthology/people/m/manzil-zaheer/;http://wittawat.com;https://veeranjaneyulus.github.io/;;;http://cs.nyu.edu/fergus/;http://www.sanjivk.com/", "dblp": "74/7997-1.html;https://dblp.org/pers/hd/r/Rawat:Ankit_Singh;40/10701;95/3398.html;81/7249;;;77/3763;", "google_scholar": "zbcN_QIAAAAJ;http://scholar.google.com/citations?user=U0_ab4cAAAAJ;A33FhJMAAAAJ;https://scholar.google.co.uk/citations?hl=en;FuIExf4AAAAJ;;;https://scholar.google.com.tw/citations?user=GgQ9GEkAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;0000-0002-9400-9262;;;;;", "linkedin": ";;;wittawat-jitkrittum/;;;;;", "or_profile": "~Seungyeon_Kim1;~Ankit_Singh_Rawat1;~Manzil_Zaheer1;~Wittawat_Jitkrittum1;~Veeranjaneyulu_Sadhanala1;~Sadeep_Jayasumana1;~Aditya_Krishna_Menon1;~Rob_Fergus1;~Sanjiv_Kumar1", "aff": "Google;Google;Google DeepMind;Google Research;Google;;;Google;Google", "aff_domain": "google.com;google.com;deepmind.com;google.com;google.com;;;google.com;google.com", "position": "Researcher;Research Scientist;Researcher;Research Scientist;Researcher;;;Research scientist;Research Scientist", "bibtex": "@inproceedings{\nkim2024ustad,\ntitle={{USTAD}: Unified Single-model Training Achieving Diverse Scores for Information Retrieval},\nauthor={Seungyeon Kim and Ankit Singh Rawat and Manzil Zaheer and Wittawat Jitkrittum and Veeranjaneyulu Sadhanala and Sadeep Jayasumana and Aditya Krishna Menon and Rob Fergus and Sanjiv Kumar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LbEB39lZqp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 771041, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZowgNr2-9HYJ:scholar.google.com/&scioq=USTAD:+Unified+Single-model+Training+Achieving+Diverse+Scores+for+Information+Retrieval&hl=en&as_sdt=0,44", "gs_version_total": 9, "email": "google.com;google.com;deepmind.com;google.com;google.com;;;google.com;google.com", "author_num": 9, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;1;0;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "How to Explore with Belief: State Entropy Maximization in POMDPs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34283", "id": "LbcNAIgNnB", "proceeding": "https://proceedings.mlr.press/v235/zamboni24a.html", "pdf": "https://openreview.net/pdf?id=LbcNAIgNnB", "openreview": "https://openreview.net/forum?id=LbcNAIgNnB", "author_site": "Riccardo Zamboni, Duilio Cirino, Marcello Restelli, Mirco Mutti", "tldr": "", "abstract": "Recent works have studied *state entropy maximization* in reinforcement learning, in which the agent's objective is to learn a policy inducing high entropy over states visitation (Hazan et al., 2019). They typically assume full observability of the state of the system, so that the entropy of the observations is maximized. In practice, the agent may only get *partial* observations, e.g., a robot perceiving the state of a physical space through proximity sensors and cameras. A significant mismatch between the entropy over observations and true states of the system can arise in those settings. In this paper, we address the problem of entropy maximization over the *true states* with a decision policy conditioned on partial observations *only*. The latter is a generalization of POMDPs, which is intractable in general. We develop a memory and computationally efficient *policy gradient* method to address a first-order relaxation of the objective defined on *belief* states, providing various formal characterizations of approximation gaps, the optimization landscape, and the *hallucination* problem. This paper aims to generalize state entropy maximization to more realistic domains that meet the challenges of applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Riccardo Zamboni;Duilio Cirino;Marcello Restelli;Mirco Mutti", "authorids": "~Riccardo_Zamboni1;~Duilio_Cirino1;~Marcello_Restelli1;~Mirco_Mutti1", "gender": ";M;M;", "homepage": "https://ricczamboni.github.io;https://github.com/duiliocirino;http://home.deib.polimi.it/restelli/;", "dblp": "275/1582;;64/1011;222/2815", "google_scholar": "jXy474MAAAAJ;;https://scholar.google.com.tw/citations?user=xdgxRiEAAAAJ;GlLkJ9UAAAAJ", "orcid": ";;0000-0002-6322-1076;", "linkedin": "riccardo-zamboni-rz95/;;;", "or_profile": "~Riccardo_Zamboni1;~Duilio_Cirino1;~Marcello_Restelli1;~Mirco_Mutti1", "aff": "Polytechnic Institute of Milan;Polytechnic Institute of Milan;Politecnico di Milano;Technion - Israel Institute of Technology", "aff_domain": "polimi.it;mail.polimi.it;polimi.it;technion.ac.il", "position": "PhD student;MS student;Associate Professor;Postdoc", "bibtex": "@inproceedings{\nzamboni2024how,\ntitle={How to Explore with Belief: State Entropy Maximization in {POMDP}s},\nauthor={Riccardo Zamboni and Duilio Cirino and Marcello Restelli and Mirco Mutti},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LbcNAIgNnB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2623665, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11796983116661895485&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "polimi.it;mail.polimi.it;polimi.it;technion.ac.il", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Polytechnic Institute of Milan;Politecnico di Milano;Technion - Israel Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.polimi.it/;https://www.polimi.it;https://www.technion.ac.il/en/", "aff_unique_abbr": "Politecnico di Milano;Polimi;Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "Italy;Israel" }, { "title": "Beyond Sole Strength: Customized Ensembles for Generalized Vision-Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34282", "id": "Lc1HlMo77m", "proceeding": "https://proceedings.mlr.press/v235/lu24a.html", "pdf": "https://openreview.net/pdf?id=Lc1HlMo77m", "openreview": "https://openreview.net/forum?id=Lc1HlMo77m", "author_site": "Zhihe Lu, Jiawang Bai, Xin Li, Zeyu Xiao, Xinchao Wang", "tldr": "", "abstract": "Fine-tuning pre-trained vision-language models (VLMs), e.g., CLIP, for the open-world generalization has gained increasing popularity due to its practical value. However, performance advancements are limited when relying solely on intricate algorithmic designs for a single model, even one exhibiting strong performance, e.g., CLIP-ViT-B/16. This paper, for the first time, explores the collaborative potential of leveraging much weaker VLMs to enhance the generalization of a robust single model. The affirmative findings motivate us to address the generalization problem from a novel perspective, i.e., ensemble of pre-trained VLMs. We introduce three customized ensemble strategies, each tailored to one specific scenario. Firstly, we introduce the zero-shot ensemble, automatically adjusting the logits of different models based on their confidence when only pre-trained VLMs are available. Furthermore, for scenarios with extra few-shot samples, we propose the training-free and tuning ensemble, offering flexibility based on the availability of computing resources. The code is available at https://github.com/zhiheLu/Ensemble_VLM.git.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhihe Lu;Jiawang Bai;Xin Li;Zeyu Xiao;Xinchao Wang", "authorids": "~Zhihe_Lu1;~Jiawang_Bai2;~Xin_Li28;~Zeyu_Xiao1;~Xinchao_Wang1", "gender": "M;M;M;;M", "homepage": "https://zhihelu.github.io/;;https://lixinustc.github.io;;https://sites.google.com/site/sitexinchaowang/", "dblp": "195/9141.html;237/9675;09/1365-82;276/3139;", "google_scholar": "X4LKIhgAAAAJ;https://scholar.google.com.hk/citations?user=sRksETcAAAAJ;sbiY97gAAAAJ;;https://scholar.google.com.tw/citations?user=w69Buq0AAAAJ", "orcid": "0000-0002-6917-8654;;;;", "linkedin": ";;;;", "or_profile": "~Zhihe_Lu1;~Jiawang_Bai2;~Xin_Li28;~Zeyu_Xiao1;~Xinchao_WANG3", "aff": "National University of Singapore;Tsinghua University;University of Science and Technology of China;University of Science and Technology of China;National University of Singapore", "aff_domain": "nus.edu;tsinghua.edu.cn;ustc.edu.cn;ustc.edu.cn;nus.edu", "position": "Postdoc;PhD student;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nlu2024beyond,\ntitle={Beyond Sole Strength: Customized Ensembles for Generalized Vision-Language Models},\nauthor={Zhihe Lu and Jiawang Bai and Xin Li and Zeyu Xiao and Xinchao Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Lc1HlMo77m}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 601340, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9805546250546941316&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "nus.edu;tsinghua.edu.cn;ustc.edu.cn;ustc.edu.cn;nus.edu", "author_num": 5, "aff_unique_index": "0;1;2;2;0", "aff_unique_norm": "National University of Singapore;Tsinghua University;University of Science and Technology of China", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nus.edu.sg;https://www.tsinghua.edu.cn;http://www.ustc.edu.cn", "aff_unique_abbr": "NUS;THU;USTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;0", "aff_country_unique": "Singapore;China" }, { "title": "On the Diminishing Returns of Width for Continual Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34281", "id": "Ld255Mbx9F", "proceeding": "https://proceedings.mlr.press/v235/guha24a.html", "pdf": "https://openreview.net/pdf?id=Ld255Mbx9F", "openreview": "https://openreview.net/forum?id=Ld255Mbx9F", "author_site": "Etash Guha, Vihan Lakshman", "tldr": "", "abstract": "While deep neural networks have demonstrated groundbreaking performance in various settings, these models often suffer from *catastrophic forgetting* when trained on new tasks in sequence. Several works have empirically demonstrated that increasing the width of a neural network leads to a decrease in catastrophic forgetting but have yet to characterize the exact relationship between width and continual learning. We design one of the first frameworks to analyze Continual Learning Theory and prove that width is directly related to forgetting in Feed-Forward Networks (FFN), demonstrating that the diminishing returns of increasing widths to reduce forgetting. We empirically verify our claims at widths hitherto unexplored in prior studies where the diminishing returns are clearly observed as predicted by our theory.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Etash Kumar Guha;Vihan Lakshman", "authorids": "~Etash_Kumar_Guha1;~Vihan_Lakshman1", "gender": "M;", "homepage": "https://etash.me/;", "dblp": "331/5590;244/2300", "google_scholar": "https://scholar.google.com/citations?hl=en;EO6fMUUAAAAJ", "orcid": ";", "linkedin": "etash-guha-00097116a/;", "or_profile": "~Etash_Kumar_Guha1;~Vihan_Lakshman1", "aff": ";ThirdAI Corp", "aff_domain": ";thirdai.com", "position": ";Researcher", "bibtex": "@inproceedings{\nguha2024on,\ntitle={On the Diminishing Returns of Width for Continual Learning},\nauthor={Etash Kumar Guha and Vihan Lakshman},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Ld255Mbx9F}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 583625, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7887969461515995694&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";thirdai.com", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "ThirdAI Corporation", "aff_unique_dep": "", "aff_unique_url": "", "aff_unique_abbr": "ThirdAI", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "DS-Agent: Automated Data Science by Empowering Large Language Models with Case-Based Reasoning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34280", "id": "LfJgeBNCFI", "proceeding": "https://proceedings.mlr.press/v235/guo24b.html", "pdf": "https://openreview.net/pdf?id=LfJgeBNCFI", "openreview": "https://openreview.net/forum?id=LfJgeBNCFI", "author_site": "Siyuan Guo, Cheng Deng, Ying Wen, Hechang Chen, Yi Chang, Jun Wang", "tldr": "", "abstract": "In this work, we investigate the potential of large language models (LLMs) based agents to automate data science tasks, with the goal of comprehending task requirements, then building and training the best-fit machine learning models. Despite their widespread success, existing LLM agents are hindered by generating unreasonable experiment plans within this scenario. To this end, we present DS-Agent, a novel automatic framework that harnesses LLM agent and case-based reasoning (CBR). In the development stage, DS-Agent follows the CBR framework to structure an automatic iteration pipeline, which can flexibly capitalize on the expert knowledge from Kaggle, and facilitate consistent performance improvement through the feedback mechanism. Moreover, DS-Agent implements a low-resource deployment stage with a simplified CBR paradigm to adapt past successful solutions from the development stage for direct code generation, significantly reducing the demand on foundational capabilities of LLMs. Empirically, DS-Agent with GPT-4 achieves 100% success rate in the development stage, while attaining 36% improvement on average one pass rate across alternative LLMs in the deployment stage. In both stages, DS-Agent achieves the best rank in performance, costing \r\n$1.60 and \\\\$0.13 per run with GPT-4, respectively. Our data and code are open-sourced at https://github.com/guosyjlu/DS-Agent.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Siyuan Guo;Cheng Deng;Ying Wen;Hechang Chen;Yi Chang;Jun Wang", "authorids": "~Siyuan_Guo2;~Cheng_Deng4;~Ying_Wen1;~Hechang_Chen2;~Yi_Chang4;~Jun_Wang2", "gender": "M;M;M;M;M;M", "homepage": ";https://www.cdeng.net/;https://yingwen.io;http://sai.jlu.edu.cn/info/1094/2387.htm;http://www.yichang-cs.com;http://www0.cs.ucl.ac.uk/staff/jun.wang/", "dblp": "244/5858;;41/4203-1;145/1142;02/5438.html;w/JunWang12", "google_scholar": "JE1Yco4AAAAJ;0VFxZy0AAAAJ;_A1CxG8AAAAJ;EezEcbgAAAAJ;https://scholar.google.com.hk/citations?user=drEkR50AAAAJ;https://scholar.google.co.uk/citations?user=wIE1tY4AAAAJ", "orcid": "0000-0002-9304-5405;;0000-0003-1247-2382;;0000-0003-2697-8093;", "linkedin": ";;wenying45;;;", "or_profile": "~Siyuan_Guo2;~Cheng_Deng4;~Ying_Wen1;~Hechang_Chen2;~Yi_Chang4;~Jun_Wang2", "aff": "Jilin University;Shanghai Jiaotong University;Shanghai Jiaotong University;Jilin University;Jilin University, China;University College London", "aff_domain": "jlu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;jlu.edu.cn;jlu.edu.cn;ucl.ac.uk", "position": "PhD student;PhD student;Associate Professor;Associate Professor;Full Professor;Professor", "bibtex": "@inproceedings{\nguo2024dsagent,\ntitle={{DS}-Agent: Automated Data Science by Empowering Large Language Models with Case-Based Reasoning},\nauthor={Siyuan Guo and Cheng Deng and Ying Wen and Hechang Chen and Yi Chang and Jun Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LfJgeBNCFI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 800104, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2541022239218935847&as_sdt=800005&sciodt=0,15&hl=en", "gs_version_total": 9, "email": "jlu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;jlu.edu.cn;jlu.edu.cn;ucl.ac.uk", "author_num": 6, "aff_unique_index": "0;1;1;0;0;2", "aff_unique_norm": "Jilin University;Shanghai Jiao Tong University;University College London", "aff_unique_dep": ";;", "aff_unique_url": "http://www.jlu.edu.cn;https://www.sjtu.edu.cn;https://www.ucl.ac.uk", "aff_unique_abbr": "JLU;SJTU;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1", "aff_country_unique": "China;United Kingdom" }, { "title": "Improving Token-Based World Models with Parallel Observation Prediction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34279", "id": "Lfp5Dk1xb6", "proceeding": "https://proceedings.mlr.press/v235/cohen24b.html", "pdf": "https://openreview.net/pdf?id=Lfp5Dk1xb6", "openreview": "https://openreview.net/forum?id=Lfp5Dk1xb6", "author_site": "Lior Cohen, Kaixin Wang, Bingyi Kang, Shie Mannor", "tldr": "", "abstract": "Motivated by the success of Transformers when applied to sequences of discrete symbols, token-based world models (TBWMs) were recently proposed as sample-efficient methods. In TBWMs, the world model consumes agent experience as a language-like sequence of tokens, where each observation constitutes a sub-sequence. However, during imagination, the sequential token-by-token generation of next observations results in a severe bottleneck, leading to long training times, poor GPU utilization, and limited representations. To resolve this bottleneck, we devise a novel Parallel Observation Prediction (POP) mechanism. POP augments a Retentive Network (RetNet) with a novel forward mode tailored to our reinforcement learning setting. We incorporate POP in a novel TBWM agent named REM (Retentive Environment Model), showcasing a 15.4x faster imagination compared to prior TBWMs. REM attains superhuman performance on 12 out of 26 games of the Atari 100K benchmark, while training in less than 12 hours. Our code is available at https://github.com/leor-c/REM", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lior Cohen;Kaixin Wang;Bingyi Kang;Shie Mannor", "authorids": "~Lior_Cohen1;~Kaixin_Wang1;~Bingyi_Kang1;~Shie_Mannor2", "gender": "M;;M;M", "homepage": "https://kaixin96.github.io;https://bingykang.github.io/;https://shie.net.technion.ac.il;", "dblp": ";;20/1669;", "google_scholar": "https://scholar.google.com.sg/citations?hl=en;https://scholar.google.com.sg/citations?user=NmHgX-wAAAAJ;https://scholar.google.com.tw/citations?user=q1HlbIUAAAAJ;qJyZqtwAAAAJ", "orcid": "0000-0001-8237-9285;;;0000-0001-7913-2150", "linkedin": ";;;leor-cohen-11496aaa/", "or_profile": "~Kaixin_Wang1;~Bingyi_Kang1;~Shie_Mannor2;~Leor_Cohen1", "aff": "Technion - Israel Institute of Technology, Technion - Israel Institute of Technology;Bytedance;Technion - Israel Institute of Technology, Technion;Technion", "aff_domain": "campus.technion.ac.il;bytedance.com;technion.il;technion.ac.il", "position": "Postdoc;Researcher;Full Professor;PhD student", "bibtex": "@inproceedings{\ncohen2024improving,\ntitle={Improving Token-Based World Models with Parallel Observation Prediction},\nauthor={Lior Cohen and Kaixin Wang and Bingyi Kang and Shie Mannor},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Lfp5Dk1xb6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1448281, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13944801603213837011&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 6, "email": "campus.technion.ac.il;bytedance.com;technion.il;technion.ac.il", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Technion - Israel Institute of Technology;ByteDance", "aff_unique_dep": ";", "aff_unique_url": "https://www.technion.ac.il/en/;https://www.bytedance.com", "aff_unique_abbr": "Technion;Bytedance", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Israel;China" }, { "title": "Layerwise Proximal Replay: A Proximal Point Method for Online Continual Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34278", "id": "Lg8nw3ltvX", "proceeding": "https://proceedings.mlr.press/v235/yoo24a.html", "pdf": "https://openreview.net/pdf?id=Lg8nw3ltvX", "openreview": "https://openreview.net/forum?id=Lg8nw3ltvX", "author_site": "Jinsoo Yoo, Yunpeng Liu, Frank Wood, Geoff Pleiss", "tldr": "", "abstract": "In online continual learning, a neural network incrementally learns from a non-i.i.d. data stream. Nearly all online continual learning methods employ experience replay to simultaneously prevent catastrophic forgetting and underfitting on past data. Our work demonstrates a limitation of this approach: neural networks trained with experience replay tend to have unstable optimization trajectories, impeding their overall accuracy. Surprisingly, these instabilities persist even when the replay buffer stores all previous training examples, suggesting that this issue is orthogonal to catastrophic forgetting. We minimize these instabilities through a simple modification of the optimization geometry. Our solution, Layerwise Proximal Replay (LPR), balances learning from new and replay data while only allowing for gradual changes in the hidden activation of past data. We demonstrate that LPR consistently improves replay-based online continual learning across multiple problem settings, regardless of the amount of available replay memory.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jinsoo Yoo;Yunpeng Liu;Frank Wood;Geoff Pleiss", "authorids": "~Jinsoo_Yoo1;~Yunpeng_Liu1;~Frank_Wood2;~Geoff_Pleiss1", "gender": "M;M;M;M", "homepage": "https://jason-yoo-108.github.io;;http://www.robots.ox.ac.uk/~fwood/;http://geoffpleiss.com", "dblp": "281/6703;02/8137-7.html;44/4750;199/1693.html", "google_scholar": "-fliFAcAAAAJ;;d4yNzXIAAAAJ;XO8T-Y4AAAAJ", "orcid": ";;;0000-0002-7009-0967", "linkedin": ";larry-liu-323b51126/;frank-wood-43529114?trk=hp-identity-name;", "or_profile": "~Jinsoo_Yoo1;~Yunpeng_Liu1;~Frank_Wood2;~Geoff_Pleiss1", "aff": "University of British Columbia;University of British Columbia;University of British Columbia;Vector Institute", "aff_domain": "ubc.ca;cs.ubc.ca;cs.ubc.ca;vectorinstitute.ai", "position": "PhD student;PhD student;Full Professor;Researcher", "bibtex": "@inproceedings{\nyoo2024layerwise,\ntitle={Layerwise Proximal Replay: A Proximal Point Method for Online Continual Learning},\nauthor={Jinsoo Yoo and Yunpeng Liu and Frank Wood and Geoff Pleiss},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Lg8nw3ltvX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6018874, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9198860649052913778&as_sdt=20000005&sciodt=0,21&hl=en", "gs_version_total": 7, "email": "ubc.ca;cs.ubc.ca;cs.ubc.ca;vectorinstitute.ai", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of British Columbia;Vector Institute", "aff_unique_dep": ";", "aff_unique_url": "https://www.ubc.ca;https://vectorinstitute.ai/", "aff_unique_abbr": "UBC;Vector Institute", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Canada" }, { "title": "Disentangled 3D Scene Generation with Layout Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34277", "id": "Lgh8bhWpVC", "proceeding": "https://proceedings.mlr.press/v235/epstein24a.html", "pdf": "https://openreview.net/pdf?id=Lgh8bhWpVC", "openreview": "https://openreview.net/forum?id=Lgh8bhWpVC", "author_site": "Dave Epstein, Ben Poole, Ben Mildenhall, Alexei Efros, Aleksander Holynski", "tldr": "", "abstract": "We introduce a method to generate 3D scenes that are disentangled into their component objects. This disentanglement is unsupervised, relying only on the knowledge of a large pretrained text-to-image model. Our key insight is that objects can be discovered by finding parts of a 3D scene that, when rearranged spatially, still produce valid configurations of the same scene. Concretely, our method jointly optimizes multiple NeRFs---each representing its own object---along with a *set of layouts* that composite these objects into scenes. We then encourage these composited scenes to be in-distribution according to the image generator. We show that despite its simplicity, our approach successfully generates 3D scenes decomposed into individual objects, enabling new capabilities in text-to-3D content creation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dave Epstein;Ben Poole;Ben Mildenhall;Alexei A Efros;Aleksander Holynski", "authorids": "~Dave_Epstein1;~Ben_Poole1;~Ben_Mildenhall1;~Alexei_A_Efros1;~Aleksander_Holynski1", "gender": ";M;M;;M", "homepage": "https://dave.ml;https://cs.stanford.edu/~poole;https://bmild.github.io;https://holynski.org;http://www.eecs.berkeley.edu/~efros/", "dblp": "https://dblp.org/pers/hd/e/Epstein:Dave;16/10397;167/4350;230/7958;40/6158", "google_scholar": "https://scholar.google.com/citations?hl=en;i5FMLA4AAAAJ;NozIDL8AAAAJ;ypBMJMgAAAAJ;https://scholar.google.com.tw/citations?user=d97bGd8AAAAJ", "orcid": ";;;;0000-0001-5720-8070", "linkedin": ";;;;alexei-efros-890736a3/", "or_profile": "~Dave_Epstein1;~Ben_Poole1;~Ben_Mildenhall1;~Aleksander_Holynski1;~Alyosha_Efros1", "aff": "University of California, Berkeley;Google;Google;Google DeepMind;University of California, Berkeley", "aff_domain": "berkeley.edu;google.com;google.com;google.com;berkeley.edu", "position": "PhD student;Research Scientist;Researcher;Researcher;Professor", "bibtex": "@inproceedings{\nepstein2024disentangled,\ntitle={Disentangled 3D Scene Generation with Layout Learning},\nauthor={Dave Epstein and Ben Poole and Ben Mildenhall and Alexei A Efros and Aleksander Holynski},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Lgh8bhWpVC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7957440, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15775173507432401824&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "berkeley.edu;google.com;google.com;google.com;berkeley.edu", "author_num": 5, "aff_unique_index": "0;1;1;1;0", "aff_unique_norm": "University of California, Berkeley;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;Google", "aff_campus_unique_index": "0;1;1;0", "aff_campus_unique": "Berkeley;Mountain View;", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Stacking Deep Set Networks and Pooling by Quantiles", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34276", "id": "Lgq1E92h1U", "proceeding": "https://proceedings.mlr.press/v235/chen24bo.html", "pdf": "https://openreview.net/pdf?id=Lgq1E92h1U", "openreview": "https://openreview.net/forum?id=Lgq1E92h1U", "author_site": "Zhuojun Chen, Xinghua Zhu, Dongzhe Su, Justin CHUANG", "tldr": "", "abstract": "We propose Stacked Deep Sets and Quantile Pooling for learning tasks on set data. We introduce Quantile Pooling, a novel permutation-invariant pooling operation that synergizes max and average pooling. Just like max pooling, quantile pooling emphasizes the most salient features of the data. Like average pooling, it captures the overall distribution and subtle features of the data. Like both, it is lightweight and fast. We demonstrate the effectiveness of our approach in a variety of tasks, showing that quantile pooling can outperform both max and average pooling in each of their respective strengths. We also introduce a variant of deep set networks that is more expressive and universal. While Quantile Pooling balances robustness and sensitivity, Stacked Deep Sets enhances learning with depth.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhuojun Chen;Xinghua Zhu;Dongzhe Su;Justin C. I. CHUANG", "authorids": "~Zhuojun_Chen1;~Xinghua_Zhu1;dzsu@astri.org;justinchuang@astri.org", "gender": "M;F;;", "homepage": ";https://scholar.google.com/citations?view_op=list_works&hl=en&user=gRHpPZQAAAAJ;;", "dblp": "184/0392.html;;;", "google_scholar": "dUYrwXIAAAAJ;;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Zhuojun_Chen1;~Xinghua_Zhu1;dzsu@astri.org;justinchuang@astri.org", "aff": "Hong Kong Applied Science and Technology Research Institute;The Hong Kong Applied Science and Technology Research Institute;;", "aff_domain": "astri.org;astri.org;;", "position": "Researcher;Principal Researcher;;", "bibtex": "@inproceedings{\nchen2024stacking,\ntitle={Stacking Deep Set Networks and Pooling by Quantiles},\nauthor={Zhuojun Chen and Xinghua Zhu and Dongzhe Su and Justin C. I. CHUANG},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Lgq1E92h1U}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4312392, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:G8FRJuhEhacJ:scholar.google.com/&scioq=Stacking+Deep+Set+Networks+and+Pooling+by+Quantiles&hl=en&as_sdt=0,5", "gs_version_total": 5, "email": "astri.org;astri.org;;", "author_num": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Hong Kong Applied Science and Technology Research Institute", "aff_unique_dep": "", "aff_unique_url": "https://www.hkastri.hk", "aff_unique_abbr": "HKASTRI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Hierarchical Neural Operator Transformer with Learnable Frequency-aware Loss Prior for Arbitrary-scale Super-resolution", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34275", "id": "LhAuVPWq6q", "proceeding": "https://proceedings.mlr.press/v235/luo24g.html", "pdf": "https://openreview.net/pdf?id=LhAuVPWq6q", "openreview": "https://openreview.net/forum?id=LhAuVPWq6q", "author_site": "Xihaier Luo, Xiaoning Qian, Byung-Jun Yoon", "tldr": "", "abstract": "In this work, we present an arbitrary-scale super-resolution (SR) method to enhance the resolution of scientific data, which often involves complex challenges such as continuity, multi-scale physics, and the intricacies of high-frequency signals. Grounded in operator learning, the proposed method is resolution-invariant. The core of our model is a hierarchical neural operator that leverages a Galerkin-type self-attention mechanism, enabling efficient learning of mappings between function spaces. Sinc filters are used to facilitate the information transfer across different levels in the hierarchy, thereby ensuring representation equivalence in the proposed neural operator. Additionally, we introduce a learnable prior structure that is derived from the spectral resizing of the input data. This loss prior is model-agnostic and is designed to dynamically adjust the weighting of pixel contributions, thereby balancing gradients effectively across the model. We conduct extensive experiments on diverse datasets from different domains and demonstrate consistent improvements compared to strong baselines, which consist of various state-of-the-art SR methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xihaier Luo;Xiaoning Qian;Byung-Jun Yoon", "authorids": "~Xihaier_Luo1;~Xiaoning_Qian2;~Byung-Jun_Yoon1", "gender": "M;M;M", "homepage": "https://xihaier.github.io/;https://www.ece.tamu.edu/~xqian;https://BioMLSP.com", "dblp": ";62/4504;14/1887", "google_scholar": "aZyVAYwAAAAJ;dXGlddgAAAAJ;KxPLjXkAAAAJ", "orcid": ";0000-0002-4347-2476;0000-0001-9328-1101", "linkedin": ";;", "or_profile": "~Xihaier_Luo1;~Xiaoning_Qian2;~Byung-Jun_Yoon1", "aff": "Brookhaven National Laboratory;Texas A&M;Brookhaven National Laboratory", "aff_domain": "bnl.gov;tamu.edu;bnl.gov", "position": "Researcher;Full Professor;Scientist", "bibtex": "@inproceedings{\nluo2024hierarchical,\ntitle={Hierarchical Neural Operator Transformer with Learnable Frequency-aware Loss Prior for Arbitrary-scale Super-resolution},\nauthor={Xihaier Luo and Xiaoning Qian and Byung-Jun Yoon},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LhAuVPWq6q}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6858235, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6760228127547381518&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "bnl.gov;tamu.edu;bnl.gov", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Brookhaven National Laboratory;Texas A&M University", "aff_unique_dep": ";", "aff_unique_url": "https://www.bnl.gov;https://www.tamu.edu", "aff_unique_abbr": "BNL;TAMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Foundation Policies with Hilbert Representations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34274", "id": "LhNsSaAKub", "proceeding": "https://proceedings.mlr.press/v235/park24g.html", "pdf": "https://openreview.net/pdf?id=LhNsSaAKub", "openreview": "https://openreview.net/forum?id=LhNsSaAKub", "author_site": "Seohong Park, Tobias Kreiman, Sergey Levine", "tldr": "", "abstract": "Unsupervised and self-supervised objectives, such as next token prediction, have enabled pre-training generalist models from large amounts of unlabeled data. In reinforcement learning (RL), however, finding a truly general and scalable unsupervised pre-training objective for generalist policies from offline data remains a major open question. While a number of methods have been proposed to enable generic self-supervised RL, based on principles such as goal-conditioned RL, behavioral cloning, and unsupervised skill learning, such methods remain limited in terms of either the diversity of the discovered behaviors, the need for high-quality demonstration data, or the lack of a clear adaptation mechanism for downstream tasks. In this work, we propose a novel unsupervised framework to pre-train generalist policies that capture diverse, optimal, long-horizon behaviors from unlabeled offline data such that they can be quickly adapted to any arbitrary new tasks in a zero-shot manner. Our key insight is to learn a structured representation that preserves the temporal structure of the underlying environment, and then to span this learned latent space with directional movements, which enables various zero-shot policy \u201cprompting\u201d schemes for downstream tasks. Through our experiments on simulated robotic locomotion and manipulation benchmarks, we show that our unsupervised policies can solve goal-conditioned and general RL tasks in a zero-shot fashion, even often outperforming prior methods designed specifically for each setting. Our code and videos are available at https://seohong.me/projects/hilp/", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Seohong Park;Tobias Kreiman;Sergey Levine", "authorids": "~Seohong_Park1;~Tobias_Kreiman1;~Sergey_Levine1", "gender": ";M;M", "homepage": "https://seohong.me/;;https://people.eecs.berkeley.edu/~svlevine/", "dblp": "227/6308;;80/7594", "google_scholar": ";;8R35rCwAAAAJ", "orcid": ";;", "linkedin": ";toby-kreiman-90002b19b/;", "or_profile": "~Seohong_Park1;~Tobias_Kreiman1;~Sergey_Levine1", "aff": "University of California, Berkeley;University of California, Berkeley;Google", "aff_domain": "berkeley.edu;berkeley.edu;google.com", "position": "PhD student;PhD student;Research Scientist", "bibtex": "@inproceedings{\npark2024foundation,\ntitle={Foundation Policies with Hilbert Representations},\nauthor={Seohong Park and Tobias Kreiman and Sergey Levine},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LhNsSaAKub}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2073099, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10115353798390039788&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "berkeley.edu;berkeley.edu;google.com", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of California, Berkeley;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;Google", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Berkeley;Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "FAFE: Immune Complex Modeling with Geodesic Distance Loss on Noisy Group Frames", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34273", "id": "Lhb39btw16", "proceeding": "https://proceedings.mlr.press/v235/wu24g.html", "pdf": "https://openreview.net/pdf?id=Lhb39btw16", "openreview": "https://openreview.net/forum?id=Lhb39btw16", "author_site": "Ruidong Wu, Ruihan Guo, Rui Wang, Shitong Luo, Xu Yue, Jiahan Li, Jianzhu Ma, qiang liu, Yunan Luo, Jian Peng", "tldr": "", "abstract": "Despite the striking success of general protein folding models such as AlphaFold2 (AF2), the accurate computational modeling of antibody-antigen complexes remains a challenging task. In this paper, we first analyze AF2's primary loss function, known as the Frame Aligned Point Error (FAPE), and raise a previously overlooked issue that FAPE tends to face gradient vanishing problem on high-rotational-error targets. To address this fundamental limitation, we propose a novel geodesic loss called Frame Aligned Frame Error (FAFE, denoted as F2E to distinguish from FAPE), which enables the model to better optimize both the rotational and translational errors between two frames. We then prove that F2E can be reformulated as a group-aware geodesic loss, which translates the optimization of the residue-to-residue error to optimizing group-to-group geodesic frame distance. By fine-tuning AF2 with our proposed new loss function, we attain a correct rate of 52.3% (DockQ > 0.23) on an evaluation set and 43.8% correct rate on a subset with low homology, with improvement over AF2 by 182% and 100% respectively.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ruidong Wu;Ruihan Guo;Rui Wang;Shitong Luo;Yue Xu;Jiahan Li;Jianzhu Ma;qiang liu;Yunan Luo;Jian Peng", "authorids": "~Ruidong_Wu1;~Ruihan_Guo2;~Rui_Wang1;~Shitong_Luo1;xuyue@helixon.com;~Jiahan_Li2;~Jianzhu_Ma2;~qiang_liu4;~Yunan_Luo1;~Jian_Peng1", "gender": "M;;M;;;;M;;;M", "homepage": ";;https://www.ruiwang1998.com;https://luost.me;;;https://majianzhu.com/;;https://faculty.cc.gatech.edu/~yunan/;http://jianpeng.web.engr.illinois.edu/", "dblp": "224/4293;;06/2293;271/0339;;;24/9080.html;;225/8950;29/4181-1", "google_scholar": "lNeJlFYAAAAJ;;;z1BrjyIAAAAJ;;;;;N8RBFoAAAAAJ;https://scholar.google.com.tw/citations?user=4wcAVXAAAAAJ", "orcid": "0009-0002-6402-4717;;;;;;;;0000-0001-7728-6412;", "linkedin": "ruidong-wu-0b7182224/;;;;;;;;;", "or_profile": "~Ruidong_Wu1;~Ruihan_Guo2;~Rui_Wang1;~Shitong_Luo1;xuyue@helixon.com;~Jiahan_Li2;~Jianzhu_Ma2;~qiang_liu4;~Yunan_Luo1;~Jian_Peng1", "aff": "Helixon AI;;Helixon;Massachusetts Institute of Technology;;;Tsinghua University;;Georgia Institute of Technology;University of Illinois, Urbana Champaign", "aff_domain": "helixon.com;;helixon.com;mit.edu;;;tsinghua.edu.cn;;gatech.edu;illinois.edu", "position": "Researcher;;Researcher;PhD student;;;Associate Professor;;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nwu2024fafe,\ntitle={{FAFE}: Immune Complex Modeling with Geodesic Distance Loss on Noisy Group Frames},\nauthor={Ruidong Wu and Ruihan Guo and Rui Wang and Shitong Luo and Yue Xu and Jiahan Li and Jianzhu Ma and qiang liu and Yunan Luo and Jian Peng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Lhb39btw16}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1166011, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13154362718152166013&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "helixon.com;;helixon.com;mit.edu;;;tsinghua.edu.cn;;gatech.edu;illinois.edu", "author_num": 10, "aff_unique_index": "0;1;2;3;4;5", "aff_unique_norm": "Helixon AI;Helixon;Massachusetts Institute of Technology;Tsinghua University;Georgia Institute of Technology;University of Illinois Urbana-Champaign", "aff_unique_dep": ";;;;;", "aff_unique_url": ";;https://web.mit.edu;https://www.tsinghua.edu.cn;https://www.gatech.edu;https://illinois.edu", "aff_unique_abbr": "Helixon AI;;MIT;THU;Georgia Tech;UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;2;0;0", "aff_country_unique": "United States;;China" }, { "title": "Feature Contamination: Neural Networks Learn Uncorrelated Features and Fail to Generalize", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34272", "id": "Ljhrv1Wmbr", "proceeding": "https://proceedings.mlr.press/v235/zhang24cj.html", "pdf": "https://openreview.net/pdf?id=Ljhrv1Wmbr", "openreview": "https://openreview.net/forum?id=Ljhrv1Wmbr", "author_site": "Tianren Zhang, Chujie Zhao, Guanyu Chen, Yizhou Jiang, Feng Chen", "tldr": "", "abstract": "Learning representations that generalize under distribution shifts is critical for building robust machine learning models. However, despite significant efforts in recent years, algorithmic advances in this direction have been limited. In this work, we seek to understand the fundamental difficulty of out-of-distribution generalization with deep neural networks. We first empirically show that perhaps surprisingly, even allowing a neural network to explicitly fit the representations obtained from a teacher network that can generalize out-of-distribution is insufficient for the generalization of the student network. Then, by a theoretical study of two-layer ReLU networks optimized by stochastic gradient descent (SGD) under a structured feature model, we identify a fundamental yet unexplored feature learning proclivity of neural networks, feature contamination: neural networks can learn uncorrelated features together with predictive features, resulting in generalization failure under distribution shifts. Notably, this mechanism essentially differs from the prevailing narrative in the literature that attributes the generalization failure to spurious correlations. Overall, our results offer new insights into the non-linear feature learning dynamics of neural networks and highlight the necessity of considering inductive biases in out-of-distribution generalization.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tianren Zhang;Chujie Zhao;Guanyu Chen;Yizhou Jiang;Feng Chen", "authorids": "~Tianren_Zhang1;~Chujie_Zhao1;~Guanyu_Chen1;~Yizhou_Jiang1;~Feng_Chen1", "gender": "M;M;M;M;M", "homepage": ";;;;", "dblp": ";366/5498;;201/8247;21/3047-7", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;;;oM8ue_UAAAAJ;", "orcid": "0000-0001-9687-5263;0000-0003-3710-2644;0009-0008-0131-7006;;0000-0003-4813-2494", "linkedin": "%E5%A4%A9%E4%BB%BB-%E7%AB%A0-622b30110/;;;;", "or_profile": "~Tianren_Zhang1;~Chujie_Zhao1;~Guanyu_Chen1;~Yizhou_Jiang1;~Feng_Chen1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;MS student;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nzhang2024feature,\ntitle={Feature Contamination: Neural Networks Learn Uncorrelated Features and Fail to Generalize},\nauthor={Tianren Zhang and Chujie Zhao and Guanyu Chen and Yizhou Jiang and Feng Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Ljhrv1Wmbr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3503066, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13274477694685159611&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Collage: Light-Weight Low-Precision Strategy for LLM Training", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34271", "id": "LkJ6qOMv77", "proceeding": "https://proceedings.mlr.press/v235/yu24d.html", "pdf": "https://openreview.net/pdf?id=LkJ6qOMv77", "openreview": "https://openreview.net/forum?id=LkJ6qOMv77", "author_site": "Tao Yu, Gaurav Gupta, KARTHICK GOPALSWAMY, Amith Mamidala, Hao Zhou, Jeffrey Huynh, Youngsuk Park, Ron Diamant, Anoop Deoras, Luke Huan", "tldr": "", "abstract": "Large models training is plagued by the intense compute cost and limited hardware memory. A practical solution is low-precision representation but is troubled by loss in numerical accuracy and unstable training rendering the model less useful. We argue that low-precision floating points can perform well provided the error is properly compensated at the critical locations in the training process. We propose Collage which utilizes multi-component float representation in low-precision to accurately perform operations with numerical errors accounted. To understand the impact of imprecision to training, we propose a simple and novel metric which tracks the lost information during training as well as differentiates various precision strategies. Our method works with commonly used low-precision such as half-precision ($16$-bit floating points) and can be naturally extended to work with even lower precision such as $8$-bit. Experimental results show that pre-training using Collage removes the requirement of using $32$-bit floating-point copies of the model and attains similar/better training performance compared to $(16, 32)$-bit mixed-precision strategy, with up to $3.7\\times$ speedup and $\\sim 15\\%$ to $23\\%$ less memory usage in practice. The code is available at https://github.com/amazon-science/collage.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tao Yu;Gaurav Gupta;Karthick Gopalswamy;Amith R Mamidala;Hao Zhou;Jeffrey Huynh;Youngsuk Park;Ron Diamant;Anoop Deoras;Luke Huan", "authorids": "~Tao_Yu1;~Gaurav_Gupta2;~Karthick_Gopalswamy1;~Amith_R_Mamidala1;~Hao_Zhou12;~Jeffrey_Huynh1;~Youngsuk_Park1;diamant@amazon.com;~Anoop_Deoras1;~Luke_Huan2", "gender": "M;M;M;M;M;;M;;M;M", "homepage": "https://ydtydr.github.io/;http://guptagaurav.me/;;;;https://www.linkedin.com/in/jeff-huynh-9960154/;https://youngsuk0723.github.io/;;;", "dblp": ";;;;;;88/11095;;55/8761;", "google_scholar": "lbi95bUAAAAJ;Maqaq6MAAAAJ;GSrftGcAAAAJ;;8vaGcAcAAAAJ;;jWROvQ0AAAAJ;;QF_rhCIAAAAJ;", "orcid": ";;;;;;0000-0002-0970-9214;;;", "linkedin": "tao-yu-220720182/;gaurav71531/;karthickgopalswamy;amith-r-mamidala-5451bb14/;hao-zhou-55697aa4/;;y-park;;anoopdeoras/;jun-luke-huan-58963121", "or_profile": "~Tao_Yu1;~Gaurav_Gupta2;~Karthick_Gopalswamy1;~Amith_R_Mamidala1;~Hao_Zhou12;~Jeffrey_Huynh1;~Youngsuk_Park1;diamant@amazon.com;~Anoop_Deoras1;~Luke_Huan2", "aff": "Cornell University;Amazon;AWS AI ;Amazon;Amazon;;Amazon, AWS AI Labs;;Amazon;Department of Computer Science, University of Massachusetts at Amherst", "aff_domain": "cornell.edu;amazon.com;amazon.com;amazon.com;amazon.com;;amazon.com;;amazon.com;cs.umass.edu", "position": "PhD student;Applied Scientist-III;Researcher;Machine Learning Engineer;Researcher;;Research;;Principal Researcher;Principal Researcher", "bibtex": "@inproceedings{\nyu2024collage,\ntitle={Collage: Light-Weight Low-Precision Strategy for {LLM} Training},\nauthor={Tao Yu and Gaurav Gupta and Karthick Gopalswamy and Amith R Mamidala and Hao Zhou and Jeffrey Huynh and Youngsuk Park and Ron Diamant and Anoop Deoras and Luke Huan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LkJ6qOMv77}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1030487, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6118304432905840221&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "cornell.edu;amazon.com;amazon.com;amazon.com;amazon.com;;amazon.com;;amazon.com;cs.umass.edu", "author_num": 10, "aff_unique_index": "0;1;1;1;1;1;1;2", "aff_unique_norm": "Cornell University;Amazon;University of Massachusetts Amherst", "aff_unique_dep": ";Amazon.com, Inc.;Department of Computer Science", "aff_unique_url": "https://www.cornell.edu;https://www.amazon.com;https://www.umass.edu", "aff_unique_abbr": "Cornell;Amazon;UMass Amherst", "aff_campus_unique_index": "1", "aff_campus_unique": ";Amherst", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Tag-LLM: Repurposing General-Purpose LLMs for Specialized Domains", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34270", "id": "LlqphyBdeT", "proceeding": "https://proceedings.mlr.press/v235/shen24f.html", "pdf": "https://openreview.net/pdf?id=LlqphyBdeT", "openreview": "https://openreview.net/forum?id=LlqphyBdeT", "author_site": "Junhong Shen, Neil Tenenholtz, James Hall, David Alvarez-Melis, Nicol\u00f2 Fusi", "tldr": "", "abstract": "Large Language Models (LLMs) have demonstrated remarkable proficiency in understanding and generating natural language. However, their capabilities wane in highly specialized domains underrepresented in the pretraining corpus, such as physical and biomedical sciences. This work explores how to repurpose general LLMs into effective task solvers for specialized domains. We introduce a novel, model-agnostic framework for learning custom input tags, which are parameterized as continuous vectors appended to the LLM\u2019s embedding layer, to condition the LLM. We design two types of input tags: domain tags are used to delimit specialized representations (e.g., chemical formulas) and provide domain-relevant context; function tags are used to represent specific functions (e.g., predicting molecular properties) and compress function-solving instructions. We develop a three-stage protocol to learn these tags using auxiliary data and domain knowledge. By explicitly disentangling task domains from task functions, our method enables zero-shot generalization to unseen problems through diverse combinations of the input tags. It also boosts LLM\u2019s performance in various specialized domains, such as predicting protein or chemical properties and modeling drug-target interactions, outperforming expert models tailored to these tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Junhong Shen;Neil Tenenholtz;James Brian Hall;David Alvarez-Melis;Nicolo Fusi", "authorids": "~Junhong_Shen1;~Neil_Tenenholtz1;~James_Brian_Hall1;~David_Alvarez-Melis1;~Nicolo_Fusi1", "gender": "F;;M;M;M", "homepage": "https://sjunhongshen.github.io;;https://www.microsoft.com/en-us/research/people/jamhall/;https://dmelis.github.io/;", "dblp": "256/9575;75/10171;;168/8255;86/10995", "google_scholar": "M561o6QAAAAJ;SGl2QI8AAAAJ;;XsxZrYYAAAAJ;GldD-lwAAAAJ", "orcid": "0009-0002-3156-4899;0000-0003-1250-3716;;0000-0002-9591-8986;", "linkedin": ";neil-tenenholtz/;;;", "or_profile": "~Junhong_Shen1;~Neil_Tenenholtz1;~James_Brian_Hall1;~David_Alvarez-Melis1;~Nicolo_Fusi1", "aff": "Carnegie Mellon University;American College of Radiology Data Science Institute;Microsoft;Microsoft;Microsoft", "aff_domain": "cmu.edu;acr.org;microsoft.com;microsoft.com;microsoft.com", "position": "PhD student;Senior Scientist;Researcher;Senior Researcher;Researcher", "bibtex": "@inproceedings{\nshen2024tagllm,\ntitle={Tag-{LLM}: Repurposing General-Purpose {LLM}s for Specialized Domains},\nauthor={Junhong Shen and Neil Tenenholtz and James Brian Hall and David Alvarez-Melis and Nicolo Fusi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LlqphyBdeT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1188117, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6816932966945650319&as_sdt=20000005&sciodt=0,21&hl=en", "gs_version_total": 8, "email": "cmu.edu;acr.org;microsoft.com;microsoft.com;microsoft.com", "author_num": 5, "aff_unique_index": "0;1;2;2;2", "aff_unique_norm": "Carnegie Mellon University;American College of Radiology;Microsoft", "aff_unique_dep": ";Data Science Institute;Microsoft Corporation", "aff_unique_url": "https://www.cmu.edu;https://www.acr.org;https://www.microsoft.com", "aff_unique_abbr": "CMU;ACR;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning with Partial-Label and Unlabeled Data: A Uniform Treatment for Supervision Redundancy and Insufficiency", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34269", "id": "LmzsgSDkWs", "proceeding": "https://proceedings.mlr.press/v235/liu24ar.html", "pdf": "https://openreview.net/pdf?id=LmzsgSDkWs", "openreview": "https://openreview.net/forum?id=LmzsgSDkWs", "author_site": "Yangfan Liu, JIAQI LYU, Xin Geng, Ning Xu", "tldr": "", "abstract": "One major challenge in weakly supervised learning is learning from inexact supervision, ranging from partial labels (PLs) with *redundant* information to the extreme of unlabeled data with *insufficient* information. While recent work has made significant strides in specific inexact supervision contexts, supervision forms typically *coexist* in complex combinations. This is exemplified in *semi-supervised partial label learning*, where PLs act as the exclusive supervision in a semi-supervised setting. Current strategies addressing combined inexact scenarios are usually composite, which can lead to incremental solutions that essentially replicate existing methods. In this paper, we propose a novel approach to *uniformly* tackle both label redundancy and insufficiency, derived from a mutual information-based perspective. We design a label channel that facilitates dynamic label exchange within the candidate label sets, which identifies potential true labels and filters out likely incorrect ones, thereby minimizing error accumulation. Experimental results demonstrate the superiority of our method over existing state-of-the-art PL and semi-supervised learning approaches by directly integrating them. Furthermore, our extended experiments on partial-complementary label learning underscore the flexibility of our uniform treatment in managing diverse supervision scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yangfan Liu;Jiaqi Lv;Xin Geng;Ning Xu", "authorids": "~Yangfan_Liu1;~Jiaqi_Lv1;~Xin_Geng1;~Ning_Xu5", "gender": "M;F;M;M", "homepage": "https://palm.seu.edu.cn/homepage/liuyangfan/index.html;;http://palm.seu.edu.cn/xgeng/index.htm;http://palm.seu.edu.cn/xuning/", "dblp": ";191/9417;;04/5856-9", "google_scholar": ";PK8L9mYAAAAJ;ZOCxkIcAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Yangfan_Liu1;~Jiaqi_Lv1;~Xin_Geng1;~Ning_Xu5", "aff": "Southeast University;Southeast University;Southeast University, China;Southeast University", "aff_domain": "seu.edu.cn;seu.edu.cn;seu.edu.cn;seu.edu.cn", "position": "MS student;Associate Professor;Professor;Associate Professor", "bibtex": "@inproceedings{\nliu2024learning,\ntitle={Learning with Partial-Label and Unlabeled Data: A Uniform Treatment for Supervision Redundancy and Insufficiency},\nauthor={Yangfan Liu and Jiaqi Lv and Xin Geng and Ning Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LmzsgSDkWs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 373413, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14595953672322951634&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "seu.edu.cn;seu.edu.cn;seu.edu.cn;seu.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Southeast University", "aff_unique_dep": "", "aff_unique_url": "https://www.seu.edu.cn/", "aff_unique_abbr": "SEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Relaxing the Accurate Imputation Assumption in Doubly Robust Learning for Debiased Collaborative Filtering", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34268", "id": "Ln3moCobjO", "proceeding": "https://proceedings.mlr.press/v235/li24cq.html", "pdf": "https://openreview.net/pdf?id=Ln3moCobjO", "openreview": "https://openreview.net/forum?id=Ln3moCobjO", "author_site": "Haoxuan Li, Chunyuan Zheng, Shuyi Wang, Kunhan Wu, Eric Wang, Peng Wu, zhi geng, Xu Chen, Xiao-Hua Zhou", "tldr": "", "abstract": "Recommender system aims to recommend items or information that may interest users based on their behaviors and preferences. However, there may be sampling selection bias in the data collection process, i.e., the collected data is not a representative of the target population. Many debiasing methods are developed based on pseudo-labelings. Nevertheless, the validity of these methods relies heavily on accurate pseudo-labelings (i.e., the imputed labels), which is difficult to satisfy in practice. In this paper, we theoretically propose several novel doubly robust estimators that are unbiased when either (a) the pseudo-labelings deviate from the true labels with an arbitrary user-specific inductive bias, item-specific inductive bias, or a combination of both, or (b) the learned propensities are accurate. We further propose a propensity reconstruction learning approach that adaptively updates the constraint weights using an attention mechanism and effectively controls the variance. Extensive experiments show that our approach outperforms the state-of-the-art on one semi-synthetic and three real-world datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haoxuan Li;Chunyuan Zheng;Shuyi Wang;Kunhan Wu;Eric Wang;Peng Wu;Zhi Geng;Xu Chen;Xiao-Hua Zhou", "authorids": "~Haoxuan_Li6;~Chunyuan_Zheng1;~Shuyi_Wang3;~Kunhan_Wu1;~Eric_Wang3;~Peng_Wu5;~Zhi_Geng1;~Xu_Chen13;~Xiao-Hua_Zhou1", "gender": "M;M;M;;;M;M;M;", "homepage": "https://haoxuanli-pku.github.io/;;;;;https://pengwu.site/;https://stxy.btbu.edu.cn/szdw/bssds/34339356074b408c8650309f05f24558.htm;https://gsai.ruc.edu.cn/chenxu;", "dblp": "145/4965-1.html;;;;;15/6146-12;;83/6331-17;", "google_scholar": "gtDqiucAAAAJ;https://scholar.google.com/citations?hl=en;;;;https://scholar.google.com/citations?view_op=list_works;;loPoqy0AAAAJ;YJNYC40AAAAJ", "orcid": "0000-0003-3620-3769;0000-0002-0306-7310;0000-0002-5576-6308;0000-0002-8456-350X;;0000-0001-7154-8880;;0000-0003-0144-1775;", "linkedin": ";;shuyi-wang-9b6731254;;;;;;", "or_profile": "~Haoxuan_Li6;~Chunyuan_Zheng1;~Shuyi_Wang3;~Kunhan_Wu1;~Eric_Wang3;~Peng_Wu5;~Zhi_Geng1;~Xu_Chen13;~Xiao-Hua_Zhou1", "aff": "Peking University;Peking University;University of Pennsylvania;Carnegie Mellon University;;Beijing Technology and Business University;School of mathematical Science, Peking University, Peking University;Renmin University of China;", "aff_domain": "pku.edu.cn;stu.pku.edu.cn;upenn.edu;cmu.edu;;btbu.edu.cn;math.pku.edu.cn;ruc.edu.cn;", "position": "PhD student;PhD student;Undergrad student;MS student;;Associate Professor;Full Professor;Associate Professor;", "bibtex": "@inproceedings{\nli2024relaxing,\ntitle={Relaxing the Accurate Imputation Assumption in Doubly Robust Learning for Debiased Collaborative Filtering},\nauthor={Haoxuan Li and Chunyuan Zheng and Shuyi Wang and Kunhan Wu and Eric Wang and Peng Wu and Zhi Geng and Xu Chen and Xiao-Hua Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Ln3moCobjO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 729692, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18385466742702955812&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "email": "pku.edu.cn;stu.pku.edu.cn;upenn.edu;cmu.edu;;btbu.edu.cn;math.pku.edu.cn;ruc.edu.cn;", "author_num": 9, "aff_unique_index": "0;0;1;2;3;0;4", "aff_unique_norm": "Peking University;University of Pennsylvania;Carnegie Mellon University;Beijing Technology and Business University;Renmin University of China", "aff_unique_dep": ";;;;", "aff_unique_url": "http://www.pku.edu.cn;https://www.upenn.edu;https://www.cmu.edu;http://www.btbu.edu.cn;http://www.ruc.edu.cn", "aff_unique_abbr": "Peking U;UPenn;CMU;BTBU;RUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Peking", "aff_country_unique_index": "0;0;1;1;0;0;0", "aff_country_unique": "China;United States" }, { "title": "DFA-RAG: Conversational Semantic Router for Large Language Model with Definite Finite Automaton", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34267", "id": "LpAzlcGzJ6", "proceeding": "https://proceedings.mlr.press/v235/sun24e.html", "pdf": "https://openreview.net/pdf?id=LpAzlcGzJ6", "openreview": "https://openreview.net/forum?id=LpAzlcGzJ6", "author_site": "Yiyou Sun, Junjie Hu, Wei Cheng, Haifeng Chen", "tldr": "", "abstract": "This paper introduces the retrieval-augmented large language model with Definite Finite Automaton (DFA-RAG), a novel framework designed to enhance the capabilities of conversational agents using large language models (LLMs). Traditional LLMs face challenges in generating regulated and compliant responses in special scenarios with predetermined response guidelines, like emotional support and customer service. Our framework addresses these challenges by embedding a Definite Finite Automaton (DFA), learned from training dialogues, within the LLM. This structured approach acts as a semantic router which enables the LLM to adhere to a deterministic response pathway. The routing is achieved by the retrieval-augmentation generation (RAG) strategy, which carefully selects dialogue examples aligned with the current conversational context. The advantages of DFA-RAG include an interpretable structure through human-readable DFA, context-aware retrieval for responses in conversations, and plug-and-play compatibility with existing LLMs. Extensive benchmarks validate DFA-RAG's effectiveness, indicating its potential as a valuable contribution to the conversational agent.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yiyou Sun;Junjie Hu;Wei Cheng;Haifeng Chen", "authorids": "~Yiyou_Sun1;~Junjie_Hu2;~Wei_Cheng1;~Haifeng_Chen1", "gender": "M;M;;M", "homepage": "https://sunyiyou.github.io/;https://chengw07.github.io/;https://haifengchen.gitlab.io/intro/;https://junjiehu.github.io/", "dblp": "211/5630;89/2506-2.html;08/57-1.html;123/0773-1.html", "google_scholar": "IKqlQo4AAAAJ;PRrGVmoAAAAJ;QzakB68AAAAJ;j-42gHYAAAAJ", "orcid": ";;;0000-0001-7137-7719", "linkedin": ";wei-cheng-ml/;;junjie-hu-24b48b83/", "or_profile": "~Yiyou_Sun1;~Wei_Cheng1;~Haifeng_Chen1;~Junjie_Hu1", "aff": "University of California, Berkeley;NEC-Labs;NEC-Labs;University of Wisconsin, Madison", "aff_domain": "berkeley.edu;nec-labs.com;nec-labs.com;wisc.edu", "position": "Postdoc;Principal Researcher;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nsun2024dfarag,\ntitle={{DFA}-{RAG}: Conversational Semantic Router for Large Language Model with Definite Finite Automaton},\nauthor={Yiyou Sun and Junjie Hu and Wei Cheng and Haifeng Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LpAzlcGzJ6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4605134, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3290483695526212151&as_sdt=5,24&sciodt=0,24&hl=en", "gs_version_total": 6, "email": "berkeley.edu;nec-labs.com;nec-labs.com;wisc.edu", "author_num": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "University of California, Berkeley;NEC Laboratories;University of Wisconsin", "aff_unique_dep": ";;", "aff_unique_url": "https://www.berkeley.edu;https://www.nec-labs.com;https://www.wisc.edu", "aff_unique_abbr": "UC Berkeley;NEC-Labs;UW", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Berkeley;;Madison", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "COPAL: Continual Pruning in Large Language Generative Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34266", "id": "Lt8Lk7IQ5b", "proceeding": "https://proceedings.mlr.press/v235/malla24a.html", "pdf": "https://openreview.net/pdf?id=Lt8Lk7IQ5b", "openreview": "https://openreview.net/forum?id=Lt8Lk7IQ5b", "author_site": "Srikanth Malla, Joon Hee Choi, Chiho Choi", "tldr": "", "abstract": "Adapting pre-trained large language models to different domains in natural language processing requires two key considerations: high computational demands and model's inability to continual adaptation. To simultaneously address both issues, this paper presents COPAL (**CO**ntinual **P**runing in **A**daptive **L**anguage settings), an algorithm developed for pruning large language generative models under a continual model adaptation setting. While avoiding resource-heavy finetuning or retraining, our pruning process is guided by the proposed sensitivity analysis. The sensitivity effectively measures model's ability to withstand perturbations introduced by the new dataset and finds model's weights that are relevant for all encountered datasets. As a result, COPAL allows seamless model adaptation to new domains while enhancing the resource efficiency. Our empirical evaluation on a various size of LLMs show that COPAL outperforms baseline models, demonstrating its efficacy in efficiency and adaptability.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Srikanth Malla;Joon Hee Choi;Chiho Choi", "authorids": "~Srikanth_Malla1;~Joon_Hee_Choi2;~Chiho_Choi2", "gender": "M;M;M", "homepage": "http://www.srikanthmalla.com;http://mllab.skku.edu/;https://chihochoi.github.io/index.html", "dblp": "223/4108;155/1950;176/1540", "google_scholar": "TjIKwLcAAAAJ;Rt1uQQcAAAAJ;iSFDVj4AAAAJ", "orcid": ";0000-0003-0816-4790;0000-0002-0196-2039", "linkedin": "srikanthmalla/;joonheechoi/;chihochoi/", "or_profile": "~Srikanth_Malla1;~Joon_Hee_Choi2;~Chiho_Choi2", "aff": "Samsung;Samsung;Samsung", "aff_domain": "samsung.com;samsung.com;samsung.com", "position": "staff machine learning engineer;Principal Researcher;Sr Staff Engineer", "bibtex": "@inproceedings{\nmalla2024copal,\ntitle={{COPAL}: Continual Pruning in Large Language Generative Models},\nauthor={Srikanth Malla and Joon Hee Choi and Chiho Choi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Lt8Lk7IQ5b}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 418586, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16623894276813189729&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "samsung.com;samsung.com;samsung.com", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Samsung", "aff_unique_dep": "Samsung", "aff_unique_url": "https://www.samsung.com", "aff_unique_abbr": "Samsung", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "S$\\Omega$I: Score-based O-INFORMATION Estimation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34265", "id": "LuhWZ2oJ5L", "proceeding": "https://proceedings.mlr.press/v235/bounoua24a.html", "pdf": "https://openreview.net/pdf?id=LuhWZ2oJ5L", "openreview": "https://openreview.net/forum?id=LuhWZ2oJ5L", "author_site": "Mustapha BOUNOUA, Giulio Franzese, Pietro Michiardi", "tldr": "", "abstract": "The analysis of scientific data and complex multivariate systems requires information quantities that capture relationships among multiple random variables. Recently, new information-theoretic measures have been developed to overcome the shortcomings of classical ones, such as mutual information, that are restricted to considering pairwise interactions. Among them, the concept of information synergy and redundancy is crucial for understanding the high-order dependencies between variables. One of the most prominent and versatile measures based on this concept is *O-information*, which provides a clear and scalable way to quantify the synergy-redundancy balance in multivariate systems. However, its practical application is limited to simplified cases. In this work, we introduce **S$\\Omega$I**, which allows to compute *O-information* without restrictive assumptions about the system while leveraging a unique model. Our experiments validate our approach on synthetic data, and demonstrate the effectiveness of **S$\\Omega$I** in the context of a real-world use case.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mustapha BOUNOUA;Giulio Franzese;Pietro Michiardi", "authorids": "~Mustapha_BOUNOUA1;~Giulio_Franzese1;~Pietro_Michiardi1", "gender": "M;M;M", "homepage": "https://mustaphabounoua.github.io/;;http://www.eurecom.fr/~michiard/", "dblp": "348/9789;217/1859.html;54/3028", "google_scholar": "1ooHDEMAAAAJ;kEtx_WwAAAAJ;https://scholar.google.com.tw/citations?user=mlx1eCgAAAAJ", "orcid": "0009-0003-5244-8528;0000-0003-4244-2053;", "linkedin": "mustb/;;", "or_profile": "~Mustapha_BOUNOUA1;~Giulio_Franzese1;~Pietro_Michiardi1", "aff": "Eurecom;Eurecom;EURECOM", "aff_domain": "eurecom.fr;eurecom.fr;eurecom.fr", "position": "PhD student;Postdoc;Full Professor", "bibtex": "@inproceedings{\nbounoua2024somegai,\ntitle={S\\${\\textbackslash}Omega\\$I: Score-based O-{INFORMATION} Estimation},\nauthor={Mustapha BOUNOUA and Giulio Franzese and Pietro Michiardi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LuhWZ2oJ5L}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8446766, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15441829042655754459&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "eurecom.fr;eurecom.fr;eurecom.fr", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "EURECOM", "aff_unique_dep": "", "aff_unique_url": "https://www.eurecom.fr", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "Learning Causal Domain-Invariant Temporal Dynamics for Few-Shot Action Recognition", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34264", "id": "LvuuYqU0BW", "proceeding": "https://proceedings.mlr.press/v235/li24h.html", "pdf": "https://openreview.net/pdf?id=LvuuYqU0BW", "openreview": "https://openreview.net/forum?id=LvuuYqU0BW", "author_site": "Yuke Li, Guangyi Chen, Ben Abramowitz, Stefano Anzellotti, Donglai Wei", "tldr": "", "abstract": "Few-shot action recognition aims at quickly adapting a pre-trained model to the novel data with a distribution shift using only a limited number of samples. Key challenges include how to identify and leverage the transferable knowledge learned by the pre-trained model. We therefore propose CDTD, or Causal Domain-Invariant Temporal Dynamics for knowledge transfer. To identify the temporally invariant and variant representations, we employ the causal representation learning methods for unsupervised pertaining, and then tune the classifier with supervisions in next stage. Specifically, we assume the domain information can be well estimated and the pre-trained temporal dynamic generation and transition models can be well transferred. During adaptation, we fix the transferable temporal dynamics and update the image encoder and domain estimator. The efficacy of our approach is revealed by the superior accuracy of CDTD over leading alternatives across standard few-shot action recognition datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuke Li;Guangyi Chen;Ben Abramowitz;Stefano Anzellotti;Donglai Wei", "authorids": "~Yuke_Li1;~Guangyi_Chen1;~Ben_Abramowitz1;~Stefano_Anzellotti1;~Donglai_Wei1", "gender": ";M;;M;M", "homepage": ";https://chengy12.github.io/;;https://sccnlab.bc.edu/;https://donglaiw.github.io/", "dblp": ";c/GuangyiChen-2;210/2612;;89/10116.html", "google_scholar": ";https://scholar.google.com/citations?hl=zh-CN;;tK5W2p0AAAAJ;xF2mhDoAAAAJ", "orcid": ";;;;0000-0002-2329-5484", "linkedin": ";;;;", "or_profile": "~Yuke_Li1;~Guangyi_Chen1;~Ben_Abramowitz1;~Stefano_Anzellotti1;~Donglai_Wei1", "aff": ";Carnegie Mellon University;;Boston College;Boston College", "aff_domain": ";cmu.edu;;bc.edu;bc.edu", "position": ";Postdoc;;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nli2024learning,\ntitle={Learning Causal Domain-Invariant Temporal Dynamics for Few-Shot Action Recognition},\nauthor={Yuke Li and Guangyi Chen and Ben Abramowitz and Stefano Anzellotti and Donglai Wei},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LvuuYqU0BW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7728149, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8589012153024678697&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": ";cmu.edu;;bc.edu;bc.edu", "author_num": 5, "aff_unique_index": "0;1;1", "aff_unique_norm": "Carnegie Mellon University;Boston College", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.bostoncollege.edu", "aff_unique_abbr": "CMU;BC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Machine Vision Therapy: Multimodal Large Language Models Can Enhance Visual Robustness via Denoising In-Context Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34263", "id": "LwOfVWgEzS", "proceeding": "https://proceedings.mlr.press/v235/huang24o.html", "pdf": "https://openreview.net/pdf?id=LwOfVWgEzS", "openreview": "https://openreview.net/forum?id=LwOfVWgEzS", "author_site": "Zhuo Huang, Chang Liu, Yinpeng Dong, Hang Su, Shibao Zheng, Tongliang Liu", "tldr": "", "abstract": "Although pre-trained models such as Contrastive Language-Image Pre-Training (CLIP) show impressive generalization results, their robustness is still limited under Out-of-Distribution (OOD) scenarios. Instead of undesirably leveraging human annotation as commonly done, it is possible to leverage the visual understanding power of Multi-modal Large Language Models (MLLMs). However, MLLMs struggle with vision problems due to task incompatibility, thus hindering their effectiveness. In this paper, we propose to effectively leverage MLLMs via Machine Vision Therapy which aims to rectify erroneous predictions of specific vision models. By supervising vision models using MLLM predictions, visual robustness can be boosted in a nearly unsupervised manner. Moreover, we propose a Denoising In-Context Learning (DICL) strategy to solve the incompatibility issue. Concretely, by examining the noise probability of each example through a transition matrix, we construct an instruction containing a correct exemplar and a probable erroneous one, which enables MLLMs to detect and rectify the incorrect predictions of vision models. Under mild assumptions, we theoretically show that our DICL method is guaranteed to find the ground truth. Through extensive experiments on various OOD datasets, our method demonstrates powerful capabilities for enhancing visual robustness under many OOD scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhuo Huang;Chang Liu;Yinpeng Dong;Hang Su;Shibao Zheng;Tongliang Liu", "authorids": "~Zhuo_Huang2;~Chang_Liu17;~Yinpeng_Dong2;~Hang_Su3;~Shibao_Zheng1;~Tongliang_Liu1", "gender": "M;M;M;;M;M", "homepage": "https://zhuohuangai.github.io/;https://github.com/sunrise6513;https://dongyp13.github.io;https://ee.sjtu.edu.cn/FacultyDetail.aspx?id=17&infoid=66&flag=66;https://tongliang-liu.github.io/;", "dblp": ";;183/0980;20/1917;150/6667;26/5371-6", "google_scholar": ";;6_4ad84AAAAJ;;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;dxN1_X0AAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Zhuo_Huang2;~Chang_Liu17;~Yinpeng_Dong2;~Shibao_Zheng1;~Tongliang_Liu1;~Hang_Su2", "aff": "University of Sydney;Shanghai Jiaotong University;Tsinghua University;Shanghai Jiaotong University;Mohamed bin Zayed University of Artificial Intelligence;Tsinghua University", "aff_domain": "uni.sydney.edu.au;sjtu.edu.cn;tsinghua.edu.cn;sjtu.edu.cn;mbzuai.ac.ae;tsinghua.edu.cn", "position": "PhD student;D.Eng;Postdoc;Instructor;Affiliated Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nhuang2024machine,\ntitle={Machine Vision Therapy: Multimodal Large Language Models Can Enhance Visual Robustness via Denoising In-Context Learning},\nauthor={Zhuo Huang and Chang Liu and Yinpeng Dong and Hang Su and Shibao Zheng and Tongliang Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LwOfVWgEzS}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9292709, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10885369736734263730&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "uni.sydney.edu.au;sjtu.edu.cn;tsinghua.edu.cn;sjtu.edu.cn;mbzuai.ac.ae;tsinghua.edu.cn", "author_num": 6, "aff_unique_index": "0;1;2;1;3;2", "aff_unique_norm": "University of Sydney;Shanghai Jiao Tong University;Tsinghua University;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.sydney.edu.au;https://www.sjtu.edu.cn;https://www.tsinghua.edu.cn;https://mbzuai.ac.ae", "aff_unique_abbr": "USYD;SJTU;THU;MBZUAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;2;1", "aff_country_unique": "Australia;China;United Arab Emirates" }, { "title": "Various Lengths, Constant Speed: Efficient Language Modeling with Lightning Attention", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34262", "id": "Lwm6TiUP4X", "proceeding": "https://proceedings.mlr.press/v235/qin24c.html", "pdf": "https://openreview.net/pdf?id=Lwm6TiUP4X", "openreview": "https://openreview.net/forum?id=Lwm6TiUP4X", "author_site": "Zhen Qin, Weigao Sun, Dong Li, Xuyang Shen, Weixuan Sun, Yiran Zhong", "tldr": "", "abstract": "We present Lightning Attention, the first linear attention implementation that maintains a constant training speed for various sequence lengths under fixed memory consumption. Due to the issue with cumulative summation operations (cumsum), previous linear attention implementations cannot achieve their theoretical advantage in a casual setting. However, this issue can be effectively solved by utilizing different attention calculation strategies to compute the different parts of attention. Specifically, we split the attention calculation into intra-blocks and inter-blocks and use conventional attention computation for intra-blocks and linear attention kernel tricks for inter-blocks. This eliminates the need for cumsum in the linear attention calculation. Furthermore, a tiling technique is adopted through both forward and backward procedures to take full advantage of the GPU hardware. To enhance accuracy while preserving efficacy, we introduce TransNormerLLM (TNL), a new architecture that is tailored to our lightning attention. We conduct rigorous testing on standard and self-collected datasets with varying model sizes and sequence lengths. TNL is notably more efficient than other language models. In addition, benchmark results indicate that TNL performs on par with state-of-the-art LLMs utilizing conventional transformer structures. The source code is released at github.com/OpenNLPLab/TransnormerLLM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhen Qin;Weigao Sun;Dong Li;Xuyang Shen;Weixuan Sun;Yiran Zhong", "authorids": "~Zhen_Qin6;~Weigao_Sun1;~Dong_Li11;~Xuyang_Shen1;~Weixuan_Sun1;~Yiran_Zhong1", "gender": ";M;M;M;M;M", "homepage": "https://github.com/Doraemonzzz;https://weigao266.github.io/;;;https://weixuansun.github.io/weixuansun-github.io/;", "dblp": ";;;274/2342;186/6724;158/9624", "google_scholar": "https://scholar.google.com.sg/citations?user=IcBRtycAAAAJ;https://scholar.google.com/citations?hl=en;bxmsqZIAAAAJ;k6Q1mcoAAAAJ;vIS56AoAAAAJ;https://scholar.google.com.sg/citations?user=E9NVOBUAAAAJ", "orcid": ";;;0000-0002-1968-7055;;", "linkedin": ";weigao-sun-01ab4a1b3/;;;;", "or_profile": "~Zhen_Qin6;~Weigao_Sun1;~Dong_Li11;~Xuyang_Shen1;~Weixuan_Sun1;~Yiran_Zhong1", "aff": "TapTap;Shanghai Artificial Intelligence Laboratory;Shanghai AI Lab;Shanghai AI Lab;Tencent;Shanghai AI Lab", "aff_domain": "xd.com;pjlab.org.cn;org.cn;pjlab.org.cn;tencent.com;pjlab.org.cn", "position": "Researcher;Researcher;Researcher;Researcher;Researcher;PI", "bibtex": "@inproceedings{\nqin2024various,\ntitle={Various Lengths, Constant Speed: Efficient Language Modeling with Lightning Attention},\nauthor={Zhen Qin and Weigao Sun and Dong Li and Xuyang Shen and Weixuan Sun and Yiran Zhong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Lwm6TiUP4X}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 951890, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17193962114183100942&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "xd.com;pjlab.org.cn;org.cn;pjlab.org.cn;tencent.com;pjlab.org.cn", "author_num": 6, "aff_unique_index": "0;1;2;2;3;2", "aff_unique_norm": "TapTap;Shanghai Artificial Intelligence Laboratory;Shanghai AI Lab;Tencent", "aff_unique_dep": ";;;Tencent Holdings Limited", "aff_unique_url": "https://www.taptap.io;http://www.shailab.org/;https://www.shanghaiailab.com;https://www.tencent.com", "aff_unique_abbr": "TapTap;Shanghai AI Lab;SAIL;Tencent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "$\\texttt{MoE-RBench}$: Towards Building Reliable Language Models with Sparse Mixture-of-Experts", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34261", "id": "LyJ85kgHFe", "proceeding": "https://proceedings.mlr.press/v235/chen24bg.html", "pdf": "https://openreview.net/pdf?id=LyJ85kgHFe", "openreview": "https://openreview.net/forum?id=LyJ85kgHFe", "author_site": "Guanjie Chen, Xinyu Zhao, Tianlong Chen, Yu Cheng", "tldr": "", "abstract": "Mixture-of-Experts (MoE) has gained increasing popularity as a promising framework for scaling up large language models (LLMs). However, the reliability assessment of MoE lags behind its surging applications. Moreover, when transferred to new domains such as in fine-tuning MoE models sometimes underperform their dense counterparts. Motivated by the research gap and counter-intuitive phenomenon, we propose $\\texttt{MoE-RBench}$, the first comprehensive assessment of SMoE reliability from three aspects: $\\textit{(i)}$ safety and hallucination, $\\textit{(ii)}$ resilience to adversarial attacks, and $\\textit{(iii)}$ out-of-distribution robustness. Extensive models and datasets are tested to compare the MoE to dense networks from these reliability dimensions. Our empirical observations suggest that with appropriate hyperparameters, training recipes, and inference techniques, we can build the MoE model more reliably than the dense LLM. In particular, we find that the robustness of SMoE is sensitive to the basic training settings. We hope that this study can provide deeper insights into how to adapt the pre-trained MoE model to other tasks with higher-generation security, quality, and stability. Codes are available at https://github.com/UNITES-Lab/MoE-RBench.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guanjie Chen;Xinyu Zhao;Tianlong Chen;Yu Cheng", "authorids": "~Guanjie_Chen3;~Xinyu_Zhao3;~Tianlong_Chen1;~Yu_Cheng1", "gender": "M;;M;M", "homepage": "https://guanjiechen118.github.io/;https://zhaocinyu.github.io/;https://tianlong-chen.github.io;https://ych133.github.io", "dblp": ";;;96/3060-1.html", "google_scholar": "cpBU1VgAAAAJ;https://scholar.google.com/citations?hl=en;LE3ctn0AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0009-0000-0253-5488;0000-0001-7774-8197;", "linkedin": ";;tianlong-chen-783862167/;chengyu05/", "or_profile": "~Guanjie_Chen3;~Xinyu_Zhao3;~Tianlong_Chen1;~Yu_Cheng1", "aff": "Nanjing University;Peking University;Harvard University;The Chinese University of Hong Kong", "aff_domain": "smail.nju.edu.cn;pku.edu.cn;harvard.edu;cuhk.edu.hk", "position": "Undergrad student;MS student;Postdoc;Associate Professor", "bibtex": "@inproceedings{\nchen2024textttmoerbench,\ntitle={\\${\\textbackslash}texttt\\{MoE-{RB}ench\\}\\$: Towards Building Reliable Language Models with Sparse Mixture-of-Experts},\nauthor={Guanjie Chen and Xinyu Zhao and Tianlong Chen and Yu Cheng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LyJ85kgHFe}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 602801, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "email": "smail.nju.edu.cn;pku.edu.cn;harvard.edu;cuhk.edu.hk", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Nanjing University;Peking University;Harvard University;Chinese University of Hong Kong", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.nju.edu.cn;http://www.pku.edu.cn;https://www.harvard.edu;https://www.cuhk.edu.hk", "aff_unique_abbr": "Nanjing U;Peking U;Harvard;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Theoretical insights for diffusion guidance: A case study for Gaussian mixture models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34260", "id": "M1ADedSnlJ", "proceeding": "https://proceedings.mlr.press/v235/wu24b.html", "pdf": "https://openreview.net/pdf?id=M1ADedSnlJ", "openreview": "https://openreview.net/forum?id=M1ADedSnlJ", "author_site": "Yuchen Wu, Minshuo Chen, Zihao Li, Mengdi Wang, Yuting Wei", "tldr": "", "abstract": "Diffusion models benefit from instillation of task-specific information into the score function to steer the sample generation towards desired properties. Such information is coined as guidance. For example, in text-to-image synthesis, text input is encoded as guidance to generate semantically aligned images. Proper guidance inputs are closely tied with the performance of diffusion models. A common observation is that strong guidance promotes a tight alignment to the task-specific information, while reduces the diversity of the generated samples. In this paper, we provide the first theoretical study towards the influence of guidance on diffusion models in the context of Gaussian mixture models. Under mild conditions, we prove that incorporating diffusion guidance not only boosts prediction confidence but also diminishes distribution diversity, leading to a reduction in the differential entropy of the output distribution. Our analysis covers the widely used DDPM and DDIM sampling schemes, and leverages comparison inequalities in differential equations as well as the Fokker-Planck equation that characterizes the evolution of probability density function, which may be of independent theoretical interest.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuchen Wu;Minshuo Chen;Zihao Li;Mengdi Wang;Yuting Wei", "authorids": "~Yuchen_Wu1;~Minshuo_Chen1;~Zihao_Li3;~Mengdi_Wang1;~Yuting_Wei1", "gender": "F;M;M;F;F", "homepage": "https://wuyc0114.github.io./;https://minshuochen.github.io;;http://mwang.princeton.edu;https://yutingwei.github.io/", "dblp": "26/317-2;217/1509;;;184/3856", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;qU9WvTgAAAAJ;;;fsbXdAYAAAAJ", "orcid": "0000-0002-9538-4558;;;;", "linkedin": "yuchen-wu-aab9b71aa/;;zihao-li-48b313235/;;", "or_profile": "~Yuchen_Wu1;~Minshuo_Chen1;~Zihao_Li3;~Mengdi_Wang1;~Yuting_Wei1", "aff": "The Wharton School, University of Pennsylvania;Princeton University;Princeton University;Princeton University;The Wharton School, University of Pennsylvania", "aff_domain": "wharton.upenn.edu;princeton.edu;princeton.edu;princeton.edu;wharton.upenn.edu", "position": "Postdoc;Postdoc;PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nwu2024theoretical,\ntitle={Theoretical insights for diffusion guidance: A case study for Gaussian mixture models},\nauthor={Yuchen Wu and Minshuo Chen and Zihao Li and Mengdi Wang and Yuting Wei},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=M1ADedSnlJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4792820, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16959469163613197696&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "wharton.upenn.edu;princeton.edu;princeton.edu;princeton.edu;wharton.upenn.edu", "author_num": 5, "aff_unique_index": "0;1;1;1;0", "aff_unique_norm": "University of Pennsylvania;Princeton University", "aff_unique_dep": "The Wharton School;", "aff_unique_url": "https://www.wharton.upenn.edu;https://www.princeton.edu", "aff_unique_abbr": "UPenn Wharton;Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Position: Key Claims in LLM Research Have a Long Tail of Footnotes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34259", "id": "M2cwkGleRL", "proceeding": "https://proceedings.mlr.press/v235/rogers24a.html", "pdf": "https://openreview.net/pdf?id=M2cwkGleRL", "openreview": "https://openreview.net/forum?id=M2cwkGleRL", "author_site": "Anna Rogers, Sasha Luccioni", "tldr": "", "abstract": "Much of the recent discourse within the ML community has been centered around Large Language Models (LLMs), their functionality and potential -- yet not only do we not have a working definition of LLMs, but much of this discourse relies on claims and assumptions that are worth re-examining. We contribute a definition of LLMs, critically examine five common claims regarding their properties (including 'emergent properties'), and conclude with suggestions for future research directions and their framing.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anna Rogers;Sasha Luccioni", "authorids": "~Anna_Rogers1;~Sasha_Luccioni1", "gender": "F;F", "homepage": "https://annargrs.github.io;http://sashaluccioni.com/", "dblp": "203/9462;162/5449", "google_scholar": "5oCYOE0AAAAJ;", "orcid": "0000-0002-4845-4023;0000-0001-6238-7050", "linkedin": "annargrs;alexandraluccioniphd/", "or_profile": "~Anna_Rogers1;~Alexandra_Luccioni1", "aff": "IT University of Copenhagen;Hugging Face", "aff_domain": "itu.dk;huggingface.co", "position": "Associate Professor;Researcher", "bibtex": "@inproceedings{\nrogers2024position,\ntitle={Position: Key Claims in {LLM} Research Have a Long Tail of Footnotes},\nauthor={Anna Rogers and Sasha Luccioni},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=M2cwkGleRL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 238209, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4840072130687808084&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "itu.dk;huggingface.co", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "IT University of Copenhagen;Hugging Face", "aff_unique_dep": ";", "aff_unique_url": "https://itu.dk;https://huggingface.co", "aff_unique_abbr": "ITU;Hugging Face", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Denmark;United States" }, { "title": "Sequential Asynchronous Action Coordination in Multi-Agent Systems: A Stackelberg Decision Transformer Approach", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34258", "id": "M3qRRkOuTN", "proceeding": "https://proceedings.mlr.press/v235/zhang24au.html", "pdf": "https://openreview.net/pdf?id=M3qRRkOuTN", "openreview": "https://openreview.net/forum?id=M3qRRkOuTN", "author_site": "Bin Zhang, Hangyu Mao, Lijuan Li, Zhiwei Xu, dapeng Li, Rui Zhao, Guoliang Fan", "tldr": "", "abstract": "Asynchronous action coordination presents a pervasive challenge in Multi-Agent Systems (MAS), which can be represented as a Stackelberg game (SG). However, the scalability of existing Multi-Agent Reinforcement Learning (MARL) methods based on SG is severely restricted by network architectures or environmental settings. To address this issue, we propose the Stackelberg Decision Transformer (STEER). It efficiently manages decision-making processes by incorporating the hierarchical decision structure of SG, the modeling capability of autoregressive sequence models, and the exploratory learning methodology of MARL. Our approach exhibits broad applicability across diverse task types and environmental configurations in MAS. Experimental results demonstrate both the convergence of our method towards Stackelberg equilibrium strategies and its superiority over strong baselines in complex scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bin Zhang;Hangyu Mao;Lijuan Li;Zhiwei Xu;Dapeng Li;Rui Zhao;Guoliang Fan", "authorids": "~Bin_Zhang12;~Hangyu_Mao2;~Lijuan_Li2;~Zhiwei_Xu3;~Dapeng_Li2;~Rui_Zhao6;~Guoliang_Fan3", "gender": ";;F;M;;M;M", "homepage": ";;;;;http://zhaorui.xyz/;http://www.ia.ac.cn", "dblp": ";;06/3198-2;262/0620-5;;26/2578-1;f/GuoliangFan", "google_scholar": ";;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.co.uk/citations?user=kZoG7ssAAAAJ;;1c9oQNMAAAAJ;", "orcid": ";;0000-0002-2758-7433;0000-0002-0754-5295;;;", "linkedin": ";;;;;;", "or_profile": "~Bin_Zhang12;~Hangyu_Mao2;~Lijuan_Li2;~Zhiwei_Xu3;~Dapeng_Li2;~Rui_Zhao6;~Guoliang_Fan3", "aff": ";;;Institute of Automation, Chinese Academy of Sciences;;SenseTime Research;Institute of Automation, Chinese Academy of Sciences", "aff_domain": ";;;ia.ac.cn;;sensetime.com;ia.ac.cn", "position": ";;;PhD student;;Researcher;Full Professor", "bibtex": "@inproceedings{\nzhang2024sequential,\ntitle={Sequential Asynchronous Action Coordination in Multi-Agent Systems: A Stackelberg Decision Transformer Approach},\nauthor={Bin Zhang and Hangyu Mao and Lijuan Li and Zhiwei Xu and Dapeng Li and Rui Zhao and Guoliang Fan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=M3qRRkOuTN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8050975, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6919078722480087898&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": ";;;ia.ac.cn;;sensetime.com;ia.ac.cn", "author_num": 7, "aff_unique_index": "0;1;0", "aff_unique_norm": "Chinese Academy of Sciences;SenseTime", "aff_unique_dep": "Institute of Automation;SenseTime Research", "aff_unique_url": "http://www.ia.cas.cn;https://www.sensetime.com", "aff_unique_abbr": "CAS;SenseTime", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "DUPLEX: Dual GAT for Complex Embedding of Directed Graphs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34257", "id": "M3uv4qDKOL", "proceeding": "https://proceedings.mlr.press/v235/ke24c.html", "pdf": "https://openreview.net/pdf?id=M3uv4qDKOL", "openreview": "https://openreview.net/forum?id=M3uv4qDKOL", "author_site": "Zhaoru Ke, Hang Yu, Jianguo Li, Haipeng Zhang", "tldr": "", "abstract": "Current directed graph embedding methods build upon undirected techniques but often inadequately capture directed edge information, leading to challenges such as: (1) Suboptimal representations for nodes with low in/out-degrees, due to the insufficient neighbor interactions; (2) Limited inductive ability for representing new nodes post-training; (3) Narrow generalizability, as training is overly coupled with specific tasks. In response, we propose DUPLEX, an inductive framework for complex embeddings of directed graphs. It (1) leverages Hermitian adjacency matrix decomposition for comprehensive neighbor integration, (2) employs a dual GAT encoder for directional neighbor modeling, and (3) features two parameter-free decoders to decouple training from particular tasks. DUPLEX outperforms state-of-the-art models, especially for nodes with sparse connectivity, and demonstrates robust inductive capability and adaptability across various tasks. The code will be available upon publication.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhaoru Ke;Hang Yu;Jianguo Li;Haipeng Zhang", "authorids": "~Zhaoru_Ke1;~Hang_Yu1;~Jianguo_Li2;~Haipeng_Zhang3", "gender": "F;M;M;M", "homepage": ";;https://faculty.sist.shanghaitech.edu.cn/zhanghp/;https://sites.google.com/site/leeplus/", "dblp": ";74/2568-2;;70/6237", "google_scholar": ";;377DmKgAAAAJ;n44GlFcAAAAJ", "orcid": "0000-0001-8991-5239;;;", "linkedin": ";hang-yu-7ba38844/;;", "or_profile": "~Zhaoru_Ke1;~Hang_Yu1;~Haipeng_Zhang3;~jianguo_Li1", "aff": "ShanghaiTech University;Ant Group;ShanghaiTech University;Ant Group", "aff_domain": "shanghaitech.edu.cn;antgroup.com;shanghaitech.edu.cn;antgroup.com", "position": "PhD student;Senior Algorithm Expert;Assistant Professor;Director", "bibtex": "@inproceedings{\nke2024duplex,\ntitle={{DUPLEX}: Dual {GAT} for Complex Embedding of Directed Graphs},\nauthor={Zhaoru Ke and Hang Yu and Jianguo Li and Haipeng Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=M3uv4qDKOL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 795836, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16157431825328787641&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "shanghaitech.edu.cn;antgroup.com;shanghaitech.edu.cn;antgroup.com", "author_num": 4, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "ShanghaiTech University;Ant Group", "aff_unique_dep": ";", "aff_unique_url": "https://www.shanghaitech.edu.cn;https://www.antgroup.com", "aff_unique_abbr": "ShanghaiTech;Ant Group", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Overcoming Saturation in Density Ratio Estimation by Iterated Regularization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34256", "id": "M407RM0z6h", "proceeding": "https://proceedings.mlr.press/v235/gruber24b.html", "pdf": "https://openreview.net/pdf?id=M407RM0z6h", "openreview": "https://openreview.net/forum?id=M407RM0z6h", "author_site": "Lukas Gruber, Markus Holzleitner, johannes lehner, Sepp Hochreiter, Werner Zellinger", "tldr": "", "abstract": "Estimating the ratio of two probability densities from finitely many samples, is a central task in machine learning and statistics. In this work, we show that a large class of kernel methods for density ratio estimation suffers from error saturation, which prevents algorithms from achieving fast error convergence rates on highly regular learning problems. To resolve saturation, we introduce iterated regularization in density ratio estimation to achieve fast error rates. Our methods outperform its non-iteratively regularized versions on benchmarks for density ratio estimation as well as on large-scale evaluations for importance-weighted ensembling of deep unsupervised domain adaptation models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lukas Gruber;Markus Holzleitner;Johannes Lehner;Sepp Hochreiter;Werner Zellinger", "authorids": "~Lukas_Gruber2;~Markus_Holzleitner1;~Johannes_Lehner1;~Sepp_Hochreiter1;~Werner_Zellinger1", "gender": "Not Specified;;M;M;", "homepage": "https://www.jku.at/en/institute-for-machine-learning/;;;https://www.jku.at/en/institute-for-machine-learning/about-us/team/sepp-hochreiter/;", "dblp": "18/7703;271/0626;232/0972;h/SeppHochreiter.html;", "google_scholar": ";518MXv8AAAAJ;W-kY2_oAAAAJ;https://scholar.google.at/citations?user=tvUH3WMAAAAJ;", "orcid": ";;;0000-0001-7449-2528;", "linkedin": ";;;https://linkedin.com/in/sepp-hochreiter-41514846;", "or_profile": "~Lukas_Gruber2;~Markus_Holzleitner1;~Johannes_Lehner1;~Sepp_Hochreiter1;~Werner_Zellinger1", "aff": "Johannes Kepler University Linz;Johannes Kepler University Linz;Johannes Kepler University Linz;Johannes Kepler University Linz;", "aff_domain": "jku.at;jku.at;jku.at;jku.at;", "position": "PhD student;Postdoc;PhD student;Full Professor;", "bibtex": "@inproceedings{\ngruber2024overcoming,\ntitle={Overcoming Saturation in Density Ratio Estimation by Iterated Regularization},\nauthor={Lukas Gruber and Markus Holzleitner and Johannes Lehner and Sepp Hochreiter and Werner Zellinger},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=M407RM0z6h}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 603345, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8419038629199975791&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "jku.at;jku.at;jku.at;jku.at;", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Johannes Kepler University", "aff_unique_dep": "", "aff_unique_url": "https://www.jku.at", "aff_unique_abbr": "JKU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Linz", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Austria" }, { "title": "Embodied CoT Distillation From LLM To Off-the-shelf Agents", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34255", "id": "M4Htd52HMH", "proceeding": "https://proceedings.mlr.press/v235/choi24d.html", "pdf": "https://openreview.net/pdf?id=M4Htd52HMH", "openreview": "https://openreview.net/forum?id=M4Htd52HMH", "author_site": "Wonje Choi, Woo Kyung Kim, Minjong Yoo, Honguk Woo", "tldr": "", "abstract": "We address the challenge of utilizing large language models (LLMs) for complex embodied tasks, in the environment where decision-making systems operate timely on capacity-limited, off-the-shelf devices. We present DeDer, a framework for decomposing and distilling the embodied reasoning capabilities from LLMs to efficient, small language model (sLM)-based policies. In DeDer, the decision-making process of LLM-based strategies is restructured into a hierarchy with a reasoning-policy and planning-policy. The reasoning-policy is distilled from the data that is generated through the embodied in-context learning and self-verification of an LLM, so it can produce effective rationales. The planning-policy, guided by the rationales, can render optimized plans efficiently. In turn, DeDer allows for adopting sLMs for both policies, deployed on off-the-shelf devices. Furthermore, to enhance the quality of intermediate rationales, specific to embodied tasks, we devise the embodied knowledge graph, and to generate multiple rationales timely through a single inference, we also use the contrastively prompted attention model. Our experiments with the ALFRED benchmark demonstrate that DeDer surpasses leading language planning and distillation approaches, indicating the applicability and efficiency of sLM-based embodied policies derived through DeDer.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wonje Choi;Woo Kyung Kim;Minjong Yoo;Honguk Woo", "authorids": "~Wonje_Choi2;~Woo_Kyung_Kim1;~Minjong_Yoo2;~Honguk_Woo1", "gender": "M;M;M;M", "homepage": "http://115.145.179.118/students/;;https://sites.google.com/view/csi-agent-group/about;", "dblp": "163/3705-3;306/0140;63/6072;253/2606.html", "google_scholar": "L4d1CjEAAAAJ;OFFacb0AAAAJ;https://scholar.google.co.kr/citations?user=Gaxjc7UAAAAJ;O6L-PkgAAAAJ", "orcid": "0000-0001-5138-0101;0000-0001-6214-4171;0000-0001-6948-3440;", "linkedin": ";;;", "or_profile": "~Wonje_Choi2;~Woo_Kyung_Kim1;~Honguk_Woo1;~minjong_Yoo1", "aff": "Sung Kyun Kwan University;Sungkyunkwan University;Sungkyunkwan University;Sungkyunkwan University", "aff_domain": "skku.edu;skku.edu;skku.edu;skku.edu", "position": "PhD student;PhD student;Associate Professor;PhD student", "bibtex": "@inproceedings{\nchoi2024embodied,\ntitle={Embodied CoT Distillation From {LLM} To Off-the-shelf Agents},\nauthor={Wonje Choi and Woo Kyung Kim and Minjong Yoo and Honguk Woo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=M4Htd52HMH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3295529, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17538030351028788212&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "skku.edu;skku.edu;skku.edu;skku.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Sungkyunkwan University", "aff_unique_dep": "", "aff_unique_url": "https://www.skku.edu", "aff_unique_abbr": "SKKU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "No Double Descent in Principal Component Regression: A High-Dimensional Analysis", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34254", "id": "M4ejBhNNrn", "proceeding": "https://proceedings.mlr.press/v235/gedon24a.html", "pdf": "https://openreview.net/pdf?id=M4ejBhNNrn", "openreview": "https://openreview.net/forum?id=M4ejBhNNrn", "author_site": "Daniel Gedon, Antonio Ribeiro, Thomas Sch\u00f6n", "tldr": "", "abstract": "Understanding the generalization properties of large-scale models necessitates incorporating realistic data assumptions into the analysis. Therefore, we consider Principal Component Regression (PCR)---combining principal component analysis and linear regression---on data from a low-dimensional manifold. We present an analysis of PCR when the data is sampled from a spiked covariance model, obtaining fundamental asymptotic guarantees for the generalization risk of this model. Our analysis is based on random matrix theory and allows us to provide guarantees for high-dimensional data. We additionally present an analysis of the distribution shift between training and test data. The results allow us to disentangle the effects of (1) the number of parameters, (2) the data-generating model and, (3) model misspecification on the generalization risk. The use of PCR effectively regularizes the model and prevents the interpolation peak of the double descent. Our theoretical findings are empirically validated in simulation, demonstrating their practical relevance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Daniel Gedon;Antonio H. Ribeiro;Thomas B. Sch\u00f6n", "authorids": "~Daniel_Gedon1;~Antonio_H._Ribeiro1;~Thomas_B._Sch\u00f6n1", "gender": "M;M;M", "homepage": "https://dgedon.github.io/;https://antonior92.github.io/;http://user.it.uu.se/~thosc112/index.html", "dblp": "253/6219;202/1699.html;85/4891", "google_scholar": "https://scholar.google.de/citations?user=5wF916YAAAAJ;https://scholar.google.com.br/citations?user=5t_sZdMAAAAJ;https://scholar.google.se/citations?user=FUqUC2oAAAAJ", "orcid": "0000-0003-4397-9952;0000-0003-3632-8529;0000-0001-5183-234X", "linkedin": "dgedon/;;thomas-sch%C3%B6n-2b587b1/", "or_profile": "~Daniel_Gedon1;~Antonio_H._Ribeiro1;~Thomas_B._Sch\u00f6n1", "aff": "Uppsala University;Uppsala University;Uppsala University", "aff_domain": "uu.se;uu.se;uu.se", "position": "PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\ngedon2024no,\ntitle={No Double Descent in Principal Component Regression: A High-Dimensional Analysis},\nauthor={Daniel Gedon and Antonio H. Ribeiro and Thomas B. Sch{\\\"o}n},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=M4ejBhNNrn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 653171, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17210501602492072343&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "uu.se;uu.se;uu.se", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Uppsala University", "aff_unique_dep": "", "aff_unique_url": "https://www.uu.se", "aff_unique_abbr": "UU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Sweden" }, { "title": "SAM as the Guide: Mastering Pseudo-Label Refinement in Semi-Supervised Referring Expression Segmentation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34253", "id": "M5kn9NKIs4", "proceeding": "https://proceedings.mlr.press/v235/yang24k.html", "pdf": "https://openreview.net/pdf?id=M5kn9NKIs4", "openreview": "https://openreview.net/forum?id=M5kn9NKIs4", "author_site": "Danni Yang, Jiayi Ji, Yiwei Ma, Tianyu Guo, Haowei Wang, Xiaoshuai Sun, Rongrong Ji", "tldr": "", "abstract": "In this paper, we introduce SemiRES, a semi-supervised framework that effectively leverages a combination of labeled and unlabeled data to perform RES. A significant hurdle in applying semi-supervised techniques to RES is the prevalence of noisy pseudo-labels, particularly at the boundaries of objects. SemiRES incorporates the Segment Anything Model (SAM), renowned for its precise boundary demarcation, to improve the accuracy of these pseudo-labels. Within SemiRES, we offer two alternative matching strategies: IoU-based Optimal Matching (IOM) and Composite Parts Integration (CPI). These strategies are designed to extract the most accurate masks from SAM's output, thus guiding the training of the student model with enhanced precision. In instances where a precise mask cannot be matched from the available candidates, we develop the Pixel-Wise Adjustment (PWA) strategy, guiding the student model's training directly by the pseudo-labels. Extensive experiments on three RES benchmarks\u2014RefCOCO, RefCOCO+, and G-Ref reveal its superior performance compared to fully supervised methods, especially in low-data scenarios. Remarkably, with only 1% labeled data, our SemiRES outperforms the supervised baseline by a large margin, e.g. +18.64% gains on RefCOCO val set.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Danni Yang;Jiayi Ji;Yiwei Ma;Tianyu Guo;Haowei Wang;Xiaoshuai Sun;Rongrong Ji", "authorids": "~Danni_Yang1;~Jiayi_Ji1;~Yiwei_Ma1;~Tianyu_Guo3;~Haowei_Wang1;~Xiaoshuai_Sun3;~Rongrong_Ji5", "gender": "F;M;M;M;M;M;", "homepage": "https://mac.xmu.edu.cn/members.htm;https://scholar.google.com/citations?user=xp_rICcAAAAJ&hl=zh-CN;https://xmu-xiaoma666.github.io/;https://mr-neko.github.io;https://sites.google.com/view/xssun;http://mac.xmu.edu.cn/rrji-en.html;https://github.com/lakergogogo", "dblp": ";250/9459;;94/10479-1;26/5787.html;86/5681;", "google_scholar": ";xp_rICcAAAAJ;KIDY5pUAAAAJ;https://scholar.google.com.hk/citations?user=SkV_NNsAAAAJ;KPMK3B4AAAAJ;;", "orcid": ";0000-0002-9956-6308;0000-0002-8744-3423;0009-0006-0289-9672;0000-0003-3912-9306;;", "linkedin": ";;;;;;", "or_profile": "~Danni_Yang1;~Jiayi_Ji1;~Yiwei_Ma1;~Haowei_Wang1;~Xiaoshuai_Sun3;~Rongrong_Ji5;~\u5929\u5b87_\u90ed1", "aff": "Xiamen Univeristy;Xiamen University;Xiamen University;Xiamen University;Xiamen University;Xiamen University;Xiamen University", "aff_domain": "xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn", "position": "MS student;Postdoc;PhD student;MS student;Associate Professor;Full Professor;MS student", "bibtex": "@inproceedings{\nyang2024sam,\ntitle={{SAM} as the Guide: Mastering Pseudo-Label Refinement in Semi-Supervised Referring Expression Segmentation},\nauthor={Danni Yang and Jiayi Ji and Yiwei Ma and Tianyu Guo and Haowei Wang and Xiaoshuai Sun and Rongrong Ji},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=M5kn9NKIs4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9568493, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2925730082720050554&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn", "author_num": 7, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Xiamen University", "aff_unique_dep": "", "aff_unique_url": "https://www.xmu.edu.cn", "aff_unique_abbr": "XMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Hypergraph-enhanced Dual Semi-supervised Graph Classification", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34252", "id": "M5ne8enLcr", "proceeding": "https://proceedings.mlr.press/v235/ju24a.html", "pdf": "https://openreview.net/pdf?id=M5ne8enLcr", "openreview": "https://openreview.net/forum?id=M5ne8enLcr", "author_site": "Wei Ju, Zhengyang Mao, Siyu Yi, Yifang Qin, Yiyang Gu, Zhiping Xiao, Yifan Wang, Xiao Luo, Ming Zhang", "tldr": "", "abstract": "In this paper, we study semi-supervised graph classification, which aims at accurately predicting the categories of graphs in scenarios with limited labeled graphs and abundant unlabeled graphs. Despite the promising capability of graph neural networks (GNNs), they typically require a large number of costly labeled graphs, while a wealth of unlabeled graphs fail to be effectively utilized. Moreover, GNNs are inherently limited to encoding local neighborhood information using message-passing mechanisms, thus lacking the ability to model higher-order dependencies among nodes. To tackle these challenges, we propose a Hypergraph-Enhanced DuAL framework named HEAL for semi-supervised graph classification, which captures graph semantics from the perspective of the hypergraph and the line graph, respectively. Specifically, to better explore the higher-order relationships among nodes, we design a hypergraph structure learning to adaptively learn complex node dependencies beyond pairwise relations. Meanwhile, based on the learned hypergraph, we introduce a line graph to capture the interaction between hyperedges, thereby better mining the underlying semantic structures. Finally, we develop a relational consistency learning to facilitate knowledge transfer between the two branches and provide better mutual guidance. Extensive experiments on real-world graph datasets verify the effectiveness of the proposed method against existing state-of-the-art methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wei Ju;Zhengyang Mao;Siyu Yi;Yifang Qin;Yiyang Gu;Zhiping Xiao;Yifan Wang;Xiao Luo;Ming Zhang", "authorids": "~Wei_Ju1;~Zhengyang_Mao1;~Siyu_Yi1;~Yifang_Qin1;~Yiyang_Gu1;~Zhiping_Xiao1;~Yifan_Wang19;~Xiao_Luo3;~Ming_Zhang5", "gender": ";M;F;M;;F;M;M;F", "homepage": ";;;https://yifang-qin.github.io/;;https://patriciaxiao.github.io/www/;https://jamesyifan.github.io/;http://luoxiao12.github.io;https://cs.pku.edu.cn/info/1080/1371.htm", "dblp": ";354/6192.html;;59/11524;;176/5397-1.html;47/6959-14;50/1585-1;73/1844-4", "google_scholar": ";kiVujZgAAAAJ;Jr2ELOMAAAAJ;DK1jxxYAAAAJ;;tF8GQawAAAAJ;olsrsjEAAAAJ;https://scholar.google.com.hk/citations?;LbzoQBsAAAAJ", "orcid": ";0000-0002-2277-6008;0000-0001-5124-2382;0000-0002-7520-8039;;0000-0002-8583-4789;0000-0001-7764-8698;;0000-0002-9809-3430", "linkedin": ";;;;;zpxiao/;;%E9%9C%84-%E7%BD%97-303548214/;", "or_profile": "~Wei_Ju1;~Zhengyang_Mao1;~Siyu_Yi1;~Yifang_Qin1;~Yiyang_Gu1;~Zhiping_Xiao1;~Yifan_Wang19;~Xiao_Luo3;~Ming_Zhang5", "aff": ";Peking University;Nankai University;Peking University;;University of California, Los Angeles;University of International Business and Economics;University of California, Los Angeles;Peking University", "aff_domain": ";pku.edu.cn;nankai.edu.cn;pku.edu.cn;;cs.ucla.edu;uibe.edu.cn;cs.ucla.edu;pku.edu.cn", "position": ";MS student;PhD student;PhD student;;PhD student;Assistant Professor;Postdoc;Full Professor", "bibtex": "@inproceedings{\nju2024hypergraphenhanced,\ntitle={Hypergraph-enhanced Dual Semi-supervised Graph Classification},\nauthor={Wei Ju and Zhengyang Mao and Siyu Yi and Yifang Qin and Yiyang Gu and Zhiping Xiao and Yifan Wang and Xiao Luo and Ming Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=M5ne8enLcr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 919921, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17490117851942820561&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": ";pku.edu.cn;nankai.edu.cn;pku.edu.cn;;cs.ucla.edu;uibe.edu.cn;cs.ucla.edu;pku.edu.cn", "author_num": 9, "aff_unique_index": "0;1;0;2;3;2;0", "aff_unique_norm": "Peking University;Nankai University;University of California, Los Angeles;University of International Business and Economics", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.pku.edu.cn;http://www.nankai.edu.cn;https://www.ucla.edu;http://www.uibe.edu.cn", "aff_unique_abbr": "Peking U;NKU;UCLA;UIBE", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;1;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Provable Multi-Task Representation Learning by Two-Layer ReLU Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34251", "id": "M8UbECx485", "proceeding": "https://proceedings.mlr.press/v235/collins24a.html", "pdf": "https://openreview.net/pdf?id=M8UbECx485", "openreview": "https://openreview.net/forum?id=M8UbECx485", "author_site": "Liam Collins, Hamed Hassani, Mahdi Soltanolkotabi, Aryan Mokhtari, Sanjay Shakkottai", "tldr": "", "abstract": "An increasingly popular machine learning paradigm is to pretrain a neural network (NN) on many tasks offline, then adapt it to downstream tasks, often by re-training only the last linear layer of the network. This approach yields strong downstream performance in a variety of contexts, demonstrating that multitask pretraining leads to effective feature learning. Although several recent theoretical studies have shown that shallow NNs learn meaningful features when either (i) they are trained on a *single* task or (ii) they are *linear*, very little is known about the closer-to-practice case of *nonlinear* NNs trained on *multiple* tasks. In this work, we present the first results proving that feature learning occurs during training with a nonlinear model on multiple tasks. Our key insight is that multi-task pretraining induces a pseudo-contrastive loss that favors representations that align points that typically have the same label across tasks. Using this observation, we show that when the tasks are binary classification tasks with labels depending on the projection of the data onto an $r$-dimensional subspace within the $d\\gg r$-dimensional input space, a simple gradient-based multitask learning algorithm on a two-layer ReLU NN recovers this projection, allowing for generalization to downstream tasks with sample and neuron complexity independent of $d$. In contrast, we show that with high probability over the draw of a single task, training on this single task cannot guarantee to learn all $r$ ground-truth features.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Liam Collins;Hamed Hassani;Mahdi Soltanolkotabi;Aryan Mokhtari;Sanjay Shakkottai", "authorids": "~Liam_Collins1;~Hamed_Hassani2;~Mahdi_Soltanolkotabi1;~Aryan_Mokhtari3;~Sanjay_Shakkottai1", "gender": ";M;M;M;M", "homepage": "https://liamc2196.github.io/;https://www.seas.upenn.edu/~hassani/;http://www-bcf.usc.edu/~soltanol/;https://sites.utexas.edu/mokhtari/;https://sites.google.com/view/sanjay-shakkottai/", "dblp": "170/1157;73/4984;75/6691;140/7407;61/4596", "google_scholar": "MRLe02cAAAAJ;;narJyMAAAAAJ;glcep6EAAAAJ;", "orcid": "0009-0006-3139-3339;;;;", "linkedin": ";;;;", "or_profile": "~Liam_Collins1;~Hamed_Hassani2;~Mahdi_Soltanolkotabi1;~Aryan_Mokhtari3;~Sanjay_Shakkottai1", "aff": "University of Texas, Austin;University of Pennsylvania;University of Southern California;University of Texas, Austin;University of Texas at Austin", "aff_domain": "utexas.edu;upenn.edu;usc.edu;utexas.edu;utexas.edu", "position": "PhD student;;Associate Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\ncollins2024provable,\ntitle={Provable Multi-Task Representation Learning by Two-Layer Re{LU} Neural Networks},\nauthor={Liam Collins and Hamed Hassani and Mahdi Soltanolkotabi and Aryan Mokhtari and Sanjay Shakkottai},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=M8UbECx485}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 907089, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10017794835185655920&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 10, "email": "utexas.edu;upenn.edu;usc.edu;utexas.edu;utexas.edu", "author_num": 5, "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "University of Texas at Austin;University of Pennsylvania;University of Southern California", "aff_unique_dep": ";;", "aff_unique_url": "https://www.utexas.edu;https://www.upenn.edu;https://www.usc.edu", "aff_unique_abbr": "UT Austin;UPenn;USC", "aff_campus_unique_index": "0;2;0;0", "aff_campus_unique": "Austin;;Los Angeles", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Predicting Dose-Response Curves with Deep Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34250", "id": "MDAg5Q7IsI", "proceeding": "https://proceedings.mlr.press/v235/alonso-campana24a.html", "pdf": "https://openreview.net/pdf?id=MDAg5Q7IsI", "openreview": "https://openreview.net/forum?id=MDAg5Q7IsI", "author_site": "Pedro A. Campana, Paul Prasse, Tobias Scheffer", "tldr": "", "abstract": "Dose-response curves characterize the relationship between the concentration of drugs and their inhibitory effect on the growth of specific types of cells. The predominant Hill-equation model of an ideal enzymatic inhibition unduly simplifies the biochemical reality of many drugs; and for these drugs the widely-used drug performance indicator of the half-inhibitory concentration $IC_{50}$ can lead to poor therapeutic recommendations and poor selections of promising drug candidates. We develop a neural model that uses an embedding of the interaction between drug molecules and the tissue transcriptome to estimate the entire dose-response curve rather than a scalar aggregate. We find that, compared to the prior state of the art, this model excels at interpolating and extrapolating the inhibitory effect of untried concentrations. Unlike prevalent parametric models, it it able to accurately predict dose-response curves of drugs on previously unseen tumor tissues as well as of previously untested drug molecules on established tumor cell lines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pedro Alonso Campana;Paul Prasse;Tobias Scheffer", "authorids": "~Pedro_Alonso_Campana1;~Paul_Prasse1;~Tobias_Scheffer1", "gender": "M;;", "homepage": ";https://www.uni-potsdam.de/de/cs-ml/staff/phd/prasse;https://www.uni-potsdam.de/en/cs-ml/staff/contacts/scheffer", "dblp": ";116/3028;s/TobiasScheffer", "google_scholar": ";https://scholar.google.de/citations?user=qAbXPJQAAAAJ;UjV0M9QAAAAJ", "orcid": ";0000-0003-1842-3645;0000-0003-4405-7925", "linkedin": "pedro-a-campana-9a509111b/;;tobiasscheffer/", "or_profile": "~Pedro_Alonso_Campana1;~Paul_Prasse1;~Tobias_Scheffer1", "aff": "Universit\u00e4t Potsdam;Universit\u00e4t Potsdam;Universit\u00e4t Potsdam", "aff_domain": "uni-potsdam.de;uni-potsdam.de;uni-potsdam.de", "position": "PhD student;Postdoc;Full Professor", "bibtex": "@inproceedings{\ncampana2024predicting,\ntitle={Predicting Dose-Response Curves with Deep Neural Networks},\nauthor={Pedro Alonso Campana and Paul Prasse and Tobias Scheffer},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MDAg5Q7IsI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 859259, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:XhHsxnwyS2EJ:scholar.google.com/&scioq=Predicting+Dose-Response+Curves+with+Deep+Neural+Networks&hl=en&as_sdt=0,33", "gs_version_total": 4, "email": "uni-potsdam.de;uni-potsdam.de;uni-potsdam.de", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Potsdam", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-potsdam.de", "aff_unique_abbr": "UP", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "Two Fists, One Heart: Multi-Objective Optimization Based Strategy Fusion for Long-tailed Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34249", "id": "MEZydkOr3l", "proceeding": "https://proceedings.mlr.press/v235/zhao24o.html", "pdf": "https://openreview.net/pdf?id=MEZydkOr3l", "openreview": "https://openreview.net/forum?id=MEZydkOr3l", "author_site": "Zhe Zhao, Pengkun Wang, HaiBin Wen, Wei Xu, LAI Song, Qingfu Zhang, Yang Wang", "tldr": "", "abstract": "Real-world data generally follows a long-tailed distribution, which makes traditional high-performance training strategies unable to show their usual effects. Various insights have been proposed to alleviate this challenging distribution. However, some observations indicate that models trained on long-tailed distributions always show a trade-off between the performance of head and tail classes. For a profound understanding of the trade-off, we first theoretically analyze the trade-off problem in long-tailed learning and creatively transform the trade-off problem in long-tailed learning into a multi-objective optimization (MOO) problem. Motivated by these analyses, we propose the idea of strategy fusion for MOO long-tailed learning and point out the potential conflict problem. We further design a Multi-Objective Optimization based Strategy Fusion (MOOSF), which effectively resolves conflicts, and achieves an efficient fusion of heterogeneous strategies. Comprehensive experiments on mainstream datasets show that even the simplest strategy fusion can outperform complex long-tailed strategies. More importantly, it provides a new perspective for generalized long-tailed learning. The code is available in the accompanying supplementary materials.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhe Zhao;Pengkun Wang;HaiBin Wen;Wei Xu;Song Lai;Qingfu Zhang;Yang Wang", "authorids": "~Zhe_Zhao5;~Pengkun_Wang1;~HaiBin_Wen1;~Wei_Xu21;~Song_Lai1;~Qingfu_Zhang1;~Yang_Wang32", "gender": "M;M;M;;M;M;M", "homepage": "https://di.ustc.edu.cn/_upload/tpl/14/f7/5367/template5367/members.html;http://home.ustc.edu.cn/~pengkun/index.html;https://github.com/haibin65535;https://home.cnblogs.com/u/Embiid;https://scholars.cityu.edu.hk/en/persons/song-lai(bbd49b19-00a6-41b6-a18a-02559baf45dc).html;https://www.cs.cityu.edu.hk/~qzhan7/index.html;http://staff.ustc.edu.cn/~angyan/", "dblp": ";;209/2186;;48/3684;98/1240.html;", "google_scholar": ";https://scholar.google.com/citations?hl=zh-CN;;;3P3jmP8AAAAJ;https://scholar.google.co.uk/citations?user=nhL9PHwAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": "0000-0002-8942-8761;0000-0002-2680-4563;0009-0009-5019-2390;;0000-0002-4835-0945;;0000-0002-6079-7053", "linkedin": ";;;;;;", "or_profile": "~Zhe_Zhao5;~Pengkun_Wang1;~HaiBin_Wen1;~Wei_Xu21;~Song_Lai1;~Qingfu_Zhang1;~Yang_Wang32", "aff": "University of Science and Technology of China;University of Science and Technology of China;Shaoguan University;University of Science and Technology of China;City University of Hong Kong;City University of Hong Kong;University of Science and Technology of China", "aff_domain": "ustc.edu.cn;ustc.edu.cn;sgu.edu.cn;ustc.edu.cn;cityu.edu.hk;cityu.edu.hk;ustc.edu.cn", "position": "PhD student;Researcher;Undergrad student;MS student;PhD student;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nzhao2024two,\ntitle={Two Fists, One Heart: Multi-Objective Optimization Based Strategy Fusion for Long-tailed Learning},\nauthor={Zhe Zhao and Pengkun Wang and HaiBin Wen and Wei Xu and Song Lai and Qingfu Zhang and Yang Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MEZydkOr3l}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2890365, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8673040561820512242&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "ustc.edu.cn;ustc.edu.cn;sgu.edu.cn;ustc.edu.cn;cityu.edu.hk;cityu.edu.hk;ustc.edu.cn", "author_num": 7, "aff_unique_index": "0;0;1;0;2;2;0", "aff_unique_norm": "University of Science and Technology of China;Shaoguan University;City University of Hong Kong", "aff_unique_dep": ";;", "aff_unique_url": "http://www.ustc.edu.cn;http://www.gdsgu.edu.cn;https://www.cityu.edu.hk", "aff_unique_abbr": "USTC;;CityU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Bottleneck-Minimal Indexing for Generative Document Retrieval", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34248", "id": "MFPYCvWsNR", "proceeding": "https://proceedings.mlr.press/v235/du24j.html", "pdf": "https://openreview.net/pdf?id=MFPYCvWsNR", "openreview": "https://openreview.net/forum?id=MFPYCvWsNR", "author_site": "Xin Du, Lixin Xiu, Kumiko Tanaka-Ishii", "tldr": "", "abstract": "We apply an information-theoretic perspective to reconsider generative document retrieval (GDR), in which a document $x \\in \\mathcal{X}$ is indexed by $t \\in \\mathcal{T}$, and a neural autoregressive model is trained to map queries $\\mathcal{Q}$ to $\\mathcal{T}$. GDR can be considered to involve information transmission from documents $\\mathcal{X}$ to queries $\\mathcal{Q}$, with the requirement to transmit more bits via the indexes $\\mathcal{T}$. By applying Shannon's rate-distortion theory, the optimality of indexing can be analyzed in terms of the mutual information, and the design of the indexes $\\mathcal{T}$ can then be regarded as a *bottleneck* in GDR. After reformulating GDR from this perspective, we empirically quantify the bottleneck underlying GDR. Finally, using the NQ320K and MARCO datasets, we evaluate our proposed bottleneck-minimal indexing method in comparison with various previous indexing methods, and we show that it outperforms those methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xin Du;Lixin Xiu;Kumiko Tanaka-Ishii", "authorids": "~Xin_Du4;~Lixin_Xiu1;~Kumiko_Tanaka-Ishii2", "gender": "M;M;", "homepage": "https://kduxin.com;https://riishin.github.io/;", "dblp": ";;", "google_scholar": "8of0O7YAAAAJ;;", "orcid": "0000-0001-9135-2906;;", "linkedin": ";;", "or_profile": "~Xin_Du4;~Lixin_Xiu1;~Kumiko_Tanaka-Ishii2", "aff": "Waseda University;The University of Tokyo;", "aff_domain": "waseda.jp;u-tokyo.ac.jp;", "position": "Assistant Professor;MS student;", "bibtex": "@inproceedings{\ndu2024bottleneckminimal,\ntitle={Bottleneck-Minimal Indexing for Generative Document Retrieval},\nauthor={Xin Du and Lixin Xiu and Kumiko Tanaka-Ishii},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MFPYCvWsNR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1528813, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2040680919902885258&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 8, "email": "waseda.jp;u-tokyo.ac.jp;", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Waseda University;University of Tokyo", "aff_unique_dep": ";", "aff_unique_url": "https://www.waseda.jp/top;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "Waseda;UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "title": "Reason for Future, Act for Now: A Principled Architecture for Autonomous LLM Agents", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34247", "id": "MGkeWJxQVl", "proceeding": "https://proceedings.mlr.press/v235/liu24ab.html", "pdf": "https://openreview.net/pdf?id=MGkeWJxQVl", "openreview": "https://openreview.net/forum?id=MGkeWJxQVl", "author_site": "Zhihan Liu, Hao Hu, Shenao Zhang, Hongyi Guo, Shuqi Ke, Boyi Liu, Zhaoran Wang", "tldr": "", "abstract": "Large language models (LLMs) demonstrate impressive reasoning abilities, but translating reasoning into actions in the real world remains challenging. In particular, it is unclear how to complete a given task provably within a minimum number of interactions with the external environment, e.g., through an internal mechanism of reasoning. To this end, we propose the first framework with provable regret guarantees to orchestrate reasoning and acting, which we call *reason for future, act for now* (**RAFA**). Specifically, we design a prompt template for reasoning that learns from the memory buffer and plans a future trajectory over a long horizon (*reason for future*). At each step, the LLM agent takes the initial action of the planned trajectory (*act for now*), stores the collected feedback in the memory buffer, and reinvokes the reasoning routine to replan the future trajectory from the new state. The key idea is to cast reasoning in LLMs as learning and planning in Bayesian adaptive Markov decision processes (MDPs). Correspondingly, we prompt LLMs with the memory buffer to estimate the unknown environment (learning) and generate an optimal trajectory for multiple future steps that maximize a value function (planning). The learning and planning subroutines are performed in an in-context manner to emulate the actor-critic update for MDPs. Our theoretical analysis establishes a $\\sqrt{T}$ regret, while our experimental validation demonstrates superior empirical performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhihan Liu;Hao Hu;Shenao Zhang;Hongyi Guo;Shuqi Ke;Boyi Liu;Zhaoran Wang", "authorids": "~Zhihan_Liu1;~Hao_Hu3;~Shenao_Zhang1;~Hongyi_Guo1;~Shuqi_Ke1;~Boyi_Liu1;~Zhaoran_Wang1", "gender": "M;M;M;M;;M;Not Specified", "homepage": ";https://mousehu.github.io;https://shenao-zhang.github.io/;https://gohsyi.github.io/;;;https://zhaoranwang.github.io/", "dblp": ";67/6924-6;253/4543.html;;;;117/2756", "google_scholar": "0VVg_R4AAAAJ;https://scholar.google.com/citations?hl=en;8NamuusAAAAJ;https://scholar.google.com/citations?hl=en;;1G8RH_YAAAAJ;https://scholar.google.com.tw/citations?user=HSx0BgQAAAAJ", "orcid": ";;;;;;", "linkedin": ";hao-hu-tsinghua;shenao-zhang-055a53178/;;;;", "or_profile": "~Zhihan_Liu1;~Hao_Hu3;~Shenao_Zhang1;~Hongyi_Guo1;~Shuqi_Ke1;~Boyi_Liu1;~Zhaoran_Wang1", "aff": "Northwestern University;Tsinghua University;Georgia Institute of Technology;Northwestern University, Northwestern University;;ByteDance Inc.;Northwestern University", "aff_domain": "northwestern.edu;tsinghua.edu.cn;gatech.edu;u.northwestern.edu;;bytedance.com;northwestern.edu", "position": "PhD student;PhD student;MS student;PhD student;;Researcher;Associate Professor", "bibtex": "@inproceedings{\nliu2024reason,\ntitle={Reason for Future, Act for Now: A Principled Architecture for Autonomous {LLM} Agents},\nauthor={Zhihan Liu and Hao Hu and Shenao Zhang and Hongyi Guo and Shuqi Ke and Boyi Liu and Zhaoran Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MGkeWJxQVl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3628066, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4081571305387143859&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "northwestern.edu;tsinghua.edu.cn;gatech.edu;u.northwestern.edu;;bytedance.com;northwestern.edu", "author_num": 7, "aff_unique_index": "0;1;2;0;3;0", "aff_unique_norm": "Northwestern University;Tsinghua University;Georgia Institute of Technology;ByteDance", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.northwestern.edu;https://www.tsinghua.edu.cn;https://www.gatech.edu;https://www.bytedance.com", "aff_unique_abbr": "NU;THU;Georgia Tech;ByteDance", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1;0", "aff_country_unique": "United States;China" }, { "title": "Differentially private exact recovery for stochastic block models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34246", "id": "MIRQ3L8vtn", "proceeding": "https://proceedings.mlr.press/v235/nguyen24j.html", "pdf": "https://openreview.net/pdf?id=MIRQ3L8vtn", "openreview": "https://openreview.net/forum?id=MIRQ3L8vtn", "author_site": "Dung Nguyen, Anil Vullikanti", "tldr": "", "abstract": "Stochastic block models (SBMs) are a very commonly studied network model for community detection algorithms. In the standard form of an SBM, the $n$ vertices (or nodes) of a graph are generally divided into multiple pre-determined communities (or clusters). Connections between pairs of vertices are generated randomly and independently with pre-defined probabilities, which depend on the communities containing the two nodes. A fundamental problem in SBMs is the recovery of the community structure, and sharp information-theoretic bounds are known for recoverability for many versions of SBMs. Our focus here is the recoverability problem in SBMs when the network is private. Under the edge differential privacy model, we derive conditions for exact recoverability in three different versions of SBMs, namely Asymmetric SBM (when communities have non-uniform sizes), General Structure SBM (with outliers), and Censored SBM (with edge features). Our private algorithms have polynomial running time w.r.t. the input graph's size, and match the recovery thresholds of the non-private setting when $\\epsilon\\rightarrow\\infty$. In contrast, the previous best results for recoverability in SBMs only hold for the symmetric case (equal size communities), and run in quasi-polynomial time, or in polynomial time with recovery thresholds being tight up to some constants from the non-private settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dung Nguyen;Anil Kumar Vullikanti", "authorids": "~Dung_Nguyen2;~Anil_Vullikanti1", "gender": ";M", "homepage": ";https://engineering.virginia.edu/faculty/anil-vullikanti", "dblp": ";89/7912", "google_scholar": ";MNJ-E9UAAAAJ", "orcid": ";0000-0002-8597-6197", "linkedin": ";", "or_profile": "~Dung_Nguyen2;~Anil_Vullikanti1", "aff": ";University of Virginia", "aff_domain": ";virginia.edu", "position": ";Professor", "bibtex": "@inproceedings{\nnguyen2024differentially,\ntitle={Differentially private exact recovery for stochastic block models},\nauthor={Dung Nguyen and Anil Kumar Vullikanti},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MIRQ3L8vtn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 579380, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6466425600237235970&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";virginia.edu", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "University of Virginia", "aff_unique_dep": "", "aff_unique_url": "https://www.virginia.edu", "aff_unique_abbr": "UVA", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "A Sparsity Principle for Partially Observable Causal Representation Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34245", "id": "MKGrRVODWR", "proceeding": "https://proceedings.mlr.press/v235/xu24ac.html", "pdf": "https://openreview.net/pdf?id=MKGrRVODWR", "openreview": "https://openreview.net/forum?id=MKGrRVODWR", "author_site": "Danru Xu, Dingling Yao, S\u00e9bastien Lachapelle, Perouz Taslakian, Julius von K\u00fcgelgen, Francesco Locatello, Sara Magliacane", "tldr": "", "abstract": "Causal representation learning aims at identifying high-level causal variables from perceptual data. Most methods assume that all latent causal variables are captured in the high-dimensional observations. We instead consider a partially observed setting, in which each measurement only provides information about a subset of the underlying causal state. Prior work has studied this setting with multiple domains or views, each depending on a fixed subset of latents. Here, we focus on learning from unpaired observations from a dataset with an instance-dependent partial observability pattern. Our main contribution is to establish two identifiability results for this setting: one for linear mixing functions without parametric assumptions on the underlying causal model, and one for piecewise linear mixing functions with Gaussian latent causal variables. Based on these insights, we propose two methods for estimating the underlying causal variables by enforcing sparsity in the inferred representation. Experiments on different simulated datasets and established benchmarks highlight the effectiveness of our approach in recovering the ground-truth latents.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Danru Xu;Dingling Yao;Sebastien Lachapelle;Perouz Taslakian;Julius von K\u00fcgelgen;Francesco Locatello;Sara Magliacane", "authorids": "~Danru_Xu1;~Dingling_Yao1;~Sebastien_Lachapelle1;~Perouz_Taslakian1;~Julius_von_K\u00fcgelgen2;~Francesco_Locatello1;~Sara_Magliacane1", "gender": ";F;M;F;;M;F", "homepage": ";;https://slachapelle.github.io/;http://www.perouz.com;;https://twitter.com/FrancescoLocat8;http://saramagliacane.github.io", "dblp": ";298/8057;224/0080;52/1849;;195/6074;120/5256", "google_scholar": ";;uxHoJp8AAAAJ;LJ7gHkQAAAAJ;;;https://scholar.google.nl/citations?user=H3j_zQ4AAAAJ", "orcid": ";;;;;;", "linkedin": ";dingling-yao-b28b161a2/;s%C3%A9bastien-lachapelle-a4321a122/;perouz/;;;magliacane/", "or_profile": "~Danru_Xu1;~Dingling_Yao1;~Sebastien_Lachapelle1;~Perouz_Taslakian1;~Julius_von_K\u00fcgelgen2;~Francesco_Locatello1;~Sara_Magliacane1", "aff": ";Institute of Science and Technology;University of Montreal;ServiceNow;;Institute of Science and Technology;University of Amsterdam", "aff_domain": ";ist.ac.at;umontreal.ca;servicenow.com;;ist.ac.at;uva.nl", "position": ";PhD student;PhD student;Researcher;;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nxu2024a,\ntitle={A Sparsity Principle for Partially Observable Causal Representation Learning},\nauthor={Danru Xu and Dingling Yao and Sebastien Lachapelle and Perouz Taslakian and Julius von K{\\\"u}gelgen and Francesco Locatello and Sara Magliacane},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MKGrRVODWR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4153482, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11652700419967049378&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": ";ist.ac.at;umontreal.ca;servicenow.com;;ist.ac.at;uva.nl", "author_num": 7, "aff_unique_index": "0;1;2;0;3", "aff_unique_norm": "Institute of Science and Technology;University of Montreal;ServiceNow;University of Amsterdam", "aff_unique_dep": ";;;", "aff_unique_url": ";https://wwwumontreal.ca;https://www.servicenow.com;https://www.uva.nl", "aff_unique_abbr": ";UM;ServiceNow;UvA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;2;3", "aff_country_unique": ";Canada;United States;Netherlands" }, { "title": "ArtWhisperer: A Dataset for Characterizing Human-AI Interactions in Artistic Creations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34244", "id": "MKzgqtRtGY", "proceeding": "https://proceedings.mlr.press/v235/vodrahalli24a.html", "pdf": "https://openreview.net/pdf?id=MKzgqtRtGY", "openreview": "https://openreview.net/forum?id=MKzgqtRtGY", "author_site": "Kailas Vodrahalli, James Zou", "tldr": "", "abstract": "In this work, we investigate how people use text-to-image models to generate desired target images. To study this interaction, we created ArtWhisperer, an online game where users are given a target image and are tasked with iteratively finding a prompt that creates a similar-looking image as the target. Through this game, we recorded over 50,000 human-AI interactions; each interaction corresponds to one text prompt created by a user and the corresponding generated image. The majority of these are repeated interactions where a user iterates to find the best prompt for their target image, making this a unique sequential dataset for studying human-AI collaborations. In an initial analysis of this dataset, we identify several characteristics of prompt interactions and user strategies. People submit diverse prompts and are able to discover a variety of text descriptions that generate similar images. Interestingly, prompt diversity does not decrease as users find better prompts. We further propose a new metric to quantify AI model *steerability* using our dataset. We define steerability as the expected number of interactions required to adequately complete a task. We estimate this value by fitting a Markov chain for each target task and calculating the expected time to reach an adequate score. We quantify and compare AI steerability across different types of target images and two different models, finding that images of cities and nature are more steerable than artistic and fantasy images. We also evaluate popular vision-language models to assess their image understanding and ability to incorporate feedback. These findings provide insights into human-AI interaction behavior, present a concrete method of assessing AI steerability, and demonstrate the general utility of the ArtWhisperer dataset.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kailas Vodrahalli;James Zou", "authorids": "~Kailas_Vodrahalli1;~James_Zou1", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": "0DeyGMcAAAAJ;23ZXZvEAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Kailas_Vodrahalli1;~James_Zou1", "aff": "Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nvodrahalli2024artwhisperer,\ntitle={ArtWhisperer: A Dataset for Characterizing Human-{AI} Interactions in Artistic Creations},\nauthor={Kailas Vodrahalli and James Zou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MKzgqtRtGY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9774515, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13182927881668708200&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "stanford.edu;stanford.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "The Non-linear $F$-Design and Applications to Interactive Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34243", "id": "MMMHufVc2v", "proceeding": "https://proceedings.mlr.press/v235/agarwal24e.html", "pdf": "https://openreview.net/pdf?id=MMMHufVc2v", "openreview": "https://openreview.net/forum?id=MMMHufVc2v", "author_site": "Alekh Agarwal, Jian Qian, Alexander Rakhlin, Tong Zhang", "tldr": "", "abstract": "We propose a generalization of the classical G-optimal design concept to non-linear function classes. The criterion, termed F -design, coincides with G-design in the linear case. We compute the value of the optimal design, termed the F-condition number, for several non-linear function classes. We further provide algorithms to construct designs with a bounded F -condition number. Finally, we employ the F-design in a variety of interactive machine learning tasks, where the design is naturally useful for data collection or exploration. We show that in four diverse settings of confidence band construction, contextual bandits, model-free reinforcement learning, and active learning, F-design can be combined with existing approaches in a black-box manner to yield state-of-the-art results in known problem settings as well as to generalize to novel ones.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alekh Agarwal;Jian Qian;Alexander Rakhlin;Tong Zhang", "authorids": "~Alekh_Agarwal2;~Jian_Qian2;~Alexander_Rakhlin1;~Tong_Zhang2", "gender": "M;;M;M", "homepage": "https://alekhagarwal.net;https://sites.google.com/view/jianqian/about;http://www.mit.edu/~rakhlin/;http://tongzhang-ml.org", "dblp": ";;59/407;07/4227-1", "google_scholar": "9nnDvooAAAAJ;;https://scholar.google.com.tw/citations?user=fds2VpgAAAAJ;LurWtuYAAAAJ", "orcid": ";;;0000-0002-5511-2558", "linkedin": ";jianQ/;;", "or_profile": "~Alekh_Agarwal2;~Jian_Qian2;~Alexander_Rakhlin1;~Tong_Zhang2", "aff": "Google;Massachusetts Institute of Technology;Massachusetts Institute of Technology;UIUC", "aff_domain": "google.com;mit.edu;mit.edu;illinois.edu", "position": "Researcher;PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nagarwal2024the,\ntitle={The Non-linear \\$F\\$-Design and Applications to Interactive Learning},\nauthor={Alekh Agarwal and Jian Qian and Alexander Rakhlin and Tong Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MMMHufVc2v}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 593850, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3396217979993988996&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "google.com;mit.edu;mit.edu;illinois.edu", "author_num": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Google;Massachusetts Institute of Technology;University of Illinois Urbana-Champaign", "aff_unique_dep": "Google;;", "aff_unique_url": "https://www.google.com;https://web.mit.edu;https://www illinois.edu", "aff_unique_abbr": "Google;MIT;UIUC", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Mountain View;;Urbana-Champaign", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Position: Optimization in SciML Should Employ the Function Space Geometry", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34242", "id": "MOrvoYrlOg", "proceeding": "https://proceedings.mlr.press/v235/muller24d.html", "pdf": "https://openreview.net/pdf?id=MOrvoYrlOg", "openreview": "https://openreview.net/forum?id=MOrvoYrlOg", "author_site": "Johannes M\u00fcller, Marius Zeinhofer", "tldr": "", "abstract": "We provide an infinite-dimensional view on optimization problems encountered in scientific machine learning (SciML) and advocate for the paradigm first optimize, then discretize for their solution. This amounts to first choosing an appropriate infinite-dimensional algorithm which is then discretized in a second step. To illustrate this point, we discuss recently proposed state-of-the-art algorithms for SciML applications and see that they can be derived within this framework. Hence, this perspective allows for a principled guide for the design of optimization algorithms for SciML. As the infinite-dimensional viewpoint is presently underdeveloped we formalize it here to foster the development of novel optimization algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Johannes M\u00fcller;Marius Zeinhofer", "authorids": "~Johannes_M\u00fcller1;~Marius_Zeinhofer1", "gender": "M;M", "homepage": "https://math.ethz.ch/sam/the-institute/people.ethz_search.html?u=mzeinhofer;https://muellerjohannes.github.io/", "dblp": "255/5011;", "google_scholar": ";https://scholar.google.de/citations?user=Wfww-P8AAAAJ", "orcid": ";0000-0001-8729-0466", "linkedin": ";", "or_profile": "~Marius_Zeinhofer1;~Johannes_Christoph_M\u00fcller1", "aff": "Simula Research Laboratory;RWTH Aachen University", "aff_domain": "simula.no;mathc.rwth-aachen.de", "position": "Postdoc;Postdoc", "bibtex": "@inproceedings{\nm{\\\"u}ller2024position,\ntitle={Position: Optimization in Sci{ML} Should Employ the Function Space Geometry},\nauthor={Johannes M{\\\"u}ller and Marius Zeinhofer},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MOrvoYrlOg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1491299, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6274255651233227864&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "simula.no;mathc.rwth-aachen.de", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Simula Research Laboratory;RWTH Aachen University", "aff_unique_dep": ";", "aff_unique_url": "https://www.simula.no;https://www.rwth-aachen.de", "aff_unique_abbr": "Simula;RWTH", "aff_campus_unique_index": "1", "aff_campus_unique": ";Aachen", "aff_country_unique_index": "0;1", "aff_country_unique": "Norway;Germany" }, { "title": "Rotational Equilibrium: How Weight Decay Balances Learning Across Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34241", "id": "MQirNNU2pC", "proceeding": "https://proceedings.mlr.press/v235/kosson24a.html", "pdf": "https://openreview.net/pdf?id=MQirNNU2pC", "openreview": "https://openreview.net/forum?id=MQirNNU2pC", "author_site": "Atli Kosson, Bettina Messmer, Martin Jaggi", "tldr": "", "abstract": "This study investigates how weight decay affects the update behavior of individual neurons in deep neural networks through a combination of applied analysis and experimentation. Weight decay can cause the expected magnitude and angular updates of a neuron's weight vector to converge to a steady state we call rotational equilibrium. These states can be highly homogeneous, effectively balancing the average rotation---a proxy for the effective learning rate---across different layers and neurons. Our work analyzes these dynamics across optimizers like Adam, Lion, and SGD with momentum, offering a new simple perspective on training that elucidates the efficacy of widely used but poorly understood methods in deep learning. We demonstrate how balanced rotation plays a key role in the effectiveness of normalization like Weight Standardization, as well as that of AdamW over Adam with L2-regularization. Finally, we show that explicitly controlling the rotation provides the benefits of weight decay while substantially reducing the need for learning rate warmup.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Atli Kosson;Bettina Messmer;Martin Jaggi", "authorids": "~Atli_Kosson1;~Bettina_Messmer1;~Martin_Jaggi1", "gender": ";;M", "homepage": ";https://people.epfl.ch/bettina.messmer;https://mlo.epfl.ch", "dblp": ";;17/4402", "google_scholar": ";;https://scholar.google.ch/citations?user=r1TJBr8AAAAJ", "orcid": ";;0000-0003-1579-5558", "linkedin": ";;", "or_profile": "~Atli_Kosson1;~Bettina_Messmer1;~Martin_Jaggi1", "aff": ";EPFL - EPF Lausanne;EPFL", "aff_domain": ";epfl.ch;epfl.ch", "position": ";PhD student;Associate Professor", "bibtex": "@inproceedings{\nkosson2024rotational,\ntitle={Rotational Equilibrium: How Weight Decay Balances Learning Across Neural Networks},\nauthor={Atli Kosson and Bettina Messmer and Martin Jaggi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MQirNNU2pC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1289190, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14899743428852739833&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 7, "email": ";epfl.ch;epfl.ch", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "EPFL", "aff_unique_dep": "", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "title": "Integrating Global Context Contrast and Local Sensitivity for Blind Image Quality Assessment", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34240", "id": "MRYS3Zb4iV", "proceeding": "https://proceedings.mlr.press/v235/li24ac.html", "pdf": "https://openreview.net/pdf?id=MRYS3Zb4iV", "openreview": "https://openreview.net/forum?id=MRYS3Zb4iV", "author_site": "Xudong Li, Runze Hu, Jingyuan Zheng, Yan Zhang, Shengchuan Zhang, Xiawu Zheng, Ke Li, Yunhang Shen, Yutao Liu, Pingyang Dai, Rongrong Ji", "tldr": "", "abstract": "Blind Image Quality Assessment (BIQA) mirrors subjective made by human observers. Generally, humans favor comparing relative qualities over predicting absolute qualities directly. However, current BIQA models focus on mining the \"local\" context, i.e., the relationship between information among individual images and the absolute quality of the image, ignoring the \"global\" context of the relative quality contrast among different images in the training data. In this paper, we present the Perceptual Context and Sensitivity BIQA (CSIQA), a novel contrastive learning paradigm that seamlessly integrates \"global'' and \"local'' perspectives into the BIQA. Specifically, the CSIQA comprises two primary components: 1) A Quality Context Contrastive Learning module, which is equipped with different contrastive learning strategies to effectively capture potential quality correlations in the global context of the dataset. 2) A Quality-aware Mask Attention Module, which employs the random mask to ensure the consistency with visual local sensitivity, thereby improving the model's perception of local distortions. Extensive experiments on eight standard BIQA datasets demonstrate the superior performance to the state-of-the-art BIQA methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xudong Li;Runze Hu;Jingyuan Zheng;Yan Zhang;Shengchuan Zhang;Xiawu Zheng;Ke Li;Yunhang Shen;Yutao Liu;Pingyang Dai;Rongrong Ji", "authorids": "~Xudong_Li7;~Runze_Hu1;~Jingyuan_Zheng1;~Yan_Zhang22;~Shengchuan_Zhang1;~Xiawu_Zheng1;~Ke_Li4;~Yunhang_Shen1;~Yutao_Liu2;~Pingyang_Dai1;~Rongrong_Ji5", "gender": "M;;;;M;M;M;M;M;M;M", "homepage": "https://github.com/LXDxmumac/LXDxmumac.github.io;;;;https://informatics.xmu.edu.cn/info/1405/25059.htm;https://sites.google.com/view/zhengxiawu/%E9%A6%96%E9%A1%B5;http://keli.info;https://shenyunhang.github.io/;https://www.researchgate.net/profile/Yutao-Liu;;http://mac.xmu.edu.cn/rrji-en.html", "dblp": ";;;;162/1064;222/7865;;146/1800;;04/8207;86/5681", "google_scholar": ";;;;GToqXScAAAAJ;jBgXocYAAAAJ;mfWsFM0AAAAJ;29teR74AAAAJ;;https://scholar.google.com.hk/citations?user=fEw3__QAAAAJ;", "orcid": ";;;;0000-0002-0800-0609;0000-0002-6855-5403;0000-0001-7998-0731;0000-0002-3970-7519;0000-0002-3066-1884;;", "linkedin": ";;;;;;;;;;", "or_profile": "~Xudong_Li7;~Runze_Hu1;~Jingyuan_Zheng1;~Yan_Zhang22;~Shengchuan_Zhang1;~Xiawu_Zheng1;~Ke_Li4;~Yunhang_Shen1;~Yutao_Liu2;~Pingyang_Dai1;~Rongrong_Ji5", "aff": "XMU;;;;Xiamen University;PengCheng Lab;Tencent;Tencent;Ocean University of China;Xiamen University;Xiamen University", "aff_domain": "xmu.edu;;;;xmu.edu.cn;pcl.ac.cn;tencent.com;tencent.com;ouc.edu.cn;xmu.edu.cn;xmu.edu.cn", "position": "MS student;;;;Assistant Professor;Postdoc;Principal Researcher;Researcher;Associate Professor;Senior Engineer;Full Professor", "bibtex": "@inproceedings{\nli2024integrating,\ntitle={Integrating Global Context Contrast and Local Sensitivity for Blind Image Quality Assessment},\nauthor={Xudong Li and Runze Hu and Jingyuan Zheng and Yan Zhang and Shengchuan Zhang and Xiawu Zheng and Ke Li and Yunhang Shen and Yutao Liu and Pingyang Dai and Rongrong Ji},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MRYS3Zb4iV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2572869, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1498493598516752713&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "xmu.edu;;;;xmu.edu.cn;pcl.ac.cn;tencent.com;tencent.com;ouc.edu.cn;xmu.edu.cn;xmu.edu.cn", "author_num": 11, "aff_unique_index": "0;0;1;2;2;3;0;0", "aff_unique_norm": "Xiamen University;Pengcheng Lab;Tencent;Ocean University of China", "aff_unique_dep": ";;Tencent Holdings Limited;", "aff_unique_url": "https://www.xmu.edu.cn;;https://www.tencent.com;http://www.ouc.edu.cn", "aff_unique_abbr": "XMU;;Tencent;OUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "A Near-Linear Time Approximation Algorithm for Beyond-Worst-Case Graph Clustering", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34239", "id": "MSFxOMM0gK", "proceeding": "https://proceedings.mlr.press/v235/cohen-addad24c.html", "pdf": "https://openreview.net/pdf?id=MSFxOMM0gK", "openreview": "https://openreview.net/forum?id=MSFxOMM0gK", "author_site": "Vincent Cohen-Addad, Tommaso d'Orsi, Aida Mousavifar", "tldr": "", "abstract": "We consider the semi-random graph model of [Makarychev, Makarychev and Vijayaraghavan, STOC'12], where, given a random bipartite graph with $\\alpha$ edges and an unknown bipartition $(A, B)$ of the vertex set, an adversary can add arbitrary edges inside each community and remove arbitrary edges from the cut $(A, B)$ (i.e. all adversarial changes are *monotone* with respect to the bipartition). For this model, a polynomial time algorithm [MMV'12] is known to approximate the Balanced Cut problem up to value $O(\\alpha)$ as long as the cut $(A, B)$ has size $\\Omega(\\alpha)$. However, it consists of slow subroutines requiring optimal solutions for logarithmically many semidefinite programs. We study the fine-grained complexity of the problem and present the first near-linear time algorithm that achieves similar performances to that of [MMV'12]. Our algorithm runs in time $O(|V(G)|^{1+o(1)} + |E(G)|^{1+o(1)})$ and finds a balanced cut of value $O(\\alpha).$ Our approach appears easily extendible to related problem, such as Sparsest Cut, and also yields an near-linear time $O(1)$-approximation to Dagupta's objective function for hierarchical clustering [Dasgupta, STOC'16] for the semi-random hierarchical stochastic block model inputs of [Cohen-Addad, Kanade, Mallmann-Trenn, Mathieu, JACM'19].", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vincent Cohen-Addad;Tommaso d'Orsi;Aida Mousavifar", "authorids": "~Vincent_Cohen-Addad1;~Tommaso_d'Orsi1;~Aida_Mousavifar2", "gender": ";;F", "homepage": ";https://tommasodorsi.github.io;https://www.linkedin.com/in/aida-mousavifar/", "dblp": "136/5814;275/8135;198/5203", "google_scholar": ";;https://scholar.google.ch/citations?user=WhqsjcoAAAAJ", "orcid": ";;", "linkedin": ";;aida-mousavifar/", "or_profile": "~Vincent_Cohen-Addad1;~Tommaso_d'Orsi1;~Aida_Sadat_Mousavifar1", "aff": "Google;Bocconi University;Google", "aff_domain": "google.com;unibocconi.it;google.com", "position": "Researcher;Assistant Professor;Researcher", "bibtex": "@inproceedings{\ncohen-addad2024a,\ntitle={A Near-Linear Time Approximation Algorithm for Beyond-Worst-Case Graph Clustering},\nauthor={Vincent Cohen-Addad and Tommaso d'Orsi and Aida Mousavifar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MSFxOMM0gK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 435074, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8623842008975847871&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "google.com;unibocconi.it;google.com", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Google;Bocconi University", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.bocconi.edu", "aff_unique_abbr": "Google;Bocconi", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;Italy" }, { "title": "Amortized Variational Deep Kernel Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34238", "id": "MSMKQuZhD5", "proceeding": "https://proceedings.mlr.press/v235/matias24a.html", "pdf": "https://openreview.net/pdf?id=MSMKQuZhD5", "openreview": "https://openreview.net/forum?id=MSMKQuZhD5", "author_site": "Alan Matias, C\u00e9sar Lincoln Mattos, Joao Paulo Gomes, Diego Mesquita", "tldr": "", "abstract": "Deep kernel learning (DKL) marries the uncertainty quantification of Gaussian processes (GPs) and the representational power of deep neural networks. However, training DKL is challenging and often leads to overfitting. Most notably, DKL often learns \u201cnon-local\u201d kernels \u2014 incurring spurious correlations. To remedy this issue, we propose using amortized inducing points and a parameter-sharing scheme, which ties together the amortization and DKL networks. This design imposes an explicit dependency between the ELBO\u2019s model fit and capacity terms. In turn, this prevents the former from dominating the optimization procedure and incurring the aforementioned spurious correlations. Extensive experiments show that our resulting method, *amortized varitional* DKL (AVDKL), i) consistently outperforms DKL and standard GPs for tabular data; ii) achieves significantly higher accuracy than DKL in node classification tasks; and iii) leads to substantially better accuracy and negative log-likelihood than DKL on CIFAR100.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alan L. S. Matias;C\u00e9sar Lincoln Mattos;Jo\u00e3o Paulo Pordeus Gomes;Diego Mesquita", "authorids": "~Alan_L._S._Matias1;~C\u00e9sar_Lincoln_Mattos1;~Jo\u00e3o_Paulo_Pordeus_Gomes1;~Diego_Mesquita1", "gender": "M;M;M;M", "homepage": ";;https://weakly-informative.github.io;https://scholar.google.com.br/citations?hl=pt-BR&user=ebsoHiUAAAAJ", "dblp": "150/2808;163/4376.html;163/4293;", "google_scholar": "DCKOV4oAAAAJ;https://scholar.google.com.br/citations?user=q3mkKj8AAAAJ;;", "orcid": "0000-0002-2404-3625;0000-0003-1686-595X;;", "linkedin": ";;;", "or_profile": "~C\u00e9sar_Lincoln_Mattos1;~Jo\u00e3o_Paulo_Pordeus_Gomes1;~Diego_Mesquita1;~Alan_Matias1", "aff": "Federal University of Cear\u00e1;Universidade Federal do Cear\u00e1;Getulio Vargas Foundation;Universidade Federal do Cear\u00e1", "aff_domain": "ufc.br;ufc.br;fgv.br;ufc.br", "position": "Associate Professor;Associate Professor;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nmatias2024amortized,\ntitle={Amortized Variational Deep Kernel Learning},\nauthor={Alan L. S. Matias and C{\\'e}sar Lincoln Mattos and Jo{\\~a}o Paulo Pordeus Gomes and Diego Mesquita},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MSMKQuZhD5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2597497, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10601779890748092484&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "ufc.br;ufc.br;fgv.br;ufc.br", "author_num": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Federal University of Cear\u00e1;Universidade Federal do Cear\u00e1;Getulio Vargas Foundation", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uece.br;https://www.ufc.br;https://fgv.br", "aff_unique_abbr": "UFC;UFC;FGV", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Brazil" }, { "title": "Unifying Image Processing as Visual Prompting Question Answering", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34237", "id": "MUXTt9Yr4T", "proceeding": "https://proceedings.mlr.press/v235/liu24k.html", "pdf": "https://openreview.net/pdf?id=MUXTt9Yr4T", "openreview": "https://openreview.net/forum?id=MUXTt9Yr4T", "author_site": "Yihao Liu, Xiangyu Chen, Xianzheng Ma, Xintao Wang, Jiantao Zhou, Yu Qiao, Chao Dong", "tldr": "", "abstract": "Image processing is a fundamental task in computer vision, which aims at enhancing image quality and extracting essential features for subsequent vision applications. Traditionally, task-specific models are developed for individual tasks and designing such models requires distinct expertise. Building upon the success of large language models (LLMs) in natural language processing (NLP), there is a similar trend in computer vision, which focuses on developing large-scale models through pretraining and in-context learning. This paradigm shift reduces the reliance on task-specific models, yielding a powerful unified model to deal with various tasks. However, these advances have predominantly concentrated on high-level vision tasks, with less attention paid to low-level vision tasks. To address this issue, we propose a universal model for general image processing that covers image restoration, image enhancement, image feature extraction tasks, etc. Our proposed framework, named PromptGIP, unifies these diverse image processing tasks within a universal framework. Inspired by NLP question answering (QA) techniques, we employ a visual prompting question answering paradigm. Specifically, we treat the input-output image pair as a structured question-answer sentence, thereby reprogramming the image processing task as a prompting QA problem. PromptGIP can undertake diverse cross-domain tasks using provided visual prompts, eliminating the need for task-specific finetuning. Capable of handling up to 15 different image processing tasks, PromptGIP represents a versatile and adaptive approach to general image processing. While PromptGIP has demonstrated a certain degree of out-of-domain task generalization capability, further research is expected to fully explore its more powerful emergent generalization. Codes will be available at https://github.com/lyh-18/PromptGIP.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yihao Liu;Xiangyu Chen;Xianzheng Ma;Xintao Wang;Jiantao Zhou;Yu Qiao;Chao Dong", "authorids": "~Yihao_Liu1;~Xiangyu_Chen5;~Xianzheng_Ma1;~Xintao_Wang1;~Jiantao_Zhou1;~Yu_Qiao1;~Chao_Dong4", "gender": "M;M;;;M;;M", "homepage": ";https://chxy95.github.io/;;;https://www.fst.um.edu.mo/en/staff/jtzhou.html;;http://xpixel.group/2010/01/20/chaodong.html", "dblp": "200/6534-1;84/7543-6;;;52/4786-1;;16/1278-5", "google_scholar": "WRIYcNwAAAAJ;https://scholar.google.com.hk/citations?user=_gkTxJUAAAAJ;;;mcROAxAAAAAJ;;https://scholar.google.com/citations?hl=zh-CN", "orcid": "0000-0001-9874-0602;;;;;;", "linkedin": ";;;;;;", "or_profile": "~Yihao_Liu1;~Xiangyu_Chen5;~Xianzheng_Ma1;~Xintao_Wang1;~Jiantao_Zhou1;~Yu_Qiao1;~Chao_Dong4", "aff": "Shanghai Artificial Intelligence Laboratory;Shenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences;;;University of Macau;;Shenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences", "aff_domain": "pjlab.org.cn;siat.ac.cn;;;umac.mo;;siat.ac.cn", "position": "Researcher;PhD student;;;Full Professor;;Full Professor", "bibtex": "@inproceedings{\nliu2024unifying,\ntitle={Unifying Image Processing as Visual Prompting Question Answering},\nauthor={Yihao Liu and Xiangyu Chen and Xianzheng Ma and Xintao Wang and Jiantao Zhou and Yu Qiao and Chao Dong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MUXTt9Yr4T}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6887214, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17480952699504828273&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "pjlab.org.cn;siat.ac.cn;;;umac.mo;;siat.ac.cn", "author_num": 7, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Shanghai Artificial Intelligence Laboratory;Chinese Academy of Sciences;University of Macau", "aff_unique_dep": ";Shenzhen Institutes of Advanced Technology;", "aff_unique_url": "http://www.shailab.org/;http://www.cas.cn;https://www.um.edu.mo", "aff_unique_abbr": "Shanghai AI Lab;CAS;UM", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Shenzhen;Macau SAR", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Consistent Adversarially Robust Linear Classification: Non-Parametric Setting", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34236", "id": "MV2b44zDd3", "proceeding": "https://proceedings.mlr.press/v235/dohmatob24a.html", "pdf": "https://openreview.net/pdf?id=MV2b44zDd3", "openreview": "https://openreview.net/forum?id=MV2b44zDd3", "tldr": "", "abstract": "For binary classification in $d$ dimensions, it is known that with a sample size of $n$, an excess adversarial risk of $O(d/n)$ is achievable under strong parametric assumptions about the underlying data distribution (e.g., assuming a Gaussian mixture model). In the case of well-separated distributions, this rate can be further refined to $O(1/n)$. Our work studies the non-parametric setting, where very little is known. With only mild regularity conditions on the conditional distribution of the features, we examine adversarial attacks with respect to arbitrary norms and introduce a straightforward yet effective estimator with provable consistency w.r.t adversarial risk. Our estimator is given by minimizing a series of smoothed versions of the robust 0/1 loss, with a smoothing bandwidth that adapts to both $n$ and $d$. Furthermore, we demonstrate that our estimator can achieve the minimax excess adversarial risk of $\\widetilde O(\\sqrt{d/n})$ for linear classifiers, at the cost of solving possibly rougher optimization problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Elvis Dohmatob", "authorids": "~Elvis_Dohmatob1", "gender": "M", "homepage": "http://dohmatob.github.io/", "dblp": "134/9794", "google_scholar": "https://scholar.google.fr/citations?user=FDWgJY8AAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Elvis_Dohmatob1", "aff": "Meta Facebook", "aff_domain": "facebook.com", "position": "Researcher", "bibtex": "@inproceedings{\ndohmatob2024consistent,\ntitle={Consistent Adversarially Robust Linear Classification: Non-Parametric Setting},\nauthor={Elvis Dohmatob},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MV2b44zDd3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 456219, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:XtEhhSr2ZD4J:scholar.google.com/&scioq=Consistent+Adversarially+Robust+Linear+Classification:+Non-Parametric+Setting&hl=en&as_sdt=0,33", "gs_version_total": 4, "email": "facebook.com", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Meta", "aff_unique_dep": "Meta Platforms, Inc.", "aff_unique_url": "https://meta.com", "aff_unique_abbr": "Meta", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Optimistic Multi-Agent Policy Gradient", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34235", "id": "MWTicAxmRP", "proceeding": "https://proceedings.mlr.press/v235/zhao24v.html", "pdf": "https://openreview.net/pdf?id=MWTicAxmRP", "openreview": "https://openreview.net/forum?id=MWTicAxmRP", "author_site": "Wenshuai Zhao, Yi Zhao, Zhiyuan Li, Kannala Juho, Joni Pajarinen", "tldr": "", "abstract": "*Relative overgeneralization* (RO) occurs in cooperative multi-agent learning tasks when agents converge towards a suboptimal joint policy due to overfitting to suboptimal behaviors of other agents. No methods have been proposed for addressing RO in multi-agent policy gradient (MAPG) methods although these methods produce state-of-the-art results. To address this gap, we propose a general, yet simple, framework to enable optimistic updates in MAPG methods that alleviate the RO problem. Our approach involves clipping the advantage to eliminate negative values, thereby facilitating optimistic updates in MAPG. The optimism prevents individual agents from quickly converging to a local optimum. Additionally, we provide a formal analysis to show that the proposed method retains optimality at a fixed point. In extensive evaluations on a diverse set of tasks including the *Multi-agent MuJoCo* and *Overcooked* benchmarks, our method outperforms strong baselines on 13 out of 19 tested tasks and matches the performance on the rest.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenshuai Zhao;Yi Zhao;Zhiyuan Li;Juho Kannala;Joni Pajarinen", "authorids": "~Wenshuai_Zhao1;~Yi_Zhao6;~Zhiyuan_Li9;~Juho_Kannala5;~Joni_Pajarinen2", "gender": "M;M;M;M;", "homepage": "https://wenshuaizhao.github.io/;https://zhaoyi11.github.io/;https://lizhyun.github.io/;https://users.aalto.fi/~kannalj1/;", "dblp": "246/5109;51/4138-1;39/7780;47/4656.html;23/8355", "google_scholar": "cuNOys8AAAAJ;https://scholar.google.com/citations?hl=en;1GYbhX0AAAAJ;c4mWQPQAAAAJ;https://scholar.google.fi/citations?user=-2fJStwAAAAJ", "orcid": ";0009-0002-9979-595X;0000-0002-1804-3485;0000-0001-5088-4041;0000-0003-4469-8191", "linkedin": ";;nostalduli/;;", "or_profile": "~Wenshuai_Zhao1;~Yi_Zhao6;~Zhiyuan_Li9;~Juho_Kannala5;~Joni_Pajarinen2", "aff": "Aalto University;Max Planck Institute for Intelligent Systems;University of Electronic Science and Technology of China;Aalto University;Aalto University", "aff_domain": "aalto.fi;mpg.tuebingen.de;uestc.edu.cn;aalto.fi;aalto.fi", "position": "PhD student;Intern;PhD student;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nzhao2024optimistic,\ntitle={Optimistic Multi-Agent Policy Gradient},\nauthor={Wenshuai Zhao and Yi Zhao and Zhiyuan Li and Juho Kannala and Joni Pajarinen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MWTicAxmRP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1029991, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18023942035056385505&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 12, "email": "aalto.fi;mpg.tuebingen.de;uestc.edu.cn;aalto.fi;aalto.fi", "author_num": 5, "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "Aalto University;Max Planck Institute for Intelligent Systems;University of Electronic Science and Technology of China", "aff_unique_dep": ";Intelligent Systems;", "aff_unique_url": "https://www.aalto.fi;https://www.mpi-is.mpg.de;https://www.uestc.edu.cn", "aff_unique_abbr": "Aalto;MPI-IS;UESTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0;0", "aff_country_unique": "Finland;Germany;China" }, { "title": "TinyTrain: Resource-Aware Task-Adaptive Sparse Training of DNNs at the Data-Scarce Edge", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34234", "id": "MWZWUyfFHC", "proceeding": "https://proceedings.mlr.press/v235/kwon24c.html", "pdf": "https://openreview.net/pdf?id=MWZWUyfFHC", "openreview": "https://openreview.net/forum?id=MWZWUyfFHC", "author_site": "Young Kwon, Rui Li, Stylianos Venieris, Jagmohan Chauhan, Nicholas Lane, Cecilia Mascolo", "tldr": "", "abstract": "On-device training is essential for user personalisation and privacy. With the pervasiveness of IoT devices and microcontroller units (MCUs), this task becomes more challenging due to the constrained memory and compute resources, and the limited availability of labelled user data. Nonetheless, prior works neglect the data scarcity issue, require excessively long training time ($\\textit{e.g.}$ a few hours), or induce substantial accuracy loss ($\\geq$10%). In this paper, we propose TinyTrain, an on-device training approach that drastically reduces training time by selectively updating parts of the model and explicitly coping with data scarcity. TinyTrain introduces a task-adaptive sparse-update method that $\\textit{dynamically}$ selects the layer/channel to update based on a multi-objective criterion that jointly captures user data, the memory, and the compute capabilities of the target device, leading to high accuracy on unseen tasks with reduced computation and memory footprint. TinyTrain outperforms vanilla fine-tuning of the entire network by 3.6-5.0% in accuracy, while reducing the backward-pass memory and computation cost by up to 1,098$\\times$ and 7.68$\\times$, respectively. Targeting broadly used real-world edge devices, TinyTrain achieves 9.5$\\times$ faster and 3.5$\\times$ more energy-efficient training over status-quo approaches, and 2.23$\\times$ smaller memory footprint than SOTA methods, while remaining within the 1 MB memory envelope of MCU-grade platforms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Young D. Kwon;Rui Li;Stylianos Venieris;Jagmohan Chauhan;Nicholas Donald Lane;Cecilia Mascolo", "authorids": "~Young_D._Kwon1;~Rui_Li11;~Stylianos_Venieris1;~Jagmohan_Chauhan1;~Nicholas_Donald_Lane1;~Cecilia_Mascolo1", "gender": "M;F;;;;F", "homepage": "https://theyoungkwon.github.io;https://ruihuili.github.io/;https://steliosven10.github.io/;;;http://www.cl.cam.ac.uk/users/cm542", "dblp": "77/5405;;169/2322;;;21/6419", "google_scholar": "_rp_S9MAAAAJ;;https://scholar.google.co.uk/citations?user=A1QXa5cAAAAJ;;;https://scholar.google.com/citations?hl=en", "orcid": ";;;;;0000-0001-9614-4380", "linkedin": "theyoungkwon/;;stylianos-i-venieris-300446155;;;", "or_profile": "~Young_D._Kwon1;~Rui_Li11;~Stylianos_Venieris1;~Jagmohan_Chauhan1;~Nicholas_Donald_Lane1;~Cecilia_Mascolo1", "aff": "University of Cambridge;Samsung AI Center;Samsung AI;;;University of Cambridge", "aff_domain": "cam.ac.uk;samsung.com;samsung.com;;;cam.ac.uk", "position": "PhD student;Researcher;Researcher;;;Full Professor", "bibtex": "@inproceedings{\nkwon2024tinytrain,\ntitle={TinyTrain: Resource-Aware Task-Adaptive Sparse Training of {DNN}s at the Data-Scarce Edge},\nauthor={Young D. Kwon and Rui Li and Stylianos Venieris and Jagmohan Chauhan and Nicholas Donald Lane and Cecilia Mascolo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MWZWUyfFHC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3469850, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1594029977386440080&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 12, "email": "cam.ac.uk;samsung.com;samsung.com;;;cam.ac.uk", "author_num": 6, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "University of Cambridge;Samsung", "aff_unique_dep": ";AI Center", "aff_unique_url": "https://www.cam.ac.uk;https://www.samsung.com/global/careers/ai-center/", "aff_unique_abbr": "Cambridge;Samsung AI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "United Kingdom;South Korea" }, { "title": "An Efficient Maximal Ancestral Graph Listing Algorithm", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34233", "id": "MZkqjV4FRT", "proceeding": "https://proceedings.mlr.press/v235/wang24o.html", "pdf": "https://openreview.net/pdf?id=MZkqjV4FRT", "openreview": "https://openreview.net/forum?id=MZkqjV4FRT", "author_site": "Tian-Zuo Wang, Wen-Bo Du, Zhi-Hua Zhou", "tldr": "", "abstract": "Maximal ancestral graph (MAG) is a prevalent graphical model to characterize causal relations in the presence of *latent variables* including latent confounders and selection variables. Given observational data, only a Markov equivalence class (MEC) of MAGs is identifiable if without some additional assumptions. Due to this fact, MAG listing, listing all the MAGs in the MEC, is usually demanded in many downstream tasks. To the best of our knowledge, there are no relevant methods for MAG listing other than brute force in the literature. In this paper, we propose the first brute-force-free MAG listing method, by determining the local structures of each vertex recursively. We provide the graphical characterization for each valid local transformation of a vertex, and present sound and complete rules to incorporate the valid local transformation in the presence of latent confounders and selection variables. Based on these components, our method can efficiently output all the MAGs in the MEC with no redundance, that is, every intermediate graph in the recursive process is necessary for the MAG listing task. The empirical analysis demonstrates the superiority of our proposed method on efficiency and effectiveness.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tian-Zuo Wang;Wen-Bo Du;Zhi-Hua Zhou", "authorids": "~Tian-Zuo_Wang1;~Wen-Bo_Du1;~Zhi-Hua_Zhou2", "gender": "M;M;M", "homepage": "http://www.lamda.nju.edu.cn/wangtz/;https://www.lamda.nju.edu.cn/duwb/;https://cs.nju.edu.cn/zhouzh/", "dblp": "249/9504;35/7086-2;z/ZhiHuaZhou", "google_scholar": "xUyl98AAAAAJ;AQyRj3oAAAAJ;https://scholar.google.com.tw/citations?user=rSVIHasAAAAJ", "orcid": ";;0000-0003-0746-1494", "linkedin": ";;", "or_profile": "~Tian-Zuo_Wang1;~Wen-Bo_Du1;~Zhi-hua_Zhou1", "aff": "Nanjing university;Nanjing University;Nanjing University", "aff_domain": "nju.edu.cn;nju.edu.cn;nju.edu.cn", "position": "Researcher;PhD student;Full Professor", "bibtex": "@inproceedings{\nwang2024an,\ntitle={An Efficient Maximal Ancestral Graph Listing Algorithm},\nauthor={Tian-Zuo Wang and Wen-Bo Du and Zhi-Hua Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MZkqjV4FRT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 692973, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7540703876341291024&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "nju.edu.cn;nju.edu.cn;nju.edu.cn", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "", "aff_unique_url": "https://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "I/O Complexity of Attention, or How Optimal is FlashAttention?", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34232", "id": "MdPBVWTfwG", "proceeding": "https://proceedings.mlr.press/v235/saha24a.html", "pdf": "https://openreview.net/pdf?id=MdPBVWTfwG", "openreview": "https://openreview.net/forum?id=MdPBVWTfwG", "author_site": "Barna Saha, Christopher Ye", "tldr": "", "abstract": "Attention is at the heart of the popular Transformer architecture, yet suffers from quadratic time and memory complexity. In a recent significant development, FlashAttention shows that the I/O complexity of attention is the true bottleneck in scaling Transformers. Given two levels of memory hierarchy, a fast cache (e.g. GPU on-chip SRAM) where computation happens and a slow memory (e.g. GPU high-bandwidth memory) where the data resides, the I/O complexity measures the number of accesses to the slow memory. FlashAttention is an I/O-aware algorithm for self-attention that requires $\\frac{N^2d^2}{M}$ I/O operations where $N$ is the dimension of the attention matrix, $d$ is the head-dimension and $M$ is the size of cache. Naturally, to further reduce the computational costs of Attention, the authors ask the question: is FlashAttention's I/O complexity optimal for every value of $M$? We resolve the above question in its full generality by showing an I/O complexity lower bound that matches the upper bound provided by FlashAttention for any values of $M \\geq d^2$ within any constant factors. Moreover, our lower bounds do not rely on using combinatorial matrix multiplication for computing the attention matrix: even if one uses fast matrix multiplication, the above I/O complexity bounds cannot be improved. Further, we give a better algorithm with lower I/O complexity for $M < d^2$, and show that it is optimal for combinatorial algorithms. We do so by introducing a new communication complexity protocol for matrix compression, and connecting communication complexity to I/O complexity. We believe this connection could be of independent interest and will find more applications in proving I/O complexity lower bounds in future.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Barna Saha;Christopher Ye", "authorids": "~Barna_Saha3;~Christopher_Ye1", "gender": ";M", "homepage": "https://barnasaha.net;https://czye17.github.io", "dblp": ";304/2086", "google_scholar": "BsmjRdoAAAAJ;WmAZ-WIAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Barna_Saha3;~Christopher_Ye1", "aff": "University of California, San Diego;University of California, San Diego", "aff_domain": "ucsd.edu;ucsd.edu", "position": "Associate Professor;PhD student", "bibtex": "@inproceedings{\nsaha2024io,\ntitle={I/O Complexity of Attention, or How Optimal is FlashAttention?},\nauthor={Barna Saha and Christopher Ye},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MdPBVWTfwG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 441335, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:mtgVVeCBp4oJ:scholar.google.com/&scioq=I/O+Complexity+of+Attention,+or+How+Optimal+is+FlashAttention%3F&hl=en&as_sdt=0,33", "gs_version_total": 6, "email": "ucsd.edu;ucsd.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of California, San Diego", "aff_unique_dep": "", "aff_unique_url": "https://www.ucsd.edu", "aff_unique_abbr": "UCSD", "aff_campus_unique_index": "0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Stereo Risk: A Continuous Modeling Approach to Stereo Matching", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34231", "id": "Mfk6ZbD6eY", "proceeding": "https://proceedings.mlr.press/v235/liu24af.html", "pdf": "https://openreview.net/pdf?id=Mfk6ZbD6eY", "openreview": "https://openreview.net/forum?id=Mfk6ZbD6eY", "author_site": "Ce Liu, Suryansh Kumar, Shuhang Gu, Radu Timofte, Yao Yao, Luc Van Gool", "tldr": "", "abstract": "We introduce Stereo Risk, a new deep-learning approach to solve the classical stereo-matching problem in computer vision. As it is well-known that stereo matching boils down to a per-pixel disparity estimation problem, the popular state-of-the-art stereo-matching approaches widely rely on regressing the scene disparity values, yet via discretization of scene disparity values. Such discretization often fails to capture the nuanced, continuous nature of scene depth. Stereo Risk departs from the conventional discretization approach by formulating the scene disparity as an optimal solution to a continuous risk minimization problem, hence the name \"stereo risk\". We demonstrate that $L^1$ minimization of the proposed continuous risk function enhances stereo-matching performance for deep networks, particularly for disparities with multi-modal probability distributions. Furthermore, to enable the end-to-end network training of the non-differentiable $L^1$ risk optimization, we exploited the implicit function theorem, ensuring a fully differentiable network. A comprehensive analysis demonstrates our method's theoretical soundness and superior performance over the state-of-the-art methods across various benchmark datasets, including KITTI 2012, KITTI 2015, ETH3D, SceneFlow, and Middlebury 2014.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ce Liu;Suryansh Kumar;Shuhang Gu;Radu Timofte;Yao Yao;Luc Van Gool", "authorids": "~Ce_Liu3;~Suryansh_Kumar1;~Shuhang_Gu3;~Radu_Timofte1;~Yao_Yao1;~Luc_Van_Gool1", "gender": "M;M;M;M;M;", "homepage": "https://vision.ee.ethz.ch/people-details.MjYzNDA1.TGlzdC8zMjg5LC0xOTcxNDY1MTc4.html;https://suryanshkumar.github.io/;;https://www.informatik.uni-wuerzburg.de/computervision/;https://yoyo000.github.io/;", "dblp": ";124/2783;126/1028;24/8616;07/4410-8;61/5017", "google_scholar": ";wbk0QAcAAAAJ;-kSTt40AAAAJ;https://scholar.google.ch/citations?user=u3MwH5kAAAAJ;MGxaDVEAAAAJ;https://scholar.google.be/citations?user=TwMib_QAAAAJ", "orcid": ";;;0000-0002-1478-0402;;", "linkedin": ";;;https://ch.linkedin.com/in/radutimofte;;", "or_profile": "~Ce_Liu3;~Suryansh_Kumar1;~Shuhang_Gu3;~Radu_Timofte1;~Yao_Yao1;~Luc_Van_Gool1", "aff": "ETH Zurich;Texas A&M University - College Station;University of Electronic Science and Technology of China;Bayerische Julius-Maximilians-Universit\u00e4t W\u00fcrzburg;Nanjing University;KU Leuven", "aff_domain": "vision.ee.ethz.ch;tamu.edu;uestc.edu.cn;uni-wuerzburg.de;nju.edu.cn;kuleuven.be", "position": "PhD student;Assistant Professor;Full Professor;Full Professor;Associate Professor;Emeritus", "bibtex": "@inproceedings{\nliu2024stereo,\ntitle={Stereo Risk: A Continuous Modeling Approach to Stereo Matching},\nauthor={Ce Liu and Suryansh Kumar and Shuhang Gu and Radu Timofte and Yao Yao and Luc Van Gool},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Mfk6ZbD6eY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9994842, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15593071776498799921&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "vision.ee.ethz.ch;tamu.edu;uestc.edu.cn;uni-wuerzburg.de;nju.edu.cn;kuleuven.be", "author_num": 6, "aff_unique_index": "0;1;2;3;4;5", "aff_unique_norm": "ETH Zurich;Texas A&M University;University of Electronic Science and Technology of China;University of W\u00fcrzburg;Nanjing University;Katholieke Universiteit Leuven", "aff_unique_dep": ";;;;;", "aff_unique_url": "https://www.ethz.ch;https://www.tamu.edu;https://www.uestc.edu.cn;https://www.uni-wuerzburg.de;https://www.nju.edu.cn;https://www.kuleuven.be", "aff_unique_abbr": "ETHZ;TAMU;UESTC;JMU;Nanjing U;KU Leuven", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";College Station;W\u00fcrzburg", "aff_country_unique_index": "0;1;2;3;2;4", "aff_country_unique": "Switzerland;United States;China;Germany;Belgium" }, { "title": "Instruction Tuning for Secure Code Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34230", "id": "MgTzMaYHvG", "proceeding": "https://proceedings.mlr.press/v235/he24k.html", "pdf": "https://openreview.net/pdf?id=MgTzMaYHvG", "openreview": "https://openreview.net/forum?id=MgTzMaYHvG", "author_site": "Jingxuan He, Mark Vero, Gabriela Krasnopolska, Martin Vechev", "tldr": "", "abstract": "Modern language models (LMs) have gained widespread acceptance in everyday and professional contexts, particularly in programming. An essential procedure enabling this adoption is instruction tuning, which substantially enhances LMs' practical utility by training them to follow user instructions and human preferences. However, existing instruction tuning schemes overlook a crucial aspect: the security of generated code. As a result, even the state-of-the-art instruction-tuned LMs frequently produce unsafe code, posing significant security risks. In this work, we introduce SafeCoder to address this gap. SafeCoder performs security-centric fine-tuning using a diverse and high-quality dataset that we collected using an automated pipeline. We integrate the security fine-tuning with standard instruction tuning, to facilitate a joint optimization of both security and utility. Despite its simplicity, we show that SafeCoder is effective across a variety of popular LMs and datasets. It is able to drastically improve security (by about 30%), while preserving utility.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jingxuan He;Mark Vero;Gabriela Krasnopolska;Martin Vechev", "authorids": "~Jingxuan_He1;~Mark_Vero1;~Gabriela_Krasnopolska1;~Martin_Vechev1", "gender": "M;M;F;M", "homepage": "https://www.sri.inf.ethz.ch/people/jingxuan;https://www.sri.inf.ethz.ch/people/markvero;;https://www.sri.inf.ethz.ch/people/martin", "dblp": ";319/4985;;93/2189.html", "google_scholar": "ylHZY58AAAAJ;vguDYtQAAAAJ;;https://scholar.google.ch/citations?user=aZ1Rh50AAAAJ", "orcid": ";;;", "linkedin": ";https://linkedin.com/in/mark-vero-9a32bb17a;gabriela-krasnopolska-59a326231?utm_source=share&utm_campaign=share_via&utm_content=profile&utm_medium=ios_app;", "or_profile": "~Jingxuan_He1;~Mark_Vero1;~Gabriela_Krasnopolska1;~Martin_Vechev1", "aff": "ETHZ - ETH Zurich;ETHZ-ETH Zurich;ETHZ - ETH Zurich;Swiss Federal Institute of Technology", "aff_domain": "ethz.ch;inf.ethz.ch;ethz.ch;ethz.ch", "position": "PhD student;PhD student;MS student;Full Professor", "bibtex": "@inproceedings{\nhe2024instruction,\ntitle={Instruction Tuning for Secure Code Generation},\nauthor={Jingxuan He and Mark Vero and Gabriela Krasnopolska and Martin Vechev},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MgTzMaYHvG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 474838, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3265167645748773017&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "ethz.ch;inf.ethz.ch;ethz.ch;ethz.ch", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "ETH Zurich;Swiss Federal Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch", "aff_unique_abbr": "ETHZ;ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Knowledge-aware Reinforced Language Models for Protein Directed Evolution", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34229", "id": "MikandLqtW", "proceeding": "https://proceedings.mlr.press/v235/wang24cq.html", "pdf": "https://openreview.net/pdf?id=MikandLqtW", "openreview": "https://openreview.net/forum?id=MikandLqtW", "author_site": "Yuhao Wang, Qiang Zhang, Ming Qin, Xiang Zhuang, Xiaotong Li, Zhichen Gong, Zeyuan Wang, Yu Zhao, Jianhua Yao, Keyan Ding, Huajun Chen", "tldr": "", "abstract": "Directed evolution, a cornerstone of protein optimization, is to harness natural mutational processes to enhance protein functionality. Existing Machine Learning-assisted Directed Evolution (MLDE) methodologies typically rely on data-driven strategies and often overlook the profound domain knowledge in biochemical fields. In this paper, we introduce a novel Knowledge-aware Reinforced Language Model (KnowRLM) for MLDE. An Amino Acid Knowledge Graph (AAKG) is constructed to represent the intricate biochemical relationships among amino acids. We further propose a Protein Language Model (PLM)-based policy network that iteratively samples mutants through preferential random walks on the AAKG using a dynamic sliding window mechanism. The novel mutants are actively sampled to fine-tune a fitness predictor as the reward model, providing feedback to the knowledge-aware policy. Finally, we optimize the whole system in an active learning approach that mimics biological settings in practice.KnowRLM stands out for its ability to utilize contextual amino acid information from knowledge graphs, thus attaining advantages from both statistical patterns of protein sequences and biochemical properties of amino acids.Extensive experiments demonstrate the superior performance of KnowRLM in more efficiently identifying high-fitness mutants compared to existing methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuhao Wang;Qiang Zhang;Ming Qin;Xiang Zhuang;Xiaotong Li;Zhichen Gong;Zeyuan Wang;Yu Zhao;Jianhua Yao;Keyan Ding;Huajun Chen", "authorids": "~Yuhao_Wang12;~Qiang_Zhang6;~Ming_Qin3;~Xiang_Zhuang1;~Xiaotong_Li3;~Zhichen_Gong1;~Zeyuan_Wang3;~Yu_Zhao8;~Jianhua_Yao3;~Keyan_Ding1;~Huajun_Chen1", "gender": ";;M;;;;M;M;M;M;M", "homepage": "https://github.com/zju-wyh;https://qiangairesearcher.github.io;;;https://github.com/MercuryDemo;;;;;;", "dblp": ";72/3527-26;76/3104;;;;;57/2056-9;;195/3500;94/5089", "google_scholar": ";https://scholar.google.com/citations?hl=zh-CN;;;;https://scholar.google.co.uk/citations?hl=zh-CN;;7XOW0wcAAAAJ;https://scholar.google.com/citations?hl=en;A7u-ZowAAAAJ;", "orcid": "0009-0000-0013-7259;;0000-0001-8607-8965;;;;;;;;", "linkedin": ";;;;;;;;;;", "or_profile": "~Yuhao_Wang12;~Qiang_Zhang6;~Ming_Qin3;~Xiang_Zhuang1;~Xiaotong_Li3;~Zhichen_Gong1;~Zeyuan_Wang3;~Yu_Zhao8;~Jianhua_Yao3;~Keyan_Ding1;~Huajun_Chen1", "aff": "Zhejiang University;Zhejiang University;Zhejiang University;;Zhejiang University;University College London, University of London;Zhejiang University;Tencent AI Lab;Tencent AI Lab;Zhejiang University;Zhejiang University", "aff_domain": "zju.edu.cn;zju.edu.cn;zju.edu.cn;;zju.edu.cn;ucl.ac.uk;zju.edu.cn;tencent.com;tencent.com;zju.edu.cn;zju.edu.cn", "position": "MS student;Principal Researcher;PhD student;;MS student;PhD student;PhD student;Researcher;Principal Researcher;Researcher;Full Professor", "bibtex": "@inproceedings{\nwang2024knowledgeaware,\ntitle={Knowledge-aware Reinforced Language Models for Protein Directed Evolution},\nauthor={Yuhao Wang and Qiang Zhang and Ming Qin and Xiang Zhuang and Xiaotong Li and Zhichen Gong and Zeyuan Wang and Yu Zhao and Jianhua Yao and Keyan Ding and Huajun Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MikandLqtW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 783619, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9011732922546567359&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "zju.edu.cn;zju.edu.cn;zju.edu.cn;;zju.edu.cn;ucl.ac.uk;zju.edu.cn;tencent.com;tencent.com;zju.edu.cn;zju.edu.cn", "author_num": 11, "aff_unique_index": "0;0;0;0;1;0;2;2;0;0", "aff_unique_norm": "Zhejiang University;University College London;Tencent", "aff_unique_dep": ";;Tencent AI Lab", "aff_unique_url": "https://www.zju.edu.cn;https://www.ucl.ac.uk;https://ai.tencent.com", "aff_unique_abbr": "ZJU;UCL;Tencent AI Lab", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0;0;0;0;0", "aff_country_unique": "China;United Kingdom" }, { "title": "LaMAGIC: Language-Model-based Topology Generation for Analog Integrated Circuits", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34228", "id": "MjGCD8wk1k", "proceeding": "https://proceedings.mlr.press/v235/chang24c.html", "pdf": "https://openreview.net/pdf?id=MjGCD8wk1k", "openreview": "https://openreview.net/forum?id=MjGCD8wk1k", "author_site": "Chen-Chia Chang, Yikang Shen, Shaoze Fan, Jing Li, Shun Zhang, Ningyuan Cao, Yiran Chen, Xin Zhang", "tldr": "", "abstract": "In the realm of electronic and electrical engineering, automation of analog circuit is increasingly vital given the complexity and customized requirements of modern applications. However, existing methods only develop search-based algorithms that require many simulation iterations to design a custom circuit topology, which is usually a time-consuming process. To this end, we introduce LaMAGIC, a pioneering language model-based topology generation model that leverages supervised finetuning for automated analog circuit design. LaMAGIC can efficiently generate an optimized circuit design from the custom specification in a single pass. Our approach involves a meticulous development and analysis of various input and output formulations for circuit. These formulations can ensure canonical representations of circuits and align with the autoregressive nature of LMs to effectively addressing the challenges of representing analog circuits as graphs. The experimental results show that LaMAGIC achieves a success rate of up to 96% under a strict tolerance of 0.01. We also examine the scalability and adaptability of LaMAGIC, specifically testing its performance on more complex circuits. Our findings reveal the enhanced effectiveness of our adjacency matrix-based circuit formulation with floating-point input, suggesting its suitability for handling intricate circuit designs. This research not only demonstrates the potential of language models in graph generation, but also builds a foundational framework for future explorations in automated analog circuit design.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chen-Chia Chang;Yikang Shen;Shaoze Fan;Jing Li;Shun Zhang;Ningyuan Cao;Yiran Chen;Xin Zhang", "authorids": "~Chen-Chia_Chang1;~Yikang_Shen1;sf392@njit.edu;~Jing_Li21;~Shun_Zhang6;ncao@nd.edu;~Yiran_Chen1;~Xin_Zhang49", "gender": "M;M;;;;;M;Not Specified", "homepage": ";;;;https://shunzh.github.io/;;https://ece.duke.edu/people/yiran-chen/;", "dblp": ";152/8226;;;;;80/1641;", "google_scholar": "https://scholar.google.com.tw/citations?user=w1-TnxYAAAAJ;qff5rRYAAAAJ;;;;;;KiLiEcYAAAAJ", "orcid": ";;;;;;0000-0002-1486-8412;", "linkedin": ";;;;;;;xinzhangibm/", "or_profile": "~Chen-Chia_Chang1;~Yikang_Shen1;sf392@njit.edu;~Jing_Li21;~Shun_Zhang6;ncao@nd.edu;~Yiran_Chen1;~Xin_Zhang49", "aff": "Duke University;International Business Machines;;;MIT-IBM Watson AI Lab;;Duke University;IBM T. J. Watson Research Center", "aff_domain": "duke.edu;ibm.com;;;ibm.com;;duke.edu;us.ibm.com", "position": "PhD student;Researcher;;;Researcher;;Professor;Researcher", "bibtex": "@inproceedings{\nchang2024lamagic,\ntitle={La{MAGIC}: Language-Model-based Topology Generation for Analog Integrated Circuits},\nauthor={Chen-Chia Chang and Yikang Shen and Shaoze Fan and Jing Li and Shun Zhang and Ningyuan Cao and Yiran Chen and Xin Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MjGCD8wk1k}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 645399, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6501909701401032938&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "duke.edu;ibm.com;;;ibm.com;;duke.edu;us.ibm.com", "author_num": 8, "aff_unique_index": "0;1;2;0;3", "aff_unique_norm": "Duke University;International Business Machines Corporation;Massachusetts Institute of Technology;IBM", "aff_unique_dep": ";;IBM Watson AI Lab;IBM", "aff_unique_url": "https://www.duke.edu;https://www.ibm.com;https://www.mitibmwatsonailab.org;https://www.ibm.com/research/watson", "aff_unique_abbr": "Duke;IBM;MIT-IBM AI Lab;IBM", "aff_campus_unique_index": "1", "aff_campus_unique": ";T. J. Watson", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Improving Prototypical Visual Explanations with Reward Reweighing, Reselection, and Retraining", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34227", "id": "MlzUD5CKvZ", "proceeding": "https://proceedings.mlr.press/v235/li24ba.html", "pdf": "https://openreview.net/pdf?id=MlzUD5CKvZ", "openreview": "https://openreview.net/forum?id=MlzUD5CKvZ", "author_site": "Aaron Li, Robin Netzorg, Zhihan Cheng, Zhuoqin Zhang, Bin Yu", "tldr": "", "abstract": "In recent years, work has gone into developing deep interpretable methods for image classification that clearly attributes a model's output to specific features of the data. One such of these methods is the Prototypical Part Network (ProtoPNet), which attempts to classify images based on meaningful parts of the input. While this architecture is able to produce visually interpretable classifications, it often learns to classify based on parts of the image that are not semantically meaningful. To address this problem, we propose the Reward Reweighing, Reselecting, and Retraining (R3) post-processing framework, which performs three additional corrective updates to a pretrained ProtoPNet in an offline and efficient manner. The first two steps involve learning a reward model based on collected human feedback and then aligning the prototypes with human preferences. The final step is retraining, which realigns the base features and the classifier layer of the original model with the updated prototypes. We find that our R3 framework consistently improves both the interpretability and the predictive accuracy of ProtoPNet and its variants.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aaron Jiaxun Li;Robin Netzorg;Zhihan Cheng;Zhuoqin Zhang;Bin Yu", "authorids": "~Aaron_Jiaxun_Li1;~Robin_Netzorg1;~Zhihan_Cheng1;~Zhuoqin_Zhang1;~Bin_Yu5", "gender": "M;F;F;F;M", "homepage": "https://aaron-jx-li.github.io/;https://www.stat.berkeley.edu/~yugroup/people/Robbie.html;;https://www.linkedin.com/in/grace-z-439b1a1a7/;https://binyu.stat.berkeley.edu", "dblp": "133/7864-2.html;232/1837;;;27/116", "google_scholar": "MGqle3EAAAAJ;;;;https://scholar.google.com.hk/citations?user=z1iJa3UAAAAJ", "orcid": ";;;;0000-0003-3097-1433", "linkedin": "aaronjxli5351/;;zhihan-cheng-259223210/;;bin-yu-b665063/", "or_profile": "~Aaron_Jiaxun_Li1;~Robin_Netzorg1;~Zhihan_Cheng1;~Zhuoqin_Zhang1;~Bin_Yu5", "aff": "Harvard University;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley", "aff_domain": "g.harvard.edu;berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu", "position": "MS student;PhD student;Undergrad student;Undergrad student;Full Professor", "bibtex": "@inproceedings{\nli2024improving,\ntitle={Improving Prototypical Visual Explanations with Reward Reweighing, Reselection, and Retraining},\nauthor={Aaron Jiaxun Li and Robin Netzorg and Zhihan Cheng and Zhuoqin Zhang and Bin Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MlzUD5CKvZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5608183, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17930755672376693398&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 7, "email": "g.harvard.edu;berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu", "author_num": 5, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "Harvard University;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://www.harvard.edu;https://www.berkeley.edu", "aff_unique_abbr": "Harvard;UC Berkeley", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "What Would Gauss Say About Representations? Probing Pretrained Image Models using Synthetic Gaussian Benchmarks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34226", "id": "MmZJ3kJXjX", "proceeding": "https://proceedings.mlr.press/v235/ko24a.html", "pdf": "https://openreview.net/pdf?id=MmZJ3kJXjX", "openreview": "https://openreview.net/forum?id=MmZJ3kJXjX", "author_site": "Ching-Yun (Irene) Ko, Pin-Yu Chen, Payel Das, Jeet Mohapatra, Luca Daniel", "tldr": "", "abstract": "Recent years have witnessed a paradigm shift in deep learning from task-centric model design to task-agnostic representation learning and task-specific fine-tuning. Pretrained model representations are commonly evaluated extensively across various real-world tasks and used as a foundation for different downstream tasks. This paper proposes a solution for assessing the quality of representations in a task-agnostic way. To circumvent the need for real-world data in evaluation, we explore the use of synthetic binary classification tasks with Gaussian mixtures to probe pretrained models and compare the robustness-accuracy performance on pretrained representations with an idealized reference. Our approach offers a holistic evaluation, revealing intrinsic model capabilities and reducing the dependency on real-life data for model evaluation. Evaluated with various pretrained image models, the experimental results confirm that our task-agnostic evaluation correlates with actual linear probing performance on downstream tasks and can also guide parameter choice in robust linear probing to achieve a better robustness-accuracy trade-off.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ching-Yun Ko;Pin-Yu Chen;Payel Das;Jeet Mohapatra;Luca Daniel", "authorids": "~Ching-Yun_Ko1;~Pin-Yu_Chen1;~Payel_Das1;~Jeet_Mohapatra1;~Luca_Daniel1", "gender": "F;M;F;M;", "homepage": ";http://www.pinyuchen.com;;;https://www.mit.edu/~dluca/", "dblp": "206/6472;39/8969;56/7926;210/2304;35/5202", "google_scholar": ";jxwlCUUAAAAJ;;;", "orcid": ";0000-0003-1039-8369;;;0000-0002-5880-3151", "linkedin": ";pin-yu-chen-940062a2;;;", "or_profile": "~Ching-Yun_Ko1;~Pin-Yu_Chen1;~Payel_Das1;~Jeet_Mohapatra1;~Luca_Daniel1", "aff": "Massachusetts Institute of Technology;International Business Machines;IBM, International Business Machines;;", "aff_domain": "mit.edu;ibm.com;us.ibm.com;;", "position": "PhD student;Principal Researcher;Principal Researcher;;", "bibtex": "@inproceedings{\nko2024what,\ntitle={What Would Gauss Say About Representations? Probing Pretrained Image Models using Synthetic Gaussian Benchmarks},\nauthor={Ching-Yun Ko and Pin-Yu Chen and Payel Das and Jeet Mohapatra and Luca Daniel},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MmZJ3kJXjX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2816239, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6465548558865857826&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": "mit.edu;ibm.com;us.ibm.com;;", "author_num": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "Massachusetts Institute of Technology;International Business Machines Corporation;International Business Machines", "aff_unique_dep": ";;", "aff_unique_url": "https://web.mit.edu;https://www.ibm.com;https://www.ibm.com", "aff_unique_abbr": "MIT;IBM;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "DeCoOp: Robust Prompt Tuning with Out-of-Distribution Detection", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34225", "id": "MoTUdh9ZCc", "proceeding": "https://proceedings.mlr.press/v235/zhou24s.html", "pdf": "https://openreview.net/pdf?id=MoTUdh9ZCc", "openreview": "https://openreview.net/forum?id=MoTUdh9ZCc", "author_site": "Zhi Zhou, Ming Yang, Jiang-Xin Shi, Lan-Zhe Guo, Yu-Feng Li", "tldr": "", "abstract": "Vision-language models (VLMs), such as CLIP, have demonstrated impressive zero-shot capabilities for various downstream tasks. Their performance can be further enhanced through few-shot prompt tuning methods. However, current studies evaluate the performance of learned prompts separately on base and new classes. This evaluation lacks practicality for real-world applications since downstream tasks cannot determine whether the data belongs to base or new classes in advance. In this paper, we explore a problem setting called ***O**pen-world **P**rompt **T**uning* (OPT), which involves tuning prompts on base classes and evaluating on a combination of base and new classes. By introducing ***De**composed **P**rompt **T**uning* framework (DePT), we theoretically demonstrate that OPT can be solved by incorporating out-of-distribution detection into prompt tuning, thereby enhancing the base-to-new discriminability. Based on DePT, we present a novel prompt tuning approach, namely, ***De**composed **Co**ntext **Op**timization* (DeCoOp), which introduces new-class detectors and sub-classifiers to further enhance the base-class and new-class discriminability. Experimental results on 11 benchmark datasets validate the effectiveness of DePT and demonstrate that DeCoOp outperforms current state-of-the-art methods, providing a significant 2% average accuracy improvement.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhi Zhou;Ming Yang;Jiang-Xin Shi;Lan-Zhe Guo;Yu-Feng Li", "authorids": "~Zhi_Zhou2;~Ming_Yang22;~Jiang-Xin_Shi1;~Lan-Zhe_Guo2;~Yu-Feng_Li1", "gender": "M;;M;M;M", "homepage": "http://www.lamda.nju.edu.cn/zhouz/;http://www.lamda.nju.edu.cn/shijx;http://www.lamda.nju.edu.cn/guolz;https://github.com/MingYang1010;https://cs.nju.edu.cn/liyf/index.htm", "dblp": "04/2090-7;299/5485.html;216/4845;;57/413", "google_scholar": "VzvP5a8AAAAJ;KEgtGncAAAAJ;dpunvqgAAAAJ;iOQobvoAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0002-0318-0911;;;0000-0002-2220-5248", "linkedin": ";;;;", "or_profile": "~Zhi_Zhou2;~Jiang-Xin_Shi1;~Lan-Zhe_Guo2;~Ming_Yang30;~Yu-feng_Li2", "aff": "Nanjing University;Nanjing University;Nanjing University;Nanjing University of Aeronautics and Astronautics;Nanjing University", "aff_domain": "nju.edu.cn;nju.edu.cn;nju.edu.cn;nuaa.edu.cn;nju.edu.cn", "position": "PhD student;PhD student;Assistant Professor;Undergrad student;Assistant Professor", "bibtex": "@inproceedings{\nzhou2024decoop,\ntitle={DeCoOp: Robust Prompt Tuning with Out-of-Distribution Detection},\nauthor={Zhi Zhou and Ming Yang and Jiang-Xin Shi and Lan-Zhe Guo and Yu-Feng Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MoTUdh9ZCc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3435035, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7119900341021184684&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "email": "nju.edu.cn;nju.edu.cn;nju.edu.cn;nuaa.edu.cn;nju.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Nanjing University;Nanjing University of Aeronautics and Astronautics", "aff_unique_dep": ";", "aff_unique_url": "https://www.nju.edu.cn;http://www.nuaa.edu.cn", "aff_unique_abbr": "Nanjing U;NUAA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Robust Yet Efficient Conformal Prediction Sets", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34224", "id": "MrNq6rbcUi", "proceeding": "https://proceedings.mlr.press/v235/h-zargarbashi24a.html", "pdf": "https://openreview.net/pdf?id=MrNq6rbcUi", "openreview": "https://openreview.net/forum?id=MrNq6rbcUi", "author_site": "Soroush H. Zargarbashi, Mohammad Sadegh Akhondzadeh, Aleksandar Bojchevski", "tldr": "", "abstract": "Conformal prediction (CP) can convert any model's output into prediction sets guaranteed to include the true label with any user-specified probability. However, same as the model itself, CP is vulnerable to adversarial test examples (evasion) and perturbed calibration data (poisoning). We derive provably robust sets by bounding the worst-case change in conformity scores. Our tighter bounds lead to more efficient sets. We cover both continuous and discrete (sparse) data and our guarantees work both for evasion and poisoning attacks (on both features and labels).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Soroush H. Zargarbashi;Mohammad Sadegh Akhondzadeh;Aleksandar Bojchevski", "authorids": "~Soroush_H._Zargarbashi1;~Mohammad_Sadegh_Akhondzadeh1;~Aleksandar_Bojchevski1", "gender": "M;M;M", "homepage": "https://www.linkedin.com/in/soroushzargar/;https://msadegh97.github.io/;https://abojchevski.github.io/", "dblp": "354/2876;264/0078;203/8114", "google_scholar": "https://scholar.google.com/citations?hl=en;zmlh-HMAAAAJ;https://scholar.google.de/citations?user=F1APiN4AAAAJ", "orcid": ";;", "linkedin": "soroushzargar/;;", "or_profile": "~Soroush_H._Zargarbashi1;~Mohammad_Sadegh_Akhondzadeh1;~Aleksandar_Bojchevski1", "aff": "CISPA, saarland university, saarland informatics campus;Universit\u00e4t K\u00f6ln;University of Cologne", "aff_domain": "cispa.saarland;uni-koeln.de;uni-koeln.de", "position": "PhD student;Researcher;Full Professor", "bibtex": "@inproceedings{\nzargarbashi2024robust,\ntitle={Robust Yet Efficient Conformal Prediction Sets},\nauthor={Soroush H. Zargarbashi and Mohammad Sadegh Akhondzadeh and Aleksandar Bojchevski},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MrNq6rbcUi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 546176, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5298941596553084039&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "cispa.saarland;uni-koeln.de;uni-koeln.de", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Saarland University;University of Cologne", "aff_unique_dep": "CISPA;", "aff_unique_url": "https://www.uni-saarland.de;https://www.uni-koeln.de/", "aff_unique_abbr": "Saarland U;UC", "aff_campus_unique_index": "0", "aff_campus_unique": "Saarland Informatics Campus;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "Local Feature Selection without Label or Feature Leakage for Interpretable Machine Learning Predictions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34223", "id": "Msjovr9hUe", "proceeding": "https://proceedings.mlr.press/v235/oosterhuis24a.html", "pdf": "https://openreview.net/pdf?id=Msjovr9hUe", "openreview": "https://openreview.net/forum?id=Msjovr9hUe", "author_site": "Harrie Oosterhuis, Lijun Lyu, Avishek Anand", "tldr": "", "abstract": "Local feature selection in machine learning provides instance-specific explanations by focusing on the most relevant features for each prediction, enhancing the interpretability of complex models. However, such methods tend to produce misleading explanations by encoding additional information in their selections. In this work, we attribute the problem of misleading selections by formalizing the concepts of label and feature leakage. We rigorously derive the necessary and sufficient conditions under which we can guarantee no leakage, and show existing methods do not meet these conditions. Furthermore, we propose the first local feature selection method that is proven to have no leakage called SUWR. Our experimental results indicate that SUWR is less prone to overfitting and combines state-of-the-art predictive performance with high feature-selection sparsity. Our generic and easily extendable formal approach provides a strong theoretical basis for future work on interpretability with reliable explanations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Harrie Oosterhuis;Lijun Lyu;Avishek Anand", "authorids": "~Harrie_Oosterhuis2;~Lijun_Lyu1;~Avishek_Anand1", "gender": "M;F;M", "homepage": "https://harrieo.github.io/;;https://www.avishekanand.com", "dblp": ";;02/7062", "google_scholar": "e9JynrAAAAAJ;OdC-530AAAAJ;https://scholar.google.de/citations?user=BMdfo4UAAAAJ", "orcid": "0000-0002-0458-9233;;", "linkedin": ";;avishek-anand-4615694/", "or_profile": "~Harrie_Oosterhuis2;~Lijun_Lyu1;~Avishek_Anand1", "aff": "Google;Delft University of Technology;Delft University of Technology", "aff_domain": "google.com;tudelft.nl;tudelft.nl", "position": "Researcher;PhD student;Associate Professor", "bibtex": "@inproceedings{\noosterhuis2024local,\ntitle={Local Feature Selection without Label or Feature Leakage for Interpretable Machine Learning Predictions},\nauthor={Harrie Oosterhuis and Lijun Lyu and Avishek Anand},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Msjovr9hUe}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 663841, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8030310740735384345&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "google.com;tudelft.nl;tudelft.nl", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Google;Delft University of Technology", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.tudelft.nl", "aff_unique_abbr": "Google;TU Delft", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;Netherlands" }, { "title": "Easing Concept Bleeding in Diffusion via Entity Localization and Anchoring", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34222", "id": "MsnJl6JkZS", "proceeding": "https://proceedings.mlr.press/v235/zhang24s.html", "pdf": "https://openreview.net/pdf?id=MsnJl6JkZS", "openreview": "https://openreview.net/forum?id=MsnJl6JkZS", "author_site": "Jiewei Zhang, Song Guo, Peiran Dong, Jie ZHANG, Ziming Liu, Yue Yu, Xiao-Ming Wu", "tldr": "", "abstract": "Recent diffusion models have manifested extraordinary capabilities in generating high-quality, diverse, and innovative images guided by textual prompts. Nevertheless, these state-of-the-art models may encounter the challenge of concept bleeding when generating images with multiple entities or attributes in the prompt, leading to the unanticipated merging or overlapping of distinct objects in the synthesized result. The current work exploits auxiliary networks to produce mask-constrained regions for entities, necessitating the training of an object detection network. In this paper, we investigate the bleeding reason and find that the cross-attention map associated with a specific entity or attribute tends to extend beyond its intended focus, encompassing the background or other unrelated objects and thereby acting as the primary source of concept bleeding. Motivated by this, we propose Entity Localization and Anchoring (ELA) to drive the entity to concentrate on the expected region accurately during inference, eliminating the necessity for training. Specifically, we initially identify the region corresponding to each entity and subsequently employ a tailored loss function to anchor entities within their designated positioning areas. Extensive experiments demonstrate its superior capability in precisely generating multiple objects as specified in the textual prompts.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiewei Zhang;Song Guo;Peiran Dong;Jie ZHANG;Ziming Liu;Yue Yu;Xiao-Ming Wu", "authorids": "~Jiewei_Zhang1;~Song_Guo5;~Peiran_Dong1;~Jie_ZHANG18;~Ziming_Liu1;~Yue_Yu8;~Xiao-Ming_Wu1", "gender": "M;M;M;F;M;M;F", "homepage": "http://peilab.comp.polyu.edu.hk/people/jiewei-zhang/;https://cse.hkust.edu.hk/~songguo/;https://polyu.netlify.app/people/peiran-dong/;https://cugzj.github.io/zhangjie.github.io/;;http://yuyue.github.io/;http://www4.comp.polyu.edu.hk/~csxmwu/", "dblp": "15/10697;01/267-1;243/6454;84/6889-76;;55/2008-1;98/2898-3", "google_scholar": "https://scholar.google.com.hk/citations?user=gak5NX0AAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.hk/citations?hl=zh-CN;JRCNlI8AAAAJ;b1WCs5kAAAAJ;VnqWgEwAAAAJ;3KbaUFkAAAAJ", "orcid": "0000-0003-2841-6422;;0000-0002-1129-9218;0000-0002-8073-2118;0000-0001-8001-9585;0000-0002-9865-2212;", "linkedin": ";;;;;;", "or_profile": "~Jiewei_Zhang1;~Song_Guo5;~Peiran_Dong1;~Jie_ZHANG18;~Ziming_Liu1;~Yue_Yu8;~Xiao-Ming_Wu1", "aff": "The Hong Kong Polytechnic University;Department of Computer Science and Engineering, Hong Kong University of Science and Technology;Hong Kong Polytechnic University;The Hong Kong Polytechnic University;The Hong Kong Polytechnic University;National University of Defense Technology;Hong Kong Polytechnic University", "aff_domain": "polyu.edu.hk;cse.ust.hk;polyu.edu.hk;polyu.edu.hk;connect.polyu.hk;nudt.edu.cn;polyu.edu.hk", "position": "PhD student;Full Professor;PhD student;Postdoc;PhD student;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nzhang2024easing,\ntitle={Easing Concept Bleeding in Diffusion via Entity Localization and Anchoring},\nauthor={Jiewei Zhang and Song Guo and Peiran Dong and Jie ZHANG and Ziming Liu and Yue Yu and Xiao-Ming Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MsnJl6JkZS}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3044178, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6600086703213733806&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "polyu.edu.hk;cse.ust.hk;polyu.edu.hk;polyu.edu.hk;connect.polyu.hk;nudt.edu.cn;polyu.edu.hk", "author_num": 7, "aff_unique_index": "0;1;0;0;0;2;0", "aff_unique_norm": "Hong Kong Polytechnic University;Hong Kong University of Science and Technology;National University of Defense Technology", "aff_unique_dep": ";Department of Computer Science and Engineering;", "aff_unique_url": "https://www.polyu.edu.hk;https://www.ust.hk;http://www.nudt.edu.cn/", "aff_unique_abbr": "PolyU;HKUST;NUDT", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Balancing Feature Similarity and Label Variability for Optimal Size-Aware One-shot Subset Selection", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34221", "id": "MurkwIl0h3", "proceeding": "https://proceedings.mlr.press/v235/acharya24a.html", "pdf": "https://openreview.net/pdf?id=MurkwIl0h3", "openreview": "https://openreview.net/forum?id=MurkwIl0h3", "author_site": "Abhinab Acharya, Dayou Yu, Qi Yu, Xumin Liu", "tldr": "", "abstract": "Subset or core-set selection offers a data-efficient way for training deep learning models. One-shot subset selection poses additional challenges as subset selection is only performed once and full set data become unavailable after the selection. However, most existing methods tend to choose either diverse or difficult data samples, which fail to faithfully represent the joint data distribution that is comprised of both feature and label information. The selection is also performed independently from the subset size, which plays an essential role in choosing what types of samples. To address this critical gap, we propose to conduct Feature similarity and Label variability Balanced One-shot Subset Selection (BOSS), aiming to construct an optimal size-aware subset for data-efficient deep learning. We show that a novel balanced core-set loss bound theoretically justifies the need to simultaneously consider both diversity and difficulty to form an optimal subset. It also reveals how the subset size influences the bound. We further connect the inaccessible bound to a practical surrogate target which is tailored to subset sizes and varying levels of overall difficulty. We design a novel Beta-scoring importance function to delicately control the optimal balance of diversity and difficulty. Comprehensive experiments conducted on both synthetic and real data justify the important theoretical properties and demonstrate the superior performance of BOSS as compared with the competitive baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Abhinab Acharya;Dayou Yu;Qi Yu;Xumin Liu", "authorids": "~Abhinab_Acharya1;~Dayou_Yu1;~Qi_Yu1;~Xumin_Liu1", "gender": "M;;M;F", "homepage": ";https://people.rit.edu/~dy2507/;https://www.rit.edu/mining/;https://www.cs.rit.edu/people/faculty/xl/", "dblp": ";319/4611;58/6957-1;61/5010", "google_scholar": "jSwlu-YAAAAJ;Obh2NOwAAAAJ;L3gWdfEAAAAJ;https://scholar.google.com.tw/citations?user=2Qq9lnUAAAAJ", "orcid": ";0009-0002-2373-4907;0000-0002-0426-5407;", "linkedin": "abhi303/;;;", "or_profile": "~Abhinab_Acharya1;~Dayou_Yu1;~Qi_Yu1;~Xumin_Liu1", "aff": "Rochester Institute of Technology;Rochester Institute of Technology;Rochester Institute of Technology;Rochester Institute of Technology", "aff_domain": "rit.edu;rit.edu;rit.edu;rit.edu", "position": "PhD student;PhD student;Professor;Full Professor", "bibtex": "@inproceedings{\nacharya2024balancing,\ntitle={Balancing Feature Similarity and Label Variability for Optimal Size-Aware One-shot Subset Selection},\nauthor={Abhinab Acharya and Dayou Yu and Qi Yu and Xumin Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MurkwIl0h3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2941467, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15907245553198989363&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "rit.edu;rit.edu;rit.edu;rit.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Rochester Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.rit.edu", "aff_unique_abbr": "RIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Risk Aware Benchmarking of Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34220", "id": "Mv8y13wfDm", "proceeding": "https://proceedings.mlr.press/v235/nitsure24a.html", "pdf": "https://openreview.net/pdf?id=Mv8y13wfDm", "openreview": "https://openreview.net/forum?id=Mv8y13wfDm", "author_site": "Apoorva Nitsure, Youssef Mroueh, Mattia Rigotti, Kristjan Greenewald, Brian Belgodere, Mikhail Yurochkin, Jiri Navratil, Igor Melnyk, Jarret Ross", "tldr": "", "abstract": "We propose a distributional framework for benchmarking socio-technical risks of foundation models with quantified statistical significance. Our approach hinges on a new statistical relative testing based on first and second order stochastic dominance of real random variables. We show that the second order statistics in this test are linked to mean-risk models commonly used in econometrics and mathematical finance to balance risk and utility when choosing between alternatives. Using this framework, we formally develop a risk-aware approach for foundation model selection given guardrails quantified by specified metrics. Inspired by portfolio optimization and selection theory in mathematical finance, we define a metrics portfolio for each model as a means to aggregate a collection of metrics, and perform model selection based on the stochastic dominance of these portfolios. The statistical significance of our tests is backed theoretically by an asymptotic analysis via central limit theorems instantiated in practice via a bootstrap variance estimate. We use our framework to compare various large language models regarding risks related to drifting from instructions and outputting toxic content.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Apoorva Nitsure;Youssef Mroueh;Mattia Rigotti;Kristjan Greenewald;Brian Belgodere;Mikhail Yurochkin;Jiri Navratil;Igor Melnyk;Jarret Ross", "authorids": "~Apoorva_Nitsure1;~Youssef_Mroueh1;~Mattia_Rigotti1;~Kristjan_Greenewald1;~Brian_Belgodere1;~Mikhail_Yurochkin1;~Jiri_Navratil1;~Igor_Melnyk1;~Jarret_Ross1", "gender": ";;;;M;M;;M;", "homepage": ";;http://www.matrig.net;https://researcher.watson.ibm.com/researcher/view.php?person=ibm-Kristjan.H.Greenewald;;https://moonfolk.github.io/;https://researcher.watson.ibm.com/researcher/view.php?person=us-jiri;https://imelnyk.github.io/;", "dblp": ";http://dblp.uni-trier.de/pers/hd/m/Mroueh:Youssef;01/9816;146/0563;https://dblp.uni-trier.de/pid/139/2237;191/6719;00/680-1.html;;192/1669", "google_scholar": ";https://scholar.google.com/citations?hl=en;TmHt7CwAAAAJ;L3zNUG4AAAAJ;;QjBF9sUAAAAJ;H41S5AgAAAAJ;4vDRTWwAAAAJ;", "orcid": ";;0000-0001-6466-2810;;;;0009-0007-5230-7679;;", "linkedin": "apoorvanitsure/;;;;brian-belgodere-050a953/;mikhail-yurochkin-a45659114/;jiri-navratil-62641497/;;", "or_profile": "~Apoorva_Nitsure1;~Youssef_Mroueh1;~Mattia_Rigotti1;~Kristjan_Greenewald1;~Brian_Belgodere1;~Mikhail_Yurochkin1;~Jiri_Navratil1;~Igor_Melnyk1;~Jarret_Ross1", "aff": "International Business Machines;IBM;International Business Machines;MIT-IBM Watson AI Lab, IBM Research;IBM Research;IBM Research;International Business Machines;International Business Machines;International Business Machines", "aff_domain": "ibm.com;us.ibm.com;ibm.com;ibm.com;ibm.com;ibm.com;ibm.com;ibm.com;ibm.com", "position": "Researcher;Research Staff member;Researcher;Research Scientist;Researcher;Researcher;Principal Research Staff Member;Researcher;Researcher", "bibtex": "@inproceedings{\nnitsure2024risk,\ntitle={Risk Aware Benchmarking of Large Language Models},\nauthor={Apoorva Nitsure and Youssef Mroueh and Mattia Rigotti and Kristjan Greenewald and Brian Belgodere and Mikhail Yurochkin and Jiri Navratil and Igor Melnyk and Jarret Ross},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Mv8y13wfDm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1759281, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=481712924328392926&as_sdt=8000005&sciodt=0,19&hl=en", "gs_version_total": 10, "email": "ibm.com;us.ibm.com;ibm.com;ibm.com;ibm.com;ibm.com;ibm.com;ibm.com;ibm.com", "author_num": 9, "aff_unique_index": "0;0;0;1;1;1;0;0;0", "aff_unique_norm": "International Business Machines Corporation;IBM", "aff_unique_dep": ";AI Lab", "aff_unique_url": "https://www.ibm.com;https://www.ibmwatsonai.org/", "aff_unique_abbr": "IBM;MIT-IBM AI Lab", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Attribute Based Interpretable Evaluation Metrics for Generative Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34219", "id": "Mw8kNVfdMs", "proceeding": "https://proceedings.mlr.press/v235/kim24t.html", "pdf": "https://openreview.net/pdf?id=Mw8kNVfdMs", "openreview": "https://openreview.net/forum?id=Mw8kNVfdMs", "author_site": "Dongkyun Kim, Mingi Kwon, Youngjung Uh", "tldr": "", "abstract": "When the training dataset comprises a 1:1 proportion of dogs to cats, a generative model that produces 1:1 dogs and cats better resembles the training species distribution than another model with 3:1 dogs and cats. Can we capture this phenomenon using existing metrics? Unfortunately, we cannot, because these metrics do not provide any interpretability beyond \u201cdiversity\". In this context, we propose a new evaluation protocol that measures the divergence of a set of generated images from the training set regarding the distribution of attribute strengths as follows. Singleattribute Divergence (SaD) reveals the attributes that are generated excessively or insufficiently by measuring the divergence of PDFs of individual attributes. Paired-attribute Divergence (PaD) reveals such pairs of attributes by measuring the divergence of joint PDFs of pairs of attributes. For measuring the attribute strengths of an image, we propose Heterogeneous CLIPScore (HCS) which measures the cosine similarity between image and text vectors with heterogeneous initial points. With SaD and PaD, we reveal the following about existing generative models. ProjectedGAN generates implausible attribute relationships such as baby with beard even though it has competitive scores of existing metrics. Diffusion models struggle to capture diverse colors in the datasets. The larger sampling timesteps of the latent diffusion model generate the more minor objects including earrings and necklace. Stable Diffusion v1.5 better captures the attributes than v2.1. Our metrics lay a foundation for explainable evaluations of generative models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dongkyun Kim;Mingi Kwon;Youngjung Uh", "authorids": "~Dongkyun_Kim2;~Mingi_Kwon1;~Youngjung_Uh2", "gender": "M;M;", "homepage": "https://vilab.yonsei.ac.kr;https://github.com/kwonminki;https://vilab.yonsei.ac.kr/member/professor", "dblp": ";327/3276;57/10511", "google_scholar": ";https://scholar.google.co.kr/citations?user=W8vK8BwAAAAJ;BWBGrEEAAAAJ", "orcid": ";;", "linkedin": ";kwonmingi/;youngjung-uh-78b459b5/", "or_profile": "~Dongkyun_Kim2;~Mingi_Kwon1;~Youngjung_Uh2", "aff": "Yonsei University;Yonsei University;Yonsei University", "aff_domain": "yonsei.ac.kr;yonsei.ac.kr;yonsei.ac.kr", "position": "MS student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nkim2024attribute,\ntitle={Attribute Based Interpretable Evaluation Metrics for Generative Models},\nauthor={Dongkyun Kim and Mingi Kwon and Youngjung Uh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Mw8kNVfdMs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9564816, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8110686798915227576&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "yonsei.ac.kr;yonsei.ac.kr;yonsei.ac.kr", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Yonsei University", "aff_unique_dep": "", "aff_unique_url": "https://www.yonsei.ac.kr", "aff_unique_abbr": "Yonsei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Matroid Semi-Bandits in Sublinear Time", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34218", "id": "MwQ53xAIPs", "proceeding": "https://proceedings.mlr.press/v235/tzeng24a.html", "pdf": "https://openreview.net/pdf?id=MwQ53xAIPs", "openreview": "https://openreview.net/forum?id=MwQ53xAIPs", "author_site": "Ruo-Chun Tzeng, Naoto Ohsaka, Kaito Ariu", "tldr": "", "abstract": "We study the matroid semi-bandits problem, where at each round the learner plays a subset of $K$ arms from a feasible set, and the goal is to maximize the expected cumulative linear rewards. Existing algorithms have per-round time complexity at least $\\Omega(K)$, which becomes expensive when $K$ is large. To address this computational issue, we propose FasterCUCB whose sampling rule takes time sublinear in $K$ for common classes of matroids: $\\mathcal{O}(D\\text{ polylog}(K)\\text{ polylog}(T))$ for uniform matroids, partition matroids, and graphical matroids, and $\\mathcal{O}(D\\sqrt{K}\\text{ polylog}(T))$ for transversal matroids. Here, $D$ is the maximum number of elements in any feasible subset of arms, and $T$ is the horizon. Our technique is based on dynamic maintenance of an approximate maximum-weight basis over inner-product weights. Although the introduction of an approximate maximum-weight basis presents a challenge in regret analysis, we can still guarantee an upper bound on regret as tight as CUCB in the sense that it matches the gap-dependent lower bound by Kveton et al. (2014a) asymptotically.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ruo-Chun Tzeng;Naoto Ohsaka;Kaito Ariu", "authorids": "~Ruo-Chun_Tzeng1;~Naoto_Ohsaka2;~Kaito_Ariu1", "gender": "F;M;M", "homepage": "https://rctzeng.github.io/;https://todo314.github.io/;https://researchmap.jp/ariu?lang=en", "dblp": "242/3884;81/10779;229/7578", "google_scholar": "jntcHQ0AAAAJ;https://scholar.google.co.jp/citations?user=Qgkc9DgAAAAJ;https://scholar.google.co.jp/citations?user=4zXjxhsAAAAJ", "orcid": ";0000-0001-9584-4764;", "linkedin": ";;", "or_profile": "~Ruo-Chun_Tzeng1;~Naoto_Ohsaka2;~Kaito_Ariu1", "aff": "KTH Royal Institute of Technology, Stockholm, Sweden;CyberAgent, Inc.;CyberAgent, Inc.", "aff_domain": "kth.se;cyberagent.co.jp;cyberagent.co.jp", "position": "PhD student;Researcher;Research Scientist", "bibtex": "@inproceedings{\ntzeng2024matroid,\ntitle={Matroid Semi-Bandits in Sublinear Time},\nauthor={Ruo-Chun Tzeng and Naoto Ohsaka and Kaito Ariu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MwQ53xAIPs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 633016, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WVuGnteW3GoJ:scholar.google.com/&scioq=Matroid+Semi-Bandits+in+Sublinear+Time&hl=en&as_sdt=0,5", "gs_version_total": 9, "email": "kth.se;cyberagent.co.jp;cyberagent.co.jp", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "KTH Royal Institute of Technology;CyberAgent", "aff_unique_dep": ";", "aff_unique_url": "https://www.kth.se;https://www.cyberagent.co.jp", "aff_unique_abbr": "KTH;CyberAgent", "aff_campus_unique_index": "0", "aff_campus_unique": "Stockholm;", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Sweden;Japan" }, { "title": "Online Learning in Betting Markets: Profit versus Prediction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34217", "id": "Mz1lcJPymz", "proceeding": "https://proceedings.mlr.press/v235/zhu24k.html", "pdf": "https://openreview.net/pdf?id=Mz1lcJPymz", "openreview": "https://openreview.net/forum?id=Mz1lcJPymz", "author_site": "Haiqing Zhu, Alexander Soen, Yun Kuen Cheung, Lexing Xie", "tldr": "", "abstract": "We examine two types of binary betting markets, whose primary goal is for profit (such as sports gambling) or to gain information (such as prediction markets). We articulate the interplay between belief and price-setting to analyse both types of markets, and show that the goals of maximising bookmaker profit and eliciting information are fundamentally incompatible. A key insight is that profit hinges on the deviation between (the distribution of) bettor and true beliefs, and that heavier tails in bettor belief distribution implies higher profit. Our algorithmic contribution is to introduce online learning methods for price-setting. Traditionally bookmakers update their prices rather infrequently, we present two algorithms that guide price updates upon seeing each bet, assuming very little of bettor belief distributions. The online pricing algorithm achieves stochastic regret of $\\mathcal{O}(\\sqrt{T})$ against the worst local maximum, or $\\mathcal{O}(\\sqrt{T \\log T})$ with high probability against the global maximum under fair odds. More broadly, the inherent tradeoff between profit and information-seeking in binary betting may inspire new understandings of large-scale multi-agent behaviour.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haiqing Zhu;Alexander Soen;Yun Kuen Cheung;Lexing Xie", "authorids": "~Haiqing_Zhu1;~Alexander_Soen1;~Yun_Kuen_Cheung1;~Lexing_Xie1", "gender": "M;M;M;F", "homepage": ";https://alexandersoen.github.io/;http://comp-math-econ.academy/;https://users.cecs.anu.edu.au/~xlx/", "dblp": ";245/9661.html;https://dblp.org/pers/hd/c/Cheung:Yun_Kuen;59/4002.html", "google_scholar": ";apRX4awAAAAJ;7rlVH7gAAAAJ;https://scholar.google.com.tw/citations?user=u0xUDSoAAAAJ", "orcid": "0000-0002-5395-046X;;0000-0002-9280-0149;0000-0001-8319-0118", "linkedin": ";;;", "or_profile": "~Haiqing_Zhu1;~Alexander_Soen1;~Yun_Kuen_Cheung1;~Lexing_Xie1", "aff": "Australian National University;Australian National University;Australian National University;University of Chicago", "aff_domain": "anu.edu;anu.edu.au;anu.edu.au;cs.uchicago.edu", "position": "PhD student;PhD student;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nzhu2024online,\ntitle={Online Learning in Betting Markets: Profit versus Prediction},\nauthor={Haiqing Zhu and Alexander Soen and Yun Kuen Cheung and Lexing Xie},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Mz1lcJPymz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1132403, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=626018305332310446&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "anu.edu;anu.edu.au;anu.edu.au;cs.uchicago.edu", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Australian National University;University of Chicago", "aff_unique_dep": ";", "aff_unique_url": "https://www.anu.edu.au;https://www.uchicago.edu", "aff_unique_abbr": "ANU;UChicago", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "Australia;United States" }, { "title": "Trust the Model Where It Trusts Itself - Model-Based Actor-Critic with Uncertainty-Aware Rollout Adaption", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34216", "id": "N0ntTjTfHb", "proceeding": "https://proceedings.mlr.press/v235/frauenknecht24a.html", "pdf": "https://openreview.net/pdf?id=N0ntTjTfHb", "openreview": "https://openreview.net/forum?id=N0ntTjTfHb", "author_site": "Bernd Frauenknecht, Artur Eisele, Devdutt Subhasish, Friedrich Solowjow, Sebastian Trimpe", "tldr": "", "abstract": "Dyna-style model-based reinforcement learning (MBRL) combines model-free agents with predictive transition models through model-based rollouts. This combination raises a critical question: \u201cWhen to trust your model?\u201d; i.e., which rollout length results in the model providing useful data? Janner et al. (2019) address this question by gradually increasing rollout lengths throughout the training. While theoretically tempting, uniform model accuracy is a fallacy that collapses at the latest when extrapolating. Instead, we propose asking the question \u201cWhere to trust your model?\u201d. Using inherent model uncertainty to consider local accuracy, we obtain the Model-Based Actor-Critic with Uncertainty-Aware Rollout Adaption (MACURA) algorithm. We propose an easy-to-tune rollout mechanism and demonstrate substantial improvements in data efficiency and performance compared to state-of-the-art deep MBRL methods on the MuJoCo benchmark.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bernd Frauenknecht;Artur Eisele;Devdutt Subhasish;Friedrich Solowjow;Sebastian Trimpe", "authorids": "~Bernd_Frauenknecht1;~Artur_Eisele1;~Devdutt_Subhasish1;~Friedrich_Solowjow1;~Sebastian_Trimpe1", "gender": ";M;M;;M", "homepage": "https://www.dsme.rwth-aachen.de/cms/DSME/das-institut/Team/~mtiqw/Bernd-Frauenknecht/lidx/1/;;;https://www.dsme.rwth-aachen.de/cms/DSME/Das-Institut/Team-CMS-Artikel-/~jptyz/Friedrich-Solowjow/;https://www.dsme.rwth-aachen.de/trimpe", "dblp": ";;;217/1553;15/8135", "google_scholar": ";;;https://scholar.google.de/citations?user=gq_ESzoAAAAJ;https://scholar.google.de/citations?user=9kzHZssAAAAJ", "orcid": ";;;;0000-0002-2785-2487", "linkedin": ";artur-eisele-87b585276;dsubhasish09;;sebastian-trimpe-2472a0a3/", "or_profile": "~Bernd_Frauenknecht1;~Artur_Eisele1;~Devdutt_Subhasish1;~Friedrich_Solowjow1;~Sebastian_Trimpe1", "aff": "Rheinisch Westf\u00e4lische Technische Hochschule Aachen;Rheinisch Westf\u00e4lische Technische Hochschule Aachen;Rheinisch Westf\u00e4lische Technische Hochschule Aachen;Rheinisch Westf\u00e4lische Technische Hochschule Aachen;RWTH Aachen University", "aff_domain": "rwth-aachen.de;rwth-aachen.de;rwth-aachen.de;rwth-aachen.de;rwth-aachen.de", "position": "PhD student;MS student;MS student;Lecturer;Full Professor", "bibtex": "@inproceedings{\nfrauenknecht2024trust,\ntitle={Trust the Model Where It Trusts Itself - Model-Based Actor-Critic with Uncertainty-Aware Rollout Adaption},\nauthor={Bernd Frauenknecht and Artur Eisele and Devdutt Subhasish and Friedrich Solowjow and Sebastian Trimpe},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=N0ntTjTfHb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2885740, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14614830303334543396&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "rwth-aachen.de;rwth-aachen.de;rwth-aachen.de;rwth-aachen.de;rwth-aachen.de", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "RWTH Aachen University", "aff_unique_dep": "", "aff_unique_url": "https://www.rwth-aachen.de", "aff_unique_abbr": "RWTH", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Aachen", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Parameter-Dependent Competitive Analysis for Online Capacitated Coverage Maximization through Boostings and Attenuations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34215", "id": "N1BPyf7wC2", "proceeding": "https://proceedings.mlr.press/v235/xu24a.html", "pdf": "https://openreview.net/pdf?id=N1BPyf7wC2", "openreview": "https://openreview.net/forum?id=N1BPyf7wC2", "tldr": "", "abstract": "In this paper, we consider a model called *Online Capacitated Coverage Maximization*, characterized by two features: (1) the dynamic arrival of online agents following a known identical and independent distribution, and (2) each offline agent is associated with a specific coverage valuation over the groundset of online agents. Additionally, both offline and online agents are assigned integer capacities, reflecting finite budgets and operational constraints. We introduce and analyze two matching policies. The first, a non-adaptive policy, utilizes offline statistics derived from solving a benchmark linear program. The second is an enhanced version equipped with real-time boostings and attenuations. We conduct a comprehensive competitive analysis and characterize the competitive ratio for both policies as functions of two crucial parameters: a lower bound on the matching capacity among offline agents and an upper bound on the number of online agents covering any specific feature for offline agents.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pan Xu", "authorids": "~Pan_Xu2", "gender": "Not Specified", "homepage": "https://sites.google.com/site/panxupi/", "dblp": "11/9718-1.html", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "~Pan_Xu2", "aff": "New Jersey Institute of Technology", "aff_domain": "cs.njit.edu", "position": "Assistant Professor", "bibtex": "@inproceedings{\nxu2024parameterdependent,\ntitle={Parameter-Dependent Competitive Analysis for Online Capacitated Coverage Maximization through Boostings and Attenuations},\nauthor={Pan Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=N1BPyf7wC2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 511063, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RAAXBp3_gR4J:scholar.google.com/&scioq=Parameter-Dependent+Competitive+Analysis+for+Online+Capacitated+Coverage+Maximization+through+Boostings+and+Attenuations&hl=en&as_sdt=0,5", "gs_version_total": 4, "email": "cs.njit.edu", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "New Jersey Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.njit.edu", "aff_unique_abbr": "NJIT", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Conditional Normalizing Flows for Active Learning of Coarse-Grained Molecular Representations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34214", "id": "N3ZrpSCJcJ", "proceeding": "https://proceedings.mlr.press/v235/schopmans24a.html", "pdf": "https://openreview.net/pdf?id=N3ZrpSCJcJ", "openreview": "https://openreview.net/forum?id=N3ZrpSCJcJ", "author_site": "Henrik Schopmans, Pascal Friederich", "tldr": "", "abstract": "Efficient sampling of the Boltzmann distribution of molecular systems is a long-standing challenge. Recently, instead of generating long molecular dynamics simulations, generative machine learning methods such as normalizing flows have been used to learn the Boltzmann distribution directly, without samples. However, this approach is susceptible to mode collapse and thus often does not explore the full configurational space. In this work, we address this challenge by separating the problem into two levels, the fine-grained and coarse-grained degrees of freedom. A normalizing flow conditioned on the coarse-grained space yields a probabilistic connection between the two levels. To explore the configurational space, we employ coarse-grained simulations with active learning which allows us to update the flow and make all-atom potential energy evaluations only when necessary. Using alanine dipeptide as an example, we show that our methods obtain a speedup to molecular dynamics simulations of approximately $15.9$ to $216.2$ compared to the speedup of $4.5$ of the current state-of-the-art machine learning approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Henrik Schopmans;Pascal Friederich", "authorids": "~Henrik_Schopmans1;~Pascal_Friederich1", "gender": "M;M", "homepage": ";https://aimat.science", "dblp": "327/3756;182/0165", "google_scholar": "0asJercAAAAJ;3B5h6u0AAAAJ", "orcid": "0000-0002-6414-3591;0000-0003-4465-1465", "linkedin": ";pascal-friederich-6088b9117/", "or_profile": "~Henrik_Schopmans1;~Pascal_Friederich1", "aff": "Karlsruher Institut f\u00fcr Technologie;Karlsruher Institut f\u00fcr Technologie", "aff_domain": "kit.edu;kit.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nschopmans2024conditional,\ntitle={Conditional Normalizing Flows for Active Learning of Coarse-Grained Molecular Representations},\nauthor={Henrik Schopmans and Pascal Friederich},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=N3ZrpSCJcJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9838141, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14158631398965455704&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "kit.edu;kit.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Karlsruher Institut f\u00fcr Technologie", "aff_unique_dep": "", "aff_unique_url": "https://www.kit.edu", "aff_unique_abbr": "KIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Enabling Uncertainty Estimation in Iterative Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34213", "id": "N6A6t6xlKm", "proceeding": "https://proceedings.mlr.press/v235/durasov24a.html", "pdf": "https://openreview.net/pdf?id=N6A6t6xlKm", "openreview": "https://openreview.net/forum?id=N6A6t6xlKm", "author_site": "Nikita Durasov, Doruk Oner, Jonathan Donier, Hieu Le, EPFL Pascal Fua", "tldr": "", "abstract": "Turning pass-through network architectures into iterative ones, which use their own output as input, is a well-known approach for boosting performance. In this paper, we argue that such architectures offer an additional benefit: The convergence rate of their successive outputs is highly correlated with the accuracy of the value to which they converge. Thus, we can use the convergence rate as a useful proxy for uncertainty. This results in an approach to uncertainty estimation that provides state-of-the-art estimates at a much lower computational cost than techniques like Ensembles, and without requiring any modifications to the original iterative model. We demonstrate its practical value by embedding it in two application domains: road detection in aerial images and the estimation of aerodynamic properties of 2D and 3D shapes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nikita Durasov;Doruk Oner;Jonathan Donier;Hieu Le;Pascal Fua", "authorids": "~Nikita_Durasov1;~Doruk_Oner1;~Jonathan_Donier1;~Hieu_Le2;~Pascal_Fua1", "gender": "M;M;;M;M", "homepage": "https://www.norange.io/about/;;;https://hieulem.github.io/;https://people.epfl.ch/pascal.fua/bio?lang=en", "dblp": "230/4660;217/1719;236/5194;130/6199.html;f/PFua", "google_scholar": "KMMvTfcAAAAJ;https://scholar.google.com.tr/citations?user=ESA2CsAAAAAJ;https://scholar.google.fr/citations?user=3hoYiLAAAAAJ;Bj9g-EEAAAAJ;https://scholar.google.com/citations?view_op=list_works", "orcid": ";0000-0002-9403-4628;;;", "linkedin": ";doruk-oner/;;;pascal-fua-epfl/?lipi=urn%3Ali%3Apage%3Ad_flagship3_search_srp_top%3BOz8ffqlCTcmui5v37AilTQ%3D%3D&licu=urn%3Ali%3Acontrol%3Ad_flagship3_search_srp_top-search_srp_result&lici=IhLn%2B0y4Rj23iI9XNMDNwA%3D%3D", "or_profile": "~Nikita_Durasov1;~Doruk_Oner1;~Jonathan_Donier1;~Hieu_Le2;~Pascal_Fua1", "aff": "NVIDIA;EPFL - EPF Lausanne;Neural Concept;EPFL - EPF Lausanne;EPFL - EPF Lausanne", "aff_domain": "nvidia.com;epfl.ch;neuralconcept.com;epfl.ch;epfl.ch", "position": "Intern;Postdoc;Principal Researcher;Postdoc;Full Professor", "bibtex": "@inproceedings{\ndurasov2024enabling,\ntitle={Enabling Uncertainty Estimation in Iterative Neural Networks},\nauthor={Nikita Durasov and Doruk Oner and Jonathan Donier and Hieu Le and Pascal Fua},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=N6A6t6xlKm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8884148, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15445813933822159439&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "nvidia.com;epfl.ch;neuralconcept.com;epfl.ch;epfl.ch", "author_num": 5, "aff_unique_index": "0;1;2;1;1", "aff_unique_norm": "NVIDIA;EPFL;Neural Concept", "aff_unique_dep": "NVIDIA Corporation;;", "aff_unique_url": "https://www.nvidia.com;https://www.epfl.ch;", "aff_unique_abbr": "NVIDIA;EPFL;", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Lausanne", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United States;Switzerland;" }, { "title": "Residual Quantization with Implicit Neural Codebooks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34212", "id": "NBAc36V00H", "proceeding": "https://proceedings.mlr.press/v235/huijben24a.html", "pdf": "https://openreview.net/pdf?id=NBAc36V00H", "openreview": "https://openreview.net/forum?id=NBAc36V00H", "author_site": "Iris Huijben, Matthijs Douze, Matthew Muckley, Ruud J. G. van Sloun, Jakob Verbeek", "tldr": "", "abstract": "Vector quantization is a fundamental operation for data compression and vector search. To obtain high accuracy, multi-codebook methods represent each vector using codewords across several codebooks. Residual quantization (RQ) is one such method, which iteratively quantizes the error of the previous step. While the error distribution is dependent on previously-selected codewords, this dependency is not accounted for in conventional RQ as it uses a fixed codebook per quantization step. In this paper, we propose QINCo, a neural RQ variant that constructs specialized codebooks per step that depend on the approximation of the vector from previous steps. Experiments show that QINCo outperforms state-of-the-art methods by a large margin on several datasets and code sizes. For example, QINCo achieves better nearest-neighbor search accuracy using 12-byte codes than the state-of-the-art UNQ using 16 bytes on the BigANN1M and Deep1M datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Iris A.M. Huijben;Matthijs Douze;Matthew J. Muckley;Ruud Van Sloun;Jakob Verbeek", "authorids": "~Iris_A.M._Huijben1;~Matthijs_Douze1;~Matthew_J._Muckley1;~Ruud_Van_Sloun1;~Jakob_Verbeek1", "gender": ";M;F;Not Specified;F", "homepage": "https://research.facebook.com/people/douze-matthijs/;https://mmuckley.github.io/;https://www.tue.nl/en/research/researchers/ruud-van-sloun;http://lear.inrialpes.fr/~verbeek;", "dblp": "64/5801;158/8226;162/9715.html;v/JakobJVerbeek;247/0968", "google_scholar": "yZmnFbkAAAAJ;Iz9v6dcAAAAJ;gQQJgocAAAAJ;oZGA-rAAAAAJ;https://scholar.google.nl/citations?user=1ReBr6sAAAAJ", "orcid": ";0000-0002-6525-8817;;0000-0003-1419-1816;0000-0002-2629-3898", "linkedin": ";matthew-muckley-33a9b558/;;jakob-verbeek-3b11aa14a/;", "or_profile": "~Matthijs_Douze1;~Matthew_J._Muckley1;~Ruud_Van_Sloun1;~Jakob_Verbeek1;~Iris_Anne_Marie_Huijben1", "aff": "Meta;Meta;Eindhoven University of Technology;Meta;Eindhoven University of Technology", "aff_domain": "meta.com;fb.com;tue.nl;meta.com;tue.nl", "position": "researcher;Research Engineer;Associate Professor;Research Scientist;PhD student", "bibtex": "@inproceedings{\nhuijben2024residual,\ntitle={Residual Quantization with Implicit Neural Codebooks},\nauthor={Iris A.M. Huijben and Matthijs Douze and Matthew J. Muckley and Ruud Van Sloun and Jakob Verbeek},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NBAc36V00H}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 770731, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5585472531270470986&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 7, "email": "meta.com;fb.com;tue.nl;meta.com;tue.nl", "author_num": 5, "aff_unique_index": "0;0;1;0;1", "aff_unique_norm": "Meta;Eindhoven University of Technology", "aff_unique_dep": "Meta Platforms, Inc.;", "aff_unique_url": "https://meta.com;https://www.tue.nl", "aff_unique_abbr": "Meta;TU/e", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;1", "aff_country_unique": "United States;Netherlands" }, { "title": "Unraveling the Impact of Heterophilic Structures on Graph Positive-Unlabeled Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34211", "id": "NCT3w7VKjo", "proceeding": "https://proceedings.mlr.press/v235/wu24ad.html", "pdf": "https://openreview.net/pdf?id=NCT3w7VKjo", "openreview": "https://openreview.net/forum?id=NCT3w7VKjo", "author_site": "Yuhao Wu, Jiangchao Yao, Bo Han, Lina Yao, Tongliang Liu", "tldr": "", "abstract": "While Positive-Unlabeled (PU) learning is vital in many real-world scenarios, its application to graph data still remains under-explored. We unveil that a critical challenge for PU learning on graph lies on the edge heterophily, which directly violates the $\\textit{irreducibility assumption}$ for $\\textit{Class-Prior Estimation}$ (class prior is essential for building PU learning algorithms) and degenerates the latent label inference on unlabeled nodes during classifier training. In response to this challenge, we introduce a new method, named $\\textit{$\\underline{G}$raph $\\underline{P}$U Learning with $\\underline{L}$abel Propagation Loss}$ (GPL). Specifically, GPL considers learning from PU nodes along with an intermediate heterophily reduction, which helps mitigate the negative impact of the heterophilic structure. We formulate this procedure as a bilevel optimization that reduces heterophily in the inner loop and efficiently learns a classifier in the outer loop. Extensive experiments across a variety of datasets have shown that GPL significantly outperforms baseline methods, confirming its effectiveness and superiority.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuhao Wu;Jiangchao Yao;Bo Han;Lina Yao;Tongliang Liu", "authorids": "~Yuhao_Wu2;~Jiangchao_Yao1;~Bo_Han1;~Lina_Yao2;~Tongliang_Liu1", "gender": "M;M;F;M;M", "homepage": "https://white1818.github.io/;https://sunarker.github.io/;https://www.linayao.com/;https://tongliang-liu.github.io/;https://bhanml.github.io/", "dblp": ";166/5900;56/6651-1;150/6667;241/0472-3", "google_scholar": ";w8oDh9QAAAAJ;https://scholar.google.com.au/citations?user=EU3snBgAAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;nTNjqHwAAAAJ", "orcid": ";;;;", "linkedin": ";;linayao/;;", "or_profile": "~Yuhao_Wu2;~Jiangchao_Yao1;~Lina_Yao2;~Tongliang_Liu1;~bo_han2", "aff": "University of Sydney;Shanghai Artificial Intelligence Laboratory;CSIRO's Data61;Mohamed bin Zayed University of Artificial Intelligence;MBZUAI", "aff_domain": "usyd.edu.au;pjlab.org.cn;data61.csiro.au;mbzuai.ac.ae;mbzuai.ac.ae", "position": "PhD student;Researcher;Principal Researcher;Affiliated Associate Professor;Researcher", "bibtex": "@inproceedings{\nwu2024unraveling,\ntitle={Unraveling the Impact of Heterophilic Structures on Graph Positive-Unlabeled Learning},\nauthor={Yuhao Wu and Jiangchao Yao and Bo Han and Lina Yao and Tongliang Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NCT3w7VKjo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 687001, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9720624032815038142&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "usyd.edu.au;pjlab.org.cn;data61.csiro.au;mbzuai.ac.ae;mbzuai.ac.ae", "author_num": 5, "aff_unique_index": "0;1;2;3;3", "aff_unique_norm": "University of Sydney;Shanghai Artificial Intelligence Laboratory;CSIRO;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";;Data61;", "aff_unique_url": "https://www.sydney.edu.au;http://www.shailab.org/;https://www.csiro.au;https://mbzuai.ac.ae", "aff_unique_abbr": "USYD;Shanghai AI Lab;CSIRO;MBZUAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;2;2", "aff_country_unique": "Australia;China;United Arab Emirates" }, { "title": "Hidden Traveling Waves bind Working Memory Variables in Recurrent Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34210", "id": "NCjlFw1Ab0", "proceeding": "https://proceedings.mlr.press/v235/karuvally24a.html", "pdf": "https://openreview.net/pdf?id=NCjlFw1Ab0", "openreview": "https://openreview.net/forum?id=NCjlFw1Ab0", "author_site": "Arjun Karuvally, Terrence Sejnowski, Hava Siegelmann", "tldr": "", "abstract": "Traveling waves are a fundamental phenomenon in the brain, playing a crucial role in short-term information storage. In this study, we leverage the concept of traveling wave dynamics within a neural lattice to formulate a theoretical model of neural working memory in Recurrent Neural Networks (RNNs), study its properties, and its real world implications in AI. The proposed model diverges from traditional approaches, which assume information storage in static, register-like locations updated by interference. Instead, the model stores data as waves that is updated by the wave's boundary conditions. We rigorously examine the model's capabilities in representing and learning state histories, which are vital for learning history-dependent dynamical systems. The findings reveal that the model reliably stores external information and enhances the learning process by addressing the diminishing gradient problem of RNNs. To understand the model's real-world applicability, we explore two cases: linear boundary condition and non-linear, self-attention-driven boundary condition. The experiments reveal that the linear scenario is effectively *learned* by RNNs through backpropagation when modeling history-dependent dynamical systems. Conversely, the non-linear scenario parallels an attention-only transformer. Collectively, our findings suggest the broader relevance of traveling waves in AI and its potential in advancing neural network architectures.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Arjun Karuvally;Terrence Sejnowski;Hava T Siegelmann", "authorids": "~Arjun_Karuvally1;~Terrence_Sejnowski2;~Hava_T_Siegelmann1", "gender": "M;M;F", "homepage": "https://arjunkaruvally.github.io/;https://cnl.salk.edu/;https://www.cics.umass.edu/faculty/directory/siegelmann_hava", "dblp": "215/5092;;s/HavaTSiegelmann.html", "google_scholar": "YfurSO4AAAAJ;;https://scholar.google.co.il/citations?user=A2fiOI0AAAAJ", "orcid": "0000-0002-8298-1409;;0000-0003-4938-8723", "linkedin": "arjunksuresh;;hava-siegelmann-4b272a/", "or_profile": "~Arjun_Karuvally1;~Terrence_Sejnowski2;~Hava_T_Siegelmann1", "aff": "University of Massachusetts, Amherst;Salk Institute;University of Massachusetts at Amherst", "aff_domain": "umass.edu;salk.edu;umass.edu", "position": "PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nkaruvally2024hidden,\ntitle={Hidden Traveling Waves bind Working Memory Variables in Recurrent Neural Networks},\nauthor={Arjun Karuvally and Terrence Sejnowski and Hava T Siegelmann},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NCjlFw1Ab0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7743418, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3402708467380210558&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": "umass.edu;salk.edu;umass.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Massachusetts Amherst;Salk Institute for Biological Studies", "aff_unique_dep": ";", "aff_unique_url": "https://www.umass.edu;https://www.salk.edu", "aff_unique_abbr": "UMass Amherst;Salk Institute", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Amherst;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "LoRA+: Efficient Low Rank Adaptation of Large Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34209", "id": "NEv8YqBROO", "proceeding": "https://proceedings.mlr.press/v235/hayou24a.html", "pdf": "https://openreview.net/pdf?id=NEv8YqBROO", "openreview": "https://openreview.net/forum?id=NEv8YqBROO", "author_site": "Soufiane Hayou, Nikhil Ghosh, Bin Yu", "tldr": "", "abstract": "In this paper, we show that Low Rank Adaptation (LoRA) as originally introduced in (Hu et al., 2021) leads to suboptimal finetuning of models with large width. This is due to the fact that adapter matrices A and B in LoRA are updated with the same learning rate in ADAM. Using scaling arguments for large width networks, we demonstrate that the same learning rate does not allow efficient feature learning. We then show that this suboptimality of LoRA can be corrected simply by setting different learning rates for the LoRA adapter matrices A and B with a well-chosen fixed ratio. We call this proposed algorithm LoRA+. In our extensive experiments, LoRA+ improves finetuning speed (up to \u223c 2X SpeedUp) and performance (1% \u2212 2% improvements), at the same computational cost as LoRA. The code is available at https://github.com/nikhil-ghosh-berkeley/loraplus", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Soufiane Hayou;Nikhil Ghosh;Bin Yu", "authorids": "~Soufiane_Hayou1;~Nikhil_Ghosh1;~Bin_Yu5", "gender": "M;M;M", "homepage": "https://www.soufianehayou.com/;;https://binyu.stat.berkeley.edu", "dblp": "220/5617;251/8779;27/116", "google_scholar": "https://scholar.google.com/citations?hl=en;0Fv4bikAAAAJ;https://scholar.google.com.hk/citations?user=z1iJa3UAAAAJ", "orcid": ";;0000-0003-3097-1433", "linkedin": ";nikhil-ghosh-03389199/;bin-yu-b665063/", "or_profile": "~Soufiane_Hayou1;~Nikhil_Ghosh1;~Bin_Yu5", "aff": "National University of Singapore;University of California, Berkeley;University of California, Berkeley", "aff_domain": "nus.edu.sg;berkeley.edu;berkeley.edu", "position": "Assistant Professor;PhD student;Full Professor", "bibtex": "@inproceedings{\nhayou2024lora,\ntitle={Lo{RA}+: Efficient Low Rank Adaptation of Large Models},\nauthor={Soufiane Hayou and Nikhil Ghosh and Bin Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NEv8YqBROO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1281181, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 162, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16075723915769445688&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 7, "email": "nus.edu.sg;berkeley.edu;berkeley.edu", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "National University of Singapore;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;https://www.berkeley.edu", "aff_unique_abbr": "NUS;UC Berkeley", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Singapore;United States" }, { "title": "Optimal Differentially Private Model Training with Public Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34208", "id": "NFEJQn7vX0", "proceeding": "https://proceedings.mlr.press/v235/lowy24a.html", "pdf": "https://openreview.net/pdf?id=NFEJQn7vX0", "openreview": "https://openreview.net/forum?id=NFEJQn7vX0", "author_site": "Andrew Lowy, Zeman Li, Tianjian Huang, Meisam Razaviyayn", "tldr": "", "abstract": "Differential privacy (DP) ensures that training a machine learning model does not leak private data. In practice, we may have access to auxiliary public data that is free of privacy concerns. In this work, we assume access to a given amount of public data and settle the following fundamental open questions: 1. What is the optimal (worst-case) error of a DP model trained over a private data set while having access to side public data? 2. How can we harness public data to improve DP model training in practice? We consider these questions in both the local and central models of pure and approximate DP. To answer the first question, we prove tight (up to log factors) lower and upper bounds that characterize the optimal error rates of three fundamental problems: mean estimation, empirical risk minimization, and stochastic convex optimization. We show that the optimal error rates can be attained (up to log factors) by either discarding private data and training a public model, or treating public data like it is private and using an optimal DP algorithm. To address the second question, we develop novel algorithms that are \"even more optimal\" (i.e. better constants) than the asymptotically optimal approaches described above. For local DP mean estimation, our algorithm is optimal including constants. Empirically, our algorithms show benefits over the state-of-the-art.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Andrew Lowy;Zeman Li;Tianjian Huang;Meisam Razaviyayn", "authorids": "~Andrew_Lowy1;~Zeman_Li1;~Tianjian_Huang2;~Meisam_Razaviyayn1", "gender": ";;Unspecified;M", "homepage": "https://sites.google.com/view/andrewlowy;;https://tianjian-huang.net/;https://sites.usc.edu/razaviyayn/", "dblp": "285/5314;;245/2447;43/8577", "google_scholar": "https://scholar.google.com/citations?hl=en;;TNdMwp0AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;", "linkedin": ";;tianjian-huang-7801a0119;", "or_profile": "~Andrew_Lowy1;~Zeman_Li1;~Tianjian_Huang2;~Meisam_Razaviyayn1", "aff": "University of Wisconsin - Madison;;University of Southern California;Google", "aff_domain": "wisc.edu;;usc.edu;google.com", "position": "Postdoc;;PhD student;Researcher", "bibtex": "@inproceedings{\nlowy2024optimal,\ntitle={Optimal Differentially Private Model Training with Public Data},\nauthor={Andrew Lowy and Zeman Li and Tianjian Huang and Meisam Razaviyayn},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NFEJQn7vX0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1352463, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17290812636097684222&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "wisc.edu;;usc.edu;google.com", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Wisconsin-Madison;University of Southern California;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.wisc.edu;https://www.usc.edu;https://www.google.com", "aff_unique_abbr": "UW-Madison;USC;Google", "aff_campus_unique_index": "0;1;2", "aff_campus_unique": "Madison;Los Angeles;Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Random Scaling and Momentum for Non-smooth Non-convex Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34207", "id": "NKirMgDsut", "proceeding": "https://proceedings.mlr.press/v235/zhang24k.html", "pdf": "https://openreview.net/pdf?id=NKirMgDsut", "openreview": "https://openreview.net/forum?id=NKirMgDsut", "author_site": "Qinzi Zhang, Ashok Cutkosky", "tldr": "", "abstract": "Training neural networks requires optimizing a loss function that may be highly irregular, and in particular neither convex nor smooth. Popular training algorithms are based on stochastic gradient descent with momentum (SGDM), for which classical analysis applies only if the loss is either convex or smooth. We show that a very small modification to SGDM closes this gap: simply scale the update at each time point by an exponentially distributed random scalar. The resulting algorithm achieves optimal convergence guarantees. Intriguingly, this result is not derived by a specific analysis of SGDM: instead, it falls naturally out of a more general framework for converting online convex optimization algorithms to non-convex optimization algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qinzi Zhang;Ashok Cutkosky", "authorids": "~Qinzi_Zhang1;~Ashok_Cutkosky1", "gender": "M;", "homepage": ";http://www.cs.stanford.edu/~ashokc", "dblp": "275/8559;191/6725", "google_scholar": "QYP73uQAAAAJ;h4AbGp0AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Qinzi_Zhang1;~Ashok_Cutkosky1", "aff": "Boston University, Boston University;Boston University", "aff_domain": "bu.edu;bu.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nzhang2024random,\ntitle={Random Scaling and Momentum for Non-smooth Non-convex Optimization},\nauthor={Qinzi Zhang and Ashok Cutkosky},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NKirMgDsut}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 831259, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11573267175641240484&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "bu.edu;bu.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Boston University", "aff_unique_dep": "", "aff_unique_url": "https://www.bu.edu", "aff_unique_abbr": "BU", "aff_campus_unique_index": "0", "aff_campus_unique": "Boston;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Exploring Training on Heterogeneous Data with Mixture of Low-rank Adapters", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34206", "id": "NQ6KDfSDFK", "proceeding": "https://proceedings.mlr.press/v235/zhou24w.html", "pdf": "https://openreview.net/pdf?id=NQ6KDfSDFK", "openreview": "https://openreview.net/forum?id=NQ6KDfSDFK", "author_site": "Yuhang Zhou, Zhao Zihua, Siyuan Du, Haolin li, Jiangchao Yao, Ya Zhang, Yanfeng Wang", "tldr": "", "abstract": "Training a unified model to take multiple targets into account is a trend towards artificial general intelligence. However, how to efficiently mitigate the training conflicts among heterogeneous data collected from different domains or tasks remains under-explored. In this study, we explore to leverage Mixture of Low-rank Adapters (MoLA) to mitigate conflicts in heterogeneous data training, which requires to jointly train the multiple low-rank adapters and their shared backbone. Specifically, we introduce two variants of MoLA, namely, MoLA-Grad and MoLA-Router, to respectively handle the target-aware and target-agnostic scenarios during inference. The former uses task identifiers to assign personalized low-rank adapters to each task, disentangling task-specific knowledge towards their adapters, thereby mitigating heterogeneity conflicts. The latter uses a novel Task-wise Decorrelation (TwD) loss to intervene the router to learn oriented weight combinations of adapters to homogeneous tasks, achieving similar effects. We conduct comprehensive experiments to verify the superiority of MoLA over previous state-of-the-art methods and present in-depth analysis on its working mechanism. Source code is available at: https://github.com/MediaBrain-SJTU/MoLA", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuhang Zhou;Zihua Zhao;Siyuan Du;Haolin li;Jiangchao Yao;Ya Zhang;Yanfeng Wang", "authorids": "~Yuhang_Zhou4;~Zihua_Zhao3;~Siyuan_Du1;~Haolin_li2;~Jiangchao_Yao1;~Ya_Zhang1;~Yanfeng_Wang1", "gender": ";M;M;M;M;F;M", "homepage": ";https://github.com/ZihuaZhao;https://cs.fudan.edu.cn/;https://cs.fudan.edu.cn/main.htm;https://sunarker.github.io/;https://annzhanglion.github.io/;https://cmic.sjtu.edu.cn/wangyanfeng/", "dblp": ";;207/9464;;166/5900;85/3714-2;55/5407-1.html", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;1jVWGqIAAAAJ;;;w8oDh9QAAAAJ;pbjw9sMAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";;;;;0000-0002-5390-9053;0000-0002-3196-2347", "linkedin": ";;;;;;", "or_profile": "~Yuhang_Zhou4;~Zihua_Zhao3;~Siyuan_Du1;~Haolin_li2;~Jiangchao_Yao1;~Ya_Zhang1;~Yanfeng_Wang1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Fudan University;Fudan University;Shanghai Artificial Intelligence Laboratory;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;fudan.edu.cn;fudan.edu.cn;pjlab.org.cn;sjtu.edu.cn;sjtu.edu.cn", "position": "PhD student;PhD student;PhD student;PhD student;Researcher;Professor;Full Professor", "bibtex": "@inproceedings{\nzhou2024exploring,\ntitle={Exploring Training on Heterogeneous Data with Mixture of Low-rank Adapters},\nauthor={Yuhang Zhou and Zihua Zhao and Siyuan Du and Haolin li and Jiangchao Yao and Ya Zhang and Yanfeng Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NQ6KDfSDFK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5746465, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4702810503133426975&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "sjtu.edu.cn;sjtu.edu.cn;fudan.edu.cn;fudan.edu.cn;pjlab.org.cn;sjtu.edu.cn;sjtu.edu.cn", "author_num": 7, "aff_unique_index": "0;0;1;1;2;0;0", "aff_unique_norm": "Shanghai Jiao Tong University;Fudan University;Shanghai Artificial Intelligence Laboratory", "aff_unique_dep": ";;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.fudan.edu.cn;http://www.shailab.org/", "aff_unique_abbr": "SJTU;Fudan;Shanghai AI Lab", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "An Information-Theoretic Analysis of In-Context Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34205", "id": "NQn2tYLv5I", "proceeding": "https://proceedings.mlr.press/v235/jeon24a.html", "pdf": "https://openreview.net/pdf?id=NQn2tYLv5I", "openreview": "https://openreview.net/forum?id=NQn2tYLv5I", "author_site": "Hong Jun Jeon, Jason Lee, Qi Lei, Benjamin Van Roy", "tldr": "", "abstract": "Previous theoretical results pertaining to meta-learning on sequences build on contrived and convoluted mixing time assumptions. We introduce new information-theoretic tools that lead to a concise yet general decomposition of error for a Bayes optimal predictor into two components: meta-learning error and intra-task error. These tools unify analyses across many meta-learning challenges. To illustrate, we apply them to establish new results about in-context learning with transformers and corroborate existing results a simple linear setting. Our theoretical results characterize how error decays in both the number of training sequences and sequence lengths. Our results are very general; for example, they avoid contrived mixing time assumptions made by all prior results that establish decay of error with sequence length.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hong Jun Jeon;Jason D. Lee;Qi Lei;Benjamin Van Roy", "authorids": "~Hong_Jun_Jeon1;~Jason_D._Lee1;~Qi_Lei1;~Benjamin_Van_Roy1", "gender": "M;M;F;M", "homepage": ";https://jasondlee88.github.io/;https://cecilialeiqi.github.io/;https://web.stanford.edu/~bvr/", "dblp": ";88/3262;;41/4314", "google_scholar": "HEMmAd0AAAAJ;GR_DsT0AAAAJ;kGOgaowAAAAJ;05sMX8MAAAAJ", "orcid": ";;;", "linkedin": "hong-jun-jeon-850199146/;;;", "or_profile": "~Hong_Jun_Jeon1;~Jason_D._Lee1;~Qi_Lei1;~Benjamin_Van_Roy1", "aff": "Stanford University;Princeton University;New York University;Stanford University", "aff_domain": "stanford.edu;princeton.edu;nyu.edu;stanford.edu", "position": "PhD student;Assistant Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\njeon2024an,\ntitle={An Information-Theoretic Analysis of In-Context Learning},\nauthor={Hong Jun Jeon and Jason D. Lee and Qi Lei and Benjamin Van Roy},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NQn2tYLv5I}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 552960, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4130102671033613991&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "stanford.edu;princeton.edu;nyu.edu;stanford.edu", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Stanford University;Princeton University;New York University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.stanford.edu;https://www.princeton.edu;https://www.nyu.edu", "aff_unique_abbr": "Stanford;Princeton;NYU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "A Bayesian Approach to Online Planning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34204", "id": "NS8z5FinYl", "proceeding": "https://proceedings.mlr.press/v235/greshler24a.html", "pdf": "https://openreview.net/pdf?id=NS8z5FinYl", "openreview": "https://openreview.net/forum?id=NS8z5FinYl", "author_site": "Nir Greshler, David Ben Eli, Carmel Rabinovitz, Gabi Guetta, Liran Gispan, Guy Zohar, Aviv Tamar", "tldr": "", "abstract": "The combination of Monte Carlo tree search and neural networks has revolutionized online planning. As neural network approximations are often imperfect, we ask whether uncertainty estimates about the network outputs could be used to improve planning. We develop a Bayesian planning approach that facilitates such uncertainty quantification, inspired by classical ideas from the meta-reasoning literature. We propose a Thompson sampling based algorithm for searching the tree of possible actions, for which we prove the first (to our knowledge) finite time Bayesian regret bound, and propose an efficient implementation for a restricted family of posterior distributions. In addition we propose a variant of the Bayes-UCB method applied to trees. Empirically, we demonstrate that on the ProcGen Maze and Leaper environments, when the uncertainty estimates are accurate but the neural network output is inaccurate, our Bayesian approach searches the tree much more effectively. In addition, we investigate whether popular uncertainty estimation methods are accurate enough to yield significant gains in planning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nir Greshler;David Ben Eli;Carmel Rabinovitz;Gabi Guetta;Liran Gispan;Guy Zohar;Aviv Tamar", "authorids": "~Nir_Greshler1;dudi.beneli@gm.com;~Carmel_Rabinovitz1;gabi.guetta@gm.com;~Liran_Gispan1;guy.zohar@gm.com;~Aviv_Tamar2", "gender": "M;;M;;;;M", "homepage": ";;;;;;https://avivt.github.io/avivt/", "dblp": ";;226/2056.html;;;;49/10622", "google_scholar": "https://scholar.google.co.il/citations?user=N662HPsAAAAJ;;tLk2Hq4AAAAJ;;;;https://scholar.google.co.il/citations?user=kppa2vgAAAAJ", "orcid": ";;;;;;", "linkedin": "nirgreshler;;carmel-rabinovitz/;;;;", "or_profile": "~Nir_Greshler1;dudi.beneli@gm.com;~Carmel_Rabinovitz1;gabi.guetta@gm.com;~Liran_Gispan1;guy.zohar@gm.com;~Aviv_Tamar2", "aff": ";;;;;;Technion, Technion", "aff_domain": ";;;;;;technion.ac.il", "position": ";;;;;;Assistant Professor", "bibtex": "@inproceedings{\ngreshler2024a,\ntitle={A Bayesian Approach to Online Planning},\nauthor={Nir Greshler and David Ben Eli and Carmel Rabinovitz and Gabi Guetta and Liran Gispan and Guy Zohar and Aviv Tamar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NS8z5FinYl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1948341, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7L3ZZS3_AA0J:scholar.google.com/&scioq=A+Bayesian+Approach+to+Online+Planning&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": ";;;;;;technion.ac.il", "author_num": 7, "aff_unique_index": "0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion", "aff_country_unique_index": "0", "aff_country_unique": "Israel" }, { "title": "Diffusion Language Models Are Versatile Protein Learners", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34203", "id": "NUAbSFqyqb", "proceeding": "https://proceedings.mlr.press/v235/wang24ct.html", "pdf": "https://openreview.net/pdf?id=NUAbSFqyqb", "openreview": "https://openreview.net/forum?id=NUAbSFqyqb", "author_site": "Xinyou Wang, Zaixiang Zheng, Fei YE, Dongyu Xue, Shujian Huang, Quanquan Gu", "tldr": "", "abstract": "This paper introduces diffusion protein language model (DPLM), a versatile protein language model that demonstrates strong generative and predictive capabilities for protein sequences. We first pre-train scalable DPLMs from evolutionary-scale protein sequences within a generative self-supervised discrete diffusion probabilistic framework, which generalizes language modeling for proteins in a principled way. After pre-training, DPLM exhibits the ability to generate structurally plausible, novel and diverse protein sequences for unconditional generation. We further demonstrate the proposed diffusion generative pre-training make DPLM possess a better understanding of proteins, making it a superior representation learner, which can be fine-tuned for various predictive tasks, comparing favorably to ESM2. Moreover, DPLM can be tailored for various needs, which showcases its prowess of conditional generation in several ways: (1) conditioning on partial peptide sequences, e.g., generating scaffolds for functional motifs with high success rate; (2) incorporating other modalities as conditioners, e.g., structure-conditioned generation for inverse folding; and (3) steering sequence generation towards desired properties, e.g., satisfying specified secondary structures, through a plug-and-play classifier guidance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xinyou Wang;Zaixiang Zheng;Fei YE;Dongyu Xue;Shujian Huang;Quanquan Gu", "authorids": "~Xinyou_Wang1;~Zaixiang_Zheng2;~Fei_YE4;~Dongyu_Xue1;~Shujian_Huang1;~Quanquan_Gu1", "gender": "M;M;F;M;M;M", "homepage": "https://nlp.nju.edu.cn;https://zhengzx-nlp.github.io/;;;http://nlp.nju.edu.cn/huangsj/;http://web.cs.ucla.edu/~qgu/", "dblp": "333/1334;205/2769;;;57/8451;50/4597", "google_scholar": ";JPSrehMAAAAJ;4MB4orsAAAAJ;;HF3-E9kAAAAJ;GU9HgNAAAAAJ", "orcid": ";;;0000-0003-1896-4222;;", "linkedin": ";;;;;", "or_profile": "~Xinyou_Wang1;~Zaixiang_Zheng2;~Fei_YE4;~Dongyu_Xue1;~Shujian_Huang1;~Quanquan_Gu1", "aff": "Nanjing University;ByteDance Research;;ByteDance AI Lab;Nanjing University;University of California, Los Angeles", "aff_domain": "nju.edu.cn;bytedance.com;;bytedance.com;nju.edu.cn;cs.ucla.edu", "position": "PhD student;Research Scientist;;Researcher;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nwang2024diffusion,\ntitle={Diffusion Language Models Are Versatile Protein Learners},\nauthor={Xinyou Wang and Zaixiang Zheng and Fei YE and Dongyu Xue and Shujian Huang and Quanquan Gu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NUAbSFqyqb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3547695, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13816037030111725821&as_sdt=20005&sciodt=0,9&hl=en", "gs_version_total": 7, "email": "nju.edu.cn;bytedance.com;;bytedance.com;nju.edu.cn;cs.ucla.edu", "author_num": 6, "aff_unique_index": "0;1;1;0;2", "aff_unique_norm": "Nanjing University;ByteDance;University of California, Los Angeles", "aff_unique_dep": ";Research;", "aff_unique_url": "https://www.nju.edu.cn;https://www.bytedance.com;https://www.ucla.edu", "aff_unique_abbr": "Nanjing U;ByteDance;UCLA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "LoCoCo: Dropping In Convolutions for Long Context Compression", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34202", "id": "NUlyqMyhO9", "proceeding": "https://proceedings.mlr.press/v235/cai24g.html", "pdf": "https://openreview.net/pdf?id=NUlyqMyhO9", "openreview": "https://openreview.net/forum?id=NUlyqMyhO9", "author_site": "Ruisi Cai, Yuandong Tian, Zhangyang \u201cAtlas\u201d Wang, Beidi Chen", "tldr": "", "abstract": "This paper tackles the memory hurdle of of processing long context sequences in Large Language Models (LLMs), by presenting a novel approach, Dropping In Convolutions for **Lo**ng **Co**ntext **Co**mpression (**LoCoCo**). LoCoCo employs only a fixed-size Key-Value (KV) cache, and can enhance efficiency in both inference and fine-tuning stages. Diverging from prior methods that selectively drop KV pairs based on heuristics, LoCoCo leverages a data-driven adaptive fusion technique, blending previous KV pairs with incoming tokens to minimize the loss of contextual information and ensure accurate attention modeling. This token integration is achieved through injecting one-dimensional convolutional kernels that dynamically calculate mixing weights for each KV cache slot. Designed for broad compatibility with existing LLM frameworks, LoCoCo allows for straightforward \"drop-in\" integration without needing architectural modifications, while incurring minimal tuning overhead. Experiments demonstrate that LoCoCo maintains consistently outstanding performance across various context lengths and can achieve a high context compression rate during both inference and fine-tuning phases. During inference, we successfully compressed up to $3482$ tokens into a $128$-size KV cache, while retaining comparable performance to the full sequence - an accuracy improvement of up to $0.2791$ compared to baselines at the same cache size. During post-training tuning, we also effectively extended the context length from 4K to 32K using a KV cache of fixed size 512, achieving performance similar to fine-tuning with entire sequences.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ruisi Cai;Yuandong Tian;Zhangyang Wang;Beidi Chen", "authorids": "~Ruisi_Cai1;~Yuandong_Tian1;~Zhangyang_Wang1;~Beidi_Chen1", "gender": "F;M;M;F", "homepage": "https://cairuisi.github.io;http://yuandong-tian.com;https://vita-group.github.io;https://www.andrew.cmu.edu/user/beidic/", "dblp": "341/1491;t/YuandongTian;119/4026;192/1339", "google_scholar": "B0chY1AAAAAJ;0mgEF28AAAAJ;pxFyKAIAAAAJ;", "orcid": ";0000-0003-4202-4847;;", "linkedin": ";yuandongtian;;", "or_profile": "~Ruisi_Cai1;~Yuandong_Tian1;~Zhangyang_Wang1;~Beidi_Chen1", "aff": "University of Texas at Austin;Meta AI (FAIR);University of Texas at Austin;Meta Facebook", "aff_domain": "utexas.edu;meta.com;utexas.edu;fb.com", "position": "PhD student;Research Scientist;Associate Professor;Researcher", "bibtex": "@inproceedings{\ncai2024lococo,\ntitle={LoCoCo: Dropping In Convolutions for Long Context Compression},\nauthor={Ruisi Cai and Yuandong Tian and Zhangyang Wang and Beidi Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NUlyqMyhO9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 963707, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=743574776137258585&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "utexas.edu;meta.com;utexas.edu;fb.com", "author_num": 4, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "University of Texas at Austin;Meta", "aff_unique_dep": ";Facebook AI Research (FAIR)", "aff_unique_url": "https://www.utexas.edu;https://ai.facebook.com", "aff_unique_abbr": "UT Austin;Meta AI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Enhancing Vision Transformer: Amplifying Non-Linearity in Feedforward Network Module", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34201", "id": "NV0q2jdwo0", "proceeding": "https://proceedings.mlr.press/v235/xu24n.html", "pdf": "https://openreview.net/pdf?id=NV0q2jdwo0", "openreview": "https://openreview.net/forum?id=NV0q2jdwo0", "author_site": "Yixing Xu, Chao Li, Dong Li, Xiao Sheng, Fan Jiang, Lu Tian, Ashish Sirasao, Emad Barsoum", "tldr": "", "abstract": "Transformer models have been gaining substantial interest in the field of computer vision tasks nowadays. Although a vision transformer contains two important components which are self-attention module and feedforward network (FFN) module, the majority of research tends to concentrate on modifying the former while leaving the latter in its original form. In this paper, we focus on improving the FFN module within the vision transformer. Through theoretical analysis, we demonstrate that the effect of the FFN module primarily lies in providing non-linearity, whose degree corresponds to the hidden dimensions. Thus, the computational cost of the FFN module can be reduced by enhancing the degree of non-linearity in the nonlinear function. Leveraging this insight, we propose an improved FFN (IFFN) module for vision transformers which involves the usage of the arbitrary GeLU (AGeLU) function and integrating multiple instances of it to augment non-linearity so that the number of hidden dimensions can be effectively reduced. Besides, a spatial enhancement part is involved to further enrich the non-linearity in the proposed IFFN module. Experimental results show that we can apply our method to a wide range of state-of-the-art vision transformer models irrespective of how they modify their self-attention part and the overall architecture, and reduce FLOPs and parameters without compromising classification accuracy on the ImageNet dataset.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yixing Xu;Chao Li;Dong Li;Xiao Sheng;Fan Jiang;Lu Tian;Ashish Sirasao;Emad Barsoum", "authorids": "~Yixing_Xu2;~Chao_Li27;~Dong_Li13;~Xiao_Sheng1;~Fan_Jiang5;~Lu_Tian3;~Ashish_Sirasao1;~Emad_Barsoum1", "gender": "M;M;;M;M;F;M;", "homepage": ";;;;;;;", "dblp": "142/1013;;;;;;19/4292.html;", "google_scholar": "32tJoOkAAAAJ;;;https://scholar.google.com/citations?view_op=list_works;;edbuKpcAAAAJ;;", "orcid": ";;;;;;;", "linkedin": ";%E8%B6%85-%E6%9D%8E-6164a6a0/;;;fan-jiang-996514268/;;;", "or_profile": "~Yixing_Xu2;~Chao_Li27;~Dong_Li13;~Xiao_Sheng1;~Fan_Jiang5;~Lu_Tian3;~Ashish_Sirasao1;~Emad_Barsoum1", "aff": "Advanced Micro Devices;;;;Advanced Micro Devices;AMD;Amd inc;", "aff_domain": "amd.com;;;;amd.com;amd.com;amd.com;", "position": "Principal Researcher;;;;Principal Researcher;Researcher;Researcher;", "bibtex": "@inproceedings{\nxu2024enhancing,\ntitle={Enhancing Vision Transformer: Amplifying Non-Linearity in Feedforward Network Module},\nauthor={Yixing Xu and Chao Li and Dong Li and Xiao Sheng and Fan Jiang and Lu Tian and Ashish Sirasao and Emad Barsoum},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NV0q2jdwo0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 713057, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13745992612807829149&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "amd.com;;;;amd.com;amd.com;amd.com;", "author_num": 8, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Advanced Micro Devices, Inc.", "aff_unique_dep": "", "aff_unique_url": "https://www.amd.com", "aff_unique_abbr": "AMD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "NExT-GPT: Any-to-Any Multimodal LLM", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34200", "id": "NZQkumsNlf", "proceeding": "https://proceedings.mlr.press/v235/wu24e.html", "pdf": "https://openreview.net/pdf?id=NZQkumsNlf", "openreview": "https://openreview.net/forum?id=NZQkumsNlf", "author_site": "Shengqiong Wu, Hao Fei, Leigang Qu, Wei Ji, Tat-Seng Chua", "tldr": "", "abstract": "While recently Multimodal Large Language Models (MM-LLMs) have made exciting strides, they mostly fall prey to the limitation of only input-side multimodal understanding, without the ability to produce content in multiple modalities. As we humans always perceive the world and communicate with people through various modalities, developing any-to-any MM-LLMs capable of accepting and delivering content in any modality becomes essential to human-level AI. To fill the gap, we present an end-to-end general-purpose any-to-any MM-LLM system, NExT-GPT. We connect an LLM with multimodal adaptors and different diffusion decoders, enabling NExT-GPT to perceive inputs and generate outputs in arbitrary combinations of text, image, video, and audio. By leveraging the existing well-trained high-performing encoders and decoders, NExT-GPT is tuned with only a small amount of parameter (1%) of certain projection layers, which not only benefits low-cost training but also facilitates convenient expansion to more potential modalities. Moreover, we introduce a modality-switching instruction tuning (MosIT) and manually curate a high-quality dataset for MosIT, based on which NExT-GPT is empowered with complex cross-modal semantic understanding and content generation. Overall, our research showcases the promising possibility of building a unified AI agent capable of modeling universal modalities, paving the way for more human-like AI research in the community.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shengqiong Wu;Hao Fei;Leigang Qu;Wei Ji;Tat-Seng Chua", "authorids": "~Shengqiong_Wu2;~Hao_Fei1;~Leigang_Qu1;~Wei_Ji1;~Tat-Seng_Chua2", "gender": "F;M;M;M;M", "homepage": "https://chocowu.github.io/;https://haofei.vip/;https://leigang-qu.github.io/;https://jiwei0523.github.io/;http://www.comp.nus.edu.sg/~chuats/", "dblp": "274/7191;81/3569-1;276/3150;52/3220-8;", "google_scholar": "RJJLKR0AAAAJ;YGDX46AAAAAJ;1W2Tio4AAAAJ;69OFB-AAAAAJ;https://scholar.google.com.tw/citations?user=Z9DWCBEAAAAJ", "orcid": "0000-0001-6192-1194;0000-0003-3026-6347;0009-0004-6555-3834;0000-0002-8106-9768;0000-0001-6097-7807", "linkedin": ";;;;", "or_profile": "~Shengqiong_Wu2;~Hao_Fei1;~Leigang_Qu1;~Wei_Ji1;~Tat-seng_Chua1", "aff": "National University of Singapore;National University of Singapore;National University of Singapore;Nanjing University;National University of Singapore", "aff_domain": "u.nus.edu;nus.edu.sg;u.nus.edu;nju.edu.cn;nus.edu.sg", "position": "PhD student;Postdoc;PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nwu2024nextgpt,\ntitle={{NE}xT-{GPT}: Any-to-Any Multimodal {LLM}},\nauthor={Shengqiong Wu and Hao Fei and Leigang Qu and Wei Ji and Tat-Seng Chua},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NZQkumsNlf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8857405, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 616, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=174615942206434624&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "u.nus.edu;nus.edu.sg;u.nus.edu;nju.edu.cn;nus.edu.sg", "author_num": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "National University of Singapore;Nanjing University", "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;https://www.nju.edu.cn", "aff_unique_abbr": "NUS;Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "Singapore;China" }, { "title": "Revisit the Essence of Distilling Knowledge through Calibration", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34199", "id": "NZgbwzaOIx", "proceeding": "https://proceedings.mlr.press/v235/fan24d.html", "pdf": "https://openreview.net/pdf?id=NZgbwzaOIx", "openreview": "https://openreview.net/forum?id=NZgbwzaOIx", "author_site": "Wen-Shu Fan, Su Lu, Xin-Chun Li, De-Chuan Zhan, Le Gan", "tldr": "", "abstract": "Knowledge Distillation (KD) has evolved into a practical technology for transferring knowledge from a well-performing model (teacher) to a weak model (student). A counter-intuitive phenomenon known as capacity mismatch has been identified, wherein KD performance may not be good when a better teacher instructs the student. Various preliminary methods have been proposed to alleviate capacity mismatch, but a unifying explanation for its cause remains lacking. In this paper, we propose *a unifying analytical framework to pinpoint the core of capacity mismatch based on calibration*. Through extensive analytical experiments, we observe a positive correlation between the calibration of the teacher model and the KD performance with original KD methods. As this correlation arises due to the sensitivity of metrics (e.g., KL divergence) to calibration, we recommend employing measurements insensitive to calibration such as ranking-based loss. Our experiments demonstrate that ranking-based loss can effectively replace KL divergence, aiding large models with poor calibration to teach better.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wen-Shu Fan;Su Lu;Xin-Chun Li;De-Chuan Zhan;Le Gan", "authorids": "~Wen-Shu_Fan2;~Su_Lu1;~Xin-Chun_Li1;~De-Chuan_Zhan1;~Le_Gan1", "gender": "M;M;M;M;M", "homepage": "http://www.lamda.nju.edu.cn/lus/;http://www.lamda.nju.edu.cn/zhandc/;;http://www.lamda.nju.edu.cn/fanws/;http://www.lamda.nju.edu.cn/lixc/", "dblp": "86/7819;74/498;199/0588.html;;https://dblp.uni-trier.de/pid/246/2947", "google_scholar": ";mYJf4TcAAAAJ;cCD5SDoAAAAJ;;7WOxRe0AAAAJ", "orcid": ";0000-0002-3533-2078;0000-0002-8260-6932;;", "linkedin": ";;;;", "or_profile": "~Su_Lu1;~De-Chuan_Zhan1;~Le_Gan1;~Wen-shu_Fan1;~Li_Xin-Chun1", "aff": "Nanjing University;Nanjing University;Nanjing University;Nanjing University;Nanjing University", "aff_domain": "nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn", "position": "PhD student;Full Professor;Researcher;PhD student;PhD student", "bibtex": "@inproceedings{\nfan2024revisit,\ntitle={Revisit the Essence of Distilling Knowledge through Calibration},\nauthor={Wen-Shu Fan and Su Lu and Xin-Chun Li and De-Chuan Zhan and Le Gan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NZgbwzaOIx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1368105, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18220701228131397041&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "", "aff_unique_url": "https://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "SimPro: A Simple Probabilistic Framework Towards Realistic Long-Tailed Semi-Supervised Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34198", "id": "NbOlmrB59Z", "proceeding": "https://proceedings.mlr.press/v235/du24b.html", "pdf": "https://openreview.net/pdf?id=NbOlmrB59Z", "openreview": "https://openreview.net/forum?id=NbOlmrB59Z", "author_site": "Chaoqun Du, Yizeng Han, Gao Huang", "tldr": "", "abstract": "Recent advancements in semi-supervised learning have focused on a more realistic yet challenging task: addressing imbalances in labeled data while the class distribution of unlabeled data remains both unknown and potentially mismatched. Current approaches in this sphere often presuppose rigid assumptions regarding the class distribution of unlabeled data, thereby limiting the adaptability of models to only certain distribution ranges. In this study, we propose a novel approach, introducing a highly adaptable framework, designated as **SimPro**, which does not rely on any predefined assumptions about the distribution of unlabeled data. Our framework, grounded in a probabilistic model, innovatively refines the expectation-maximization (EM) method by separating the modeling of conditional and marginal class distributions. This separation facilitates a closed-form solution for class distribution estimation during the maximization phase, leading to the formulation of a Bayes classifier. The Bayes classifier, in turn, enhances the quality of pseudo-labels in the expectation phase. Remarkably, the SimPro framework is not only straightforward to implement but also comes with theoretical guarantees. Moreover, we introduce two novel class distributions broadening the scope of the evaluation. Our method showcases consistent state-of-the-art performance across diverse benchmarks and data distribution scenarios. benchmarks and data distribution scenarios. Our code is available at https://github.com/LeapLabTHU/SimPro.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chaoqun Du;Yizeng Han;Gao Huang", "authorids": "~Chaoqun_Du1;~Yizeng_Han1;~Gao_Huang1", "gender": "M;M;M", "homepage": ";https://yizenghan.top/;http://www.gaohuang.net", "dblp": "308/6110;217/9548;", "google_scholar": "0PSKJuYAAAAJ;25mubAsAAAAJ;-P9LwcgAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Chaoqun_Du1;~Yizeng_Han1;~Gao_Huang1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "mails.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\ndu2024simpro,\ntitle={SimPro: A Simple Probabilistic Framework Towards Realistic Long-Tailed Semi-Supervised Learning},\nauthor={Chaoqun Du and Yizeng Han and Gao Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NbOlmrB59Z}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1083693, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10846194835459509011&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "mails.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Resisting Stochastic Risks in Diffusion Planners with the Trajectory Aggregation Tree", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34197", "id": "NbYAmsFJrc", "proceeding": "https://proceedings.mlr.press/v235/feng24b.html", "pdf": "https://openreview.net/pdf?id=NbYAmsFJrc", "openreview": "https://openreview.net/forum?id=NbYAmsFJrc", "author_site": "Lang Feng, Pengjie Gu, Bo An, Gang Pan", "tldr": "", "abstract": "Diffusion planners have shown promise in handling long-horizon and sparse-reward tasks due to the non-autoregressive plan generation. However, their inherent stochastic risk of generating infeasible trajectories presents significant challenges to their reliability and stability. We introduce a novel approach, the Trajectory Aggregation Tree (TAT), to address this issue in diffusion planners. Compared to prior methods that rely solely on raw trajectory predictions, TAT aggregates information from both historical and current trajectories, forming a dynamic tree-like structure. Each trajectory is conceptualized as a branch and individual states as nodes. As the structure evolves with the integration of new trajectories, unreliable states are marginalized, and the most impactful nodes are prioritized for decision-making. TAT can be deployed without modifying the original training and sampling pipelines of diffusion planners, making it a training-free, ready-to-deploy solution. We provide both theoretical analysis and empirical evidence to support TAT's effectiveness. Our results highlight its remarkable ability to resist the risk from unreliable trajectories, guarantee the performance boosting of diffusion planners in 100% of tasks, and exhibit an appreciable tolerance margin for sample quality, thereby enabling planning with a more than $3\\times$ acceleration.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lang Feng;Pengjie Gu;Bo An;Gang Pan", "authorids": "~Lang_Feng1;~Pengjie_Gu1;~Bo_An2;~Gang_Pan1", "gender": ";M;M;", "homepage": "https://github.com/langfengQ;;https://personal.ntu.edu.sg/boan/;", "dblp": "211/0071-2;226/1222;42/6178-1.html;", "google_scholar": "9bT7-I0AAAAJ;;PEEpuNwAAAAJ;", "orcid": "0000-0003-2543-1344;;0000-0002-7064-7438;", "linkedin": ";;;", "or_profile": "~Lang_Feng1;~Pengjie_Gu1;~Bo_An2;~Gang_Pan1", "aff": "Zhejiang University;Nanyang Technological University;Nanyang Technological University;", "aff_domain": "zju.edu.cn;ntu.edu.sg;ntu.edu.sg;", "position": "MS student;PhD student;Full Professor;", "bibtex": "@inproceedings{\nfeng2024resisting,\ntitle={Resisting Stochastic Risks in Diffusion Planners with the Trajectory Aggregation Tree},\nauthor={Lang Feng and Pengjie Gu and Bo An and Gang Pan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NbYAmsFJrc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3081505, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16654309507595422112&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "zju.edu.cn;ntu.edu.sg;ntu.edu.sg;", "author_num": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "Zhejiang University;Nanyang Technological University", "aff_unique_dep": ";", "aff_unique_url": "https://www.zju.edu.cn;https://www.ntu.edu.sg", "aff_unique_abbr": "ZJU;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;Singapore" }, { "title": "Privately Learning Smooth Distributions on the Hypercube by Projections", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34196", "id": "NeEbsvnaWE", "proceeding": "https://proceedings.mlr.press/v235/lalanne24a.html", "pdf": "https://openreview.net/pdf?id=NeEbsvnaWE", "openreview": "https://openreview.net/forum?id=NeEbsvnaWE", "author_site": "Cl\u00e9ment Lalanne, S\u00e9bastien Gadat", "tldr": "", "abstract": "Fueled by the ever-increasing need for statistics that guarantee the privacy of their training sets, this article studies the centrally-private estimation of Sobolev-smooth densities of probability over the hypercube in dimension d. The contributions of this article are two-fold : Firstly, it generalizes the one-dimensional results of (Lalanne et al., 2023) to non-integer levels of smoothness and to a high-dimensional setting, which is important for two reasons : it is more suited for modern learning tasks, and it allows understanding the relations between privacy, dimensionality and smoothness, which is a central question with differential privacy. Secondly, this article presents a private strategy of estimation that is data-driven (usually referred to as adaptive in Statistics) in order to privately choose an estimator that achieves a good bias-variance trade-off among a finite family of private projection estimators without prior knowledge of the ground-truth smoothness \u03b2. This is achieved by adapting the Lepskii method for private selection, by adding a new penalization term that makes the estimation privacy-aware.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Cl\u00e9ment Lalanne;S\u00e9bastien Gadat", "authorids": "~Cl\u00e9ment_Lalanne1;~S\u00e9bastien_Gadat1", "gender": ";", "homepage": "https://www.clemlal.github.io;https://perso.math.univ-toulouse.fr/gadat/", "dblp": ";", "google_scholar": "uGRMC34AAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Cl\u00e9ment_Lalanne1;~S\u00e9bastien_Gadat1", "aff": "Toulouse School of Economics;Toulouse School of Economics", "aff_domain": "tse-fr.eu;tse.fr", "position": "Postdoc;Full Professor", "bibtex": "@inproceedings{\nlalanne2024privately,\ntitle={Privately Learning Smooth Distributions on the Hypercube by Projections},\nauthor={Cl{\\'e}ment Lalanne and S{\\'e}bastien Gadat},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NeEbsvnaWE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 616984, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2062558356750959413&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "email": "tse-fr.eu;tse.fr", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Toulouse School of Economics", "aff_unique_dep": "", "aff_unique_url": "https://www.tse-fr.eu", "aff_unique_abbr": "TSE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "title": "Augmenting Decision with Hypothesis in Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34195", "id": "NeO2hoSexj", "proceeding": "https://proceedings.mlr.press/v235/quang24a.html", "pdf": "https://openreview.net/pdf?id=NeO2hoSexj", "openreview": "https://openreview.net/forum?id=NeO2hoSexj", "author_site": "Nguyen Minh Quang, Hady Lauw", "tldr": "", "abstract": "Value-based reinforcement learning is the current State-Of-The-Art due to high sampling efficiency. However, our study shows it suffers from low exploitation in early training period and bias sensitiveness. To address these issues, we propose to augment the decision-making process with hypothesis, a weak form of environment description. Our approach relies on prompting the learning agent with accurate hypotheses, and designing a ready-to-adapt policy through incremental learning. We propose the ALH algorithm, showing detailed analyses on a typical learning scheme and a diverse set of Mujoco benchmarks. Our algorithm produces a significant improvement over value-based learning algorithms and other strong baselines. Our code is available at [Github URL](https://github.com/nbtpj/ALH).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nguyen Minh Quang;Hady W. Lauw", "authorids": "~Nguyen_Minh_Quang1;~Hady_W._Lauw1", "gender": "M;M", "homepage": "https://www.nguyenminhquang.com/;http://www.hadylauw.com", "dblp": "384/4286;00/2494", "google_scholar": "vPOA7EcAAAAJ;HTC1z2gAAAAJ", "orcid": "0009-0006-5080-0702;0000-0002-8245-8677", "linkedin": "mquang-nguyen/;hadylauw", "or_profile": "~Nguyen_Minh_Quang1;~Hady_W_Lauw1", "aff": "Singapore Management University;Singapore Management University", "aff_domain": "smu.edu.sg;smu.edu.sg", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nquang2024augmenting,\ntitle={Augmenting Decision with Hypothesis in Reinforcement Learning},\nauthor={Nguyen Minh Quang and Hady W. Lauw},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NeO2hoSexj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2062047, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:e1tYcy2TNQUJ:scholar.google.com/&scioq=Augmenting+Decision+with+Hypothesis+in+Reinforcement+Learning&hl=en&as_sdt=0,5", "gs_version_total": 4, "email": "smu.edu.sg;smu.edu.sg", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Singapore Management University", "aff_unique_dep": "", "aff_unique_url": "https://www.smu.edu.sg", "aff_unique_abbr": "SMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Singapore" }, { "title": "SpikeZIP-TF: Conversion is All You Need for Transformer-based SNN", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34194", "id": "NeotatlYOL", "proceeding": "https://proceedings.mlr.press/v235/you24b.html", "pdf": "https://openreview.net/pdf?id=NeotatlYOL", "openreview": "https://openreview.net/forum?id=NeotatlYOL", "author_site": "kang you, Zekai Xu, Chen Nie, Zhijie Deng, Qinghai Guo, Xiang Wang, Zhezhi He", "tldr": "", "abstract": "Spiking neural network (SNN) has attracted great attention due to its characteristic of high efficiency and accuracy. Currently, the ANN-to-SNN conversion methods can obtain ANN on-par accuracy SNN with ultra-low latency (8 time-steps) in CNN structure on computer vision (CV) tasks. However, as Transformer-based networks have achieved prevailing precision on both CV and natural language processing (NLP), the Transformer-based SNNs are still encounting the lower accuracy w.r.t the ANN counterparts. In this work, we introduce a novel ANN-to-SNN conversion method called SpikeZIP-TF, where ANN and SNN are exactly equivalent, thus incurring no accuracy degradation. SpikeZIP-TF achieves 83.82% accuracy on CV dataset (ImageNet) and 93.79% accuracy on NLP dataset (SST-2), which are higher than SOTA Transformer-based SNNs. The code is available in GitHub: https://github.com/Intelligent-Computing-Research-Group/SpikeZIP_transformer", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "kang you;Zekai Xu;Chen Nie;Zhijie Deng;Qinghai Guo;Xiang Wang;Zhezhi He", "authorids": "~kang_you2;~Zekai_Xu1;~Chen_Nie1;~Zhijie_Deng1;~Qinghai_Guo1;~Xiang_Wang14;~Zhezhi_He1", "gender": "M;M;M;M;M;;M", "homepage": ";;https://www.baidu.com;https://thudzj.github.io/;https://www.semanticscholar.org/author/Qinghai-Guo/47747957;;https://elliothe.github.io/", "dblp": ";244/3434;;209/4959;12/8502;;184/1264", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com/citations?hl=zh-CN;;J3dR0sUAAAAJ;;;https://scholar.google.com/citations?hl=en", "orcid": ";;;0000-0002-0932-1631;0000-0003-4697-9464;0000-0003-1735-633X;0000-0002-6357-236X", "linkedin": ";;;;;;", "or_profile": "~kang_you2;~Zekai_Xu1;~Chen_Nie1;~Zhijie_Deng1;~Qinghai_Guo1;~Xiang_Wang14;~Zhezhi_He1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Huawei Technologies Ltd.;Huawei Technologies Ltd.;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;huawei.com;huawei.com;sjtu.edu.cn", "position": "PhD student;PhD student;PhD student;Assistant Professor;Researcher;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nyou2024spikeziptf,\ntitle={Spike{ZIP}-{TF}: Conversion is All You Need for Transformer-based {SNN}},\nauthor={kang you and Zekai Xu and Chen Nie and Zhijie Deng and Qinghai Guo and Xiang Wang and Zhezhi He},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NeotatlYOL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1891704, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9560996825776379125&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;huawei.com;huawei.com;sjtu.edu.cn", "author_num": 7, "aff_unique_index": "0;0;0;0;1;1;0", "aff_unique_norm": "Shanghai Jiao Tong University;Huawei", "aff_unique_dep": ";Huawei Technologies", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.huawei.com", "aff_unique_abbr": "SJTU;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Geometry-Calibrated DRO: Combating Over-Pessimism with Free Energy Implications", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34193", "id": "NgaYcefBnZ", "proceeding": "https://proceedings.mlr.press/v235/liu24br.html", "pdf": "https://openreview.net/pdf?id=NgaYcefBnZ", "openreview": "https://openreview.net/forum?id=NgaYcefBnZ", "author_site": "Jiashuo Liu, Jiayun Wu, Tianyu Wang, Hao Zou, Bo Li, Peng Cui", "tldr": "", "abstract": "Machine learning algorithms minimizing average risk are susceptible to distributional shifts. Distributionally Robust Optimization (DRO) addresses this issue by optimizing the worst-case risk within an uncertainty set. However, DRO suffers from over-pessimism, leading to low-confidence predictions, poor parameter estimations as well as poor generalization. In this work, we conduct a theoretical analysis of a probable root cause of over-pessimism: excessive focus on noisy samples. To alleviate the impact of noise, we incorporate data geometry into calibration terms in DRO, resulting in our novel Geometry-Calibrated DRO (GCDRO) for regression. We establish the connection between our risk objective and the Helmholtz free energy in statistical physics, and this free-energy-based risk can extend to standard DRO methods. Leveraging gradient flow in Wasserstein space, we develop an approximate minimax optimization algorithm with a bounded error ratio and elucidate how our approach mitigates noisy sample effects. Comprehensive experiments confirm GCDRO's superiority over conventional DRO methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiashuo Liu;Jiayun Wu;Tianyu Wang;Hao Zou;Bo Li;Peng Cui", "authorids": "~Jiashuo_Liu1;~Jiayun_Wu1;~Tianyu_Wang6;~Hao_Zou1;~Bo_Li29;~Peng_Cui1", "gender": "M;M;M;M;M;M", "homepage": "https://ljsthu.github.io;https://ic-hub.github.io;https://wangtianyu61.github.io;https://scholar.google.com/citations?user=f5cbI4cAAAAJ&hl=en;http://www.sem.tsinghua.edu.cn/en/libo;http://pengcui.thumedialab.com/", "dblp": "180/2823;00/9456;;13/4741-1;50/3402-64;31/891-1", "google_scholar": "b7bpt5MAAAAJ;https://scholar.google.com/citations?hl=en;mKT6mKEAAAAJ;f5cbI4cAAAAJ;GaJXFWMAAAAJ;https://scholar.google.com.tw/citations?user=G8x97ZgAAAAJ", "orcid": ";0009-0007-7131-7290;0009-0000-2095-431X;0000-0002-6000-6936;0000-0001-5599-8857;0000-0003-2957-8511", "linkedin": "jiashuo-liu-244a6b1a4;jiayun-wu-4aa86323a/;;;;", "or_profile": "~Jiashuo_Liu1;~Jiayun_Wu1;~Tianyu_Wang6;~Hao_Zou1;~Bo_Li29;~Peng_Cui1", "aff": "University of Cambridge;Tsinghua University;Columbia University;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "cam.ac.uk;mails.tsinghua.edu.cn;columbia.edu;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "Researcher;MS student;PhD student;PhD student;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nliu2024geometrycalibrated,\ntitle={Geometry-Calibrated {DRO}: Combating Over-Pessimism with Free Energy Implications},\nauthor={Jiashuo Liu and Jiayun Wu and Tianyu Wang and Hao Zou and Bo Li and Peng Cui},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NgaYcefBnZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5398606, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10152050426102300533&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 7, "email": "cam.ac.uk;mails.tsinghua.edu.cn;columbia.edu;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 6, "aff_unique_index": "0;1;2;1;1;1", "aff_unique_norm": "University of Cambridge;Tsinghua University;Columbia University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cam.ac.uk;https://www.tsinghua.edu.cn;https://www.columbia.edu", "aff_unique_abbr": "Cambridge;THU;Columbia", "aff_campus_unique_index": "0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;1;2;1;1;1", "aff_country_unique": "United Kingdom;China;United States" }, { "title": "A Federated Stochastic Multi-level Compositional Minimax Algorithm for Deep AUC Maximization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34192", "id": "NkN6wrYXe5", "proceeding": "https://proceedings.mlr.press/v235/zhang24aw.html", "pdf": "https://openreview.net/pdf?id=NkN6wrYXe5", "openreview": "https://openreview.net/forum?id=NkN6wrYXe5", "author_site": "Xinwen Zhang, Ali Payani, Myungjin Lee, Richard Souvenir, Hongchang Gao", "tldr": "", "abstract": "AUC maximization is an effective approach to address the imbalanced data classification problem in federated learning. In the past few years, a couple of federated AUC maximization approaches have been developed based on the minimax optimization. However, directly solving a minimax optimization problem to maximize the AUC score cannot achieve satisfactory performance. To address this issue, we propose to maximize AUC via optimizing a federated multi-level compositional minimax problem. Specifically, we develop a novel federated multi-level compositional minimax algorithm with rigorous theoretical guarantees to solve this new learning paradigm in both algorithmic design and theoretical analysis. To the best of our knowledge, this is the first work studying the multi-level minimax optimization problem. Additionally, extensive empirical evaluations confirm the efficacy of our proposed approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xinwen Zhang;Ali Payani;Myungjin Lee;Richard Souvenir;Hongchang Gao", "authorids": "~Xinwen_Zhang3;~Ali_Payani1;~Myungjin_Lee1;~Richard_Souvenir2;~Hongchang_Gao3", "gender": "F;M;M;M;", "homepage": ";;;https://cis.temple.edu/~souvenir/;", "dblp": "14/3612;184/3921;;95/5553;", "google_scholar": "6pZyGBQAAAAJ;9rHwD8wAAAAJ;XjWpxJUAAAAJ;1sMNiJIAAAAJ;", "orcid": "0009-0002-1981-7523;0000-0003-4054-2958;0000-0003-2360-7019;0000-0002-6066-0946;", "linkedin": "xinwen-zhang-54a485249;ali-payani-59267515;;;", "or_profile": "~Xinwen_Zhang3;~Ali_Payani1;~Myungjin_Lee1;~Richard_Souvenir2;~Hongchang_Gao3", "aff": "Temple University;Cisco;Cisco;Temple University;", "aff_domain": "temple.edu;cisco.com;cisco.com;temple.edu;", "position": "PhD student;Researcher;Principal Researcher;Full Professor;", "bibtex": "@inproceedings{\nzhang2024a,\ntitle={A Federated Stochastic Multi-level Compositional Minimax Algorithm for Deep {AUC} Maximization},\nauthor={Xinwen Zhang and Ali Payani and Myungjin Lee and Richard Souvenir and Hongchang Gao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NkN6wrYXe5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 675927, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:VdXAAt-mu0AJ:scholar.google.com/&scioq=A+Federated+Stochastic+Multi-level+Compositional+Minimax+Algorithm+for+Deep+AUC+Maximization&hl=en&as_sdt=0,5", "gs_version_total": 5, "email": "temple.edu;cisco.com;cisco.com;temple.edu;", "author_num": 5, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Temple University;Cisco Systems", "aff_unique_dep": ";", "aff_unique_url": "https://www.temple.edu;https://www.cisco.com", "aff_unique_abbr": "Temple;Cisco", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Position: Topological Deep Learning is the New Frontier for Relational Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34191", "id": "Nl3RG5XWAt", "proceeding": "https://proceedings.mlr.press/v235/papamarkou24a.html", "pdf": "https://openreview.net/pdf?id=Nl3RG5XWAt", "openreview": "https://openreview.net/forum?id=Nl3RG5XWAt", "author_site": "Theodore Papamarkou, Tolga Birdal, Michael Bronstein, Gunnar Carlsson, Justin Curry, Yue Gao, Mustafa Hajij, Roland Kwitt, Pietro Li\u00f3, Paolo Di Lorenzo, Vasileios Maroulas, Nina Miolane, Farzana Nasrin, Karthikeyan Ramamurthy, Bastian Rieck, Simone Scardapane, Michael Schaub, Petar Veli\u010dkovi\u0107, Bei Wang, Yusu Wang, Guowei Wei, Ghada Zam", "tldr": "", "abstract": "Topological deep learning (TDL) is a rapidly evolving field that uses topological features to understand and design deep learning models. This paper posits that TDL is the new frontier for relational learning. TDL may complement graph representation learning and geometric deep learning by incorporating topological concepts, and can thus provide a natural choice for various machine learning settings. To this end, this paper discusses open problems in TDL, ranging from practical benefits to theoretical foundations. For each problem, it outlines potential solutions and future research opportunities. At the same time, this paper serves as an invitation to the scientific community to actively participate in TDL research to unlock the potential of this emerging field.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Theodore Papamarkou;Tolga Birdal;Michael M. Bronstein;Gunnar E. Carlsson;Justin Curry;Yue Gao;Mustafa Hajij;Roland Kwitt;Pietro Lio;Paolo Di Lorenzo;Vasileios Maroulas;Nina Miolane;Farzana Nasrin;Karthikeyan Natesan Ramamurthy;Bastian Rieck;Simone Scardapane;Michael T Schaub;Petar Veli\u010dkovi\u0107;Bei Wang;Yusu Wang;Guowei Wei;Ghada Zamzmi", "authorids": "~Theodore_Papamarkou1;~Tolga_Birdal3;~Michael_M._Bronstein1;~Gunnar_E._Carlsson1;~Justin_Curry1;~Yue_Gao4;~Mustafa_Hajij1;~Roland_Kwitt1;~Pietro_Lio1;~Paolo_Di_Lorenzo1;vmaroula@utk.edu;~Nina_Miolane2;fnasrin@hawaii.edu;~Karthikeyan_Natesan_Ramamurthy1;~Bastian_Rieck1;~Simone_Scardapane1;~Michael_T_Schaub1;~Petar_Veli\u010dkovi\u01071;~Bei_Wang3;~Yusu_Wang1;~Guowei_Wei1;~Ghada_Zamzmi2", "gender": "M;M;M;M;M;M;M;M;M;M;;;;;M;M;;M;F;;M;", "homepage": "https://www.theopapamarkou.com/;http://tolgabirdal.github.io;http://www.inf.usi.ch/bronstein/;;http://justinmcurry.com/;http://www.gaoyue.org;;http://rkwitt.org;https://www.cst.cam.ac.uk/people/pl219;https://sites.google.com/site/paolodilorenzohp/;;https://www.ece.ucsb.edu/people/faculty/nina-miolane;;https://nrkarthikeyan.github.io/;https://bastian.rieck.me;http://ispac.diet.uniroma1.it/scardapane/;https://michaelschaub.github.io/;https://petar-v.com;http://www.sci.utah.edu/~beiwang/;;https://users.math.msu.edu/users/weig/;", "dblp": ";143/7056;07/2668;24/6871;;33/3099-2;;60/4140;l/PietroLio.html;42/9879;;;;58/7800;119/8860;144/2184;72/10263;184/4786.html;08/6391-1;;;", "google_scholar": "ydMfbhAAAAAJ;_Bxd5ggAAAAJ;UU3N6-UAAAAJ;;9RL0BDsAAAAJ;UTDfWocAAAAJ;6fwWEFoAAAAJ;https://scholar.google.at/citations?user=sfGFi6UAAAAJ;https://scholar.google.co.uk/citations?user=3YrWf7EAAAAJ;https://scholar.google.it/citations?user=VZYvspQAAAAJ;;;;mG8HuhEAAAAJ;https://scholar.google.ch/citations?user=La7zuKQAAAAJ;https://scholar.google.it/citations?user=aSuosYoAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.co.uk/citations?user=kcTK_FAAAAAJ;;;If_mksUAAAAJ;", "orcid": "0000-0002-9689-543X;0000-0001-7915-7964;;;0000-0003-2504-8388;;;;0000-0002-0540-5053;;;;;0000-0002-6021-5930;0000-0003-4335-0302;0000-0003-0881-8344;0000-0003-2426-6404;0000-0002-2820-4692;;;;", "linkedin": "papamarkou/;https://linkedin.com/in/tbirdal;mbronstein/;;;;mustafa-hajij/;;;;;;;;br-ml/;simonescardapane;;petarvelickovic;;;;", "or_profile": "~Theodore_Papamarkou1;~Tolga_Birdal3;~Michael_M._Bronstein1;~Gunnar_E._Carlsson1;~Justin_Curry1;~Yue_Gao4;~Mustafa_Hajij1;~Roland_Kwitt1;~Pietro_Lio1;~Paolo_Di_Lorenzo1;vmaroula@utk.edu;~Nina_Miolane2;fnasrin@hawaii.edu;~Karthikeyan_Natesan_Ramamurthy1;~Bastian_Rieck1;~Simone_Scardapane1;~Michael_T_Schaub1;~Petar_Veli\u010dkovi\u01071;~Bei_Wang3;~Yusu_Wang1;~Guowei_Wei1;~Ghada_Zamzmi2", "aff": "University of Manchester;Imperial College London;University of Oxford;;;Tsinghua University;Santa Clara University;University of Salzburg;University of Cambridge;University of Roma \"La Sapienza\";;University of California, Santa Barbara;;International Business Machines;Helmholtz Zentrum M\u00fcnchen;Sapienza University of Rome;RWTH Aachen University;Google DeepMind;University of Utah;;Michigan State University;", "aff_domain": "manchester.ac.uk;imperial.ac.uk;ox.ac.uk;;;tsinghua.edu.cn;scu.edu;sbg.ac.at;cam.ac.uk;uniroma1.it;;ucsb.edu;;ibm.com;helmholtz-munich.de;uniroma1.it;rwth-aachen.de;google.com;utah.edu;;msu.edu;", "position": "Full Professor;Assistant Professor;Full Professor;;;Associate Professor;Assistant Professor;Full Professor;Full Professor;Associate Professor;;Assistant Professor;;Research Staff Member;Principal Investigator;Assistant Professor;Assistant Professor;Senior Staff Research Scientist;Associate Professor;;Full Professor;", "bibtex": "@inproceedings{\npapamarkou2024position,\ntitle={Position: Topological Deep Learning is the New Frontier for Relational Learning},\nauthor={Theodore Papamarkou and Tolga Birdal and Michael M. Bronstein and Gunnar E. Carlsson and Justin Curry and Yue Gao and Mustafa Hajij and Roland Kwitt and Pietro Lio and Paolo Di Lorenzo and Vasileios Maroulas and Nina Miolane and Farzana Nasrin and Karthikeyan Natesan Ramamurthy and Bastian Rieck and Simone Scardapane and Michael T Schaub and Petar Veli{\\v{c}}kovi{\\'c} and Bei Wang and Yusu Wang and Guowei Wei and Ghada Zamzmi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Nl3RG5XWAt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 505675, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 22, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2892841484993613602&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "manchester.ac.uk;imperial.ac.uk;ox.ac.uk;;;tsinghua.edu.cn;scu.edu;sbg.ac.at;cam.ac.uk;uniroma1.it;;ucsb.edu;;ibm.com;helmholtz-munich.de;uniroma1.it;rwth-aachen.de;google.com;utah.edu;;msu.edu;", "author_num": 22, "aff_unique_index": "0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15", "aff_unique_norm": "University of Manchester;Imperial College London;University of Oxford;Tsinghua University;Santa Clara University;University of Salzburg;University of Cambridge;University of Rome La Sapienza;University of California, Santa Barbara;International Business Machines Corporation;Helmholtz Zentrum M\u00fcnchen;Sapienza University of Rome;RWTH Aachen University;Google;University of Utah;Michigan State University", "aff_unique_dep": ";;;;;;;;;;;;;Google DeepMind;;", "aff_unique_url": "https://www.manchester.ac.uk;https://www.imperial.ac.uk;https://www.ox.ac.uk;https://www.tsinghua.edu.cn;https://www.scu.edu;https://www.uni-salzburg.at;https://www.cam.ac.uk;https://www.uniroma1.it;https://www.ucsb.edu;https://www.ibm.com;https://www.helmholtz-muenchen.de;https://www.uniroma1.it;https://www.rwth-aachen.de;https://deepmind.com;https://www.utah.edu;https://www.msu.edu", "aff_unique_abbr": "UoM;ICL;Oxford;THU;SCU;USAL;Cambridge;La Sapienza;UCSB;IBM;;Sapienza;RWTH;DeepMind;Utah;MSU", "aff_campus_unique_index": "1;2;3;2;4", "aff_campus_unique": ";Cambridge;Rome;Santa Barbara;Aachen", "aff_country_unique_index": "0;0;0;1;2;3;0;4;2;2;5;4;5;0;2;2", "aff_country_unique": "United Kingdom;China;United States;Austria;Italy;Germany" }, { "title": "Sequence Compression Speeds Up Credit Assignment in Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34190", "id": "NlM4gp8hyO", "proceeding": "https://proceedings.mlr.press/v235/ramesh24b.html", "pdf": "https://openreview.net/pdf?id=NlM4gp8hyO", "openreview": "https://openreview.net/forum?id=NlM4gp8hyO", "author_site": "Aditya A. Ramesh, Kenny Young, Louis Kirsch, J\u00fcrgen Schmidhuber", "tldr": "", "abstract": "Temporal credit assignment in reinforcement learning is challenging due to delayed and stochastic outcomes. Monte Carlo targets can bridge long delays between action and consequence but lead to high-variance targets due to stochasticity. Temporal difference (TD) learning uses bootstrapping to overcome variance but introduces a bias that can only be corrected through many iterations. TD($\\lambda$) provides a mechanism to navigate this bias-variance tradeoff smoothly. Appropriately selecting $\\lambda$ can significantly improve performance. Here, we propose Chunked-TD, which uses predicted probabilities of transitions from a model for computing $\\lambda$-return targets. Unlike other model-based solutions to credit assignment, Chunked-TD is less vulnerable to model inaccuracies. Our approach is motivated by the principle of history compression and \u2018chunks\u2019 trajectories for conventional TD learning. Chunking with learned world models compresses near-deterministic regions of the environment-policy interaction to speed up credit assignment while still bootstrapping when necessary. We propose algorithms that can be implemented online and show that they solve some problems much faster than conventional TD($\\lambda$).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aditya Ramesh;Kenny John Young;Louis Kirsch;J\u00fcrgen Schmidhuber", "authorids": "~Aditya_Ramesh2;~Kenny_John_Young1;~Louis_Kirsch1;~J\u00fcrgen_Schmidhuber1", "gender": "M;M;;M", "homepage": "https://adityaramesh.in;;http://louiskirsch.com;http://people.idsia.ch/~juergen/", "dblp": ";179/2526;202/2379;s/JurgenSchmidhuber", "google_scholar": "https://scholar.google.ch/citations?user=60K82BkAAAAJ;;w8AkOEAAAAAJ;https://scholar.google.ch/citations?user=gLnCTgIAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Aditya_Ramesh2;~Kenny_John_Young1;~Louis_Kirsch1;~J\u00fcrgen_Schmidhuber1", "aff": "University of Alberta;University of Alberta;Scuola universitaria professionale della Svizzera italiana (SUPSI);IDSIA", "aff_domain": "ualberta.ca;ualberta.ca;supsi.ch;idsia.ch", "position": "Visiting PhD student;PhD student;PhD student;Scientific Director", "bibtex": "@inproceedings{\nramesh2024sequence,\ntitle={Sequence Compression Speeds Up Credit Assignment in Reinforcement Learning},\nauthor={Aditya Ramesh and Kenny John Young and Louis Kirsch and J{\\\"u}rgen Schmidhuber},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NlM4gp8hyO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3826335, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14156264324748273926&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "ualberta.ca;ualberta.ca;supsi.ch;idsia.ch", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "University of Alberta;Scuola universitaria professionale della Svizzera italiana;Institute of Digital Technologies", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ualberta.ca;https://www.supsi.ch;https://www.idsia.ch", "aff_unique_abbr": "UAlberta;SUPSI;IDSIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "Canada;Switzerland" }, { "title": "Improving Context Understanding in Multimodal Large Language Models via Multimodal Composition Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34189", "id": "Nm6jYZsBum", "proceeding": "https://proceedings.mlr.press/v235/li24s.html", "pdf": "https://openreview.net/pdf?id=Nm6jYZsBum", "openreview": "https://openreview.net/forum?id=Nm6jYZsBum", "author_site": "Wei Li, Hehe Fan, Yongkang Wong, Yi Yang, Mohan Kankanhalli", "tldr": "", "abstract": "Previous efforts using frozen Large Language Models (LLMs) for visual understanding, via image captioning or image-text retrieval tasks, face challenges when dealing with complex multimodal scenarios. In order to enhance the capabilities of Multimodal Large Language Models (MLLM) in comprehending the context of vision and language, we introduce Multimodal Composition Learning (MCL) for the purpose of mapping or aligning the vision and language input. In particular, we introduce two tasks: Multimodal-Context Captioning (MC-Cap) and Multimodal-Context Retrieval (MC-Ret) to guide a frozen LLM in comprehending the vision and language context. These specialized tasks are crafted to improve the LLM\u2019s capacity for efficient processing and utilization of multimodal inputs, thereby enhancing its proficiency in generating more accurate text or visual representations. Extensive experiments on both retrieval tasks (i.e., zero-shot composed image retrieval, visual storytelling image retrieval and visual dialog image retrieval) and text generation tasks (i.e., visual question answering) demonstrate the effectiveness of the proposed method. The code is available at: https://github.com/dhg-wei/MCL.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wei Li;Hehe Fan;Yongkang Wong;Yi Yang;Mohan Kankanhalli", "authorids": "~Wei_Li55;~Hehe_Fan1;~Yongkang_Wong1;~Yi_Yang4;~Mohan_Kankanhalli1", "gender": "M;M;M;M;M", "homepage": "https://github.com/lw-2018;https://hehefan.github.io;https://sites.google.com/site/yongkangwong/;http://reler.net/;https://www.comp.nus.edu.sg/~mohan", "dblp": ";184/5722.html;89/7407;;09/3613.html", "google_scholar": "hDubMJwAAAAJ;hVuflMQAAAAJ;https://scholar.google.com.sg/citations?user=Xa0mxggAAAAJ;https://scholar.google.com.au/citations?user=RMSuNFwAAAAJ;6Lx_eowAAAAJ", "orcid": ";0000-0001-9572-2345;0000-0002-1239-4428;;0000-0002-4846-2015", "linkedin": ";;yongkangwong/;;mohan-kankanhalli-583417221", "or_profile": "~Wei_Li55;~Hehe_Fan1;~Yongkang_Wong1;~Yi_Yang4;~Mohan_Kankanhalli1", "aff": "National University of Singapore;Zhejiang University;National University of Singapore;Zhejiang University;National University of Singapore", "aff_domain": "nus.edu;zju.edu.cn;nus.edu.sg;zju.edu.cn;nus.edu.sg", "position": "Intern;Assistant Professor;Senior Research Fellow;Full Professor;Full Professor", "bibtex": "@inproceedings{\nli2024improving,\ntitle={Improving Context Understanding in Multimodal Large Language Models via Multimodal Composition Learning},\nauthor={Wei Li and Hehe Fan and Yongkang Wong and Yi Yang and Mohan Kankanhalli},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Nm6jYZsBum}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4686452, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18012283362063815007&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "nus.edu;zju.edu.cn;nus.edu.sg;zju.edu.cn;nus.edu.sg", "author_num": 5, "aff_unique_index": "0;1;0;1;0", "aff_unique_norm": "National University of Singapore;Zhejiang University", "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;https://www.zju.edu.cn", "aff_unique_abbr": "NUS;ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;0", "aff_country_unique": "Singapore;China" }, { "title": "LIDAO: Towards Limited Interventions for Debiasing (Large) Language Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34188", "id": "NsHxeSCtgr", "proceeding": "https://proceedings.mlr.press/v235/liu24bm.html", "pdf": "https://openreview.net/pdf?id=NsHxeSCtgr", "openreview": "https://openreview.net/forum?id=NsHxeSCtgr", "author_site": "Tianci Liu, Haoyu Wang, Shiyang Wang, Yu Cheng, Jing Gao", "tldr": "", "abstract": "Large language models (LLMs) have achieved impressive performance on various natural language generation tasks. Nonetheless, they suffer from generating negative and harmful contents that are biased against certain demographic groups (e.g., female), raising severe fairness concerns. As remedies, prior works intervened the generation by removing attitude or demographic information, inevitably degrading the generation quality and resulting in notable *fairness-fluency* trade-offs. However, it is still under-explored to what extent the fluency *has to* be affected in order to achieve a desired level of fairness. In this work, we conduct the first formal study from an information-theoretic perspective. We show that previous approaches are excessive for debiasing and propose LIDAO, a general framework to debias a (L)LM at a better fluency provably. We further robustify LIDAO in adversarial scenarios, where a carefully-crafted prompt may stimulate LLMs exhibiting instruction-following abilities to generate texts with fairness issue appears only when the prompt is also taken into account. Experiments on three LMs ranging from 0.7B to 7B parameters demonstrate the superiority of our method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tianci Liu;Haoyu Wang;Shiyang Wang;Yu Cheng;Jing Gao", "authorids": "~Tianci_Liu1;~Haoyu_Wang6;wang5348@purdue.edu;~Yu_Cheng1;~Jing_Gao2", "gender": "M;M;;M;F", "homepage": "https://lliutianc.github.io;https://sites.google.com/view/haoyuwang/home;;https://ych133.github.io;https://engineering.purdue.edu/~jinggao/", "dblp": "148/1911-3;50/8499-4;;96/3060-1.html;67/4834-4", "google_scholar": ";https://scholar.google.com.hk/citations?user=5Lw9_jcAAAAJ;;https://scholar.google.com/citations?hl=en;Ftj1h4cAAAAJ", "orcid": ";0000-0001-7485-6213;;;", "linkedin": ";;;chengyu05/;", "or_profile": "~Tianci_Liu1;~Haoyu_Wang6;wang5348@purdue.edu;~Yu_Cheng1;~Jing_Gao2", "aff": "Purdue University;Purdue University;;The Chinese University of Hong Kong;Purdue University", "aff_domain": "purdue.edu;purdue.edu;;cuhk.edu.hk;purdue.edu", "position": "PhD student;PhD student;;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nliu2024lidao,\ntitle={{LIDAO}: Towards Limited Interventions for Debiasing (Large) Language Models},\nauthor={Tianci Liu and Haoyu Wang and Shiyang Wang and Yu Cheng and Jing Gao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NsHxeSCtgr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 385466, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:e09jJFkU1h4J:scholar.google.com/&scioq=LIDAO:+Towards+Limited+Interventions+for+Debiasing+(Large)+Language+Models&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": "purdue.edu;purdue.edu;;cuhk.edu.hk;purdue.edu", "author_num": 5, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Purdue University;Chinese University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "https://www.purdue.edu;https://www.cuhk.edu.hk", "aff_unique_abbr": "Purdue;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;China" }, { "title": "Multigroup Robustness", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34187", "id": "Nue7KgVZ6e", "proceeding": "https://proceedings.mlr.press/v235/hu24l.html", "pdf": "https://openreview.net/pdf?id=Nue7KgVZ6e", "openreview": "https://openreview.net/forum?id=Nue7KgVZ6e", "author_site": "Lunjia Hu, Charlotte Peale, Judy Hanwen Shen", "tldr": "", "abstract": "To address the shortcomings of real-world datasets, robust learning algorithms have been designed to overcome arbitrary and indiscriminate data corruption. However, practical processes of gathering data may lead to patterns of data corruption that are localized to specific partitions of the training dataset. Motivated by critical applications where the learned model is deployed to make predictions about people from a rich collection of overlapping subpopulations, we initiate the study of *multigroup robust* algorithms whose robustness guarantees for each subpopulation only degrade with the amount of data corruption *inside* that subpopulation. When the data corruption is not distributed uniformly over subpopulations, our algorithms provide more meaningful robustness guarantees than standard guarantees that are oblivious to how the data corruption and the affected subpopulations are related. Our techniques establish a new connection between multigroup fairness and robustness.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lunjia Hu;Charlotte Peale;Judy Hanwen Shen", "authorids": "~Lunjia_Hu1;cpeale@stanford.edu;~Judy_Hanwen_Shen1", "gender": "M;;F", "homepage": "https://lunjiahu.com;;http://heyyjudes.github.io/", "dblp": "195/6273;;217/2243", "google_scholar": "ss7CIgcAAAAJ;;LCjSZ3eS8pIC", "orcid": ";;", "linkedin": ";;", "or_profile": "~Lunjia_Hu1;cpeale@stanford.edu;~Judy_Hanwen_Shen1", "aff": "Stanford University;;Apple", "aff_domain": "stanford.edu;;apple.com", "position": "PhD student;;Intern", "bibtex": "@inproceedings{\nhu2024multigroup,\ntitle={Multigroup Robustness},\nauthor={Lunjia Hu and Charlotte Peale and Judy Hanwen Shen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Nue7KgVZ6e}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 919769, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3096944193569623146&as_sdt=805&sciodt=0,3&hl=en", "gs_version_total": 8, "email": "stanford.edu;;apple.com", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Stanford University;Apple", "aff_unique_dep": ";Apple Inc.", "aff_unique_url": "https://www.stanford.edu;https://www.apple.com", "aff_unique_abbr": "Stanford;Apple", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Spectral Preconditioning for Gradient Methods on Graded Non-convex Functions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34186", "id": "NvBJOcmti6", "proceeding": "https://proceedings.mlr.press/v235/doikov24a.html", "pdf": "https://openreview.net/pdf?id=NvBJOcmti6", "openreview": "https://openreview.net/forum?id=NvBJOcmti6", "author_site": "Nikita Doikov, Sebastian Stich, Martin Jaggi", "tldr": "", "abstract": "The performance of optimization methods is often tied to the spectrum of the objective Hessian. Yet, conventional assumptions, such as smoothness, do often not enable us to make finely-grained convergence statements\u2014particularly not for non-convex problems. Striving for a more intricate characterization of complexity, we introduce a unique concept termed graded non-convexity. This allows to partition the class of non-convex problems into a nested chain of subclasses. Interestingly, many traditional non-convex objectives, including partially convex problems, matrix factorizations, and neural networks, fall within these subclasses. As a second contribution, we propose gradient methods with spectral preconditioning, which employ inexact top eigenvectors of the Hessian to address the ill-conditioning of the problem, contingent on the grade. Our analysis reveals that these new methods provide provably superior convergence rates compared to basic gradient descent on applicable problem classes, particularly when large gaps exist between the top eigenvalues of the Hessian. Our theory is validated by numerical experiments executed on multiple practical machine learning problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nikita Doikov;Sebastian U Stich;Martin Jaggi", "authorids": "~Nikita_Doikov1;~Sebastian_U_Stich1;~Martin_Jaggi1", "gender": ";M;M", "homepage": "https://doikov.com;https://www.sstich.ch;https://mlo.epfl.ch", "dblp": "222/9897;04/10549;17/4402", "google_scholar": "YNBhhjUAAAAJ;https://scholar.google.ch/citations?user=8l-mDfQAAAAJ;https://scholar.google.ch/citations?user=r1TJBr8AAAAJ", "orcid": ";;0000-0003-1579-5558", "linkedin": ";;", "or_profile": "~Nikita_Doikov1;~Sebastian_U_Stich1;~Martin_Jaggi1", "aff": "EPFL - EPF Lausanne;CISPA Helmholtz Center for Information Security;EPFL", "aff_domain": "epfl.ch;cispa.de;epfl.ch", "position": "Postdoc;Tenure Track Faculty;Associate Professor", "bibtex": "@inproceedings{\ndoikov2024spectral,\ntitle={Spectral Preconditioning for Gradient Methods on Graded Non-convex Functions},\nauthor={Nikita Doikov and Sebastian U Stich and Martin Jaggi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NvBJOcmti6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1227289, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7062226831035378659&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "email": "epfl.ch;cispa.de;epfl.ch", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "EPFL;CISPA Helmholtz Center for Information Security", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://www.cispa.de/", "aff_unique_abbr": "EPFL;CISPA", "aff_campus_unique_index": "0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Switzerland;Germany" }, { "title": "Differentially Private Representation Learning via Image Captioning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34185", "id": "Nw7yOe8nBi", "proceeding": "https://proceedings.mlr.press/v235/sander24b.html", "pdf": "https://openreview.net/pdf?id=Nw7yOe8nBi", "openreview": "https://openreview.net/forum?id=Nw7yOe8nBi", "author_site": "Tom Sander, Yaodong Yu, Maziar Sanjabi, Alain Oliviero Durmus, Yi Ma, Kamalika Chaudhuri, Chuan Guo", "tldr": "", "abstract": "Differentially private (DP) machine learning is considered the gold-standard solution for training a model from sensitive data while still preserving privacy. However, a major barrier to achieving this ideal is its sub-optimal privacy-accuracy trade-off, which is particularly visible in DP representation learning. Specifically, it has been shown that under modest privacy budgets, most models learn representations that are not significantly better than hand-crafted features. In this work, we show that effective DP representation learning can be done via image captioning and scaling up to internet-scale multimodal datasets. Through a series of engineering tricks, we successfully train a DP image captioner (DP-Cap) on a 233M subset of LAION-2B from scratch using a reasonable amount of computation, and obtaining unprecedented high-quality image features that can be used in a variety of downstream vision and vision-language tasks. For example, under a privacy budget of $\\varepsilon=8$ for the LAION dataset, a linear classifier trained on top of learned DP-Cap features attains $65.8\\%$ accuracy on ImageNet-1K, considerably improving the previous SOTA of $56.5\\%$. Our work challenges the prevailing sentiment that high-utility DP representation learning cannot be achieved by training from scratch.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tom Sander;Yaodong Yu;Maziar Sanjabi;Alain Oliviero Durmus;Yi Ma;Kamalika Chaudhuri;Chuan Guo", "authorids": "~Tom_Sander1;~Yaodong_Yu4;~Maziar_Sanjabi1;~Alain_Oliviero_Durmus1;~Yi_Ma4;~Kamalika_Chaudhuri1;~Chuan_Guo1", "gender": "M;M;M;M;F;M;M", "homepage": ";https://yaodongyu.github.io;https://sites.google.com/view/maziar;http://people.eecs.berkeley.edu/~yima/;http://cseweb.ucsd.edu/users/kamalika;https://sites.google.com/view/chuanguo;", "dblp": ";;21/8577;;56/6435;;01/11275", "google_scholar": ";bZ9oyW8AAAAJ;bc_N2-oAAAAJ;https://scholar.google.com.hk/citations?user=XqLiBQMAAAAJ;I-DJ7EsAAAAJ;0gp5M-kAAAAJ;", "orcid": ";;;;;;", "linkedin": "tomsdr;;;;;;", "or_profile": "~Tom_Sander1;~Yaodong_Yu4;~Maziar_Sanjabi1;~Yi_Ma4;~Kamalika_Chaudhuri1;~Chuan_Guo1;~Alain_Durmus1", "aff": "\u00c9cole Polytechnique;Electrical Engineering & Computer Science Department, University of California Berkeley;Meta;University of California, Berkeley;University of California, San Diego;Meta;\u00c9cole Polytechnique", "aff_domain": "polytechnique.fr;eecs.berkeley.edu;meta.com;berkeley.edu;ucsd.edu;meta.com;polytechnique.fr", "position": "PhD student;PhD student;Researcher;Full Professor;Associate Professor;Researcher;Full Professor", "bibtex": "@inproceedings{\nsander2024differentially,\ntitle={Differentially Private Representation Learning via Image Captioning},\nauthor={Tom Sander and Yaodong Yu and Maziar Sanjabi and Alain Oliviero Durmus and Yi Ma and Kamalika Chaudhuri and Chuan Guo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Nw7yOe8nBi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5454400, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12684934702415887309&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 8, "email": "polytechnique.fr;eecs.berkeley.edu;meta.com;berkeley.edu;ucsd.edu;meta.com;polytechnique.fr", "author_num": 7, "aff_unique_index": "0;1;2;1;3;2;0", "aff_unique_norm": "Ecole Polytechnique;University of California, Berkeley;Meta;University of California, San Diego", "aff_unique_dep": ";Electrical Engineering & Computer Science Department;Meta Platforms, Inc.;", "aff_unique_url": "https://www.polytechnique.edu;https://www.berkeley.edu;https://meta.com;https://www.ucsd.edu", "aff_unique_abbr": "X;UC Berkeley;Meta;UCSD", "aff_campus_unique_index": "1;1;2", "aff_campus_unique": ";Berkeley;San Diego", "aff_country_unique_index": "0;1;1;1;1;1;0", "aff_country_unique": "France;United States" }, { "title": "Dynamic Byzantine-Robust Learning: Adapting to Switching Byzantine Workers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34184", "id": "NwYsuFuelg", "proceeding": "https://proceedings.mlr.press/v235/dorfman24a.html", "pdf": "https://openreview.net/pdf?id=NwYsuFuelg", "openreview": "https://openreview.net/forum?id=NwYsuFuelg", "author_site": "Ron Dorfman, Naseem Yehya, Kfir Levy", "tldr": "", "abstract": "Byzantine-robust learning has emerged as a prominent fault-tolerant distributed machine learning framework. However, most techniques focus on the *static* setting, wherein the identity of Byzantine workers remains unchanged throughout the learning process. This assumption fails to capture real-world *dynamic* Byzantine behaviors, which may include intermittent malfunctions or targeted, time-limited attacks. Addressing this limitation, we propose DynaBRO -- a new method capable of withstanding any sub-linear number of identity changes across rounds. Specifically, when the number of such changes is $\\mathcal{O}(\\sqrt{T})$ (where $T$ is the total number of training rounds), DynaBRO nearly matches the state-of-the-art asymptotic convergence rate of the static setting. Our method utilizes a multi-level Monte Carlo (MLMC) gradient estimation technique applied at the server to robustly aggregated worker updates. By additionally leveraging an adaptive learning rate, we circumvent the need for prior knowledge of the fraction of Byzantine workers.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ron Dorfman;Naseem Amin Yehya;Kfir Yehuda Levy", "authorids": "~Ron_Dorfman2;~Naseem_Amin_Yehya1;~Kfir_Yehuda_Levy1", "gender": "M;M;M", "homepage": ";https://sites.google.com/view/naseem-yehya/about;http://kfiryehud.wixsite.com/kfir-y-levy", "dblp": "271/8319;;83/11388", "google_scholar": "baGUoIEAAAAJ;;", "orcid": ";;", "linkedin": "ron-dorfman-756b9a13a/;;", "or_profile": "~Ron_Dorfman2;~Naseem_Amin_Yehya1;~Kfir_Yehuda_Levy1", "aff": "Technion, Technion;;Technion - Israel Institute of Technology, Technion", "aff_domain": "technion.ac.il;;technion.ac.il", "position": "PhD student;;Assistant Professor", "bibtex": "@inproceedings{\ndorfman2024dynamic,\ntitle={Dynamic Byzantine-Robust Learning: Adapting to Switching Byzantine Workers},\nauthor={Ron Dorfman and Naseem Amin Yehya and Kfir Yehuda Levy},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NwYsuFuelg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1021686, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6558978237993357186&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "technion.ac.il;;technion.ac.il", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "An Empirical Examination of Balancing Strategy for Counterfactual Estimation on Time Series", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34183", "id": "Nxz3CDtGXp", "proceeding": "https://proceedings.mlr.press/v235/huang24r.html", "pdf": "https://openreview.net/pdf?id=Nxz3CDtGXp", "openreview": "https://openreview.net/forum?id=Nxz3CDtGXp", "author_site": "Qiang Huang, Chuizheng Meng, Defu Cao, Biwei Huang, Yi Chang, Yan Liu", "tldr": "", "abstract": "Counterfactual estimation from observations represents a critical endeavor in numerous application fields, such as healthcare and finance, with the primary challenge being the mitigation of treatment bias. The balancing strategy aimed at reducing covariate disparities between different treatment groups serves as a universal solution. However, when it comes to the time series data, the effectiveness of balancing strategies remains an open question, with a thorough analysis of the robustness and applicability of balancing strategies still lacking. This paper revisits counterfactual estimation in the temporal setting and provides a brief overview of recent advancements in balancing strategies. More importantly, we conduct a critical empirical examination for the effectiveness of the balancing strategies within the realm of temporal counterfactual estimation in various settings on multiple datasets. Our findings could be of significant interest to researchers and practitioners and call for a reexamination of the balancing strategy in time series settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qiang Huang;Chuizheng Meng;Defu Cao;Biwei Huang;Yi Chang;Yan Liu", "authorids": "~Qiang_Huang4;~Chuizheng_Meng1;~Defu_Cao1;~Biwei_Huang1;~Yi_Chang4;~Yan_Liu1", "gender": "M;M;M;F;M;F", "homepage": "https://15754311016.github.io/qianghuang.github.io/;;https://idevede.github.io/;;http://www.yichang-cs.com;http://www-bcf.usc.edu/~liu32/", "dblp": "80/2732-1.html;207/8096.html;274/1535;165/3288;02/5438.html;150/4295", "google_scholar": "https://scholar.google.com/citations?hl=en;nzkOdekAAAAJ;https://scholar.google.com/citations?hl=en;;https://scholar.google.com.hk/citations?user=drEkR50AAAAJ;UUKLPMYAAAAJ", "orcid": "0000-0003-0046-0923;;0000-0003-0240-3818;;0000-0003-2697-8093;0000-0002-7055-9518", "linkedin": ";;;;;", "or_profile": "~Qiang_Huang4;~Chuizheng_Meng1;~Defu_Cao1;~Biwei_Huang1;~Yi_Chang4;~Yan_Liu1", "aff": "Jilin University, China;University of Southern California;University of Southern California;University of California, San Diego;Jilin University, China;University of Southern California", "aff_domain": "jlu.edu.cn;usc.edu;usc.edu;ucsd.edu;jlu.edu.cn;usc.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor;Full Professor;Professor", "bibtex": "@inproceedings{\nhuang2024an,\ntitle={An Empirical Examination of Balancing Strategy for Counterfactual Estimation on Time Series},\nauthor={Qiang Huang and Chuizheng Meng and Defu Cao and Biwei Huang and Yi Chang and Yan Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Nxz3CDtGXp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1748605, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14054895992938310755&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "jlu.edu.cn;usc.edu;usc.edu;ucsd.edu;jlu.edu.cn;usc.edu", "author_num": 6, "aff_unique_index": "0;1;1;2;0;1", "aff_unique_norm": "Jilin University;University of Southern California;University of California, San Diego", "aff_unique_dep": ";;", "aff_unique_url": "http://www.jlu.edu.cn;https://www.usc.edu;https://www.ucsd.edu", "aff_unique_abbr": "JLU;USC;UCSD", "aff_campus_unique_index": "1;1;2;1", "aff_campus_unique": ";Los Angeles;San Diego", "aff_country_unique_index": "0;1;1;1;0;1", "aff_country_unique": "China;United States" }, { "title": "Automated Loss function Search for Class-imbalanced Node Classification", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34182", "id": "O1hmwi51pp", "proceeding": "https://proceedings.mlr.press/v235/guo24h.html", "pdf": "https://openreview.net/pdf?id=O1hmwi51pp", "openreview": "https://openreview.net/forum?id=O1hmwi51pp", "author_site": "Xinyu Guo, KAI WU, Xiaoyu Zhang, Jing Liu", "tldr": "", "abstract": "Class-imbalanced node classification tasks are prevalent in real-world scenarios. Due to the uneven distribution of nodes across different classes, learning high-quality node representations remains a challenging endeavor. The engineering of loss functions has shown promising potential in addressing this issue. It involves the meticulous design of loss functions, utilizing information about the quantities of nodes in different categories and the network's topology to learn unbiased node representations. However, the design of these loss functions heavily relies on human expert knowledge and exhibits limited adaptability to specific target tasks. In this paper, we introduce a high-performance, flexible, and generalizable automated loss function search framework to tackle this challenge. Across 15 combinations of graph neural networks and datasets, our framework achieves a significant improvement in performance compared to state-of-the-art methods. Additionally, we observe that homophily in graph-structured data significantly contributes to the transferability of the proposed framework.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xinyu Guo;Kai Wu;Xiaoyu Zhang;Jing Liu", "authorids": "~Xinyu_Guo2;~Kai_Wu3;~Xiaoyu_Zhang6;~Jing_Liu20", "gender": "M;;F;F", "homepage": "https://evoigroup.netlify.app/;;https://scholar.google.com.hk/citations?user=XtfE1f0AAAAJ&hl=zh-CN;https://faculty.xidian.edu.cn/LJ22/zh_CN/index.htm", "dblp": ";;12/5927-10.html;72/2590-6", "google_scholar": ";;https://scholar.google.com.hk/citations?user=XtfE1f0AAAAJ;kqRxf3MAAAAJ", "orcid": ";;;0000-0002-6834-5350", "linkedin": ";;;", "or_profile": "~Xinyu_Guo2;~Kai_Wu3;~Xiaoyu_Zhang6;~Jing_Liu20", "aff": "Xidian University;;Xidian University;Xidian University, China", "aff_domain": "xidian.edu.cn;;xidian.edu.cn;mail.xidian.edu.cn", "position": "MS student;;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nguo2024automated,\ntitle={Automated Loss function Search for Class-imbalanced Node Classification},\nauthor={Xinyu Guo and Kai Wu and Xiaoyu Zhang and Jing Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=O1hmwi51pp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1695881, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8534111097050687808&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "xidian.edu.cn;;xidian.edu.cn;mail.xidian.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Xidian University", "aff_unique_dep": "", "aff_unique_url": "http://www.xidian.edu.cn/", "aff_unique_abbr": "Xidian", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Exploring Correlations of Self-Supervised Tasks for Graphs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34181", "id": "O3CFN1VIwt", "proceeding": "https://proceedings.mlr.press/v235/fang24b.html", "pdf": "https://openreview.net/pdf?id=O3CFN1VIwt", "openreview": "https://openreview.net/forum?id=O3CFN1VIwt", "author_site": "Taoran Fang, Wei Chow, Yifei Sun, Kaiqiao Han, Lvbin Ma, Yang Yang", "tldr": "", "abstract": "Graph self-supervised learning has sparked a research surge in training informative representations without accessing any labeled data. However, our understanding of graph self-supervised learning remains limited, and the inherent relationships between various self-supervised tasks are still unexplored. Our paper aims to provide a fresh understanding of graph self-supervised learning based on task correlations. Specifically, we evaluate the performance of the representations trained by one specific task on other tasks and define correlation values to quantify task correlations. Through this process, we unveil the task correlations between various self-supervised tasks and can measure their expressive capabilities, which are closely related to downstream performance. By analyzing the correlation values between tasks across various datasets, we reveal the complexity of task correlations and the limitations of existing multi-task learning methods. To obtain more capable representations, we propose Graph Task Correlation Modeling (GraphTCM) to illustrate the task correlations and utilize it to enhance graph self-supervised training. The experimental results indicate that our method significantly outperforms existing methods across various downstream tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Taoran Fang;Wei Chow;Yifei Sun;Kaiqiao Han;Lvbin Ma;Yang Yang", "authorids": "~Taoran_Fang2;~Wei_Chow1;~Yifei_Sun1;~Kaiqiao_Han1;gmmmfly@163.com;~Yang_Yang35", "gender": "M;M;M;M;;M", "homepage": "https://www.baidu.com;http://none.com;https://sunefei.github.io/;;;http://yangy.org", "dblp": ";;27/3389-2;356/3989;;", "google_scholar": ";;9mxdFawAAAAJ;gFBnb-AAAAAJ;;", "orcid": ";;0000-0002-6814-5527;0009-0008-1389-1291;;0000-0002-5058-4417", "linkedin": ";;yifeis;kaiqiao-han-30a6a7329/;;", "or_profile": "~Taoran_Fang2;~Wei_Chow1;~Yifei_Sun1;~Kaiqiao_Han1;gmmmfly@163.com;~Yang_Yang35", "aff": "Zhejiang University;Zhejiang University;National University of Singapore;Zhejiang University;;Zhejiang University", "aff_domain": "zju.edu.cn;zju.edu.cn;comp.nus.edu.sg;zju.edu.cn;;zju.edu.cn", "position": "PhD student;Undergrad student;PhD student;Undergrad student;;Associate Professor", "bibtex": "@inproceedings{\nfang2024exploring,\ntitle={Exploring Correlations of Self-Supervised Tasks for Graphs},\nauthor={Taoran Fang and Wei Chow and Yifei Sun and Kaiqiao Han and Lvbin Ma and Yang Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=O3CFN1VIwt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4440409, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9993889436384043153&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "zju.edu.cn;zju.edu.cn;comp.nus.edu.sg;zju.edu.cn;;zju.edu.cn", "author_num": 6, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Zhejiang University;National University of Singapore", "aff_unique_dep": ";", "aff_unique_url": "https://www.zju.edu.cn;https://www.nus.edu.sg", "aff_unique_abbr": "ZJU;NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "China;Singapore" }, { "title": "Towards Resource-friendly, Extensible and Stable Incomplete Multi-view Clustering", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34180", "id": "O45u81aby2", "proceeding": "https://proceedings.mlr.press/v235/yu24b.html", "pdf": "https://openreview.net/pdf?id=O45u81aby2", "openreview": "https://openreview.net/forum?id=O45u81aby2", "author_site": "Shengju Yu, Dong Zhibin, Siwei Wang, Xinhang Wan, Yue Liu, Weixuan Liang, Pei Zhang, Wenxuan Tu, Xinwang Liu", "tldr": "", "abstract": "Incomplete multi-view clustering (IMVC) methods typically encounter three drawbacks: (1) intense time and/or space overheads; (2) intractable hyper-parameters; (3) non-zero variance results. With these concerns in mind, we give a simple yet effective IMVC scheme, termed as ToRES. Concretely, instead of self-expression affinity, we manage to construct prototype-sample affinity for incomplete data so as to decrease the memory requirements. To eliminate hyper-parameters, besides mining complementary features among views by view-wise prototypes, we also attempt to devise cross-view prototypes to capture consensus features for jointly forming high-quality clustering representation. To avoid the variance, we successfully unify representation learning and clustering operation, and directly optimize the discrete cluster indicators from incomplete data. Then, for the resulting objective function, we provide two equivalent solutions from perspectives of feasible region partitioning and objective transformation. Many results suggest that ToRES exhibits advantages against 20 SOTA algorithms, even in scenarios with a higher ratio of incomplete data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shengju Yu;Zhibin Dong;Siwei Wang;Xinhang Wan;Yue Liu;Weixuan Liang;Pei Zhang;Wenxuan Tu;Xinwang Liu", "authorids": "~Shengju_Yu1;~Zhibin_Dong1;~Siwei_Wang4;~Xinhang_Wan1;~Yue_Liu10;~Weixuan_Liang1;~Pei_Zhang9;~Wenxuan_Tu1;~Xinwang_Liu1", "gender": ";M;M;M;M;M;F;;M", "homepage": ";https://dzboop.GitHub.io;https://wangsiwei2010.github.io/;https://wanxinhang.github.io/;https://yueliu1999.github.io/;;;;https://xinwangliu.github.io/", "dblp": ";227/6683;51/8279-1;331/1513;74/1932-8;274/1152;78/5323-8;;45/6569-2.html", "google_scholar": ";;5o9hK3EAAAAJ;4CxuLpsAAAAJ;5tfpu3MAAAAJ;https://scholar.google.com/citations?hl=en;Vh1eFgoAAAAJ;;A56vWC4AAAAJ", "orcid": ";0000-0001-7829-4924;0000-0001-9517-262X;0000-0001-8749-2869;;0000-0002-1868-5445;;;", "linkedin": ";;;;;;;;", "or_profile": "~Shengju_Yu1;~Zhibin_Dong1;~Siwei_Wang4;~Xinhang_Wan1;~Yue_Liu10;~Weixuan_Liang1;~Pei_Zhang9;~Wenxuan_Tu1;~Xinwang_Liu1", "aff": ";National University of Defense Technology;Intelligent Game and Decision Lab;National University of Defense Technology;University of Illinois, Urbana Champaign;National University of Defense Technology;National University of Defense Technology;;National University of Defense Technology", "aff_domain": ";nudt.edu.cn;nudt.edu.cn;nudt.edu.cn;uiuc.edu;nudt.edu.cn;nudt.edu.cn;;nudt.edu.cn", "position": ";PhD student;Assistant Professor;PhD student;Intern;PhD student;PhD student;;Full Professor", "bibtex": "@inproceedings{\nyu2024towards,\ntitle={Towards Resource-friendly, Extensible and Stable Incomplete Multi-view Clustering},\nauthor={Shengju Yu and Zhibin Dong and Siwei Wang and Xinhang Wan and Yue Liu and Weixuan Liang and Pei Zhang and Wenxuan Tu and Xinwang Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=O45u81aby2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1153348, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11721857635249693948&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 7, "email": ";nudt.edu.cn;nudt.edu.cn;nudt.edu.cn;uiuc.edu;nudt.edu.cn;nudt.edu.cn;;nudt.edu.cn", "author_num": 9, "aff_unique_index": "0;1;0;2;0;0;0", "aff_unique_norm": "National University of Defense Technology;Intelligent Game and Decision Lab;University of Illinois Urbana-Champaign", "aff_unique_dep": ";Intelligent Game and Decision Lab;", "aff_unique_url": "http://www.nudt.edu.cn/;;https://illinois.edu", "aff_unique_abbr": "NUDT;;UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;2;0;0;0", "aff_country_unique": "China;;United States" }, { "title": "Self-Play Fine-Tuning Converts Weak Language Models to Strong Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34179", "id": "O4cHTxW9BS", "proceeding": "https://proceedings.mlr.press/v235/chen24j.html", "pdf": "https://openreview.net/pdf?id=O4cHTxW9BS", "openreview": "https://openreview.net/forum?id=O4cHTxW9BS", "author_site": "Zixiang Chen, Yihe Deng, Huizhuo Yuan, Kaixuan Ji, Quanquan Gu", "tldr": "", "abstract": "Harnessing the power of human-annotated data through Supervised Fine-Tuning (SFT) is pivotal for advancing Large Language Models (LLMs). In this paper, we delve into the prospect of growing a strong LLM out of a weak one without the need for acquiring additional human-annotated data. We propose a new fine-tuning method called Self-Play fIne-tuNing (SPIN), which starts from a supervised fine-tuned model. At the heart of SPIN lies a self-play mechanism, where the LLM refines its capability by playing against instances of itself. More specifically, the LLM generates its own training data from its previous iterations, refining its policy by discerning these self-generated responses from those obtained from human-annotated data. Our method progressively elevates the LLM from a nascent model to a formidable one, unlocking the full potential of human-annotated demonstration data for SFT. Theoretically, we prove that the global optimum to the training objective function of our method is achieved only when the LLM policy aligns with the target data distribution. Empirically, we evaluate our method on several benchmark datasets including the HuggingFace Open LLM Leaderboard, MT-Bench, and datasets from Big-Bench. Our results show that SPIN can significantly improve the LLM's performance across a variety of benchmarks and even outperform models trained through direct preference optimization (DPO) supplemented with extra GPT-4 preference data. This sheds light on the promise of self-play, enabling the achievement of human-level performance in LLMs without the need for expert opponents.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zixiang Chen;Yihe Deng;Huizhuo Yuan;Kaixuan Ji;Quanquan Gu", "authorids": "~Zixiang_Chen1;~Yihe_Deng1;~Huizhuo_Yuan1;~Kaixuan_Ji2;~Quanquan_Gu1", "gender": "M;F;;Not Specified;M", "homepage": "https://sites.google.com/view/zxchen;;;https://github.com/jkx19;http://web.cs.ucla.edu/~qgu/", "dblp": "137/3624;230/8011;;252/7475;50/4597", "google_scholar": "6nrCHr0AAAAJ;7Lix1poAAAAJ;;FOoKDukAAAAJ;GU9HgNAAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Zixiang_Chen1;~Yihe_Deng1;~Huizhuo_Yuan1;~Kaixuan_Ji2;~Quanquan_Gu1", "aff": " University of California, Los Angeles;University of California, Los Angeles;;University of California, Los Angeles;University of California, Los Angeles", "aff_domain": "cs.ucla.edu;ucla.edu;;ucla.edu;cs.ucla.edu", "position": "PhD student;PhD student;;PhD student;Associate Professor", "bibtex": "@inproceedings{\nchen2024selfplay,\ntitle={Self-Play Fine-Tuning Converts Weak Language Models to Strong Language Models},\nauthor={Zixiang Chen and Yihe Deng and Huizhuo Yuan and Kaixuan Ji and Quanquan Gu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=O4cHTxW9BS}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1270411, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 548, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5894388903338122881&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 16, "email": "cs.ucla.edu;ucla.edu;;ucla.edu;cs.ucla.edu", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of California, Los Angeles", "aff_unique_dep": "", "aff_unique_url": "https://www.ucla.edu", "aff_unique_abbr": "UCLA", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Enhancing Class-Imbalanced Learning with Pre-Trained Guidance through Class-Conditional Knowledge Distillation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34178", "id": "O4nXWHPl6g", "proceeding": "https://proceedings.mlr.press/v235/li24ao.html", "pdf": "https://openreview.net/pdf?id=O4nXWHPl6g", "openreview": "https://openreview.net/forum?id=O4nXWHPl6g", "author_site": "Lan Li, Xin-Chun Li, Han-Jia Ye, De-Chuan Zhan", "tldr": "", "abstract": "In class-imbalanced learning, the scarcity of information about minority classes presents challenges in obtaining generalizable features for these classes. Leveraging large-scale pre-trained models with powerful generalization capabilities as teacher models can help fill this information gap. Traditional knowledge distillation transfers the label distribution $p(\\boldsymbol{y}|\\boldsymbol{x})$ predicted by the teacher model to the student model. However, this method falls short on imbalanced data as it fails to capture the class-conditional probability distribution $p(\\boldsymbol{x}|\\boldsymbol{y})$ from the teacher model, which is crucial for enhancing generalization. To overcome this, we propose Class-Conditional Knowledge Distillation (CCKD), a novel approach that enables learning of the teacher model\u2019s class-conditional probability distribution during the distillation process. Additionally, we introduce Augmented CCKD (ACCKD), which involves distillation on a constructed class-balanced dataset (formed through data mixing) and feature imitation on the entire dataset to further facilitate the learning of features. Experimental results on various imbalanced datasets demonstrate an average accuracy improvement of 7.4% using our method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lan Li;Xin-Chun Li;Han-Jia Ye;De-Chuan Zhan", "authorids": "~Lan_Li2;~Xin-Chun_Li1;~Han-Jia_Ye1;~De-Chuan_Zhan1", "gender": "M;M;M;M", "homepage": "http://www.lamda.nju.edu.cn/lil/;http://www.lamda.nju.edu.cn/yehj;http://www.lamda.nju.edu.cn/zhandc/;http://www.lamda.nju.edu.cn/lixc/", "dblp": "21/820-1;165/3014;74/498;https://dblp.uni-trier.de/pid/246/2947", "google_scholar": "https://scholar.google.com.hk/citations?user=D2vVpPUAAAAJ;mgOYhtoAAAAJ;mYJf4TcAAAAJ;7WOxRe0AAAAJ", "orcid": ";;0000-0002-3533-2078;", "linkedin": ";;;", "or_profile": "~Lan_Li2;~Han-Jia_Ye1;~De-Chuan_Zhan1;~Li_Xin-Chun1", "aff": "Nanjing University;Nanjing University;Nanjing University;Nanjing University", "aff_domain": "nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn", "position": "PhD student;Associate Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nli2024enhancing,\ntitle={Enhancing Class-Imbalanced Learning with Pre-Trained Guidance through Class-Conditional Knowledge Distillation},\nauthor={Lan Li and Xin-Chun Li and Han-Jia Ye and De-Chuan Zhan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=O4nXWHPl6g}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4491918, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14718396799587047378&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "", "aff_unique_url": "https://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Provable Representation with Efficient Planning for Partially Observable Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34177", "id": "O6tenHWTUU", "proceeding": "https://proceedings.mlr.press/v235/zhang24bd.html", "pdf": "https://openreview.net/pdf?id=O6tenHWTUU", "openreview": "https://openreview.net/forum?id=O6tenHWTUU", "author_site": "Hongming Zhang, Tongzheng Ren, Chenjun Xiao, Dale Schuurmans, Bo Dai", "tldr": "", "abstract": "In most real-world reinforcement learning applications, state information is only partially observable, which breaks the Markov decision process assumption and leads to inferior performance for algorithms that conflate observations with state. Partially Observable Markov Decision Processes (POMDPs), on the other hand, provide a general framework that allows for partial observability to be accounted for in *learning, exploration and planning*, but presents significant computational and statistical challenges. To address these difficulties, we develop a representation-based perspective that leads to a coherent framework and tractable algorithmic approach for practical reinforcement learning from partial observations. We provide a theoretical analysis for justifying the statistical efficiency of the proposed algorithm, and also empirically demonstrate the proposed algorithm can surpass state-of-the-art performance with partial observations across various benchmarks, advancing reliable reinforcement learning towards more practical applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hongming Zhang;Tongzheng Ren;Chenjun Xiao;Dale Schuurmans;Bo Dai", "authorids": "~Hongming_Zhang3;~Tongzheng_Ren1;~Chenjun_Xiao1;~Dale_Schuurmans1;~Bo_Dai1", "gender": "M;M;;;", "homepage": "https://github.com/initial-h;https://www.cs.utexas.edu/~tzren/;https://chenjun-x.github.io/;;https://bo-dai.github.io/", "dblp": ";211/8004;178/8641;;64/2903", "google_scholar": "https://scholar.google.ca/citations?user=mwbsY3AAAAAJ;VgNDYeYAAAAJ;;;TIKl_foAAAAJ", "orcid": ";;0000-0002-5493-1500;;0009-0002-8070-574X", "linkedin": ";;;;", "or_profile": "~Hongming_Zhang3;~Tongzheng_Ren1;~Chenjun_Xiao1;~Dale_Schuurmans1;~Bo_Dai1", "aff": "University of Alberta;University of Texas, Austin;Huawei Technologies Ltd.;;Google Brain", "aff_domain": "ualberta.ca;utexas.edu;huawei.com;;google.com", "position": "PhD student;PhD student;Researcher;;Research Scientist", "bibtex": "@inproceedings{\nzhang2024provable,\ntitle={Provable Representation with Efficient Planning for Partially Observable Reinforcement Learning},\nauthor={Hongming Zhang and Tongzheng Ren and Chenjun Xiao and Dale Schuurmans and Bo Dai},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=O6tenHWTUU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2031972, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5187570997639685783&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "ualberta.ca;utexas.edu;huawei.com;;google.com", "author_num": 5, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Alberta;University of Texas at Austin;Huawei;Google", "aff_unique_dep": ";;Huawei Technologies;Google Brain", "aff_unique_url": "https://www.ualberta.ca;https://www.utexas.edu;https://www.huawei.com;https://brain.google.com", "aff_unique_abbr": "UAlberta;UT Austin;Huawei;Google Brain", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Austin;Mountain View", "aff_country_unique_index": "0;1;2;1", "aff_country_unique": "Canada;United States;China" }, { "title": "What needs to go right for an induction head? A mechanistic study of in-context learning circuits and their formation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34176", "id": "O8rrXl71D5", "proceeding": "https://proceedings.mlr.press/v235/singh24c.html", "pdf": "https://openreview.net/pdf?id=O8rrXl71D5", "openreview": "https://openreview.net/forum?id=O8rrXl71D5", "author_site": "Aaditya Singh, Ted Moskovitz, Feilx Hill, Stephanie Chan, Andrew Saxe", "tldr": "", "abstract": "In-context learning is a powerful emergent ability in transformer models. Prior work in mechanistic interpretability has identified a circuit element that may be critical for in-context learning \u2013 the induction head (IH), which performs a match-and-copy operation. During training of large transformers on natural language data, IHs emerge around the same time as a notable phase change in the loss. Despite the robust evidence for IHs and this interesting coincidence with the phase change, relatively little is known about the diversity and emergence dynamics of IHs. Why is there more than one IH, and how are they dependent on each other? Why do IHs appear all of a sudden, and what are the subcircuits that enable them to emerge? We answer these questions by studying IH emergence dynamics in a controlled setting by training on synthetic data. In doing so, we develop and share a novel optogenetics-inspired causal framework for modifying activations throughout training. Using this framework, we delineate the diverse and additive nature of IHs. By \"clamping\" subsets of activations throughout training, we then identify three underlying subcircuits that interact to drive IH formation, yielding the phase change. Furthermore, these subcircuits shed light on data-dependent properties of formation, such as phase change timing, already showing the promise of this more in-depth understanding of subcircuits that need to \"go right\" for an induction head.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aaditya K Singh;Ted Moskovitz;Felix Hill;Stephanie C.Y. Chan;Andrew M Saxe", "authorids": "~Aaditya_K_Singh1;~Ted_Moskovitz1;~Felix_Hill1;~Stephanie_C.Y._Chan1;~Andrew_M_Saxe1", "gender": "M;;F;M;M", "homepage": "https://tedmoskovitz.github.io/;https://fh295.github.io/;https://scychan.github.io/;https://www.saxelab.org;https://aadityasingh.github.io/", "dblp": ";116/0509;255/7866;39/6894;", "google_scholar": "pPVXrTYAAAAJ;https://scholar.google.co.uk/citations?user=4HLUnhIAAAAJ;https://scholar.google.com/citations?hl=en;h0Al1fcAAAAJ;9OPKqmMAAAAJ", "orcid": ";;;0000-0002-9831-8812;", "linkedin": ";;scychan;;", "or_profile": "~Ted_Moskovitz1;~Felix_Hill1;~Stephanie_C.Y._Chan1;~Andrew_M_Saxe1;~Aaditya_Singh1", "aff": "Gatsby Computational Neuroscience Unit;Google;Google DeepMind;University College London, University of London;University College London, University of London", "aff_domain": "gatsby.ucl.ac.uk;google.com;deepmind.com;ucl.ac.uk;ucl.ac.uk", "position": "PhD student;Researcher;Research Scientist;Full Professor;PhD student", "bibtex": "@inproceedings{\nsingh2024what,\ntitle={What needs to go right for an induction head? A mechanistic study of in-context learning circuits and their formation},\nauthor={Aaditya K Singh and Ted Moskovitz and Felix Hill and Stephanie C.Y. Chan and Andrew M Saxe},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=O8rrXl71D5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2340420, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10906725491303699815&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "gatsby.ucl.ac.uk;google.com;deepmind.com;ucl.ac.uk;ucl.ac.uk", "author_num": 5, "aff_unique_index": "0;1;1;0;0", "aff_unique_norm": "University College London;Google", "aff_unique_dep": "Gatsby Computational Neuroscience Unit;Google", "aff_unique_url": "https://www.ucl.ac.uk;https://www.google.com", "aff_unique_abbr": "UCL;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "KV-Runahead: Scalable Causal LLM Inference by Parallel Key-Value Cache Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34175", "id": "OBs0AjXE3F", "proceeding": "https://proceedings.mlr.press/v235/cho24e.html", "pdf": "https://openreview.net/pdf?id=OBs0AjXE3F", "openreview": "https://openreview.net/forum?id=OBs0AjXE3F", "author_site": "Minsik Cho, Mohammad Rastegari, Devang Naik", "tldr": "", "abstract": "Large Language Model or LLM inference has two phases, the prompt (or prefill) phase to output the first token and the extension (or decoding) phase to the generate subsequent tokens. In this work, we propose an efficient parallelization scheme, KV-Runahead to accelerate the prompt phase. The key observation is that the extension phase generates tokens faster than the prompt phase because of key-value cache (KV-cache). Hence, KV-Runahead parallelizes the prompt phase by orchestrating multiple processes to populate the KV-cache and minimizes the time-to-first-token (TTFT). Dual-purposing the KV-cache scheme has two main benefits. First, since KV-cache is designed to leverage the causal attention map, we minimize computation and computation automatically. Second, since it already exists for the extension phase, KV-Runahead is easy to implement. We further propose context-level load-balancing to handle uneven KV-cache generation (due to the causal attention) and to optimize TTFT. Compared with an existing parallelization scheme such as tensor or sequential parallelization where keys and values are locally generated and exchanged via all-gather collectives, our experimental results demonstrate that KV-Runahead can offer over 1.4\u00d7 and 1.6\u00d7 speedups for Llama 7B and Falcon 7B respectively.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Minsik Cho;Mohammad Rastegari;Devang Naik", "authorids": "~Minsik_Cho1;~Mohammad_Rastegari2;~Devang_Naik1", "gender": "M;M;M", "homepage": ";https://mrastegari.github.io/;", "dblp": ";31/5228;66/9317", "google_scholar": "_AZys7EAAAAJ;N4-2Z_cAAAAJ;wIQcv5sAAAAJ", "orcid": ";;", "linkedin": ";;https://linkedin.com/in/denaik", "or_profile": "~Minsik_Cho1;~Mohammad_Rastegari2;~Devang_Naik1", "aff": ";Department of Computer Science, University of Washington;", "aff_domain": ";cs.washington.edu;", "position": ";Assistant Professor;", "bibtex": "@inproceedings{\ncho2024kvrunahead,\ntitle={{KV}-Runahead: Scalable Causal {LLM} Inference by Parallel Key-Value Cache Generation},\nauthor={Minsik Cho and Mohammad Rastegari and Devang Naik},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OBs0AjXE3F}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6137886, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9201429752328346092&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";cs.washington.edu;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "University of Washington", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.washington.edu", "aff_unique_abbr": "UW", "aff_campus_unique_index": "0", "aff_campus_unique": "Seattle", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "DNA-SE: Towards Deep Neural-Nets Assisted Semiparametric Estimation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34174", "id": "OERwuPzHdh", "proceeding": "https://proceedings.mlr.press/v235/liu24bk.html", "pdf": "https://openreview.net/pdf?id=OERwuPzHdh", "openreview": "https://openreview.net/forum?id=OERwuPzHdh", "author_site": "Qinshuo Liu, Zixin Wang, Xi'an Li, Xinyao Ji, Lei Zhang, Lin Liu, Zhonghua Liu", "tldr": "", "abstract": "Semiparametric statistics play a pivotal role in a wide range of domains, including but not limited to missing data, causal inference, and transfer learning, to name a few. In many settings, semiparametric theory leads to (nearly) statistically optimal procedures that yet involve numerically solving Fredholm integral equations of the second kind. Traditional numerical methods, such as polynomial or spline approximations, are difficult to scale to multi-dimensional problems. Alternatively, statisticians may choose to approximate the original integral equations by ones with closed-form solutions, resulting in computationally more efficient, but statistically suboptimal or even incorrect procedures. To bridge this gap, we propose a novel framework by formulating the semiparametric estimation problem as a bi-level optimization problem; and then we propose a scalable algorithm called **D**eep **N**eural-Nets **A**ssisted **S**emiparametric **E**stimation ($\\mathsf{DNA\\mbox{-}SE}$) by leveraging the universal approximation property of Deep Neural-Nets (DNN) to streamline semiparametric procedures. Through extensive numerical experiments and a real data analysis, we demonstrate the numerical and statistical advantages of $\\mathsf{DNA\\mbox{-}SE}$ over traditional methods. To the best of our knowledge, we are the first to bring DNN into semiparametric statistics as a numerical solver of integral equations in our proposed general framework.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qinshuo Liu;Zixin Wang;Xi'an Li;Xinyao Ji;Lei Zhang;Lin Liu;Zhonghua Liu", "authorids": "~Qinshuo_Liu2;~Zixin_Wang2;~Xi'an_Li1;~Xinyao_Ji1;~Lei_Zhang42;~Lin_Liu7;~Zhonghua_Liu2", "gender": "M;;M;F;M;;", "homepage": "https://saasweb.hku.hk/pgs.php;;https://www.researchgate.net/profile/Xian-Li-71;;https://ins.sjtu.edu.cn/people/lzhang/home.html;https://linliu-stats.github.io/;https://sites.google.com/view/drliu/home", "dblp": ";;;;;;", "google_scholar": ";https://scholar.google.co.uk/citations?view_op=list_works;;;bsoSk6EAAAAJ;2xESgioAAAAJ;", "orcid": ";;0000-0002-1509-9328;;0000-0002-2917-9652;;", "linkedin": ";;;xinyao-ji-97698a46/;;;", "or_profile": "~Qinshuo_Liu2;~Zixin_Wang2;~Xi'an_Li1;~Xinyao_Ji1;~Lei_Zhang42;~Lin_Liu7;~Zhonghua_Liu2", "aff": "University of Hong Kong;;Shandong University;Citadel LLC;;Shanghai Jiaotong University;Columbia University", "aff_domain": "hku.hk;;sdu.edu.cn;citadel.com;;sjtu.edu.cn;columbia.edu", "position": "PhD student;;Postdoc;Researcher;;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nliu2024dnase,\ntitle={{DNA}-{SE}: Towards Deep Neural-Nets Assisted Semiparametric Estimation},\nauthor={Qinshuo Liu and Zixin Wang and Xi'an Li and Xinyao Ji and Lei Zhang and Lin Liu and Zhonghua Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OERwuPzHdh}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1899514, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ywZQa8cG6dgJ:scholar.google.com/&scioq=DNA-SE:+Towards+Deep+Neural-Nets+Assisted+Semiparametric+Estimation&hl=en&as_sdt=0,44", "gs_version_total": 6, "email": "hku.hk;;sdu.edu.cn;citadel.com;;sjtu.edu.cn;columbia.edu", "author_num": 7, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "University of Hong Kong;Shandong University;Citadel LLC;Shanghai Jiao Tong University;Columbia University", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.hku.hk;http://www.sdu.edu.cn;https://www.citadel.com;https://www.sjtu.edu.cn;https://www.columbia.edu", "aff_unique_abbr": "HKU;SDU;Citadel;SJTU;Columbia", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;1;0;1", "aff_country_unique": "China;United States" }, { "title": "Q-Star Meets Scalable Posterior Sampling: Bridging Theory and Practice via HyperAgent", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34173", "id": "OF7e0w1uon", "proceeding": "https://proceedings.mlr.press/v235/li24by.html", "pdf": "https://openreview.net/pdf?id=OF7e0w1uon", "openreview": "https://openreview.net/forum?id=OF7e0w1uon", "author_site": "Yingru Li, Jiawei Xu, Lei Han, Zhi-Quan Luo", "tldr": "", "abstract": "We propose HyperAgent, a reinforcement learning (RL) algorithm based on the hypermodel framework for exploration in RL. HyperAgent allows for the efficient incremental approximation of posteriors associated with an optimal action-value function ($Q^\\star$) without the need for conjugacy and follows the greedy policies w.r.t. these approximate posterior samples. We demonstrate that HyperAgent offers robust performance in large-scale deep RL benchmarks. It can solve Deep Sea hard exploration problems with episodes that optimally scale with problem size and exhibits significant efficiency gains in the Atari suite. Implementing HyperAgent requires minimal code addition to well-established deep RL frameworks like DQN. We theoretically prove that, under tabular assumptions, HyperAgent achieves logarithmic per-step computational complexity while attaining sublinear regret, matching the best known randomized tabular RL algorithm.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yingru Li;Jiawei Xu;Lei Han;Zhi-Quan Luo", "authorids": "~Yingru_Li1;~Jiawei_Xu1;~Lei_Han1;~Zhi-Quan_Luo1", "gender": "M;M;M;M", "homepage": "https://richardli.xyz;https://github.com/jiawei415;https://www.leihan.org;", "dblp": "156/7684;;75/2307-1;", "google_scholar": "OOhB7fcAAAAJ;;Tz4_zi8AAAAJ;dW3gcXoAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Yingru_Li1;~Jiawei_Xu1;~Lei_Han1;~Zhi-Quan_Luo1", "aff": "The Chinese University of Hong Kong, Shenzhen, China;CUHK(SZ);Tencent Robotics X;The Chinese University of Hong Kong, Shenzhen", "aff_domain": "cuhk.edu.cn;link.cuhk.edu.cn;tencent.com;cuhk.edu.cn", "position": "PhD student;PhD student;Principal Researcher;Full Professor", "bibtex": "@inproceedings{\nli2024qstar,\ntitle={Q-Star Meets Scalable Posterior Sampling: Bridging Theory and Practice via HyperAgent},\nauthor={Yingru Li and Jiawei Xu and Lei Han and Zhi-Quan Luo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OF7e0w1uon}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3248152, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1108345384570524826&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "cuhk.edu.cn;link.cuhk.edu.cn;tencent.com;cuhk.edu.cn", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Chinese University of Hong Kong;Chinese University of Hong Kong, Shenzhen;Tencent", "aff_unique_dep": ";;Tencent Robotics X", "aff_unique_url": "https://www.cuhk.edu.cn;https://www.cuhk.edu.cn/sz;https://www.tencent.com", "aff_unique_abbr": "CUHK;CUHK(SZ);Tencent Robotics X", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Diffusion-based Missing-view Generation With the Application on Incomplete Multi-view Clustering", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34172", "id": "OHFxcU9jwW", "proceeding": "https://proceedings.mlr.press/v235/wen24c.html", "pdf": "https://openreview.net/pdf?id=OHFxcU9jwW", "openreview": "https://openreview.net/forum?id=OHFxcU9jwW", "author_site": "Jie Wen, Shijie Deng, Waikeung Wong, Guoqing Chao, Chao Huang, Lunke Fei, Yong Xu", "tldr": "", "abstract": "As a branch of clustering, multi-view clustering has received much attention in recent years. In practical applications, a common phenomenon is that partial views of some samples may be missing in the collected multi-view data, which poses a severe challenge to design the multi-view learning model and explore complementary and consistent information. Currently, most of the incomplete multi-view clustering methods only focus on exploring the information of available views while few works study the missing view recovery for incomplete multi-view learning. To this end, we propose an innovative diffusion-based missing view generation (DMVG) network. Moreover, for the scenarios with high missing rates, we further propose an incomplete multi-view data augmentation strategy to enhance the recovery quality for the missing views. Extensive experimental results show that the proposed DMVG can not only accurately predict missing views, but also further enhance the subsequent clustering performance in comparison with several state-of-the-art incomplete multi-view clustering methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jie Wen;Shijie Deng;Waikeung Wong;Guoqing Chao;Chao Huang;Lunke Fei;Yong Xu", "authorids": "~Jie_Wen1;~Shijie_Deng1;~Waikeung_Wong1;~Guoqing_Chao1;~Chao_Huang16;~Lunke_Fei1;~Yong_Xu9", "gender": ";M;M;M;M;M;M", "homepage": ";;;https://guoqingchao.github.io/;;;https://www.yongxu.org", "dblp": ";;122/1539.html;120/8804;;157/4083;", "google_scholar": ";;;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=zh-CN;;https://scholar.google.com.hk/citations?user=zOVgYQYAAAAJ", "orcid": ";0000-0001-9573-7352;0000-0002-5214-7114;0000-0002-2410-650X;;;", "linkedin": ";;;;;;", "or_profile": "~Jie_Wen1;~Shijie_Deng1;~Waikeung_Wong1;~Guoqing_Chao1;~Chao_Huang16;~Lunke_Fei1;~Yong_Xu9", "aff": ";Harbin Institute of Technology;Hong Kong Polytechnic University;Harbin Institute of Technology;University of Macau;Guangdong University of Technology;Harbin Institute of Technology", "aff_domain": ";hit.edu.cn;polyu.edu.hk;hit.edu.cn;um.edu.mo;gdut.edu.cn;hit.edu.cn", "position": ";MS student;Full Professor;Associate Professor;Intern;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nwen2024diffusionbased,\ntitle={Diffusion-based Missing-view Generation With the Application on Incomplete Multi-view Clustering},\nauthor={Jie Wen and Shijie Deng and Waikeung Wong and Guoqing Chao and Chao Huang and Lunke Fei and Yong Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OHFxcU9jwW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9368256, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15518682679137132983&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "email": ";hit.edu.cn;polyu.edu.hk;hit.edu.cn;um.edu.mo;gdut.edu.cn;hit.edu.cn", "author_num": 7, "aff_unique_index": "0;1;0;2;3;0", "aff_unique_norm": "Harbin Institute of Technology;Hong Kong Polytechnic University;University of Macau;Guangdong University of Technology", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.hit.edu.cn/;https://www.polyu.edu.hk;https://www.um.edu.mo;http://www.gdut.edu.cn", "aff_unique_abbr": "HIT;PolyU;UM;GDUT", "aff_campus_unique_index": "0;1;0;2;0", "aff_campus_unique": "Harbin;Hong Kong SAR;Macau SAR;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "ReDiffuser: Reliable Decision-Making Using a Diffuser with Confidence Estimation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34171", "id": "OI1YP53WKI", "proceeding": "https://proceedings.mlr.press/v235/he24e.html", "pdf": "https://openreview.net/pdf?id=OI1YP53WKI", "openreview": "https://openreview.net/forum?id=OI1YP53WKI", "author_site": "Nantian He, Shaohui Li, Zhi Li, Yu LIU, You He", "tldr": "", "abstract": "The diffusion model has demonstrated impressive performance in offline reinforcement learning. However, non-deterministic sampling in diffusion models can lead to unstable performance. Furthermore, the lack of confidence measurements makes it difficult to evaluate the reliability and trustworthiness of the sampled decisions. To address these issues, we present ReDiffuser, which utilizes confidence estimation to ensure reliable decision-making. We achieve this by learning a confidence function based on Random Network Distillation. The confidence function measures the reliability of sampled decisions and contributes to quantitative recognition of reliable decisions. Additionally, we integrate the confidence function into task-specific sampling procedures to realize adaptive-horizon planning and value-embedded planning. Experiments show that the proposed ReDiffuser achieves state-of-the-art performance on standard offline RL datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nantian He;Shaohui Li;Zhi Li;Yu LIU;You He", "authorids": "~Nantian_He1;~Shaohui_Li3;~Zhi_Li5;~Yu_LIU31;~You_He2", "gender": "M;M;;M;M", "homepage": "https://github.com/he-nantian;;;;", "dblp": ";;;;", "google_scholar": ";UDQR5QkAAAAJ;;;", "orcid": ";0000-0002-9650-8874;;0000-0002-5216-3181;0000-0002-6111-340X", "linkedin": ";;;;", "or_profile": "~Nantian_He1;~Shaohui_Li3;~Zhi_Li5;~Yu_LIU31;~You_He2", "aff": "Tsinghua University; Tsinghua University;;Tsinghua University;Tsinghua University", "aff_domain": "mails.tsinghua.edu.cn;tsinghua.edu.cn;;tsinghua.edu.cn;tsinghua.edu.cn", "position": "MS student;Postdoc;;Full Professor;Full Professor", "bibtex": "@inproceedings{\nhe2024rediffuser,\ntitle={ReDiffuser: Reliable Decision-Making Using a Diffuser with Confidence Estimation},\nauthor={Nantian He and Shaohui Li and Zhi Li and Yu LIU and You He},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OI1YP53WKI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 849813, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7405826888058259703&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": "mails.tsinghua.edu.cn;tsinghua.edu.cn;;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Error Feedback Can Accurately Compress Preconditioners", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34170", "id": "OJTKlubFk1", "proceeding": "https://proceedings.mlr.press/v235/modoranu24a.html", "pdf": "https://openreview.net/pdf?id=OJTKlubFk1", "openreview": "https://openreview.net/forum?id=OJTKlubFk1", "author_site": "Ionut-Vlad Modoranu, Aleksei Kalinov, Eldar Kurtic, Elias Frantar, Dan Alistarh", "tldr": "", "abstract": "Leveraging second-order information about the loss at the scale of deep networks is one of the main lines of approach for improving the performance of current optimizers for deep learning. Yet, existing approaches for accurate full-matrix preconditioning, such as Full-Matrix Adagrad (GGT) or Matrix-Free Approximate Curvature (M-FAC) suffer from massive storage costs when applied even to small-scale models, as they must store a sliding window of gradients, whose memory requirements are multiplicative in the model dimension. In this paper, we address this issue via a novel and efficient error-feedback technique that can be applied to compress preconditioners by up to two orders of magnitude in practice, without loss of convergence. Specifically, our approach compresses the gradient information via sparsification or low-rank compression before it is fed into the preconditioner, feeding the compression error back into future iterations. Extensive experiments on deep neural networks show that this approach can compress full-matrix preconditioners to up to 99% sparsity without accuracy loss, effectively removing the memory overhead of fullmatrix preconditioners such as GGT and M-FAC.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ionut-Vlad Modoranu;Aleksei Kalinov;Eldar Kurtic;Elias Frantar;Dan Alistarh", "authorids": "~Ionut-Vlad_Modoranu1;~Aleksei_Kalinov1;~Eldar_Kurtic1;~Elias_Frantar1;~Dan_Alistarh7", "gender": "M;;M;M;M", "homepage": ";https://alekseika.com;;;http://people.csail.mit.edu/alistarh/", "dblp": "275/9983;;297/3713;259/2210;36/3251.html", "google_scholar": "N56bz4gAAAAJ;iyPrfM0AAAAJ;https://scholar.google.com/citations?hl=en;hjdlwz8AAAAJ;https://scholar.google.com.tw/citations?user=75q-6ZQAAAAJ", "orcid": ";0000-0003-2189-3904;;;", "linkedin": "ionut-vlad-modoranu/;;eldar-kurti%C4%87-77963b160/;elias-frantar-5b43181a4;", "or_profile": "~Ionut-Vlad_Modoranu1;~Aleksei_Kalinov1;~Eldar_Kurtic1;~Elias_Frantar1;~Dan_Alistarh1", "aff": "Institute of Science and Technology Austria;Institute of Science and Technology;Institute of Science and Technology Austria;Institute of Science and Technology Austria;Institute of Science and Technology", "aff_domain": "ist.ac.at;ist.ac.at;ist.ac.at;ist.ac.at;ist.ac.at", "position": "PhD student;PhD student;Researcher;PhD student;Full Professor", "bibtex": "@inproceedings{\nmodoranu2024error,\ntitle={Error Feedback Can Accurately Compress Preconditioners},\nauthor={Ionut-Vlad Modoranu and Aleksei Kalinov and Eldar Kurtic and Elias Frantar and Dan Alistarh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OJTKlubFk1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 663206, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18338425770365168027&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 7, "email": "ist.ac.at;ist.ac.at;ist.ac.at;ist.ac.at;ist.ac.at", "author_num": 5, "aff_unique_index": "0;1;0;0;1", "aff_unique_norm": "Institute of Science and Technology Austria;Institute of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ist.ac.at;", "aff_unique_abbr": "IST Austria;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Austria;" }, { "title": "Knowledge Transfer from Vision Foundation Models for Efficient Training of Small Task-specific Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34169", "id": "OKYfaYQlML", "proceeding": "https://proceedings.mlr.press/v235/vemulapalli24a.html", "pdf": "https://openreview.net/pdf?id=OKYfaYQlML", "openreview": "https://openreview.net/forum?id=OKYfaYQlML", "author_site": "Raviteja Vemulapalli, Hadi Pouransari, Fartash Faghri, Sachin Mehta, Mehrdad Farajtabar, Mohammad Rastegari, Oncel Tuzel", "tldr": "", "abstract": "Vision Foundation Models (VFMs) pretrained on massive datasets exhibit impressive performance on various downstream tasks, especially with limited labeled target data. However, due to their high inference compute cost, these models cannot be deployed for many real-world applications. Motivated by this, we ask the following important question, \"How can we leverage the knowledge from a large VFM to train a small task-specific model for a new target task with limited labeled training data?\", and propose a simple task-oriented knowledge transfer approach as a highly effective solution to this problem. Our experimental results on five target tasks show that the proposed approach outperforms task-agnostic VFM distillation, web-scale CLIP pretraining, supervised ImageNet pretraining, and self-supervised DINO pretraining by up to 11.6%, 22.1%, 13.7%, and 29.8%, respectively. Furthermore, the proposed approach also demonstrates up to 9x, 4x and 15x reduction in pretraining compute cost when compared to task-agnostic VFM distillation, ImageNet pretraining and DINO pretraining, respectively, while outperforming them. We also show that the dataset used for transferring knowledge has a significant effect on the final target task performance, and introduce a retrieval-augmented knowledge transfer strategy that uses web-scale image retrieval to curate effective transfer sets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Raviteja Vemulapalli;Hadi Pouransari;Fartash Faghri;Sachin Mehta;Mehrdad Farajtabar;Mohammad Rastegari;Oncel Tuzel", "authorids": "~Raviteja_Vemulapalli1;~Hadi_Pouransari1;~Fartash_Faghri1;~Sachin_Mehta1;~Mehrdad_Farajtabar1;~Mohammad_Rastegari2;~Oncel_Tuzel2", "gender": "M;M;M;M;M;M;M", "homepage": "http://ravitejav.weebly.com/;;;https://sacmehta.github.io/;https://www.cc.gatech.edu/~mfarajta/;https://mrastegari.github.io/;http://www.onceltuzel.net", "dblp": "135/4940;162/5187;115/7922;34/11140;21/9988;31/5228;73/2943.html", "google_scholar": "0OFqm7YAAAAJ;besz69AAAAAJ;https://scholar.google.ca/citations?user=KUG_tG0AAAAJ;https://scholar.google.co.in/citations?user=cnRJ0GUAAAAJ;shkKxnQAAAAJ;N4-2Z_cAAAAJ;Fe7NTe0AAAAJ", "orcid": ";;;;;;", "linkedin": "raviteja-vemulapalli-85146113?utm_source=share&utm_campaign=share_via&utm_content=profile&utm_medium=ios_app;;fartash-faghri;;;;", "or_profile": "~Raviteja_Vemulapalli1;~Hadi_Pouransari1;~Fartash_Faghri1;~Sachin_Mehta1;~Mehrdad_Farajtabar1;~Mohammad_Rastegari2;~Oncel_Tuzel2", "aff": "Apple;Apple;Apple;Apple;Apple;Department of Computer Science, University of Washington;Apple", "aff_domain": "apple.com;apple.com;apple.com;apple.com;apple.com;cs.washington.edu;apple.com", "position": "Researcher;Principal Researcher;Researcher;Researcher;Researcher;Assistant Professor;Principal Researcher", "bibtex": "@inproceedings{\nvemulapalli2024knowledge,\ntitle={Knowledge Transfer from Vision Foundation Models for Efficient Training of Small Task-specific Models},\nauthor={Raviteja Vemulapalli and Hadi Pouransari and Fartash Faghri and Sachin Mehta and Mehrdad Farajtabar and Mohammad Rastegari and Oncel Tuzel},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OKYfaYQlML}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8418804, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5539576296736494986&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "apple.com;apple.com;apple.com;apple.com;apple.com;cs.washington.edu;apple.com", "author_num": 7, "aff_unique_index": "0;0;0;0;0;1;0", "aff_unique_norm": "Apple;University of Washington", "aff_unique_dep": "Apple Inc.;Department of Computer Science", "aff_unique_url": "https://www.apple.com;https://www.washington.edu", "aff_unique_abbr": "Apple;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Exploiting Code Symmetries for Learning Program Semantics", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34168", "id": "OLvgrLtv6J", "proceeding": "https://proceedings.mlr.press/v235/pei24b.html", "pdf": "https://openreview.net/pdf?id=OLvgrLtv6J", "openreview": "https://openreview.net/forum?id=OLvgrLtv6J", "author_site": "Kexin Pei, Weichen Li, Qirui Jin, Shuyang Liu, Scott Geng, Lorenzo Cavallaro, Junfeng Yang, Suman Jana", "tldr": "", "abstract": "This paper tackles the challenge of teaching code semantics to Large Language Models (LLMs) for program analysis by incorporating code symmetries into the model architecture. We introduce a group-theoretic framework that defines code symmetries as semantics-preserving transformations, where forming a code symmetry group enables precise and efficient reasoning of code semantics. Our solution, SymC, develops a novel variant of self-attention that is provably equivariant to code symmetries from the permutation group defined over the program dependence graph. SymC obtains superior performance on five program analysis tasks, outperforming state-of-the-art code models, including GPT-4, without any pre-training. Our results suggest that code LLMs that encode the code structural prior via the code symmetry group generalize better and faster.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kexin Pei;Weichen Li;Qirui Jin;Shuyang Liu;Scott Geng;Lorenzo Cavallaro;Junfeng Yang;Suman Jana", "authorids": "~Kexin_Pei1;~Weichen_Li1;~Qirui_Jin1;~Shuyang_Liu2;~Scott_Geng1;~Lorenzo_Cavallaro1;~Junfeng_Yang1;~Suman_Jana1", "gender": "M;;M;F;;M;M;M", "homepage": "https://sites.google.com/site/kexinpeisite/;https://weichenxli.github.io/;https://c-lister.github.io;;https://www.scottgeng.com/;https://s2lab.cs.ucl.ac.uk/people/sullivan;https://www.cs.columbia.edu/~junfeng/;http://sumanj.info", "dblp": "145/6061;;;;330/4056.html;95/5162;71/3724.html;74/28", "google_scholar": "XzSkny0AAAAJ;qxhN-ocAAAAJ;0g6YBJMAAAAJ;;jCg1gRoAAAAJ;oWT7fIYAAAAJ;JJ9AvbAAAAAJ;https://scholar.google.com.tw/citations?user=SDY9FwUAAAAJ", "orcid": "0000-0001-5052-9808;;;;;0000-0002-3878-2680;0009-0000-2277-6545;", "linkedin": "kexin-pei/;weichen-li-2b09871a7/;;shuyang-liu-3422ab271/;;lorenzocavallaro/;;", "or_profile": "~Kexin_Pei1;~Weichen_Li1;~Qirui_Jin1;~Shuyang_Liu2;~Scott_Geng1;~Lorenzo_Cavallaro1;~Junfeng_Yang1;~Suman_Jana1", "aff": "The University of Chicago;Columbia University;University of Michigan - Ann Arbor;Huazhong University of Science and Technology;University of Washington;University College London;Columbia University;, Columbia University", "aff_domain": "uchicago.edu;columbia.edu;umich.edu;hust.edu.cn;cs.washington.edu;ucl.ac.uk;columbia.edu;cs.columbia.edu", "position": "Assistant Professor;MS student;Undergrad student;Undergrad student;PhD student;Full Professor;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\npei2024exploiting,\ntitle={Exploiting Code Symmetries for Learning Program Semantics},\nauthor={Kexin Pei and Weichen Li and Qirui Jin and Shuyang Liu and Scott Geng and Lorenzo Cavallaro and Junfeng Yang and Suman Jana},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OLvgrLtv6J}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 938866, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17308434802228616897&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "uchicago.edu;columbia.edu;umich.edu;hust.edu.cn;cs.washington.edu;ucl.ac.uk;columbia.edu;cs.columbia.edu", "author_num": 8, "aff_unique_index": "0;1;2;3;4;5;1;1", "aff_unique_norm": "University of Chicago;Columbia University;University of Michigan;Huazhong University of Science and Technology;University of Washington;University College London", "aff_unique_dep": ";;;;;", "aff_unique_url": "https://www.uchicago.edu;https://www.columbia.edu;https://www.umich.edu;http://www.hust.edu.cn;https://www.washington.edu;https://www.ucl.ac.uk", "aff_unique_abbr": "UChicago;Columbia;UM;HUST;UW;UCL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Ann Arbor", "aff_country_unique_index": "0;0;0;1;0;2;0;0", "aff_country_unique": "United States;China;United Kingdom" }, { "title": "Liouville Flow Importance Sampler", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34167", "id": "OMKNBzf6HJ", "proceeding": "https://proceedings.mlr.press/v235/tian24c.html", "pdf": "https://openreview.net/pdf?id=OMKNBzf6HJ", "openreview": "https://openreview.net/forum?id=OMKNBzf6HJ", "author_site": "Yifeng Tian, Nishant Panda, Yen Ting Lin", "tldr": "", "abstract": "We present the Liouville Flow Importance Sampler (LFIS), an innovative flow-based model for generating samples from unnormalized density functions. LFIS learns a time-dependent velocity field that deterministically transports samples from a simple initial distribution to a complex target distribution, guided by a prescribed path of annealed distributions. The training of LFIS utilizes a unique method that enforces the structure of a derived partial differential equation to neural networks modeling velocity fields. By considering the neural velocity field as an importance sampler, sample weights can be computed through accumulating errors along the sample trajectories driven by neural velocity fields, ensuring unbiased and consistent estimation of statistical quantities. We demonstrate the effectiveness of LFIS through its application to a range of benchmark problems, on many of which LFIS achieved state-of-the-art performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yifeng Tian;Nishant Panda;Yen Ting Lin", "authorids": "~Yifeng_Tian1;~Nishant_Panda1;~Yen_Ting_Lin1", "gender": "M;M;M", "homepage": "https://www.researchgate.net/profile/Yifeng-Tian-2;;", "dblp": ";146/9708;", "google_scholar": "Pk57n7YAAAAJ;zOV6TUAAAAAJ;wUhVn34AAAAJ", "orcid": ";0000-0001-9754-2794;0000-0001-6893-8423", "linkedin": ";;yen-ting-lin-95858265/", "or_profile": "~Yifeng_Tian1;~Nishant_Panda1;~Yen_Ting_Lin1", "aff": "Los Alamos National Laboratory;Los Alamos National Laboratory;Los Alamos National Laboratory", "aff_domain": "lanl.gov;lanl.gov;lanl.gov", "position": "Postdoc;Researcher;Researcher", "bibtex": "@inproceedings{\ntian2024liouville,\ntitle={Liouville Flow Importance Sampler},\nauthor={Yifeng Tian and Nishant Panda and Yen Ting Lin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OMKNBzf6HJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3702021, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18257582919528247755&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "lanl.gov;lanl.gov;lanl.gov", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Los Alamos National Laboratory", "aff_unique_dep": "", "aff_unique_url": "https://www.lanl.gov", "aff_unique_abbr": "LANL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "LongRoPE: Extending LLM Context Window Beyond 2 Million Tokens", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34166", "id": "ONOtpXLqqw", "proceeding": "https://proceedings.mlr.press/v235/ding24i.html", "pdf": "https://openreview.net/pdf?id=ONOtpXLqqw", "openreview": "https://openreview.net/forum?id=ONOtpXLqqw", "author_site": "Yiran Ding, Li Lyna Zhang, Chengruidong Zhang, Yuanyuan Xu, Ning Shang, Jiahang Xu, Fan Yang, Mao Yang", "tldr": "", "abstract": "Large context window is a desirable feature in large language models (LLMs). However, due to high fine-tuning costs, scarcity of long texts, and catastrophic values introduced by new token positions, current extended context windows are limited to around 128k tokens. This paper introduces LongRoPE that, for the first time, extends the context window of pre-trained LLMs to an impressive 2048k tokens, with up to only 1k fine-tuning steps at within 256k training lengths, while maintaining performance at the original short context window. This is achieved by three key innovations: (i) we identify and exploit two forms of non-uniformities in positional interpolation through an efficient search, providing a better initialization for fine-tuning and enabling an 8x extension in non-fine-tuning scenarios; (ii) we introduce a progressive extension strategy that first fine-tunes a 256k length LLM and then conducts a second positional interpolation on the fine-tuned extended LLM to achieve a 2048k context window; (iii) we readjust LongRoPE on 8k length to recover the short context window performance. Extensive experiments on LLaMA2 and Mistral across various tasks demonstrate the effectiveness of our method. Models extended via LongRoPE retain the original architecture with minor modifications to the positional embedding, and can reuse most pre-existing optimizations. Code is available at https://github.com/microsoft/LongRoPE", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yiran Ding;Li Lyna Zhang;Chengruidong Zhang;Yuanyuan Xu;Ning Shang;Jiahang Xu;Fan Yang;Mao Yang", "authorids": "~Yiran_Ding1;~Li_Lyna_Zhang1;~Chengruidong_Zhang1;~Yuanyuan_Xu3;~Ning_Shang1;~Jiahang_Xu2;~Fan_Yang28;~Mao_Yang1", "gender": ";F;M;M;M;F;M;", "homepage": "https://yiyi-philosophy.github.io/yiran.ding/;https://www.microsoft.com/en-us/research/people/lzhani/;https://github.com/Starmys/;https://github.com/Wonderful-Me;;https://jiahangxu.github.io/;https://fanyangcs.github.io/;", "dblp": "260/4763;195/5224;;;;133/5457;29/3081-24.html;", "google_scholar": ";-_ItfAoAAAAJ;WbvQ5JEAAAAJ;;;PuecdZgAAAAJ;https://scholar.google.com/citations?hl=en;LgJqohwAAAAJ", "orcid": ";;;;0000-0002-2510-4856;0000-0001-9186-619X;0000-0002-0378-060X;", "linkedin": ";;;;;;;", "or_profile": "~Yiran_Ding1;~Li_Lyna_Zhang1;~Chengruidong_Zhang1;~Yuanyuan_Xu3;~Ning_Shang1;~Jiahang_Xu2;~Fan_Yang28;~Mao_Yang1", "aff": "Hangzhou Dianzi University;Microsoft Research Asia;Microsoft;University of Science and Technology of China;;Microsoft Research;Microsoft Research;", "aff_domain": "hdu.edu.cn;microsoft.com;microsoft.com;ustc.edu.cn;;research.microsoft.com;research.microsoft.com;", "position": "Undergrad student;Researcher;Researcher;Undergrad student;;Researcher;Senior Principal Researcher;", "bibtex": "@inproceedings{\nding2024longrope,\ntitle={LongRo{PE}: Extending {LLM} Context Window Beyond 2 Million Tokens},\nauthor={Yiran Ding and Li Lyna Zhang and Chengruidong Zhang and Yuanyuan Xu and Ning Shang and Jiahang Xu and Fan Yang and Mao Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ONOtpXLqqw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1149782, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 148, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6760895382382251119&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 8, "email": "hdu.edu.cn;microsoft.com;microsoft.com;ustc.edu.cn;;research.microsoft.com;research.microsoft.com;", "author_num": 8, "aff_unique_index": "0;1;1;2;1;1", "aff_unique_norm": "Hangzhou Dianzi University;Microsoft;University of Science and Technology of China", "aff_unique_dep": ";Research;", "aff_unique_url": "http://www.hdu.edu.cn/;https://www.microsoft.com/en-us/research/group/asia;http://www.ustc.edu.cn", "aff_unique_abbr": "HGHDU;MSR Asia;USTC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;1;0;1;1", "aff_country_unique": "China;United States" }, { "title": "Enhancing Trajectory Prediction through Self-Supervised Waypoint Distortion Prediction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34165", "id": "OQ7TlOphGX", "proceeding": "https://proceedings.mlr.press/v235/chib24b.html", "pdf": "https://openreview.net/pdf?id=OQ7TlOphGX", "openreview": "https://openreview.net/forum?id=OQ7TlOphGX", "author_site": "Pranav Singh Chib, Pravendra Singh", "tldr": "", "abstract": "Trajectory prediction is an important task that involves modeling the indeterminate nature of agents to forecast future trajectories given the observed trajectory sequences. The task of predicting trajectories poses significant challenges, as agents not only move individually through time but also interact spatially. The learning of complex spatio-temporal representations stands as a fundamental challenge in trajectory prediction. To this end, we propose a novel approach called SSWDP (Self-Supervised Waypoint Distortion Prediction). We propose a simple yet highly effective self-supervised task of predicting distortion present in the observed trajectories to improve the representation learning of the model. Our approach can complement existing trajectory prediction methods. The experimental results highlight a significant improvement with relative percentage differences of 22.7%/38.9%, 33.8%/36.4%, and 16.60%/23.20% in ADE/FDE for the NBA, TrajNet++, and ETH-UCY datasets, respectively, compared to the baseline methods. Our approach also demonstrates a significant improvement over baseline methods with relative percentage differences of 76.8%/82.5% and 61.0%/36.1% in ADE/FDE for TrajNet++ and NBA datasets in distorted environments, respectively.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pranav singh chib;Pravendra Singh", "authorids": "~Pranav_singh_chib1;~Pravendra_Singh1", "gender": ";M", "homepage": ";https://sites.google.com/view/pravendra/", "dblp": ";160/8743", "google_scholar": ";YwDTxJMAAAAJ", "orcid": ";0000-0003-1001-2219", "linkedin": ";", "or_profile": "~Pranav_singh_chib1;~Pravendra_Singh1", "aff": ";Indian Institute of Technology, Roorkee", "aff_domain": ";iitr.ac.in", "position": ";Assistant Professor", "bibtex": "@inproceedings{\nchib2024enhancing,\ntitle={Enhancing Trajectory Prediction through Self-Supervised Waypoint Distortion Prediction},\nauthor={Pranav singh chib and Pravendra Singh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OQ7TlOphGX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3021448, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8961308353366758667&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 3, "email": ";iitr.ac.in", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Indian Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.iitr.ac.in", "aff_unique_abbr": "IIT Roorkee", "aff_campus_unique_index": "0", "aff_campus_unique": "Roorkee", "aff_country_unique_index": "0", "aff_country_unique": "India" }, { "title": "On The Complexity of First-Order Methods in Stochastic Bilevel Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34164", "id": "OQ97v7uRGc", "proceeding": "https://proceedings.mlr.press/v235/kwon24b.html", "pdf": "https://openreview.net/pdf?id=OQ97v7uRGc", "openreview": "https://openreview.net/forum?id=OQ97v7uRGc", "author_site": "Jeongyeol Kwon, Dohyun Kwon, Hanbaek Lyu", "tldr": "", "abstract": "We consider the problem of finding stationary points in Bilevel optimization when the lower-level problem is unconstrained and strongly convex. The problem has been extensively studied in recent years; the main technical challenge is to keep track of lower-level solutions $y^*(x)$ in response to the changes in the upper-level variables $x$. Subsequently, all existing approaches tie their analyses to a genie algorithm that knows lower-level solutions and, therefore, need not query any points far from them. We consider a dual question to such approaches: suppose we have an oracle, which we call $y^*$-aware, that returns an $O(\\epsilon)$-estimate of the lower-level solution, in addition to first-order gradient estimators *locally unbiased* within the $\\Theta(\\epsilon)$-ball around $y^*(x)$. We study the complexity of finding stationary points with such an $y^*$-aware oracle: we propose a simple first-order method that converges to an $\\epsilon$ stationary point using $O(\\epsilon^{-6}), O(\\epsilon^{-4})$ access to first-order $y^*$-aware oracles. Our upper bounds also apply to standard unbiased first-order oracles, improving the best-known complexity of first-order methods by $O(\\epsilon)$ with minimal assumptions. We then provide the matching $\\Omega(\\epsilon^{-6})$, $\\Omega(\\epsilon^{-4})$ lower bounds without and with an additional smoothness assumption, respectively. Our results imply that any approach that simulates an algorithm with an $y^*$-aware oracle must suffer the same lower bounds.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jeongyeol Kwon;Dohyun Kwon;Hanbaek Lyu", "authorids": "~Jeongyeol_Kwon1;~Dohyun_Kwon1;~Hanbaek_Lyu1", "gender": "M;M;", "homepage": "https://kwonchungli.github.io/;https://www.dohyunkwon.com/;https://www.hanbaeklyu.com", "dblp": "https://dblp.uni-trier.de/pid/228/9224;218/1797-2;", "google_scholar": "cnyMCYMAAAAJ;dBxpstQAAAAJ;gDFWvgQAAAAJ", "orcid": ";0000-0001-9198-4735;", "linkedin": ";;", "or_profile": "~Jeongyeol_Kwon1;~Dohyun_Kwon1;~Hanbaek_Lyu1", "aff": "University of Wisconsin - Madison;University of Seoul;University of Wisconsin, Madison", "aff_domain": "wisc.edu;uos.ac.kr;wisc.edu", "position": "Postdoc;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nkwon2024on,\ntitle={On The Complexity of First-Order Methods in Stochastic Bilevel Optimization},\nauthor={Jeongyeol Kwon and Dohyun Kwon and Hanbaek Lyu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OQ97v7uRGc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 449064, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=792768782860506600&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "email": "wisc.edu;uos.ac.kr;wisc.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Wisconsin-Madison;University of Seoul;University of Wisconsin", "aff_unique_dep": ";;", "aff_unique_url": "https://www.wisc.edu;http://www.useoul.edu;https://www.wisc.edu", "aff_unique_abbr": "UW-Madison;UOS;UW", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;South Korea" }, { "title": "Disentangled Graph Self-supervised Learning for Out-of-Distribution Generalization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34163", "id": "OS0szhkPmF", "proceeding": "https://proceedings.mlr.press/v235/li24br.html", "pdf": "https://openreview.net/pdf?id=OS0szhkPmF", "openreview": "https://openreview.net/forum?id=OS0szhkPmF", "author_site": "Haoyang Li, Xin Wang, Zeyang Zhang, Haibo Chen, Ziwei Zhang, Wenwu Zhu", "tldr": "", "abstract": "Graph out-of-distribution (OOD) generalization, aiming to generalize graph neural networks (GNNs) under distribution shifts between training and testing environments, has attracted ever-increasing attention recently. However, existing literature heavily relies on sufficient task-dependent graph labels, which are often scarce or even unavailable, limiting their applications in real-world scenarios. In this paper, we study the self-supervised graph OOD generalization problem, i.e., learning GNNs capable of achieving relatively stable performances under distribution shifts without graph labels. However, the problem remains largely unexplored, with the critical challenge that the invariant and variant information are highly entangled in graphs. To solve this problem, we propose an OOD generalized disentangled graph contrastive learning model (OOD-GCL), which is capable of learning disentangled graph-level representations with self-supervision that can handle distribution shifts between training and testing graph data. Specifically, we first introduce a disentangled graph encoder to map each input graph into the factorized graph representation. Then we propose a tailored disentangled invariant self-supervised learning module to maximize predictive ability of the representations and make sure the representations other than from one specific channel are invariant to the environments partitioned by this latent factor for excluding the information corresponding to this latent factor for disentanglement. Finally, the disentangled graph representations are fed into a linear predictor and finetuned for the downstream tasks. We provide comprehensive theoretical analyses to show that our model can learn disentangled graph representations and achieve OOD generalization. Extensive experiments on real-world datasets demonstrate the superiority of our model against state-of-the-art baselines under distribution shifts for graph classification tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haoyang Li;Xin Wang;Zeyang Zhang;Haibo Chen;Ziwei Zhang;Wenwu Zhu", "authorids": "~Haoyang_Li1;~Xin_Wang17;~Zeyang_Zhang1;~Haibo_Chen7;~Ziwei_Zhang1;~Wenwu_Zhu1", "gender": "M;M;;M;;M", "homepage": "https://haoyang.li;http://mn.cs.tsinghua.edu.cn/xinwang/;https://zzythu.com;https://github.com/haibo12;;http://media.cs.tsinghua.edu.cn/en/zww", "dblp": "118/0004-1.html;10/5630-19;236/0242;;;97/6308-1.html", "google_scholar": "86RE16gAAAAJ;YPOBHYUAAAAJ;w_njVcAAAAAJ;GdCBTssAAAAJ;;https://scholar.google.com.tw/citations?user=7t2jzpgAAAAJ", "orcid": "0000-0003-3544-5563;0000-0002-0351-2939;0000-0003-1329-1313;0009-0006-0608-0111;;0000-0003-2236-9290", "linkedin": ";;zeyang-zhang-a7a039159;;;", "or_profile": "~Haoyang_Li1;~Xin_Wang17;~Zeyang_Zhang1;~Haibo_Chen7;~Ziwei_Zhang1;~Wenwu_Zhu1", "aff": "Cornell University;Tsinghua University;Tsinghua University;Central South University;;Tsinghua University", "aff_domain": "med.cornell.edu;cs.tsinghua.edu.cn;tsinghua.edu.cn;csu.edu.cn;;tsinghua.edu.cn", "position": "Postdoc;Associate Professor;PhD student;Undergrad student;;Full Professor", "bibtex": "@inproceedings{\nli2024disentangled,\ntitle={Disentangled Graph Self-supervised Learning for Out-of-Distribution Generalization},\nauthor={Haoyang Li and Xin Wang and Zeyang Zhang and Haibo Chen and Ziwei Zhang and Wenwu Zhu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OS0szhkPmF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 583613, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6765960129261242517&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "med.cornell.edu;cs.tsinghua.edu.cn;tsinghua.edu.cn;csu.edu.cn;;tsinghua.edu.cn", "author_num": 6, "aff_unique_index": "0;1;1;2;1", "aff_unique_norm": "Cornell University;Tsinghua University;Central South University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cornell.edu;https://www.tsinghua.edu.cn;https://www.csu.edu.cn", "aff_unique_abbr": "Cornell;THU;CSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "United States;China" }, { "title": "SparQ Attention: Bandwidth-Efficient LLM Inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34162", "id": "OS5dqxmmtl", "proceeding": "https://proceedings.mlr.press/v235/ribar24a.html", "pdf": "https://openreview.net/pdf?id=OS5dqxmmtl", "openreview": "https://openreview.net/forum?id=OS5dqxmmtl", "author_site": "Luka Ribar, Ivan Chelombiev, Luke Hudlass-Galley, Charlie Blake, Carlo Luschi, Douglas Orr", "tldr": "", "abstract": "The computational difficulties of large language model (LLM) inference remain a significant obstacle to their widespread deployment. The need for many applications to support long input sequences and process them in large batches typically causes token-generation to be bottlenecked by data transfer. For this reason, we introduce **SparQ Attention**, a technique for increasing the inference throughput of LLMs by utilising memory bandwidth more efficiently within the attention layers, through selective fetching of the cached history. Our proposed technique can be applied directly to off-the-shelf LLMs during inference, without requiring any modification to the pre-training setup or additional fine-tuning. We show that SparQ Attention brings up to 8x savings in attention data transfers without substantial drops in accuracy, by evaluating Llama 2 and 3, Mistral, Gemma and Pythia models on a wide range of downstream tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Luka Ribar;Ivan Chelombiev;Luke Hudlass-Galley;Charlie Blake;Carlo Luschi;Douglas Orr", "authorids": "~Luka_Ribar1;~Ivan_Chelombiev1;lukehg@graphcore.ai;~Charlie_Blake1;~Carlo_Luschi1;~Douglas_Orr1", "gender": "M;M;;M;M;M", "homepage": ";;;https://thecharlieblake.co.uk/;;https://douglasorr.github.io/", "dblp": "224/1508;;;243/6977;72/10621;33/8535", "google_scholar": "O3OBF-QAAAAJ;https://scholar.google.com/citations?hl=en;;kvibgXMAAAAJ;;", "orcid": ";;;;;", "linkedin": "luka-ribar/;ivan-chelombiev-5a7790a9/;;;carlo-luschi-1908144/;", "or_profile": "~Luka_Ribar1;~Ivan_Chelombiev1;lukehg@graphcore.ai;~Charlie_Blake1;~Carlo_Luschi1;~Douglas_Orr1", "aff": "Graphcore;Graphcore;;;Graphcore;Graphcore", "aff_domain": "graphcore.ai;graphcore.ai;;;graphcore.ai;graphcore.ai", "position": "Researcher;Researcher;;;VP & Head of Research;Researcher", "bibtex": "@inproceedings{\nribar2024sparq,\ntitle={SparQ Attention: Bandwidth-Efficient {LLM} Inference},\nauthor={Luka Ribar and Ivan Chelombiev and Luke Hudlass-Galley and Charlie Blake and Carlo Luschi and Douglas Orr},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OS5dqxmmtl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1649074, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2789462403763633938&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "graphcore.ai;graphcore.ai;;;graphcore.ai;graphcore.ai", "author_num": 6, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Graphcore", "aff_unique_dep": "", "aff_unique_url": "https://www.graphcore.ai", "aff_unique_abbr": "Graphcore", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "A Human-Inspired Reading Agent with Gist Memory of Very Long Contexts", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34161", "id": "OTmcsyEO5G", "proceeding": "https://proceedings.mlr.press/v235/lee24c.html", "pdf": "https://openreview.net/pdf?id=OTmcsyEO5G", "openreview": "https://openreview.net/forum?id=OTmcsyEO5G", "author_site": "Kuang-Huei Lee, Xinyun Chen, Hiroki Furuta, John Canny, Ian Fischer", "tldr": "", "abstract": "Current Large Language Models (LLMs) are not only limited to some maximum context length, but also are not able to robustly consume long inputs. To address these limitations, we propose ReadAgent, an LLM agent system that increases effective context length up to 20x in our experiments. Inspired by how humans interactively read long documents, we implement ReadAgent as a simple prompting system that uses the advanced language capabilities of LLMs to (1) decide what content to store together in a memory episode, (2) compress those memory episodes into short episodic memories called *gist memories*, and (3) take actions to look up passages in the original text if ReadAgent needs to remind itself of relevant details to complete a task. We evaluate ReadAgent against baselines using retrieval methods, using the original long contexts, and using the gist memories. These evaluations are performed on three long-document reading comprehension tasks: QuALITY, NarrativeQA, and QMSum. ReadAgent outperforms the baselines on all three tasks while extending the effective context window by 3.5-20x.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kuang-Huei Lee;Xinyun Chen;Hiroki Furuta;John Canny;Ian Fischer", "authorids": "~Kuang-Huei_Lee1;~Xinyun_Chen1;~Hiroki_Furuta1;~John_Canny1;~Ian_Fischer1", "gender": "M;M;M;M;F", "homepage": "https://kuanghuei.github.io/;https://github.com/frt03;http://www.cs.berkeley.edu/~jfc/;;https://jungyhuk.github.io/", "dblp": "66/11466;267/2065;;17/5600;", "google_scholar": "rE7-N30AAAAJ;M0OhM1UAAAAJ;https://scholar.google.com.tw/citations?user=LAv0HTEAAAAJ;tPnf61gAAAAJ;d4W1UT0AAAAJ", "orcid": ";;;;", "linkedin": ";;;iantfischer;", "or_profile": "~Kuang-Huei_Lee1;~Hiroki_Furuta1;~John_Canny1;~Ian_Fischer1;~Xinyun_Chen2", "aff": "Google;Google DeepMind;University of California, Berkeley;Google;Google", "aff_domain": "google.com;google.com;berkeley.edu;google.com;google.com", "position": "Researcher;Intern;Full Professor;Researcher;Researcher", "bibtex": "@inproceedings{\nlee2024a,\ntitle={A Human-Inspired Reading Agent with Gist Memory of Very Long Contexts},\nauthor={Kuang-Huei Lee and Xinyun Chen and Hiroki Furuta and John Canny and Ian Fischer},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OTmcsyEO5G}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 479301, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9575781618566728504&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "google.com;google.com;berkeley.edu;google.com;google.com", "author_num": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Google;University of California, Berkeley", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.berkeley.edu", "aff_unique_abbr": "Google;UC Berkeley", "aff_campus_unique_index": "0;2;0;0", "aff_campus_unique": "Mountain View;;Berkeley", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "One-Shot Strategic Classification Under Unknown Costs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34160", "id": "OURP5Z58jt", "proceeding": "https://proceedings.mlr.press/v235/rosenfeld24a.html", "pdf": "https://openreview.net/pdf?id=OURP5Z58jt", "openreview": "https://openreview.net/forum?id=OURP5Z58jt", "author_site": "Elan Rosenfeld, Nir Rosenfeld", "tldr": "", "abstract": "The goal of strategic classification is to learn decision rules which are robust to strategic input manipulation. Earlier works assume that these responses are known; while some recent works handle unknown responses, they exclusively study online settings with repeated model deployments. But there are many domains \u2013 particularly in public policy, a common motivating use case \u2013 where multiple deployments are infeasible, or where even one bad round is unacceptable. To address this gap, we initiate the formal study of *one-shot* strategic classification under unknown responses, which requires committing to a single classifier once. Focusing on uncertainty in the users' cost function, we begin by proving that for a broad class of costs, even a small mis-estimation of the true cost can entail trivial accuracy in the worst case. In light of this, we frame the task as a minimax problem, aiming to minimize worst-case risk over an uncertainty set of costs. We design efficient algorithms for both the full-batch and stochastic settings, which we prove converge (offline) to the minimax solution at the rate of $\\tilde{\\mathcal{O}}(T^{-\\frac{1}{2}})$. Our analysis reveals important structure stemming from strategic responses, particularly the value of *dual norm regularization* with respect to the cost function.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Elan Rosenfeld;Nir Rosenfeld", "authorids": "~Elan_Rosenfeld1;~Nir_Rosenfeld2", "gender": "M;M", "homepage": ";https://nirr.cswp.cs.technion.ac.il", "dblp": "236/4508;145/9800", "google_scholar": "f0j0K8QAAAAJ;WTlgnYkAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Elan_Rosenfeld1;~Nir_Rosenfeld2", "aff": "Carnegie Mellon University;Technion, Technion", "aff_domain": "andrew.cmu.edu;technion.ac.il", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nrosenfeld2024oneshot,\ntitle={One-Shot Strategic Classification Under Unknown Costs},\nauthor={Elan Rosenfeld and Nir Rosenfeld},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OURP5Z58jt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 485360, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16334157912011981321&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "andrew.cmu.edu;technion.ac.il", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Carnegie Mellon University;Technion - Israel Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.technion.ac.il/en/", "aff_unique_abbr": "CMU;Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Israel" }, { "title": "Universal Consistency of Wide and Deep ReLU Neural Networks and Minimax Optimal Convergence Rates for Kolmogorov-Donoho Optimal Function Classes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34159", "id": "OVn8FpeBpG", "proceeding": "https://proceedings.mlr.press/v235/ko24b.html", "pdf": "https://openreview.net/pdf?id=OVn8FpeBpG", "openreview": "https://openreview.net/forum?id=OVn8FpeBpG", "author_site": "Hyunouk Ko, Xiaoming Huo", "tldr": "", "abstract": "In this paper, we prove the universal consistency of wide and deep ReLU neural network classifiers. We also give sufficient conditions for a class of probability measures for which classifiers based on neural networks achieve minimax optimal rates of convergence. The result applies to a wide range of known function classes. In particular, while most previous works impose explicit smoothness assumptions on the regression function, our framework encompasses more general settings. The proposed neural networks are either the minimizers of the $0$-$1$ loss that exhibit a benign overfitting behavior.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hyunouk Ko;Xiaoming Huo", "authorids": "~Hyunouk_Ko1;~Xiaoming_Huo1", "gender": "M;M", "homepage": "https://www.isye.gatech.edu/users/hyunouk-ko;https://www.isye.gatech.edu/users/xiaoming-huo", "dblp": ";67/3392", "google_scholar": ";https://scholar.google.com/citations?hl=en", "orcid": ";0000-0003-0101-1206", "linkedin": ";xiaoming-huo-9653374/", "or_profile": "~Hyunouk_Ko1;~Xiaoming_Huo1", "aff": "Georgia Institute of Technology;Georgia Institute of Technology", "aff_domain": "gatech.edu;gatech.edu", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nko2024universal,\ntitle={Universal Consistency of Wide and Deep Re{LU} Neural Networks and Minimax Optimal Convergence Rates for Kolmogorov-Donoho Optimal Function Classes},\nauthor={Hyunouk Ko and Xiaoming Huo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OVn8FpeBpG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 365819, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16511908463028323786&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 8, "email": "gatech.edu;gatech.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Counterfactual Image Editing", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34158", "id": "OXzkw7vFIO", "proceeding": "https://proceedings.mlr.press/v235/pan24a.html", "pdf": "https://openreview.net/pdf?id=OXzkw7vFIO", "openreview": "https://openreview.net/forum?id=OXzkw7vFIO", "author_site": "Yushu Pan, Elias Bareinboim", "tldr": "", "abstract": "Counterfactual image editing is a challenging task within generative AI. The current literature on the topic focuses primarily on changing individual features while being silent about the causal relationships between features, which are present in the real world. In this paper, we first formalize this task through causal language, modeling the causal relationships between latent generative factors and images through a special type of causal model called *augmented structural causal models (ASCMs)*. Second, we show two fundamental impossibility results: (1) counterfactual editing is impossible from i.i.d. image samples and their corresponding labels alone; (2) also, even when the causal relationships between latent generative factors and images are available, no guarantees regarding the output of the generative model can be provided. Third, we propose a relaxation over this hard problem aiming to approximate the non-identifiable target counterfactual distributions while still preserving features the users care about and that are causally consistent with the true generative model, which we call **ctf-consistent estimators**. Finally, we develop an efficient algorithm to generate counterfactual image samples leveraging neural causal models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yushu Pan;Elias Bareinboim", "authorids": "~Yushu_Pan1;~Elias_Bareinboim2", "gender": ";M", "homepage": ";https://causalai.net", "dblp": "243/6652;85/9005", "google_scholar": "https://scholar.google.com/citations?hl=en;r5U-D7YAAAAJ", "orcid": ";", "linkedin": "yushu-pan-699a37194/;", "or_profile": "~Yushu_Pan1;~Elias_Bareinboim2", "aff": "Columbia University;Columbia University", "aff_domain": "columbia.edu;columbia.edu", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\npan2024counterfactual,\ntitle={Counterfactual Image Editing},\nauthor={Yushu Pan and Elias Bareinboim},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OXzkw7vFIO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5498086, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9356881294253163888&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "columbia.edu;columbia.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Columbia University", "aff_unique_dep": "", "aff_unique_url": "https://www.columbia.edu", "aff_unique_abbr": "Columbia", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Controllable Prompt Tuning For Balancing Group Distributional Robustness", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34157", "id": "OYL91MHfuU", "proceeding": "https://proceedings.mlr.press/v235/phan24b.html", "pdf": "https://openreview.net/pdf?id=OYL91MHfuU", "openreview": "https://openreview.net/forum?id=OYL91MHfuU", "author_site": "Hoang Phan, Andrew Wilson, Qi Lei", "tldr": "", "abstract": "Models trained on data composed of different groups or domains can suffer from severe performance degradation under distribution shifts. While recent methods have largely focused on optimizing the worst-group objective, this often comes at the expense of good performance on other groups. To address this problem, we introduce an optimization scheme to achieve good performance across groups and find a good solution for all without severely sacrificing performance on any of them. However, directly applying such optimization involves updating the parameters of the entire network, making it both computationally expensive and challenging. Thus, we introduce Controllable Prompt Tuning (CPT), which couples our approach with prompt-tuning techniques. On spurious correlation benchmarks, our procedures achieve state-of-the-art results across both transformer and non-transformer architectures, as well as unimodal and multimodal data, while requiring only $0.4\\%$ tunable parameters.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hoang Phan;Andrew Gordon Wilson;Qi Lei", "authorids": "~Hoang_Phan1;~Andrew_Gordon_Wilson1;~Qi_Lei1", "gender": "Not Specified;F;M", "homepage": "https://cims.nyu.edu/~andrewgw;https://cecilialeiqi.github.io/;https://viethoang1512.github.io/", "dblp": "65/10453;;295/0299", "google_scholar": "https://scholar.google.com.tw/citations?user=twWX2LIAAAAJ;kGOgaowAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Andrew_Gordon_Wilson1;~Qi_Lei1;~Hoang_Viet_Phan1", "aff": "New York University;New York University;New York University", "aff_domain": "nyu.edu;nyu.edu;nyu.edu", "position": "Associate Professor;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nphan2024controllable,\ntitle={Controllable Prompt Tuning For Balancing Group Distributional Robustness},\nauthor={Hoang Phan and Andrew Gordon Wilson and Qi Lei},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OYL91MHfuU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4917368, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7015975429193605255&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 7, "email": "nyu.edu;nyu.edu;nyu.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "New York University", "aff_unique_dep": "", "aff_unique_url": "https://www.nyu.edu", "aff_unique_abbr": "NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Bayesian Exploration Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34156", "id": "OYw6sS8QmL", "proceeding": "https://proceedings.mlr.press/v235/fellows24a.html", "pdf": "https://openreview.net/pdf?id=OYw6sS8QmL", "openreview": "https://openreview.net/forum?id=OYw6sS8QmL", "author_site": "Mattie Fellows, Brandon Kaplowitz, Christian Schroeder, Shimon Whiteson", "tldr": "", "abstract": "Bayesian reinforcement learning (RL) offers a principled and elegant approach for sequential decision making under uncertainty. Most notably, Bayesian agents do not face an exploration/exploitation dilemma, a major pathology of frequentist methods. However theoretical understanding of model-free approaches is lacking. In this paper, we introduce a novel Bayesian model-free formulation and the first analysis showing that model-free approaches can yield Bayes-optimal policies. We show all existing model-free approaches make approximations that yield policies that can be arbitrarily Bayes-suboptimal. As a first step towards model-free Bayes optimality, we introduce the Bayesian exploration network (BEN) which uses normalising flows to model both the aleatoric uncertainty (via density estimation) and epistemic uncertainty (via variational inference) in the Bellman operator. In the limit of complete optimisation, BEN learns true Bayes-optimal policies, but like in variational expectation-maximisation, partial optimisation renders our approach tractable. Empirical results demonstrate that BEN can learn true Bayes-optimal policies in tasks where existing model-free approaches fail.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mattie Fellows;Brandon Gary Kaplowitz;Christian Schroeder de Witt;Shimon Whiteson", "authorids": "~Mattie_Fellows1;~Brandon_Gary_Kaplowitz1;~Christian_Schroeder_de_Witt1;~Shimon_Whiteson1", "gender": "M;M;;Unspecified", "homepage": ";https://www.schroederdewitt.com;;http://whirl.cs.ox.ac.uk/member/matthew-fellows/", "dblp": ";;https://dblp.uni-trier.de/pers/w/Whiteson:Shimon.html;26/4512", "google_scholar": ";DE60h_0AAAAJ;;", "orcid": ";;;", "linkedin": "brandon-kaplowitz-40271571/;;;", "or_profile": "~Brandon_Gary_Kaplowitz1;~Christian_Schroeder_de_Witt1;~Shimon_Whiteson1;~Matthew_Fellows1", "aff": "New York University;University of Oxford;University of Oxford;Department of Computer Science", "aff_domain": "nyu.edu;oxford.ac.uk;ox.ac.uk;cs.ox.ac.uk", "position": "PhD student;Lecturer;Professor;Postdoc", "bibtex": "@inproceedings{\nfellows2024bayesian,\ntitle={Bayesian Exploration Networks},\nauthor={Mattie Fellows and Brandon Gary Kaplowitz and Christian Schroeder de Witt and Shimon Whiteson},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OYw6sS8QmL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3395234, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11889924292021021006&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "nyu.edu;oxford.ac.uk;ox.ac.uk;cs.ox.ac.uk", "author_num": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "New York University;University of Oxford;Unknown Institution", "aff_unique_dep": ";;Department of Computer Science", "aff_unique_url": "https://www.nyu.edu;https://www.ox.ac.uk;", "aff_unique_abbr": "NYU;Oxford;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;United Kingdom;" }, { "title": "Sparse and Structured Hopfield Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34155", "id": "OdPlFWExX1", "proceeding": "https://proceedings.mlr.press/v235/santos24a.html", "pdf": "https://openreview.net/pdf?id=OdPlFWExX1", "openreview": "https://openreview.net/forum?id=OdPlFWExX1", "author_site": "Sa\u00fal Santos, Vlad Niculae, Daniel McNamee, Andre Martins", "tldr": "", "abstract": "Modern Hopfield networks have enjoyed recent interest due to their connection to attention in transformers. Our paper provides a unified framework for sparse Hopfield networks by establishing a link with Fenchel-Young losses. The result is a new family of Hopfield-Fenchel-Young energies whose update rules are end-to-end differentiable sparse transformations. We reveal a connection between loss margins, sparsity, and exact memory retrieval. We further extend this framework to structured Hopfield networks via the SparseMAP transformation, which can retrieve pattern associations instead of a single pattern. Experiments on multiple instance learning and text rationalization demonstrate the usefulness of our approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Saul Jos\u00e9 Rodrigues dos Santos;Vlad Niculae;Daniel C McNamee;Andre Martins", "authorids": "~Saul_Jos\u00e9_Rodrigues_dos_Santos1;~Vlad_Niculae2;~Daniel_C_McNamee1;~Andre_Martins1", "gender": "M;M;;M", "homepage": "https://ssantos97.github.io/;https://vene.ro;https://fchampalimaud.org/news/daniel-mcnamee;https://andre-martins.github.io/", "dblp": ";40/10489;304/8886;m/AndreFTMartins", "google_scholar": "mq8rYZYAAAAJ;7_3UAgQAAAAJ;xRCY13MAAAAJ;https://scholar.google.pt/citations?user=mT7ppvwAAAAJ", "orcid": ";;0000-0001-9928-4960;", "linkedin": "saul-santos-63449b15b/;;;", "or_profile": "~Saul_Jos\u00e9_Rodrigues_dos_Santos1;~Vlad_Niculae2;~Daniel_C_McNamee1;~Andre_Martins1", "aff": "Instituto Superior T\u00e9cnico;University of Amsterdam;Champalimaud Neuroscience Programme;Unbabel", "aff_domain": "tecnico.ulisboa.pt;uva.nl;fchampalimaud.org;unbabel.com", "position": "PhD student;Assistant Professor;Assistant Professor;Research Scientist", "bibtex": "@inproceedings{\nsantos2024sparse,\ntitle={Sparse and Structured Hopfield Networks},\nauthor={Saul Jos{\\'e} Rodrigues dos Santos and Vlad Niculae and Daniel C McNamee and Andre Martins},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OdPlFWExX1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2050223, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7569954455071475156&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "tecnico.ulisboa.pt;uva.nl;fchampalimaud.org;unbabel.com", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Instituto Superior T\u00e9cnico;University of Amsterdam;Champalimaud Centre for the Unknown;Unbabel", "aff_unique_dep": ";;Neuroscience Programme;", "aff_unique_url": "https://www.ist.utl.pt;https://www.uva.nl;https://www.champalimaud.org;https://www.unbabel.com", "aff_unique_abbr": "IST;UvA;CCU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Portugal;Netherlands" }, { "title": "Compact Optimality Verification for Optimization Proxies", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34154", "id": "OdsZS0E0AO", "proceeding": "https://proceedings.mlr.press/v235/chen24bj.html", "pdf": "https://openreview.net/pdf?id=OdsZS0E0AO", "openreview": "https://openreview.net/forum?id=OdsZS0E0AO", "author_site": "Wenbo Chen, Haoruo Zhao, Mathieu Tanneau, Pascal Van Hentenryck", "tldr": "", "abstract": "Recent years have witnessed increasing interest in optimization proxies, i.e., machine learning models that approximate the input-output mapping of parametric optimization problems and return near-optimal feasible solutions. Following recent work by (Nellikkath & Chatzivasileiadis, 2021), this paper reconsiders the optimality verification problem for optimization proxies, i.e., the determination of the worst-case optimality gap over the instance distribution. The paper proposes a compact formulation for optimality verification and a gradient-based primal heuristic that brings significant computational benefits to the original formulation. The compact formulation is also more general and applies to non-convex optimization problems. The benefits of the compact formulation are demonstrated on large-scale DC Optimal Power Flow and knapsack problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenbo Chen;Haoruo Zhao;Mathieu Tanneau;Pascal Van Hentenryck", "authorids": "~Wenbo_Chen2;~Haoruo_Zhao1;~Mathieu_Tanneau1;~Pascal_Van_Hentenryck2", "gender": "M;F;;M", "homepage": "https://wenbo11.github.io/;;;https://sites.gatech.edu/pascal-van-hentenryck/", "dblp": "80/1502-1;;251/3115.html;h/PVHentenryck.html", "google_scholar": "https://scholar.google.com/citations?hl=en;;69NakqoAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0002-9967-0578;0000-0001-7085-9994", "linkedin": "wenbo-chen-919603184;haoruo-zhao-66a96a251/;;", "or_profile": "~Wenbo_Chen2;~Haoruo_Zhao1;~Mathieu_Tanneau1;~Pascal_Van_Hentenryck2", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology", "aff_domain": "gatech.edu;gatech.edu;gatech.edu;gatech.edu", "position": "PhD student;PhD student;Researcher;Full Professor", "bibtex": "@inproceedings{\nchen2024compact,\ntitle={Compact Optimality Verification for Optimization Proxies},\nauthor={Wenbo Chen and Haoruo Zhao and Mathieu Tanneau and Pascal Van Hentenryck},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OdsZS0E0AO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1422493, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8606769638016656836&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 7, "email": "gatech.edu;gatech.edu;gatech.edu;gatech.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Structure Your Data: Towards Semantic Graph Counterfactuals", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34153", "id": "OenMwDPqWn", "proceeding": "https://proceedings.mlr.press/v235/dimitriou24a.html", "pdf": "https://openreview.net/pdf?id=OenMwDPqWn", "openreview": "https://openreview.net/forum?id=OenMwDPqWn", "author_site": "Angeliki Dimitriou, Maria Lymperaiou, Giorgos Filandrianos, Konstantinos Thomas, Giorgos Stamou", "tldr": "", "abstract": "Counterfactual explanations (CEs) based on concepts are explanations that consider alternative scenarios to understand which high-level semantic features contributed to particular model predictions. In this work, we propose CEs based on the semantic graphs accompanying input data to achieve more descriptive, accurate, and human-aligned explanations. Building upon state-of-the-art (SotA) conceptual attempts, we adopt a model-agnostic edit-based approach and introduce leveraging GNNs for efficient Graph Edit Distance (GED) computation. With a focus on the visual domain, we represent images as scene graphs and obtain their GNN embeddings to bypass solving the NP-hard graph similarity problem for all input pairs, an integral part of CE computation process. We apply our method to benchmark and real-world datasets with varying difficulty and availability of semantic annotations. Testing on diverse classifiers, we find that our CEs outperform previous SotA explanation models based on semantics, including both white and black-box as well as conceptual and pixel-level approaches. Their superiority is proven quantitatively and qualitatively, as validated by human subjects, highlighting the significance of leveraging semantic edges in the presence of intricate relationships. Our model-agnostic graph-based approach is widely applicable and easily extensible, producing actionable explanations across different contexts. The code is available at https://github.com/aggeliki-dimitriou/SGCE.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Angeliki Dimitriou;Maria Lymperaiou;Georgios Filandrianos;Konstantinos Thomas;Giorgos Stamou", "authorids": "~Angeliki_Dimitriou1;~Maria_Lymperaiou1;~Georgios_Filandrianos1;~Konstantinos_Thomas1;~Giorgos_Stamou1", "gender": "F;F;M;;M", "homepage": "https://www.ails.ece.ntua.gr/people/angelikidim;https://www.ails.ece.ntua.gr/people/marialymp;https://www.ails.ece.ntua.gr/people/geofila;;https://www.ece.ntua.gr/en/staff/174", "dblp": "367/3837;329/4552;290/5533.html;290/5433.html;s/GBStamou", "google_scholar": "pyiokhkAAAAJ;YNikyhIAAAAJ;oPIyXYcAAAAJ;;https://scholar.google.gr/citations?user=R3y5dxMAAAAJ", "orcid": "0009-0001-5817-3794;0000-0001-9442-4186;0000-0002-7015-7746;;", "linkedin": "angeliki-dimitriou-81079621b/;maria-lymperaiou-55a5b964/;george-filandrianos-7693b9188/;;", "or_profile": "~Angeliki_Dimitriou1;~Maria_Lymperaiou1;~Georgios_Filandrianos1;~Konstantinos_Thomas1;~Giorgos_B._Stamou1", "aff": "National Technical University of Athens;National Technical University of Athens;National Technical University of Athens;National Technical University of Athens;National Technical University of Athens", "aff_domain": "ntua.gr;ntua.gr;ntua.gr;ntua.gr;ntua.gr", "position": "PhD student;PhD student;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\ndimitriou2024structure,\ntitle={Structure Your Data: Towards Semantic Graph Counterfactuals},\nauthor={Angeliki Dimitriou and Maria Lymperaiou and Georgios Filandrianos and Konstantinos Thomas and Giorgos Stamou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OenMwDPqWn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9465025, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4335702462262391799&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "ntua.gr;ntua.gr;ntua.gr;ntua.gr;ntua.gr", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "National Technical University of Athens", "aff_unique_dep": "", "aff_unique_url": "https://www.ntua.gr", "aff_unique_abbr": "NTUA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Greece" }, { "title": "Vanilla Bayesian Optimization Performs Great in High Dimensions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34152", "id": "OfT8MgIqHT", "proceeding": "https://proceedings.mlr.press/v235/hvarfner24a.html", "pdf": "https://openreview.net/pdf?id=OfT8MgIqHT", "openreview": "https://openreview.net/forum?id=OfT8MgIqHT", "author_site": "Carl Hvarfner, Erik Hellsten, Luigi Nardi", "tldr": "", "abstract": "High-dimensional optimization problems have long been considered the Achilles' heel of Bayesian optimization algorithms. Spurred by the curse of dimensionality, a large collection of algorithms aim to make BO more performant in this setting, commonly by imposing various simplifying assumptions on the objective, thereby decreasing its presumed complexity. In this paper, we identify the degeneracies that make vanilla BO poorly suited to high-dimensional tasks, and further show how existing algorithms address these degeneracies through the lens of model complexity. Motivated by the model complexity measure, we derive an enhancement to the prior assumptions that are typical of the vanilla BO algorithm, which reduces the complexity to manageable levels without imposing structural restrictions on the objective. Our modification - a simple scaling of the Gaussian process lengthscale prior in the dimensionality - reveals that standard BO works drastically better than previously thought in high dimensions. Our insights are supplemented by substantial out-performance of existing state-of-the-art on multiple commonly considered real-world high-dimensional tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Carl Hvarfner;Erik Orm Hellsten;Luigi Nardi", "authorids": "~Carl_Hvarfner1;~Erik_Orm_Hellsten1;~Luigi_Nardi1", "gender": "M;M;M", "homepage": "https://portal.research.lu.se/portal/sv/persons/carl-hvarfner(cd140b82-9fed-4e88-868e-1cf569dcbeb7).html;;", "dblp": "319/3033;;60/7206", "google_scholar": "https://scholar.google.se/citations?hl=en;https://scholar.google.se/citations?user=mK5N-xQAAAAJ;https://scholar.google.it/citations?user=Kgs3zQoAAAAJ", "orcid": ";;0000-0002-4601-2264", "linkedin": "carl-hvarfner-a97421153/;;nardiluigi/", "or_profile": "~Carl_Hvarfner1;~Erik_Orm_Hellsten1;~Luigi_Nardi1", "aff": "Lund University;Lund University;Stanford University", "aff_domain": "lu.se;lu.se;stanford.edu", "position": "PhD student;Postdoc;Researcher", "bibtex": "@inproceedings{\nhvarfner2024vanilla,\ntitle={Vanilla Bayesian Optimization Performs Great in High Dimensions},\nauthor={Carl Hvarfner and Erik Orm Hellsten and Luigi Nardi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OfT8MgIqHT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6476145, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1386777226300846433&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "lu.se;lu.se;stanford.edu", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Lund University;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://www.lunduniversity.lu.se;https://www.stanford.edu", "aff_unique_abbr": "LU;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Sweden;United States" }, { "title": "Pragmatic Feature Preferences: Learning Reward-Relevant Preferences from Human Input", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34151", "id": "OgG0I5toZZ", "proceeding": "https://proceedings.mlr.press/v235/peng24d.html", "pdf": "https://openreview.net/pdf?id=OgG0I5toZZ", "openreview": "https://openreview.net/forum?id=OgG0I5toZZ", "author_site": "Andi Peng, Yuying Sun, Tianmin Shu, David Abel", "tldr": "", "abstract": "Humans use context to specify preferences over behaviors, i.e. their reward functions. Yet, algorithms for inferring reward models from preference data do not take this social learning view into account. Inspired by pragmatic human communication, we study how to extract fine-grained data regarding why an example is preferred that is useful for learning an accurate reward model. We propose to enrich preference queries to ask both (1) which features of a given example are preferable in addition to (2) comparisons between objects. We derive an approach for learning from these feature-level preferences, both for cases where users specify which features are reward-relevant, and when users do not. We evaluate our approach on linear bandit settings in both visual and language-based domains. Results support the efficiency of our approach in quickly converging to accurate rewards with less comparisons vs. example-only labels. Finally, we validate the real-world applicability with a behavioral experiment on a mushroom foraging task. Our findings suggest that incorporating pragmatic feature preferences is a promising approach for more efficient user-aligned reward learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Andi Peng;Yuying Sun;Tianmin Shu;David Abel", "authorids": "~Andi_Peng1;ysunboston@gmail.com;~Tianmin_Shu1;~David_Abel1", "gender": "F;;;M", "homepage": "https://andipeng.com/;;;http://david-abel.github.io", "dblp": "242/9185;;163/2175.html;162/9926", "google_scholar": "S63gb38AAAAJ;;YT_ffdwAAAAJ;lvBJlmwAAAAJ", "orcid": ";;;0000-0003-0302-7543", "linkedin": ";;;", "or_profile": "~Andi_Peng1;ysunboston@gmail.com;~Tianmin_Shu1;~David_Abel1", "aff": "Massachusetts Institute of Technology;;Johns Hopkins University;Google DeepMind", "aff_domain": "mit.edu;;jhu.edu;google.com", "position": "PhD student;;Assistant Professor;Research Scientist", "bibtex": "@inproceedings{\npeng2024pragmatic,\ntitle={Pragmatic Feature Preferences: Learning Reward-Relevant Preferences from Human Input},\nauthor={Andi Peng and Yuying Sun and Tianmin Shu and David Abel},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OgG0I5toZZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1312738, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17014348803230874776&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "mit.edu;;jhu.edu;google.com", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Massachusetts Institute of Technology;Johns Hopkins University;Google", "aff_unique_dep": ";;Google DeepMind", "aff_unique_url": "https://web.mit.edu;https://www.jhu.edu;https://deepmind.com", "aff_unique_abbr": "MIT;JHU;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Investigating Pre-Training Objectives for Generalization in Vision-Based Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34150", "id": "OiI12sNbgD", "proceeding": "https://proceedings.mlr.press/v235/kim24u.html", "pdf": "https://openreview.net/pdf?id=OiI12sNbgD", "openreview": "https://openreview.net/forum?id=OiI12sNbgD", "author_site": "Donghu Kim, Hojoon Lee, Kyungmin Lee, Dongyoon Hwang, Jaegul Choo", "tldr": "", "abstract": "Recently, various pre-training methods have been introduced in vision-based Reinforcement Learning (RL). However, their generalization ability remains unclear due to evaluations being limited to in-distribution environments and non-unified experimental setups. To address this, we introduce the Atari Pre-training Benchmark (Atari-PB), which pre-trains a ResNet-50 model on 10 million transitions from 50 Atari games and evaluates it across diverse environment distributions. Our experiments show that pre-training objectives focused on learning task-agnostic features (e.g., identifying objects and understanding temporal dynamics) enhance generalization across different environments. In contrast, objectives focused on learning task-specific knowledge (e.g., identifying agents and fitting reward functions) improve performance in environments similar to the pre-training dataset but not in varied ones. We publicize our codes, datasets, and model checkpoints at https://github.com/dojeon-ai/Atari-PB.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Donghu Kim;Hojoon Lee;Kyungmin Lee;Dongyoon Hwang;Jaegul Choo", "authorids": "~Donghu_Kim1;~Hojoon_Lee1;~Kyungmin_Lee2;~Dongyoon_Hwang1;~Jaegul_Choo1", "gender": "M;M;M;M;M", "homepage": "https://i-am-proto.github.io;https://joonleesky.github.io/;https://github.com/kyungminn;;https://sites.google.com/site/jaegulchoo/", "dblp": "379/3468;;;;07/2074", "google_scholar": "LcYjQYcAAAAJ;;;https://scholar.google.com/citations?view_op=list_works;GHJYsLEAAAAJ", "orcid": ";;;;", "linkedin": "donghu-kim-3b57972b6/;;;;", "or_profile": "~Donghu_Kim1;~Hojoon_Lee1;~Kyungmin_Lee2;~Dongyoon_Hwang1;~Jaegul_Choo1", "aff": "Korea University;Sony AI;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "korea.ac.kr;sony.com;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "Undergrad student;Intern;PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nkim2024investigating,\ntitle={Investigating Pre-Training Objectives for Generalization in Vision-Based Reinforcement Learning},\nauthor={Donghu Kim and Hojoon Lee and Kyungmin Lee and Dongyoon Hwang and Jaegul Choo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OiI12sNbgD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1634666, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17539706079043661944&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "korea.ac.kr;sony.com;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "author_num": 5, "aff_unique_index": "0;1;2;2;2", "aff_unique_norm": "Korea University;Sony;Korea Advanced Institute of Science and Technology", "aff_unique_dep": ";Sony AI;", "aff_unique_url": "https://www.korea.ac.kr;https://www.sony.com;https://www.kaist.ac.kr", "aff_unique_abbr": "KU;Sony AI;KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "South Korea;Japan" }, { "title": "Straight-Through Meets Sparse Recovery: the Support Exploration Algorithm", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34149", "id": "Oj18qGN1gC", "proceeding": "https://proceedings.mlr.press/v235/mohamed24a.html", "pdf": "https://openreview.net/pdf?id=Oj18qGN1gC", "openreview": "https://openreview.net/forum?id=Oj18qGN1gC", "author_site": "Mimoun Mohamed, Francois Malgouyres, Valentin Emiya, Caroline Chaux", "tldr": "", "abstract": "The *straight-through estimator* (STE) is commonly used to optimize quantized neural networks, yet its contexts of effective performance are still unclear despite empirical successes. To make a step forward in this comprehension, we apply STE to a well-understood problem: *sparse support recovery*. We introduce the *Support Exploration Algorithm* (SEA), a novel algorithm promoting sparsity, and we analyze its performance in support recovery (a.k.a. model selection) problems. SEA explores more supports than the state-of-the-art, leading to superior performance in experiments, especially when the columns of $A$ are strongly coherent. The theoretical analysis considers recovery guarantees when the linear measurements matrix $A$ satisfies the *Restricted Isometry Property* (RIP). The sufficient conditions of recovery are comparable but more stringent than those of the state-of-the-art in sparse support recovery. Their significance lies mainly in their applicability to an instance of the STE.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mimoun Mohamed;Francois Malgouyres;Valentin Emiya;Caroline Chaux", "authorids": "~Mimoun_Mohamed1;~Francois_Malgouyres1;~Valentin_Emiya2;~Caroline_Chaux2", "gender": "M;M;M;F", "homepage": "https://mimoun-mohamed-lab.github.io/;https://www.math.univ-toulouse.fr/~fmalgouy/;https://pageperso.lis-lab.fr/valentin.emiya/;https://ipal.cnrs.fr/caroline-chaux-personal-page/", "dblp": ";97/5816;67/7645;95/6069", "google_scholar": ";ECRBHzwAAAAJ;https://scholar.google.fr/citations?user=073tcVsAAAAJ;https://scholar.google.fr/citations?user=Mo_nD_gAAAAJ", "orcid": "0009-0005-1214-3396;;0000-0001-7102-6943;0000-0003-1056-673X", "linkedin": "mimoun-mohamed-bb57aa150/;;;", "or_profile": "~Mimoun_Mohamed1;~Francois_Malgouyres1;~Valentin_Emiya2;~Caroline_Chaux2", "aff": "Universit\u00e9 d'Aix-Marseille;Universit\u00e9 de Toulouse;Universit\u00e9 d'Aix-Marseille;CNRS", "aff_domain": "univ-amu.fr;univ-tlse3.fr;univ-amu.fr;cnrs.fr", "position": "PhD student;Full Professor;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nmohamed2024straightthrough,\ntitle={Straight-Through Meets Sparse Recovery: the Support Exploration Algorithm},\nauthor={Mimoun Mohamed and Francois Malgouyres and Valentin Emiya and Caroline Chaux},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Oj18qGN1gC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1639219, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7505289243251131976&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 21, "email": "univ-amu.fr;univ-tlse3.fr;univ-amu.fr;cnrs.fr", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Aix-Marseille University;Universit\u00e9 de Toulouse;Centre National de la Recherche Scientifique", "aff_unique_dep": ";;", "aff_unique_url": "https://www.univ-amu.fr;https://www.univ-toulouse.fr;https://www.cnrs.fr", "aff_unique_abbr": "AMU;UT;CNRS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "France" }, { "title": "MGit: A Model Versioning and Management System", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34148", "id": "OjBW993g79", "proceeding": "https://proceedings.mlr.press/v235/hao24c.html", "pdf": "https://openreview.net/pdf?id=OjBW993g79", "openreview": "https://openreview.net/forum?id=OjBW993g79", "author_site": "Wei Hao, Daniel Mendoza, Rafael Mendes, Deepak Narayanan, Amar Phanishayee, Asaf Cidon, Junfeng Yang", "tldr": "", "abstract": "New ML models are often derived from existing ones (e.g., through fine-tuning, quantization or distillation), forming an ecosystem where models are *related* to each other and can share structure or even parameter values. Managing such a large and evolving ecosystem of model derivatives is challenging. For instance, the overhead of storing all such models is high, and models may inherit bugs from related models, complicating error attribution and debugging. In this paper, we propose a model versioning and management system called MGit that makes it easier to store, test, update, and collaborate on related models. MGit introduces a lineage graph that records the relationships between models, optimizations to efficiently store model parameters, and abstractions over this lineage graph that facilitate model testing, updating and collaboration. We find that MGit works well in practice: MGit is able to reduce model storage footprint by up to 7$\\times$. Additionally, in a user study with 20 ML practitioners, users complete a model updating task 3$\\times$ faster on average with MGit.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wei Hao;Daniel Mendoza;Rafael Mendes;Deepak Narayanan;Amar Phanishayee;Asaf Cidon;Junfeng Yang", "authorids": "~Wei_Hao4;~Daniel_Mendoza1;~Rafael_Mendes1;~Deepak_Narayanan2;~Amar_Phanishayee1;~Asaf_Cidon1;~Junfeng_Yang1", "gender": ";M;M;M;M;M;M", "homepage": "https://weihao97.github.io/;https://deepakn94.github.io/;https://aka.ms/amar;https://www.asafcidon.com/;https://www.cs.columbia.edu/~junfeng/;https://cs.stanford.edu/people/dmendo/;https://www.microsoft.com/en-us/research/people/rdasilva/", "dblp": ";;14/877;35/10805;71/3724.html;211/4496;", "google_scholar": "https://scholar.google.com/citations?hl=en;sTzb6LAAAAAJ;;yKzSdygAAAAJ;JJ9AvbAAAAAJ;aNSWanAAAAAJ;", "orcid": "0009-0000-8575-2540;;;0009-0007-4046-2022;0009-0000-2277-6545;;", "linkedin": ";;;asaf-cidon-a7ab289/;;daniel-mendoza-b0a810137/;", "or_profile": "~Wei_Hao4;~Deepak_Narayanan2;~Amar_Phanishayee1;~Asaf_Cidon1;~Junfeng_Yang1;~Daniel_Marcos_Mendoza1;~Rafael_Mendes_Da_Silva1", "aff": "Columbia University;NVIDIA;Microsoft;Columbia University;Columbia University;Stanford University;Microsoft Research", "aff_domain": "columbia.edu;nvidia.com;microsoft.com;columbia.edu;columbia.edu;stanford.edu;research.microsoft.com", "position": "PhD student;Researcher;Sr. Principal Researcher;Associate Professor;Associate Professor;PhD student;Researcher", "bibtex": "@inproceedings{\nhao2024mgit,\ntitle={{MG}it: A Model Versioning and Management System},\nauthor={Wei Hao and Daniel Mendoza and Rafael Mendes and Deepak Narayanan and Amar Phanishayee and Asaf Cidon and Junfeng Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OjBW993g79}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 476968, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4358366350416789867&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "email": "columbia.edu;nvidia.com;microsoft.com;columbia.edu;columbia.edu;stanford.edu;research.microsoft.com", "author_num": 7, "aff_unique_index": "0;1;2;0;0;3;2", "aff_unique_norm": "Columbia University;NVIDIA;Microsoft;Stanford University", "aff_unique_dep": ";NVIDIA Corporation;Microsoft Corporation;", "aff_unique_url": "https://www.columbia.edu;https://www.nvidia.com;https://www.microsoft.com;https://www.stanford.edu", "aff_unique_abbr": "Columbia;NVIDIA;Microsoft;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Verification of Machine Unlearning is Fragile", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34147", "id": "OkChMnjF6s", "proceeding": "https://proceedings.mlr.press/v235/zhang24h.html", "pdf": "https://openreview.net/pdf?id=OkChMnjF6s", "openreview": "https://openreview.net/forum?id=OkChMnjF6s", "author_site": "Binchi Zhang, Zihan Chen, Cong Shen, Jundong Li", "tldr": "", "abstract": "As privacy concerns escalate in the realm of machine learning, data owners now have the option to utilize machine unlearning to remove their data from machine learning models, following recent legislation. To enhance transparency in machine unlearning and avoid potential dishonesty by model providers, various verification strategies have been proposed. These strategies enable data owners to ascertain whether their target data has been effectively unlearned from the model. However, our understanding of the safety issues of machine unlearning verification remains nascent. In this paper, we explore the novel research question of whether model providers can circumvent verification strategies while retaining the information of data supposedly unlearned. Our investigation leads to a pessimistic answer: the verification of machine unlearning is fragile. Specifically, we categorize the current verification strategies regarding potential dishonesty among model providers into two types. Subsequently, we introduce two novel adversarial unlearning processes capable of circumventing both types. We validate the efficacy of our methods through theoretical analysis and empirical experiments using real-world datasets. This study highlights the vulnerabilities and limitations in machine unlearning verification, paving the way for further research into the safety of machine unlearning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Binchi Zhang;Zihan Chen;Cong Shen;Jundong Li", "authorids": "~Binchi_Zhang1;~Zihan_Chen5;~Cong_Shen1;~Jundong_Li2", "gender": "M;;M;M", "homepage": "https://zhangbinchi.github.io/;;https://cshen317.github.io/;https://jundongli.github.io/", "dblp": "304/7647;;79/6027-1.html;144/7997.html", "google_scholar": "c8Z36PAAAAAJ;;70LBhKcAAAAJ;uY6ek7sAAAAJ", "orcid": "0000-0001-7321-3822;;0000-0002-3148-4453;", "linkedin": "binchi-zhang-274922221/;;cong-shen-3372404/;", "or_profile": "~Binchi_Zhang1;~Zihan_Chen5;~Cong_Shen1;~Jundong_Li2", "aff": "University of Virginia, Charlottesville;;University of Virginia;University of Virginia", "aff_domain": "virginia.edu;;virginia.edu;virginia.edu", "position": "PhD student;;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nzhang2024verification,\ntitle={Verification of Machine Unlearning is Fragile},\nauthor={Binchi Zhang and Zihan Chen and Cong Shen and Jundong Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OkChMnjF6s}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 673014, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1564256925988973071&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "virginia.edu;;virginia.edu;virginia.edu", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Virginia", "aff_unique_dep": "", "aff_unique_url": "https://www.virginia.edu", "aff_unique_abbr": "UVA", "aff_campus_unique_index": "0", "aff_campus_unique": "Charlottesville;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "A Unified Linear Programming Framework for Offline Reward Learning from Human Demonstrations and Feedback", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34146", "id": "Olix9pk6nV", "proceeding": "https://proceedings.mlr.press/v235/kim24ak.html", "pdf": "https://openreview.net/pdf?id=Olix9pk6nV", "openreview": "https://openreview.net/forum?id=Olix9pk6nV", "author_site": "Kihyun Kim, Jiawei Zhang, Asuman Ozdaglar, Pablo A. Parrilo", "tldr": "", "abstract": "Inverse Reinforcement Learning (IRL) and Reinforcement Learning from Human Feedback (RLHF) are pivotal methodologies in reward learning, which involve inferring and shaping the underlying reward function of sequential decision-making problems based on observed human demonstrations and feedback. Most prior work in reward learning has relied on prior knowledge or assumptions about decision or preference models, potentially leading to robustness issues. In response, this paper introduces a novel linear programming (LP) framework tailored for offline reward learning. Utilizing pre-collected trajectories without online exploration, this framework estimates a feasible reward set from the primal-dual optimality conditions of a suitably designed LP, and offers an optimality guarantee with provable sample efficiency. Our LP framework also enables aligning the reward functions with human feedback, such as pairwise trajectory comparison data, while maintaining computational tractability and sample efficiency. We demonstrate that our framework potentially achieves better performance compared to the conventional maximum likelihood estimation (MLE) approach through analytical examples and numerical experiments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kihyun Kim;Jiawei Zhang;Asuman E. Ozdaglar;Pablo Parrilo", "authorids": "~Kihyun_Kim1;~Jiawei_Zhang6;~Asuman_E._Ozdaglar1;~Pablo_Parrilo1", "gender": ";M;F;M", "homepage": "https://kihyun.xyz/;https://www.cuhk.edu.cn/;https://asu.mit.edu/;https://www.mit.edu/~parrilo/", "dblp": ";;35/2875;09/456", "google_scholar": "WEQPfK8AAAAJ;;https://scholar.google.com.tw/citations?user=nWnBSOsAAAAJ;https://scholar.google.com.tw/citations?user=s_IPfEYAAAAJ", "orcid": "0000-0003-4372-0967;0000-0002-9420-384X;;0000-0003-1132-8477", "linkedin": "kihyun-kim-88010a200/;;;", "or_profile": "~Kihyun_Kim1;~Jiawei_Zhang6;~Asuman_E._Ozdaglar1;~Pablo_Parrilo1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu;mit.edu", "position": "PhD student;Postdoc;PhD student;Full Professor", "bibtex": "@inproceedings{\nkim2024a,\ntitle={A Unified Linear Programming Framework for Offline Reward Learning from Human Demonstrations and Feedback},\nauthor={Kihyun Kim and Jiawei Zhang and Asuman E. Ozdaglar and Pablo Parrilo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Olix9pk6nV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 401894, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11137300185461139543&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "mit.edu;mit.edu;mit.edu;mit.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "FRAG: Frequency Adapting Group for Diffusion Video Editing", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34145", "id": "OnEaBGU3LO", "proceeding": "https://proceedings.mlr.press/v235/yoon24c.html", "pdf": "https://openreview.net/pdf?id=OnEaBGU3LO", "openreview": "https://openreview.net/forum?id=OnEaBGU3LO", "author_site": "Sunjae Yoon, Gwanhyeong Koo, Geonwoo Kim, Chang Yoo", "tldr": "", "abstract": "In video editing, the hallmark of a quality edit lies in its consistent and unobtrusive adjustment. Modification, when integrated, must be smooth and subtle, preserving the natural flow and aligning seamlessly with the original vision. Therefore, our primary focus is on overcoming the current challenges in high quality edit to ensure that each edit enhances the final product without disrupting its intended essence. However, quality deterioration such as blurring and flickering is routinely observed in recent diffusion video editing systems. We confirm that this deterioration often stems from high-frequency leak: the diffusion model fails to accurately synthesize high-frequency components during denoising process. To this end, we devise Frequency Adapting Group (FRAG) which enhances the video quality in terms of consistency and fidelity by introducing a novel receptive field branch to preserve high-frequency components during the denoising process. FRAG is performed in a model-agnostic manner without additional training and validates the effectiveness on video editing benchmarks (i.e., TGVE, DAVIS).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sunjae Yoon;Gwanhyeong Koo;Geonwoo Kim;Chang D. Yoo", "authorids": "~Sunjae_Yoon1;~Gwanhyeong_Koo2;~Geonwoo_Kim2;~Chang_D._Yoo1", "gender": "M;M;M;M", "homepage": "https://dbstjswo505.github.io/;;https://sanctusfactory.com/family.php;https://kookie12.github.io/", "dblp": "273/3911;;31/7819;358/7119", "google_scholar": "2A2lRoUAAAAJ;;gFWgUQEAAAAJ;https://scholar.google.co.kr/citations?user=qDCTLZgAAAAJ", "orcid": "0000-0001-7458-5273;;0000-0002-0756-7179;", "linkedin": "sunjae-yoon-133294333/;geonwookim-6a25b72b0/;;", "or_profile": "~Sunjae_Yoon1;~Geonwoo_Kim2;~Chang_D._Yoo1;~GwanHyeong_Koo1", "aff": "Korea Advanced Institute of Science and Technology (KAIST);KAIST, Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science and Technology (KAIST)", "aff_domain": "kaist.ac.kr;ee.kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "PhD student;Undergrad student;Full Professor;MS student", "bibtex": "@inproceedings{\nyoon2024frag,\ntitle={{FRAG}: Frequency Adapting Group for Diffusion Video Editing},\nauthor={Sunjae Yoon and Gwanhyeong Koo and Geonwoo Kim and Chang D. Yoo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OnEaBGU3LO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6015854, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10641112772685413256&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "kaist.ac.kr;ee.kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "A Simple Early Exiting Framework for Accelerated Sampling in Diffusion Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34144", "id": "OnOaj3g9fi", "proceeding": "https://proceedings.mlr.press/v235/moon24a.html", "pdf": "https://openreview.net/pdf?id=OnOaj3g9fi", "openreview": "https://openreview.net/forum?id=OnOaj3g9fi", "author_site": "Taehong Moon, Moonseok Choi, EungGu Yun, Jongmin Yoon, Gayoung Lee, Jaewoong Cho, Juho Lee", "tldr": "", "abstract": "Diffusion models have shown remarkable performance in generation problems over various domains including images, videos, text, and audio. A practical bottleneck of diffusion models is their sampling speed, due to the repeated evaluation of score estimation networks during the inference. In this work, we propose a novel framework capable of adaptively allocating compute required for the score estimation, thereby reducing the overall sampling time of diffusion models. We observe that the amount of computation required for the score estimation may vary along the time step for which the score is estimated. Based on this observation, we propose an early-exiting scheme, where we skip the subset of parameters in the score estimation network during the inference, based on a time-dependent exit schedule. Using the diffusion models for image synthesis, we show that our method could significantly improve the sampling throughput of the diffusion models without compromising image quality. Furthermore, we also demonstrate that our method seamlessly integrates with various types of solvers for faster sampling, capitalizing on their compatibility to enhance overall efficiency.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Taehong Moon;Moonseok Choi;EungGu Yun;Jongmin Yoon;Gayoung Lee;Jaewoong Cho;Juho Lee", "authorids": "~Taehong_Moon1;~Moonseok_Choi1;~EungGu_Yun1;~Jongmin_Yoon1;~Gayoung_Lee1;~Jaewoong_Cho1;~Juho_Lee2", "gender": "M;M;M;M;F;;M", "homepage": "https://taehong-moon.github.io;;https://yuneg11.github.io;https://jmyoon1.github.io;;https://sites.google.com/view/jaewoongcho;https://juho.lee.github.io", "dblp": "391/5924;331/2083;;04/390;179/2468;184/3848;55/3410-1", "google_scholar": "https://scholar.google.co.kr/citations?user=wBwIIYQAAAAJ;i-pOb1IAAAAJ;r7-847MAAAAJ;https://scholar.google.co.kr/citations?user=WKsaDwQAAAAJ;;;Py4URJUAAAAJ", "orcid": ";;0000-0002-4648-1415;;;;", "linkedin": "taehongmoon;moonseok-choi/;yuneg/;jongmin-yoon-9915469a;gayoung-lee-0824548a/;;", "or_profile": "~Taehong_Moon1;~Moonseok_Choi1;~EungGu_Yun1;~Jongmin_Yoon1;~Gayoung_Lee1;~Jaewoong_Cho1;~Juho_Lee2", "aff": "Krafton;Korea Advanced Institute of Science & Technology;SAIGE;Korea Advanced Institute of Science & Technology;NAVER AI Lab;KRAFTON;Korea Advanced Institute of Science & Technology", "aff_domain": "krafton.com;kaist.ac.kr;saige.ai;kaist.ac.kr;navercorp.com;krafton.com;kaist.ac.kr", "position": "Researcher;PhD student;Researcher;PhD student;Researcher;Researcher;Associate Professor", "bibtex": "@inproceedings{\nmoon2024a,\ntitle={A Simple Early Exiting Framework for Accelerated Sampling in Diffusion Models},\nauthor={Taehong Moon and Moonseok Choi and EungGu Yun and Jongmin Yoon and Gayoung Lee and Jaewoong Cho and Juho Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OnOaj3g9fi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 10195211, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5021745180009990507&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 6, "email": "krafton.com;kaist.ac.kr;saige.ai;kaist.ac.kr;navercorp.com;krafton.com;kaist.ac.kr", "author_num": 7, "aff_unique_index": "0;1;2;1;3;0;1", "aff_unique_norm": "KRAFTON Inc.;Korea Advanced Institute of Science and Technology;SAIGE;NAVER Corporation", "aff_unique_dep": ";;;NAVER AI Lab", "aff_unique_url": "https://www.krafton.com;https://www.kaist.ac.kr;;https://www.naver.com", "aff_unique_abbr": "Krafton;KAIST;;NAVER", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "South Korea;" }, { "title": "A Study of First-Order Methods with a Deterministic Relative-Error Gradient Oracle", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34143", "id": "OndZHBUA1G", "proceeding": "https://proceedings.mlr.press/v235/hallak24a.html", "pdf": "https://openreview.net/pdf?id=OndZHBUA1G", "openreview": "https://openreview.net/forum?id=OndZHBUA1G", "author_site": "Nadav Hallak, Kfir Levy", "tldr": "", "abstract": "This paper studies the theoretical guarantees of the classical projected gradient and conditional gradient methods applied to constrained optimization problems with biased relative-error gradient oracles. These oracles are used in various settings, such as distributed optimization systems or derivative-free optimization, and are particularly common when gradients are compressed, quantized, or estimated via finite differences computations. Several settings are investigated: Optimization over the box with a coordinate-wise erroneous gradient oracle, optimization over a general compact convex set, and three more specific scenarios. Convergence guarantees are established with respect to the relative-error magnitude, and in particular, we show that the conditional gradient is invariant to relative-error when applied over the box with a coordinate-wise erroneous gradient oracle, and the projected gradient maintains its convergence guarantees when optimizing a nonconvex objective function.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nadav Hallak;Kfir Yehuda Levy", "authorids": "~Nadav_Hallak1;~Kfir_Yehuda_Levy1", "gender": "M;M", "homepage": "https://sites.google.com/view/ndvhllk/home;http://kfiryehud.wixsite.com/kfir-y-levy", "dblp": "178/2441;83/11388", "google_scholar": "https://scholar.google.co.il/citations?user=4wKbph8AAAAJ;", "orcid": "0000-0002-0045-6636;", "linkedin": "nadav-hallak-701a8516/;", "or_profile": "~Nadav_Hallak1;~Kfir_Yehuda_Levy1", "aff": "Technion, Technion;Technion - Israel Institute of Technology, Technion", "aff_domain": "technion.ac.il;technion.ac.il", "position": "Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nhallak2024a,\ntitle={A Study of First-Order Methods with a Deterministic Relative-Error Gradient Oracle},\nauthor={Nadav Hallak and Kfir Yehuda Levy},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OndZHBUA1G}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 450131, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3999906851418707212&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "technion.ac.il;technion.ac.il", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "Diffusion Model-Augmented Behavioral Cloning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34142", "id": "OnidGtOhg3", "proceeding": "https://proceedings.mlr.press/v235/chen24as.html", "pdf": "https://openreview.net/pdf?id=OnidGtOhg3", "openreview": "https://openreview.net/forum?id=OnidGtOhg3", "author_site": "Shang-Fu Chen, Hsiang-Chun Wang, Ming-Hao Hsu, Chun-Mao Lai, Shao-Hua Sun", "tldr": "", "abstract": "Imitation learning addresses the challenge of learning by observing an expert\u2019s demonstrations without access to reward signals from environments. Most existing imitation learning methods that do not require interacting with environments either model the expert distribution as the conditional probability p(a|s) (e.g., behavioral cloning, BC) or the joint probability p(s, a). Despite the simplicity of modeling the conditional probability with BC, it usually struggles with generalization. While modeling the joint probability can improve generalization performance, the inference procedure is often time-consuming, and the model can suffer from manifold overfitting. This work proposes an imitation learning framework that benefits from modeling both the conditional and joint probability of the expert distribution. Our proposed Diffusion Model-Augmented Behavioral Cloning (DBC) employs a diffusion model trained to model expert behaviors and learns a policy to optimize both the BC loss (conditional) and our proposed diffusion model loss (joint). DBC outperforms baselines in various continuous control tasks in navigation, robot arm manipulation, dexterous manipulation, and locomotion. We design additional experiments to verify the limitations of modeling either the conditional probability or the joint probability of the expert distribution, as well as compare different generative models. Ablation studies justify the effectiveness of our design choices.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shang-Fu Chen;Hsiang-Chun Wang;Ming-Hao Hsu;Chun-Mao Lai;Shao-Hua Sun", "authorids": "~Shang-Fu_Chen2;~Hsiang-Chun_Wang1;~Ming-Hao_Hsu1;~Chun-Mao_Lai1;~Shao-Hua_Sun1", "gender": "M;;M;M;M", "homepage": "https://www.linkedin.com/in/shang-fu-chen-354914199/;https://hsiangchun0205.github.io/;https://qaz159qaz159.github.io/;https://mecoli1219.github.io/;http://shaohua0116.github.io", "dblp": "203/9102;;325/4631;325/4767;158/9680", "google_scholar": "https://scholar.google.com.tw/citations?user=ZKOpgs4AAAAJ;https://scholar.google.com.tw/citations?user=vpJMSjMAAAAJ;;;uXsfnaQAAAAJ", "orcid": ";;;;0000-0001-7579-6734", "linkedin": ";https://tw.linkedin.com/in/hsiang-chun-wang-8a4798269;;;shaohua0116/", "or_profile": "~Shang-Fu_Chen2;~Hsiang-Chun_Wang1;~Ming-Hao_Hsu1;~Chun-Mao_Lai1;~Shao-Hua_Sun1", "aff": "National Taiwan University;National Taiwan University;National Taiwan University;National Taiwan University;National Taiwan University", "aff_domain": "ntu.edu.tw;ntu.edu.tw;ntu.edu.tw;ntu.edu.tw;ntu.edu.tw", "position": "PhD student;MS student;Undergrad student;Undergrad student;Assistant Professor", "bibtex": "@inproceedings{\nchen2024diffusion,\ntitle={Diffusion Model-Augmented Behavioral Cloning},\nauthor={Shang-Fu Chen and Hsiang-Chun Wang and Ming-Hao Hsu and Chun-Mao Lai and Shao-Hua Sun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OnidGtOhg3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3638993, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=499986354715790633&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 9, "email": "ntu.edu.tw;ntu.edu.tw;ntu.edu.tw;ntu.edu.tw;ntu.edu.tw", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "National Taiwan University", "aff_unique_dep": "", "aff_unique_url": "https://www.ntu.edu.tw", "aff_unique_abbr": "NTU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Triadic-OCD: Asynchronous Online Change Detection with Provable Robustness, Optimality, and Convergence", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34141", "id": "OnkA4zaEU9", "proceeding": "https://proceedings.mlr.press/v235/huang24ad.html", "pdf": "https://openreview.net/pdf?id=OnkA4zaEU9", "openreview": "https://openreview.net/forum?id=OnkA4zaEU9", "author_site": "Yancheng Huang, Kai Yang, Zelin Zhu, Leian Chen", "tldr": "", "abstract": "The primary goal of online change detection (OCD) is to promptly identify changes in the data stream. OCD problem find a wide variety of applications in diverse areas, e.g., security detection in smart grids and intrusion detection in communication networks. Prior research usually assumes precise knowledge of the system parameters. Nevertheless, this presumption often proves unattainable in practical scenarios due to factors such as estimation errors, system updates, etc. This paper aims to take the first attempt to develop a triadic-OCD framework with certifiable robustness, provable optimality, and guaranteed convergence. In addition, the proposed triadic-OCD algorithm can be realized in a fully asynchronous distributed manner, easing the necessity of transmitting the data to a single server. This asynchronous mechanism could also mitigate the straggler issue that faced by traditional synchronous algorithm. Moreover, the non-asymptotic convergence property of Triadic-OCD is theoretically analyzed, and its iteration complexity to achieve an $\\epsilon$-optimal point is derived. Extensive experiments have been conducted to elucidate the effectiveness of the proposed method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yancheng Huang;Kai Yang;Zelin Zhu;Leian Chen", "authorids": "~Yancheng_Huang1;~Kai_Yang3;2111136@tongji.edu.cn;chen.leian@columbia.edu", "gender": "M;;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": "0000-0001-8271-6085;;;", "linkedin": ";;;", "or_profile": "~Yancheng_Huang1;~Kai_Yang3;2111136@tongji.edu.cn;chen.leian@columbia.edu", "aff": "Tongji University;;;", "aff_domain": "tongji.edu.cn;;;", "position": "MS student;;;", "bibtex": "@inproceedings{\nhuang2024triadicocd,\ntitle={Triadic-{OCD}: Asynchronous Online Change Detection with Provable Robustness, Optimality, and Convergence},\nauthor={Yancheng Huang and Kai Yang and Zelin Zhu and Leian Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OnkA4zaEU9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 510328, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-yr8uUwPFioJ:scholar.google.com/&scioq=Triadic-OCD:+Asynchronous+Online+Change+Detection+with+Provable+Robustness,+Optimality,+and+Convergence&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": "tongji.edu.cn;;;", "author_num": 4, "aff_unique_index": "0", "aff_unique_norm": "Tongji University", "aff_unique_dep": "", "aff_unique_url": "https://www.tongji.edu.cn", "aff_unique_abbr": "Tongji", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "Sparse Cocktail: Every Sparse Pattern Every Sparse Ratio All At Once", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34140", "id": "OrVl8R13Wy", "proceeding": "https://proceedings.mlr.press/v235/li24av.html", "pdf": "https://openreview.net/pdf?id=OrVl8R13Wy", "openreview": "https://openreview.net/forum?id=OrVl8R13Wy", "author_site": "Zhangheng Li, Shiwei Liu, Tianlong Chen, Ajay Jaiswal, Zhenyu Zhang, Dilin Wang, Raghuraman Krishnamoorthi, Shiyu Chang, Zhangyang \u201cAtlas\u201d Wang", "tldr": "", "abstract": "Sparse Neural Networks (SNNs) have received voluminous attention for mitigating the explosion in computational costs and memory footprints of modern deep neural networks. Despite their popularity, most state-of-the-art training approaches seek to find a single high-quality sparse subnetwork with a preset sparsity pattern and ratio, making them inadequate to satiate platform and resource variability. Recently proposed approaches attempt to jointly train multiple subnetworks (we term as ``sparse co-training\") with a *fixed sparsity pattern*, to allow switching sparsity ratios subject to resource requirements. In this work, we take one more step forward and expand the scope of sparse co-training to cover diverse sparsity patterns and multiple sparsity ratios *at once*. We introduce **Sparse Cocktail**, the first sparse co-training framework that co-trains a suite of sparsity patterns simultaneously, loaded with multiple sparsity ratios which facilitate harmonious switch across various sparsity patterns and ratios at inference depending on the hardware availability. More specifically, Sparse Cocktail alternatively trains subnetworks generated from different sparsity patterns with a gradual increase in sparsity ratios across patterns and relies on an *unified mask generation process* and the *Dense Pivot Co-training* to ensure the subnetworks of different patterns orchestrate their shared parameters without canceling each other\u2019s performance. Experiment results on image classification, object detection, and instance segmentation illustrate the favorable effectiveness and flexibility of Sparse Cocktail, pointing to a promising direction for sparse co-training. Codes will be released.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhangheng LI;Shiwei Liu;Tianlong Chen;AJAY KUMAR JAISWAL;Zhenyu Zhang;Dilin Wang;Raghuraman Krishnamoorthi;Shiyu Chang;Zhangyang Wang", "authorids": "~Zhangheng_LI2;~Shiwei_Liu2;~Tianlong_Chen1;~AJAY_KUMAR_JAISWAL1;~Zhenyu_Zhang4;~Dilin_Wang1;~Raghuraman_Krishnamoorthi1;~Shiyu_Chang2;~Zhangyang_Wang1", "gender": "M;M;M;M;M;M;Unspecified;M;M", "homepage": ";https://shiweiliuiiiiiii.github.io/;https://tianlong-chen.github.io;https://ajay1994.github.io/;https://zhenyu.gallery;;http://people.csail.mit.edu/chang87/;https://vita-group.github.io;", "dblp": ";234/8697-3.html;;30/9707;01/1844-15;;28/9988;119/4026;142/7035", "google_scholar": "https://scholar.google.co.uk/citations?user=NZCLqZMAAAAJ;73IbXtsAAAAJ;LE3ctn0AAAAJ;I783HxYAAAAJ;ZLyJRxoAAAAJ;F1mr9C0AAAAJ;r21asW4AAAAJ;pxFyKAIAAAAJ;dmTy9EIAAAAJ", "orcid": ";;0000-0001-7774-8197;;;;;;", "linkedin": "%E7%AB%A0%E6%81%92-%E6%9D%8E-b1b19711a/;;tianlong-chen-783862167/;;zhenyu-allen-zhang-a9b1391a3/;raghuraman-krishnamoorthi-b8670a5/;;;", "or_profile": "~Zhangheng_LI2;~Shiwei_Liu2;~Tianlong_Chen1;~AJAY_KUMAR_JAISWAL1;~Zhenyu_Zhang4;~Raghuraman_Krishnamoorthi1;~Shiyu_Chang2;~Zhangyang_Wang1;~Dilin_Wang2", "aff": "University of Texas at Austin;University of Oxford;Harvard University;University of Texas, Austin;University of Texas at Austin;Meta Facebook;University of California, Santa Barbara;University of Texas at Austin;Meta", "aff_domain": "utexas.edu;ox.ac.uk;harvard.edu;utexas.edu;utexas.edu;meta.com;ucsb.edu;utexas.edu;meta.com", "position": "PhD student;Postdoc;Postdoc;PhD student;PhD student;Researcher;Assistant Professor;Associate Professor;Research Scientist", "bibtex": "@inproceedings{\nli2024sparse,\ntitle={Sparse Cocktail: Every Sparse Pattern Every Sparse Ratio All At Once},\nauthor={Zhangheng LI and Shiwei Liu and Tianlong Chen and AJAY KUMAR JAISWAL and Zhenyu Zhang and Dilin Wang and Raghuraman Krishnamoorthi and Shiyu Chang and Zhangyang Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OrVl8R13Wy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 891123, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8961225764107459830&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "email": "utexas.edu;ox.ac.uk;harvard.edu;utexas.edu;utexas.edu;meta.com;ucsb.edu;utexas.edu;meta.com", "author_num": 9, "aff_unique_index": "0;1;2;0;0;3;4;0;3", "aff_unique_norm": "University of Texas at Austin;University of Oxford;Harvard University;Meta;University of California, Santa Barbara", "aff_unique_dep": ";;;Meta Platforms, Inc.;", "aff_unique_url": "https://www.utexas.edu;https://www.ox.ac.uk;https://www.harvard.edu;https://meta.com;https://www.ucsb.edu", "aff_unique_abbr": "UT Austin;Oxford;Harvard;Meta;UCSB", "aff_campus_unique_index": "0;0;0;2;0", "aff_campus_unique": "Austin;;Santa Barbara", "aff_country_unique_index": "0;1;0;0;0;0;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Invariant Risk Minimization Is A Total Variation Model", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34139", "id": "P7qwBmzwwZ", "proceeding": "https://proceedings.mlr.press/v235/lai24c.html", "pdf": "https://openreview.net/pdf?id=P7qwBmzwwZ", "openreview": "https://openreview.net/forum?id=P7qwBmzwwZ", "author_site": "Zhao-Rong Lai, Weiwen Wang", "tldr": "", "abstract": "Invariant risk minimization (IRM) is an arising approach to generalize invariant features to different environments in machine learning. While most related works focus on new IRM settings or new application scenarios, the mathematical essence of IRM remains to be properly explained. We verify that IRM is essentially a total variation based on $L^2$ norm (TV-$\\ell_2$) of the learning risk with respect to the classifier variable. Moreover, we propose a novel IRM framework based on the TV-$\\ell_1$ model. It not only expands the classes of functions that can be used as the learning risk and the feature extractor, but also has robust performance in denoising and invariant feature preservation based on the coarea formula. We also illustrate some requirements for IRM-TV-$\\ell_1$ to achieve out-of-distribution generalization. Experimental results show that the proposed framework achieves competitive performance in several benchmark machine learning scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhao-Rong Lai;Weiwen Wang", "authorids": "~Zhao-Rong_Lai1;~Weiwen_Wang2", "gender": "M;M", "homepage": "https://cybsec.jnu.edu.cn/2023/1120/c39593a781893/page.htm;https://wangyuanhao.github.io/", "dblp": "142/3902;", "google_scholar": "https://scholar.google.com.hk/citations?user=psPB6TsAAAAJ;uVAHiMUAAAAJ", "orcid": ";0000-0002-5435-2680", "linkedin": ";", "or_profile": "~Zhao-Rong_Lai1;~Weiwen_Wang2", "aff": "Jinan University;Jinan University", "aff_domain": "jnu.edu.cn;jnu.edu.cn", "position": "Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nlai2024invariant,\ntitle={Invariant Risk Minimization Is A Total Variation Model},\nauthor={Zhao-Rong Lai and Weiwen Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=P7qwBmzwwZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 687834, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:rqcfDVC1DjIJ:scholar.google.com/&scioq=Invariant+Risk+Minimization+Is+A+Total+Variation+Model&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": "jnu.edu.cn;jnu.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Jinan University", "aff_unique_dep": "", "aff_unique_url": "https://www.jnu.edu.cn", "aff_unique_abbr": "JNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "In-Context Principle Learning from Mistakes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34138", "id": "PAPY0cAB3C", "proceeding": "https://proceedings.mlr.press/v235/zhang24at.html", "pdf": "https://openreview.net/pdf?id=PAPY0cAB3C", "openreview": "https://openreview.net/forum?id=PAPY0cAB3C", "author_site": "Tianjun Zhang, Aman Madaan, Luyu Gao, Steven Zheng, Swaroop Mishra, Yiming Yang, Niket Tandon, Uri Alon", "tldr": "", "abstract": "In-context learning (ICL, also known as few-shot prompting) has been the standard method of adapting LLMs to downstream tasks, by learning from a few input-output examples. Nonetheless, all ICL-based approaches only learn from correct input-output pairs. In this paper, we revisit this paradigm, by learning more from the few given input-output examples. We introduce Learning Principles (LEAP): First, we intentionally induce the model to make mistakes on these few examples; then we reflect on these mistakes, and learn explicit task-specific \u201cprinciples\u201d from them, which help solve similar problems and avoid common mistakes; finally, we prompt the model to answer unseen test questions using the original few-shot examples and these learned general principles. We evaluate LEAP on a wide range of benchmarks, including multi-hop question answering (Hotpot QA), textual QA (DROP), Big-Bench Hard reasoning, and math problems (GSM8K and MATH); in all these benchmarks, LEAP improves the strongest available LLMs such as GPT-3.5-turbo, GPT-4, GPT-4-turbo and Claude-2.1. For example, LEAP improves over the standard few-shot prompting using GPT-4 by 7.5% in DROP, and by 3.3% in HotpotQA. Importantly, LEAP does not require any more input or examples than the standard few-shot prompting settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tianjun Zhang;Aman Madaan;Luyu Gao;Steven Zheng;Swaroop Mishra;Yiming Yang;Niket Tandon;Uri Alon", "authorids": "~Tianjun_Zhang1;~Aman_Madaan1;~Luyu_Gao1;~Steven_Zheng1;~Swaroop_Mishra1;~Yiming_Yang1;~Niket_Tandon2;~Uri_Alon1", "gender": ";;M;M;F;M;M;M", "homepage": "https://tianjunz.github.io;https://madaan.github.io;https://luyug.github.io/;https://swarooprm.github.io/;http://www.cs.cmu.edu/~yiming/;https://niket.tandon.info;https://urialon.ml/;", "dblp": ";138/1043;;249/2784;25/1666;29/9923;40/2257-2;307/3201", "google_scholar": "UE9jz_MAAAAJ;jW9ts2cAAAAJ;https://scholar.google.com/citations?hl=zh-CN;-7LK2SwAAAAJ;MlZq4XwAAAAJ;9uWuZkUAAAAJ;https://scholar.google.co.il/citations?user=QBn7vq8AAAAJ;PyK4x4wAAAAJ", "orcid": ";;;;0000-0001-8322-607X;;;", "linkedin": ";amnmadaan/;;;yiming-yang-24100924/;;https://linkedin.com/in/urialon1/;", "or_profile": "~Tianjun_Zhang1;~Aman_Madaan1;~Luyu_Gao1;~Swaroop_Mishra1;~Yiming_Yang1;~Niket_Tandon2;~Uri_Alon1;~Huaixiu_Steven_Zheng1", "aff": "University of California, Berkeley;Carnegie Mellon University;Carnegie Mellon University;Google;School of Computer Science, Carnegie Mellon University;Microsoft Research;Google DeepMind;Google", "aff_domain": "berkeley.edu;cmu.edu;cmu.edu;google.com;cs.cmu.edu;research.microsoft.com;google.com;google.com", "position": "PhD student;PhD student;PhD student;Researcher;Full Professor;Principal Researcher;Researcher;Software Engineer", "bibtex": "@inproceedings{\nzhang2024incontext,\ntitle={In-Context Principle Learning from Mistakes},\nauthor={Tianjun Zhang and Aman Madaan and Luyu Gao and Steven Zheng and Swaroop Mishra and Yiming Yang and Niket Tandon and Uri Alon},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PAPY0cAB3C}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 678136, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12482585053692312076&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "berkeley.edu;cmu.edu;cmu.edu;google.com;cs.cmu.edu;research.microsoft.com;google.com;google.com", "author_num": 8, "aff_unique_index": "0;1;1;2;1;3;2;2", "aff_unique_norm": "University of California, Berkeley;Carnegie Mellon University;Google;Microsoft", "aff_unique_dep": ";;Google;Microsoft Research", "aff_unique_url": "https://www.berkeley.edu;https://www.cmu.edu;https://www.google.com;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "UC Berkeley;CMU;Google;MSR", "aff_campus_unique_index": "0;2;3;2", "aff_campus_unique": "Berkeley;;Mountain View;Pittsburgh", "aff_country_unique_index": "0;0;0;0;0;0;1;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Limited Preference Aided Imitation Learning from Imperfect Demonstrations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34137", "id": "PAbkWU0KDG", "proceeding": "https://proceedings.mlr.press/v235/cao24b.html", "pdf": "https://openreview.net/pdf?id=PAbkWU0KDG", "openreview": "https://openreview.net/forum?id=PAbkWU0KDG", "author_site": "Xingchen Cao, Fan-Ming Luo, Junyin Ye, Tian Xu, Zhilong Zhang, Yang Yu", "tldr": "", "abstract": "Imitation learning mimics high-quality policies from expert data for sequential decision-making tasks. However, its efficacy is hindered in scenarios where optimal demonstrations are unavailable, and only imperfect demonstrations are present. To address this issue, introducing additional limited human preferences is a suitable approach as it can be obtained in a human-friendly manner, offering a promising way to learn the policy that exceeds the performance of imperfect demonstrations. In this paper, we propose a novel imitation learning (IL) algorithm, **P**reference **A**ided **I**mitation **L**earning from imperfect demonstrations (PAIL). Specifically, PAIL learns a preference reward by querying experts for limited preferences from imperfect demonstrations. This serves two purposes during training: 1) Reweighting imperfect demonstrations with the preference reward for higher quality. 2) Selecting explored trajectories with high cumulative preference rewards to augment imperfect demonstrations. The dataset with continuously improving quality empowers the performance of PAIL to transcend the initial demonstrations. Comprehensive empirical results across a synthetic task and two locomotion benchmarks show that PAIL surpasses baselines by **73.2%** and breaks through the performance bottleneck of imperfect demonstrations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xingchen Cao;Fan-Ming Luo;Junyin Ye;Tian Xu;Zhilong Zhang;Yang Yu", "authorids": "~Xingchen_Cao1;~Fan-Ming_Luo1;~Junyin_Ye1;~Tian_Xu2;~Zhilong_Zhang2;~Yang_Yu5", "gender": "M;;M;M;M;", "homepage": "https://grimreaperno218.github.io/;;http://www.lamda.nju.edu.cn/yejy/;http://www.lamda.nju.edu.cn/xut/;http://www.lamda.nju.edu.cn/zhangzl/;", "dblp": ";;;07/2985-3;;", "google_scholar": ";;;e5mnk1wAAAAJ;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Xingchen_Cao1;~Fan-Ming_Luo1;~Junyin_Ye1;~Tian_Xu2;~Zhilong_Zhang2;~Yang_Yu5", "aff": "Nanjing University;;Nanjing University;Nanjing University;Nanjing University;", "aff_domain": "lamda.nju.edu.cn;;nju.edu.cn;nju.edu.cn;nju.edu.cn;", "position": "MS student;;MS student;PhD student;MS student;", "bibtex": "@inproceedings{\ncao2024limited,\ntitle={Limited Preference Aided Imitation Learning from Imperfect Demonstrations},\nauthor={Xingchen Cao and Fan-Ming Luo and Junyin Ye and Tian Xu and Zhilong Zhang and Yang Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PAbkWU0KDG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 0, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14867370472920079223&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "lamda.nju.edu.cn;;nju.edu.cn;nju.edu.cn;nju.edu.cn;", "author_num": 6, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "", "aff_unique_url": "https://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Bootstrapping Fisher Market Equilibrium and First-Price Pacing Equilibrium", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34136", "id": "PApqOVbHYF", "proceeding": "https://proceedings.mlr.press/v235/liao24b.html", "pdf": "https://openreview.net/pdf?id=PApqOVbHYF", "openreview": "https://openreview.net/forum?id=PApqOVbHYF", "author_site": "Luofeng Liao, Christian Kroer", "tldr": "", "abstract": "Linear Fisher market (LFM) is an equilibrium model for fair and efficient resource allocation, and first-price pacing equilibrium (FPPE) is a model for budget-management in first-price auctions. One thing they have in common is that both have a corresponding Eisenberg-Gale convex program characterization. In this paper, we introduce and devise several statistically valid bootstrap inference procedures for LFM and FPPE. The most challenging part is to bootstrap general FPPE, which reduces to bootstrapping constrained M-estimators, a largely unexplored problem. We are able to devise a bootstrap procedure for FPPE with structures by using the powerful tool of epi-convergence theory. Experiments with synthetic and semi-real data verify our theory.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Luofeng Liao;Christian Kroer", "authorids": "~Luofeng_Liao1;~Christian_Kroer1", "gender": "M;M", "homepage": ";http://www.columbia.edu/~ck2945/", "dblp": ";64/10660", "google_scholar": "2kVrHEUAAAAJ;https://scholar.google.ch/citations?user=ckHwjPAAAAAJ", "orcid": ";0000-0002-9009-8683", "linkedin": "luofeng-liao-7a1027181/;", "or_profile": "~Luofeng_Liao1;~Christian_Kroer1", "aff": "Columbia University;Columbia University", "aff_domain": "columbia.edu;columbia.edu", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nliao2024bootstrapping,\ntitle={Bootstrapping Fisher Market Equilibrium and First-Price Pacing Equilibrium},\nauthor={Luofeng Liao and Christian Kroer},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PApqOVbHYF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 758122, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=456499811308333479&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "columbia.edu;columbia.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Columbia University", "aff_unique_dep": "", "aff_unique_url": "https://www.columbia.edu", "aff_unique_abbr": "Columbia", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "High-Order Contrastive Learning with Fine-grained Comparative Levels for Sparse Ordinal Tensor Completion", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34135", "id": "PDO2Oc1cS1", "proceeding": "https://proceedings.mlr.press/v235/dai24c.html", "pdf": "https://openreview.net/pdf?id=PDO2Oc1cS1", "openreview": "https://openreview.net/forum?id=PDO2Oc1cS1", "author_site": "Yu Dai, Junchen Shen, Zijie Zhai, Danlin Liu, Jingyang Chen, Yu Sun, Ping Li, Jie Zhang, Kai Zhang", "tldr": "", "abstract": "Contrastive learning is a powerful paradigm for representation learning with prominent success in computer vision and NLP, but how to extend its success to high-dimensional tensors remains a challenge. This is because tensor data often exhibit high-order mode-interactions that are hard to profile and with negative samples growing combinatorially faster than second-order contrastive learning; furthermore, many real-world tensors have ordinal entries that necessitate more delicate comparative levels. To solve the challenge, we propose High-Order Contrastive Tensor Completion (HOCTC), an innovative network to extend contrastive learning to sparse ordinal tensor data. HOCTC employs a novel attention-based strategy with query-expansion to capture high-order mode interactions even in case of very limited tokens, which transcends beyond second-order learning scenarios. Besides, it extends two-level comparisons (positive-vs-negative) to fine-grained contrast-levels using ordinal tensor entries as a natural guidance. Efficient sampling scheme is proposed to enforce such delicate comparative structures, generating comprehensive self-supervised signals for high-order representation learning. Extensive experiments show that HOCTC has promising results in sparse tensor completion in traffic/recommender applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yu Dai;Junchen Shen;Zijie Zhai;Danlin Liu;Jingyang Chen;Yu Sun;Ping Li;Jie Zhang;Kai Zhang", "authorids": "~Yu_Dai4;~Junchen_Shen2;~Zijie_Zhai1;~Danlin_Liu1;~Jingyang_Chen1;sunyu9910@gmail.com;~Ping_Li12;~Jie_Zhang10;~Kai_Zhang1", "gender": "M;;M;F;M;;F;M;M", "homepage": ";;;;;;;https://istbi.fudan.edu.cn/lnen/info/1157/1639.htm;https://cis.temple.edu/user/635", "dblp": ";;;;156/8435;;62/5860-24;84/6889-12;55/957-1.html", "google_scholar": ";;;https://scholar.google.cz/citations?user=mM-246AAAAAJ;zzyePNsAAAAJ;;TwSm5CUAAAAJ;https://scholar.google.com.hk/citations?user=epTfECgAAAAJ;I6ifR7YAAAAJ", "orcid": ";;;;;;0000-0002-8391-6510;;0000-0001-6297-4423", "linkedin": "yudai1203;;blank-zhai-2890a31ba;;%E5%8A%B2%E6%9D%A8-%E9%99%88-38a93623a;;;;kai-zhang-1b939430/", "or_profile": "~Yu_Dai4;~Junchen_Shen2;~Zijie_Zhai1;~Danlin_Liu1;~Jingyang_Chen1;sunyu9910@gmail.com;~Ping_Li12;~Jie_Zhang10;~Kai_Zhang1", "aff": "East China Normal University;;East China Normal University;East China Normal University;Fudan University;;Southwest Petroleum University;Fudan University;East China Normal University", "aff_domain": "stu.ecnu.edu.cn;;ecnu.edu.cn;ecnu.edu.cn;fudan.edu.cn;;swpu.edu.cn;fudan.edu.cn;ecnu.edu.cn", "position": "MS student;;MS student;Postdoc;PhD student;;Full Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\ndai2024highorder,\ntitle={High-Order Contrastive Learning with Fine-grained Comparative Levels for Sparse Ordinal Tensor Completion},\nauthor={Yu Dai and Junchen Shen and Zijie Zhai and Danlin Liu and Jingyang Chen and Yu Sun and Ping Li and Jie Zhang and Kai Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PDO2Oc1cS1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1007809, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GuI9nHWzu2AJ:scholar.google.com/&scioq=High-Order+Contrastive+Learning+with+Fine-grained+Comparative+Levels+for+Sparse+Ordinal+Tensor+Completion&hl=en&as_sdt=0,44", "gs_version_total": 4, "email": "stu.ecnu.edu.cn;;ecnu.edu.cn;ecnu.edu.cn;fudan.edu.cn;;swpu.edu.cn;fudan.edu.cn;ecnu.edu.cn", "author_num": 9, "aff_unique_index": "0;0;0;1;2;1;0", "aff_unique_norm": "East China Normal University;Fudan University;Southwest Petroleum University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.ecnu.edu.cn;https://www.fudan.edu.cn;https://www.swpu.edu.cn", "aff_unique_abbr": "ECNU;Fudan;SWPU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Distributed High-Dimensional Quantile Regression: Estimation Efficiency and Support Recovery", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34134", "id": "PDUQRBPkks", "proceeding": "https://proceedings.mlr.press/v235/wang24bk.html", "pdf": "https://openreview.net/pdf?id=PDUQRBPkks", "openreview": "https://openreview.net/forum?id=PDUQRBPkks", "author_site": "Caixing Wang, Ziliang Shen", "tldr": "", "abstract": "In this paper, we focus on distributed estimation and support recovery for high-dimensional linear quantile regression. Quantile regression is a popular alternative tool to the least squares regression for robustness against outliers and data heterogeneity. However, the non-smoothness of the check loss function poses big challenges to both computation and theory in the distributed setting. To tackle these problems, we transform the original quantile regression into the least-squares optimization. By applying a double-smoothing approach, we extend a previous Newton-type distributed approach without the restrictive independent assumption between the error term and covariates. An efficient algorithm is developed, which enjoys high computation and communication efficiency. Theoretically, the proposed distributed estimator achieves a near-oracle convergence rate and high support recovery accuracy after a constant number of iterations. Extensive experiments on synthetic examples and a real data application further demonstrate the effectiveness of the proposed method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Caixing Wang;Ziliang Shen", "authorids": "~Caixing_Wang1;~Ziliang_Shen1", "gender": "M;M", "homepage": "https://pinzek.github.io/;http://wangcaixing96.com/", "dblp": ";", "google_scholar": ";SLEH6XYAAAAJ", "orcid": ";0009-0009-3068-6094", "linkedin": ";", "or_profile": "~Ziliang_Shen1;~Wang_Caixing1", "aff": "Shanghai University of Finance and Economics;Shanghai University of Finance and Economics", "aff_domain": "sufe.edu.cn;shufe.edu.cn", "position": "PhD student;PhD student", "bibtex": "@inproceedings{\nwang2024distributed,\ntitle={Distributed High-Dimensional Quantile Regression: Estimation Efficiency and Support Recovery},\nauthor={Caixing Wang and Ziliang Shen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PDUQRBPkks}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1724967, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17537142036480025125&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "sufe.edu.cn;shufe.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Shanghai University of Finance and Economics", "aff_unique_dep": "", "aff_unique_url": "http://www.sufe.edu.cn", "aff_unique_abbr": "SUFE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Medusa: Simple LLM Inference Acceleration Framework with Multiple Decoding Heads", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34133", "id": "PEpbUobfJv", "proceeding": "https://proceedings.mlr.press/v235/cai24b.html", "pdf": "https://openreview.net/pdf?id=PEpbUobfJv", "openreview": "https://openreview.net/forum?id=PEpbUobfJv", "author_site": "Tianle Cai, Yuhong Li, Zhengyang Geng, Hongwu Peng, Jason Lee, Deming Chen, Tri Dao", "tldr": "", "abstract": "Large Language Models (LLMs) employ auto-regressive decoding that requires sequential computation, with each step reliant on the previous one's output. This creates a bottleneck as each step necessitates moving the full model parameters from High-Bandwidth Memory (HBM) to the accelerator's cache. While methods such as speculative decoding have been suggested to address this issue, their implementation is impeded by the challenges associated with acquiring and maintaining a separate draft model. In this paper, we present Medusa, an efficient method that augments LLM inference by adding extra decoding heads to predict multiple subsequent tokens in parallel. Using a tree-based attention mechanism, Medusa constructs multiple candidate continuations and verifies them simultaneously in each decoding step. By leveraging parallel processing, Medusa reduces the number of decoding steps required. We present two levels of fine-tuning procedures for Medusa to meet the needs of different use cases: Medusa-1: Medusa is directly fine-tuned on top of a frozen backbone LLM, enabling lossless inference acceleration. Medusa-2: Medusa is fine-tuned together with the backbone LLM, enabling better prediction accuracy of Medusa heads and higher speedup but needing a special training recipe that preserves the model's capabilities. Moreover, we propose several extensions that improve or expand the utility of Medusa, including a self-distillation to handle situations where no training data is available and a typical acceptance scheme to boost the acceptance rate while maintaining generation quality. We evaluate Medusa on models of various sizes and training procedures. Our experiments demonstrate that Medusa-1 can achieve over 2.2$\\times$ speedup without compromising generation quality, while Medusa-2 further improves the speedup to 2.3-2.8$\\times$.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tianle Cai;Yuhong Li;Zhengyang Geng;Hongwu Peng;Jason D. Lee;Deming Chen;Tri Dao", "authorids": "~Tianle_Cai1;~Yuhong_Li2;~Zhengyang_Geng1;~Hongwu_Peng1;~Jason_D._Lee1;~Deming_Chen1;~Tri_Dao1", "gender": "M;M;;M;M;;", "homepage": "https://tianle.website;https://leeyeehoo.github.io/;https://gsunshine.github.io/;https://harveyp123.github.io/;https://jasondlee88.github.io/;;https://tridao.me/", "dblp": "241/9458;;250/2651.html;292/5365;88/3262;;206/7018", "google_scholar": "CvwLRSMAAAAJ;Qh-6mV8AAAAJ;lNkw3QYAAAAJ;9P2qtQoAAAAJ;GR_DsT0AAAAJ;;NQRw0bQAAAAJ", "orcid": ";0000-0002-3769-6772;;;;;", "linkedin": ";;;hongwu-peng-374893119/;;;", "or_profile": "~Tianle_Cai1;~Yuhong_Li2;~Zhengyang_Geng1;~Hongwu_Peng1;~Jason_D._Lee1;~Deming_Chen1;~Tri_Dao1", "aff": "Princeton University;University of Illinois, Urbana Champaign;Massachusetts Institute of Technology;University of Connecticut;Princeton University;;Princeton University", "aff_domain": "princeton.edu;illinois.edu;mit.edu;uconn.edu;princeton.edu;;princeton.edu", "position": "PhD student;PhD student;Visiting student;PhD student;Assistant Professor;;Assistant Professor", "bibtex": "@inproceedings{\ncai2024medusa,\ntitle={Medusa: Simple {LLM} Inference Acceleration Framework with Multiple Decoding Heads},\nauthor={Tianle Cai and Yuhong Li and Zhengyang Geng and Hongwu Peng and Jason D. Lee and Deming Chen and Tri Dao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PEpbUobfJv}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1970523, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 229, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8066158661998700751&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "princeton.edu;illinois.edu;mit.edu;uconn.edu;princeton.edu;;princeton.edu", "author_num": 7, "aff_unique_index": "0;1;2;3;0;0", "aff_unique_norm": "Princeton University;University of Illinois Urbana-Champaign;Massachusetts Institute of Technology;University of Connecticut", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.princeton.edu;https://illinois.edu;https://web.mit.edu;https://www.uconn.edu", "aff_unique_abbr": "Princeton;UIUC;MIT;UConn", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "LESS: Selecting Influential Data for Targeted Instruction Tuning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34132", "id": "PG5fV50maR", "proceeding": "https://proceedings.mlr.press/v235/xia24c.html", "pdf": "https://openreview.net/pdf?id=PG5fV50maR", "openreview": "https://openreview.net/forum?id=PG5fV50maR", "author_site": "Mengzhou Xia, Sadhika Malladi, Suchin Gururangan, Sanjeev Arora, Danqi Chen", "tldr": "", "abstract": "Instruction tuning has unlocked powerful capabilities in large language models (LLMs), using combined datasets to develop general-purpose chatbots. However, real-world applications often require a specialized suite of skills (e.g., reasoning). The challenge lies in identifying the most relevant data from these extensive datasets to effectively develop specific capabilities, a setting we frame as *targeted instruction tuning*. We propose LESS, an optimizer-aware and practically efficient algorithm to estimate data influences and perform **L**ow-rank gradi**E**nt **S**imilarity **S**earch for instruction data selection. Crucially, LESS adapts existing influence formulations to work with the Adam optimizer and variable-length instruction data. LESS first constructs a highly reusable and transferable *gradient datastore* with low-dimensional gradient features and then selects examples based on their similarity to few-shot examples embodying a specific capability. Experiments show that training on a LESS-selected 5% of the data can often outperform training on the full dataset across diverse downstream tasks. Furthermore, the selected data is highly transferable: smaller models can be leveraged to select useful data for larger models and models from different families. Our qualitative analysis shows that our method goes beyond surface form cues to identify data that exemplifies the necessary reasoning skills for the intended downstream application. To facilitate future work, we release code and data at [princeton-nlp/LESS](https://github.com/princeton-nlp/LESS).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mengzhou Xia;Sadhika Malladi;Suchin Gururangan;Sanjeev Arora;Danqi Chen", "authorids": "~Mengzhou_Xia1;~Sadhika_Malladi2;~Suchin_Gururangan1;~Sanjeev_Arora1;~Danqi_Chen1", "gender": "F;F;M;;F", "homepage": "https://xiamengzhou.github.io/;https://www.cs.princeton.edu/~smalladi/;https://suchin.io;http://www.cs.princeton.edu/~arora/;https://www.cs.princeton.edu/~danqic/", "dblp": "241/9329;176/9810;217/1570;a/SArora;87/7949", "google_scholar": "zyJn1IcAAAAJ;9HCmTcwAAAAJ;CJIKhNIAAAAJ;RUP4S68AAAAJ;sVR8ktkAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Mengzhou_Xia1;~Sadhika_Malladi2;~Suchin_Gururangan1;~Sanjeev_Arora1;~Danqi_Chen1", "aff": "Princeton University;Princeton University;University of Washington, Seattle;Princeton University;Princeton University", "aff_domain": "princeton.edu;princeton.edu;uw.edu;princeton.edu;cs.princeton.edu", "position": "PhD student;PhD student;PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nxia2024less,\ntitle={{LESS}: Selecting Influential Data for Targeted Instruction Tuning},\nauthor={Mengzhou Xia and Sadhika Malladi and Suchin Gururangan and Sanjeev Arora and Danqi Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PG5fV50maR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 880116, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 200, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15578855076689041298&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "princeton.edu;princeton.edu;uw.edu;princeton.edu;cs.princeton.edu", "author_num": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Princeton University;University of Washington", "aff_unique_dep": ";", "aff_unique_url": "https://www.princeton.edu;https://www.washington.edu", "aff_unique_abbr": "Princeton;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "AegisFL: Efficient and Flexible Privacy-Preserving Byzantine-Robust Cross-silo Federated Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34131", "id": "PHUAG63Efe", "proceeding": "https://proceedings.mlr.press/v235/chen24ag.html", "pdf": "https://openreview.net/pdf?id=PHUAG63Efe", "openreview": "https://openreview.net/forum?id=PHUAG63Efe", "author_site": "Dong Chen, Hongyuan Qu, Guangwu Xu", "tldr": "", "abstract": "Privacy attacks and poisoning attacks are two of the thorniest problems in federation learning (FL). Homomorphic encryption (HE), which allows certain mathematical operations to be done in the ciphertext state, provides a way to solve these two problems simultaneously. However, existing Paillier-based and CKKS-based privacy-preserving byzantine-robust FL (PBFL) solutions not only suffer from low efficiency but also expose the final model to the server. Additionally, these methods are limited to one robust aggregation algorithm (AGR) and are therefore vulnerable to AGR-tailored poisoning attacks. In this paper, we present AegisFL, an efficient PBLF system that provides the flexibility to change the AGR. We first observe that the core of the existing advanced AGRs is to calculate the inner products, $L_2$ norms and mean values for vectors. Based on this observation, we tailor a packing scheme for PBFL, which fits perfectly with RLWE-based fully homomorphic encryption. Under this packing scheme, the server only needs to perform one ciphertext multiplication to construct any required AGR, while the global model only belongs to honest clients. Finally, we conduct extensive experiments on different datasets and adversary settings, which also confirm the effectiveness and efficiency of our scheme.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dong Chen;Hongyuan Qu;Guangwu Xu", "authorids": "~Dong_Chen13;~Hongyuan_Qu1;gxu4sdq@sdu.edu.cn", "gender": "M;;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": "0000-0001-7779-8186;0000-0002-1684-0819;", "linkedin": ";;", "or_profile": "~Dong_Chen13;~Hongyuan_Qu1;gxu4sdq@sdu.edu.cn", "aff": "Shandong University;;", "aff_domain": "sdu.edu.cn;;", "position": "MS student;;", "bibtex": "@inproceedings{\nchen2024aegisfl,\ntitle={Aegis{FL}: Efficient and Flexible Privacy-Preserving Byzantine-Robust Cross-silo Federated Learning},\nauthor={Dong Chen and Hongyuan Qu and Guangwu Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PHUAG63Efe}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 610050, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7980169492262075731&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "sdu.edu.cn;;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Shandong University", "aff_unique_dep": "", "aff_unique_url": "http://www.sdu.edu.cn", "aff_unique_abbr": "SDU", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "Q-Align: Teaching LMMs for Visual Scoring via Discrete Text-Defined Levels", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34130", "id": "PHjkVjR78A", "proceeding": "https://proceedings.mlr.press/v235/wu24ah.html", "pdf": "https://openreview.net/pdf?id=PHjkVjR78A", "openreview": "https://openreview.net/forum?id=PHjkVjR78A", "author_site": "Haoning Wu, Zicheng Zhang, Weixia Zhang, Chaofeng Chen, Liang Liao, Chunyi Li, Yixuan Gao, Annan Wang, Erli Zhang, Wenxiu Sun, Qiong Yan, Xiongkuo Min, Guangtao Zhai, Weisi Lin", "tldr": "", "abstract": "The explosion of visual content available online underscores the requirement for an accurate machine assessor to robustly evaluate scores across diverse types of visual contents. While recent studies have demonstrated the exceptional potentials of large multi-modality models (LMMs) on a wide range of related fields, in this work, we explore how to teach them for visual rating aligning with human opinions. Observing that human raters only learn and judge discrete text-defined levels in subjective studies, we propose to emulate this subjective process and teach LMMs with text-defined rating levels instead of scores. The proposed Q-Align achieves state-of-the-art accuracy on image quality assessment (IQA), image aesthetic assessment (IAA), as well as video quality assessment (VQA) under the original LMM structure. With the syllabus, we further unify the three tasks into one model, termed the OneAlign. Our experiments demonstrate the advantage of discrete levels over direct scores on training, and that LMMs can learn beyond the discrete levels and provide effective finer-grained evaluations. Code and weights will be released.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haoning Wu;Zicheng Zhang;Weixia Zhang;Chaofeng Chen;Liang Liao;Chunyi Li;Yixuan Gao;Annan Wang;Erli Zhang;Wenxiu Sun;Qiong Yan;Xiongkuo Min;Guangtao Zhai;Weisi Lin", "authorids": "~Haoning_Wu1;~Zicheng_Zhang7;~Weixia_Zhang1;~Chaofeng_Chen1;~Liang_Liao3;~Chunyi_Li1;~Yixuan_Gao2;~Annan_Wang1;~Erli_Zhang1;~Wenxiu_Sun1;~Qiong_Yan1;~Xiongkuo_Min1;~Guangtao_Zhai1;~Weisi_Lin1", "gender": "M;M;M;M;M;F;M;M;F;;M;M;M;M", "homepage": "https://teowu.github.io;;https://chaofengc.github.io/;https://liaoliang92.github.io/homepage/;https://lcysyzxdxc.github.io;;;;http://wenxiusun.com/;;;https://faculty.sjtu.edu.cn/zhaiguangtao/en/index.htm;http://www.ntu.edu.sg/home/wslin/;", "dblp": "264/5802-1;196/3124;198/2537;;192/6758;202/1021;;32/749;16/9879;122/4814;139/6983;19/3230;14/3737.html;", "google_scholar": "https://scholar.google.com.hk/citations?user=wth-VbMAAAAJ;KK2nLnQAAAAJ;lxiqnI0AAAAJ;kqTUHSIAAAAJ;https://scholar.google.com/citations?hl=en;;;gfjYZKMAAAAJ;X9lE6O4AAAAJ;uT9CtPYAAAAJ;91sjuWIAAAAJ;E6zbSYgAAAAJ;https://scholar.google.com.tw/citations?user=D_S41X4AAAAJ;QICTEckAAAAJ", "orcid": "0000-0001-8642-8101;;0000-0001-6137-5162;0000-0002-2238-2420;;0000-0002-6292-0529;0009-0004-2998-9817;;;;0000-0001-5693-0416;;;", "linkedin": ";;;;;;annan-wang-1026241a4;zhang-erli/;;;;;;", "or_profile": "~Haoning_Wu1;~Weixia_Zhang1;~Chaofeng_Chen1;~Liang_Liao3;~Chunyi_Li1;~Yixuan_Gao2;~Annan_Wang1;~Erli_Zhang1;~Wenxiu_Sun1;~Qiong_Yan1;~Xiongkuo_Min1;~Guangtao_Zhai1;~Weisi_Lin1;~zicheng_zhang6", "aff": "Nanyang Technological University;Shanghai Jiaotong University;Nanyang Technological University;Nanyang Technological University;Shanghai Artificial Intelligence Laboratory;Shanghai Jiaotong University;Nanyang Technological University;Nanyang Technological University;SenseTime Group Limited;SenseTime Research;Shanghai Jiaotong University;Shanghai Jiaotong University;Nanyang Technological University;Shanghai Jiaotong University", "aff_domain": "ntu.edu.sg;sjtu.edu.cn;ntu.edu.sg;ntu.edu.sg;pjlab.org.cn;sjtu.edu.cn;ntu.edu.sg;ntu.edu.sg;sensetime.com;sensetime.com;sjtu.edu.cn;sjtu.edu.cn;ntu.edu.sg;sjtu.edu.cn", "position": "PhD student;Researcher;Postdoc;Postdoc;Intern;PhD student;Researcher;Undergrad student;Principal Researcher;Research Director;Associate Professor;Full Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nwu2024qalign,\ntitle={Q-Align: Teaching {LMM}s for Visual Scoring via Discrete Text-Defined Levels},\nauthor={Haoning Wu and Zicheng Zhang and Weixia Zhang and Chaofeng Chen and Liang Liao and Chunyi Li and Yixuan Gao and Annan Wang and Erli Zhang and Wenxiu Sun and Qiong Yan and Xiongkuo Min and Guangtao Zhai and Weisi Lin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PHjkVjR78A}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8753259, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 14, "gs_citation": 165, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2101774103586253104&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "ntu.edu.sg;sjtu.edu.cn;ntu.edu.sg;ntu.edu.sg;pjlab.org.cn;sjtu.edu.cn;ntu.edu.sg;ntu.edu.sg;sensetime.com;sensetime.com;sjtu.edu.cn;sjtu.edu.cn;ntu.edu.sg;sjtu.edu.cn", "author_num": 14, "aff_unique_index": "0;1;0;0;2;1;0;0;3;4;1;1;0;1", "aff_unique_norm": "Nanyang Technological University;Shanghai Jiao Tong University;Shanghai Artificial Intelligence Laboratory;SenseTime Group Limited;SenseTime", "aff_unique_dep": ";;;;SenseTime Research", "aff_unique_url": "https://www.ntu.edu.sg;https://www.sjtu.edu.cn;http://www.shailab.org/;https://www.sensetime.com;https://www.sensetime.com", "aff_unique_abbr": "NTU;SJTU;Shanghai AI Lab;SenseTime;SenseTime", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1;1;0;0;1;1;1;1;0;1", "aff_country_unique": "Singapore;China" }, { "title": "RICE: Breaking Through the Training Bottlenecks of Reinforcement Learning with Explanation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34129", "id": "PKJqsZD5nQ", "proceeding": "https://proceedings.mlr.press/v235/cheng24j.html", "pdf": "https://openreview.net/pdf?id=PKJqsZD5nQ", "openreview": "https://openreview.net/forum?id=PKJqsZD5nQ", "author_site": "Zelei Cheng, Xian Wu, Jiahao Yu, Sabrina Yang, Gang Wang, Xinyu Xing", "tldr": "", "abstract": "Deep reinforcement learning (DRL) is playing an increasingly important role in real-world applications. However, obtaining an optimally performing DRL agent for complex tasks, especially with sparse rewards, remains a significant challenge. The training of a DRL agent can be often trapped in a bottleneck without further progress. In this paper, we propose RICE, an innovative refining scheme for reinforcement learning that incorporates explanation methods to break through the training bottlenecks. The high-level idea of RICE is to construct a new initial state distribution that combines both the default initial states and critical states identified through explanation methods, thereby encouraging the agent to explore from the mixed initial states. Through careful design, we can theoretically guarantee that our refining scheme has a tighter sub-optimality bound. We evaluate RICE in various popular RL environments and real-world applications. The results demonstrate that RICE significantly outperforms existing refining schemes in enhancing agent performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zelei Cheng;Xian Wu;Jiahao Yu;Sabrina Yang;Gang Wang;Xinyu Xing", "authorids": "~Zelei_Cheng1;~Xian_Wu8;~Jiahao_Yu1;~Sabrina_Yang1;~Gang_Wang13;~Xinyu_Xing3", "gender": ";M;M;F;M;M", "homepage": ";https://nuwuxian.github.io/;https://sherdencooper.github.io/;https://www.linkedin.com/in/sabrina-yang-3673a6293/;https://gangw.cs.illinois.edu/;http://xinyuxing.org/", "dblp": "258/0335;03/5595-7.html;238/6241-1;;71/4292-11.html;", "google_scholar": "https://scholar.google.com/citations?hl=en;ptWUm0EAAAAJ;mB4eowUAAAAJ;;-DfTtxgAAAAJ;71rdofMAAAAJ", "orcid": "0000-0001-7478-933X;;;;0000-0002-8910-8979;", "linkedin": ";;;sabrina-yang-3673a6293/;;", "or_profile": "~Zelei_Cheng1;~Xian_Wu8;~Jiahao_Yu1;~Sabrina_Yang1;~Gang_Wang13;~Xinyu_Xing3", "aff": "Northwestern University;Northwestern University;Northwestern University;Normalyze, Inc.;University of Illinois, Urbana Champaign;Northwestern University", "aff_domain": "northwestern.edu;northwestern.edu;northwestern.edu;normalyze.ai;illinois.edu;northwestern.edu", "position": "PhD student;PhD student;PhD student;Intern;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\ncheng2024rice,\ntitle={{RICE}: Breaking Through the Training Bottlenecks of Reinforcement Learning with Explanation},\nauthor={Zelei Cheng and Xian Wu and Jiahao Yu and Sabrina Yang and Gang Wang and Xinyu Xing},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PKJqsZD5nQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2434801, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6245417976251358014&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 14, "email": "northwestern.edu;northwestern.edu;northwestern.edu;normalyze.ai;illinois.edu;northwestern.edu", "author_num": 6, "aff_unique_index": "0;0;0;1;2;0", "aff_unique_norm": "Northwestern University;Normalyze, Inc.;University of Illinois Urbana-Champaign", "aff_unique_dep": ";;", "aff_unique_url": "https://www.northwestern.edu;;https://illinois.edu", "aff_unique_abbr": "NU;;UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Graph Mixup on Approximate Gromov\u2013Wasserstein Geodesics", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34128", "id": "PKdege0U6Z", "proceeding": "https://proceedings.mlr.press/v235/zeng24e.html", "pdf": "https://openreview.net/pdf?id=PKdege0U6Z", "openreview": "https://openreview.net/forum?id=PKdege0U6Z", "author_site": "Zhichen Zeng, Ruizhong Qiu, Zhe Xu, Zhining Liu, Yuchen Yan, Tianxin Wei, Lei Ying, Jingrui He, Hanghang Tong", "tldr": "", "abstract": "Mixup, which generates synthetic training samples on the data manifold, has been shown to be highly effective in augmenting Euclidean data. However, finding a proper data manifold for graph data is non-trivial, as graphs are non-Euclidean data in disparate spaces. Though efforts have been made, most of the existing graph mixup methods neglect the intrinsic geodesic guarantee, thereby generating inconsistent sample-label pairs. To address this issue, we propose GeoMix to mixup graphs on the Gromov-Wasserstein (GW) geodesics. A joint space over input graphs is first defined based on the GW distance, and graphs are then transformed into the GW space through equivalence-preserving transformations. We further show that the linear interpolation of the transformed graph pairs defines a geodesic connecting the original pairs on the GW manifold, hence ensuring the consistency between generated samples and labels. An accelerated mixup algorithm on the approximate low-dimensional GW manifold is further proposed. Extensive experiments show that the proposed GeoMix promotes the generalization and robustness of GNN models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhichen Zeng;Ruizhong Qiu;Zhe Xu;Zhining Liu;Yuchen Yan;Tianxin Wei;Lei Ying;Jingrui He;Hanghang Tong", "authorids": "~Zhichen_Zeng1;~Ruizhong_Qiu1;~Zhe_Xu5;~Zhining_Liu1;~Yuchen_Yan1;~Tianxin_Wei1;~Lei_Ying1;~Jingrui_He1;~Hanghang_Tong3", "gender": ";M;M;M;;;M;F;", "homepage": "https://zhichenz98.github.io/;https://q-rz.github.io/;https://pricexu.github.io/;https://zhiningliu.com/;;https://weitianxin.github.io/;http://leiying.engin.umich.edu/;https://www.hejingrui.org;http://tonghanghang.org", "dblp": "345/6632-1;330/9860;97/3701-7;195/4399-2;;277/5800;27/4818;34/2685;58/1757", "google_scholar": "rFdX368AAAAJ;REKarmcAAAAJ;7IhVDFsAAAAJ;5WORAUQAAAAJ;;_LU2-kMAAAAJ;7f3HKI8AAAAJ;hXpZynkAAAAJ;RaINcuUAAAAJ", "orcid": "0000-0002-5534-3401;0009-0000-3253-8890;0000-0002-6675-1398;0000-0003-1828-2109;;0000-0003-4450-2005;;0000-0002-6429-6272;0000-0003-4405-3887", "linkedin": ";ruizhong-qiu/;;zhiningliu/;;tianxin-wei-7063a2180/;;;htong/", "or_profile": "~Zhichen_Zeng1;~Ruizhong_Qiu1;~Zhe_Xu5;~Zhining_Liu1;~Yuchen_Yan1;~Tianxin_Wei1;~Lei_Ying1;~Jingrui_He1;~Hanghang_Tong3", "aff": "University of Illinois Urbana-Champaign;University of Illinois Urbana-Champaign;University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;;University of Illinois, Urbana-Champaign;University of Michigan, Ann Arbor;University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;illinois.edu;illinois.edu;illinois.edu;;uiuc.edu;umich.edu;illinois.edu;illinois.edu", "position": "PhD student;MS student;PhD student;PhD student;;PhD student;Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nzeng2024graph,\ntitle={Graph Mixup on Approximate Gromov{\\textendash}Wasserstein Geodesics},\nauthor={Zhichen Zeng and Ruizhong Qiu and Zhe Xu and Zhining Liu and Yuchen Yan and Tianxin Wei and Lei Ying and Jingrui He and Hanghang Tong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PKdege0U6Z}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8954264, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15618477522880302068&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "illinois.edu;illinois.edu;illinois.edu;illinois.edu;;uiuc.edu;umich.edu;illinois.edu;illinois.edu", "author_num": 9, "aff_unique_index": "0;0;0;0;1;2;0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;University of Illinois;University of Michigan", "aff_unique_dep": ";;", "aff_unique_url": "https://illinois.edu;https://illinois.edu;https://www.umich.edu", "aff_unique_abbr": "UIUC;UIUC;UM", "aff_campus_unique_index": "0;0;0;0;0;1;0;0", "aff_campus_unique": "Urbana-Champaign;Ann Arbor", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "InstructRetro: Instruction Tuning post Retrieval-Augmented Pretraining", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34127", "id": "PLAGGbssT8", "proceeding": "https://proceedings.mlr.press/v235/wang24bd.html", "pdf": "https://openreview.net/pdf?id=PLAGGbssT8", "openreview": "https://openreview.net/forum?id=PLAGGbssT8", "author_site": "Boxin Wang, Wei Ping, Lawrence McAfee, Peng Xu, Bo Li, Mohammad Shoeybi, Bryan Catanzaro", "tldr": "", "abstract": "Pretraining auto-regressive large language models (LLMs) with retrieval demonstrates better perplexity and factual accuracy by leveraging external databases. However, the size of existing pretrained retrieval-augmented LLM is still limited (e.g., Retro has 7.5B parameters), which limits the effectiveness of instruction tuning and zero-shot generalization. In this work, we introduce Retro 48B, the largest LLM pretrained with retrieval. Specifically, we continue to pretrain a 43B GPT model on additional 100 billion tokens using the Retro augmentation method by retrieving from 1.2 trillion tokens. Notably, the obtained foundation model, Retro 48B, largely outperforms the counterpart GPT 43B trained on 1.2T tokens in terms of perplexity with only 2.58% additional GPU hours, demonstrating the significant scaling potential of the method. After instruction tuning on Retro, InstructRetro demonstrates significant improvement over the instruction-tuned GPT on a wide range of zero-shot tasks. Specifically, the average improvement of InstructRetro is 7% over its GPT counterpart across 8 short-form QA and reading comprehension tasks, 10% over GPT across 4 challenging long-form QA tasks, and 16% over GPT across 3 summarization tasks. Surprisingly, we find that one can ablate the encoder from InstructRetro architecture and directly use its decoder backbone, while achieving comparable results. Our results highlight the promising direction to obtain a better GPT decoder through continued pretraining with retrieval before instruction tuning. Our code and checkpoints are publicly available at: https://huggingface.co/nvidia/retro-48b-instruct-4k.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Boxin Wang;Wei Ping;Lawrence McAfee;Peng Xu;Bo Li;Mohammad Shoeybi;Bryan Catanzaro", "authorids": "~Boxin_Wang1;~Wei_Ping1;~Lawrence_McAfee1;~Peng_Xu7;~Bo_Li19;~Mohammad_Shoeybi1;~Bryan_Catanzaro1", "gender": ";M;M;M;F;M;M", "homepage": "https://wbx.life;https://wpingnet.github.io/;https://nvidia.com;https://scholar.google.com.hk/citations?user=PQ26NTIAAAAJ&hl=en;http://boli.cs.illinois.edu/;;https://ctnzr.io", "dblp": "236/6319;08/8399.html;;84/586-8;50/3402-26;53/9742;14/4826", "google_scholar": "YOf2ATIAAAAJ;6gKEYRgAAAAJ;;https://scholar.google.com.hk/citations?user=PQ26NTIAAAAJ;K8vJkTcAAAAJ;62ElavIAAAAJ;UZ6kI2AAAAAJ", "orcid": ";;;;;;0000-0003-0034-7728", "linkedin": ";wei-ping/;;;;shoeybi/;bryancatanzaro/", "or_profile": "~Boxin_Wang1;~Wei_Ping1;~Lawrence_McAfee1;~Peng_Xu7;~Bo_Li19;~Mohammad_Shoeybi1;~Bryan_Catanzaro1", "aff": "NVIDIA;NVIDIA;NVIDIA;NVIDIA;University of Illinois, Urbana Champaign;NVIDIA;NVIDIA", "aff_domain": "nvidia.com;nvidia.com;nvidia.com;nvidia.com;illinois.edu;nvidia.com;nvidia.com", "position": "Senior Research Scientist;Principal Researcher;Researcher;Researcher;Assistant Professor;Director of Applied Resesrch;Vice President", "bibtex": "@inproceedings{\nwang2024instructretro,\ntitle={InstructRetro: Instruction Tuning post Retrieval-Augmented Pretraining},\nauthor={Boxin Wang and Wei Ping and Lawrence McAfee and Peng Xu and Bo Li and Mohammad Shoeybi and Bryan Catanzaro},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PLAGGbssT8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 631148, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18383393396082703199&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "nvidia.com;nvidia.com;nvidia.com;nvidia.com;illinois.edu;nvidia.com;nvidia.com", "author_num": 7, "aff_unique_index": "0;0;0;0;1;0;0", "aff_unique_norm": "NVIDIA;University of Illinois Urbana-Champaign", "aff_unique_dep": "NVIDIA Corporation;", "aff_unique_url": "https://www.nvidia.com;https://illinois.edu", "aff_unique_abbr": "NVIDIA;UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Sobolev Space Regularised Pre Density Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34126", "id": "PMASooqgoq", "proceeding": "https://proceedings.mlr.press/v235/kozdoba24a.html", "pdf": "https://openreview.net/pdf?id=PMASooqgoq", "openreview": "https://openreview.net/forum?id=PMASooqgoq", "author_site": "Mark Kozdoba, Binyamin Perets, Shie Mannor", "tldr": "", "abstract": "We propose a new approach to non-parametric density estimation that is based on regularizing a Sobolev norm of the density. This method is statistically consistent, and makes the inductive bias of the model clear and interpretable. While there is no closed analytic form for the associated kernel, we show that one can approximate it using sampling. The optimization problem needed to determine the density is non-convex, and standard gradient methods do not perform well. However, we show that with an appropriate initialization and using natural gradients, one can obtain well performing solutions. Finally, while the approach provides pre-densities (i.e. not necessarily integrating to 1), which prevents the use of log-likelihood for cross validation, we show that one can instead adapt Fisher divergence based score matching methods for this task. We evaluate the resulting method on the comprehensive recent anomaly detection benchmark suite, ADBench, and find that it ranks second best, among more than 15 algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mark Kozdoba;Binyamin Perets;Shie Mannor", "authorids": "~Mark_Kozdoba1;~Binyamin_Perets1;~Shie_Mannor2", "gender": ";M;M", "homepage": "https://www.linkedin.com/in/mark-kozdoba-5b6bb835/;;https://shie.net.technion.ac.il", "dblp": "161/9885;;20/1669", "google_scholar": "PHE-SswAAAAJ;7jP7ra0AAAAJ;https://scholar.google.com.tw/citations?user=q1HlbIUAAAAJ", "orcid": "0000-0002-8451-023X;;", "linkedin": "mark-kozdoba-5b6bb835/;benny-perets-876853102/;", "or_profile": "~Mark_Kozdoba1;~Binyamin_Perets1;~Shie_Mannor2", "aff": "Technion;Technion - Israel Institute of Technology, Technion - Israel Institute of Technology;Technion - Israel Institute of Technology, Technion", "aff_domain": "technion.ac.il;campus.technion.ac.il;technion.il", "position": "Principal Researcher;PhD student;Full Professor", "bibtex": "@inproceedings{\nkozdoba2024sobolev,\ntitle={Sobolev Space Regularised Pre Density Models},\nauthor={Mark Kozdoba and Binyamin Perets and Shie Mannor},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PMASooqgoq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6663835, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:s9rSBjcfh94J:scholar.google.com/&scioq=Sobolev+Space+Regularised+Pre+Density+Models&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": "technion.ac.il;campus.technion.ac.il;technion.il", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Israel" }, { "title": "Extending Test-Time Augmentation with Metamorphic Relations for Combinatorial Problems", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34125", "id": "PNsdnl8blk", "proceeding": "https://proceedings.mlr.press/v235/wei24i.html", "pdf": "https://openreview.net/pdf?id=PNsdnl8blk", "openreview": "https://openreview.net/forum?id=PNsdnl8blk", "author_site": "Siwei Wei, Xudong Zhang, Zhiyang Zhou, Yan Cai", "tldr": "", "abstract": "The application of machine learning methods to solve combinatorial problems has garnered considerable research interest. In this paper, we propose MAgg (**M**etamorphic **Agg**regation), a method to augment machine learning models for combinatorial problems at inference time using metamorphic relations. MAgg models metamorphic relations using directed graphs, which are then fed to a Graph Neural Network (GNN) model to improve the aggregation of predictions across transformed input instances. By incorporating metamorphic relations, MAgg essentially extends standard Test-Time Augmentation (TTA), eliminating the necessity of label-preserving transformations and expanding its applicability to a broader range of supervised learning tasks for combinatorial problems. We evaluate the proposed MAgg method on three mainstream machine learning tasks for combinatorial problems, namely Boolean Satisfiability Prediction (SAT), Decision Traveling Salesman Problem Satisfiability Prediction (Decision TSP), and Graph Edit Distance Estimation (GED). The evaluation result shows significant improvements over base models in all three tasks, corroborating the effectiveness and versatility of the proposed method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Siwei Wei;Xudong Zhang;Zhiyang Zhou;Yan Cai", "authorids": "~Siwei_Wei1;zhangxd20@ios.ac.cn;~Zhiyang_Zhou3;~Yan_Cai3", "gender": "M;;M;M", "homepage": ";;https://github.com/zhiyang3344;http://yancai.site", "dblp": "208/9736;;162/3480;60/3060-1.html", "google_scholar": ";;;https://scholar.google.com.tw/citations?user=QotyxBEAAAAJ", "orcid": "0009-0003-3073-3117;;;", "linkedin": ";;;", "or_profile": "~Siwei_Wei1;zhangxd20@ios.ac.cn;~Zhiyang_Zhou3;~Yan_Cai3", "aff": "Chinese Academy of Sciences, Chinese Academy of Sciences;;University of Chinese Academy of Sciences;Institute of Software, Chinese Academy of Sciences", "aff_domain": "ios.ac.cn;;ucas.ac.cn;iscas.ac.cn", "position": "PhD student;;Postdoc;Full Professor", "bibtex": "@inproceedings{\nwei2024extending,\ntitle={Extending Test-Time Augmentation with Metamorphic Relations for Combinatorial Problems},\nauthor={Siwei Wei and Xudong Zhang and Zhiyang Zhou and Yan Cai},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PNsdnl8blk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1046531, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:y8R6L_irmEkJ:scholar.google.com/&scioq=Extending+Test-Time+Augmentation+with+Metamorphic+Relations+for+Combinatorial+Problems&hl=en&as_sdt=0,23", "gs_version_total": 4, "email": "ios.ac.cn;;ucas.ac.cn;iscas.ac.cn", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences", "aff_unique_dep": ";", "aff_unique_url": "http://www.cas.cn;http://www.ucas.ac.cn", "aff_unique_abbr": "CAS;UCAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Prompt-based Visual Alignment for Zero-shot Policy Transfer", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34124", "id": "PPoQz8K4GZ", "proceeding": "https://proceedings.mlr.press/v235/gao24r.html", "pdf": "https://openreview.net/pdf?id=PPoQz8K4GZ", "openreview": "https://openreview.net/forum?id=PPoQz8K4GZ", "author_site": "Haihan Gao, Rui Zhang, Qi Yi, Hantao Yao, Haochen Li, Jiaming Guo, Shaohui Peng, Yunkai Gao, QiCheng Wang, Xing Hu, Yuanbo Wen, Zihao Zhang, Zidong Du, Ling Li, Qi Guo, Yunji Chen", "tldr": "", "abstract": "Overfitting in RL has become one of the main obstacles to applications in reinforcement learning(RL). Existing methods do not provide explicit semantic constrain for the feature extractor, hindering the agent from learning a unified cross-domain representation and resulting in performance degradation on unseen domains. Besides, abundant data from multiple domains are needed. To address these issues, in this work, we propose prompt-based visual alignment (PVA), a robust framework to mitigate the detrimental domain bias in the image for zero-shot policy transfer. Inspired that Visual-Language Model (VLM) can serve as a bridge to connect both text space and image space, we leverage the semantic information contained in a text sequence as an explicit constraint to train a visual aligner. Thus, the visual aligner can map images from multiple domains to a unified domain and achieve good generalization performance. To better depict semantic information, prompt tuning is applied to learn a sequence of learnable tokens. With explicit constraints of semantic information, PVA can learn unified cross-domain representation under limited access to cross-domain data and achieves great zero-shot generalization ability in unseen domains. We verify PVA on a vision-based autonomous driving task with CARLA simulator. Experiments show that the agent generalizes well on unseen domains under limited access to multi-domain data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haihan Gao;Rui Zhang;Qi Yi;Hantao Yao;Haochen Li;Jiaming Guo;Shaohui Peng;Yunkai Gao;QiCheng Wang;Xing Hu;Yuanbo Wen;Zihao Zhang;Zidong Du;Ling Li;Qi Guo;Yunji Chen", "authorids": "~Haihan_Gao1;~Rui_Zhang1;~Qi_Yi1;~Hantao_Yao2;~Haochen_Li2;~Jiaming_Guo2;~Shaohui_Peng2;~Yunkai_Gao1;~QiCheng_Wang2;~Xing_Hu3;~Yuanbo_Wen1;~Zihao_Zhang4;~Zidong_Du1;~Ling_Li6;~Qi_Guo4;~Yunji_Chen1", "gender": "M;F;M;M;M;M;M;M;F;M;M;;F;M;M;M", "homepage": ";;;http://www.hantaoyao.com/;https://github.com/Therock90421;;;https://github.com/Outstandingggg;;;http://www.zihaozhang.tech;https://zidongdu.github.io/;;http://novel.ict.ac.cn/qguo;;", "dblp": ";60/2536-40;295/8813;167/3478;49/11531-2;63/8512;44/8056-1.html;;49/10052-1;262/3144;;44/11216;92/5001-1;67/398-1;48/474;246/8768", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;dse6jAsAAAAJ;veu6_ykAAAAJ;;QxfHHQcAAAAJ;;https://scholar.google.com/citations?hl=zh-CN;;Hc3iRxUAAAAJ;;;https://scholar.google.com.sg/citations?user=8N9ym9YAAAAJ;;;;", "orcid": ";;;;0000-0003-0813-6351;;0000-0003-4126-7441;;;0000-0002-7775-2724;0000-0001-6859-7518;0000-0002-7603-4210;0000-0001-8877-9052;;;", "linkedin": ";;;;;;;;;;;;;;;", "or_profile": "~Haihan_Gao1;~Rui_Zhang1;~Qi_Yi1;~Hantao_Yao2;~Haochen_Li2;~Jiaming_Guo2;~Yunkai_Gao1;~QiCheng_Wang2;~Xing_Hu3;~Yuanbo_Wen1;~Zihao_Zhang4;~Zidong_Du1;~Ling_Li6;~Qi_Guo4;~Yunji_Chen1;~shaohui_peng1", "aff": "University of Science and Technology of China;Institute of Computing Technology, CAS;University of Science and Technology of China;Institute of automation, Chinese academy of science;Institute of Software, Chinese Academy of Sciences;Institute of Computing Technology, Chinese Academy of Sciences;University of Science and Technology of China;University of Chinese Academy of Sciences;Institute of Computing Technology, Chinese Academy of Sciences;Institute of Computing Technology, Chinese Academy of Sciences;INsititue of Computing Technology, Chinese Academy of Sciences;Institute of Computing Technology, Chinese Academy of Sciences;Institute of Software, CAS;Institute of Computing Technology, Chinese Academy of Sciences;Institute of Computing Technology, Chinese Academy of Sciences; Chinese Academy of Sciences", "aff_domain": "ustc.edu.cn;ict.ac.cn;ustc.edu.cn;nlpr.ia.ac.cn;iscas.ac.cn;ict.ac.cn;ustc.edu.cn;ucas.ac.cn;ict.ac.cn;ict.ac.cn;ict.ac.cn;ict.ac.cn;iscas.ac.cn;ict.ac.cn;ict.ac.cn;iscas.ac.cn", "position": "MS student;Assistant Professor;PhD student;Associate Professor;PhD student;Postdoc;PhD student;MS student;Full Professor;Postdoc;Assistant Professor;Full Professor;Full Professor;Full Professor;Full Professor;Postdoc", "bibtex": "@inproceedings{\ngao2024promptbased,\ntitle={Prompt-based Visual Alignment for Zero-shot Policy Transfer},\nauthor={Haihan Gao and Rui Zhang and Qi Yi and Hantao Yao and Haochen Li and Jiaming Guo and Shaohui Peng and Yunkai Gao and QiCheng Wang and Xing Hu and Yuanbo Wen and Zihao Zhang and Zidong Du and Ling Li and Qi Guo and Yunji Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PPoQz8K4GZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 713250, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 16, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Dys4AwX6TC0J:scholar.google.com/&scioq=Prompt-based+Visual+Alignment+for+Zero-shot+Policy+Transfer&hl=en&as_sdt=0,5", "gs_version_total": 8, "email": "ustc.edu.cn;ict.ac.cn;ustc.edu.cn;nlpr.ia.ac.cn;iscas.ac.cn;ict.ac.cn;ustc.edu.cn;ucas.ac.cn;ict.ac.cn;ict.ac.cn;ict.ac.cn;ict.ac.cn;iscas.ac.cn;ict.ac.cn;ict.ac.cn;iscas.ac.cn", "author_num": 16, "aff_unique_index": "0;1;0;1;1;1;0;2;1;1;3;1;1;1;1;1", "aff_unique_norm": "University of Science and Technology of China;Chinese Academy of Sciences;University of Chinese Academy of Sciences;Institute of Computing Technology, Chinese Academy of Sciences", "aff_unique_dep": ";Institute of Computing Technology;;", "aff_unique_url": "http://www.ustc.edu.cn;http://www.ict.ac.cn;http://www.ucas.ac.cn;http://www.ict.ac.cn", "aff_unique_abbr": "USTC;CAS;UCAS;ICT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Position: Mission Critical \u2013 Satellite Data is a Distinct Modality in Machine Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34123", "id": "PQ0ERKKYJu", "proceeding": "https://proceedings.mlr.press/v235/rolf24a.html", "pdf": "https://openreview.net/pdf?id=PQ0ERKKYJu", "openreview": "https://openreview.net/forum?id=PQ0ERKKYJu", "author_site": "Esther Rolf, Konstantin Klemmer, Caleb Robinson, Hannah Kerner", "tldr": "", "abstract": "Satellite data has the potential to inspire a seismic shift for machine learning---one in which we rethink existing practices designed for traditional data modalities. As machine learning for satellite data (SatML) gains traction for its real-world impact, our field is at a crossroads. We can either continue applying ill-suited approaches, or we can initiate a new research agenda that centers around the unique characteristics and challenges of satellite data. This position paper argues that satellite data constitutes a distinct modality for machine learning research and that we must recognize it as such to advance the quality and impact of SatML research across theory, methods, and deployment. We outline research directions, critical discussion questions and actionable suggestions to transform SatML from merely an intriguing application area to a dedicated research discipline that helps move the needle on big challenges for machine learning and society.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Esther Rolf;Konstantin Klemmer;Caleb Robinson;Hannah Kerner", "authorids": "~Esther_Rolf1;~Konstantin_Klemmer1;~Caleb_Robinson1;~Hannah_Kerner1", "gender": ";;M;F", "homepage": ";https://konstantinklemmer.github.io/;http://calebrob.com;https://hannah-rae.github.io/", "dblp": ";189/0395;194/7729;218/2646", "google_scholar": ";https://scholar.google.co.uk/citations?user=ltmGyokAAAAJ;cjYgLT0AAAAJ;g5CD7dQAAAAJ", "orcid": ";0000-0002-7096-0133;;0000-0002-3259-7759", "linkedin": ";;;hannahkerner/", "or_profile": "~Esther_Rolf1;~Konstantin_Klemmer1;~Caleb_Robinson1;~Hannah_Kerner1", "aff": ";Microsoft;Microsoft;Arizona State University", "aff_domain": ";microsoft.com;microsoft.com;asu.edu", "position": ";Postdoc;Principal Researcher;Assistant Professor", "bibtex": "@inproceedings{\nrolf2024position,\ntitle={Position: Mission Critical {\\textendash} Satellite Data is a Distinct Modality in Machine Learning},\nauthor={Esther Rolf and Konstantin Klemmer and Caleb Robinson and Hannah Kerner},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PQ0ERKKYJu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7340465, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9665880935398896752&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": ";microsoft.com;microsoft.com;asu.edu", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Microsoft;Arizona State University", "aff_unique_dep": "Microsoft Corporation;", "aff_unique_url": "https://www.microsoft.com;https://www.asu.edu", "aff_unique_abbr": "Microsoft;ASU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Position: The Reasonable Person Standard for AI", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34122", "id": "PQWVUbqQtQ", "proceeding": "https://proceedings.mlr.press/v235/rane24a.html", "pdf": "https://openreview.net/pdf?id=PQWVUbqQtQ", "openreview": "https://openreview.net/forum?id=PQWVUbqQtQ", "tldr": "", "abstract": "As AI systems are increasingly incorporated into domains where human behavior has set the norm, a challenge for AI governance and AI alignment research is to regulate their behavior in a way that is useful and constructive for society. One way to answer this question is to ask: how do we govern the human behavior that the models are emulating? To evaluate human behavior, the American legal system often uses the \"Reasonable Person Standard.\" The idea of \"reasonable\" behavior comes up in nearly every area of law. The legal system often judges the actions of parties with respect to what a reasonable person would have done under similar circumstances. This paper argues that the reasonable person standard provides useful guidelines for the type of behavior we should develop, probe, and stress-test in models. It explains how reasonableness is defined and used in key areas of the law using illustrative cases, how the reasonable person standard could apply to AI behavior in each of these areas and contexts, and how our societal understanding of \"reasonable\" behavior provides useful technical goals for AI researchers.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sunayana Rane", "authorids": "~Sunayana_Rane1", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\nrane2024position,\ntitle={Position: The Reasonable Person Standard for {AI}},\nauthor={Sunayana Rane},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PQWVUbqQtQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 176664, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Wj9OeC3I1G0J:scholar.google.com/&scioq=Position:+The+Reasonable+Person+Standard+for+AI&hl=en&as_sdt=0,44", "gs_version_total": 4, "email": "", "author_num": 1 }, { "title": "Think Before You Act: Decision Transformers with Working Memory", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34121", "id": "PSQ5Z920M8", "proceeding": "https://proceedings.mlr.press/v235/kang24b.html", "pdf": "https://openreview.net/pdf?id=PSQ5Z920M8", "openreview": "https://openreview.net/forum?id=PSQ5Z920M8", "author_site": "Jikun Kang, Romain Laroche, Xingdi Yuan, Adam Trischler, Xue Liu, Jie Fu", "tldr": "", "abstract": "Decision Transformer-based decision-making agents have shown the ability to generalize across multiple tasks. However, their performance relies on massive data and computation. We argue that this inefficiency stems from the forgetting phenomenon, in which a model memorizes its behaviors in parameters throughout training. As a result, training on a new task may deteriorate the model's performance on previous tasks. In contrast to LLMs' implicit memory mechanism, the human brain utilizes distributed memory storage, which helps manage and organize multiple skills efficiently, mitigating the forgetting phenomenon. Inspired by this, we propose a working memory module to store, blend, and retrieve information for different downstream tasks. Evaluation results show that the proposed method improves training efficiency and generalization in Atari games and Meta-World object manipulation tasks. Moreover, we demonstrate that memory fine-tuning further enhances the adaptability of the proposed architecture.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jikun Kang;Romain Laroche;Xingdi Yuan;Adam Trischler;Xue Liu;Jie Fu", "authorids": "~Jikun_Kang1;~Romain_Laroche1;~Xingdi_Yuan2;~Adam_Trischler1;~Xue_Liu1;~Jie_Fu2", "gender": "M;M;M;M;M;M", "homepage": "https://luciferkonn.github.io;https://www.researchgate.net/profile/Romain_Laroche;https://www.microsoft.com/en-us/research/people/adtrisch/;http://www.cs.mcgill.ca/~xueliu/;https://bigaidream.github.io/;https://xingdi-eric-yuan.github.io/", "dblp": "299/0233;65/9019;177/9137;l/XueLiu;;40/10147", "google_scholar": "Jikun%20Kang;RiIOKJMAAAAJ;https://scholar.google.ca/citations?user=EvUM6UUAAAAJ;https://scholar.google.com.tw/citations?user=rfLIRakAAAAJ;66osleIAAAAJ;hYfE-B8AAAAJ", "orcid": "0009-0001-1334-7092;;;;0000-0002-4494-843X;", "linkedin": "kang-jikun-91993814b/;romain-laroche-6282397/?originalSubdomain=ca;;;;", "or_profile": "~Jikun_Kang1;~Romain_Laroche1;~Adam_Trischler1;~Xue_Liu1;~Jie_Fu1;~Eric_Yuan1", "aff": "Convergence AI;Wayve;;McGill University;Hong Kong University of Science and Technology;Microsoft Research", "aff_domain": "convergence.ai;wayve.ai;;mcgill.ca;ust.hk;microsoft.com", "position": "Researcher;Principal Researcher;;Full Professor;Researcher;Senior Researcher", "bibtex": "@inproceedings{\nkang2024think,\ntitle={Think Before You Act: Decision Transformers with Working Memory},\nauthor={Jikun Kang and Romain Laroche and Xingdi Yuan and Adam Trischler and Xue Liu and Jie Fu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PSQ5Z920M8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2984688, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2693893706142154305&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "convergence.ai;wayve.ai;;mcgill.ca;ust.hk;microsoft.com", "author_num": 6, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Convergence AI;Wayve;McGill University;Hong Kong University of Science and Technology;Microsoft", "aff_unique_dep": ";;;;Microsoft Research", "aff_unique_url": ";https://www.wayve.ai;https://www.mcgill.ca;https://www.ust.hk;https://www.microsoft.com/en-us/research", "aff_unique_abbr": ";;McGill;HKUST;MSR", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "1;2;3;4", "aff_country_unique": ";United Kingdom;Canada;China;United States" }, { "title": "An Online Optimization Perspective on First-Order and Zero-Order Decentralized Nonsmooth Nonconvex Stochastic Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34120", "id": "PSzyBN7LIA", "proceeding": "https://proceedings.mlr.press/v235/sahinoglu24a.html", "pdf": "https://openreview.net/pdf?id=PSzyBN7LIA", "openreview": "https://openreview.net/forum?id=PSzyBN7LIA", "author_site": "Emre Sahinoglu, Shahin Shahrampour", "tldr": "", "abstract": "We investigate the finite-time analysis of finding ($\\delta, \\epsilon$)-stationary points for nonsmooth nonconvex objectives in decentralized stochastic optimization. A set of agents aim at minimizing a global function using only their local information by interacting over a network. We present a novel algorithm, called Multi Epoch Decentralized Online Learning (ME-DOL), for which we establish the sample complexity in various settings. First, using a recently proposed online-to-nonconvex technique, we show that our algorithm recovers the optimal convergence rate of smooth nonconvex objectives. We then extend our analysis to the nonsmooth setting, building on properties of randomized smoothing and Goldstein-subdifferential sets. We establish the sample complexity of $O(\\delta^{-1}\\epsilon^{-3})$, which to the best of our knowledge is the first finite-time guarantee for decentralized nonsmooth nonconvex stochastic optimization in the first-order setting (without weak-convexity), matching its optimal centralized counterpart. We further prove the same rate for the zero-order oracle setting without using variance reduction.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Emre Sahinoglu;Shahin Shahrampour", "authorids": "~Emre_Sahinoglu1;~Shahin_Shahrampour2", "gender": "M;", "homepage": ";", "dblp": ";127/7489", "google_scholar": "0eSjG6gAAAAJ;nr4EJS8AAAAJ", "orcid": ";", "linkedin": "https://linkedin.com/in/emresahinoglu607;shahin-shahrampour-425a8823/", "or_profile": "~Emre_Sahinoglu1;~Shahin_Shahrampour2", "aff": "Northeastern University;Northeastern University", "aff_domain": "northeastern.edu;northeastern.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nsahinoglu2024an,\ntitle={An Online Optimization Perspective on First-Order and Zero-Order Decentralized Nonsmooth Nonconvex Stochastic Optimization},\nauthor={Emre Sahinoglu and Shahin Shahrampour},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PSzyBN7LIA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3268251, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7434931183136101111&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "northeastern.edu;northeastern.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Northeastern University", "aff_unique_dep": "", "aff_unique_url": "https://www.northeastern.edu", "aff_unique_abbr": "NEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Private Vector Mean Estimation in the Shuffle Model: Optimal Rates Require Many Messages", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34119", "id": "PTGJOUlQ68", "proceeding": "https://proceedings.mlr.press/v235/asi24a.html", "pdf": "https://openreview.net/pdf?id=PTGJOUlQ68", "openreview": "https://openreview.net/forum?id=PTGJOUlQ68", "author_site": "Hilal Asi, Vitaly Feldman, Jelani Nelson, Huy Nguyen, Kunal Talwar, Samson Zhou", "tldr": "", "abstract": "We study the problem of private vector mean estimation in the shuffle model of privacy where $n$ users each have a unit vector $v^{(i)} \\in \\mathbb{R}^d$. We propose a new multi-message protocol that achieves the optimal error using $O(\\min(n\\varepsilon^2,d))$ messages per user. Moreover, we show that any (unbiased) protocol that achieves optimal error must require each user to send $\\Omega(\\min(n\\varepsilon^2,d)/\\log(n))$ messages, demonstrating the optimality of our message complexity up to logarithmic factors. Additionally, we study the single-message setting and design a protocol that achieves mean squared error $O(dn^{d/(d+2)}\\varepsilon^{-4/(d+2)})$. Moreover, we show that *any* single-message protocol must incur mean squared error $\\Omega(dn^{d/(d+2)})$, showing that our protocol is optimal in the standard setting where $\\varepsilon = \\Theta(1)$. Finally, we study robustness to malicious users and show that malicious users can incur large additive error with a single shuffler.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hilal Asi;Vitaly Feldman;Jelani Nelson;Huy Nguyen;Kunal Talwar;Samson Zhou", "authorids": "~Hilal_Asi1;~Vitaly_Feldman1;~Jelani_Nelson2;~Huy_Nguyen3;~Kunal_Talwar1;~Samson_Zhou1", "gender": "M;M;M;;M;M", "homepage": "http://web.stanford.edu/~asi/;https://vtaly.net;http://www.kunaltalwar.org;https://samsonzhou.github.io/;https://www.khoury.northeastern.edu/~hlnguyen/;http://people.eecs.berkeley.edu/~minilek", "dblp": ";67/1162;06/3696;179/2683;62/3796;68/3296.html", "google_scholar": "QGcz9-kAAAAJ;GqZBmfgAAAAJ;XD_01h8AAAAJ;NpjsgocAAAAJ;https://scholar.google.com.tw/citations?user=MDCu0WEAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;;;", "linkedin": ";;kunal-talwar-128a6159;;;minilek/", "or_profile": "~Hilal_Asi1;~Vitaly_Feldman1;~Kunal_Talwar1;~Samson_Zhou1;~Huy_Nguyen1;~Jelani_Nelson1", "aff": "Apple;Apple AI Research;Apple;Texas A&M University - College Station;Northeastern University;University of California, Berkeley", "aff_domain": "apple.com;apple.com;apple.com;tamu.edu;northeastern.edu;berkeley.edu", "position": "Researcher;Research Scientist;Research Scientist;Assistant Professor;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nasi2024private,\ntitle={Private Vector Mean Estimation in the Shuffle Model: Optimal Rates Require Many Messages},\nauthor={Hilal Asi and Vitaly Feldman and Jelani Nelson and Huy Nguyen and Kunal Talwar and Samson Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PTGJOUlQ68}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 432011, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13200638913846685365&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "apple.com;apple.com;apple.com;tamu.edu;northeastern.edu;berkeley.edu", "author_num": 6, "aff_unique_index": "0;0;0;1;2;3", "aff_unique_norm": "Apple;Texas A&M University;Northeastern University;University of California, Berkeley", "aff_unique_dep": "Apple Inc.;;;", "aff_unique_url": "https://www.apple.com;https://www.tamu.edu;https://www.northeastern.edu;https://www.berkeley.edu", "aff_unique_abbr": "Apple;TAMU;NEU;UC Berkeley", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";College Station;Berkeley", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Generalization in Kernel Regression Under Realistic Assumptions", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34118", "id": "PY3bKuorBI", "proceeding": "https://proceedings.mlr.press/v235/barzilai24a.html", "pdf": "https://openreview.net/pdf?id=PY3bKuorBI", "openreview": "https://openreview.net/forum?id=PY3bKuorBI", "author_site": "Daniel Barzilai, Ohad Shamir", "tldr": "", "abstract": "It is by now well-established that modern over-parameterized models seem to elude the bias-variance tradeoff and generalize well despite overfitting noise. Many recent works attempt to analyze this phenomenon in the relatively tractable setting of kernel regression. However, as we argue in detail, most past works on this topic either make unrealistic assumptions, or focus on a narrow problem setup. This work aims to provide a unified theory to upper bound the excess risk of kernel regression for nearly all common and realistic settings. When applied to common kernels, our results imply benign overfitting in high input dimensions, nearly tempered overfitting in fixed dimensions, and explicit convergence rates for regularized regression. As a by-product, we obtain time-dependent bounds for neural networks trained in the kernel regime. Our results rely on new relative perturbation bounds for the eigenvalues of kernel matrices, which may be of independent interest. These reveal a self-regularization phenomenon, whereby a heavy tail in the eigendecomposition of the kernel implicitly leads to good generalization.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Daniel Barzilai;Ohad Shamir", "authorids": "~Daniel_Barzilai1;~Ohad_Shamir1", "gender": "M;", "homepage": ";http://www.wisdom.weizmann.ac.il/~shamiro/", "dblp": "334/4656;12/5897", "google_scholar": "B6zVOFoAAAAJ;all0DHsAAAAJ", "orcid": ";", "linkedin": "daniel-barzilai-1a9b61219/;", "or_profile": "~Daniel_Barzilai1;~Ohad_Shamir1", "aff": "Weizmann Institute of Science;Weizmann Institute", "aff_domain": "weizmann.ac.il;weizmann.ac.il", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nbarzilai2024generalization,\ntitle={Generalization in Kernel Regression Under Realistic Assumptions},\nauthor={Daniel Barzilai and Ohad Shamir},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PY3bKuorBI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 786708, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=944603360716883429&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "weizmann.ac.il;weizmann.ac.il", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Weizmann Institute of Science", "aff_unique_dep": "", "aff_unique_url": "https://www.weizmann.org.il", "aff_unique_abbr": "Weizmann", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "QBMK: Quantum-based Matching Kernels for Un-attributed Graphs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34117", "id": "PYDCwWvbG7", "proceeding": "https://proceedings.mlr.press/v235/bai24a.html", "pdf": "https://openreview.net/pdf?id=PYDCwWvbG7", "openreview": "https://openreview.net/forum?id=PYDCwWvbG7", "author_site": "Lu Bai, Lixin Cui, Ming Li, Yue Wang, Edwin Hancock", "tldr": "", "abstract": "In this work, we develop a new Quantum-based Matching Kernel (QBMK) for un-attributed graphs, by computing the kernel-based similarity between the quantum Shannon entropies of aligned vertices through the Continuous-time Quantum Walk (CTQW). The theoretical analysis reveals that the proposed QBMK kernel not only addresses the shortcoming of neglecting the structural correspondence information between graphs arising in existing R-convolution graph kernels, but also overcomes the problem of neglecting the structural differences between pairs of aligned vertices arising in existing vertex-based matching kernels. Moreover, the proposed QBMK kernel can simultaneously capture both global and local structural characteristics through the quantum Shannon entropies. Experimental evaluations on standard graph datasets demonstrate that the proposed QBMK kernel is able to outperform state-of-the-art graph kernels and graph deep learning approaches.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lu Bai;Lixin Cui;Ming Li;Yue Wang;Edwin Hancock", "authorids": "~Lu_Bai3;cuilixin@cufe.edu.cn;~Ming_Li15;~Yue_Wang41;~Edwin_Hancock1", "gender": "M;;M;;M", "homepage": ";;https://mingli-ai.github.io;;https://pure.york.ac.uk/portal/en/persons/edwin-r-hancock", "dblp": "26/1137-1;;181/2821-65;;h/EdwinRHancock", "google_scholar": ";;Z7yEoOQAAAAJ;;EjDU2ncAAAAJ", "orcid": ";;0000-0002-1218-2804;;0000-0003-4496-2028", "linkedin": ";;;;edwin-hancock-11913117/?originalSubdomain=uk", "or_profile": "~Lu_Bai3;cuilixin@cufe.edu.cn;~Ming_Li15;~Yue_Wang41;~Edwin_Hancock1", "aff": ";;Zhejiang Normal University;;Anhui University", "aff_domain": ";;zjnu.edu.cn;;ahu.edu.cn", "position": ";;Full Professor;;Honorary Professor", "bibtex": "@inproceedings{\nbai2024qbmk,\ntitle={{QBMK}: Quantum-based Matching Kernels for Un-attributed Graphs},\nauthor={Lu Bai and Lixin Cui and Ming Li and Yue Wang and Edwin Hancock},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PYDCwWvbG7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1059410, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:tXf3E3ZbswkJ:scholar.google.com/&scioq=QBMK:+Quantum-based+Matching+Kernels+for+Un-attributed+Graphs&hl=en&as_sdt=0,44", "gs_version_total": 5, "email": ";;zjnu.edu.cn;;ahu.edu.cn", "author_num": 5, "aff_unique_index": "0;1", "aff_unique_norm": "Zhejiang Normal University;Anhui University", "aff_unique_dep": ";", "aff_unique_url": "http://www.zjnu.edu.cn;http://www.ahu.edu.cn/", "aff_unique_abbr": "ZJNU;AHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "A Sober Look at LLMs for Material Discovery: Are They Actually Good for Bayesian Optimization Over Molecules?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34116", "id": "Pa3GyTe3kf", "proceeding": "https://proceedings.mlr.press/v235/kristiadi24a.html", "pdf": "https://openreview.net/pdf?id=Pa3GyTe3kf", "openreview": "https://openreview.net/forum?id=Pa3GyTe3kf", "author_site": "Agustinus Kristiadi, Felix Strieth-Kalthoff, Marta Skreta, Pascal Poupart, Alan Aspuru-Guzik, Geoff Pleiss", "tldr": "", "abstract": "Automation is one of the cornerstones of contemporary material discovery. Bayesian optimization (BO) is an essential part of such workflows, enabling scientists to leverage prior domain knowledge into efficient exploration of a large molecular space. While such prior knowledge can take many forms, there has been significant fanfare around the ancillary scientific knowledge encapsulated in large language models (LLMs). However, existing work thus far has only explored LLMs for heuristic materials searches. Indeed, recent work obtains the uncertainty estimate---an integral part of BO---from point-estimated, _non-Bayesian_ LLMs. In this work, we study the question of whether LLMs are actually useful to accelerate principled _Bayesian_ optimization in the molecular space. We take a sober, dispassionate stance in answering this question. This is done by carefully (i) viewing LLMs as fixed feature extractors for standard but principled BO surrogate models and by (ii) leveraging parameter-efficient finetuning methods and Bayesian neural networks to obtain the posterior of the LLM surrogate. Our extensive experiments with real-world chemistry problems show that LLMs can be useful for BO over molecules, but only if they have been pretrained or finetuned with domain-specific data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Agustinus Kristiadi;Felix Strieth-Kalthoff;Marta Skreta;Pascal Poupart;Alan Aspuru-Guzik;Geoff Pleiss", "authorids": "~Agustinus_Kristiadi1;~Felix_Strieth-Kalthoff1;~Marta_Skreta1;~Pascal_Poupart2;~Alan_Aspuru-Guzik2;~Geoff_Pleiss1", "gender": ";M;F;M;M;M", "homepage": "https://agustinus.kristia.de;https://fsk-lab.github.io;;https://cs.uwaterloo.ca/~ppoupart;http://matter.toronto.edu;http://geoffpleiss.com", "dblp": "215/3954;;255/5167;26/2122;;199/1693.html", "google_scholar": "_1qe2mYAAAAJ;https://scholar.google.ca/citations?user=LGZDAgIAAAAJ;https://scholar.google.ca/citations?user=OYd3hjYAAAAJ;https://scholar.google.ca/citations?user=KhAJWroAAAAJ;Ag_6KEgAAAAJ;XO8T-Y4AAAAJ", "orcid": "0000-0003-1615-1121;0000-0003-1357-5500;;;0000-0002-8277-4434;0000-0002-7009-0967", "linkedin": "agustinus-kristiadi/;felix-strieth-kalthoff-11115b150/?locale=en_US;martaskreta/;;;", "or_profile": "~Agustinus_Kristiadi1;~Felix_Strieth-Kalthoff1;~Marta_Skreta1;~Pascal_Poupart2;~Alan_Aspuru-Guzik2;~Geoff_Pleiss1", "aff": "Vector Institute;University of Toronto;Department of Computer Science, University of Toronto;University of Waterloo;University of Toronto;Vector Institute", "aff_domain": "vectorinstitute.ai;utoronto.ca;cs.toronto.edu;uwaterloo.ca;utoronto.ca;vectorinstitute.ai", "position": "Postdoc;Postdoc;PhD student;Full Professor;Full Professor;Researcher", "bibtex": "@inproceedings{\nkristiadi2024a,\ntitle={A Sober Look at {LLM}s for Material Discovery: Are They Actually Good for Bayesian Optimization Over Molecules?},\nauthor={Agustinus Kristiadi and Felix Strieth-Kalthoff and Marta Skreta and Pascal Poupart and Alan Aspuru-Guzik and Geoff Pleiss},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Pa3GyTe3kf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1172976, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13426905606348665598&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "vectorinstitute.ai;utoronto.ca;cs.toronto.edu;uwaterloo.ca;utoronto.ca;vectorinstitute.ai", "author_num": 6, "aff_unique_index": "0;1;1;2;1;0", "aff_unique_norm": "Vector Institute;University of Toronto;University of Waterloo", "aff_unique_dep": ";;", "aff_unique_url": "https://vectorinstitute.ai/;https://www.utoronto.ca;https://uwaterloo.ca", "aff_unique_abbr": "Vector Institute;U of T;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Toronto", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "Canada" }, { "title": "Robust Universal Adversarial Perturbations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34115", "id": "Paw0BkPaTN", "proceeding": "https://proceedings.mlr.press/v235/xu24v.html", "pdf": "https://openreview.net/pdf?id=Paw0BkPaTN", "openreview": "https://openreview.net/forum?id=Paw0BkPaTN", "author_site": "Changming Xu, Gagandeep Singh", "tldr": "", "abstract": "Universal Adversarial Perturbations (UAPs) are imperceptible, image-agnostic vectors that cause deep neural networks (DNNs) to misclassify inputs with high probability. In practical attack scenarios, adversarial perturbations may undergo transformations such as changes in pixel intensity, scaling, etc. before being added to DNN inputs. Existing methods do not create UAPs robust to these real-world transformations, thereby limiting their applicability in practical attack scenarios. In this work, we introduce and formulate UAPs robust against real-world transformations. We build an iterative algorithm using probabilistic robustness bounds and construct UAPs robust to transformations generated by composing arbitrary sub-differentiable transformation functions. We perform an extensive evaluation on the popular CIFAR-10 and ILSVRC 2012 datasets measuring our UAPs' robustness under a wide range common, real-world transformations such as rotation, contrast changes, etc. We further show that by using a set of primitive transformations our method generalizes well to unseen transformations such as fog, JPEG compression, etc. Our results show that our method can generate UAPs up to 23% more robust than state-of-the-art baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Changming Xu;Gagandeep Singh", "authorids": "~Changming_Xu2;~Gagandeep_Singh1", "gender": "M;M", "homepage": "https://cmxu.io;https://ggndpsngh.github.io/", "dblp": ";64/3747-1", "google_scholar": ";https://scholar.google.ch/citations?user=m4b2ruEAAAAJ", "orcid": "0000-0003-3079-5652;0000-0002-9299-2961", "linkedin": "calvin-xu-8236451a2/;gagandeep-singh-1bb01b49/", "or_profile": "~Changming_Xu2;~Gagandeep_Singh1", "aff": "University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;illinois.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nxu2024robust,\ntitle={Robust Universal Adversarial Perturbations},\nauthor={Changming Xu and Gagandeep Singh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Paw0BkPaTN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4328892, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2644493942476496226&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "email": "illinois.edu;illinois.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Neural SPH: Improved Neural Modeling of Lagrangian Fluid Dynamics", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34114", "id": "Pbey7LqBRl", "proceeding": "https://proceedings.mlr.press/v235/toshev24a.html", "pdf": "https://openreview.net/pdf?id=Pbey7LqBRl", "openreview": "https://openreview.net/forum?id=Pbey7LqBRl", "author_site": "Artur Toshev, Jonas Erbesdobler, Nikolaus Adams, Johannes Brandstetter", "tldr": "", "abstract": "Smoothed particle hydrodynamics (SPH) is omnipresent in modern engineering and scientific disciplines. SPH is a class of Lagrangian schemes that discretize fluid dynamics via finite material points that are tracked through the evolving velocity field. Due to the particle-like nature of the simulation, graph neural networks (GNNs) have emerged as appealing and successful surrogates. However, the practical utility of such GNN-based simulators relies on their ability to faithfully model physics, providing accurate and stable predictions over long time horizons - which is a notoriously hard problem. In this work, we identify particle clustering originating from tensile instabilities as one of the primary pitfalls. Based on these insights, we enhance both training and rollout inference of state-of-the-art GNN-based simulators with varying components from standard SPH solvers, including pressure, viscous, and external force components. All Neural SPH-enhanced simulators achieve better performance than the baseline GNNs, often by orders of magnitude in terms of rollout error, allowing for significantly longer rollouts and significantly better physics modeling. Code available under https://github.com/tumaer/neuralsph.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Artur Toshev;Jonas A. Erbesdobler;Nikolaus A. Adams;Johannes Brandstetter", "authorids": "~Artur_Toshev1;~Jonas_A._Erbesdobler1;~Nikolaus_A._Adams1;~Johannes_Brandstetter1", "gender": ";;M;M", "homepage": "https://arturtoshev.github.io/;;;", "dblp": "344/3672;;;251/8691", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.de/citations?user=baGPClkAAAAJ;https://scholar.google.de/citations?user=Oer7zf4AAAAJ;KiRvOHcAAAAJ", "orcid": "0000-0003-0486-5565;0009-0007-8535-3401; 0000-0001-5048-8639;", "linkedin": ";;;", "or_profile": "~Artur_Toshev1;~Jonas_A._Erbesdobler1;~Nikolaus_A._Adams1;~Johannes_Brandstetter1", "aff": "Technische Universit\u00e4t M\u00fcnchen;Technische Universit\u00e4t M\u00fcnchen;Technische Universit\u00e4t M\u00fcnchen;Microsoft", "aff_domain": "tum.de;tum.de;tum.de;microsoft.com", "position": "PhD student;MS student;Full Professor;Researcher", "bibtex": "@inproceedings{\ntoshev2024neural,\ntitle={Neural {SPH}: Improved Neural Modeling of Lagrangian Fluid Dynamics},\nauthor={Artur Toshev and Jonas A. Erbesdobler and Nikolaus A. Adams and Johannes Brandstetter},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Pbey7LqBRl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6876670, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8032572330235354227&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "tum.de;tum.de;tum.de;microsoft.com", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Technische Universit\u00e4t M\u00fcnchen;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.tum.de;https://www.microsoft.com", "aff_unique_abbr": "TUM;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "Germany;United States" }, { "title": "Prospector Heads: Generalized Feature Attribution for Large Models & Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34113", "id": "PjVqEErDgK", "proceeding": "https://proceedings.mlr.press/v235/machiraju24a.html", "pdf": "https://openreview.net/pdf?id=PjVqEErDgK", "openreview": "https://openreview.net/forum?id=PjVqEErDgK", "author_site": "Gautam Machiraju, Alexander Derry, Arjun Desai, Neel Guha, Amir-Hossein Karimi, James Zou, Russ B Altman, Christopher Re, Parag Mallick", "tldr": "", "abstract": "Feature attribution, the ability to localize regions of the input data that are relevant for classification, is an important capability for ML models in scientific and biomedical domains. Current methods for feature attribution, which rely on \"explaining\" the predictions of end-to-end classifiers, suffer from imprecise feature localization and are inadequate for use with small sample sizes and high-dimensional datasets due to computational challenges. We introduce prospector heads, an efficient and interpretable alternative to explanation-based attribution methods that can be applied to any encoder and any data modality. Prospector heads generalize across modalities through experiments on sequences (text), images (pathology), and graphs (protein structures), outperforming baseline attribution methods by up to 26.3 points in mean localization AUPRC. We also demonstrate how prospector heads enable improved interpretation and discovery of class-specific patterns in input data. Through their high performance, flexibility, and generalizability, prospectors provide a framework for improving trust and transparency for ML models in complex domains.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gautam Machiraju;Alexander Derry;Arjun D Desai;Neel Guha;Amir-Hossein Karimi;James Zou;Russ B Altman;Christopher Re;Parag Mallick", "authorids": "~Gautam_Machiraju1;~Alexander_Derry1;~Arjun_D_Desai1;~Neel_Guha1;~Amir-Hossein_Karimi1;~James_Zou1;~Russ_B_Altman1;~Christopher_Re1;~Parag_Mallick1", "gender": "M;M;;M;;;M;;M", "homepage": "https://gmachiraju.github.io/;;;http://neelguha.com;https://sites.google.com/view/amirhkarimi;;https://rbaltman.people.stanford.edu;;http://www.mallicklab.org", "dblp": "305/8425;;;130/0311;;;;;88/2810", "google_scholar": "nxq3KagAAAAJ;L3_m0UYAAAAJ;;YI5N4HQAAAAJ;https://scholar.google.ca/citations?user=umI56k0AAAAJ;23ZXZvEAAAAJ;s6XjtCMAAAAJ;;", "orcid": "0000-0003-4743-0123;0000-0003-2076-1184;;;;;0000-0003-3859-2905;;", "linkedin": "https://www.linkedin.com/gmachi;;;;amir-hossein-karimi-4a008538/;;russ-altman-81b17/;;", "or_profile": "~Gautam_Machiraju1;~Alexander_Derry1;~Arjun_D_Desai1;~Neel_Guha1;~Amir-Hossein_Karimi1;~James_Zou1;~Russ_B_Altman1;~Christopher_Re1;~Parag_Mallick1", "aff": "Stanford University;Stanford University;;Computer Science Department, Stanford University;University of Waterloo;Stanford University;Stanford University;;Stanford University", "aff_domain": "stanford.edu;stanford.edu;;cs.stanford.edu;uwaterloo.ca;stanford.edu;stanford.edu;;stanford.edu", "position": "PhD student;PhD student;;PhD student;Assistant Professor;Assistant Professor;Full Professor;;Associate Professor", "bibtex": "@inproceedings{\nmachiraju2024prospector,\ntitle={Prospector Heads: Generalized Feature Attribution for Large Models \\& Data},\nauthor={Gautam Machiraju and Alexander Derry and Arjun D Desai and Neel Guha and Amir-Hossein Karimi and James Zou and Russ B Altman and Christopher Re and Parag Mallick},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PjVqEErDgK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8219131, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9627195403607211480&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "stanford.edu;stanford.edu;;cs.stanford.edu;uwaterloo.ca;stanford.edu;stanford.edu;;stanford.edu", "author_num": 9, "aff_unique_index": "0;0;0;1;0;0;0", "aff_unique_norm": "Stanford University;University of Waterloo", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://uwaterloo.ca", "aff_unique_abbr": "Stanford;UW", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;1;0;0;0", "aff_country_unique": "United States;Canada" }, { "title": "ConTextual: Evaluating Context-Sensitive Text-Rich Visual Reasoning in Large Multimodal Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34112", "id": "PjiRSyUt7e", "proceeding": "https://proceedings.mlr.press/v235/wadhawan24a.html", "pdf": "https://openreview.net/pdf?id=PjiRSyUt7e", "openreview": "https://openreview.net/forum?id=PjiRSyUt7e", "author_site": "Rohan Wadhawan, Hritik Bansal, Kai-Wei Chang, Nanyun Peng", "tldr": "", "abstract": "Many real-world tasks require an agent to reason jointly over text and visual objects, (e.g., navigating in public spaces), which we refer to as context-sensitive text-rich visual reasoning. Specifically, these tasks require an understanding of the context in which the text interacts with visual elements within an image. However, there is a lack of existing datasets to benchmark the state-of-the-art multimodal models' capability on context-sensitive text-rich visual reasoning. In this paper, we introduce ConTextual, a novel dataset featuring human-crafted instructions that require context-sensitive reasoning for text-rich images. We conduct experiments to assess the performance of 14 foundation models (GPT-4V, Gemini-Pro-Vision, LLaVA-Next) and establish a human performance baseline. Further, we perform human evaluations of the model responses and observe a significant performance gap of 30.8% between GPT-4V (the current best-performing Large Multimodal Model) and human performance. Our fine-grained analysis reveals that GPT-4V encounters difficulties interpreting time-related data and infographics. However, it demonstrates proficiency in comprehending abstract visual contexts such as memes and quotes. Finally, our qualitative analysis uncovers various factors contributing to poor performance including lack of precise visual perception and hallucinations. Our dataset, code, and leaderboard can be found on the project page https://con-textual.github.io/.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rohan Wadhawan;Hritik Bansal;Kai-Wei Chang;Nanyun Peng", "authorids": "~Rohan_Wadhawan1;~Hritik_Bansal2;~Kai-Wei_Chang1;~Nanyun_Peng1", "gender": ";M;M;F", "homepage": ";https://sites.google.com/view/hbansal;http://kwchang.net;https://violetpeng.github.io/", "dblp": ";239/5922;18/2428;117/4036", "google_scholar": ";gAKTYtoAAAAJ;fqDBtzYAAAAJ;XxRXvX0AAAAJ", "orcid": ";;0000-0001-5365-0072;", "linkedin": ";hritik-bansal/;kai-wei-chang-41239040;", "or_profile": "~Rohan_Wadhawan1;~Hritik_Bansal2;~Kai-Wei_Chang1;~Nanyun_Peng1", "aff": ";University of California, Los Angeles;Amazon;University of California, Los Angeles", "aff_domain": ";ucla.edu;amazon.com;ucla.edu", "position": ";PhD student;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nwadhawan2024contextual,\ntitle={ConTextual: Evaluating Context-Sensitive Text-Rich Visual Reasoning in Large Multimodal Models},\nauthor={Rohan Wadhawan and Hritik Bansal and Kai-Wei Chang and Nanyun Peng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PjiRSyUt7e}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5318011, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6870638609360303902&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": ";ucla.edu;amazon.com;ucla.edu", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of California, Los Angeles;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.ucla.edu;https://www.amazon.com", "aff_unique_abbr": "UCLA;Amazon", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Learning 1-Bit Tiny Object Detector with Discriminative Feature Refinement", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34111", "id": "PlM30j9i80", "proceeding": "https://proceedings.mlr.press/v235/xu24z.html", "pdf": "https://openreview.net/pdf?id=PlM30j9i80", "openreview": "https://openreview.net/forum?id=PlM30j9i80", "author_site": "Sheng Xu, Mingze Wang, Yanjing Li, Mingbao Lin, Baochang Zhang, David Doermann, Xiao Sun", "tldr": "", "abstract": "1-bit detectors show impressive performance comparable to their real-valued counterparts when detecting commonly sized objects while exhibiting significant performance degradation on tiny objects. The challenge stems from the fact that high-level features extracted by 1-bit convolutions seem less compelling to reveal the discriminative foreground features. To address these issues, we introduce a Discriminative Feature Refinement method for 1-bit Detectors (DFR-Det), aiming to enhance the discriminative ability of foreground representation for tiny objects in aerial images. This is accomplished by refining the feature representation using an information bottleneck (IB) to achieve a distinctive representation of tiny objects. Specifically, we introduce a new decoder with a foreground mask, aiming to enhance the discriminative ability of high-level features for the target but suppress the background impact. Additionally, our decoder is simple but effective and can be easily mounted on existing detectors without extra burden added to the inference procedure. Extensive experiments on various tiny object detection (TOD) tasks demonstrate DFR-Det's superiority over state-of-the-art 1-bit detectors. For example, 1-bit FCOS achieved by DFR-Det achieves the 12.8% AP on AI-TOD dataset, approaching the performance of the real-valued counterpart.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sheng Xu;Mingze Wang;Yanjing Li;Mingbao Lin;Baochang Zhang;David Doermann;Xiao Sun", "authorids": "~Sheng_Xu4;~Mingze_Wang3;~Yanjing_Li2;~Mingbao_Lin1;~Baochang_Zhang1;~David_Doermann2;~Xiao_Sun8", "gender": "M;M;;M;M;M;M", "homepage": ";https://github.com/Meize0729;;http://lmb.bjbxit.cn/;https://dblp.uni-trier.de/pid/80/3887-1.html;https://cse.buffalo.edu/~doermann/;https://jimmysuen.github.io/", "dblp": "10/1887-7.html;;62/201;211/5903;https://dblp.uni-trier.de/pid/80/3887-1.html;;151/8845", "google_scholar": "https://scholar.google.com.hk/citations?user=ZLR31ccAAAAJ;;2rE-GM8AAAAJ;Dp3L1bsAAAAJ;;RoGOW9AAAAAJ;wYIe0tYAAAAJ", "orcid": "0000-0002-7742-275X;;0000-0003-3745-8755;0000-0003-1764-1894;;0000-0003-1639-4561;", "linkedin": ";;;mingbao-lin-890444105/;;david-doermann-bb7757/;", "or_profile": "~Sheng_Xu4;~Mingze_Wang3;~Yanjing_Li2;~Mingbao_Lin1;~Baochang_Zhang1;~David_Doermann2;~Xiao_Sun2", "aff": "Beihang University;Beihang University;Beihang University;Xiamen University;Beihang University;State University of New York at Buffalo;Shanghai Artificial Intelligence Laboratory", "aff_domain": "buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;xmu.edu.cn;buaa.edu.cn;buffalo.edu;pjlab.org.cn", "position": "PhD student;MS student;PhD student;PhD student;Professor;Full Professor;Principal Researcher", "bibtex": "@inproceedings{\nxu2024learning,\ntitle={Learning 1-Bit Tiny Object Detector with Discriminative Feature Refinement},\nauthor={Sheng Xu and Mingze Wang and Yanjing Li and Mingbao Lin and Baochang Zhang and David Doermann and Xiao Sun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PlM30j9i80}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2700694, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6369034184727137079&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;xmu.edu.cn;buaa.edu.cn;buffalo.edu;pjlab.org.cn", "author_num": 7, "aff_unique_index": "0;0;0;1;0;2;3", "aff_unique_norm": "Beihang University;Xiamen University;State University of New York at Buffalo;Shanghai Artificial Intelligence Laboratory", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.buaa.edu.cn/;https://www.xmu.edu.cn;https://www.buffalo.edu;http://www.shailab.org/", "aff_unique_abbr": "BUAA;XMU;SUNY Buffalo;Shanghai AI Lab", "aff_campus_unique_index": "1", "aff_campus_unique": ";Buffalo", "aff_country_unique_index": "0;0;0;0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Consistent Diffusion Meets Tweedie: Training Exact Ambient Diffusion Models with Noisy Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34110", "id": "PlVjIGaFdH", "proceeding": "https://proceedings.mlr.press/v235/daras24a.html", "pdf": "https://openreview.net/pdf?id=PlVjIGaFdH", "openreview": "https://openreview.net/forum?id=PlVjIGaFdH", "author_site": "Giannis Daras, Alexandros Dimakis, Constantinos Daskalakis", "tldr": "", "abstract": "Ambient diffusion is a recently proposed framework for training diffusion models using corrupted data. Both Ambient Diffusion and alternative SURE-based approaches for learning diffusion models from corrupted data resort to approximations which deteriorate performance. We present the first framework for training diffusion models that provably sample from the uncorrupted distribution given only noisy training data, solving an open problem in Ambient diffusion. Our key technical contribution is a method that uses a double application of Tweedie's formula and a consistency loss function that allows us to extend sampling at noise levels below the observed data noise. We also provide further evidence that diffusion models memorize from their training sets by identifying extremely corrupted images that are almost perfectly reconstructed, raising copyright and privacy concerns. Our method for training using corrupted samples can be used to mitigate this problem. We demonstrate this by fine-tuning Stable Diffusion XL to generate samples from a distribution using only noisy samples. Our framework reduces the amount of memorization of the fine-tuning dataset, while maintaining competitive performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Giannis Daras;Alex Dimakis;Constantinos Costis Daskalakis", "authorids": "~Giannis_Daras1;~Alex_Dimakis1;~Constantinos_Costis_Daskalakis1", "gender": "M;M;M", "homepage": "https://giannisdaras.github.io/;https://people.eecs.berkeley.edu/~alexdimakis/;http://people.csail.mit.edu/costis/", "dblp": "254/2703;19/5000.html;", "google_scholar": "LaScvbQAAAAJ;JSFmVQEAAAAJ;iTv2cOgAAAAJ", "orcid": ";;", "linkedin": ";alex-dimakis-b1b20320/;", "or_profile": "~Giannis_Daras1;~Alex_Dimakis1;~Constantinos_Costis_Daskalakis1", "aff": "University of Texas, Austin;University of Texas at Austin;Massachusetts Institute of Technology", "aff_domain": "utexas.edu;utexas.edu;mit.edu", "position": "PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\ndaras2024consistent,\ntitle={Consistent Diffusion Meets Tweedie: Training Exact Ambient Diffusion Models with Noisy Data},\nauthor={Giannis Daras and Alex Dimakis and Constantinos Costis Daskalakis},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PlVjIGaFdH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3603558, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1647212041279964627&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "utexas.edu;utexas.edu;mit.edu", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Texas at Austin;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.utexas.edu;https://web.mit.edu", "aff_unique_abbr": "UT Austin;MIT", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Vocabulary for Universal Approximation: A Linguistic Perspective of Mapping Compositions", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34109", "id": "PnyYgWMMwj", "proceeding": "https://proceedings.mlr.press/v235/cai24a.html", "pdf": "https://openreview.net/pdf?id=PnyYgWMMwj", "openreview": "https://openreview.net/forum?id=PnyYgWMMwj", "tldr": "", "abstract": "In recent years, deep learning-based sequence modelings, such as language models, have received much attention and success, which pushes researchers to explore the possibility of transforming non-sequential problems into a sequential form. Following this thought, deep neural networks can be represented as composite functions of a sequence of mappings, linear or nonlinear, where each composition can be viewed as a word. However, the weights of linear mappings are undetermined and hence require an infinite number of words. In this article, we investigate the finite case and constructively prove the existence of a finite vocabulary $V$=$\\phi_i: \\mathbb{R}^d \\to \\mathbb{R}^d | i=1,...,n$ with $n=O(d^2)$ for the universal approximation. That is, for any continuous mapping $f: \\mathbb{R}^d \\to \\mathbb{R}^d$, compact domain $\\Omega$ and $\\varepsilon>0$, there is a sequence of mappings $\\phi_{i_1}, ..., \\phi_{i_m} \\in V, m \\in \\mathbb{Z}^+$, such that the composition $\\phi_{i_m} \\circ ... \\circ \\phi_{i_1} $ approximates $f$ on $\\Omega$ with an error less than $\\varepsilon$. Our results demonstrate an unusual approximation power of mapping compositions and motivate a novel compositional model for regular languages.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yongqiang Cai", "authorids": "~Yongqiang_Cai1", "gender": "M", "homepage": "", "dblp": "228/6809", "google_scholar": "https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-2666-0539", "linkedin": "", "or_profile": "~Yongqiang_Cai1", "aff": "Beijing Normal University", "aff_domain": "bnu.edu.cn", "position": "Lecturer", "bibtex": "@inproceedings{\ncai2024vocabulary,\ntitle={Vocabulary for Universal Approximation: A Linguistic Perspective of Mapping Compositions},\nauthor={Yongqiang Cai},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PnyYgWMMwj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 490977, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13576541971883970788&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 7, "email": "bnu.edu.cn", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Beijing Normal University", "aff_unique_dep": "", "aff_unique_url": "https://www.bnu.edu.cn", "aff_unique_abbr": "BNU", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "Bridging Data Gaps in Diffusion Models with Adversarial Noise-Based Transfer Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34108", "id": "PpBs2iL0jv", "proceeding": "https://proceedings.mlr.press/v235/wang24ap.html", "pdf": "https://openreview.net/pdf?id=PpBs2iL0jv", "openreview": "https://openreview.net/forum?id=PpBs2iL0jv", "author_site": "Xiyu Wang, Baijiong Lin, Daochang Liu, YINGCONG CHEN, Chang Xu", "tldr": "", "abstract": "Diffusion Probabilistic Models (DPMs) show significant potential in image generation, yet their performance hinges on having access to large datasets. Previous works, like Generative Adversarial Networks (GANs), have tackled the limited data problem by transferring pre-trained models learned with sufficient data. However, those methods are hard to be utilized in DPMs since the distinct differences between DPM-based and GAN-based methods, showing in the unique iterative denoising process integral and the need for many timesteps with no-targeted noise in DPMs. In this paper, we propose a novel DPMs-based transfer learning method, ANT, to address the limited data problem. It includes two strategies: similarity-guided training, which boosts transfer with a classifier, and adversarial noise selection which adaptively chooses targeted noise based on the input image. Extensive experiments in the context of few-shot image generation tasks demonstrate that our method is not only efficient but also excels in terms of image quality and diversity when compared to existing GAN-based and DDPM-based methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiyu Wang;Baijiong Lin;Daochang Liu;Ying-Cong Chen;Chang Xu", "authorids": "~Xiyu_Wang2;~Baijiong_Lin1;~Daochang_Liu1;~Ying-Cong_Chen1;~Chang_Xu4", "gender": "M;M;M;M;", "homepage": ";https://baijiong-lin.github.io/;https://finspire13.github.io;https://www.yingcong.me/;https://sydney.edu.au/engineering/about/our-people/academic-staff/c-xu.html", "dblp": ";279/2950;222/2701;137/6578;97/2966-2", "google_scholar": ";KVdbYTYAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.hk/citations?user=n7j4bJUAAAAJ;N4F_3eoAAAAJ", "orcid": ";0000-0002-4257-0226;;;0000-0002-4756-0609", "linkedin": "%E6%9B%A6%E5%AE%87-%E7%8E%8B-66b6aa1b3/;;;;", "or_profile": "~Xiyu_Wang2;~Baijiong_Lin1;~Daochang_Liu1;~Ying-Cong_Chen1;~Charles_Xu1", "aff": "University of Sydney;The Hong Kong University of Science and Technology (Guangzhou);University of Sydney;Hong Kong University of Science and Technology;University of Sydney", "aff_domain": "usyd.edu.au;connect.hkust-gz.edu.cn;usyd.edu.au;hkust-gz.edu.cn;sydney.eud.au", "position": "PhD student;PhD student;Postdoc;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nwang2024bridging,\ntitle={Bridging Data Gaps in Diffusion Models with Adversarial Noise-Based Transfer Learning},\nauthor={Xiyu Wang and Baijiong Lin and Daochang Liu and Ying-Cong Chen and Chang Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PpBs2iL0jv}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2891859, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18233294786549196092&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "usyd.edu.au;connect.hkust-gz.edu.cn;usyd.edu.au;hkust-gz.edu.cn;sydney.eud.au", "author_num": 5, "aff_unique_index": "0;1;0;1;0", "aff_unique_norm": "University of Sydney;Hong Kong University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.sydney.edu.au;https://www.ust.hk", "aff_unique_abbr": "USYD;HKUST", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Guangzhou;Hong Kong SAR", "aff_country_unique_index": "0;1;0;1;0", "aff_country_unique": "Australia;China" }, { "title": "Efficient Exploration for LLMs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34107", "id": "PpPZ6W7rxy", "proceeding": "https://proceedings.mlr.press/v235/dwaracherla24a.html", "pdf": "https://openreview.net/pdf?id=PpPZ6W7rxy", "openreview": "https://openreview.net/forum?id=PpPZ6W7rxy", "author_site": "Vikranth Dwaracherla, Seyed Mohammad Asghari, Botao Hao, Benjamin Van Roy", "tldr": "", "abstract": "We present evidence of substantial benefit from efficient exploration in gathering human feedback to improve large language models. In our experiments, an agent sequentially generates queries while fitting a reward model to the feedback received. Our best-performing agent generates queries using double Thompson sampling, with uncertainty represented by an epistemic neural network. Our results demonstrate that efficient exploration enables high levels of performance with far fewer queries. Further, both uncertainty estimation and the choice of exploration scheme play critical roles.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vikranth Dwaracherla;Seyed Mohammad Asghari;Botao Hao;Benjamin Van Roy", "authorids": "~Vikranth_Dwaracherla1;~Seyed_Mohammad_Asghari1;~Botao_Hao1;~Benjamin_Van_Roy3", "gender": "M;;;", "homepage": "https://vikranth.people.stanford.edu/;;https://haobotao000.github.io/;https://web.stanford.edu/~bvr", "dblp": "182/7585;;222/2211;41/4314.html", "google_scholar": "ir7j5AkAAAAJ;;;05sMX8MAAAAJ", "orcid": ";;;", "linkedin": ";seyed-mohammad-asghari;;", "or_profile": "~Vikranth_Dwaracherla1;~Seyed_Mohammad_Asghari1;~Botao_Hao1;~Benjamin_Van_Roy3", "aff": "Google DeepMind;Google DeepMind;Google Deepmind;", "aff_domain": "deepmind.com;deepmind.com;google.com;", "position": "Researcher;Research Engineer;Research Scientist;", "bibtex": "@inproceedings{\ndwaracherla2024efficient,\ntitle={Efficient Exploration for {LLM}s},\nauthor={Vikranth Dwaracherla and Seyed Mohammad Asghari and Botao Hao and Benjamin Van Roy},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PpPZ6W7rxy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 791773, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18288998842436153311&as_sdt=5,39&sciodt=0,39&hl=en", "gs_version_total": 6, "email": "deepmind.com;deepmind.com;google.com;", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Google;DeepMind", "aff_unique_dep": "Google DeepMind;DeepMind", "aff_unique_url": "https://deepmind.com;https://deepmind.com", "aff_unique_abbr": "DeepMind;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Position: Bayesian Deep Learning is Needed in the Age of Large-Scale AI", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34106", "id": "PrmxFWI1Fr", "proceeding": "https://proceedings.mlr.press/v235/papamarkou24b.html", "pdf": "https://openreview.net/pdf?id=PrmxFWI1Fr", "openreview": "https://openreview.net/forum?id=PrmxFWI1Fr", "author_site": "Theodore Papamarkou, Maria Skoularidou, Konstantina Palla, Laurence Aitchison, Julyan Arbel, David Dunson, Maurizio Filippone, Vincent Fortuin, Philipp Hennig, Jose Miguel Hernandez-Lobato, Aliaksandr Hubin, Alexander Immer, Theofanis Karaletsos, Khan Emtiyaz, Agustinus Kristiadi, Yingzhen Li, Stephan Mandt, Chris Nemeth, Michael A Osborne, Tim G. J. Rudner, David R\u00fcgamer, Yee-Whye Teh, Max Welling, Andrew Wilson, Ruqi Zhang", "tldr": "", "abstract": "In the current landscape of deep learning research, there is a predominant emphasis on achieving high predictive accuracy in supervised tasks involving large image and language datasets. However, a broader perspective reveals a multitude of overlooked metrics, tasks, and data types, such as uncertainty, active and continual learning, and scientific data, that demand attention. Bayesian deep learning (BDL) constitutes a promising avenue, offering advantages across these diverse settings. This paper posits that BDL can elevate the capabilities of deep learning. It revisits the strengths of BDL, acknowledges existing challenges, and highlights some exciting research avenues aimed at addressing these obstacles. Looking ahead, the discussion focuses on possible ways to combine large-scale foundation models with BDL to unlock their full potential.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Theodore Papamarkou;Maria Skoularidou;Konstantina Palla;Laurence Aitchison;Julyan Arbel;David Dunson;Maurizio Filippone;Vincent Fortuin;Philipp Hennig;Jos\u00e9 Miguel Hern\u00e1ndez-Lobato;Aliaksandr Hubin;Alexander Immer;Theofanis Karaletsos;Mohammad Emtiyaz Khan;Agustinus Kristiadi;Yingzhen Li;Stephan Mandt;Christopher Nemeth;Michael A Osborne;Tim G. J. Rudner;David R\u00fcgamer;Yee Whye Teh;Max Welling;Andrew Gordon Wilson;Ruqi Zhang", "authorids": "~Theodore_Papamarkou1;~Maria_Skoularidou1;~Konstantina_Palla1;~Laurence_Aitchison1;~Julyan_Arbel1;~David_Dunson1;~Maurizio_Filippone1;~Vincent_Fortuin1;~Philipp_Hennig1;~Jos\u00e9_Miguel_Hern\u00e1ndez-Lobato1;aliaksah@math.uio.no;~Alexander_Immer1;~Theofanis_Karaletsos1;~Mohammad_Emtiyaz_Khan1;~Agustinus_Kristiadi1;~Yingzhen_Li1;~Stephan_Mandt1;~Christopher_Nemeth1;~Michael_A_Osborne1;~Tim_G._J._Rudner2;~David_R\u00fcgamer1;~Yee_Whye_Teh2;~Max_Welling1;~Andrew_Gordon_Wilson1;~Ruqi_Zhang1", "gender": "M;F;F;;M;;M;M;M;;;;M;M;;F;;M;;;M;;M;Not Specified;F", "homepage": "https://www.theopapamarkou.com/;;https://konstantinapalla.netlify.app/;http://www.gatsby.ucl.ac.uk/~laurence/;http://www.julyanarbel.com/;https://www.daviddunson.com/;;https://fortuin.github.io/;http://mml.inf.uni-tuebingen.de;;;;http://karaletsos.com/;https://emtiyaz.github.io/;https://agustinus.kristia.de;http://yingzhenli.net/home/en/;;http://www.lancs.ac.uk/~nemeth/;;;https://davidruegamer.github.io/;;https://staff.fnwi.uva.nl/m.welling/;https://cims.nyu.edu/~andrewgw;https://ruqizhang.github.io/", "dblp": ";166/1561.html;38/2865;155/1918.html;172/8198;;35/5597;218/7489;08/9077;;;;31/11191;58/10432;215/3954;117/9230;;88/10513;;;220/5560;;16/2286;65/10453;", "google_scholar": "ydMfbhAAAAAJ;https://scholar.google.co.uk/citations?user=Lkm90i8AAAAJ;https://scholar.google.co.uk/citations?user=7QmCrjcAAAAJ;;Q7P4K3wAAAAJ;https://scholar.google.co.uk/citations?user=KwEOawwAAAAJ;https://scholar.google.com.tw/citations?user=ILUeAloAAAAJ;https://scholar.google.ch/citations?user=XBlrYTIAAAAJ;https://scholar.google.de/citations?user=UeG5w08AAAAJ;;;;zrxafGsAAAAJ;https://scholar.google.com/citations?hl=en;_1qe2mYAAAAJ;https://scholar.google.se/citations?hl=en;;https://scholar.google.co.uk/citations?user=17-Ze24AAAAJ;;;https://scholar.google.de/citations?user=_DYguksAAAAJ;;https://scholar.google.nl/citations?user=8200InoAAAAJ;https://scholar.google.com.tw/citations?user=twWX2LIAAAAJ;4ojpmc8AAAAJ", "orcid": "0000-0002-9689-543X;;;;0000-0002-2525-4416;;;0000-0002-0640-2671;0000-0001-7293-6092;;;;;;0000-0003-1615-1121;;;0000-0002-9084-3866;;;;;0000-0003-1484-2121;;", "linkedin": "papamarkou/;maria-skoularidou-1289b62a/;;;julyanarbel/;;;vincent-fortuin-42426b134/;;;;;;;agustinus-kristiadi/;;;christopher-nemeth-815963233/;;;;;;;", "or_profile": "~Theodore_Papamarkou1;~Maria_Skoularidou1;~Konstantina_Palla1;~Laurence_Aitchison1;~Julyan_Arbel1;~David_Dunson1;~Maurizio_Filippone1;~Vincent_Fortuin1;~Philipp_Hennig1;~Jos\u00e9_Miguel_Hern\u00e1ndez-Lobato1;aliaksah@math.uio.no;~Alexander_Immer1;~Theofanis_Karaletsos1;~Mohammad_Emtiyaz_Khan1;~Agustinus_Kristiadi1;~Yingzhen_Li1;~Stephan_Mandt1;~Christopher_Nemeth1;~Michael_A_Osborne1;~Tim_G._J._Rudner2;~David_R\u00fcgamer1;~Yee_Whye_Teh2;~Max_Welling1;~Andrew_Gordon_Wilson1;~Ruqi_Zhang1", "aff": "University of Manchester;Broad Institute;Spotify Research;University of Bristol;Inria;Duke University;Eurecom;Helmholtz AI;University of T\u00fcbingen;;;;Pyramidal, Inc;RIKEN Center for AI Project;Vector Institute;Imperial College London;;Lancaster University;;;LMU Munich;;University of Amsterdam;New York University;Purdue University", "aff_domain": "manchester.ac.uk;broadinstitute.org;spotify.com;bristol.ac.uk;inria.fr;duke.edu;eurecom.fr;helmholtz.ai;uni-tuebingen.de;;;;pyramidal.ai;riken.jp;vectorinstitute.ai;imperial.ac.uk;;lancaster.ac.uk;;;lmu.de;;uva.nl;nyu.edu;purdue.edu", "position": "Full Professor;Postdoc;Researcher;Assistant Professor;Researcher;Full Professor;Associate Professor;Principal Researcher;Full Professor;;;;Researcher;Full Professor;Postdoc;Associate Professor;;Full Professor;;;Associate Professor;;Full Professor;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\npapamarkou2024position,\ntitle={Position: Bayesian Deep Learning is Needed in the Age of Large-Scale {AI}},\nauthor={Theodore Papamarkou and Maria Skoularidou and Konstantina Palla and Laurence Aitchison and Julyan Arbel and David Dunson and Maurizio Filippone and Vincent Fortuin and Philipp Hennig and Jos{\\'e} Miguel Hern{\\'a}ndez-Lobato and Aliaksandr Hubin and Alexander Immer and Theofanis Karaletsos and Mohammad Emtiyaz Khan and Agustinus Kristiadi and Yingzhen Li and Stephan Mandt and Christopher Nemeth and Michael A Osborne and Tim G. J. Rudner and David R{\\\"u}gamer and Yee Whye Teh and Max Welling and Andrew Gordon Wilson and Ruqi Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PrmxFWI1Fr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 510857, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 25, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7972611535590656652&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "email": "manchester.ac.uk;broadinstitute.org;spotify.com;bristol.ac.uk;inria.fr;duke.edu;eurecom.fr;helmholtz.ai;uni-tuebingen.de;;;;pyramidal.ai;riken.jp;vectorinstitute.ai;imperial.ac.uk;;lancaster.ac.uk;;;lmu.de;;uva.nl;nyu.edu;purdue.edu", "author_num": 25, "aff_unique_index": "0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17", "aff_unique_norm": "University of Manchester;Broad Institute;Spotify;University of Bristol;INRIA;Duke University;EURECOM;Helmholtz Association of German Research Centres;University of T\u00fcbingen;Pyramidal, Inc;RIKEN;Vector Institute;Imperial College London;Lancaster University;Ludwig Maximilian University of Munich;University of Amsterdam;New York University;Purdue University", "aff_unique_dep": ";;Spotify Research;;;;;Helmholtz AI;;;Center for AI Project;;;;;;;", "aff_unique_url": "https://www.manchester.ac.uk;https://www.broadinstitute.org;https://www.spotify.com/research;https://www.bristol.ac.uk;https://www.inria.fr;https://www.duke.edu;https://www.eurecom.fr;https://www.helmholtz-ai.de;https://www.uni-tuebingen.de/;;https://www.riken.jp/en/;https://vectorinstitute.ai/;https://www.imperial.ac.uk;https://www.lancaster.ac.uk;https://www.lmu.de;https://www.uva.nl;https://www.nyu.edu;https://www.purdue.edu", "aff_unique_abbr": "UoM;Broad;Spotify;Bristol;Inria;Duke;;Helmholtz AI;Uni T\u00fcbingen;;RIKEN;Vector Institute;ICL;Lancaster;LMU;UvA;NYU;Purdue", "aff_campus_unique_index": "1", "aff_campus_unique": ";Munich", "aff_country_unique_index": "0;1;2;0;3;1;3;4;4;1;5;6;0;0;4;7;1;1", "aff_country_unique": "United Kingdom;United States;Sweden;France;Germany;Japan;Canada;Netherlands" }, { "title": "Causal Representation Learning from Multiple Distributions: A General Setting", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34105", "id": "Pte6iiXvpf", "proceeding": "https://proceedings.mlr.press/v235/zhang24br.html", "pdf": "https://openreview.net/pdf?id=Pte6iiXvpf", "openreview": "https://openreview.net/forum?id=Pte6iiXvpf", "author_site": "Kun Zhang, Shaoan Xie, Ignavier Ng, Yujia Zheng", "tldr": "", "abstract": "In many problems, the measured variables (e.g., image pixels) are just mathematical functions of the latent causal variables (e.g., the underlying concepts or objects). For the purpose of making predictions in changing environments or making proper changes to the system, it is helpful to recover the latent causal variables $Z_i$ and their causal relations represented by graph $\\mathcal{G}_Z$. This problem has recently been known as causal representation learning. This paper is concerned with a general, completely nonparametric setting of causal representation learning from multiple distributions (arising from heterogeneous data or nonstationary time series), without assuming hard interventions behind distribution changes. We aim to develop general solutions in this fundamental case; as a by product, this helps see the unique benefit offered by other assumptions such as parametric causal models or hard interventions. We show that under the sparsity constraint on the recovered graph over the latent variables and suitable sufficient change conditions on the causal influences, interestingly, one can recover the moralized graph of the underlying directed acyclic graph, and the recovered latent variables and their relations are related to the underlying causal model in a specific, nontrivial way. In some cases, most latent variables can even be recovered up to component-wise transformations. Experimental results verify our theoretical claims.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kun Zhang;Shaoan Xie;Ignavier Ng;Yujia Zheng", "authorids": "~Kun_Zhang1;~Shaoan_Xie4;~Ignavier_Ng1;~Yujia_Zheng1", "gender": "M;;M;M", "homepage": "http://www.andrew.cmu.edu/user/kunz1/;https://shaoan.net;https://ignavierng.github.io/;https://yjzheng.com", "dblp": "96/3115-1;205/9276.html;251/3037;245/6109-1.html", "google_scholar": "RGoypN4AAAAJ;mChB-hQAAAAJ;;https://scholar.google.co.uk/citations?user=ioiW248AAAAJ", "orcid": ";;;0009-0003-5225-6366", "linkedin": ";;;", "or_profile": "~Kun_Zhang1;~Shaoan_Xie4;~Ignavier_Ng1;~Yujia_Zheng1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu;cmu.edu;cmu.edu", "position": "Associate Professor;PhD student;PhD student;PhD student", "bibtex": "@inproceedings{\nzhang2024causal,\ntitle={Causal Representation Learning from Multiple Distributions: A General Setting},\nauthor={Kun Zhang and Shaoan Xie and Ignavier Ng and Yujia Zheng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Pte6iiXvpf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1743837, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=189618091708802019&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "cmu.edu;cmu.edu;cmu.edu;cmu.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Building Socially-Equitable Public Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34104", "id": "PudBRuNa8r", "proceeding": "https://proceedings.mlr.press/v235/liu24bw.html", "pdf": "https://openreview.net/pdf?id=PudBRuNa8r", "openreview": "https://openreview.net/forum?id=PudBRuNa8r", "author_site": "Yejia Liu, Jianyi Yang, Pengfei Li, Tongxin Li, Shaolei Ren", "tldr": "", "abstract": "Public models offer predictions to a variety of downstream tasks and have played a crucial role in various AI applications, showcasing their proficiency in accurate predictions. However, the exclusive emphasis on prediction accuracy may not align with the diverse end objectives of downstream agents. Recognizing the public model's predictions as a service, we advocate for integrating the objectives of downstream agents into the optimization process. Concretely, to address performance disparities and foster fairness among heterogeneous agents in training, we propose a novel Equitable Objective. This objective, coupled with a policy gradient algorithm, is crafted to train the public model to produce a more equitable/uniform performance distribution across downstream agents, each with their unique concerns. Both theoretical analysis and empirical case studies have proven the effectiveness of our method in advancing performance equity across diverse downstream agents utilizing the public model for their decision-making. Codes and datasets are released at https://github.com/Ren-Research/Socially-Equitable-Public-Models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yejia Liu;Jianyi Yang;Pengfei Li;Tongxin Li;Shaolei Ren", "authorids": "~Yejia_Liu1;~Jianyi_Yang1;~Pengfei_Li2;~Tongxin_Li1;~Shaolei_Ren1", "gender": "F;M;M;M;", "homepage": "https://liuyejia.github.io/;https://jyang-ai.github.io;https://www.cs.ucr.edu/~pli081/;https://tongxin.me/;", "dblp": "215/4938.html;124/1315;;140/7353;", "google_scholar": "https://scholar.google.ca/citations?user=bST-gYQAAAAJ;n7UUdJQAAAAJ;irA8gqoAAAAJ;qyNc3CkAAAAJ;", "orcid": ";;0000-0003-3257-9929;;", "linkedin": "yejia-martha-liu-159410113/?originalSubdomain=ca;jianyi-yang-b7a9181a6/;;;", "or_profile": "~Yejia_Liu1;~Jianyi_Yang1;~Pengfei_Li2;~Tongxin_Li1;~Shaolei_Ren1", "aff": "University of California, Riverside;University of California, Riverside;University of California, Riverside;The Chinese University of Hong Kong, Shenzhen;", "aff_domain": "cs.ucr.edu;ucr.edu;ucr.edu;cuhk.edu.cn;", "position": "PhD student;Researcher;PhD student;Assistant Professor;", "bibtex": "@inproceedings{\nliu2024building,\ntitle={Building Socially-Equitable Public Models},\nauthor={Yejia Liu and Jianyi Yang and Pengfei Li and Tongxin Li and Shaolei Ren},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PudBRuNa8r}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1609315, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:V2uBsT9KDxgJ:scholar.google.com/&scioq=Building+Socially-Equitable+Public+Models&hl=en&as_sdt=0,33", "gs_version_total": 8, "email": "cs.ucr.edu;ucr.edu;ucr.edu;cuhk.edu.cn;", "author_num": 5, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of California, Riverside;Chinese University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucr.edu;https://www.cuhk.edu.cn", "aff_unique_abbr": "UCR;CUHK", "aff_campus_unique_index": "0;0;0;1", "aff_campus_unique": "Riverside;Shenzhen", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;China" }, { "title": "Zeroth-Order Methods for Constrained Nonconvex Nonsmooth Stochastic Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34103", "id": "PxHmxoFOgI", "proceeding": "https://proceedings.mlr.press/v235/liu24j.html", "pdf": "https://openreview.net/pdf?id=PxHmxoFOgI", "openreview": "https://openreview.net/forum?id=PxHmxoFOgI", "author_site": "Zhuanghua Liu, Cheng Chen, Luo Luo, Bryan Kian Hsiang Low", "tldr": "", "abstract": "This paper studies the problem of solving nonconvex nonsmooth optimization over a closed convex set. Most previous works tackle such problems by transforming the constrained problem into an unconstrained problem that can be solved by the techniques developed in the unconstrained setting. However, they only provide asymptotic convergence analysis for their methods. In this work, we provide the non-asymptotic analysis for solving constrained nonconvex nonsmooth optimization. We first generalize classical gradient mapping and the Frank\u2013Wolfe gap in the nonsmooth setting. Then we introduce novel notions of approximate stationarity concerning such generalized quantities. We also propose several stochastic zeroth-order algorithms for the problem, along with their non-asymptotic convergence guarantees of obtaining the proposed approximate stationarity. Finally, we conduct numerical experiments that demonstrate the effectiveness of our algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhuanghua Liu;Cheng Chen;Luo Luo;Bryan Kian Hsiang Low", "authorids": "~Zhuanghua_Liu2;~Cheng_Chen9;~Luo_Luo1;~Bryan_Kian_Hsiang_Low1", "gender": "M;M;M;M", "homepage": ";https://chengchen8.github.io/;https://luoluo-sds.github.io/;http://www.comp.nus.edu.sg/~lowkh", "dblp": "195/8237.html;10/217-15;https://dblp.org/pers/hd/l/Luo:Luo;97/4877", "google_scholar": ";https://scholar.google.com/citations?hl=en;NggI9EsAAAAJ;https://scholar.google.com.tw/citations?user=2P-Q09UAAAAJ", "orcid": ";0000-0002-9094-0869;;", "linkedin": ";;;", "or_profile": "~Zhuanghua_Liu2;~Cheng_Chen9;~Luo_Luo1;~Bryan_Kian_Hsiang_Low1", "aff": "National University of Singapore;East China Normal University;Fudan University;National University of Singapore", "aff_domain": "nus.edu.sg;ecnu.edu.cn;fudan.edu.cn;nus.edu.sg", "position": "PhD student;Associate Professor;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nliu2024zerothorder,\ntitle={Zeroth-Order Methods for Constrained Nonconvex Nonsmooth Stochastic Optimization},\nauthor={Zhuanghua Liu and Cheng Chen and Luo Luo and Bryan Kian Hsiang Low},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PxHmxoFOgI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 479504, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17366147255544097421&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "nus.edu.sg;ecnu.edu.cn;fudan.edu.cn;nus.edu.sg", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "National University of Singapore;East China Normal University;Fudan University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nus.edu.sg;http://www.ecnu.edu.cn;https://www.fudan.edu.cn", "aff_unique_abbr": "NUS;ECNU;Fudan", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "Singapore;China" }, { "title": "Density Ratio Estimation with Doubly Strong Robustness", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34102", "id": "PykISfqvet", "proceeding": "https://proceedings.mlr.press/v235/nagumo24a.html", "pdf": "https://openreview.net/pdf?id=PykISfqvet", "openreview": "https://openreview.net/forum?id=PykISfqvet", "author_site": "Ryosuke Nagumo, Hironori Fujisawa", "tldr": "", "abstract": "We develop two density ratio estimation (DRE) methods with robustness to outliers. These are based on the divergence with a weight function to weaken the adverse effects of outliers. One is based on the Unnormalized Kullback-Leibler divergence, called Weighted DRE, and its optimization is a convex problem. The other is based on the \u03b3-divergence, called \u03b3-DRE, which improves a normalizing term problem of Weighted DRE. Its optimization is a DC (Difference of Convex functions) problem and needs more computation than a convex problem. These methods have doubly strong robustness, which means robustness to the heavy contamination of both the reference and target distributions. Numerical experiments show that our proposals are more robust than the previous methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ryosuke Nagumo;Hironori Fujisawa", "authorids": "~Ryosuke_Nagumo1;~Hironori_Fujisawa1", "gender": ";M", "homepage": ";https://sites.google.com/view/hironorifujisawa/home/english", "dblp": ";26/7233.html", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Ryosuke_Nagumo1;~Hironori_Fujisawa1", "aff": ";Graduate University for Advanced Studies", "aff_domain": ";soken.ac.jp", "position": ";Full Professor", "bibtex": "@inproceedings{\nnagumo2024density,\ntitle={Density Ratio Estimation with Doubly Strong Robustness},\nauthor={Ryosuke Nagumo and Hironori Fujisawa},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PykISfqvet}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1745211, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EFTXHQKdW4oJ:scholar.google.com/&scioq=Density+Ratio+Estimation+with+Doubly+Strong+Robustness&hl=en&as_sdt=0,14", "gs_version_total": 5, "email": ";soken.ac.jp", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Graduate University for Advanced Studies", "aff_unique_dep": "", "aff_unique_url": "http://www.gucas.ac.jp", "aff_unique_abbr": "GUAS", "aff_country_unique_index": "0", "aff_country_unique": "Japan" }, { "title": "Diagnosing the Compositional Knowledge of Vision Language Models from a Game-Theoretic View", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34101", "id": "PzjDsfYwLC", "proceeding": "https://proceedings.mlr.press/v235/wang24n.html", "pdf": "https://openreview.net/pdf?id=PzjDsfYwLC", "openreview": "https://openreview.net/forum?id=PzjDsfYwLC", "author_site": "Jin Wang, Shichao Dong, Yapeng Zhu, kelu Yao, Weidong Zhao, Chao Li, Ping Luo", "tldr": "", "abstract": "Compositional reasoning capabilities are usually considered as fundamental skills to characterize human perception. Recent studies show that current Vision Language Models (VLMs) surprisingly lack sufficient knowledge with respect to such capabilities. To this end, we propose to thoroughly diagnose the composition representations encoded by VLMs, systematically revealing the potential cause for this weakness. Specifically, we propose evaluation methods from a novel game-theoretic view to assess the vulnerability of VLMs on different aspects of compositional understanding, e.g., relations and attributes. Extensive experimental results demonstrate and validate several insights to understand the incapabilities of VLMs on compositional reasoning, which provide useful and reliable guidance for future studies. The deliverables will be updated [here](https://vlms-compositionality-gametheory.github.io/).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jin Wang;Shichao Dong;Yapeng Zhu;kelu Yao;Weidong Zhao;Chao Li;Ping Luo", "authorids": "~Jin_Wang16;~Shichao_Dong3;~Yapeng_Zhu1;~kelu_Yao1;~Weidong_Zhao2;~Chao_Li22;~Ping_Luo2", "gender": "M;M;M;M;M;M;", "homepage": "https://jinjinw.com;;;https://scholar.google.com/citations?user=hX9B4O4AAAAJ&hl=zh-CN;https://github.com/Dd-12138;https://scholar.google.com/citations?user=gF8h0HMAAAAJ&hl=zh-CN&oi=sra;http://luoping.me/", "dblp": ";;;205/8747;;66/190-28;54/4989-2.html", "google_scholar": "https://scholar.google.com/citations?hl=en;A6dVGqEAAAAJ;https://scholar.google.co.uk/citations?hl=en;hX9B4O4AAAAJ;;https://scholar.google.com.hk/citations?user=gF8h0HMAAAAJ;https://scholar.google.com.hk/citations?hl=en", "orcid": "0000-0002-0533-4523;;;0000-0002-4891-3197;;0009-0009-5932-4409;0000-0002-6685-7950", "linkedin": ";;;;;;", "or_profile": "~Jin_Wang16;~Shichao_Dong3;~Yapeng_Zhu1;~kelu_Yao1;~Weidong_Zhao2;~Chao_Li22;~Luo_Ping2", "aff": "The University of Hong Kong;Baidu;;Zhejiang Lab, Zhejiang Lab;\u56db\u5ddd\u5e08\u8303\u5927\u5b66;zhejianglab;The University of Hong Kong", "aff_domain": "connect.hku.hk;baidu.com;;zhejianglab.com;sicnu.edu;zhejianglab.com;hku.hk", "position": "PhD student;Researcher;;Assistant Professor;MS student;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nwang2024diagnosing,\ntitle={Diagnosing the Compositional Knowledge of Vision Language Models from a Game-Theoretic View},\nauthor={Jin Wang and Shichao Dong and Yapeng Zhu and kelu Yao and Weidong Zhao and Chao Li and Ping Luo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PzjDsfYwLC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7341810, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11234215117675248252&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "connect.hku.hk;baidu.com;;zhejianglab.com;sicnu.edu;zhejianglab.com;hku.hk", "author_num": 7, "aff_unique_index": "0;1;2;3;2;0", "aff_unique_norm": "University of Hong Kong;Baidu;Zhejiang Lab;Sichuan Normal University", "aff_unique_dep": ";Baidu, Inc.;;", "aff_unique_url": "https://www.hku.hk;https://www.baidu.com;https://www.zhejianglab.com;http://www.sctu.edu.cn", "aff_unique_abbr": "HKU;Baidu;;SCTU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "CogBench: a large language model walks into a psychology lab", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34100", "id": "Q3104y8djk", "proceeding": "https://proceedings.mlr.press/v235/coda-forno24a.html", "pdf": "https://openreview.net/pdf?id=Q3104y8djk", "openreview": "https://openreview.net/forum?id=Q3104y8djk", "author_site": "Julian Coda-Forno, Marcel Binz, Jane Wang, Eric Schulz", "tldr": "", "abstract": "Large language models (LLMs) have significantly advanced the field of artificial intelligence. Yet, evaluating them comprehensively remains challenging. We argue that this is partly due to the predominant focus on performance metrics in most benchmarks. This paper introduces *CogBench*, a benchmark that includes ten behavioral metrics derived from seven cognitive psychology experiments. This novel approach offers a toolkit for phenotyping LLMs\u2019 behavior. We apply *CogBench* to 40 LLMs, yielding a rich and diverse dataset. We analyze this data using statistical multilevel modeling techniques, accounting for the nested dependencies among fine-tuned versions of specific LLMs. Our study highlights the crucial role of model size and reinforcement learning from human feedback (RLHF) in improving performance and aligning with human behavior. Interestingly, we find that open-source models are less risk-prone than proprietary models and that fine-tuning on code does not necessarily enhance LLMs' behavior. Finally, we explore the effects of prompt-engineering techniques. We discover that chain-of-thought prompting improves probabilistic reasoning, while take-a-step-back prompting fosters model-based behaviors.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Julian Coda-Forno;Marcel Binz;Jane X Wang;Eric Schulz", "authorids": "~Julian_Coda-Forno1;~Marcel_Binz1;~Jane_X_Wang1;~Eric_Schulz1", "gender": "M;M;M;F", "homepage": ";;https://cpilab.org;http://www.janexwang.com", "dblp": ";212/5102;124/0016;88/10757", "google_scholar": "beVJGycAAAAJ;https://scholar.google.de/citations?user=Lvm9Q8QAAAAJ;;https://scholar.google.co.uk/citations?user=YizAq4gAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Julian_Coda-Forno1;~Marcel_Binz1;~Eric_Schulz1;~Jane_Wang1", "aff": "Max Planck Institute for Biological Cybernetics, Max-Planck Institute;Helmholtz Zentrum M\u00fcnchen;Max Planck Institute for Biological Cybernetics;Google DeepMind", "aff_domain": "tuebingen.mpg.de;helmholtz-munich.de;tuebingen.mpg.de;google.com", "position": "PhD student;Postdoc;Assistant Professor;Research Scientist", "bibtex": "@inproceedings{\ncoda-forno2024cogbench,\ntitle={CogBench: a large language model walks into a psychology lab},\nauthor={Julian Coda-Forno and Marcel Binz and Jane X Wang and Eric Schulz},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Q3104y8djk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1020752, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1579690979402162692&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 10, "email": "tuebingen.mpg.de;helmholtz-munich.de;tuebingen.mpg.de;google.com", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Max Planck Institute for Biological Cybernetics;Helmholtz Zentrum M\u00fcnchen;Google", "aff_unique_dep": "Biological Cybernetics;;Google DeepMind", "aff_unique_url": "https://www.biological-cybernetics.de;https://www.helmholtz-muenchen.de;https://deepmind.com", "aff_unique_abbr": "MPIBC;;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "Germany;United Kingdom" }, { "title": "Scribble-Supervised Semantic Segmentation with Prototype-based Feature Augmentation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34099", "id": "Q8uJyOwOsd", "proceeding": "https://proceedings.mlr.press/v235/chan24b.html", "pdf": "https://openreview.net/pdf?id=Q8uJyOwOsd", "openreview": "https://openreview.net/forum?id=Q8uJyOwOsd", "author_site": "Guiyang Chan, Pengcheng Zhang, Hai Dong, Shunhui Ji, Bainian Chen", "tldr": "", "abstract": "Scribble-supervised semantic segmentation presents a cost-effective training method that utilizes annotations generated through scribbling. It is valued in attaining high performance while minimizing annotation costs, which has made it highly regarded among researchers. Scribble supervision propagates information from labeled pixels to the surrounding unlabeled pixels, enabling semantic segmentation for the entire image. However, existing methods often ignore the features of classified pixels during feature propagation. To address these limitations, this paper proposes a prototype-based feature augmentation method that leverages feature prototypes to augment scribble supervision. Experimental results demonstrate that our approach achieves state-of-the-art performance on the PASCAL VOC 2012 dataset in scribble-supervised semantic segmentation tasks. The code is available at https://github.com/TranquilChan/PFA.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guiyang Chan;Pengcheng Zhang;Hai Dong;Shunhui Ji;Bainian Chen", "authorids": "~Guiyang_Chan1;~Pengcheng_Zhang4;~Hai_Dong1;~Shunhui_Ji1;~Bainian_Chen1", "gender": ";;;M;M", "homepage": "https://jszy.hhu.edu.cn/zpc/;http://titan.csit.rmit.edu.au/~e13322/hai_dong/;https://orcid.org/0000-0002-8584-5795;https://github.com/cbn001012;", "dblp": "84/6775.html;14/2764;;;", "google_scholar": "FWL9hEoAAAAJ;https://scholar.google.com.au/citations?user=ENWFU4gAAAAJ;;;", "orcid": ";0000-0002-7033-5688;;0000-0001-6035-1222;0009-0007-7268-6469", "linkedin": ";dr-hai-dong/;;;", "or_profile": "~Pengcheng_Zhang4;~Hai_Dong1;~Shunhui_Ji1;~Bainian_Chen1;~Chan_Guiyang1", "aff": "Hohai University;Royal Melbourne Institute of Technology;Hohai University;Hohai University;Hohai University", "aff_domain": "hhu.edu.cn;rmit.edu.au;hhu.edu.cn;hhu.edu.cn;hhu.edu.cn", "position": "Full Professor;Associate Professor;Assistant Professor;MS student;MS student", "bibtex": "@inproceedings{\nchan2024scribblesupervised,\ntitle={Scribble-Supervised Semantic Segmentation with Prototype-based Feature Augmentation},\nauthor={Guiyang Chan and Pengcheng Zhang and Hai Dong and Shunhui Ji and Bainian Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Q8uJyOwOsd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2524392, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4748662774525750997&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 4, "email": "hhu.edu.cn;rmit.edu.au;hhu.edu.cn;hhu.edu.cn;hhu.edu.cn", "author_num": 5, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Hohai University;Royal Melbourne Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.hohai.edu.cn;https://www.rmit.edu.au", "aff_unique_abbr": "Hohai;RMIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "China;Australia" }, { "title": "RigorLLM: Resilient Guardrails for Large Language Models against Undesired Content", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34098", "id": "QAGRPiC3FS", "proceeding": "https://proceedings.mlr.press/v235/yuan24f.html", "pdf": "https://openreview.net/pdf?id=QAGRPiC3FS", "openreview": "https://openreview.net/forum?id=QAGRPiC3FS", "author_site": "Zhuowen Yuan, Zidi Xiong, Yi Zeng, Ning Yu, Ruoxi Jia, Dawn Song, Bo Li", "tldr": "", "abstract": "Recent advancements in Large Language Models (LLMs) have showcased remarkable capabilities across various tasks in different domains. However, the emergence of biases and the potential for generating harmful content in LLMs, particularly under malicious inputs, pose significant challenges. Current mitigation strategies, while effective, are not resilient under adversarial attacks. This paper introduces Resilient Guardrails for Large Language Models (RigorLLM), a novel framework designed to efficiently and effectively moderate harmful and unsafe inputs and outputs for LLMs. By employing a multi-faceted approach that includes energy-based training data augmentation through Langevin dynamics, optimizing a safe suffix for inputs via minimax optimization, and integrating a fusion-based model combining robust KNN with LLMs based on our data augmentation, RigorLLM offers a robust solution to harmful content moderation. Our experimental evaluations demonstrate that RigorLLM not only outperforms existing baselines like OpenAI API and Perspective API in detecting harmful content but also exhibits unparalleled resilience to jailbreaking attacks. The innovative use of constrained optimization and a fusion-based guardrail approach represents a significant step forward in developing more secure and reliable LLMs, setting a new standard for content moderation frameworks in the face of evolving digital threats.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhuowen Yuan;Zidi Xiong;Yi Zeng;Ning Yu;Ruoxi Jia;Dawn Song;Bo Li", "authorids": "~Zhuowen_Yuan1;~Zidi_Xiong2;~Yi_Zeng3;~Ning_Yu2;~Ruoxi_Jia1;~Dawn_Song1;~Bo_Li19", "gender": "M;M;M;;;F;F", "homepage": ";https://polaris-73.github.io/;https://yizeng623.github.io/;;https://ruoxijia.info/;;http://boli.cs.illinois.edu/", "dblp": "304/3576;314/6808;75/148;;147/5355-1;s/DXSong;50/3402-26", "google_scholar": "F-r0bYQAAAAJ;XL6QafwAAAAJ;slUNmHQAAAAJ;;JCrug-YAAAAJ;;K8vJkTcAAAAJ", "orcid": ";;0000-0002-6901-9194;;;;", "linkedin": ";https://www.linkedin.com/public-profile/settings;chnyizeng/;;;;", "or_profile": "~Zhuowen_Yuan1;~Zidi_Xiong2;~Yi_Zeng3;~Ning_Yu2;~Ruoxi_Jia1;~Dawn_Song1;~Bo_Li19", "aff": "University of Illinois Urbana-Champaign;Harvard University;Virginia Tech;;Virginia Tech;University of California, Berkeley;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;harvard.edu;vt.edu;;vt.edu;berkeley.edu;illinois.edu", "position": "PhD student;PhD student;PhD student;;Assistant Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nyuan2024rigorllm,\ntitle={Rigor{LLM}: Resilient Guardrails for Large Language Models against Undesired Content},\nauthor={Zhuowen Yuan and Zidi Xiong and Yi Zeng and Ning Yu and Ruoxi Jia and Dawn Song and Bo Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QAGRPiC3FS}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1871607, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12189300064563248043&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "illinois.edu;harvard.edu;vt.edu;;vt.edu;berkeley.edu;illinois.edu", "author_num": 7, "aff_unique_index": "0;1;2;2;3;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;Harvard University;Virginia Tech;University of California, Berkeley", "aff_unique_dep": ";;;", "aff_unique_url": "https://illinois.edu;https://www.harvard.edu;https://www.vt.edu;https://www.berkeley.edu", "aff_unique_abbr": "UIUC;Harvard;VT;UC Berkeley", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Urbana-Champaign;;Berkeley", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Useful Representations of Recurrent Neural Network Weight Matrices", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34097", "id": "QBj7Uurdwf", "proceeding": "https://proceedings.mlr.press/v235/herrmann24a.html", "pdf": "https://openreview.net/pdf?id=QBj7Uurdwf", "openreview": "https://openreview.net/forum?id=QBj7Uurdwf", "author_site": "Vincent Herrmann, Francesco Faccio, J\u00fcrgen Schmidhuber", "tldr": "", "abstract": "Recurrent Neural Networks (RNNs) are general-purpose parallel-sequential computers. The program of an RNN is its weight matrix. How to learn useful representations of RNN weights that facilitate RNN analysis as well as downstream tasks? While the _mechanistic approach_ directly looks at some RNN's weights to predict its behavior, the _functionalist approach_ analyzes its overall functionality\u2013specifically, its input-output mapping. We consider several mechanistic approaches for RNN weights and adapt the permutation equivariant Deep Weight Space layer for RNNs. Our two novel functionalist approaches extract information from RNN weights by 'interrogating' the RNN through probing inputs. We develop a theoretical framework that demonstrates conditions under which the functionalist approach can generate rich representations that help determine RNN behavior. We create and release the first two 'model zoo' datasets for RNN weight representation learning. One consists of generative models of a class of formal languages, and the other one of classifiers of sequentially processed MNIST digits. With the help of an emulation-based self-supervised learning technique we compare and evaluate the different RNN weight encoding techniques on multiple downstream applications. On the most challenging one, namely predicting which exact task the RNN was trained on, functionalist approaches show clear superiority.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vincent Herrmann;Francesco Faccio;J\u00fcrgen Schmidhuber", "authorids": "~Vincent_Herrmann1;~Francesco_Faccio1;~J\u00fcrgen_Schmidhuber1", "gender": "M;M;M", "homepage": "https://vincentherrmann.github.io;;http://people.idsia.ch/~juergen/", "dblp": "248/8663;227/3214;s/JurgenSchmidhuber", "google_scholar": ";0z3DkrkAAAAJ;https://scholar.google.ch/citations?user=gLnCTgIAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Vincent_Herrmann1;~Francesco_Faccio1;~J\u00fcrgen_Schmidhuber1", "aff": "The Swiss AI Lab IDSIA;The Swiss AI Lab IDSIA - USI - SUPSI;IDSIA", "aff_domain": "idsia.ch;idsia.ch;idsia.ch", "position": "PhD student;PhD student;Scientific Director", "bibtex": "@inproceedings{\nherrmann2024learning,\ntitle={Learning Useful Representations of Recurrent Neural Network Weight Matrices},\nauthor={Vincent Herrmann and Francesco Faccio and J{\\\"u}rgen Schmidhuber},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QBj7Uurdwf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7205628, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4918472442622728557&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "idsia.ch;idsia.ch;idsia.ch", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "IDSIA;Swiss AI Lab IDSIA;Institute of Digital Technologies", "aff_unique_dep": "Swiss AI Lab;AI Lab;", "aff_unique_url": "https://www.idsia.ch/;https://www.idsia.ch/;https://www.idsia.ch", "aff_unique_abbr": "IDSIA;IDSIA;IDSIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Transformers, parallel computation, and logarithmic depth", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34096", "id": "QCZabhKQhB", "proceeding": "https://proceedings.mlr.press/v235/sanford24a.html", "pdf": "https://openreview.net/pdf?id=QCZabhKQhB", "openreview": "https://openreview.net/forum?id=QCZabhKQhB", "author_site": "Clayton Sanford, Daniel Hsu, Matus Telgarsky", "tldr": "", "abstract": "We show that a constant number of self-attention layers can efficiently simulate\u2014and be simulated by\u2014a constant number of communication rounds of *Massively Parallel Computation*. As a consequence, we show that logarithmic-depth is sufficient for transformers to solve basic computational tasks that cannot be efficiently solved by several other neural sequence models and sub-quadratic transformer approximations. We thus establish parallelism as a key distinguishing property of transformers.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Clayton Sanford;Daniel Hsu;Matus Telgarsky", "authorids": "~Clayton_Sanford1;~Daniel_Hsu1;~Matus_Telgarsky1", "gender": "M;M;M", "homepage": "https://www.cs.columbia.edu/~djhsu/;https://cims.nyu.edu/~matus/;https://claytonsanford.com/", "dblp": "h/DanielHsu.html;05/9061;232/1797", "google_scholar": "Bp6tvy0AAAAJ;https://scholar.google.com/citations?hl=en;Qo18yHAAAAAJ", "orcid": "0000-0002-3495-7113;;", "linkedin": ";;claytonsanford/", "or_profile": "~Daniel_Hsu1;~Matus_Telgarsky1;~Clayton_Hendrick_Sanford1", "aff": "Columbia University;NYU, New York University;Columbia University", "aff_domain": "columbia.edu;cims.nyu.edu;columbia.edu", "position": "Associate Professor;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nsanford2024transformers,\ntitle={Transformers, parallel computation, and logarithmic depth},\nauthor={Clayton Sanford and Daniel Hsu and Matus Telgarsky},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QCZabhKQhB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1533631, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2455483225130010259&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "columbia.edu;cims.nyu.edu;columbia.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Columbia University;New York University", "aff_unique_dep": ";", "aff_unique_url": "https://www.columbia.edu;https://www.nyu.edu", "aff_unique_abbr": "Columbia;NYU", "aff_campus_unique_index": "1", "aff_campus_unique": ";New York", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "The Merit of River Network Topology for Neural Flood Forecasting", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34095", "id": "QE6iC9s6vU", "proceeding": "https://proceedings.mlr.press/v235/kirschstein24a.html", "pdf": "https://openreview.net/pdf?id=QE6iC9s6vU", "openreview": "https://openreview.net/forum?id=QE6iC9s6vU", "author_site": "Nikolas Kirschstein, Yixuan Sun", "tldr": "", "abstract": "Climate change exacerbates riverine floods, which occur with higher frequency and intensity than ever. The much-needed forecasting systems typically rely on accurate river discharge predictions. To this end, the SOTA data-driven approaches treat forecasting at spatially distributed gauge stations as isolated problems, even within the same river network. However, incorporating the known topology of the river network into the prediction model has the potential to leverage the adjacency relationship between gauges. Thus, we model river discharge for a network of gauging stations with GNNs and compare the forecasting performance achieved by different adjacency definitions. Our results show that the model fails to benefit from the river network topology information, both on the entire network and small subgraphs. The learned edge weights correlate with neither of the static definitions and exhibit no regular pattern. Furthermore, the GNNs struggle to predict sudden, narrow discharge spikes. Our work hints at a more general underlying phenomenon of neural prediction not always benefitting from graphical structure and may inspire a systematic study of the conditions under which this happens.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nikolas Kirschstein;Yixuan Sun", "authorids": "~Nikolas_Kirschstein1;~Yixuan_Sun3", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";0000-0002-7947-4369", "linkedin": ";", "or_profile": "~Nikolas_Kirschstein1;~Yixuan_Sun3", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nkirschstein2024the,\ntitle={The Merit of River Network Topology for Neural Flood Forecasting},\nauthor={Nikolas Kirschstein and Yixuan Sun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QE6iC9s6vU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1028662, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8404080526834111154&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": ";", "author_num": 2 }, { "title": "Lightweight Image Super-Resolution via Flexible Meta Pruning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34094", "id": "QFMcXz6e4Y", "proceeding": "https://proceedings.mlr.press/v235/zhang24cc.html", "pdf": "https://openreview.net/pdf?id=QFMcXz6e4Y", "openreview": "https://openreview.net/forum?id=QFMcXz6e4Y", "author_site": "Yulun Zhang, Kai Zhang, Luc Van Gool, Martin Danelljan, Fisher Yu", "tldr": "", "abstract": "Lightweight image super-resolution (SR) methods have obtained promising results with moderate model complexity. These approaches primarily focus on a lightweight architecture design, but neglect to further reduce network redundancy. While some model compression techniques try to achieve more lightweight SR models with neural architecture search, knowledge distillation, or channel pruning, they typically require considerable extra computational resources or neglect to prune weights. To address these issues, we propose a flexible meta pruning (FMP) for lightweight image SR, where the network channels and weights are pruned simultaneously. Specifically, we control the network sparsity via channel vectors and weight indicators. We feed them into a hypernetwork, whose parameters act as meta-data for the parameters of the SR backbone. Consequently, for each network layer, we conduct structured pruning with channel vectors, which control the output and input channels. Besides, we conduct unstructured pruning with weight indicators to influence the sparsity of kernel weights, resulting in flexible pruning. During pruning, the sparsity of both channel vectors and weight indicators are regularized. We optimize the channel vectors and weight indicators with proximal gradient and SGD. We conduct extensive experiments to investigate critical factors in the flexible channel and weight pruning for image SR, demonstrating the superiority of our FMP when applied to baseline image SR architectures.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yulun Zhang;Kai Zhang;Luc Van Gool;Martin Danelljan;Fisher Yu", "authorids": "~Yulun_Zhang1;~Kai_Zhang8;~Luc_Van_Gool1;~Martin_Danelljan4;~Fisher_Yu2", "gender": "M;M;;M;M", "homepage": "http://yulunzhang.com/;https://github.com/cszn;;https://martin-danelljan.github.io/;https://www.yf.io/", "dblp": "166/2763-1.html;55/957-8;61/5017;151/8848;117/6314", "google_scholar": "ORmLjWoAAAAJ;0RycFIIAAAAJ;https://scholar.google.be/citations?user=TwMib_QAAAAJ;NCSSpMkAAAAJ;-XCiamcAAAAJ", "orcid": "0000-0002-2288-5079;0000-0002-6319-3722;;;", "linkedin": "yulun-zhang-1116b5b9/;;;;", "or_profile": "~Yulun_Zhang1;~Kai_Zhang8;~Luc_Van_Gool1;~Martin_Danelljan4;~Fisher_Yu2", "aff": "Swiss Federal Institute of Technology;ETH Zurich;KU Leuven;ETH Zurich;Swiss Federal Institute of Technology", "aff_domain": "ethz.ch;vision.ee.ethz.ch;kuleuven.be;vision.ee.ethz.ch;ethz.ch", "position": "Postdoc;Postdoc;Emeritus;Principal Researcher;Assistant Professor", "bibtex": "@inproceedings{\nzhang2024lightweight,\ntitle={Lightweight Image Super-Resolution via Flexible Meta Pruning},\nauthor={Yulun Zhang and Kai Zhang and Luc Van Gool and Martin Danelljan and Fisher Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QFMcXz6e4Y}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2193143, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12790632853710065376&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "ethz.ch;vision.ee.ethz.ch;kuleuven.be;vision.ee.ethz.ch;ethz.ch", "author_num": 5, "aff_unique_index": "0;1;2;1;0", "aff_unique_norm": "Swiss Federal Institute of Technology;ETH Zurich;Katholieke Universiteit Leuven", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch;https://www.kuleuven.be", "aff_unique_abbr": "ETH Zurich;ETHZ;KU Leuven", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "Switzerland;Belgium" }, { "title": "Optimizing Watermarks for Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34093", "id": "QGAeWRRe6e", "proceeding": "https://proceedings.mlr.press/v235/wouters24a.html", "pdf": "https://openreview.net/pdf?id=QGAeWRRe6e", "openreview": "https://openreview.net/forum?id=QGAeWRRe6e", "tldr": "", "abstract": "With the rise of large language models (LLMs) and concerns about potential misuse, watermarks for generative LLMs have recently attracted much attention. An important aspect of such watermarks is the trade-off between their identifiability and their impact on the quality of the generated text. This paper introduces a systematic approach to this trade-off in terms of a multi-objective optimization problem. For a large class of robust, efficient watermarks, the associated Pareto optimal solutions are identified and shown to outperform existing robust, efficient watermarks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bram Wouters", "authorids": "~Bram_Wouters1", "gender": "", "homepage": "", "dblp": "", "google_scholar": "https://scholar.google.nl/citations?user=_22XG4YAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Bram_Wouters1", "aff": "University of Amsterdam", "aff_domain": "uva.nl", "position": "Lecturer", "bibtex": "@inproceedings{\nwouters2024optimizing,\ntitle={Optimizing Watermarks for Large Language Models},\nauthor={Bram Wouters},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QGAeWRRe6e}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 806569, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14203444169121888100&as_sdt=5,24&sciodt=0,24&hl=en", "gs_version_total": 6, "email": "uva.nl", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "University of Amsterdam", "aff_unique_dep": "", "aff_unique_url": "https://www.uva.nl", "aff_unique_abbr": "UvA", "aff_country_unique_index": "0", "aff_country_unique": "Netherlands" }, { "title": "Diffusive Gibbs Sampling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34092", "id": "QH4mXDEULp", "proceeding": "https://proceedings.mlr.press/v235/chen24be.html", "pdf": "https://openreview.net/pdf?id=QH4mXDEULp", "openreview": "https://openreview.net/forum?id=QH4mXDEULp", "author_site": "Wenlin Chen, Mingtian Zhang, Brooks Paige, Jose Miguel Hernandez-Lobato, David Barber", "tldr": "", "abstract": "The inadequate mixing of conventional Markov Chain Monte Carlo (MCMC) methods for multi-modal distributions presents a significant challenge in practical applications such as Bayesian inference and molecular dynamics. Addressing this, we propose Diffusive Gibbs Sampling (DiGS), an innovative family of sampling methods designed for effective sampling from distributions characterized by distant and disconnected modes. DiGS integrates recent developments in diffusion models, leveraging Gaussian convolution to create an auxiliary noisy distribution that bridges isolated modes in the original space and applying Gibbs sampling to alternately draw samples from both spaces. A novel Metropolis-within-Gibbs scheme is proposed to enhance mixing in the denoising sampling step. DiGS exhibits a better mixing property for sampling multi-modal distributions than state-of-the-art methods such as parallel tempering, attaining substantially improved performance across various tasks, including mixtures of Gaussians, Bayesian neural networks and molecular dynamics.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenlin Chen;Mingtian Zhang;Brooks Paige;Jos\u00e9 Miguel Hern\u00e1ndez-Lobato;David Barber", "authorids": "~Wenlin_Chen2;~Mingtian_Zhang1;~Brooks_Paige1;~Jos\u00e9_Miguel_Hern\u00e1ndez-Lobato1;~David_Barber2", "gender": ";M;M;M;", "homepage": "https://wenlin-chen.github.io/;http://tomo.wiki;https://tbrx.github.io;http://www.cs.ucl.ac.uk/staff/D.Barber/;http://jmhl.org", "dblp": ";230/8340;https://dblp.uni-trier.de/pers/p/Paige:Brooks;;40/6058", "google_scholar": "https://scholar.google.com/citations?hl=en;;JrFJmx0AAAAJ;https://scholar.google.com.tw/citations?user=Nej1FcgAAAAJ;BEBccCQAAAAJ", "orcid": ";;;;0000-0001-7610-949X", "linkedin": ";;;;", "or_profile": "~Wenlin_Chen2;~Mingtian_Zhang1;~Brooks_Paige1;~David_Barber1;~Jose_Miguel_Hernandez_Lobato1", "aff": "Microsoft Research;;University College London;University College London;University of Cambridge", "aff_domain": "microsoft.com;;ucl.ac.uk;;cam.ac.uk", "position": "Research Intern;;Associate Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nchen2024diffusive,\ntitle={Diffusive Gibbs Sampling},\nauthor={Wenlin Chen and Mingtian Zhang and Brooks Paige and Jos{\\'e} Miguel Hern{\\'a}ndez-Lobato and David Barber},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QH4mXDEULp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3610128, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4016715199546276556&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "microsoft.com;;ucl.ac.uk;;cam.ac.uk", "author_num": 5, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Microsoft;University College London;University of Cambridge", "aff_unique_dep": "Microsoft Research;;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.ucl.ac.uk;https://www.cam.ac.uk", "aff_unique_abbr": "MSR;UCL;Cambridge", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "DPZero: Private Fine-Tuning of Language Models without Backpropagation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34091", "id": "QJkG8Mln72", "proceeding": "https://proceedings.mlr.press/v235/zhang24af.html", "pdf": "https://openreview.net/pdf?id=QJkG8Mln72", "openreview": "https://openreview.net/forum?id=QJkG8Mln72", "author_site": "Liang Zhang, Bingcong Li, Kiran Thekumparampil, Sewoong Oh, Niao He", "tldr": "", "abstract": "The widespread practice of fine-tuning large language models (LLMs) on domain-specific data faces two major challenges in memory and privacy. First, as the size of LLMs continues to grow, the memory demands of gradient-based training methods via backpropagation become prohibitively high. Second, given the tendency of LLMs to memorize training data, it is important to protect potentially sensitive information in the fine-tuning data from being regurgitated. Zeroth-order methods, which rely solely on forward passes, substantially reduce memory consumption during training. However, directly combining them with standard differentially private gradient descent suffers more as model size grows. To bridge this gap, we introduce DPZero, a novel private zeroth-order algorithm with nearly dimension-independent rates. The memory efficiency of DPZero is demonstrated in privately fine-tuning RoBERTa and OPT on several downstream tasks. Our code is available at https://github.com/Liang137/DPZero.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Liang Zhang;Bingcong Li;Kiran Koshy Thekumparampil;Sewoong Oh;Niao He", "authorids": "~Liang_Zhang6;~Bingcong_Li1;~Kiran_Koshy_Thekumparampil1;~Sewoong_Oh3;~Niao_He3", "gender": "M;;M;;", "homepage": "https://liang137.github.io/;;http://thekump2.web.engr.illinois.edu;;", "dblp": "50/6759;;142/2840;;", "google_scholar": "OIgmMCkAAAAJ;;0gJQCIgAAAAJ;;", "orcid": "0009-0007-4012-8040;;;;", "linkedin": ";;;;", "or_profile": "~Liang_Zhang6;~Bingcong_Li1;~Kiran_Koshy_Thekumparampil1;~Sewoong_Oh3;~Niao_He3", "aff": "Department of Computer Science, ETHZ - ETH Zurich;;Amazon;;", "aff_domain": "inf.ethz.ch;;amazon.com;;", "position": "PhD student;;Researcher;;", "bibtex": "@inproceedings{\nzhang2024dpzero,\ntitle={{DPZ}ero: Private Fine-Tuning of Language Models without Backpropagation},\nauthor={Liang Zhang and Bingcong Li and Kiran Koshy Thekumparampil and Sewoong Oh and Niao He},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QJkG8Mln72}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4274442, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5727291565833018304&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "inf.ethz.ch;;amazon.com;;", "author_num": 5, "aff_unique_index": "0;1", "aff_unique_norm": "ETH Zurich;Amazon", "aff_unique_dep": "Department of Computer Science;Amazon.com, Inc.", "aff_unique_url": "https://www.ethz.ch;https://www.amazon.com", "aff_unique_abbr": "ETHZ;Amazon", "aff_campus_unique_index": "0", "aff_campus_unique": "Zurich;", "aff_country_unique_index": "0;1", "aff_country_unique": "Switzerland;United States" }, { "title": "Prediction-powered Generalization of Causal Inferences", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34090", "id": "QKnWXX3aVm", "proceeding": "https://proceedings.mlr.press/v235/demirel24a.html", "pdf": "https://openreview.net/pdf?id=QKnWXX3aVm", "openreview": "https://openreview.net/forum?id=QKnWXX3aVm", "author_site": "Ilker Demirel, Ahmed Alaa, Anthony Philippakis, David Sontag", "tldr": "", "abstract": "Causal inferences from a randomized controlled trial (RCT) may not pertain to a *target* population where some effect modifiers have a different distribution. Prior work studies *generalizing* the results of a trial to a target population with no outcome but covariate data available. We show how the limited size of trials makes generalization a statistically infeasible task, as it requires estimating complex nuisance functions. We develop generalization algorithms that supplement the trial data with a prediction model learned from an additional *observational* study (OS), without making *any* assumptions on the OS. We theoretically and empirically show that our methods facilitate better generalization when the OS is \"high-quality\", and remain robust when it is not, and *e.g.*, have unmeasured confounding.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ilker Demirel;Ahmed Alaa;Anthony Philippakis;David Sontag", "authorids": "~Ilker_Demirel1;~Ahmed_Alaa1;~Anthony_Philippakis1;~David_Sontag1", "gender": "M;M;M;M", "homepage": "https://demireal.github.io;https://alaalab.berkeley.edu/;;http://people.csail.mit.edu/dsontag/", "dblp": "253/7352;140/7324;;12/673", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com.eg/citations?user=_pv1sEcAAAAJ;Q-v0BgUAAAAJ;LfcroyAAAAAJ", "orcid": "0000-0003-1035-8500;;;0000-0002-5034-7796", "linkedin": "ilker-demirel-7ab01818a/;;;", "or_profile": "~Ilker_Demirel1;~Ahmed_Alaa1;~Anthony_Philippakis1;~David_Sontag1", "aff": "Massachusetts Institute of Technology;University of California, Berkeley;Broad Institute;Massachusetts Institute of Technology", "aff_domain": "mit.edu;berkeley.edu;broadinstitute.org;mit.edu", "position": "PhD student;Assistant Professor;Researcher;Assistant Professor", "bibtex": "@inproceedings{\ndemirel2024predictionpowered,\ntitle={Prediction-powered Generalization of Causal Inferences},\nauthor={Ilker Demirel and Ahmed Alaa and Anthony Philippakis and David Sontag},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QKnWXX3aVm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8302997, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16352812440677992937&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "mit.edu;berkeley.edu;broadinstitute.org;mit.edu", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Massachusetts Institute of Technology;University of California, Berkeley;Broad Institute", "aff_unique_dep": ";;", "aff_unique_url": "https://web.mit.edu;https://www.berkeley.edu;https://www.broadinstitute.org", "aff_unique_abbr": "MIT;UC Berkeley;Broad", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Bayesian Power Steering: An Effective Approach for Domain Adaptation of Diffusion Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34089", "id": "QLOvxGwbIM", "proceeding": "https://proceedings.mlr.press/v235/huang24l.html", "pdf": "https://openreview.net/pdf?id=QLOvxGwbIM", "openreview": "https://openreview.net/forum?id=QLOvxGwbIM", "author_site": "Ding Huang, Ting Li, Jian Huang", "tldr": "", "abstract": "We propose a Bayesian framework for fine-tuning large diffusion models with a novel network structure called Bayesian Power Steering (BPS). We clarify the meaning behind adaptation from a large probability space to a small probability space and explore the task of fine-tuning pre-trained models using learnable modules from a Bayesian perspective. BPS extracts task-specific knowledge from a pre-trained model\u2019s learned prior distribution. It efficiently leverages large diffusion models, differentially intervening different hidden features with a head-heavy and foot-light configuration. Experiments highlight the superiority of BPS over contemporary methods across a range of tasks even with limited amount of data. Notably, BPS attains an FID score of 10.49 under the sketch condition on the COCO17 dataset.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ding Huang;Ting Li;Jian Huang", "authorids": "~Ding_Huang1;~Ting_Li4;~Jian_Huang5", "gender": ";M;M", "homepage": "https://www.polyu.edu.hk/ama/people/research-students/;https://sites.google.com/view/tinglipolyu;https://www.polyu.edu.hk/ama/people/academic-staff/prof-huang-jian/", "dblp": ";;", "google_scholar": ";Mkj1HHcAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0002-3880-8609;0000-0002-5218-9269", "linkedin": ";;", "or_profile": "~Ding_Huang1;~Ting_Li4;~Jian_Huang5", "aff": "Hong Kong Polytechnic University;Hong Kong Polytechnic University;Hong Kong Polytechnic University", "aff_domain": "polyu.edu.hk;polyu.edu.hk;polyu.edu.hk", "position": "PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nhuang2024bayesian,\ntitle={Bayesian Power Steering: An Effective Approach for Domain Adaptation of Diffusion Models},\nauthor={Ding Huang and Ting Li and Jian Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QLOvxGwbIM}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6480693, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:stx6vop9GnAJ:scholar.google.com/&scioq=Bayesian+Power+Steering:+An+Effective+Approach+for+Domain+Adaptation+of+Diffusion+Models&hl=en&as_sdt=0,34", "gs_version_total": 6, "email": "polyu.edu.hk;polyu.edu.hk;polyu.edu.hk", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Hong Kong Polytechnic University", "aff_unique_dep": "", "aff_unique_url": "https://www.polyu.edu.hk", "aff_unique_abbr": "PolyU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Rewards-in-Context: Multi-objective Alignment of Foundation Models with Dynamic Preference Adjustment", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34088", "id": "QLcBzRI3V3", "proceeding": "https://proceedings.mlr.press/v235/yang24q.html", "pdf": "https://openreview.net/pdf?id=QLcBzRI3V3", "openreview": "https://openreview.net/forum?id=QLcBzRI3V3", "author_site": "Rui Yang, Xiaoman Pan, Feng Luo, Shuang Qiu, Han Zhong, Dong Yu, Jianshu Chen", "tldr": "", "abstract": "We consider the problem of multi-objective alignment of foundation models with human preferences, which is a critical step towards helpful and harmless AI systems. However, it is generally costly and unstable to fine-tune large foundation models using reinforcement learning (RL), and the multi-dimensionality, heterogeneity, and conflicting nature of human preferences further complicate the alignment process. In this paper, we introduce Rewards-in-Context (RiC), which conditions the response of a foundation model on multiple rewards in its prompt context and applies supervised fine-tuning for alignment. The salient features of RiC are simplicity and adaptivity, as it only requires supervised fine-tuning of a single foundation model and supports dynamic adjustment for user preferences during inference time. Inspired by the analytical solution of an abstracted convex optimization problem, our dynamic inference-time adjustment method approaches the Pareto-optimal solution for multiple objectives. Empirical evidence demonstrates the efficacy of our method in aligning both Large Language Models (LLMs) and diffusion models to accommodate diverse rewards with only around 10% GPU hours compared with multi-objective RL baseline.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rui Yang;Xiaoman Pan;Feng Luo;Shuang Qiu;Han Zhong;Dong Yu;Jianshu Chen", "authorids": "~Rui_Yang8;~Xiaoman_Pan2;~Feng_Luo4;~Shuang_Qiu2;~Han_Zhong1;~Dong_Yu2;~Jianshu_Chen1", "gender": "M;F;M;;M;M;M", "homepage": "https://yangrui2015.github.io;;https://shq-ml.github.io/;https://hanzhong-ml.github.io/;https://sites.google.com/view/dongyu888/;https://chenjianshu.github.io/;https://panx27.github.io/homepage/", "dblp": "92/1942-10;;;137/8096.html;71/4598-1;11/3124;148/9210", "google_scholar": "QHSUy3MAAAAJ;TGd_-9UAAAAJ;-Z7fY00AAAAJ;Bk5q_pAAAAAJ;tMY31_gAAAAJ;jQeFWdoAAAAJ;tRPF03IAAAAJ", "orcid": "0000-0003-3525-1726;;;;0000-0003-0520-6844;;", "linkedin": ";;;;dongyu/;;", "or_profile": "~Rui_Yang8;~Feng_Luo4;~Shuang_Qiu2;~Han_Zhong1;~Dong_Yu2;~Jianshu_Chen1;~Xiaoman_Pan1", "aff": "Hong Kong University of Science and Technology;Tencent AI Lab;;Peking University;Tencent AI Lab;Amazon;Tencent AI Lab", "aff_domain": "ust.hk;tencent.com;;stu.pku.edu.cn;tencent.com;amazon.com;tencent.com", "position": "PhD student;Researcher;;PhD student;Distinguished Scientist;Principal Scientist;Researcher", "bibtex": "@inproceedings{\nyang2024rewardsincontext,\ntitle={Rewards-in-Context: Multi-objective Alignment of Foundation Models with Dynamic Preference Adjustment},\nauthor={Rui Yang and Xiaoman Pan and Feng Luo and Shuang Qiu and Han Zhong and Dong Yu and Jianshu Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QLcBzRI3V3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1429055, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1658060447251601132&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "ust.hk;tencent.com;;stu.pku.edu.cn;tencent.com;amazon.com;tencent.com", "author_num": 7, "aff_unique_index": "0;1;2;1;3;1", "aff_unique_norm": "Hong Kong University of Science and Technology;Tencent;Peking University;Amazon", "aff_unique_dep": ";Tencent AI Lab;;Amazon.com, Inc.", "aff_unique_url": "https://www.ust.hk;https://ai.tencent.com;http://www.pku.edu.cn;https://www.amazon.com", "aff_unique_abbr": "HKUST;Tencent AI Lab;Peking U;Amazon", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Optimization without Retraction on the Random Generalized Stiefel Manifold", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34087", "id": "QLtxj3erlJ", "proceeding": "https://proceedings.mlr.press/v235/vary24a.html", "pdf": "https://openreview.net/pdf?id=QLtxj3erlJ", "openreview": "https://openreview.net/forum?id=QLtxj3erlJ", "author_site": "Simon Vary, Pierre Ablin, Bin Gao, P.-A. Absil", "tldr": "", "abstract": "Optimization over the set of matrices $X$ that satisfy $X^\\top B X = I_p$, referred to as the generalized Stiefel manifold, appears in many applications involving sampled covariance matrices such as the canonical correlation analysis (CCA), independent component analysis (ICA), and the generalized eigenvalue problem (GEVP). Solving these problems is typically done by iterative methods that require a fully formed $B$. We propose a cheap stochastic iterative method that solves the optimization problem while having access only to a random estimates of $B$. Our method does not enforce the constraint in every iteration; instead, it produces iterations that converge to critical points on the generalized Stiefel manifold defined in expectation. The method has lower per-iteration cost, requires only matrix multiplications, and has the same convergence rates as its Riemannian optimization counterparts that require the full matrix $B$. Experiments demonstrate its effectiveness in various machine learning applications involving generalized orthogonality constraints, including CCA, ICA, and the GEVP.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Simon Vary;Pierre Ablin;Bin Gao;Pierre-Antoine Absil", "authorids": "~Simon_Vary1;~Pierre_Ablin2;~Bin_Gao6;~Pierre-Antoine_Absil1", "gender": "M;M;M;", "homepage": "https://simonvary.github.io;https://pierreablin.com/;https://www.gaobin.cc/;https://sites.uclouvain.be/absil/", "dblp": "230/4630;174/0980.html;181/2330-7;08/1880", "google_scholar": "V6OqU-cAAAAJ;1ZsunaYAAAAJ;Q9uKXacAAAAJ;", "orcid": ";;0000-0001-5290-4675;", "linkedin": ";;;", "or_profile": "~Simon_Vary1;~Pierre_Ablin2;~Bin_Gao6;~Pierre-Antoine_Absil1", "aff": "University of Oxford;Apple;Academy of Mathematics and Systems Science;UCLouvain", "aff_domain": "stats.ox.ac.uk;apple.com;lsec.cc.ac.cn;uclouvain.be", "position": "Postdoc;Researcher;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nvary2024optimization,\ntitle={Optimization without Retraction on the Random Generalized Stiefel Manifold},\nauthor={Simon Vary and Pierre Ablin and Bin Gao and Pierre-Antoine Absil},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QLtxj3erlJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2082761, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3780782102866753838&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 14, "email": "stats.ox.ac.uk;apple.com;lsec.cc.ac.cn;uclouvain.be", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Oxford;Apple;Academy of Mathematics and Systems Science;Universit\u00e9 catholique de Louvain", "aff_unique_dep": ";Apple Inc.;;", "aff_unique_url": "https://www.ox.ac.uk;https://www.apple.com;http://amss.cas.cn;https://www.uclouvain.be", "aff_unique_abbr": "Oxford;Apple;;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;3", "aff_country_unique": "United Kingdom;United States;China;Belgium" }, { "title": "DoraemonGPT: Toward Understanding Dynamic Scenes with Large Language Models (Exemplified as A Video Agent)", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34086", "id": "QMy2RLnxGN", "proceeding": "https://proceedings.mlr.press/v235/yang24d.html", "pdf": "https://openreview.net/pdf?id=QMy2RLnxGN", "openreview": "https://openreview.net/forum?id=QMy2RLnxGN", "author_site": "Zongxin Yang, Guikun Chen, Xiaodi Li, Wenguan Wang, Yi Yang", "tldr": "", "abstract": "Recent LLM-driven visual agents mainly focus on solving image-based tasks, which limits their ability to understand dynamic scenes, making it far from real-life applications like guiding students in laboratory experiments and identifying their mistakes. Hence, this paper explores DoraemonGPT, a comprehensive and conceptually elegant system driven by LLMs to understand dynamic scenes. Considering the video modality better reflects the ever-changing nature of real-world scenarios, we exemplify DoraemonGPT as a video agent. Given a video with a question/task, DoraemonGPT begins by converting the input video into a symbolic memory that stores task-related attributes. This structured representation allows for spatial-temporal querying and reasoning by well-designed sub-task tools, resulting in concise intermediate results. Recognizing that LLMs have limited internal knowledge when it comes to specialized domains (e.g., analyzing the scientific principles underlying experiments), we incorporate plug-and-play tools to assess external knowledge and address tasks across different domains. Moreover, a novel LLM-driven planner based on Monte Carlo Tree Search is introduced to explore the large planning space for scheduling various tools. The planner iteratively finds feasible solutions by backpropagating the result's reward, and multiple solutions can be summarized into an improved final answer. We extensively evaluate DoraemonGPT's effectiveness on three benchmarks and several in-the-wild scenarios. Project page: https://z-x-yang.github.io/doraemon-gpt.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zongxin Yang;Guikun Chen;Xiaodi Li;Wenguan Wang;Yi Yang", "authorids": "~Zongxin_Yang1;~Guikun_Chen1;~Xiaodi_Li2;~Wenguan_Wang4;~Yi_Yang4", "gender": "M;M;;M;M", "homepage": "https://z-x-yang.github.io/;https://guikunchen.github.io/;;https://sites.google.com/view/wenguanwang/;http://reler.net/", "dblp": ";342/9515;;145/1078;", "google_scholar": "8IE0CfwAAAAJ;I1TOdpkAAAAJ;;CqAQQkgAAAAJ;https://scholar.google.com.au/citations?user=RMSuNFwAAAAJ", "orcid": ";;;0000-0002-0802-9567;", "linkedin": ";;;wenguanwang;", "or_profile": "~Zongxin_Yang1;~Guikun_Chen1;~Xiaodi_Li2;~Wenguan_Wang4;~Yi_Yang4", "aff": "Zhejiang University;Zhejiang University;;Zhejiang University;Zhejiang University", "aff_domain": "zju.edu.cn;zju.edu.cn;;zju.edu.cn;zju.edu.cn", "position": "Postdoc;PhD student;;Full Professor;Full Professor", "bibtex": "@inproceedings{\nyang2024doraemongpt,\ntitle={Doraemon{GPT}: Toward Understanding Dynamic Scenes with Large Language Models (Exemplified as A Video Agent)},\nauthor={Zongxin Yang and Guikun Chen and Xiaodi Li and Wenguan Wang and Yi Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QMy2RLnxGN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3437805, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14846351798209922947&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "zju.edu.cn;zju.edu.cn;;zju.edu.cn;zju.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Zhejiang University", "aff_unique_dep": "", "aff_unique_url": "https://www.zju.edu.cn", "aff_unique_abbr": "ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Delving into the Convergence of Generalized Smooth Minimax Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34085", "id": "QPsEPI9bvp", "proceeding": "https://proceedings.mlr.press/v235/xian24a.html", "pdf": "https://openreview.net/pdf?id=QPsEPI9bvp", "openreview": "https://openreview.net/forum?id=QPsEPI9bvp", "author_site": "Wenhan Xian, Ziyi Chen, Heng Huang", "tldr": "", "abstract": "Minimax optimization is fundamental and important for enormous machine learning applications such as generative adversarial network, adversarial training, and robust optimization. Recently, a variety of minimax algorithms with theoretical guarantees based on Lipschitz smoothness have been proposed. However, these algorithms could fail to converge in practice because the requisite Lipschitz smooth condition may not hold even in some classic minimax problems. We will present some counterexamples to reveal this divergence issue. Thus, to fill this gap, we are motivated to delve into the convergence analysis of minimax algorithms under a relaxed Lipschitz smoothness condition, *i.e.*, generalized smoothness. We prove that variants of basic minimax optimization algorithms GDA, SGDA, GDmax and SGDmax can still converge in generalized smooth problems, and hence their theoretical guarantees can be extended to a wider range of applications. We also conduct a numerical experiment to validate the performance of our proposed algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenhan Xian;Ziyi Chen;Heng Huang", "authorids": "~Wenhan_Xian1;~Ziyi_Chen2;~Heng_Huang1", "gender": "M;M;M", "homepage": ";;https://www.cs.umd.edu/~heng/", "dblp": "246/3134;37/1439-2;03/281", "google_scholar": ";zjSBVOIAAAAJ;4OqLaDwAAAAJ", "orcid": ";;", "linkedin": "wenhan-xian-3392ba170;ziyi-chen-84616184/;", "or_profile": "~Wenhan_Xian1;~Ziyi_Chen2;~Heng_Huang1", "aff": "University of Maryland, College Park;University of Maryland, College Park;Department of Computer Science, University of Maryland, College Park", "aff_domain": "umd.edu;umd.edu;cs.umd.edu", "position": "PhD student;Postdoc;Full Professor", "bibtex": "@inproceedings{\nxian2024delving,\ntitle={Delving into the Convergence of Generalized Smooth Minimax Optimization},\nauthor={Wenhan Xian and Ziyi Chen and Heng Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QPsEPI9bvp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 826109, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11193326436187631783&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "umd.edu;umd.edu;cs.umd.edu", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Maryland;University of Maryland, College Park", "aff_unique_dep": ";Department of Computer Science", "aff_unique_url": "https://www/umd.edu;https://www/umd.edu", "aff_unique_abbr": "UMD;UMD", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Interpretable Deep Clustering for Tabular Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34084", "id": "QPy7zLfvof", "proceeding": "https://proceedings.mlr.press/v235/svirsky24a.html", "pdf": "https://openreview.net/pdf?id=QPy7zLfvof", "openreview": "https://openreview.net/forum?id=QPy7zLfvof", "author_site": "Jonathan Svirsky, Ofir Lindenbaum", "tldr": "", "abstract": "Clustering is a fundamental learning task widely used as a first step in data analysis. For example, biologists use cluster assignments to analyze genome sequences, medical records, or images. Since downstream analysis is typically performed at the cluster level, practitioners seek reliable and interpretable clustering models. We propose a new deep-learning framework for general domain tabular data that predicts interpretable cluster assignments at the instance and cluster levels. First, we present a self-supervised procedure to identify the subset of the most informative features from each data point. Then, we design a model that predicts cluster assignments and a gate matrix that provides cluster-level feature selection. Overall, our model provides cluster assignments with an indication of the driving feature for each sample and each cluster. We show that the proposed method can reliably predict cluster assignments in biological, text, image, and physics tabular datasets. Furthermore, using previously proposed metrics, we verify that our model leads to interpretable results at a sample and cluster level. Our code is available on https://github.com/jsvir/idc.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jonathan Svirsky;Ofir Lindenbaum", "authorids": "~Jonathan_Svirsky1;~Ofir_Lindenbaum1", "gender": "M;M", "homepage": ";https://www.eng.biu.ac.il/lindeno/", "dblp": "179/4135;142/4140", "google_scholar": "qRwzZmgAAAAJ;https://scholar.google.co.il/citations?user=jXxk6gcAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Jonathan_Svirsky1;~Ofir_Lindenbaum1", "aff": "Bar-Ilan University;Bar-Ilan University", "aff_domain": "biu.ac.il;biu.ac.il", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nsvirsky2024interpretable,\ntitle={Interpretable Deep Clustering for Tabular Data},\nauthor={Jonathan Svirsky and Ofir Lindenbaum},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QPy7zLfvof}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1615083, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14403360031182791394&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "biu.ac.il;biu.ac.il", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Bar-Ilan University", "aff_unique_dep": "", "aff_unique_url": "https://www.biu.ac.il", "aff_unique_abbr": "BIU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "Nash Incentive-compatible Online Mechanism Learning via Weakly Differentially Private Online Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34083", "id": "QQkK6YH0Th", "proceeding": "https://proceedings.mlr.press/v235/huh24b.html", "pdf": "https://openreview.net/pdf?id=QQkK6YH0Th", "openreview": "https://openreview.net/forum?id=QQkK6YH0Th", "author_site": "Joon Suk Huh, Kirthevasan Kandasamy", "tldr": "", "abstract": "We study a multi-round mechanism design problem, where we interact with a set of agents over a sequence of rounds. We wish to design an incentive-compatible (IC) online learning scheme to maximize an application-specific objective within a given class of mechanisms, without prior knowledge of the agents' type distributions. Even if each mechanism in this class is IC in a single round, if an algorithm naively chooses from this class on each round, the entire learning process may not be IC against non-myopic buyers who appear over multiple rounds. On each round, our method randomly chooses between the recommendation of a weakly differentially private online learning algorithm (e.g., Hedge), and a commitment mechanism which penalizes non-truthful behavior. Our method is IC and achieves $O(T^{\\frac{1+h}{2}})$ regret for the application-specific objective in an adversarial setting, where $h$ quantifies the long-sightedness of the agents. When compared to prior work, our approach is conceptually simpler, it applies to general mechanism design problems (beyond auctions), and its regret scales gracefully with the size of the mechanism class.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Joon Suk Huh;Kirthevasan Kandasamy", "authorids": "~Joon_Suk_Huh1;~Kirthevasan_Kandasamy1", "gender": ";M", "homepage": ";https://people.eecs.berkeley.edu/~kandasamy/research.html", "dblp": ";128/3628", "google_scholar": ";kohOJPcAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Joon_Suk_Huh1;~Kirthevasan_Kandasamy1", "aff": ";Department of Computer Science, University of Wisconsin - Madison", "aff_domain": ";cs.wisc.edu", "position": ";Assistant Professor", "bibtex": "@inproceedings{\nhuh2024nash,\ntitle={Nash Incentive-compatible Online Mechanism Learning via Weakly Differentially Private Online Learning},\nauthor={Joon Suk Huh and Kirthevasan Kandasamy},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QQkK6YH0Th}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 348206, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7110455693631596447&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 8, "email": ";cs.wisc.edu", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "University of Wisconsin-Madison", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.wisc.edu", "aff_unique_abbr": "UW-Madison", "aff_campus_unique_index": "0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Multi-Fidelity Residual Neural Processes for Scalable Surrogate Modeling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34082", "id": "QRDfBIhrJq", "proceeding": "https://proceedings.mlr.press/v235/niu24d.html", "pdf": "https://openreview.net/pdf?id=QRDfBIhrJq", "openreview": "https://openreview.net/forum?id=QRDfBIhrJq", "author_site": "Brooks(Ruijia) Niu, Dongxia Wu, Kai Kim, Yian Ma, Duncan Watson-Parris, Rose Yu", "tldr": "", "abstract": "Multi-fidelity surrogate modeling aims to learn an accurate surrogate at the highest fidelity level by combining data from multiple sources. Traditional methods relying on Gaussian processes can hardly scale to high-dimensional data. Deep learning approaches utilize neural network based encoders and decoders to improve scalability. These approaches share encoded representations across fidelities without including corresponding decoder parameters. This hinders inference performance, especially in out-of-distribution scenarios when the highest fidelity data has limited domain coverage. To address these limitations, we propose Multi-fidelity Residual Neural Processes (MFRNP), a novel multi-fidelity surrogate modeling framework. MFRNP explicitly models the residual between the aggregated output from lower fidelities and ground truth at the highest fidelity. The aggregation introduces decoders into the information sharing step and optimizes lower fidelity decoders to accurately capture both in-fidelity and cross-fidelity information. We show that MFRNP significantly outperforms state-of-the-art in learning partial differential equations and a real-world climate modeling task. Our code is published at: https://github.com/Rose-STL-Lab/MFRNP", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ruijia Niu;Dongxia Wu;Kai Kim;Yian Ma;Duncan Watson-Parris;Rose Yu", "authorids": "~Ruijia_Niu1;~Dongxia_Wu1;~Kai_Kim1;~Yian_Ma1;~Duncan_Watson-Parris1;~Rose_Yu1", "gender": "M;M;M;M;M;F", "homepage": "https://sites.google.com/view/niubrooks/about;https://dongxiaw.github.io/online-cv/;;https://sites.google.com/view/yianma;https://climate-analytics-lab.github.io;http://roseyu.com", "dblp": "346/7681;;;;254/3021;164/7314", "google_scholar": ";jZb2e8cAAAAJ;;A0TFlacAAAAJ;https://scholar.google.co.uk/citations?user=LK2IWlAAAAAJ;", "orcid": ";;;;0000-0002-5312-4950;", "linkedin": "rniu/;dongxia-wu-2021/;kai-kim-83780a202/;;;", "or_profile": "~Ruijia_Niu1;~Dongxia_Wu1;~Kai_Kim1;~Yian_Ma1;~Duncan_Watson-Parris1;~Rose_Yu1", "aff": "University of California, San Diego;University of California, San Diego;University of California, San Diego;University of California, San Diego;University of California, San Diego;University of California, San Diego", "aff_domain": "ucsd.edu;ucsd.edu;ucsd.edu;ucsd.edu;ucsd.edu;ucsd.edu", "position": "MS student;Ph.D student;Undergrad student;Assistant Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nniu2024multifidelity,\ntitle={Multi-Fidelity Residual Neural Processes for Scalable Surrogate Modeling},\nauthor={Ruijia Niu and Dongxia Wu and Kai Kim and Yian Ma and Duncan Watson-Parris and Rose Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QRDfBIhrJq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1653886, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17336762155020172278&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "ucsd.edu;ucsd.edu;ucsd.edu;ucsd.edu;ucsd.edu;ucsd.edu", "author_num": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "University of California, San Diego", "aff_unique_dep": "", "aff_unique_url": "https://www.ucsd.edu", "aff_unique_abbr": "UCSD", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Re-Dock: Towards Flexible and Realistic Molecular Docking with Diffusion Bridge", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34081", "id": "QRjTDhCIO8", "proceeding": "https://proceedings.mlr.press/v235/huang24ag.html", "pdf": "https://openreview.net/pdf?id=QRjTDhCIO8", "openreview": "https://openreview.net/forum?id=QRjTDhCIO8", "author_site": "Yufei Huang, Odin Zhang, Lirong Wu, Cheng Tan, Haitao Lin, Zhangyang Gao, Siyuan Li, Stan Z Li", "tldr": "", "abstract": "Accurate prediction of protein-ligand binding structures, a task known as molecular docking is crucial for drug design but remains challenging. While deep learning has shown promise, existing methods often depend on holo-protein structures (docked, and not accessible in realistic tasks) or neglect pocket sidechain conformations, leading to limited practical utility and unrealistic conformation predictions. To fill these gaps, we introduce an under-explored task, named flexible docking to predict poses of ligand and pocket sidechains simultaneously and introduce Re-Dock, a novel diffusion bridge generative model extended to geometric manifolds. Specifically, we propose energy-to-geometry mapping inspired by the Newton-Euler equation to co-model the binding energy and conformations for reflecting the energy-constrained docking generative process. Comprehensive experiments on designed benchmark datasets including apo-dock and cross-dock demonstrate our model's superior effectiveness and efficiency over current methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yufei Huang;Odin Zhang;Lirong Wu;Cheng Tan;Haitao Lin;Zhangyang Gao;Siyuan Li;Stan Z. Li", "authorids": "~Yufei_Huang4;~Odin_Zhang1;~Lirong_Wu1;~Cheng_Tan1;~Haitao_Lin2;~Zhangyang_Gao1;~Siyuan_Li6;~Stan_Z._Li2", "gender": "M;;;M;M;M;M;M", "homepage": "https://2021.igem.org/Team:ZJU-China;https://haotianzhangai4science.github.io/;;https://chengtan9907.github.io/;;;https://lupin1998.github.io/;https://en.westlake.edu.cn/academics/School_of_Engineering/About/Our_People/Faculty/201912/t20191206_2497.shtml", "dblp": "68/1946-2;;15/10330;70/1533-12.html;34/1040;275/3266;63/9705-2;l/StanZLi", "google_scholar": "qmTjdwIAAAAJ;ypnp3YwAAAAJ;Tk7TrCoAAAAJ;6kTV6aMAAAAJ;o5A23qIAAAAJ;4SclT-QAAAAJ;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com/citations?hl=zh-CN", "orcid": "0009-0007-8184-4529;;;;;0000-0003-1026-6083;0000-0001-6806-2468;", "linkedin": ";;;;;;https://www.linkedin.cn/incareer/in/siyuan-li-lupin1998/;stan-z-li-%E6%9D%8E%E5%AD%90%E9%9D%92-55753224/", "or_profile": "~Yufei_Huang4;~Odin_Zhang1;~Lirong_Wu1;~Cheng_Tan1;~Haitao_Lin2;~Zhangyang_Gao1;~Siyuan_Li6;~Stan_Z._Li1", "aff": "Zhejiang University;;Westlake University;Zhejiang University & Westlake University;Westlake University;Westlake University, China;Alibaba Group;Westlake University", "aff_domain": "zju.edu.cn;;westlake.edu.cn;westlake.edu.cn;westlake.edu.cn;westlake.edu.cn;alibaba-inc.com;westlake.edu.cn", "position": "PhD student;;PhD student;PhD student;PhD student;PhD student;Intern;Chair Professor", "bibtex": "@inproceedings{\nhuang2024redock,\ntitle={Re-Dock: Towards Flexible and Realistic Molecular Docking with Diffusion Bridge},\nauthor={Yufei Huang and Odin Zhang and Lirong Wu and Cheng Tan and Haitao Lin and Zhangyang Gao and Siyuan Li and Stan Z. Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QRjTDhCIO8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4275040, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8826258867770229894&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 7, "email": "zju.edu.cn;;westlake.edu.cn;westlake.edu.cn;westlake.edu.cn;westlake.edu.cn;alibaba-inc.com;westlake.edu.cn", "author_num": 8, "aff_unique_index": "0;1;0;1;1;2;1", "aff_unique_norm": "Zhejiang University;Westlake University;Alibaba Group", "aff_unique_dep": ";;", "aff_unique_url": "https://www.zju.edu.cn;https://www.westlake.edu.cn;https://www.alibaba.com", "aff_unique_abbr": "ZJU;WU;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Reward-Free Kernel-Based Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34080", "id": "QTt2xJI8vk", "proceeding": "https://proceedings.mlr.press/v235/vakili24a.html", "pdf": "https://openreview.net/pdf?id=QTt2xJI8vk", "openreview": "https://openreview.net/forum?id=QTt2xJI8vk", "author_site": "Sattar Vakili, Farhang Nabiei, Da-shan Shiu, Alberto Bernacchia", "tldr": "", "abstract": "Achieving sample efficiency in Reinforcement Learning (RL) is primarily hinged on the efficient exploration of the underlying environment, but it is still unknown what are the best exploration strategies in different settings. We consider the *reward-free* RL problem, which operates in two phases: an exploration phase, where the agent gathers exploration trajectories over episodes irrespective of any predetermined reward function, and a subsequent planning phase, where a reward function is introduced. The agent then utilizes the episodes from the exploration phase to calculate a near-optimal policy. Existing algorithms and sample complexities for reward-free RL are limited to tabular, linear or very smooth function approximations, leaving the problem largely open for more general cases. We consider a broad range of kernel-based function approximations, including non-smooth kernels, and propose an algorithm based on adaptive domain partitioning. We show that our algorithm achieves order-optimal sample complexity for a large class of common kernels, which includes Mat\u00e9rn and Neural Tangent kernels.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sattar Vakili;Farhang Nabiei;Da-shan Shiu;Alberto Bernacchia", "authorids": "~Sattar_Vakili1;~Farhang_Nabiei1;~Da-shan_Shiu1;~Alberto_Bernacchia1", "gender": ";;M;", "homepage": "https://sattar-vakili.github.io/;;;", "dblp": "140/5473;;95/2355;68/9669", "google_scholar": "N9xs8w0AAAAJ;0xczfG8AAAAJ;https://scholar.google.com/citations?hl=en;n48pFqcAAAAJ", "orcid": ";;;", "linkedin": ";farhang-nabiei-b3576a60/;;", "or_profile": "~Sattar_Vakili1;~Farhang_Nabiei1;~Da-shan_Shiu1;~Alberto_Bernacchia1", "aff": "MediaTek Research;MediaTek Research;;MedaiTek Research", "aff_domain": "mtkresearch.com;mtkresearch.com;;mtkresearch.com", "position": "Principal AI Research Manager;Researcher;;Team Lead", "bibtex": "@inproceedings{\nvakili2024rewardfree,\ntitle={Reward-Free Kernel-Based Reinforcement Learning},\nauthor={Sattar Vakili and Farhang Nabiei and Da-shan Shiu and Alberto Bernacchia},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QTt2xJI8vk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 472999, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10464834401921196694&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "mtkresearch.com;mtkresearch.com;;mtkresearch.com", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "MediaTek Inc.;MedaiTek", "aff_unique_dep": "Research;Research", "aff_unique_url": "https://www.mediatek.com/;", "aff_unique_abbr": "MediaTek;", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Taiwan;", "aff_country_unique_index": "0;0", "aff_country_unique": "China;" }, { "title": "Improving Adversarial Energy-Based Model via Diffusion Process", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34079", "id": "QXEx16jWdN", "proceeding": "https://proceedings.mlr.press/v235/geng24a.html", "pdf": "https://openreview.net/pdf?id=QXEx16jWdN", "openreview": "https://openreview.net/forum?id=QXEx16jWdN", "author_site": "Cong Geng, Tian Han, Peng-Tao Jiang, Hao Zhang, Jinwei Chen, S\u00f8ren Hauberg, Bo Li", "tldr": "", "abstract": "Generative models have shown strong generation ability while efficient likelihood estimation is less explored. Energy-based models (EBMs) define a flexible energy function to parameterize unnormalized densities efficiently but are notorious for being difficult to train. Adversarial EBMs introduce a generator to form a minimax training game to avoid expensive MCMC sampling used in traditional EBMs, but a noticeable gap between adversarial EBMs and other strong generative models still exists. Inspired by diffusion-based models, we embedded EBMs into each denoising step to split a long-generated process into several smaller steps. Besides, we employ a symmetric Jeffrey divergence and introduce a variational posterior distribution for the generator's training to address the main challenges that exist in adversarial EBMs. Our experiments show significant improvement in generation compared to existing adversarial EBMs, while also providing a useful energy function for efficient density estimation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Cong Geng;Tian Han;Peng-Tao Jiang;Hao Zhang;Jinwei Chen;S\u00f8ren Hauberg;Bo Li", "authorids": "~Cong_Geng1;~Tian_Han1;~Peng-Tao_Jiang1;~Hao_Zhang52;~Jinwei_Chen3;~S\u00f8ren_Hauberg1;~Bo_Li20", "gender": "F;M;M;M;M;M;M", "homepage": "https://gengcong940126.github.io/;https://hthth0801.github.io/;https://pengtaojiang.github.io;;https://scholar.google.com/citations?hl=zh-CN&user=Pcsml4oAAAAJ;http://www2.compute.dtu.dk/~sohau/;https://libraboli.github.io/", "dblp": "61/8108;65/4065-1;218/5550;;;39/7226;50/3402-115", "google_scholar": "acHCsegAAAAJ;Qtvu5t4AAAAJ;85QJ_i4AAAAJ;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com/citations?hl=en;https://scholar.google.com.hk/citations?hl=zh-CN", "orcid": ";;;0009-0007-1175-5918;;;", "linkedin": ";;;https://www.linkedin.cn/incareer/in/ACoAABKDDJUB-t0pY6F0gGrP67GMU5igjH6519E;;;", "or_profile": "~Cong_Geng1;~Tian_Han1;~Peng-Tao_Jiang1;~Hao_Zhang52;~Jinwei_Chen3;~S\u00f8ren_Hauberg1;~Bo_Li20", "aff": "China Mobile Research Institute;Stevens Institute of Technology;vivo Mobile Communication Co., Ltd;vivo Mobile Communication Co., Ltd;vivo Mobile Communication Co., Ltd.;Technical University of Denmark;Tencent Youtu Lab", "aff_domain": "chinamobile.com;stevens.edu;vivo.com;vivo.com;vivo.com;dtu.dk;tencent.com", "position": "Researcher;Assistant Professor;Researcher;Researcher;Principal Researcher;Professor;Researcher", "bibtex": "@inproceedings{\ngeng2024improving,\ntitle={Improving Adversarial Energy-Based Model via Diffusion Process},\nauthor={Cong Geng and Tian Han and Peng-Tao Jiang and Hao Zhang and Jinwei Chen and S{\\o}ren Hauberg and Bo Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QXEx16jWdN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8609094, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6466173994009782997&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "chinamobile.com;stevens.edu;vivo.com;vivo.com;vivo.com;dtu.dk;tencent.com", "author_num": 7, "aff_unique_index": "0;1;2;2;3;4;5", "aff_unique_norm": "China Mobile;Stevens Institute of Technology;vivo Mobile Communication Co., Ltd;vivo Mobile Communication Co., Ltd.;Technical University of Denmark;Tencent", "aff_unique_dep": "Research Institute;;;;;Youtu Lab", "aff_unique_url": "https://www.chinamobile.com/;https://www.stevens.edu;https://www.vivo.com.cn;https://www.vivo.com.cn;https://www.tek.dk;https://www.tencent.com", "aff_unique_abbr": "CMRI;SIT;vivo;vivo;DTU;Tencent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;2;0", "aff_country_unique": "China;United States;Denmark" }, { "title": "SleepFM: Multi-modal Representation Learning for Sleep Across Brain Activity, ECG and Respiratory Signals", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34078", "id": "QXqXGDapkQ", "proceeding": "https://proceedings.mlr.press/v235/thapa24a.html", "pdf": "https://openreview.net/pdf?id=QXqXGDapkQ", "openreview": "https://openreview.net/forum?id=QXqXGDapkQ", "author_site": "Rahul Thapa, Bryan He, Magnus Ruud Kjaer, Hyatt Moore, Gauri Ganjoo, Emmanuel Mignot, James Zou", "tldr": "", "abstract": "Sleep is a complex physiological process evaluated through various modalities recording electrical brain, cardiac, and respiratory activities. We curate a large polysomnography dataset from over 14,000 participants comprising over 100,000 hours of multi-modal sleep recordings. Leveraging this extensive dataset, we developed SleepFM, the first multi-modal foundation model for sleep analysis. We show that a novel leave-one-out approach for contrastive learning significantly improves downstream task performance compared to representations from standard pairwise contrastive learning. A logistic regression model trained on SleepFM's learned embeddings outperforms an end-to-end trained convolutional neural network (CNN) on sleep stage classification (macro AUROC 0.88 vs 0.72 and macro AUPRC 0.72 vs 0.48) and sleep disordered breathing detection (AUROC 0.85 vs 0.69 and AUPRC 0.77 vs 0.61). Notably, the learned embeddings achieve 48% top-1 average accuracy in retrieving modality clip pairs from 90,000 candidates. This work demonstrates the value of holistic multi-modal sleep modeling to fully capture the richness of sleep recordings. SleepFM is open source and available at https://anonymous.4open.science/r/sleepfm.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rahul Thapa;Bryan He;Magnus Ruud Kjaer;Hyatt Moore IV;Gauri Ganjoo;Emmanuel Mignot;James Zou", "authorids": "~Rahul_Thapa1;~Bryan_He1;~Magnus_Ruud_Kjaer1;~Hyatt_Moore_IV1;~Gauri_Ganjoo1;~Emmanuel_Mignot1;~James_Zou1", "gender": "M;M;M;;M;;M", "homepage": "https://rthapa84.github.io/;;;;http://www.mignotlab.com;;", "dblp": ";;;;;;00/10545", "google_scholar": "H9FNWVcAAAAJ;;;;;23ZXZvEAAAAJ;", "orcid": ";;0000-0001-7866-2280;;;;", "linkedin": "rahul-thapa/;magnus-ruud-kjaer-7778ab222;;gauri-ganjoo/;;;", "or_profile": "~Rahul_Thapa1;~Magnus_Ruud_Kjaer1;~Hyatt_Moore_IV1;~Gauri_Ganjoo1;~Emmanuel_Mignot1;~James_Zou1;~Bryan_D._He1", "aff": "Stanford University;Technical University of Denmark;;Stanford University;Stanford University;Stanford University;Stanford University", "aff_domain": "stanford.edu;dtu.dk;;stanford.edu;stanford.edu;stanford.edu;stanford.edu", "position": "PhD student;PhD student;;Researcher;Full Professor;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nthapa2024sleepfm,\ntitle={Sleep{FM}: Multi-modal Representation Learning for Sleep Across Brain Activity, {ECG} and Respiratory Signals},\nauthor={Rahul Thapa and Bryan He and Magnus Ruud Kjaer and Hyatt Moore IV and Gauri Ganjoo and Emmanuel Mignot and James Zou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QXqXGDapkQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7623877, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8479016964109740785&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "stanford.edu;dtu.dk;;stanford.edu;stanford.edu;stanford.edu;stanford.edu", "author_num": 7, "aff_unique_index": "0;1;0;0;0;0", "aff_unique_norm": "Stanford University;Technical University of Denmark", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.tek.dk", "aff_unique_abbr": "Stanford;DTU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;1;0;0;0;0", "aff_country_unique": "United States;Denmark" }, { "title": "Differentiable Mapper for Topological Optimization of Data Representation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34077", "id": "QZ1DVzr6N9", "proceeding": "https://proceedings.mlr.press/v235/oulhaj24a.html", "pdf": "https://openreview.net/pdf?id=QZ1DVzr6N9", "openreview": "https://openreview.net/forum?id=QZ1DVzr6N9", "author_site": "Ziyad Oulhaj, Mathieu Carri\u00e8re, Bertrand Michel", "tldr": "", "abstract": "Unsupervised data representation and visualization using tools from topology is an active and growing field of Topological Data Analysis (TDA) and data science. Its most prominent line of work is based on the so-called Mapper graph, which is a combinatorial graph whose topological structures (connected components, branches, loops) are in correspondence with those of the data itself. While highly generic and applicable, its use has been hampered so far by the manual tuning of its many parameters\u2014among these, a crucial one is the so-called filter: it is a continuous function whose variations on the data set are the main ingredient for both building the Mapper representation and assessing the presence and sizes of its topological structures. However, while a few parameter tuning methods have already been investigated for the other Mapper parameters (i.e., resolution, gain, clustering), there is currently no method for tuning the filter itself. In this work, we build on a recently proposed optimization framework incorporating topology to provide the first filter optimization scheme for Mapper graphs. In order to achieve this, we propose a relaxed and more general version of the Mapper graph, whose convergence properties are investigated. Finally, we demonstrate the usefulness of our approach by optimizing Mapper graph representations on several datasets, and showcasing the superiority of the optimized representation over arbitrary ones.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ziyad Oulhaj;Mathieu Carri\u00e8re;Bertrand Michel", "authorids": "ziyad.oulhaj@ec-nantes.fr;~Mathieu_Carri\u00e8re1;~Bertrand_Michel2", "gender": ";;M", "homepage": ";https://mathieucarriere.github.io/website/;http://bertrand.michel.perso.math.cnrs.fr/", "dblp": ";167/1015;", "google_scholar": ";;https://scholar.google.fr/citations?user=QiI_EskAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "ziyad.oulhaj@ec-nantes.fr;~Mathieu_Carri\u00e8re1;~Bertrand_Michel2", "aff": ";INRIA;Ecole Centrale de Nantes", "aff_domain": ";inria.fr;ec-nantes.fr", "position": ";Researcher;Full Professor", "bibtex": "@inproceedings{\noulhaj2024differentiable,\ntitle={Differentiable Mapper for Topological Optimization of Data Representation},\nauthor={Ziyad Oulhaj and Mathieu Carri{\\`e}re and Bertrand Michel},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QZ1DVzr6N9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2708496, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14863135380120509085&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 18, "email": ";inria.fr;ec-nantes.fr", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "INRIA;Ecole Centrale de Nantes", "aff_unique_dep": ";", "aff_unique_url": "https://www.inria.fr;https://www.ec-nantes.fr", "aff_unique_abbr": "INRIA;ECNantes", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "title": "Polynomial-based Self-Attention for Table Representation Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34076", "id": "QZd3rvlP76", "proceeding": "https://proceedings.mlr.press/v235/kim24ae.html", "pdf": "https://openreview.net/pdf?id=QZd3rvlP76", "openreview": "https://openreview.net/forum?id=QZd3rvlP76", "author_site": "Jayoung Kim, Yehjin Shin, Jeongwhan Choi, Hyowon Wi, Noseong Park", "tldr": "", "abstract": "Structured data, which constitutes a significant portion of existing data types, has been a long-standing research topic in the field of machine learning. Various representation learning methods for tabular data have been proposed, ranging from encoder-decoder structures to Transformers. Among these, Transformer-based methods have achieved state-of-the-art performance not only in tabular data but also in various other fields, including computer vision and natural language processing. However, recent studies have revealed that self-attention, a key component of Transformers, can lead to an oversmoothing issue. We show that Transformers for tabular data also face this problem. To tackle the problem, we suggest a novel self-attention layer for tabular data, leveraging matrix polynomials. This proposed layer serves as a replacement for the original self-attention layer, contributing to the improvement of model scalability. In our experiments with three representative table learning models equipped with our proposed layer, we illustrate that the layer effectively mitigates the oversmoothing problem and enhances the representation performance of the existing methods, outperforming the state-of-the-art table representation methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jayoung Kim;Yehjin Shin;Jeongwhan Choi;Hyowon Wi;Noseong Park", "authorids": "~Jayoung_Kim1;~Yehjin_Shin1;~Jeongwhan_Choi1;~Hyowon_Wi1;~Noseong_Park1", "gender": "F;F;M;;", "homepage": ";http://yehjin-shin.github.io/;https://www.jeongwhanchoi.com;;", "dblp": "26/9969-2;322/5257;39/11215-2;332/6458;", "google_scholar": "3qbSHGwAAAAJ;https://scholar.google.com/citations?view_op=list_works;3MNElkYAAAAJ;https://scholar.google.com/citations?view_op=list_works;", "orcid": ";0009-0001-7600-2585;0000-0002-6530-2662;;", "linkedin": ";yehjin-shin-528987217/;jeongwhanchoi/;;", "or_profile": "~Jayoung_Kim1;~Yehjin_Shin1;~Jeongwhan_Choi1;~Hyowon_Wi1;~Noseong_Park1", "aff": "Yonsei University;Yonsei University;Yonsei University;Yonsei University;", "aff_domain": "yonsei.ac.kr;yonsei.ac.kr;yonsei.ac.kr;yonsei.ac.kr;", "position": "MS student;MS student;PhD student;MS student;", "bibtex": "@inproceedings{\nkim2024polynomialbased,\ntitle={Polynomial-based Self-Attention for Table Representation Learning},\nauthor={Jayoung Kim and Yehjin Shin and Jeongwhan Choi and Hyowon Wi and Noseong Park},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QZd3rvlP76}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 438747, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ACU2lBgNl4IJ:scholar.google.com/&scioq=Polynomial-based+Self-Attention+for+Table+Representation+Learning&hl=en&as_sdt=0,33", "gs_version_total": 9, "email": "yonsei.ac.kr;yonsei.ac.kr;yonsei.ac.kr;yonsei.ac.kr;", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Yonsei University", "aff_unique_dep": "", "aff_unique_url": "https://www.yonsei.ac.kr", "aff_unique_abbr": "Yonsei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "The Illusion of State in State-Space Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34075", "id": "QZgo9JZpLq", "proceeding": "https://proceedings.mlr.press/v235/merrill24a.html", "pdf": "https://openreview.net/pdf?id=QZgo9JZpLq", "openreview": "https://openreview.net/forum?id=QZgo9JZpLq", "author_site": "William Merrill, Jackson Petty, Ashish Sabharwal", "tldr": "", "abstract": "State-space models (SSMs) have emerged as a potential alternative architecture for building large language models (LLMs) compared to the previously ubiquitous transformer architecture. One theoretical weakness of transformers is that they cannot express certain kinds of sequential computation and state tracking (Merrill & Sabharwal, 2023), which SSMs are explicitly designed to address via their close architectural similarity to recurrent neural networks (RNNs). *But do SSMs truly have an advantage (over transformers) in expressive power for state tracking?* Surprisingly, the answer is no. Our analysis reveals that the expressive power of SSMs is limited very similarly to transformers: SSMs cannot express computation outside the complexity class $\\mathsf{TC}^0$. In particular, this means they cannot solve simple state-tracking problems like permutation composition. It follows that SSMs are provably unable to accurately track chess moves with certain notation, evaluate code, or track entities in a long narrative. To supplement our formal analysis, we report experiments showing that Mamba-style SSMs indeed struggle with state tracking. Thus, despite its recurrent formulation, the \"state'' in an SSM is an illusion: SSMs have similar expressiveness limitations to non-recurrent models like transformers, which may fundamentally limit their ability to solve real-world state-tracking problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "William Merrill;Jackson Petty;Ashish Sabharwal", "authorids": "~William_Merrill1;~Jackson_Petty1;~Ashish_Sabharwal1", "gender": "M;;M", "homepage": "http://lambdaviking.com;https://jacksonpetty.org;", "dblp": "19/3512;;13/154", "google_scholar": "CyjChJQAAAAJ;hCYSiTgAAAAJ;7VspfeAAAAAJ", "orcid": ";0000-0002-9492-0144;", "linkedin": "william-merrill-15ab0743/;https://linkedin.com/in/jackson-petty;ashish-sabharwal-82a2b661", "or_profile": "~William_Merrill1;~Jackson_Petty1;~Ashish_Sabharwal1", "aff": "New York University;New York University;Allen Institute for AI", "aff_domain": "nyu.edu;nyu.edu;allenai.org", "position": "Graduate student;PhD student;Principal Researcher", "bibtex": "@inproceedings{\nmerrill2024the,\ntitle={The Illusion of State in State-Space Models},\nauthor={William Merrill and Jackson Petty and Ashish Sabharwal},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QZgo9JZpLq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 490539, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8394274544057291655&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "nyu.edu;nyu.edu;allenai.org", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "New York University;Allen Institute for AI", "aff_unique_dep": ";", "aff_unique_url": "https://www.nyu.edu;https://allenai.org", "aff_unique_abbr": "NYU;AI2", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Potential Based Diffusion Motion Planning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34074", "id": "Qb68Rs0p9f", "proceeding": "https://proceedings.mlr.press/v235/luo24h.html", "pdf": "https://openreview.net/pdf?id=Qb68Rs0p9f", "openreview": "https://openreview.net/forum?id=Qb68Rs0p9f", "author_site": "Yunhao Luo, Chen Sun, Josh Tenenbaum, Yilun Du", "tldr": "", "abstract": "Effective motion planning in high dimensional spaces is a long-standing open problem in robotics. One class of traditional motion planning algorithms corresponds to potential-based motion planning. An advantage of potential based motion planning is composability -- different motion constraints can easily combined by adding corresponding potentials. However, constructing motion paths from potentials requires solving a global optimization across configuration space potential landscape, which is often prone to local minima. We propose a new approach towards learning potential based motion planning, where we train a neural network to capture and learn an easily optimizable potentials over motion planning trajectories. We illustrate the effectiveness of such approach, significantly outperforming both classical and recent learned motion planning approaches and avoiding issues with local minima. We further illustrate its inherent composability, enabling us to generalize to a multitude of different motion constraints. Project website at https://energy-based-model.github.io/potential-motion-plan.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yunhao Luo;Chen Sun;Joshua B. Tenenbaum;Yilun Du", "authorids": "~Yunhao_Luo1;~Chen_Sun1;~Joshua_B._Tenenbaum1;~Yilun_Du1", "gender": "M;M;;", "homepage": "https://devinluo27.github.io/;https://chensun.me;;https://yilundu.github.io", "dblp": ";01/6072-2;t/JoshuaBTenenbaum;204/4379", "google_scholar": ";vQa7heEAAAAJ;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Yunhao_Luo1;~Chen_Sun1;~Joshua_B._Tenenbaum1;~Yilun_Du1", "aff": "Brown University;Google;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "brown.edu;google.com;mit.edu;mit.edu", "position": "MS student;Research Scientist;Professor;PhD student", "bibtex": "@inproceedings{\nluo2024potential,\ntitle={Potential Based Diffusion Motion Planning},\nauthor={Yunhao Luo and Chen Sun and Joshua B. Tenenbaum and Yilun Du},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Qb68Rs0p9f}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9814168, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6157278685220882083&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "brown.edu;google.com;mit.edu;mit.edu", "author_num": 4, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Brown University;Google;Massachusetts Institute of Technology", "aff_unique_dep": ";Google;", "aff_unique_url": "https://www.brown.edu;https://www.google.com;https://web.mit.edu", "aff_unique_abbr": "Brown;Google;MIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Scalable Safe Policy Improvement for Factored Multi-Agent MDPs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34073", "id": "Qc5umSsUi8", "proceeding": "https://proceedings.mlr.press/v235/bianchi24b.html", "pdf": "https://openreview.net/pdf?id=Qc5umSsUi8", "openreview": "https://openreview.net/forum?id=Qc5umSsUi8", "author_site": "Federico Bianchi, Edoardo Zorzi, Alberto Castellini, Thiago Sim\u00e3o, Matthijs T. J. Spaan, Alessandro Farinelli", "tldr": "", "abstract": "In this work, we focus on safe policy improvement in multi-agent domains where current state-of-the-art methods cannot be effectively applied because of large state and action spaces. We consider recent results using Monte Carlo Tree Search for Safe Policy Improvement with Baseline Bootstrapping and propose a novel algorithm that scales this approach to multi-agent domains, exploiting the factorization of the transition model and value function. Given a centralized behavior policy and a dataset of trajectories, our algorithm generates an improved policy by selecting joint actions using a novel extension of Max-Plus (or Variable Elimination) that constrains local actions to guarantee safety criteria. An empirical evaluation on multi-agent SysAdmin and multi-UAV Delivery shows that the approach scales to very large domains where state-of-the-art methods cannot work.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Federico Bianchi;Edoardo Zorzi;Alberto Castellini;Thiago D. Sim\u00e3o;Matthijs T. J. Spaan;Alessandro Farinelli", "authorids": "~Federico_Bianchi2;~Edoardo_Zorzi1;~Alberto_Castellini1;~Thiago_D._Sim\u00e3o1;~Matthijs_T._J._Spaan1;~Alessandro_Farinelli1", "gender": ";Not Specified;M;;;M", "homepage": "https://www.di.univr.it/?ent=persona&id=24522;;http://www.di.univr.it/?ent=persona&id=4048&lang=it;https://tdsimao.github.io/;;http://profs.sci.univr.it/~farinelli/", "dblp": "122/8815-2;342/5435;79/6199;229/5739;;f/AlessandroFarinelli", "google_scholar": "ufkZ7JwAAAAJ;;https://scholar.google.it/citations?user=AploBScAAAAJ;uEq4AMUAAAAJ;;https://scholar.google.co.uk/citations?user=KHAIAA8AAAAJ", "orcid": "0000-0001-7773-3032;;0000-0001-8420-0699;;;0000-0002-2592-5814", "linkedin": "https://it.linkedin.com/in/federicobianchi-cs;;;;;alessandro-farinelli/", "or_profile": "~Federico_Bianchi2;~Edoardo_Zorzi1;~Alberto_Castellini1;~Thiago_D._Sim\u00e3o1;~Matthijs_T._J._Spaan1;~Alessandro_Farinelli1", "aff": "University of Verona;University of Verona;University of Verona;Eindhoven University of Technology;;Universit\u00e0 degli Studi di Verona", "aff_domain": "univr.it;univr.it;univr.it;tue.nl;;univr.it", "position": "PhD student;MS student;Assistant Professor;Assistant Professor;;Full Professor", "bibtex": "@inproceedings{\nbianchi2024scalable,\ntitle={Scalable Safe Policy Improvement for Factored Multi-Agent {MDP}s},\nauthor={Federico Bianchi and Edoardo Zorzi and Alberto Castellini and Thiago D. Sim{\\~a}o and Matthijs T. J. Spaan and Alessandro Farinelli},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Qc5umSsUi8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1931753, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4524013909979358070&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "email": "univr.it;univr.it;univr.it;tue.nl;;univr.it", "author_num": 6, "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "University of Verona;Eindhoven University of Technology;Universit\u00e0 degli Studi di Verona", "aff_unique_dep": ";;", "aff_unique_url": "https://www.univr.it;https://www.tue.nl;https://www.univr.it", "aff_unique_abbr": "UniVR;TU/e;UniVR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "Italy;Netherlands" }, { "title": "Exploring the Complexity of Deep Neural Networks through Functional Equivalence", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34072", "id": "QgMqvxvWpX", "proceeding": "https://proceedings.mlr.press/v235/shen24a.html", "pdf": "https://openreview.net/pdf?id=QgMqvxvWpX", "openreview": "https://openreview.net/forum?id=QgMqvxvWpX", "tldr": "", "abstract": "We investigate the complexity of deep neural networks through the lens of functional equivalence, which posits that different parameterizations can yield the same network function. Leveraging the equivalence property, we present a novel bound on the covering number for deep neural networks, which reveals that the complexity of neural networks can be reduced. Additionally, we demonstrate that functional equivalence benefits optimization, as overparameterized networks tend to be easier to train since increasing network width leads to a diminishing volume of the effective parameter space. These findings can offer valuable insights into the phenomenon of overparameterization and have implications for understanding generalization and optimization in deep learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guohao Shen", "authorids": "~Guohao_Shen2", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\nshen2024exploring,\ntitle={Exploring the Complexity of Deep Neural Networks through Functional Equivalence},\nauthor={Guohao Shen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QgMqvxvWpX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 556422, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13886801507699770504&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 12, "email": "", "author_num": 1 }, { "title": "Enhancing Storage and Computational Efficiency in Federated Multimodal Learning for Large-Scale Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34071", "id": "QgvBcOsF4B", "proceeding": "https://proceedings.mlr.press/v235/zhang24az.html", "pdf": "https://openreview.net/pdf?id=QgvBcOsF4B", "openreview": "https://openreview.net/forum?id=QgvBcOsF4B", "author_site": "Zixin Zhang, Fan Qi, Changsheng Xu", "tldr": "", "abstract": "The remarkable generalization of large-scale models has recently gained significant attention in multimodal research. However, deploying heterogeneous large-scale models with different modalities under Federated Learning (FL) to protect data privacy imposes tremendous challenges on clients' limited computation and storage. In this work, we propose M$^2$FedSA to address the above issue. We realize modularized decomposition of large-scale models via Split Learning (SL) and only retain privacy-sensitive modules on clients, alleviating storage overhead. By freezing large-scale models and introducing two specialized lightweight adapters, the models can better focus on task-specific knowledge and enhance modality-specific knowledge, improving the model's adaptability to different tasks while balancing efficiency. In addition, M$^2$FedSA further improves performance by transferring multimodal knowledge to unimodal clients at both the feature and decision levels, which leverages the complementarity of different modalities. Extensive experiments on various multimodal classification tasks validate the effectiveness of our proposed M$^2$FedSA. The code is made available publicly at https://github.com/M2FedSA/M-2FedSA.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zixin Zhang;Fan Qi;Changsheng Xu", "authorids": "~Zixin_Zhang5;~Fan_Qi1;~Changsheng_Xu1", "gender": "F;F;M", "homepage": ";;", "dblp": "33/1581-4.html;228/1390.html;85/1301", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;TlNvVKAAAAAJ;https://scholar.google.com.sg/citations?user=hI9NRDkAAAAJ", "orcid": "0009-0001-5522-7152;;", "linkedin": ";;", "or_profile": "~Zixin_Zhang5;~Fan_Qi1;~Changsheng_Xu1", "aff": "Tianjin University of Technology;Tianjin University of Technology;Institute of Automation, Chinese Academy of Sciences", "aff_domain": "tjut.edu.cn;tjut.edu.cn;ia.ac.cn", "position": "PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nzhang2024enhancing,\ntitle={Enhancing Storage and Computational Efficiency in Federated Multimodal Learning for Large-Scale Models},\nauthor={Zixin Zhang and Fan Qi and Changsheng Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QgvBcOsF4B}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1743590, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17807880573774881060&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "tjut.edu.cn;tjut.edu.cn;ia.ac.cn", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Tianjin University of Technology;Chinese Academy of Sciences", "aff_unique_dep": ";Institute of Automation", "aff_unique_url": "http://www.tjut.edu.cn;http://www.ia.cas.cn", "aff_unique_abbr": "TUT;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Unleashing the Power of Meta-tuning for Few-shot Generalization Through Sparse Interpolated Experts", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34070", "id": "QhHMx51ir6", "proceeding": "https://proceedings.mlr.press/v235/chen24ak.html", "pdf": "https://openreview.net/pdf?id=QhHMx51ir6", "openreview": "https://openreview.net/forum?id=QhHMx51ir6", "author_site": "Shengzhuang Chen, Jihoon Tack, Yunqiao Yang, Yee-Whye Teh, Jonathan Richard Schwarz, Ying WEI", "tldr": "", "abstract": "Recent successes suggest that parameter-efficient fine-tuning of foundation models is becoming the state-of-the-art method for transfer learning in vision, gradually replacing the rich literature of alternatives such as meta-learning. In trying to harness the best of both worlds, meta-tuning introduces a subsequent optimization stage of foundation models but has so far only shown limited success and crucially tends to underperform on out-of-distribution (OOD) tasks. In this paper, we introduce Sparse MetA-Tuning (SMAT), a method inspired by sparse mixture-of-experts approaches and trained to isolate subsets of pre-trained parameters automatically for meta-tuning on each task. SMAT successfully overcomes OOD sensitivity and delivers on the promise of enhancing the transfer abilities of vision foundation models beyond parameter-efficient finetuning. We establish new state-of-the-art results on a challenging combination of Meta-Dataset augmented with additional OOD tasks in both zero-shot and gradient-based adaptation settings. In addition, we provide a thorough analysis of the superiority of learned over hand-designed sparsity patterns for sparse expert methods and the pivotal importance of the sparsity level in balancing between in-distribution and out-of-distribution generalization. Our code and models are publicly available.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shengzhuang Chen;Jihoon Tack;Yunqiao Yang;Yee Whye Teh;Jonathan Richard Schwarz;Ying Wei", "authorids": "~Shengzhuang_Chen1;~Jihoon_Tack1;~Yunqiao_Yang1;~Yee_Whye_Teh2;~Jonathan_Richard_Schwarz1;~Ying_Wei1", "gender": "M;M;;F;M;M", "homepage": ";https://jihoontack.github.io;;https://wei-ying.net/;https://jonathan-schwarz.github.io;http://csml.stats.ox.ac.uk/people/teh/", "dblp": ";267/5487;292/2104;14/4899-1;211/7673;88/2483", "google_scholar": "kpKst1UAAAAJ;eW8-OT4AAAAJ;;5UpFdKsAAAAJ;Efs3XxQAAAAJ;https://scholar.google.co.uk/citations?user=y-nUzMwAAAAJ", "orcid": ";;0000-0003-0109-8903;;;", "linkedin": "jerry-chen-45bb15156;;;;schwarzjonathan/;", "or_profile": "~Shengzhuang_Chen1;~Jihoon_Tack1;~Yunqiao_Yang1;~Ying_Wei1;~Jonathan_Schwarz1;~Yee_Whye_Teh1", "aff": "City University of Hong Kong;Meta FAIR;City University of Hong Kong;Nanyang Technological University;Harvard University;University of Oxford", "aff_domain": "cityu.edu.hk;meta.com;cityu.edu.hk;ntu.edu.sg;harvard.edu;ox.ac.uk", "position": "PhD student;Intern;PhD student;Assistant Professor;Postdoc;Full Professor", "bibtex": "@inproceedings{\nchen2024unleashing,\ntitle={Unleashing the Power of Meta-tuning for Few-shot Generalization Through Sparse Interpolated Experts},\nauthor={Shengzhuang Chen and Jihoon Tack and Yunqiao Yang and Yee Whye Teh and Jonathan Richard Schwarz and Ying Wei},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QhHMx51ir6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 883043, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=704589863360358303&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "cityu.edu.hk;meta.com;cityu.edu.hk;ntu.edu.sg;harvard.edu;ox.ac.uk", "author_num": 6, "aff_unique_index": "0;1;0;2;3;4", "aff_unique_norm": "City University of Hong Kong;Meta;Nanyang Technological University;Harvard University;University of Oxford", "aff_unique_dep": ";Meta Platforms, Inc.;;;", "aff_unique_url": "https://www.cityu.edu.hk;https://meta.com;https://www.ntu.edu.sg;https://www.harvard.edu;https://www.ox.ac.uk", "aff_unique_abbr": "CityU;Meta;NTU;Harvard;Oxford", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;0;2;1;3", "aff_country_unique": "China;United States;Singapore;United Kingdom" }, { "title": "Naive Bayes Classifiers over Missing Data: Decision and Poisoning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34069", "id": "QhKsE7YAJk", "proceeding": "https://proceedings.mlr.press/v235/bian24b.html", "pdf": "https://openreview.net/pdf?id=QhKsE7YAJk", "openreview": "https://openreview.net/forum?id=QhKsE7YAJk", "author_site": "Song Bian, Xiating Ouyang, ZHIWEI FAN, Paris Koutris", "tldr": "", "abstract": "We study the certifiable robustness of ML classifiers on dirty datasets that could contain missing values. A test point is certifiably robust for an ML classifier if the classifier returns the same prediction for that test point, regardless of which cleaned version (among exponentially many) of the dirty dataset the classifier is trained on. In this paper, we show theoretically that for Naive Bayes Classifiers (NBC) over dirty datasets with missing values: (i) there exists an efficient polynomial time algorithm to decide whether multiple input test points are all certifiably robust over a dirty dataset; and (ii) the data poisoning attack, which aims to make all input test points certifiably non-robust by inserting missing cells to the clean dataset, is in polynomial time for single test points but NP-complete for multiple test points. Extensive experiments demonstrate that our algorithms are efficient and outperform existing baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Song Bian;Xiating Ouyang;ZHIWEI FAN;Paraschos Koutris", "authorids": "~Song_Bian4;~Xiating_Ouyang1;~ZHIWEI_FAN1;~Paraschos_Koutris1", "gender": ";M;M;M", "homepage": ";https://pages.cs.wisc.edu/~xouyang/;;http://pages.cs.wisc.edu/~paris/", "dblp": ";;156/3546;", "google_scholar": ";;GAyCZo8AAAAJ;https://scholar.google.com.tw/citations?user=CaO1wNAAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Song_Bian4;~Xiating_Ouyang1;~ZHIWEI_FAN1;~Paraschos_Koutris1", "aff": ";University of Wisconsin-Madison;Meta;Department of Computer Science, University of Wisconsin - Madison", "aff_domain": ";cs.wisc.edu;meta.com;cs.wisc.edu", "position": ";PhD student;Research Scientist;Associate Professor", "bibtex": "@inproceedings{\nbian2024naive,\ntitle={Naive Bayes Classifiers over Missing Data: Decision and Poisoning},\nauthor={Song Bian and Xiating Ouyang and ZHIWEI FAN and Paraschos Koutris},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QhKsE7YAJk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 594765, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12437858628576967214&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6, "email": ";cs.wisc.edu;meta.com;cs.wisc.edu", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Wisconsin-Madison;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.wisc.edu;https://meta.com", "aff_unique_abbr": "UW-Madison;Meta", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Score identity Distillation: Exponentially Fast Distillation of Pretrained Diffusion Models for One-Step Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34068", "id": "QhqQJqe0Wq", "proceeding": "https://proceedings.mlr.press/v235/zhou24x.html", "pdf": "https://openreview.net/pdf?id=QhqQJqe0Wq", "openreview": "https://openreview.net/forum?id=QhqQJqe0Wq", "author_site": "Mingyuan Zhou, Huangjie Zheng, Zhendong Wang, Mingzhang Yin, Hai Huang", "tldr": "", "abstract": "We introduce Score identity Distillation (SiD), an innovative data-free method that distills the generative capabilities of pretrained diffusion models into a single-step generator. SiD not only facilitates an exponentially fast reduction in Fr\u00e9chet inception distance (FID) during distillation but also approaches or even exceeds the FID performance of the original teacher diffusion models. By reformulating forward diffusion processes as semi-implicit distributions, we leverage three score-related identities to create an innovative loss mechanism. This mechanism achieves rapid FID reduction by training the generator using its own synthesized images, eliminating the need for real data or reverse-diffusion-based generation, all accomplished within significantly shortened generation time. Upon evaluation across four benchmark datasets, the SiD algorithm demonstrates high iteration efficiency during distillation and surpasses competing distillation approaches, whether they are one-step or few-step, data-free, or dependent on training data, in terms of generation quality. This achievement not only redefines the benchmarks for efficiency and effectiveness in diffusion distillation but also in the broader field of diffusion-based generation. The PyTorch implementation is available at https://github.com/mingyuanzhou/SiD.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mingyuan Zhou;Huangjie Zheng;Zhendong Wang;Mingzhang Yin;Hai Huang", "authorids": "~Mingyuan_Zhou1;~Huangjie_Zheng1;~Zhendong_Wang1;~Mingzhang_Yin1;~Hai_Huang5", "gender": "M;M;M;M;M", "homepage": "http://mingyuanzhou.github.io;;https://zhendong-wang.github.io/;http://mingzhang-yin.github.io;https://github.com/haihuang-ml", "dblp": ";192/2170;;200/8662;51/944-11", "google_scholar": "LXwCIisAAAAJ;Vl5wCXsAAAAJ;lRiIjhcAAAAJ;oAEsILEAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0003-0508-5034;;0000-0002-5216-2437;", "linkedin": ";;;mingzhang-yin-19930406/;hai-huang-ml/", "or_profile": "~Mingyuan_Zhou1;~Huangjie_Zheng1;~Zhendong_Wang1;~Mingzhang_Yin1;~Hai_Huang5", "aff": "Google;University of Texas, Austin;University of Texas at Austin;University of Florida;Google", "aff_domain": "google.com;utexas.edu;utexas.edu;ufl.edu;google.com", "position": "Researcher;PhD student;PhD student;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nzhou2024score,\ntitle={Score identity Distillation: Exponentially Fast Distillation of Pretrained Diffusion Models for One-Step Generation},\nauthor={Mingyuan Zhou and Huangjie Zheng and Zhendong Wang and Mingzhang Yin and Hai Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QhqQJqe0Wq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7669069, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7072813813159162585&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "google.com;utexas.edu;utexas.edu;ufl.edu;google.com", "author_num": 5, "aff_unique_index": "0;1;1;2;0", "aff_unique_norm": "Google;University of Texas at Austin;University of Florida", "aff_unique_dep": "Google;;", "aff_unique_url": "https://www.google.com;https://www.utexas.edu;https://www.ufl.edu", "aff_unique_abbr": "Google;UT Austin;UF", "aff_campus_unique_index": "0;1;1;0", "aff_campus_unique": "Mountain View;Austin;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Online Learning in CMDPs: Handling Stochastic and Adversarial Constraints", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34067", "id": "Qv5szC1zp7", "proceeding": "https://proceedings.mlr.press/v235/stradi24a.html", "pdf": "https://openreview.net/pdf?id=Qv5szC1zp7", "openreview": "https://openreview.net/forum?id=Qv5szC1zp7", "author_site": "Francesco Emanuele Stradi, Jacopo Germano, Gianmarco Genalti, Matteo Castiglioni, Alberto Marchesi, Nicola Gatti", "tldr": "", "abstract": "We study online learning in episodic constrained Markov decision processes (CMDPs), where the learner aims at collecting as much reward as possible over the episodes, while satisfying some long-term constraints during the learning process. Rewards and constraints can be selected either stochastically or adversarially, and the transition function is not known to the learner. While online learning in classical (unconstrained) MDPs has received considerable attention over the last years, the setting of CMDPs is still largely unexplored. This is surprising, since in real-world applications, such as, e.g., autonomous driving, automated bidding, and recommender systems, there are usually additional constraints and specifications that an agent has to obey during the learning process. In this paper, we provide the first best-of-both-worlds algorithm for CMDPs with long-term constraints, in the flavor of Balseiro et al. (2023). Our algorithm is capable of handling settings in which rewards and constraints are selected either stochastically or adversarially, without requiring any knowledge of the underling process. Moreover, our algorithm matches state-of-the-art regret and constraint violation bounds for settings in which constraints are selected stochastically, while it is the first to provide guarantees in the case in which they are chosen adversarially.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Francesco Emanuele Stradi;Jacopo Germano;Gianmarco Genalti;Matteo Castiglioni;Alberto Marchesi;Nicola Gatti", "authorids": "~Francesco_Emanuele_Stradi1;~Jacopo_Germano1;~Gianmarco_Genalti1;~Matteo_Castiglioni1;~Alberto_Marchesi1;~Nicola_Gatti1", "gender": "M;M;Not Specified;;M;M", "homepage": "https://francescoemanuelestradi.github.io;https://www.deib.polimi.it/eng/people/details/1132094;;https://castiglionimatteo.github.io;https://albymarke.github.io;https://www4.ceda.polimi.it/manifesti/manifesti/controller/ricerche/RicercaPerDocentiPublic.do?k_doc=75785&lang=EN&EVN_PRODOTTI=evento&__pj0=0&__pj1=d918ee8916afbd0005f5c0bc3c0ff350", "dblp": "345/9650;;;225/7720;204/1718;g/NicolaGatti", "google_scholar": "JYdi_FMAAAAJ;;b4UMI8kAAAAJ;https://scholar.google.it/citations?user=NPE3HAYAAAAJ;vXDtCzoAAAAJ;https://scholar.google.com.tw/citations?user=j-HrYREAAAAJ", "orcid": ";;;0000-0002-1070-6766;;0000-0001-7349-3932", "linkedin": "francesco-emanuele-stradi-bb35b0222/;;gianmarco-genalti-26328a1a4/;;;nicola-gatti-1284b21", "or_profile": "~Francesco_Emanuele_Stradi1;~Jacopo_Germano1;~Gianmarco_Genalti1;~Matteo_Castiglioni1;~Alberto_Marchesi1;~Nicola_Gatti1", "aff": "Polytechnic Institute of Milan;Polytechnic Institute of Milan;Polytechnic Institute of Milan;Politecnico di Milano;Politecnico di Milano;Polytechnic Institute of Milan", "aff_domain": "polimi.it;polimi.it;polimi.it;polimi.it;polimi.it;polimi.it", "position": "PhD student;PhD student;PhD student;Assistant Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nstradi2024online,\ntitle={Online Learning in {CMDP}s: Handling Stochastic and Adversarial Constraints},\nauthor={Francesco Emanuele Stradi and Jacopo Germano and Gianmarco Genalti and Matteo Castiglioni and Alberto Marchesi and Nicola Gatti},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Qv5szC1zp7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 427220, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5664595614106347430&as_sdt=8005&sciodt=0,7&hl=en", "gs_version_total": 7, "email": "polimi.it;polimi.it;polimi.it;polimi.it;polimi.it;polimi.it", "author_num": 6, "aff_unique_index": "0;0;0;1;1;0", "aff_unique_norm": "Polytechnic Institute of Milan;Politecnico di Milano", "aff_unique_dep": ";", "aff_unique_url": "https://www.polimi.it/;https://www.polimi.it", "aff_unique_abbr": "Politecnico di Milano;Polimi", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "Italy" }, { "title": "Enhancing Adversarial Robustness in SNNs with Sparse Gradients", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34066", "id": "QvABoVGdRp", "proceeding": "https://proceedings.mlr.press/v235/liu24f.html", "pdf": "https://openreview.net/pdf?id=QvABoVGdRp", "openreview": "https://openreview.net/forum?id=QvABoVGdRp", "author_site": "Yujia Liu, Tong Bu, Ding Jianhao, Zecheng Hao, Tiejun Huang, Zhaofei Yu", "tldr": "", "abstract": "Spiking Neural Networks (SNNs) have attracted great attention for their energy-efficient operations and biologically inspired structures, offering potential advantages over Artificial Neural Networks (ANNs) in terms of energy efficiency and interpretability. Nonetheless, similar to ANNs, the robustness of SNNs remains a challenge, especially when facing adversarial attacks. Existing techniques, whether adapted from ANNs or specifically designed for SNNs, exhibit limitations in training SNNs or defending against strong attacks. In this paper, we propose a novel approach to enhance the robustness of SNNs through gradient sparsity regularization. We observe that SNNs exhibit greater resilience to random perturbations compared to adversarial perturbations, even at larger scales. Motivated by this, we aim to narrow the gap between SNNs under adversarial and random perturbations, thereby improving their overall robustness. To achieve this, we theoretically prove that this performance gap is upper bounded by the gradient sparsity of the probability associated with the true label concerning the input image, laying the groundwork for a practical strategy to train robust SNNs by regularizing the gradient sparsity. We validate the effectiveness of our approach through extensive experiments on both image-based and event-based datasets. The results demonstrate notable improvements in the robustness of SNNs. Our work highlights the importance of gradient sparsity in SNNs and its role in enhancing robustness.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yujia Liu;Tong Bu;Jianhao Ding;Zecheng Hao;Tiejun Huang;Zhaofei Yu", "authorids": "~Yujia_Liu1;~Tong_Bu1;~Jianhao_Ding1;~Zecheng_Hao1;~Tiejun_Huang1;~Zhaofei_Yu1", "gender": "F;;M;;M;M", "homepage": ";;https://dingjianhao.github.io/;https://hzc1208.github.io/;https://idm.pku.edu.cn/~tjhuang/;https://yuzhaofei.github.io", "dblp": "42/10221.html;;128/2534;339/6969;h/TiejunHuang;166/0573", "google_scholar": "iDyKEuwAAAAJ;;4rDfCSsAAAAJ;txTkX7YAAAAJ;https://scholar.google.com.tw/citations?user=knvEK4AAAAAJ;qaUgD50AAAAJ", "orcid": "0000-0001-7356-3937;;;0000-0001-9074-2857;0000-0002-4234-6099;", "linkedin": ";;;;;", "or_profile": "~Yujia_Liu1;~Tong_Bu1;~Jianhao_Ding1;~Zecheng_Hao1;~Tiejun_Huang1;~Zhaofei_Yu1", "aff": "Peking University;;Institute of Automation, Chinese Academy of Sciences;Peking University;Peking University;Peking University", "aff_domain": "pku.edu.cn;;ia.ac.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn", "position": "Researcher;;Intern;PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nliu2024enhancing,\ntitle={Enhancing Adversarial Robustness in {SNN}s with Sparse Gradients},\nauthor={Yujia Liu and Tong Bu and Jianhao Ding and Zecheng Hao and Tiejun Huang and Zhaofei Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QvABoVGdRp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2684635, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10851952377609035097&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "pku.edu.cn;;ia.ac.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn", "author_num": 6, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Peking University;Chinese Academy of Sciences", "aff_unique_dep": ";Institute of Automation", "aff_unique_url": "http://www.pku.edu.cn;http://www.ia.cas.cn", "aff_unique_abbr": "Peking U;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "A Bias-Variance-Covariance Decomposition of Kernel Scores for Generative Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34065", "id": "QwgSOwynxD", "proceeding": "https://proceedings.mlr.press/v235/gruber24a.html", "pdf": "https://openreview.net/pdf?id=QwgSOwynxD", "openreview": "https://openreview.net/forum?id=QwgSOwynxD", "author_site": "Sebastian Gregor Gruber, Florian Buettner", "tldr": "", "abstract": "Generative models, like large language models, are becoming increasingly relevant in our daily lives, yet a theoretical framework to assess their generalization behavior and uncertainty does not exist. Particularly, the problem of uncertainty estimation is commonly solved in an ad-hoc and task-dependent manner. For example, natural language approaches cannot be transferred to image generation. In this paper, we introduce the first bias-variance-covariance decomposition for kernel scores. This decomposition represents a theoretical framework from which we derive a kernel-based variance and entropy for uncertainty estimation. We propose unbiased and consistent estimators for each quantity which only require generated samples but not the underlying model itself. Based on the wide applicability of kernels, we demonstrate our framework via generalization and uncertainty experiments for image, audio, and language generation. Specifically, kernel entropy for uncertainty estimation is more predictive of performance on CoQA and TriviaQA question answering datasets than existing baselines and can also be applied to closed-source models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sebastian Gregor Gruber;Florian Buettner", "authorids": "~Sebastian_Gregor_Gruber1;~Florian_Buettner1", "gender": "M;", "homepage": "https://www.kuleuven.be/wieiswie/en/person/00178219;", "dblp": ";245/4220", "google_scholar": "_ThqALUAAAAJ;AaPKbPAAAAAJ", "orcid": "0000-0002-8544-3470;0000-0001-5587-6761", "linkedin": "sebastian-gruber-21b76813b/;", "or_profile": "~Sebastian_Gregor_Gruber1;~Florian_Buettner1", "aff": "Johann Wolfgang Goethe Universit\u00e4t Frankfurt am Main;Deutsches Krebsforschungszentrum", "aff_domain": "uni-frankfurt.de;dkfz.de", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\ngruber2024a,\ntitle={A Bias-Variance-Covariance Decomposition of Kernel Scores for Generative Models},\nauthor={Sebastian Gregor Gruber and Florian Buettner},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QwgSOwynxD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1655047, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:PWzJqZN1_e4J:scholar.google.com/&scioq=A+Bias-Variance-Covariance+Decomposition+of+Kernel+Scores+for+Generative+Models&hl=en&as_sdt=0,33", "gs_version_total": 8, "email": "uni-frankfurt.de;dkfz.de", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Johann Wolfgang Goethe University Frankfurt am Main;Deutsches Krebsforschungszentrum", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-frankfurt.de;https://www.dkfz.de", "aff_unique_abbr": "JWGU;DKFZ", "aff_campus_unique_index": "0", "aff_campus_unique": "Frankfurt am Main;", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "MuxServe: Flexible Spatial-Temporal Multiplexing for Multiple LLM Serving", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34064", "id": "R0SoZvqXyQ", "proceeding": "https://proceedings.mlr.press/v235/duan24a.html", "pdf": "https://openreview.net/pdf?id=R0SoZvqXyQ", "openreview": "https://openreview.net/forum?id=R0SoZvqXyQ", "author_site": "Jiangfei Duan, Runyu Lu, Haojie Duanmu, Xiuhong Li, Xingcheng ZHANG, Dahua Lin, Ion Stoica, Hao Zhang", "tldr": "", "abstract": "Large language models (LLMs) have demonstrated remarkable performance, and organizations are racing to serve LLMs of varying sizes as endpoints for use-cases like chat, programming and search. However, efficiently serving multiple LLMs poses significant challenges for existing approaches due to varying popularity of LLMs. In the paper, we present MuxServe, a flexible spatial-temporal multiplexing system for efficient multiple LLM serving. The key insight behind is to colocate LLMs considering their popularity to multiplex memory resources, and leverage the characteristics of prefill and decoding phases to separate and flexibly colocate them to multiplex computation resources. MuxServe formally formulates the multiplexing problem, and proposes a novel placement algorithm and adaptive batch scheduling strategy to identify optimal colocations and maximize utilization. MuxServe designs a unified resource manager to enable flexible and efficient multiplexing. Evaluation results show that MuxServe can achieves up to $1.8\\times$ higher throughput or processes $2.9\\times$ more requests within $99\\%$ SLO attainment. The code is available at: https://github.com/hao-ai-lab/MuxServe.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiangfei Duan;Runyu Lu;Haojie Duanmu;Xiuhong Li;Xingcheng ZHANG;Dahua Lin;Ion Stoica;Hao Zhang", "authorids": "~Jiangfei_Duan1;~Runyu_Lu2;~Haojie_Duanmu1;~Xiuhong_Li1;~Xingcheng_ZHANG2;~Dahua_Lin1;~Ion_Stoica1;~Hao_Zhang2", "gender": "Not Specified;M;M;M;M;M;M;M", "homepage": "https://jf-d.github.io/;https://lry89757.github.io/;https://github.com/cat538;;;http://dahua.site;http://people.eecs.berkeley.edu/~istoica/;https://cseweb.ucsd.edu/~haozhang/", "dblp": "348/8881.html;;;;190/7261;53/6088;s/IonStoica;55/2270-25", "google_scholar": "XC-pUfcAAAAJ;;U4a_Sr8AAAAJ;90eREm0AAAAJ;3L8CsIIAAAAJ;GMzzRRUAAAAJ;vN-is70AAAAJ;H1d4BS8AAAAJ", "orcid": "0000-0002-6327-2033;;;0000-0002-4896-121X;0009-0006-8525-0608;;;", "linkedin": ";;;;xingchengzhang/;;ionstoica;", "or_profile": "~Jiangfei_Duan1;~Runyu_Lu2;~Haojie_Duanmu1;~Xiuhong_Li1;~Xingcheng_ZHANG2;~Dahua_Lin1;~Ion_Stoica1;~Hao_Zhang2", "aff": "The Chinese University of Hong Kong;Huazhong University of Science and Technology;Shanghai Jiaotong University;Peking University;Sensetime;The Chinese University of Hong Kong;University of California, Berkeley;Carnegie Mellon University", "aff_domain": "cuhk.edu.hk;hust.edu.cn;sjtu.edu.cn;pku.edu.cn;sensetime.com;cuhk.edu.hk;berkeley.edu;cmu.edu", "position": "PhD student;Undergrad student;PhD student;Assistant Research Professor;Researcher;Associate Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nduan2024muxserve,\ntitle={MuxServe: Flexible Spatial-Temporal Multiplexing for Multiple {LLM} Serving},\nauthor={Jiangfei Duan and Runyu Lu and Haojie Duanmu and Xiuhong Li and Xingcheng ZHANG and Dahua Lin and Ion Stoica and Hao Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=R0SoZvqXyQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1395437, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17587407023933410530&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "cuhk.edu.hk;hust.edu.cn;sjtu.edu.cn;pku.edu.cn;sensetime.com;cuhk.edu.hk;berkeley.edu;cmu.edu", "author_num": 8, "aff_unique_index": "0;1;2;3;4;0;5;6", "aff_unique_norm": "Chinese University of Hong Kong;Huazhong University of Science and Technology;Shanghai Jiao Tong University;Peking University;SenseTime;University of California, Berkeley;Carnegie Mellon University", "aff_unique_dep": ";;;;;;", "aff_unique_url": "https://www.cuhk.edu.hk;http://www.hust.edu.cn;https://www.sjtu.edu.cn;http://www.pku.edu.cn;https://www.sensetime.com;https://www.berkeley.edu;https://www.cmu.edu", "aff_unique_abbr": "CUHK;HUST;SJTU;Peking U;SenseTime;UC Berkeley;CMU", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Hong Kong SAR;;Berkeley", "aff_country_unique_index": "0;0;0;0;0;0;1;1", "aff_country_unique": "China;United States" }, { "title": "Efficient Online Set-valued Classification with Bandit Feedback", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34063", "id": "R1auM3tLPE", "proceeding": "https://proceedings.mlr.press/v235/wang24bg.html", "pdf": "https://openreview.net/pdf?id=R1auM3tLPE", "openreview": "https://openreview.net/forum?id=R1auM3tLPE", "author_site": "Zhou Wang, Xingye Qiao", "tldr": "", "abstract": "Conformal prediction is a distribution-free method that wraps a given machine learning model and returns a set of plausible labels that contain the true label with a prescribed coverage rate. In practice, the empirical coverage achieved highly relies on fully observed label information from data both in the training phase for model fitting and the calibration phase for quantile estimation. This dependency poses a challenge in the context of online learning with bandit feedback, where a learner only has access to the correctness of actions (i.e., pulled an arm) but not the full information of the true label. In particular, when the pulled arm is incorrect, the learner only knows that the pulled one is not the true class label, but does not know which label is true. Additionally, bandit feedback further results in a smaller labeled dataset for calibration, limited to instances with correct actions, thereby affecting the accuracy of quantile estimation. To address these limitations, we propose Bandit Class-specific Conformal Prediction (BCCP), offering coverage guarantees on a class-specific granularity. Using an unbiased estimation of an estimand involving the true label, BCCP trains the model and makes set-valued inferences through stochastic gradient descent. Our approach overcomes the challenges of sparsely labeled data in each iteration and generalizes the reliability and applicability of conformal prediction to online decision-making environments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhou Wang;Xingye Qiao", "authorids": "~Zhou_Wang3;~Xingye_Qiao1", "gender": "Not Specified;", "homepage": ";http://people.math.binghamton.edu/qiao/", "dblp": ";21/10859", "google_scholar": ";O8NqeoQAAAAJ", "orcid": ";0000-0003-0937-9822", "linkedin": "zhou-wang-75a269163/;", "or_profile": "~Zhou_Wang3;~Xingye_Qiao1", "aff": "State University of New York at Binghamton;State University of New York at Binghamton", "aff_domain": "binghamton.edu;binghamton.edu", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nwang2024efficient,\ntitle={Efficient Online Set-valued Classification with Bandit Feedback},\nauthor={Zhou Wang and Xingye Qiao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=R1auM3tLPE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8602788, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6702433891937353563&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "binghamton.edu;binghamton.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "State University of New York at Binghamton", "aff_unique_dep": "", "aff_unique_url": "https://www.binghamton.edu", "aff_unique_abbr": "SUNY Binghamton", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Binghamton", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "MMT-Bench: A Comprehensive Multimodal Benchmark for Evaluating Large Vision-Language Models Towards Multitask AGI", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34062", "id": "R4Ng8zYaiz", "proceeding": "https://proceedings.mlr.press/v235/ying24a.html", "pdf": "https://openreview.net/pdf?id=R4Ng8zYaiz", "openreview": "https://openreview.net/forum?id=R4Ng8zYaiz", "author_site": "Kaining Ying, Fanqing Meng, Jin Wang, Zhiqian Li, Han Lin, Yue Yang, Hao Zhang, Wenbo Zhang, Yuqi Lin, Shuo Liu, jiayi lei, Quanfeng Lu, Runjian Chen, Peng Xu, Renrui Zhang, Haozhe Zhang, Peng Gao, Yali Wang, Yu Qiao, Ping Luo, Kaipeng Zhang, WENQI SHAO", "tldr": "", "abstract": "Large Vision-Language Models (LVLMs) show significant strides in general-propose multimodal applications such as visual dialogue and embodied navigation. However, existing multimodal evaluation benchmarks cover a limited number of multimodal tasks testing rudimentary capabilities, falling short in tracking LVLM development. In this study, we present MMT-Bench, a comprehensive benchmark designed to assess LVLMs across massive multimodal tasks requiring expert knowledge and deliberate visual recognition, localization, and reasoning. MMT-Bench comprises $31,325$ meticulously curated multi-choice visual questions from various multimodal scenarios such as vehicle driving and embodied navigation, covering $32$ core meta-tasks and $162$ subtasks in multimodal understanding. Due to its extensive task coverage, MMT-Bench enables the evaluation of LVLMs using a task map, facilitating the discovery of in- and out-of-domain tasks. Evaluation results involving $20$ publicly available LVLMs such as the proprietary GeminiProVision model, underscore the significant challenges posed by MMT-Bench. We anticipate that MMT-Bench will inspire the community to develop next-generation multimodal foundation models aimed at achieving general-purpose multimodal intelligence.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kaining Ying;Fanqing Meng;Jin Wang;Zhiqian Li;Han Lin;Yue Yang;Hao Zhang;Wenbo Zhang;Yuqi Lin;Shuo Liu;jiayi lei;Quanfeng Lu;Runjian Chen;Peng Xu;Renrui Zhang;Haozhe Zhang;Peng Gao;Yali Wang;Yu Qiao;Ping Luo;Kaipeng Zhang;Wenqi Shao", "authorids": "~Kaining_Ying2;~Fanqing_Meng1;~Jin_Wang16;~Zhiqian_Li1;~Han_Lin3;~Yue_Yang6;~Hao_Zhang56;~Wenbo_Zhang9;~Yuqi_Lin1;~Shuo_Liu5;~jiayi_lei1;~Quanfeng_Lu1;~Runjian_Chen1;~Peng_Xu11;~Renrui_Zhang1;~Haozhe_Zhang4;~Peng_Gao3;~Yali_Wang1;~Yu_Qiao1;~Ping_Luo2;~Kaipeng_Zhang1;~Wenqi_Shao2", "gender": "M;M;M;F;F;;M;M;M;F;F;M;M;M;M;M;;M;;;M;M", "homepage": "https://github.com/Morgott-The-Omen-King;https://github.com/FanqingM;https://jinjinw.com;;https://github.com/linhan12;;;https://zwbx.github.io;http://wiki.zjulearning.org.cn/wiki/User:Linyuqi;;;https://lqf-hfnju.github.io/;https://runjian-chen.github.io;;;https://github.com/haozhezju;;;;;http://kpzhang93.github.io/;https://wqshao126.github.io/", "dblp": "291/9018;;;;;;;;117/7752;07/6773;;366/3877;257/4647;;244/1748;218/5378-2;;01/773-1;;;179/2126;227/3122", "google_scholar": "MDvaeqUAAAAJ;iUIC-JEAAAAJ;https://scholar.google.com/citations?hl=en;;;;hwZUvY0AAAAJ;A-qS5eYAAAAJ;5-jDh48AAAAJ;https://scholar.google.com.tw/citations?hl=zh-CN;7VD1YLMAAAAJ;NlBS3nMAAAAJ;_USUMdAAAAAJ;;YlL3xN4AAAAJ;;;https://scholar.google.com/citations?hl=en;;;4OqZBmYAAAAJ;Bs9mrwwAAAAJ", "orcid": ";0000-0002-0920-3539;0000-0002-0533-4523;;;;0000-0002-3572-7053;;;;;;0000-0003-0519-496X;;;0009-0003-5603-0727;;;;;;", "linkedin": ";;;zhiqian-li-a19727205/;;;;;;;;;;https://www.linkedin.cn/incareer/in/peng-xu-250466206;;;;;;;;", "or_profile": "~Kaining_Ying2;~Fanqing_Meng1;~Jin_Wang16;~Zhiqian_Li1;~Han_Lin3;~Yue_Yang6;~Hao_Zhang56;~Wenbo_Zhang9;~Yuqi_Lin1;~Shuo_Liu5;~jiayi_lei1;~Quanfeng_Lu1;~Runjian_Chen1;~Peng_Xu11;~Renrui_Zhang1;~Haozhe_Zhang4;~Peng_Gao3;~Yali_Wang1;~Yu_Qiao1;~Ping_Luo2;~Kaipeng_Zhang1;~Wenqi_Shao2", "aff": "Zhejiang University of Technology;Shanghai Jiaotong University;The University of Hong Kong;University of Hong Kong;Shanghai Jiaotong University;;Xi'an Jiaotong University;University of Adelaide;Zhejiang University;Shanghai AI lab;Shanghai Jiaotong University;Nanjing university;University of Hong Kong;University of Hong Kong;MMLab of CUHK & Shanghai AI Laboratory;Zhejiang University;;SIAT, Chinese Academy of Sciences;;;Shanghai AI Laboratory;Shanghai AI Laboratory", "aff_domain": "zjut.edu.cn;sjtu.edu.cn;connect.hku.hk;hku.hk;sjtu.edu.cn;;xjtu.edu.cn;adelaide.edu.au;zju.edu.cn;pjlab.org;sjtu.edu;nju.edu.cn;hku.hk;hku.hk;pjlab.org.cn;zju.edu.cn;;siat.ac.cn;;;pjlab.org.cn;pjlab.org.cn", "position": "MS student;PhD student;PhD student;Undergrad student;PhD student;;PhD student;PhD student;PhD student;Researcher;Undergrad student;Undergrad student;PhD student;PhD student;PhD student;MS student;;Full Professor;;;Researcher;Researcher", "bibtex": "@inproceedings{\nying2024mmtbench,\ntitle={{MMT}-Bench: A Comprehensive Multimodal Benchmark for Evaluating Large Vision-Language Models Towards Multitask {AGI}},\nauthor={Kaining Ying and Fanqing Meng and Jin Wang and Zhiqian Li and Han Lin and Yue Yang and Hao Zhang and Wenbo Zhang and Yuqi Lin and Shuo Liu and jiayi lei and Quanfeng Lu and Runjian Chen and Peng Xu and Renrui Zhang and Haozhe Zhang and Peng Gao and Yali Wang and Yu Qiao and Ping Luo and Kaipeng Zhang and Wenqi Shao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=R4Ng8zYaiz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9775285, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 22, "gs_citation": 84, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1678940697530371926&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "zjut.edu.cn;sjtu.edu.cn;connect.hku.hk;hku.hk;sjtu.edu.cn;;xjtu.edu.cn;adelaide.edu.au;zju.edu.cn;pjlab.org;sjtu.edu;nju.edu.cn;hku.hk;hku.hk;pjlab.org.cn;zju.edu.cn;;siat.ac.cn;;;pjlab.org.cn;pjlab.org.cn", "author_num": 22, "aff_unique_index": "0;1;2;2;1;3;4;5;6;1;7;2;2;8;5;9;10;10", "aff_unique_norm": "Zhejiang University of Technology;Shanghai Jiao Tong University;University of Hong Kong;Xi'an Jiao Tong University;University of Adelaide;Zhejiang University;Shanghai AI Lab;Nanjing University;Chinese University of Hong Kong;Shenzhen Institute of Advanced Technology;Shanghai AI Laboratory", "aff_unique_dep": ";;;;;;AI Research;;MMLab;;", "aff_unique_url": "https://www.zjut.edu.cn;https://www.sjtu.edu.cn;https://www.hku.hk;https://www.xjtu.edu.cn;https://www.adelaide.edu.au;https://www.zju.edu.cn;https://www.shanghaiailab.com;https://www.nju.edu.cn;https://www.cuhk.edu.hk;http://www.siat.ac.cn;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "ZJUT;SJTU;HKU;XJTU;Adelaide;ZJU;Shanghai AI Lab;Nanjing U;CUHK;SIAT;SAIL", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;1;0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "China;Australia" }, { "title": "Target Networks and Over-parameterization Stabilize Off-policy Bootstrapping with Function Approximation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34061", "id": "R6GT1UDcOW", "proceeding": "https://proceedings.mlr.press/v235/che24a.html", "pdf": "https://openreview.net/pdf?id=R6GT1UDcOW", "openreview": "https://openreview.net/forum?id=R6GT1UDcOW", "author_site": "Fengdi Che, Chenjun Xiao, Jincheng Mei, Bo Dai, Ramki Gummadi, Oscar Ramirez, Christopher Harris, Rupam Mahmood, Dale Schuurmans", "tldr": "", "abstract": "We prove that the combination of a target network and over-parameterized linear function approximation establishes a weaker convergence condition for bootstrapped value estimation in certain cases, even with off-policy data. Our condition is naturally satisfied for expected updates over the entire state-action space or learning with a batch of complete trajectories from episodic Markov decision processes. Notably, using only a target network or an over-parameterized model does not provide such a convergence guarantee. Additionally, we extend our results to learning with truncated trajectories, showing that convergence is achievable for all tasks with minor modifications, akin to value truncation for the final states in trajectories. Our primary result focuses on temporal difference estimation for prediction, providing high-probability value estimation error bounds and empirical analysis on Baird's counterexample and a Four-room task. Furthermore, we explore the control setting, demonstrating that similar convergence conditions apply to Q-learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fengdi Che;Chenjun Xiao;Jincheng Mei;Bo Dai;Ramki Gummadi;Oscar A Ramirez;Christopher K Harris;A. Rupam Mahmood;Dale Schuurmans", "authorids": "~Fengdi_Che1;~Chenjun_Xiao1;~Jincheng_Mei1;~Bo_Dai1;~Ramki_Gummadi1;~Oscar_A_Ramirez1;~Christopher_K_Harris1;~A._Rupam_Mahmood1;~Dale_Schuurmans1", "gender": "F;;M;;;M;;;", "homepage": "https://github.com/FengdiC;https://chenjun-x.github.io/;https://jinchengmei.github.io;https://bo-dai.github.io/;;;;;", "dblp": ";178/8641;149/1408;64/2903;https://dblp.org/pers/hd/g/Gummadi:Ramki;145/7596;;;", "google_scholar": ";;;TIKl_foAAAAJ;2P8IbqoAAAAJ;LLnrH8IAAAAJ;;;", "orcid": ";0000-0002-5493-1500;;0009-0002-8070-574X;;;;;", "linkedin": ";;;;;oscar-ramirez-905913b9;;;", "or_profile": "~Fengdi_Che1;~Chenjun_Xiao1;~Jincheng_Mei1;~Bo_Dai1;~Ramki_Gummadi1;~Oscar_A_Ramirez1;~Christopher_K_Harris1;~A._Rupam_Mahmood1;~Dale_Schuurmans1", "aff": "University of Alberta;Huawei Technologies Ltd.;Google DeepMind;Google Brain;Google DeepMind;Google;;;", "aff_domain": "ualberta.ca;huawei.com;google.com;google.com;google.com;google.com;;;", "position": "PhD student;Researcher;Research Scientist;Research Scientist;Staff Research Scientist;Researcher;;;", "bibtex": "@inproceedings{\nche2024target,\ntitle={Target Networks and Over-parameterization Stabilize Off-policy Bootstrapping with Function Approximation},\nauthor={Fengdi Che and Chenjun Xiao and Jincheng Mei and Bo Dai and Ramki Gummadi and Oscar A Ramirez and Christopher K Harris and A. Rupam Mahmood and Dale Schuurmans},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=R6GT1UDcOW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5085817, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3260515796616757810&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "ualberta.ca;huawei.com;google.com;google.com;google.com;google.com;;;", "author_num": 9, "aff_unique_index": "0;1;2;2;2;2", "aff_unique_norm": "University of Alberta;Huawei;Google", "aff_unique_dep": ";Huawei Technologies;Google DeepMind", "aff_unique_url": "https://www.ualberta.ca;https://www.huawei.com;https://deepmind.com", "aff_unique_abbr": "UAlberta;Huawei;DeepMind", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;2;3;2;3", "aff_country_unique": "Canada;China;United Kingdom;United States" }, { "title": "OMPO: A Unified Framework for RL under Policy and Dynamics Shifts", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34060", "id": "R83VIZtHXA", "proceeding": "https://proceedings.mlr.press/v235/luo24d.html", "pdf": "https://openreview.net/pdf?id=R83VIZtHXA", "openreview": "https://openreview.net/forum?id=R83VIZtHXA", "author_site": "Yu Luo, Tianying Ji, Fuchun Sun, Jianwei Zhang, Huazhe Xu, Xianyuan Zhan", "tldr": "", "abstract": "Training reinforcement learning policies using environment interaction data collected from varying policies or dynamics presents a fundamental challenge. Existing works often overlook the distribution discrepancies induced by policy or dynamics shifts, or rely on specialized algorithms with task priors, thus often resulting in suboptimal policy performances and high learning variances. In this paper, we identify a unified strategy for online RL policy learning under diverse settings of policy and dynamics shifts: transition occupancy matching. In light of this, we introduce a surrogate policy learning objective by considering the transition occupancy discrepancies and then cast it into a tractable min-max optimization problem through dual reformulation. Our method, dubbed Occupancy-Matching Policy Optimization (OMPO), features a specialized actor-critic structure equipped with a distribution discriminator and a small-size local buffer. We conduct extensive experiments based on the OpenAI Gym, Meta-World, and Panda Robots environments, encompassing policy shifts under stationary and non-stationary dynamics, as well as domain adaption. The results demonstrate that OMPO outperforms the specialized baselines from different categories in all settings. We also find that OMPO exhibits particularly strong performance when combined with domain randomization, highlighting its potential in RL-based robotics applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yu Luo;Tianying Ji;Fuchun Sun;Jianwei Zhang;Huazhe Xu;Xianyuan Zhan", "authorids": "~Yu_Luo5;~Tianying_Ji2;~Fuchun_Sun1;~Jianwei_Zhang2;~Huazhe_Xu1;~Xianyuan_Zhan1", "gender": "M;F;M;M;M;M", "homepage": ";;https://www.cs.tsinghua.edu.cn/info/1121/3555.htm;https://tams.informatik.uni-hamburg.de/people/zhang/;http://hxu.rocks;http://zhanxianyuan.xyz/", "dblp": ";124/2199.html;;z/JianweiZhang1;164/9006;181/5081", "google_scholar": "https://scholar.google.com.hk/citations?user=KQjoQOMAAAAJ;https://scholar.google.com/citations?hl=zh-CN;;;t9HPFawAAAAJ;pDMnGloAAAAJ", "orcid": "0000-0001-6229-4639;;;;;0000-0002-3683-0554", "linkedin": ";;;;;", "or_profile": "~Yu_Luo5;~Tianying_Ji2;~Fuchun_Sun1;~Jianwei_Zhang2;~Huazhe_Xu1;~Xianyuan_Zhan1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Universit\u00e4t Hamburg;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;cs.tsinghua.edu.cn;uni-hamburg.de;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;PhD student;Full Professor;Full Professor;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nluo2024ompo,\ntitle={{OMPO}: A Unified Framework for {RL} under Policy and Dynamics Shifts},\nauthor={Yu Luo and Tianying Ji and Fuchun Sun and Jianwei Zhang and Huazhe Xu and Xianyuan Zhan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=R83VIZtHXA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7958114, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1441603951177409073&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "tsinghua.edu.cn;tsinghua.edu.cn;cs.tsinghua.edu.cn;uni-hamburg.de;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;1;0;0", "aff_unique_norm": "Tsinghua University;University of Hamburg", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.uni-hamburg.de", "aff_unique_abbr": "THU;UHH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "China;Germany" }, { "title": "ODIM: Outlier Detection via Likelihood of Under-Fitted Generative Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34059", "id": "R8nbccD7kv", "proceeding": "https://proceedings.mlr.press/v235/kim24h.html", "pdf": "https://openreview.net/pdf?id=R8nbccD7kv", "openreview": "https://openreview.net/forum?id=R8nbccD7kv", "author_site": "Dongha Kim, Jaesung Hwang, Jongjin Lee, Kunwoong Kim, Yongdai Kim", "tldr": "", "abstract": "The unsupervised outlier detection (UOD) problem refers to a task to identify inliers given training data which contain outliers as well as inliers, without any labeled information about inliers and outliers. It has been widely recognized that using fully-trained likelihood-based deep generative models (DGMs) often results in poor performance in distinguishing inliers from outliers. In this study, we claim that the likelihood itself could serve as powerful evidence for identifying inliers in UOD tasks, provided that DGMs are carefully under-fitted. Our approach begins with a novel observation called the inlier-memorization (IM) effect--when training a deep generative model with data including outliers, the model initially memorizes inliers before outliers. Based on this finding, we develop a new method called the outlier detection via the IM effect (ODIM). Remarkably, the ODIM requires only a few updates, making it computationally efficient--at least tens of times faster than other deep-learning-based algorithms. Also, the ODIM filters out outliers excellently, regardless of the data type, including tabular, image, and text data. To validate the superiority and efficiency of our method, we provide extensive empirical analyses on close to 60 datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dongha Kim;JaesungHwang;Jongjin Lee;Kunwoong Kim;Yongdai Kim", "authorids": "~Dongha_Kim1;~JaesungHwang1;~Jongjin_Lee1;~Kunwoong_Kim1;~Yongdai_Kim1", "gender": "M;M;M;M;M", "homepage": "https://sites.google.com/view/istat-lab;;;;", "dblp": ";;;93/734;296/1715", "google_scholar": ";;https://scholar.google.com/citations?view_op=list_works;;", "orcid": ";;;;", "linkedin": ";jaesung-hwang-b2955613b/;;;", "or_profile": "~Dongha_Kim1;~JaesungHwang1;~Jongjin_Lee1;~Yongdai_Kim1;~Kun_woong_Kim1", "aff": "Sungshin Women's University;SKTelecom;Seoul National University;Seoul National University;Seoul National University", "aff_domain": "sungshin.ac.kr;sk.com;snu.ac.kr;snu.ac.kr;snu.ac.kr", "position": "Assistant Professor;Researcher;PhD student;Full Professor;PhD student", "bibtex": "@inproceedings{\nkim2024odim,\ntitle={{ODIM}: Outlier Detection via Likelihood of Under-Fitted Generative Models},\nauthor={Dongha Kim and JaesungHwang and Jongjin Lee and Kunwoong Kim and Yongdai Kim},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=R8nbccD7kv}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 814159, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15381670644327699987&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "sungshin.ac.kr;sk.com;snu.ac.kr;snu.ac.kr;snu.ac.kr", "author_num": 5, "aff_unique_index": "0;1;2;2;2", "aff_unique_norm": "Sungshin Women's University;SK Telecom;Seoul National University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.sungshin.ac.kr;https://www.sktelecom.com;https://www.snu.ac.kr", "aff_unique_abbr": "SWU;SKT;SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Estimating Distributional Treatment Effects in Randomized Experiments: Machine Learning for Variance Reduction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34058", "id": "RDofzHLuX4", "proceeding": "https://proceedings.mlr.press/v235/byambadalai24a.html", "pdf": "https://openreview.net/pdf?id=RDofzHLuX4", "openreview": "https://openreview.net/forum?id=RDofzHLuX4", "author_site": "Undral Byambadalai, Tatsushi Oka, Shota Yasui", "tldr": "", "abstract": "We propose a novel regression adjustment method designed for estimating distributional treatment effect parameters in randomized experiments. Randomized experiments have been extensively used to estimate treatment effects in various scientific fields. However, to gain deeper insights, it is essential to estimate distributional treatment effects rather than relying solely on average effects. Our approach incorporates pre-treatment covariates into a distributional regression framework, utilizing machine learning techniques to improve the precision of distributional treatment effect estimators. The proposed approach can be readily implemented with off-the-shelf machine learning methods and remains valid as long as the nuisance components are reasonably well estimated. Also, we establish the asymptotic properties of the proposed estimator and present a uniformly valid inference method. Through simulation results and real data analysis, we demonstrate the effectiveness of integrating machine learning techniques in reducing the variance of distributional treatment effect estimators in finite samples.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Undral Byambadalai;Tatsushi Oka;Shota Yasui", "authorids": "~Undral_Byambadalai1;~Tatsushi_Oka1;~Shota_Yasui1", "gender": "F;;M", "homepage": "https://undara.github.io/;;https://yasui-salmon.github.io/", "dblp": "334/2219;;227/2734", "google_scholar": "Y1ghEW4AAAAJ;;https://scholar.google.co.jp/citations?user=47E8oVcAAAAJ", "orcid": "0000-0002-5794-4745;;", "linkedin": "undralbyambadalai/;;", "or_profile": "~Undral_Byambadalai1;~Tatsushi_Oka1;~Shota_Yasui1", "aff": "CyberAgent, Inc.;;", "aff_domain": "cyberagent.co.jp;;", "position": "Researcher;;", "bibtex": "@inproceedings{\nbyambadalai2024estimating,\ntitle={Estimating Distributional Treatment Effects in Randomized Experiments: Machine Learning for Variance Reduction},\nauthor={Undral Byambadalai and Tatsushi Oka and Shota Yasui},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RDofzHLuX4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 654805, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:K6HE8YyyTcQJ:scholar.google.com/&scioq=Estimating+Distributional+Treatment+Effects+in+Randomized+Experiments:+Machine+Learning+for+Variance+Reduction&hl=en&as_sdt=0,44", "gs_version_total": 11, "email": "cyberagent.co.jp;;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "CyberAgent", "aff_unique_dep": "", "aff_unique_url": "https://www.cyberagent.co.jp", "aff_unique_abbr": "CyberAgent", "aff_country_unique_index": "0", "aff_country_unique": "Japan" }, { "title": "$f$-Divergence Based Classification: Beyond the Use of Cross-Entropy", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34057", "id": "RFhkcqRmTD", "proceeding": "https://proceedings.mlr.press/v235/novello24a.html", "pdf": "https://openreview.net/pdf?id=RFhkcqRmTD", "openreview": "https://openreview.net/forum?id=RFhkcqRmTD", "author_site": "Nicola Novello, Andrea Tonello", "tldr": "", "abstract": "In deep learning, classification tasks are formalized as optimization problems often solved via the minimization of the cross-entropy. However, recent advancements in the design of objective functions allow the usage of the $f$-divergence to generalize the formulation of the optimization problem for classification. We adopt a Bayesian perspective and formulate the classification task as a maximum a posteriori probability problem. We propose a class of objective functions based on the variational representation of the $f$-divergence. Furthermore, driven by the challenge of improving the state-of-the-art approach, we propose a bottom-up method that leads us to the formulation of an objective function corresponding to a novel $f$-divergence referred to as shifted log (SL). We theoretically analyze the objective functions proposed and numerically test them in three application scenarios: toy examples, image datasets, and signal detection/decoding problems. The analyzed scenarios demonstrate the effectiveness of the proposed approach and that the SL divergence achieves the highest classification accuracy in almost all the considered cases.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nicola Novello;Andrea M Tonello", "authorids": "~Nicola_Novello1;~Andrea_M_Tonello1", "gender": ";M", "homepage": ";http://www.andreatonello.com", "dblp": ";191/4511", "google_scholar": ";https://scholar.google.de/citations?user=qBiseEsAAAAJ", "orcid": ";0000-0002-9873-2407", "linkedin": ";", "or_profile": "~Nicola_Novello1;~Andrea_M_Tonello1", "aff": ";Alpen-Adria Universit\u00e4t Klagenfurt", "aff_domain": ";aau.at", "position": ";Full Professor", "bibtex": "@inproceedings{\nnovello2024fdivergence,\ntitle={\\$f\\$-Divergence Based Classification: Beyond the Use of Cross-Entropy},\nauthor={Nicola Novello and Andrea M Tonello},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RFhkcqRmTD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2415474, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6023505377106459260&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": ";aau.at", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Alpen-Adria-Universit\u00e4t Klagenfurt", "aff_unique_dep": "", "aff_unique_url": "https://www.aau.at", "aff_unique_abbr": "AAU", "aff_country_unique_index": "0", "aff_country_unique": "Austria" }, { "title": "Asymptotics of Learning with Deep Structured (Random) Features", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34056", "id": "RI4GA8amUI", "proceeding": "https://proceedings.mlr.press/v235/schroder24a.html", "pdf": "https://openreview.net/pdf?id=RI4GA8amUI", "openreview": "https://openreview.net/forum?id=RI4GA8amUI", "author_site": "Dominik Schr\u00f6der, Daniil Dmitriev, Hugo Cui, Bruno Loureiro", "tldr": "", "abstract": "For a large class of feature maps we provide a tight asymptotic characterisation of the test error associated with learning the readout layer, in the high-dimensional limit where the input dimension, hidden layer widths, and number of training samples are proportionally large. This characterization is formulated in terms of the population covariance of the features. Our work is partially motivated by the problem of learning with Gaussian rainbow neural networks, namely deep non-linear fully-connected networks with random but structured weights, whose row-wise covariances are further allowed to depend on the weights of previous layers. For such networks we also derive a closed-form formula for the feature covariance in terms of the weight matrices. We further find that in some cases our results can capture feature maps learned by deep, finite-width neural networks trained under gradient descent.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dominik Schr\u00f6der;Daniil Dmitriev;Hugo Cui;Bruno Loureiro", "authorids": "~Dominik_Schr\u00f6der1;~Daniil_Dmitriev2;~Hugo_Cui1;~Bruno_Loureiro1", "gender": "M;M;;M", "homepage": "https://n.ethz.ch/~dschroeder;;;https://brloureiro.github.io/", "dblp": ";;;207/1834", "google_scholar": "u3ilHrcAAAAJ;3_4gF8wAAAAJ;;DXl3ir8AAAAJ", "orcid": "0000-0002-2904-1856;;;0000-0002-6327-4688", "linkedin": ";;;bruno-loureiro-43183b14a/", "or_profile": "~Dominik_Schr\u00f6der1;~Daniil_Dmitriev2;~Hugo_Cui1;~Bruno_Loureiro1", "aff": "Swiss Federal Institute of Technology;ETHZ - ETH Zurich;;Ecole Normale Sup\u00e9rieure, Ecole Normale Sup\u00e9rieure de Paris", "aff_domain": "ethz.ch;ethz.ch;;di.ens.fr", "position": "Postdoc;PhD student;;Researcher", "bibtex": "@inproceedings{\nschr{\\\"o}der2024asymptotics,\ntitle={Asymptotics of Learning with Deep Structured (Random) Features},\nauthor={Dominik Schr{\\\"o}der and Daniil Dmitriev and Hugo Cui and Bruno Loureiro},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RI4GA8amUI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1180008, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5291235798671266038&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "ethz.ch;ethz.ch;;di.ens.fr", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Swiss Federal Institute of Technology;ETH Zurich;Ecole Normale Sup\u00e9rieure de Paris", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch;https://www.ens.psl.eu", "aff_unique_abbr": "ETH Zurich;ETHZ;ENS Paris", "aff_campus_unique_index": "1", "aff_campus_unique": ";Paris", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Switzerland;France" }, { "title": "Understanding Retrieval-Augmented Task Adaptation for Vision-Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34055", "id": "RIMRKeeVsr", "proceeding": "https://proceedings.mlr.press/v235/ming24a.html", "pdf": "https://openreview.net/pdf?id=RIMRKeeVsr", "openreview": "https://openreview.net/forum?id=RIMRKeeVsr", "author_site": "Yifei Ming, Sharon Li", "tldr": "", "abstract": "Pre-trained contrastive vision-language models have demonstrated remarkable performance across a wide range of tasks. However, they often struggle on fine-trained datasets with categories not adequately represented during pre-training, which makes adaptation necessary. Recent works have shown promising results by utilizing samples from web-scale databases for retrieval-augmented adaptation, especially in low-data regimes. Despite the empirical success, understanding how retrieval impacts the adaptation of vision-language models remains an open research question. In this work, we adopt a reflective perspective by presenting a systematic study to understand the roles of key components in retrieval-augmented adaptation. We unveil new insights on uni-modal and cross-modal retrieval and highlight the critical role of logit ensemble for effective adaptation. We further present theoretical underpinnings that directly support our empirical observations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yifei Ming;Yixuan Li", "authorids": "~Yifei_Ming1;~Yixuan_Li1", "gender": "M;F", "homepage": "https://alvinmingsf.github.io/;http://pages.cs.wisc.edu/~sharonli/", "dblp": "277/4125;144/6087-1", "google_scholar": "Dh_4cyQAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";", "linkedin": ";liyixuan", "or_profile": "~Yifei_Ming1;~Yixuan_Li1", "aff": "University of Wisconsin - Madison;Cornell University", "aff_domain": "wisc.edu;cornell.edu", "position": "PhD student;Graduate Student", "bibtex": "@inproceedings{\nming2024understanding,\ntitle={Understanding Retrieval-Augmented Task Adaptation for Vision-Language Models},\nauthor={Yifei Ming and Yixuan Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RIMRKeeVsr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6907143, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17454894719995451683&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "wisc.edu;cornell.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Wisconsin-Madison;Cornell University", "aff_unique_dep": ";", "aff_unique_url": "https://www.wisc.edu;https://www.cornell.edu", "aff_unique_abbr": "UW-Madison;Cornell", "aff_campus_unique_index": "0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Et Tu Certifications: Robustness Certificates Yield Better Adversarial Examples", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34054", "id": "RKlmOBFwAh", "proceeding": "https://proceedings.mlr.press/v235/cullen24a.html", "pdf": "https://openreview.net/pdf?id=RKlmOBFwAh", "openreview": "https://openreview.net/forum?id=RKlmOBFwAh", "author_site": "Andrew C. Cullen, Shijie Liu, Paul Montague, Sarah Erfani, Benjamin Rubinstein", "tldr": "", "abstract": "In guaranteeing the absence of adversarial examples in an instance's neighbourhood, certification mechanisms play an important role in demonstrating neural net robustness. In this paper, we ask if these certifications can compromise the very models they help to protect? Our new *Certification Aware Attack* exploits certifications to produce computationally efficient norm-minimising adversarial examples $74$% more often than comparable attacks, while reducing the median perturbation norm by more than $10$%. While these attacks can be used to assess the tightness of certification bounds, they also highlight that releasing certifications can paradoxically reduce security.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Andrew Craig Cullen;Shijie Liu;Paul Montague;Sarah Monazam Erfani;Benjamin I. P. Rubinstein", "authorids": "~Andrew_Craig_Cullen1;~Shijie_Liu4;~Paul_Montague1;~Sarah_Monazam_Erfani1;~Benjamin_I._P._Rubinstein1", "gender": "M;M;M;;M", "homepage": "https://www.andrewcraigcullen.com;https://github.com/shijiel2;;https://people.eng.unimelb.edu.au/smonazam/;http://www.bipr.net/", "dblp": "238/6828;;50/805;136/0170;90/1092", "google_scholar": "BeXBviIAAAAJ;https://scholar.google.com.au/citations?user=lH5nxwMAAAAJ;;https://scholar.google.com.au/citations?user=Jq9ocx4AAAAJ;https://scholar.google.com.au/citations?user=hMG_gR4AAAAJ", "orcid": "0000-0001-8243-6470;0009-0008-2980-6266;0000-0001-9461-7471;;0000-0002-2947-6980", "linkedin": ";;;;benjaminrubinstein/", "or_profile": "~Andrew_Craig_Cullen1;~Shijie_Liu4;~Paul_Montague1;~Sarah_Monazam_Erfani1;~Benjamin_I._P._Rubinstein1", "aff": "The University of Melbourne;The University of Melbourne;Defence Science and Technology Group;The University of Melbourne;The University of Melbourne", "aff_domain": "unimelb.edu.au;unimelb.edu.au;dst.defence.gov.au;unimelb.edu.au;unimelb.edu.au", "position": "Postdoc;PhD student;Researcher;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\ncullen2024et,\ntitle={Et Tu Certifications: Robustness Certificates Yield Better Adversarial Examples},\nauthor={Andrew Craig Cullen and Shijie Liu and Paul Montague and Sarah Monazam Erfani and Benjamin I. P. Rubinstein},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RKlmOBFwAh}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1269402, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1878612105809938428&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "unimelb.edu.au;unimelb.edu.au;dst.defence.gov.au;unimelb.edu.au;unimelb.edu.au", "author_num": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "University of Melbourne;Defence Science and Technology Group", "aff_unique_dep": ";", "aff_unique_url": "https://www.unimelb.edu.au;https://www.dst.defence.gov.au/", "aff_unique_abbr": "UniMelb;DST Group", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Australia" }, { "title": "Mitigating Oversmoothing Through Reverse Process of GNNs for Heterophilic Graphs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34053", "id": "RLA4JTckXe", "proceeding": "https://proceedings.mlr.press/v235/park24d.html", "pdf": "https://openreview.net/pdf?id=RLA4JTckXe", "openreview": "https://openreview.net/forum?id=RLA4JTckXe", "author_site": "MoonJeong Park, Jaeseung Heo, Dongwoo Kim", "tldr": "", "abstract": "Graph Neural Network (GNN) resembles the diffusion process, leading to the over-smoothing of learned representations when stacking many layers. Hence, the reverse process of message passing can produce the distinguishable node representations by inverting the forward message propagation. The distinguishable representations can help us to better classify neighboring nodes with different labels, such as in heterophilic graphs. In this work, we apply the design principle of the reverse process to the three variants of the GNNs. Through the experiments on heterophilic graph data, where adjacent nodes need to have different representations for successful classification, we show that the reverse process significantly improves the prediction performance in many cases. Additional analysis reveals that the reverse mechanism can mitigate the over-smoothing over hundreds of layers. Our code is available at https://github.com/ml-postech/reverse-gnn.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "MoonJeong Park;Jaeseung Heo;Dongwoo Kim", "authorids": "~MoonJeong_Park1;~Jaeseung_Heo1;~Dongwoo_Kim1", "gender": ";M;M", "homepage": "https://jeong27.github.io/;https://ml.postech.ac.kr/;http://dongwookim-ml.github.io/", "dblp": "321/3773;348/9020;15/398-2", "google_scholar": "https://scholar.google.com/citations?hl=ko;;https://scholar.google.co.kr/citations?user=RkspD6IAAAAJ", "orcid": ";;0000-0002-6515-5260", "linkedin": "moonjeong-park-97ba85258/;;", "or_profile": "~MoonJeong_Park1;~Jaeseung_Heo1;~Dongwoo_Kim1", "aff": "Pohang University of Science and Technology;Pohang University of Science and Technology;POSTECH", "aff_domain": "postech.ac.kr;postech.edu;postech.ac.kr", "position": "PhD student;MS student;Assistant Professor", "bibtex": "@inproceedings{\npark2024mitigating,\ntitle={Mitigating Oversmoothing Through Reverse Process of {GNN}s for Heterophilic Graphs},\nauthor={MoonJeong Park and Jaeseung Heo and Dongwoo Kim},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RLA4JTckXe}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1039971, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4913716333544861650&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "postech.ac.kr;postech.edu;postech.ac.kr", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Pohang University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.postech.ac.kr", "aff_unique_abbr": "POSTECH", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pohang", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Use Your INSTINCT: INSTruction optimization for LLMs usIng Neural bandits Coupled with Transformers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34052", "id": "RLENZ8pNnn", "proceeding": "https://proceedings.mlr.press/v235/lin24r.html", "pdf": "https://openreview.net/pdf?id=RLENZ8pNnn", "openreview": "https://openreview.net/forum?id=RLENZ8pNnn", "author_site": "Xiaoqiang Lin, Zhaoxuan Wu, Zhongxiang Dai, Wenyang Hu, Yao Shu, See-Kiong Ng, Patrick Jaillet, Bryan Kian Hsiang Low", "tldr": "", "abstract": "Large language models (LLMs) have shown remarkable instruction-following capabilities and achieved impressive performances in various applications. However, the performances of LLMs depend heavily on the instructions given to them, which are typically manually tuned with substantial human efforts. Recent work has used the query-efficient Bayesian optimization (BO) algorithm to automatically optimize the instructions given to black-box LLMs. However, BO usually falls short when optimizing highly sophisticated (e.g., high-dimensional) objective functions, such as the functions mapping an instruction to the performance of an LLM. This is mainly due to the limited expressive power of the Gaussian process (GP) which is used by BO as a surrogate to model the objective function. Meanwhile, it has been repeatedly shown that neural networks (NNs), especially pre-trained transformers, possess strong expressive power and can model highly complex functions. So, we adopt a neural bandit algorithm which replaces the GP in BO by an NN surrogate to optimize instructions for black-box LLMs. More importantly, the neural bandit algorithm allows us to naturally couple the NN surrogate with the hidden representation learned by a pre-trained transformer (i.e., an open-source LLM), which significantly boosts its performance. These motivate us to propose our INSTruction optimization usIng Neural bandits Coupled with Transformers (INSTINCT) algorithm. We perform instruction optimization for ChatGPT and use extensive experiments to show that INSTINCT consistently outperforms baselines in different tasks, e.g., various instruction induction tasks and the task of improving zero-shot chain-of-thought instructions. Our code is available at https://github.com/xqlin98/INSTINCT.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaoqiang Lin;Zhaoxuan Wu;Zhongxiang Dai;Wenyang Hu;Yao Shu;See-Kiong Ng;Patrick Jaillet;Bryan Kian Hsiang Low", "authorids": "~Xiaoqiang_Lin1;~Zhaoxuan_Wu1;~Zhongxiang_Dai1;~Wenyang_Hu1;~Yao_Shu1;~See-Kiong_Ng1;~Patrick_Jaillet1;~Bryan_Kian_Hsiang_Low1", "gender": "M;M;M;;M;M;M;M", "homepage": "https://xqlin98.github.io/;https://zhaoxuanwu.github.io/;https://daizhongxiang.github.io/;https://scholar.google.com/citations?user=EecZzYsAAAAJ;https://yao.notion.site;https://www.comp.nus.edu.sg/~ngsk/;http://web.mit.edu/jaillet/www/;http://www.comp.nus.edu.sg/~lowkh", "dblp": "269/4573;298/5083;172/4968;258/0545;44/1338;00/5480;https://dblp.uni-trier.de/pers/hd/j/Jaillet:Patrick;97/4877", "google_scholar": "nqKwA60AAAAJ;Th_mPm8AAAAJ;1v8xOIYAAAAJ;EecZzYsAAAAJ;https://scholar.google.com.au/citations?hl=en;https://scholar.google.com.tw/citations?user=_wsommYAAAAJ;ND0FM6EAAAAJ;https://scholar.google.com.tw/citations?user=2P-Q09UAAAAJ", "orcid": ";0009-0002-5659-6387;;0009-0008-6189-7890;;0000-0001-6565-7511;0000-0002-8585-6566;", "linkedin": ";zhaoxuanwu/;;;yao-shu-a5640514b;seekiong/?originalSubdomain=sg;patrick-jaillet-1260445/;", "or_profile": "~Xiaoqiang_Lin1;~Zhaoxuan_Wu1;~Zhongxiang_Dai1;~Wenyang_Hu1;~Yao_Shu1;~See-Kiong_Ng1;~Patrick_Jaillet1;~Bryan_Kian_Hsiang_Low1", "aff": "National University of Singapore;National University of Singapore;Massachusetts Institute of Technology;National University of Singapore;Guangming Lab;National University of Singapore;Massachusetts Institute of Technology;National University of Singapore", "aff_domain": "u.nus.edu;u.nus.edu;mit.edu;u.nus.edu;gml.ac.cn;nus.edu.sg;mit.edu;nus.edu.sg", "position": "PhD student;PhD student;Postdoc;PhD student;Researcher;Full Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nlin2024use,\ntitle={Use Your {INSTINCT}: {INST}ruction optimization for {LLM}s usIng Neural bandits Coupled with Transformers},\nauthor={Xiaoqiang Lin and Zhaoxuan Wu and Zhongxiang Dai and Wenyang Hu and Yao Shu and See-Kiong Ng and Patrick Jaillet and Bryan Kian Hsiang Low},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RLENZ8pNnn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1975554, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2291276295759924013&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "u.nus.edu;u.nus.edu;mit.edu;u.nus.edu;gml.ac.cn;nus.edu.sg;mit.edu;nus.edu.sg", "author_num": 8, "aff_unique_index": "0;0;1;0;2;0;1;0", "aff_unique_norm": "National University of Singapore;Massachusetts Institute of Technology;Guangming Lab", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nus.edu.sg;https://web.mit.edu;", "aff_unique_abbr": "NUS;MIT;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;2;0;1;0", "aff_country_unique": "Singapore;United States;China" }, { "title": "Dealing With Unbounded Gradients in Stochastic Saddle-point Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34051", "id": "RPMTNGMq0O", "proceeding": "https://proceedings.mlr.press/v235/neu24a.html", "pdf": "https://openreview.net/pdf?id=RPMTNGMq0O", "openreview": "https://openreview.net/forum?id=RPMTNGMq0O", "author_site": "Gergely Neu, Nneka Okolo", "tldr": "", "abstract": "We study the performance of stochastic first-order methods for finding saddle points of convex-concave functions. A notorious challenge faced by such methods is that the gradients can grow arbitrarily large during optimization, which may result in instability and divergence. In this paper, we propose a simple and effective regularization technique that stabilizes the iterates and yields meaningful performance guarantees even if the domain and the gradient noise scales linearly with the size of the iterates (and is thus potentially unbounded). Besides providing a set of general results, we also apply our algorithm to a specific problem in reinforcement learning, where it leads to performance guarantees for finding near-optimal policies in an average-reward MDP without prior knowledge of the bias span.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gergely Neu;Nneka Okolo", "authorids": "~Gergely_Neu1;~Nneka_Okolo1", "gender": "M;F", "homepage": "http://cs.bme.hu/~gergo;", "dblp": "83/7606;331/5997", "google_scholar": "https://scholar.google.ch/citations?user=uz27G84AAAAJ;s8DIX2sAAAAJ", "orcid": ";0009-0004-0137-970X", "linkedin": ";nneka-okolo-876410134/", "or_profile": "~Gergely_Neu1;~Nneka_Okolo1", "aff": "Universitat Pompeu Fabra;Universitat Pompeu Fabra", "aff_domain": "upf.edu;upf.edu", "position": "Assistant Professor;PhD student", "bibtex": "@inproceedings{\nneu2024dealing,\ntitle={Dealing With Unbounded Gradients in Stochastic Saddle-point Optimization},\nauthor={Gergely Neu and Nneka Okolo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RPMTNGMq0O}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 415727, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12222260585525025147&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "upf.edu;upf.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Universitat Pompeu Fabra", "aff_unique_dep": "", "aff_unique_url": "https://www.upf.edu/", "aff_unique_abbr": "UPF", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Spain" }, { "title": "Classification under Nuisance Parameters and Generalized Label Shift in Likelihood-Free Inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34050", "id": "RXxTuxPopa", "proceeding": "https://proceedings.mlr.press/v235/masserano24a.html", "pdf": "https://openreview.net/pdf?id=RXxTuxPopa", "openreview": "https://openreview.net/forum?id=RXxTuxPopa", "author_site": "Luca Masserano, Alexander Shen, Michele Doro, Tommaso Dorigo, Rafael Izbicki, Ann Lee", "tldr": "", "abstract": "An open scientific challenge is how to classify events with reliable measures of uncertainty, when we have a mechanistic model of the data-generating process but the distribution over both labels and latent nuisance parameters is different between train and target data. We refer to this type of distributional shift as generalized label shift (GLS). Direct classification using observed data $\\mathbf{X}$ as covariates leads to biased predictions and invalid uncertainty estimates of labels $Y$. We overcome these biases by proposing a new method for robust uncertainty quantification that casts classification as a hypothesis testing problem under nuisance parameters. The key idea is to estimate the classifier's receiver operating characteristic (ROC) across the entire nuisance parameter space, which allows us to devise cutoffs that are invariant under GLS. Our method effectively endows a pre-trained classifier with domain adaptation capabilities and returns valid prediction sets while maintaining high power. We demonstrate its performance on two challenging scientific problems in biology and astroparticle physics with data from realistic mechanistic models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Luca Masserano;Alexander Shen;Michele Doro;Tommaso Dorigo;Rafael Izbicki;Ann B. Lee", "authorids": "~Luca_Masserano1;~Alexander_Shen1;michele.doro@unipd.it;~Tommaso_Dorigo1;~Rafael_Izbicki1;~Ann_B._Lee1", "gender": "M;;;M;;F", "homepage": ";;;https://userswww.pd.infn.it/~dorigo/;https://rafaelizbicki.com;http://www.stat.cmu.edu/~annlee/", "dblp": ";;;;;03/3920", "google_scholar": "https://scholar.google.com/citations?hl=en;;;p5i5SrgAAAAJ;IldCv5AAAAAJ;pDQsaXgAAAAJ", "orcid": ";;;0000-0002-1659-8727;;", "linkedin": ";alexander-j-shen/;;;;", "or_profile": "~Luca_Masserano1;~Alexander_Shen1;michele.doro@unipd.it;~Tommaso_Dorigo1;~Rafael_Izbicki1;~Ann_B._Lee1", "aff": "Carnegie Mellon University;Carnegie Mellon University;;;Universidade Federal de Sao Carlos;", "aff_domain": "andrew.cmu.edu;cmu.edu;;;ufscar.br;", "position": "PhD student;PhD student;;;Assistant Professor;", "bibtex": "@inproceedings{\nmasserano2024classification,\ntitle={Classification under Nuisance Parameters and Generalized Label Shift in Likelihood-Free Inference},\nauthor={Luca Masserano and Alexander Shen and Michele Doro and Tommaso Dorigo and Rafael Izbicki and Ann B. Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RXxTuxPopa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5923688, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8667080985153810411&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "andrew.cmu.edu;cmu.edu;;;ufscar.br;", "author_num": 6, "aff_unique_index": "0;0;1", "aff_unique_norm": "Carnegie Mellon University;Universidade Federal de Sao Carlos", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;http://www.ufscar.br", "aff_unique_abbr": "CMU;UFSCar", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;Brazil" }, { "title": "Learning and Forgetting Unsafe Examples in Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34049", "id": "RYmmgedVjR", "proceeding": "https://proceedings.mlr.press/v235/zhao24e.html", "pdf": "https://openreview.net/pdf?id=RYmmgedVjR", "openreview": "https://openreview.net/forum?id=RYmmgedVjR", "author_site": "Jiachen Zhao, Zhun Deng, David Madras, James Zou, Mengye Ren", "tldr": "", "abstract": "As the number of large language models (LLMs) released to the public grows, there is a pressing need to understand the safety implications associated with these models learning from third-party custom finetuning data. We explore the behavior of LLMs finetuned on noisy custom data containing unsafe content, represented by datasets that contain biases, toxicity, and harmfulness, finding that while aligned LLMs can readily learn this unsafe content, they also tend to forget it more significantly than other examples when subsequently finetuned on safer content. Drawing inspiration from the discrepancies in forgetting, we introduce the \u201cForgetFilter\u201d algorithm, which filters unsafe data based on how strong the model's forgetting signal is for that data. We demonstrate that the ForgetFilter algorithm ensures safety in customized finetuning without compromising downstream task performance, unlike sequential safety finetuning. ForgetFilter outperforms alternative strategies like replay and moral self-correction in curbing LLMs\u2019 ability to assimilate unsafe content during custom finetuning, e.g. 75% lower than not applying any safety measures and 62% lower than using self-correction in toxicity score.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiachen Zhao;Zhun Deng;David Madras;James Zou;Mengye Ren", "authorids": "~Jiachen_Zhao4;~Zhun_Deng1;~David_Madras1;~James_Zou1;~Mengye_Ren1", "gender": "M;M;;;M", "homepage": "https://www.zhundeng.org/;http://www.cs.toronto.edu/~madras/;;http://www.cs.toronto.edu/~mren;", "dblp": "204/4353;188/6211;;163/1952;", "google_scholar": "nkmi-moAAAAJ;MgnNDpkAAAAJ;23ZXZvEAAAAJ;XcQ9WqMAAAAJ;9dFt9JAAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Zhun_Deng1;~David_Madras1;~James_Zou1;~Mengye_Ren1;~Jiachen_ZHAO1", "aff": "Columbia University;Google;Stanford University;New York University;University of Massachusetts Amherst", "aff_domain": "columbia.edu;google.com;stanford.edu;nyu.edu;umass.edu", "position": "Postdoc;Researcher;Assistant Professor;Assistant Professor;MS student", "bibtex": "@inproceedings{\nzhao2024learning,\ntitle={Learning and Forgetting Unsafe Examples in Large Language Models},\nauthor={Jiachen Zhao and Zhun Deng and David Madras and James Zou and Mengye Ren},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RYmmgedVjR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 678194, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8289082060151570200&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "columbia.edu;google.com;stanford.edu;nyu.edu;umass.edu", "author_num": 5, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Columbia University;Google;Stanford University;New York University;University of Massachusetts Amherst", "aff_unique_dep": ";Google;;;", "aff_unique_url": "https://www.columbia.edu;https://www.google.com;https://www.stanford.edu;https://www.nyu.edu;https://www.umass.edu", "aff_unique_abbr": "Columbia;Google;Stanford;NYU;UMass Amherst", "aff_campus_unique_index": "1;2;3", "aff_campus_unique": ";Mountain View;Stanford;Amherst", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Let Go of Your Labels with Unsupervised Transfer", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34048", "id": "RZHRnnGcEx", "proceeding": "https://proceedings.mlr.press/v235/gadetsky24a.html", "pdf": "https://openreview.net/pdf?id=RZHRnnGcEx", "openreview": "https://openreview.net/forum?id=RZHRnnGcEx", "author_site": "Artyom Gadetsky, Yulun Jiang, Maria Brbic", "tldr": "", "abstract": "Foundation vision-language models have enabled remarkable zero-shot transferability of the pre-trained representations to a wide range of downstream tasks. However, to solve a new task, zero-shot transfer still necessitates human guidance to define visual categories that appear in the data. Here, we show that fully unsupervised transfer emerges when searching for the labeling of a dataset that induces maximal margin classifiers in representation spaces of different foundation models. We present TURTLE, a fully unsupervised method that effectively employs this guiding principle to uncover the underlying labeling of a downstream dataset without any supervision and task-specific representation learning. We evaluate TURTLE on a diverse benchmark suite of 26 datasets and show that it achieves new state-of-the-art unsupervised performance. Furthermore, TURTLE, although being fully unsupervised, outperforms zero-shot transfer baselines on a wide range of datasets. In particular, TURTLE matches the average performance of CLIP zero-shot on 26 datasets by employing the same representation space, spanning a wide range of architectures and model sizes. By guiding the search for the underlying labeling using the representation spaces of two foundation models, TURTLE surpasses zero-shot transfer and unsupervised prompt tuning baselines, demonstrating the surprising power and effectiveness of unsupervised transfer.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Artyom Gadetsky;Yulun Jiang;Maria Brbic", "authorids": "~Artyom_Gadetsky1;~Yulun_Jiang1;~Maria_Brbic1", "gender": "M;M;F", "homepage": "https://agadetsky.github.io;https://yljblues.github.io/;https://brbiclab.epfl.ch/", "dblp": "222/2900;236/9267.html;130/3233", "google_scholar": "J48uBYgAAAAJ;C-dyKuwAAAAJ;ltxmeroAAAAJ", "orcid": ";;0000-0002-1120-1778", "linkedin": ";;", "or_profile": "~Artyom_Gadetsky1;~Yulun_Jiang1;~Maria_Brbic1", "aff": "EPFL - EPF Lausanne;EPFL - EPF Lausanne;EPFL - EPF Lausanne", "aff_domain": "epfl.ch;epfl.ch;epfl.ch", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\ngadetsky2024let,\ntitle={Let Go of Your Labels with Unsupervised Transfer},\nauthor={Artyom Gadetsky and Yulun Jiang and Maria Brbic},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RZHRnnGcEx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5456641, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12839820903509935815&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "epfl.ch;epfl.ch;epfl.ch", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "EPFL", "aff_unique_dep": "", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Improving Transformers with Dynamically Composable Multi-Head Attention", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34047", "id": "RbiBKPtuHp", "proceeding": "https://proceedings.mlr.press/v235/xiao24d.html", "pdf": "https://openreview.net/pdf?id=RbiBKPtuHp", "openreview": "https://openreview.net/forum?id=RbiBKPtuHp", "author_site": "Da Xiao, Qingye Meng, Shengping Li, xingyuan yuan", "tldr": "", "abstract": "Multi-Head Attention (MHA) is a key component of Transformer. In MHA, attention heads work independently, causing problems such as low-rank bottleneck of attention score matrices and head redundancy. We propose Dynamically Composable Multi-Head Attention (DCMHA), a parameter and computation efficient attention architecture that tackles the shortcomings of MHA and increases the expressive power of the model by dynamically composing attention heads. At the core of DCMHA is a Compose function that transforms the attention score and weight matrices in an input-dependent way. DCMHA can be used as a drop-in replacement of MHA in any transformer architecture to obtain the corresponding DCFormer. DCFormer significantly outperforms Transformer on different architectures and model scales in language modeling, matching the performance of models with 1.7x-2.0x compute. For example, DCPythia-6.9B outperforms open source Pythia-12B on both pretraining perplexity and downstream task evaluation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Da Xiao;Qingye Meng;Shengping Li;xingyuan yuan", "authorids": "~Da_Xiao1;~Qingye_Meng1;~Shengping_Li2;~xingyuan_yuan1", "gender": "M;M;M;M", "homepage": ";;https://github.com/Lisennlp;http://wiki.swarma.net/index.php/User:Chaosconst", "dblp": ";;;", "google_scholar": "xd--YPAAAAAJ;uFisAHgAAAAJ;;", "orcid": ";;;", "linkedin": ";%E5%BA%86%E4%B8%9A-%E5%AD%9F-0483b9b4/;;xingyuan-yuan-67035935/", "or_profile": "~Da_Xiao1;~Qingye_Meng1;~Shengping_Li2;~xingyuan_yuan1", "aff": "Beijing University of Posts and Telecommunications;ColorfulClouds Tech.;ColorfulClouds Tech.;ColorfulClouds Tech.", "aff_domain": "bupt.edu.cn;caiyunapp.com;caiyunapp.com;caiyunapp.com", "position": "Assistant Professor;MS student;Researcher;CEO", "bibtex": "@inproceedings{\nxiao2024improving,\ntitle={Improving Transformers with Dynamically Composable Multi-Head Attention},\nauthor={Da Xiao and Qingye Meng and Shengping Li and xingyuan yuan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RbiBKPtuHp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 637471, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9697142635808887706&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 6, "email": "bupt.edu.cn;caiyunapp.com;caiyunapp.com;caiyunapp.com", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Beijing University of Posts and Telecommunications;ColorfulClouds Tech", "aff_unique_dep": ";", "aff_unique_url": "http://www.bupt.edu.cn/;", "aff_unique_abbr": "BUPT;", "aff_campus_unique_index": "0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0", "aff_country_unique": "China;" }, { "title": "Ameliorate Spurious Correlations in Dataset Condensation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34046", "id": "RbnojVv4HK", "proceeding": "https://proceedings.mlr.press/v235/cui24e.html", "pdf": "https://openreview.net/pdf?id=RbnojVv4HK", "openreview": "https://openreview.net/forum?id=RbnojVv4HK", "author_site": "Jiaxing Cui, Ruochen Wang, Yuanhao Xiong, Cho-Jui Hsieh", "tldr": "", "abstract": "Dataset Condensation has emerged as a technique for compressing large datasets into smaller synthetic counterparts, facilitating downstream training tasks. In this paper, we study the impact of bias inside the original dataset on the performance of dataset condensation. With a comprehensive empirical evaluation on canonical datasets with color, corruption and background biases, we found that color and background biases in the original dataset will be amplified through the condensation process, resulting in a notable decline in the performance of models trained on the condensed dataset, while corruption bias is suppressed through the condensation process. To reduce bias amplification in dataset condensation, we introduce a simple yet highly effective approach based on a sample reweighting scheme utilizing kernel density estimation. Empirical results on multiple real-world and synthetic datasets demonstrate the effectiveness of the proposed method. Notably, on CMNIST with 5% bias-conflict ratio and IPC 50, our method achieves 91.5% test accuracy compared to 23.8% from vanilla DM, boosting the performance by 67.7%, whereas applying state-of-the-art debiasing method on the same dataset only achieves 53.7% accuracy. Our findings highlight the importance of addressing biases in dataset condensation and provide a promising avenue to address bias amplification in the process.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Justin Cui;Ruochen Wang;Yuanhao Xiong;Cho-Jui Hsieh", "authorids": "~Justin_Cui1;~Ruochen_Wang2;~Yuanhao_Xiong1;~Cho-Jui_Hsieh1", "gender": "M;M;M;M", "homepage": ";https://ruocwang.github.io/;https://xyh97.github.io/;http://web.cs.ucla.edu/~chohsieh/index.html", "dblp": "324/7960;33/120;232/1248;14/2770", "google_scholar": "zel3jUcAAAAJ;8fXrlRAAAAAJ;DVKxiMkAAAAJ;Wy89g4IAAAAJ", "orcid": ";;;", "linkedin": ";ruochen-wang-1699b1113/;;", "or_profile": "~Justin_Cui1;~Ruochen_Wang2;~Yuanhao_Xiong1;~Cho-Jui_Hsieh1", "aff": ", University of California, Los Angeles;University of California, Los Angeles;University of California, Los Angeles;University of California, Los Angeles", "aff_domain": "cs.ucla.edu;ucla.edu;cs.ucla.edu;ucla.edu", "position": "PhD student;PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\ncui2024ameliorate,\ntitle={Ameliorate Spurious Correlations in Dataset Condensation},\nauthor={Justin Cui and Ruochen Wang and Yuanhao Xiong and Cho-Jui Hsieh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RbnojVv4HK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4261580, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1518638189413435615&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": "cs.ucla.edu;ucla.edu;cs.ucla.edu;ucla.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of California, Los Angeles", "aff_unique_dep": "", "aff_unique_url": "https://www.ucla.edu", "aff_unique_abbr": "UCLA", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "How Far Can Fairness Constraints Help Recover From Biased Data?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34045", "id": "RfQT6vJt8b", "proceeding": "https://proceedings.mlr.press/v235/sharma24a.html", "pdf": "https://openreview.net/pdf?id=RfQT6vJt8b", "openreview": "https://openreview.net/forum?id=RfQT6vJt8b", "author_site": "Mohit Sharma, Amit Jayant Deshpande", "tldr": "", "abstract": "A general belief in fair classification is that fairness constraints incur a trade-off with accuracy, which biased data may worsen. Contrary to this belief, Blum & Stangl (2019) show that fair classification with equal opportunity constraints even on extremely biased data can recover optimally accurate and fair classifiers on the original data distribution. Their result is interesting because it demonstrates that fairness constraints can implicitly rectify data bias and simultaneously overcome a perceived fairness-accuracy trade-off. Their data bias model simulates under-representation and label bias in underprivileged population, and they show the above result on a stylized data distribution with i.i.d. label noise, under simple conditions on the data distribution and bias parameters. We propose a general approach to extend the result of Blum & Stangl (2019) to different fairness constraints, data bias models, data distributions, and hypothesis classes. We strengthen their result, and extend it to the case when their stylized distribution has labels with Massart noise instead of i.i.d. noise. We prove a similar recovery result for arbitrary data distributions using fair reject option classifiers. We further generalize it to arbitrary data distributions and arbitrary hypothesis classes, i.e., we prove that for any data distribution, if the optimally accurate classifier in a given hypothesis class is fair and robust, then it can be recovered through fair classification with equal opportunity constraints on the biased distribution whenever the bias parameters satisfy certain simple conditions. Finally, we show applications of our technique to time-varying data bias in classification and fair machine learning pipelines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "mohit sharma;Amit Deshpande", "authorids": "~mohit_sharma5;~Amit_Deshpande1", "gender": "M;M", "homepage": ";", "dblp": ";28/6953-1", "google_scholar": "m74gCtYAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~mohit_sharma5;~Amit_Deshpande1", "aff": "Microsoft;Microsoft Research", "aff_domain": "microsoft.com;microsoft.com", "position": "Intern;Researcher", "bibtex": "@inproceedings{\nsharma2024how,\ntitle={How Far Can Fairness Constraints Help Recover From Biased Data?},\nauthor={mohit sharma and Amit Deshpande},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RfQT6vJt8b}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 424581, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7971049443443388848&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "microsoft.com;microsoft.com", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Microsoft", "aff_unique_dep": "Microsoft Corporation", "aff_unique_url": "https://www.microsoft.com", "aff_unique_abbr": "Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "On the Second-Order Convergence of Biased Policy Gradient Algorithms", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34044", "id": "RfsagmV1AG", "proceeding": "https://proceedings.mlr.press/v235/mu24b.html", "pdf": "https://openreview.net/pdf?id=RfsagmV1AG", "openreview": "https://openreview.net/forum?id=RfsagmV1AG", "author_site": "Siqiao Mu, Diego Klabjan", "tldr": "", "abstract": "Since the objective functions of reinforcement learning problems are typically highly nonconvex, it is desirable that policy gradient, the most popular algorithm, escapes saddle points and arrives at second-order stationary points. Existing results only consider vanilla policy gradient algorithms with unbiased gradient estimators, but practical implementations under the infinite-horizon discounted reward setting are biased due to finite-horizon sampling. Moreover, actor-critic methods, whose second-order convergence has not yet been established, are also biased due to the critic approximation of the value function. We provide a novel second-order analysis of biased policy gradient methods, including the vanilla gradient estimator computed from Monte-Carlo sampling of trajectories as well as the double-loop actor-critic algorithm, where in the inner loop the critic improves the approximation of the value function via TD(0) learning. Separately, we also establish the convergence of TD(0) on Markov chains irrespective of initial state distribution.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Siqiao Mu;Diego Klabjan", "authorids": "~Siqiao_Mu1;~Diego_Klabjan1", "gender": "F;M", "homepage": "https://scholar.google.com/citations?user=HSiE3cIAAAAJ&hl=en;http://dynresmanagement.com/index.html", "dblp": ";17/105", "google_scholar": ";TaQZ_VUAAAAJ", "orcid": ";0000-0003-4213-9281", "linkedin": ";diegoklabjan", "or_profile": "~Siqiao_Mu1;~Diego_Klabjan1", "aff": "Northwestern University;Northwestern University", "aff_domain": "northwestern.edu;u.northwestern.edu", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nmu2024on,\ntitle={On the Second-Order Convergence of Biased Policy Gradient Algorithms},\nauthor={Siqiao Mu and Diego Klabjan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RfsagmV1AG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 441432, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18370484199169874523&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "northwestern.edu;u.northwestern.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Northwestern University", "aff_unique_dep": "", "aff_unique_url": "https://www.northwestern.edu", "aff_unique_abbr": "NU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Stay on Topic with Classifier-Free Guidance", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34043", "id": "RiM3cl9MdK", "proceeding": "https://proceedings.mlr.press/v235/sanchez24a.html", "pdf": "https://openreview.net/pdf?id=RiM3cl9MdK", "openreview": "https://openreview.net/forum?id=RiM3cl9MdK", "author_site": "Guillaume Sanchez, Alexander Spangher, Honglu Fan, Elad Levi, Stella Biderman", "tldr": "", "abstract": "Classifier-Free Guidance (CFG) has recently emerged in as a lightweight technique to encourage prompt-adherence in generations, yet has not yet been successfully applied to language modeling. In this work, we demonstrate across a wide array of benchmarks that CFG can be used broadly as an inference-time technique in pure language modeling. We show that CFG (1) improves the performance of Pythia, GPT-2 and LLaMA-family models across: Q&A, reasoning, code generation, and machine translation, achieving SOTA on LAMBADA with LLaMA-7B over PaLM-540B; (2) brings improvements equivalent to a model with twice the parameter-count; (3) can stack alongside other inference-time methods like Chain-of-Thought and Self-Consistency, yielding further improvements in difficult tasks; (4) can be used to increase the faithfulness and coherence of assistants in challenging form-driven and content-driven prompts: in human evaluations we show a 75% preference for using CFG over baseline.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guillaume Sanchez;Alexander Spangher;Honglu Fan;Elad Levi;Stella Biderman", "authorids": "~Guillaume_Sanchez1;~Alexander_Spangher2;~Honglu_Fan1;~Elad_Levi1;~Stella_Biderman1", "gender": "M;M;Not Specified;M;F", "homepage": ";http://alexander-spangher.com/;https://honglu.fan;;http://www.stellabiderman.com", "dblp": ";227/2512;;232/2420;239/5641", "google_scholar": "ke4fCUgAAAAJ;https://scholar.google.com/citations?hl=en;XqlOVeAAAAAJ;https://scholar.google.com/citations?hl=en;bO7H0DAAAAAJ", "orcid": ";;;;0000-0001-8228-1042", "linkedin": ";;;;stellabiderman", "or_profile": "~Guillaume_Sanchez1;~Alexander_Spangher2;~Honglu_Fan1;~Elad_Levi1;~Stella_Biderman1", "aff": ";University of Southern California;University of Geneva;;Booz Allen Hamilton", "aff_domain": ";usc.edu;unige.ch;;boozallen.com", "position": ";PhD student;Postdoc;;Industry researcher", "bibtex": "@inproceedings{\nsanchez2024stay,\ntitle={Stay on Topic with Classifier-Free Guidance},\nauthor={Guillaume Sanchez and Alexander Spangher and Honglu Fan and Elad Levi and Stella Biderman},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RiM3cl9MdK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2037724, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11081534836631603439&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";usc.edu;unige.ch;;boozallen.com", "author_num": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Southern California;University of Geneva;Booz Allen Hamilton", "aff_unique_dep": ";;", "aff_unique_url": "https://www.usc.edu;https://www.unige.ch;https://www.boozallen.com", "aff_unique_abbr": "USC;UNIGE;BAH", "aff_campus_unique_index": "0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;Switzerland" }, { "title": "Beyond Implicit Bias: The Insignificance of SGD Noise in Online Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34042", "id": "RiQbe8RwCe", "proceeding": "https://proceedings.mlr.press/v235/vyas24a.html", "pdf": "https://openreview.net/pdf?id=RiQbe8RwCe", "openreview": "https://openreview.net/forum?id=RiQbe8RwCe", "author_site": "Nikhil Vyas, Depen Morwani, Rosie Zhao, Gal Kaplun, Sham Kakade, Boaz Barak", "tldr": "", "abstract": "The success of SGD in deep learning has been ascribed by prior works to the *implicit bias* induced by finite batch sizes (''SGD noise''). While prior works focused on *offline learning* (i.e., multiple-epoch training), we study the impact of SGD noise on *online* (i.e., single epoch) learning. Through an extensive empirical analysis of image and language data, we demonstrate that small batch sizes do *not* confer any implicit bias advantages in online learning. In contrast to offline learning, the benefits of SGD noise in online learning are strictly computational, facilitating more cost-effective gradient steps. This suggests that SGD in the online regime can be construed as taking noisy steps along the ''golden path'' of the noiseless *gradient descent* algorithm. We study this hypothesis and provide supporting evidence in loss and function space. Our findings challenge the prevailing understanding of SGD and offer novel insights into its role in online learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nikhil Vyas;Depen Morwani;Rosie Zhao;Gal Kaplun;Sham M. Kakade;Boaz Barak", "authorids": "~Nikhil_Vyas1;~Depen_Morwani1;~Rosie_Zhao1;~Gal_Kaplun1;~Sham_M._Kakade1;~Boaz_Barak2", "gender": "M;M;F;M;M;M", "homepage": "https://nikhilvyas.github.io/;;https://rosieyzh.github.io/;http://www.galkaplun.com;https://shamulent.github.io;https://boazbarak.org", "dblp": "176/1074;277/5200;277/9223;237/9816;s/SMKakade;b/BBarak", "google_scholar": ";vOngxFUAAAAJ;rgwbR6wAAAAJ;y4BzFYsAAAAJ;https://scholar.google.com.tw/citations?user=wb-DKCIAAAAJ;I0fbJ6cAAAAJ", "orcid": ";;;;;0000-0002-4053-8927", "linkedin": ";depen-morwani-070298122/;https://linkedin.com/in/rosieyzh;gal-kaplun-865496151/;;", "or_profile": "~Nikhil_Vyas1;~Depen_Morwani1;~Rosie_Zhao1;~Gal_Kaplun1;~Sham_M._Kakade1;~Boaz_Barak2", "aff": "Harvard University;Harvard University, Harvard University;Harvard University, Harvard University;;Harvard University;Harvard University", "aff_domain": "harvard.edu;g.harvard.edu;g.harvard.edu;;harvard.edu;fas.harvard.edu", "position": "Postdoc;PhD student;PhD student;;Full Professor;Full Professor", "bibtex": "@inproceedings{\nvyas2024beyond,\ntitle={Beyond Implicit Bias: The Insignificance of {SGD} Noise in Online Learning},\nauthor={Nikhil Vyas and Depen Morwani and Rosie Zhao and Gal Kaplun and Sham M. Kakade and Boaz Barak},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RiQbe8RwCe}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2121184, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7105605598769388633&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "harvard.edu;g.harvard.edu;g.harvard.edu;;harvard.edu;fas.harvard.edu", "author_num": 6, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Open Ad Hoc Teamwork with Cooperative Game Theory", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34041", "id": "RlibRvH4B4", "proceeding": "https://proceedings.mlr.press/v235/wang24an.html", "pdf": "https://openreview.net/pdf?id=RlibRvH4B4", "openreview": "https://openreview.net/forum?id=RlibRvH4B4", "author_site": "Jianhong Wang, Yang Li, Yuan Zhang, Wei Pan, Samuel Kaski", "tldr": "", "abstract": "Ad hoc teamwork poses a challenging problem, requiring the design of an agent to collaborate with teammates without prior coordination or joint training. Open ad hoc teamwork (OAHT) further complicates this challenge by considering environments with a changing number of teammates, referred to as open teams. One promising solution in practice to this problem is leveraging the generalizability of graph neural networks to handle an unrestricted number of agents with various agent-types, named graph-based policy learning (GPL). However, its joint Q-value representation over a coordination graph lacks convincing explanations. In this paper, we establish a new theory to understand the representation of the joint Q-value for OAHT and its learning paradigm, through the lens of cooperative game theory. Building on our theory, we propose a novel algorithm named CIAO, based on GPL's framework, with additional provable implementation tricks that can facilitate learning. The demos of experimental results are available on https://sites.google.com/view/ciao2024, and the code of experiments is published on https://github.com/hsvgbkhgbv/CIAO.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jianhong Wang;Yang Li;Yuan Zhang;Wei Pan;Samuel Kaski", "authorids": "~Jianhong_Wang1;~Yang_Li40;~Yuan_Zhang8;~Wei_Pan2;~Samuel_Kaski1", "gender": "M;M;;M;M", "homepage": "https://hsvgbkhgbv.github.io/;https://liyang.page;;http://panweihit.github.io;https://people.aalto.fi/samuel.kaski", "dblp": ";;;;64/5826", "google_scholar": "K1FKF3IAAAAJ;msAmwaoAAAAJ;gMzGCV0AAAAJ;GqryWPsAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;0000-0003-1121-9879;0000-0003-1925-9154", "linkedin": "jianhong-wang-45995b100/;;;wei-pan-6b558b17/;samuel-kaski-27790/", "or_profile": "~Jianhong_Wang1;~Yang_Li40;~Yuan_Zhang8;~Wei_Pan2;~Samuel_Kaski1", "aff": "University of Manchester;University of Manchester;University of Freiburg;University of Manchester;Aalto University", "aff_domain": "manchester.ac.uk;cs.manchester.ac.uk;uni-freiburg.de;manchester.ac.uk;aalto.fi", "position": "Postdoc;PhD student;PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nwang2024open,\ntitle={Open Ad Hoc Teamwork with Cooperative Game Theory},\nauthor={Jianhong Wang and Yang Li and Yuan Zhang and Wei Pan and Samuel Kaski},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RlibRvH4B4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3898679, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1872307303992794652&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "manchester.ac.uk;cs.manchester.ac.uk;uni-freiburg.de;manchester.ac.uk;aalto.fi", "author_num": 5, "aff_unique_index": "0;0;1;0;2", "aff_unique_norm": "University of Manchester;University of Freiburg;Aalto University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.manchester.ac.uk;https://www.uni-freiburg.de;https://www.aalto.fi", "aff_unique_abbr": "UoM;UoF;Aalto", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;2", "aff_country_unique": "United Kingdom;Germany;Finland" }, { "title": "Projection-Free Online Convex Optimization with Time-Varying Constraints", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34040", "id": "RnbobOgbn0", "proceeding": "https://proceedings.mlr.press/v235/garber24a.html", "pdf": "https://openreview.net/pdf?id=RnbobOgbn0", "openreview": "https://openreview.net/forum?id=RnbobOgbn0", "author_site": "Dan Garber, Ben Kretzu", "tldr": "", "abstract": "We consider the setting of online convex optimization with adversarial time-varying constraints in which actions must be feasible w.r.t. a fixed constraint set, and are also required on average to approximately satisfy additional time-varying constraints. Motivated by scenarios in which the fixed feasible set (hard constraint) is difficult to project on, we consider projection-free algorithms that access this set only through a linear optimization oracle (LOO). We present an algorithm that, on a sequence of length $T$ and using overall $T$ calls to the LOO, guarantees $\\tilde{O}(T^{3/4})$ regret w.r.t. the losses and $O(T^{7/8})$ constraints violation (ignoring all quantities except for $T$). In particular, these bounds hold w.r.t. any interval of the sequence. This algorithm however also requires access to an oracle for minimizing a strongly convex nonsmooth function over a Euclidean ball. We present a more efficient algorithm that does not require the latter optimization oracle but only first-order access to the time-varying constraints, and achieves similar bounds w.r.t. the entire sequence. We extend the latter to the setting of bandit feedback and obtain similar bounds (as a function of $T$) in expectation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dan Garber;Ben Kretzu", "authorids": "~Dan_Garber1;benkretzu@campus.technion.ac.il", "gender": ";", "homepage": "https://dangar.net.technion.ac.il/;", "dblp": ";", "google_scholar": "https://scholar.google.co.il/citations?user=kUe1sZEAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Dan_Garber1;benkretzu@campus.technion.ac.il", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\ngarber2024projectionfree,\ntitle={Projection-Free Online Convex Optimization with Time-Varying Constraints},\nauthor={Dan Garber and Ben Kretzu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RnbobOgbn0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 401972, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15031369904105280047&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": ";", "author_num": 2 }, { "title": "AutoOS: Make Your OS More Powerful by Exploiting Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34039", "id": "Rp8R9C0Sth", "proceeding": "https://proceedings.mlr.press/v235/chen24at.html", "pdf": "https://openreview.net/pdf?id=Rp8R9C0Sth", "openreview": "https://openreview.net/forum?id=Rp8R9C0Sth", "author_site": "Huilai Chen, Yuanbo Wen, Limin Cheng, Shouxu Kuang, Yumeng Liu, Weijia Li, Ling Li, Rui Zhang, Xinkai Song, Wei Li, Qi Guo, Yunji Chen", "tldr": "", "abstract": "With the rapid development of Artificial Intelligence of Things (AIoT), customizing and optimizing operating system (OS) kernel configurations for various AIoT application scenarios is crucial for maximizing system performance. However, existing approaches falter due to the overwhelming problem complexity (i.e., over 15,000 configuration options in the Linux kernel), together with the huge evaluation costs and error-prone options that may result in OS boot-up failure, which all make it an unresolved problem to optimize the Linux kernel automatically. In this paper, we introduce AutoOS, a novel framework exploiting Large Language Models for customizing and optimizing OS kernel configurations automatically for various AIoT application scenarios.Inspired by the inherently directory-structured kernel configuration process, we first formulate our research problem as optimizing on a dynamic tree. We then propose a novel framework integrating a state machine-based traversal algorithm as the observe-prune-propose-act-correct loop, which can effectively refine the optimization space and ensure a successful OS boot-up.Experimental results show that AutoOS can automatically customize and optimize the OS kernel configurations without human effort. More importantly, AutoOS even achieves better performance by up to 25% than vendor-provided configuration.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Huilai Chen;Yuanbo Wen;Limin Cheng;Shouxu Kuang;Yumeng Liu;Weijia Li;Ling Li;Rui Zhang;Xinkai Song;Wei Li;Qi Guo;Yunji Chen", "authorids": "~Huilai_Chen1;~Yuanbo_Wen1;~Limin_Cheng1;~Shouxu_Kuang1;~Yumeng_Liu3;~Weijia_Li4;~Ling_Li6;~Rui_Zhang1;~Xinkai_Song1;~Wei_Li96;~Qi_Guo4;~Yunji_Chen1", "gender": "M;M;F;M;M;F;F;F;;;M;M", "homepage": ";;http://www.iscas.ac.cn/;https://github.com/k1366191024;;http://www.iscas.ac.cn/;;;;;http://novel.ict.ac.cn/qguo;", "dblp": ";262/3144;;;;;92/5001-1;60/2536-40;;;67/398-1;48/474", "google_scholar": ";;https://scholar.google.com.hk/citations?view_op=new_articles;;;;;dse6jAsAAAAJ;;;;", "orcid": "0009-0003-3379-3449;0000-0002-7775-2724;;;0000-0003-0927-9039;;0000-0001-8877-9052;;;;;", "linkedin": ";;;;;;;;;;;", "or_profile": "~Huilai_Chen1;~Yuanbo_Wen1;~Limin_Cheng1;~Shouxu_Kuang1;~Yumeng_Liu3;~Weijia_Li4;~Ling_Li6;~Rui_Zhang1;~Xinkai_Song1;~Wei_Li96;~Qi_Guo4;~Yunji_Chen1", "aff": "Cambricon lab\uff0cICT, Chinese Academy of Sciences;Institute of Computing Technology, Chinese Academy of Sciences;University of Chinese Academy of Sciences;University of Chinese Academy of Sciences;University of South Carolina - Aiken;Institute of Software;Institute of Software, CAS;Institute of Computing Technology, CAS;;;Institute of Computing Technology, Chinese Academy of Sciences;Institute of Computing Technology, Chinese Academy of Sciences", "aff_domain": "ict.ac.cn;ict.ac.cn;ucas.edu.cn;ucas.edu.cn;usca.edu;iscas.ac.cn;iscas.ac.cn;ict.ac.cn;;;ict.ac.cn;ict.ac.cn", "position": "PhD student;Postdoc;PhD student;MS student;Full Professor;MS student;Full Professor;Assistant Professor;;;Full Professor;Full Professor", "bibtex": "@inproceedings{\nchen2024autoos,\ntitle={Auto{OS}: Make Your {OS} More Powerful by Exploiting Large Language Models},\nauthor={Huilai Chen and Yuanbo Wen and Limin Cheng and Shouxu Kuang and Yumeng Liu and Weijia Li and Ling Li and Rui Zhang and Xinkai Song and Wei Li and Qi Guo and Yunji Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Rp8R9C0Sth}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 618430, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13070370855544247880&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "ict.ac.cn;ict.ac.cn;ucas.edu.cn;ucas.edu.cn;usca.edu;iscas.ac.cn;iscas.ac.cn;ict.ac.cn;;;ict.ac.cn;ict.ac.cn", "author_num": 12, "aff_unique_index": "0;0;1;1;2;3;0;0;0;0", "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;University of South Carolina;Institute of Software", "aff_unique_dep": "Institute of Computing Technology;;;", "aff_unique_url": "http://www.cas.cn/;http://www.ucas.ac.cn;https://www.sc.edu;", "aff_unique_abbr": "CAS;UCAS;USC;", "aff_campus_unique_index": "1", "aff_campus_unique": ";Aiken", "aff_country_unique_index": "0;0;0;0;1;0;0;0;0", "aff_country_unique": "China;United States;" }, { "title": "Online Resource Allocation with Non-Stationary Customers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34038", "id": "RsIMGYzBcv", "proceeding": "https://proceedings.mlr.press/v235/zhang24ba.html", "pdf": "https://openreview.net/pdf?id=RsIMGYzBcv", "openreview": "https://openreview.net/forum?id=RsIMGYzBcv", "author_site": "Xiaoyue Zhang, Hanzhang Qin, Mabel Chou", "tldr": "", "abstract": "We propose a novel algorithm for online resource allocation with non-stationary customer arrivals and unknown click-through rates. We assume multiple types of customers arriving in a nonstationary stochastic fashion, with unknown arrival rates in each period. Additionally, customers' click-through rates are assumed to be unknown and only learnable online. By leveraging results from the stochastic contextual bandit with knapsack and online matching with adversarial arrivals, we develop an online scheme to allocate the resources to nonstationary customers. We prove that under mild conditions, our scheme achieves a ``best-of-both-world'' result: the scheme has a sublinear regret when the customer arrivals are near-stationary, and enjoys an optimal competitive ratio under general (non-stationary) customer arrival distributions. Finally, we conduct extensive numerical experiments to show our approach generates near-optimal revenues for all different customer scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaoyue Zhang;Hanzhang Qin;Mabel Chou", "authorids": "~Xiaoyue_Zhang3;~Hanzhang_Qin1;mabelchou@nus.edu.sg", "gender": "F;M;", "homepage": "https://iora.nus.edu.sg/people-p/zhang-xiaoyue/;;", "dblp": ";174/9322;", "google_scholar": ";o1VZkPUAAAAJ;", "orcid": ";0000-0002-2787-0685;", "linkedin": "%E6%BD%87%E6%9C%88-%E5%BC%A0-470509180/?otpToken=MTUwNDE5ZTgxYjI4Y2NjN2I2MjQwNGVkNDYxYmUzYmM4NmNkZDM0MjkxYWI4ODYxNzljNzA4NmE0YjVkNTVmNWYzZDZkZjliNDlmMWU4Zjg0MDkyYzNiYTlhZTMyN2Y5MmExYTM3MDlhNTg5MDgzZWQxZTZmMCwxLDE%3D&midSig=2U1Lbaii9SKH41&eid=bvf0ic-lrhobc2p-5q&midToken=AQGD-0ltqnjJYw&trkEmail=eml-email_pymk_02-header-0-profile_glimmer-null-bvf0ic%7Elrhobc2p%7E5q-null-null&trk=eml-email_pymk_02-header-0-profile_glimmer&originalSubdomain=sg;hanzhang-qin-7b67b394/;", "or_profile": "~Xiaoyue_Zhang3;~Hanzhang_Qin1;mabelchou@nus.edu.sg", "aff": "National University of Singapore;National University of Singapore;", "aff_domain": "u.nus.edu;nus.edu.sg;", "position": "PhD student;Assistant Professor;", "bibtex": "@inproceedings{\nzhang2024online,\ntitle={Online Resource Allocation with Non-Stationary Customers},\nauthor={Xiaoyue Zhang and Hanzhang Qin and Mabel Chou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RsIMGYzBcv}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 652288, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16626942333334706117&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "u.nus.edu;nus.edu.sg;", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "National University of Singapore", "aff_unique_dep": "", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Singapore" }, { "title": "PAPM: A Physics-aware Proxy Model for Process Systems", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34037", "id": "RtCmp5F9lN", "proceeding": "https://proceedings.mlr.press/v235/liu24v.html", "pdf": "https://openreview.net/pdf?id=RtCmp5F9lN", "openreview": "https://openreview.net/forum?id=RtCmp5F9lN", "author_site": "Pengwei Liu, Zhongkai Hao, Xingyu Ren, Hangjie Yuan, Jiayang Ren, Dong Ni", "tldr": "", "abstract": "In the context of proxy modeling for process systems, traditional data-driven deep learning approaches frequently encounter significant challenges, such as substantial training costs induced by large amounts of data, and limited generalization capabilities. As a promising alternative, physics-aware models incorporate partial physics knowledge to ameliorate these challenges. Although demonstrating efficacy, they fall short in terms of exploration depth and universality. To address these shortcomings, we introduce a **p**hysics-**a**ware **p**roxy **m**odel (**PAPM**) that fully incorporates partial prior physics of process systems, which includes multiple input conditions and the general form of conservation relations, resulting in better out-of-sample generalization. Additionally, PAPM contains a holistic temporal-spatial stepping module for flexible adaptation across various process systems. Through systematic comparisons with state-of-the-art pure data-driven and physics-aware models across five two-dimensional benchmarks in nine generalization tasks, PAPM notably achieves an average performance improvement of 6.7%, while requiring fewer FLOPs, and just 1% of the parameters compared to the prior leading method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pengwei Liu;Zhongkai Hao;Xingyu Ren;Hangjie Yuan;Jiayang Ren;Dong Ni", "authorids": "~Pengwei_Liu1;~Zhongkai_Hao1;~Xingyu_Ren2;~Hangjie_Yuan1;~Jiayang_Ren1;~Dong_Ni3", "gender": ";M;M;M;M;M", "homepage": "https://github.com/pengwei07;https://github.com/small-dumpling;https://jacobyuan7.github.io/;https://jiayang.site;;https://haozhongkai.github.io/", "dblp": ";;293/9956;310/1496;;270/0220.html", "google_scholar": "https://scholar.google.com.hk/citations?user=mlcLvUEAAAAJ;;jQ3bFDMAAAAJ;V3QBv3cAAAAJ;;dfSzq27ZiVoC", "orcid": ";;;;0000-0002-2227-2555;", "linkedin": ";;;;;", "or_profile": "~Pengwei_Liu1;~Xingyu_Ren2;~Hangjie_Yuan1;~Jiayang_Ren1;~Dong_Ni3;~Hao_Zhongkai1", "aff": "Zhejiang University;Zhejiang University;Zhejiang University;University of British Columbia;Zhejiang University;Tsinghua University", "aff_domain": "zju.edu.cn;zju.edu.cn;zju.edu.cn;ubc.ca;zju.edu.cn;mails.tsinghua.edu.cn", "position": "PhD student;Undergrad student;PhD student;PhD student;Full Professor;PhD student", "bibtex": "@inproceedings{\nliu2024papm,\ntitle={{PAPM}: A Physics-aware Proxy Model for Process Systems},\nauthor={Pengwei Liu and Zhongkai Hao and Xingyu Ren and Hangjie Yuan and Jiayang Ren and Dong Ni},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RtCmp5F9lN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9588985, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1065915363133201743&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "zju.edu.cn;zju.edu.cn;zju.edu.cn;ubc.ca;zju.edu.cn;mails.tsinghua.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;1;0;2", "aff_unique_norm": "Zhejiang University;University of British Columbia;Tsinghua University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.zju.edu.cn;https://www.ubc.ca;https://www.tsinghua.edu.cn", "aff_unique_abbr": "ZJU;UBC;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "China;Canada" }, { "title": "Compositional Curvature Bounds for Deep Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34036", "id": "RtnGLJNtEG", "proceeding": "https://proceedings.mlr.press/v235/entesari24a.html", "pdf": "https://openreview.net/pdf?id=RtnGLJNtEG", "openreview": "https://openreview.net/forum?id=RtnGLJNtEG", "author_site": "Taha Entesari, Sina Sharifi, Mahyar Fazlyab", "tldr": "", "abstract": "A key challenge that threatens the widespread use of neural networks in safety-critical applications is their vulnerability to adversarial attacks. In this paper, we study the second-order behavior of continuously differentiable deep neural networks, focusing on robustness against adversarial perturbations. First, we provide a theoretical analysis of robustness and attack certificates for deep classifiers by leveraging local gradients and upper bounds on the second derivative (curvature constant). Next, we introduce a novel algorithm to analytically compute provable upper bounds on the second derivative of neural networks. This algorithm leverages the compositional structure of the model to propagate the curvature bound layer-by-layer, giving rise to a scalable and modular approach. The proposed bound can serve as a differentiable regularizer to control the curvature of neural networks during training, thereby enhancing robustness. Finally, we demonstrate the efficacy of our method on classification tasks using the MNIST and CIFAR-10 datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Taha Entesari;Sina Sharifi;Mahyar Fazlyab", "authorids": "~Taha_Entesari1;~Sina_Sharifi1;~Mahyar_Fazlyab1", "gender": "M;M;M", "homepage": ";;https://www.ece.jhu.edu/mahyarfazlyab/", "dblp": "332/2244;;147/4846", "google_scholar": "5F1qfQ0AAAAJ;tZaIXKcAAAAJ;Y3bmjJwAAAAJ", "orcid": ";;", "linkedin": "tahaentesari/;;", "or_profile": "~Taha_Entesari1;~Sina_Sharifi1;~Mahyar_Fazlyab1", "aff": "Whiting School of Engineering, Johns Hopkins University;Johns Hopkins University;Johns Hopkins University", "aff_domain": "engineering.jhu.edu;jh.edu;jhu.edu", "position": "MS student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nentesari2024compositional,\ntitle={Compositional Curvature Bounds for Deep Neural Networks},\nauthor={Taha Entesari and Sina Sharifi and Mahyar Fazlyab},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RtnGLJNtEG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 648776, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:KDwK0XahQbAJ:scholar.google.com/&scioq=Compositional+Curvature+Bounds+for+Deep+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": "engineering.jhu.edu;jh.edu;jhu.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Johns Hopkins University", "aff_unique_dep": "Whiting School of Engineering", "aff_unique_url": "https://www.jhu.edu", "aff_unique_abbr": "JHU", "aff_campus_unique_index": "0", "aff_campus_unique": "Baltimore;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Locally Differentially Private Decentralized Stochastic Bilevel Optimization with Guaranteed Convergence Accuracy", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34035", "id": "RuH78kOcDi", "proceeding": "https://proceedings.mlr.press/v235/chen24ap.html", "pdf": "https://openreview.net/pdf?id=RuH78kOcDi", "openreview": "https://openreview.net/forum?id=RuH78kOcDi", "author_site": "Ziqin Chen, Yongqiang Wang", "tldr": "", "abstract": "Decentralized bilevel optimization based machine learning techniques are achieving remarkable success in a wide variety of domains. However, the intensive exchange of information (involving nested-loops of consensus or communication iterations) in existing decentralized bilevel optimization algorithms leads to a great challenge to ensure rigorous differential privacy, which, however, is necessary to bring the benefits of machine learning to domains where involved data are sensitive. By proposing a new decentralized stochastic bilevel-optimization algorithm which avoids nested-loops of information-exchange iterations, we achieve, for the first time, both differential privacy and accurate convergence in decentralized bilevel optimization. This is significant since even for single-level decentralized optimization and learning, existing differential-privacy solutions have to sacrifice convergence accuracy for privacy. Besides characterizing the convergence rate under nonconvex/convex/strongly convex conditions, we also rigorously quantify the price of differential privacy in the convergence rate. Experimental results on machine learning models confirm the efficacy of our algorithm.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ziqin Chen;Yongqiang Wang", "authorids": "~Ziqin_Chen1;~Yongqiang_Wang1", "gender": "F;", "homepage": ";https://cecas.clemson.edu/ndcl/", "dblp": ";", "google_scholar": "i-IM2rIAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Ziqin_Chen1;~Yongqiang_Wang1", "aff": "Clemson University;Clemson University", "aff_domain": "clemson.edu;clemson.edu", "position": "Postdoc;Associate Professor", "bibtex": "@inproceedings{\nchen2024locally,\ntitle={Locally Differentially Private Decentralized Stochastic Bilevel Optimization with Guaranteed Convergence Accuracy},\nauthor={Ziqin Chen and Yongqiang Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RuH78kOcDi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1062821, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12967314906028798149&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "clemson.edu;clemson.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Clemson University", "aff_unique_dep": "", "aff_unique_url": "https://www.clemson.edu", "aff_unique_abbr": "Clemson", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Finite Smoothing Algorithm for High-Dimensional Support Vector Machines and Quantile Regression", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34034", "id": "RvwMTDYTOb", "proceeding": "https://proceedings.mlr.press/v235/tang24j.html", "pdf": "https://openreview.net/pdf?id=RvwMTDYTOb", "openreview": "https://openreview.net/forum?id=RvwMTDYTOb", "author_site": "Qian Tang, Yikai Zhang, Boxiang Wang", "tldr": "", "abstract": "This paper introduces a finite smoothing algorithm (FSA), a novel approach to tackle computational challenges in applying support vector machines (SVM) and quantile regression to high-dimensional data. The critical issue with these methods is the non-smooth nature of their loss functions, which traditionally limits the use of highly efficient coordinate descent techniques in high-dimensional settings. FSA innovatively addresses this issue by transforming these loss functions into their smooth counterparts, thereby facilitating more efficient computation. A distinctive feature of FSA is its theoretical foundation: FSA can yield exact solutions, not just approximations, despite the smoothing approach. Our simulation and benchmark tests demonstrate that FSA significantly outpaces its competitors in speed, often by orders of magnitude, while improving or at least maintaining precision. We have implemented FSA in two open-source R packages: hdsvm for high-dimensional SVM and hdqr for high-dimensional quantile regression.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qian Tang;Yikai Zhang;Boxiang Wang", "authorids": "~Qian_Tang2;~Yikai_Zhang4;~Boxiang_Wang1", "gender": "F;M;", "homepage": ";;https://myweb.uiowa.edu/boxwang/", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": "tia-qian-tang-5a445a169/;yikai-zhang-66a01b160;", "or_profile": "~Qian_Tang2;~Yikai_Zhang4;~Boxiang_Wang1", "aff": "University of Iowa;University of Iowa;University of Iowa", "aff_domain": "uiowa.edu;uiowa.edu;uiowa.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\ntang2024finite,\ntitle={Finite Smoothing Algorithm for High-Dimensional Support Vector Machines and Quantile Regression},\nauthor={Qian Tang and Yikai Zhang and Boxiang Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RvwMTDYTOb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 414005, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:sRpQQ6fG8PsJ:scholar.google.com/&scioq=Finite+Smoothing+Algorithm+for+High-Dimensional+Support+Vector+Machines+and+Quantile+Regression&hl=en&as_sdt=0,5", "gs_version_total": 4, "email": "uiowa.edu;uiowa.edu;uiowa.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Iowa", "aff_unique_dep": "", "aff_unique_url": "https://www.uiowa.edu", "aff_unique_abbr": "UIowa", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Multi-Patch Prediction: Adapting Language Models for Time Series Representation Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34033", "id": "Rx9GMufByc", "proceeding": "https://proceedings.mlr.press/v235/bian24a.html", "pdf": "https://openreview.net/pdf?id=Rx9GMufByc", "openreview": "https://openreview.net/forum?id=Rx9GMufByc", "author_site": "Yuxuan Bian, Xuan Ju, Jiangtong Li, Zhijian Xu, Dawei Cheng, Qiang Xu", "tldr": "", "abstract": "In this study, we present $\\text{aL\\small{LM}4T\\small{S}}$, an innovative framework that adapts Large Language Models (LLMs) for time-series representation learning. Central to our approach is that we reconceive time-series forecasting as a self-supervised, multi-patch prediction task, which, compared to traditional mask-and-reconstruction methods, captures temporal dynamics in patch representations more effectively. Our strategy encompasses two-stage training: (i). a causal continual pre-training phase on various time-series datasets, anchored on next patch prediction, effectively syncing LLM capabilities with the intricacies of time-series data; (ii). fine-tuning for multi-patch prediction in the targeted time-series context. A distinctive element of our framework is the patch-wise decoding layer, which departs from previous methods reliant on sequence-level decoding. Such a design directly transposes individual patches into temporal sequences, thereby significantly bolstering the model's proficiency in mastering temporal patch-based representations. $\\text{aL\\small{LM}4T\\small{S}}$ demonstrates superior performance in several downstream tasks, proving its effectiveness in deriving temporal representations with enhanced transferability and marking a pivotal advancement in the adaptation of LLMs for time-series analysis.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuxuan Bian;Xuan Ju;Jiangtong Li;Zhijian Xu;Dawei Cheng;Qiang Xu", "authorids": "~Yuxuan_Bian1;~Xuan_Ju1;~Jiangtong_Li2;~Zhijian_Xu1;~Dawei_Cheng1;~Qiang_Xu1", "gender": "M;F;M;M;M;M", "homepage": "https://yxbian23.github.io;https://juxuan27.github.io/;http://notfornow.com;http://cs1.tongji.edu.cn/~dawei/;https://github.com/cure-lab;https://www.jiangtongli.me", "dblp": "357/3628;34/8495;72/8350;135/6864;43/1230-1;220/0990", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.hk/citations?user=pWzvK20AAAAJ;;4UD20ukAAAAJ;https://scholar.google.com.tw/citations?user=eSiKPqUAAAAJ;https://scholar.google.com.hk/citations?user=bJc_mGMAAAAJ", "orcid": ";;;0000-0002-5877-7387;;0000-0003-3873-4053", "linkedin": "yuxuan-bian-433455268/;%E7%92%87-%E9%9E%A0-122070217/;;;;", "or_profile": "~Yuxuan_Bian1;~Xuan_Ju1;~Zhijian_Xu1;~Dawei_Cheng1;~Qiang_Xu1;~jiangtong_li1", "aff": "The Chinese University of Hong Kong;Chinese University of Hong Kong;The Chinese University of Hong Kong;Tongji University;The Chinese University of Hong Kong;Shanghai Jiaotong University", "aff_domain": "cse.cuhk.edu.hk;cuhk.hk;cuhk.edu.hk;tongji.edu.cn;cuhk.edu.hk;sjtu.edu.cn", "position": "PhD student;PhD student;PhD student;Associate Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nbian2024multipatch,\ntitle={Multi-Patch Prediction: Adapting Language Models for Time Series Representation Learning},\nauthor={Yuxuan Bian and Xuan Ju and Jiangtong Li and Zhijian Xu and Dawei Cheng and Qiang Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Rx9GMufByc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7357694, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1833381520543069071&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "cse.cuhk.edu.hk;cuhk.hk;cuhk.edu.hk;tongji.edu.cn;cuhk.edu.hk;sjtu.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;1;0;2", "aff_unique_norm": "Chinese University of Hong Kong;Tongji University;Shanghai Jiao Tong University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.tongji.edu.cn;https://www.sjtu.edu.cn", "aff_unique_abbr": "CUHK;Tongji;SJTU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "EvTexture: Event-driven Texture Enhancement for Video Super-Resolution", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34032", "id": "Ry4RAzdOWl", "proceeding": "https://proceedings.mlr.press/v235/kai24a.html", "pdf": "https://openreview.net/pdf?id=Ry4RAzdOWl", "openreview": "https://openreview.net/forum?id=Ry4RAzdOWl", "author_site": "Dachun Kai, Jiayao Lu, Yueyi Zhang, Xiaoyan Sun", "tldr": "", "abstract": "Event-based vision has drawn increasing attention due to its unique characteristics, such as high temporal resolution and high dynamic range. It has been used in video super-resolution (VSR) recently to enhance the flow estimation and temporal alignment. Rather than for motion learning, we propose in this paper the first VSR method that utilizes event signals for texture enhancement. Our method, called EvTexture, leverages high-frequency details of events to better recover texture regions in VSR. In our EvTexture, a new texture enhancement branch is presented. We further introduce an iterative texture enhancement module to progressively explore the high-temporal-resolution event information for texture restoration. This allows for gradual refinement of texture regions across multiple iterations, leading to more accurate and rich high-resolution details. Experimental results show that our EvTexture achieves state-of-the-art performance on four datasets. For the Vid4 dataset with rich textures, our method can get up to 4.67dB gain compared with recent event-based methods. Code: [https://github.com/DachunKai/EvTexture](https://github.com/DachunKai/EvTexture).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dachun Kai;Jiayao Lu;Yueyi Zhang;Xiaoyan Sun", "authorids": "~Dachun_Kai1;~Jiayao_Lu1;~Yueyi_Zhang2;~Xiaoyan_Sun1", "gender": "M;M;;F", "homepage": ";;;", "dblp": "361/0216;;;13/1574-1.html", "google_scholar": ";;LatWlFAAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";;;", "linkedin": ";%E5%AE%B6%E5%B0%A7-%E9%99%86-4556a7145/;;", "or_profile": "~Dachun_Kai1;~Jiayao_Lu1;~Yueyi_Zhang2;~Xiaoyan_Sun1", "aff": "University of Science and Technology of China;University of Science and Technology of China;University of Science and Technology of China;University of Science and Technology of China", "aff_domain": "ustc.edu.cn;ustc.edu.cn;ustc.edu.cn;ustc.edu.cn", "position": "PhD student;PhD student;Associate Researcher;Full Professor", "bibtex": "@inproceedings{\nkai2024evtexture,\ntitle={EvTexture: Event-driven Texture Enhancement for Video Super-Resolution},\nauthor={Dachun Kai and Jiayao Lu and Yueyi Zhang and Xiaoyan Sun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Ry4RAzdOWl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7775669, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15860118274303209235&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "ustc.edu.cn;ustc.edu.cn;ustc.edu.cn;ustc.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Science and Technology of China", "aff_unique_dep": "", "aff_unique_url": "http://www.ustc.edu.cn", "aff_unique_abbr": "USTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Why Do Animals Need Shaping? A Theory of Task Composition and Curriculum Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34031", "id": "S0DPCE7tt4", "proceeding": "https://proceedings.mlr.press/v235/lee24r.html", "pdf": "https://openreview.net/pdf?id=S0DPCE7tt4", "openreview": "https://openreview.net/forum?id=S0DPCE7tt4", "author_site": "Jin Hwa Lee, Stefano Mannelli, Andrew Saxe", "tldr": "", "abstract": "Diverse studies in systems neuroscience begin with extended periods of curriculum training known as \u2018shaping\u2019 procedures. These involve progressively studying component parts of more complex tasks, and can make the difference between learning a task quickly, slowly or not at all. Despite the importance of shaping to the acquisition of complex tasks, there is as yet no theory that can help guide the design of shaping procedures, or more fundamentally, provide insight into its key role in learning. Modern deep reinforcement learning systems might implicitly learn compositional primitives within their multilayer policy networks. Inspired by these models, we propose and analyse a model of deep policy gradient learning of simple compositional reinforcement learning tasks. Using the tools of statistical physics, we solve for exact learning dynamics and characterise different learning strategies including primitives pre-training, in which task primitives are studied individually before learning compositional tasks. We find a complex interplay between task complexity and the efficacy of shaping strategies. Overall, our theory provides an analytical understanding of the benefits of shaping in a class of compositional tasks and a quantitative account of how training protocols can disclose useful task primitives, ultimately yielding faster and more robust learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jin Hwa Lee;Stefano Sarao Mannelli;Andrew M Saxe", "authorids": "~Jin_Hwa_Lee1;~Stefano_Sarao_Mannelli1;~Andrew_M_Saxe1", "gender": "F;M;M", "homepage": ";https://stefsmlab.github.io/;https://www.saxelab.org", "dblp": "277/8357;232/3343;39/6894", "google_scholar": "https://scholar.google.it/citations?view_op=list_works;https://scholar.google.it/citations?user=Kq272_MAAAAJ;h0Al1fcAAAAJ", "orcid": ";;0000-0002-9831-8812", "linkedin": "jin-hwa-lee-297149155/;;", "or_profile": "~Jin_Hwa_Lee1;~Stefano_Sarao_Mannelli1;~Andrew_M_Saxe1", "aff": "University College London, University of London;University College London;University College London, University of London", "aff_domain": "ucl.ac.uk;ucl.ac.uk;ucl.ac.uk", "position": "PhD student;Postdoc;Full Professor", "bibtex": "@inproceedings{\nlee2024why,\ntitle={Why Do Animals Need Shaping? A Theory of Task Composition and Curriculum Learning},\nauthor={Jin Hwa Lee and Stefano Sarao Mannelli and Andrew M Saxe},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=S0DPCE7tt4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2252178, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3071619021600612012&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 7, "email": "ucl.ac.uk;ucl.ac.uk;ucl.ac.uk", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University College London", "aff_unique_dep": "", "aff_unique_url": "https://www.ucl.ac.uk", "aff_unique_abbr": "UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "On the Independence Assumption in Neurosymbolic Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34030", "id": "S1gSrruVd4", "proceeding": "https://proceedings.mlr.press/v235/van-krieken24a.html", "pdf": "https://openreview.net/pdf?id=S1gSrruVd4", "openreview": "https://openreview.net/forum?id=S1gSrruVd4", "author_site": "Emile van Krieken, Pasquale Minervini, Edoardo Ponti, Antonio Vergari", "tldr": "", "abstract": "State-of-the-art neurosymbolic learning systems use probabilistic reasoning to guide neural networks towards predictions that conform to logical constraints. Many such systems assume that the probabilities of the considered symbols are conditionally independent given the input to simplify learning and reasoning. We study and criticise this assumption, highlighting how it can hinder optimisation and prevent uncertainty quantification. We prove that loss functions bias conditionally independent neural networks to become overconfident in their predictions. As a result, they are unable to represent uncertainty over multiple valid options. Furthermore, we prove that the minima of such loss functions are usually highly disconnected and non-convex, and thus difficult to optimise. Our theoretical analysis gives the foundation for replacing the conditional independence assumption and designing more expressive neurosymbolic probabilistic models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Emile van Krieken;Pasquale Minervini;Edoardo Ponti;Antonio Vergari", "authorids": "~Emile_van_Krieken1;~Pasquale_Minervini4;~Edoardo_Ponti1;~Antonio_Vergari3", "gender": "M;;M;M", "homepage": "https://emilevankrieken.com;https://ducdauge.github.io/;http://nolovedeeplearning.com;https://www.neuralnoise.com", "dblp": "235/1698;178/8829;http://dblp.uni-trier.de/pers/hd/v/Vergari:Antonio;58/10142", "google_scholar": "https://scholar.google.nl/citations?user=il8Y0B4AAAAJ;https://scholar.google.ca/citations?user=tklL2q0AAAAJ;YK0NLaUAAAAJ;https://scholar.google.it/citations?user=9sk6CSgAAAA", "orcid": "0000-0001-5502-4817;0000-0002-6308-1050;0000-0003-0036-5678;0000-0002-8442-602X", "linkedin": "emile-van-krieken/;edoardo-maria-ponti/;;pasquale-mauro-minervini-47a08324/", "or_profile": "~Emile_van_Krieken1;~Edoardo_Ponti1;~antonio_vergari2;~Pasquale_Minervini1", "aff": "Vrije Universiteit Amsterdam;NVIDIA;University of Edinburgh, University of Edinburgh;University of Edinburgh, University of Edinburgh", "aff_domain": "vu.nl;nvidia.com;ed.ac.uk;ed.ac.uk", "position": "PhD student;Researcher;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nkrieken2024on,\ntitle={On the Independence Assumption in Neurosymbolic Learning},\nauthor={Emile van Krieken and Pasquale Minervini and Edoardo Ponti and Antonio Vergari},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=S1gSrruVd4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3683576, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7852001078043166934&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "vu.nl;nvidia.com;ed.ac.uk;ed.ac.uk", "author_num": 4, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Vrije Universiteit Amsterdam;NVIDIA;University of Edinburgh", "aff_unique_dep": ";NVIDIA Corporation;", "aff_unique_url": "https://www.vu.nl;https://www.nvidia.com;https://www.ed.ac.uk", "aff_unique_abbr": "VU Amsterdam;NVIDIA;Edinburgh", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;2", "aff_country_unique": "Netherlands;United States;United Kingdom" }, { "title": "Equilibrium of Data Markets with Externality", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34029", "id": "S2XgbBCJy0", "proceeding": "https://proceedings.mlr.press/v235/hossain24a.html", "pdf": "https://openreview.net/pdf?id=S2XgbBCJy0", "openreview": "https://openreview.net/forum?id=S2XgbBCJy0", "author_site": "Safwan Hossain, Yiling Chen", "tldr": "", "abstract": "We model real-world data markets, where sellers post fixed prices and buyers are free to purchase from any set of sellers, as a simultaneous game. A key component here is the negative externality buyers induce on one another due to data purchases. Starting with a simple setting where buyers know their valuations a priori, we characterize both the existence and welfare properties of the pure Nash equilibrium in the presence of such externality. While the outcomes are bleak without any intervention, mirroring the limitations of current data markets, we prove that for a standard class of externality functions, platforms intervening through a transaction cost can lead to a pure equilibrium with strong welfare guarantees. We next consider a more realistic setting where buyers learn their valuations over time through market interactions. Our intervention is feasible here as well, and we consider learning algorithms to achieve low regret concerning both individual and cumulative utility metrics. Lastly, we analyze the promises of this intervention under a much richer externality model.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Safwan Hossain;Yiling Chen", "authorids": "~Safwan_Hossain1;~Yiling_Chen1", "gender": "M;F", "homepage": "https://safwanhossain.github.io/;https://yiling.seas.harvard.edu/", "dblp": ";72/3762-1", "google_scholar": "https://scholar.google.ca/citations?user=gyCQnUAAAAAJ;x_7xA0UAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Safwan_Hossain1;~Yiling_Chen1", "aff": "Harvard University;Harvard University", "aff_domain": "harvard.edu;fas.harvard.edu", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nhossain2024equilibrium,\ntitle={Equilibrium of Data Markets with Externality},\nauthor={Safwan Hossain and Yiling Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=S2XgbBCJy0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 918544, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=54247217457251049&as_sdt=4000005&sciodt=0,18&hl=en", "gs_version_total": 7, "email": "harvard.edu;fas.harvard.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Practical Performance Guarantees for Pipelined DNN Inference", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34028", "id": "S3xqyEaST9", "proceeding": "https://proceedings.mlr.press/v235/archer24a.html", "pdf": "https://openreview.net/pdf?id=S3xqyEaST9", "openreview": "https://openreview.net/forum?id=S3xqyEaST9", "author_site": "Aaron Archer, Matthew Fahrbach, Kuikui Liu, Prakash Prabhu", "tldr": "", "abstract": "We optimize pipeline parallelism for deep neural network (DNN) inference by partitioning model graphs into $k$ stages and minimizing the running time of the bottleneck stage, including communication. We give practical and effective algorithms for this NP-hard problem, but our emphasis is on tackling the practitioner's dilemma of deciding when a solution is good enough. To this end, we design novel mixed integer programming (MIP) relaxations for proving lower bounds. Applying these methods to a diverse testbed of 369 production models, for $k \\in \\\\{2, 4, 8, 16, 32, 64\\\\}$, we empirically show that these lower bounds are strong enough to be useful in practice. Our lower bounds are substantially stronger than standard combinatorial bounds. For example, evaluated via geometric means across a production testbed with $k = 16$ pipeline stages, our MIP formulations raise the lower bound from 0.4598 to 0.9452, expressed as a fraction of the best partition found. In other words, our improved lower bounds close the optimality gap by a factor of 9.855x.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aaron Archer;Matthew Fahrbach;Kuikui Liu;Prakash Prabhu", "authorids": "~Aaron_Archer1;~Matthew_Fahrbach1;~Kuikui_Liu1;~Prakash_Prabhu1", "gender": "M;;;M", "homepage": "https://research.google/people/AaronArcher/;;https://toc.csail.mit.edu/user/461;", "dblp": "97/1504;;230/3619;32/5749", "google_scholar": "7-TwsxsAAAAJ;;;", "orcid": "0000-0002-7761-2000;;;", "linkedin": "aaron-archer-939014134/;;;", "or_profile": "~Aaron_Archer1;~Matthew_Fahrbach1;~Kuikui_Liu1;~Prakash_Prabhu1", "aff": "Google Research;;Massachusetts Institute of Technology;Google", "aff_domain": "research.google.com;;mit.edu;google.com", "position": "Researcher;;Assistant Professor;Software Engineer", "bibtex": "@inproceedings{\narcher2024practical,\ntitle={Practical Performance Guarantees for Pipelined {DNN} Inference},\nauthor={Aaron Archer and Matthew Fahrbach and Kuikui Liu and Prakash Prabhu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=S3xqyEaST9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2156245, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:NHiPTjidBcwJ:scholar.google.com/&scioq=Practical+Performance+Guarantees+for+Pipelined+DNN+Inference&hl=en&as_sdt=0,33", "gs_version_total": 6, "email": "research.google.com;;mit.edu;google.com", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Google;Massachusetts Institute of Technology", "aff_unique_dep": "Google Research;", "aff_unique_url": "https://research.google;https://web.mit.edu", "aff_unique_abbr": "Google Research;MIT", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Be Your Own Neighborhood: Detecting Adversarial Examples by the Neighborhood Relations Built on Self-Supervised Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34027", "id": "S4LqI6CcJ3", "proceeding": "https://proceedings.mlr.press/v235/he24l.html", "pdf": "https://openreview.net/pdf?id=S4LqI6CcJ3", "openreview": "https://openreview.net/forum?id=S4LqI6CcJ3", "author_site": "Zhiyuan He, Yijun Yang, Pin-Yu Chen, Qiang Xu, Tsung-Yi Ho", "tldr": "", "abstract": "Deep Neural Networks (DNNs) are vulnerable to Adversarial Examples (AEs), hindering their use in safety-critical systems. In this paper, we present **BEYOND**, an innovative AE detection framework designed for reliable predictions. BEYOND identifies AEs by distinguishing the AE\u2019s abnormal relation with its augmented versions, i.e. neighbors, from two prospects: representation similarity and label consistency. An off-the-shelf Self-Supervised Learning (SSL) model is used to extract the representation and predict the label for its highly informative representation capacity compared to supervised learning models. We found clean samples maintain a high degree of representation similarity and label consistency relative to their neighbors, in contrast to AEs which exhibit significant discrepancies. We explain this observation and show that leveraging this discrepancy BEYOND can accurately detect AEs. Additionally, we develop a rigorous justification for the effectiveness of BEYOND. Furthermore, as a plug-and-play model, BEYOND can easily cooperate with the Adversarial Trained Classifier (ATC), achieving state-of-the-art (SOTA) robustness accuracy. Experimental results show that BEYOND outperforms baselines by a large margin, especially under adaptive attacks. Empowered by the robust relationship built on SSL, we found that BEYOND outperforms baselines in terms of both detection ability and speed. Project page: https://huggingface.co/spaces/allenhzy/Be-Your-Own-Neighborhood.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhiyuan He;Yijun Yang;Pin-Yu Chen;Qiang Xu;Tsung-Yi Ho", "authorids": "~Zhiyuan_He2;~Yijun_Yang5;~Pin-Yu_Chen1;~Qiang_Xu1;~Tsung-Yi_Ho2", "gender": ";;M;M;M", "homepage": ";;http://www.pinyuchen.com;https://github.com/cure-lab;https://www.cse.cuhk.edu.hk/people/faculty/tsung-yi-ho/", "dblp": ";;39/8969;43/1230-1;63/4181.html", "google_scholar": ";;jxwlCUUAAAAJ;https://scholar.google.com.tw/citations?user=eSiKPqUAAAAJ;TRDUYkAAAAAJ", "orcid": ";;0000-0003-1039-8369;;0000-0001-7348-5625", "linkedin": ";;pin-yu-chen-940062a2;;", "or_profile": "~Zhiyuan_He2;~Yijun_Yang5;~Pin-Yu_Chen1;~Qiang_Xu1;~Tsung-Yi_Ho2", "aff": ";;International Business Machines;The Chinese University of Hong Kong;Department of Computer Science and Engineering, The Chinese University of Hong Kong", "aff_domain": ";;ibm.com;cuhk.edu.hk;cse.cuhk.edu.hk", "position": ";;Principal Researcher;Full Professor;Full Professor", "bibtex": "@inproceedings{\nhe2024be,\ntitle={Be Your Own Neighborhood: Detecting Adversarial Examples by the Neighborhood Relations Built on Self-Supervised Learning},\nauthor={Zhiyuan He and Yijun Yang and Pin-Yu Chen and Qiang Xu and Tsung-Yi Ho},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=S4LqI6CcJ3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1395291, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14773029716196574449&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": ";;ibm.com;cuhk.edu.hk;cse.cuhk.edu.hk", "author_num": 5, "aff_unique_index": "0;1;1", "aff_unique_norm": "International Business Machines Corporation;Chinese University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "https://www.ibm.com;https://www.cuhk.edu.hk", "aff_unique_abbr": "IBM;CUHK", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;China" }, { "title": "Position: Social Environment Design Should be Further Developed for AI-based Policy-Making", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34026", "id": "S6a6gHvMWx", "proceeding": "https://proceedings.mlr.press/v235/zhang24cl.html", "pdf": "https://openreview.net/pdf?id=S6a6gHvMWx", "openreview": "https://openreview.net/forum?id=S6a6gHvMWx", "author_site": "Edwin Zhang, Sadie Zhao, Tonghan Wang, Safwan Hossain, Henry Gasztowtt, Stephan Zheng, David Parkes, Milind Tambe, Yiling Chen", "tldr": "", "abstract": "Artificial Intelligence (AI) holds promise as a technology that can be used to improve government and economic policy-making. This paper proposes a new research agenda towards this end by introducing **Social Environment Design**, a general framework for the use of AI in automated policy-making that connects with the Reinforcement Learning, EconCS, and Computational Social Choice communities. The framework seeks to capture general economic environments, includes voting on policy objectives, and gives a direction for the systematic analysis of government and economic policy through AI simulation. We highlight key open problems for future research in AI-based policymaking. By solving these challenges, we hope to achieve various social welfare objectives, thereby promoting more ethical and responsible decision making.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Edwin Zhang;Sadie Zhao;Tonghan Wang;Safwan Hossain;Henry Gasztowtt;Stephan Zheng;David C. Parkes;Milind Tambe;Yiling Chen", "authorids": "~Edwin_Zhang2;~Sadie_Zhao1;~Tonghan_Wang1;~Safwan_Hossain1;henry.gasztowtt@chch.ox.ac.uk;~Stephan_Zheng1;~David_C._Parkes1;~Milind_Tambe1;~Yiling_Chen1", "gender": ";;M;M;;M;M;;F", "homepage": "https://eddie.win;;https://tonghanwang.github.io/;https://safwanhossain.github.io/;;http://www.stephanzheng.com;https://parkes.seas.harvard.edu/;http://teamcore.seas.harvard.edu/tambe;https://yiling.seas.harvard.edu/", "dblp": ";;175/6039-1.html;;;https://dblp.org/pers/hd/z/Zheng:Stephan;p/DavidCParkes.html;67/2667;72/3762-1", "google_scholar": ";;-AR1yc4AAAAJ;https://scholar.google.ca/citations?user=gyCQnUAAAAAJ;;7mnKGGEAAAAJ;JUn8PgwAAAAJ;YOVZiJkAAAAJ;x_7xA0UAAAAJ", "orcid": ";;;;;;0000-0002-2701-3464;;", "linkedin": ";;;;;stephanzheng;;;", "or_profile": "~Edwin_Zhang2;~Sadie_Zhao1;~Tonghan_Wang1;~Safwan_Hossain1;henry.gasztowtt@chch.ox.ac.uk;~Stephan_Zheng1;~David_C._Parkes1;~Milind_Tambe1;~Yiling_Chen1", "aff": "Harvard University;;Tsinghua University;Harvard University;;;Harvard University;Google;Harvard University", "aff_domain": "harvard.edu;;tsinghua.edu.cn;harvard.edu;;;harvard.edu;google.com;fas.harvard.edu", "position": "PhD student;;MS student;PhD student;;;Full Professor;Principal Researcher;Full Professor", "bibtex": "@inproceedings{\nzhang2024position,\ntitle={Position: Social Environment Design Should be Further Developed for {AI}-based Policy-Making},\nauthor={Edwin Zhang and Sadie Zhao and Tonghan Wang and Safwan Hossain and Henry Gasztowtt and Stephan Zheng and David C. Parkes and Milind Tambe and Yiling Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=S6a6gHvMWx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1611001, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11932892051496618572&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "harvard.edu;;tsinghua.edu.cn;harvard.edu;;;harvard.edu;google.com;fas.harvard.edu", "author_num": 9, "aff_unique_index": "0;1;0;0;2;0", "aff_unique_norm": "Harvard University;Tsinghua University;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.harvard.edu;https://www.tsinghua.edu.cn;https://www.google.com", "aff_unique_abbr": "Harvard;THU;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;0;0;0;0", "aff_country_unique": "United States;China" }, { "title": "A Primal-Dual Algorithm for Offline Constrained Reinforcement Learning with Linear MDPs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34025", "id": "S80a4hJtuE", "proceeding": "https://proceedings.mlr.press/v235/hong24e.html", "pdf": "https://openreview.net/pdf?id=S80a4hJtuE", "openreview": "https://openreview.net/forum?id=S80a4hJtuE", "author_site": "Kihyuk Hong, Ambuj Tewari", "tldr": "", "abstract": "We study offline reinforcement learning (RL) with linear MDPs under the infinite-horizon discounted setting which aims to learn a policy that maximizes the expected discounted cumulative reward using a pre-collected dataset. Existing algorithms for this setting either require a uniform data coverage assumptions or are computationally inefficient for finding an $\\epsilon$-optimal policy with $\\mathcal{O}(\\epsilon^{-2})$ sample complexity. In this paper, we propose a primal dual algorithm for offline RL with linear MDPs in the infinite-horizon discounted setting. Our algorithm is the first computationally efficient algorithm in this setting that achieves sample complexity of $\\mathcal{O}(\\epsilon^{-2})$ with partial data coverage assumption. Our work is an improvement upon a recent work that requires $\\mathcal{O}(\\epsilon^{-4})$ samples. Moreover, we extend our algorithm to work in the offline constrained RL setting that enforces constraints on additional reward signals.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kihyuk Hong;Ambuj Tewari", "authorids": "~Kihyuk_Hong1;~Ambuj_Tewari1", "gender": "M;M", "homepage": ";https://www.ambujtewari.com", "dblp": ";24/567", "google_scholar": ";ttbl4FsAAAAJ", "orcid": ";0000-0001-6969-7844", "linkedin": "hominot/;", "or_profile": "~Kihyuk_Hong1;~Ambuj_Tewari1", "aff": "University of Michigan - Ann Arbor;University of Michigan - Ann Arbor", "aff_domain": "umich.edu;umich.edu", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nhong2024a,\ntitle={A Primal-Dual Algorithm for Offline Constrained Reinforcement Learning with Linear {MDP}s},\nauthor={Kihyuk Hong and Ambuj Tewari},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=S80a4hJtuE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 469467, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17289037158259775867&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "umich.edu;umich.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Michigan", "aff_unique_dep": "", "aff_unique_url": "https://www.umich.edu", "aff_unique_abbr": "UM", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Ann Arbor", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Adaptive-Gradient Policy Optimization: Enhancing Policy Learning in Non-Smooth Differentiable Simulations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34024", "id": "S9DV6ZP4eE", "proceeding": "https://proceedings.mlr.press/v235/gao24m.html", "pdf": "https://openreview.net/pdf?id=S9DV6ZP4eE", "openreview": "https://openreview.net/forum?id=S9DV6ZP4eE", "author_site": "Feng Gao, Liangzhi Shi, Shenao Zhang, Zhaoran Wang, Yi Wu", "tldr": "", "abstract": "Recent advancements in differentiable simulators highlight the potential of policy optimization using simulation gradients. Yet, these approaches are largely contingent on the continuity and smoothness of the simulation, which precludes the use of certain simulation engines, such as Mujoco. To tackle this challenge, we introduce the adaptive analytic gradient. This method views the Q function as a surrogate for future returns, consistent with the Bellman equation. By analyzing the variance of batched gradients, our method can autonomously opt for a more resilient Q function to compute the gradient when encountering rough simulation transitions. We also put forth the Adaptive-Gradient Policy Optimization (AGPO) algorithm, which leverages our proposed method for policy learning. On the theoretical side, we demonstrate AGPO's convergence, emphasizing its stable performance under non-smooth dynamics due to low variance. On the empirical side, our results show that AGPO effectively mitigates the challenges posed by non-smoothness in policy learning through differentiable simulation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Feng Gao;Liangzhi Shi;Shenao Zhang;Zhaoran Wang;Yi Wu", "authorids": "~Feng_Gao5;~Liangzhi_Shi1;~Shenao_Zhang1;~Zhaoran_Wang1;~Yi_Wu1", "gender": "M;M;M;Not Specified;M", "homepage": ";https://github.com/slzhta;https://shenao-zhang.github.io/;https://zhaoranwang.github.io/;https://jxwuyi.weebly.com", "dblp": ";;253/4543.html;117/2756;", "google_scholar": "wzcIdLAAAAAJ;;8NamuusAAAAJ;https://scholar.google.com.tw/citations?user=HSx0BgQAAAAJ;dusV5HMAAAAJ", "orcid": ";;;;", "linkedin": ";;shenao-zhang-055a53178/;;", "or_profile": "~Feng_Gao5;~Liangzhi_Shi1;~Shenao_Zhang1;~Zhaoran_Wang1;~Yi_Wu1", "aff": "IIIS, Tsinghua University;University of California, San Diego;Georgia Institute of Technology;Northwestern University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;ucsd.edu;gatech.edu;northwestern.edu;tsinghua.edu.cn", "position": "PhD student;Intern;MS student;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\ngao2024adaptivegradient,\ntitle={Adaptive-Gradient Policy Optimization: Enhancing Policy Learning in Non-Smooth Differentiable Simulations},\nauthor={Feng Gao and Liangzhi Shi and Shenao Zhang and Zhaoran Wang and Yi Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=S9DV6ZP4eE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2078886, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17720820685731461562&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": "tsinghua.edu.cn;ucsd.edu;gatech.edu;northwestern.edu;tsinghua.edu.cn", "author_num": 5, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "Tsinghua University;University of California, San Diego;Georgia Institute of Technology;Northwestern University", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences;;;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.ucsd.edu;https://www.gatech.edu;https://www.northwestern.edu", "aff_unique_abbr": "THU;UCSD;Georgia Tech;NU", "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;1;1;1;0", "aff_country_unique": "China;United States" }, { "title": "Video-LaVIT: Unified Video-Language Pre-training with Decoupled Visual-Motional Tokenization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34023", "id": "S9lk6dk4LL", "proceeding": "https://proceedings.mlr.press/v235/jin24f.html", "pdf": "https://openreview.net/pdf?id=S9lk6dk4LL", "openreview": "https://openreview.net/forum?id=S9lk6dk4LL", "author_site": "Yang Jin, Zhicheng Sun, Kun Xu, Kun Xu, Liwei Chen, Hao Jiang, Quzhe Huang, Chengru Song, Yuliang Liu, Di ZHANG, Yang Song, Kun Gai, Yadong Mu", "tldr": "", "abstract": "In light of recent advances in multimodal Large Language Models (LLMs), there is increasing attention to scaling them from image-text data to more informative real-world videos. Compared to static images, video poses unique challenges for effective large-scale pre-training due to the modeling of its spatiotemporal dynamics. In this paper, we address such limitations in video-language pre-training with an efficient video decomposition that represents each video as keyframes and temporal motions. These are then adapted to an LLM using well-designed tokenizers that discretize visual and temporal information as a few tokens, thus enabling unified generative pre-training of videos, images, and text. At inference, the generated tokens from the LLM are carefully recovered to the original continuous pixel space to create various video content. Our proposed framework is both capable of comprehending and generating image and video content, as demonstrated by its competitive performance across 13 multimodal benchmarks in image and video understanding and generation. Our code and models are available at https://video-lavit.github.io.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yang Jin;Zhicheng Sun;Kun Xu;Kun Xu;Liwei Chen;Hao Jiang;Quzhe Huang;Chengru Song;Yuliang Liu;Di ZHANG;Yang Song;Kun Gai;Yadong MU", "authorids": "~Yang_Jin1;~Zhicheng_Sun1;~Kun_Xu4;~Kun_Xu6;~Liwei_Chen3;~Hao_Jiang10;~Quzhe_Huang1;~Chengru_Song1;~Yuliang_Liu3;~Di_ZHANG3;~Yang_Song6;~Kun_Gai1;~Yadong_MU1", "gender": "M;M;;M;M;;;M;M;M;M;M;M", "homepage": ";https://feifeiobama.github.io;https://sites.google.com/view/kunxu2/home;;;https://jianghao.com;https://andrewzhe.github.io/;;https://github.com/YuliangLiu0306;;http://sonyis.me;;http://www.muyadong.com/", "dblp": "51/3584;331/1484-1;;;;;278/1884;144/1365;;;24/4470-8;59/2902;55/1817", "google_scholar": "O3NlYwEAAAAJ;Xa8dgkYAAAAJ;;MGTIEMIAAAAJ;https://scholar.google.com/citations?hl=en;;https://scholar.google.com/citations?hl=en;;;;tvWB_yUAAAAJ;PXO4ygEAAAAJ;https://scholar.google.com.tw/citations?user=Fqqx4HsAAAAJ", "orcid": ";;;;;;;;;0009-0006-5475-2728;;;", "linkedin": ";zhicheng-sun;;;;;;;;;;;", "or_profile": "~Yang_Jin1;~Zhicheng_Sun1;~Kun_Xu4;~Kun_Xu6;~Liwei_Chen3;~Hao_Jiang10;~Quzhe_Huang1;~Chengru_Song1;~Yuliang_Liu3;~Di_ZHANG3;~Yang_Song6;~Kun_Gai1;~Yadong_MU1", "aff": "Peking University;Peking University;Tencent AI Lab;Kuaishou- \u5feb\u624b\u79d1\u6280;Kuaishou- \u5feb\u624b\u79d1\u6280;Peking University;Peking University;Kuaishou- \u5feb\u624b\u79d1\u6280;;Kuaishou Technology;Kuaishou Technology;Kuaishou- \u5feb\u624b\u79d1\u6280;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn;tencent.com;kuaishou.com;kuaishou.com;pku.edu.cn;pku.edu.cn;kuaishou.com;;kuaishou.com;kuaishou.com;kuaishou.com;pku.edu.cn", "position": "PhD student;PhD student;Researcher;Researcher;Researcher;PhD student;PhD student;Principal Researcher;;VP;Researcher;Instructor;Associate Professor", "bibtex": "@inproceedings{\njin2024videolavit,\ntitle={Video-La{VIT}: Unified Video-Language Pre-training with Decoupled Visual-Motional Tokenization},\nauthor={Yang Jin and Zhicheng Sun and Kun Xu and Kun Xu and Liwei Chen and Hao Jiang and Quzhe Huang and Chengru Song and Yuliang Liu and Di ZHANG and Yang Song and Kun Gai and Yadong MU},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=S9lk6dk4LL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9460386, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 13, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3825542749986207979&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "pku.edu.cn;pku.edu.cn;tencent.com;kuaishou.com;kuaishou.com;pku.edu.cn;pku.edu.cn;kuaishou.com;;kuaishou.com;kuaishou.com;kuaishou.com;pku.edu.cn", "author_num": 13, "aff_unique_index": "0;0;1;2;2;0;0;2;2;2;2;0", "aff_unique_norm": "Peking University;Tencent;Kuaishou Technology", "aff_unique_dep": ";Tencent AI Lab;", "aff_unique_url": "http://www.pku.edu.cn;https://ai.tencent.com;https://www.kuaishou.com", "aff_unique_abbr": "Peking U;Tencent AI Lab;Kuaishou", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Efficient Low-Rank Matrix Estimation, Experimental Design, and Arm-Set-Dependent Low-Rank Bandits", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34022", "id": "SAEUO7847g", "proceeding": "https://proceedings.mlr.press/v235/jang24e.html", "pdf": "https://openreview.net/pdf?id=SAEUO7847g", "openreview": "https://openreview.net/forum?id=SAEUO7847g", "author_site": "Kyoungseok Jang, Chicheng Zhang, Kwang-Sung Jun", "tldr": "", "abstract": "We study low-rank matrix trace regression and the related problem of low-rank matrix bandits. Assuming access to the distribution of the covariates, we propose a novel low-rank matrix estimation method called *LowPopArt* and provide its recovery guarantee that depends on a novel quantity denoted by $B(Q)$ that characterizes the hardness of the problem, where $Q$ is the covariance matrix of the measurement distribution. We show that our method can provide tighter recovery guarantees than classical nuclear norm penalized least squares (Koltchinskii et al., 2011) in several problems. To perform an efficient estimation with a limited number of measurements from an arbitrarily given measurement set $\\mathcal{A}$, we also propose a novel experimental design criterion that minimizes $B(Q)$ with computational efficiency. We leverage our novel estimator and design of experiments to derive two low-rank linear bandit algorithms for general arm sets that enjoy improved regret upper bounds. This improves over previous works on low-rank bandits, which make somewhat restrictive assumptions that the arm set is the unit ball or that an efficient exploration distribution is given. To our knowledge, our experimental design criterion is the first one tailored to low-rank matrix estimation beyond the naive reduction to linear regression, which can be of independent interest.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kyoungseok Jang;Chicheng Zhang;Kwang-Sung Jun", "authorids": "~Kyoungseok_Jang1;~Chicheng_Zhang1;~Kwang-Sung_Jun1", "gender": "M;M;M", "homepage": "https://jajajang.github.io;http://zcc1307.github.io;http://kwangsungjun.github.io", "dblp": "296/8698;149/2402;88/8411", "google_scholar": "hDqIvzAAAAAJ;29B3BAgAAAAJ;VgvC7o8AAAAJ", "orcid": "0009-0002-6689-4601;;", "linkedin": "kyoungseok-jang-856440219/;;", "or_profile": "~Kyoungseok_Jang1;~Chicheng_Zhang1;~Kwang-Sung_Jun1", "aff": "New York University;University of Arizona;University of Arizona", "aff_domain": "nyu.edu;arizona.edu;cs.arizona.edu", "position": "Postdoc;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\njang2024efficient,\ntitle={Efficient Low-Rank Matrix Estimation, Experimental Design, and Arm-Set-Dependent Low-Rank Bandits},\nauthor={Kyoungseok Jang and Chicheng Zhang and Kwang-Sung Jun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SAEUO7847g}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1484584, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9557030337144386027&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 10, "email": "nyu.edu;arizona.edu;cs.arizona.edu", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "New York University;University of Arizona", "aff_unique_dep": ";", "aff_unique_url": "https://www.nyu.edu;https://www.arizona.edu", "aff_unique_abbr": "NYU;UA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Parallel Affine Transformation Tuning of Markov Chain Monte Carlo", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34021", "id": "SAXp5dMYv7", "proceeding": "https://proceedings.mlr.press/v235/schar24a.html", "pdf": "https://openreview.net/pdf?id=SAXp5dMYv7", "openreview": "https://openreview.net/forum?id=SAXp5dMYv7", "author_site": "Philip Sch\u00e4r, Michael Habeck, Daniel Rudolf", "tldr": "", "abstract": "The performance of Markov chain Monte Carlo samplers strongly depends on the properties of the target distribution such as its covariance structure, the location of its probability mass and its tail behavior. We explore the use of bijective affine transformations of the sample space to improve the properties of the target distribution and thereby the performance of samplers running in the transformed space. In particular, we propose a flexible and user-friendly scheme for adaptively learning the affine transformation during sampling. Moreover, the combination of our scheme with Gibbsian polar slice sampling is shown to produce samples of high quality at comparatively low computational cost in several settings based on real-world data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Philip Sch\u00e4r;Michael Habeck;Daniel Rudolf", "authorids": "~Philip_Sch\u00e4r1;~Michael_Habeck1;~Daniel_Rudolf1", "gender": "M;;M", "homepage": "https://microscopic-image-analysis.github.io/team/philip-schaer.html;;https://staff.fim.uni-passau.de/~rudolf/", "dblp": ";67/5618;", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.de/citations?hl=de;", "orcid": ";0000-0002-2188-5667;", "linkedin": "philip-sch%C3%A4r-6a190b250/;;", "or_profile": "~Philip_Sch\u00e4r1;~Michael_Habeck1;~Daniel_Rudolf1", "aff": "Friedrich-Schiller Universit\u00e4t Jena;University of Jena;Universit\u00e4t Passau", "aff_domain": "uni-jena.de;uni-jena.de;uni-passau.de", "position": "PhD student;Professor;Full Professor", "bibtex": "@inproceedings{\nsch{\\\"a}r2024parallel,\ntitle={Parallel Affine Transformation Tuning of Markov Chain Monte Carlo},\nauthor={Philip Sch{\\\"a}r and Michael Habeck and Daniel Rudolf},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SAXp5dMYv7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1910219, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3315724846471046677&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 9, "email": "uni-jena.de;uni-jena.de;uni-passau.de", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Friedrich-Schiller-Universit\u00e4t Jena;Friedrich Schiller University Jena;University of Passau", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uni-jena.de;https://www.uni-jena.de/;https://www.uni-passau.de", "aff_unique_abbr": "FSU Jena;FSU Jena;UP", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Jena;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "Winner-takes-all learners are geometry-aware conditional density estimators", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34020", "id": "SAbL40d8A4", "proceeding": "https://proceedings.mlr.press/v235/letzelter24a.html", "pdf": "https://openreview.net/pdf?id=SAbL40d8A4", "openreview": "https://openreview.net/forum?id=SAbL40d8A4", "author_site": "Victor Letzelter, David Perera, C\u221a\u00a9dric Rommel, Mathieu Fontaine, Slim Essid, Ga\u00ebl Richard, Patrick Perez", "tldr": "", "abstract": "Winner-takes-all training is a simple learning paradigm, which handles ambiguous tasks by predicting a set of plausible hypotheses. Recently, a connection was established between Winner-takes-all training and centroidal Voronoi tessellations, showing that, once trained, hypotheses should quantize optimally the shape of the conditional distribution to predict. However, the best use of these hypotheses for uncertainty quantification is still an open question. In this work, we show how to leverage the appealing geometric properties of the Winner-takes-all learners for conditional density estimation, without modifying its original training scheme. We theoretically establish the advantages of our novel estimator both in terms of quantization and density estimation, and we demonstrate its competitiveness on synthetic and real-world datasets, including audio data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Victor Letzelter;David Perera;C\u00e9dric Rommel;Mathieu Fontaine;Slim Essid;Ga\u00ebl Richard;Patrick Perez", "authorids": "~Victor_Letzelter1;david.perera@telecom-paris.fr;~C\u00e9dric_Rommel1;~Mathieu_Fontaine1;~Slim_Essid1;~Ga\u00ebl_Richard1;~Patrick_Perez1", "gender": ";;M;M;Not Specified;M;", "homepage": "https://victorletzelter.github.io;;https://cedricrommel.github.io/;https://matfontaine.github.io/;https://perso.telecom-paris.fr/essid/;https://perso.telecom-paristech.fr/grichard/;", "dblp": "360/0588;;295/9766;;53/6904;34/1310;", "google_scholar": "https://scholar.google.fr/citations?user=YhTdZh8AAAAJ;;GBv4KYwAAAAJ;https://scholar.google.fr/citations?user=xDMdhVgAAAAJ;5dP_Pv0AAAAJ;https://scholar.google.fr/citations?user=xn70tPIAAAAJ;", "orcid": ";;;;;;", "linkedin": "victor-letzelter-3b832219b;;cedric-rommel/;;;;", "or_profile": "~Victor_Letzelter1;david.perera@telecom-paris.fr;~C\u00e9dric_Rommel1;~Mathieu_Fontaine1;~Slim_Essid1;~Ga\u00ebl_Richard1;~Patrick_Perez1", "aff": "T\u00e9l\u00e9com ParisTech;;Valeo;T\u00e9l\u00e9com ParisTech;T\u00e9l\u00e9com ParisTech;Telecom Paris;", "aff_domain": "telecom-paristech.fr;;valeo.com;telecom-paristech.fr;telecom-paristech.fr;telecom-paris.fr;", "position": "PhD student;;Researcher;Associate Professor;Full Professor;Full Professor;", "bibtex": "@inproceedings{\nletzelter2024winnertakesall,\ntitle={Winner-takes-all learners are geometry-aware conditional density estimators},\nauthor={Victor Letzelter and David Perera and C{\\'e}dric Rommel and Mathieu Fontaine and Slim Essid and Ga{\\\"e}l Richard and Patrick Perez},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SAbL40d8A4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4614683, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17257844895172219175&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 16, "email": "telecom-paristech.fr;;valeo.com;telecom-paristech.fr;telecom-paristech.fr;telecom-paris.fr;", "author_num": 7, "aff_unique_index": "0;1;0;0;2", "aff_unique_norm": "T\u00e9l\u00e9com ParisTech;Valeo;Telecom Paris", "aff_unique_dep": ";;", "aff_unique_url": "https://www.telecom-paristech.fr;https://www.valeo.com;https://www.telecom-paris.fr", "aff_unique_abbr": "TP;;Telecom Paris", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "France" }, { "title": "Decomposable Submodular Maximization in Federated Setting", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34019", "id": "SAbZExIIgG", "proceeding": "https://proceedings.mlr.press/v235/rafiey24a.html", "pdf": "https://openreview.net/pdf?id=SAbZExIIgG", "openreview": "https://openreview.net/forum?id=SAbZExIIgG", "tldr": "", "abstract": "Submodular functions, as well as the sub-class of decomposable submodular functions, and their optimization appear in a wide range of applications in machine learning, recommendation systems, and welfare maximization. However, optimization of decomposable submodular functions with millions of component functions is computationally prohibitive. Furthermore, the component functions may be private (they might represent user preference function, for example) and cannot be widely shared. To address these issues, we propose a *federated optimization* setting for decomposable submodular optimization. In this setting, clients have their own preference functions, and a weighted sum of these preferences needs to be maximized. We implement the popular *continuous greedy* algorithm in this setting where clients take parallel small local steps towards the local solution and then the local changes are aggregated at a central server. To address the large number of clients, the aggregation is performed only on a subsampled set. Further, the aggregation is performed only intermittently between stretches of parallel local steps, which reduces communication cost significantly. We show that our federated algorithm is guaranteed to provide a good approximate solution, even in the presence of above cost-cutting measures. Finally, we show how the federated setting can be incorporated in solving fundamental discrete submodular optimization problems such as Maximum Coverage and Facility Location.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Akbar Rafiey", "authorids": "~Akbar_Rafiey1", "gender": "", "homepage": "https://akbarrafiey.github.io", "dblp": "166/1694.html", "google_scholar": "", "orcid": "0000-0003-1619-3997", "linkedin": "akbar-rafiey-b42040114/", "or_profile": "~Akbar_Rafiey1", "aff": "University of California, San Diego", "aff_domain": "ucsd.edu", "position": "Postdoc", "bibtex": "@inproceedings{\nrafiey2024decomposable,\ntitle={Decomposable Submodular Maximization in Federated Setting},\nauthor={Akbar Rafiey},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SAbZExIIgG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 422170, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16848365224823001563&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "ucsd.edu", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "University of California, San Diego", "aff_unique_dep": "", "aff_unique_url": "https://www.ucsd.edu", "aff_unique_abbr": "UCSD", "aff_campus_unique_index": "0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "DetKDS: Knowledge Distillation Search for Object Detectors", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34018", "id": "SBR8Gwe1E2", "proceeding": "https://proceedings.mlr.press/v235/li24c.html", "pdf": "https://openreview.net/pdf?id=SBR8Gwe1E2", "openreview": "https://openreview.net/forum?id=SBR8Gwe1E2", "author_site": "Lujun Li, Yufan Bao, Peijie Dong, Chuanguang Yang, Anggeng Li, Wenhan Luo, Qifeng Liu, Wei Xue, Yike Guo", "tldr": "", "abstract": "In this paper, we present DetKDS, the first framework that searches for optimal detection distillation policies. Manual design of detection distillers becomes challenging and time-consuming due to significant disparities in distillation behaviors between detectors with different backbones, paradigms, and label assignments. To tackle these challenges, we leverage search algorithms to discover optimal distillers for homogeneous and heterogeneous student-teacher pairs. Firstly, our search space encompasses global features, foreground-background features, instance features, logits response, and localization response as inputs. Then, we construct omni-directional cascaded transformations and obtain the distiller by selecting the advanced distance function and common weight value options. Finally, we present a divide-and-conquer evolutionary algorithm to handle the explosion of the search space. In this strategy, we first evolve the best distiller formulations of individual knowledge inputs and then optimize the combined weights of these multiple distillation losses. DetKDS automates the distillation process without requiring expert design or additional tuning, effectively reducing the teacher-student gap in various scenarios. Based on the analysis of our search results, we provide valuable guidance that contributes to detection distillation designs. Comprehensive experiments on different detectors demonstrate that DetKDS outperforms state-of-the-art methods in detection and instance segmentation tasks. For instance, DetKDS achieves significant gains than baseline detectors: $+3.7$, $+4.1$, $+4.0$, $+3.7$, and $+3.5$ AP on RetinaNet, Faster-RCNN, FCOS, RepPoints, and GFL, respectively. Code at: https://github.com/lliai/DetKDS.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lujun Li;Yufan Bao;Peijie Dong;Chuanguang Yang;Anggeng Li;Wenhan Luo;Qifeng Liu;Wei Xue;Yike Guo", "authorids": "~Lujun_Li1;~Yufan_Bao1;~Peijie_Dong1;~Chuanguang_Yang1;~Anggeng_Li1;~Wenhan_Luo1;~Qifeng_Liu1;~Wei_Xue5;~Yike_Guo1", "gender": ";F;M;M;M;M;M;M;M", "homepage": ";https://github.com/Yufan-Bao;https://pprp.github.io;https://winycg.github.io/;https://github.com/AgL2;https://whluo.github.io/;;http://www.wei-xue.com;https://cse.hkust.edu.hk/admin/people/faculty/profile/yikeguo", "dblp": ";;315/4734;241/6325;354/8829.html;64/9877;23/992.html;;g/YikeGuo", "google_scholar": ";;TqS6s4gAAAAJ;https://scholar.google.com/citations?hl=zh-CN;;g20Q12MAAAAJ;scR1CXcAAAAJ;77lSoywAAAAJ;https://scholar.google.com.tw/citations?user=-0q6cIYAAAAJ", "orcid": ";;0000-0003-1952-4544;0000-0001-5890-289X;;0000-0002-5697-4168;0000-0001-6191-076X;;0009-0005-8401-282X", "linkedin": ";;;;;wenhan-luo-a1843480/;qifeng-liu-483b3227/;;", "or_profile": "~Lujun_Li1;~Yufan_Bao1;~Peijie_Dong1;~Chuanguang_Yang1;~Anggeng_Li1;~Wenhan_Luo1;~Qifeng_Liu1;~Wei_Xue5;~Yike_Guo1", "aff": ";iFLYTEK Research;The Hong Kong University of Science and Technology (Guang Zhou);Institute of Computing Technology, Chinese Academy of Sciences;Huawei Technologies Ltd.;Sun Yat-sen University;Hong Kong University of Science and Technology;Hong Kong University of Science and Technology;Imperial College London", "aff_domain": ";iflytek.com;connect.hkust-gz.edu.cn;ict.ac.cn;huawei.com;sysu.edu.cn;hkust.edu.hk;ust.hk;imperial.ac.uk", "position": ";Intern;Phd student;Assistant Professor;Intern;Associate Professor;Full Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nli2024detkds,\ntitle={Det{KDS}: Knowledge Distillation Search for Object Detectors},\nauthor={Lujun Li and Yufan Bao and Peijie Dong and Chuanguang Yang and Anggeng Li and Wenhan Luo and Qifeng Liu and Wei Xue and Yike Guo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SBR8Gwe1E2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3635759, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=650238385278705556&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": ";iflytek.com;connect.hkust-gz.edu.cn;ict.ac.cn;huawei.com;sysu.edu.cn;hkust.edu.hk;ust.hk;imperial.ac.uk", "author_num": 9, "aff_unique_index": "0;1;2;3;4;1;1;5", "aff_unique_norm": "iFLYTEK;Hong Kong University of Science and Technology;Chinese Academy of Sciences;Huawei;Sun Yat-sen University;Imperial College London", "aff_unique_dep": "Research;;Institute of Computing Technology;Huawei Technologies;;", "aff_unique_url": "https://www.iflytek.com;https://www.ust.hk;http://www.ict.ac.cn;https://www.huawei.com;http://www.sysu.edu.cn/;https://www.imperial.ac.uk", "aff_unique_abbr": "iFLYTEK;HKUST;CAS;Huawei;SYSU;ICL", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0;1", "aff_country_unique": "China;United Kingdom" }, { "title": "Confidence-aware Contrastive Learning for Selective Classification", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34017", "id": "SDCx6rQV2l", "proceeding": "https://proceedings.mlr.press/v235/wu24s.html", "pdf": "https://openreview.net/pdf?id=SDCx6rQV2l", "openreview": "https://openreview.net/forum?id=SDCx6rQV2l", "author_site": "Yu-Chang Wu, Shen-Huan Lyu, Haopu Shang, Xiangyu Wang, Chao Qian", "tldr": "", "abstract": "Selective classification enables models to make predictions only when they are sufficiently confident, aiming to enhance safety and reliability, which is important in high-stakes scenarios. Previous methods mainly use deep neural networks and focus on modifying the architecture of classification layers to enable the model to estimate the confidence of its prediction. This work provides a generalization bound for selective classification, disclosing that optimizing feature layers helps improve the performance of selective classification. Inspired by this theory, we propose to explicitly improve the selective classification model at the feature level for the first time, leading to a novel Confidence-aware Contrastive Learning method for Selective Classification, CCL-SC, which similarizes the features of homogeneous instances and differentiates the features of heterogeneous instances, with the strength controlled by the model's confidence. The experimental results on typical datasets, i.e., CIFAR-10, CIFAR-100, CelebA, and ImageNet, show that CCL-SC achieves significantly lower selective risk than state-of-the-art methods, across almost all coverage degrees. Moreover, it can be combined with existing methods to bring further improvement.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yu-Chang Wu;Shen-Huan Lyu;Haopu Shang;Xiangyu Wang;Chao Qian", "authorids": "~Yu-Chang_Wu1;~Shen-Huan_Lyu1;~Haopu_Shang1;~Xiangyu_Wang6;~Chao_Qian1", "gender": "M;M;M;M;M", "homepage": "https://www.lamda.nju.edu.cn/shanghp/;https://cs.nju.edu.cn/rinc/student/wangxiangyu/wangxiangyu.html;http://www.lamda.nju.edu.cn/qianc/;https://lyushenhuan.netlify.app;http://www.lamda.nju.edu.cn/wuyc/", "dblp": "318/3146;;84/8508-1;255/7033;141/8679", "google_scholar": "B5Bxh80AAAAJ;;;_NlDLQIAAAAJ;", "orcid": ";0000-0002-8466-4896;;0000-0002-0173-8408;", "linkedin": ";;;;", "or_profile": "~Haopu_Shang1;~Xiangyu_Wang6;~Chao_Qian1;~Shenhuan_Lv1;~Yuchang_Wu1", "aff": "Nanjing University;Nanjing University;Nanjing university;Hohai University;Nanjing University", "aff_domain": "nju.edu.cn;nju.edu.cn;nju.edu;hhu.edu.cn;nju.edu.cn", "position": "PhD student;MS student;Full Professor;Researcher;MS student", "bibtex": "@inproceedings{\nwu2024confidenceaware,\ntitle={Confidence-aware Contrastive Learning for Selective Classification},\nauthor={Yu-Chang Wu and Shen-Huan Lyu and Haopu Shang and Xiangyu Wang and Chao Qian},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SDCx6rQV2l}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2048178, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4851598721206267981&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "nju.edu.cn;nju.edu.cn;nju.edu;hhu.edu.cn;nju.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Nanjing University;Hohai University", "aff_unique_dep": ";", "aff_unique_url": "https://www.nju.edu.cn;https://www.hohai.edu.cn", "aff_unique_abbr": "Nanjing U;Hohai", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "D-Flow: Differentiating through Flows for Controlled Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34016", "id": "SE20BFqj6J", "proceeding": "https://proceedings.mlr.press/v235/ben-hamu24a.html", "pdf": "https://openreview.net/pdf?id=SE20BFqj6J", "openreview": "https://openreview.net/forum?id=SE20BFqj6J", "author_site": "Heli Ben-Hamu, Omri Puny, Itai Gat, Brian Karrer, Uriel Singer, Yaron Lipman", "tldr": "", "abstract": "Taming the generation outcome of state of the art Diffusion and Flow-Matching (FM) models without having to re-train a task-specific model unlocks a powerful tool for solving inverse problems, conditional generation, and controlled generation in general. In this work we introduce *D-Flow*, a simple framework for controlling the generation process by differentiating through the flow, optimizing for the source (noise) point. We motivate this framework by our key observation stating that for Diffusion/FM models trained with Gaussian probability paths, differentiating through the generation process projects gradient on the data manifold, implicitly injecting the prior into the optimization process. We validate our framework on linear and non-linear controlled generation problems including: image and audio inverse problems and conditional molecule generation reaching state of the art performance across all.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Heli Ben-Hamu;Omri Puny;Itai Gat;Brian Karrer;Uriel Singer;Yaron Lipman", "authorids": "~Heli_Ben-Hamu1;~Omri_Puny1;~Itai_Gat1;~Brian_Karrer1;~Uriel_Singer1;~Yaron_Lipman1", "gender": ";M;M;M;;", "homepage": ";https://omri1348.github.io/;https://www.linkedin.com/in/itaigat/;;https://il.linkedin.com/in/urielsinger;", "dblp": ";267/5465;221/4128;27/7164;238/0243;", "google_scholar": ";https://scholar.google.com/citations?view_op=list_works;TnJqhXIAAAAJ;Wewcpo4AAAAJ;nIEep3cAAAAJ;", "orcid": ";;;;0000-0001-8451-8533;", "linkedin": ";omri-puny-0917771b2/;;;;", "or_profile": "~Heli_Ben-Hamu1;~Omri_Puny1;~Itai_Gat1;~Brian_Karrer1;~Uriel_Singer1;~Yaron_Lipman1", "aff": ";Weizmann Institute of Science;;Meta Fundamental AI Research (FAIR);Meta AI Research;", "aff_domain": ";weizmann.ac.il;;meta.com;meta.com;", "position": ";PhD student;;Researcher;Researcher;", "bibtex": "@inproceedings{\nben-hamu2024dflow,\ntitle={D-Flow: Differentiating through Flows for Controlled Generation},\nauthor={Heli Ben-Hamu and Omri Puny and Itai Gat and Brian Karrer and Uriel Singer and Yaron Lipman},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SE20BFqj6J}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9672828, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2179784580606330040&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": ";weizmann.ac.il;;meta.com;meta.com;", "author_num": 6, "aff_unique_index": "0;1;1", "aff_unique_norm": "Weizmann Institute of Science;Meta", "aff_unique_dep": ";Fundamental AI Research", "aff_unique_url": "https://www.weizmann.org.il;https://meta.ai", "aff_unique_abbr": "Weizmann;Meta FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Israel;United States" }, { "title": "ESNet: Evolution and Succession Network for High-Resolution Salient Object Detection", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34015", "id": "SERrqPDvoY", "proceeding": "https://proceedings.mlr.press/v235/liu24l.html", "pdf": "https://openreview.net/pdf?id=SERrqPDvoY", "openreview": "https://openreview.net/forum?id=SERrqPDvoY", "author_site": "Hongyu Liu, Runmin Cong, Hua Li, Qianqian Xu, Qingming Huang, Wei Zhang", "tldr": "", "abstract": "Preserving details and avoiding high computational costs are the two main challenges for the High-Resolution Salient Object Detection (HRSOD) task. In this paper, we propose a two-stage HRSOD model from the perspective of evolution and succession, including an evolution stage with Low-resolution Location Model (LrLM) and a succession stage with High-resolution Refinement Model (HrRM). The evolution stage achieves detail-preserving salient objects localization on the low-resolution image through the evolution mechanisms on supervision and feature; the succession stage utilizes the shallow high-resolution features to complement and enhance the features inherited from the first stage in a lightweight manner and generate the final high-resolution saliency prediction. Besides, a new metric named Boundary-Detail-aware Mean Absolute Error (${MAE}_{{BD}}$) is designed to evaluate the ability to detect details in high-resolution scenes. Extensive experiments on five datasets demonstrate that our network achieves superior performance at real-time speed (49 FPS) compared to state-of-the-art methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hongyu Liu;Runmin Cong;Hua Li;Qianqian Xu;Qingming Huang;Wei Zhang", "authorids": "~Hongyu_Liu4;~Runmin_Cong1;~Hua_Li8;~Qianqian_Xu2;~Qingming_Huang1;~Wei_Zhang7", "gender": "M;M;;F;;M", "homepage": "https://github.com/big-feather;https://rmcong.github.io/;;http://vipl.ict.ac.cn/people/~qianqianxu;;https://www.vsislab.com", "dblp": ";180/7852;;07/7627;;", "google_scholar": ";https://scholar.google.co.uk/citations?hl=en;;https://scholar.google.com.hk/citations?user=MjifS2MAAAAJ;;", "orcid": ";0000-0003-0972-4008;;;;", "linkedin": ";;;;;", "or_profile": "~Hongyu_Liu4;~Runmin_Cong1;~Hua_Li8;~Qianqian_Xu2;~Qingming_Huang1;~Wei_Zhang7", "aff": "Beijing Jiaotong University;Shandong University;;Institute of Computing Technology, Chinese Academy of Sciences;;Shandong University", "aff_domain": "bjtu.edu.cn;sdu.edu.cn;;ict.ac.cn;;sdu.edu.cn", "position": "MS student;Full Professor;;Full Professor;;Full Professor", "bibtex": "@inproceedings{\nliu2024esnet,\ntitle={{ESN}et: Evolution and Succession Network for High-Resolution Salient Object Detection},\nauthor={Hongyu Liu and Runmin Cong and Hua Li and Qianqian Xu and Qingming Huang and Wei Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SERrqPDvoY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3247647, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3990917450085232279&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "bjtu.edu.cn;sdu.edu.cn;;ict.ac.cn;;sdu.edu.cn", "author_num": 6, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Beijing Jiao Tong University;Shandong University;Chinese Academy of Sciences", "aff_unique_dep": ";;Institute of Computing Technology", "aff_unique_url": "http://www.njtu.edu.cn/en;http://www.sdu.edu.cn;http://www.ict.ac.cn", "aff_unique_abbr": "BJTU;SDU;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Benchmarking Deletion Metrics with the Principled Explanations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34014", "id": "SKPhvzxO1g", "proceeding": "https://proceedings.mlr.press/v235/wang24br.html", "pdf": "https://openreview.net/pdf?id=SKPhvzxO1g", "openreview": "https://openreview.net/forum?id=SKPhvzxO1g", "author_site": "Yipei Wang, Xiaoqian Wang", "tldr": "", "abstract": "Insertion/deletion metrics and their variants have been extensively applied to evaluate attribution-based explanation methods. Such metrics measure the significance of features by observing changes in model predictions as features are incrementally inserted or deleted. Given the direct connection between the attribution values and model predictions that insertion/deletion metrics enable, they are commonly used as the decisive metrics for novel attribution methods. Such influential metrics for explanation methods should be handled with great scrutiny. However, contemporary research on insertion/deletion metrics falls short of a comprehensive analysis. To address this, we propose the TRAjectory importanCE (TRACE) framework, which achieves the best scores of the insertion/deletion metric. Our contribution includes two aspects: 1) TRACE stands as the principled explanation for explaining the influence of feature deletion on model predictions. We demonstrate that TRACE is guaranteed to achieve almost optimal results both theoretically and empirically. 2) Using TRACE, we benchmark insertion/deletion metrics across all possible settings and study critical problems such as the out-of-distribution (OOD) issue, and provide practical guidance on applying these metrics in practice.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yipei Wang;Xiaoqian Wang", "authorids": "~Yipei_Wang1;~Xiaoqian_Wang1", "gender": "M;F", "homepage": "https://yipei-wang.github.io;https://engineering.purdue.edu/~joywang/", "dblp": "140/2763;151/3215-1", "google_scholar": "NXENco8AAAAJ;I3tc214AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Yipei_Wang1;~Xiaoqian_Wang1", "aff": "Purdue University;Purdue University", "aff_domain": "purdue.edu;purdue.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nwang2024benchmarking,\ntitle={Benchmarking Deletion Metrics with the Principled Explanations},\nauthor={Yipei Wang and Xiaoqian Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SKPhvzxO1g}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8274110, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ULWM0FjaposJ:scholar.google.com/&scioq=Benchmarking+Deletion+Metrics+with+the+Principled+Explanations&hl=en&as_sdt=0,5", "gs_version_total": 5, "email": "purdue.edu;purdue.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Causal Representation Learning Made Identifiable by Grouping of Observational Variables", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34013", "id": "SL6V527p1F", "proceeding": "https://proceedings.mlr.press/v235/morioka24a.html", "pdf": "https://openreview.net/pdf?id=SL6V527p1F", "openreview": "https://openreview.net/forum?id=SL6V527p1F", "author_site": "Hiroshi Morioka, Aapo Hyvarinen", "tldr": "", "abstract": "A topic of great current interest is Causal Representation Learning (CRL), whose goal is to learn a causal model for hidden features in a data-driven manner. Unfortunately, CRL is severely ill-posed since it is a combination of the two notoriously ill-posed problems of representation learning and causal discovery. Yet, finding practical identifiability conditions that guarantee a unique solution is crucial for its practical applicability. Most approaches so far have been based on assumptions on the latent causal mechanisms, such as temporal causality, or existence of supervision or interventions; these can be too restrictive in actual applications. Here, we show identifiability based on novel, weak constraints, which requires no temporal structure, intervention, nor weak supervision. The approach is based on assuming the observational mixing exhibits a suitable grouping of the observational variables. We also propose a novel self-supervised estimation framework consistent with the model, prove its statistical consistency, and experimentally show its superior CRL performances compared to the state-of-the-art baselines. We further demonstrate its robustness against latent confounders and causal cycles.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hiroshi Morioka;Aapo Hyvarinen", "authorids": "~Hiroshi_Morioka1;~Aapo_Hyvarinen1", "gender": ";", "homepage": "https://sites.google.com/view/hiroshimorioka/;https://www.cs.helsinki.fi/u/ahyvarin/", "dblp": "67/10335;56/3623", "google_scholar": "https://scholar.google.co.jp/citations?user=pt03Xx8AAAAJ;https://scholar.google.co.jp/citations?user=UnrY-40AAAAJ", "orcid": ";0000-0002-5806-4432", "linkedin": ";", "or_profile": "~Hiroshi_Morioka1;~Aapo_Hyvarinen1", "aff": "RIKEN;University of Helsinki", "aff_domain": "riken.jp;helsinki.fi", "position": "Postdoc;Full Professor", "bibtex": "@inproceedings{\nmorioka2024causal,\ntitle={Causal Representation Learning Made Identifiable by Grouping of Observational Variables},\nauthor={Hiroshi Morioka and Aapo Hyvarinen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SL6V527p1F}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 0, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8800410264008199114&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "riken.jp;helsinki.fi", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "RIKEN;University of Helsinki", "aff_unique_dep": ";", "aff_unique_url": "https://www.riken.jp;https://www.helsinki.fi", "aff_unique_abbr": "RIKEN;UH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Japan;Finland" }, { "title": "Few-Shot Unsupervised Implicit Neural Shape Representation Learning with Spatial Adversaries", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34012", "id": "SLqdDWwibH", "proceeding": "https://proceedings.mlr.press/v235/ouasfi24a.html", "pdf": "https://openreview.net/pdf?id=SLqdDWwibH", "openreview": "https://openreview.net/forum?id=SLqdDWwibH", "author_site": "Amine Ouasfi, Adnane Boukhayma", "tldr": "", "abstract": "Implicit Neural Representations have gained prominence as a powerful framework for capturing complex data modalities, encompassing a wide range from 3D shapes to images and audio. Within the realm of 3D shape representation, Neural Signed Distance Functions (SDF) have demonstrated remarkable potential in faithfully encoding intricate shape geometry. However, learning SDFs from sparse 3D point clouds in the absence of ground truth supervision remains a very challenging task. While recent methods rely on smoothness priors to regularize the learning, our method introduces a regularization term that leverages adversarial samples around the shape to improve the learned SDFs. Through extensive experiments and evaluations, we illustrate the efficacy of our proposed method, highlighting its capacity to improve SDF learning with respect to baselines and the state-of-the-art using synthetic and real data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Amine Ouasfi;Adnane Boukhayma", "authorids": "~Amine_Ouasfi1;~Adnane_Boukhayma2", "gender": ";M", "homepage": ";https://boukhayma.github.io/", "dblp": "324/2085;172/2146", "google_scholar": "IdcK7TcAAAAJ;", "orcid": ";", "linkedin": "amine-ouasfi/;", "or_profile": "~Amine_Ouasfi1;~Adnane_Boukhayma2", "aff": "INRIA;INRIA", "aff_domain": "inria.fr;inria.fr", "position": "PhD student;Researcher", "bibtex": "@inproceedings{\nouasfi2024fewshot,\ntitle={Few-Shot Unsupervised Implicit Neural Shape Representation Learning with Spatial Adversaries},\nauthor={Amine Ouasfi and Adnane Boukhayma},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SLqdDWwibH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9125186, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14285305077357609247&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "inria.fr;inria.fr", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "INRIA", "aff_unique_dep": "", "aff_unique_url": "https://www.inria.fr", "aff_unique_abbr": "INRIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "title": "Time-Series Forecasting for Out-of-Distribution Generalization Using Invariant Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34011", "id": "SMUXPVKUBg", "proceeding": "https://proceedings.mlr.press/v235/liu24ae.html", "pdf": "https://openreview.net/pdf?id=SMUXPVKUBg", "openreview": "https://openreview.net/forum?id=SMUXPVKUBg", "author_site": "Haoxin Liu, Harshavardhan Kamarthi, Lingkai Kong, Zhiyuan Zhao, Chao Zhang, B. Aditya Prakash", "tldr": "", "abstract": "Time-series forecasting (TSF) finds broad applications in real-world scenarios. Due to the dynamic nature of time-series data, it is crucial for TSF models to preserve out-of-distribution (OOD) generalization abilities, as training and test sets represent historical and future data respectively. In this paper, we aim to alleviate the inherent OOD problem in TSF via invariant learning. We identify fundamental challenges of invariant learning for TSF. First, the target variables in TSF may not be sufficiently determined by the input due to unobserved core variables in TSF, breaking the fundamental assumption of invariant learning. Second, time-series datasets lack adequate environment labels, while existing environmental inference methods are not suitable for TSF. To address these challenges, we propose FOIL, a model-agnostic framework that endows time-series forecasting for out-of-distribution generalization via invariant learning. Specifically, FOIL employs a novel surrogate loss to mitigate the impact of unobserved variables. Further, FOIL implements joint optimization by alternately inferring environments effectively with a multi-head network while preserving the temporal adjacency structure and learning invariant representations across inferred environments for OOD generalized TSF. Extensive experiments demonstrate that the proposed FOIL significantly and consistently improves the performance of various TSF models, achieving gains of up to 85%.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haoxin Liu;Harshavardhan Kamarthi;Lingkai Kong;Zhiyuan Zhao;Chao Zhang;B. Aditya Prakash", "authorids": "~Haoxin_Liu3;~Harshavardhan_Kamarthi1;~Lingkai_Kong1;~Zhiyuan_Zhao1;~Chao_Zhang15;~B._Aditya_Prakash2", "gender": ";M;M;M;;", "homepage": ";https://www.harsha-pk.com;https://lingkai-kong.com/;https://leozhao1997.github.io/;http://chaozhang.org/;https://www.cc.gatech.edu/~badityap/", "dblp": ";245/8927;20/10253;;94/3019-14;06/3956", "google_scholar": ";LNXEjT8AAAAJ;https://scholar.google.com/citations?hl=en;TzWPFmwAAAAJ;https://scholar.google.com/citations?hl=en;C-NftTgAAAAJ", "orcid": ";0000-0002-2901-7127;0000-0001-6480-513X;0009-0005-6671-705x;0000-0003-3009-598X;0000-0002-3252-455X", "linkedin": ";harshavardhan-kamarthi-462928112/;;;;", "or_profile": "~Haoxin_Liu3;~Harshavardhan_Kamarthi1;~Lingkai_Kong1;~Zhiyuan_Zhao1;~Chao_Zhang15;~B._Aditya_Prakash2", "aff": ";Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology", "aff_domain": ";gatech.edu;gatech.edu;gatech.edu;gatech.edu;gatech.edu", "position": ";PhD student;PhD student;PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nliu2024timeseries,\ntitle={Time-Series Forecasting for Out-of-Distribution Generalization Using Invariant Learning},\nauthor={Haoxin Liu and Harshavardhan Kamarthi and Lingkai Kong and Zhiyuan Zhao and Chao Zhang and B. Aditya Prakash},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SMUXPVKUBg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 649587, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15391506719812803568&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": ";gatech.edu;gatech.edu;gatech.edu;gatech.edu;gatech.edu", "author_num": 6, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Mapping the Multiverse of Latent Representations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34010", "id": "SPBxFwIdMk", "proceeding": "https://proceedings.mlr.press/v235/wayland24a.html", "pdf": "https://openreview.net/pdf?id=SPBxFwIdMk", "openreview": "https://openreview.net/forum?id=SPBxFwIdMk", "author_site": "Jeremy Wayland, Corinna Coupette, Bastian Rieck", "tldr": "", "abstract": "Echoing recent calls to counter reliability and robustness concerns in machine learning via *multiverse analysis*, we present PRESTO, a principled framework for *mapping the multiverse* of machine-learning models that rely on *latent representations*. Although such models enjoy widespread adoption, the variability in their embeddings remains poorly understood, resulting in unnecessary complexity and untrustworthy representations. Our framework uses *persistent homology* to characterize the latent spaces arising from different combinations of diverse machine-learning methods, (hyper)parameter configurations, and datasets, allowing us to measure their pairwise *(dis)similarity* and statistically reason about their *distributions*. As we demonstrate both theoretically and empirically, our pipeline preserves desirable properties of collections of latent representations, and it can be leveraged to perform sensitivity analysis, detect anomalous embeddings, or efficiently and effectively navigate hyperparameter search spaces.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jeremy Wayland;Corinna Coupette;Bastian Rieck", "authorids": "~Jeremy_Wayland1;~Corinna_Coupette1;~Bastian_Rieck1", "gender": "M;Non-Binary;M", "homepage": "https://jeremy-wayland.me/;https://coupette.io;https://bastian.rieck.me", "dblp": ";259/0911;119/8860", "google_scholar": "FucrCk8AAAAJ;gwtvRF4AAAAJ;https://scholar.google.ch/citations?user=La7zuKQAAAAJ", "orcid": "0000-0002-8766-8737;0000-0001-9151-2092;0000-0003-4335-0302", "linkedin": "jeremy-wayland/;;br-ml/", "or_profile": "~Jeremy_Wayland1;~Corinna_Coupette1;~Bastian_Rieck1", "aff": "Technische Universit\u00e4t M\u00fcnchen;Bucerius Center for Legal Technology and Data Science;Helmholtz Zentrum M\u00fcnchen", "aff_domain": "tum.de;law-school.de;helmholtz-munich.de", "position": "PhD student;Researcher;Principal Investigator", "bibtex": "@inproceedings{\nwayland2024mapping,\ntitle={Mapping the Multiverse of Latent Representations},\nauthor={Jeremy Wayland and Corinna Coupette and Bastian Rieck},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SPBxFwIdMk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2233608, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10503172766269718596&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "tum.de;law-school.de;helmholtz-munich.de", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Technische Universit\u00e4t M\u00fcnchen;Bucerius Law School;Helmholtz Zentrum M\u00fcnchen", "aff_unique_dep": ";Center for Legal Technology and Data Science;", "aff_unique_url": "https://www.tum.de;https://www.bucerius.de/en/;https://www.helmholtz-muenchen.de", "aff_unique_abbr": "TUM;Bucerius;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "A Provable Decision Rule for Out-of-Distribution Detection", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34009", "id": "SPygKwms0X", "proceeding": "https://proceedings.mlr.press/v235/ma24t.html", "pdf": "https://openreview.net/pdf?id=SPygKwms0X", "openreview": "https://openreview.net/forum?id=SPygKwms0X", "author_site": "Xinsong Ma, Xin Zou, Weiwei Liu", "tldr": "", "abstract": "Out-of-distribution (OOD) detection task plays the key role in reliable and safety-critical applications. Existing researches mainly devote to designing or training the powerful score function but overlook investigating the decision rule based on the proposed score function. Different from previous work, this paper aims to design a decision rule with rigorous theoretical guarantee and well empirical performance. Specifically, we provide a new insight for the OOD detection task from a hypothesis testing perspective and propose a novel generalized Benjamini Hochberg (g-BH) procedure with empirical p-values to solve the testing problem. Theoretically, the g-BH procedure controls false discovery rate (FDR) at pre-specified level. Furthermore, we derive an upper bound of the expectation of false positive rate (FPR) for the g-BH procedure based on the tailed generalized Gaussian distribution family, indicating that the FPR of g-BH procedure converges to zero in probability. Finally, the extensive experimental results verify the superiority of g-BH procedure over the traditional threshold-based decision rule on several OOD detection benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xinsong Ma;Xin Zou;Weiwei Liu", "authorids": "~Xinsong_Ma1;~Xin_Zou3;~Weiwei_Liu1", "gender": ";M;M", "homepage": ";https://zouxinn.github.io/;https://sites.google.com/site/weiweiliuhomepage/", "dblp": ";18/6081-2;54/6677-3.html", "google_scholar": ";https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";;", "linkedin": ";;weiwei-liu-4a7849134/", "or_profile": "~Xinsong_Ma1;~Xin_Zou3;~Weiwei_Liu1", "aff": ";Wuhan University;Wuhan University", "aff_domain": ";whu.edu.cn;whu.edu.cn", "position": ";PhD student;Full Professor", "bibtex": "@inproceedings{\nma2024a,\ntitle={A Provable Decision Rule for Out-of-Distribution Detection},\nauthor={Xinsong Ma and Xin Zou and Weiwei Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SPygKwms0X}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 472950, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8341735003267318972&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 4, "email": ";whu.edu.cn;whu.edu.cn", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Wuhan University", "aff_unique_dep": "", "aff_unique_url": "http://www.whu.edu.cn/", "aff_unique_abbr": "WHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "RoboGen: Towards Unleashing Infinite Data for Automated Robot Learning via Generative Simulation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34008", "id": "SQIDlJd3hN", "proceeding": "https://proceedings.mlr.press/v235/wang24cc.html", "pdf": "https://openreview.net/pdf?id=SQIDlJd3hN", "openreview": "https://openreview.net/forum?id=SQIDlJd3hN", "author_site": "Yufei Wang, Zhou Xian, Feng Chen, Johnson Tsun-Hsuan Wang, Yian Wang, Katerina Fragkiadaki, Zackory Erickson, David Held, Chuang Gan", "tldr": "", "abstract": "We present RoboGen, a generative robotic agent that automatically learns diverse robotic skills at scale via generative simulation. RoboGen leverages the latest advancements in foundation and generative models. Instead of directly adapting these models to produce policies or low-level actions, we advocate for a generative scheme, which uses these models to automatically generate diversified tasks, scenes, and training supervisions, thereby scaling up robotic skill learning with minimal human supervision. Our approach equips a robotic agent with a self-guided propose-generate-learn cycle: the agent first proposes interesting tasks and skills to develop, and then generates simulation environments by populating pertinent assets with proper spatial configurations. Afterwards, the agent decomposes the proposed task into sub-tasks, selects the optimal learning approach (reinforcement learning, motion planning, or trajectory optimization), generates required training supervision, and then learns policies to acquire the proposed skill. Our fully generative pipeline can be queried repeatedly, producing an endless stream of skill demonstrations associated with diverse tasks and environments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yufei Wang;Zhou Xian;Feng Chen;Tsun-Hsuan Wang;Yian Wang;Katerina Fragkiadaki;Zackory Erickson;David Held;Chuang Gan", "authorids": "~Yufei_Wang4;~Zhou_Xian1;~Feng_Chen16;~Tsun-Hsuan_Wang2;~Yian_Wang1;~Katerina_Fragkiadaki1;~Zackory_Erickson1;~David_Held1;~Chuang_Gan1", "gender": ";M;M;M;F;M;M;M;M", "homepage": "https://yufeiwang63.github.io/;;https://winniechen2002.github.io/;https://zswang666.github.io/;https://www.cs.cmu.edu/~katef/;https://zackory.com;http://davheld.github.io/;http://people.csail.mit.edu/ganchuang/;http://wangyian-me.github.io/", "dblp": ";258/5020;;217/1809.html;21/8780;;22/11147;139/6993;71/10046", "google_scholar": "HQl9718AAAAJ;;xuVkkKwAAAAJ;xE3WSuYAAAAJ;FWp7728AAAAJ;wElkTtIAAAAJ;0QtU-NsAAAAJ;PTeSCbIAAAAJ;dUf3wx4AAAAJ", "orcid": ";;;;;;;;", "linkedin": ";;https://linkedin.com/in/\u67ab-\u9648-822809265;;;;;;", "or_profile": "~Yufei_Wang4;~Zhou_Xian1;~Feng_Chen16;~Tsun-Hsuan_Wang2;~Katerina_Fragkiadaki1;~Zackory_Erickson1;~David_Held1;~Chuang_Gan1;~\u9038\u5b89_\u738b1", "aff": "School of Computer Science, Carnegie Mellon University;Carnegie Mellon University;IIIS, Tsinghua University;Liquid AI;Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;University of Massachusetts at Amherst;NVIDIA", "aff_domain": "cs.cmu.edu;cmu.edu;mails.tsinghua.edu.cn;liquid.ai;cmu.edu;cmu.edu;cmu.edu;umass.edu;nvidia.com", "position": "PhD student;PhD student;Undergrad student;Researcher;Associate Professor;Assistant Professor;Associate Professor;Assistant Professor;Intern", "bibtex": "@inproceedings{\nwang2024robogen,\ntitle={RoboGen: Towards Unleashing Infinite Data for Automated Robot Learning via Generative Simulation},\nauthor={Yufei Wang and Zhou Xian and Feng Chen and Tsun-Hsuan Wang and Yian Wang and Katerina Fragkiadaki and Zackory Erickson and David Held and Chuang Gan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SQIDlJd3hN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3250275, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 88, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13053780899187527920&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "cs.cmu.edu;cmu.edu;mails.tsinghua.edu.cn;liquid.ai;cmu.edu;cmu.edu;cmu.edu;umass.edu;nvidia.com", "author_num": 9, "aff_unique_index": "0;0;1;2;0;0;0;3;4", "aff_unique_norm": "Carnegie Mellon University;Tsinghua University;Liquid AI;University of Massachusetts Amherst;NVIDIA", "aff_unique_dep": "School of Computer Science;Institute for Interdisciplinary Information Sciences;;;NVIDIA Corporation", "aff_unique_url": "https://www.cmu.edu;https://www.tsinghua.edu.cn;;https://www.umass.edu;https://www.nvidia.com", "aff_unique_abbr": "CMU;THU;;UMass Amherst;NVIDIA", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Pittsburgh;;Amherst", "aff_country_unique_index": "0;0;1;2;0;0;0;0;0", "aff_country_unique": "United States;China;Unknown" }, { "title": "UniAudio: Towards Universal Audio Generation with Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34007", "id": "SRmZw7nEGW", "proceeding": "https://proceedings.mlr.press/v235/yang24x.html", "pdf": "https://openreview.net/pdf?id=SRmZw7nEGW", "openreview": "https://openreview.net/forum?id=SRmZw7nEGW", "author_site": "Dongchao Yang, Jinchuan Tian, Xu Tan, Rongjie Huang, Songxiang Liu, Haohan Guo, Xuankai Chang, Jiatong Shi, sheng zhao, Jiang Bian, Zhou Zhao, Xixin Wu, Helen M Meng", "tldr": "", "abstract": "Audio generation is a major branch of generative AI research. Compared with prior works in this area that are commonly task-specific with heavy domain knowledge, this paper advocates building universal audio generation models that can handle various tasks in a unified manner. As recent research on large language models (LLMs) has demonstrated their strong ability to handle multiple tasks, this work presents UniAudio, an LLM-based audio generation model that supports a wide range of audio generation tasks. Based on various input conditions, such as phoneme, text description, or audio itself, UniAudio can generate speech, sound, music, and singing voice. The proposed UniAudio is built with 100k hours of multi-source open-available audio data and is scaled to 1B parameters. The audio tokenization method and language model architecture are also specifically designed for both performance and efficiency. Experimentally, UniAuido supports 11 audio generation tasks and achieves competitive results on all tasks consistently. We also show that UniAudio can support new tasks seamlessly via simple fine-tuning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dongchao Yang;Jinchuan Tian;Xu Tan;Rongjie Huang;Songxiang Liu;Haohan Guo;Xuankai Chang;Jiatong Shi;sheng zhao;Jiang Bian;Zhou Zhao;Xixin Wu;Helen M. Meng", "authorids": "~Dongchao_Yang1;~Jinchuan_Tian1;~Xu_Tan1;~Rongjie_Huang1;~Songxiang_Liu2;~Haohan_Guo1;~Xuankai_Chang1;~Jiatong_Shi1;~sheng_zhao1;~Jiang_Bian1;~Zhou_Zhao3;~Xixin_Wu1;~Helen_M._Meng1", "gender": "M;M;M;M;M;M;M;M;M;M;;F;M", "homepage": "http://dongchaoyang.top;;https://tan-xu.github.io/;;https://liusongxiang.github.io/;https://hhguo.github.io/;https://www.xuankaic.com;http://shijt.site;https://www.aaai.org/ojs/index.php/AAAI/article/view/4642;https://sites.google.com/view/jiangbian;https://www1.se.cuhk.edu.hk/~wuxx/;http://www.se.cuhk.edu.hk/people/academic-staff/prof-meng-mei-ling-helen/;https://dblp.uni-trier.de/pid/75/7785.html?", "dblp": ";;96/10484-3;212/8936-1;;;194/1149.html;229/3529.html;;09/851-2.html;125/2836;92/3270;75/7785", "google_scholar": "WNiojyAAAAAJ;https://scholar.google.com.hk/citations?user=KE5I4R0AAAAJ;tob-U1oAAAAJ;iRHBUsgAAAAJ;https://scholar.google.com.hk/citations?user=4fD1l28AAAAJ;;cIl2jpMAAAAJ;FEDNbgkAAAAJ;689bIIwAAAAJ;pZBEnY8AAAAJ;;;https://scholar.google.com.hk/citations?user=IIoFY90AAAAJ", "orcid": ";;0000-0001-5631-0639;;;;0000-0002-5221-5412;;;0000-0002-9472-600X;;;0000-0001-6121-0384", "linkedin": ";;;;;;;jiatong-shi-608b3016b/;;jbian/;;;", "or_profile": "~Dongchao_Yang1;~Jinchuan_Tian1;~Xu_Tan1;~Rongjie_Huang1;~Songxiang_Liu2;~Haohan_Guo1;~Xuankai_Chang1;~Jiatong_Shi1;~sheng_zhao1;~Jiang_Bian1;~Xixin_Wu1;~Helen_M._Meng1;~Zhou_Zhao2", "aff": "Chinese University of Hong Kong;Carnegie Mellon University;Microsoft;Zhejiang University;The Chinese University of Hong Kong;Chinese University of Hong Kong, The Chinese University of Hong Kong;Carnegie Mellon University;Carnegie Mellon University;Microsoft;Microsoft;The Chinese University of Hong Kong;The Chinese University of Hong Kong;Zhejiang University", "aff_domain": "cuhk.hk;andrew.cmu.edu;microsoft.com;zju.edu.cn;cuhk.edu.hk;se.cuhk.edu.hk;andrew.cmu.edu;andrew.cmu.edu;microsoft.com;microsoft.com;cuhk.edu.hk;cuhk.edu.hk;zju.edu.cn", "position": "PhD student;PhD student;Principal Researcher;MS student;Researcher;PhD student;PhD student;PhD student;Researcher;Partner Research Manager;Assistant Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nyang2024uniaudio,\ntitle={UniAudio: Towards Universal Audio Generation with Large Language Models},\nauthor={Dongchao Yang and Jinchuan Tian and Xu Tan and Rongjie Huang and Songxiang Liu and Haohan Guo and Xuankai Chang and Jiatong Shi and sheng zhao and Jiang Bian and Zhou Zhao and Xixin Wu and Helen M. Meng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SRmZw7nEGW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1245872, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 13, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7553768456221728553&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "cuhk.hk;andrew.cmu.edu;microsoft.com;zju.edu.cn;cuhk.edu.hk;se.cuhk.edu.hk;andrew.cmu.edu;andrew.cmu.edu;microsoft.com;microsoft.com;cuhk.edu.hk;cuhk.edu.hk;zju.edu.cn", "author_num": 13, "aff_unique_index": "0;1;2;3;0;0;1;1;2;2;0;0;3", "aff_unique_norm": "Chinese University of Hong Kong;Carnegie Mellon University;Microsoft;Zhejiang University", "aff_unique_dep": ";;Microsoft Corporation;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.cmu.edu;https://www.microsoft.com;https://www.zju.edu.cn", "aff_unique_abbr": "CUHK;CMU;Microsoft;ZJU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;1;0;0;0;1;1;1;1;0;0;0", "aff_country_unique": "China;United States" }, { "title": "PIDformer: Transformer Meets Control Theory", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34006", "id": "SRzb3QDjdV", "proceeding": "https://proceedings.mlr.press/v235/nguyen24i.html", "pdf": "https://openreview.net/pdf?id=SRzb3QDjdV", "openreview": "https://openreview.net/forum?id=SRzb3QDjdV", "author_site": "Tam Nguyen, Cesar Uribe, Tan Nguyen, Richard Baraniuk", "tldr": "", "abstract": "In this work, we address two main shortcomings of transformer architectures: input corruption and rank collapse in their output representation. We unveil self-attention as an autonomous state-space model that inherently promotes smoothness in its solutions, leading to lower-rank outputs and diminished representation capacity. Moreover, the steady-state solution of the model is sensitive to input perturbations. We incorporate a Proportional-Integral-Derivative (PID) closed-loop feedback control system with a reference point into the model to improve robustness and representation capacity. This integration aims to preserve high-frequency details while bolstering model stability, rendering it more noise-resilient. The resulting controlled state-space model is theoretically proven robust and adept at addressing the rank collapse. Motivated by this control framework, we derive a novel class of transformers, PID-controlled Transformer (PIDformer), aimed at improving robustness and mitigating the rank-collapse issue inherent in softmax transformers. We empirically evaluate the model for advantages and robustness against baseline transformers across various practical tasks, including object classification, image segmentation, and language modeling.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tam Minh Nguyen;Cesar A Uribe;Tan Minh Nguyen;Richard Baraniuk", "authorids": "~Tam_Minh_Nguyen1;~Cesar_A_Uribe1;~Tan_Minh_Nguyen1;~Richard_Baraniuk1", "gender": "F;M;M;", "homepage": ";https://cauribe.rice.edu/;https://tanmnguyen89.github.io/;http://richb.rice.edu/", "dblp": "251/1464;143/6101;255/4725;32/2804", "google_scholar": ";b_uG-kEAAAAJ;OizOh88AAAAJ;https://scholar.google.com.tw/citations?user=N-BBA20AAAAJ", "orcid": ";0000-0002-7080-9724;;", "linkedin": "tam-nguyen-6a3935132/;cesarauribe/;;richard-baraniuk", "or_profile": "~Tam_Minh_Nguyen1;~Cesar_A_Uribe1;~Tan_Minh_Nguyen1;~Richard_Baraniuk1", "aff": "Rice University;Rice University;National University of Singapore;William Marsh Rice University", "aff_domain": "rice.edu;rice.edu;nus.edu.sg;rice.edu", "position": "PhD student;Assistant Professor;Assistant Professor;C. Sidney Burrus Professor", "bibtex": "@inproceedings{\nnguyen2024pidformer,\ntitle={{PID}former: Transformer Meets Control Theory},\nauthor={Tam Minh Nguyen and Cesar A Uribe and Tan Minh Nguyen and Richard Baraniuk},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SRzb3QDjdV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 578432, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1088937429672835840&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "rice.edu;rice.edu;nus.edu.sg;rice.edu", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Rice University;National University of Singapore", "aff_unique_dep": ";", "aff_unique_url": "https://www.rice.edu;https://www.nus.edu.sg", "aff_unique_abbr": "Rice;NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;Singapore" }, { "title": "Adaptive Proximal Gradient Methods Are Universal Without Approximation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34005", "id": "SUxarNgrUT", "proceeding": "https://proceedings.mlr.press/v235/oikonomidis24a.html", "pdf": "https://openreview.net/pdf?id=SUxarNgrUT", "openreview": "https://openreview.net/forum?id=SUxarNgrUT", "author_site": "Konstantinos Oikonomidis, Emanuel Laude, Puya Latafat, Andreas Themelis, Panagiotis Patrinos", "tldr": "", "abstract": "We show that adaptive proximal gradient methods for convex problems are not restricted to traditional Lipschitzian assumptions. Our analysis reveals that a class of linesearch-free methods is still convergent under mere local H\u00f6lder gradient continuity, covering in particular continuously differentiable semi-algebraic functions. To mitigate the lack of local Lipschitz continuity, popular approaches revolve around $\\varepsilon$-oracles and/or linesearch procedures. In contrast, we exploit plain H\u00f6lder inequalities not entailing any approximation, all while retaining the linesearch-free nature of adaptive schemes. Furthermore, we prove full sequence convergence without prior knowledge of local H\u00f6lder constants nor of the order of H\u00f6lder continuity. Numerical experiments make comparisons with baseline methods on diverse tasks from machine learning covering both the locally and the globally H\u00f6lder setting.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Konstantinos Oikonomidis;Emanuel Laude;Puya Latafat;Andreas Themelis;Panagiotis Patrinos", "authorids": "~Konstantinos_Oikonomidis1;~Emanuel_Laude1;~Puya_Latafat1;themelis.andreas.822@m.kyushu-u.ac.jp;~Panagiotis_Patrinos1", "gender": "M;M;M;;M", "homepage": ";;https://github.com/pylat;;https://homes.esat.kuleuven.be/~ppatrino/index.html", "dblp": "360/1105.html;173/5225;;;55/896", "google_scholar": "https://scholar.google.gr/citations?user=UqjW5AIAAAAJ;;TaTK05QAAAAJ;;Qiwt2t8AAAAJ", "orcid": "0000-0002-6828-1385;;0000-0002-7969-8565;;0000-0003-4824-7697", "linkedin": ";;;;", "or_profile": "~Konstantinos_Oikonomidis1;~Emanuel_Laude1;~Puya_Latafat1;themelis.andreas.822@m.kyushu-u.ac.jp;~Panagiotis_Patrinos1", "aff": "KU Leuven;Department of Electrical Engineering, KU Leuven, Belgium, KU Leuven;KU Leuven;;", "aff_domain": "kuleuven.be;esat.kuleuven.be;kuleuven.be;;", "position": "PhD student;Postdoc;Postdoc;;", "bibtex": "@inproceedings{\noikonomidis2024adaptive,\ntitle={Adaptive Proximal Gradient Methods Are Universal Without Approximation},\nauthor={Konstantinos Oikonomidis and Emanuel Laude and Puya Latafat and Andreas Themelis and Panagiotis Patrinos},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SUxarNgrUT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1840034, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6434371544025311848&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13, "email": "kuleuven.be;esat.kuleuven.be;kuleuven.be;;", "author_num": 5, "aff_unique_index": "0;1;0", "aff_unique_norm": "Katholieke Universiteit Leuven;KU Leuven", "aff_unique_dep": ";Department of Electrical Engineering", "aff_unique_url": "https://www.kuleuven.be;https://www.kuleuven.be", "aff_unique_abbr": "KU Leuven;KU Leuven", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Belgium" }, { "title": "SiT: Symmetry-invariant Transformers for Generalisation in Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34004", "id": "SWrwurHAeq", "proceeding": "https://proceedings.mlr.press/v235/weissenbacher24a.html", "pdf": "https://openreview.net/pdf?id=SWrwurHAeq", "openreview": "https://openreview.net/forum?id=SWrwurHAeq", "author_site": "Matthias Weissenbacher, Rishabh Agarwal, Yoshinobu Kawahara", "tldr": "", "abstract": "An open challenge in reinforcement learning (RL) is the effective deployment of a trained policy to new or slightly different situations as well as semantically-similar environments. We introduce **S**ymmetry-**I**nvariant **T**ransformer (**SiT**), a scalable vision transformer (ViT) that leverages both local and global data patterns in a self-supervised manner to improve generalisation. Central to our approach is Graph Symmetric Attention, which refines the traditional self-attention mechanism to preserve graph symmetries, resulting in invariant and equivariant latent representations. We showcase SiT's superior generalization over ViTs on MiniGrid and Procgen RL benchmarks, and its sample efficiency on Atari 100k and CIFAR10.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Matthias Weissenbacher;Rishabh Agarwal;Yoshinobu Kawahara", "authorids": "~Matthias_Weissenbacher1;~Rishabh_Agarwal2;~Yoshinobu_Kawahara1", "gender": "M;M;M", "homepage": ";https://agarwl.github.io;https://mls.ist.osaka-u.ac.jp/en/~kawahara/", "dblp": ";;09/4700", "google_scholar": "xwo5JWgAAAAJ;https://scholar.google.ca/citations?user=aH8AJu4AAAAJ;B8sRETUAAAAJ", "orcid": ";;0000-0001-7789-4709", "linkedin": "matthias-weissenbacher-8bb850107/;;", "or_profile": "~Matthias_Weissenbacher1;~Rishabh_Agarwal2;~Yoshinobu_Kawahara1", "aff": ";Google DeepMind;RIKEN", "aff_domain": ";google.com;riken.jp", "position": ";Research Scientist;Team Director", "bibtex": "@inproceedings{\nweissenbacher2024sit,\ntitle={SiT: Symmetry-invariant Transformers for Generalisation in Reinforcement Learning},\nauthor={Matthias Weissenbacher and Rishabh Agarwal and Yoshinobu Kawahara},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SWrwurHAeq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3404044, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Mlq3of4JqigJ:scholar.google.com/&scioq=SiT:+Symmetry-invariant+Transformers+for+Generalisation+in+Reinforcement+Learning&hl=en&as_sdt=0,44", "gs_version_total": 7, "email": ";google.com;riken.jp", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Google;RIKEN", "aff_unique_dep": "Google DeepMind;", "aff_unique_url": "https://deepmind.com;https://www.riken.jp", "aff_unique_abbr": "DeepMind;RIKEN", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United Kingdom;Japan" }, { "title": "CodeIt: Self-Improving Language Models with Prioritized Hindsight Replay", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34003", "id": "SXVn5IFsrs", "proceeding": "https://proceedings.mlr.press/v235/butt24a.html", "pdf": "https://openreview.net/pdf?id=SXVn5IFsrs", "openreview": "https://openreview.net/forum?id=SXVn5IFsrs", "author_site": "Natasha Butt, Blazej Manczak, Auke Wiggers, Corrado Rainone, David Zhang, Micha\u00ebl Defferrard, Taco Cohen", "tldr": "", "abstract": "Large language models are increasingly solving tasks that are commonly believed to require human-level reasoning ability. However, these models still perform very poorly on benchmarks of general intelligence such as the Abstraction and Reasoning Corpus (ARC). In this paper, we approach the ARC as a programming-by-examples problem, and introduce a novel and scalable method for language model self-improvement called Code Iteration (CodeIt). Our method iterates between 1) program sampling and hindsight relabeling, and 2) learning from prioritized experience replay. By relabeling the goal of an episode (i.e., the program output given input) to the output actually produced by the sampled program, our method effectively deals with the extreme sparsity of rewards in program synthesis. Applying CodeIt to the ARC dataset, we demonstrate that prioritized hindsight replay, along with pre-training and data-augmentation, leads to successful inter-task generalization. CodeIt is the first neuro-symbolic approach that scales to the full ARC evaluation dataset. Our method solves 15% of ARC evaluation tasks, achieving state-of-the-art performance and outperforming existing neural and symbolic baselines. Our code is available at https://github.com/Qualcomm-AI-research/codeit.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Natasha Butt;Blazej Manczak;Auke Wiggers;Corrado Rainone;David W. Zhang;Micha\u00ebl Defferrard;Taco Cohen", "authorids": "~Natasha_Butt1;~Blazej_Manczak1;~Auke_Wiggers1;~Corrado_Rainone1;~David_W._Zhang1;~Micha\u00ebl_Defferrard1;~Taco_Cohen1", "gender": "F;M;M;M;M;M;M", "homepage": "https://ivi.fnwi.uva.nl/quva/people.html;;;https://deff.ch;http://www.ta.co.nl;https://aukejw.github.io/;https://davzha.netlify.app/", "dblp": ";;304/2695;182/2568.html;142/2903;182/2485;119/0960", "google_scholar": ";;ss6yaikAAAAJ;https://scholar.google.ch/citations?user=Ztj2-gUAAAAJ;a3q4YxEAAAAJ;https://scholar.google.nl/citations?user=rrwwB4cAAAAJ;https://scholar.google.nl/citations?user=MG3oLzUAAAAJ", "orcid": ";;0000-0003-0381-7254;0000-0002-6028-9024;;;0000-0002-2137-1738", "linkedin": ";blaise-blazej-manczak-10769b150/;corrado-rainone-2361641a1/;https://linkedin.com/in/mdeff;;;david-zhang-1b86b314a", "or_profile": "~Natasha_Butt1;~Blazej_Manczak1;~Corrado_Rainone1;~Micha\u00ebl_Defferrard1;~Taco_Cohen1;~Auke_J._Wiggers1;~David_W_Zhang1", "aff": "University of Amsterdam;Qualcomm Inc, QualComm;Qualcomm Inc, QualComm;Qualcomm;Meta;QualComm;Qualcomm Inc, QualComm", "aff_domain": "uva.nl;qti.qualcomm.com;qti.qualcomm.com;qualcomm.com;meta.com;qualcomm.com;qti.qualcomm.com", "position": "PhD student;Researcher;Researcher;Researcher;Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\nbutt2024codeit,\ntitle={CodeIt: Self-Improving Language Models with Prioritized Hindsight Replay},\nauthor={Natasha Butt and Blazej Manczak and Auke Wiggers and Corrado Rainone and David W. Zhang and Micha{\\\"e}l Defferrard and Taco Cohen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SXVn5IFsrs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1304582, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18358320256362818385&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "uva.nl;qti.qualcomm.com;qti.qualcomm.com;qualcomm.com;meta.com;qualcomm.com;qti.qualcomm.com", "author_num": 7, "aff_unique_index": "0;1;1;1;2;1;1", "aff_unique_norm": "University of Amsterdam;Qualcomm Incorporated;Meta", "aff_unique_dep": ";;Meta Platforms, Inc.", "aff_unique_url": "https://www.uva.nl;https://www.qualcomm.com;https://meta.com", "aff_unique_abbr": "UvA;Qualcomm;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;1;1", "aff_country_unique": "Netherlands;United States" }, { "title": "An Explicit Frame Construction for Normalizing 3D Point Clouds", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34002", "id": "SZ0JnRxi0x", "proceeding": "https://proceedings.mlr.press/v235/baker24a.html", "pdf": "https://openreview.net/pdf?id=SZ0JnRxi0x", "openreview": "https://openreview.net/forum?id=SZ0JnRxi0x", "author_site": "Justin Baker, Shih-Hsin Wang, Tommaso de Fernex, Bao Wang", "tldr": "", "abstract": "Many real-world datasets are represented as 3D point clouds -- yet they often lack a predefined reference frame, posing a challenge for machine learning or general data analysis. Traditional methods for determining reference frames and normalizing 3D point clouds often struggle with specific inputs, lack theoretical guarantees, or require massive data. We introduce a new algorithm that overcomes these limitations and guarantees both universality and compatibility with any learnable framework for 3D point cloud analysis. Our algorithm works with any input point cloud and performs consistently regardless of input complexities, unlike data-driven methods that are susceptible to biases or limited training data. Empirically, our algorithm outperforms existing methods in effectiveness and generalizability across diverse benchmark datasets. Code is available at https://github.com/Utah-Math-Data-Science/alignment.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Justin Baker;Shih-Hsin Wang;Tommaso de Fernex;Bao Wang", "authorids": "~Justin_Baker1;~Shih-Hsin_Wang1;~Tommaso_de_Fernex1;~Bao_Wang1", "gender": ";M;;M", "homepage": ";;https://www.math.utah.edu/~defernex/;https://www.math.utah.edu/~bwang/index.html", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";shih-hsin-sam-wang-9803671a5/;;", "or_profile": "~Justin_Baker1;~Shih-Hsin_Wang1;~Tommaso_de_Fernex1;~Bao_Wang1", "aff": ";University of Utah;University of Utah;University of Utah", "aff_domain": ";utah.edu;utah.edu;utah.edu", "position": ";PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nbaker2024an,\ntitle={An Explicit Frame Construction for Normalizing 3D Point Clouds},\nauthor={Justin Baker and Shih-Hsin Wang and Tommaso de Fernex and Bao Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SZ0JnRxi0x}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2853399, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17675315872373916324&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 6, "email": ";utah.edu;utah.edu;utah.edu", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Utah", "aff_unique_dep": "", "aff_unique_url": "https://www.utah.edu", "aff_unique_abbr": "Utah", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Representation Surgery for Multi-Task Model Merging", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34001", "id": "Sbl2keQEML", "proceeding": "https://proceedings.mlr.press/v235/yang24t.html", "pdf": "https://openreview.net/pdf?id=Sbl2keQEML", "openreview": "https://openreview.net/forum?id=Sbl2keQEML", "author_site": "Enneng Yang, Li Shen, Zhenyi Wang, Guibing Guo, Xiaojun Chen, Xingwei Wang, Dacheng Tao", "tldr": "", "abstract": "Multi-task learning (MTL) compresses the information from multiple tasks into a unified backbone to improve computational efficiency and generalization. Recent work directly merges multiple independently trained models to perform MTL instead of collecting their raw data for joint training, greatly expanding the application scenarios of MTL. However, by visualizing the representation distribution of existing model merging schemes, we find that the merged model often suffers from the dilemma of representation bias. That is, there is a significant discrepancy in the representation distribution between the merged and individual models, resulting in poor performance of merged MTL. In this paper, we propose a representation surgery solution called ``Surgery\" to reduce representation bias in the merged model. Specifically, Surgery is a lightweight task-specific plugin that takes the representation of the merged model as input and attempts to output the biases contained in the representation from the merged model. We then designed an unsupervised optimization objective that updates the Surgery plugin by minimizing the distance between the merged model's representation and the individual model's representation. Extensive experiments demonstrate significant MTL performance improvements when our Surgery plugin is applied to state-of-the-art (SOTA) model merging schemes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Enneng Yang;Li Shen;Zhenyi Wang;Guibing Guo;Xiaojun Chen;Xingwei Wang;Dacheng Tao", "authorids": "~Enneng_Yang1;~Li_Shen1;~Zhenyi_Wang1;~Guibing_Guo2;~Xiaojun_Chen4;~Xingwei_Wang3;~Dacheng_Tao1", "gender": "M;M;;M;M;M;", "homepage": ";https://sites.google.com/site/mathshenli/home;;https://guoguibing.github.io/cn/;https://bruce-xjchen.github.io/HomePage/;https://www.neu.edu.cn/info/1012/3221.htm;", "dblp": "246/2889;91/3680-8;;84/10716;20/3215-6;99/4694-1;", "google_scholar": ";yVhgENIAAAAJ;;YMXJa2EAAAAJ;yAjyrwkAAAAJ;;", "orcid": "0000-0001-5419-5286;;;;0000-0002-2818-4652;0000-0003-2856-4716;", "linkedin": ";;;;;;", "or_profile": "~Enneng_Yang1;~Li_Shen1;~Zhenyi_Wang1;~Guibing_Guo2;~Xiaojun_Chen4;~Xingwei_Wang3;~Dacheng_Tao1", "aff": "Northeastern University;JD Explore Academy;;Northeastern University;Shenzhen University;Northeastern University;", "aff_domain": "neu.edu.cn;jd.com;;neu.edu.cn;szu.edu.cn;neu.edu;", "position": "PhD student;Researcher;;Full Professor;Full Professor;Full Professor;", "bibtex": "@inproceedings{\nyang2024representation,\ntitle={Representation Surgery for Multi-Task Model Merging},\nauthor={Enneng Yang and Li Shen and Zhenyi Wang and Guibing Guo and Xiaojun Chen and Xingwei Wang and Dacheng Tao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Sbl2keQEML}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3084006, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6453660338776961868&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 6, "email": "neu.edu.cn;jd.com;;neu.edu.cn;szu.edu.cn;neu.edu;", "author_num": 7, "aff_unique_index": "0;1;0;2;0", "aff_unique_norm": "Northeastern University;JD;Shenzhen University", "aff_unique_dep": ";JD Explore Academy;", "aff_unique_url": "https://www.northeastern.edu;;https://www.szu.edu.cn", "aff_unique_abbr": "NEU;;SZU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;2;0", "aff_country_unique": "United States;;China" }, { "title": "Assessing Large Language Models on Climate Information", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/34000", "id": "ScIHQoTUjT", "proceeding": "https://proceedings.mlr.press/v235/bulian24a.html", "pdf": "https://openreview.net/pdf?id=ScIHQoTUjT", "openreview": "https://openreview.net/forum?id=ScIHQoTUjT", "author_site": "Jannis Bulian, Mike Sch\u00e4fer, Afra Amini, Heidi Lam, Massimiliano Ciaramita, Ben Gaiarin, Michelle Chen Huebscher, Christian Buck, Niels Mede, Markus Leippold, Nadine Strauss", "tldr": "", "abstract": "As Large Language Models (LLMs) rise in popularity, it is necessary to assess their capability in critically relevant domains. We present a comprehensive evaluation framework, grounded in science communication research, to assess LLM responses to questions about climate change. Our framework emphasizes both presentational and epistemological adequacy, offering a fine-grained analysis of LLM generations spanning 8 dimensions and 30 issues. Our evaluation task is a real-world example of a growing number of challenging problems where AI can complement and lift human performance. We introduce a novel protocol for scalable oversight that relies on AI Assistance and raters with relevant education. We evaluate several recent LLMs on a set of diverse climate questions. Our results point to a significant gap between surface and epistemological qualities of LLMs in the realm of climate communication.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jannis Bulian;Mike S. Sch\u00e4fer;Afra Amini;Heidi Lam;Massimiliano Ciaramita;Ben Gaiarin;Michelle Chen Huebscher;Christian Buck;Niels G. Mede;Markus Leippold;Nadine Strauss", "authorids": "~Jannis_Bulian1;~Mike_S._Sch\u00e4fer1;~Afra_Amini1;~Heidi_Lam1;~Massimiliano_Ciaramita2;~Ben_Gaiarin1;~Michelle_Chen_Huebscher1;~Christian_Buck1;~Niels_G._Mede1;~Markus_Leippold1;~Nadine_Strauss1", "gender": "M;M;F;;;M;F;M;M;M;F", "homepage": "http://bulian.org/;https://www.ikmz.uzh.ch/en/research/divisions/science-crisis-and-risk-communication/team/mike-s-schaefer.html;;http://google.com;;;https://arxiv.org/search/cs?searchtype=author&query=Huebscher%2C+M+C;;https://www.nielsmede.com/;https://www.bf.uzh.ch/de/persons/leippold-markus;https://www.nadinestrauss.com", "dblp": "09/10967;;270/4959;;31/916;;;;;;", "google_scholar": "https://scholar.google.co.uk/citations?user=Yq32OuIAAAAJ;;;;;btkFBuQAAAAJ;;DSb_wQ8AAAAJ;wzJ53tgAAAAJ;-Ta9boQAAAAJ;https://scholar.google.com/citations?hl=de", "orcid": "0000-0002-2908-4485;;;;;;;;0000-0001-5707-7568;;0000-0002-5050-7067", "linkedin": "jbulian/;;afraamini;;;bgaiarin/;;;ngmede/;markus-leippold-578bb95/;nadine-strau\u00df-bb129572/", "or_profile": "~Jannis_Bulian1;~Mike_S._Sch\u00e4fer1;~Afra_Amini1;~Heidi_Lam1;~Massimiliano_Ciaramita2;~Ben_Gaiarin1;~Michelle_Chen_Huebscher1;~Christian_Buck1;~Niels_G._Mede1;~Markus_Leippold1;~Nadine_Strauss1", "aff": "Google DeepMind;University of Zurich;ETHZ - ETH Zurich;;Google;Research, Google;;Google;University of Zurich;University of Zurich;University of Zurich", "aff_domain": "google.com;uzh.ch;ethz.ch;;google.com;research.google.com;;google.com;uzh.ch;uzh.ch;uzh.ch", "position": "Researcher;Full Professor;PhD student;;Research Scientist;Technical Program Manager;;Researcher;Postdoc;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nbulian2024assessing,\ntitle={Assessing Large Language Models on Climate Information},\nauthor={Jannis Bulian and Mike S. Sch{\\\"a}fer and Afra Amini and Heidi Lam and Massimiliano Ciaramita and Ben Gaiarin and Michelle Chen Huebscher and Christian Buck and Niels G. Mede and Markus Leippold and Nadine Strauss},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ScIHQoTUjT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2947723, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9869895816938654350&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "email": "google.com;uzh.ch;ethz.ch;;google.com;research.google.com;;google.com;uzh.ch;uzh.ch;uzh.ch", "author_num": 11, "aff_unique_index": "0;1;2;0;0;0;1;1;1", "aff_unique_norm": "Google;University of Zurich;ETH Zurich", "aff_unique_dep": "Google DeepMind;;", "aff_unique_url": "https://deepmind.com;https://www.unizh.ch;https://www.ethz.ch", "aff_unique_abbr": "DeepMind;UZH;ETHZ", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;2;2;2;1;1;1", "aff_country_unique": "United Kingdom;Switzerland;United States" }, { "title": "A Fresh Take on Stale Embeddings: Improving Dense Retriever Training with Corrector Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33999", "id": "ScRhEuj480", "proceeding": "https://proceedings.mlr.press/v235/monath24a.html", "pdf": "https://openreview.net/pdf?id=ScRhEuj480", "openreview": "https://openreview.net/forum?id=ScRhEuj480", "author_site": "Nicholas Monath, Will Grathwohl, Michael Boratko, Rob Fergus, Andrew McCallum, Manzil Zaheer", "tldr": "", "abstract": "In dense retrieval, deep encoders provide embeddings for both inputs and targets, and the softmax function is used to parameterize a distribution over a large number of candidate targets (e.g., textual passages for information retrieval). Significant challenges arise in training such encoders in the increasingly prevalent scenario of (1) a large number of targets, (2) a computationally expensive target encoder model, (3) cached target embeddings that are out-of-date due to ongoing training of target encoder parameters. This paper presents a simple and highly scalable response to these challenges by training a small parametric _corrector network_ that adjusts stale cached target embeddings, enabling an accurate softmax approximation and thereby sampling of up-to-date high scoring \"hard negatives.\" We theoretically investigate the generalization properties of our proposed target corrector, relating the complexity of the network, staleness of cached representations, and the amount of training data. We present experimental results on large benchmark dense retrieval datasets as well as on QA with retrieval augmented language models. Our approach matches state-of-the-art results even when no target embedding updates are made during training beyond an initial cache from the unsupervised pre-trained model, providing a 4-80x reduction in re-embedding computational cost.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nicholas Monath;Will Sussman Grathwohl;Michael Boratko;Rob Fergus;Andrew McCallum;Manzil Zaheer", "authorids": "~Nicholas_Monath1;~Will_Sussman_Grathwohl2;~Michael_Boratko1;~Rob_Fergus1;~Andrew_McCallum1;~Manzil_Zaheer1", "gender": "M;M;M;M;M;M", "homepage": "https://nmonath.github.io/;http://www.cs.toronto.edu/~wgrathwohl/;https://people.cs.umass.edu/~mboratko/;http://cs.nyu.edu/fergus/;http://www.cs.umass.edu/~mccallum;https://www.aclweb.org/anthology/people/m/manzil-zaheer/", "dblp": "131/4309;192/1565;222/1939;77/3763;m/AndrewMcCallum;40/10701", "google_scholar": "PTfhfCQAAAAJ;;YKZGpnkAAAAJ;https://scholar.google.com.tw/citations?user=GgQ9GEkAAAAJ;yILa1y0AAAAJ;A33FhJMAAAAJ", "orcid": "0000-0002-5135-2423;;;;0009-0004-5487-2848;", "linkedin": "nicholas-monath-8627581aa/;will-grathwohl-b44a383b/;michaelboratko/;;andrew-mccallum-a412;", "or_profile": "~Nicholas_Monath1;~Will_Sussman_Grathwohl2;~Michael_Boratko1;~Rob_Fergus1;~Andrew_McCallum1;~Manzil_Zaheer1", "aff": "Google;Google DeepMind;Google;Google;University of Massachusetts Amherst;Google DeepMind", "aff_domain": "google.com;deepmind.com;google.com;google.com;cs.umass.edu;deepmind.com", "position": "Researcher;Senior Research Scientist;Researcher;Research scientist;Distinguished Professor;Researcher", "bibtex": "@inproceedings{\nmonath2024a,\ntitle={A Fresh Take on Stale Embeddings: Improving Dense Retriever Training with Corrector Networks},\nauthor={Nicholas Monath and Will Sussman Grathwohl and Michael Boratko and Rob Fergus and Andrew McCallum and Manzil Zaheer},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ScRhEuj480}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 662367, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:1WKup7HbbFEJ:scholar.google.com/&scioq=A+Fresh+Take+on+Stale+Embeddings:+Improving+Dense+Retriever+Training+with+Corrector+Networks&hl=en&as_sdt=0,23", "gs_version_total": 8, "email": "google.com;deepmind.com;google.com;google.com;cs.umass.edu;deepmind.com", "author_num": 6, "aff_unique_index": "0;0;0;0;1;0", "aff_unique_norm": "Google;University of Massachusetts Amherst", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.umass.edu", "aff_unique_abbr": "Google;UMass Amherst", "aff_campus_unique_index": "0;0;0;2", "aff_campus_unique": "Mountain View;;Amherst", "aff_country_unique_index": "0;1;0;0;0;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Reflected Flow Matching", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33998", "id": "Sf5KYznS2G", "proceeding": "https://proceedings.mlr.press/v235/xie24k.html", "pdf": "https://openreview.net/pdf?id=Sf5KYznS2G", "openreview": "https://openreview.net/forum?id=Sf5KYznS2G", "author_site": "Tianyu Xie, Yu Zhu, Longlin Yu, Tong Yang, Ziheng Cheng, Shiyue Zhang, Xiangyu Zhang, Cheng Zhang", "tldr": "", "abstract": "Continuous normalizing flows (CNFs) learn an ordinary differential equation to transform prior samples into data. Flow matching (FM) has recently emerged as a simulation-free approach for training CNFs by regressing a velocity model towards the conditional velocity field. However, on constrained domains, the learned velocity model may lead to undesirable flows that result in highly unnatural samples, e.g., oversaturated images, due to both flow matching error and simulation error. To address this, we add a boundary constraint term to CNFs, which leads to reflected CNFs that keep trajectories within the constrained domains. We propose reflected flow matching (RFM) to train the velocity model in reflected CNFs by matching the conditional velocity fields in a simulation-free manner, similar to the vanilla FM. Moreover, the analytical form of conditional velocity fields in RFM avoids potentially biased approximations, making it superior to existing score-based generative models on constrained domains. We demonstrate that RFM achieves comparable or better results on standard image benchmarks and produces high-quality class-conditioned samples under high guidance weight.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tianyu Xie;Yu Zhu;Longlin Yu;Tong Yang;Ziheng Cheng;Shiyue Zhang;Xiangyu Zhang;Cheng Zhang", "authorids": "~Tianyu_Xie1;~Yu_Zhu13;~Longlin_Yu1;~Tong_Yang2;~Ziheng_Cheng4;~Shiyue_Zhang3;~Xiangyu_Zhang1;~Cheng_Zhang3", "gender": "M;M;M;M;M;M;M;M", "homepage": "https://tyuxie.github.io;https://github.com/zhuyu-cs;https://github.com/longinYu;;https://alexczh1.github.io/;https://github.com/ShiyueZhang66;;https://zcrabbit.github.io", "dblp": "345/3987-1;;;;;;95/3760-5.html;", "google_scholar": "qbJJQ_AAAAAJ;;;yu7ijD0AAAAJ;M8Hz2NSNe3QC;nu6YfFkAAAAJ;yuB-cfoAAAAJ;PddDrLgAAAAJ", "orcid": ";0000-0001-8836-7939;;;;;0000-0003-2138-4608;", "linkedin": ";;;;;;;", "or_profile": "~Tianyu_Xie1;~Yu_Zhu13;~Longlin_Yu1;~Tong_Yang2;~Ziheng_Cheng4;~Shiyue_Zhang3;~Xiangyu_Zhang1;~Cheng_Zhang3", "aff": "Peking University;Institute of automation, Chinese academy of science;Peking University;Megvii Technology Inc.;Peking University;Peking University;MEGVII Technology;Peking University", "aff_domain": "pku.edu.cn;ia.ac.cn;pku.edu.cn;megvii.com;pku.edu.cn;pku.edu.cn;megvii.com;pku.edu.cn", "position": "PhD student;PhD student;PhD student;Researcher;Undergrad student;Undergrad student;Principal Researcher;Assistant Professor", "bibtex": "@inproceedings{\nxie2024reflected,\ntitle={Reflected Flow Matching},\nauthor={Tianyu Xie and Yu Zhu and Longlin Yu and Tong Yang and Ziheng Cheng and Shiyue Zhang and Xiangyu Zhang and Cheng Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Sf5KYznS2G}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8294183, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8104171696739423636&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "pku.edu.cn;ia.ac.cn;pku.edu.cn;megvii.com;pku.edu.cn;pku.edu.cn;megvii.com;pku.edu.cn", "author_num": 8, "aff_unique_index": "0;1;0;2;0;0;2;0", "aff_unique_norm": "Peking University;Chinese Academy of Sciences;Megvii Technology", "aff_unique_dep": ";Institute of Automation;", "aff_unique_url": "http://www.pku.edu.cn;http://www.ia.cas.cn;https://www.megvii.com", "aff_unique_abbr": "Peking U;CAS;Megvii", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "A Theoretical Analysis of Backdoor Poisoning Attacks in Convolutional Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33997", "id": "SfcB4cVvPz", "proceeding": "https://proceedings.mlr.press/v235/li24at.html", "pdf": "https://openreview.net/pdf?id=SfcB4cVvPz", "openreview": "https://openreview.net/forum?id=SfcB4cVvPz", "author_site": "Boqi Li, Weiwei Liu", "tldr": "", "abstract": "The rising threat of backdoor poisoning attacks (BPAs) on Deep Neural Networks (DNNs) has become a significant concern in recent years. In such attacks, the adversaries strategically target a specific class and generate a poisoned training set. The neural network (NN), well-trained on the poisoned training set, is able to predict any input with the trigger pattern as the targeted label, while maintaining accurate outputs for clean inputs. However, why the BPAs work remains less explored. To fill this gap, we employ a dirty-label attack and conduct a detailed analysis of BPAs in a two-layer convolutional neural network. We provide theoretical insights and results on the effectiveness of BPAs. Our experimental results on two real-world datasets validate our theoretical findings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Boqi Li;Weiwei Liu", "authorids": "~Boqi_Li2;~Weiwei_Liu1", "gender": "M;M", "homepage": ";https://sites.google.com/site/weiweiliuhomepage/", "dblp": "149/1037-2.html;54/6677-3.html", "google_scholar": "rYtY_mwAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";", "linkedin": ";weiwei-liu-4a7849134/", "or_profile": "~Boqi_Li2;~Weiwei_Liu1", "aff": "Wuhan University;Wuhan University", "aff_domain": "whu.edu.cn;whu.edu.cn", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nli2024a,\ntitle={A Theoretical Analysis of Backdoor Poisoning Attacks in Convolutional Neural Networks},\nauthor={Boqi Li and Weiwei Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SfcB4cVvPz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 832644, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17635775104408155886&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "whu.edu.cn;whu.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Wuhan University", "aff_unique_dep": "", "aff_unique_url": "http://www.whu.edu.cn/", "aff_unique_abbr": "WHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "OTMatch: Improving Semi-Supervised Learning with Optimal Transport", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33996", "id": "ShkKSDrfG6", "proceeding": "https://proceedings.mlr.press/v235/tan24f.html", "pdf": "https://openreview.net/pdf?id=ShkKSDrfG6", "openreview": "https://openreview.net/forum?id=ShkKSDrfG6", "author_site": "Zhiquan Tan, Kaipeng Zheng, Weiran Huang", "tldr": "", "abstract": "Semi-supervised learning has made remarkable strides by effectively utilizing a limited amount of labeled data while capitalizing on the abundant information present in unlabeled data. However, current algorithms often prioritize aligning image predictions with specific classes generated through self-training techniques, thereby neglecting the inherent relationships that exist within these classes. In this paper, we present a new approach called OTMatch, which leverages semantic relationships among classes by employing an optimal transport loss function to match distributions. We conduct experiments on many standard vision and language datasets. The empirical results show improvements in our method above baseline, this demonstrates the effectiveness and superiority of our approach in harnessing semantic relationships to enhance learning performance in a semi-supervised setting.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhiquan Tan;Kaipeng Zheng;Weiran Huang", "authorids": "~Zhiquan_Tan1;~Kaipeng_Zheng1;~Weiran_Huang1", "gender": "M;M;M", "homepage": ";https://github.com/uiiuiiuii;https://www.weiranhuang.com", "dblp": "326/0177;330/7352;170/0073-1", "google_scholar": ";;AjJ2rf8AAAAJ", "orcid": ";;", "linkedin": "https://www.linkedin.cn/incareer/in/ACoAAC1A8_QBFX8OlchWmVI_pNXN4zm_t6vPKCs;;", "or_profile": "~Zhiquan_Tan1;~Kaipeng_Zheng1;~Weiran_Huang1", "aff": "Tsinghua University;Shanghai Artificial Intelligence Laboratory;Shanghai AI Laboratory", "aff_domain": "tsinghua.edu.cn;pjlab.org.cn;pjlab.org.cn", "position": "PhD student;Intern;Consultant", "bibtex": "@inproceedings{\ntan2024otmatch,\ntitle={{OTM}atch: Improving Semi-Supervised Learning with Optimal Transport},\nauthor={Zhiquan Tan and Kaipeng Zheng and Weiran Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ShkKSDrfG6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 533141, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1650358935855985376&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 7, "email": "tsinghua.edu.cn;pjlab.org.cn;pjlab.org.cn", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Tsinghua University;Shanghai Artificial Intelligence Laboratory;Shanghai AI Laboratory", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;http://www.shailab.org/;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "THU;Shanghai AI Lab;SAIL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "GATE: How to Keep Out Intrusive Neighbors", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33995", "id": "Sjv5RcqfuH", "proceeding": "https://proceedings.mlr.press/v235/mustafa24a.html", "pdf": "https://openreview.net/pdf?id=Sjv5RcqfuH", "openreview": "https://openreview.net/forum?id=Sjv5RcqfuH", "author_site": "Nimrah Mustafa, Rebekka Burkholz", "tldr": "", "abstract": "Graph Attention Networks (GATs) are designed to provide flexible neighborhood aggregation that assigns weights to neighbors according to their importance. In practice, however, GATs are often unable to switch off task-irrelevant neighborhood aggregation, as we show experimentally and analytically. To address this challenge, we propose GATE, a GAT extension that holds three major advantages: i) It alleviates over-smoothing by addressing its root cause of unnecessary neighborhood aggregation. ii) Similarly to perceptrons, it benefits from higher depth as it can still utilize additional layers for (non-)linear feature transformations in case of (nearly) switched-off neighborhood aggregation. iii) By down-weighting connections to unrelated neighbors, it often outperforms GATs on real-world heterophilic datasets. To further validate our claims, we construct a synthetic test bed to analyze a model's ability to utilize the appropriate amount of neighborhood aggregation, which could be of independent interest.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nimrah Mustafa;Rebekka Burkholz", "authorids": "~Nimrah_Mustafa1;~Rebekka_Burkholz1", "gender": "F;F", "homepage": "https://cispa.de/en/people/c01nimu;https://sites.google.com/view/rebekkaburkholz/startseite", "dblp": ";194/3172", "google_scholar": ";https://scholar.google.ch/citations?user=vkWBb2wAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Nimrah_Mustafa1;~Rebekka_Burkholz1", "aff": "CISPA, saarland university, saarland informatics campus;Helmholtz Center CISPA for Information Security", "aff_domain": "cispa.saarland;cispa.saarland", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nmustafa2024gate,\ntitle={{GATE}: How to Keep Out Intrusive Neighbors},\nauthor={Nimrah Mustafa and Rebekka Burkholz},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Sjv5RcqfuH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2461462, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14484222735322731887&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "cispa.saarland;cispa.saarland", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Saarland University;Helmholtz Center CISPA", "aff_unique_dep": "CISPA;Information Security", "aff_unique_url": "https://www.uni-saarland.de;https://www.cispa.de/", "aff_unique_abbr": "Saarland U;CISPA", "aff_campus_unique_index": "0", "aff_campus_unique": "Saarland Informatics Campus;", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Efficient and Effective Time-Series Forecasting with Spiking Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33994", "id": "SkI6u81AkI", "proceeding": "https://proceedings.mlr.press/v235/lv24d.html", "pdf": "https://openreview.net/pdf?id=SkI6u81AkI", "openreview": "https://openreview.net/forum?id=SkI6u81AkI", "author_site": "Changze Lv, Yansen Wang, Dongqi Han, Xiaoqing Zheng, Xuanjing Huang, Dongsheng Li", "tldr": "", "abstract": "Spiking neural networks (SNNs), inspired by the spiking behavior of biological neurons, provide a unique pathway for capturing the intricacies of temporal data. However, applying SNNs to time-series forecasting is challenging due to difficulties in effective temporal alignment, complexities in encoding processes, and the absence of standardized guidelines for model selection. In this paper, we propose a framework for SNNs in time-series forecasting tasks, leveraging the efficiency of spiking neurons in processing temporal information. Through a series of experiments, we demonstrate that our proposed SNN-based approaches achieve comparable or superior results to traditional time-series forecasting methods on diverse benchmarks with much less energy consumption. Furthermore, we conduct detailed analysis experiments to assess the SNN's capacity to capture temporal dependencies within time-series data, offering valuable insights into its nuanced strengths and effectiveness in modeling the intricate dynamics of temporal data. Our study contributes to the expanding field of SNNs and offers a promising alternative for time-series forecasting tasks, presenting a pathway for the development of more biologically inspired and temporally aware forecasting models. Our code is available at https://github.com/microsoft/SeqSNN.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Changze Lv;Yansen Wang;Dongqi Han;Xiaoqing Zheng;Xuanjing Huang;Dongsheng Li", "authorids": "~Changze_Lv1;~Yansen_Wang2;~Dongqi_Han1;~Xiaoqing_Zheng2;~Xuanjing_Huang1;~Dongsheng_Li2", "gender": "M;;M;;F;M", "homepage": "https://lvchangze.github.io;;https://frosthan.github.io/;;https://xuanjing-huang.github.io/;http://recmind.cn", "dblp": "350/4445;;;;05/6735-1;254/0830-2.html", "google_scholar": "t3-viUwAAAAJ;;3V_9fRUAAAAJ;;RGsMgZA4H78C;VNg5rA8AAAAJ", "orcid": ";;0000-0002-6872-7121;;0000-0001-9197-9426;0000-0003-3103-8442", "linkedin": ";;;;;", "or_profile": "~Changze_Lv1;~Yansen_Wang2;~Dongqi_Han1;~Xiaoqing_Zheng2;~Xuanjing_Huang1;~Dongsheng_Li2", "aff": "Fudan University;;Microsoft;;Fudan University;Microsoft Research Asia", "aff_domain": "fudan.edu.cn;;microsoft.com;;fudan.edu.cn;microsoft.com", "position": "PhD student;;Researcher;;Full Professor;Principal Researcher", "bibtex": "@inproceedings{\nlv2024efficient,\ntitle={Efficient and Effective Time-Series Forecasting with Spiking Neural Networks},\nauthor={Changze Lv and Yansen Wang and Dongqi Han and Xiaoqing Zheng and Xuanjing Huang and Dongsheng Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SkI6u81AkI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 838696, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10288270577914747876&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "fudan.edu.cn;;microsoft.com;;fudan.edu.cn;microsoft.com", "author_num": 6, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Fudan University;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.fudan.edu.cn;https://www.microsoft.com", "aff_unique_abbr": "Fudan;Microsoft", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "China;United States" }, { "title": "Nonsmooth Implicit Differentiation: Deterministic and Stochastic Convergence Rates", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33993", "id": "SlRcJvf1yd", "proceeding": "https://proceedings.mlr.press/v235/grazzi24a.html", "pdf": "https://openreview.net/pdf?id=SlRcJvf1yd", "openreview": "https://openreview.net/forum?id=SlRcJvf1yd", "author_site": "Riccardo Grazzi, Massimiliano Pontil, Saverio Salzo", "tldr": "", "abstract": "We study the problem of efficiently computing the derivative of the fixed-point of a parametric nondifferentiable contraction map. This problem has wide applications in machine learning, including hyperparameter optimization, meta-learning and data poisoning attacks. We analyze two popular approaches: iterative differentiation (ITD) and approximate implicit differentiation (AID). A key challenge behind the nonsmooth setting is that the chain rule does not hold anymore. We build upon the work by Bolte et al. (2022), who prove linear convergence of nonsmooth ITD under a piecewise Lipschitz smooth assumption. In the deterministic case, we provide a linear rate for AID and an improved linear rate for ITD which closely match the ones for the smooth setting. We further introduce NSID, a new stochastic method to compute the implicit derivative when the contraction map is defined as the composition of an outer map and an inner map which is accessible only through a stochastic unbiased estimator. We establish rates for the convergence of NSID, encompassing the best available rates in the smooth setting. We also present illustrative experiments confirming our analysis.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Riccardo Grazzi;Massimiliano Pontil;Saverio Salzo", "authorids": "~Riccardo_Grazzi2;~Massimiliano_Pontil4;~Saverio_Salzo1", "gender": "Not Specified;;M", "homepage": "https://www.iit.it/web/computational-statistics-and-machine-learning;;", "dblp": ";60/9367;222/2069", "google_scholar": "lcOacs8AAAAJ;https://scholar.google.it/citations?user=zocrDQkAAAAJ;9Tlyx1IAAAAJ", "orcid": "0000-0001-9415-098X;0000-0003-0494-9101;", "linkedin": ";;", "or_profile": "~Massimiliano_Pontil4;~Saverio_Salzo1;~Riccardo_Grazzi1", "aff": "University College London, University of London;Istituto Italiano di Tecnologia;Istituto Italiano di Tecnologia", "aff_domain": "ucl.ac.uk;iit.it;iit.it", "position": "Full Professor;Researcher;Postdoc", "bibtex": "@inproceedings{\ngrazzi2024nonsmooth,\ntitle={Nonsmooth Implicit Differentiation: Deterministic and Stochastic Convergence Rates},\nauthor={Riccardo Grazzi and Massimiliano Pontil and Saverio Salzo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SlRcJvf1yd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 543061, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8578711368874501108&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "ucl.ac.uk;iit.it;iit.it", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "University College London;Istituto Italiano di Tecnologia", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucl.ac.uk;https://www.iit.it", "aff_unique_abbr": "UCL;IIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United Kingdom;Italy" }, { "title": "Position: Compositional Generative Modeling: A Single Model is Not All You Need", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33992", "id": "SoNexFx8qz", "proceeding": "https://proceedings.mlr.press/v235/du24d.html", "pdf": "https://openreview.net/pdf?id=SoNexFx8qz", "openreview": "https://openreview.net/forum?id=SoNexFx8qz", "author_site": "Yilun Du, Leslie Kaelbling", "tldr": "", "abstract": "Large monolithic generative models trained on massive amounts of data have become an increasingly dominant approach in AI research. In this paper, we argue that we should instead construct large generative systems by composing smaller generative models together. We show how such a compositional generative approach enables us to learn distributions in a more data-efficient manner, enabling generalization to parts of the data distribution unseen at training time. We further show how this enables us to program and construct new generative models for tasks completely unseen at training. Finally, we show that in many cases, we can discover separate compositional components from data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yilun Du;Leslie Pack Kaelbling", "authorids": "~Yilun_Du1;~Leslie_Pack_Kaelbling1", "gender": ";F", "homepage": "https://yilundu.github.io;http://people.csail.mit.edu/lpk/", "dblp": "204/4379;k/LesliePackKaelbling", "google_scholar": ";IcasIiwAAAAJ", "orcid": ";0000-0001-6054-7145", "linkedin": ";", "or_profile": "~Yilun_Du1;~Leslie_Pack_Kaelbling1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\ndu2024position,\ntitle={Position: Compositional Generative Modeling: A Single Model is Not All You Need},\nauthor={Yilun Du and Leslie Pack Kaelbling},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SoNexFx8qz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2332285, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16211327218673069379&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": "mit.edu;mit.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Mastering Zero-Shot Interactions in Cooperative and Competitive Simultaneous Games", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33991", "id": "SoqxSnEUi1", "proceeding": "https://proceedings.mlr.press/v235/mahlau24a.html", "pdf": "https://openreview.net/pdf?id=SoqxSnEUi1", "openreview": "https://openreview.net/forum?id=SoqxSnEUi1", "author_site": "Yannik Mahlau, Frederik Schubert, Bodo Rosenhahn", "tldr": "", "abstract": "The combination of self-play and planning has achieved great successes in sequential games, for instance in Chess and Go. However, adapting algorithms such as AlphaZero to simultaneous games poses a new challenge. In these games, missing information about concurrent actions of other agents is a limiting factor as they may select different Nash equilibria or do not play optimally at all. Thus, it is vital to model the behavior of the other agents when interacting with them in simultaneous games. To this end, we propose Albatross: AlphaZero for Learning Bounded-rational Agents and Temperature-based Response Optimization using Simulated Self-play. Albatross learns to play the novel equilibrium concept of a Smooth Best Response Logit Equilibrium (SBRLE), which enables cooperation and competition with agents of any playing strength. We perform an extensive evaluation of Albatross on a set of cooperative and competitive simultaneous perfect-information games. In contrast to AlphaZero, Albatross is able to exploit weak agents in the competitive game of Battlesnake. Additionally, it yields an improvement of 37.6% compared to previous state of the art in the cooperative Overcooked benchmark.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yannik Mahlau;Frederik Schubert;Bodo Rosenhahn", "authorids": "~Yannik_Mahlau1;~Frederik_Schubert1;~Bodo_Rosenhahn1", "gender": "M;M;M", "homepage": ";https://www.tnt.uni-hannover.de/de/staff/schubert/;http://www.tnt.uni-hannover.de/staff/rosenhahn/", "dblp": ";;09/2973", "google_scholar": "KU_OIyEAAAAJ;lWi5nEwAAAAJ;qq3TxtcAAAAJ\\", "orcid": "0000-0003-0425-5003;;", "linkedin": ";frederikschubert/;b-rosenhahn-a397b1183/", "or_profile": "~Yannik_Mahlau1;~Frederik_Schubert1;~Bodo_Rosenhahn1", "aff": "Universit\u00e4t Hannover;Leibniz Universit\u00e4t Hannover;Institut f\u00fcr Informationsverarbeitung", "aff_domain": "uni-hannover.de;uni-hannover.de;tnt.uni-hannover.de", "position": "PhD student;PhD student;Professor", "bibtex": "@inproceedings{\nmahlau2024mastering,\ntitle={Mastering Zero-Shot Interactions in Cooperative and Competitive Simultaneous Games},\nauthor={Yannik Mahlau and Frederik Schubert and Bodo Rosenhahn},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SoqxSnEUi1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2073845, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:m3LN3Lc__XQJ:scholar.google.com/&scioq=Mastering+Zero-Shot+Interactions+in+Cooperative+and+Competitive+Simultaneous+Games&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": "uni-hannover.de;uni-hannover.de;tnt.uni-hannover.de", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Hanover;Leibniz Universit\u00e4t Hannover;Institut f\u00fcr Informationsverarbeitung", "aff_unique_dep": ";;Department of Information Processing", "aff_unique_url": "https://www.uni-hannover.de;https://www.leibniz.uni-hannover.de/;", "aff_unique_abbr": "Uni Hanover;LUH;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "An Information Theoretic Approach to Interaction-Grounded Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33990", "id": "Sra298VMFM", "proceeding": "https://proceedings.mlr.press/v235/hu24e.html", "pdf": "https://openreview.net/pdf?id=Sra298VMFM", "openreview": "https://openreview.net/forum?id=Sra298VMFM", "author_site": "Xiaoyan Hu, Farzan Farnia, Ho-fung Leung", "tldr": "", "abstract": "Reinforcement learning (RL) problems where the learner attempts to infer an unobserved reward from some feedback variables have been studied in several recent papers. The setting of Interaction-Grounded Learning (IGL) is an example of such feedback-based reinforcement learning tasks where the learner optimizes the return by inferring latent binary rewards from the interaction with the environment. In the IGL setting, a relevant assumption used in the RL literature is that the feedback variable $Y$ is conditionally independent of the context-action $(X,A)$ given the latent reward $R$. In this work, we propose *Variational Information-based IGL (VI-IGL)* as an information-theoretic method to enforce the conditional independence assumption in the IGL-based RL problem. The VI-IGL framework learns a reward decoder using an information-based objective based on the conditional mutual information (MI) between the context-action $(X,A)$ and the feedback variable $Y$ observed from the environment. To estimate and optimize the information-based terms for the continuous random variables in the RL problem, VI-IGL leverages the variational representation of mutual information and results in a min-max optimization problem. Theoretical analysis shows that the optimization problem can be sample-efficiently solved. Furthermore, we extend the VI-IGL framework to general $f$-Information measures in the information theory literature, leading to the generalized $f$-VI-IGL framework to address the RL problem under the IGL condition. Finally, the empirical results on several reinforcement learning settings indicate an improved performance in comparison to the previous IGL-based RL algorithm.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaoyan Hu;Farzan Farnia;Ho-fung Leung", "authorids": "~Xiaoyan_Hu2;~Farzan_Farnia1;~Ho-fung_Leung1", "gender": "M;M;M", "homepage": "https://yannxiaoyanhu.github.io;https://www.cse.cuhk.edu.hk/~farnia/;http://www.cse.cuhk.edu.hk/~lhf/", "dblp": ";132/7757;l/HofungLeung", "google_scholar": "https://scholar.google.com/citations?hl=en;GYPCqcYAAAAJ;https://scholar.google.com.hk/citations?user=JDErdKcAAAAJ", "orcid": "0000-0002-5766-1059;0000-0002-6049-9232;0000-0003-4914-2934", "linkedin": "xiaoyan-hu-9a26661b9/;farzan-farnia-00798335;ho-fung-leung-1a73135/", "or_profile": "~Xiaoyan_Hu2;~Farzan_Farnia1;~Ho-fung_Leung1", "aff": "The Chinese University of Hong Kong;The Chinese University of Hong Kong; ", "aff_domain": "cse.cuhk.edu.hk;cuhk.edu.hk;outlook.com", "position": "PhD student;Assistant Professor;Independent Researcher", "bibtex": "@inproceedings{\nhu2024an,\ntitle={An Information Theoretic Approach to Interaction-Grounded Learning},\nauthor={Xiaoyan Hu and Farzan Farnia and Ho-fung Leung},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Sra298VMFM}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 547671, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16158058583373233653&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "cse.cuhk.edu.hk;cuhk.edu.hk;outlook.com", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Chinese University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Absolute Policy Optimization: Enhancing Lower Probability Bound of Performance with High Confidence", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33989", "id": "Ss3h1ixJAU", "proceeding": "https://proceedings.mlr.press/v235/zhao24i.html", "pdf": "https://openreview.net/pdf?id=Ss3h1ixJAU", "openreview": "https://openreview.net/forum?id=Ss3h1ixJAU", "author_site": "Weiye Zhao, Feihan Li, Yifan Sun, Rui Chen, Tianhao Wei, Changliu Liu", "tldr": "", "abstract": "In recent years, trust region on-policy reinforcement learning has achieved impressive results in addressing complex control tasks and gaming scenarios. However, contemporary state-of-the-art algorithms within this category primarily emphasize improvement in expected performance, lacking the ability to control over the worst-case performance outcomes. To address this limitation, we introduce a novel objective function, optimizing which leads to guaranteed monotonic improvement in the lower probability bound of performance with high confidence. Building upon this groundbreaking theoretical advancement, we further introduce a practical solution called Absolute Policy Optimization (APO). Our experiments demonstrate the effectiveness of our approach across challenging continuous control benchmark tasks and extend its applicability to mastering Atari games. Our findings reveal that APO as well as its efficient variation Proximal Absolute Policy Optimization (PAPO) significantly outperforms state-of-the-art policy gradient algorithms, resulting in substantial improvements in worst-case performance, as well as expected performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weiye Zhao;Feihan Li;Yifan Sun;Rui Chen;Tianhao Wei;Changliu Liu", "authorids": "~Weiye_Zhao1;~Feihan_Li1;~Yifan_Sun9;~Rui_Chen11;~Tianhao_Wei1;~Changliu_Liu1", "gender": "M;M;M;M;M;F", "homepage": "https://github.com/CaesarAndylaw;;https://yifansun98.github.io/;https://ruichen.pub/;;http://www.cs.cmu.edu/~cliu6/index.html", "dblp": "228/6863;;99/10261-11;;222/5386;166/3563", "google_scholar": "P-79KOcAAAAJ;;DGhQSYUAAAAJ;XiUE0wMAAAAJ;V22j1C0AAAAJ;", "orcid": "0000-0002-8426-5238;0000-0003-1770-4664;0009-0007-2073-7789;0000-0002-8671-8771;;", "linkedin": ";;yifansun1/;;;", "or_profile": "~Weiye_Zhao1;~Feihan_Li1;~Yifan_Sun9;~Rui_Chen11;~Tianhao_Wei1;~Changliu_Liu1", "aff": "Carnegie Mellon University;Tsinghua University;Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "andrew.cmu.edu;tsinghua.edu.cn;andrew.cmu.edu;andrew.cmu.edu;andrew.cmu.edu;cmu.edu", "position": "PhD student;Undergrad student;PhD student;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nzhao2024absolute,\ntitle={Absolute Policy Optimization: Enhancing Lower Probability Bound of Performance with High Confidence},\nauthor={Weiye Zhao and Feihan Li and Yifan Sun and Rui Chen and Tianhao Wei and Changliu Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Ss3h1ixJAU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9248769, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=777739855918121689&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "andrew.cmu.edu;tsinghua.edu.cn;andrew.cmu.edu;andrew.cmu.edu;andrew.cmu.edu;cmu.edu", "author_num": 6, "aff_unique_index": "0;1;0;0;0;0", "aff_unique_norm": "Carnegie Mellon University;Tsinghua University", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.tsinghua.edu.cn", "aff_unique_abbr": "CMU;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;0", "aff_country_unique": "United States;China" }, { "title": "ReMax: A Simple, Effective, and Efficient Reinforcement Learning Method for Aligning Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33988", "id": "Stn8hXkpe6", "proceeding": "https://proceedings.mlr.press/v235/li24cd.html", "pdf": "https://openreview.net/pdf?id=Stn8hXkpe6", "openreview": "https://openreview.net/forum?id=Stn8hXkpe6", "author_site": "Ziniu Li, Tian Xu, Yushun Zhang, Zhihang Lin, Yang Yu, Ruoyu Sun, Zhi-Quan Luo", "tldr": "", "abstract": "Reinforcement Learning from Human Feedback (RLHF) is key to aligning Large Language Models (LLMs), typically paired with the Proximal Policy Optimization (PPO) algorithm. While PPO is a powerful method designed for general reinforcement learning tasks, it is overly sophisticated for LLMs, leading to laborious hyper-parameter tuning and significant computation burdens. To make RLHF efficient, we present ReMax, which leverages 3 properties of RLHF: fast simulation, deterministic transitions, and trajectory-level rewards. These properties are not exploited in PPO, making it less suitable for RLHF. Building on the renowned REINFORCE algorithm, ReMax does not require training an additional value model as in PPO and is further enhanced with a new variance reduction technique. ReMax offers several benefits over PPO: it is simpler to implement, eliminates more than 4 hyper-parameters in PPO, reduces GPU memory usage, and shortens training time. ReMax can save about 46% GPU memory than PPO when training a 7B model and enables training on A800-80GB GPUs without the memory-saving offloading technique needed by PPO. Applying ReMax to a Mistral-7B model resulted in a 94.78% win rate on the AlpacaEval leaderboard and a 7.739 score on MT-bench, setting a new SOTA for open-source 7B models. These results show the effectiveness of ReMax while addressing the limitations of PPO in LLMs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ziniu Li;Tian Xu;Yushun Zhang;Zhihang Lin;Yang Yu;Ruoyu Sun;Zhi-Quan Luo", "authorids": "~Ziniu_Li1;~Tian_Xu2;~Yushun_Zhang1;~Zhihang_Lin2;~Yang_Yu5;~Ruoyu_Sun1;~Zhi-Quan_Luo1", "gender": "M;M;M;M;;M;M", "homepage": "http://www.liziniu.org/;http://www.lamda.nju.edu.cn/xut/;https://zyushun.github.io/;https://github.com/diruoshui;https://ruoyus.github.io/;;http://www.lamda.nju.edu.cn/yuy", "dblp": "254/0986;07/2985-3;276/8662;;30/9879-1;;46/2181-1", "google_scholar": "80UnKQQAAAAJ;e5mnk1wAAAAJ;https://scholar.google.com/citations?hl=en;;PsfzbCMAAAAJ;dW3gcXoAAAAJ;PG2lDSwAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Ziniu_Li1;~Tian_Xu2;~Yushun_Zhang1;~Zhihang_Lin2;~Ruoyu_Sun1;~Zhi-Quan_Luo1;~Yang_Yu2", "aff": "The Chinese University of Hong Kong, Shenzhen;Nanjing University;The Chinese University of Hong Kong, Shenzhen;Westlake Scietrain;The Chinese University of Hong Kong;The Chinese University of Hong Kong, Shenzhen;Nanjing University", "aff_domain": "cuhk.edu.cn;nju.edu.cn;cuhk.edu.cn;xinchen-inc.com;cuhk.edu.cn;cuhk.edu.cn;nju.edu.cn", "position": "PhD student;PhD student;PhD student;Researcher;Associate Professor;Full Professor;Professor", "bibtex": "@inproceedings{\nli2024remax,\ntitle={ReMax: A Simple, Effective, and Efficient Reinforcement Learning Method for Aligning Large Language Models},\nauthor={Ziniu Li and Tian Xu and Yushun Zhang and Zhihang Lin and Yang Yu and Ruoyu Sun and Zhi-Quan Luo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Stn8hXkpe6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2350069, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10822526911171533520&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "email": "cuhk.edu.cn;nju.edu.cn;cuhk.edu.cn;xinchen-inc.com;cuhk.edu.cn;cuhk.edu.cn;nju.edu.cn", "author_num": 7, "aff_unique_index": "0;1;0;2;0;0;1", "aff_unique_norm": "Chinese University of Hong Kong;Nanjing University;Westlake University", "aff_unique_dep": ";;Scietrain", "aff_unique_url": "https://www.cuhk.edu.cn;https://www.nju.edu.cn;https://www.westlake.edu.cn", "aff_unique_abbr": "CUHK;Nanjing U;", "aff_campus_unique_index": "0;0;2;0", "aff_campus_unique": "Shenzhen;;Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Wasserstein Wormhole: Scalable Optimal Transport Distance with Transformer", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33987", "id": "Su0qe33cWA", "proceeding": "https://proceedings.mlr.press/v235/haviv24a.html", "pdf": "https://openreview.net/pdf?id=Su0qe33cWA", "openreview": "https://openreview.net/forum?id=Su0qe33cWA", "author_site": "Doron Haviv, Russell Kunes, Thomas Dougherty, Cassandra Burdziak, Tal Nawy, Anna C. Gilbert, Dana Pe'er", "tldr": "", "abstract": "Optimal transport (OT) and the related Wasserstein metric ($W$) are powerful and ubiquitous tools for comparing distributions. However, computing pairwise Wasserstein distances rapidly becomes intractable as cohort size grows. An attractive alternative would be to find an embedding space in which pairwise Euclidean distances map to OT distances, akin to standard multidimensional scaling (MDS). We present Wasserstein Wormhole, a transformer-based autoencoder that embeds empirical distributions into a latent space wherein Euclidean distances approximate OT distances. Extending MDS theory, we show that our objective function implies a bound on the error incurred when embedding non-Euclidean distances. Empirically, distances between Wormhole embeddings closely match Wasserstein distances, enabling linear time computation of OT distances. Along with an encoder that maps distributions to embeddings, Wasserstein Wormhole includes a decoder that maps embeddings back to distributions, allowing for operations in the embedding space to generalize to OT spaces, such as Wasserstein barycenter estimation and OT interpolation. By lending scalability and interpretability to OT approaches, Wasserstein Wormhole unlocks new avenues for data analysis in the fields of computational geometry and single-cell biology.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Doron Haviv;Russell Zhang Kunes;Thomas Dougherty;Cassandra Burdziak;Tal Nawy;Anna Gilbert;Dana Pe'er", "authorids": "~Doron_Haviv1;rk3064@columbia.edu;~Thomas_Dougherty1;burdziac@mskcc.org;nawyt@mskcc.org;~Anna_Gilbert2;~Dana_Pe'er1", "gender": "M;;;;;F;F", "homepage": "https://doronhav.github.io/;;;;;https://annacgilbert.github.io/;http://datascience.columbia.edu/dana-peer", "dblp": "236/4996;;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": "doron-haviv-92ab63138/;;;;;;", "or_profile": "~Doron_Haviv1;rk3064@columbia.edu;~Thomas_Dougherty1;burdziac@mskcc.org;nawyt@mskcc.org;~Anna_Gilbert2;~Dana_Pe'er1", "aff": "Memorial Sloan Kettering Cancer Centre;;;;;Yale University;Columbia University", "aff_domain": "mskcc.org;;;;;yale.edu;", "position": "PhD student;;;;;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nhaviv2024wasserstein,\ntitle={Wasserstein Wormhole: Scalable Optimal Transport Distance with Transformer},\nauthor={Doron Haviv and Russell Zhang Kunes and Thomas Dougherty and Cassandra Burdziak and Tal Nawy and Anna Gilbert and Dana Pe'er},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Su0qe33cWA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4397934, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15138459532140704990&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 10, "email": "mskcc.org;;;;;yale.edu;", "author_num": 7, "aff_unique_index": "0;1;2", "aff_unique_norm": "Memorial Sloan Kettering Cancer Center;Yale University;Columbia University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.mskcc.org;https://www.yale.edu;https://www.columbia.edu", "aff_unique_abbr": "MSKCC;Yale;Columbia", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Generalizing Knowledge Graph Embedding with Universal Orthogonal Parameterization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33986", "id": "Sv4u9PtvT5", "proceeding": "https://proceedings.mlr.press/v235/li24ah.html", "pdf": "https://openreview.net/pdf?id=Sv4u9PtvT5", "openreview": "https://openreview.net/forum?id=Sv4u9PtvT5", "author_site": "Rui Li, Chaozhuo Li, Yanming Shen, Zeyu Zhang, Xu Chen", "tldr": "", "abstract": "Recent advances in knowledge graph embedding (KGE) rely on Euclidean/hyperbolic orthogonal relation transformations to model intrinsic logical patterns and topological structures. However, existing approaches are confined to rigid relational orthogonalization with restricted dimension and homogeneous geometry, leading to deficient modeling capability. In this work, we move beyond these approaches in terms of both dimension and geometry by introducing a powerful framework named GoldE, which features a universal orthogonal parameterization based on a generalized form of Householder reflection. Such parameterization can naturally achieve dimensional extension and geometric unification with theoretical guarantees, enabling our framework to simultaneously capture crucial logical patterns and inherent topological heterogeneity of knowledge graphs. Empirically, GoldE achieves state-of-the-art performance on three standard benchmarks. Codes are available at https://github.com/xxrep/GoldE.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rui Li;Chaozhuo Li;Yanming Shen;Zeyu Zhang;Xu Chen", "authorids": "~Rui_Li16;~Chaozhuo_Li1;~Yanming_Shen1;~Zeyu_Zhang6;~Xu_Chen13", "gender": ";;M;M;M", "homepage": "https://github.com/rui9812;https://scss.bupt.edu.cn/info/1063/5534.htm;;https://zeyu-zhang.cn;https://gsai.ruc.edu.cn/chenxu", "dblp": "96/4282-86.html;316/1269.html;51/3800;44/8352-200.html;83/6331-17", "google_scholar": "UlIbtTkAAAAJ;https://scholar.google.com/citations?hl=zh-CN;MvlgpWcAAAAJ;https://scholar.google.com/citations?hl=en;loPoqy0AAAAJ", "orcid": "0009-0005-0625-6802;0000-0002-8179-7503;;0000-0003-0048-1687;0000-0003-0144-1775", "linkedin": ";;;;", "or_profile": "~Rui_Li16;~Chaozhuo_Li1;~Yanming_Shen1;~Zeyu_Zhang6;~Xu_Chen13", "aff": "Renmin University of China;Beijing University of Posts and Telecommunications;Dalian University of Technology;Renmin University of China;Renmin University of China", "aff_domain": "ruc.edu.cn;bupt.edu.cn;dlut.edu.cn;ruc.edu.cn;ruc.edu.cn", "position": "PhD student;Associate Professor;Full Professor;MS student;Associate Professor", "bibtex": "@inproceedings{\nli2024generalizing,\ntitle={Generalizing Knowledge Graph Embedding with Universal Orthogonal Parameterization},\nauthor={Rui Li and Chaozhuo Li and Yanming Shen and Zeyu Zhang and Xu Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Sv4u9PtvT5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 595959, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4634810458331269235&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "ruc.edu.cn;bupt.edu.cn;dlut.edu.cn;ruc.edu.cn;ruc.edu.cn", "author_num": 5, "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "Renmin University of China;Beijing University of Posts and Telecommunications;Dalian University of Technology", "aff_unique_dep": ";;", "aff_unique_url": "http://www.ruc.edu.cn;http://www.bupt.edu.cn/;http://www.dlut.edu.cn/", "aff_unique_abbr": "RUC;BUPT;DUT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "MLI Formula: A Nearly Scale-Invariant Solution with Noise Perturbation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33985", "id": "SvBLKoBL4q", "proceeding": "https://proceedings.mlr.press/v235/tao24a.html", "pdf": "https://openreview.net/pdf?id=SvBLKoBL4q", "openreview": "https://openreview.net/forum?id=SvBLKoBL4q", "author_site": "Bowen Tao, Xin-Chun Li, De-Chuan Zhan", "tldr": "", "abstract": "Monotonic Linear Interpolation (MLI) refers to the peculiar phenomenon that the error between the initial and converged model monotonically decreases along the linear interpolation, i.e., $(1-\\alpha)\\boldsymbol{\\theta}_0 + \\alpha \\boldsymbol{\\theta}_F$. Previous works focus on paired initial and converged points, relating MLI to the smoothness of the optimization trajectory. In this paper, we find a shocking fact that the error curves still exhibit a monotonic decrease when $\\boldsymbol{\\theta}_0$ is replaced with noise or even zero values, implying that the decreasing curve may be primarily related to the property of the converged model rather than the optimization trajectory. We further explore the relationship between $\\alpha\\boldsymbol{\\theta}_F$ and $\\boldsymbol{\\theta}_F$ and propose scale invariance properties in various cases, including Generalized Scale Invariance (GSI), Rectified Scale Invariance (RSI), and Normalized Scale Invariance (NSI). From an inverse perspective, the MLI formula is essentially an equation that adds varying levels of noise (i.e., $(1-\\alpha)\\boldsymbol{\\epsilon}$) to a nearly scale-invariant network (i.e., $\\alpha \\boldsymbol{\\theta}_F$), resulting in a monotonically increasing error as the noise level rises. MLI is a special case where $\\boldsymbol{\\epsilon}$ is equal to $\\boldsymbol{\\theta}_0$.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bowen Tao;Xin-Chun Li;De-Chuan Zhan", "authorids": "~Bowen_Tao1;~Xin-Chun_Li1;~De-Chuan_Zhan1", "gender": ";M;M", "homepage": "http://www.lamda.nju.edu.cn/taobw/;http://www.lamda.nju.edu.cn/zhandc/;http://www.lamda.nju.edu.cn/lixc/", "dblp": ";74/498;https://dblp.uni-trier.de/pid/246/2947", "google_scholar": ";mYJf4TcAAAAJ;7WOxRe0AAAAJ", "orcid": ";0000-0002-3533-2078;", "linkedin": ";;", "or_profile": "~Bowen_Tao1;~De-Chuan_Zhan1;~Li_Xin-Chun1", "aff": "Nanjing University;Nanjing University;Nanjing University", "aff_domain": "nju.edu.cn;nju.edu.cn;nju.edu.cn", "position": "MS student;Full Professor;PhD student", "bibtex": "@inproceedings{\ntao2024mli,\ntitle={{MLI} Formula: A Nearly Scale-Invariant Solution with Noise Perturbation},\nauthor={Bowen Tao and Xin-Chun Li and De-Chuan Zhan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SvBLKoBL4q}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1011081, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9671089358770616770&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "nju.edu.cn;nju.edu.cn;nju.edu.cn", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "", "aff_unique_url": "https://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Bridging Mini-Batch and Asymptotic Analysis in Contrastive Learning: From InfoNCE to Kernel-Based Losses", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33984", "id": "SvvvB5t5EW", "proceeding": "https://proceedings.mlr.press/v235/koromilas24a.html", "pdf": "https://openreview.net/pdf?id=SvvvB5t5EW", "openreview": "https://openreview.net/forum?id=SvvvB5t5EW", "author_site": "Panagiotis Koromilas, Giorgos Bouritsas, Theodoros Giannakopoulos, Mihalis Nicolaou, Yannis Panagakis", "tldr": "", "abstract": "What do different contrastive learning (CL) losses actually optimize for? Although multiple CL methods have demonstrated remarkable representation learning capabilities, the differences in their inner workings remain largely opaque. In this work, we analyse several CL families and prove that, under certain conditions, they admit the same minimisers when optimizing either their batch-level objectives or their expectations asymptotically. In both cases, an intimate connection with the hyperspherical energy minimisation (HEM) problem resurfaces. Drawing inspiration from this, we introduce a novel CL objective, coined Decoupled Hyperspherical Energy Loss (DHEL). DHEL simplifies the problem by decoupling the target hyperspherical energy from the alignment of positive examples while preserving the same theoretical guarantees. Going one step further, we show the same results hold for another relevant CL family, namely kernel contrastive learning (KCL), with the additional advantage of the expected loss being independent of batch size, thus identifying the minimisers in the non-asymptotic regime. Empirical results demonstrate improved downstream performance and robustness across combinations of different batch sizes and hyperparameters and reduced dimensionality collapse, on several computer vision datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Panagiotis Koromilas;Giorgos Bouritsas;Theodoros Giannakopoulos;Mihalis Nicolaou;Yannis Panagakis", "authorids": "~Panagiotis_Koromilas1;~Giorgos_Bouritsas1;~Theodoros_Giannakopoulos1;~Mihalis_Nicolaou1;~Yannis_Panagakis1", "gender": "M;;M;Not Specified;", "homepage": "https://scholar.google.com/citations?user=DMI5W9wAAAAJ&hl=el;http://users.uoa.gr/~gbouritsas/;http://tyiannak.github.io;https://mihalisan.github.io;", "dblp": ";190/1675;64/1130;32/8615;", "google_scholar": "DMI5W9wAAAAJ;eNUJDXUAAAAJ;BeIoqhwAAAAJ;R9x_bZ8AAAAJ;", "orcid": ";0000-0002-8476-4918;;;", "linkedin": ";giorgos-bouritsas;theodoros-giannakopoulos-0b626442;;", "or_profile": "~Panagiotis_Koromilas1;~Giorgos_Bouritsas1;~Theodoros_Giannakopoulos1;~Mihalis_Nicolaou1;~Yannis_Panagakis1", "aff": "NCSR Demokritos;University of Athens;Behavioral Signals;The Cyprus Institute;", "aff_domain": "iit.demokritos.gr;uoa.gr;behavioralsignals.com;cyi.ac.cy;", "position": "Researcher;Postdoc;Principal Researcher;Associate Professor;", "bibtex": "@inproceedings{\nkoromilas2024bridging,\ntitle={Bridging Mini-Batch and Asymptotic Analysis in Contrastive Learning: From Info{NCE} to Kernel-Based Losses},\nauthor={Panagiotis Koromilas and Giorgos Bouritsas and Theodoros Giannakopoulos and Mihalis Nicolaou and Yannis Panagakis},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SvvvB5t5EW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 682776, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8440605786275677348&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 10, "email": "iit.demokritos.gr;uoa.gr;behavioralsignals.com;cyi.ac.cy;", "author_num": 5, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "National Centre for Scientific Research 'Demokritos';University of Athens;Behavioral Signals;Cyprus Institute", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.demokritos.gr;https://www.uoa.gr;;https://www.cyi.ac.cy", "aff_unique_abbr": "NCSR Demokritos;UoA;;CyI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;2", "aff_country_unique": "Greece;;Cyprus" }, { "title": "Rethinking Transformers in Solving POMDPs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33983", "id": "SyY7ScNpGL", "proceeding": "https://proceedings.mlr.press/v235/lu24h.html", "pdf": "https://openreview.net/pdf?id=SyY7ScNpGL", "openreview": "https://openreview.net/forum?id=SyY7ScNpGL", "author_site": "Chenhao Lu, Ruizhe Shi, Yuyao Liu, Kaizhe Hu, Simon Du, Huazhe Xu", "tldr": "", "abstract": "Sequential decision-making algorithms such as reinforcement learning (RL) in real-world scenarios inevitably face environments with partial observability. This paper scrutinizes the effectiveness of a popular architecture, namely Transformers, in Partially Observable Markov Decision Processes (POMDPs) and reveals its theoretical limitations. We establish that regular languages, which Transformers struggle to model, are reducible to POMDPs. This poses a significant challenge for Transformers in learning POMDP-specific inductive biases, due to their lack of inherent recurrence found in other models like RNNs. This paper casts doubt on the prevalent belief in Transformers as sequence models for RL and proposes to introduce a point-wise recurrent structure. The Deep Linear Recurrent Unit (LRU) emerges as a well-suited alternative for Partially Observable RL, with empirical results highlighting the sub-optimal performance of the Transformer and considerable strength of LRU.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chenhao Lu;Ruizhe Shi;Yuyao Liu;Kaizhe Hu;Simon Shaolei Du;Huazhe Xu", "authorids": "~Chenhao_Lu1;~Ruizhe_Shi1;~Yuyao_Liu1;~Kaizhe_Hu1;~Simon_Shaolei_Du1;~Huazhe_Xu1", "gender": ";M;M;M;M;M", "homepage": "https://ctp314.github.io/;http://srzer.github.io;;https://hukz18.github.io/;http://simonshaoleidu.com;http://hxu.rocks", "dblp": ";304/0634.html;;330/4940;176/5602;164/9006", "google_scholar": ";0tlXSPkAAAAJ;https://scholar.google.com/citations?hl=en;mPpYLhcAAAAJ;OttawxUAAAAJ;t9HPFawAAAAJ", "orcid": ";;;;;", "linkedin": ";;;%E5%BC%80%E5%93%B2-%E8%83%A1-40137718a/?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACyMbIEBJhMDJ4b7wLQyHotP_JGOnWDoEDU;;", "or_profile": "~Chenhao_Lu1;~Ruizhe_Shi1;~Yuyao_Liu1;~Kaizhe_Hu1;~Simon_Shaolei_Du1;~Huazhe_Xu1", "aff": "Tsinghua University;University of Washington;Massachusetts Institute of Technology;Stanford University;University of Washington;Tsinghua University", "aff_domain": "tsinghua.edu.cn;uw.edu;mit.edu;stanford.edu;washington.edu;tsinghua.edu.cn", "position": "Undergrad student;Intern;Visiting Student;Researcher;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nlu2024rethinking,\ntitle={Rethinking Transformers in Solving {POMDP}s},\nauthor={Chenhao Lu and Ruizhe Shi and Yuyao Liu and Kaizhe Hu and Simon Shaolei Du and Huazhe Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SyY7ScNpGL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5037874, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7936728632813554537&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "tsinghua.edu.cn;uw.edu;mit.edu;stanford.edu;washington.edu;tsinghua.edu.cn", "author_num": 6, "aff_unique_index": "0;1;2;3;1;0", "aff_unique_norm": "Tsinghua University;University of Washington;Massachusetts Institute of Technology;Stanford University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.washington.edu;https://web.mit.edu;https://www.stanford.edu", "aff_unique_abbr": "THU;UW;MIT;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;1;1;1;0", "aff_country_unique": "China;United States" }, { "title": "Optimally Improving Cooperative Learning in a Social Setting", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33982", "id": "Sz9mAYuqlE", "proceeding": "https://proceedings.mlr.press/v235/haddadan24a.html", "pdf": "https://openreview.net/pdf?id=Sz9mAYuqlE", "openreview": "https://openreview.net/forum?id=Sz9mAYuqlE", "author_site": "Shahrzad Haddadan, Cheng Xin, Jie Gao", "tldr": "", "abstract": "We consider a cooperative learning scenario where a collection of networked agents with individually owned classifiers dynamically update their predictions, for the same classification task, through communication or observations of each other\u2019s predictions. Clearly if highly influential vertices use erroneous classifiers, there will be a negative effect on the accuracy of all the agents in the network. We ask the following question: how can we optimally fix the prediction of a few classifiers so as maximize the overall accuracy in the entire network. To this end we consider an aggregate and an egalitarian objective function. We show a polynomial time algorithm for optimizing the aggregate objective function, and show that optimizing the egalitarian objective function is NP-hard. Furthermore, we develop approximation algorithms for the egalitarian improvement. The performance of all of our algorithms are guaranteed by mathematical analysis and backed by experiments on synthetic and real data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shahrzad Haddadan;Cheng Xin;Jie Gao", "authorids": "~Shahrzad_Haddadan1;~Cheng_Xin2;~Jie_Gao6", "gender": "F;Not Specified;", "homepage": "https://sites.google.com/view/shahrzadhaddadan;https://jackal092927.github.io/;https://sites.rutgers.edu/jie-gao/", "dblp": "179/2403;168/8972;g/JieGao", "google_scholar": "iIvAkVsAAAAJ;lQk90B0AAAAJ;P1CMmgEAAAAJ", "orcid": ";;0000-0001-5083-6082", "linkedin": ";;", "or_profile": "~Shahrzad_Haddadan1;~Cheng_Xin2;~Jie_Gao6", "aff": "Rutgers University;Rutgers University;Rutgers University", "aff_domain": "rutgers.edu;cs.rutgers.edu;rutgers.edu", "position": "Assistant Professor;Postdoc;Full Professor", "bibtex": "@inproceedings{\nhaddadan2024optimally,\ntitle={Optimally Improving Cooperative Learning in a Social Setting},\nauthor={Shahrzad Haddadan and Cheng Xin and Jie Gao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Sz9mAYuqlE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 728258, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9708364023789439347&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 11, "email": "rutgers.edu;cs.rutgers.edu;rutgers.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Rutgers University", "aff_unique_dep": "", "aff_unique_url": "https://www.rutgers.edu", "aff_unique_abbr": "Rutgers", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Sparse Model Inversion: Efficient Inversion of Vision Transformers for Data-Free Applications", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33981", "id": "T0lFfO8HaK", "proceeding": "https://proceedings.mlr.press/v235/hu24o.html", "pdf": "https://openreview.net/pdf?id=T0lFfO8HaK", "openreview": "https://openreview.net/forum?id=T0lFfO8HaK", "author_site": "Zixuan Hu, Yongxian Wei, Li Shen, Zhenyi Wang, Lei Li, Chun Yuan, Dacheng Tao", "tldr": "", "abstract": "Model inversion, which aims to reconstruct the original training data from pre-trained discriminative models, is especially useful when the original training data is unavailable due to privacy, usage rights, or size constraints. However, existing dense inversion methods attempt to reconstruct the entire image area, making them extremely inefficient when inverting high-resolution images from large-scale Vision Transformers (ViTs). We further identify two underlying causes of this inefficiency: the redundant inversion of noisy backgrounds and the unintended inversion of spurious correlations\u2014a phenomenon we term ``hallucination'' in model inversion. To address these limitations, we propose a novel sparse model inversion strategy, as a plug-and-play extension to speed up existing dense inversion methods with no need for modifying their original loss functions. Specifically, we selectively invert semantic foregrounds while stopping the inversion of noisy backgrounds and potential spurious correlations. Through both theoretical and empirical studies, we validate the efficacy of our approach in achieving significant inversion acceleration (up to $\\times$3.79) while maintaining comparable or even enhanced downstream performance in data-free model quantization and data-free knowledge transfer. Code is available at https://github.com/Egg-Hu/SMI.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zixuan Hu;Yongxian Wei;Li Shen;Zhenyi Wang;Lei Li;Chun Yuan;Dacheng Tao", "authorids": "~Zixuan_Hu1;~Yongxian_Wei1;~Li_Shen1;~Zhenyi_Wang1;~Lei_Li12;~Chun_Yuan1;~Dacheng_Tao1", "gender": "M;;M;;M;M;", "homepage": ";;https://sites.google.com/site/mathshenli/home;;;https://www.sigs.tsinghua.edu.cn/fg3/105064.jhtml;", "dblp": "332/4542;;91/3680-8;;https://dblp.uni-trier.de/pers/hd/l/Li:Lei;;", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;;yVhgENIAAAAJ;;;https://scholar.google.com.hk/citations?user=fYdxi2sAAAAJ;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Zixuan_Hu1;~Yongxian_Wei1;~Li_Shen1;~Zhenyi_Wang1;~Lei_Li12;~Chun_Yuan1;~Dacheng_Tao1", "aff": "Tsinghua University;;JD Explore Academy;;;Tsinghua University;", "aff_domain": "mails.tsinghua.edu.cn;;jd.com;;;tsinghua.edu.cn;", "position": "MS student;;Researcher;;;Full Professor;", "bibtex": "@inproceedings{\nhu2024sparse,\ntitle={Sparse Model Inversion: Efficient Inversion of Vision Transformers for Data-Free Applications},\nauthor={Zixuan Hu and Yongxian Wei and Li Shen and Zhenyi Wang and Lei Li and Chun Yuan and Dacheng Tao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=T0lFfO8HaK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6067490, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2172021972219562590&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "mails.tsinghua.edu.cn;;jd.com;;;tsinghua.edu.cn;", "author_num": 7, "aff_unique_index": "0;1;0", "aff_unique_norm": "Tsinghua University;JD", "aff_unique_dep": ";JD Explore Academy", "aff_unique_url": "https://www.tsinghua.edu.cn;", "aff_unique_abbr": "THU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China;" }, { "title": "PARCv2: Physics-aware Recurrent Convolutional Neural Networks for Spatiotemporal Dynamics Modeling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33980", "id": "T0zR4mdSce", "proceeding": "https://proceedings.mlr.press/v235/nguyen24c.html", "pdf": "https://openreview.net/pdf?id=T0zR4mdSce", "openreview": "https://openreview.net/forum?id=T0zR4mdSce", "author_site": "Phong Nguyen, Xinlun Cheng, Shahab Azarfar, Pradeep Seshadri, Yen Nguyen, Munho Kim, Sanghun Choi, H. Udaykumar, Stephen Baek", "tldr": "", "abstract": "Modeling unsteady, fast transient, and advection-dominated physics problems is a pressing challenge for physics-aware deep learning (PADL). The physics of complex systems is governed by large systems of partial differential equations (PDEs) and ancillary constitutive models with nonlinear structures, as well as evolving state fields exhibiting sharp gradients and rapidly deforming material interfaces. Here, we investigate an inductive bias approach that is versatile and generalizable to model generic nonlinear field evolution problems. Our study focuses on the recent physics-aware recurrent convolutions (PARC), which incorporates a differentiator-integrator architecture that inductively models the spatiotemporal dynamics of generic physical systems. We extend the capabilities of PARC to simulate unsteady, transient, and advection-dominant systems. The extended model, referred to as PARCv2, is equipped with differential operators to model advection-reaction-diffusion equations, as well as a hybrid integral solver for stable, long-time predictions. PARCv2 is tested on both standard benchmark problems in fluid dynamics, namely Burgers and Navier-Stokes equations, and then applied to more complex shock-induced reaction problems in energetic materials. We evaluate the behavior of PARCv2 in comparison to other physics-informed and learning bias models and demonstrate its potential to model unsteady and advection-dominant dynamics regimes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Phong C.H. Nguyen;Xinlun Cheng;Shahab Azarfar;Pradeep Seshadri;Yen T. Nguyen;Munho Kim;Sanghun Choi;H.S. Udaykumar;Stephen Baek", "authorids": "~Phong_C.H._Nguyen1;xc7ts@virginia.edu;jut6nm@virginia.edu;pradeep-seshadri@uiowa.edu;yenthi-nguyen@uiowa.edu;ansghwkd2@naver.com;s-choi@knu.ac.kr;hs-kumar@uiowa.edu;~Stephen_Baek1", "gender": "M;;;;;;;;", "homepage": ";;;;;;;;http://www.stephenbaek.com", "dblp": ";;;;;;;;", "google_scholar": "dTI5u5wAAAAJ;;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": "~Phong_C.H._Nguyen1;xc7ts@virginia.edu;jut6nm@virginia.edu;pradeep-seshadri@uiowa.edu;yenthi-nguyen@uiowa.edu;ansghwkd2@naver.com;s-choi@knu.ac.kr;hs-kumar@uiowa.edu;~Stephen_Baek1", "aff": "University of Virginia, Charlottesville;;;;;;;;University of Virginia, Charlottesville", "aff_domain": "virginia.edu;;;;;;;;virginia.edu", "position": "Assistant Professor;;;;;;;;Associate Professor", "bibtex": "@inproceedings{\nnguyen2024parcv,\ntitle={{PARC}v2: Physics-aware Recurrent Convolutional Neural Networks for Spatiotemporal Dynamics Modeling},\nauthor={Phong C.H. Nguyen and Xinlun Cheng and Shahab Azarfar and Pradeep Seshadri and Yen T. Nguyen and Munho Kim and Sanghun Choi and H.S. Udaykumar and Stephen Baek},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=T0zR4mdSce}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2814544, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4877511432796569410&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 10, "email": "virginia.edu;;;;;;;;virginia.edu", "author_num": 9, "aff_unique_index": "0;0", "aff_unique_norm": "University of Virginia", "aff_unique_dep": "", "aff_unique_url": "https://www.virginia.edu", "aff_unique_abbr": "UVA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Charlottesville", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Revisiting Zeroth-Order Optimization for Memory-Efficient LLM Fine-Tuning: A Benchmark", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33979", "id": "THPjMr2r0S", "proceeding": "https://proceedings.mlr.press/v235/zhang24ad.html", "pdf": "https://openreview.net/pdf?id=THPjMr2r0S", "openreview": "https://openreview.net/forum?id=THPjMr2r0S", "author_site": "Yihua Zhang, Pingzhi Li, Junyuan Hong, Jiaxiang Li, Yimeng Zhang, Wenqing Zheng, Pin-Yu Chen, Jason Lee, Wotao Yin, Mingyi Hong, Zhangyang \u201cAtlas\u201d Wang, Sijia Liu, Tianlong Chen", "tldr": "", "abstract": "In the evolving landscape of natural language processing (NLP), fine-tuning pre-trained Large Language Models (LLMs) with first-order (FO) optimizers like SGD and Adam has become standard. Yet, as LLMs grow in size, the substantial memory overhead from back-propagation (BP) for FO gradient computation presents a significant challenge. Addressing this issue is crucial, especially for applications like on-device training where memory efficiency is paramount. This paper proposes a shift towards BP-free, zeroth-order (ZO) optimization as a solution for reducing memory costs during LLM fine-tuning, building on the initial concept introduced by (Malladi et al., 2023). Unlike traditional ZO-SGD methods, ou\u8ba9work expands the exploration to a wider array of ZO optimization techniques, through a comprehensive, first-of-its-kind benchmarking study across five LLM families, three task complexities, and five fine-tuning schemes. Our study unveils previously overlooked optimization principles, highlighting the importance of task alignment, the role of the forward gradient method, and the balance between algorithm complexity and fine-tuning performance. We further introduce novel enhancements to ZO optimization, including block-wise descent, hybrid training, and gradient sparsity. Our study offers a promising direction for achieving further memory-efficient LLM fine-tuning. Codes to reproduce all our experiments will be made public.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yihua Zhang;Pingzhi Li;Junyuan Hong;Jiaxiang Li;Yimeng Zhang;Wenqing Zheng;Pin-Yu Chen;Jason D. Lee;Wotao Yin;Mingyi Hong;Zhangyang Wang;Sijia Liu;Tianlong Chen", "authorids": "~Yihua_Zhang1;~Pingzhi_Li1;~Junyuan_Hong1;~Jiaxiang_Li1;~Yimeng_Zhang2;~Wenqing_Zheng1;~Pin-Yu_Chen1;~Jason_D._Lee1;~Wotao_Yin1;~Mingyi_Hong1;~Zhangyang_Wang1;~Sijia_Liu1;~Tianlong_Chen1", "gender": "M;M;M;M;M;M;M;M;M;M;M;M;M", "homepage": "https://yihua-zhang.com;https://pingzhili.github.io;https://jyhong.gitlab.io/;https://jasonjiaxiangli.github.io/;https://damon-demon.github.io;https://wenqing-zheng.github.io;http://www.pinyuchen.com;https://jasondlee88.github.io/;http://wotaoyin.com;http://people.ece.umn.edu/~mhong/mingyi.html;https://vita-group.github.io;https://lsjxjtu.github.io/;https://tianlong-chen.github.io", "dblp": ";358/4405;185/1316;;;;39/8969;88/3262;76/2265;57/8053;119/4026;128/6972-1;", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;QUfhEyQAAAAJ;7Cbv6doAAAAJ;h5OWvc0AAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=zh-CN;jxwlCUUAAAAJ;GR_DsT0AAAAJ;kpQGGFUAAAAJ;qRnP-p0AAAAJ;pxFyKAIAAAAJ;C7dO_UgAAAAJ;LE3ctn0AAAAJ", "orcid": ";;0000-0002-5718-5187;0009-0001-5555-6511;0000-0003-1608-2541;0000-0002-8283-7511;0000-0003-1039-8369;;0000-0001-6697-9731;;;;0000-0001-7774-8197", "linkedin": "zhangyihua/;pingzhili/;;jiaxiang-li-9aa485118/;;;pin-yu-chen-940062a2;;;;;;tianlong-chen-783862167/", "or_profile": "~Yihua_Zhang1;~Pingzhi_Li1;~Junyuan_Hong1;~Jiaxiang_Li1;~Yimeng_Zhang2;~Wenqing_Zheng1;~Pin-Yu_Chen1;~Jason_D._Lee1;~Wotao_Yin1;~Mingyi_Hong1;~Zhangyang_Wang1;~Sijia_Liu1;~Tianlong_Chen1", "aff": "Michigan State University;University of North Carolina at Chapel Hill;University of Texas at Austin;University of Minnesota - Twin Cities;ByteDance Inc.;University of Texas, Austin;International Business Machines;Princeton University;Alibaba Group US;University of Minnesota, Minneapolis;University of Texas at Austin;Michigan State University;Harvard University", "aff_domain": "msu.edu;unc.edu;utexas.edu;umn.edu;bytedance.com;utexas.edu;ibm.com;princeton.edu;alibaba-inc.com;umn.edu;utexas.edu;msu.edu;harvard.edu", "position": "PhD student;PhD student;Postdoc;Postdoc;Research Intern;PhD student;Principal Researcher;Assistant Professor;Principal Researcher;Associate Professor;Associate Professor;Assistant Professor;Postdoc", "bibtex": "@inproceedings{\nzhang2024revisiting,\ntitle={Revisiting Zeroth-Order Optimization for Memory-Efficient {LLM} Fine-Tuning: A Benchmark},\nauthor={Yihua Zhang and Pingzhi Li and Junyuan Hong and Jiaxiang Li and Yimeng Zhang and Wenqing Zheng and Pin-Yu Chen and Jason D. Lee and Wotao Yin and Mingyi Hong and Zhangyang Wang and Sijia Liu and Tianlong Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=THPjMr2r0S}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 534532, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 13, "gs_citation": 55, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5799199012759155520&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 11, "email": "msu.edu;unc.edu;utexas.edu;umn.edu;bytedance.com;utexas.edu;ibm.com;princeton.edu;alibaba-inc.com;umn.edu;utexas.edu;msu.edu;harvard.edu", "author_num": 13, "aff_unique_index": "0;1;2;3;4;2;5;6;7;3;2;0;8", "aff_unique_norm": "Michigan State University;University of North Carolina;University of Texas at Austin;University of Minnesota;ByteDance;International Business Machines Corporation;Princeton University;Alibaba Group;Harvard University", "aff_unique_dep": ";;;;;;;;", "aff_unique_url": "https://www.msu.edu;https://www.unc.edu;https://www.utexas.edu;https://www.minnesota.edu;https://www.bytedance.com;https://www.ibm.com;https://www.princeton.edu;https://www.alibaba.com;https://www.harvard.edu", "aff_unique_abbr": "MSU;UNC;UT Austin;UMN;ByteDance;IBM;Princeton;Alibaba;Harvard", "aff_campus_unique_index": "1;2;3;2;4;2", "aff_campus_unique": ";Chapel Hill;Austin;Twin Cities;Minneapolis", "aff_country_unique_index": "0;0;0;0;1;0;0;0;0;0;0;0;0", "aff_country_unique": "United States;China" }, { "title": "Energy-based Backdoor Defense without Task-Specific Samples and Model Retraining", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33978", "id": "TJ6tVNt6Y4", "proceeding": "https://proceedings.mlr.press/v235/gao24b.html", "pdf": "https://openreview.net/pdf?id=TJ6tVNt6Y4", "openreview": "https://openreview.net/forum?id=TJ6tVNt6Y4", "author_site": "Yudong Gao, Honglong Chen, Peng Sun, Zhe Li, Junjian Li, Huajie Shao", "tldr": "", "abstract": "Backdoor defense is crucial to ensure the safety and robustness of machine learning models when under attack. However, most existing methods specialize in either the detection or removal of backdoors, but seldom both. While few works have addressed both, these methods rely on strong assumptions or entail significant overhead costs, such as the need of task-specific samples for detection and model retraining for removal. Hence, the key challenge is how to reduce overhead and relax unrealistic assumptions. In this work, we propose two Energy-Based BAckdoor defense methods, called EBBA and EBBA+, that can achieve both backdoored model detection and backdoor removal with low overhead. Our contributions are twofold: First, we offer theoretical analysis for our observation that a predefined target label is more likely to occur among the top results for various samples. Inspired by this, we develop an enhanced energy-based technique, called EBBA, to detect backdoored models without task-specific samples (i.e., samples from any tasks). Secondly, we theoretically analyze that after data corruption, the original clean label of a poisoned sample is more likely to be predicted as a top output by the model, a sharp contrast to clean samples. Accordingly, we extend EBBA to develop EBBA+, a new transferred energy approach to efficiently detect poisoned images and remove backdoors without model retraining. Extensive experiments on multiple benchmark datasets demonstrate the superior performance of our methods over baselines in both backdoor detection and removal. Notably, the proposed methods can effectively detect backdoored model and poisoned images as well as remove backdoors at the same time.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yudong Gao;Honglong Chen;Peng Sun;Zhe Li;Junjian Li;Huajie Shao", "authorids": "~Yudong_Gao2;~Honglong_Chen1;~Peng_Sun9;~Zhe_Li14;~Junjian_Li1;~Huajie_Shao1", "gender": "M;M;M;F;M;M", "homepage": ";;https://pengsun6.github.io/;;;https://huajieshao.github.io/", "dblp": ";35/7221.html;88/619-3;;;179/4173", "google_scholar": ";;lpg_2i0AAAAJ;;;5-D7ZLsAAAAJ", "orcid": "0000-0003-0264-6545;;0000-0001-6221-8142;0000-0002-6979-3972;0000-0001-5319-517X;0000-0001-7627-5615", "linkedin": ";;;;;huajie-shao-508465113/", "or_profile": "~Yudong_Gao2;~Honglong_Chen1;~Peng_Sun9;~Zhe_Li14;~Junjian_Li1;~Huajie_Shao1", "aff": "China University of Petroleum;China University of Petroleum;Hunan University;China University of Petroleum;China University of Petroleum;College of William and Mary", "aff_domain": "upc.edu.cn;upc.edu.cn;hnu.edu.cn;upc.edu.cn;upc.edu.cn;wm.edu", "position": "MS student;Full Professor;Associate Professor;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\ngao2024energybased,\ntitle={Energy-based Backdoor Defense without Task-Specific Samples and Model Retraining},\nauthor={Yudong Gao and Honglong Chen and Peng Sun and Zhe Li and Junjian Li and Huajie Shao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=TJ6tVNt6Y4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2741029, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11046146924775877828&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "upc.edu.cn;upc.edu.cn;hnu.edu.cn;upc.edu.cn;upc.edu.cn;wm.edu", "author_num": 6, "aff_unique_index": "0;0;1;0;0;2", "aff_unique_norm": "China University of Petroleum;Hunan University;College of William and Mary", "aff_unique_dep": ";;", "aff_unique_url": "http://www.cup.edu.cn;http://www.hunu.edu.cn/;https://www.wm.edu", "aff_unique_abbr": "CUP;HNU;WM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "diff History for Neural Language Agents", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33977", "id": "TJCUrzhbiH", "proceeding": "https://proceedings.mlr.press/v235/piterbarg24a.html", "pdf": "https://openreview.net/pdf?id=TJCUrzhbiH", "openreview": "https://openreview.net/forum?id=TJCUrzhbiH", "author_site": "Ulyana Piterbarg, Lerrel Pinto, Rob Fergus", "tldr": "", "abstract": "Neural Language Models (LMs) offer an exciting solution for general-purpose embodied control. However, a key technical issue arises when using an LM-based controller: environment observations must be converted to text, which coupled with history, results in long and verbose textual prompts. As a result, prior work in LM agents is limited to restricted domains with small observation size as well as minimal needs for interaction history or instruction finetuning. In this paper, we introduce diff history, a simple and highly effective solution to these issues. By applying the Unix diff command on consecutive text observations in the interaction histories used to prompt LM policies, we can both abstract away redundant information and focus the content of textual inputs on the salient changes in the environment. On NetHack, an unsolved video game that requires long-horizon reasoning for decision-making, LMs tuned with diff history match state-of-the-art performance for neural agents while needing 1800X fewer training examples compared to prior work. Even on the simpler BabyAI-Text environment with concise text observations, we find that although diff history increases the length of prompts, the representation it provides offers a 25% improvement in the efficiency of low-sample instruction finetuning. Further, we show that diff history scales favorably across different finetuning dataset sizes. We open-source our code and data to https://diffhistory.github.io.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ulyana Piterbarg;Lerrel Pinto;Rob Fergus", "authorids": "~Ulyana_Piterbarg1;~Lerrel_Pinto1;~Rob_Fergus1", "gender": "F;M;M", "homepage": "https://upiterbarg.github.io/;https://www.lerrelpinto.com/;http://cs.nyu.edu/fergus/", "dblp": "284/4477;168/8304;77/3763", "google_scholar": ";pmVPj94AAAAJ;https://scholar.google.com.tw/citations?user=GgQ9GEkAAAAJ", "orcid": "0000-0002-8363-9648;;", "linkedin": ";;", "or_profile": "~Ulyana_Piterbarg1;~Lerrel_Pinto1;~Rob_Fergus1", "aff": "Microsoft Research;New York University;Google", "aff_domain": "research.microsoft.com;cs.nyu.edu;google.com", "position": "Intern;Assistant Professor;Research scientist", "bibtex": "@inproceedings{\npiterbarg2024diff,\ntitle={diff History for Neural Language Agents},\nauthor={Ulyana Piterbarg and Lerrel Pinto and Rob Fergus},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=TJCUrzhbiH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2484858, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-p13jHwUgs0J:scholar.google.com/&scioq=diff+History+for+Neural+Language+Agents&hl=en&as_sdt=0,19", "gs_version_total": 7, "email": "research.microsoft.com;cs.nyu.edu;google.com", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Microsoft;New York University;Google", "aff_unique_dep": "Microsoft Research;;Google", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.nyu.edu;https://www.google.com", "aff_unique_abbr": "MSR;NYU;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Hierarchical State Space Models for Continuous Sequence-to-Sequence Modeling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33976", "id": "TK7xkOsXDu", "proceeding": "https://proceedings.mlr.press/v235/bhirangi24a.html", "pdf": "https://openreview.net/pdf?id=TK7xkOsXDu", "openreview": "https://openreview.net/forum?id=TK7xkOsXDu", "author_site": "Raunaq Bhirangi, Chenyu Wang, Venkatesh Pattabiraman, Carmel Majidi, Abhinav Gupta, Tess Hellebrekers, Lerrel Pinto", "tldr": "", "abstract": "Reasoning from sequences of raw sensory data is a ubiquitous problem across fields ranging from medical devices to robotics. These problems often involve using long sequences of raw sensor data (e.g. magnetometers, piezoresistors) to predict sequences of desirable physical quantities (e.g. force, inertial measurements). While classical approaches are powerful for locally-linear prediction problems, they often fall short when using real-world sensors. These sensors are typically non-linear, are affected by extraneous variables (e.g. vibration), and exhibit data-dependent drift. For many problems, the prediction task is exacerbated by small labeled datasets since obtaining ground-truth labels requires expensive equipment. In this work, we present Hierarchical State-Space models (HiSS), a conceptually simple, new technique for continuous sequential prediction. HiSS stacks structured state-space models on top of each other to create a temporal hierarchy. Across six real-world sensor datasets, from tactile-based state prediction to accelerometer-based inertial measurement, HiSS outperforms state-of-the-art sequence models such as causal Transformers, LSTMs, S4, and Mamba by at least 23% on MSE. Our experiments further indicate that HiSS demonstrates efficient scaling to smaller datasets and is compatible with existing data-filtering techniques. Code, datasets and videos can be found on https://hiss-csp.github.io.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Raunaq Bhirangi;Chenyu Wang;Venkatesh Pattabiraman;Carmel Majidi;Abhinav Gupta;Tess Hellebrekers;Lerrel Pinto", "authorids": "~Raunaq_Bhirangi1;cw4030@nyu.edu;~Venkatesh_Pattabiraman1;~Carmel_Majidi1;~Abhinav_Gupta1;~Tess_Hellebrekers2;~Lerrel_Pinto1", "gender": "M;;M;;M;;M", "homepage": "https://raunaqbhirangi.github.io;;https://notvenky.github.io/;;http://www.cs.cmu.edu/~abhinavg;https://tesshellebrekers.com;https://www.lerrelpinto.com/", "dblp": "266/4528;;;;36/7024-1;;168/8304", "google_scholar": "LUy4hkcAAAAJ;;ZuZ84d0AAAAJ;1LyndUsAAAAJ;https://scholar.google.com.tw/citations?user=bqL73OkAAAAJ;;pmVPj94AAAAJ", "orcid": ";;;;;;", "linkedin": "raunaq-bhirangi/;;venkatesh-pattabiraman/;;;;", "or_profile": "~Raunaq_Bhirangi1;cw4030@nyu.edu;~Venkatesh_Pattabiraman1;~Carmel_Majidi1;~Abhinav_Gupta1;~Tess_Hellebrekers2;~Lerrel_Pinto1", "aff": "Carnegie Mellon University;;New York University;Carnegie Mellon University;Carnegie Mellon University;Meta;New York University", "aff_domain": "cmu.edu;;nyu.edu;cmu.edu;cmu.edu;ai.meta.com;cs.nyu.edu", "position": "PhD student;;MS student;Full Professor;Full Professor;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nbhirangi2024hierarchical,\ntitle={Hierarchical State Space Models for Continuous Sequence-to-Sequence Modeling},\nauthor={Raunaq Bhirangi and Chenyu Wang and Venkatesh Pattabiraman and Carmel Majidi and Abhinav Gupta and Tess Hellebrekers and Lerrel Pinto},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=TK7xkOsXDu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6335416, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3374268239398117361&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "cmu.edu;;nyu.edu;cmu.edu;cmu.edu;ai.meta.com;cs.nyu.edu", "author_num": 7, "aff_unique_index": "0;1;0;0;2;1", "aff_unique_norm": "Carnegie Mellon University;New York University;Meta", "aff_unique_dep": ";;Meta Platforms, Inc.", "aff_unique_url": "https://www.cmu.edu;https://www.nyu.edu;https://meta.com", "aff_unique_abbr": "CMU;NYU;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Tandem Transformers for Inference Efficient LLMs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33975", "id": "TN3fi7dwPo", "proceeding": "https://proceedings.mlr.press/v235/s24a.html", "pdf": "https://openreview.net/pdf?id=TN3fi7dwPo", "openreview": "https://openreview.net/forum?id=TN3fi7dwPo", "author_site": "Aishwarya P S, Pranav Nair, Yashas Samaga, Toby Boyd, Sanjiv Kumar, Prateek Jain, Praneeth Kumar Netrapalli", "tldr": "", "abstract": "The autoregressive nature of conventional large language models (LLMs) inherently limits inference speed, as tokens are generated sequentially. While speculative (Leviathan et al., 2023) and parallel (Stern et al., 2018) decoding techniques attempt to mitigate this, they face limitations: either relying on less accurate smaller models for generation or failing to fully leverage the base LLM's representations. We introduce a novel architecture, Tandem transformers, to address these issues. This architecture uniquely combines (1) a small autoregressive model and (2) a large model operating in block mode (processing multiple tokens simultaneously). The small model's predictive accuracy is substantially enhanced by granting it attention to the large model's richer representations. On the PaLM2 pretraining dataset, a tandem of PaLM2-Bison and PaLM2-Gecko demonstrates a 3.3% improvement in next-token prediction accuracy over a standalone PaLM2-Gecko, offering a 1.16x speedup compared to a PaLM2-Otter model with comparable downstream performance. We further incorporate the Tandem model within the speculative decoding (SPEED) framework where the large model validates tokens from the small model. This ensures that the tandem of PaLM2-Bison and PaLM2-Gecko achieves substantial speedup (around 1.14x faster than using vanilla PaLM2-Gecko in SPEED) while maintaining identical downstream task accuracy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aishwarya P S;Pranav Ajit Nair;Yashas Samaga B L;Toby James Boyd;Sanjiv Kumar;Prateek Jain;Praneeth Netrapalli", "authorids": "~Aishwarya_P_S1;~Pranav_Ajit_Nair1;~Yashas_Samaga_B_L2;~Toby_James_Boyd1;~Sanjiv_Kumar1;~Prateek_Jain1;~Praneeth_Netrapalli1", "gender": "F;;M;M;;M;M", "homepage": ";;https://github.com/yashassamaga;https://www.linkedin.com/in/tobyjboyd/;http://www.sanjivk.com/;http://prateekjain.org;http://praneethnetrapalli.org/", "dblp": ";;;;;https://dblp.uni-trier.de/pers/j/Jain_0002:Prateek.html;http://dblp.uni-trier.de/pers/hd/n/Netrapalli:Praneeth", "google_scholar": ";;0Hq70tUAAAAJ;;https://scholar.google.com/citations?hl=en;qYhRbJoAAAAJ;https://scholar.google.co.in/citations?user=mim8FQkAAAAJ", "orcid": ";;;;;;", "linkedin": "aishwarya-ps/;;https://in.linkedin.com/in/yashassamaga;;;;", "or_profile": "~Aishwarya_P_S1;~Pranav_Ajit_Nair1;~Yashas_Samaga_B_L2;~Toby_James_Boyd1;~Sanjiv_Kumar1;~Prateek_Jain1;~Praneeth_Netrapalli1", "aff": "Google;;Google DeepMind;;Google;Google;Google", "aff_domain": "google.com;;google.com;;google.com;google.com;google.com", "position": "Researcher;;Researcher;;Research Scientist;Researcher;Research Scientist", "bibtex": "@inproceedings{\ns2024tandem,\ntitle={Tandem Transformers for Inference Efficient {LLM}s},\nauthor={Aishwarya P S and Pranav Ajit Nair and Yashas Samaga B L and Toby James Boyd and Sanjiv Kumar and Prateek Jain and Praneeth Netrapalli},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=TN3fi7dwPo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 412273, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4925206433264599427&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "google.com;;google.com;;google.com;google.com;google.com", "author_num": 7, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Short-Long Convolutions Help Hardware-Efficient Linear Attention to Focus on Long Sequences", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33974", "id": "TRrXkVdhwi", "proceeding": "https://proceedings.mlr.press/v235/liu24ak.html", "pdf": "https://openreview.net/pdf?id=TRrXkVdhwi", "openreview": "https://openreview.net/forum?id=TRrXkVdhwi", "author_site": "Zicheng Liu, Siyuan Li, Li Wang, Zedong Wang, Yunfan Liu, Stan Z Li", "tldr": "", "abstract": "To mitigate the computational complexity in the self-attention mechanism on long sequences, linear attention utilizes computation tricks to achieve linear complexity, while state space models (SSMs) popularize a favourable practice of using non-data-dependent memory pattern, *i.e.,* emphasize the near and neglect the distant, to processing sequences. Recent studies have shown the priorities by combining them as one. However, the efficiency of linear attention remains only at the theoretical level in a causal setting, and SSMs require various designed constraints to operate effectively on specific data. Therefore, in order to unveil the true power of the hybrid design, the following two issues need to be addressed: (1) hardware-efficient implementation for linear attention and (2) stabilization of SSMs. To achieve this, we leverage the thought of tiling and hierarchy to propose CHELA (short-long Convolutions with Hardware-Efficient Linear Attention), which replaces SSMs with short-long convolutions and implements linear attention in a divide-and-conquer manner. This approach enjoys global abstraction and data-dependent selection from stable SSM and linear attention while maintaining real linear complexity. Our comprehensive experiments on the Long Range Arena benchmark and language modeling tasks demonstrate the effectiveness of the proposed method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zicheng Liu;Siyuan Li;Li Wang;Zedong Wang;Yunfan Liu;Stan Z. Li", "authorids": "~Zicheng_Liu2;~Siyuan_Li6;~Li_Wang34;~Zedong_Wang1;~Yunfan_Liu2;~Stan_Z._Li2", "gender": "M;M;M;M;M;M", "homepage": ";https://lupin1998.github.io/;https://github.com/lwang2070;https://jacky1128.github.io;https://github.com/XYxiyang;https://en.westlake.edu.cn/academics/School_of_Engineering/About/Our_People/Faculty/201912/t20191206_2497.shtml", "dblp": "l/ZichengLiu-6;63/9705-2;;179/8811.html;170/8550-2;l/StanZLi", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com/citations?hl=zh-CN;;https://scholar.google.com/citations?hl=en;;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";0000-0001-6806-2468;;0009-0000-0112-0491;0009-0002-1639-5855;", "linkedin": ";https://www.linkedin.cn/incareer/in/siyuan-li-lupin1998/;;;;stan-z-li-%E6%9D%8E%E5%AD%90%E9%9D%92-55753224/", "or_profile": "~Zicheng_Liu2;~Siyuan_Li6;~Li_Wang34;~Zedong_Wang1;~Yunfan_Liu2;~Stan_Z._Li1", "aff": "Zhejiang University;Alibaba Group;IntSig;Westlake University;Zhejiang University;Westlake University", "aff_domain": "zju.edu.cn;alibaba-inc.com;intsig.net;westlake.edu;zju.edu.cn;westlake.edu.cn", "position": "PhD student;Intern;Researcher;Intern;PhD student;Chair Professor", "bibtex": "@inproceedings{\nliu2024shortlong,\ntitle={Short-Long Convolutions Help Hardware-Efficient Linear Attention to Focus on Long Sequences},\nauthor={Zicheng Liu and Siyuan Li and Li Wang and Zedong Wang and Yunfan Liu and Stan Z. Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=TRrXkVdhwi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 441415, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5730061270701194740&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "zju.edu.cn;alibaba-inc.com;intsig.net;westlake.edu;zju.edu.cn;westlake.edu.cn", "author_num": 6, "aff_unique_index": "0;1;2;3;0;3", "aff_unique_norm": "Zhejiang University;Alibaba Group;INTSIG;Westlake University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.zju.edu.cn;https://www.alibaba.com;;https://www.westlake.edu.cn", "aff_unique_abbr": "ZJU;Alibaba;;WU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China;" }, { "title": "OSN: Infinite Representations of Dynamic 3D Scenes from Monocular Videos", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33973", "id": "TTYVG17wfc", "proceeding": "https://proceedings.mlr.press/v235/song24d.html", "pdf": "https://openreview.net/pdf?id=TTYVG17wfc", "openreview": "https://openreview.net/forum?id=TTYVG17wfc", "author_site": "Ziyang Song, Jinxi Li, Bo Yang", "tldr": "", "abstract": "It has long been challenging to recover the underlying dynamic 3D scene representations from a monocular RGB video. Existing works formulate this problem into finding a single most plausible solution by adding various constraints such as depth priors and strong geometry constraints, ignoring the fact that there could be infinitely many 3D scene representations corresponding to a single dynamic video. In this paper, we aim to learn all plausible 3D scene configurations that match the input video, instead of just inferring a specific one. To achieve this ambitious goal, we introduce a new framework, called OSN. The key to our approach is a simple yet innovative object scale network together with a joint optimization module to learn an accurate scale range for every dynamic 3D object. This allows us to sample as many faithful 3D scene configurations as possible. Extensive experiments show that our method surpasses all baselines and achieves superior accuracy in dynamic novel view synthesis on multiple synthetic and real-world datasets. Most notably, our method demonstrates a clear advantage in learning fine-grained 3D scene geometry.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ziyang Song;Jinxi Li;Bo Yang", "authorids": "~Ziyang_Song1;~Jinxi_Li2;~Bo_Yang7", "gender": "M;M;M", "homepage": "https://szy-young.github.io/;;https://yang7879.github.io/", "dblp": ";198/4279,;46/999-27", "google_scholar": "7YcpCEwAAAAJ;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0002-2419-4140", "linkedin": ";jinxi-leo-li;", "or_profile": "~Ziyang_Song1;~Jinxi_Li2;~Bo_Yang7", "aff": "The Hong Kong Polytechnic University;Hong Kong Polytechnic University;The Hong Kong Polytechnic University", "aff_domain": "polyu.edu.hk;polyu.edu.hk;polyu.edu.hk", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nsong2024osn,\ntitle={{OSN}: Infinite Representations of Dynamic 3D Scenes from Monocular Videos},\nauthor={Ziyang Song and Jinxi Li and Bo Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=TTYVG17wfc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7826680, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Y_-d0aUlNrgJ:scholar.google.com/&scioq=OSN:+Infinite+Representations+of+Dynamic+3D+Scenes+from+Monocular+Videos&hl=en&as_sdt=0,33", "gs_version_total": 6, "email": "polyu.edu.hk;polyu.edu.hk;polyu.edu.hk", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Hong Kong Polytechnic University", "aff_unique_dep": "", "aff_unique_url": "https://www.polyu.edu.hk", "aff_unique_abbr": "PolyU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Multi-Agent Reinforcement Learning with Hierarchical Coordination for Emergency Responder Stationing", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33972", "id": "TTZXl9WYFF", "proceeding": "https://proceedings.mlr.press/v235/sivagnanam24a.html", "pdf": "https://openreview.net/pdf?id=TTZXl9WYFF", "openreview": "https://openreview.net/forum?id=TTZXl9WYFF", "author_site": "Amutheezan Sivagnanam, Ava Pettet, Hunter Lee, Ayan Mukhopadhyay, Abhishek Dubey, Aron Laszka", "tldr": "", "abstract": "An emergency responder management (ERM) system dispatches responders, such as ambulances, when it receives requests for medical aid. ERM systems can also proactively reposition responders between predesignated waiting locations to cover any gaps that arise due to the prior dispatch of responders or significant changes in the distribution of anticipated requests. Optimal repositioning is computationally challenging due to the exponential number of ways to allocate responders between locations and the uncertainty in future requests. The state-of-the-art approach in proactive repositioning is a hierarchical approach based on spatial decomposition and online Monte Carlo tree search, which may require minutes of computation for each decision in a domain where seconds can save lives. We address the issue of long decision times by introducing a novel reinforcement learning (RL) approach, based on the same hierarchical decomposition, but replacing online search with learning. To address the computational challenges posed by large, variable-dimensional, and discrete state and action spaces, we propose: (1) actor-critic based agents that incorporate transformers to handle variable-dimensional states and actions, (2) projections to fixed-dimensional observations to handle complex states, and (3) combinatorial techniques to map continuous actions to discrete allocations. We evaluate our approach using real-world data from two U.S. cities, Nashville, TN and Seattle, WA. Our experiments show that compared to the state of the art, our approach reduces computation time per decision by three orders of magnitude, while also slightly reducing average ambulance response time by 5 seconds.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Amutheezan Sivagnanam;Ava Pettet;Hunter Lee;Ayan Mukhopadhyay;Abhishek Dubey;Aron Laszka", "authorids": "~Amutheezan_Sivagnanam1;~Ava_Pettet1;~Hunter_Lee1;~Ayan_Mukhopadhyay1;~Abhishek_Dubey1;~Aron_Laszka1", "gender": "M;;;M;Not Specified;F", "homepage": "https://amutheezan.com;;https://ayanmukhopadhyay.github.io/;http://engineering.vanderbilt.edu/bio/abhishek-dubey;https://aronlaszka.com/;https://avapettet.com/", "dblp": "262/5961.html;;180/1429;67/525.html;54/10042;", "google_scholar": "1m6U1cQAAAAJ;;1lNZTdMAAAAJ;5J3w9OoAAAAJ;ckHkkjgAAAAJ;", "orcid": "0000-0002-4295-529X;;0000-0002-8355-0950;;0000-0001-7400-2357;", "linkedin": "amutheezansivagnanam/;hunter-lee-750937179/;;;;", "or_profile": "~Amutheezan_Sivagnanam1;~Hunter_Lee1;~Ayan_Mukhopadhyay1;~Abhishek_Dubey1;~Aron_Laszka1;~Geoffrey_Pettet1", "aff": "Pennsylvania State University;Vanderbilt University;Vanderbilt University;Vanderbilt University;Pennsylvania State University;", "aff_domain": "psu.edu;vanderbilt.edu;vanderbilt.edu;vanderbilt.edu;psu.edu;", "position": "PhD student;Undergrad student;Researcher;Associate Professor;Assistant Professor;", "bibtex": "@inproceedings{\nsivagnanam2024multiagent,\ntitle={Multi-Agent Reinforcement Learning with Hierarchical Coordination for Emergency Responder Stationing},\nauthor={Amutheezan Sivagnanam and Ava Pettet and Hunter Lee and Ayan Mukhopadhyay and Abhishek Dubey and Aron Laszka},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=TTZXl9WYFF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1162215, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11619272503425937496&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "email": "psu.edu;vanderbilt.edu;vanderbilt.edu;vanderbilt.edu;psu.edu;", "author_num": 6, "aff_unique_index": "0;1;1;1;0", "aff_unique_norm": "Pennsylvania State University;Vanderbilt University", "aff_unique_dep": ";", "aff_unique_url": "https://www.psu.edu;https://www.vanderbilt.edu", "aff_unique_abbr": "PSU;Vanderbilt", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Inverse-Variance Weighting for Estimation of Heterogeneous Treatment Effects", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33971", "id": "TUKOklS3gg", "proceeding": "https://proceedings.mlr.press/v235/fisher24a.html", "pdf": "https://openreview.net/pdf?id=TUKOklS3gg", "openreview": "https://openreview.net/forum?id=TUKOklS3gg", "tldr": "", "abstract": "Many methods for estimating conditional average treatment effects (CATEs) can be expressed as weighted pseudo-outcome regressions (PORs). Previous comparisons of POR techniques have paid careful attention to the choice of pseudo-outcome transformation. However, we argue that the dominant driver of performance is actually the choice of weights. For example, we point out that R-Learning implicitly performs a POR with inverse-variance weights (IVWs). In the CATE setting, IVWs mitigate the instability associated with inverse-propensity weights, and lead to convenient simplifications of bias terms. We demonstrate the superior performance of IVWs in simulations, and derive convergence rates for IVWs that are, to our knowledge, the fastest yet shown without assuming knowledge of the covariate distribution.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aaron Fisher", "authorids": "~Aaron_Fisher1", "gender": "M", "homepage": "https://aaronjfisher.github.io/", "dblp": "", "google_scholar": "Ubjcc-IAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Aaron_Fisher1", "aff": "Foundation Medicine Inc", "aff_domain": "foundationmedicine.com", "position": "Researcher", "bibtex": "@inproceedings{\nfisher2024inversevariance,\ntitle={Inverse-Variance Weighting for Estimation of Heterogeneous Treatment Effects},\nauthor={Aaron Fisher},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=TUKOklS3gg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 781675, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12448067921629111352&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 4, "email": "foundationmedicine.com", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Foundation Medicine", "aff_unique_dep": "", "aff_unique_url": "https://www.foundationmedicine.com", "aff_unique_abbr": "FMI", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "A Theory of Non-Linear Feature Learning with One Gradient Step in Two-Layer Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33970", "id": "TWu1fzFJm0", "proceeding": "https://proceedings.mlr.press/v235/moniri24a.html", "pdf": "https://openreview.net/pdf?id=TWu1fzFJm0", "openreview": "https://openreview.net/forum?id=TWu1fzFJm0", "author_site": "Behrad Moniri, Donghwan Lee, Hamed Hassani, Edgar Dobriban", "tldr": "", "abstract": "Feature learning is thought to be one of the fundamental reasons for the success of deep neural networks. It is rigorously known that in two-layer fully-connected neural networks under certain conditions, one step of gradient descent on the first layer can lead to feature learning; characterized by the appearance of a separated rank-one component---spike---in the spectrum of the feature matrix. However, with a constant gradient descent step size, this spike only carries information from the linear component of the target function and therefore learning non-linear components is impossible. We show that with a learning rate that grows with the sample size, such training in fact introduces multiple rank-one components, each corresponding to a specific polynomial feature. We further prove that the limiting large-dimensional and large sample training and test errors of the updated neural networks are fully characterized by these spikes. By precisely analyzing the improvement in the training and test errors, we demonstrate that these non-linear features can enhance learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Behrad Moniri;Donghwan Lee;Hamed Hassani;Edgar Dobriban", "authorids": "~Behrad_Moniri1;~Donghwan_Lee5;~Hamed_Hassani2;~Edgar_Dobriban2", "gender": "M;M;M;", "homepage": "https://bemoniri.com/;;https://www.seas.upenn.edu/~hassani/;https://statistics.wharton.upenn.edu/profile/dobriban/", "dblp": ";;73/4984;99/11269", "google_scholar": "HvsqQ9kAAAAJ;https://scholar.google.com/citations?hl=en;;aGvH4yMAAAAJ", "orcid": ";;;", "linkedin": ";;;edgar-dobriban/", "or_profile": "~Behrad_Moniri1;~Donghwan_Lee5;~Hamed_Hassani2;~Edgar_Dobriban2", "aff": "University of Pennsylvania;University of Pennsylvania;University of Pennsylvania;The Wharton School, University of Pennsylvania", "aff_domain": "wharton.upenn.edu;upenn.edu;upenn.edu;wharton.upenn.edu", "position": "MS student;PhD student;;Associate Professor", "bibtex": "@inproceedings{\nmoniri2024a,\ntitle={A Theory of Non-Linear Feature Learning with One Gradient Step in Two-Layer Neural Networks},\nauthor={Behrad Moniri and Donghwan Lee and Hamed Hassani and Edgar Dobriban},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=TWu1fzFJm0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1819625, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=83613241973691338&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "wharton.upenn.edu;upenn.edu;upenn.edu;wharton.upenn.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Data Engineering for Scaling Language Models to 128K Context", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33969", "id": "TaAqeo7lUh", "proceeding": "https://proceedings.mlr.press/v235/fu24d.html", "pdf": "https://openreview.net/pdf?id=TaAqeo7lUh", "openreview": "https://openreview.net/forum?id=TaAqeo7lUh", "author_site": "Yao Fu, Rameswar Panda, Xinyao Niu, Xiang Yue, Hannaneh Hajishirzi, Yoon Kim, Hao Peng", "tldr": "", "abstract": "We study continual pretraining recipe for scaling language models' context lengths to 128K, with a focus on data engineering. We hypothesize that long context modeling, in particular *the ability to utilize information at arbitrary input locations*, is a capability that is mostly already acquired through large-scale pretraining, and that this capability can be readily extended to contexts substantially longer than seen during training (e.g., 4K to 128K) through lightweight continual pretraining on appropriate data mixture. We investigate the *quantity* and *quality* of the data for continual pretraining: (1) for quantity, we show that 500 million to 5 billion tokens are enough to enable the model to retrieve information anywhere within the 128K context; (2) for quality, our results equally emphasize *domain balance* and *length upsampling*. Concretely, na\u00efvely upsampling longer data on certain domains like books, a common practice of existing work, gives suboptimal performance; a balanced domain mixture is equally important. We demonstrate that continual pretraining of the full model on 1B-5B tokens of such data is an effective and affordable strategy for scaling the context length of language models to 128K. Our recipe outperforms strong open-source long-context models and closes the gap to frontier models like GPT-4 128K.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yao Fu;Rameswar Panda;Xinyao Niu;Xiang Yue;Hannaneh Hajishirzi;Yoon Kim;Hao Peng", "authorids": "~Yao_Fu3;~Rameswar_Panda1;~Xinyao_Niu1;~Xiang_Yue1;~Hannaneh_Hajishirzi1;~Yoon_Kim1;~Hao_Peng4", "gender": "M;M;;;F;;", "homepage": "https://franxyao.github.io/;https://rpand002.github.io/;;;https://homes.cs.washington.edu/~hannaneh/;https://people.csail.mit.edu/yoonkim/;", "dblp": ";126/0986;;;52/1296;;", "google_scholar": "liSP4cEAAAAJ;_ySuu6gAAAAJ;;;LOV6_WIAAAAJ;n_ts4eYAAAAJ;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Yao_Fu3;~Rameswar_Panda1;~Xinyao_Niu1;~Xiang_Yue1;~Hannaneh_Hajishirzi1;~Yoon_Kim1;~Hao_Peng4", "aff": "University of Edinburgh;MIT-IBM Watson AI Lab;;;University of Washington;Massachusetts Institute of Technology;", "aff_domain": "ed.ac.uk;ibm.com;;;uw.edu;mit.edu;", "position": "PhD student;Research Scientist;;;Associate Professor;Assistant Professor;", "bibtex": "@inproceedings{\nfu2024data,\ntitle={Data Engineering for Scaling Language Models to 128K Context},\nauthor={Yao Fu and Rameswar Panda and Xinyao Niu and Xiang Yue and Hannaneh Hajishirzi and Yoon Kim and Hao Peng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=TaAqeo7lUh}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2607529, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 100, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2830289917074254945&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "ed.ac.uk;ibm.com;;;uw.edu;mit.edu;", "author_num": 7, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "University of Edinburgh;Massachusetts Institute of Technology;University of Washington", "aff_unique_dep": ";IBM Watson AI Lab;", "aff_unique_url": "https://www.ed.ac.uk;https://www.mitibmwatsonailab.org;https://www.washington.edu", "aff_unique_abbr": "Edinburgh;MIT-IBM AI Lab;UW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Generalization Bounds for Causal Regression: Insights, Guarantees and Sensitivity Analysis", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33968", "id": "TejqrQBvll", "proceeding": "https://proceedings.mlr.press/v235/csillag24a.html", "pdf": "https://openreview.net/pdf?id=TejqrQBvll", "openreview": "https://openreview.net/forum?id=TejqrQBvll", "author_site": "Daniel Csillag, Claudio Struchiner, Guilherme Goedert", "tldr": "", "abstract": "Many algorithms have been recently proposed for causal machine learning. Yet, there is little to no theory on their quality, especially considering finite samples. In this work, we propose a theory based on generalization bounds that provides such guarantees. By introducing a novel change-of-measure inequality, we are able to tightly bound the model loss in terms of the deviation of the treatment propensities over the population, which we show can be empirically limited. Our theory is fully rigorous and holds even in the face of hidden confounding and violations of positivity. We demonstrate our bounds on semi-synthetic and real data, showcasing their remarkable tightness and practical utility.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Daniel Csillag;Claudio Jose Struchiner;Guilherme Tegoni Goedert", "authorids": "~Daniel_Csillag1;~Claudio_Jose_Struchiner1;~Guilherme_Tegoni_Goedert1", "gender": "M;M;M", "homepage": "https://dccsillag.xyz;;https://gtgoedert.com", "dblp": "320/6802;;", "google_scholar": "8QEfviAAAAAJ;kVh9mCwAAAAJ;", "orcid": "0009-0009-9449-0496;0000-0003-2114-847X;0000-0002-4759-1296", "linkedin": ";;", "or_profile": "~Daniel_Csillag1;~Claudio_Jose_Struchiner1;~Guilherme_Tegoni_Goedert1", "aff": "Funda\u00e7\u00e3o Get\u00falio Vargas;Funda\u00e7\u00e3o Get\u00falio Vargas (FGV);Funda\u00e7\u00e3o Get\u00falio Vargas (FGV)", "aff_domain": "fgv.br;fgv.br;fgv.br", "position": "Undergrad student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\ncsillag2024generalization,\ntitle={Generalization Bounds for Causal Regression: Insights, Guarantees and Sensitivity Analysis},\nauthor={Daniel Csillag and Claudio Jose Struchiner and Guilherme Tegoni Goedert},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=TejqrQBvll}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1014421, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14355712647754907926&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "fgv.br;fgv.br;fgv.br", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Funda\u00e7\u00e3o Get\u00falio Vargas", "aff_unique_dep": "", "aff_unique_url": "https://www.fgv.br", "aff_unique_abbr": "FGV", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Brazil" }, { "title": "LPGD: A General Framework for Backpropagation through Embedded Optimization Layers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33967", "id": "TfWKkSAziC", "proceeding": "https://proceedings.mlr.press/v235/paulus24a.html", "pdf": "https://openreview.net/pdf?id=TfWKkSAziC", "openreview": "https://openreview.net/forum?id=TfWKkSAziC", "author_site": "Anselm Paulus, Georg Martius, Vit Musil", "tldr": "", "abstract": "Embedding parameterized optimization problems as layers into machine learning architectures serves as a powerful inductive bias. Training such architectures with stochastic gradient descent requires care, as degenerate derivatives of the embedded optimization problem often render the gradients uninformative. We propose Lagrangian Proximal Gradient Descent (LPGD), a flexible framework for training architectures with embedded optimization layers that seamlessly integrates into automatic differentiation libraries. LPGD efficiently computes meaningful replacements of the degenerate optimization layer derivatives by re-running the forward solver oracle on a perturbed input. LPGD captures various previously proposed methods as special cases, while fostering deep links to traditional optimization methods. We theoretically analyze our method and demonstrate on historical and synthetic data that LPGD converges faster than gradient descent even in a differentiable setup.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anselm Paulus;Georg Martius;V\u00edt Musil", "authorids": "~Anselm_Paulus1;~Georg_Martius1;~V\u00edt_Musil1", "gender": ";M;M", "homepage": ";https://uni-tuebingen.de/de/264672;http://vejtek.matfyz.cz/", "dblp": "255/5245;47/2706;255/4994", "google_scholar": "njZL5CQAAAAJ;https://scholar.google.de/citations?user=b-JF-UIAAAAJ;https://scholar.google.cz/citations?user=hA1rlU4AAAAJ", "orcid": ";;0000-0001-6083-227X", "linkedin": ";;", "or_profile": "~Anselm_Paulus1;~Georg_Martius1;~V\u00edt_Musil1", "aff": "Facebook AI Research;Max Planck Institute for Intelligent Systems;Masaryk University", "aff_domain": "meta.com;tuebingen.mpg.de;muni.cz", "position": "Intern;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\npaulus2024lpgd,\ntitle={{LPGD}: A General Framework for Backpropagation through Embedded Optimization Layers},\nauthor={Anselm Paulus and Georg Martius and V{\\'\\i}t Musil},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=TfWKkSAziC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1207624, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11549529648687780489&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "meta.com;tuebingen.mpg.de;muni.cz", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Meta;Max Planck Institute for Intelligent Systems;Masaryk University", "aff_unique_dep": "Facebook AI Research;Intelligent Systems;", "aff_unique_url": "https://research.facebook.com;https://www.mpi-is.mpg.de;https://www.muni.cz", "aff_unique_abbr": "FAIR;MPI-IS;MU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "United States;Germany;Czech Republic" }, { "title": "Testing the Feasibility of Linear Programs with Bandit Feedback", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33966", "id": "TfwGtfPkhV", "proceeding": "https://proceedings.mlr.press/v235/gangrade24a.html", "pdf": "https://openreview.net/pdf?id=TfwGtfPkhV", "openreview": "https://openreview.net/forum?id=TfwGtfPkhV", "author_site": "Aditya Gangrade, Aditya Gopalan, Venkatesh Saligrama, Clay Scott", "tldr": "", "abstract": "While the recent literature has seen a surge in the study of constrained bandit problems, all existing methods for these begin by assuming the feasibility of the underlying problem. We initiate the study of testing such feasibility assumptions, and in particular address the problem in the linear bandit setting, thus characterising the costs of feasibility testing for an unknown linear program using bandit feedback. Concretely, we test if $\\exists x: Ax \\ge 0$ for an unknown $A \\in \\mathbb{R}^{m \\times d}$, by playing a sequence of actions $x_t\\in \\mathbb{R}^d$, and observing $Ax_t + \\mathrm{noise}$ in response. By identifying the hypothesis as determining the sign of the value of a minimax game, we construct a novel test based on low-regret algorithms and a nonasymptotic law of iterated logarithms. We prove that this test is reliable, and adapts to the `signal level,' $\\Gamma,$ of any instance, with mean sample costs scaling as $\\widetilde{O}(d^2/\\Gamma^2)$. We complement this by a minimax lower bound of $\\Omega(d/\\Gamma^2)$ for sample costs of reliable tests, dominating prior asymptotic lower bounds by capturing the dependence on $d$, and thus elucidating a basic insight missing in the extant literature on such problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aditya Gangrade;Aditya Gopalan;Venkatesh Saligrama;Clayton Scott", "authorids": "~Aditya_Gangrade1;~Aditya_Gopalan1;~Venkatesh_Saligrama1;~Clayton_Scott1", "gender": ";M;;", "homepage": ";https://ece.iisc.ac.in/~aditya/;https://venkatesh-saligrama.github.io/;", "dblp": ";90/9826;67/4721;", "google_scholar": ";dM5_1NsAAAAJ;S4z3uzMAAAAJ;", "orcid": ";;0000-0002-0675-2268;", "linkedin": ";;venkatesh-saligrama-91175a16/;", "or_profile": "~Aditya_Gangrade1;~Aditya_Gopalan1;~Venkatesh_Saligrama1;~Clayton_Scott1", "aff": ";Indian Institute of Science;Boston University;", "aff_domain": ";iisc.ac.in;bu.edu;", "position": ";Associate Professor;Full Professor;", "bibtex": "@inproceedings{\ngangrade2024testing,\ntitle={Testing the Feasibility of Linear Programs with Bandit Feedback},\nauthor={Aditya Gangrade and Aditya Gopalan and Venkatesh Saligrama and Clayton Scott},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=TfwGtfPkhV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 660659, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:lzhBUg0Xpq0J:scholar.google.com/&scioq=Testing+the+Feasibility+of+Linear+Programs+with+Bandit+Feedback&hl=en&as_sdt=0,33", "gs_version_total": 7, "email": ";iisc.ac.in;bu.edu;", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Indian Institute of Science;Boston University", "aff_unique_dep": ";", "aff_unique_url": "https://www.iisc.ac.in;https://www.bu.edu", "aff_unique_abbr": "IISc;BU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "India;United States" }, { "title": "Position: LLMs Can\u2019t Plan, But Can Help Planning in LLM-Modulo Frameworks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33965", "id": "Th8JPEmH4z", "proceeding": "https://proceedings.mlr.press/v235/kambhampati24a.html", "pdf": "https://openreview.net/pdf?id=Th8JPEmH4z", "openreview": "https://openreview.net/forum?id=Th8JPEmH4z", "author_site": "Subbarao Kambhampati, Karthik Valmeekam, Lin Guan, Mudit Verma, Kaya Stechly, Siddhant Bhambri, Lucas Saldyt, Anil B Murthy", "tldr": "", "abstract": "We argue that auto-regressive LLMs cannot, by themselves, do planning or self-verification (which is after all a form of reasoning), and shed some light on the reasons for misunderstandings in the literature. We will also argue that LLMs should be viewed as universal approximate knowledge sources that have much more meaningful roles to play in planning/reasoning tasks beyond simple front-end/back-end format translators. We present a vision of LLM-Modulo Frameworks that combine the strengths of LLMs with external model-based verifiers in a tighter bi-directional interaction regime. We will show how the models driving the external verifiers themselves can be acquired with the help of LLMs. We will also argue that rather than simply pipelining LLMs and symbolic components, this LLM-Modulo Framework provides a better neuro-symbolic approach that offers tighter integration between LLMs and symbolic components, and allows extending the scope of model-based planning/reasoning regimes towards more flexible knowledge, problem and preference specifications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Subbarao Kambhampati;Karthik Valmeekam;Lin Guan;Mudit Verma;Kaya Stechly;Siddhant Bhambri;Lucas Paul Saldyt;Anil B Murthy", "authorids": "~Subbarao_Kambhampati1;~Karthik_Valmeekam1;~Lin_Guan1;~Mudit_Verma2;~Kaya_Stechly1;~Siddhant_Bhambri1;~Lucas_Paul_Saldyt1;~Anil_B_Murthy1", "gender": "M;M;M;M;;M;Not Specified;M", "homepage": "http://rakaposhi.eas.asu.edu;;https://guansuns.github.io/;https://famishedrover.github.io/;https://kstechly.github.io/;https://sbhambr1.github.io;https://github.com/LSaldyt;", "dblp": "k/SKambhampati;279/2957;;192/7474;;;;", "google_scholar": "yl3L07sAAAAJ;CrYLDt4AAAAJ;c1L_gZoAAAAJ;8TtypKwAAAAJ;BUT7cR0AAAAJ;E9I6GbwAAAAJ;wyEpF4wAAAAJ;lPCPmgYAAAAJ", "orcid": ";;;;;0000-0003-1182-4999;;", "linkedin": ";;lin-guan/;;kaya-stechly-327729236;siddhant-bhambri/;lsaldyt/;anil-b-m", "or_profile": "~Subbarao_Kambhampati1;~Karthik_Valmeekam1;~Lin_Guan1;~Mudit_Verma2;~Kaya_Stechly1;~Siddhant_Bhambri1;~Lucas_Paul_Saldyt1;~Anil_B_Murthy1", "aff": "Arizona State University;Arizona State University;Arizona State University;Arizona State University;Arizona State University;Microsoft;Arizona State University;Arizona State University", "aff_domain": "asu.edu;asu.edu;asu.edu;asu.edu;asu.edu;microsoft.com;asu.edu;asu.edu", "position": "Full Professor;PhD student;PhD student;PhD student;MS student;Intern;PhD student;PhD student", "bibtex": "@inproceedings{\nkambhampati2024position,\ntitle={Position: {LLM}s Can{\\textquoteright}t Plan, But Can Help Planning in {LLM}-Modulo Frameworks},\nauthor={Subbarao Kambhampati and Karthik Valmeekam and Lin Guan and Mudit Verma and Kaya Stechly and Siddhant Bhambri and Lucas Paul Saldyt and Anil B Murthy},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Th8JPEmH4z}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1888249, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 192, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17467729488139106777&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 8, "email": "asu.edu;asu.edu;asu.edu;asu.edu;asu.edu;microsoft.com;asu.edu;asu.edu", "author_num": 8, "aff_unique_index": "0;0;0;0;0;1;0;0", "aff_unique_norm": "Arizona State University;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.asu.edu;https://www.microsoft.com", "aff_unique_abbr": "ASU;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Harnessing the Power of Neural Operators with Automatically Encoded Conservation Laws", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33964", "id": "ToHkAg936Y", "proceeding": "https://proceedings.mlr.press/v235/liu24p.html", "pdf": "https://openreview.net/pdf?id=ToHkAg936Y", "openreview": "https://openreview.net/forum?id=ToHkAg936Y", "author_site": "Ning Liu, Yiming Fan, Xianyi Zeng, Milan Kl\u00f6wer, LU ZHANG, Yue Yu", "tldr": "", "abstract": "Neural operators (NOs) have emerged as effective tools for modeling complex physical systems in scientific machine learning. In NOs, a central characteristic is to learn the governing physical laws directly from data. In contrast to other machine learning applications, partial knowledge is often known a priori about the physical system at hand whereby quantities such as mass, energy and momentum are exactly conserved. Currently, NOs have to learn these conservation laws from data and can only approximately satisfy them due to finite training data and random noise. In this work, we introduce conservation law-encoded neural operators (clawNOs), a suite of NOs that endow inference with automatic satisfaction of such conservation laws. ClawNOs are built with a divergence-free prediction of the solution field, with which the continuity equation is automatically guaranteed. As a consequence, clawNOs are compliant with the most fundamental and ubiquitous conservation laws essential for correct physical consistency. As demonstrations, we consider a wide variety of scientific applications ranging from constitutive modeling of material deformation, incompressible fluid dynamics, to atmospheric simulation. ClawNOs significantly outperform the state-of-the-art NOs in learning efficacy, especially in small-data regimes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ning Liu;Yiming Fan;Xianyi Zeng;Milan Kl\u00f6wer;LU ZHANG;Yue Yu", "authorids": "~Ning_Liu6;~Yiming_Fan1;~Xianyi_Zeng1;~Milan_Kl\u00f6wer1;~LU_ZHANG18;~Yue_Yu3", "gender": ";M;Not Specified;M;Not Specified;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";LNQDvbkAAAAJ;2ttEAj8AAAAJ;;;", "orcid": ";0000-0002-4783-0699;;0000-0002-3920-4356;;", "linkedin": ";;;;whichis/;", "or_profile": "~Ning_Liu6;~Yiming_Fan1;~Xianyi_Zeng1;~Milan_Kl\u00f6wer1;~LU_ZHANG18;~Yue_Yu3", "aff": ";Lehigh University;Lehigh University;Massachusetts Institute of Technology;Lehigh University;", "aff_domain": ";lehigh.edu;lehigh.edu;mit.edu;lehigh.edu;", "position": ";PhD student;Assistant Professor;Postdoc;PhD student;", "bibtex": "@inproceedings{\nliu2024harnessing,\ntitle={Harnessing the Power of Neural Operators with Automatically Encoded Conservation Laws},\nauthor={Ning Liu and Yiming Fan and Xianyi Zeng and Milan Kl{\\\"o}wer and LU ZHANG and Yue Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ToHkAg936Y}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3636033, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2666518352325242566&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 11, "email": ";lehigh.edu;lehigh.edu;mit.edu;lehigh.edu;", "author_num": 6, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Lehigh University;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.lehigh.edu;https://web.mit.edu", "aff_unique_abbr": "Lehigh;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "PinNet: Pinpoint Instructive Information for Retrieval Augmented Code-to-Text Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33963", "id": "TqcZfMZjgM", "proceeding": "https://proceedings.mlr.press/v235/fu24f.html", "pdf": "https://openreview.net/pdf?id=TqcZfMZjgM", "openreview": "https://openreview.net/forum?id=TqcZfMZjgM", "author_site": "Han Fu, Jian Tan, Pinhan Zhang, Feifei Li, Jianling Sun", "tldr": "", "abstract": "Automatically generating high-quality code descriptions greatly improves the readability and maintainability of the codebase. Recently, retrieval augmented code-to-text generation has proven to be an effective solution, which has achieved state-of-the-art results on various benchmarks. It brings out the potential to leverage large unlabeled code descriptions to further improve the generation quality. Despite the promising performance, retrieval-augmented models however suffer from being deluded by inconducive retrieved references, due to irrelevant or even misleading information contained therein. To this end, we design PinNet, a new framework for code-to-text generation. PinNet relies on a discriminator to measure how well the retrievals match the semantics of the input code. Remarkably, the hidden representation of the reference before the output layer of the discriminator can be leveraged to significantly improve the code-to-text generation by modifying the attention weights. It essentially pays high attention to valuable information and eliminates misleadingness. To effectively execute this idea, we also propose a novel contrastive learning method to quantify the semantical similarities between unlabeled references. Using extensive experiments on code summarization and SQL-to-text generation, we demonstrate that the proposed method can significantly outperform all of the baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Han Fu;Jian Tan;Pinhan Zhang;Feifei Li;Jianling Sun", "authorids": "~Han_Fu1;~Jian_Tan2;~Pinhan_Zhang1;~Feifei_Li3;~Jianling_Sun2", "gender": "M;M;M;M;", "homepage": ";;https://z666pr.github.io/;http://www.cs.utah.edu/~lifeifei/;", "dblp": ";51/2627-1.html;;l/FeifeiLi;", "google_scholar": ";m36rOvoAAAAJ;;qPDhlWkAAAAJ;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Han_Fu1;~Jian_Tan2;~Pinhan_Zhang1;~Feifei_Li3;~Jianling_Sun2", "aff": "Alibaba Group;Alibaba Group;Zhejiang University;;", "aff_domain": "alibaba-inc.com;alibaba-inc.com;zju.edu.cn;;", "position": "Alibaba Group;Researcher;MS student;;", "bibtex": "@inproceedings{\nfu2024pinnet,\ntitle={PinNet: Pinpoint Instructive Information for Retrieval Augmented Code-to-Text Generation},\nauthor={Han Fu and Jian Tan and Pinhan Zhang and Feifei Li and Jianling Sun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=TqcZfMZjgM}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 855570, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1570648588457083767&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "alibaba-inc.com;alibaba-inc.com;zju.edu.cn;;", "author_num": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "Alibaba Group;Zhejiang University", "aff_unique_dep": ";", "aff_unique_url": "https://www.alibaba.com;https://www.zju.edu.cn", "aff_unique_abbr": "Alibaba;ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Rethinking DP-SGD in Discrete Domain: Exploring Logistic Distribution in the Realm of signSGD", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33962", "id": "TtSFg4s3F0", "proceeding": "https://proceedings.mlr.press/v235/jang24a.html", "pdf": "https://openreview.net/pdf?id=TtSFg4s3F0", "openreview": "https://openreview.net/forum?id=TtSFg4s3F0", "author_site": "Jonggyu Jang, Seongjin Hwang, Hyun Jong Yang", "tldr": "", "abstract": "Deep neural networks (DNNs) have a risk of remembering sensitive data from their training datasets, inadvertently leading to substantial information leakage through privacy attacks like membership inference attacks. DP-SGD is a simple but effective defense method, incorporating Gaussian noise into gradient updates to safeguard sensitive information. With the prevalence of large neural networks, DP-signSGD, a variant of DP-SGD, has emerged, aiming to curtail memory usage while maintaining security. However, it is noteworthy that most DP-signSGD algorithms default to Gaussian noise, suitable only for DP-SGD, without scant discussion of its appropriateness for signSGD. Our study delves into an intriguing question: **\"Can we find a more efficient substitute for Gaussian noise to secure privacy in DP-signSGD?\"** We propose an answer with a Logistic mechanism, which conforms to signSGD principles and is interestingly evolved from an exponential mechanism. In this paper, we provide both theoretical and experimental evidence showing that our method surpasses DP-signSGD.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jonggyu Jang;Seongjin Hwang;Hyun Jong Yang", "authorids": "~Jonggyu_Jang1;~Seongjin_Hwang1;~Hyun_Jong_Yang2", "gender": "M;M;M", "homepage": ";https://sites.google.com/g.postech.edu/seongjin?usp=sharing;http://aislab.io", "dblp": "182/7311;;87/4972", "google_scholar": "https://scholar.google.co.kr/citations?hl=en;;https://scholar.google.co.kr/citations?user=zNBCcSgAAAAJ", "orcid": "0000-0001-9651-2227;;0000-0002-0717-3794", "linkedin": ";;hyun-jong-yang-549a2417", "or_profile": "~Jonggyu_Jang1;~Seongjin_Hwang1;~Hyun_Jong_Yang2", "aff": "Pohang University of Science and Technology;Pohang University of Science and Technology;POSTECH", "aff_domain": "postech.ac.kr;postech.ac.kr;postech.ac.kr", "position": "Postdoc;Combined M.S./Ph.D candidate;Associate Professor", "bibtex": "@inproceedings{\njang2024rethinking,\ntitle={Rethinking {DP}-{SGD} in Discrete Domain: Exploring Logistic Distribution in the Realm of sign{SGD}},\nauthor={Jonggyu Jang and Seongjin Hwang and Hyun Jong Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=TtSFg4s3F0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1568178, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5964353307385529469&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "postech.ac.kr;postech.ac.kr;postech.ac.kr", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Pohang University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.postech.ac.kr", "aff_unique_abbr": "POSTECH", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pohang", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "MS$^3$D: A RG Flow-Based Regularization for GAN Training with Limited Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33961", "id": "TuALw8xVum", "proceeding": "https://proceedings.mlr.press/v235/wang24af.html", "pdf": "https://openreview.net/pdf?id=TuALw8xVum", "openreview": "https://openreview.net/forum?id=TuALw8xVum", "author_site": "Jian Wang, Xin Lan, Yuxin Tian, Jiancheng Lv", "tldr": "", "abstract": "Generative adversarial networks (GANs) have made impressive advances in image generation, but they often require large-scale training data to avoid degradation caused by discriminator overfitting. To tackle this issue, we investigate the challenge of training GANs with limited data, and propose a novel regularization method based on the idea of renormalization group (RG) in physics.We observe that in the limited data setting, the gradient pattern that the generator obtains from the discriminator becomes more aggregated over time. In RG context, this aggregated pattern exhibits a high discrepancy from its coarse-grained versions, which implies a high-capacity and sensitive system, prone to overfitting and collapse. To address this problem, we introduce a **m**ulti-**s**cale **s**tructural **s**elf-**d**issimilarity (MS$^3$D) regularization, which constrains the gradient field to have a consistent pattern across different scales, thereby fostering a more redundant and robust system. We show that our method can effectively enhance the performance and stability of GANs under limited data scenarios, and even allow them to generate high-quality images with very few data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jian Wang;Xin Lan;Yuxin Tian;Jiancheng Lv", "authorids": "~Jian_Wang12;~Xin_Lan1;~Yuxin_Tian3;~Jiancheng_Lv2", "gender": "M;M;;M", "homepage": "http://dicalab.cn/wangjian;https://dicalab.cn/author/xin-lan/;;https://cs.scu.edu.cn/info/1303/13767.htm", "dblp": ";;;", "google_scholar": "Irn-tV4AAAAJ;https://scholar.google.com/citations?hl=en;n36mg0QAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": "0000-0002-9173-4979;0000-0002-3350-8578;0000-0003-3706-3462;", "linkedin": ";;;", "or_profile": "~Jian_Wang12;~Xin_Lan1;~Yuxin_Tian3;~Jiancheng_Lv2", "aff": "Sichuan University;Sichuan University;Ant Group;Sichuan University", "aff_domain": "scu.edu.cn;scu.edu.cn;antgroup.com;scu.edu.cn", "position": "PhD student;MS student;Intern;Full Professor", "bibtex": "@inproceedings{\nwang2024msd,\ntitle={{MS}\\${\\textasciicircum}3\\$D: A {RG} Flow-Based Regularization for {GAN} Training with Limited Data},\nauthor={Jian Wang and Xin Lan and Yuxin Tian and Jiancheng Lv},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=TuALw8xVum}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8882267, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11725142564318256678&as_sdt=5,24&sciodt=0,24&hl=en", "gs_version_total": 6, "email": "scu.edu.cn;scu.edu.cn;antgroup.com;scu.edu.cn", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Sichuan University;Ant Group", "aff_unique_dep": ";", "aff_unique_url": "https://www.scu.edu.cn;https://www.antgroup.com", "aff_unique_abbr": "SCU;Ant Group", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Online Matching with Stochastic Rewards: Provable Better Bound via Adversarial Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33960", "id": "TujtZgdRxB", "proceeding": "https://proceedings.mlr.press/v235/zhang24bf.html", "pdf": "https://openreview.net/pdf?id=TujtZgdRxB", "openreview": "https://openreview.net/forum?id=TujtZgdRxB", "author_site": "Qiankun Zhang, Aocheng Shen, Boyu Zhang, Hanrui Jiang, Bingqian Du", "tldr": "", "abstract": "For a specific online optimization problem, for example, online bipartite matching (OBM), research efforts could be made in two directions before it is finally closed, i.e., the optimal competitive online algorithm is found. One is to continuously design algorithms with better performance. To this end, reinforcement learning (RL) has demonstrated great success in literature. However, little is known on the other direction: whether RL helps explore how hard an online problem is. In this paper, we study a generalized model of OBM, named online matching with stochastic rewards (OMSR, FOCS 2012), for which the optimal competitive ratio is still unknown. We adopt an adversarial RL approach that trains two RL agents adversarially and iteratively: the algorithm agent learns for algorithms with larger competitive ratios, while the adversarial agent learns to produce a family of hard instances. Through such a framework, agents converge at the end with a robust algorithm, which empirically outperforms the state of the art (STOC 2020). Much more significantly, it allows to track how the hard instances are generated. We succeed in distilling two structural properties from the learned graph patterns, which remarkably reduce the action space, and further enable theoretical improvement on the best-known hardness result of OMSR, from $0.621$ (FOCS 2012) to $0.597$. To the best of our knowledge, this gives the first evidence that RL can help enhance the theoretical understanding of an online problem.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qiankun Zhang;Aocheng Shen;Boyu Zhang;Hanrui Jiang;Bingqian Du", "authorids": "~Qiankun_Zhang1;~Aocheng_Shen1;~Boyu_Zhang6;~Hanrui_Jiang1;~Bingqian_Du1", "gender": ";M;;M;F", "homepage": ";https://navi-awson.github.io/;https://chamberqaq.github.io/;https://baidu.cc;", "dblp": ";;;;245/3556", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Qiankun_Zhang1;~Aocheng_Shen1;~Boyu_Zhang6;~Hanrui_Jiang1;~Bingqian_Du1", "aff": ";Huazhong University of Science and Technology;Huazhong University of Science and Technology;Huazhong University of Science and Technology;", "aff_domain": ";hust.edu.cn;hust.edu.cn;hust.edu.cn;", "position": ";Undergrad student;MS student;Undergrad student;", "bibtex": "@inproceedings{\nzhang2024online,\ntitle={Online Matching with Stochastic Rewards: Provable Better Bound via Adversarial Reinforcement Learning},\nauthor={Qiankun Zhang and Aocheng Shen and Boyu Zhang and Hanrui Jiang and Bingqian Du},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=TujtZgdRxB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 705170, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17355541874739528753&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": ";hust.edu.cn;hust.edu.cn;hust.edu.cn;", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Huazhong University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "http://www.hust.edu.cn", "aff_unique_abbr": "HUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Gaussian Plane-Wave Neural Operator for Electron Density Estimation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33959", "id": "TvoG41N1Y3", "proceeding": "https://proceedings.mlr.press/v235/kim24b.html", "pdf": "https://openreview.net/pdf?id=TvoG41N1Y3", "openreview": "https://openreview.net/forum?id=TvoG41N1Y3", "author_site": "Seongsu Kim, Sungsoo Ahn", "tldr": "", "abstract": "This work studies machine learning for electron density prediction, which is fundamental for understanding chemical systems and density functional theory (DFT) simulations. To this end, we introduce the Gaussian plane-wave neural operator (GPWNO), which operates in the infinite-dimensional functional space using the plane-wave and Gaussian-type orbital bases, widely recognized in the context of DFT. In particular, both high- and low-frequency components of the density can be effectively represented due to the complementary nature of the two bases. Extensive experiments on QM9, MD, and material project datasets demonstrate GPWNO's superior performance over ten baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Seongsu Kim;Sungsoo Ahn", "authorids": "~Seongsu_Kim2;~Sungsoo_Ahn1", "gender": "M;M", "homepage": ";https://sungsooahn.super.site/", "dblp": ";90/5164", "google_scholar": ";XTenHs0AAAAJ", "orcid": ";", "linkedin": "%EC%84%B1%EC%88%98-%EA%B9%80-8799bb248/;", "or_profile": "~Seongsu_Kim2;~Sungsoo_Ahn1", "aff": "Pohang University of Science and Technology;Pohang University of Science and Technology", "aff_domain": "postech.edu;postech.ac.kr", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nkim2024gaussian,\ntitle={Gaussian Plane-Wave Neural Operator for Electron Density Estimation},\nauthor={Seongsu Kim and Sungsoo Ahn},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=TvoG41N1Y3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8663979, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8145448988165952044&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "postech.edu;postech.ac.kr", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Pohang University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.postech.ac.kr", "aff_unique_abbr": "POSTECH", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Pohang", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "How to Trace Latent Generative Model Generated Images without Artificial Watermark?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33958", "id": "TwZ2sY6eJj", "proceeding": "https://proceedings.mlr.press/v235/wang24bj.html", "pdf": "https://openreview.net/pdf?id=TwZ2sY6eJj", "openreview": "https://openreview.net/forum?id=TwZ2sY6eJj", "author_site": "Zhenting Wang, Vikash Sehwag, Chen Chen, Lingjuan Lyu, Dimitris Metaxas, Shiqing Ma", "tldr": "", "abstract": "Latent generative models (e.g., Stable Diffusion) have become more and more popular, but concerns have arisen regarding potential misuse related to images generated by these models. It is, therefore, necessary to analyze the origin of images by inferring if a particular image was generated by a specific latent generative model. Most existing methods (e.g., image watermark and model fingerprinting) require extra steps during training or generation. These requirements restrict their usage on the generated images without such extra operations, and the extra required operations might compromise the quality of the generated images. In this work, we ask whether it is possible to effectively and efficiently trace the images generated by a specific latent generative model without the aforementioned requirements. To study this problem, we design a latent inversion based method called LatentTracer to trace the generated images of the inspected model by checking if the examined images can be well-reconstructed with an inverted latent input. We leverage gradient based latent inversion and identify a encoder-based initialization critical to the success of our approach. Our experiments on the state-of-the-art latent generative models, such as Stable Diffusion, show that our method can distinguish the images generated by the inspected model and other images with a high accuracy and efficiency. Our findings suggest the intriguing possibility that today's latent generative generated images are naturally watermarked by the decoder used in the source models. Code: https://github.com/ZhentingWang/LatentTracer.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhenting Wang;Vikash Sehwag;Chen Chen;Lingjuan Lyu;Dimitris N. Metaxas;Shiqing Ma", "authorids": "~Zhenting_Wang1;~Vikash_Sehwag1;~Chen_Chen20;~Lingjuan_Lyu1;~Dimitris_N._Metaxas1;~Shiqing_Ma2", "gender": "M;M;M;F;;M", "homepage": "https://zhentingwang.github.io/;https://vsehwag.github.io/;https://cc233.github.io/;https://sites.google.com/view/lingjuan-lyu;https://people.cs.umass.edu/~shiqingma/;https://www.cs.rutgers.edu/~dnm/", "dblp": "263/4521;187/5613;65/4423-43;178/9876;172/8745;m/DNMetaxas", "google_scholar": "QSYVbj8AAAAJ;JAkeEG8AAAAJ;;;X_mDnjkAAAAJ;https://scholar.google.com.tw/citations?user=a7VNhCIAAAAJ", "orcid": ";;0000-0001-7359-8515;;0000-0003-1551-8948;", "linkedin": ";;;;shiqing-ma-6590b086;dimitris-metaxas-1bb74914/", "or_profile": "~Zhenting_Wang1;~Vikash_Sehwag1;~Chen_Chen20;~Lingjuan_Lyu1;~Shiqing_Ma2;~Dimitris_Metaxas1", "aff": "Sony AI;Sony AI;Sony AI;Sony;University of Massachusetts at Amherst;Rutgers University", "aff_domain": "sony.com;sony.com;sony.com;sony.com;umass.edu;cs.rutgers.edu", "position": "Intern;Researcher;Researcher;scientist;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nwang2024how,\ntitle={How to Trace Latent Generative Model Generated Images without Artificial Watermark?},\nauthor={Zhenting Wang and Vikash Sehwag and Chen Chen and Lingjuan Lyu and Dimitris N. Metaxas and Shiqing Ma},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=TwZ2sY6eJj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1846231, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6273326576124050125&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "sony.com;sony.com;sony.com;sony.com;umass.edu;cs.rutgers.edu", "author_num": 6, "aff_unique_index": "0;0;0;1;2;3", "aff_unique_norm": "Sony;Sony Corporation;University of Massachusetts Amherst;Rutgers University", "aff_unique_dep": "Sony AI;;;", "aff_unique_url": "https://www.sony.com;https://www.sony.com;https://www.umass.edu;https://www.rutgers.edu", "aff_unique_abbr": "Sony AI;Sony;UMass Amherst;Rutgers", "aff_campus_unique_index": "1", "aff_campus_unique": ";Amherst", "aff_country_unique_index": "0;0;0;0;1;1", "aff_country_unique": "Japan;United States" }, { "title": "Can Machines Learn the True Probabilities?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33957", "id": "TzqmqZS0nj", "proceeding": "https://proceedings.mlr.press/v235/kim24a.html", "pdf": "https://openreview.net/pdf?id=TzqmqZS0nj", "openreview": "https://openreview.net/forum?id=TzqmqZS0nj", "tldr": "", "abstract": "When there exists uncertainty, AI machines are designed to make decisions so as to reach the best expected outcomes. Expectations are based on true facts about the objective environment the machines interact with, and those facts can be encoded into AI models in the form of true objective probability functions. Accordingly, AI models involve probabilistic machine learning in which the probabilities should be objectively interpreted. We prove under some basic assumptions when machines can learn the true objective probabilities, if any, and when machines cannot learn them.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jinsook Kim", "authorids": "~Jinsook_Kim1", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\nkim2024can,\ntitle={Can Machines Learn the True Probabilities?},\nauthor={Jinsook Kim},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=TzqmqZS0nj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 387423, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9813284351558569652&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "", "author_num": 1 }, { "title": "Causal Bandits: The Pareto Optimal Frontier of Adaptivity, a Reduction to Linear Bandits, and Limitations around Unknown Marginals", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33956", "id": "U1uKihiG39", "proceeding": "https://proceedings.mlr.press/v235/liu24b.html", "pdf": "https://openreview.net/pdf?id=U1uKihiG39", "openreview": "https://openreview.net/forum?id=U1uKihiG39", "author_site": "Ziyi Liu, Idan Attias, Daniel Roy", "tldr": "", "abstract": "In this work, we investigate the problem of adapting to the presence or absence of causal structure in multi-armed bandit problems. In addition to the usual reward signal, we assume the learner has access to additional variables, observed in each round after acting. When these variables $d$-separate the action from the reward, existing work in causal bandits demonstrates that one can achieve strictly better (minimax) rates of regret (Lu et al., 2020). Our goal is to adapt to this favorable ``conditionally benign'' structure, if it is present in the environment, while simultaneously recovering worst-case minimax regret, if it is not. Notably, the learner has no prior knowledge of whether the favorable structure holds. In this paper, we establish the Pareto optimal frontier of adaptive rates. We prove upper and matching lower bounds on the possible trade-offs in the performance of learning in conditionally benign and arbitrary environments, resolving an open question raised by Bilodeau et al. (2022). Furthermore, we are the first to obtain instance-dependent bounds for causal bandits, by reducing the problem to the linear bandit setting. Finally, we examine the common assumption that the marginal distributions of the post-action contexts are known and show that a nontrivial estimate is necessary for better-than-worst-case minimax rates.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ziyi Liu;Idan Attias;Daniel M. Roy", "authorids": "~Ziyi_Liu7;~Idan_Attias1;~Daniel_M._Roy1", "gender": ";M;M", "homepage": "https://www.statistics.utoronto.ca/people/directories/graduate-students/ziyi-liu;https://www.idanattias.com;http://danroy.org", "dblp": ";228/6803;04/2068", "google_scholar": ";-L6uUy0AAAAJ;https://scholar.google.ca/citations?user=vA6ZQ_AAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Ziyi_Liu7;~Idan_Attias1;~Daniel_M_Roy1", "aff": "University of Toronto;Tel Aviv University;University of Toronto", "aff_domain": "utoronto.ca;tau.ac.il;utoronto.ca", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nliu2024causal,\ntitle={Causal Bandits: The Pareto Optimal Frontier of Adaptivity, a Reduction to Linear Bandits, and Limitations around Unknown Marginals},\nauthor={Ziyi Liu and Idan Attias and Daniel M. Roy},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=U1uKihiG39}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1066474, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9365860197438235951&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "utoronto.ca;tau.ac.il;utoronto.ca", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Toronto;Tel Aviv University", "aff_unique_dep": ";", "aff_unique_url": "https://www.utoronto.ca;https://www.tau.ac.il", "aff_unique_abbr": "U of T;TAU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Canada;Israel" }, { "title": "Exponential Spectral Pursuit: An Effective Initialization Method for Sparse Phase Retrieval", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33955", "id": "U4Yvwu1RQY", "proceeding": "https://proceedings.mlr.press/v235/xu24ah.html", "pdf": "https://openreview.net/pdf?id=U4Yvwu1RQY", "openreview": "https://openreview.net/forum?id=U4Yvwu1RQY", "author_site": "Mengchu Xu, Zhang Yuxuan, Jian Wang", "tldr": "", "abstract": "Sparse phase retrieval aims to reconstruct an $n$-dimensional $k$-sparse signal from its phaseless measurements. For most of the existing reconstruction algorithms, their sampling complexity is known to be dominated by the initialization stage. In this paper, in order to improve the sampling complexity for initialization, we propose a novel method termed exponential spectral pursuit (ESP). Theoretically, our method offers a tighter bound of sampling complexity compared to the state-of-the-art ones, such as the truncated power method. Moreover, it empirically outperforms the existing initialization methods for sparse phase retrieval.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mengchu Xu;Yuxuan Zhang;Jian Wang", "authorids": "~Mengchu_Xu1;~Yuxuan_Zhang10;~Jian_Wang14", "gender": "Not Specified;M;M", "homepage": "https://github.com/mengchuxu97;https://github.com/xuanxuan202;http://homepage.fudan.edu.cn/jianwang/", "dblp": "322/0200;;", "google_scholar": ";;5i7YIEgAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Mengchu_Xu1;~Yuxuan_Zhang10;~Jian_Wang14", "aff": "Fudan University;Fudan University;Fudan University", "aff_domain": "fudan.edu.cn;fudan.edu.cn;fudan.edu.cn", "position": "MS student;MS student;Associate Professor", "bibtex": "@inproceedings{\nxu2024exponential,\ntitle={Exponential Spectral Pursuit: An Effective Initialization Method for Sparse Phase Retrieval},\nauthor={Mengchu Xu and Yuxuan Zhang and Jian Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=U4Yvwu1RQY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 576792, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16549499419916176751&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "fudan.edu.cn;fudan.edu.cn;fudan.edu.cn", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Fudan University", "aff_unique_dep": "", "aff_unique_url": "https://www.fudan.edu.cn", "aff_unique_abbr": "Fudan", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Configurable Mirror Descent: Towards a Unification of Decision Making", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33954", "id": "U841CrDUx9", "proceeding": "https://proceedings.mlr.press/v235/li24an.html", "pdf": "https://openreview.net/pdf?id=U841CrDUx9", "openreview": "https://openreview.net/forum?id=U841CrDUx9", "author_site": "Pengdeng Li, Shuxin Li, Chang Yang, Xinrun Wang, Shuyue Hu, Xiao Huang, Hau Chan, Bo An", "tldr": "", "abstract": "Decision-making problems, categorized as single-agent, e.g., Atari, cooperative multi-agent, e.g., Hanabi, competitive multi-agent, e.g., Hold'em poker, and mixed cooperative and competitive, e.g., football, are ubiquitous in the real world. Although various methods have been proposed to address the specific decision-making categories, these methods typically evolve independently and cannot generalize to other categories. Therefore, a fundamental question for decision-making is: *Can we develop **a single algorithm** to tackle **ALL** categories of decision-making problems?* There are several main challenges to address this question: i) different decision-making categories involve different numbers of agents and different relationships between agents, ii) different categories have different solution concepts and evaluation measures, and iii) there lacks a comprehensive benchmark covering all the categories. This work presents a preliminary attempt to address the question with three main contributions. i) We propose the generalized mirror descent (GMD), a generalization of MD variants, which considers multiple historical policies and works with a broader class of Bregman divergences. ii) We propose the configurable mirror descent (CMD) where a meta-controller is introduced to dynamically adjust the hyper-parameters in GMD conditional on the evaluation measures. iii) We construct the GameBench with 15 academic-friendly games across different decision-making categories. Extensive experiments demonstrate that CMD achieves empirically competitive or better outcomes compared to baselines while providing the capability of exploring diverse dimensions of decision making.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pengdeng Li;Shuxin Li;Chang Yang;Xinrun Wang;Shuyue Hu;Xiao Huang;Hau Chan;Bo An", "authorids": "~Pengdeng_Li1;~Shuxin_Li1;~Chang_Yang3;~Xinrun_Wang1;~Shuyue_Hu1;~Xiao_Huang1;~Hau_Chan1;~Bo_An2", "gender": "M;F;F;M;;M;M;M", "homepage": ";;http://none.com;https://rainwangphy.github.io/;;https://www4.comp.polyu.edu.hk/~xiaohuang/;https://cse.unl.edu/~hchan/;https://personal.ntu.edu.sg/boan/", "dblp": "199/2319;;;199/6413;;25/692-1.html;48/9282;42/6178-1.html", "google_scholar": "HY6ghxoAAAAJ;bSksaBYAAAAJ;;ROANfPUAAAAJ;;Be21PkYAAAAJ;R1w52RIAAAAJ;PEEpuNwAAAAJ", "orcid": "0000-0002-5549-3864;0009-0001-5748-2667;;;;0000-0002-3867-900X;;0000-0002-7064-7438", "linkedin": ";;;;;;;", "or_profile": "~Pengdeng_Li1;~Shuxin_Li1;~Chang_Yang3;~Xinrun_Wang1;~Shuyue_Hu1;~Xiao_Huang1;~Hau_Chan1;~Bo_An2", "aff": "Nanyang Technological University;Nanyang Technological University;The Hong Kong Polytechnic University;Nanyang Technological University;;The Hong Kong Polytechnic University;University of Nebraska, Lincoln;Nanyang Technological University", "aff_domain": "ntu.edu.sg;ntu.edu.sg;polyu.edu.hk;ntu.edu.sg;;polyu.edu.hk;unl.edu;ntu.edu.sg", "position": "Postdoc;PhD student;PhD student;Postdoc;;Assistant Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nli2024configurable,\ntitle={Configurable Mirror Descent: Towards a Unification of Decision Making},\nauthor={Pengdeng Li and Shuxin Li and Chang Yang and Xinrun Wang and Shuyue Hu and Xiao Huang and Hau Chan and Bo An},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=U841CrDUx9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7854044, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1560333105612614692&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 8, "email": "ntu.edu.sg;ntu.edu.sg;polyu.edu.hk;ntu.edu.sg;;polyu.edu.hk;unl.edu;ntu.edu.sg", "author_num": 8, "aff_unique_index": "0;0;1;0;1;2;0", "aff_unique_norm": "Nanyang Technological University;Hong Kong Polytechnic University;University of Nebraska", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ntu.edu.sg;https://www.polyu.edu.hk;https://www.unl.edu", "aff_unique_abbr": "NTU;PolyU;UNL", "aff_campus_unique_index": "1;1;2", "aff_campus_unique": ";Hong Kong SAR;Lincoln", "aff_country_unique_index": "0;0;1;0;1;2;0", "aff_country_unique": "Singapore;China;United States" }, { "title": "Auto-Encoding Morph-Tokens for Multimodal LLM", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33953", "id": "U97MIrs35l", "proceeding": "https://proceedings.mlr.press/v235/pan24h.html", "pdf": "https://openreview.net/pdf?id=U97MIrs35l", "openreview": "https://openreview.net/forum?id=U97MIrs35l", "author_site": "Kaihang Pan, Siliang Tang, Juncheng Li, Zhaoyu Fan, Wei Chow, Shuicheng YAN, Tat-Seng Chua, Yueting Zhuang, Hanwang Zhang", "tldr": "", "abstract": "For multimodal LLMs, the synergy of visual comprehension (textual output) and generation (visual output) presents an ongoing challenge. This is due to a conflicting objective: for comprehension, an MLLM needs to abstract the visuals; for generation, it needs to preserve the visuals as much as possible. Thus, the objective is a dilemma for visual-tokens. To resolve the conflict, we propose encoding images into morph-tokens to serve a dual purpose: for comprehension, they act as visual prompts instructing MLLM to generate texts; for generation, they take on a different, non-conflicting role as complete visual-tokens for image reconstruction, where the missing visual cues are recovered by the MLLM. Extensive experiments show that morph-tokens can achieve a new SOTA for multimodal comprehension and generation simultaneously. Our project is available at https://github.com/DCDmllm/MorphTokens.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kaihang Pan;Siliang Tang;Juncheng Li;Zhaoyu Fan;Wei Chow;Shuicheng YAN;Tat-Seng Chua;Yueting Zhuang;Hanwang Zhang", "authorids": "~Kaihang_Pan1;~Siliang_Tang1;~Juncheng_Li3;~Zhaoyu_Fan1;~Wei_Chow1;~Shuicheng_YAN3;~Tat-Seng_Chua2;~Yueting_Zhuang1;~Hanwang_Zhang3", "gender": "M;M;M;M;M;M;M;M;M", "homepage": "https://github.com/1308024507pkh;https://person.zju.edu.cn/en/siliang;;https://github.com/FanBB2333;http://none.com;https://yanshuicheng.ai/;https://person.zju.edu.cn/yzhuang;https://mreallab.github.io/index.html;http://www.comp.nus.edu.sg/~chuats/", "dblp": "344/0647.html;44/5693;182/7674-6;;;y/ShuichengYan;;79/8116.html;", "google_scholar": "https://scholar.google.com.hk/citations?user=lMQADDUAAAAJ;8e7H3PcAAAAJ;lm9s-QgAAAAJ;;;https://scholar.google.com.hk/citations?user=DNuiPHwAAAAJ;1RD7UJAAAAAJ;YG0DFyYAAAAJ;https://scholar.google.com.tw/citations?user=Z9DWCBEAAAAJ", "orcid": ";0000-0002-7356-9711;0000-0003-2258-1291;;;;;;0000-0001-6097-7807", "linkedin": ";siliang-tang-4734272a/;;;;;;;", "or_profile": "~Kaihang_Pan1;~Siliang_Tang1;~Juncheng_Li3;~Zhaoyu_Fan1;~Wei_Chow1;~Shuicheng_YAN3;~Yueting_Zhuang1;~Hanwang_Zhang3;~Tat-seng_Chua1", "aff": "Zhejiang University;Zhejiang University;National University of Singapore;Zhejiang University;Zhejiang University;sea Group;Zhejiang University;Nanyang Technological University;National University of Singapore", "aff_domain": "zju.edu.cn;zju.edu.cn;nus.edu;zju.edu.cn;zju.edu.cn;sea.com;zju.edu.cn;ntu.edu.sg;nus.edu.sg", "position": "PhD student;Full Professor;Postdoc;MS student;Undergrad student;Researcher;Full Professor;Associate Professor;Full Professor", "bibtex": "@inproceedings{\npan2024autoencoding,\ntitle={Auto-Encoding Morph-Tokens for Multimodal {LLM}},\nauthor={Kaihang Pan and Siliang Tang and Juncheng Li and Zhaoyu Fan and Wei Chow and Shuicheng YAN and Tat-Seng Chua and Yueting Zhuang and Hanwang Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=U97MIrs35l}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3645226, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4719089750159601645&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "zju.edu.cn;zju.edu.cn;nus.edu;zju.edu.cn;zju.edu.cn;sea.com;zju.edu.cn;ntu.edu.sg;nus.edu.sg", "author_num": 9, "aff_unique_index": "0;0;1;0;0;2;0;3;1", "aff_unique_norm": "Zhejiang University;National University of Singapore;Sea Group;Nanyang Technological University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.zju.edu.cn;https://www.nus.edu.sg;;https://www.ntu.edu.sg", "aff_unique_abbr": "ZJU;NUS;;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0;1;1", "aff_country_unique": "China;Singapore;" }, { "title": "Provably Efficient Long-Horizon Exploration in Monte Carlo Tree Search through State Occupancy Regularization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33952", "id": "UCKFhc9SFC", "proceeding": "https://proceedings.mlr.press/v235/schramm24a.html", "pdf": "https://openreview.net/pdf?id=UCKFhc9SFC", "openreview": "https://openreview.net/forum?id=UCKFhc9SFC", "author_site": "Liam Schramm, Abdeslam Boularias", "tldr": "", "abstract": "Monte Carlo tree search (MCTS) has been successful in a variety of domains, but faces challenges with long-horizon exploration when compared to sampling-based motion planning algorithms like Rapidly-Exploring Random Trees. To address these limitations of MCTS, we derive a tree search algorithm based on policy optimization with state-occupancy measure regularization, which we call *Volume-MCTS*. We show that count-based exploration and sampling-based motion planning can be derived as approximate solutions to this state-occupancy measure regularized objective. We test our method on several robot navigation problems, and find that Volume-MCTS outperforms AlphaZero and displays significantly better long-horizon exploration properties.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Liam Schramm;Abdeslam Boularias", "authorids": "~Liam_Schramm2;~Abdeslam_Boularias1", "gender": ";M", "homepage": "https://liamschramm.com/;http://rl.cs.rutgers.edu/", "dblp": "https://dblp.org/pers/hd/s/Schramm:Liam;57/2269", "google_scholar": ";https://scholar.google.com.tw/citations?user=8AF3RCsAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Liam_Schramm2;~Abdeslam_Boularias1", "aff": "Rutgers University;, Rutgers University", "aff_domain": "rutgers.edu;cs.rutgers.edu", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nschramm2024provably,\ntitle={Provably Efficient Long-Horizon Exploration in Monte Carlo Tree Search through State Occupancy Regularization},\nauthor={Liam Schramm and Abdeslam Boularias},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=UCKFhc9SFC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 752481, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6299761347312019419&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "rutgers.edu;cs.rutgers.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Rutgers University", "aff_unique_dep": "", "aff_unique_url": "https://www.rutgers.edu", "aff_unique_abbr": "Rutgers", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Is Kernel Prediction More Powerful than Gating in Convolutional Neural Networks?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33951", "id": "UE79AkNg60", "proceeding": "https://proceedings.mlr.press/v235/muller24a.html", "pdf": "https://openreview.net/pdf?id=UE79AkNg60", "openreview": "https://openreview.net/forum?id=UE79AkNg60", "author_site": "Lorenz K. Muller", "tldr": "", "abstract": "Neural networks whose weights are the output of a predictor (HyperNetworks) achieve excellent performance on many tasks. In ConvNets, kernel prediction layers are a popular type of HyperNetwork. Previous theoretical work has argued that a hierarchy of multiplicative interactions exists in which gating is at the bottom and full weight prediction, as in HyperNetworks, is at the top. In this paper, we constructively demonstrate an equivalence between gating combined with fixed weight layers and weight prediction, relativizing the notion of a hierarchy of multiplicative interactions. We further derive an equivalence between a restricted type of HyperNetwork and factorization machines. Finally, we find empirically that gating layers can learn to imitate weight prediction layers with an SGD variant and show a novel practical application in image denoising using kernel prediction networks. Our reformulation of predicted kernels, combining fixed layers and gating, reduces memory requirements.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lorenz K Muller", "authorids": "~Lorenz_K_Muller1", "gender": "M", "homepage": "", "dblp": "139/1372", "google_scholar": "https://scholar.google.ch/citations?user=DxppwfcAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Lorenz_K_Muller1", "aff": "Huawei Technologies Ltd.", "aff_domain": "huawei.com", "position": "Researcher", "bibtex": "@inproceedings{\nmuller2024is,\ntitle={Is Kernel Prediction More Powerful than Gating in Convolutional Neural Networks?},\nauthor={Lorenz K Muller},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=UE79AkNg60}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 490175, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Dlpa5CityZAJ:scholar.google.com/&scioq=Is+Kernel+Prediction+More+Powerful+than+Gating+in+Convolutional+Neural+Networks%3F&hl=en&as_sdt=0,33", "gs_version_total": 4, "email": "huawei.com", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Huawei", "aff_unique_dep": "Huawei Technologies", "aff_unique_url": "https://www.huawei.com", "aff_unique_abbr": "Huawei", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "The Linear Representation Hypothesis and the Geometry of Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33950", "id": "UGpGkLzwpP", "proceeding": "https://proceedings.mlr.press/v235/park24c.html", "pdf": "https://openreview.net/pdf?id=UGpGkLzwpP", "openreview": "https://openreview.net/forum?id=UGpGkLzwpP", "author_site": "Kiho Park, Yo Joong Choe, Victor Veitch", "tldr": "", "abstract": "Informally, the \"linear representation hypothesis\" is the idea that high-level concepts are represented linearly as directions in some representation space. In this paper, we address two closely related questions: What does \"linear representation\" actually mean? And, how do we make sense of geometric notions (e.g., cosine similarity and projection) in the representation space? To answer these, we use the language of counterfactuals to give two formalizations of linear representation, one in the output (word) representation space, and one in the input (context) space. We then prove that these connect to linear probing and model steering, respectively. To make sense of geometric notions, we use the formalization to identify a particular (non-Euclidean) inner product that respects language structure in a sense we make precise. Using this *causal inner product*, we show how to unify all notions of linear representation. In particular, this allows the construction of probes and steering vectors using counterfactual pairs. Experiments with LLaMA-2 demonstrate the existence of linear representations of concepts, the connection to interpretation and control, and the fundamental role of the choice of inner product.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kiho Park;Yo Joong Choe;Victor Veitch", "authorids": "~Kiho_Park1;~Yo_Joong_Choe1;~Victor_Veitch1", "gender": "M;M;", "homepage": "https://kihopark.github.io;https://yjchoe.github.io/;http://victorveitch.com", "dblp": "336/5337;234/0589;167/5650", "google_scholar": "f4HcMx8AAAAJ;71g2MrUAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0009-0003-6800-3216;0000-0002-0614-9477;", "linkedin": ";;", "or_profile": "~Kiho_Park1;~Yo_Joong_Choe1;~Victor_Veitch1", "aff": "Netflix;University of Chicago;Google", "aff_domain": "netflix.com;uchicago.edu;google.com", "position": "Intern;Postdoc;Research Scientist", "bibtex": "@inproceedings{\npark2024the,\ntitle={The Linear Representation Hypothesis and the Geometry of Large Language Models},\nauthor={Kiho Park and Yo Joong Choe and Victor Veitch},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=UGpGkLzwpP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2741897, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 155, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=830461866206681894&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "email": "netflix.com;uchicago.edu;google.com", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Netflix;University of Chicago;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.netflix.com;https://www.uchicago.edu;https://www.google.com", "aff_unique_abbr": "Netflix;UChicago;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "A Persuasive Approach to Combating Misinformation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33949", "id": "UIxOkdBmxh", "proceeding": "https://proceedings.mlr.press/v235/hossain24b.html", "pdf": "https://openreview.net/pdf?id=UIxOkdBmxh", "openreview": "https://openreview.net/forum?id=UIxOkdBmxh", "author_site": "Safwan Hossain, Andjela Mladenovic, Yiling Chen, Gauthier Gidel", "tldr": "", "abstract": "Bayesian Persuasion is proposed as a tool for social media platforms to combat the spread of misinformation. Since platforms can use machine learning to predict the popularity and misinformation features of to-be-shared posts, and users are largely motivated to share popular content, platforms can strategically signal this informational advantage to change user beliefs and persuade them not to share misinformation. We characterize the optimal signaling scheme with imperfect predictions as a linear program and give sufficient and necessary conditions on the classifier to ensure optimal platform utility is non-decreasing and continuous. Next, this interaction is considered under a performative model, wherein platform intervention affects the user's future behaviour. The convergence and stability of optimal signaling under this performative process are fully characterized. Lastly, we experimentally validate that our approach significantly reduces misinformation in both the single round and performative setting.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Safwan Hossain;Andjela Mladenovic;Yiling Chen;Gauthier Gidel", "authorids": "~Safwan_Hossain1;~Andjela_Mladenovic1;~Yiling_Chen1;~Gauthier_Gidel1", "gender": "M;F;F;M", "homepage": "https://safwanhossain.github.io/;;https://yiling.seas.harvard.edu/;https://gauthiergidel.github.io/", "dblp": ";;72/3762-1;188/6326", "google_scholar": "https://scholar.google.ca/citations?user=gyCQnUAAAAAJ;ALrei20AAAAJ;x_7xA0UAAAAJ;https://scholar.google.fr/citations?user=bDrXQPUAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Safwan_Hossain1;~Andjela_Mladenovic1;~Yiling_Chen1;~Gauthier_Gidel1", "aff": "Harvard University;Montreal Institute for Learning Algorithms, University of Montreal, University of Montreal;Harvard University;Mila - Quebec Artificial Intelligence Institute", "aff_domain": "harvard.edu;mila.umontreal.ca;fas.harvard.edu;mila.quebec", "position": "PhD student;PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nhossain2024a,\ntitle={A Persuasive Approach to Combating Misinformation},\nauthor={Safwan Hossain and Andjela Mladenovic and Yiling Chen and Gauthier Gidel},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=UIxOkdBmxh}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2105083, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5165413603785750357&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 6, "email": "harvard.edu;mila.umontreal.ca;fas.harvard.edu;mila.quebec", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Harvard University;University of Montreal;Quebec Artificial Intelligence Institute", "aff_unique_dep": ";Montreal Institute for Learning Algorithms;Artificial Intelligence", "aff_unique_url": "https://www.harvard.edu;https://www.umontreal.ca;https://mila.quebec", "aff_unique_abbr": "Harvard;UM;Mila", "aff_campus_unique_index": "1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "United States;Canada" }, { "title": "Collaborative Learning with Different Labeling Functions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33948", "id": "UKHfmzLR7P", "proceeding": "https://proceedings.mlr.press/v235/deng24d.html", "pdf": "https://openreview.net/pdf?id=UKHfmzLR7P", "openreview": "https://openreview.net/forum?id=UKHfmzLR7P", "author_site": "yuyang deng, Mingda Qiao", "tldr": "", "abstract": "We study a variant of Collaborative PAC Learning, in which we aim to learn an accurate classifier for each of the $n$ data distributions, while minimizing the number of samples drawn from them in total. Unlike in the usual collaborative learning setup, it is not assumed that there exists a single classifier that is simultaneously accurate for all distributions. We show that, when the data distributions satisfy a weaker realizability assumption, which appeared in (Crammer & Mansour, 2012) in the context of multi-task learning, sample-efficient learning is still feasible. We give a learning algorithm based on Empirical Risk Minimization (ERM) on a natural augmentation of the hypothesis class, and the analysis relies on an upper bound on the VC dimension of this augmented class. In terms of the computational efficiency, we show that ERM on the augmented hypothesis class is $\\mathsf{NP}$-hard, which gives evidence against the existence of computationally efficient learners in general. On the positive side, for two special cases, we give learners that are both sample- and computationally-efficient.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuyang Deng;Mingda Qiao", "authorids": "~Yuyang_Deng3;~Mingda_Qiao1", "gender": "M;M", "homepage": "https://sites.psu.edu/yuyangdeng/;https://sites.google.com/site/acmonsterqiao/", "dblp": "261/9253;185/0592", "google_scholar": "bfV3XWUAAAAJ;mV9LQUoAAAAJ", "orcid": ";0000-0002-9182-6152", "linkedin": ";", "or_profile": "~Yuyang_Deng3;~Mingda_Qiao1", "aff": "Pennsylvania State University;University of California, Berkeley", "aff_domain": "psu.edu;berkeley.edu", "position": "PhD student;Postdoc", "bibtex": "@inproceedings{\ndeng2024collaborative,\ntitle={Collaborative Learning with Different Labeling Functions},\nauthor={Yuyang Deng and Mingda Qiao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=UKHfmzLR7P}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 436571, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7874516261676035550&as_sdt=8005&sciodt=0,7&hl=en", "gs_version_total": 8, "email": "psu.edu;berkeley.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Pennsylvania State University;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://www.psu.edu;https://www.berkeley.edu", "aff_unique_abbr": "PSU;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Convergence of Online Learning Algorithm for a Mixture of Multiple Linear Regressions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33947", "id": "ULKvSqmSgA", "proceeding": "https://proceedings.mlr.press/v235/liu24an.html", "pdf": "https://openreview.net/pdf?id=ULKvSqmSgA", "openreview": "https://openreview.net/forum?id=ULKvSqmSgA", "author_site": "Yujing Liu, Zhixin Liu, Lei Guo", "tldr": "", "abstract": "This paper considers the parameter learning and data clustering problem for MLR with multiple sub-models and arbitrary mixing weights. To deal with the data streaming case, we propose an online learning algorithm to estimate the unknown parameters. By utilizing Ljung's ODE method, we establish the almost sure convergence results of this MLR problem without the traditional i.i.d. assumption on the input data for the first time. Based on the convergence property and using the classical stochastic Lyapunov function method, we also obtain the convergence rate analysis of the proposed algorithm for the first time. In addition, the data clustering can asymptotically achieve the same performance as the case with known parameters. Future work will consider how to relax the asymptotically stationary and ergodic assumption on the input data, and how to design algorithms with global convergence performance for the MLR problem.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yujing Liu;Zhixin Liu;Lei Guo", "authorids": "~Yujing_Liu2;~Zhixin_Liu2;~Lei_Guo11", "gender": "F;F;M", "homepage": ";https://people.ucas.edu.cn/~0017757;http://lsc.amss.cas.cn/guolei/english/jbxx/", "dblp": ";;", "google_scholar": "https://scholar.google.com.hk/citations?user=GPC_X54AAAAJ;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yujing_Liu2;~Zhixin_Liu2;~Lei_Guo11", "aff": "Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Chinese Academy of Sciences;Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Chinese Academy of Sciences;Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Chinese Academy of Sciences", "aff_domain": "amss.ac.cn;amss.ac.cn;amss.ac.cn", "position": "PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nliu2024convergence,\ntitle={Convergence of Online Learning Algorithm for a Mixture of Multiple Linear Regressions},\nauthor={Yujing Liu and Zhixin Liu and Lei Guo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ULKvSqmSgA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 477354, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18385274239562455289&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "amss.ac.cn;amss.ac.cn;amss.ac.cn", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Chinese Academy of Sciences", "aff_unique_dep": "Academy of Mathematics and Systems Science", "aff_unique_url": "http://www.cas.cn", "aff_unique_abbr": "CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Towards General Algorithm Discovery for Combinatorial Optimization: Learning Symbolic Branching Policy from Bipartite Graph", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33946", "id": "ULleq1Dtaw", "proceeding": "https://proceedings.mlr.press/v235/kuang24a.html", "pdf": "https://openreview.net/pdf?id=ULleq1Dtaw", "openreview": "https://openreview.net/forum?id=ULleq1Dtaw", "author_site": "Yufei Kuang, Jie Wang, Yuyan Zhou, Xijun Li, Fangzhou Zhu, Jianye Hao, Feng Wu", "tldr": "", "abstract": "Machine learning (ML) approaches have been successfully applied to accelerating exact combinatorial optimization (CO) solvers. However, many of them fail to explain what patterns they have learned that accelerate the CO algorithms due to the black-box nature of ML models like neural networks, and thus they prevent researchers from further understanding the tasks they are interested in. To tackle this problem, we propose the *first* graph-based algorithm discovery framework---namely, graph symbolic discovery for exact combinatorial optimization solver (GS4CO)---that learns interpretable branching policies directly from the *general* bipartite graph representation of CO problems. Specifically, we design a unified representation for symbolic policies with graph inputs, and then we employ a Transformer with multiple tree-structural encodings to generate symbolic trees end-to-end, which effectively reduces the cumulative error from iteratively distilling graph neural networks. Experiments show that GS4CO learned interpretable and lightweight policies outperform all the baselines on CPU machines, including both the human-designed and the learning-based. GS4CO shows an encouraging step towards general algorithm discovery on modern CO solvers.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yufei Kuang;Jie Wang;Yuyan Zhou;Xijun Li;Fangzhou Zhu;Jianye HAO;Feng Wu", "authorids": "~Yufei_Kuang1;~Jie_Wang1;~Yuyan_Zhou1;~Xijun_Li1;~Fangzhou_Zhu1;~Jianye_HAO1;~Feng_Wu1", "gender": "M;M;M;M;M;M;M", "homepage": "https://miralab.ai/people/yufei-kuang/;http://staff.ustc.edu.cn/~jwangx;http://kym.nju.edu.cn;https://xijunlee.github.io/;;http://www.icdai.org/jianye.html;", "dblp": "280/1134;29/5259-5;;203/0784;74/8725;21/7664.html;25/3972-1", "google_scholar": "STN3F_oAAAAJ;OugG4dUAAAAJ;;QXU_QbMAAAAJ;5fTTRiwAAAAJ;;5bInRDEAAAAJ", "orcid": ";;;0000-0002-9013-1180;;0000-0002-0422-8235;", "linkedin": ";;;;;;", "or_profile": "~Yufei_Kuang1;~Jie_Wang1;~Yuyan_Zhou1;~Xijun_Li1;~Fangzhou_Zhu1;~Jianye_HAO1;~Feng_Wu1", "aff": "University of Science and Technology of China;University of Science and Technology of China;Nanjing University;Huawei Technologies Ltd.;Huawei Technologies Ltd.;Tianjin University;University of Science and Technology of China", "aff_domain": "ustc.edu.cn;ustc.edu.cn;nju.edu.cn;huawei.com;huawei.com;tju.edu.cn;ustc.edu.cn", "position": "PhD student;Full Professor;Undergrad student;Researcher;Researcher;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nkuang2024towards,\ntitle={Towards General Algorithm Discovery for Combinatorial Optimization: Learning Symbolic Branching Policy from Bipartite Graph},\nauthor={Yufei Kuang and Jie Wang and Yuyan Zhou and Xijun Li and Fangzhou Zhu and Jianye HAO and Feng Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ULleq1Dtaw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 608698, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9717930619631216026&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "ustc.edu.cn;ustc.edu.cn;nju.edu.cn;huawei.com;huawei.com;tju.edu.cn;ustc.edu.cn", "author_num": 7, "aff_unique_index": "0;0;1;2;2;3;0", "aff_unique_norm": "University of Science and Technology of China;Nanjing University;Huawei;Tianjin University", "aff_unique_dep": ";;Huawei Technologies;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.nju.edu.cn;https://www.huawei.com;http://www.tju.edu.cn", "aff_unique_abbr": "USTC;Nanjing U;Huawei;TJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Probabilistic Forecasting with Stochastic Interpolants and F\u00f6llmer Processes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33945", "id": "UQYXZdca92", "proceeding": "https://proceedings.mlr.press/v235/chen24n.html", "pdf": "https://openreview.net/pdf?id=UQYXZdca92", "openreview": "https://openreview.net/forum?id=UQYXZdca92", "author_site": "Yifan Chen, Mark Goldstein, Mengjian Hua, Michael Albergo, Nicholas Boffi, Eric Vanden-Eijnden", "tldr": "", "abstract": "We propose a framework for probabilistic forecasting of dynamical systems based on generative modeling. Given observations of the system state over time, we formulate the forecasting problem as sampling from the conditional distribution of the future system state given its current state. To this end, we leverage the framework of stochastic interpolants, which facilitates the construction of a generative model between an arbitrary base distribution and the target. We design a fictitious, non-physical stochastic dynamics that takes as initial condition the current system state and produces as output a sample from the target conditional distribution in finite time and without bias. This process therefore maps a point mass centered at the current state onto a probabilistic ensemble of forecasts. We prove that the drift coefficient entering the stochastic differential equation (SDE) achieving this task is non-singular, and that it can be learned efficiently by square loss regression over the time-series data. We show that the drift and the diffusion coefficients of this SDE can be adjusted after training, and that a specific choice that minimizes the impact of the estimation error gives a F\u00f6llmer process. We highlight the utility of our approach on several complex, high-dimensional forecasting problems, including stochastically forced Navier-Stokes and video prediction on the KTH and CLEVRER datasets. The code is available at https://github.com/interpolants/forecasting.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yifan Chen;Mark Goldstein;Mengjian Hua;Michael Samuel Albergo;Nicholas Matthew Boffi;Eric Vanden-Eijnden", "authorids": "~Yifan_Chen5;~Mark_Goldstein1;~Mengjian_Hua1;~Michael_Samuel_Albergo1;~Nicholas_Matthew_Boffi1;~Eric_Vanden-Eijnden1", "gender": ";M;M;M;M;M", "homepage": "https://yifanc96.github.io;https://cims.nyu.edu/~mg3479/;;http://malbergo.me;https://nmboffi.github.io;https://wp.nyu.edu/courantinstituteofmathematicalsciences-eve2/", "dblp": ";;;;;88/7927", "google_scholar": "GNiinUoAAAAJ;https://scholar.google.fr/citations?hl=en;llRFiBEAAAAJ;GQyCZ4kAAAAJ;_jkX2q0AAAAJ;A5Gx65gAAAAJ", "orcid": ";;0000-0003-0203-1356;0000-0001-9058-5943;;", "linkedin": ";;;;;", "or_profile": "~Yifan_Chen5;~Mark_Goldstein1;~Mengjian_Hua1;~Michael_Samuel_Albergo1;~Nicholas_Matthew_Boffi1;~Eric_Vanden-Eijnden1", "aff": "New York University;Google;Courant Institute of Mathematical Sciences, New York University;New York University;NYU, New York University;New York University", "aff_domain": "nyu.edu;google.com;nyu.edu;nyu.edu;cims.nyu.edu;nyu.edu", "position": "Postdoc;Intern;PhD student;PhD student;Instructor;Full Professor", "bibtex": "@inproceedings{\nchen2024probabilistic,\ntitle={Probabilistic Forecasting with Stochastic Interpolants and F\\\"ollmer Processes},\nauthor={Yifan Chen and Mark Goldstein and Mengjian Hua and Michael Samuel Albergo and Nicholas Matthew Boffi and Eric Vanden-Eijnden},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=UQYXZdca92}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2029891, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6112206326186281669&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "nyu.edu;google.com;nyu.edu;nyu.edu;cims.nyu.edu;nyu.edu", "author_num": 6, "aff_unique_index": "0;1;0;0;0;0", "aff_unique_norm": "New York University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.nyu.edu;https://www.google.com", "aff_unique_abbr": "NYU;Google", "aff_campus_unique_index": "1;2;2", "aff_campus_unique": ";Mountain View;New York", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "WAVES: Benchmarking the Robustness of Image Watermarks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33944", "id": "URtUYfC3GA", "proceeding": "https://proceedings.mlr.press/v235/an24a.html", "pdf": "https://openreview.net/pdf?id=URtUYfC3GA", "openreview": "https://openreview.net/forum?id=URtUYfC3GA", "author_site": "Bang An, Mucong Ding, Tahseen Rabbani, Aakriti Agrawal, Yuancheng Xu, Chenghao Deng, Sicheng Zhu, Abdirisak Mohamed, Yuxin Wen, Tom Goldstein, Furong Huang", "tldr": "", "abstract": "In the burgeoning age of generative AI, watermarks act as identifiers of provenance and artificial content. We present WAVES (Watermark Analysis via Enhanced Stress-testing), a benchmark for assessing image watermark robustness, overcoming the limitations of current evaluation methods. WAVES integrates detection and identification tasks and establishes a standardized evaluation protocol comprised of a diverse range of stress tests. The attacks in WAVES range from traditional image distortions to advanced, novel variations of diffusive, and adversarial attacks. Our evaluation examines two pivotal dimensions: the degree of image quality degradation and the efficacy of watermark detection after attacks. Our novel, comprehensive evaluation reveals previously undetected vulnerabilities of several modern watermarking algorithms. We envision WAVES as a toolkit for the future development of robust watermarks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bang An;Mucong Ding;Tahseen Rabbani;Aakriti Agrawal;Yuancheng Xu;Chenghao Deng;Sicheng Zhu;Abdirisak Mohamed;Yuxin Wen;Tom Goldstein;Furong Huang", "authorids": "~Bang_An1;~Mucong_Ding1;~Tahseen_Rabbani1;~Aakriti_Agrawal1;~Yuancheng_Xu1;~Chenghao_Deng1;~Sicheng_Zhu1;~Abdirisak_Mohamed1;~Yuxin_Wen2;~Tom_Goldstein1;~Furong_Huang1", "gender": ";M;M;F;M;M;M;M;;M;F", "homepage": "https://bangann.github.io/;http://www.cs.umd.edu/~mcding/;https://www.cs.umd.edu/people/trabbani;https://sites.google.com/umd.edu/aakriti-agrawal/;https://yuancheng-xu.github.io;https://deng-chenghao.com;https://schzhu.github.io/;;https://yuxinwenrick.github.io/;https://www.cs.umd.edu/~tomg/;https://furong-huang.com", "dblp": "188/0741;232/1754.html;280/2362;259/2330;;;;332/1919;;25/8184;72/8513", "google_scholar": "3ce6z_sAAAAJ;_bVao2MAAAAJ;;2RRnCRMAAAAJ;OPB0QgwAAAAJ;AcGw1hcAAAAJ;;IaxIgBsAAAAJ;oUYfjg0AAAAJ;KmSuVtgAAAAJ;13yyuCcAAAAJ", "orcid": ";0000-0002-6173-8055;;;;;;;;;", "linkedin": ";mucong-ding-489296104;;aakriti-agrawal05/;yuancheng-xu/;chenghao-deng-326444182/;;abdirisak-mohamed-0524981a/;;;", "or_profile": "~Bang_An1;~Mucong_Ding1;~Tahseen_Rabbani1;~Aakriti_Agrawal1;~Yuancheng_Xu1;~Chenghao_Deng1;~Sicheng_Zhu1;~Abdirisak_Mohamed1;~Yuxin_Wen2;~Tom_Goldstein1;~Furong_Huang1", "aff": "University of Maryland, College Park;Department of Computer Science, University of Maryland, College Park;University of Maryland, College Park;University of Maryland;University of Maryland, College Park;University of Maryland, College Park;University of Maryland, College Park;Montgomery College;University of Maryland, College Park;University of Maryland, College Park;University of Maryland", "aff_domain": "umd.edu;cs.umd.edu;umd.edu;umd.edu;umd.edu;umd.edu;umd.edu;montgomerycollege.edu;umd.edu;umd.edu;cs.umd.edu", "position": "PhD student;PhD student;PhD student;PhD student;PhD student;PhD student;PhD student;Lecturer;PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nan2024waves,\ntitle={{WAVES}: Benchmarking the Robustness of Image Watermarks},\nauthor={Bang An and Mucong Ding and Tahseen Rabbani and Aakriti Agrawal and Yuancheng Xu and Chenghao Deng and Sicheng Zhu and Abdirisak Mohamed and Yuxin Wen and Tom Goldstein and Furong Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=URtUYfC3GA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 10147517, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=61115339966992246&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "umd.edu;cs.umd.edu;umd.edu;umd.edu;umd.edu;umd.edu;umd.edu;montgomerycollege.edu;umd.edu;umd.edu;cs.umd.edu", "author_num": 11, "aff_unique_index": "0;1;0;0;0;0;0;2;0;0;0", "aff_unique_norm": "University of Maryland;University of Maryland, College Park;Montgomery College", "aff_unique_dep": ";Department of Computer Science;", "aff_unique_url": "https://www/umd.edu;https://www/umd.edu;https://www.montgomerycollege.edu", "aff_unique_abbr": "UMD;UMD;", "aff_campus_unique_index": "0;0;0;0;0;0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Graph Positional and Structural Encoder", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33943", "id": "UTSCK582Yo", "proceeding": "https://proceedings.mlr.press/v235/canturk24a.html", "pdf": "https://openreview.net/pdf?id=UTSCK582Yo", "openreview": "https://openreview.net/forum?id=UTSCK582Yo", "author_site": "Semih Cant\u00fcrk, Renming Liu, Olivier Lapointe-Gagn\u00e9, Vincent L\u00e9tourneau, Guy Wolf, Dominique Beaini, Ladislav Rampasek", "tldr": "", "abstract": "Positional and structural encodings (PSE) enable better identifiability of nodes within a graph, rendering them essential tools for empowering modern GNNs, and in particular graph Transformers. However, designing PSEs that work optimally for all graph prediction tasks is a challenging and unsolved problem. Here, we present the Graph Positional and Structural Encoder (GPSE), the first-ever graph encoder designed to capture rich PSE representations for augmenting any GNN. GPSE learns an efficient common latent representation for multiple PSEs, and is highly transferable: The encoder trained on a particular graph dataset can be used effectively on datasets drawn from markedly different distributions and modalities. We show that across a wide range of benchmarks, GPSE-enhanced models can significantly outperform those that employ explicitly computed PSEs, and at least match their performance in others. Our results pave the way for the development of foundational pre-trained graph encoders for extracting positional and structural information, and highlight their potential as a more powerful and efficient alternative to explicitly computed PSEs and existing self-supervised pre-training approaches. Our framework and pre-trained models are publicly available at https://github.com/G-Taxonomy-Workgroup/GPSE. For convenience, GPSE has also been integrated into the PyG library to facilitate downstream applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Semih Cant\u00fcrk;Renming Liu;Olivier Lapointe-Gagn\u00e9;Vincent L\u00e9tourneau;Guy Wolf;Dominique Beaini;Ladislav Ramp\u00e1\u0161ek", "authorids": "~Semih_Cant\u00fcrk1;~Renming_Liu1;~Olivier_Lapointe-Gagn\u00e91;~Vincent_L\u00e9tourneau1;~Guy_Wolf1;~Dominique_Beaini1;~Ladislav_Ramp\u00e1\u0161ek1", "gender": "M;M;;;M;M;M", "homepage": ";https://github.com/RemyLau;https://github.com/OlivierLapointeG;;http://guywolf.org;;https://rampasek.github.io", "dblp": "268/5493;;;;120/1308;201/8526;146/9832", "google_scholar": "k_BbPa4AAAAJ;by5L-BcAAAAJ;;;g0k3SjcAAAAJ;https://scholar.google.ca/citations?hl=en;nqeYbJcAAAAJ", "orcid": "0000-0002-2365-9292;0000-0002-6025-6492;;;0000-0002-6740-059X;0000-0002-4613-9388;0000-0001-7527-1196", "linkedin": "semihcanturk/;remy-liu-a24780213/;;vincent-l%C3%A9tourneau-1b298570/;;dbeaini/;", "or_profile": "~Semih_Cant\u00fcrk1;~Renming_Liu1;~Olivier_Lapointe-Gagn\u00e91;~Vincent_L\u00e9tourneau1;~Guy_Wolf1;~Dominique_Beaini1;~Ladislav_Rampasek1", "aff": "Department of Computer Science and Operations Research, Universit\u00e9 de Montr\u00e9al;Michigan State University;Universit\u00e9 de Montr\u00e9al;;University of Montreal;Mila - Institut Qu\u00e9b\u00e9cois d'intelligence artificielle;Isomorphic Labs", "aff_domain": "diro.umontreal.ca;msu.edu;umontreal.ca;;umontreal.ca;mila.quebec;google.com", "position": "PhD student;PhD student;MS student;;Associate Professor;Associate Professor;Researcher", "bibtex": "@inproceedings{\ncant{\\\"u}rk2024graph,\ntitle={Graph Positional and Structural Encoder},\nauthor={Semih Cant{\\\"u}rk and Renming Liu and Olivier Lapointe-Gagn{\\'e} and Vincent L{\\'e}tourneau and Guy Wolf and Dominique Beaini and Ladislav Ramp{\\'a}{\\v{s}}ek},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=UTSCK582Yo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1484397, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17034865299338222140&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "diro.umontreal.ca;msu.edu;umontreal.ca;;umontreal.ca;mila.quebec;google.com", "author_num": 7, "aff_unique_index": "0;1;0;2;3;4", "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al;Michigan State University;University of Montreal;Mila - Quebec Artificial Intelligence Institute;Isomorphic Labs", "aff_unique_dep": "Department of Computer Science and Operations Research;;;Artificial Intelligence;", "aff_unique_url": "https://www.umontreal.ca;https://www.msu.edu;https://wwwumontreal.ca;https://mila.quebec;", "aff_unique_abbr": "UdeM;MSU;UM;Mila;", "aff_campus_unique_index": "0", "aff_campus_unique": "Montr\u00e9al;", "aff_country_unique_index": "0;1;0;0;0;1", "aff_country_unique": "Canada;United States" }, { "title": "Rethinking Momentum Knowledge Distillation in Online Continual Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33942", "id": "UW5nO9NGjt", "proceeding": "https://proceedings.mlr.press/v235/michel24a.html", "pdf": "https://openreview.net/pdf?id=UW5nO9NGjt", "openreview": "https://openreview.net/forum?id=UW5nO9NGjt", "author_site": "Nicolas MICHEL, Maorong Wang, Ling Xiao, Toshihiko Yamasaki", "tldr": "", "abstract": "Online Continual Learning (OCL) addresses the problem of training neural networks on a continuous data stream where multiple classification tasks emerge in sequence. In contrast to offline Continual Learning, data can be seen only once in OCL, which is a very severe constraint. In this context, replay-based strategies have achieved impressive results and most state-of-the-art approaches heavily depend on them. While Knowledge Distillation (KD) has been extensively used in offline Continual Learning, it remains under-exploited in OCL, despite its high potential. In this paper, we analyze the challenges in applying KD to OCL and give empirical justifications. We introduce a direct yet effective methodology for applying Momentum Knowledge Distillation (MKD) to many flagship OCL methods and demonstrate its capabilities to enhance existing approaches. In addition to improving existing state-of-the-art accuracy by more than $10\\%$ points on ImageNet100, we shed light on MKD internal mechanics and impacts during training in OCL. We argue that similar to replay, MKD should be considered a central component of OCL. The code is available at https://github.com/Nicolas1203/mkd_ocl.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nicolas Michel;Maorong Wang;Ling Xiao;Toshihiko Yamasaki", "authorids": "~Nicolas_Michel1;~Maorong_Wang1;~Ling_Xiao2;~Toshihiko_Yamasaki1", "gender": "M;M;F;M", "homepage": "https://nicolas1203.github.io/;https://maorong.wang;https://dr-lingxiao.github.io/;http://www.cvm.t.u-tokyo.ac.jp/en/", "dblp": "254/6754;282/7285;59/4568-1.html;81/881", "google_scholar": "OyXkV0QAAAAJ;ZaasPpgAAAAJ;https://scholar.google.com/citations?hl=en;rE9iY5MAAAAJ", "orcid": ";;0000-0002-4650-8841;0000-0002-1784-2314", "linkedin": "nicolas-michel-4166b7113/;;;", "or_profile": "~Nicolas_Michel1;~Maorong_Wang1;~Ling_Xiao2;~Toshihiko_Yamasaki1", "aff": "Ecole Sup\u00e9rieure d'Ing\u00e9nieurs en Electronique et Electrotechnique;The University of Tokyo;The University of Tokyo ;The University of Tokyo", "aff_domain": "esiee.fr;u-tokyo.ac.jp;u-tokyo.ac.jp;u-tokyo.ac.jp", "position": "PhD student;PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nmichel2024rethinking,\ntitle={Rethinking Momentum Knowledge Distillation in Online Continual Learning},\nauthor={Nicolas Michel and Maorong Wang and Ling Xiao and Toshihiko Yamasaki},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=UW5nO9NGjt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3280943, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2590274080183476841&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "esiee.fr;u-tokyo.ac.jp;u-tokyo.ac.jp;u-tokyo.ac.jp", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Ecole Sup\u00e9rieure d'Ing\u00e9nieurs en Electronique et Electrotechnique;University of Tokyo", "aff_unique_dep": ";", "aff_unique_url": "https://www.esiee.fr;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "ESIEE;UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "France;Japan" }, { "title": "Active Ranking and Matchmaking, with Perfect Matchings", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33941", "id": "UZZaWUR0n4", "proceeding": "https://proceedings.mlr.press/v235/ferchichi24a.html", "pdf": "https://openreview.net/pdf?id=UZZaWUR0n4", "openreview": "https://openreview.net/forum?id=UZZaWUR0n4", "author_site": "Hafedh El Ferchichi, Matthieu LERASLE, Vianney Perchet", "tldr": "", "abstract": "We address the challenge of actively ranking a set of items/players with varying values/strengths. The comparison outcomes are random, with a greater noise the closer the values. A crucial requirement is that, at each iteration of the algorithm, all items must be compared once, i.e., an iteration is a perfect matching. Furthermore, we presume that comparing two players with closely matched strengths incurs no cost and, in contrast, a unit cost is associated with comparing players whose strength difference is more substantial. Our secondary objective is to determine an optimal matching between players based on this cost function: we propose and analyze an algorithm that draws on concepts from both AKS sorting networks and bandit theory. Our algorithm achieves both objectives with high probability, and the total cost is optimal (up to logarithmic terms).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hafedh El Ferchichi;Matthieu LERASLE;Vianney Perchet", "authorids": "~Hafedh_El_Ferchichi1;~Matthieu_LERASLE1;~Vianney_Perchet3", "gender": "M;M;M", "homepage": "https://crest.science/user/hefedh-el-ferchichi/;https://lerasle.perso.math.cnrs.fr/;", "dblp": ";119/5672;83/7398", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Hafedh_El_Ferchichi1;~Matthieu_LERASLE1;~Vianney_Perchet1", "aff": "Ecole Nationale de la Statistique et de l'Administration Economique;Ecole Nationale de la Statistique et de l'Administration Economique;", "aff_domain": "ensae.fr;ensae.fr;", "position": "PhD student;Associate Professor;", "bibtex": "@inproceedings{\nferchichi2024active,\ntitle={Active Ranking and Matchmaking, with Perfect Matchings},\nauthor={Hafedh El Ferchichi and Matthieu LERASLE and Vianney Perchet},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=UZZaWUR0n4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 576252, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nMfe0Pc0mnkJ:scholar.google.com/&scioq=Active+Ranking+and+Matchmaking,+with+Perfect+Matchings&hl=en&as_sdt=0,10", "gs_version_total": 8, "email": "ensae.fr;ensae.fr;", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Ecole Nationale de la Statistique et de l'Administration Economique", "aff_unique_dep": "", "aff_unique_url": "https://ensae.fr", "aff_unique_abbr": "ENSAE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "title": "Irregular Multivariate Time Series Forecasting: A Transformable Patching Graph Neural Networks Approach", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33940", "id": "UZlMXUGI6e", "proceeding": "https://proceedings.mlr.press/v235/zhang24bw.html", "pdf": "https://openreview.net/pdf?id=UZlMXUGI6e", "openreview": "https://openreview.net/forum?id=UZlMXUGI6e", "author_site": "Weijia Zhang, Chenlong Yin, Hao Liu, Xiaofang Zhou, Hui Xiong", "tldr": "", "abstract": "Forecasting of Irregular Multivariate Time Series (IMTS) is critical for numerous areas, such as healthcare, biomechanics, climate science, and astronomy. Despite existing research addressing irregularities in time series through ordinary differential equations, the challenge of modeling correlations between asynchronous IMTS remains underexplored. To bridge this gap, this study proposes Transformable Patching Graph Neural Networks (t-PatchGNN), which transforms each univariate irregular time series into a series of transformable patches encompassing a varying number of observations with uniform temporal resolution. It seamlessly facilitates local semantics capture and inter-time series correlation modeling while avoiding sequence length explosion in aligned IMTS. Building on the aligned patching outcomes, we then present time-adaptive graph neural networks to model dynamic intertime series correlation based on a series of learned time-varying adaptive graphs. We demonstrate the remarkable superiority of t-PatchGNN on a comprehensive IMTS forecasting benchmark we build, which contains four real-world scientific datasets covering healthcare, biomechanics and climate science, and seventeen competitive baselines adapted from relevant research fields.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weijia Zhang;Chenlong Yin;Hao Liu;Xiaofang Zhou;Hui Xiong", "authorids": "~Weijia_Zhang4;~Chenlong_Yin1;~Hao_Liu17;zxf@ust.hk;~Hui_Xiong1", "gender": ";M;;;M", "homepage": "https://www.researchgate.net/profile/Weijia-Zhang-6;https://scholar.google.cz/citations?user=1mwEE7QAAAAJ&hl=En;https://raymondhliu.github.io/;;https://www.hkust-gz.edu.cn/people/hui-xiong/", "dblp": "158/5387-3;384/4116;09/3214-26;;262/1686-1.html", "google_scholar": "https://scholar.google.com.hk/citations?hl=zh-CN;https://scholar.google.cz/citations?user=1mwEE7QAAAAJ;;;cVDF1tkAAAAJ", "orcid": "0000-0001-5085-5216;0009-0005-1877-0542;0000-0003-4271-1567;;0000-0001-6016-6465", "linkedin": ";;;;", "or_profile": "~Weijia_Zhang4;~Chenlong_Yin1;~Hao_Liu17;zxf@ust.hk;~Hui_Xiong1", "aff": "HKUST(GZ);University of Science and Technology of China;The Hong Kong University of Science and Technology (Guangzhou);;Hong Kong University of Science and Technology (Guangzhou)", "aff_domain": "connect.hkust-gz.edu.cn;ustc.edu.cn;hkust-gz.edu.cn;;hkust.edu", "position": "PhD student;Undergrad student;Assistant Professor;;Full Professor", "bibtex": "@inproceedings{\nzhang2024irregular,\ntitle={Irregular Multivariate Time Series Forecasting: A Transformable Patching Graph Neural Networks Approach},\nauthor={Weijia Zhang and Chenlong Yin and Hao Liu and Xiaofang Zhou and Hui Xiong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=UZlMXUGI6e}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3599525, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12133064463454834568&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "email": "connect.hkust-gz.edu.cn;ustc.edu.cn;hkust-gz.edu.cn;;hkust.edu", "author_num": 5, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Hong Kong University of Science and Technology;University of Science and Technology of China", "aff_unique_dep": ";", "aff_unique_url": "https://www.ust.hk;http://www.ustc.edu.cn", "aff_unique_abbr": "HKUST;USTC", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Guangzhou;;Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Initial Guessing Bias: How Untrained Networks Favor Some Classes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33939", "id": "UZstTlLq1E", "proceeding": "https://proceedings.mlr.press/v235/francazi24a.html", "pdf": "https://openreview.net/pdf?id=UZstTlLq1E", "openreview": "https://openreview.net/forum?id=UZstTlLq1E", "author_site": "Emanuele Francazi, Aurelien Lucchi, Marco Baity-Jesi", "tldr": "", "abstract": "Understanding and controlling biasing effects in neural networks is crucial for ensuring accurate and fair model performance. In the context of classification problems, we provide a theoretical analysis demonstrating that the structure of a deep neural network (DNN) can condition the model to assign all predictions to the same class, even before the beginning of training, and in the absence of explicit biases. We prove that, besides dataset properties, the presence of this phenomenon, which we call *Initial Guessing Bias* (IGB), is influenced by model choices including dataset preprocessing methods, and architectural decisions, such as activation functions, max-pooling layers, and network depth. Our analysis of IGB provides information for architecture selection and model initialization. We also highlight theoretical consequences, such as the breakdown of node-permutation symmetry, the violation of self-averaging and the non-trivial effects that depth has on the phenomenon.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Emanuele Francazi;Aurelien Lucchi;Marco Baity-Jesi", "authorids": "~Emanuele_Francazi1;~Aurelien_Lucchi1;~Marco_Baity-Jesi1", "gender": ";M;M", "homepage": "https://sites.google.com/view/emanuelefrancazi/home-page;http://people.inf.ethz.ch/alucchi/;https://mbaityje.github.io/", "dblp": "323/8096;14/5780;52/11265", "google_scholar": "hf5I62kAAAAJ;https://scholar.google.ch/citations?user=V1ONSgIAAAAJ;", "orcid": ";;0000-0002-8723-906X", "linkedin": "emanuele-francazi-a71717238;;", "or_profile": "~Emanuele_Francazi1;~Aurelien_Lucchi1;~Marco_Baity-Jesi1", "aff": "EPFL - EPF Lausanne;University of Basel;Eawag", "aff_domain": "epfl.ch;unibas.ch;eawag.ch", "position": "PhD student;Assistant Professor;Group Leader", "bibtex": "@inproceedings{\nfrancazi2024initial,\ntitle={Initial Guessing Bias: How Untrained Networks Favor Some Classes},\nauthor={Emanuele Francazi and Aurelien Lucchi and Marco Baity-Jesi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=UZstTlLq1E}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2754487, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10935642443946451785&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "epfl.ch;unibas.ch;eawag.ch", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "EPFL;University of Basel;Eawag", "aff_unique_dep": ";;", "aff_unique_url": "https://www.epfl.ch;https://www.unibas.ch;https://www.eawag.ch", "aff_unique_abbr": "EPFL;UniBas;", "aff_campus_unique_index": "0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Task Groupings Regularization: Data-Free Meta-Learning with Heterogeneous Pre-trained Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33938", "id": "UcOze9EXEc", "proceeding": "https://proceedings.mlr.press/v235/wei24e.html", "pdf": "https://openreview.net/pdf?id=UcOze9EXEc", "openreview": "https://openreview.net/forum?id=UcOze9EXEc", "author_site": "Yongxian Wei, Zixuan Hu, Li Shen, Zhenyi Wang, Yu Li, Chun Yuan, Dacheng Tao", "tldr": "", "abstract": "Data-Free Meta-Learning (DFML) aims to derive knowledge from a collection of pre-trained models without accessing their original data, enabling the rapid adaptation to new unseen tasks. Current methods often overlook the heterogeneity among pre-trained models, which leads to performance degradation due to task conflicts. In this paper, we empirically and theoretically identify and analyze the model heterogeneity in DFML. We find that model heterogeneity introduces a heterogeneity-homogeneity trade-off, where homogeneous models reduce task conflicts but also increase the overfitting risk. Balancing this trade-off is crucial for learning shared representations across tasks. Based on our findings, we propose Task Groupings Regularization, a novel approach that benefits from model heterogeneity by grouping and aligning conflicting tasks. Specifically, we embed pre-trained models into a task space to compute dissimilarity, and group heterogeneous models together based on this measure. Then, we introduce implicit gradient regularization within each group to mitigate potential conflicts. By encouraging a gradient direction suitable for all tasks, the meta-model captures shared representations that generalize across tasks. Comprehensive experiments showcase the superiority of our approach in multiple benchmarks, effectively tackling the model heterogeneity in challenging multi-domain and multi-architecture scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yongxian Wei;Zixuan Hu;Li Shen;Zhenyi Wang;Yu Li;Chun Yuan;Dacheng Tao", "authorids": "~Yongxian_Wei1;~Zixuan_Hu1;~Li_Shen1;~Zhenyi_Wang1;~Yu_Li1;~Chun_Yuan1;~Dacheng_Tao1", "gender": ";M;M;;M;M;", "homepage": ";;https://sites.google.com/site/mathshenli/home;;https://sites.google.com/view/liyu1995;https://www.sigs.tsinghua.edu.cn/fg3/105064.jhtml;", "dblp": ";332/4542;91/3680-8;;;;", "google_scholar": ";https://scholar.google.com/citations?view_op=list_works;yVhgENIAAAAJ;;8YHZx-AAAAAJ;https://scholar.google.com.hk/citations?user=fYdxi2sAAAAJ;", "orcid": ";;;;0000-0002-3664-6722;;", "linkedin": ";;;;yuli1995/;;", "or_profile": "~Yongxian_Wei1;~Zixuan_Hu1;~Li_Shen1;~Zhenyi_Wang1;~Yu_Li1;~Chun_Yuan1;~Dacheng_Tao1", "aff": ";Tsinghua University;JD Explore Academy;;Department of Computer Science and Engineering, The Chinese University of Hong Kong;Tsinghua University;", "aff_domain": ";mails.tsinghua.edu.cn;jd.com;;cse.cuhk.edu.hk;tsinghua.edu.cn;", "position": ";MS student;Researcher;;Assistant Professor;Full Professor;", "bibtex": "@inproceedings{\nwei2024task,\ntitle={Task Groupings Regularization: Data-Free Meta-Learning with Heterogeneous Pre-trained Models},\nauthor={Yongxian Wei and Zixuan Hu and Li Shen and Zhenyi Wang and Yu Li and Chun Yuan and Dacheng Tao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=UcOze9EXEc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 0, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10811392117612284914&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";mails.tsinghua.edu.cn;jd.com;;cse.cuhk.edu.hk;tsinghua.edu.cn;", "author_num": 7, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Tsinghua University;JD;Chinese University of Hong Kong", "aff_unique_dep": ";JD Explore Academy;Department of Computer Science and Engineering", "aff_unique_url": "https://www.tsinghua.edu.cn;;https://www.cuhk.edu.hk", "aff_unique_abbr": "THU;;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China;" }, { "title": "Zero-Sum Positional Differential Games as a Framework for Robust Reinforcement Learning: Deep Q-Learning Approach", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33937", "id": "UdXDUDxq11", "proceeding": "https://proceedings.mlr.press/v235/plaksin24a.html", "pdf": "https://openreview.net/pdf?id=UdXDUDxq11", "openreview": "https://openreview.net/forum?id=UdXDUDxq11", "author_site": "Anton Plaksin, Vitaly Kalev", "tldr": "", "abstract": "Robust Reinforcement Learning (RRL) is a promising Reinforcement Learning (RL) paradigm aimed at training robust to uncertainty or disturbances models, making them more efficient for real-world applications. Following this paradigm, uncertainty or disturbances are interpreted as actions of a second adversarial agent, and thus, the problem is reduced to seeking the agents' policies robust to any opponent's actions. This paper is the first to propose considering the RRL problems within the positional differential game theory, which helps us to obtain theoretically justified intuition to develop a centralized Q-learning approach. Namely, we prove that under Isaacs's condition (sufficiently general for real-world dynamical systems), the same Q-function can be utilized as an approximate solution of both minimax and maximin Bellman equations. Based on these results, we present the Isaacs Deep Q-Network algorithms and demonstrate their superiority compared to other baseline RRL and Multi-Agent RL algorithms in various environments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anton Plaksin;Vitaly Kalev", "authorids": "~Anton_Plaksin1;~Vitaly_Kalev1", "gender": "M;M", "homepage": "https://orcid.org/0000-0002-1468-201X;", "dblp": "276/9860;", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;", "orcid": "0000-0002-1468-201X;0000-0002-8904-2930", "linkedin": ";", "or_profile": "~Anton_Plaksin1;~Vitaly_Kalev1", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nplaksin2024zerosum,\ntitle={Zero-Sum Positional Differential Games as a Framework for Robust Reinforcement Learning: Deep Q-Learning Approach},\nauthor={Anton Plaksin and Vitaly Kalev},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=UdXDUDxq11}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1074335, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3663410815533770007&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": ";", "author_num": 2 }, { "title": "ELF: Encoding Speaker-Specific Latent Speech Feature for Speech Synthesis", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33936", "id": "Ug1m4P4AKf", "proceeding": "https://proceedings.mlr.press/v235/kong24c.html", "pdf": "https://openreview.net/pdf?id=Ug1m4P4AKf", "openreview": "https://openreview.net/forum?id=Ug1m4P4AKf", "author_site": "Jungil Kong, Junmo Lee, Jeongmin Kim, Beomjeong Kim, JIHOON PARK, Dohee Kong, Changheon Lee, Sangjin Kim", "tldr": "", "abstract": "In this work, we propose a novel method for modeling numerous speakers, which enables expressing the overall characteristics of speakers in detail like a trained multi-speaker model without additional training on the target speaker's dataset. Although various works with similar purposes have been actively studied, their performance has not yet reached that of trained multi-speaker models due to their fundamental limitations. To overcome previous limitations, we propose effective methods for feature learning and representing target speakers' speech characteristics by discretizing the features and conditioning them to a speech synthesis model. Our method obtained a significantly higher similarity mean opinion score (SMOS) in subjective similarity evaluation than seen speakers of a high-performance multi-speaker model, even with unseen speakers. The proposed method also outperforms a zero-shot method by significant margins. Furthermore, our method shows remarkable performance in generating new artificial speakers. In addition, we demonstrate that the encoded latent features are sufficiently informative to reconstruct an original speaker's speech completely. It implies that our method can be used as a general methodology to encode and reconstruct speakers' characteristics in various tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jungil Kong;Junmo Lee;Jeongmin Kim;Beomjeong Kim;JIHOON PARK;Dohee Kong;Changheon Lee;Sangjin Kim", "authorids": "~Jungil_Kong3;~Junmo_Lee1;~Jeongmin_Kim2;~Beomjeong_Kim1;~JIHOON_PARK2;~Dohee_Kong1;~Changheon_Lee1;~Sangjin_Kim1", "gender": ";M;M;M;;M;M;M", "homepage": ";;;;;;;", "dblp": "266/1599;254/3810;;;;;;", "google_scholar": "-P4Jn3kAAAAJ;;;;;https://scholar.google.com/citations?hl=ko;;", "orcid": ";;;;;;;", "linkedin": ";ljun4121/;jeongmin-kim-683a30276/;beomjeong-kim-3714a0226;dohee-kong/;;sangjin-kim-samprate1st;https://www.linkedin.com/mwlite/in/jihoon-park-084151b5", "or_profile": "~Jungil_Kong3;~Junmo_Lee1;~Jeongmin_Kim2;~Beomjeong_Kim1;~Dohee_Kong1;~Changheon_Lee1;~Sangjin_Kim1;~JIHUN_PARK1", "aff": "SK Telecom AI&CO;;SK Telecom;;SK Telecom AI&CO;SK Telecom;;", "aff_domain": "sk.com;;sk.com;;sk.com;sktelecom.com;;", "position": "Researcher;;Researcher;;Researcher;Researcher;;", "bibtex": "@inproceedings{\nkong2024elf,\ntitle={{ELF}: Encoding Speaker-Specific Latent Speech Feature for Speech Synthesis},\nauthor={Jungil Kong and Junmo Lee and Jeongmin Kim and Beomjeong Kim and JIHOON PARK and Dohee Kong and Changheon Lee and Sangjin Kim},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Ug1m4P4AKf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2036116, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16998420983970854481&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "sk.com;;sk.com;;sk.com;sktelecom.com;;", "author_num": 8, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "SK Telecom", "aff_unique_dep": "AI&CO", "aff_unique_url": "https://www.sktelecom.com", "aff_unique_abbr": "SKT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Outlier-aware Slicing for Post-Training Quantization in Vision Transformer", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33935", "id": "Uh5XN9d2J4", "proceeding": "https://proceedings.mlr.press/v235/ma24f.html", "pdf": "https://openreview.net/pdf?id=Uh5XN9d2J4", "openreview": "https://openreview.net/forum?id=Uh5XN9d2J4", "author_site": "Yuexiao Ma, Huixia Li, Xiawu Zheng, Feng Ling, Xuefeng Xiao, Rui Wang, Shilei Wen, Fei Chao, Rongrong Ji", "tldr": "", "abstract": "Post-Training Quantization (PTQ) is a vital technique for network compression and acceleration, gaining prominence as model sizes increase. This paper addresses a critical challenge in PTQ: **the severe impact of outliers on the accuracy of quantized transformer architectures.** Specifically, we introduce the concept of `reconstruction granularity' as a novel solution to this issue, which has been overlooked in previous works. Our work provides theoretical insights into the role of reconstruction granularity in mitigating the outlier problem in transformer models. This theoretical framework is supported by empirical analysis, demonstrating that varying reconstruction granularities significantly influence quantization performance. Our findings indicate that different architectural designs necessitate distinct optimal reconstruction granularities. For instance, the multi-stage Swin Transformer architecture benefits from finer granularity, a deviation from the trends observed in ViT and DeiT models. We further develop an algorithm for determining the optimal reconstruction granularity for various ViT models, achieving state-of-the-art (SOTA) performance in PTQ. For example, applying our method to $4$-bit quantization, the Swin-Base model achieves a Top-1 accuracy of $82.24\\%$ on the ImageNet classification task. This result surpasses the RepQ-ViT by $3.92\\%$ ($82.24\\%$ VS $78.32\\%$). Similarly, our approach elevates the ViT-Small to a Top-1 accuracy of $80.50\\%$, outperforming NoisyQuant by $3.64\\%$ ($80.50\\%$ VS $76.86\\%$). Codes are available in Supplementary Materials.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuexiao Ma;Huixia Li;Xiawu Zheng;Feng Ling;Xuefeng Xiao;Rui Wang;Shilei Wen;Fei Chao;Rongrong Ji", "authorids": "~Yuexiao_Ma1;~Huixia_Li2;~Xiawu_Zheng1;~Feng_Ling1;~Xuefeng_Xiao1;~Rui_Wang32;~Shilei_Wen1;~Fei_Chao1;~Rongrong_Ji5", "gender": "M;F;M;M;M;;M;M;M", "homepage": ";https://github.com/ReKarma/ReKarma.github.io;https://sites.google.com/view/zhengxiawu/%E9%A6%96%E9%A1%B5;https://github.com/lingffff;;;;https://cogsci.xmu.edu.cn/info/1034/1249.htm;http://mac.xmu.edu.cn/rrji-en.html", "dblp": "299/8966.html;;222/7865;;245/9547;;159/2939;118/5221-1.html;86/5681", "google_scholar": "DVszibYAAAAJ;fcqN6RQAAAAJ;jBgXocYAAAAJ;;;;zKtYrHYAAAAJ;srS6rNMAAAAJ;", "orcid": ";;0000-0002-6855-5403;;;;;;", "linkedin": ";;;;;;;;", "or_profile": "~Yuexiao_Ma1;~Huixia_Li2;~Xiawu_Zheng1;~Feng_Ling1;~Xuefeng_Xiao1;~Rui_Wang32;~Shilei_Wen1;~Fei_Chao1;~Rongrong_Ji5", "aff": "Xiamen University;ByteDance;PengCheng Lab;ByteDance Inc.;ByteDance;;bytedance;Xiamen University;Xiamen University", "aff_domain": "xmu.edu.cn;bytedance.inc;pcl.ac.cn;bytedance.com;bytedance.com;;bytedance.com;xmu.edu.cn;xmu.edu.cn", "position": "PhD student;Researcher;Postdoc;Researcher;Researcher;;Researcher;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nma2024outlieraware,\ntitle={Outlier-aware Slicing for Post-Training Quantization in Vision Transformer},\nauthor={Yuexiao Ma and Huixia Li and Xiawu Zheng and Feng Ling and Xuefeng Xiao and Rui Wang and Shilei Wen and Fei Chao and Rongrong Ji},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Uh5XN9d2J4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 498302, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6087227739484039644&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 5, "email": "xmu.edu.cn;bytedance.inc;pcl.ac.cn;bytedance.com;bytedance.com;;bytedance.com;xmu.edu.cn;xmu.edu.cn", "author_num": 9, "aff_unique_index": "0;1;2;1;1;1;0;0", "aff_unique_norm": "Xiamen University;ByteDance;Pengcheng Lab", "aff_unique_dep": ";;", "aff_unique_url": "https://www.xmu.edu.cn;https://www.bytedance.com;", "aff_unique_abbr": "XMU;ByteDance;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "On dimensionality of feature vectors in MPNNs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33934", "id": "UjDp4Wkq2V", "proceeding": "https://proceedings.mlr.press/v235/bravo24a.html", "pdf": "https://openreview.net/pdf?id=UjDp4Wkq2V", "openreview": "https://openreview.net/forum?id=UjDp4Wkq2V", "author_site": "C\u00e9sar Bravo, Alexander Kozachinskiy, Cristobal Rojas", "tldr": "", "abstract": "We revisit the result of Morris et al. (AAAI'19) that message-passing graphs neural networks (MPNNs) are equal in their distinguishing power to the Weisfeiler--Leman (WL) isomorphism test. Morris et al. show their result with ReLU activation function and $O(n)$-dimensional feature vectors, where $n$ is the size of the graph. Recently, by introducing randomness into the architecture, Aamand et al. (NeurIPS'22) improved this bound to $O(\\log n)$-dimensional feature vectors, although at the expense of guaranteeing perfect simulation only with high probability. In all these constructions, to guarantee equivalence to the WL test, the dimension of feature vectors in the MPNN has to increase with the size of the graphs. However, architectures used in practice have feature vectors of constant dimension. Thus, there is a gap between the guarantees provided by these results and the actual characteristics of architectures used in practice. In this paper we close this gap by showing that, for *any* non-polynomial analytic (like the sigmoid) activation function, to guarantee that MPNNs are equivalent to the WL test, feature vectors of dimension $d=1$ is all we need, independently of the size of the graphs. Our main technical insight is that for simulating multi-sets in the WL-test, it is enough to use linear independence of feature vectors over rationals instead of reals. Countability of the set of rationals together with nice properties of analytic functions allow us to carry out the simulation invariant over the iterations of the WL test without increasing the dimension of the feature vectors.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "C\u00e9sar Bravo;Alexander Kozachinskiy;Cristobal Rojas", "authorids": "~C\u00e9sar_Bravo1;~Alexander_Kozachinskiy1;~Cristobal_Rojas1", "gender": "M;M;M", "homepage": "https://github.com/CesarBravo99;https://kozlachinskiy.github.io/;", "dblp": ";164/0711;83/3605", "google_scholar": ";gAKBJ7kAAAAJ;https://scholar.google.cl/citations?user=Z1aTUGsAAAAJ", "orcid": ";0000-0002-9956-9023;", "linkedin": ";;", "or_profile": "~C\u00e9sar_Bravo1;~Alexander_Kozachinskiy1;~Cristobal_Rojas1", "aff": "Pontificia Universidad Catolica de Chile;Pontificia Universidad Catolica de Chile;Pontificia Universidad Catolica de Chile", "aff_domain": "uc.cl;puc.cl;uc.cl", "position": "MS student;Postdoc;Associate Professor", "bibtex": "@inproceedings{\nbravo2024on,\ntitle={On dimensionality of feature vectors in {MPNN}s},\nauthor={C{\\'e}sar Bravo and Alexander Kozachinskiy and Cristobal Rojas},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=UjDp4Wkq2V}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3849473, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14171532908111943599&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "uc.cl;puc.cl;uc.cl", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Pontificia Universidad Catolica de Chile", "aff_unique_dep": "", "aff_unique_url": "https://www.puc.cl", "aff_unique_abbr": "PUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Chile" }, { "title": "Explain Temporal Black-Box Models via Functional Decomposition", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33933", "id": "Uo3LNg5SLY", "proceeding": "https://proceedings.mlr.press/v235/yang24y.html", "pdf": "https://openreview.net/pdf?id=Uo3LNg5SLY", "openreview": "https://openreview.net/forum?id=Uo3LNg5SLY", "author_site": "Linxiao Yang, Yunze Tong, Xinyue Gu, Liang Sun", "tldr": "", "abstract": "How to explain temporal models is a significant challenge due to the inherent characteristics of time series data, notably the strong temporal dependencies and interactions between observations. Unlike ordinary tabular data, data at different time steps in time series usually interact dynamically, forming influential patterns that shape the model\u2019s predictions, rather than only acting in isolation. Existing explanatory approaches for time series often overlook these crucial temporal interactions by treating time steps as separate entities, leading to a superficial understanding of model behavior. To address this challenge, we introduce FDTempExplainer, an innovative model-agnostic explanation method based on functional decomposition, tailored to unravel the complex interplay within black-box time series models. Our approach disentangles the individual contributions from each time step, as well as the aggregated influence of their interactions, in a rigorous framework. FDTempExplainer accurately measures the strength of interactions, yielding insights that surpass those from baseline models. We demonstrate the effectiveness of our approach in a wide range of time series applications, including anomaly detection, classification, and forecasting, showing its superior performance to the state-of-the-art algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Linxiao Yang;Yunze Tong;Xinyue Gu;Liang Sun", "authorids": "~Linxiao_Yang1;~Yunze_Tong1;~Xinyue_Gu1;~Liang_Sun2", "gender": ";M;;M", "homepage": "https://github.com/DAMO-DI-ML;;;https://www.linkedin.com/in/liang-sun-a0a87621/", "dblp": "160/8447;;;18/5837-1", "google_scholar": ";https://scholar.google.com.hk/citations?hl=zh-CN;XkgI14gAAAAJ;D_cOMBgAAAAJ", "orcid": "0000-0001-9558-7163;;0000-0002-6183-2295;0009-0002-5835-7259", "linkedin": ";;;", "or_profile": "~Linxiao_Yang1;~Yunze_Tong1;~Xinyue_Gu1;~Liang_Sun2", "aff": "Alibaba Group;Zhejiang University;Alibaba Group;Alibaba Group", "aff_domain": "alibaba-inc.com;zju.edu.cn;alibaba-inc.com;alibaba-inc.com", "position": "Engineer;PhD student;Researcher;Staff Software Engineer", "bibtex": "@inproceedings{\nyang2024explain,\ntitle={Explain Temporal Black-Box Models via Functional Decomposition},\nauthor={Linxiao Yang and Yunze Tong and Xinyue Gu and Liang Sun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Uo3LNg5SLY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4205647, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16254981210959899197&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "alibaba-inc.com;zju.edu.cn;alibaba-inc.com;alibaba-inc.com", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Alibaba Group;Zhejiang University", "aff_unique_dep": ";", "aff_unique_url": "https://www.alibaba.com;https://www.zju.edu.cn", "aff_unique_abbr": "Alibaba;ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Do Transformer World Models Give Better Policy Gradients?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33932", "id": "Uoved2xD81", "proceeding": "https://proceedings.mlr.press/v235/ma24i.html", "pdf": "https://openreview.net/pdf?id=Uoved2xD81", "openreview": "https://openreview.net/forum?id=Uoved2xD81", "author_site": "Michel Ma, Tianwei Ni, Clement Gehring, Pierluca D'Oro, Pierre-Luc Bacon", "tldr": "", "abstract": "A natural approach for reinforcement learning is to predict future rewards by unrolling a neural network world model, and to backpropagate through the resulting computational graph to learn a control policy. However, this method often becomes impractical for long horizons, since typical world models induce hard-to-optimize loss landscapes. Transformers are known to efficiently propagate gradients over long horizons: could they be the solution to this problem? Surprisingly, we show that commonly-used transformer world models produce circuitous gradient paths, which can be detrimental to long-range policy gradients. To tackle this challenge, we propose a class of world models called Action-conditioned World Models (AWMs), designed to provide more direct routes for gradient propagation. We integrate such AWMs into a policy gradient framework that underscores the relationship between network architectures and the policy gradient updates they inherently represent. We demonstrate that AWMs can generate optimization landscapes that are easier to navigate even when compared to those from the simulator itself. This property allows transformer AWMs to produce better policies than competitive baselines in realistic long-horizon tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Michel Ma;Tianwei Ni;Clement Gehring;Pierluca D'Oro;Pierre-Luc Bacon", "authorids": "~Michel_Ma1;~Tianwei_Ni1;~Clement_Gehring1;~Pierluca_D'Oro1;~Pierre-Luc_Bacon1", "gender": "M;M;M;M;", "homepage": ";https://twni2016.github.io/;http://people.csail.mit.edu/gehring/;https://proceduralia.github.io;", "dblp": ";230/8153;131/5247;248/8326;", "google_scholar": ";njAD34UAAAAJ;KvX7mJUAAAAJ;https://scholar.google.it/citations?user=AuVp7pkAAAAJ;", "orcid": ";;;;", "linkedin": "michel-ma/;;;;", "or_profile": "~Michel_Ma1;~Tianwei_Ni1;~Clement_Gehring1;~Pierluca_D'Oro1;~Pierre-Luc_Bacon1", "aff": "University of Montreal;Amazon Web Services;Massachusetts Institute of Technology;Universit\u00e9 de Montr\u00e9al;", "aff_domain": "umontreal.ca;amazon.com;mit.edu;umontreal.ca;", "position": "PhD student;Intern;PhD student;PhD student;", "bibtex": "@inproceedings{\nma2024do,\ntitle={Do Transformer World Models Give Better Policy Gradients?},\nauthor={Michel Ma and Tianwei Ni and Clement Gehring and Pierluca D'Oro and Pierre-Luc Bacon},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Uoved2xD81}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 841339, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1494770702983213775&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "umontreal.ca;amazon.com;mit.edu;umontreal.ca;", "author_num": 5, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Montreal;Amazon;Massachusetts Institute of Technology;Universit\u00e9 de Montr\u00e9al", "aff_unique_dep": ";Amazon Web Services;;", "aff_unique_url": "https://wwwumontreal.ca;https://aws.amazon.com;https://web.mit.edu;https://www.umontreal.ca", "aff_unique_abbr": "UM;AWS;MIT;UdeM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "Canada;United States" }, { "title": "Arrows of Time for Large Language Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33931", "id": "UpSe7ag34v", "proceeding": "https://proceedings.mlr.press/v235/papadopoulos24a.html", "pdf": "https://openreview.net/pdf?id=UpSe7ag34v", "openreview": "https://openreview.net/forum?id=UpSe7ag34v", "author_site": "Vassilis Papadopoulos, J\u00e9r\u00e9mie Wenger, Clement Hongler", "tldr": "", "abstract": "We study the probabilistic modeling performed by Autoregressive Large Language Models (LLMs) through the angle of time directionality, addressing a question first raised in (Shannon, 1951). For large enough models, we empirically find a time asymmetry in their ability to learn natural language: a difference in the average log-perplexity when trying to predict the next token versus when trying to predict the previous one. This difference is at the same time subtle and very consistent across various modalities (language, model size, training time, ...). Theoretically, this is surprising: from an information-theoretic point of view, there should be no such difference. We provide a theoretical framework to explain how such an asymmetry can appear from sparsity and computational complexity considerations, and outline a number of perspectives opened by our results.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vassilis Papadopoulos;J\u00e9r\u00e9mie Wenger;Cl\u00e9ment Hongler", "authorids": "~Vassilis_Papadopoulos1;~J\u00e9r\u00e9mie_Wenger1;~Cl\u00e9ment_Hongler1", "gender": ";M;", "homepage": "https://www.vassi.life;;", "dblp": ";;222/3086", "google_scholar": "4o52I2oAAAAJ;;", "orcid": "0000-0003-3270-5219;0000-0002-4477-4118;", "linkedin": ";;", "or_profile": "~Vassilis_Papadopoulos1;~J\u00e9r\u00e9mie_Wenger1;~Cl\u00e9ment_Hongler1", "aff": "EPFL - EPF Lausanne;Goldsmiths College, University of London;Swiss Federal Institute of Technology Lausanne", "aff_domain": "epfl.ch;gold.ac.uk;epfl.ch", "position": "Postdoc;Lecturer;Associate Professor", "bibtex": "@inproceedings{\npapadopoulos2024arrows,\ntitle={Arrows of Time for Large Language Models},\nauthor={Vassilis Papadopoulos and J{\\'e}r{\\'e}mie Wenger and Cl{\\'e}ment Hongler},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=UpSe7ag34v}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1438194, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7118639906397303747&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "epfl.ch;gold.ac.uk;epfl.ch", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "EPFL;University of London;Swiss Federal Institute of Technology Lausanne", "aff_unique_dep": ";Goldsmiths College;", "aff_unique_url": "https://www.epfl.ch;https://www.gold.ac.uk;https://www.epfl.ch", "aff_unique_abbr": "EPFL;Goldsmiths;EPFL", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Lausanne;London", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Switzerland;United Kingdom" }, { "title": "Bring Your Own (Non-Robust) Algorithm to Solve Robust MDPs by Estimating The Worst Kernel", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33930", "id": "UqoG0YRfQx", "proceeding": "https://proceedings.mlr.press/v235/gadot24a.html", "pdf": "https://openreview.net/pdf?id=UqoG0YRfQx", "openreview": "https://openreview.net/forum?id=UqoG0YRfQx", "author_site": "Uri Gadot, Kaixin Wang, Navdeep Kumar, Kfir Levy, Shie Mannor", "tldr": "", "abstract": "Robust Markov Decision Processes (RMDPs) provide a framework for sequential decision-making that is robust to perturbations on the transition kernel. However, current RMDP methods are often limited to small-scale problems, hindering their use in high-dimensional domains. To bridge this gap, we present EWoK, a novel online approach to solve RMDP that Estimates the Worst transition Kernel to learn robust policies. Unlike previous works that regularize the policy or value updates, EWoK achieves robustness by simulating the worst scenarios for the agent while retaining complete flexibility in the learning process. Notably, EWoK can be applied on top of any off-the-shelf non-robust RL algorithm, enabling easy scaling to high-dimensional domains. Our experiments, spanning from simple Cartpole to high-dimensional DeepMind Control Suite environments, demonstrate the effectiveness and applicability of the EWoK paradigm as a practical method for learning robust policies.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Uri Gadot;Kaixin Wang;Navdeep Kumar;Kfir Yehuda Levy;Shie Mannor", "authorids": "~Uri_Gadot1;~Kaixin_Wang1;~Navdeep_Kumar1;~Kfir_Yehuda_Levy1;~Shie_Mannor2", "gender": "M;M;M;M;M", "homepage": ";https://kaixin96.github.io;;http://kfiryehud.wixsite.com/kfir-y-levy;https://shie.net.technion.ac.il", "dblp": "349/0367;;;83/11388;20/1669", "google_scholar": "XXolX3MAAAAJ;https://scholar.google.com.sg/citations?hl=en;;;https://scholar.google.com.tw/citations?user=q1HlbIUAAAAJ", "orcid": ";0000-0001-8237-9285;;;", "linkedin": ";;navdeepsjb/;;", "or_profile": "~Uri_Gadot1;~Kaixin_Wang1;~Navdeep_Kumar1;~Kfir_Yehuda_Levy1;~Shie_Mannor2", "aff": "Technion - Israel Institute of Technology, Technion - Israel Institute of Technology;Technion - Israel Institute of Technology, Technion - Israel Institute of Technology;Technion - Israel Institute of Technology, Technion - Israel Institute of Technology;Technion - Israel Institute of Technology, Technion;Technion - Israel Institute of Technology, Technion", "aff_domain": "campus.technion.ac.il;campus.technion.ac.il;campus.technion.ac.il;technion.ac.il;technion.il", "position": "PhD student;Postdoc;PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\ngadot2024bring,\ntitle={Bring Your Own (Non-Robust) Algorithm to Solve Robust {MDP}s by Estimating The Worst Kernel},\nauthor={Uri Gadot and Kaixin Wang and Navdeep Kumar and Kfir Yehuda Levy and Shie Mannor},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=UqoG0YRfQx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8764797, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14917537709480111497&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "email": "campus.technion.ac.il;campus.technion.ac.il;campus.technion.ac.il;technion.ac.il;technion.il", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Israel" }, { "title": "Connect Later: Improving Fine-tuning for Robustness with Targeted Augmentations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33929", "id": "Uz4Qr40Y3C", "proceeding": "https://proceedings.mlr.press/v235/qu24b.html", "pdf": "https://openreview.net/pdf?id=Uz4Qr40Y3C", "openreview": "https://openreview.net/forum?id=Uz4Qr40Y3C", "author_site": "Helen Qu, Sang Michael Xie", "tldr": "", "abstract": "Models trained on a labeled source domain often generalize poorly when deployed on an out-of-distribution (OOD) target domain. In the domain adaptation setting where unlabeled target data is available, self-supervised pretraining (e.g., contrastive learning or masked autoencoding) is a promising method to mitigate this performance drop. Pretraining depends on generic data augmentations (e.g., cropping or masking) to learn representations that generalize across domains, which may not work for all distribution shifts. In this paper, we show on real-world tasks that standard fine-tuning after pretraining does not consistently improve OOD error over simply training from scratch on labeled source data. To better leverage pretraining for distribution shifts, we propose the Connect Later framework, which fine-tunes the model with targeted augmentations designed with knowledge of the shift. Intuitively, pretraining learns good representations within the source and target domains, while fine-tuning with targeted augmentations improves generalization across domains. Connect Later achieves state-of-the-art OOD accuracy while maintaining comparable or better in-distribution accuracy on 4 real-world tasks in wildlife identification (iWildCam-WILDS), tumor detection (Camelyon17-WILDS), and astronomy (AstroClassification, Redshifts).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Helen Qu;Sang Michael Xie", "authorids": "~Helen_Qu1;~Sang_Michael_Xie1", "gender": ";", "homepage": "https://www.helenqu.com;https://cs.stanford.edu/~eix/", "dblp": "317/0339;220/3987", "google_scholar": ";EBNa5IEAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Helen_Qu1;~Sang_Michael_Xie1", "aff": "University of Pennsylvania;Stanford University", "aff_domain": "upenn.edu;stanford.edu", "position": "PhD student;PhD student", "bibtex": "@inproceedings{\nqu2024connect,\ntitle={Connect Later: Improving Fine-tuning for Robustness with Targeted Augmentations},\nauthor={Helen Qu and Sang Michael Xie},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Uz4Qr40Y3C}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1879740, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7541529667349864195&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "upenn.edu;stanford.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Pennsylvania;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://www.upenn.edu;https://www.stanford.edu", "aff_unique_abbr": "UPenn;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "No Free Prune: Information-Theoretic Barriers to Pruning at Initialization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33928", "id": "Uzb45nolTb", "proceeding": "https://proceedings.mlr.press/v235/kumar24a.html", "pdf": "https://openreview.net/pdf?id=Uzb45nolTb", "openreview": "https://openreview.net/forum?id=Uzb45nolTb", "author_site": "Tanishq Kumar, Kevin Luo, Mark Sellke", "tldr": "", "abstract": "The existence of \u201clottery tickets\u201d (Frankle & Carbin, 2018) at or near initialization raises the tantalizing question of whether large models are necessary in deep learning, or whether sparse networks can be quickly identified and trained without ever training the dense models that contain them. However, efforts to find these sparse subnetworks without training the dense model (\u201cpruning at initialization\u201d) have been broadly unsuccessful (Frankle et al., 2020b). We put forward a theoretical explanation for this, based on the model\u2019s effective parameter count, $p_\\text{eff}$, given by the sum of the number of non-zero weights in the final network and the mutual information between the sparsity mask and the data. We show the Law of Robustness of (Bubeck & Sellke, 2023) extends to sparse networks with the usual parameter count replaced by $p_\\text{eff}$, meaning a sparse neural network which robustly interpolates noisy data requires a heavily data-dependent mask. We posit that pruning during and after training outputs masks with higher mutual information than those produced by pruning at initialization. Thus two networks may have the same sparsities, but differ in effective parameter count based on how they were trained. This suggests that pruning near initialization may be infeasible and explains why lottery tickets exist, but cannot be found fast (i.e. without training the full network). Experiments on neural networks confirm that information gained during training may indeed affect model capacity.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tanishq Kumar;Kevin Luo;Mark Sellke", "authorids": "~Tanishq_Kumar1;~Kevin_Luo1;~Mark_Sellke1", "gender": "M;M;M", "homepage": "https://tanishqkumar.github.io/;https://kevinzluo.github.io/;https://msellke.com/", "dblp": ";;207/8338", "google_scholar": ";;lXCP2cMAAAAJ", "orcid": ";;0000-0001-9166-8185", "linkedin": ";kevinluo0/;mark-sellke-a40b19100/", "or_profile": "~Tanishq_Kumar1;~Kevin_Luo1;~Mark_Sellke1", "aff": "Harvard College;Harvard University;Harvard University", "aff_domain": "harvard.edu;harvard.edu;harvard.edu", "position": "Undergrad student;Undergrad student;Assistant Professor", "bibtex": "@inproceedings{\nkumar2024no,\ntitle={No Free Prune: Information-Theoretic Barriers to Pruning at Initialization},\nauthor={Tanishq Kumar and Kevin Luo and Mark Sellke},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Uzb45nolTb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 623698, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12087378534534038736&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "harvard.edu;harvard.edu;harvard.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_campus_unique_index": "0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Adapt and Diffuse: Sample-adaptive Reconstruction via Latent Diffusion Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33927", "id": "V3OpGwo68Z", "proceeding": "https://proceedings.mlr.press/v235/fabian24a.html", "pdf": "https://openreview.net/pdf?id=V3OpGwo68Z", "openreview": "https://openreview.net/forum?id=V3OpGwo68Z", "author_site": "Zalan Fabian, Berk Tinaz, Mahdi Soltanolkotabi", "tldr": "", "abstract": "Inverse problems arise in a multitude of applications, where the goal is to recover a clean signal from noisy and possibly (non)linear observations. The difficulty of a reconstruction problem depends on multiple factors, such as the ground truth signal structure, the severity of the degradation and the complex interactions between the above. This results in natural sample-by-sample variation in the difficulty of a reconstruction problem. Our key observation is that most existing inverse problem solvers lack the ability to adapt their compute power to the difficulty of the reconstruction task, resulting in subpar performance and wasteful resource allocation. We propose a novel method, *severity encoding*, to estimate the degradation severity of corrupted signals in the latent space of an autoencoder. We show that the estimated severity has strong correlation with the true corruption level and can provide useful hints on the difficulty of reconstruction problems on a sample-by-sample basis. Furthermore, we propose a reconstruction method based on latent diffusion models that leverages the predicted degradation severities to fine-tune the reverse diffusion sampling trajectory and thus achieve sample-adaptive inference times. Our framework, Flash-Diffusion, acts as a wrapper that can be combined with any latent diffusion-based baseline solver, imbuing it with sample-adaptivity and acceleration. We perform experiments on both linear and nonlinear inverse problems and demonstrate that our technique greatly improves the performance of the baseline solver and achieves up to $10\\times$ acceleration in mean sampling speed.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zalan Fabian;Berk Tinaz;Mahdi Soltanolkotabi", "authorids": "~Zalan_Fabian1;~Berk_Tinaz1;~Mahdi_Soltanolkotabi1", "gender": "M;M;M", "homepage": "https://z-fabian.github.io/;https://berktinaz.github.io/;http://www-bcf.usc.edu/~soltanol/", "dblp": "192/2874;275/8488;75/6691", "google_scholar": "5EKjsXQAAAAJ;gzIzOtAAAAAJ;narJyMAAAAAJ", "orcid": ";;", "linkedin": ";berk-tinaz/;", "or_profile": "~Zalan_Fabian1;~Berk_Tinaz1;~Mahdi_Soltanolkotabi1", "aff": "University of Southern California;Amazon;University of Southern California", "aff_domain": "usc.edu;amazon.com;usc.edu", "position": "Postdoc;Intern;Associate Professor", "bibtex": "@inproceedings{\nfabian2024adapt,\ntitle={Adapt and Diffuse: Sample-adaptive Reconstruction via Latent Diffusion Models},\nauthor={Zalan Fabian and Berk Tinaz and Mahdi Soltanolkotabi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=V3OpGwo68Z}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9619827, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4530409226247603787&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "usc.edu;amazon.com;usc.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Southern California;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.usc.edu;https://www.amazon.com", "aff_unique_abbr": "USC;Amazon", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Provable Contrastive Continual Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33926", "id": "V3ya8RlbrW", "proceeding": "https://proceedings.mlr.press/v235/wen24f.html", "pdf": "https://openreview.net/pdf?id=V3ya8RlbrW", "openreview": "https://openreview.net/forum?id=V3ya8RlbrW", "author_site": "Yichen Wen, Zhiquan Tan, Kaipeng Zheng, Chuanlong Xie, Weiran Huang", "tldr": "", "abstract": "Continual learning requires learning incremental tasks with dynamic data distributions. So far, it has been observed that employing a combination of contrastive loss and distillation loss for training in continual learning yields strong performance. To the best of our knowledge, however, this contrastive continual learning framework lacks convincing theoretical explanations. In this work, we fill this gap by establishing theoretical performance guarantees, which reveal how the performance of the model is bounded by training losses of previous tasks in the contrastive continual learning framework. Our theoretical explanations further support the idea that pre-training can benefit continual learning. Inspired by our theoretical analysis of these guarantees, we propose a novel contrastive continual learning algorithm called CILA, which uses adaptive distillation coefficients for different tasks. These distillation coefficients are easily computed by the ratio between average distillation losses and average contrastive losses from previous tasks. Our method shows great improvement on standard benchmarks and achieves new state-of-the-art performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yichen Wen;Zhiquan Tan;Kaipeng Zheng;Chuanlong Xie;Weiran Huang", "authorids": "~Yichen_Wen1;~Zhiquan_Tan1;~Kaipeng_Zheng1;~Chuanlong_Xie1;~Weiran_Huang1", "gender": "M;M;M;M;M", "homepage": "https://stat.bnu.edu.cn/;;https://github.com/uiiuiiuii;;https://www.weiranhuang.com", "dblp": ";326/0177;330/7352;;170/0073-1", "google_scholar": ";;;_fgE3u8AAAAJ;AjJ2rf8AAAAJ", "orcid": ";;;;", "linkedin": ";https://www.linkedin.cn/incareer/in/ACoAAC1A8_QBFX8OlchWmVI_pNXN4zm_t6vPKCs;;;", "or_profile": "~Yichen_Wen1;~Zhiquan_Tan1;~Kaipeng_Zheng1;~Chuanlong_Xie1;~Weiran_Huang1", "aff": "Beijing Normal University;Tsinghua University;Shanghai Artificial Intelligence Laboratory;Beijing Normal University;Shanghai AI Laboratory", "aff_domain": "bnu.edu.cn;tsinghua.edu.cn;pjlab.org.cn;bnu.edu.cn;pjlab.org.cn", "position": "MS student;PhD student;Intern;Associate Professor;Consultant", "bibtex": "@inproceedings{\nwen2024provable,\ntitle={Provable Contrastive Continual Learning},\nauthor={Yichen Wen and Zhiquan Tan and Kaipeng Zheng and Chuanlong Xie and Weiran Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=V3ya8RlbrW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 616191, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1828287020864456166&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 6, "email": "bnu.edu.cn;tsinghua.edu.cn;pjlab.org.cn;bnu.edu.cn;pjlab.org.cn", "author_num": 5, "aff_unique_index": "0;1;2;0;3", "aff_unique_norm": "Beijing Normal University;Tsinghua University;Shanghai Artificial Intelligence Laboratory;Shanghai AI Laboratory", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.bnu.edu.cn;https://www.tsinghua.edu.cn;http://www.shailab.org/;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "BNU;THU;Shanghai AI Lab;SAIL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "An Embodied Generalist Agent in 3D World", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33925", "id": "V4qV08Vk6S", "proceeding": "https://proceedings.mlr.press/v235/huang24ae.html", "pdf": "https://openreview.net/pdf?id=V4qV08Vk6S", "openreview": "https://openreview.net/forum?id=V4qV08Vk6S", "author_site": "Jiangyong Huang, Silong Yong, Xiaojian Ma, Xiongkun Linghu, Puhao Li, Yan Wang, Qing Li, Song-Chun Zhu, Baoxiong Jia, Siyuan Huang", "tldr": "", "abstract": "Leveraging massive knowledge from large language models (LLMs), recent machine learning models show notable successes in general-purpose task solving in diverse domains such as computer vision and robotics. However, several significant challenges remain: (i) most of these models rely on 2D images yet exhibit a limited capacity for 3D input; (ii) these models rarely explore the tasks inherently defined in 3D world, e.g., 3D grounding, embodied reasoning and acting. We argue these limitations significantly hinder current models from performing real-world tasks and approaching general intelligence. To this end, we introduce LEO, an embodied multi-modal generalist agent that excels in perceiving, grounding, reasoning, planning, and acting in the 3D world. LEO is trained with a unified task interface, model architecture, and objective in two stages: (i) 3D vision-language (VL) alignment and (ii) 3D vision-language-action (VLA) instruction tuning. We collect large-scale datasets comprising diverse object-level and scene-level tasks, which require considerable understanding of and interaction with the 3D world. Moreover, we meticulously design an LLM-assisted pipeline to produce high-quality 3D VL data. Through extensive experiments, we demonstrate LEO's remarkable proficiency across a wide spectrum of tasks, including 3D captioning, question answering, embodied reasoning, navigation and manipulation. Our ablative studies and scaling analyses further provide valuable insights for developing future embodied generalist agents. Code and data are available on [project page](https://embodied-generalist.github.io/).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiangyong Huang;Silong Yong;Xiaojian Ma;Xiongkun Linghu;Puhao Li;Yan Wang;Qing Li;Song-Chun Zhu;Baoxiong Jia;Siyuan Huang", "authorids": "~Jiangyong_Huang1;~Silong_Yong1;~Xiaojian_Ma1;~Xiongkun_Linghu1;~Puhao_Li1;~Yan_Wang24;~Qing_Li1;~Song-Chun_Zhu1;~Baoxiong_Jia1;~Siyuan_Huang2", "gender": ";M;;;M;M;M;M;M;M", "homepage": "https://huangjy-pku.github.io/;https://github.com/SilongYong;;;https://xiaoyao-li.github.io/;https://github.com/jetpackfirstme;http://liqing-ustc.github.io/;https://zhusongchun.net/;https://buzz-beater.github.io/;https://siyuanhuang.com/", "dblp": "334/4572;;;;330/4121.html;;181/2689-3;10/10313;206/8738;62/885-1", "google_scholar": ";EitVAcwAAAAJ;;;https://scholar.google.at/citations?user=HTsO18AAAAAJ;;iwdFZBEAAAAJ;https://scholar.google.com.tw/citations?user=Al8dyb4AAAAJ;qIBUK6sAAAAJ;1NN7Ee8AAAAJ", "orcid": "0000-0001-9125-6893;;;0000-0002-4393-4386;0009-0003-2696-9346;;;;0000-0002-4968-3290;", "linkedin": ";;;;;;;;baoxiong-jia-2b6094122?trk=public_post-text;", "or_profile": "~Jiangyong_Huang1;~Silong_Yong1;~Xiaojian_Ma1;~Xiongkun_Linghu1;~Puhao_Li1;~Yan_Wang24;~Qing_Li1;~Song-Chun_Zhu1;~Baoxiong_Jia1;~Siyuan_Huang2", "aff": "Peking University;Carnegie Mellon University;;Beijing Institute for General Artificial Intelligence;Tsinghua University;Beijing Institute for General Artificial Intelligence;Beijing Institute for General Artificial Intelligence (BIGAI);Peking University;Beijing Institute for General Artificial Intelligence;Beijing Institute for General Artificial Intelligence", "aff_domain": "pku.edu.cn;andrew.cmu.edu;;bigai.ai;tsinghua.edu.cn;bigai.ai;bigai.ai;pku.edu.cn;bigai.ai;bigai.ai", "position": "PhD student;MS student;;Researcher;PhD student;Researcher;Researcher;Full Professor;Researcher;Researcher", "bibtex": "@inproceedings{\nhuang2024an,\ntitle={An Embodied Generalist Agent in 3D World},\nauthor={Jiangyong Huang and Silong Yong and Xiaojian Ma and Xiongkun Linghu and Puhao Li and Yan Wang and Qing Li and Song-Chun Zhu and Baoxiong Jia and Siyuan Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=V4qV08Vk6S}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1275041, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 143, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8869821326961875122&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "pku.edu.cn;andrew.cmu.edu;;bigai.ai;tsinghua.edu.cn;bigai.ai;bigai.ai;pku.edu.cn;bigai.ai;bigai.ai", "author_num": 10, "aff_unique_index": "0;1;2;3;2;2;0;2;2", "aff_unique_norm": "Peking University;Carnegie Mellon University;Beijing Institute for General Artificial Intelligence;Tsinghua University", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.pku.edu.cn;https://www.cmu.edu;http://www.bigaiai.org/;https://www.tsinghua.edu.cn", "aff_unique_abbr": "Peking U;CMU;BIGAI;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;0;0;0;0", "aff_country_unique": "China;United States" }, { "title": "A Neural-Preconditioned Poisson Solver for Mixed Dirichlet and Neumann Boundary Conditions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33924", "id": "VAKkoJjVpn", "proceeding": "https://proceedings.mlr.press/v235/lan24a.html", "pdf": "https://openreview.net/pdf?id=VAKkoJjVpn", "openreview": "https://openreview.net/forum?id=VAKkoJjVpn", "author_site": "Kai Weixian Lan, Elias Gueidon, Ayano Kaneda, Julian Panetta, Joseph Teran", "tldr": "", "abstract": "We introduce a neural-preconditioned iterative solver for Poisson equations with mixed boundary conditions. Typical Poisson discretizations yield large, ill-conditioned linear systems. Iterative solvers can be effective for these problems, but only when equipped with powerful preconditioners. Unfortunately, effective preconditioners like multigrid require costly setup phases that must be re-executed every time domain shapes or boundary conditions change, forming a severe bottleneck for problems with evolving boundaries. In contrast, we present a neural preconditioner trained to efficiently approximate the inverse of the discrete Laplacian in the presence of such changes. Our approach generalizes to domain shapes, boundary conditions, and grid sizes outside the training set. The key to our preconditioner's success is a novel, lightweight neural network architecture featuring spatially varying convolution kernels and supporting fast inference. We demonstrate that our solver outperforms state-of-the-art methods like algebraic multigrid as well as recently proposed neural preconditioners on challenging test cases arising from incompressible fluid simulations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kai Weixian Lan;Elias Gueidon;Ayano Kaneda;Julian Panetta;Joseph Teran", "authorids": "~Kai_Weixian_Lan1;~Elias_Gueidon1;~Ayano_Kaneda1;~Julian_Panetta1;~Joseph_Teran1", "gender": "M;F;M;M;", "homepage": ";https://sites.google.com/view/pandako;http://julianpanetta.com;https://www.math.ucla.edu/~jteran/;https://www.weixianlan.com", "dblp": ";;;;358/4635", "google_scholar": ";YCs12mQAAAAJ;1rQ_OTEAAAAJ;ksjNjEwAAAAJ;1NVO-OIAAAAJ", "orcid": ";;;;", "linkedin": "egueidon/;kaneda-ayano-b75a49115/;;;", "or_profile": "~Elias_Gueidon1;~Ayano_Kaneda1;~Julian_Panetta1;~Joseph_Teran1;~Weixian_Lan1", "aff": "University of California, Los Angeles;Waseda University;University of California, Davis;;University of California, Davis", "aff_domain": "ucla.edu;waseda.jp;ucdavis.edu;;ucdavis.edu", "position": "PhD student;PhD student;Assistant Professor;;PhD student", "bibtex": "@inproceedings{\nlan2024a,\ntitle={A Neural-Preconditioned Poisson Solver for Mixed Dirichlet and Neumann Boundary Conditions},\nauthor={Kai Weixian Lan and Elias Gueidon and Ayano Kaneda and Julian Panetta and Joseph Teran},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VAKkoJjVpn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9310242, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MmQ3tzegVbMJ:scholar.google.com/&scioq=A+Neural-Preconditioned+Poisson+Solver+for+Mixed+Dirichlet+and+Neumann+Boundary+Conditions&hl=en&as_sdt=0,33", "gs_version_total": 7, "email": "ucla.edu;waseda.jp;ucdavis.edu;;ucdavis.edu", "author_num": 5, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "University of California, Los Angeles;Waseda University;University of California, Davis", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ucla.edu;https://www.waseda.jp/top;https://www.ucdavis.edu", "aff_unique_abbr": "UCLA;Waseda;UC Davis", "aff_campus_unique_index": "0;2;2", "aff_campus_unique": "Los Angeles;;Davis", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;Japan" }, { "title": "On the Convergence of Projected Bures-Wasserstein Gradient Descent under Euclidean Strong Convexity", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33923", "id": "VDgfJnOEMV", "proceeding": "https://proceedings.mlr.press/v235/fan24b.html", "pdf": "https://openreview.net/pdf?id=VDgfJnOEMV", "openreview": "https://openreview.net/forum?id=VDgfJnOEMV", "author_site": "Junyi FAN, Yuxuan Han, Zijian Liu, Jian-Feng Cai, Yang Wang, Zhengyuan Zhou", "tldr": "", "abstract": "The Bures-Wasserstein (BW) gradient descent method has gained considerable attention in various domains, including Gaussian barycenter, matrix recovery and variational inference problems, due to its alignment with the Wasserstein geometry of normal distributions. Despite its popularity, existing convergence analysis are often contingent upon specific loss functions, and the exploration of constrained settings within this framework remains limited. In this work, we make an attempt to bridge this gap by providing a general convergence rate guarantee for BW gradient descent when the Euclidean strong convexity of the loss and the constraints is assumed. In an effort to advance practical implementations, we also derive a closed-form solution for the projection onto BW distance-constrained sets, which enables the fast implementation of projected BW gradient descent for problems that arise in the constrained barycenter and distributionally robust optimization literature. Experimental results demonstrate significant improvements in computational efficiency and convergence speed, underscoring the efficacy of our method in practical scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Junyi FAN;Yuxuan Han;Zijian Liu;Jian-Feng Cai;Yang Wang;Zhengyuan Zhou", "authorids": "~Junyi_FAN1;~Yuxuan_Han1;~Zijian_Liu1;~Jian-Feng_Cai4;~Yang_Wang25;~Zhengyuan_Zhou2", "gender": ";;;;M;M", "homepage": ";;;;http://www.math.ust.hk/~yangwang;https://scholar.google.com/citations?user=hiGI9v0AAAAJ&hl=en", "dblp": ";;;;;125/5270", "google_scholar": ";;;;;", "orcid": "0000-0001-5516-7471;;;;0000-0002-8903-2388;", "linkedin": ";;;;;", "or_profile": "~Junyi_FAN1;~Yuxuan_Han1;~Zijian_Liu1;~Jian-Feng_Cai4;~Yang_Wang25;~Zhengyuan_Zhou2", "aff": "Hong Kong University of Science and Technology;;;;Hong Kong University of Science and Technology;New York University", "aff_domain": "hkust.edu;;;;hkust.edu.hk;nyu.edu", "position": "PhD student;;;;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nfan2024on,\ntitle={On the Convergence of Projected Bures-Wasserstein Gradient Descent under Euclidean Strong Convexity},\nauthor={Junyi FAN and Yuxuan Han and Zijian Liu and Jian-Feng Cai and Yang Wang and Zhengyuan Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VDgfJnOEMV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 669976, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MAojLe3RadQJ:scholar.google.com/&scioq=On+the+Convergence+of+Projected+Bures-Wasserstein+Gradient+Descent+under+Euclidean+Strong+Convexity&hl=en&as_sdt=0,33", "gs_version_total": 4, "email": "hkust.edu;;;;hkust.edu.hk;nyu.edu", "author_num": 6, "aff_unique_index": "0;0;1", "aff_unique_norm": "Hong Kong University of Science and Technology;New York University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ust.hk;https://www.nyu.edu", "aff_unique_abbr": "HKUST;NYU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;United States" }, { "title": "Stealing part of a production language model", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33922", "id": "VE3yWXt3KB", "proceeding": "https://proceedings.mlr.press/v235/carlini24a.html", "pdf": "https://openreview.net/pdf?id=VE3yWXt3KB", "openreview": "https://openreview.net/forum?id=VE3yWXt3KB", "author_site": "Nicholas Carlini, Daniel Paleka, Krishnamurthy Dvijotham, Thomas Steinke, Jonathan Hayase, A. Feder Cooper, Katherine Lee, Matthew Jagielski, Milad Nasr, Arthur Conmy, Eric Wallace, David Rolnick, Florian Tramer", "tldr": "", "abstract": "We introduce the first model-stealing attack that extracts precise, nontrivial information from black-box production language models like OpenAI's ChatGPT or Google's PaLM-2. Specifically, our attack recovers the embedding projection layer (up to symmetries) of a transformer model, given typical API access. For under \r\n$20 USD, our attack extracts the entire projection matrix of OpenAI's Ada and Babbage language models. We thereby confirm, for the first time, that these black-box models have a hidden dimension of 1024 and 2048, respectively. We also recover the exact hidden dimension size of the GPT-3.5-turbo model, and estimate it would cost under \\\\$2,000 in queries to recover the entire projection matrix. We conclude with potential defenses and mitigations, and discuss the implications of possible future work that could extend our attack.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nicholas Carlini;Daniel Paleka;Krishnamurthy Dj Dvijotham;Thomas Steinke;Jonathan Hayase;A. Feder Cooper;Katherine Lee;Matthew Jagielski;Milad Nasr;Arthur Conmy;Eric Wallace;David Rolnick;Florian Tram\u00e8r", "authorids": "~Nicholas_Carlini1;~Daniel_Paleka1;~Krishnamurthy_Dj_Dvijotham1;~Thomas_Steinke2;~Jonathan_Hayase2;~A._Feder_Cooper1;~Katherine_Lee1;~Matthew_Jagielski1;~Milad_Nasr2;~Arthur_Conmy1;~Eric_Wallace1;~David_Rolnick1;~Florian_Tram\u00e8r1", "gender": ";;M;M;;F;M;;M;M;M;M;M", "homepage": "http://nicholas.carlini.com;https://danielpaleka.com/;http://www.thomas-steinke.net/;https://jhayase.github.io/;https://afedercooper.info;https://katelee168.github.io/;https://jagielski.github.io/;https://people.cs.umass.edu/~milad/;https://arthurconmy.github.io/;http://www.ericswallace.com/;http://www.davidrolnick.com/;http://floriantramer.com;http://dvij.github.io", "dblp": "145/1806;324/2779;https://dblp.uni-trier.de/pid/73/4025-2.html;244/9599;260/0514;115/5082.html;218/5156;;;218/6165;37/10718;158/7224;16/8758", "google_scholar": ";;kwnwhrgAAAAJ;Zw-l1d8AAAAJ;https://scholar.google.ch/citations?hl=en;bjdB4K8AAAAJ;_8rw_GMAAAAJ;k6-nvDAAAAAJ;;SgST3LkAAAAJ;P_luG3cAAAAJ;https://scholar.google.ch/citations?user=ijH0-a8AAAAJ;BUtloecAAAAJ", "orcid": ";;;0000-0002-3757-6586;0000-0002-4892-681X;;;;;;;;", "linkedin": ";;thomas-steinke-2841248/;jonathan-hayase-5ab849128;;;;;;;;;", "or_profile": "~Nicholas_Carlini1;~Daniel_Paleka1;~Thomas_Steinke2;~Jonathan_Hayase2;~A._Feder_Cooper1;~Katherine_Lee1;~Matthew_Jagielski1;~Milad_Nasr2;~Arthur_Conmy1;~Eric_Wallace1;~David_Rolnick1;~Florian_Tramer1;~Krishnamurthy_Dvijotham2", "aff": "Google;Department of Computer Science, ETHZ - ETH Zurich;Google;University of Washington;Cornell University;Google;Google;Google;Google DeepMind;University of California, Berkeley;McGill University;ETHZ - ETH Zurich;Google DeepMind", "aff_domain": "google.com;inf.ethz.ch;google.com;washington.edu;cornell.edu;google.com;google.com;google.com;google.com;berkeley.edu;cs.mcgill.ca;ethz.ch;google.com", "position": "Researcher;PhD student;Research Scientist;PhD student;PhD student;Researcher;Researcher;Researcher;Researcher;PhD student;Assistant Professor;Assistant Professor;Researcher", "bibtex": "@inproceedings{\ncarlini2024stealing,\ntitle={Stealing part of a production language model},\nauthor={Nicholas Carlini and Daniel Paleka and Krishnamurthy Dj Dvijotham and Thomas Steinke and Jonathan Hayase and A. Feder Cooper and Katherine Lee and Matthew Jagielski and Milad Nasr and Arthur Conmy and Eric Wallace and David Rolnick and Florian Tram{\\`e}r},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VE3yWXt3KB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 775928, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 13, "gs_citation": 84, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17318026270501704804&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "email": "google.com;inf.ethz.ch;google.com;washington.edu;cornell.edu;google.com;google.com;google.com;google.com;berkeley.edu;cs.mcgill.ca;ethz.ch;google.com", "author_num": 13, "aff_unique_index": "0;1;0;2;3;0;0;0;0;4;5;1;0", "aff_unique_norm": "Google;ETH Zurich;University of Washington;Cornell University;University of California, Berkeley;McGill University", "aff_unique_dep": "Google;Department of Computer Science;;;;", "aff_unique_url": "https://www.google.com;https://www.ethz.ch;https://www.washington.edu;https://www.cornell.edu;https://www.berkeley.edu;https://www.mcgill.ca", "aff_unique_abbr": "Google;ETHZ;UW;Cornell;UC Berkeley;McGill", "aff_campus_unique_index": "0;1;0;0;0;0;3", "aff_campus_unique": "Mountain View;Zurich;;Berkeley", "aff_country_unique_index": "0;1;0;0;0;0;0;0;2;0;3;1;2", "aff_country_unique": "United States;Switzerland;United Kingdom;Canada" }, { "title": "Slow and Steady Wins the Race: Maintaining Plasticity with Hare and Tortoise Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33921", "id": "VF177x7Syw", "proceeding": "https://proceedings.mlr.press/v235/lee24d.html", "pdf": "https://openreview.net/pdf?id=VF177x7Syw", "openreview": "https://openreview.net/forum?id=VF177x7Syw", "author_site": "Hojoon Lee, Hyeonseo Cho, Hyunseung Kim, Donghu Kim, Dugki Min, Jaegul Choo, Clare Lyle", "tldr": "", "abstract": "This study investigates the loss of generalization ability in neural networks, revisiting warm-starting experiments from Ash & Adams. Our empirical analysis reveals that common methods designed to enhance plasticity by maintaining trainability provide limited benefits to generalization. While reinitializing the network can be effective, it also risks losing valuable prior knowledge. To this end, we introduce the Hare & Tortoise, inspired by the brain's complementary learning system. Hare & Tortoise consists of two components: the Hare network, which rapidly adapts to new information analogously to the hippocampus, and the Tortoise network, which gradually integrates knowledge akin to the neocortex. By periodically reinitializing the Hare network to the Tortoise's weights, our method preserves plasticity while retaining general knowledge. Hare & Tortoise can effectively maintain the network's ability to generalize, which improves advanced reinforcement learning algorithms on the Atari-100k benchmark. The code is available at https://github.com/dojeon-ai/hare-tortoise.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hojoon Lee;Hyeonseo Cho;Hyunseung Kim;Donghu Kim;Dugki Min;Jaegul Choo;Clare Lyle", "authorids": "~Hojoon_Lee1;~Hyeonseo_Cho1;~Hyunseung_Kim1;~Donghu_Kim1;~Dugki_Min1;~Jaegul_Choo1;~Clare_Lyle1", "gender": "M;;M;M;M;M;", "homepage": "https://joonleesky.github.io/;;;https://i-am-proto.github.io;http://dms.konkuk.ac.kr;https://sites.google.com/site/jaegulchoo/;", "dblp": ";;244/0949;379/3468;;07/2074;192/1910", "google_scholar": ";;https://scholar.google.com/citations?view_op=list_works;LcYjQYcAAAAJ;;GHJYsLEAAAAJ;", "orcid": ";;;;;;", "linkedin": ";;;donghu-kim-3b57972b6/;;;", "or_profile": "~Hojoon_Lee1;~Hyeonseo_Cho1;~Hyunseung_Kim1;~Donghu_Kim1;~Dugki_Min1;~Jaegul_Choo1;~Clare_Lyle1", "aff": "Sony AI;;Korea Advanced Institute of Science & Technology;Korea University;Konkuk University;Korea Advanced Institute of Science & Technology;Google DeepMind", "aff_domain": "sony.com;;kaist.ac.kr;korea.ac.kr;konkuk.ac.kr;kaist.ac.kr;google.com", "position": "Intern;;PhD student;Undergrad student;Full Professor;Associate Professor;Researcher", "bibtex": "@inproceedings{\nlee2024slow,\ntitle={Slow and Steady Wins the Race: Maintaining Plasticity with Hare and Tortoise Networks},\nauthor={Hojoon Lee and Hyeonseo Cho and Hyunseung Kim and Donghu Kim and Dugki Min and Jaegul Choo and Clare Lyle},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VF177x7Syw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1416343, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7153219641892626390&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "sony.com;;kaist.ac.kr;korea.ac.kr;konkuk.ac.kr;kaist.ac.kr;google.com", "author_num": 7, "aff_unique_index": "0;1;2;3;1;4", "aff_unique_norm": "Sony;Korea Advanced Institute of Science and Technology;Korea University;Konkuk University;Google", "aff_unique_dep": "Sony AI;;;;Google DeepMind", "aff_unique_url": "https://www.sony.com;https://www.kaist.ac.kr;https://www.korea.ac.kr;http://www.konkuk.edu;https://deepmind.com", "aff_unique_abbr": "Sony AI;KAIST;KU;KU;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;2", "aff_country_unique": "Japan;South Korea;United Kingdom" }, { "title": "Variance-reduced Zeroth-Order Methods for Fine-Tuning Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33920", "id": "VHO4nE7v41", "proceeding": "https://proceedings.mlr.press/v235/gautam24a.html", "pdf": "https://openreview.net/pdf?id=VHO4nE7v41", "openreview": "https://openreview.net/forum?id=VHO4nE7v41", "author_site": "Tanmay Gautam, Youngsuk Park, Hao Zhou, Parameswaran Raman, Wooseok Ha", "tldr": "", "abstract": "Fine-tuning language models (LMs) has demonstrated success in a wide array of downstream tasks. However, as LMs are scaled up, the memory requirements for backpropagation become prohibitively high. Zeroth-order (ZO) optimization methods can leverage memory-efficient forward passes to estimate gradients. More recently, MeZO, an adaptation of ZO-SGD, has been shown to consistently outperform zero-shot and in-context learning when combined with suitable task prompts. In this work, we couple ZO methods with variance reduction techniques to enhance stability and convergence for inference-based LM fine-tuning. We introduce Memory-Efficient Zeroth-Order Stochastic Variance-Reduced Gradient (MeZO-SVRG) and demonstrate its efficacy across multiple LM fine-tuning tasks, eliminating the reliance on task-specific prompts. Evaluated across a range of both masked and autoregressive LMs on benchmark GLUE tasks, MeZO-SVRG outperforms MeZO with up to 20% increase in test accuracies in both full- and partial-parameter fine-tuning settings. MeZO-SVRG benefits from reduced computation time as it often surpasses MeZO's peak test accuracy with a $2\\times$ reduction in GPU-hours. MeZO-SVRG significantly reduces the required memory footprint compared to first-order SGD, i.e. by $2\\times$ for autoregressive models. Our experiments highlight that MeZO-SVRG's memory savings progressively improve compared to SGD with larger batch sizes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tanmay Gautam;Youngsuk Park;Hao Zhou;Parameswaran Raman;Wooseok Ha", "authorids": "~Tanmay_Gautam1;~Youngsuk_Park1;~Hao_Zhou12;~Parameswaran_Raman1;~Wooseok_Ha1", "gender": "M;M;M;M;M", "homepage": "https://people.eecs.berkeley.edu/~tgautam23/;https://youngsuk0723.github.io/;;https://paramsraman.github.io/;https://haywse.github.io/", "dblp": ";88/11095;;142/2573;178/3275", "google_scholar": "oKeqnc0AAAAJ;jWROvQ0AAAAJ;8vaGcAcAAAAJ;amJUMFEAAAAJ;", "orcid": ";0000-0002-0970-9214;;;", "linkedin": ";y-park;hao-zhou-55697aa4/;;", "or_profile": "~Tanmay_Gautam1;~Youngsuk_Park1;~Hao_Zhou12;~Parameswaran_Raman1;~Wooseok_Ha1", "aff": "University of California, Berkeley;Amazon, AWS AI Labs;Amazon;Amazon;AWS AI Labs", "aff_domain": "berkeley.edu;amazon.com;amazon.com;amazon.com;amazon.com", "position": "MS student;Research;Researcher;Applied Scientist;Researcher", "bibtex": "@inproceedings{\ngautam2024variancereduced,\ntitle={Variance-reduced Zeroth-Order Methods for Fine-Tuning Language Models},\nauthor={Tanmay Gautam and Youngsuk Park and Hao Zhou and Parameswaran Raman and Wooseok Ha},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VHO4nE7v41}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1565657, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12121127406468423574&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "berkeley.edu;amazon.com;amazon.com;amazon.com;amazon.com", "author_num": 5, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "University of California, Berkeley;Amazon", "aff_unique_dep": ";AWS AI Labs", "aff_unique_url": "https://www.berkeley.edu;https://www.amazon.com", "aff_unique_abbr": "UC Berkeley;Amazon", "aff_campus_unique_index": "0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Mobile Attention: Mobile-Friendly Linear-Attention for Vision Transformers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33919", "id": "VHtIDVaOKC", "proceeding": "https://proceedings.mlr.press/v235/yao24c.html", "pdf": "https://openreview.net/pdf?id=VHtIDVaOKC", "openreview": "https://openreview.net/forum?id=VHtIDVaOKC", "author_site": "Zhiyu Yao, Jian Wang, Haixu Wu, Jingdong Wang, Mingsheng Long", "tldr": "", "abstract": "Vision Transformers (ViTs) excel in computer vision tasks due to their ability to capture global context among tokens. However, their quadratic complexity $\\mathcal{O}(N^2D)$ in terms of token number $N$ and feature dimension $D$ limits practical use on mobile devices, necessitating more mobile-friendly ViTs with reduced latency. Multi-head linear-attention is emerging as a promising alternative with linear complexity $\\mathcal{O}(NDd)$, where $d$ is the per-head dimension. Still, more compute is needed as $d$ gets large for model accuracy. Reducing $d$ improves mobile friendliness at the expense of excessive small heads weak at learning valuable subspaces, ultimately impeding model capability. To overcome this efficiency-capability dilemma, we propose a novel Mobile-Attention design with a head-competition mechanism empowered by information flow, which prevents overemphasis on less important subspaces upon trivial heads while preserving essential subspaces to ensure Transformer's capability. It enables linear-time complexity on mobile devices by supporting a small per-head dimension $d$ for mobile efficiency. By replacing the standard attention of ViTs with Mobile-Attention, our optimized ViTs achieved enhanced model capacity and competitive performance in a range of computer vision tasks. Specifically, we have achieved remarkable reductions in latency on the iPhone 12. Code is available at https://github.com/thuml/MobileAttention.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhiyu Yao;Jian Wang;Haixu Wu;Jingdong Wang;Mingsheng Long", "authorids": "~Zhiyu_Yao2;~Jian_Wang11;~Haixu_Wu1;~Jingdong_Wang1;~Mingsheng_Long5", "gender": "M;M;M;M;M", "homepage": ";;;https://jingdongwang2017.github.io/;http://ise.thss.tsinghua.edu.cn/~mlong", "dblp": "230/4609;39/449-66;286/8115;49/3441;74/9023", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.hk/citations?user=hDPRTekAAAAJ;oLL_x0wAAAAJ;z5SPCmgAAAAJ;_MjXpXkAAAAJ", "orcid": "0000-0002-0887-8809;;;0000-0002-4888-4445;0000-0002-5412-9120", "linkedin": ";;;;", "or_profile": "~Zhiyu_Yao2;~Jian_Wang11;~Haixu_Wu1;~Jingdong_Wang1;~Mingsheng_Long2", "aff": "Tsinghua University;ByteDance Inc.;Tsinghua University;Baidu;Tsinghua University", "aff_domain": "tsinghua.edu.cn;bytedance.com;tsinghua.edu.cn;baidu.com;tsinghua.edu.cn", "position": "PhD student;Instructor;PhD student;Chief Scientist for Computer Vision;Associate Professor", "bibtex": "@inproceedings{\nyao2024mobile,\ntitle={Mobile Attention: Mobile-Friendly Linear-Attention for Vision Transformers},\nauthor={Zhiyu Yao and Jian Wang and Haixu Wu and Jingdong Wang and Mingsheng Long},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VHtIDVaOKC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1366761, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7791872035499382682&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "tsinghua.edu.cn;bytedance.com;tsinghua.edu.cn;baidu.com;tsinghua.edu.cn", "author_num": 5, "aff_unique_index": "0;1;0;2;0", "aff_unique_norm": "Tsinghua University;ByteDance;Baidu", "aff_unique_dep": ";;Baidu, Inc.", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.bytedance.com;https://www.baidu.com", "aff_unique_abbr": "THU;ByteDance;Baidu", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Second-Order Uncertainty Quantification: A Distance-Based Approach", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33918", "id": "VJjjNrUi8j", "proceeding": "https://proceedings.mlr.press/v235/sale24a.html", "pdf": "https://openreview.net/pdf?id=VJjjNrUi8j", "openreview": "https://openreview.net/forum?id=VJjjNrUi8j", "author_site": "Yusuf Sale, Viktor Bengs, Michele Caprio, Eyke H\u00fcllermeier", "tldr": "", "abstract": "In the past couple of years, various approaches to representing and quantifying different types of predictive uncertainty in machine learning, notably in the setting of classification, have been proposed on the basis of second-order probability distributions, i.e., predictions in the form of distributions on probability distributions. A completely conclusive solution has not yet been found, however, as shown by recent criticisms of commonly used uncertainty measures associated with second-order distributions, identifying undesirable theoretical properties of these measures. In light of these criticisms, we propose a set of formal criteria that meaningful uncertainty measures for predictive uncertainty based on second-order distributions should obey. Moreover, we provide a general framework for developing uncertainty measures to account for these criteria, and offer an instantiation based on the Wasserstein distance, for which we prove that all criteria are satisfied.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yusuf Sale;Viktor Bengs;Michele Caprio;Eyke H\u00fcllermeier", "authorids": "~Yusuf_Sale1;~Viktor_Bengs1;~Michele_Caprio1;~Eyke_H\u00fcllermeier1", "gender": "M;M;M;M", "homepage": ";https://www.kiml.ifi.lmu.de/;https://mc6034.wixsite.com/caprio;https://cs.uni-paderborn.de/index.php?id=60202", "dblp": ";244/9484;322/9067;h/EykeHullermeier", "google_scholar": "https://scholar.google.de/citations?user=yn3w9eoAAAAJ;J1eEtpwAAAAJ;6rngqVgAAAAJ;https://scholar.google.de/citations?user=usVJeNN3xFAC", "orcid": ";0000-0001-6988-6186;0000-0002-7569-097X;0000-0002-9944-4108", "linkedin": ";;michele-caprio-5866b162/;", "or_profile": "~Yusuf_Sale1;~Viktor_Bengs1;~Michele_Caprio1;~Eyke_H\u00fcllermeier1", "aff": "Institute of Computer Science, Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;University of Pennsylvania;Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen", "aff_domain": "ifi.lmu.de;lmu.de;seas.upenn.edu;lmu.de", "position": "PhD student;Postdoc;Postdoc;Full Professor", "bibtex": "@inproceedings{\nsale2024secondorder,\ntitle={Second-Order Uncertainty Quantification: A Distance-Based Approach},\nauthor={Yusuf Sale and Viktor Bengs and Michele Caprio and Eyke H{\\\"u}llermeier},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VJjjNrUi8j}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2090826, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12935211635031214816&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 8, "email": "ifi.lmu.de;lmu.de;seas.upenn.edu;lmu.de", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;University of Pennsylvania", "aff_unique_dep": "Institute of Computer Science;", "aff_unique_url": "https://www.uni-muenchen.de;https://www.upenn.edu", "aff_unique_abbr": "LMU M\u00fcnchen;UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Germany;United States" }, { "title": "Rate-Optimal Policy Optimization for Linear Markov Decision Processes", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33917", "id": "VJwsDwuiuH", "proceeding": "https://proceedings.mlr.press/v235/sherman24a.html", "pdf": "https://openreview.net/pdf?id=VJwsDwuiuH", "openreview": "https://openreview.net/forum?id=VJwsDwuiuH", "author_site": "Uri Sherman, Alon Cohen, Tomer Koren, Yishay Mansour", "tldr": "", "abstract": "We study regret minimization in online episodic linear Markov Decision Processes, and propose a policy optimization algorithm that is computationally efficient, and obtains rate optimal $\\widetilde O (\\sqrt K)$ regret where $K$ denotes the number of episodes. Our work is the first to establish the optimal rate (in terms of $K$) of convergence in the stochastic setting with bandit feedback using a policy optimization based approach, and the first to establish the optimal rate in the adversarial setup with full information feedback, for which no algorithm with an optimal rate guarantee was previously known.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Uri Sherman;Alon Cohen;Tomer Koren;Yishay Mansour", "authorids": "~Uri_Sherman1;~Alon_Cohen1;~Tomer_Koren1;~Yishay_Mansour2", "gender": "M;M;M;M", "homepage": "https://urisherman.github.io/;https://sites.google.com/site/aloncohentechnion/;https://tomerkoren.github.io;https://www.cs.tau.ac.il/~mansour/", "dblp": "284/9712;133/2021;12/10044;m/YishayMansour", "google_scholar": "https://scholar.google.co.il/citations?hl=en;shoYR_AAAAAJ;wGG1voYAAAAJ;OEJUgwkAAAAJ", "orcid": ";;;0000-0001-6891-2645", "linkedin": "uri-sherman-a1b85924/;;;", "or_profile": "~Uri_Sherman1;~Alon_Cohen1;~Tomer_Koren1;~Yishay_Mansour1", "aff": "Meta Facebook;Google;Tel Aviv University;School of Computer Science, Tel Aviv University", "aff_domain": "meta.com;google.com;tau.ac.il;cs.tau.ac.il", "position": "Intern;Researcher;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nsherman2024rateoptimal,\ntitle={Rate-Optimal Policy Optimization for Linear Markov Decision Processes},\nauthor={Uri Sherman and Alon Cohen and Tomer Koren and Yishay Mansour},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VJwsDwuiuH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 401769, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8789807320973734802&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "email": "meta.com;google.com;tau.ac.il;cs.tau.ac.il", "author_num": 4, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Meta;Google;Tel Aviv University", "aff_unique_dep": "Meta Platforms, Inc.;Google;", "aff_unique_url": "https://meta.com;https://www.google.com;https://www.tau.ac.il", "aff_unique_abbr": "Meta;Google;TAU", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Mountain View;Tel Aviv", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "United States;Israel" }, { "title": "Positional Knowledge is All You Need: Position-induced Transformer (PiT) for Operator Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33916", "id": "VOcsmIBiXE", "proceeding": "https://proceedings.mlr.press/v235/chen24au.html", "pdf": "https://openreview.net/pdf?id=VOcsmIBiXE", "openreview": "https://openreview.net/forum?id=VOcsmIBiXE", "author_site": "Junfeng CHEN, Kailiang Wu", "tldr": "", "abstract": "Operator learning for Partial Differential Equations (PDEs) is rapidly emerging as a promising approach for surrogate modeling of intricate systems. Transformers with the self-attention mechanism---a powerful tool originally designed for natural language processing---have recently been adapted for operator learning. However, they confront challenges, including high computational demands and limited interpretability. This raises a critical question: *Is there a more efficient attention mechanism for Transformer-based operator learning?* This paper proposes the Position-induced Transformer (PiT), built on an innovative position-attention mechanism, which demonstrates significant advantages over the classical self-attention in operator learning. Position-attention draws inspiration from numerical methods for PDEs. Different from self-attention, position-attention is induced by only the spatial interrelations of sampling positions for input functions of the operators, and does not rely on the input function values themselves, thereby greatly boosting efficiency. PiT exhibits superior performance over current state-of-the-art neural operators in a variety of complex operator learning tasks across diverse PDE benchmarks. Additionally, PiT possesses an enhanced discretization convergence feature, compared to the widely-used Fourier neural operator.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Junfeng CHEN;Kailiang Wu", "authorids": "~Junfeng_CHEN2;~Kailiang_Wu1", "gender": "Not Specified;M", "homepage": ";https://sites.google.com/site/klwuhomepage/", "dblp": ";", "google_scholar": "HK-MQ-cAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Junfeng_CHEN2;~Kailiang_Wu1", "aff": "Southern University of Science and Technology;Southern University of Science and Technology", "aff_domain": "sustech.edu.cn;sustech.edu.cn", "position": "Postdoc;Associate Professor", "bibtex": "@inproceedings{\nchen2024positional,\ntitle={Positional Knowledge is All You Need: Position-induced Transformer (PiT) for Operator Learning},\nauthor={Junfeng CHEN and Kailiang Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VOcsmIBiXE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6556405, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12675684036219877113&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "sustech.edu.cn;sustech.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Southern University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.sustech.edu.cn", "aff_unique_abbr": "SUSTech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Equivariant Diffusion for Crystal Structure Prediction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33915", "id": "VRv8KjJNuj", "proceeding": "https://proceedings.mlr.press/v235/lin24b.html", "pdf": "https://openreview.net/pdf?id=VRv8KjJNuj", "openreview": "https://openreview.net/forum?id=VRv8KjJNuj", "author_site": "Peijia Lin, Pin Chen, Rui Jiao, Qing Mo, Jianhuan Cen, Wenbing Huang, Yang Liu, Dan Huang, Yutong Lu", "tldr": "", "abstract": "In addressing the challenge of Crystal Structure Prediction (CSP), symmetry-aware deep learning models, particularly diffusion models, have been extensively studied, which treat CSP as a conditional generation task. However, ensuring permutation, rotation, and periodic translation equivariance during diffusion process remains incompletely addressed. In this work, we propose EquiCSP, a novel equivariant diffusion-based generative model. We not only address the overlooked issue of lattice permutation equivariance in existing models, but also develop a unique noising algorithm that rigorously maintains periodic translation equivariance throughout both training and inference processes. Our experiments indicate that EquiCSP significantly surpasses existing models in terms of generating accurate structures and demonstrates faster convergence during the training process.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Peijia Lin;Pin Chen;Rui Jiao;Qing Mo;Cen Jianhuan;Wenbing Huang;Yang Liu;Dan Huang;Yutong Lu", "authorids": "~Peijia_Lin1;~Pin_Chen1;~Rui_Jiao1;~Qing_Mo1;~Cen_Jianhuan1;~Wenbing_Huang1;~Yang_Liu19;~Dan_Huang3;~Yutong_Lu1", "gender": "M;M;M;;M;M;M;M;F", "homepage": "https://github.com/EmperorJia;;https://jiaor17.github.io/;https://github.com/qingmo-nscc-gz;;https://gsai.ruc.edu.cn/english/wenbing_huang;http://nlp.csai.tsinghua.edu.cn/~ly/;https://cse.sysu.edu.cn/content/5266;http://www.sysu.edu.cn", "dblp": ";78/5412;223/1073;;344/1810.html;155/3181-1.html;51/3710-5;;", "google_scholar": ";;buW16-AAAAAJ;;FF3PSA4AAAAJ;0yNkmO4AAAAJ;https://scholar.google.com.hk/citations?user=lVhoKNcAAAAJ;;", "orcid": ";0000-0001-8746-9917;;;0009-0001-0515-2455;;0000-0002-3087-242X;;", "linkedin": ";;;;;;;;", "or_profile": "~Peijia_Lin1;~Pin_Chen1;~Rui_Jiao1;~Qing_Mo1;~Cen_Jianhuan1;~Wenbing_Huang1;~Yang_Liu19;~Dan_Huang3;~Yutong_Lu1", "aff": "SUN YAT-SEN UNIVERSITY;SUN YAT-SEN UNIVERSITY;Tsinghua University;;SUN YAT-SEN UNIVERSITY;Renmin University of China;Tsinghua University;Sun Yat-Sen University;SUN YAT-SEN UNIVERSITY", "aff_domain": "sysu.edu.cn;sysu.edu.cn;tsinghua.edu.cn;;sysu.edu.cn;ruc.edu.cn;tsinghua.edu.cn;mail.sysu.edu.cn;sysu.edu.cn", "position": "MS student;Researcher;PhD student;;PhD student;Associate Professor;Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nlin2024equivariant,\ntitle={Equivariant Diffusion for Crystal Structure Prediction},\nauthor={Peijia Lin and Pin Chen and Rui Jiao and Qing Mo and Cen Jianhuan and Wenbing Huang and Yang Liu and Dan Huang and Yutong Lu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VRv8KjJNuj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4952588, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3528753908469374994&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "sysu.edu.cn;sysu.edu.cn;tsinghua.edu.cn;;sysu.edu.cn;ruc.edu.cn;tsinghua.edu.cn;mail.sysu.edu.cn;sysu.edu.cn", "author_num": 9, "aff_unique_index": "0;0;1;0;2;1;0;0", "aff_unique_norm": "Sun Yat-sen University;Tsinghua University;Renmin University of China", "aff_unique_dep": ";;", "aff_unique_url": "http://www.sysu.edu.cn;https://www.tsinghua.edu.cn;http://www.ruc.edu.cn", "aff_unique_abbr": "SYSU;THU;RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Latent Optimal Paths by Gumbel Propagation for Variational Bayesian Dynamic Programming", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33914", "id": "VSwrXRqD9o", "proceeding": "https://proceedings.mlr.press/v235/niu24b.html", "pdf": "https://openreview.net/pdf?id=VSwrXRqD9o", "openreview": "https://openreview.net/forum?id=VSwrXRqD9o", "author_site": "Xinlei Niu, Christian Walder, Jing Zhang, Charles Martin", "tldr": "", "abstract": "We propose the stochastic optimal path which solves the classical optimal path problem by a probability-softening solution. This unified approach transforms a wide range of DP problems into directed acyclic graphs in which all paths follow a Gibbs distribution. We show the equivalence of the Gibbs distribution to a message-passing algorithm by the properties of the Gumbel distribution and give all the ingredients required for variational Bayesian inference of a latent path, namely Bayesian dynamic programming (BDP). We demonstrate the usage of BDP in the latent space of variational autoencoders (VAEs) and propose the BDP-VAE which captures structured sparse optimal paths as latent variables. This enables end-to-end training for generative tasks in which models rely on unobserved structural information. At last, we validate the behavior of our approach and showcase its applicability in two real-world applications: text-to-speech and singing voice synthesis. Our implementation code is available at https://github.com/XinleiNIU/LatentOptimalPathsBayesianDP.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xinlei Niu;Christian Walder;Jing Zhang;Charles Patrick Martin", "authorids": "~Xinlei_Niu1;~Christian_Walder1;~Jing_Zhang23;~Charles_Patrick_Martin1", "gender": "F;;F;M", "homepage": ";;https://jingzhang617.github.io;https://charlesmartin.au", "dblp": "327/8666;;05/3499-52;208/2413", "google_scholar": "https://scholar.google.com.au/citations?view_op=list_works;;https://scholar.google.com.au/citations?user=Qa1DMv8AAAAJ;mTlH4G8AAAAJ", "orcid": "0009-0003-6407-8309;;;0000-0001-5683-7529", "linkedin": "xinlei-niu-7ab15a219/?originalSubdomain=au;;;charles-patrick-martin/", "or_profile": "~Xinlei_Niu1;~Christian_Walder1;~Jing_Zhang23;~Charles_Patrick_Martin1", "aff": "Australian National University;;Australian National University;Australian National University", "aff_domain": "anu.edu.au;;anu.edu.au;anu.edu.au", "position": "PhD student;;Lecturer;Senior Lecturer", "bibtex": "@inproceedings{\nniu2024latent,\ntitle={Latent Optimal Paths by Gumbel Propagation for Variational Bayesian Dynamic Programming},\nauthor={Xinlei Niu and Christian Walder and Jing Zhang and Charles Patrick Martin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VSwrXRqD9o}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1953770, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:SF88krEx0q8J:scholar.google.com/&scioq=Latent+Optimal+Paths+by+Gumbel+Propagation+for+Variational+Bayesian+Dynamic+Programming&hl=en&as_sdt=0,5", "gs_version_total": 9, "email": "anu.edu.au;;anu.edu.au;anu.edu.au", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Australian National University", "aff_unique_dep": "", "aff_unique_url": "https://www.anu.edu.au", "aff_unique_abbr": "ANU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Australia" }, { "title": "Simplicity Bias via Global Convergence of Sharpness Minimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33913", "id": "VUTyzH63Xa", "proceeding": "https://proceedings.mlr.press/v235/gatmiry24a.html", "pdf": "https://openreview.net/pdf?id=VUTyzH63Xa", "openreview": "https://openreview.net/forum?id=VUTyzH63Xa", "author_site": "Khashayar Gatmiry, Zhiyuan Li, Sashank J. Reddi, Stefanie Jegelka", "tldr": "", "abstract": "The remarkable generalization ability of neural networks is usually attributed to the implicit bias of SGD, which often yields models with lower complexity using simpler (e.g. linear) and low-rank features. Recent works have provided empirical and theoretical evidence for the bias of particular variants of SGD (such as label noise SGD) toward flatter regions of the loss landscape. Despite the folklore intuition that flat solutions are 'simple', the connection with the simplicity of the final trained model (e.g. low-rank) is not well understood. In this work, we take a step toward bridging this gap by studying the simplicity structure that arises from minimizers of the sharpness for a class of two-layer neural networks. We show that, for any high dimensional training data and certain activations, with small enough step size, label noise SGD always converges to a network that replicates a single linear feature across all neurons; thereby implying a simple rank one feature matrix. To obtain this result, our main technical contribution is to show that label noise SGD always minimizes the sharpness on the manifold of models with zero loss for two-layer networks. Along the way, we discover a novel property --- a local geodesic convexity --- of the trace of Hessian of the loss at approximate stationary points on the manifold of zero loss, which links sharpness to the geometry of the manifold. This tool may be of independent interest.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Khashayar Gatmiry;Zhiyuan Li;Sashank J. Reddi;Stefanie Jegelka", "authorids": "~Khashayar_Gatmiry1;~Zhiyuan_Li2;~Sashank_J._Reddi1;~Stefanie_Jegelka3", "gender": "M;M;M;F", "homepage": "http://ce.sharif.edu/~kgatmiry/;https://zhiyuanli.ttic.edu;;http://people.csail.mit.edu/stefje/", "dblp": ";l/ZhiyuanLi;50/10452;38/7003", "google_scholar": ";https://scholar.google.com/citations?hl=en;70lgwYwAAAAJ;gTWUZlsAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Khashayar_Gatmiry1;~Zhiyuan_Li2;~Sashank_J._Reddi1;~Stefanie_Jegelka3", "aff": "Massachusetts Institute of Technology;Toyota Technological Institute at Chicago;Google;Massachusetts Institute of Technology", "aff_domain": "mit.edu;ttic.edu;google.com;mit.edu", "position": "PhD student;Assistant Professor;Research Scientist;Associate Professor", "bibtex": "@inproceedings{\ngatmiry2024simplicity,\ntitle={Simplicity Bias via Global Convergence of Sharpness Minimization},\nauthor={Khashayar Gatmiry and Zhiyuan Li and Sashank J. Reddi and Stefanie Jegelka},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VUTyzH63Xa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2034590, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13197113638455643103&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "mit.edu;ttic.edu;google.com;mit.edu", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Massachusetts Institute of Technology;Toyota Technological Institute at Chicago;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://web.mit.edu;https://www.tti-chicago.org;https://www.google.com", "aff_unique_abbr": "MIT;TTI Chicago;Google", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Chicago;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Non-parametric Online Change Point Detection on Riemannian Manifolds", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33912", "id": "VW7Jk8KhNC", "proceeding": "https://proceedings.mlr.press/v235/wang24f.html", "pdf": "https://openreview.net/pdf?id=VW7Jk8KhNC", "openreview": "https://openreview.net/forum?id=VW7Jk8KhNC", "author_site": "Xiuheng Wang, Ricardo Borsoi, C\u00e9dric Richard", "tldr": "", "abstract": "Non-parametric detection of change points in streaming time series data that belong to Euclidean spaces has been extensively studied in the literature. Nevertheless, when the data belongs to a Riemannian manifold, existing approaches are no longer applicable as they fail to account for the structure and geometry of the manifold. In this paper, we introduce a non-parametric algorithm for online change point detection in manifold-valued data streams. This algorithm monitors the generalized Karcher mean of the data, computed using stochastic Riemannian optimization. We provide theoretical bounds on the detection and false alarm rate performances of the algorithm, using a new result on the non-asymptotic convergence of the stochastic Riemannian gradient descent. We apply our algorithm to two different Riemannian manifolds. Experimental results with both synthetic and real data illustrate the performance of the proposed method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiuheng Wang;Ricardo Augusto Borsoi;C\u00e9dric Richard", "authorids": "~Xiuheng_Wang1;~Ricardo_Augusto_Borsoi1;~C\u00e9dric_Richard1", "gender": "M;;M", "homepage": "https://xiuheng-wang.github.io/;https://ricardoborsoi.github.io/;http://www.cedric-richard.fr", "dblp": "270/4673.html;194/3132;", "google_scholar": "xyfMMGIAAAAJ;FeiFDgkAAAAJ;", "orcid": "0000-0002-0115-5034;;", "linkedin": ";;", "or_profile": "~Xiuheng_Wang1;~Ricardo_Augusto_Borsoi1;~C\u00e9dric_Richard1", "aff": "Universit\u00e9 C\u00f4te d'Azur;Universit\u00e9 de Lorraine;Universit\u00e9 C\u00f4te d'Azur", "aff_domain": "unice.fr;univ-lorraine.fr;unice.fr", "position": "PhD student;Researcher;Full Professor", "bibtex": "@inproceedings{\nwang2024nonparametric,\ntitle={Non-parametric Online Change Point Detection on Riemannian Manifolds},\nauthor={Xiuheng Wang and Ricardo Augusto Borsoi and C{\\'e}dric Richard},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VW7Jk8KhNC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 984658, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_5TiCDsqoRoJ:scholar.google.com/&scioq=Non-parametric+Online+Change+Point+Detection+on+Riemannian+Manifolds&hl=en&as_sdt=0,5", "gs_version_total": 15, "email": "unice.fr;univ-lorraine.fr;unice.fr", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Universit\u00e9 C\u00f4te d'Azur;Universit\u00e9 de Lorraine", "aff_unique_dep": ";", "aff_unique_url": "https://www.univ-cotedazur.fr;https://www.univ-lorraine.fr", "aff_unique_abbr": "UCA;UL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "Discrete Latent Perspective Learning for Segmentation and Detection", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33911", "id": "VWCpm39peL", "proceeding": "https://proceedings.mlr.press/v235/ji24e.html", "pdf": "https://openreview.net/pdf?id=VWCpm39peL", "openreview": "https://openreview.net/forum?id=VWCpm39peL", "author_site": "Deyi Ji, Feng Zhao, Lanyun Zhu, Wenwei Jin, Hongtao Lu, Jieping Ye", "tldr": "", "abstract": "In this paper, we address the challenge of Perspective-Invariant Learning in machine learning and computer vision, which involves enabling a network to understand images from varying perspectives to achieve consistent semantic interpretation. While standard approaches rely on the labor-intensive collection of multi-view images or limited data augmentation techniques, we propose a novel framework, Discrete Latent Perspective Learning (DLPL), for latent multi-perspective fusion learning using conventional single-view images. DLPL comprises three main modules: Perspective Discrete Decomposition (PDD), Perspective Homography Transformation (PHT), and Perspective Invariant Attention (PIA), which work together to discretize visual features, transform perspectives, and fuse multi-perspective semantic information, respectively. DLPL is a universal perspective learning framework applicable to a variety of scenarios and vision tasks. Extensive experiments demonstrate that DLPL significantly enhances the network's capacity to depict images across diverse scenarios (daily photos, UAV, auto-driving) and tasks (detection, segmentation).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Deyi Ji;Feng Zhao;Lanyun Zhu;Wenwei Jin;Hongtao Lu;Jieping Ye", "authorids": "~Deyi_Ji2;~Feng_Zhao6;~Lanyun_Zhu1;~Wenwei_Jin3;~Hongtao_Lu1;~Jieping_Ye4", "gender": ";M;M;;M;M", "homepage": "https://jankyee.github.io;https://bivlab123.github.io/;https://lanyunzhu.site;;https://www.cs.sjtu.edu.cn/en/PeopleDetail.aspx?id=156;http://yelabs.net/", "dblp": "230/2118;181/2734-4;245/2640;;;03/5454", "google_scholar": "r9-7am4AAAAJ;https://scholar.google.co.uk/citations?hl=en;urOSnlQAAAAJ;;https://scholar.google.com.tw/citations?user=GtNuBJcAAAAJ;T9AzhwcAAAAJ", "orcid": "0000-0001-7561-9789;0000-0001-6767-8105;;;0000-0003-2300-3039;0000-0001-8662-5818", "linkedin": ";;;;;", "or_profile": "~Deyi_Ji2;~Feng_Zhao6;~Lanyun_Zhu1;~Wenwei_Jin3;~Hongtao_Lu1;~Jieping_Ye4", "aff": "Alibaba Group;University of Science and Technology of China;Singapore University of Technology and Design;;Shanghai Jiaotong University;Alibaba Group", "aff_domain": "alibaba-inc.com;ustc.edu.cn;sutd.edu.sg;;sjtu.edu.cn;alibaba-inc.com", "position": "Researcher;Full Professor;PhD student;;Full Professor;Principal Researcher", "bibtex": "@inproceedings{\nji2024discrete,\ntitle={Discrete Latent Perspective Learning for Segmentation and Detection},\nauthor={Deyi Ji and Feng Zhao and Lanyun Zhu and Wenwei Jin and Hongtao Lu and Jieping Ye},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VWCpm39peL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5017394, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10303357120666995468&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "alibaba-inc.com;ustc.edu.cn;sutd.edu.sg;;sjtu.edu.cn;alibaba-inc.com", "author_num": 6, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "Alibaba Group;University of Science and Technology of China;Singapore University of Technology and Design;Shanghai Jiao Tong University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.alibaba.com;http://www.ustc.edu.cn;https://www.sutd.edu.sg;https://www.sjtu.edu.cn", "aff_unique_abbr": "Alibaba;USTC;SUTD;SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "China;Singapore" }, { "title": "Codebook Features: Sparse and Discrete Interpretability for Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33910", "id": "VZ5A0LPbnc", "proceeding": "https://proceedings.mlr.press/v235/tamkin24a.html", "pdf": "https://openreview.net/pdf?id=VZ5A0LPbnc", "openreview": "https://openreview.net/forum?id=VZ5A0LPbnc", "author_site": "Alex Tamkin, Mohammad Taufeeque, Noah Goodman", "tldr": "", "abstract": "Understanding neural networks is challenging in part because of the dense, continuous nature of their hidden states. We explore whether we can train neural networks to have hidden states that are sparse, discrete, and more interpretable by quantizing their continuous features into what we call codebook features. Codebook features are produced by finetuning neural networks with vector quantization bottlenecks at each layer, producing a network whose hidden features are the sum of a small number of discrete vector codes chosen from a larger codebook. Surprisingly, we find that neural networks can operate under this extreme bottleneck with only modest degradation in performance. In addition, we can control a model's behavior by finding codes that activate on a desired behavior, then activating those same codes during generation. We first validate codebook features on a finite state machine dataset with far more hidden states than neurons. In this setting, our approach overcomes the superposition problem by assigning states to distinct codes, and we find that we can make the neural network behave as if it is in a different state by activating the code for that state. We then train Transformer language models with up to 410M parameters on two natural language datasets. We identify codes in these models representing diverse, disentangled concepts (ranging from negative emotions to months of the year) and find that we can guide the model to generate different topics and pronoun genders by activating these codes during inference. Overall, codebook features appear to be a promising unit of analysis and control for neural networks and interpretability. Our codebase and models are open-sourced at [this URL](https://github.com/taufeeque9/codebook-features).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alex Tamkin;Mohammad Taufeeque;Noah Goodman", "authorids": "~Alex_Tamkin1;~Mohammad_Taufeeque2;~Noah_Goodman1", "gender": ";;", "homepage": ";https://taufeeque9.github.io/;https://cocolab.stanford.edu/", "dblp": ";286/7826;96/1216", "google_scholar": ";OKO5dwcAAAAJ;OUpIbcQAAAAJ", "orcid": ";;", "linkedin": ";mtaufeeque;", "or_profile": "~Alex_Tamkin1;~Mohammad_Taufeeque2;~Noah_Goodman1", "aff": ";FAR.AI;Stanford University", "aff_domain": ";far.ai;stanford.edu", "position": ";Researcher;Full Professor", "bibtex": "@inproceedings{\ntamkin2024codebook,\ntitle={Codebook Features: Sparse and Discrete Interpretability for Neural Networks},\nauthor={Alex Tamkin and Mohammad Taufeeque and Noah Goodman},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VZ5A0LPbnc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 650642, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9460788710261450578&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": ";far.ai;stanford.edu", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "FAR.AI;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://www.far.ai;https://www.stanford.edu", "aff_unique_abbr": "FAR.AI;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "R\u00e9nyi Pufferfish Privacy: General Additive Noise Mechanisms and Privacy Amplification by Iteration via Shift Reduction Lemmas", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33909", "id": "VZsxhPpu9T", "proceeding": "https://proceedings.mlr.press/v235/pierquin24a.html", "pdf": "https://openreview.net/pdf?id=VZsxhPpu9T", "openreview": "https://openreview.net/forum?id=VZsxhPpu9T", "author_site": "Cl\u00e9ment Pierquin, Aur\u00e9lien Bellet, Marc Tommasi, Matthieu Boussard", "tldr": "", "abstract": "Pufferfish privacy is a flexible generalization of differential privacy that allows to model arbitrary secrets and adversary's prior knowledge about the data. Unfortunately, designing general and tractable Pufferfish mechanisms that do not compromise utility is challenging. Furthermore, this framework does not provide the composition guarantees needed for a direct use in iterative machine learning algorithms. To mitigate these issues, we introduce a R\u00e9nyi divergence-based variant of Pufferfish and show that it allows us to extend the applicability of the Pufferfish framework. We first generalize the Wasserstein mechanism to cover a wide range of noise distributions and introduce several ways to improve its utility. Finally, as an alternative to composition, we prove privacy amplification results for contractive noisy iterations and showcase the first use of Pufferfish in private convex optimization. A common ingredient underlying our results is the use and extension of shift reduction lemmas.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Cl\u00e9ment Pierquin;Aur\u00e9lien Bellet;Marc Tommasi;Matthieu Boussard", "authorids": "~Cl\u00e9ment_Pierquin1;~Aur\u00e9lien_Bellet1;~Marc_Tommasi1;~Matthieu_Boussard1", "gender": "M;;M;", "homepage": "https://github.com/clmpqncraft;http://researchers.lille.inria.fr/abellet/;https://www.cristal.univ-lille.fr/en/profil/tommasi/;https://www.craft.ai", "dblp": ";61/8017;t/MarcTommasi;", "google_scholar": ";https://scholar.google.fr/citations?user=j8svx3IAAAAJ;https://scholar.google.fr/citations?user=IRyM3b8AAAAJ;", "orcid": ";0000-0003-3440-1251;;", "linkedin": ";;;", "or_profile": "~Cl\u00e9ment_Pierquin1;~Aur\u00e9lien_Bellet1;~Marc_Tommasi1;~Matthieu_Boussard1", "aff": "Craft AI;INRIA;INRIA;Craft AI", "aff_domain": "craft.ai;inria.fr;inria.fr;craft-ai.fr", "position": "Intern;Tenured researcher;Researcher;Principal Researcher", "bibtex": "@inproceedings{\npierquin2024rnyi,\ntitle={R\\'enyi Pufferfish Privacy: General Additive Noise Mechanisms and Privacy Amplification by Iteration via Shift Reduction Lemmas},\nauthor={Cl{\\'e}ment Pierquin and Aur{\\'e}lien Bellet and Marc Tommasi and Matthieu Boussard},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VZsxhPpu9T}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1546893, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2713001579085574496&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "craft.ai;inria.fr;inria.fr;craft-ai.fr", "author_num": 4, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Craft AI;INRIA", "aff_unique_dep": ";", "aff_unique_url": "https://www.craft-ai.com;https://www.inria.fr", "aff_unique_abbr": "Craft AI;INRIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "France" }, { "title": "VoroNav: Voronoi-based Zero-shot Object Navigation with Large Language Model", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33908", "id": "Va7mhTVy5s", "proceeding": "https://proceedings.mlr.press/v235/wu24u.html", "pdf": "https://openreview.net/pdf?id=Va7mhTVy5s", "openreview": "https://openreview.net/forum?id=Va7mhTVy5s", "author_site": "Pengying Wu, Yao Mu, Bingxian Wu, Yi Hou, Ji Ma, Shanghang Zhang, Chang Liu", "tldr": "", "abstract": "In the realm of household robotics, the Zero-Shot Object Navigation (ZSON) task empowers agents to adeptly traverse unfamiliar environments and locate objects from novel categories without prior explicit training. This paper introduces VoroNav, a novel semantic exploration framework that proposes the Reduced Voronoi Graph to extract exploratory paths and planning nodes from a semantic map constructed in real time. By harnessing topological and semantic information, VoroNav designs text-based descriptions of paths and images that are readily interpretable by a large language model (LLM). In particular, our approach presents a synergy of path and farsight descriptions to represent the environmental context, enabling LLM to apply commonsense reasoning to ascertain waypoints for navigation. Extensive evaluation on HM3D and HSSD validates VoroNav surpasses existing benchmarks in both success rate and exploration efficiency (absolute improvement: +2.8% Success and +3.7% SPL on HM3D, +2.6% Success and +3.8% SPL on HSSD). Additionally introduced metrics that evaluate obstacle avoidance proficiency and perceptual efficiency further corroborate the enhancements achieved by our method in ZSON planning. Project page: https://voro-nav.github.io", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pengying Wu;Yao Mu;Bingxian Wu;Yi Hou;Ji Ma;Shanghang Zhang;Chang Liu", "authorids": "~Pengying_Wu1;~Yao_Mu1;~Bingxian_Wu1;~Yi_Hou1;~Ji_Ma5;~Shanghang_Zhang4;~Chang_Liu30", "gender": "M;M;M;M;;;M", "homepage": "https://github.com/LittleFive666;https://yaomarkmu.github.io/;https://github.com/davidwu2003;https://github.com/2460555471;;;http://www2.coe.pku.edu.cn/faculty/liuchang/index.html", "dblp": ";260/0674;;;;;52/5716-2", "google_scholar": ";;;;;;hhm6ZzUAAAAJ", "orcid": "0009-0005-5818-6818;;;;;;", "linkedin": ";;;;;;changliu89/", "or_profile": "~Pengying_Wu1;~Yao_Mu1;~Bingxian_Wu1;~Yi_Hou1;~Ji_Ma5;~Shanghang_Zhang4;~Chang_Liu30", "aff": "Peking University;The University of Hong Kong;Peking University;Peking University;;;Peking University", "aff_domain": "pku.edu.cn;hku.hk;pku.edu.cn;pku.edu.cn;;;pku.edu.cn", "position": "MS student;PhD student;Undergrad student;PhD student;;;Assistant Professor", "bibtex": "@inproceedings{\nwu2024voronav,\ntitle={VoroNav: Voronoi-based Zero-shot Object Navigation with Large Language Model},\nauthor={Pengying Wu and Yao Mu and Bingxian Wu and Yi Hou and Ji Ma and Shanghang Zhang and Chang Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Va7mhTVy5s}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9353697, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12527858673486198523&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 6, "email": "pku.edu.cn;hku.hk;pku.edu.cn;pku.edu.cn;;;pku.edu.cn", "author_num": 7, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Peking University;University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.hku.hk", "aff_unique_abbr": "Peking U;HKU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "A Single-Loop Robust Policy Gradient Method for Robust Markov Decision Processes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33907", "id": "VaZVZQSgTP", "proceeding": "https://proceedings.mlr.press/v235/lin24u.html", "pdf": "https://openreview.net/pdf?id=VaZVZQSgTP", "openreview": "https://openreview.net/forum?id=VaZVZQSgTP", "author_site": "Zhenwei Lin, Chenyu Xue, Qi Deng, Yinyu Ye", "tldr": "", "abstract": "Robust Markov Decision Processes (RMDPs) have recently been recognized as a valuable and promising approach to discovering a policy with creditable performance, particularly in the presence of a dynamic environment and estimation errors in the transition matrix due to limited data. Despite extensive exploration of dynamic programming algorithms for solving RMDPs, there has been a notable upswing in interest in developing efficient algorithms using the policy gradient method. In this paper, we propose the first single-loop robust policy gradient (SRPG) method with the global optimality guarantee for solving RMDPs through its minimax formulation. Moreover, we complement the convergence analysis of the nonconvex-nonconcave min-max optimization problem with the objective function's gradient dominance property, which is not explored in the prior literature. Numerical experiments validate the efficacy of SRPG, demonstrating its faster and more robust convergence behavior compared to its nested-loop counterpart.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhenwei Lin;Chenyu Xue;Qi Deng;Yinyu Ye", "authorids": "~Zhenwei_Lin3;~Chenyu_Xue1;~Qi_Deng1;~Yinyu_Ye1", "gender": "M;M;M;M", "homepage": "https://github.com/zhenweilin;https://sites.google.com/view/chenyuxue/home;http://sime.shufe.edu.cn/teacher/show/225;https://web.stanford.edu/~yyye/", "dblp": "309/0132;240/6330;;42/1372", "google_scholar": "Bq-FHQsAAAAJ;;https://scholar.google.com/citations?hl=en;BgOXDogAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Zhenwei_Lin3;~Chenyu_Xue1;~Qi_Deng1;~Yinyu_Ye1", "aff": "Shanghai University of Finance and Economics;National University of Singapore;Shanghai University of Finance and Economics;", "aff_domain": "shufe.edu;nus.edu;sufe.edu.cn;", "position": "PhD student;PhD student;Associate Professor;", "bibtex": "@inproceedings{\nlin2024a,\ntitle={A Single-Loop Robust Policy Gradient Method for Robust Markov Decision Processes},\nauthor={Zhenwei Lin and Chenyu Xue and Qi Deng and Yinyu Ye},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VaZVZQSgTP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 939177, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:oV0fB6_891wJ:scholar.google.com/&scioq=A+Single-Loop+Robust+Policy+Gradient+Method+for+Robust+Markov+Decision+Processes&hl=en&as_sdt=0,5", "gs_version_total": 7, "email": "shufe.edu;nus.edu;sufe.edu.cn;", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Shanghai University of Finance and Economics;National University of Singapore", "aff_unique_dep": ";", "aff_unique_url": "http://www.sufe.edu.cn;https://www.nus.edu.sg", "aff_unique_abbr": "SUFE;NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;Singapore" }, { "title": "Towards Neural Architecture Search through Hierarchical Generative Modeling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33906", "id": "VdZfEMuoj2", "proceeding": "https://proceedings.mlr.press/v235/xiang24a.html", "pdf": "https://openreview.net/pdf?id=VdZfEMuoj2", "openreview": "https://openreview.net/forum?id=VdZfEMuoj2", "author_site": "Lichuan Xiang, \u0141ukasz Dudziak, Mohamed Abdelfattah, Abhinav Mehrotra, Nicholas Lane, Hongkai Wen", "tldr": "", "abstract": "Neural Architecture Search (NAS) aims to automate deep neural network design across various applications, while a good search space design is core to NAS performance. A too-narrow search space may fail to cover diverse task requirements, whereas a too-broad one can escalate computational expenses and reduce efficiency. %We propose automatically generating the search space to tailor it to specific task conditions, optimizing search costs and producing viable architectures. In this work, we aim to address this challenge by leaning on the recent advances in generative modelling -- we propose a novel method that can navigate through an extremely large, general-purpose initial search space efficiently by training a two-level generative model hierarchy. The first level uses Conditional Continuous Normalizing Flow (CCNF) for micro-cell design, while the second employs a transformer-based sequence generator to craft macro architectures aligned with task needs and architectural constraints. To ensure computational feasibility, we pretrain the generative models in a task-agnostic manner using a metric space of graph and zero-cost (ZC) similarities between architectures. We show our approach can achieve state-of-the-art performance among other low-cost NAS methods across different tasks on CIFAR-10/100, ImageNet and NAS-Bench-360.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lichuan Xiang;\u0141ukasz Dudziak;Mohamed S Abdelfattah;Abhinav Mehrotra;Nicholas Donald Lane;Hongkai Wen", "authorids": "~Lichuan_Xiang1;~\u0141ukasz_Dudziak1;~Mohamed_S_Abdelfattah1;~Abhinav_Mehrotra1;~Nicholas_Donald_Lane1;~Hongkai_Wen1", "gender": "M;M;M;M;;", "homepage": ";;https://mohsaied.github.io/;https://abhinavmehrotra.github.io/;;", "dblp": "294/8850;228/7987;124/7095;154/4273;;", "google_scholar": ";R47NvpoAAAAJ;https://scholar.google.ca/citations?user=q4wBpWAAAAAJ;https://scholar.google.co.uk/citations?user=AbeyFKwAAAAJ;;", "orcid": ";;;;;", "linkedin": "lichuan-xiang-17ab43101/;;mabdelfattah/;;;", "or_profile": "~Lichuan_Xiang1;~\u0141ukasz_Dudziak1;~Mohamed_S_Abdelfattah1;~Abhinav_Mehrotra1;~Nicholas_Donald_Lane1;~Hongkai_Wen1", "aff": "The university of Warwick;Samsung;Cornell University;Samsung AI Center;;", "aff_domain": "warwick.ac.uk;samsung.com;cornell.edu;samsung.com;;", "position": "PhD student;Software Engineer;Assistant Professor;Researcher;;", "bibtex": "@inproceedings{\nxiang2024towards,\ntitle={Towards Neural Architecture Search through Hierarchical Generative Modeling},\nauthor={Lichuan Xiang and {\\L}ukasz Dudziak and Mohamed S Abdelfattah and Abhinav Mehrotra and Nicholas Donald Lane and Hongkai Wen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VdZfEMuoj2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4750326, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:UZqPfeT4XtIJ:scholar.google.com/&scioq=Towards+Neural+Architecture+Search+through+Hierarchical+Generative+Modeling&hl=en&as_sdt=0,33", "gs_version_total": 6, "email": "warwick.ac.uk;samsung.com;cornell.edu;samsung.com;;", "author_num": 6, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "University of Warwick;Samsung;Cornell University", "aff_unique_dep": ";Samsung;", "aff_unique_url": "https://warwick.ac.uk;https://www.samsung.com;https://www.cornell.edu", "aff_unique_abbr": "Warwick;Samsung;Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;1", "aff_country_unique": "United Kingdom;South Korea;United States" }, { "title": "Reinforcement Learning and Regret Bounds for Admission Control", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33905", "id": "Vdr87ZUfnl", "proceeding": "https://proceedings.mlr.press/v235/weber24a.html", "pdf": "https://openreview.net/pdf?id=Vdr87ZUfnl", "openreview": "https://openreview.net/forum?id=Vdr87ZUfnl", "author_site": "Lucas Weber, Ana Busic, Jiamin ZHU", "tldr": "", "abstract": "The expected regret of any reinforcement learning algorithm is lower bounded by $\\Omega\\left(\\sqrt{DXAT}\\right)$ for undiscounted returns, where $D$ is the diameter of the Markov decision process, $X$ the size of the state space, $A$ the size of the action space and $T$ the number of time steps. However, this lower bound is general. A smaller regret can be obtained by taking into account some specific knowledge of the problem structure. In this article, we consider an admission control problem to an $M/M/c/S$ queue with $m$ job classes and class-dependent rewards and holding costs. Queuing systems often have a diameter that is exponential in the buffer size $S$, making the previous lower bound prohibitive for any practical use. We propose an algorithm inspired by UCRL2, and use the structure of the problem to upper bound the expected total regret by $O(S\\log T + \\sqrt{mT \\log T})$ in the finite server case. In the infinite server case, we prove that the dependence of the regret on $S$ disappears.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lucas Weber;Ana Busic;Jiamin Zhu", "authorids": "~Lucas_Weber2;~Ana_Busic1;jiamin.zhu@ifpen.fr", "gender": "M;F;", "homepage": "https://www.theses.fr/s343783;;", "dblp": ";57/3580;", "google_scholar": ";https://scholar.google.fr/citations?user=u-RXvmAAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Lucas_Weber2;~Ana_Busic1;jiamin.zhu@ifpen.fr", "aff": "INRIA;Ecole Normale Sup\u00e9rieure;", "aff_domain": "inria.fr;di.ens.fr;", "position": "PhD student;Researcher;", "bibtex": "@inproceedings{\nweber2024reinforcement,\ntitle={Reinforcement Learning and Regret Bounds for Admission Control},\nauthor={Lucas Weber and Ana Busic and Jiamin Zhu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Vdr87ZUfnl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 787552, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:p9F26cw-foUJ:scholar.google.com/&scioq=Reinforcement+Learning+and+Regret+Bounds+for+Admission+Control&hl=en&as_sdt=0,33", "gs_version_total": 9, "email": "inria.fr;di.ens.fr;", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "INRIA;Ecole Normale Sup\u00e9rieure", "aff_unique_dep": ";", "aff_unique_url": "https://www.inria.fr;https://www.ens.fr", "aff_unique_abbr": "INRIA;ENS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "title": "Improved Bounds for Pure Private Agnostic Learning: Item-Level and User-Level Privacy", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33904", "id": "VfWrXJtLSL", "proceeding": "https://proceedings.mlr.press/v235/li24bq.html", "pdf": "https://openreview.net/pdf?id=VfWrXJtLSL", "openreview": "https://openreview.net/forum?id=VfWrXJtLSL", "author_site": "Bo Li, Wei Wang, Peng Ye", "tldr": "", "abstract": "Machine Learning has made remarkable progress in a wide range of fields. In many scenarios, learning is performed on datasets involving sensitive information, in which privacy protection is essential for learning algorithms. In this work, we study pure private learning in the agnostic model -- a framework reflecting the learning process in practice. We examine the number of users required under item-level (where each user contributes one example) and user-level (where each user contributes multiple examples) privacy and derive several improved upper bounds. For item-level privacy, our algorithm achieves a near optimal bound for general concept classes. We extend this to the user-level setting, rendering a tighter upper bound than the one proved by Ghazi et al. (2023). Lastly, we consider the problem of learning thresholds under user-level privacy and present an algorithm with a nearly tight user complexity.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bo Li;Wei Wang;Peng Ye", "authorids": "~Bo_Li33;~Wei_Wang50;~Peng_Ye5", "gender": ";M;", "homepage": ";https://www.cse.ust.hk/~weiwa/;", "dblp": ";35/7092-30;53/930-5", "google_scholar": ";https://scholar.google.ca/citations?user=FeJrzPMAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Bo_Li33;~Wei_Wang50;~Peng_Ye5", "aff": ";HKUST;Hong Kong University of Science and Technology", "aff_domain": ";cse.ust.hk;hkust.edu", "position": ";Associate Professor;PhD student", "bibtex": "@inproceedings{\nli2024improved,\ntitle={Improved Bounds for Pure Private Agnostic Learning: Item-Level and User-Level Privacy},\nauthor={Bo Li and Wei Wang and Peng Ye},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VfWrXJtLSL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 427588, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:i5rzbFZJDPQJ:scholar.google.com/&scioq=Improved+Bounds+for+Pure+Private+Agnostic+Learning:+Item-Level+and+User-Level+Privacy&hl=en&as_sdt=0,33", "gs_version_total": 10, "email": ";cse.ust.hk;hkust.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Hong Kong University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.ust.hk", "aff_unique_abbr": "HKUST", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Position: Will we run out of data? Limits of LLM scaling based on human-generated data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33903", "id": "ViZcgDQjyG", "proceeding": "https://proceedings.mlr.press/v235/villalobos24a.html", "pdf": "https://openreview.net/pdf?id=ViZcgDQjyG", "openreview": "https://openreview.net/forum?id=ViZcgDQjyG", "author_site": "Pablo Villalobos, Anson Ho, Jaime Sevilla, Tamay Besiroglu, Lennart Heim, Marius Hobbhahn", "tldr": "", "abstract": "We investigate the potential constraints on LLM scaling posed by the availability of public human-generated text data. We forecast the growing demand for training data based on current trends and estimate the total stock of public human text data. Our findings indicate that if current LLM development trends continue, models will be trained on datasets roughly equal in size to the available stock of public human text data between 2026 and 2032, or slightly earlier if models are overtrained. We explore how progress in language modeling can continue when human-generated text datasets cannot be scaled any further. We argue that synthetic data generation, transfer learning from data-rich domains, and data efficiency improvements might support further progress.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pablo Villalobos;Anson Ho;Jaime Sevilla;Tamay Besiroglu;Lennart Heim;Marius Hobbhahn", "authorids": "~Pablo_Villalobos1;~Anson_Ho1;jaime@epochai.org;tamay@epochai.org;~Lennart_Heim1;~Marius_Hobbhahn1", "gender": "M;M;;;M;", "homepage": ";https://ansonwhho.github.io/;;;https://heim.xyz;http://www.mariushobbhahn.com", "dblp": ";;;;;260/0039", "google_scholar": ";;;;;SJ1y8o0AAAAJ", "orcid": "0009-0003-5216-0639;;;;;", "linkedin": "pablo-villalobos-sanchez/;;;;;", "or_profile": "~Pablo_Villalobos1;~Anson_Ho1;jaime@epochai.org;tamay@epochai.org;~Lennart_Heim1;~Marius_Hobbhahn1", "aff": "Epoch;Epoch;;;Centre for the Governance of AI;Max Planck Institute for Intelligent Systems, Max-Planck Institute", "aff_domain": "epochai.org;epochai.org;;;governance.ai;tue.mpg.de", "position": "Researcher;Researcher;;;Researcher;PhD student", "bibtex": "@inproceedings{\nvillalobos2024position,\ntitle={Position: Will we run out of data? Limits of {LLM} scaling based on human-generated data},\nauthor={Pablo Villalobos and Anson Ho and Jaime Sevilla and Tamay Besiroglu and Lennart Heim and Marius Hobbhahn},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ViZcgDQjyG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 638503, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12752275066945387256&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 5, "email": "epochai.org;epochai.org;;;governance.ai;tue.mpg.de", "author_num": 6, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Epoch;Centre for the Governance of AI;Max Planck Institute for Intelligent Systems", "aff_unique_dep": ";;Intelligent Systems", "aff_unique_url": ";https://www.governanceofai.org;https://www.mpi-is.mpg.de", "aff_unique_abbr": ";CGAI;MPI-IS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;2", "aff_country_unique": ";United Kingdom;Germany" }, { "title": "Auctionformer: A Unified Deep Learning Algorithm for Solving Equilibrium Strategies in Auction Games", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33902", "id": "VnI9200eeL", "proceeding": "https://proceedings.mlr.press/v235/huang24c.html", "pdf": "https://openreview.net/pdf?id=VnI9200eeL", "openreview": "https://openreview.net/forum?id=VnI9200eeL", "author_site": "Kexin Huang, Ziqian Chen, xue wang, Chongming Gao, Jinyang Gao, Bolin Ding, Xiang Wang", "tldr": "", "abstract": "Auction games have been widely used in plenty of trading environments such as online advertising and real estate. The complexity of real-world scenarios, characterized by diverse auction mechanisms and bidder asymmetries, poses significant challenges in efficiently solving for equilibria. Traditional learning approaches often face limitations due to their specificity to certain settings and high resource demands. Addressing this, we introduce *Auctionformer*, an efficient transformer-based method to solve equilibria of diverse auctions in a unified framework. Leveraging the flexible tokenization schemes, Auctionformer translates varying auction games into a standard token series, making use of renowned Transformer architectures. Moreover, we employ Nash error as the loss term, sidestepping the need for underlying equilibrium solutions and enabling efficient training and inference. Furthermore, a few-shot framework supports adaptability to new mechanisms, reinforced by a self-supervised fine-tuning approach. Extensive experimental results affirm the superior performance of Auctionformer over contemporary methods, heralding its potential for broad real-world applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kexin Huang;Ziqian Chen;Xue Wang;Chongming Gao;Jinyang Gao;Bolin Ding;Xiang Wang", "authorids": "~Kexin_Huang6;~Ziqian_Chen1;~Xue_Wang9;~Chongming_Gao1;~Jinyang_Gao1;~Bolin_Ding3;~Xiang_Wang6", "gender": ";M;M;M;M;M;M", "homepage": ";;https://chongminggao.me;;https://bolinding.github.io/;https://github.com/xiangwang1223;https://www.linkedin.com/in/xue-wang-98739572/", "dblp": ";168/3805;211/2856.html;131/4047;46/3522.html;31/2864-10;", "google_scholar": "ELa-ADAAAAAJ;;eaGLJ-UAAAAJ;;AjYkTi8AAAAJ;https://scholar.google.com.sg/citations?user=HdhaQB0AAAAJ;", "orcid": "0009-0001-4868-0952;;0000-0002-5187-9196;;;0000-0002-6148-6329;", "linkedin": ";;;;bolin-ding-50a0119/;;", "or_profile": "~Kexin_Huang6;~Ziqian_Chen1;~Chongming_Gao1;~Jinyang_Gao1;~Bolin_Ding3;~Xiang_Wang6;~xue_wang1", "aff": "University of Science and Technology of China;Alibaba Group;University of Science and Technology of China;Alibaba Group;Alibaba Group;University of Science and Technology of China;Alibaba Group US", "aff_domain": "ustc.edu.cn;alibaba-inc.com;ustc.edu.cn;alibaba-inc.com;alibaba-inc.com;ustc.edu.cn;alibaba-inc.com", "position": "MS student;Staff Engineer;Postdoc;Researcher;Senior Director;Full Professor;Researcher", "bibtex": "@inproceedings{\nhuang2024auctionformer,\ntitle={Auctionformer: A Unified Deep Learning Algorithm for Solving Equilibrium Strategies in Auction Games},\nauthor={Kexin Huang and Ziqian Chen and Xue Wang and Chongming Gao and Jinyang Gao and Bolin Ding and Xiang Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VnI9200eeL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1674124, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xff4oZ4ymScJ:scholar.google.com/&scioq=Auctionformer:+A+Unified+Deep+Learning+Algorithm+for+Solving+Equilibrium+Strategies+in+Auction+Games&hl=en&as_sdt=0,5", "gs_version_total": 4, "email": "ustc.edu.cn;alibaba-inc.com;ustc.edu.cn;alibaba-inc.com;alibaba-inc.com;ustc.edu.cn;alibaba-inc.com", "author_num": 7, "aff_unique_index": "0;1;0;1;1;0;1", "aff_unique_norm": "University of Science and Technology of China;Alibaba Group", "aff_unique_dep": ";", "aff_unique_url": "http://www.ustc.edu.cn;https://www.alibaba.com", "aff_unique_abbr": "USTC;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Towards Realistic Model Selection for Semi-supervised Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33901", "id": "VoMPNYTZud", "proceeding": "https://proceedings.mlr.press/v235/li24bv.html", "pdf": "https://openreview.net/pdf?id=VoMPNYTZud", "openreview": "https://openreview.net/forum?id=VoMPNYTZud", "author_site": "Muyang Li, Xiaobo Xia, Runze Wu, Fengming Huang, Jun Yu, Bo Han, Tongliang Liu", "tldr": "", "abstract": "Semi-supervised Learning (SSL) has shown remarkable success in applications with limited supervision. However, due to the scarcity of labels in the training process, SSL algorithms are known to be impaired by the lack of proper model selection, as splitting a validation set will further reduce the limited labeled data, and the size of the validation set could be too small to provide a reliable indication to the generalization error. Therefore, we seek alternatives that do not rely on validation data to probe the generalization performance of SSL models. Specifically, we find that the distinct margin distribution in SSL can be effectively utilized in conjunction with the model's spectral complexity, to provide a non-vacuous indication of the generalization error. Built upon this, we propose a novel model selection method, specifically tailored for SSL, known as **S**pectral-normalized **La**beled-margin **M**inimization (SLAM). We prove that the model selected by SLAM has upper-bounded differences w.r.t. the best model within the search space. In addition, comprehensive experiments showcase that SLAM can achieve significant improvements compared to its counterparts, verifying its efficacy from both theoretical and empirical standpoints.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Muyang Li;Xiaobo Xia;Runze Wu;Fengming Huang;Jun Yu;Bo Han;Tongliang Liu", "authorids": "~Muyang_Li3;~Xiaobo_Xia1;~Runze_Wu1;~Fengming_Huang2;~Jun_Yu3;~Bo_Han1;~Tongliang_Liu1", "gender": "M;M;M;M;M;M;M", "homepage": ";https://xiaoboxia.github.io/;https://wu-runze.github.io/;;https://faculty.ustc.edu.cn/yujun_AI/en/index.htm;https://tongliang-liu.github.io/;https://bhanml.github.io/", "dblp": "87/10837;242/8072;;;50/5754-1.html;150/6667;241/0472-3", "google_scholar": ";jRsugY0AAAAJ;8Uxbo9AAAAAJ;;efZyqyQAAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;nTNjqHwAAAAJ", "orcid": ";;0000-0002-6986-5825;;0000-0002-3197-8103;;", "linkedin": "%E6%B2%90%E9%98%B3-%E6%9D%8E-5b78a5163/;;;%E5%B3%B0%E6%98%8E-%E9%BB%84-48082a10a/;;;", "or_profile": "~Muyang_Li3;~Xiaobo_Xia1;~Runze_Wu1;~Fengming_Huang2;~Jun_Yu3;~Tongliang_Liu1;~bo_han2", "aff": "University of Sydney;The University of Sydney;NetEase Corp;University of Sydney, University of Sydney;University of Science and Technology of China;Mohamed bin Zayed University of Artificial Intelligence;MBZUAI", "aff_domain": "usyd.edu.au;sydney.edu.au;netease.com;usyd.edu.au;ustc.edu.cn;mbzuai.ac.ae;mbzuai.ac.ae", "position": "PhD student;PhD student;Principal Researcher;MS student;Associate Professor;Affiliated Associate Professor;Researcher", "bibtex": "@inproceedings{\nli2024towards,\ntitle={Towards Realistic Model Selection for Semi-supervised Learning},\nauthor={Muyang Li and Xiaobo Xia and Runze Wu and Fengming Huang and Jun Yu and Bo Han and Tongliang Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VoMPNYTZud}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 489414, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10981661647148400505&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "usyd.edu.au;sydney.edu.au;netease.com;usyd.edu.au;ustc.edu.cn;mbzuai.ac.ae;mbzuai.ac.ae", "author_num": 7, "aff_unique_index": "0;0;1;0;2;3;3", "aff_unique_norm": "University of Sydney;NetEase Corporation;University of Science and Technology of China;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.sydney.edu.au;https://www.163.com;http://www.ustc.edu.cn;https://mbzuai.ac.ae", "aff_unique_abbr": "USYD;NetEase;USTC;MBZUAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;1;2;2", "aff_country_unique": "Australia;China;United Arab Emirates" }, { "title": "Random Masking Finds Winning Tickets for Parameter Efficient Fine-tuning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33900", "id": "VrwIrAa1Lc", "proceeding": "https://proceedings.mlr.press/v235/xu24ag.html", "pdf": "https://openreview.net/pdf?id=VrwIrAa1Lc", "openreview": "https://openreview.net/forum?id=VrwIrAa1Lc", "author_site": "Jing Xu, Jingzhao Zhang", "tldr": "", "abstract": "Fine-tuning large language models (LLM) can be costly. Parameter-efficient fine-tuning (PEFT) addresses the problems by training a fraction of the parameters, whose success reveals the expressiveness and flexibility of pretrained models. This paper studies the limit of PEFT, by further simplifying its design and reducing the number of trainable parameters beyond standard setups. To this end, we use Random Masking to fine-tune the pretrained model. Despite its simplicity, we show that Random Masking is surprisingly effective: with a larger-than-expected learning rate, Random Masking can match the performance of standard PEFT algorithms such as LoRA on various tasks, using fewer trainable parameters. We provide both empirical and theoretical explorations into the success of Random Masking. We show that masking induces a flatter loss landscape and more distant solutions, which allows for and necessitates large learning rates.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jing Xu;Jingzhao Zhang", "authorids": "~Jing_Xu4;~Jingzhao_Zhang2", "gender": "M;M", "homepage": "https://jingxuthu.github.io;https://sites.google.com/view/jingzhao/home", "dblp": "07/1951-27;220/5559", "google_scholar": "jlrroGQAAAAJ;8NudxYsAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Jing_Xu4;~Jingzhao_Zhang2", "aff": "Tsinghua University;Tsinghua University", "aff_domain": "thu.edu.cn;mail.tsinghua.edu.cn", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nxu2024random,\ntitle={Random Masking Finds Winning Tickets for Parameter Efficient Fine-tuning},\nauthor={Jing Xu and Jingzhao Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VrwIrAa1Lc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6011277, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14838510620305216840&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "thu.edu.cn;mail.tsinghua.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "BAGEL: Bootstrapping Agents by Guiding Exploration with Language", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33899", "id": "VsvfSMI5bs", "proceeding": "https://proceedings.mlr.press/v235/murty24a.html", "pdf": "https://openreview.net/pdf?id=VsvfSMI5bs", "openreview": "https://openreview.net/forum?id=VsvfSMI5bs", "author_site": "Shikhar Murty, Christopher Manning, Peter Shaw, Mandar Joshi, Kenton Lee", "tldr": "", "abstract": "Following natural language instructions by executing actions in digital environments (e.g. web-browsers and REST APIs) is a challenging task for language model (LM) agents. Unfortunately, LM agents often fail to generalize to new environments without human demonstrations. This work presents BAGEL, a method for bootstrapping LM agents without human supervision. BAGEL converts a seed set of randomly explored trajectories to synthetic demonstrations via round-trips between two noisy LM components: an LM labeler which converts a trajectory into a synthetic instruction, and a zero-shot LM agent which maps the synthetic instruction into a refined trajectory. By performing these round-trips iteratively, BAGEL quickly converts the initial distribution of trajectories towards those that are well-described by natural language. We adapt the base LM agent at test time with in-context learning by retrieving relevant BAGEL demonstrations based on the instruction, and find improvements of over 2-13% absolute on ToolQA and MiniWob++, with up to 13x reduction in execution failures.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shikhar Murty;Christopher D Manning;Peter Shaw;Mandar Joshi;Kenton Lee", "authorids": "~Shikhar_Murty1;~Christopher_D_Manning1;~Peter_Shaw1;~Mandar_Joshi1;~Kenton_Lee1", "gender": "M;M;M;;M", "homepage": "https://murtyshikhar.github.io/;https://nlp.stanford.edu/~manning/;http://www.ptshaw.com;https://homes.cs.washington.edu/~mandar90;https://kentonl.com/", "dblp": "202/2040;m/ChristopherDManning;217/1471;85/1261;121/7560", "google_scholar": "https://scholar.google.ca/citations?user=ubAcojQAAAAJ;1zmDOdwAAAAJ;SmGaQicAAAAJ;;qXwJkr8AAAAJ", "orcid": ";0000-0001-6155-649X;;;", "linkedin": ";christopher-manning-011575/;;;", "or_profile": "~Shikhar_Murty1;~Christopher_D_Manning1;~Peter_Shaw1;~Mandar_Joshi1;~Kenton_Lee1", "aff": "Stanford University;Computer Science Department, Stanford University;Google DeepMind;Google DeepMind;Google Research", "aff_domain": "cs.stanford.edu;cs.stanford.edu;google.com;google.com;google.com", "position": "PhD student;Full Professor;Research Scientist;Researcher;Research Scientist", "bibtex": "@inproceedings{\nmurty2024bagel,\ntitle={{BAGEL}: Bootstrapping Agents by Guiding Exploration with Language},\nauthor={Shikhar Murty and Christopher D Manning and Peter Shaw and Mandar Joshi and Kenton Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VsvfSMI5bs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1222318, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8527081720699102849&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 6, "email": "cs.stanford.edu;cs.stanford.edu;google.com;google.com;google.com", "author_num": 5, "aff_unique_index": "0;0;1;1;1", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.stanford.edu;https://deepmind.com", "aff_unique_abbr": "Stanford;DeepMind", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Stanford;;Mountain View", "aff_country_unique_index": "0;0;1;1;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Guidance with Spherical Gaussian Constraint for Conditional Diffusion", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33898", "id": "VtqyurB4Af", "proceeding": "https://proceedings.mlr.press/v235/yang24h.html", "pdf": "https://openreview.net/pdf?id=VtqyurB4Af", "openreview": "https://openreview.net/forum?id=VtqyurB4Af", "author_site": "Lingxiao Yang, Shutong Ding, Yifan Cai, Jingyi Yu, Jingya Wang, Ye Shi", "tldr": "", "abstract": "Recent advances in diffusion models attempt to handle conditional generative tasks by utilizing a differentiable loss function for guidance without the need for additional training. While these methods achieved certain success, they often compromise on sample quality and require small guidance step sizes, leading to longer sampling processes. This paper reveals that the fundamental issue lies in the manifold deviation during the sampling process when loss guidance is employed. We theoretically show the existence of manifold deviation by establishing a certain lower bound for the estimation error of the loss guidance. To mitigate this problem, we propose Diffusion with Spherical Gaussian constraint (DSG), drawing inspiration from the concentration phenomenon in high-dimensional Gaussian distributions. DSG effectively constrains the guidance step within the intermediate data manifold through optimization and enables the use of larger guidance steps. Furthermore, we present a closed-form solution for DSG denoising with the Spherical Gaussian constraint. Notably, DSG can seamlessly integrate as a plugin module within existing training-free conditional diffusion methods. Implementing DSG merely involves a few lines of additional code with almost no extra computational overhead, yet it leads to significant performance improvements. Comprehensive experimental results in various conditional generation tasks validate the superiority and adaptability of DSG in terms of both sample quality and time efficiency.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lingxiao Yang;Shutong Ding;Yifan Cai;Jingyi Yu;Jingya Wang;Ye Shi", "authorids": "~Lingxiao_Yang3;~Shutong_Ding1;~Yifan_Cai1;~Jingyi_Yu5;~Jingya_Wang3;~Ye_Shi1", "gender": "M;M;;M;F;M", "homepage": "https://github.com/LingxiaoYang2023;https://dingsht.tech/;https://github.com/skpycyf;;https://faculty.sist.shanghaitech.edu.cn/faculty/wangjingya/;http://faculty.sist.shanghaitech.edu.cn/faculty/shiye", "dblp": ";;;;;34/11191-1", "google_scholar": ";https://scholar.google.com.hk/citations?user=qJyqm40AAAAJ;;R9L_AfQAAAAJ;https://scholar.google.com.au/citations?user=vmvJV_IAAAAJ;gMqbZPUAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Lingxiao_Yang3;~Shutong_Ding1;~Yifan_Cai1;~Jingyi_Yu5;~Jingya_Wang3;~Ye_Shi1", "aff": "ShanghaiTech University;ShanghaiTech University;ShanghaiTech University;ShanghaiTech University;ShanghaiTech University;ShanghaiTech University", "aff_domain": "shanghaitech.edu.cn;shanghaitech.edu.cn;shanghaitech.edu.cn;shanghaitech.edu.cn;shanghaitech.edu.cn;shanghaitech.edu.cn", "position": "MS student;MS student;Undergrad student;Full Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nyang2024guidance,\ntitle={Guidance with Spherical Gaussian Constraint for Conditional Diffusion},\nauthor={Lingxiao Yang and Shutong Ding and Yifan Cai and Jingyi Yu and Jingya Wang and Ye Shi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VtqyurB4Af}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9529707, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11089646191818717708&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "shanghaitech.edu.cn;shanghaitech.edu.cn;shanghaitech.edu.cn;shanghaitech.edu.cn;shanghaitech.edu.cn;shanghaitech.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "ShanghaiTech University", "aff_unique_dep": "", "aff_unique_url": "https://www.shanghaitech.edu.cn", "aff_unique_abbr": "ShanghaiTech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Automating the Selection of Proxy Variables of Unmeasured Confounders", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33897", "id": "VuoB86HiCL", "proceeding": "https://proceedings.mlr.press/v235/xie24b.html", "pdf": "https://openreview.net/pdf?id=VuoB86HiCL", "openreview": "https://openreview.net/forum?id=VuoB86HiCL", "author_site": "Feng Xie, Zhengming Chen, Shanshan Luo, Wang Miao, Ruichu Cai, zhi geng", "tldr": "", "abstract": "Recently, interest has grown in the use of proxy variables of unobserved confounding for inferring the causal effect in the presence of unmeasured confounders from observational data. One difficulty inhibiting the practical use is finding valid proxy variables of unobserved confounding to a target causal effect of interest. These proxy variables are typically justified by background knowledge. In this paper, we investigate the estimation of causal effects among multiple treatments and a single outcome, all of which are affected by unmeasured confounders, within a linear causal model, without prior knowledge of the validity of proxy variables. To be more specific, we first extend the existing proxy variable estimator, originally addressing a single unmeasured confounder, to accommodate scenarios where multiple unmeasured confounders exist between the treatments and the outcome. Subsequently, we present two different sets of precise identifiability conditions for selecting valid proxy variables of unmeasured confounders, based on the second-order statistics and higher-order statistics of the data, respectively. Moreover, we propose two data-driven methods for the selection of proxy variables and for the unbiased estimation of causal effects. Theoretical analysis demonstrates the correctness of our proposed algorithms. Experimental results on both synthetic and real-world data show the effectiveness of the proposed approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Feng Xie;Zhengming Chen;Shanshan Luo;Wang Miao;Ruichu Cai;Zhi Geng", "authorids": "~Feng_Xie1;~Zhengming_Chen2;~Shanshan_Luo2;~Wang_Miao1;~Ruichu_Cai1;~Zhi_Geng1", "gender": "M;;;Not Specified;M;M", "homepage": "https://fengxie.site/;;;https://www.math.pku.edu.cn/teachers/mwfy;https://ruichucai.github.io/;https://stxy.btbu.edu.cn/szdw/bssds/34339356074b408c8650309f05f24558.htm", "dblp": "11/4605-2;;;;09/6889;", "google_scholar": "stLFCtQAAAAJ;;;;https://scholar.google.com/citations?hl=en;", "orcid": "0000-0001-7229-3955;;;;;", "linkedin": ";;;;;", "or_profile": "~Feng_Xie1;~Zhengming_Chen2;~Shanshan_Luo2;~Wang_Miao1;~Ruichu_Cai1;~Zhi_Geng1", "aff": "Beijing Technology and Business University;;;Peking University;Guangdong University of Technology;School of mathematical Science, Peking University, Peking University", "aff_domain": "btbu.edu.cn;;;pku.edu.cn;gdut.edu.cn;math.pku.edu.cn", "position": "Associate Professor;;;Assistant Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nxie2024automating,\ntitle={Automating the Selection of Proxy Variables of Unmeasured Confounders},\nauthor={Feng Xie and Zhengming Chen and Shanshan Luo and Wang Miao and Ruichu Cai and Zhi Geng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VuoB86HiCL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 600485, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8192583960025255973&as_sdt=8000005&sciodt=0,19&hl=en", "gs_version_total": 8, "email": "btbu.edu.cn;;;pku.edu.cn;gdut.edu.cn;math.pku.edu.cn", "author_num": 6, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Beijing Technology and Business University;Peking University;Guangdong University of Technology", "aff_unique_dep": ";;", "aff_unique_url": "http://www.btbu.edu.cn;http://www.pku.edu.cn;http://www.gdut.edu.cn", "aff_unique_abbr": "BTBU;Peking U;GDUT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Peking", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Self-Consistency Training for Density-Functional-Theory Hamiltonian Prediction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33896", "id": "Vw4Yar2fmW", "proceeding": "https://proceedings.mlr.press/v235/zhang24ak.html", "pdf": "https://openreview.net/pdf?id=Vw4Yar2fmW", "openreview": "https://openreview.net/forum?id=Vw4Yar2fmW", "author_site": "He Zhang, Chang Liu, wang, Xinran Wei, Siyuan Liu, Nanning Zheng, Bin Shao, Tie-Yan Liu", "tldr": "", "abstract": "Predicting the mean-field Hamiltonian matrix in density functional theory is a fundamental formulation to leverage machine learning for solving molecular science problems. Yet, its applicability is limited by insufficient labeled data for training. In this work, we highlight that Hamiltonian prediction possesses a self-consistency principle, based on which we propose self-consistency training, an exact training method that does not require labeled data. It distinguishes the task from predicting other molecular properties by the following benefits: (1) it enables the model to be trained on a large amount of unlabeled data, hence addresses the data scarcity challenge and enhances generalization; (2) it is more efficient than running DFT to generate labels for supervised training, since it amortizes DFT calculation over a set of queries. We empirically demonstrate the better generalization in data-scarce and out-of-distribution scenarios, and the better efficiency over DFT labeling. These benefits push forward the applicability of Hamiltonian prediction to an ever-larger scale.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "He Zhang;Chang Liu;Zun Wang;Xinran Wei;Siyuan Liu;Nanning Zheng;Bin Shao;Tie-Yan Liu", "authorids": "~He_Zhang1;~Chang_Liu10;~Zun_Wang2;~Xinran_Wei1;~Siyuan_Liu3;~Nanning_Zheng1;~Bin_Shao1;~Tie-Yan_Liu1", "gender": "M;M;M;F;M;M;;M", "homepage": ";https://changliu00.github.io/;;;;;https://www.binshao.info/;http://member.acm.org/~tieyanliu", "dblp": "24/2058;52/5716-30;44/8410;;;07/256-1;;l/TieYanLiu", "google_scholar": "https://scholar.google.com/citations?hl=en;rYd0GEsAAAAJ;;https://scholar.google.com/citations?hl=zh-CN;;https://scholar.google.com/citations?hl=zh-CN;h9L4CgIAAAAJ;Nh832fgAAAAJ", "orcid": "0000-0003-4294-5697;0000-0001-5207-5440;0000-0002-8763-8327;;0000-0002-1318-6540;;;0000-0002-0476-8020", "linkedin": "%E8%B4%BA-%E5%BC%A0-8a592a16b/;chang-liu-9ab479168/;;;;;;", "or_profile": "~He_Zhang1;~Chang_Liu10;~Zun_Wang2;~Xinran_Wei1;~Siyuan_Liu3;~Nanning_Zheng1;~Bin_Shao1;~Tie-Yan_Liu1", "aff": "Xi'an Jiaotong University;Microsoft;Microsoft;Microsoft;DP Technology;Xi'an Jiaotong University;Microsoft;Microsoft", "aff_domain": "xjtu.edu;microsoft.com;microsoft.com;microsoft.com;dp.tech;xjtu.edu.cn;microsoft.com;microsoft.com", "position": "PhD student;Researcher;Researcher;Researcher;Researcher;Full Professor;Principal Research Manager;Distinguished Scientist", "bibtex": "@inproceedings{\nzhang2024selfconsistency,\ntitle={Self-Consistency Training for Density-Functional-Theory Hamiltonian Prediction},\nauthor={He Zhang and Chang Liu and Zun Wang and Xinran Wei and Siyuan Liu and Nanning Zheng and Bin Shao and Tie-Yan Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Vw4Yar2fmW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1055556, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17285964557150499544&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "xjtu.edu;microsoft.com;microsoft.com;microsoft.com;dp.tech;xjtu.edu.cn;microsoft.com;microsoft.com", "author_num": 8, "aff_unique_index": "0;1;1;1;2;0;1;1", "aff_unique_norm": "Xi'an Jiao Tong University;Microsoft;DP Technology", "aff_unique_dep": ";Microsoft Corporation;", "aff_unique_url": "https://www.xjtu.edu.cn;https://www.microsoft.com;", "aff_unique_abbr": "XJTU;Microsoft;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;0;1;1", "aff_country_unique": "China;United States;" }, { "title": "Symmetric Matrix Completion with ReLU Sampling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33895", "id": "VxI0gInNlh", "proceeding": "https://proceedings.mlr.press/v235/liu24bj.html", "pdf": "https://openreview.net/pdf?id=VxI0gInNlh", "openreview": "https://openreview.net/forum?id=VxI0gInNlh", "author_site": "Huikang Liu, Peng Wang, Longxiu Huang, Qing Qu, Laura Balzano", "tldr": "", "abstract": "We study the problem of symmetric positive semi-definite low-rank matrix completion (MC) with deterministic entry-dependent sampling. In particular, we consider rectified linear unit (ReLU) sampling, where only positive entries are observed, as well as a generalization to threshold-based sampling. We first empirically demonstrate that the landscape of this MC problem is not globally benign: Gradient descent (GD) with random initialization will generally converge to stationary points that are not globally optimal. Nevertheless, we prove that when the matrix factor with a small rank satisfies mild assumptions, the nonconvex objective function is geodesically strongly convex on the quotient manifold in a neighborhood of a planted low-rank matrix. Moreover, we show that our assumptions are satisfied by a matrix factor with i.i.d. Gaussian entries. Finally, we develop a tailor-designed initialization for GD to solve our studied formulation, which empirically always achieves convergence to the global minima. We also conduct extensive experiments and compare MC methods, investigating convergence and completion performance with respect to initialization, noise level, dimension, and rank.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Huikang Liu;Peng Wang;Longxiu Huang;Qing Qu;Laura Balzano", "authorids": "~Huikang_Liu2;~Peng_Wang23;~Longxiu_Huang1;~Qing_Qu2;~Laura_Balzano1", "gender": "M;M;F;M;F", "homepage": "https://huikang2019.github.io;https://peng8wang.github.io/;http://longxiuhuang.com/;https://qingqu.engin.umich.edu/;http://web.eecs.umich.edu/~girasole/", "dblp": "62/8489;95/4442-98;;127/6874-1;25/6625", "google_scholar": "https://scholar.google.com.hk/citations?hl=zh-TW;baF3HKUAAAAJ;-QRD5VYAAAAJ;JfblW3MAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0002-6799-0745;0000-0002-6610-9653;0000-0001-9136-558X;0000-0003-2914-123X", "linkedin": ";;;qing-q-1a0b9746/;", "or_profile": "~Huikang_Liu2;~Peng_Wang23;~Longxiu_Huang1;~Qing_Qu2;~Laura_Balzano1", "aff": "Shanghai University of Finance and Economics;University of Michigan - Ann Arbor;Michigan State University;University of Michigan;University of Michigan - Ann Arbor", "aff_domain": "sufe.edu;umich.edu;msu.edu;umich.edu;umich.edu", "position": "Assistant Professor;Postdoc;Assistant Professor;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nliu2024symmetric,\ntitle={Symmetric Matrix Completion with Re{LU} Sampling},\nauthor={Huikang Liu and Peng Wang and Longxiu Huang and Qing Qu and Laura Balzano},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VxI0gInNlh}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 926678, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3129667710956575228&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "sufe.edu;umich.edu;msu.edu;umich.edu;umich.edu", "author_num": 5, "aff_unique_index": "0;1;2;1;1", "aff_unique_norm": "Shanghai University of Finance and Economics;University of Michigan;Michigan State University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.sufe.edu.cn;https://www.umich.edu;https://www.msu.edu", "aff_unique_abbr": "SUFE;UM;MSU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Ann Arbor", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "China;United States" }, { "title": "Prompting4Debugging: Red-Teaming Text-to-Image Diffusion Models by Finding Problematic Prompts", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33894", "id": "VyGo1S5A6d", "proceeding": "https://proceedings.mlr.press/v235/chin24a.html", "pdf": "https://openreview.net/pdf?id=VyGo1S5A6d", "openreview": "https://openreview.net/forum?id=VyGo1S5A6d", "author_site": "Zhi-Yi Chin, Chieh Ming Jiang, Ching-Chun Huang, Pin-Yu Chen, Wei-Chen Chiu", "tldr": "", "abstract": "Text-to-image diffusion models, e.g. Stable Diffusion (SD), lately have shown remarkable ability in high-quality content generation, and become one of the representatives for the recent wave of transformative AI. Nevertheless, such advance comes with an intensifying concern about the misuse of this generative technology, especially for producing copyrighted or NSFW (i.e. not safe for work) images. Although efforts have been made to filter inappropriate images/prompts or remove undesirable concepts/styles via model fine-tuning, the reliability of these safety mechanisms against diversified problematic prompts remains largely unexplored. In this work, we propose **Prompting4Debugging (P4D)** as a debugging and red-teaming tool that automatically finds problematic prompts for diffusion models to test the reliability of a deployed safety mechanism. We demonstrate the efficacy of our P4D tool in uncovering new vulnerabilities of SD models with safety mechanisms. Particularly, our result shows that around half of prompts in existing safe prompting benchmarks which were originally considered \"safe\" can actually be manipulated to bypass many deployed safety mechanisms, including concept removal, negative prompt, and safety guidance. Our findings suggest that, without comprehensive testing, the evaluations on limited safe prompting benchmarks can lead to a false sense of safety for text-to-image models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhi-Yi Chin;Chieh Ming Jiang;Ching-Chun Huang;Pin-Yu Chen;Wei-Chen Chiu", "authorids": "~Zhi-Yi_Chin1;~Chieh_Ming_Jiang1;~Ching-Chun_Huang1;~Pin-Yu_Chen1;~Wei-Chen_Chiu3", "gender": "F;M;M;M;M", "homepage": "https://joycenerd.github.io/;;http://acm.cs.nctu.edu.tw/;http://www.pinyuchen.com;https://walonchiu.github.io/", "dblp": "300/7905;;;39/8969;148/9413", "google_scholar": "7VWH7r0AAAAJ;;xTdexhsAAAAJ;jxwlCUUAAAAJ;FiFOBS8AAAAJ", "orcid": ";;0000-0002-4382-5083;0000-0003-1039-8369;0000-0001-7715-8306", "linkedin": "zhi-yi-chin-b7927645/;%E5%82%91%E5%90%8D-%E8%94%A3-4166891a5?lipi=urn%3Ali%3Apage%3Ad_flagship3_profile_view_base_contact_details%3BtGXLr1n4TMeTFJU9LTI0Zg%3D%3D;;pin-yu-chen-940062a2;", "or_profile": "~Zhi-Yi_Chin1;~Chieh_Ming_Jiang1;~Ching-Chun_Huang1;~Pin-Yu_Chen1;~Wei-chen_Chiu2", "aff": "National Yang Ming Chiao Tung University;;National Yang Ming Chiao Tung University;International Business Machines;National Chiao Tung University", "aff_domain": "cs.nycu.edu.tw;;nycu.edu.tw;ibm.com;nctu.edu.tw", "position": "Researcher;;Full Professor;Principal Researcher;Associate Professor", "bibtex": "@inproceedings{\nchin2024promptingdebugging,\ntitle={Prompting4Debugging: Red-Teaming Text-to-Image Diffusion Models by Finding Problematic Prompts},\nauthor={Zhi-Yi Chin and Chieh Ming Jiang and Ching-Chun Huang and Pin-Yu Chen and Wei-Chen Chiu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VyGo1S5A6d}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7993393, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 71, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17234300437580455860&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "cs.nycu.edu.tw;;nycu.edu.tw;ibm.com;nctu.edu.tw", "author_num": 5, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "National Yang Ming Chiao Tung University;International Business Machines Corporation;National Chiao Tung University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nycu.edu.tw;https://www.ibm.com;https://www.nctu.edu.tw", "aff_unique_abbr": "NYCU;IBM;NCTU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Taiwan;", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Graph Neural Networks with a Distribution of Parametrized Graphs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33893", "id": "VyfEv6EjKR", "proceeding": "https://proceedings.mlr.press/v235/lee24k.html", "pdf": "https://openreview.net/pdf?id=VyfEv6EjKR", "openreview": "https://openreview.net/forum?id=VyfEv6EjKR", "author_site": "See Hian Lee, Feng Ji, Kelin Xia, Wee Peng Tay", "tldr": "", "abstract": "Traditionally, graph neural networks have been trained using a single observed graph. However, the observed graph represents only one possible realization. In many applications, the graph may encounter uncertainties, such as having erroneous or missing edges, as well as edge weights that provide little informative value. To address these challenges and capture additional information previously absent in the observed graph, we introduce latent variables to parameterize and generate multiple graphs. The parameters follow an unknown distribution to be estimated. We propose a formulation in terms of maximum likelihood estimation of the network parameters. Therefore, it is possible to devise an algorithm based on Expectation-Maximization (EM). Specifically, we iteratively determine the distribution of the graphs using a Markov Chain Monte Carlo (MCMC) method, incorporating the principles of PAC-Bayesian theory. Numerical experiments demonstrate improvements in performance against baseline models on node classification for both heterogeneous and homogeneous graphs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "See Hian Lee;Feng Ji;KELIN XIA;Wee Peng Tay", "authorids": "~See_Hian_Lee1;~Feng_Ji2;~KELIN_XIA1;~Wee_Peng_Tay1", "gender": "F;M;M;", "homepage": ";;https://personal.ntu.edu.sg/XIAKELIN/;https://personal.ntu.edu.sg/wptay/", "dblp": ";;67/9752;45/3753", "google_scholar": ";EA0VBD8AAAAJ;jZ0sEWYAAAAJ;BkCI7rEAAAAJ", "orcid": "0000-0002-1154-6354;0000-0003-3442-1471;0000-0003-4183-0943;0000-0002-1543-195X", "linkedin": "leeseehian/;;;", "or_profile": "~See_Hian_Lee1;~Feng_Ji2;~KELIN_XIA1;~Wee_Peng_Tay1", "aff": "Nanyang Technological University;Nanyang Technological University;Nanyang Technological University;Nanyang Technological University", "aff_domain": "ntu.edu.sg;ntu.edu.sg;ntu.edu.sg;ntu.edu.sg", "position": "PhD student;Researcher;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nlee2024graph,\ntitle={Graph Neural Networks with a Distribution of Parametrized Graphs},\nauthor={See Hian Lee and Feng Ji and KELIN XIA and Wee Peng Tay},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VyfEv6EjKR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 860761, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10497875753385710133&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8, "email": "ntu.edu.sg;ntu.edu.sg;ntu.edu.sg;ntu.edu.sg", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Nanyang Technological University", "aff_unique_dep": "", "aff_unique_url": "https://www.ntu.edu.sg", "aff_unique_abbr": "NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Singapore" }, { "title": "In-Context Freeze-Thaw Bayesian Optimization for Hyperparameter Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33892", "id": "VyoY3Wh9Wd", "proceeding": "https://proceedings.mlr.press/v235/rakotoarison24a.html", "pdf": "https://openreview.net/pdf?id=VyoY3Wh9Wd", "openreview": "https://openreview.net/forum?id=VyoY3Wh9Wd", "author_site": "Herilalaina Rakotoarison, Steven Adriaensen, Neeratyoy Mallik, Samir Garibov, Edward Bergman, Frank Hutter", "tldr": "", "abstract": "With the increasing computational costs associated with deep learning, automated hyperparameter optimization methods, strongly relying on black-box Bayesian optimization (BO), face limitations. Freeze-thaw BO offers a promising grey-box alternative, strategically allocating scarce resources incrementally to different configurations. However, the frequent surrogate model updates inherent to this approach pose challenges for existing methods, requiring retraining or fine-tuning their neural network surrogates online, introducing overhead, instability, and hyper-hyperparameters. In this work, we propose FT-PFN, a novel surrogate for Freeze-thaw style BO. FT-PFN is a prior-data fitted network (PFN) that leverages the transformers' in-context learning ability to efficiently and reliably do Bayesian learning curve extrapolation in a single forward pass. Our empirical analysis across three benchmark suites shows that the predictions made by FT-PFN are more accurate and 10-100 times faster than those of the deep Gaussian process and deep ensemble surrogates used in previous work. Furthermore, we show that, when combined with our novel acquisition mechanism (MFPI-random), the resulting in-context freeze-thaw BO method (ifBO), yields new state-of-the-art performance in the same three families of deep learning HPO benchmarks considered in prior work.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Herilalaina Rakotoarison;Steven Adriaensen;Neeratyoy Mallik;Samir Garibov;Eddie Bergman;Frank Hutter", "authorids": "~Herilalaina_Rakotoarison1;~Steven_Adriaensen1;~Neeratyoy_Mallik1;~Samir_Garibov1;~Eddie_Bergman1;~Frank_Hutter1", "gender": "M;M;M;M;M;M", "homepage": "https://scholar.google.fr/citations?user=pyws4AQAAAAJ&hl=en;;https://ml.informatik.uni-freiburg.de/profile/mallik/;;https://github.com/eddiebergman;http://ml.informatik.uni-freiburg.de/~hutter/", "dblp": "242/7961;148/1033;178/9789;;;89/5383", "google_scholar": "https://scholar.google.fr/citations?user=pyws4AQAAAAJ;;https://scholar.google.de/citations?user=SGtKoyMAAAAJ;;;https://scholar.google.de/citations?user=YUrxwrkAAAAJ", "orcid": ";;;;;0000-0002-2037-3694", "linkedin": ";;neeratyoy/;samir-garibov-307558173;eddie-bergman-356736153/;frank-hutter-9190b24b/", "or_profile": "~Herilalaina_Rakotoarison1;~Steven_Adriaensen1;~Neeratyoy_Mallik1;~Samir_Garibov1;~Eddie_Bergman1;~Frank_Hutter1", "aff": "University of Freiburg;Universit\u00e4t Freiburg;University of Freiburg;;Albert-Ludwigs-Universit\u00e4t Freiburg;Albert-Ludwigs-Universit\u00e4t Freiburg", "aff_domain": "cs.uni-freiburg.de;uni-freiburg.de;uni-freiburg.de;;uni-freiburg.de;uni-freiburg.de", "position": "Postdoc;Postdoc;PhD student;;Researcher;Full Professor", "bibtex": "@inproceedings{\nrakotoarison2024incontext,\ntitle={In-Context Freeze-Thaw Bayesian Optimization for Hyperparameter Optimization},\nauthor={Herilalaina Rakotoarison and Steven Adriaensen and Neeratyoy Mallik and Samir Garibov and Eddie Bergman and Frank Hutter},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VyoY3Wh9Wd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6308491, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6556949515626508156&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 8, "email": "cs.uni-freiburg.de;uni-freiburg.de;uni-freiburg.de;;uni-freiburg.de;uni-freiburg.de", "author_num": 6, "aff_unique_index": "0;0;0;1;1", "aff_unique_norm": "University of Freiburg;Albert-Ludwigs-Universit\u00e4t Freiburg", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-freiburg.de;https://www.uni-freiburg.de", "aff_unique_abbr": "UoF;Albert-Ludwigs-Universit\u00e4t", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Freiburg", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Generalization Analysis for Multi-Label Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33891", "id": "W4mLp5KuKl", "proceeding": "https://proceedings.mlr.press/v235/zhang24by.html", "pdf": "https://openreview.net/pdf?id=W4mLp5KuKl", "openreview": "https://openreview.net/forum?id=W4mLp5KuKl", "author_site": "Yi-Fan Zhang, Min-Ling Zhang", "tldr": "", "abstract": "Despite great advances in algorithms for multi-label learning, research on the theoretical analysis of generalization is still in the early stage. Some recent theoretical results has investigated the generalization performance of multi-label learning under several evaluation metrics, however, how to reduce the dependency on the number of labels, explicitly introduce label correlations, and quantitatively analyze the impact of various inductive biases in the generalization analysis of multi-label learning is still a crucial and open problem. In an attempt to make up for the gap in the generalization theory of multi-label learning, we develop several novel vector-contraction inequalities, which exploit the Lipschitz continuity of loss functions, and derive generalization bounds with a weaker dependency on the number of labels than the state of the art in the case of decoupling the relationship among different components, which serves as theoretical guarantees for the generalization of multi-label learning. In addition, we derive the generalization bound for Macro-Averaged AUC and analyze its relationship with class-imbalance. The mild bounds without strong assumptions explain the good generalization ability of multi-label learning with first-order label correlations and high-order label correlations induced by norm regularizers.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yifan Zhang;Min-Ling Zhang", "authorids": "~Yifan_Zhang13;~Min-Ling_Zhang2", "gender": "M;M", "homepage": "http://palm.seu.edu.cn/homepage/zhangyifan/demo/demo/index.html;http://palm.seu.edu.cn/zhangml/", "dblp": "https://dblp.org/rec/conf/ictai/ZhangL20;84/271.html", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;uFHCIM0AAAAJ", "orcid": ";0000-0003-1880-5918", "linkedin": ";", "or_profile": "~Yifan_Zhang13;~Min-Ling_Zhang2", "aff": "Southeast University;Southeast University", "aff_domain": "seu.edu.cn;seu.edu.cn", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nzhang2024generalization,\ntitle={Generalization Analysis for Multi-Label Learning},\nauthor={Yifan Zhang and Min-Ling Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=W4mLp5KuKl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 409341, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2126230951133852657&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 5, "email": "seu.edu.cn;seu.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Southeast University", "aff_unique_dep": "", "aff_unique_url": "https://www.seu.edu.cn/", "aff_unique_abbr": "SEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "FlowMM: Generating Materials with Riemannian Flow Matching", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33890", "id": "W4pB7VbzZI", "proceeding": "https://proceedings.mlr.press/v235/miller24a.html", "pdf": "https://openreview.net/pdf?id=W4pB7VbzZI", "openreview": "https://openreview.net/forum?id=W4pB7VbzZI", "author_site": "Benjamin Kurt Miller, Ricky T. Q. Chen, Anuroop Sriram, Brandon Wood", "tldr": "", "abstract": "Crystalline materials are a fundamental component in next-generation technologies, yet modeling their distribution presents unique computational challenges. Of the plausible arrangements of atoms in a periodic lattice only a vanishingly small percentage are thermodynamically stable, which is a key indicator of the materials that can be experimentally realized. Two fundamental tasks in this area are to (a) predict the stable crystal structure of a known composition of elements and (b) propose novel compositions along with their stable structures. We present FlowMM, a pair of generative models that achieve state-of-the-art performance on both tasks while being more efficient and more flexible than competing methods. We extend Riemannian Flow Matching to suit the symmetries inherent to crystals: translation, rotation, permutation, and periodic boundary conditions. Our framework enables the freedom to choose the flow base distributions, drastically simplifying the problem of learning crystal structures compared with diffusion models. In addition to standard benchmarks, we validate FlowMM's generated structures with quantum chemistry calculations, demonstrating that it is $\\sim$3x more efficient, in terms of integration steps, at finding stable materials compared to previous open methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Benjamin Kurt Miller;Ricky T. Q. Chen;Anuroop Sriram;Brandon M Wood", "authorids": "~Benjamin_Kurt_Miller1;~Ricky_T._Q._Chen1;~Anuroop_Sriram1;~Brandon_M_Wood1", "gender": "M;M;M;M", "homepage": "http://www.mathben.com/;https://anuroopsriram.com;https://www.bmwood.org;http://www.rtqichen.com", "dblp": "269/9572;200/7951;276/7546;228/6698", "google_scholar": "IrCdg_wAAAAJ;D4uRc_UAAAAJ;KbqboRgAAAAJ;7MxQd6UAAAAJ", "orcid": "0000-0003-0387-8727;;0000-0002-7251-337X;", "linkedin": "benjamin-k-miller/;anuroopsriram/;;", "or_profile": "~Benjamin_Kurt_Miller1;~Anuroop_Sriram1;~Brandon_M_Wood1;~Tian_Qi_Chen2", "aff": "University of Amsterdam;Meta Facebook;FAIR at Meta;FAIR Labs, Meta AI", "aff_domain": "uva.nl;meta.com;meta.com;meta.com", "position": "PhD student;Principal Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\nmiller2024flowmm,\ntitle={Flow{MM}: Generating Materials with Riemannian Flow Matching},\nauthor={Benjamin Kurt Miller and Ricky T. Q. Chen and Anuroop Sriram and Brandon M Wood},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=W4pB7VbzZI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 618973, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3337542899660719700&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 7, "email": "uva.nl;meta.com;meta.com;meta.com", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of Amsterdam;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.uva.nl;https://meta.com", "aff_unique_abbr": "UvA;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Netherlands;United States" }, { "title": "Position: Quo Vadis, Unsupervised Time Series Anomaly Detection?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33889", "id": "W7Vqx1Jvc2", "proceeding": "https://proceedings.mlr.press/v235/sarfraz24a.html", "pdf": "https://openreview.net/pdf?id=W7Vqx1Jvc2", "openreview": "https://openreview.net/forum?id=W7Vqx1Jvc2", "author_site": "M. Saquib Sarfraz, Mei-Yen Chen, Lukas Layer, Kunyu Peng, Marios Koulakis", "tldr": "", "abstract": "The current state of machine learning scholarship in Timeseries Anomaly Detection (TAD) is plagued by the persistent use of flawed evaluation metrics, inconsistent benchmarking practices, and a lack of proper justification for the choices made in novel deep learning-based model designs. Our paper presents a critical analysis of the status quo in TAD, revealing the misleading track of current research and highlighting problematic methods, and evaluation practices. ***Our position advocates for a shift in focus from solely pursuing novel model designs to improving benchmarking practices, creating non-trivial datasets, and critically evaluating the utility of complex methods against simpler baselines***. Our findings demonstrate the need for rigorous evaluation protocols, the creation of simple baselines, and the revelation that state-of-the-art deep anomaly detection models effectively learn linear mappings. These findings suggest the need for more exploration and development of simple and interpretable TAD methods. The increment of model complexity in the state-of-the-art deep-learning based models unfortunately offers very little improvement. We offer insights and suggestions for the field to move forward.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "M. Saquib Sarfraz;Mei-Yen Chen;Lukas Layer;Kunyu Peng;Marios Koulakis", "authorids": "~M._Saquib_Sarfraz1;~Mei-Yen_Chen1;~Lukas_Layer1;~Kunyu_Peng1;~Marios_Koulakis1", "gender": "M;;M;F;M", "homepage": "https://ssarfraz.github.io/;;;;", "dblp": "12/1561;;;292/4197;249/6142", "google_scholar": "https://scholar.google.de/citations?user=4YLsmYIAAAAJ;;;pA9c0YsAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-1271-0005;;0000-0001-9416-9605;0000-0002-5419-9292;", "linkedin": "saquib-sarfraz-6395783a/;mei-yen-chen-22937787;;;marioskoulakis", "or_profile": "~M._Saquib_Sarfraz1;~Mei-Yen_Chen1;~Lukas_Layer1;~Kunyu_Peng1;~Marios_Koulakis1", "aff": "Karlsruher Institut f\u00fcr Technologie;Mercedes-Benz Tech Innovation GmbH;;Karlsruher Institut f\u00fcr Technologie;DeepHealth", "aff_domain": "kit.edu;mercedes-benz.com;;kit.edu;deephealth.com", "position": "Lecturer;Researcher;;Postdoc;Machine Learning Scientist", "bibtex": "@inproceedings{\nsarfraz2024position,\ntitle={Position: Quo Vadis, Unsupervised Time Series Anomaly Detection?},\nauthor={M. Saquib Sarfraz and Mei-Yen Chen and Lukas Layer and Kunyu Peng and Marios Koulakis},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=W7Vqx1Jvc2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2061669, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6310874184156449010&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "kit.edu;mercedes-benz.com;;kit.edu;deephealth.com", "author_num": 5, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Karlsruher Institut f\u00fcr Technologie;Mercedes-Benz Tech Innovation GmbH;DeepHealth", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kit.edu;https://www.mercedes-benz.com;", "aff_unique_abbr": "KIT;MB Tech;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany;" }, { "title": "Noise-Adaptive Confidence Sets for Linear Bandits and Application to Bayesian Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33888", "id": "W8hBNk1FhQ", "proceeding": "https://proceedings.mlr.press/v235/jun24a.html", "pdf": "https://openreview.net/pdf?id=W8hBNk1FhQ", "openreview": "https://openreview.net/forum?id=W8hBNk1FhQ", "author_site": "Kwang-Sung Jun, Jungtaek Kim", "tldr": "", "abstract": "Adapting to a priori unknown noise level is a very important but challenging problem in sequential decision-making as efficient exploration typically requires knowledge of the noise level, which is often loosely specified. We report significant progress in addressing this issue in linear bandits in two respects. First, we propose a novel confidence set that is 'semi-adaptive' to the unknown sub-Gaussian parameter $\\sigma_*^2$ in the sense that the (normalized) confidence width scales with $\\sqrt{d\\sigma_*^2 + \\sigma_0^2}$ where $d$ is the dimension and $\\sigma_0^2$ is the specified sub-Gaussian parameter (known) that can be much larger than $\\sigma_*^2$. This is a significant improvement over $\\sqrt{d\\sigma_0^2}$ of the standard confidence set of Abbasi-Yadkori et al. (2011), especially when $d$ is large. We show that this leads to an improved regret bound in linear bandits. Second, for bounded rewards, we propose a novel variance-adaptive confidence set that has a much improved numerical performance upon prior art. We then apply this confidence set to develop, as we claim, the first practical variance-adaptive linear bandit algorithm via an optimistic approach, which is enabled by our novel regret analysis technique. Both of our confidence sets rely critically on `regret equality' from online learning. Our empirical evaluation in Bayesian optimization tasks shows that our algorithms demonstrate better or comparable performance compared to existing methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kwang-Sung Jun;Jungtaek Kim", "authorids": "~Kwang-Sung_Jun1;~Jungtaek_Kim1", "gender": "M;M", "homepage": "http://kwangsungjun.github.io;https://jungtaekkim.github.io", "dblp": "88/8411;31/3193-1", "google_scholar": "VgvC7o8AAAAJ;KXNUYWgAAAAJ", "orcid": ";0000-0002-1905-1399", "linkedin": ";jungtaekkim", "or_profile": "~Kwang-Sung_Jun1;~Jungtaek_Kim1", "aff": "University of Arizona;University of Pittsburgh", "aff_domain": "cs.arizona.edu;pitt.edu", "position": "Assistant Professor;Postdoc", "bibtex": "@inproceedings{\njun2024noiseadaptive,\ntitle={Noise-Adaptive Confidence Sets for Linear Bandits and Application to Bayesian Optimization},\nauthor={Kwang-Sung Jun and Jungtaek Kim},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=W8hBNk1FhQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6447379, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5712720731117007279&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "email": "cs.arizona.edu;pitt.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Arizona;University of Pittsburgh", "aff_unique_dep": ";", "aff_unique_url": "https://www.arizona.edu;https://www.pitt.edu", "aff_unique_abbr": "UA;Pitt", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Spectral Phase Transition and Optimal PCA in Block-Structured Spiked Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33887", "id": "W97gFmrKe6", "proceeding": "https://proceedings.mlr.press/v235/mergny24a.html", "pdf": "https://openreview.net/pdf?id=W97gFmrKe6", "openreview": "https://openreview.net/forum?id=W97gFmrKe6", "author_site": "Pierre Mergny, Justin Ko, FLORENT KRZAKALA", "tldr": "", "abstract": "We discuss the inhomogeneous Wigner spike model, a theoretical framework recently introduced to study structured noise in various learning scenarios, through the prism of random matrix theory, with a specific focus on its spectral properties. Our primary objective is to find an optimal spectral method, and to extend the celebrated (BBP) phase transition criterion ---well-known in the homogeneous case--- to our inhomogeneous, block-structured, Wigner model. We provide a thorough rigorous analysis of a transformed matrix and show that the transition for the appearance of 1) an outlier outside the bulk of the limiting spectral distribution and 2) a positive overlap between the associated eigenvector and the signal, occurs precisely at the optimal threshold, making the proposed spectral method optimal within the class of iterative methods for the inhomogeneous Wigner problem.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pierre Mergny;Justin Ko;Florent Krzakala", "authorids": "~Pierre_Mergny1;~Justin_Ko2;~Florent_Krzakala1", "gender": ";;", "homepage": "https://people.epfl.ch/pierre.mergny/?lang=fr;;http://Krzakala.org", "dblp": ";;25/1282", "google_scholar": ";;https://scholar.google.fr/citations?user=3jDeUlMAAAAJ", "orcid": ";;0000-0003-2313-2578", "linkedin": ";;", "or_profile": "~Pierre_Mergny1;~Justin_Ko2;~Florent_Krzakala1", "aff": "EPFL - EPF Lausanne;;Swiss Federal Institute of Technology Lausanne", "aff_domain": "epfl.ch;;epfl.ch", "position": "Postdoc;;Full Professor", "bibtex": "@inproceedings{\nmergny2024spectral,\ntitle={Spectral Phase Transition and Optimal {PCA} in Block-Structured Spiked Models},\nauthor={Pierre Mergny and Justin Ko and Florent Krzakala},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=W97gFmrKe6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 640177, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6062814694621164544&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "epfl.ch;;epfl.ch", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "EPFL;Swiss Federal Institute of Technology Lausanne", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://www.epfl.ch", "aff_unique_abbr": "EPFL;EPFL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "title": "Time Series Diffusion in the Frequency Domain", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33886", "id": "W9GaJUVLCT", "proceeding": "https://proceedings.mlr.press/v235/crabbe24a.html", "pdf": "https://openreview.net/pdf?id=W9GaJUVLCT", "openreview": "https://openreview.net/forum?id=W9GaJUVLCT", "author_site": "Jonathan Crabb\u00e9, Nicolas Huynh, Jan Stanczuk, M van der Schaar", "tldr": "", "abstract": "Fourier analysis has been an instrumental tool in the development of signal processing. This leads us to wonder whether this framework could similarly benefit generative modelling. In this paper, we explore this question through the scope of time series diffusion models. More specifically, we analyze whether representing time series in the frequency domain is a useful inductive bias for score-based diffusion models. By starting from the canonical SDE formulation of diffusion in the time domain, we show that a dual diffusion process occurs in the frequency domain with an important nuance: Brownian motions are replaced by what we call mirrored Brownian motions, characterized by mirror symmetries among their components. Building on this insight, we show how to adapt the denoising score matching approach to implement diffusion models in the frequency domain. This results in frequency diffusion models, which we compare to canonical time diffusion models. Our empirical evaluation on real-world datasets, covering various domains like healthcare and finance, shows that frequency diffusion models better capture the training distribution than time diffusion models. We explain this observation by showing that time series from these datasets tend to be more localized in the frequency domain than in the time domain, which makes them easier to model in the former case. All our observations point towards impactful synergies between Fourier analysis and diffusion models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jonathan Crabb\u00e9;Nicolas Huynh;Jan Pawel Stanczuk;Mihaela van der Schaar", "authorids": "~Jonathan_Crabb\u00e91;~Nicolas_Huynh1;~Jan_Pawel_Stanczuk1;~Mihaela_van_der_Schaar2", "gender": "M;M;Not Specified;F", "homepage": "https://jonathancrabbe.github.io/;;;https://www.vanderschaar-lab.com", "dblp": "278/8353.html;134/9604;286/8660;", "google_scholar": "Y_Nmd2sAAAAJ;;Auwhh8sAAAAJ;DZ3S--MAAAAJ", "orcid": "0000-0002-0341-7712;;;", "linkedin": "jonathan-crabb%C3%A9-4ab5701a5/;;;", "or_profile": "~Jonathan_Crabb\u00e91;~Nicolas_Huynh1;~Jan_Pawel_Stanczuk1;~Mihaela_van_der_Schaar2", "aff": "University of Cambridge;University of Cambridge;;University of California, Los Angeles", "aff_domain": "cam.ac.uk;cam.ac.uk;;ucla.edu", "position": "PhD student;PhD student;;Full Professor", "bibtex": "@inproceedings{\ncrabb{\\'e}2024time,\ntitle={Time Series Diffusion in the Frequency Domain},\nauthor={Jonathan Crabb{\\'e} and Nicolas Huynh and Jan Pawel Stanczuk and Mihaela van der Schaar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=W9GaJUVLCT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1795946, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18102599187870191651&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "cam.ac.uk;cam.ac.uk;;ucla.edu", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Cambridge;University of California, Los Angeles", "aff_unique_dep": ";", "aff_unique_url": "https://www.cam.ac.uk;https://www.ucla.edu", "aff_unique_abbr": "Cambridge;UCLA", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Cambridge;Los Angeles", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Learning High-Order Relationships of Brain Regions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33885", "id": "WC14xZIaC2", "proceeding": "https://proceedings.mlr.press/v235/qiu24b.html", "pdf": "https://openreview.net/pdf?id=WC14xZIaC2", "openreview": "https://openreview.net/forum?id=WC14xZIaC2", "author_site": "Weikang Qiu, Huangrui Chu, Selena Wang, Haolan Zuo, Xiaoxiao Li, Yize Zhao, ZHITAO YING", "tldr": "", "abstract": "Discovering reliable and informative relationships among brain regions from functional magnetic resonance imaging (fMRI) signals is essential in phenotypic predictions in neuroscience. Most of the current methods fail to accurately characterize those interactions because they only focus on pairwise connections and overlook the high-order relationships of brain regions. We propose that these high-order relationships should be *maximally informative and minimally redundant* (MIMR). However, identifying such high-order relationships is challenging and under-explored due to the exponential search space and the absence of a tractable objective. In response to this gap, we propose a novel method named HyBRiD, which aims to extract MIMR high-order relationships from fMRI data. HyBRiD employs a Constructor to identify hyperedge structures, and a Weighter to compute a weight for each hyperedge, which avoids searching in exponential space. HyBRiD achieves the MIMR objective through an innovative information bottleneck framework named multi-head drop-bottleneck with theoretical guarantees. Our comprehensive experiments demonstrate the effectiveness of our model. Our model outperforms the state-of-the-art predictive model by an average of 11.2%, regarding the quality of hyperedges measured by CPM, a standard protocol for studying brain connections.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weikang Qiu;Huangrui Chu;Selena Wang;Haolan Zuo;Xiaoxiao Li;Yize Zhao;Rex Ying", "authorids": "~Weikang_Qiu1;~Huangrui_Chu1;~Selena_Wang1;~Haolan_Zuo1;~Xiaoxiao_Li1;~Yize_Zhao1;~Zhitao_Ying1", "gender": "M;M;F;M;Unspecified;;M", "homepage": "https://www.boltzmachine.com;https://huangruichu.github.io/;https://www.selenawangcv.com;;https://xxlya.github.io/;https://www.yizezhao.com/;https://www.cs.yale.edu/homes/ying-rex", "dblp": "336/1936;;;;71/8042;;209/4936", "google_scholar": "OLRjhHAAAAAJ;;;;sdENOQ4AAAAJ;;6fqNXooAAAAJ", "orcid": ";;;;;;", "linkedin": ";huangruichu/;;daniel-zuo-604619244/;;;rex-ying-92770148/", "or_profile": "~Weikang_Qiu1;~Huangrui_Chu1;~Selena_Wang1;~Haolan_Zuo1;~Xiaoxiao_Li1;~Yize_Zhao1;~Zhitao_Ying1", "aff": "Yale University;Yale University;Yale University;Yale University;University of British Columbia;;Yale University", "aff_domain": "yale.edu;yale.edu;yale.edu;yale.edu;ece.ubc.ca;;yale.edu", "position": "PhD student;MS student;Postdoc;MS student;Assistant Professor;;Assistant Professor", "bibtex": "@inproceedings{\nqiu2024learning,\ntitle={Learning High-Order Relationships of Brain Regions},\nauthor={Weikang Qiu and Huangrui Chu and Selena Wang and Haolan Zuo and Xiaoxiao Li and Yize Zhao and Rex Ying},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WC14xZIaC2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2107669, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4767684268898720399&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "yale.edu;yale.edu;yale.edu;yale.edu;ece.ubc.ca;;yale.edu", "author_num": 7, "aff_unique_index": "0;0;0;0;1;0", "aff_unique_norm": "Yale University;University of British Columbia", "aff_unique_dep": ";", "aff_unique_url": "https://www.yale.edu;https://www.ubc.ca", "aff_unique_abbr": "Yale;UBC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0", "aff_country_unique": "United States;Canada" }, { "title": "GistScore: Learning Better Representations for In-Context Example Selection with Gist Bottlenecks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33884", "id": "WCVC5wGZyz", "proceeding": "https://proceedings.mlr.press/v235/gupta24c.html", "pdf": "https://openreview.net/pdf?id=WCVC5wGZyz", "openreview": "https://openreview.net/forum?id=WCVC5wGZyz", "author_site": "Shivanshu Gupta, Clemens Rosenbaum, Ethan R. Elenberg", "tldr": "", "abstract": "In-Context Learning (ICL) is the ability of Large Language Models (LLMs) to perform new tasks when conditioned on prompts comprising a few task examples. However, ICL performance can be critically sensitive to the choice of examples. To dynamically select the best examples for every test input, we propose Example Gisting, a novel approach for training example encoders through supervised finetuning with an attention bottleneck between the inputs and outputs. These gist models form the basis for GistScore, a novel metric for scoring and selecting informative examples. Further, we experiment with two variations: (1) finetuning gist models for each dataset and (2) multi-task training a single model on a large collection of datasets. The latter can be used for new tasks out-of-the-box, enabling a training-free ICL pipeline. Evaluations with 21 datasets spanning 9 tasks and 8 diverse LLMs show that our fine-tuned models get state-of-the-art ICL performance with over 20% absolute gain over off-the-shelf retrievers and 5% over the best prior methods. Further, our multi-task model generalizes well to new tasks, datasets, and prompt templates. Selection using this model matches or outperforms prior methods while being three orders of magnitude faster than the strongest training-free baseline.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shivanshu Gupta;Clemens Rosenbaum;Ethan R. Elenberg", "authorids": "~Shivanshu_Gupta2;~Clemens_Rosenbaum1;~Ethan_R._Elenberg2", "gender": "M;M;", "homepage": "https://shivanshu-gupta.github.io;https://people.cs.umass.edu/~cgbr/;http://eelenberg.github.io/", "dblp": "302/4731;182/2594;150/5501", "google_scholar": "OtlUDs8AAAAJ;JkHX5H8AAAAJ;Kh-DC4IAAAAJ", "orcid": ";;", "linkedin": "shivanshu-gupta1995/;;", "or_profile": "~Shivanshu_Gupta2;~Clemens_Rosenbaum1;~Ethan_R_Elenberg1", "aff": "University of California, Irvine;ASAPP;ASAPP", "aff_domain": "uci.edu;asapp.com;asapp.com", "position": "PhD student;Researcher;Researcher", "bibtex": "@inproceedings{\ngupta2024gistscore,\ntitle={GistScore: Learning Better Representations for In-Context Example Selection with Gist Bottlenecks},\nauthor={Shivanshu Gupta and Clemens Rosenbaum and Ethan R. Elenberg},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WCVC5wGZyz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2276261, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6081502775958142653&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "uci.edu;asapp.com;asapp.com", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of California, Irvine;ASAPP", "aff_unique_dep": ";", "aff_unique_url": "https://www.uci.edu;https://www.asapp.com", "aff_unique_abbr": "UCI;ASAPP", "aff_campus_unique_index": "0", "aff_campus_unique": "Irvine;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Agnostic Interactive Imitation Learning: New Theory and Practical Algorithms", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33883", "id": "WCwxFM7n5S", "proceeding": "https://proceedings.mlr.press/v235/li24ck.html", "pdf": "https://openreview.net/pdf?id=WCwxFM7n5S", "openreview": "https://openreview.net/forum?id=WCwxFM7n5S", "author_site": "Yichen Li, Chicheng Zhang", "tldr": "", "abstract": "We study interactive imitation learning, where a learner interactively queries a demonstrating expert for action annotations, aiming to learn a policy that has performance competitive with the expert, using as few annotations as possible. We focus on the general agnostic setting where the expert demonstration policy may not be contained in the policy class used by the learner. We propose a new oracle-efficient algorithm MFTPL-P (abbreviation for Mixed Follow the Perturbed Leader with Poisson perturbations) with provable finite-sample guarantees, under the assumption that the learner is given access to samples from some ``explorative'' distribution over states. Our guarantees hold for any policy class, which is considerably broader than prior state of the art. We further propose Bootstrap-DAgger, a more practical variant that does not require additional sample access.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yichen Li;Chicheng Zhang", "authorids": "~Yichen_Li3;~Chicheng_Zhang1", "gender": "M;M", "homepage": "https://www.cs.arizona.edu/person/yichen-li;http://zcc1307.github.io", "dblp": ";149/2402", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;29B3BAgAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Yichen_Li3;~Chicheng_Zhang1", "aff": "University of Arizona;University of Arizona", "aff_domain": "cs.arizona.edu;arizona.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nli2024agnostic,\ntitle={Agnostic Interactive Imitation Learning: New Theory and Practical Algorithms},\nauthor={Yichen Li and Chicheng Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WCwxFM7n5S}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4120321, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:VuxkKpHg3MgJ:scholar.google.com/&scioq=Agnostic+Interactive+Imitation+Learning:+New+Theory+and+Practical+Algorithms&hl=en&as_sdt=0,14", "gs_version_total": 7, "email": "cs.arizona.edu;arizona.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Arizona", "aff_unique_dep": "", "aff_unique_url": "https://www.arizona.edu", "aff_unique_abbr": "UA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Two Heads Are Better Than One: Boosting Graph Sparse Training via Semantic and Topological Awareness", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33882", "id": "WDgV1BJEW0", "proceeding": "https://proceedings.mlr.press/v235/zhang24bx.html", "pdf": "https://openreview.net/pdf?id=WDgV1BJEW0", "openreview": "https://openreview.net/forum?id=WDgV1BJEW0", "author_site": "Guibin Zhang, Yanwei Yue, kun wang, Junfeng Fang, Yongduo Sui, Kai Wang, Yuxuan Liang, Dawei Cheng, Shirui Pan, Tianlong Chen", "tldr": "", "abstract": "Graph Neural Networks (GNNs) excel in various graph learning tasks but face computational challenges when applied to large-scale graphs. A promising solution is to remove non-essential edges to reduce the computational overheads in GNN. Previous literature generally falls into two categories: topology-guided and semantic-guided. The former maintains certain graph topological properties yet often underperforms on GNNs. % due to low integration with neural network training. The latter performs well at lower sparsity on GNNs but faces performance collapse at higher sparsity levels. With this in mind, we propose a new research line and concept termed **Graph Sparse Training** **(GST)**, which dynamically manipulates sparsity at the data level. Specifically, GST initially constructs a topology & semantic anchor at a low training cost, followed by performing dynamic sparse training to align the sparse graph with the anchor. We introduce the **Equilibria Sparsification Principle** to guide this process, balancing the preservation of both topological and semantic information. Ultimately, GST produces a sparse graph with maximum topological integrity and no performance degradation. Extensive experiments on 6 datasets and 5 backbones showcase that GST **(I)** identifies subgraphs at higher graph sparsity levels ($1.67\\%\\sim15.85\\%$$\\uparrow$) than state-of-the-art sparsification methods, **(II)** preserves more key spectral properties, **(III)** achieves $1.27-3.42\\times$ speedup in GNN inference and **(IV)** successfully helps graph adversarial defense and graph lottery tickets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guibin Zhang;Yanwei Yue;Kun Wang;Junfeng Fang;Yongduo Sui;Kai Wang;Yuxuan Liang;Dawei Cheng;Shirui Pan;Tianlong Chen", "authorids": "~Guibin_Zhang1;~Yanwei_Yue1;~Kun_Wang15;~Junfeng_Fang1;~Yongduo_Sui1;~Kai_Wang8;~Yuxuan_Liang1;~Dawei_Cheng1;~Shirui_Pan1;~Tianlong_Chen1", "gender": ";M;M;M;M;M;M;M;;M", "homepage": ";https://yanweiyue.github.io/;http://home.ustc.edu.cn/~wk520529/#home;https://scholar.google.com/citations?user=beNNywsAAAAJ&hl=zh-CN;https://yongduosui.github.io/;https://kaiwang960112.github.io/;https://yuxuanliang.com;http://cs1.tongji.edu.cn/~dawei/;;https://tianlong-chen.github.io", "dblp": ";289/8664;;340/7929;277/5175;78/2022-36;183/0977;135/6864;91/8171;", "google_scholar": ";https://scholar.google.com.hk/citations?user=JaJm738AAAAJ;UnyqjWQAAAAJ;beNNywsAAAAJ;VD9g6ogAAAAJ;i2II0XIAAAAJ;n9cODgcAAAAJ;4UD20ukAAAAJ;https://scholar.google.com.au/citations?user=frWRJN4AAAAJ;LE3ctn0AAAAJ", "orcid": ";;0000-0003-0602-169X;;0000-0003-4492-147X;0000-0002-1154-5175;0000-0003-2817-7337;0000-0002-5877-7387;0000-0003-0794-527X;0000-0001-7774-8197", "linkedin": ";;;;yongduosui/;;yoshall/;;;tianlong-chen-783862167/", "or_profile": "~Guibin_Zhang1;~Yanwei_Yue1;~Kun_Wang15;~Junfeng_Fang1;~Yongduo_Sui1;~Kai_Wang8;~Yuxuan_Liang1;~Dawei_Cheng1;~Shirui_Pan1;~Tianlong_Chen1", "aff": ";Tongji University;University of Science and Technology of China;;University of Science and Technology of China;National University of Singapore;The Hong Kong University of Science and Technology (Guangzhou);Tongji University;Griffith University;Harvard University", "aff_domain": ";tongji.edu.cn;ustc.edu.cn;;ustc.edu.cn;u.nus.edu;hkust-gz.edu.cn;tongji.edu.cn;griffith.edu.au;harvard.edu", "position": ";Undergrad student;PhD student;;PhD student;PhD student;Assistant Professor;Associate Professor;Full Professor;Postdoc", "bibtex": "@inproceedings{\nzhang2024two,\ntitle={Two Heads Are Better Than One: Boosting Graph Sparse Training via Semantic and Topological Awareness},\nauthor={Guibin Zhang and Yanwei Yue and Kun Wang and Junfeng Fang and Yongduo Sui and Kai Wang and Yuxuan Liang and Dawei Cheng and Shirui Pan and Tianlong Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WDgV1BJEW0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7436013, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2443199420420427740&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": ";tongji.edu.cn;ustc.edu.cn;;ustc.edu.cn;u.nus.edu;hkust-gz.edu.cn;tongji.edu.cn;griffith.edu.au;harvard.edu", "author_num": 10, "aff_unique_index": "0;1;1;2;3;0;4;5", "aff_unique_norm": "Tongji University;University of Science and Technology of China;National University of Singapore;Hong Kong University of Science and Technology;Griffith University;Harvard University", "aff_unique_dep": ";;;;;", "aff_unique_url": "https://www.tongji.edu.cn;http://www.ustc.edu.cn;https://www.nus.edu.sg;https://www.ust.hk;https://www.griffith.edu.au;https://www.harvard.edu", "aff_unique_abbr": "Tongji;USTC;NUS;HKUST;Griffith;Harvard", "aff_campus_unique_index": "1", "aff_campus_unique": ";Guangzhou", "aff_country_unique_index": "0;0;0;1;0;0;2;3", "aff_country_unique": "China;Singapore;Australia;United States" }, { "title": "Language Models as Science Tutors", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33881", "id": "WFyolnFZOR", "proceeding": "https://proceedings.mlr.press/v235/chevalier24a.html", "pdf": "https://openreview.net/pdf?id=WFyolnFZOR", "openreview": "https://openreview.net/forum?id=WFyolnFZOR", "author_site": "Alexis Chevalier, Jiayi Geng, Alexander Wettig, Howard Chen, Sebastian Mizera, Toni Annala, Max Aragon, Arturo Fanlo, Simon Frieder, Simon Machado, Akshara P, Ellie Thieu, Jiachen Wang, Zirui Wang, Xindi Wu, Mengzhou Xia, Wenhan Xia, Jiatong Yu, Junjie Zhu, Zhiyong Ren, Sanjeev Arora, Danqi Chen", "tldr": "", "abstract": "NLP has recently made exciting progress toward training language models (LMs) with strong scientific problem-solving skills. However, model development has not focused on real-life use-cases of LMs for science, including applications in education that require processing long scientific documents. To address this, we introduce TutorEval and TutorChat. TutorEval is a diverse question-answering benchmark consisting of questions about long chapters from STEM textbooks, written by experts. TutorEval helps measure real-life usability of LMs as scientific assistants, and it is the first benchmark combining long contexts, free-form generation, and multi-disciplinary scientific knowledge. Moreover, we show that fine-tuning base models with existing dialogue datasets leads to poor performance on TutorEval. Therefore, we create TutorChat, a dataset of 80,000 long synthetic dialogues about textbooks. We use TutorChat to fine-tune Llemma models with 7B and 34B parameters. These LM tutors specialized in math have a 32K-token context window, and they excel at TutorEval while performing strongly on GSM8K and MATH. Our datasets build on open-source materials, and we release our models, data, and evaluations publicly.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alexis Chevalier;Jiayi Geng;Alexander Wettig;Howard Chen;Sebastian Mizera;Toni Annala;Max Aragon;Arturo Rodriguez Fanlo;Simon Frieder;Simon Machado;Akshara Prabhakar;Ellie Thieu;Jiachen T. Wang;Zirui Wang;Xindi Wu;Mengzhou Xia;Wenhan Xia;Jiatong Yu;Junjie Zhu;Zhiyong Ren;Sanjeev Arora;Danqi Chen", "authorids": "~Alexis_Chevalier1;~Jiayi_Geng1;~Alexander_Wettig1;~Howard_Chen1;~Sebastian_Mizera1;~Toni_Annala1;~Max_Aragon1;~Arturo_Rodriguez_Fanlo1;~Simon_Frieder1;~Simon_Machado1;~Akshara_Prabhakar1;elliethieu.amherst@gmail.com;~Jiachen_T._Wang1;~Zirui_Wang5;~Xindi_Wu1;~Mengzhou_Xia1;~Wenhan_Xia1;~Jiatong_Yu1;~Junjie_Zhu4;~Zhiyong_Ren1;~Sanjeev_Arora1;~Danqi_Chen1", "gender": "Not Specified;;;M;;M;M;M;;;F;;;M;F;F;F;F;M;;;F", "homepage": "https://www.ias.edu/scholars/alexis-chevalier;https://jiayigeng.github.io/;https://www.cs.princeton.edu/~awettig/;https://howard50b.github.io/;http://smizera.com;;;;;https://sites.google.com/view/simonmachado/home;;;;https://zwcolin.github.io;https://xindiwu.github.io/;https://xiamengzhou.github.io/;https://wenhanlunaxia.github.io/;https://www.cs.princeton.edu/~jiatongy/;https://junjiezhublog.wordpress.com/;;http://www.cs.princeton.edu/~arora/;https://www.cs.princeton.edu/~danqic/", "dblp": ";;302/0235;06/2061;;;;;;;307/3028;;;;235/0784;241/9329;;;;;a/SArora;87/7949", "google_scholar": ";VHpgr5IAAAAJ;N_jSE08AAAAJ;wsNa_W4AAAAJ;zoabscUAAAAJ;bo3IAlUAAAAJ;gM3o0i0AAAAJ;;;0IUG5DIAAAAJ;vvUmC_EAAAAJ;;;https://scholar.google.com/citations?hl=en;hvnUnrUAAAAJ;zyJn1IcAAAAJ;;;n9Zatu8AAAAJ;;RUP4S68AAAAJ;sVR8ktkAAAAJ", "orcid": ";;;;0000-0002-8066-5891;0000-0001-6419-0278;;0000-0002-9188-5128;;;0000-0001-6846-9153;;;0009-0005-1329-5607;;;;;0000-0002-7546-2870;;;", "linkedin": "alexchvl;;alexander-wettig/;;;toni-annala-7054b828a/;;;;;p-akshara/;;;zwcolin/;;;;;;;;", "or_profile": "~Alexis_Chevalier1;~Jiayi_Geng1;~Alexander_Wettig1;~Howard_Chen1;~Sebastian_Mizera1;~Toni_Annala1;~Max_Aragon1;~Arturo_Rodriguez_Fanlo1;~Simon_Frieder1;~Simon_Machado1;~Akshara_Prabhakar1;elliethieu.amherst@gmail.com;~Jiachen_T._Wang1;~Zirui_Wang5;~Xindi_Wu1;~Mengzhou_Xia1;~Wenhan_Xia1;~Jiatong_Yu1;~Junjie_Zhu4;~Zhiyong_Ren1;~Sanjeev_Arora1;~Danqi_Chen1", "aff": "Princeton University;Princeton University;Allen Institute for Artificial Intelligence;Princeton University;Institue for Advanced Study, Princeton;Institue for Advanced Study, Princeton;Princeton University;Hebrew University of Jerusalem;;ETHZ - ETH Zurich;Princeton University;;;Princeton University;Princeton University;Princeton University;Princeton University;Princeton University;Princeton University;University of Colorado at Denver-Downtown Campus;Princeton University;Princeton University", "aff_domain": "princeton.edu;cs.princeton.edu;allenai.org;princeton.edu;ias.edu;ias.edu;princeton.edu;huji.ac.il;;ethz.ch;princeton.edu;;;princeton.edu;cs.princeton.edu;princeton.edu;princeton.edu;princeton.edu;princeton.edu;;princeton.edu;cs.princeton.edu", "position": "Postdoc;MS student;Intern;PhD student;Postdoc;Postdoc;PhD student;Postdoc;;Postdoc;MS student;;;MS student;PhD student;PhD student;PhD student;Undergrad student;Researcher;;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nchevalier2024language,\ntitle={Language Models as Science Tutors},\nauthor={Alexis Chevalier and Jiayi Geng and Alexander Wettig and Howard Chen and Sebastian Mizera and Toni Annala and Max Aragon and Arturo Rodriguez Fanlo and Simon Frieder and Simon Machado and Akshara Prabhakar and Ellie Thieu and Jiachen T. Wang and Zirui Wang and Xindi Wu and Mengzhou Xia and Wenhan Xia and Jiatong Yu and Junjie Zhu and Zhiyong Ren and Sanjeev Arora and Danqi Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WFyolnFZOR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 850291, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 22, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3705590770722008462&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 9, "email": "princeton.edu;cs.princeton.edu;allenai.org;princeton.edu;ias.edu;ias.edu;princeton.edu;huji.ac.il;;ethz.ch;princeton.edu;;;princeton.edu;cs.princeton.edu;princeton.edu;princeton.edu;princeton.edu;princeton.edu;;princeton.edu;cs.princeton.edu", "author_num": 22, "aff_unique_index": "0;0;1;0;2;2;0;3;4;0;0;0;0;0;0;0;5;0;0", "aff_unique_norm": "Princeton University;Allen Institute for Artificial Intelligence;Institute for Advanced Study;Hebrew University of Jerusalem;ETH Zurich;University of Colorado Denver", "aff_unique_dep": ";;;;;", "aff_unique_url": "https://www.princeton.edu;https://allenai.org;https://wwwIAS.edu;https://www.huji.ac.il;https://www.ethz.ch;https://www.ucdenver.edu", "aff_unique_abbr": "Princeton;AI2;IAS;HUJI;ETHZ;UC Denver", "aff_campus_unique_index": "1;1;2;3", "aff_campus_unique": ";Princeton;Jerusalem;Downtown Campus", "aff_country_unique_index": "0;0;0;0;0;0;0;1;2;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States;Israel;Switzerland" }, { "title": "An Empirical Study of Realized GNN Expressiveness", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33880", "id": "WIaZFk02fI", "proceeding": "https://proceedings.mlr.press/v235/wang24cl.html", "pdf": "https://openreview.net/pdf?id=WIaZFk02fI", "openreview": "https://openreview.net/forum?id=WIaZFk02fI", "author_site": "Yanbo Wang, Muhan Zhang", "tldr": "", "abstract": "Research on the theoretical expressiveness of Graph Neural Networks (GNNs) has developed rapidly, and many methods have been proposed to enhance the expressiveness. However, most methods do not have a uniform expressiveness measure except for a few that strictly follow the $k$-dimensional Weisfeiler-Lehman ($k$-WL) test hierarchy, leading to difficulties in quantitatively comparing their expressiveness. Previous research has attempted to use datasets for measurement, but facing problems with difficulty (any model surpassing 1-WL has nearly 100% accuracy), granularity (models tend to be either 100% correct or near random guess), and scale (only several essentially different graphs involved). To address these limitations, we study the realized expressive power that a practical model instance can achieve using a novel expressiveness dataset, BREC, which poses greater difficulty (with up to 4-WL-indistinguishable graphs), finer granularity (enabling comparison of models between 1-WL and 3-WL), a larger scale (consisting of 800 1-WL-indistinguishable graphs that are non-isomorphic to each other). We synthetically test 23 models with higher-than-1-WL expressiveness on BREC. Our experiment gives the first thorough measurement of the realized expressiveness of those state-of-the-art beyond-1-WL GNN models and reveals the gap between theoretical and realized expressiveness. Dataset and evaluation codes are released at: https://github.com/GraphPKU/BREC.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yanbo Wang;Muhan Zhang", "authorids": "~Yanbo_Wang2;~Muhan_Zhang1", "gender": "M;M", "homepage": "https://yanxwb.github.io/;https://muhanzhang.github.io/", "dblp": ";157/5518", "google_scholar": "Rmo7EXQAAAAJ;https://scholar.google.com.hk/citations?user=OBBqkosAAAAJ", "orcid": ";0000-0002-7680-6401", "linkedin": ";jerry-muhan-zhang-a33a1777/", "or_profile": "~Yanbo_Wang2;~Muhan_Zhang1", "aff": "Peking University;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nwang2024an,\ntitle={An Empirical Study of Realized {GNN} Expressiveness},\nauthor={Yanbo Wang and Muhan Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WIaZFk02fI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1089543, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11872123270568217565&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": "pku.edu.cn;pku.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Linear Explanations for Individual Neurons", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33879", "id": "WIbntm28cM", "proceeding": "https://proceedings.mlr.press/v235/oikarinen24a.html", "pdf": "https://openreview.net/pdf?id=WIbntm28cM", "openreview": "https://openreview.net/forum?id=WIbntm28cM", "author_site": "Tuomas Oikarinen, Lily Weng", "tldr": "", "abstract": "In recent years many methods have been developed to understand the internal workings of neural networks, often by describing the function of individual neurons in the model. However, these methods typically only focus on explaining the very highest activations of a neuron. In this paper we show this is not sufficient, and that the highest activation range is only responsible for a very small percentage of the neuron's causal effect. In addition, inputs causing lower activations are often very different and can't be reliably predicted by only looking at high activations. We propose that neurons should instead be understood as a linear combination of concepts, and develop an efficient method for producing these linear explanations. In addition, we show how to automatically evaluate description quality using simulation, i.e. predicting neuron activations on unseen inputs in vision setting.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tuomas Oikarinen;Tsui-Wei Weng", "authorids": "~Tuomas_Oikarinen1;~Tsui-Wei_Weng1", "gender": "M;F", "homepage": "https://tuomaso.github.io/;https://lilywenglab.github.io", "dblp": "243/3532;177/9197", "google_scholar": "M3KZnPwAAAAJ;v8GM4xoAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Tuomas_Oikarinen1;~Tsui-Wei_Weng1", "aff": "GenenTech;University of California, San Diego", "aff_domain": "gene.com;ucsd.edu", "position": "Intern;Assistant Professor", "bibtex": "@inproceedings{\noikarinen2024linear,\ntitle={Linear Explanations for Individual Neurons},\nauthor={Tuomas Oikarinen and Tsui-Wei Weng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WIbntm28cM}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9800205, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11762235162028091045&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "gene.com;ucsd.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Genentech;University of California, San Diego", "aff_unique_dep": ";", "aff_unique_url": "https://www.genentech.com;https://www.ucsd.edu", "aff_unique_abbr": "Genentech;UCSD", "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Breaking the Barrier: Enhanced Utility and Robustness in Smoothed DRL Agents", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33878", "id": "WJ5fJhwvCl", "proceeding": "https://proceedings.mlr.press/v235/sun24b.html", "pdf": "https://openreview.net/pdf?id=WJ5fJhwvCl", "openreview": "https://openreview.net/forum?id=WJ5fJhwvCl", "author_site": "Chung-En Sun, Sicun Gao, Lily Weng", "tldr": "", "abstract": "Robustness remains a paramount concern in deep reinforcement learning (DRL), with randomized smoothing emerging as a key technique for enhancing this attribute. However, a notable gap exists in the performance of current smoothed DRL agents, often characterized by significantly low clean rewards and weak robustness. In response to this challenge, our study introduces innovative algorithms aimed at training effective smoothed robust DRL agents. We propose S-DQN and S-PPO, novel approaches that demonstrate remarkable improvements in clean rewards, empirical robustness, and robustness guarantee across standard RL benchmarks. Notably, our S-DQN and S-PPO agents not only significantly outperform existing smoothed agents by an average factor of $2.16\\times$ under the strongest attack, but also surpass previous robustly-trained agents by an average factor of $2.13\\times$. This represents a significant leap forward in the field. Furthermore, we introduce Smoothed Attack, which is $1.89\\times$ more effective in decreasing the rewards of smoothed agents than existing adversarial attacks. Our code is available at: [https://github.com/Trustworthy-ML-Lab/Robust_HighUtil_Smoothed_DRL](https://github.com/Trustworthy-ML-Lab/Robust_HighUtil_Smoothed_DRL)", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chung-En Sun;Sicun Gao;Tsui-Wei Weng", "authorids": "~Chung-En_Sun1;~Sicun_Gao1;~Tsui-Wei_Weng1", "gender": "M;M;F", "homepage": "https://sunchungen.github.io/;;https://lilywenglab.github.io", "dblp": "264/5788;22/8296;177/9197", "google_scholar": "https://scholar.google.com/citations?hl=en;;v8GM4xoAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Chung-En_Sun1;~Sicun_Gao1;~Tsui-Wei_Weng1", "aff": "Microsoft;University of California, San Diego;University of California, San Diego", "aff_domain": "microsoft.com;ucsd.edu;ucsd.edu", "position": "Intern;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nsun2024breaking,\ntitle={Breaking the Barrier: Enhanced Utility and Robustness in Smoothed {DRL} Agents},\nauthor={Chung-En Sun and Sicun Gao and Tsui-Wei Weng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WJ5fJhwvCl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1531572, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16838820785238307811&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "microsoft.com;ucsd.edu;ucsd.edu", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Microsoft;University of California, San Diego", "aff_unique_dep": "Microsoft Corporation;", "aff_unique_url": "https://www.microsoft.com;https://www.ucsd.edu", "aff_unique_abbr": "Microsoft;UCSD", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Robust Graph Matching when Nodes are Corrupt", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33877", "id": "WJn1BAx9aj", "proceeding": "https://proceedings.mlr.press/v235/ameen24a.html", "pdf": "https://openreview.net/pdf?id=WJn1BAx9aj", "openreview": "https://openreview.net/forum?id=WJn1BAx9aj", "author_site": "Taha Ameen Ur Rahman, Bruce Hajek", "tldr": "", "abstract": "Two models are introduced to study the problem of matching two correlated graphs when some of the nodes are corrupt. In the weak model, a random subset of nodes in one or both graphs can interact randomly with their network. For this model, it is shown that no estimator can correctly recover a positive fraction of the corrupt nodes. Necessary conditions for any estimator to correctly identify and match all the uncorrupt nodes are derived, and it is shown that these conditions are also sufficient for the k-core estimator. In the strong model, an adversarially selected subset of nodes in one or both graphs can interact arbitrarily with their network. For this model, detection of corrupt nodes is impossible. Even so, we show that if only one of the networks is compromised, then under appropriate conditions, the maximum overlap estimator can correctly match a positive fraction of nodes albeit without explicitly identifying them.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Taha Ameen;Bruce Hajek", "authorids": "~Taha_Ameen1;~Bruce_Hajek1", "gender": "M;", "homepage": "https://www.taha-ameen.com;", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Taha_Ameen1;~Bruce_Hajek1", "aff": "University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;", "position": "PhD student;", "bibtex": "@inproceedings{\nameen2024robust,\ntitle={Robust Graph Matching when Nodes are Corrupt},\nauthor={Taha Ameen and Bruce Hajek},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WJn1BAx9aj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 491936, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8095619255259992074&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "illinois.edu;", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Tackling Non-Stationarity in Reinforcement Learning via Causal-Origin Representation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33876", "id": "WLGWMDtj8L", "proceeding": "https://proceedings.mlr.press/v235/zhang24ah.html", "pdf": "https://openreview.net/pdf?id=WLGWMDtj8L", "openreview": "https://openreview.net/forum?id=WLGWMDtj8L", "author_site": "Wanpeng Zhang, Yilin Li, Boyu Yang, Zongqing Lu", "tldr": "", "abstract": "In real-world scenarios, the application of reinforcement learning is significantly challenged by complex non-stationarity. Most existing methods attempt to model changes in the environment explicitly, often requiring impractical prior knowledge of environments. In this paper, we propose a new perspective, positing that non-stationarity can propagate and accumulate through complex causal relationships during state transitions, thereby compounding its sophistication and affecting policy learning. We believe that this challenge can be more effectively addressed by implicitly tracing the causal origin of non-stationarity. To this end, we introduce the Causal-Origin REPresentation (COREP) algorithm. COREP primarily employs a guided updating mechanism to learn a stable graph representation for the state, termed as causal-origin representation. By leveraging this representation, the learned policy exhibits impressive resilience to non-stationarity. We supplement our approach with a theoretical analysis grounded in the causal interpretation for non-stationary reinforcement learning, advocating for the validity of the causal-origin representation. Experimental results further demonstrate the superior performance of COREP over existing methods in tackling non-stationarity problems. The code is available at https://github.com/PKU-RL/COREP.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wanpeng Zhang;Yilin Li;Boyu Yang;Zongqing Lu", "authorids": "~Wanpeng_Zhang1;~Yilin_Li1;~Boyu_Yang2;~Zongqing_Lu2", "gender": "M;M;;", "homepage": "https://zhangwp.com;;;", "dblp": "73/10693-2;;;", "google_scholar": "_IKNf9EAAAAJ;https://scholar.google.com/citations?hl=en;;", "orcid": "0000-0001-5351-3449;0000-0001-7124-9186;;", "linkedin": ";;;", "or_profile": "~Wanpeng_Zhang1;~Yilin_Li1;~Boyu_Yang2;~Zongqing_Lu2", "aff": "Peking University;Peking University;;", "aff_domain": "pku.edu.cn;pku.edu.cn;;", "position": "PhD student;PhD student;;", "bibtex": "@inproceedings{\nzhang2024tackling,\ntitle={Tackling Non-Stationarity in Reinforcement Learning via Causal-Origin Representation},\nauthor={Wanpeng Zhang and Yilin Li and Boyu Yang and Zongqing Lu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WLGWMDtj8L}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1947858, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dxVnbhmEw3MJ:scholar.google.com/&scioq=Tackling+Non-Stationarity+in+Reinforcement+Learning+via+Causal-Origin+Representation&hl=en&as_sdt=0,33", "gs_version_total": 8, "email": "pku.edu.cn;pku.edu.cn;;", "author_num": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Robust CLIP: Unsupervised Adversarial Fine-Tuning of Vision Embeddings for Robust Large Vision-Language Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33875", "id": "WLPhywf1si", "proceeding": "https://proceedings.mlr.press/v235/schlarmann24a.html", "pdf": "https://openreview.net/pdf?id=WLPhywf1si", "openreview": "https://openreview.net/forum?id=WLPhywf1si", "author_site": "Christian Schlarmann, Naman Singh, Francesco Croce, Matthias Hein", "tldr": "", "abstract": "Multi-modal foundation models like OpenFlamingo, LLaVA, and GPT-4 are increasingly used for various real-world tasks. Prior work has shown that these models are highly vulnerable to adversarial attacks on the vision modality. These attacks can be leveraged to spread fake information or defraud users, and thus pose a significant risk, which makes the robustness of large multi-modal foundation models a pressing problem. The CLIP model, or one of its variants, is used as a frozen vision encoder in many large vision-language models (LVLMs), e.g. LLaVA and OpenFlamingo. We propose an unsupervised adversarial fine-tuning scheme to obtain a robust CLIP vision encoder, which yields robustness on all vision down-stream tasks (LVLMs, zero-shot classification) that rely on CLIP. In particular, we show that stealth-attacks on users of LVLMs by a malicious third party providing manipulated images are no longer possible once one replaces the original CLIP model with our robust one. No retraining or fine-tuning of the down-stream LVLMs is required. The code and robust models are available on GitHub.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Christian Schlarmann;Naman Deep Singh;Francesco Croce;Matthias Hein", "authorids": "~Christian_Schlarmann1;~Naman_Deep_Singh1;~Francesco_Croce1;~Matthias_Hein2", "gender": ";M;M;M", "homepage": ";https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/maschinelles-lernen/team/;;https://uni-tuebingen.de/de/164260", "dblp": ";230/3694.html;52/4288;97/1213-1", "google_scholar": ";https://scholar.google.de/citations?user=zfObWM0AAAAJ;https://scholar.google.de/citations?view_op=list_works;0ZAb3tsAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Christian_Schlarmann1;~Naman_Deep_Singh1;~Francesco_Croce1;~Matthias_Hein2", "aff": ";Eberhard-Karls-Universit\u00e4t T\u00fcbingen;EPFL - EPF Lausanne;University of T\u00fcbingen", "aff_domain": ";uni-tuebingen.de;epfl.ch;uni-tuebingen.de", "position": ";PhD student;Postdoc;Full Professor", "bibtex": "@inproceedings{\nschlarmann2024robust,\ntitle={Robust {CLIP}: Unsupervised Adversarial Fine-Tuning of Vision Embeddings for Robust Large Vision-Language Models},\nauthor={Christian Schlarmann and Naman Deep Singh and Francesco Croce and Matthias Hein},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WLPhywf1si}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4896304, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11316451338934221409&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 7, "email": ";uni-tuebingen.de;epfl.ch;uni-tuebingen.de", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Eberhard Karls University of T\u00fcbingen;EPFL;University of T\u00fcbingen", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.epfl.ch;https://www.uni-tuebingen.de/", "aff_unique_abbr": "Uni T\u00fcbingen;EPFL;Uni T\u00fcbingen", "aff_campus_unique_index": "0;1", "aff_campus_unique": "T\u00fcbingen;Lausanne;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Germany;Switzerland" }, { "title": "Why Larger Language Models Do In-context Learning Differently?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33874", "id": "WOa96EG26M", "proceeding": "https://proceedings.mlr.press/v235/shi24f.html", "pdf": "https://openreview.net/pdf?id=WOa96EG26M", "openreview": "https://openreview.net/forum?id=WOa96EG26M", "author_site": "Zhenmei Shi, Junyi Wei, Zhuoyan Xu, Yingyiu Liang", "tldr": "", "abstract": "Large language models (LLM) have emerged as a powerful tool for AI, with the key ability of in-context learning (ICL), where they can perform well on unseen tasks based on a brief series of task examples without necessitating any adjustments to the model parameters. One recent interesting mysterious observation is that models of different scales may have different ICL behaviors: larger models tend to be more sensitive to noise in the test context. This work studies this observation theoretically aiming to improve the understanding of LLM and ICL. We analyze two stylized settings: (1) linear regression with one-layer single-head linear transformers and (2) parity classification with two-layer multiple attention heads transformers (non-linear data and non-linear model). In both settings, we give closed-form optimal solutions and find that smaller models emphasize important hidden features while larger ones cover more hidden features; thus, smaller models are more robust to noise while larger ones are more easily distracted, leading to different ICL behaviors. This sheds light on where transformers pay attention to and how that affects ICL. Preliminary experimental results on large base and chat models provide positive support for our analysis.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhenmei Shi;Junyi Wei;Zhuoyan Xu;Yingyu Liang", "authorids": "~Zhenmei_Shi1;~Junyi_Wei1;~Zhuoyan_Xu1;~Yingyu_Liang1", "gender": "M;F;M;", "homepage": "http://zhmeishi.github.io/;;https://pages.cs.wisc.edu/~zxu444/;", "dblp": "246/5216;166/6146;126/2019;", "google_scholar": "0oeNnzMAAAAJ;Kb1GL40AAAAJ;uufndFAAAAAJ;", "orcid": ";;;", "linkedin": "zhenmei-shi-56408a113/;Junyi-Jenny-Wei-04ba979b/;zhuoyan-xu-0702301a2/;", "or_profile": "~Zhenmei_Shi1;~Junyi_Wei1;~Zhuoyan_Xu1;~Yingyu_Liang1", "aff": "University of Wisconsin - Madison;University of Wisconsin, Madison;University of Wisconsin - Madison;", "aff_domain": "wisc.edu;wisc.edu;wisc.edu;", "position": "PhD student;PhD student;PhD student;", "bibtex": "@inproceedings{\nshi2024why,\ntitle={Why Larger Language Models Do In-context Learning Differently?},\nauthor={Zhenmei Shi and Junyi Wei and Zhuoyan Xu and Yingyu Liang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WOa96EG26M}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 511856, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 348, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3276432233377164291&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 12, "email": "wisc.edu;wisc.edu;wisc.edu;", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Wisconsin-Madison;University of Wisconsin", "aff_unique_dep": ";", "aff_unique_url": "https://www.wisc.edu;https://www.wisc.edu", "aff_unique_abbr": "UW-Madison;UW", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Position: Exploring the Robustness of Pipeline-Parallelism-Based Decentralized Training", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33873", "id": "WPfYVdJHPk", "proceeding": "https://proceedings.mlr.press/v235/lu24c.html", "pdf": "https://openreview.net/pdf?id=WPfYVdJHPk", "openreview": "https://openreview.net/forum?id=WPfYVdJHPk", "author_site": "Lin Lu, Chenxi Dai, Wangcheng Tao, Binhang Yuan, Yanan Sun, Pan Zhou", "tldr": "", "abstract": "Modern machine learning applications increasingly demand greater computational resources for training large models. Decentralized training has emerged as an effective means to democratize this technology. However, the potential threats associated with this approach remain inadequately discussed, posing a hurdle to the development of decentralized training infrastructures. This paper aims to initiate discussion towards this end by exploring the robustness of decentralized training from three primary perspectives. Firstly, we articulate our position on establishing robust decentralized training by outlining potential threats and the corresponding countermeasures. Secondly, we illustrate a nascent poisoning attack targeting decentralized training frameworks, easily executable by malicious stages. To mitigate this security threat and ensure efficient training, we propose a robust training framework, integrating a 100% detection strategy and efficient training mechanisms. Finally, we demonstrate the severity of the proposed attack and the effectiveness of our robust training framework. This position paper emphasizes the urgency of exploring the robustness of decentralized training and proposes a feasible solution. The code is available at https://github.com/dcx001016/pipeline_attack.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lin Lu;Chenxi Dai;Wangcheng Tao;Binhang Yuan;Yanan Sun;Pan Zhou", "authorids": "~Lin_Lu3;~Chenxi_Dai1;~Wangcheng_Tao1;~Binhang_Yuan1;~Yanan_Sun4;~Pan_Zhou5", "gender": "M;M;;M;;M", "homepage": "https://github.com/loserlulin9;;https://taowangcheng.github.io;https://binhangyuan.github.io/site/;;http://faculty.hust.edu.cn/pzhou/zh_CN/index.htm", "dblp": ";;;141/0690.html;;84/6614-1", "google_scholar": ";https://scholar.google.com/citations?hl=zh-CN;;TflKxcIAAAAJ;;cTpFPJgAAAAJ", "orcid": ";;;0000-0002-3188-2769;;", "linkedin": ";;;;;", "or_profile": "~Lin_Lu3;~Chenxi_Dai1;~Wangcheng_Tao1;~Binhang_Yuan1;~Yanan_Sun4;~Pan_Zhou5", "aff": "Huazhong University of Science and Technology;Huazhong University of Science and Technology;Huazhong University of Science and Technology;Hong Kong University of Science and Technology;;Huazhong University of Science and Technology", "aff_domain": "hust.edu.cn;hust.edu.cn;hust.edu.cn;ust.hk;;hust.edu.cn", "position": "MS student;MS student;Undergrad student;Assistant Professor;;Professor", "bibtex": "@inproceedings{\nlu2024position,\ntitle={Position: Exploring the Robustness of Pipeline-Parallelism-Based Decentralized Training},\nauthor={Lin Lu and Chenxi Dai and Wangcheng Tao and Binhang Yuan and Yanan Sun and Pan Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WPfYVdJHPk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 603459, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10478603051640306012&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "hust.edu.cn;hust.edu.cn;hust.edu.cn;ust.hk;;hust.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Huazhong University of Science and Technology;Hong Kong University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "http://www.hust.edu.cn;https://www.ust.hk", "aff_unique_abbr": "HUST;HKUST", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Active Label Correction for Semantic Segmentation with Foundation Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33872", "id": "WPt9HRmMrG", "proceeding": "https://proceedings.mlr.press/v235/kim24g.html", "pdf": "https://openreview.net/pdf?id=WPt9HRmMrG", "openreview": "https://openreview.net/forum?id=WPt9HRmMrG", "author_site": "Hoyoung Kim, SEHYUN HWANG, Suha Kwak, Jungseul Ok", "tldr": "", "abstract": "Training and validating models for semantic segmentation require datasets with pixel-wise annotations, which are notoriously labor-intensive. Although useful priors such as foundation models or crowdsourced datasets are available, they are error-prone. We hence propose an effective framework of active label correction (ALC) based on a design of correction query to rectify pseudo labels of pixels, which in turn is more annotator-friendly than the standard one inquiring to classify a pixel directly according to our theoretical analysis and user study. Specifically, leveraging foundation models providing useful zero-shot predictions on pseudo labels and superpixels, our method comprises two key techniques: (i) an annotator-friendly design of correction query with the pseudo labels, and (ii) an acquisition function looking ahead label expansions based on the superpixels. Experimental results on PASCAL, Cityscapes, and Kvasir-SEG datasets demonstrate the effectiveness of our ALC framework, outperforming prior methods for active semantic segmentation and label correction. Notably, utilizing our method, we obtained a revised dataset of PASCAL by rectifying errors in 2.6 million pixels in PASCAL dataset.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hoyoung Kim;Sehyun Hwang;Suha Kwak;Jungseul Ok", "authorids": "~Hoyoung_Kim1;~Sehyun_Hwang1;~Suha_Kwak3;~Jungseul_Ok2", "gender": "M;M;M;M", "homepage": "https://cskhy16.github.io;http://sehyun03.github.io/;https://suhakwak.github.io/;https://sites.google.com/view/jungseulok", "dblp": "05/5746;322/8982;65/6173;117/3448", "google_scholar": "tuVPLyIAAAAJ;GW4KY8IAAAAJ;-gscDIEAAAAJ;KWG3UUMAAAAJ", "orcid": ";0000-0002-8541-9403;;0000-0003-4742-2473", "linkedin": "hoyoung-kim-6142a6162/;sehyun-hwang-864690219;;", "or_profile": "~Hoyoung_Kim1;~Sehyun_Hwang1;~Suha_Kwak3;~Jungseul_Ok2", "aff": "POSTECH;Microsoft Research;POSTECH;POSTECH", "aff_domain": "postech.ac.kr;research.microsoft.com;postech.ac.kr;postech.ac.kr", "position": "PhD student;Intern;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nkim2024active,\ntitle={Active Label Correction for Semantic Segmentation with Foundation Models},\nauthor={Hoyoung Kim and Sehyun Hwang and Suha Kwak and Jungseul Ok},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WPt9HRmMrG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9309187, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10397429164177186990&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "postech.ac.kr;research.microsoft.com;postech.ac.kr;postech.ac.kr", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Pohang University of Science and Technology;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.postech.ac.kr;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "POSTECH;MSR", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pohang;", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "South Korea;United States" }, { "title": "Memorization Through the Lens of Curvature of Loss Function Around Samples", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33871", "id": "WQbDS9RydY", "proceeding": "https://proceedings.mlr.press/v235/garg24a.html", "pdf": "https://openreview.net/pdf?id=WQbDS9RydY", "openreview": "https://openreview.net/forum?id=WQbDS9RydY", "author_site": "Isha Garg, Deepak Ravikumar, Kaushik Roy", "tldr": "", "abstract": "Deep neural networks are over-parameterized and easily overfit to and memorize the datasets that they train on. In the extreme case, it has been shown that networks can memorize a randomly labeled dataset. In this paper, we propose using the curvature of the loss function around each training sample, averaged over training epochs, as a measure of memorization of a sample. We show that this curvature metric effectively captures memorization statistics, both qualitatively and quantitatively in popular image datasets. We provide quantitative validation of the proposed metric against memorization scores released by Feldman & Zhang (2020). Further, experiments on mislabeled data detection show that corrupted samples are learned with high curvature and using curvature for identifying mislabelled examples outperforms existing approaches. Qualitatively, we find that high curvature samples correspond to long-tailed, mislabeled, or conflicting instances, indicating a likelihood of memorization. Notably, this analysis helps us find, to the best of our knowledge, a novel failure mode on the CIFAR100 and ImageNet datasets: that of duplicated images with differing labels.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Isha Garg;Deepak Ravikumar;Kaushik Roy", "authorids": "~Isha_Garg1;~Deepak_Ravikumar1;~Kaushik_Roy1", "gender": "F;;M", "homepage": ";;https://engineering.purdue.edu/NRL/Group", "dblp": ";;r/KaushikRoy", "google_scholar": ";;to4P8KgAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Isha_Garg1;~Deepak_Ravikumar1;~Kaushik_Roy1", "aff": "Purdue University;;Purdue University", "aff_domain": "purdue.edu;;purdue.edu", "position": "PhD student;;Full Professor", "bibtex": "@inproceedings{\ngarg2024memorization,\ntitle={Memorization Through the Lens of Curvature of Loss Function Around Samples},\nauthor={Isha Garg and Deepak Ravikumar and Kaushik Roy},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WQbDS9RydY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9485936, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10984568258835713207&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 6, "email": "purdue.edu;;purdue.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Scalable High-Resolution Pixel-Space Image Synthesis with Hourglass Diffusion Transformers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33870", "id": "WRIn2HmtBS", "proceeding": "https://proceedings.mlr.press/v235/crowson24a.html", "pdf": "https://openreview.net/pdf?id=WRIn2HmtBS", "openreview": "https://openreview.net/forum?id=WRIn2HmtBS", "author_site": "Katherine Crowson, Stefan Baumann, Alex Birch, Tanishq Abraham, Daniel Kaplan, Enrico Shippole", "tldr": "", "abstract": "We present the Hourglass Diffusion Transformer (HDiT), an image-generative model that exhibits linear scaling with pixel count, supporting training at high resolution (e.g. $1024 \\times 1024$) directly in pixel-space. Building on the Transformer architecture, which is known to scale to billions of parameters, it bridges the gap between the efficiency of convolutional U-Nets and the scalability of Transformers. HDiT trains successfully without typical high-resolution training techniques such as multiscale architectures, latent autoencoders or self-conditioning. We demonstrate that HDiT performs competitively with existing models on ImageNet $256^2$, and sets a new state-of-the-art for diffusion models on FFHQ-$1024^2$. Code is available at https://github.com/crowsonkb/k-diffusion.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Katherine Crowson;Stefan Andreas Baumann;Alex Birch;Tanishq Mathew Abraham;Daniel Z Kaplan;Enrico Shippole", "authorids": "~Katherine_Crowson1;~Stefan_Andreas_Baumann1;~Alex_Birch1;~Tanishq_Mathew_Abraham1;~Daniel_Z_Kaplan2;~Enrico_Shippole1", "gender": "F;;;M;;M", "homepage": "https://kath.io;;;https://tanishq.ai;;https://github.com/conceptofmind", "dblp": ";;;;;356/2379", "google_scholar": ";;;hIyhkfQAAAAJ;;_mXePyUAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;enrico-shippole-495521b8", "or_profile": "~Katherine_Crowson1;~Stefan_Andreas_Baumann1;~Alex_Birch1;~Tanishq_Mathew_Abraham1;~Daniel_Z_Kaplan2;~Enrico_Shippole1", "aff": "EleutherAI;;;Stability AI;;Teraflop AI", "aff_domain": "eleuther.ai;;;stability.ai;;teraflop.ai", "position": "Researcher;;;Research Director;;CEO", "bibtex": "@inproceedings{\ncrowson2024scalable,\ntitle={Scalable High-Resolution Pixel-Space Image Synthesis with Hourglass Diffusion Transformers},\nauthor={Katherine Crowson and Stefan Andreas Baumann and Alex Birch and Tanishq Mathew Abraham and Daniel Z Kaplan and Enrico Shippole},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WRIn2HmtBS}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9706136, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14287713676117152529&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "eleuther.ai;;;stability.ai;;teraflop.ai", "author_num": 6, "aff_unique_index": "0;1;2", "aff_unique_norm": "EleutherAI;Stability AI;Teraflop AI", "aff_unique_dep": ";;", "aff_unique_url": "https://www.eleuther.ai;https://stability.ai;", "aff_unique_abbr": "EleutherAI;Stability AI;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States;" }, { "title": "A Subquadratic Time Algorithm for Robust Sparse Mean Estimation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33869", "id": "WSi4IiMaCx", "proceeding": "https://proceedings.mlr.press/v235/pensia24a.html", "pdf": "https://openreview.net/pdf?id=WSi4IiMaCx", "openreview": "https://openreview.net/forum?id=WSi4IiMaCx", "tldr": "", "abstract": "We study the algorithmic problem of sparse mean estimation in the presence of adversarial outliers. Specifically, the algorithm observes a *corrupted* set of samples from $\\mathcal{N}(\\mu,\\mathbf{I}_d)$, where the unknown mean $\\mu \\in \\mathbb{R}^d$ is constrained to be $k$-sparse. A series of prior works has developed efficient algorithms for robust sparse mean estimation with sample complexity $\\mathrm{poly}(k,\\log d, 1/\\epsilon)$ and runtime $d^2 \\mathrm{poly}(k,\\log d,1/\\epsilon)$, where $\\epsilon$ is the fraction of contamination. In particular, the fastest runtime of existing algorithms is quadratic in the dimension, which can be prohibitive in high dimensions. This quadratic barrier in the runtime stems from the reliance of these algorithms on the sample covariance matrix, which is of size $d^2$. Our main contribution is an algorithm for robust sparse mean estimation which runs in _subquadratic_ time using $\\mathrm{poly}(k,\\log d,1/\\epsilon)$ samples. Our results build on algorithmic advances in detecting weak correlations, a generalized version of the light-bulb problem by Valiant (2015).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ankit Pensia", "authorids": "~Ankit_Pensia1", "gender": "M", "homepage": "https://ankitp.net/", "dblp": "213/7640", "google_scholar": "u1Qs7YIAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Ankit_Pensia1", "aff": "IBM Research", "aff_domain": "ibm.com", "position": "Postdoc", "bibtex": "@inproceedings{\npensia2024a,\ntitle={A Subquadratic Time Algorithm for Robust Sparse Mean Estimation},\nauthor={Ankit Pensia},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WSi4IiMaCx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 538483, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:LLpQjPtpn1AJ:scholar.google.com/&scioq=A+Subquadratic+Time+Algorithm+for+Robust+Sparse+Mean+Estimation&hl=en&as_sdt=0,5", "gs_version_total": 7, "email": "ibm.com", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "IBM", "aff_unique_dep": "IBM Research", "aff_unique_url": "https://www.ibm.com/research", "aff_unique_abbr": "IBM", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Helpful or Harmful Data? Fine-tuning-free Shapley Attribution for Explaining Language Model Predictions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33868", "id": "WSpPC1Jm0p", "proceeding": "https://proceedings.mlr.press/v235/wang24aq.html", "pdf": "https://openreview.net/pdf?id=WSpPC1Jm0p", "openreview": "https://openreview.net/forum?id=WSpPC1Jm0p", "author_site": "Jingtan Wang, Xiaoqiang Lin, Rui Qiao, Chuan-Sheng Foo, Bryan Kian Hsiang Low", "tldr": "", "abstract": "The increasing complexity of foundational models underscores the necessity for explainability, particularly for fine-tuning, the most widely used training method for adapting models to downstream tasks. Instance attribution, one type of explanation, attributes the model prediction to each training example by an instance score. However, the robustness of instance scores, specifically towards dataset resampling, has been overlooked. To bridge this gap, we propose a notion of robustness on the sign of the instance score. We theoretically and empirically demonstrate that the popular leave-one-out-based methods lack robustness, while the Shapley value behaves significantly better, but at a higher computational cost. Accordingly, we introduce an efficient fine-tuning-free approximation of the Shapley value (FreeShap) for instance attribution based on the neural tangent kernel. We empirically demonstrate that FreeShap outperforms other methods for instance attribution and other data-centric applications such as data removal, data selection, and wrong label detection, and further generalize our scale to large language models (LLMs). Our code is available at https://github.com/JTWang2000/FreeShap.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jingtan Wang;Xiaoqiang Lin;Rui Qiao;Chuan-Sheng Foo;Bryan Kian Hsiang Low", "authorids": "~Jingtan_Wang1;~Xiaoqiang_Lin1;~Rui_Qiao3;~Chuan-Sheng_Foo1;~Bryan_Kian_Hsiang_Low1", "gender": "F;M;M;M;M", "homepage": "https://jtwang2000.github.io/;https://xqlin98.github.io/;https://qiaoruiyt.github.io/;http://ai.stanford.edu/~csfoo;http://www.comp.nus.edu.sg/~lowkh", "dblp": ";269/4573;31/3517-6;73/1823;97/4877", "google_scholar": ";nqKwA60AAAAJ;Ox5Z9EwAAAAJ;AgbeqGkAAAAJ;https://scholar.google.com.tw/citations?user=2P-Q09UAAAAJ", "orcid": ";;0000-0002-6719-4490;0000-0002-4748-5792;", "linkedin": "jingtan-wang-130643193/;;;;", "or_profile": "~Jingtan_Wang1;~Xiaoqiang_Lin1;~Rui_Qiao3;~Chuan-Sheng_Foo1;~Bryan_Kian_Hsiang_Low1", "aff": "National University of Singapore, National University of Singapore;National University of Singapore;University of Washington;Institute for Infocomm Research, A*STAR;National University of Singapore", "aff_domain": "u.nus.edu;u.nus.edu;uw.edu;i2r.a-star.edu.sg;nus.edu.sg", "position": "PhD student;PhD student;Intern;Principal Scientist;Associate Professor", "bibtex": "@inproceedings{\nwang2024helpful,\ntitle={Helpful or Harmful Data? Fine-tuning-free Shapley Attribution for Explaining Language Model Predictions},\nauthor={Jingtan Wang and Xiaoqiang Lin and Rui Qiao and Chuan-Sheng Foo and Bryan Kian Hsiang Low},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WSpPC1Jm0p}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3782392, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17250228767869404958&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "u.nus.edu;u.nus.edu;uw.edu;i2r.a-star.edu.sg;nus.edu.sg", "author_num": 5, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "National University of Singapore;University of Washington;Institute for Infocomm Research", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nus.edu.sg;https://www.washington.edu;https://www.i2r.a-star.edu.sg", "aff_unique_abbr": "NUS;UW;I2R", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "Singapore;United States" }, { "title": "Gradient-based Visual Explanation for Transformer-based CLIP", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33867", "id": "WT4X3QYopC", "proceeding": "https://proceedings.mlr.press/v235/zhao24p.html", "pdf": "https://openreview.net/pdf?id=WT4X3QYopC", "openreview": "https://openreview.net/forum?id=WT4X3QYopC", "author_site": "Chenyang ZHAO, Kun Wang, Xingyu Zeng, Rui Zhao, Antoni Chan", "tldr": "", "abstract": "Significant progress has been achieved on the improvement and downstream usages of the Contrastive Language-Image Pre-training (CLIP) vision-language model, while less attention is paid to the interpretation of CLIP. We propose a Gradient-based visual Explanation method for CLIP (Grad-ECLIP), which interprets the matching result of CLIP for specific input image-text pair. By decomposing the architecture of the encoder and discovering the relationship between the matching similarity and intermediate spatial features, Grad-ECLIP produces effective heat maps that show the influence of image regions or words on the CLIP results. Different from the previous Transformer interpretation methods that focus on the utilization of self-attention maps, which are typically extremely sparse in CLIP, we produce high-quality visual explanations by applying channel and spatial weights on token features. Qualitative and quantitative evaluations verify the superiority of Grad-ECLIP compared with the state-of-the-art methods. A series of analysis are conducted based on our visual explanation results, from which we explore the working mechanism of image-text matching, and the strengths and limitations in attribution identification of CLIP. Codes are available here: https://github.com/Cyang-Zhao/Grad-Eclip.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chenyang ZHAO;Kun Wang;Xingyu Zeng;Rui Zhao;Antoni B. Chan", "authorids": "~Chenyang_ZHAO3;~Kun_Wang8;~Xingyu_Zeng1;~Rui_Zhao6;~Antoni_B._Chan1", "gender": "F;M;M;M;M", "homepage": ";https://twitter.com/wk910930;;http://zhaorui.xyz/;http://www.cs.cityu.edu.hk/~abchan/", "dblp": ";;;26/2578-1;55/5814", "google_scholar": "q5AJRJQAAAAJ;;https://scholar.google.com.hk/citations?user=4XyqsFwAAAAJ;1c9oQNMAAAAJ;j4vFSn8AAAAJ", "orcid": "0000-0003-2233-8494;;;;0000-0002-2886-2513", "linkedin": ";;;;", "or_profile": "~Chenyang_ZHAO3;~Kun_Wang8;~Xingyu_Zeng1;~Rui_Zhao6;~Antoni_B._Chan1", "aff": "City University of Hong Kong;SenseTime Group Ltd;SenseTime Group Limited;SenseTime Research;City University of Hong Kong", "aff_domain": "cityu.edu.hk;sensetime.com;sensetime.com;sensetime.com;cityu.edu.hk", "position": "PhD student;Researcher;Researcher;Researcher;Full Professor", "bibtex": "@inproceedings{\nzhao2024gradientbased,\ntitle={Gradient-based Visual Explanation for Transformer-based {CLIP}},\nauthor={Chenyang ZHAO and Kun Wang and Xingyu Zeng and Rui Zhao and Antoni B. Chan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WT4X3QYopC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 10161957, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9754920154856026187&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "cityu.edu.hk;sensetime.com;sensetime.com;sensetime.com;cityu.edu.hk", "author_num": 5, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "City University of Hong Kong;SenseTime Group;SenseTime Group Limited;SenseTime", "aff_unique_dep": ";;;SenseTime Research", "aff_unique_url": "https://www.cityu.edu.hk;https://www.sensetime.com;https://www.sensetime.com;https://www.sensetime.com", "aff_unique_abbr": "CityU;SenseTime;SenseTime;SenseTime", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Data-Efficient Learning via Clustering-Based Sensitivity Sampling: Foundation Models and Beyond", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33866", "id": "WUQ4YzIQt2", "proceeding": "https://proceedings.mlr.press/v235/axiotis24a.html", "pdf": "https://openreview.net/pdf?id=WUQ4YzIQt2", "openreview": "https://openreview.net/forum?id=WUQ4YzIQt2", "author_site": "Kyriakos Axiotis, Vincent Cohen-Addad, Monika Henzinger, Sammy Jerome, Vahab Mirrokni, David Saulpic, David Woodruff, Michael Wunder", "tldr": "", "abstract": "We study the data selection problem, whose aim is to select a small representative subset of data that can be used to efficiently train a machine learning model. We present a new data selection approach based on $k$-means clustering and sensitivity sampling. Assuming access to an embedding representation of the data with respect to which the model loss is Holder continuous, our approach provably allows selecting a set of ``typical'' $k + 1/\\varepsilon^2$ elements whose average loss corresponds to the average loss of the whole dataset, up to a multiplicative $(1\\pm\\varepsilon)$ factor and an additive $\\varepsilon \\lambda \\Phi_k$, where $\\Phi_k$ represents the $k$-means cost for the input embeddings and $\\lambda$ is the Holder constant. We furthermore demonstrate the performance and scalability of our approach on fine-tuning foundation models and show that it outperforms state-of-the-art methods. We also show how it can be applied on linear regression, leading to a new sampling strategy that surprisingly matches the performance of leverage score sampling, while being conceptually simpler and more scalable.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kyriakos Axiotis;Vincent Cohen-Addad;Monika Henzinger;Sammy Jerome;Vahab Mirrokni;David Saulpic;David Woodruff;Michael Wunder", "authorids": "~Kyriakos_Axiotis1;~Vincent_Cohen-Addad1;~Monika_Henzinger1;~Sammy_Jerome1;~Vahab_Mirrokni2;~David_Saulpic1;~David_Woodruff1;~Michael_Wunder2", "gender": ";;;;M;;M;", "homepage": ";;;;https://people.csail.mit.edu/mirrokni/Welcome.html;http://www.normalesup.org/~saulpic/;http://www.cs.cmu.edu/~dwoodruf/;", "dblp": "176/5139;136/5814;;;m/VahabSMirrokni;https://dblp.uni-trier.de/pers/hd/s/Saulpic:David;w/DPWoodruff;98/8411", "google_scholar": "Xhv2tkcAAAAJ;;NXbggxYAAAAJ;;opbZfw0AAAAJ;;https://scholar.google.com.tw/citations?user=0G2t-6sAAAAJ;", "orcid": ";;;;;0000-0003-4208-8541;;", "linkedin": ";;;https://www.linkedin.com/samuel-jerome-6a190284;;;;", "or_profile": "~Kyriakos_Axiotis1;~Vincent_Cohen-Addad1;~Monika_Henzinger1;~Sammy_Jerome1;~Vahab_Mirrokni2;~David_Saulpic1;~David_Woodruff1;~Michael_Wunder2", "aff": "Google;Google;Institute of Science and Technology;Google;Google Research;Institute of Science and Technology;Carnegie Mellon University;Google", "aff_domain": "google.com;google.com;ist.ac.at;google.com;google.com;ist.ac.at;cmu.edu;google.com", "position": "Researcher;Researcher;Full Professor;Researcher;VP, Google Fellow;Postdoc;Full Professor;Researcher", "bibtex": "@inproceedings{\naxiotis2024dataefficient,\ntitle={Data-Efficient Learning via Clustering-Based Sensitivity Sampling: Foundation Models and Beyond},\nauthor={Kyriakos Axiotis and Vincent Cohen-Addad and Monika Henzinger and Sammy Jerome and Vahab Mirrokni and David Saulpic and David Woodruff and Michael Wunder},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WUQ4YzIQt2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 688190, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5227890396390458482&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 7, "email": "google.com;google.com;ist.ac.at;google.com;google.com;ist.ac.at;cmu.edu;google.com", "author_num": 8, "aff_unique_index": "0;0;1;0;0;1;2;0", "aff_unique_norm": "Google;Institute of Science and Technology;Carnegie Mellon University", "aff_unique_dep": "Google;;", "aff_unique_url": "https://www.google.com;;https://www.cmu.edu", "aff_unique_abbr": "Google;;CMU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States;" }, { "title": "Cascade-CLIP: Cascaded Vision-Language Embeddings Alignment for Zero-Shot Semantic Segmentation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33865", "id": "WUdq1WFUPr", "proceeding": "https://proceedings.mlr.press/v235/li24aq.html", "pdf": "https://openreview.net/pdf?id=WUdq1WFUPr", "openreview": "https://openreview.net/forum?id=WUdq1WFUPr", "author_site": "Yunheng Li, Zhong-Yu Li, Quan-Sheng Zeng, Qibin Hou, Ming-Ming Cheng", "tldr": "", "abstract": "Pre-trained vision-language models, e.g., CLIP, have been successfully applied to zero-shot semantic segmentation. Existing CLIP-based approaches primarily utilize visual features from the last layer to align with text embeddings, while they neglect the crucial information in intermediate layers that contain rich object details. However, we find that directly aggregating the multi-level visual features weakens the zero-shot ability for novel classes. The large differences between the visual features from different layers make these features hard to align well with the text embeddings. We resolve this problem by introducing a series of independent decoders to align the multi-level visual features with the text embeddings in a cascaded way, forming a novel but simple framework named Cascade-CLIP. Our Cascade-CLIP is flexible and can be easily applied to existing zero-shot semantic segmentation methods. Experimental results show that our simple Cascade-CLIP achieves superior zero-shot performance on segmentation benchmarks, like COCO-Stuff, Pascal-VOC, and Pascal-Context. Our code is available at https://github.com/HVision-NKU/Cascade-CLIP.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yunheng Li;Zhong-Yu Li;Quan-Sheng Zeng;Qibin Hou;Ming-Ming Cheng", "authorids": "~Yunheng_Li1;~Zhong-Yu_Li1;~Quan-Sheng_Zeng1;~Qibin_Hou1;~Ming-Ming_Cheng3", "gender": "M;M;M;M;M", "homepage": "https://github.com/lyhisme;https://github.com/ashun989;https://houqb.github.io/;https://mmcheng.net;https://github.com/lzyhha", "dblp": "294/0060.html;364/7590;40/4112;45/7592;283/4860", "google_scholar": "https://scholar.google.com.hk/citations?hl=zh-CN;;fF8OFV8AAAAJ;huWpVyEAAAAJ;g6WHXrgAAAAJ", "orcid": ";;;0000-0001-5550-8758;0000-0002-3682-4904", "linkedin": ";;;;", "or_profile": "~Yunheng_Li1;~Quan-Sheng_Zeng1;~Qibin_Hou1;~Ming-Ming_Cheng3;~ZhongYu_Li1", "aff": "Nankai University;Nankai University;Nankai University;Nankai University;Nankai University", "aff_domain": "nankai.edu.cn;nankai.edu.cn;nankai.edu.cn;nankai.edu.cn;nankai.edu.cn", "position": "PhD student;MS student;Associate Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nli2024cascadeclip,\ntitle={Cascade-{CLIP}: Cascaded Vision-Language Embeddings Alignment for Zero-Shot Semantic Segmentation},\nauthor={Yunheng Li and Zhong-Yu Li and Quan-Sheng Zeng and Qibin Hou and Ming-Ming Cheng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WUdq1WFUPr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2478781, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8702583709389523552&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 7, "email": "nankai.edu.cn;nankai.edu.cn;nankai.edu.cn;nankai.edu.cn;nankai.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Nankai University", "aff_unique_dep": "", "aff_unique_url": "http://www.nankai.edu.cn", "aff_unique_abbr": "NKU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "One Size Fits All for Semantic Shifts: Adaptive Prompt Tuning for Continual Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33864", "id": "WUi1AqhKn5", "proceeding": "https://proceedings.mlr.press/v235/kim24ai.html", "pdf": "https://openreview.net/pdf?id=WUi1AqhKn5", "openreview": "https://openreview.net/forum?id=WUi1AqhKn5", "author_site": "Doyoung Kim, Susik Yoon, Dongmin Park, Youngjun Lee, Hwanjun Song, Jihwan Bang, Jae-Gil Lee", "tldr": "", "abstract": "In real-world continual learning (CL) scenarios, tasks often exhibit intricate and unpredictable semantic shifts, posing challenges for *fixed* prompt management strategies which are tailored to only handle semantic shifts of *uniform* degree (i.e., uniformly mild or uniformly abrupt). To address this limitation, we propose an *adaptive* prompting approach that effectively accommodates semantic shifts of *varying* degree where mild and abrupt shifts are mixed. AdaPromptCL employs the assign-and-refine semantic grouping mechanism that dynamically manages prompt groups in accordance with the semantic similarity between tasks, enhancing the quality of grouping through continuous refinement. Our experiment results demonstrate that AdaPromptCL outperforms existing prompting methods by up to 21.3%, especially in the benchmark datasets with diverse semantic shifts between tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Doyoung Kim;Susik Yoon;Dongmin Park;Youngjun Lee;Hwanjun Song;Jihwan Bang;Jae-Gil Lee", "authorids": "~Doyoung_Kim2;~Susik_Yoon1;~Dongmin_Park1;~Youngjun_Lee1;~Hwanjun_Song2;~Jihwan_Bang1;~Jae-Gil_Lee1", "gender": "M;;M;M;M;M;M", "homepage": ";http://www.susikyoon.com;https://dongmean.github.io/;https://github.com/e0jun;https://songhwanjun.github.io/;https://hwany-j.github.io/;https://dm.kaist.ac.kr/jaegil/", "dblp": ";179/5307;82/2651;74/11299;204/3381;221/4643;28/3904", "google_scholar": "vEAbNDYAAAAJ;tCJs1zEAAAAJ;https://scholar.google.co.kr/citations?user=4xXYQl0AAAAJ;https://scholar.google.com/citations?hl=ko;Ijzuc-8AAAAJ;molKYzwAAAAJ;https://scholar.google.com.tw/citations?user=h9mbv9MAAAAJ", "orcid": ";0000-0001-5596-4972;;;0000-0002-1105-0818;;0000-0002-8711-7732", "linkedin": ";;dongmin-park-82995613a/;;;jihwan-bang/;", "or_profile": "~Doyoung_Kim2;~Susik_Yoon1;~Dongmin_Park1;~Youngjun_Lee1;~Hwanjun_Song2;~Jihwan_Bang1;~Jae-Gil_Lee1", "aff": "Korea Advanced Institute of Science & Technology;Korea University;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Amazon Web Services;Qualcomm Inc, QualComm;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;korea.ac.kr;kaist.ac.kr;kaist.ac.kr;amazon.com;qti.qualcomm.com;kaist.ac.kr", "position": "Researcher;Assistant Professor;PhD student;PhD student;Research Scientist;Researcher;Full Professor", "bibtex": "@inproceedings{\nkim2024one,\ntitle={One Size Fits All for Semantic Shifts: Adaptive Prompt Tuning for Continual Learning},\nauthor={Doyoung Kim and Susik Yoon and Dongmin Park and Youngjun Lee and Hwanjun Song and Jihwan Bang and Jae-Gil Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WUi1AqhKn5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1394059, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4689291634515894096&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "kaist.ac.kr;korea.ac.kr;kaist.ac.kr;kaist.ac.kr;amazon.com;qti.qualcomm.com;kaist.ac.kr", "author_num": 7, "aff_unique_index": "0;1;0;0;2;3;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Korea University;Amazon;Qualcomm Incorporated", "aff_unique_dep": ";;Amazon Web Services;", "aff_unique_url": "https://www.kaist.ac.kr;https://www.korea.ac.kr;https://aws.amazon.com;https://www.qualcomm.com", "aff_unique_abbr": "KAIST;KU;AWS;Qualcomm", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;1;0", "aff_country_unique": "South Korea;United States" }, { "title": "Allocation Requires Prediction Only if Inequality Is Low", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33863", "id": "WUicA0hOF9", "proceeding": "https://proceedings.mlr.press/v235/shirali24a.html", "pdf": "https://openreview.net/pdf?id=WUicA0hOF9", "openreview": "https://openreview.net/forum?id=WUicA0hOF9", "author_site": "Ali Shirali, Rediet Abebe, Moritz Hardt", "tldr": "", "abstract": "Algorithmic predictions are emerging as a promising solution concept for efficiently allocating societal resources. Fueling their use is an underlying assumption that such systems are necessary to identify individuals for interventions. We propose a principled framework for assessing this assumption: Using a simple mathematical model, we evaluate the efficacy of prediction-based allocations in settings where individuals belong to larger units such as hospitals, neighborhoods, or schools. We find that prediction-based allocations outperform baseline methods using aggregate unit-level statistics only when between-unit inequality is low and the intervention budget is high. Our results hold for a wide range of settings for the price of prediction, treatment effect heterogeneity, and unit-level statistics' learnability. Combined, we highlight the potential limits to improving the efficacy of interventions through prediction.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ali Shirali;Rediet Abebe;Moritz Hardt", "authorids": "~Ali_Shirali1;~Rediet_Abebe2;~Moritz_Hardt1", "gender": "M;;Not Specified", "homepage": "https://sites.google.com/berkeley.edu/ali/home;;http://mrtz.org/", "dblp": "299/4983.html;;26/4683", "google_scholar": "FUSSkq0AAAAJ;;adnTgaAAAAAJ", "orcid": "0000-0003-3750-0159;;", "linkedin": ";;", "or_profile": "~Ali_Shirali1;~Rediet_Abebe2;~Moritz_Hardt1", "aff": "University of California, Berkeley;;Max-Planck-Institute for Intelligent Systems, Max-Planck Institute", "aff_domain": "berkeley.edu;;is.mpg.de", "position": "PhD Student;;Principal Researcher", "bibtex": "@inproceedings{\nshirali2024allocation,\ntitle={Allocation Requires Prediction Only if Inequality Is Low},\nauthor={Ali Shirali and Rediet Abebe and Moritz Hardt},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WUicA0hOF9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 713041, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16628311051541634189&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "email": "berkeley.edu;;is.mpg.de", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "University of California, Berkeley;Max-Planck-Institute for Intelligent Systems", "aff_unique_dep": ";Intelligent Systems", "aff_unique_url": "https://www.berkeley.edu;https://www.mpi-is.mpg.de", "aff_unique_abbr": "UC Berkeley;MPI-IS", "aff_campus_unique_index": "0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Germany" }, { "title": "Criterion Collapse and Loss Distribution Control", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33862", "id": "WVORGH73Cg", "proceeding": "https://proceedings.mlr.press/v235/holland24a.html", "pdf": "https://openreview.net/pdf?id=WVORGH73Cg", "openreview": "https://openreview.net/forum?id=WVORGH73Cg", "tldr": "", "abstract": "In this work, we consider the notion of \"criterion collapse,\" in which optimization of one metric implies optimality in another, with a particular focus on conditions for collapse into error probability minimizers under a wide variety of learning criteria, ranging from DRO and OCE risks (CVaR, tilted ERM) to non-monotonic criteria underlying recent ascent-descent algorithms explored in the literature (Flooding, SoftAD). We show how collapse in the context of losses with a Bernoulli distribution goes far beyond existing results for CVaR and DRO, then expand our scope to include surrogate losses, showing conditions where monotonic criteria such as tilted ERM cannot avoid collapse, whereas non-monotonic alternatives can.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Matthew J. Holland", "authorids": "~Matthew_J._Holland1", "gender": "M", "homepage": "https://feedbackward.com/", "dblp": "148/9989", "google_scholar": "pQoH5uEAAAAJ", "orcid": "0000-0002-6704-1769", "linkedin": "", "or_profile": "~Matthew_J._Holland1", "aff": "Osaka University", "aff_domain": "osaka-u.ac.jp", "position": "Assistant Professor", "bibtex": "@inproceedings{\nholland2024criterion,\ntitle={Criterion Collapse and Loss Distribution Control},\nauthor={Matthew J. Holland},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WVORGH73Cg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 631777, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10637270332815160587&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "osaka-u.ac.jp", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Osaka University", "aff_unique_dep": "", "aff_unique_url": "https://www.osaka-u.ac.jp", "aff_unique_abbr": "Osaka U", "aff_country_unique_index": "0", "aff_country_unique": "Japan" }, { "title": "GeoReasoner: Geo-localization with Reasoning in Street Views using a Large Vision-Language Model", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33861", "id": "WWo9G5zyh0", "proceeding": "https://proceedings.mlr.press/v235/li24ch.html", "pdf": "https://openreview.net/pdf?id=WWo9G5zyh0", "openreview": "https://openreview.net/forum?id=WWo9G5zyh0", "author_site": "Ling Li, Yu Ye, Bingchuan Jiang, Wei Zeng", "tldr": "", "abstract": "This work tackles the problem of geo-localization with a new paradigm using a large vision-language model (LVLM) augmented with human inference knowledge. A primary challenge here is the scarcity of data for training the LVLM - existing street-view datasets often contain numerous low-quality images lacking visual clues, and lack any reasoning inference. To address the data-quality issue, we devise a CLIP-based network to quantify the degree of street-view images being locatable, leading to the creation of a new dataset comprising highly locatable street views. To enhance reasoning inference, we integrate external knowledge obtained from real geo-localization games, tapping into valuable human inference capabilities. The data are utilized to train GeoReasoner, which undergoes fine-tuning through dedicated reasoning and location-tuning stages. Qualitative and quantitative evaluations illustrate that GeoReasoner outperforms counterpart LVLMs by more than 25% at country-level and 38% at city-level geo-localization tasks, and surpasses StreetCLIP performance while requiring fewer training resources. The data and code are available at https://github.com/lingli1996/GeoReasoner.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ling Li;Yu Ye;Bingchuan Jiang;Wei Zeng", "authorids": "~Ling_Li12;~Yu_Ye2;~Bingchuan_Jiang1;~Wei_Zeng7", "gender": "M;M;M;F", "homepage": ";;https://www.zeng-wei.com;https://lingli1996.github.io/", "dblp": ";;80/1961-4;", "google_scholar": "J857J64AAAAJ;;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0003-1073-094X;0000-0002-5600-8824;0009-0003-6899-1314", "linkedin": ";;;", "or_profile": "~Yu_Ye2;~Bingchuan_Jiang1;~Wei_Zeng7;~Ling.Li12", "aff": ";Information Engineering University;The Hong Kong University of Science and Technology (Guangzhou);The Hong Kong University of Science and Technology (Guangzhou)", "aff_domain": ";plaieu.edu;hkust-gz.edu.cn;hkust-gz.edu.cn", "position": ";Associate Professor;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nli2024georeasoner,\ntitle={GeoReasoner: Geo-localization with Reasoning in Street Views using a Large Vision-Language Model},\nauthor={Ling Li and Yu Ye and Bingchuan Jiang and Wei Zeng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WWo9G5zyh0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7382813, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17967814895049459567&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": ";plaieu.edu;hkust-gz.edu.cn;hkust-gz.edu.cn", "author_num": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "Information Engineering University;Hong Kong University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": ";https://www.ust.hk", "aff_unique_abbr": ";HKUST", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Guangzhou", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Iterative Data Smoothing: Mitigating Reward Overfitting and Overoptimization in RLHF", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33860", "id": "WXg6MJo1FH", "proceeding": "https://proceedings.mlr.press/v235/zhu24e.html", "pdf": "https://openreview.net/pdf?id=WXg6MJo1FH", "openreview": "https://openreview.net/forum?id=WXg6MJo1FH", "author_site": "Banghua Zhu, Michael Jordan, Jiantao Jiao", "tldr": "", "abstract": "Reinforcement Learning from Human Feedback (RLHF) is a pivotal technique that aligns language models closely with human-centric values. The initial phase of RLHF involves learning human values using a reward model from ranking data. It is observed that the performance of the reward model degrades after one epoch of training, and optimizing too much against the learned reward model eventually hinders the true objective. This paper analyzes potential reasons behind the issues, and designs improved reward learning algorithm termed 'Iterative Data Smoothing' (IDS). The core idea is that during each training epoch, we not only update the model with the data, but also update the date using the model, replacing hard labels with soft labels. Our empirical findings highlight the superior performance of this approach over the traditional methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Banghua Zhu;Michael Jordan;Jiantao Jiao", "authorids": "~Banghua_Zhu1;~Michael_Jordan1;~Jiantao_Jiao1", "gender": "M;M;M", "homepage": "https://people.eecs.berkeley.edu/~banghua/;http://www.cs.berkeley.edu/~jordan/;https://scholar.google.com/citations?user=aO8KpGcAAAAJ&hl=en", "dblp": "204/5394;j/MichaelIJordan;43/8919", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=yxUduqMAAAAJ;aO8KpGcAAAAJ", "orcid": ";0000-0001-8935-817X;", "linkedin": ";;", "or_profile": "~Banghua_Zhu1;~Michael_Jordan1;~Jiantao_Jiao1", "aff": "University of California, Berkeley;University of California, Berkeley;University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu", "position": "PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nzhu2024iterative,\ntitle={Iterative Data Smoothing: Mitigating Reward Overfitting and Overoptimization in {RLHF}},\nauthor={Banghua Zhu and Michael Jordan and Jiantao Jiao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WXg6MJo1FH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1031973, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15835007242568524733&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "berkeley.edu;berkeley.edu;berkeley.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Audio Flamingo: A Novel Audio Language Model with Few-Shot Learning and Dialogue Abilities", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33859", "id": "WYi3WKZjYe", "proceeding": "https://proceedings.mlr.press/v235/kong24a.html", "pdf": "https://openreview.net/pdf?id=WYi3WKZjYe", "openreview": "https://openreview.net/forum?id=WYi3WKZjYe", "author_site": "Zhifeng Kong, ARUSHI GOEL, Rohan Badlani, Wei Ping, Rafael Valle, Bryan Catanzaro", "tldr": "", "abstract": "Augmenting large language models (LLMs) to understand audio \u2013 including non-speech sounds and non-verbal speech \u2013 is critically important for diverse real-world applications of LLMs. In this paper, we propose Audio Flamingo, a novel audio language model with 1) strong audio understanding abilities, 2) the ability to quickly adapt to unseen tasks via in-context learning and retrieval, and 3) strong multi-turn dialogue abilities. We introduce a series of training techniques, architecture design, and data strategies to enhance our model with these abilities. Extensive evaluations across various audio understanding tasks confirm the efficacy of our method, setting new state-of-the-art benchmarks. Our demo website is https://audioflamingo.github.io/ and the code is open-sourced at https://github.com/NVIDIA/audio-flamingo.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhifeng Kong;Arushi Goel;Rohan Badlani;Wei Ping;Rafael Valle;Bryan Catanzaro", "authorids": "~Zhifeng_Kong1;~Arushi_Goel2;~Rohan_Badlani1;~Wei_Ping1;~Rafael_Valle1;~Bryan_Catanzaro1", "gender": "M;F;M;M;Not Specified;M", "homepage": "https://cseweb.ucsd.edu/~z4kong;https://goelarushi.github.io/;https://scholar.google.co.in/citations?user=sk-qH8wAAAAJ&hl=en;https://wpingnet.github.io/;http://rafaelvalle.github.io;https://ctnzr.io", "dblp": "206/7097;;;08/8399.html;;14/4826", "google_scholar": "jAOD1dsAAAAJ;tj08PZcAAAAJ;https://scholar.google.co.in/citations?user=sk-qH8wAAAAJ;6gKEYRgAAAAJ;SktxU8IAAAAJ;UZ6kI2AAAAAJ", "orcid": ";;;;;0000-0003-0034-7728", "linkedin": "zhifeng-kong-745605103/;;;wei-ping/;vallerafael/;bryancatanzaro/", "or_profile": "~Zhifeng_Kong1;~Arushi_Goel2;~Rohan_Badlani1;~Wei_Ping1;~Rafael_Valle1;~Bryan_Catanzaro1", "aff": "NVIDIA;NVIDIA Research;NVIDIA;NVIDIA;NVIDIA;NVIDIA", "aff_domain": "nvidia.com;nvidia.com;nvidia.com;nvidia.com;nvidia.com;nvidia.com", "position": "Research Scientist;Researcher;Researcher;Principal Researcher;Senior Research Scientist;Vice President", "bibtex": "@inproceedings{\nkong2024audio,\ntitle={Audio Flamingo: A Novel Audio Language Model with Few-Shot Learning and Dialogue Abilities},\nauthor={Zhifeng Kong and Arushi Goel and Rohan Badlani and Wei Ping and Rafael Valle and Bryan Catanzaro},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WYi3WKZjYe}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1110855, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 88, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13588747752690497428&as_sdt=20000005&sciodt=0,21&hl=en", "gs_version_total": 8, "email": "nvidia.com;nvidia.com;nvidia.com;nvidia.com;nvidia.com;nvidia.com", "author_num": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "NVIDIA", "aff_unique_dep": "NVIDIA Corporation", "aff_unique_url": "https://www.nvidia.com", "aff_unique_abbr": "NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Graph Neural PDE Solvers with Conservation and Similarity-Equivariance", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33858", "id": "WajJf47TUi", "proceeding": "https://proceedings.mlr.press/v235/horie24a.html", "pdf": "https://openreview.net/pdf?id=WajJf47TUi", "openreview": "https://openreview.net/forum?id=WajJf47TUi", "author_site": "Masanobu Horie, NAOTO MITSUME", "tldr": "", "abstract": "Utilizing machine learning to address partial differential equations (PDEs) presents significant challenges due to the diversity of spatial domains and their corresponding state configurations, which complicates the task of encompassing all potential scenarios through data-driven methodologies alone. Moreover, there are legitimate concerns regarding the generalization and reliability of such approaches, as they often overlook inherent physical constraints. In response to these challenges, this study introduces a novel machine-learning architecture that is highly generalizable and adheres to conservation laws and physical symmetries, thereby ensuring greater reliability. The foundation of this architecture is graph neural networks (GNNs), which are adept at accommodating a variety of shapes and forms. Additionally, we explore the parallels between GNNs and traditional numerical solvers, facilitating a seamless integration of conservative principles and symmetries into machine learning models. Our findings from experiments demonstrate that the model's inclusion of physical laws significantly enhances its generalizability, i.e., no significant accuracy degradation for unseen spatial domains while other models degrade. The code is available at https://github.com/yellowshippo/fluxgnn-icml2024.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Masanobu Horie;NAOTO MITSUME", "authorids": "~Masanobu_Horie1;~NAOTO_MITSUME1", "gender": "M;M", "homepage": "https://yellowshippo.github.io/;", "dblp": "264/9957;", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.co.jp/citations?user=p1Uqlh0AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Masanobu_Horie1;~NAOTO_MITSUME1", "aff": "RICOS Co. Ltd.;University of Tsukuba", "aff_domain": "ricos.co.jp;tsukuba.ac.jp", "position": "Principal Researcher;Assistant Professor", "bibtex": "@inproceedings{\nhorie2024graph,\ntitle={Graph Neural {PDE} Solvers with Conservation and Similarity-Equivariance},\nauthor={Masanobu Horie and NAOTO MITSUME},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WajJf47TUi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9128868, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13141013937255339798&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "ricos.co.jp;tsukuba.ac.jp", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "RICOS Co. Ltd.;University of Tsukuba", "aff_unique_dep": ";", "aff_unique_url": ";https://www.tsukuba.ac.jp", "aff_unique_abbr": ";UT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1", "aff_country_unique": ";Japan" }, { "title": "Decouple then Classify: A Dynamic Multi-view Labeling Strategy with Shared and Specific Information", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33857", "id": "WfJuiIiFzB", "proceeding": "https://proceedings.mlr.press/v235/wan24e.html", "pdf": "https://openreview.net/pdf?id=WfJuiIiFzB", "openreview": "https://openreview.net/forum?id=WfJuiIiFzB", "author_site": "Xinhang Wan, Jiyuan Liu, Xinwang Liu, Yi Wen, Hao Yu, Siwei Wang, Shengju Yu, Tianjiao Wan, Jun Wang, En Zhu", "tldr": "", "abstract": "Sample labeling is the most primary and fundamental step of semi-supervised learning. In literature, most existing methods randomly label samples with a given ratio, but achieve unpromising and unstable results due to the randomness, especially in multi-view settings. To address this issue, we propose a Dynamic Multi-view Labeling Strategy with Shared and Specific Information. To be brief, by building two classifiers with existing labels to utilize decoupled shared and specific information, we select the samples of low classification confidence and label them in high priorities. The newly generated labels are also integrated to update the classifiers adaptively. The two processes are executed alternatively until a satisfying classification performance. To validate the effectiveness of the proposed method, we conduct extensive experiments on popular benchmarks, achieving promising performance. The code is publicly available at https://github.com/wanxinhang/ICML2024_decouple_then_classify.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xinhang Wan;Jiyuan Liu;Xinwang Liu;Yi Wen;Hao Yu;Siwei Wang;Shengju Yu;Tianjiao Wan;Jun Wang;En Zhu", "authorids": "~Xinhang_Wan1;~Jiyuan_Liu1;~Xinwang_Liu1;~Yi_Wen1;~Hao_Yu13;~Siwei_Wang4;~Shengju_Yu1;~Tianjiao_Wan1;wang_jun@nudt.edu.cn;~En_Zhu1", "gender": "M;M;M;;M;M;;F;;M", "homepage": "https://wanxinhang.github.io/;https://liujiyuan13.github.io/;https://xinwangliu.github.io/;https://wenyiwy99.github.io/;https://csyuhao.github.io/;https://wangsiwei2010.github.io/;;https://github.com/6020662832;;https://www.researchgate.net/profile/En_Zhu", "dblp": "331/1513;18/798-3;45/6569-2.html;10/761-1;64/4832-5.html;51/8279-1;;348/9607;;30/1307", "google_scholar": "4CxuLpsAAAAJ;;A56vWC4AAAAJ;lrwiztgAAAAJ;ygRUerEAAAAJ;5o9hK3EAAAAJ;;;;", "orcid": "0000-0001-8749-2869;0000-0001-5702-4941;;0000-0002-5924-1429;0000-0001-9044-4841;0000-0001-9517-262X;;0000-0002-2423-4982;;", "linkedin": ";;;;;;;;;", "or_profile": "~Xinhang_Wan1;~Jiyuan_Liu1;~Xinwang_Liu1;~Yi_Wen1;~Hao_Yu13;~Siwei_Wang4;~Shengju_Yu1;~Tianjiao_Wan1;wang_jun@nudt.edu.cn;~En_Zhu1", "aff": "National University of Defense Technology;National University of Defense Technology;National University of Defense Technology;National University of Defense Technology;National University of Defense Technology;Intelligent Game and Decision Lab;;National University of Defense Technology;;National University of Defense Technology", "aff_domain": "nudt.edu.cn;nudt.edu.cn;nudt.edu.cn;nudt.edu.cn;nudt.edu.cn;nudt.edu.cn;;nudt.edu.cn;;nudt.edu.cn", "position": "PhD student;Lecturer;Full Professor;MS student;PhD student;Assistant Professor;;PhD student;;Full Professor", "bibtex": "@inproceedings{\nwan2024decouple,\ntitle={Decouple then Classify: A Dynamic Multi-view Labeling Strategy with Shared and Specific Information},\nauthor={Xinhang Wan and Jiyuan Liu and Xinwang Liu and Yi Wen and Hao Yu and Siwei Wang and Shengju Yu and Tianjiao Wan and Jun Wang and En Zhu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WfJuiIiFzB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1425117, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9040361115552531064&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 6, "email": "nudt.edu.cn;nudt.edu.cn;nudt.edu.cn;nudt.edu.cn;nudt.edu.cn;nudt.edu.cn;;nudt.edu.cn;;nudt.edu.cn", "author_num": 10, "aff_unique_index": "0;0;0;0;0;1;0;0", "aff_unique_norm": "National University of Defense Technology;Intelligent Game and Decision Lab", "aff_unique_dep": ";Intelligent Game and Decision Lab", "aff_unique_url": "http://www.nudt.edu.cn/;", "aff_unique_abbr": "NUDT;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China;" }, { "title": "Dissecting Multimodality in VideoQA Transformer Models by Impairing Modality Fusion", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33856", "id": "Wj5wm3Os5v", "proceeding": "https://proceedings.mlr.press/v235/rawal24a.html", "pdf": "https://openreview.net/pdf?id=Wj5wm3Os5v", "openreview": "https://openreview.net/forum?id=Wj5wm3Os5v", "author_site": "Ishaan Rawal, Alexander Matyasko, Shantanu Jaiswal, Basura Fernando, Cheston Tan", "tldr": "", "abstract": "While VideoQA Transformer models demonstrate competitive performance on standard benchmarks, the reasons behind their success are not fully understood. Do these models capture the rich multimodal structures and dynamics from video and text jointly? Or are they achieving high scores by exploiting biases and spurious features? Hence, to provide insights, we design *QUAG* (QUadrant AveraGe), a lightweight and non-parametric probe, to conduct dataset-model combined representation analysis by impairing modality fusion. We find that the models achieve high performance on many datasets without leveraging multimodal representations. To validate QUAG further, we design *QUAG-attention*, a less-expressive replacement of self-attention with restricted token interactions. Models with QUAG-attention achieve similar performance with significantly fewer multiplication operations without any finetuning. Our findings raise doubts about the current models' abilities to learn highly-coupled multimodal representations. Hence, we design the *CLAVI* (Complements in LAnguage and VIdeo) dataset, a stress-test dataset curated by augmenting real-world videos to have high modality coupling. Consistent with the findings of QUAG, we find that most of the models achieve near-trivial performance on CLAVI. This reasserts the limitations of current models for learning highly-coupled multimodal representations, that is not evaluated by the current datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ishaan Singh Rawal;Alexander Matyasko;Shantanu Jaiswal;Basura Fernando;Cheston Tan", "authorids": "~Ishaan_Singh_Rawal1;~Alexander_Matyasko2;~Shantanu_Jaiswal1;~Basura_Fernando1;~Cheston_Tan1", "gender": "M;;M;M;M", "homepage": ";;https://basurafernando.github.io/;;", "dblp": ";203/9071;01/9558;136/9366;", "google_scholar": "2qv2QwcAAAAJ;GmGNq2MAAAAJ;https://scholar.google.com.au/citations?user=GyvseMkAAAAJ;Up0UYEYAAAAJ;", "orcid": ";;0000-0002-6920-9916;;", "linkedin": "ishaan-s-rawal;;;cheston-tan/;alexander-matyasko/", "or_profile": "~Ishaan_Singh_Rawal1;~Shantanu_Jaiswal1;~Basura_Fernando1;~Cheston_Tan1;~Alexander_Matyasko1", "aff": "Centre For Frontier AI Research, A*STAR. Singapore;Center for Frontier AI Research, A*STAR Singapore;A*STAR;Singapore University of Technology and Design;A*STAR", "aff_domain": "cfar.a-star.edu.sg;ihpc.a-star.edu.sg;astar.edu.sg;sutd.edu.sg;a-star.edu.sg", "position": "Research Engineer;Research Engineer;Principal Researcher;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nrawal2024dissecting,\ntitle={Dissecting Multimodality in Video{QA} Transformer Models by Impairing Modality Fusion},\nauthor={Ishaan Singh Rawal and Alexander Matyasko and Shantanu Jaiswal and Basura Fernando and Cheston Tan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Wj5wm3Os5v}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4370295, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9664228973710650436&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "cfar.a-star.edu.sg;ihpc.a-star.edu.sg;astar.edu.sg;sutd.edu.sg;a-star.edu.sg", "author_num": 5, "aff_unique_index": "0;1;2;3;2", "aff_unique_norm": "A*STAR;A*STAR Singapore;Agency for Science, Technology and Research;Singapore University of Technology and Design", "aff_unique_dep": "Centre For Frontier AI Research;Center for Frontier AI Research;;", "aff_unique_url": "https://www.a-star.edu.sg;https://www.a-star.edu.sg;https://www.a-star.edu.sg;https://www.sutd.edu.sg", "aff_unique_abbr": "A*STAR;A*STAR;A*STAR;SUTD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Singapore" }, { "title": "From Generalization Analysis to Optimization Designs for State Space Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33855", "id": "WjNzXeiOSL", "proceeding": "https://proceedings.mlr.press/v235/liu24ah.html", "pdf": "https://openreview.net/pdf?id=WjNzXeiOSL", "openreview": "https://openreview.net/forum?id=WjNzXeiOSL", "author_site": "Fusheng Liu, Qianxiao Li", "tldr": "", "abstract": "A State Space Model (SSM) is a foundation model in time series analysis, which has recently been shown as an alternative to transformers in sequence modeling. In this paper, we theoretically study the generalization of SSMs and propose improvements to training algorithms based on the generalization results. Specifically, we give a *data-dependent* generalization bound for SSMs, showing an interplay between the SSM parameters and the temporal dependencies of the training sequences. Leveraging the generalization bound, we (1) set up a scaling rule for model initialization based on the proposed generalization measure, which significantly improves the robustness of the output value scales on SSMs to different temporal patterns in the sequence data; (2) introduce a new regularization method for training SSMs to enhance the generalization performance. Numerical results are conducted to validate our results.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fusheng Liu;Qianxiao Li", "authorids": "~Fusheng_Liu1;~Qianxiao_Li1", "gender": ";M", "homepage": "https://mathematicallfs.github.io;https://blog.nus.edu.sg/qianxiaoli/", "dblp": ";172/0930.html", "google_scholar": ";https://scholar.google.com.sg/citations?user=zLgReYoAAAAJ", "orcid": ";0000-0002-3903-3737", "linkedin": ";", "or_profile": "~Fusheng_Liu1;~Qianxiao_Li1", "aff": "National University of Singapore;National University of Singapore", "aff_domain": "u.nus.edu;nus.edu.sg", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nliu2024from,\ntitle={From Generalization Analysis to Optimization Designs for State Space Models},\nauthor={Fusheng Liu and Qianxiao Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WjNzXeiOSL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1111864, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17064670728480213744&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "u.nus.edu;nus.edu.sg", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "National University of Singapore", "aff_unique_dep": "", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Singapore" }, { "title": "FedREDefense: Defending against Model Poisoning Attacks for Federated Learning using Model Update Reconstruction Error", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33854", "id": "Wjq2bS7fTK", "proceeding": "https://proceedings.mlr.press/v235/xie24c.html", "pdf": "https://openreview.net/pdf?id=Wjq2bS7fTK", "openreview": "https://openreview.net/forum?id=Wjq2bS7fTK", "author_site": "Yueqi Xie, Minghong Fang, Neil Gong", "tldr": "", "abstract": "Federated Learning (FL) faces threats from model poisoning attacks. Existing defenses, typically relying on cross-client/global information to mitigate these attacks, fall short when faced with non-IID data distributions and/or a large number of malicious clients. To address these challenges, we present FedREDefense. Unlike existing methods, it doesn't hinge on similar distributions across clients or a predominant presence of benign clients. Instead, it assesses the likelihood that a client's model update is a product of genuine training, solely based on the characteristics of the model update itself. Our key finding is that model updates stemming from genuine training can be approximately reconstructed with some distilled local knowledge, while those from deliberate handcrafted model poisoning attacks cannot. Drawing on this distinction, FedREDefense identifies and filters out malicious clients based on the discrepancies in their model update Reconstruction Errors. Empirical tests on three benchmark datasets confirm that FedREDefense successfully filters model poisoning attacks in FL\u2014even in scenarios with high non-IID degrees and large numbers of malicious clients.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yueqi XIE;Minghong Fang;Neil Zhenqiang Gong", "authorids": "~Yueqi_XIE1;~Minghong_Fang1;~Neil_Zhenqiang_Gong1", "gender": "F;M;M", "homepage": "https://xyq7.github.io/;https://minghongfang.com/;http://people.duke.edu/~zg70/", "dblp": "239/5986;157/0863;03/9437", "google_scholar": "XB8oP_gAAAAJ;L6vkkC8AAAAJ;t6uCsYoAAAAJ", "orcid": "0000-0002-5169-3180;0000-0002-1365-3911;0000-0002-9900-9309", "linkedin": ";;", "or_profile": "~Yueqi_XIE1;~Minghong_Fang1;~Neil_Gong2", "aff": "Hong Kong University of Science and Technology;Duke University;Duke University", "aff_domain": "hkust.edu;duke.edu;duke.edu", "position": "PhD student;Postdoc;Associate Professor", "bibtex": "@inproceedings{\nxie2024fedredefense,\ntitle={Fed{RED}efense: Defending against Model Poisoning Attacks for Federated Learning using Model Update Reconstruction Error},\nauthor={Yueqi XIE and Minghong Fang and Neil Zhenqiang Gong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Wjq2bS7fTK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 571908, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14353398340016134125&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "hkust.edu;duke.edu;duke.edu", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Hong Kong University of Science and Technology;Duke University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ust.hk;https://www.duke.edu", "aff_unique_abbr": "HKUST;Duke", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;United States" }, { "title": "Distilling Morphology-Conditioned Hypernetworks for Efficient Universal Morphology Control", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33853", "id": "WjvEvYTy3w", "proceeding": "https://proceedings.mlr.press/v235/xiong24c.html", "pdf": "https://openreview.net/pdf?id=WjvEvYTy3w", "openreview": "https://openreview.net/forum?id=WjvEvYTy3w", "author_site": "Zheng Xiong, Risto Vuorio, Jacob Beck, Matthieu Zimmer, Kun Shao, Shimon Whiteson", "tldr": "", "abstract": "Learning a universal policy across different robot morphologies can significantly improve learning efficiency and enable zero-shot generalization to unseen morphologies. However, learning a highly performant universal policy requires sophisticated architectures like transformers (TF) that have larger memory and computational cost than simpler multi-layer perceptrons (MLP). To achieve both good performance like TF and high efficiency like MLP at inference time, we propose HyperDistill, which consists of: (1) A morphology-conditioned hypernetwork (HN) that generates robot-wise MLP policies, and (2) A policy distillation approach that is essential for successful training. We show that on UNIMAL, a benchmark with hundreds of diverse morphologies, HyperDistill performs as well as a universal TF teacher policy on both training and unseen test robots, but reduces model size by 6-14 times, and computational cost by 67-160 times in different environments. Our analysis attributes the efficiency advantage of HyperDistill at inference time to knowledge decoupling, i.e., the ability to decouple inter-task and intra-task knowledge, a general principle that could also be applied to improve inference efficiency in other domains. The code is publicly available at https://github.com/MasterXiong/Universal-Morphology-Control.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zheng Xiong;Risto Vuorio;Jacob Beck;Matthieu Zimmer;Kun Shao;Shimon Whiteson", "authorids": "~Zheng_Xiong1;~Risto_Vuorio1;~Jacob_Beck1;~Matthieu_Zimmer1;~Kun_Shao1;~Shimon_Whiteson1", "gender": ";;;M;;", "homepage": ";;;https://matthieu-zimmer.net/;;", "dblp": "217/1483;;;216/6664;;https://dblp.uni-trier.de/pers/w/Whiteson:Shimon.html", "google_scholar": "F5bted4AAAAJ;;;https://scholar.google.fr/citations?user=6z-GF2sAAAAJ;;", "orcid": ";;;0000-0002-8029-308X;;", "linkedin": ";;;;;", "or_profile": "~Zheng_Xiong1;~Risto_Vuorio1;~Jacob_Beck1;~Matthieu_Zimmer1;~Kun_Shao1;~Shimon_Whiteson1", "aff": "University of Oxford;;;Huawei Technologies Ltd.;;University of Oxford", "aff_domain": "ox.ac.uk;;;huawei.com;;ox.ac.uk", "position": "PhD student;;;Researcher;;Professor", "bibtex": "@inproceedings{\nxiong2024distilling,\ntitle={Distilling Morphology-Conditioned Hypernetworks for Efficient Universal Morphology Control},\nauthor={Zheng Xiong and Risto Vuorio and Jacob Beck and Matthieu Zimmer and Kun Shao and Shimon Whiteson},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WjvEvYTy3w}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 640817, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13956128168720896147&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "ox.ac.uk;;;huawei.com;;ox.ac.uk", "author_num": 6, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Oxford;Huawei", "aff_unique_dep": ";Huawei Technologies", "aff_unique_url": "https://www.ox.ac.uk;https://www.huawei.com", "aff_unique_abbr": "Oxford;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United Kingdom;China" }, { "title": "Variational Inference with Coverage Guarantees in Simulation-Based Inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33852", "id": "Wn4QwCrDvH", "proceeding": "https://proceedings.mlr.press/v235/patel24a.html", "pdf": "https://openreview.net/pdf?id=Wn4QwCrDvH", "openreview": "https://openreview.net/forum?id=Wn4QwCrDvH", "author_site": "Yash Patel, Declan McNamara, Jackson Loper, Jeffrey Regier, Ambuj Tewari", "tldr": "", "abstract": "Amortized variational inference is an often employed framework in simulation-based inference that produces a posterior approximation that can be rapidly computed given any new observation. Unfortunately, there are few guarantees about the quality of these approximate posteriors. We propose Conformalized Amortized Neural Variational Inference (CANVI), a procedure that is scalable, easily implemented, and provides guaranteed marginal coverage. Given a collection of candidate amortized posterior approximators, CANVI constructs conformalized predictors based on each candidate, compares the predictors using a metric known as predictive efficiency, and returns the most efficient predictor. CANVI ensures that the resulting predictor constructs regions that contain the truth with a user-specified level of probability. CANVI is agnostic to design decisions in formulating the candidate approximators and only requires access to samples from the forward model, permitting its use in likelihood-free settings. We prove lower bounds on the predictive efficiency of the regions produced by CANVI and explore how the quality of a posterior approximation relates to the predictive efficiency of prediction regions based on that approximation. Finally, we demonstrate the accurate calibration and high predictive efficiency of CANVI on a suite of simulation-based inference benchmark tasks and an important scientific task: analyzing galaxy emission spectra.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yash Patel;Declan McNamara;Jackson Loper;Jeffrey Regier;Ambuj Tewari", "authorids": "~Yash_Patel3;~Declan_McNamara1;~Jackson_Loper1;~Jeffrey_Regier1;~Ambuj_Tewari1", "gender": "M;M;;M;M", "homepage": "https://yashpatel5400.github.io/;;;https://regier.stat.lsa.umich.edu;https://www.ambujtewari.com", "dblp": ";347/8126;https://dblp.uni-trier.de/pers/hd/l/Loper:Jackson;164/7281;24/567", "google_scholar": "_BQwMtgAAAAJ;ploel6YAAAAJ;4JmKgfkAAAAJ;q-J0TmgAAAAJ;ttbl4FsAAAAJ", "orcid": ";;;0000-0002-1472-5235;0000-0001-6969-7844", "linkedin": ";dmmcnamara/;;;", "or_profile": "~Yash_Patel3;~Declan_McNamara1;~Jackson_Loper1;~Jeffrey_Regier1;~Ambuj_Tewari1", "aff": "University of Michigan - Ann Arbor;University of Michigan - Ann Arbor;University of Michigan - Ann Arbor;University of Michigan;University of Michigan - Ann Arbor", "aff_domain": "umich.edu;umich.edu;umich.edu;umich.edu;umich.edu", "position": "PhD student;PhD student;Postdoc;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\npatel2024variational,\ntitle={Variational Inference with Coverage Guarantees in Simulation-Based Inference},\nauthor={Yash Patel and Declan McNamara and Jackson Loper and Jeffrey Regier and Ambuj Tewari},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Wn4QwCrDvH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8830230, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11647668878359276820&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 5, "email": "umich.edu;umich.edu;umich.edu;umich.edu;umich.edu", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of Michigan", "aff_unique_dep": "", "aff_unique_url": "https://www.umich.edu", "aff_unique_abbr": "UM", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Ann Arbor;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Universal Gradient Methods for Stochastic Convex Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33851", "id": "Wnhp34K5jR", "proceeding": "https://proceedings.mlr.press/v235/rodomanov24a.html", "pdf": "https://openreview.net/pdf?id=Wnhp34K5jR", "openreview": "https://openreview.net/forum?id=Wnhp34K5jR", "author_site": "Anton Rodomanov, Ali Kavis, Yongtao Wu, Kimon Antonakopoulos, Volkan Cevher", "tldr": "", "abstract": "We develop universal gradient methods for Stochastic Convex Optimization (SCO). Our algorithms automatically adapt not only to the oracle's noise but also to the H\u00f6lder smoothness of the objective function without a priori knowledge of the particular setting. The key ingredient is a novel strategy for adjusting step-size coefficients in the Stochastic Gradient Method (SGD). Unlike AdaGrad, which accumulates gradient norms, our Universal Gradient Method accumulates appropriate combinations of gradientand iterate differences. The resulting algorithm has state-of-the-art worst-case convergence rate guarantees for the entire H\u00f6lder class including, in particular, both nonsmooth functions and those with Lipschitz continuous gradient. We also present the Universal Fast Gradient Method for SCO enjoying optimal efficiency estimates.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anton Rodomanov;Ali Kavis;Yongtao Wu;Kimon Antonakopoulos;Volkan Cevher", "authorids": "~Anton_Rodomanov1;~Ali_Kavis1;~Yongtao_Wu1;~Kimon_Antonakopoulos1;~Volkan_Cevher1", "gender": ";;M;M;M", "homepage": ";https://alikavis.github.io;https://www.epfl.ch/labs/lions/people/phds/yongtao-wu/;;http://lions.epfl.ch", "dblp": "153/5453;231/7697;322/3726;https://dblp.org/pers/hd/a/Antonakopoulos:Kimon;70/5301", "google_scholar": "u95GRZQAAAAJ;sPrPq6oAAAAJ;rLgDE9AAAAAJ;;https://scholar.google.ch/citations?user=hlWhzU8AAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Anton_Rodomanov1;~Ali_Kavis1;~Yongtao_Wu1;~Kimon_Antonakopoulos1;~Volkan_Cevher1", "aff": "CISPA;University of Texas at Austin;Swiss Federal Institute of Technology Lausanne;EPFL - EPF Lausanne;Amazon Development Center Germany", "aff_domain": "cispa.de;utexas.edu;epfl.ch;epfl.ch;amazon.de", "position": "Postdoc;Postdoc;PhD student;Postdoc;Amazon Scholar", "bibtex": "@inproceedings{\nrodomanov2024universal,\ntitle={Universal Gradient Methods for Stochastic Convex Optimization},\nauthor={Anton Rodomanov and Ali Kavis and Yongtao Wu and Kimon Antonakopoulos and Volkan Cevher},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Wnhp34K5jR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 709630, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11996586632773965743&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "email": "cispa.de;utexas.edu;epfl.ch;epfl.ch;amazon.de", "author_num": 5, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "CISPA Helmholtz Center for Information Security;University of Texas at Austin;Swiss Federal Institute of Technology Lausanne;EPFL;Amazon", "aff_unique_dep": ";;;;Development Center", "aff_unique_url": "https://www.cispa.de/;https://www.utexas.edu;https://www.epfl.ch;https://www.epfl.ch;https://www.amazon.de", "aff_unique_abbr": "CISPA;UT Austin;EPFL;EPFL;Amazon", "aff_campus_unique_index": "1;2;2", "aff_campus_unique": ";Austin;Lausanne", "aff_country_unique_index": "0;1;2;2;0", "aff_country_unique": "Germany;United States;Switzerland" }, { "title": "Conditional Common Entropy for Instrumental Variable Testing and Partial Identification", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33850", "id": "Wnni3cu39x", "proceeding": "https://proceedings.mlr.press/v235/jiang24b.html", "pdf": "https://openreview.net/pdf?id=Wnni3cu39x", "openreview": "https://openreview.net/forum?id=Wnni3cu39x", "author_site": "Ziwei Jiang, Murat Kocaoglu", "tldr": "", "abstract": "Instrumental variables (IVs) are widely used for estimating causal effects. There are two main challenges when using instrumental variables. First of all, using IV without additional assumptions such as linearity, the causal effect may still not be identifiable. Second, when selecting an IV, the validity of the selected IV is typically not testable since the causal graph is not identifiable from observational data. In this paper, we propose a method for bounding the causal effect with instrumental variables under weak confounding. In addition, we present a novel criterion to falsify the IV with side information about the confounder. We demonstrate the utility of the proposed method with simulated and real-world datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ziwei Jiang;Murat Kocaoglu", "authorids": "~Ziwei_Jiang1;~Murat_Kocaoglu1", "gender": "M;M", "homepage": "https://ziwei-jiang.github.io/;https://www.muratkocaoglu.com", "dblp": "https://dblp.org/rec/conf/icml/JiangWK23.html;74/11343", "google_scholar": ";7N7bzdwAAAAJ", "orcid": ";", "linkedin": ";mkocaoglu/", "or_profile": "~Ziwei_Jiang1;~Murat_Kocaoglu1", "aff": "Purdue University;Purdue University", "aff_domain": "purdue.edu;purdue.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\njiang2024conditional,\ntitle={Conditional Common Entropy for Instrumental Variable Testing and Partial Identification},\nauthor={Ziwei Jiang and Murat Kocaoglu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Wnni3cu39x}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1258770, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3950351812149593458&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "purdue.edu;purdue.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Cross-domain Open-world Discovery", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33849", "id": "WofwaWjIf7", "proceeding": "https://proceedings.mlr.press/v235/wen24b.html", "pdf": "https://openreview.net/pdf?id=WofwaWjIf7", "openreview": "https://openreview.net/forum?id=WofwaWjIf7", "author_site": "Shuo Wen, Maria Brbic", "tldr": "", "abstract": "In many real-world applications, test data may commonly exhibit categorical shifts, characterized by the emergence of novel classes, as well as distribution shifts arising from feature distributions different from the ones the model was trained on. However, existing methods either discover novel classes in the open-world setting or assume domain shifts without the ability to discover novel classes. In this work, we consider a cross-domain open-world discovery setting, where the goal is to assign samples to seen classes and discover unseen classes under a domain shift. To address this challenging problem, we present CROW, a prototype-based approach that introduces a cluster-then-match strategy enabled by a well-structured representation space of foundation models. In this way, CROW discovers novel classes by robustly matching clusters with previously seen classes, followed by fine-tuning the representation space using an objective designed for cross-domain open-world discovery. Extensive experimental results on image classification benchmark datasets demonstrate that CROW outperforms alternative baselines, achieving an 8% average performance improvement across 75 experimental settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shuo Wen;Maria Brbic", "authorids": "~Shuo_Wen2;~Maria_Brbic1", "gender": ";F", "homepage": "https://wenshuo128.github.io/;https://brbiclab.epfl.ch/", "dblp": ";130/3233", "google_scholar": ";ltxmeroAAAAJ", "orcid": ";0000-0002-1120-1778", "linkedin": ";", "or_profile": "~Shuo_Wen2;~Maria_Brbic1", "aff": "School of Computer and Communication Sciences, EPFL - EPF Lausanne;EPFL - EPF Lausanne", "aff_domain": "ic.epfl.ch;epfl.ch", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nwen2024crossdomain,\ntitle={Cross-domain Open-world Discovery},\nauthor={Shuo Wen and Maria Brbic},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WofwaWjIf7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 10139014, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4707525781663529875&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "ic.epfl.ch;epfl.ch", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "EPFL", "aff_unique_dep": "School of Computer and Communication Sciences", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "title": "Watermark Stealing in Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33848", "id": "Wp054bnPq9", "proceeding": "https://proceedings.mlr.press/v235/jovanovic24a.html", "pdf": "https://openreview.net/pdf?id=Wp054bnPq9", "openreview": "https://openreview.net/forum?id=Wp054bnPq9", "author_site": "Nikola Jovanovi\u0107, Robin Staab, Martin Vechev", "tldr": "", "abstract": "LLM watermarking has attracted attention as a promising way to detect AI-generated content, with some works suggesting that current schemes may already be fit for deployment. In this work we dispute this claim, identifying *watermark stealing* (WS) as a fundamental vulnerability of these schemes. We show that querying the API of the watermarked LLM to approximately reverse-engineer a watermark enables practical *spoofing attacks*, as hypothesized in prior work, but also greatly boosts *scrubbing* attacks, which was previously unnoticed. We are the first to propose an automated WS algorithm and use it in the first comprehensive study of spoofing and scrubbing in realistic settings. We show that for under $50 an attacker can both spoof and scrub state-of-the-art schemes previously considered safe, with average success rate of over 80\\%. Our findings challenge common beliefs about LLM watermarking, stressing the need for more robust schemes. We make all our code and additional examples available at https://watermark-stealing.org.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nikola Jovanovi\u0107;Robin Staab;Martin Vechev", "authorids": "~Nikola_Jovanovi\u01071;~Robin_Staab1;~Martin_Vechev1", "gender": "M;M;M", "homepage": "https://www.sri.inf.ethz.ch/people/nikola;;https://www.sri.inf.ethz.ch/people/martin", "dblp": "230/4424-1;304/3512;93/2189.html", "google_scholar": "https://scholar.google.com/citations?hl=en;;https://scholar.google.ch/citations?user=aZ1Rh50AAAAJ", "orcid": ";;", "linkedin": "nikola-jovanovi%C4%87-9b599b105/;robin-staab-b778a51a6/;", "or_profile": "~Nikola_Jovanovi\u01071;~Robin_Staab1;~Martin_Vechev1", "aff": "ETHZ - ETH Zurich;ETHZ - ETH Zurich;Swiss Federal Institute of Technology", "aff_domain": "ethz.ch;ethz.ch;ethz.ch", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\njovanovi{\\'c}2024watermark,\ntitle={Watermark Stealing in Large Language Models},\nauthor={Nikola Jovanovi{\\'c} and Robin Staab and Martin Vechev},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Wp054bnPq9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 535943, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6737928476072006007&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 9, "email": "ethz.ch;ethz.ch;ethz.ch", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "ETH Zurich;Swiss Federal Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch", "aff_unique_abbr": "ETHZ;ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Time Weaver: A Conditional Time Series Generation Model", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33847", "id": "WpKDeixmFr", "proceeding": "https://proceedings.mlr.press/v235/narasimhan24a.html", "pdf": "https://openreview.net/pdf?id=WpKDeixmFr", "openreview": "https://openreview.net/forum?id=WpKDeixmFr", "author_site": "Sai Shankar Narasimhan, Shubhankar Agarwal, Oguzhan Akcin, Sujay Sanghavi, Sandeep Chinchali", "tldr": "", "abstract": "Imagine generating a city\u2019s electricity demand pattern based on weather, the presence of an electric vehicle, and location, which could be used for capacity planning during a winter freeze. Such real-world time series are often enriched with paired heterogeneous contextual metadata (e.g., weather and location). Current approaches to time series generation often ignore this paired metadata. Additionally, the heterogeneity in metadata poses several practical challenges in adapting existing conditional generation approaches from the image, audio, and video domains to the time series domain. To address this gap, we introduce TIME WEAVER, a novel diffusion-based model that leverages the heterogeneous metadata in the form of categorical, continuous, and even time-variant variables to significantly improve time series generation. Additionally, we show that naive extensions of standard evaluation metrics from the image to the time series domain are insufficient. These metrics do not penalize conditional generation approaches for their poor specificity in reproducing the metadata-specific features in the generated time series. Thus, we innovate a novel evaluation metric that accurately captures the specificity of conditional generation and the realism of the generated time series. We show that TIME WEAVER outperforms state-of-the-art benchmarks, such as Generative Adversarial Networks (GANs), by up to 30% in downstream classification tasks on real-world energy, medical, air quality, and traffic datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sai Shankar Narasimhan;Shubhankar Agarwal;Oguzhan Akcin;sujay sanghavi;Sandeep P. Chinchali", "authorids": "~Sai_Shankar_Narasimhan1;~Shubhankar_Agarwal1;~Oguzhan_Akcin2;~sujay_sanghavi1;~Sandeep_P._Chinchali1", "gender": "M;;M;M;", "homepage": "https://saishankarn.github.io;;;https://sites.utexas.edu/sanghavi;", "dblp": "340/7508;;311/3023;69/4911.html;", "google_scholar": "y0jHk04AAAAJ;;2elIEXoAAAAJ;O-DazBUAAAAJ;", "orcid": "0000-0001-9714-3865;;;;", "linkedin": ";;oguzhan-akcin-0907/;;", "or_profile": "~Sai_Shankar_Narasimhan1;~Shubhankar_Agarwal1;~Oguzhan_Akcin2;~sujay_sanghavi1;~Sandeep_P._Chinchali1", "aff": "University of Texas at Austin;;The University of Texas at Austin;University of Texas, Austin;", "aff_domain": "utexas.edu;;utexas.edu;utexas.edu;", "position": "PhD student;;PhD student;Associate Professor;", "bibtex": "@inproceedings{\nnarasimhan2024time,\ntitle={Time Weaver: A Conditional Time Series Generation Model},\nauthor={Sai Shankar Narasimhan and Shubhankar Agarwal and Oguzhan Akcin and sujay sanghavi and Sandeep P. Chinchali},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WpKDeixmFr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8626942, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7296022982002440036&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "utexas.edu;;utexas.edu;utexas.edu;", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Rethinking Decision Transformer via Hierarchical Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33846", "id": "WsM4TVsZpJ", "proceeding": "https://proceedings.mlr.press/v235/ma24b.html", "pdf": "https://openreview.net/pdf?id=WsM4TVsZpJ", "openreview": "https://openreview.net/forum?id=WsM4TVsZpJ", "author_site": "Yi Ma, Jianye Hao, Hebin Liang, Chenjun Xiao", "tldr": "", "abstract": "Decision Transformer (DT) is an innovative algorithm leveraging recent advances of the transformer architecture in reinforcement learning (RL). However, a notable limitation of DT is its reliance on recalling trajectories from datasets, losing the capability to seamlessly stitch sub-optimal trajectories together. In this work we introduce a general sequence modeling framework for studying sequential decision making through the lens of *Hierarchical RL*. At the time of making decisions, a *high-level* policy first proposes an ideal *prompt* for the current state, a *low-level* policy subsequently generates an action conditioned on the given prompt. We show DT emerges as a special case of this framework with certain choices of high-level and low-level policies, and discuss the potential failure of these choices. Inspired by these observations, we study how to jointly optimize the high-level and low-level policies to enable the stitching ability, which further leads to the development of new offline RL algorithms. Our empirical results clearly show that the proposed algorithms significantly surpass DT on several control and navigation benchmarks. We hope our contributions can inspire the integration of transformer architectures within the field of RL.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yi Ma;Jianye HAO;Hebin Liang;Chenjun Xiao", "authorids": "~Yi_Ma5;~Jianye_HAO1;~Hebin_Liang2;~Chenjun_Xiao1", "gender": ";M;;M", "homepage": "https://mayi1996.top/;http://www.icdai.org/jianye.html;https://chenjun-x.github.io/;https://github.com/superCat-star", "dblp": "69/1112-5.html;21/7664.html;178/8641;352/9378.html", "google_scholar": "TdVWzqgAAAAJ;;;", "orcid": "0000-0001-9375-6605;0000-0002-0422-8235;0000-0002-5493-1500;0009-0000-8371-2297", "linkedin": ";;;", "or_profile": "~Yi_Ma5;~Jianye_HAO1;~Chenjun_Xiao1;~hebin_liang1", "aff": "Tianjin University;Tianjin University;Huawei Technologies Ltd.;Tianjin University", "aff_domain": "tju.edu.cn;tju.edu.cn;huawei.com;tju.edu.cn", "position": "PhD student;Associate Professor;Researcher;MS student", "bibtex": "@inproceedings{\nma2024rethinking,\ntitle={Rethinking Decision Transformer via Hierarchical Reinforcement Learning},\nauthor={Yi Ma and Jianye HAO and Hebin Liang and Chenjun Xiao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WsM4TVsZpJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1699531, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=871146290267306675&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "tju.edu.cn;tju.edu.cn;huawei.com;tju.edu.cn", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Tianjin University;Huawei", "aff_unique_dep": ";Huawei Technologies", "aff_unique_url": "http://www.tju.edu.cn;https://www.huawei.com", "aff_unique_abbr": "TJU;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Position: Do pretrained Transformers Learn In-Context by Gradient Descent?", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33845", "id": "WsawczEqO6", "proceeding": "https://proceedings.mlr.press/v235/shen24d.html", "pdf": "https://openreview.net/pdf?id=WsawczEqO6", "openreview": "https://openreview.net/forum?id=WsawczEqO6", "author_site": "Lingfeng Shen, Aayush Mishra, Daniel Khashabi", "tldr": "", "abstract": "The emergence of In-Context Learning (ICL) in LLMs remains a remarkable phenomenon that is partially understood. To explain ICL, recent studies have created theoretical connections to Gradient Descent (GD). We ask, do such connections hold up in actual pre-trained language models? We highlight the limiting assumptions in prior works that make their setup considerably different from the practical setup in which language models are trained. For example, their experimental verification uses *ICL objective* (training models explicitly for ICL), which differs from the emergent ICL in the wild. Furthermore, the theoretical hand-constructed weights used in these studies have properties that don't match those of real LLMs. We also look for evidence in real models. We observe that ICL and GD have different sensitivity to the order in which they observe demonstrations. Finally, we probe and compare the ICL vs. GD hypothesis in a natural setting. We conduct comprehensive empirical analyses on language models pre-trained on natural data (LLaMa-7B). Our comparisons of three performance metrics highlight the inconsistent behavior of ICL and GD as a function of various factors such as datasets, models, and the number of demonstrations. We observe that ICL and GD modify the output distribution of language models differently. These results indicate that *the equivalence between ICL and GD remains an open hypothesis* and calls for further studies.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lingfeng Shen;Aayush Mishra;Daniel Khashabi", "authorids": "~Lingfeng_Shen1;~Aayush_Mishra1;~Daniel_Khashabi2", "gender": "M;M;M", "homepage": ";https://aamixsh.github.io;http://danielkhashabi.com/", "dblp": "240/5490.html;263/3200;71/10515", "google_scholar": "PoSTdLAAAAAJ;https://scholar.google.com/citations?hl=en;pK2kQvgAAAAJ", "orcid": ";;", "linkedin": ";aamixsh/;", "or_profile": "~Lingfeng_Shen1;~Aayush_Mishra1;~Daniel_Khashabi2", "aff": "Johns Hopkins University;Adobe Systems;Johns Hopkins University", "aff_domain": "jh.edu;adobe.com;jhu.edu", "position": "MS student;Intern;Assistant Professor", "bibtex": "@inproceedings{\nshen2024position,\ntitle={Position: Do pretrained Transformers Learn In-Context by Gradient Descent?},\nauthor={Lingfeng Shen and Aayush Mishra and Daniel Khashabi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WsawczEqO6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2839937, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13290472683521802124&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "jh.edu;adobe.com;jhu.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Johns Hopkins University;Adobe", "aff_unique_dep": ";Adobe Systems Incorporated", "aff_unique_url": "https://www.jhu.edu;https://www.adobe.com", "aff_unique_abbr": "JHU;Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Exploring the LLM Journey from Cognition to Expression with Linear Representations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33844", "id": "WtvI3QijEF", "proceeding": "https://proceedings.mlr.press/v235/yan24c.html", "pdf": "https://openreview.net/pdf?id=WtvI3QijEF", "openreview": "https://openreview.net/forum?id=WtvI3QijEF", "author_site": "Yuzi Yan, Jialian Li, YipinZhang, Dong Yan", "tldr": "", "abstract": "This paper presents an in-depth examination of the evolution and interplay of cognitive and expressive capabilities in large language models (LLMs), with a specific focus on Baichuan-7B and Baichuan-33B, an advanced bilingual (Chinese and English) LLM series. We define and explore the model's cognitive and expressive capabilities through linear representations across three critical phases: Pretraining, Supervised Fine-Tuning (SFT), and Reinforcement Learning from Human Feedback (RLHF). Cognitive capability is defined as the quantity and quality of information conveyed by the neuron output vectors within the network, similar to the neural signal processing in human cognition. Expressive capability is defined as the model\u2019s capability to produce word-level output. Our findings unveil a sequential development pattern, where cognitive abilities are largely established during Pretraining, whereas expressive abilities predominantly advance during SFT and RLHF. Statistical analyses confirm a significant correlation between the two capabilities, suggesting that cognitive capacity may limit expressive potential. The paper also explores the theoretical underpinnings of these divergent developmental trajectories and their connection to the LLMs' architectural design. Moreover, we evaluate various optimization-independent strategies, such as few-shot learning and repeated sampling, which bridge the gap between cognitive and expressive capabilities. This research reveals the potential connection between the hidden space and the output space, contributing valuable insights into the interpretability and controllability of their training processes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuzi Yan;Jialian Li;Yipin Zhang;Dong Yan", "authorids": "~Yuzi_Yan1;~Jialian_Li2;zypzyp665@gmail.com;~Dong_Yan1", "gender": "M;F;;M", "homepage": ";;;", "dblp": ";182/2475;;20/7834", "google_scholar": "FBoRIz8AAAAJ;;;lvztRUkAAAAJ", "orcid": ";;;0000-0003-0641-8988", "linkedin": ";;;", "or_profile": "~Yuzi_Yan1;~Jialian_Li2;zypzyp665@gmail.com;~Dong_Yan1", "aff": "Tsinghua University;Tsinghua University;;Baichuan Intelligent Technology", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;;baichuan-ai.com", "position": "PhD student;PhD student;;Researcher", "bibtex": "@inproceedings{\nyan2024exploring,\ntitle={Exploring the {LLM} Journey from Cognition to Expression with Linear Representations},\nauthor={Yuzi Yan and Jialian Li and Yipin Zhang and Dong Yan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WtvI3QijEF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3159895, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4899057268355722446&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "tsinghua.edu.cn;tsinghua.edu.cn;;baichuan-ai.com", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Tsinghua University;Baichuan Intelligent Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;", "aff_unique_abbr": "THU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Leveraging (Biased) Information: Multi-armed Bandits with Offline Data", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33843", "id": "WvIHbQhrTq", "proceeding": "https://proceedings.mlr.press/v235/cheung24a.html", "pdf": "https://openreview.net/pdf?id=WvIHbQhrTq", "openreview": "https://openreview.net/forum?id=WvIHbQhrTq", "author_site": "Wang Chi Cheung, Lixing Lyu", "tldr": "", "abstract": "We leverage offline data to facilitate online learning in stochastic multi-armed bandits. The probability distributions that govern the offline data and the online rewards can be different. Without any non-trival upper bound on their difference, we show that no non-anticipatory policy can out-perform the UCB policy by (Auer et al. 2002), even in the presence of offline data. In complement, we propose an online policy MIN-UCB, which outperforms UCB when a non-trivial upper bound is given. MIN-UCB adaptively chooses to utilize the offline data when they are deemed informative, and to ignore them otherwise. MIN-UCB is shown to be tight in terms of both instance indepedent and dependent regret bounds. Finally, we corroborate the theoretical results with numerical experiments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wang Chi Cheung;Lixing Lyu", "authorids": "~Wang_Chi_Cheung1;~Lixing_Lyu1", "gender": ";M", "homepage": ";https://lyulixing.github.io", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Wang_Chi_Cheung1;~Lixing_Lyu1", "aff": ";National University of Singapore", "aff_domain": ";u.nus.edu", "position": ";PhD student", "bibtex": "@inproceedings{\ncheung2024leveraging,\ntitle={Leveraging (Biased) Information: Multi-armed Bandits with Offline Data},\nauthor={Wang Chi Cheung and Lixing Lyu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WvIHbQhrTq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 590739, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4571267383814388983&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": ";u.nus.edu", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "National University of Singapore", "aff_unique_dep": "", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", "aff_country_unique_index": "0", "aff_country_unique": "Singapore" }, { "title": "MoMo: Momentum Models for Adaptive Learning Rates", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33842", "id": "WvvkbWD1vL", "proceeding": "https://proceedings.mlr.press/v235/schaipp24a.html", "pdf": "https://openreview.net/pdf?id=WvvkbWD1vL", "openreview": "https://openreview.net/forum?id=WvvkbWD1vL", "author_site": "Fabian Schaipp, Ruben Ohana, Michael Eickenberg, Aaron Defazio, Robert Gower", "tldr": "", "abstract": "Training a modern machine learning architecture on a new task requires extensive learning-rate tuning, which comes at a high computational cost. Here we develop new Polyak-type adaptive learning rates that can be used on top of any momentum method, and require less tuning to perform well. We first develop MoMo, a **Mo**mentum **Mo**del based adaptive learning rate for SGD-M (stochastic gradient descent with momentum). MoMo uses momentum estimates of the batch losses and gradients sampled at each iteration to build a model of the loss function. Our model also makes use of any known lower bound of the loss function by using truncation, e.g. most losses are lower-bounded by zero. The models is then approximately minimized at each iteration to compute the next step. We show how MoMo can be used in combination with any momentum-based method, and showcase this by developing MoMo-Adam - which is Adam with our new model-based adaptive learning rate. We show that MoMo attains a $\\mathcal{O}(1/\\sqrt{K})$ convergence rate for convex problems with interpolation, needing knowledge of no problem-specific quantities other than the optimal value. Additionally, for losses with unknown lower bounds, we develop on-the-fly estimates of a lower bound, that are incorporated in our model. We demonstrate that MoMo and MoMo-Adam improve over SGD-M and Adam in terms of robustness to hyperparameter tuning for training image classifiers on MNIST, CIFAR, and Imagenet, for recommender systems on the Criteo dataset, for a transformer model on the translation task IWSLT14, and for a diffusion model.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fabian Schaipp;Ruben Ohana;Michael Eickenberg;Aaron Defazio;Robert M. Gower", "authorids": "~Fabian_Schaipp1;~Ruben_Ohana1;~Michael_Eickenberg5;~Aaron_Defazio1;~Robert_M._Gower1", "gender": ";;M;M;M", "homepage": ";https://rubenohana.github.io/;http://eickenberg.github.io;https://www.aarondefazio.com/;https://gowerrobert.github.io/", "dblp": "276/2086;251/5608;117/7268;116/2969;143/0056", "google_scholar": ";https://scholar.google.fr/citations?user=F9qNg2wAAAAJ;GW0werQAAAAJ;KEzJsdkAAAAJ;okKw87MAAAAJ", "orcid": "0000-0002-0673-9944;0000-0002-8493-1210;;;", "linkedin": ";rubenohana/;;;", "or_profile": "~Fabian_Schaipp1;~Ruben_Ohana1;~Michael_Eickenberg5;~Aaron_Defazio1;~Robert_M._Gower1", "aff": "Technische Universit\u00e4t M\u00fcnchen;Flatiron Institute;Flatiron Institute;Meta;Flatiron Institute", "aff_domain": "tum.de;flatironinstitute.org;flatironinstitute.org;meta.com;simonsfoundation.org", "position": "PhD student;Postdoc;Researcher;Research Scientist;Researcher", "bibtex": "@inproceedings{\nschaipp2024momo,\ntitle={MoMo: Momentum Models for Adaptive Learning Rates},\nauthor={Fabian Schaipp and Ruben Ohana and Michael Eickenberg and Aaron Defazio and Robert M. Gower},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WvvkbWD1vL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3975863, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4379861052436102614&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "tum.de;flatironinstitute.org;flatironinstitute.org;meta.com;simonsfoundation.org", "author_num": 5, "aff_unique_index": "0;1;1;2;1", "aff_unique_norm": "Technische Universit\u00e4t M\u00fcnchen;Flatiron Institute;Meta", "aff_unique_dep": ";;Meta Platforms, Inc.", "aff_unique_url": "https://www.tum.de;https://flatironinstitute.org;https://meta.com", "aff_unique_abbr": "TUM;Flatiron;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "Germany;United States" }, { "title": "Best Arm Identification for Stochastic Rising Bandits", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33841", "id": "WwLtwPHmSM", "proceeding": "https://proceedings.mlr.press/v235/mussi24b.html", "pdf": "https://openreview.net/pdf?id=WwLtwPHmSM", "openreview": "https://openreview.net/forum?id=WwLtwPHmSM", "author_site": "Marco Mussi, Alessandro Montenegro, Francesco Trov\u00f2, Marcello Restelli, Alberto Maria Metelli", "tldr": "", "abstract": "Stochastic Rising Bandits (SRBs) model sequential decision-making problems in which the expected reward of the available options increases every time they are selected. This setting captures a wide range of scenarios in which the available options are learning entities whose performance improves (in expectation) over time (e.g., online best model selection). While previous works addressed the regret minimization problem, this paper focuses on the fixed-budget Best Arm Identification (BAI) problem for SRBs. In this scenario, given a fixed budget of rounds, we are asked to provide a recommendation about the best option at the end of the identification process. We propose two algorithms to tackle the above-mentioned setting, namely R-UCBE, which resorts to a UCB-like approach, and R-SR, which employs a successive reject procedure. Then, we prove that, with a sufficiently large budget, they provide guarantees on the probability of properly identifying the optimal option at the end of the learning process and on the simple regret. Furthermore, we derive a lower bound on the error probability, matched by our R-SR (up to constants), and illustrate how the need for a sufficiently large budget is unavoidable in the SRB setting. Finally, we numerically validate the proposed algorithms in both synthetic and realistic environments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Marco Mussi;Alessandro Montenegro;Francesco Trov\u00f2;Marcello Restelli;Alberto Maria Metelli", "authorids": "~Marco_Mussi1;~Alessandro_Montenegro1;~Francesco_Trov\u00f21;~Marcello_Restelli1;~Alberto_Maria_Metelli2", "gender": "M;M;M;M;M", "homepage": "https://marcomussi.github.io/;;https://trovo.faculty.polimi.it/;http://home.deib.polimi.it/restelli/;https://albertometelli.github.io/", "dblp": "321/0756;;69/11487;64/1011;209/4941", "google_scholar": "3gca-JUAAAAJ;CugD-ogAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=xdgxRiEAAAAJ;R31IsPwAAAAJ", "orcid": "0000-0001-8356-6744;;0000-0001-5796-7667;0000-0002-6322-1076;0000-0002-3424-5212", "linkedin": "marcomussi95/;alessandro-montenegro-3266291b7/;;;", "or_profile": "~Marco_Mussi1;~Alessandro_Montenegro1;~Francesco_Trov\u00f21;~Marcello_Restelli1;~Alberto_Maria_Metelli2", "aff": "Politecnico di Milano;Politecnico di Milano;Politecnico di Milano;Politecnico di Milano;Politecnico di Milano", "aff_domain": "polimi.it;polimi.it;polimi.it;polimi.it;polimi.it", "position": "PhD student;PhD student;Assistant Professor;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nmussi2024best,\ntitle={Best Arm Identification for Stochastic Rising Bandits},\nauthor={Marco Mussi and Alessandro Montenegro and Francesco Trov{\\`o} and Marcello Restelli and Alberto Maria Metelli},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WwLtwPHmSM}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1387869, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17019165035024854401&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 11, "email": "polimi.it;polimi.it;polimi.it;polimi.it;polimi.it", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Politecnico di Milano", "aff_unique_dep": "", "aff_unique_url": "https://www.polimi.it", "aff_unique_abbr": "Polimi", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Italy" }, { "title": "Online Cascade Learning for Efficient Inference over Streams", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33840", "id": "Wz4lgc8dsN", "proceeding": "https://proceedings.mlr.press/v235/nie24a.html", "pdf": "https://openreview.net/pdf?id=Wz4lgc8dsN", "openreview": "https://openreview.net/forum?id=Wz4lgc8dsN", "author_site": "Lunyiu Nie, Zhimin Ding, Erdong Hu, Christopher Jermaine, Swarat Chaudhuri", "tldr": "", "abstract": "Large Language Models (LLMs) have a natural role in answering complex queries about data streams, but the high computational cost of LLM inference makes them infeasible in many such tasks. We propose *online cascade learning*, the first approach to address this challenge. The objective here is to learn a ``cascade'' of models, starting with lower-capacity models (such as logistic regression) and ending with a powerful LLM, along with a *deferral policy* that determines the model to be used on a given input. We formulate the task of learning cascades online as an imitation-learning problem, where smaller models are updated over time imitating the collected LLM demonstrations, and give a no-regret algorithm for the problem. Experimental results across four benchmarks show that our method parallels LLMs in accuracy while cutting down inference costs by as much as 90% with strong robustness against input distribution shifts, underscoring its efficacy and adaptability in stream processing. Our source code is available at https://github.com/flitternie/online_cascade_learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lunyiu Nie;Zhimin Ding;Erdong Hu;Christopher Jermaine;Swarat Chaudhuri", "authorids": "~Lunyiu_Nie1;~Zhimin_Ding1;~Erdong_Hu1;~Christopher_Jermaine1;~Swarat_Chaudhuri1", "gender": "M;M;M;M;M", "homepage": "https://flitternie.github.io/;;;https://www.cs.rice.edu/~cmj4/;http://www.cs.utexas.edu/~swarat", "dblp": "320/5241;;;j/ChrisJermaine.html;37/6100", "google_scholar": "IwXgGTsAAAAJ;MZWEfNgAAAAJ;;D2P2B0MAAAAJ;9j6RBYQAAAAJ", "orcid": ";;0000-0002-1979-0386;;0000-0002-6859-1391", "linkedin": ";;;;swarat-chaudhuri-609b3092/", "or_profile": "~Lunyiu_Nie1;~Zhimin_Ding1;~Erdong_Hu1;~Christopher_Jermaine1;~Swarat_Chaudhuri1", "aff": "University of Texas at Austin;Rice University;Rice University ;Rice University;University of Texas at Austin", "aff_domain": "utexas.edu;rice.edu;rice.edu;rice.edu;utexas.edu", "position": "PhD student;PhD student;PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nnie2024online,\ntitle={Online Cascade Learning for Efficient Inference over Streams},\nauthor={Lunyiu Nie and Zhimin Ding and Erdong Hu and Christopher Jermaine and Swarat Chaudhuri},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Wz4lgc8dsN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2090227, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9826386894881104340&as_sdt=4005&sciodt=0,6&hl=en", "gs_version_total": 9, "email": "utexas.edu;rice.edu;rice.edu;rice.edu;utexas.edu", "author_num": 5, "aff_unique_index": "0;1;1;1;0", "aff_unique_norm": "University of Texas at Austin;Rice University", "aff_unique_dep": ";", "aff_unique_url": "https://www.utexas.edu;https://www.rice.edu", "aff_unique_abbr": "UT Austin;Rice", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Finite Volume Features, Global Geometry Representations, and Residual Training for Deep Learning-based CFD Simulation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33839", "id": "WzD4a5ufN8", "proceeding": "https://proceedings.mlr.press/v235/jessica24a.html", "pdf": "https://openreview.net/pdf?id=WzD4a5ufN8", "openreview": "https://openreview.net/forum?id=WzD4a5ufN8", "author_site": "Loh S.E. Jessica, Naheed Anjum Arafat, Wei Xian Lim, Wai Lee Chan, Adams Wai Kin Kong", "tldr": "", "abstract": "Computational fluid dynamics (CFD) simulation is an irreplaceable modelling step in many engineering designs, but it is often computationally expensive. Some graph neural network (GNN)-based CFD methods have been proposed. However, the current methods inherit the weakness of traditional numerical simulators, as well as ignore the cell characteristics in the mesh used in the finite volume method, a common method in practical CFD applications. Specifically, the input nodes in these GNN methods have very limited information about any object immersed in the simulation domain and its surrounding environment. Also, the cell characteristics of the mesh such as cell volume, face surface area, and face centroid are not included in the message-passing operations in the GNN methods. To address these weaknesses, this work proposes two novel geometric representations: Shortest Vector (SV) and Directional Integrated Distance (DID). Extracted from the mesh, the SV and DID provide global geometry perspective to each input node, thus removing the need to collect this information through message-passing. This work also introduces the use of Finite Volume Features (FVF) in the graph convolutions as node and edge attributes, enabling its message-passing operations to adjust to different nodes. Finally, this work is the first to demonstrate how residual training, with the availability of low-resolution data, can be adopted to improve the flow field prediction accuracy. Experimental results on two datasets with five different state-of-the-art GNN methods for CFD indicate that SV, DID, FVF and residual training can effectively reduce the predictive error of current GNN-based methods by as much as 41%. Our codes and datasets are available at https://github.com/toggled/FvFGeo.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Loh Sher En Jessica;Naheed Anjum Arafat;Wei Xian Lim;Wai Lee Chan;Adams Wai-Kin Kong", "authorids": "~Loh_Sher_En_Jessica1;~Naheed_Anjum_Arafat1;~Wei_Xian_Lim2;~Wai_Lee_Chan1;~Adams_Wai-Kin_Kong1", "gender": "M;M;M;Not Specified;M", "homepage": "https://toggled.github.io/naheed/;https://dr.ntu.edu.sg/cris/rp/rp00198;https://personal.ntu.edu.sg/AdamsKong/;;", "dblp": "204/2502;;16/3792;;", "google_scholar": "qWrpqBEAAAAJ;REl0cgwAAAAJ;2GfXvbUAAAAJ;;", "orcid": ";0000-0002-3692-7604;;0000-0002-0223-6768;0000-0001-6676-3447", "linkedin": ";;;jessica-loh-a54a5157/;", "or_profile": "~Naheed_Anjum_Arafat1;~Wai_Lee_Chan1;~Adams_Wai-Kin_Kong1;~Jessica_Sher_En_Loh1;~WEI_XIAN_LIM1", "aff": "Nanyang Technological University;Nanyang Technological University, Singapore;Nanyang Technological University;Nanyang Technological University;Nanyang Technological University", "aff_domain": "ntu.edu.sg;ntu.edu.sg;ntu.edu.sg;ntu.edu.sg;ntu.edu.sg", "position": "Postdoc;Assistant Professor;Associate Professor;PhD student;Postdoc", "bibtex": "@inproceedings{\njessica2024finite,\ntitle={Finite Volume Features, Global Geometry Representations, and Residual Training for Deep Learning-based {CFD} Simulation},\nauthor={Loh Sher En Jessica and Naheed Anjum Arafat and Wei Xian Lim and Wai Lee Chan and Adams Wai-Kin Kong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WzD4a5ufN8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2360037, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17253511673713261892&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "ntu.edu.sg;ntu.edu.sg;ntu.edu.sg;ntu.edu.sg;ntu.edu.sg", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Nanyang Technological University", "aff_unique_dep": "", "aff_unique_url": "https://www.ntu.edu.sg", "aff_unique_abbr": "NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Singapore" }, { "title": "DPOT: Auto-Regressive Denoising Operator Transformer for Large-Scale PDE Pre-Training", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33838", "id": "X7UnDevHOM", "proceeding": "https://proceedings.mlr.press/v235/hao24d.html", "pdf": "https://openreview.net/pdf?id=X7UnDevHOM", "openreview": "https://openreview.net/forum?id=X7UnDevHOM", "author_site": "Zhongkai Hao, Chang Su, LIU SONGMING, Julius Berner, Chengyang Ying, Hang Su, Anima Anandkumar, Jian Song, Jun Zhu", "tldr": "", "abstract": "Pre-training has been investigated to improve the efficiency and performance of training neural operators in data-scarce settings. However, it is largely in its infancy due to the inherent complexity and diversity, such as long trajectories, multiple scales and varying dimensions of partial differential equations (PDEs) data. In this paper, we present a new auto-regressive denoising pre-training strategy, which allows for more stable and efficient pre-training on PDE data and generalizes to various downstream tasks. Moreover, by designing a flexible and scalable model architecture based on Fourier attention, we can easily scale up the model for large-scale pre-training. We train our PDE foundation model with up to 0.5B parameters on 10+ PDE datasets with more than 100k trajectories. Extensive experiments show that we achieve SOTA on these benchmarks and validate the strong generalizability of our model to significantly enhance performance on diverse downstream PDE tasks like 3D data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhongkai Hao;Chang Su;Songming Liu;Julius Berner;Chengyang Ying;Hang Su;Anima Anandkumar;Jian Song;Jun Zhu", "authorids": "~Zhongkai_Hao1;~Chang_Su7;~Songming_Liu1;~Julius_Berner1;~Chengyang_Ying1;~Hang_Su3;~Anima_Anandkumar1;~Jian_Song3;~Jun_Zhu2", "gender": "M;M;M;M;M;M;F;M;M", "homepage": "https://github.com/EdwardIX;;https://jberner.info/;https://yingchengyang.github.io/;https://www.sigs.tsinghua.edu.cn/sj_6973/main.htm;http://ml.cs.tsinghua.edu.cn/~jun;http://tensorlab.cms.caltech.edu/users/anima/;;https://haozhongkai.github.io/", "dblp": ";285/4585;227/2217;296/2065;;50/2644-1;;26/5371-6;270/0220.html", "google_scholar": ";6urFg8kAAAAJ;73-D2jgAAAAJ;vM6KE18AAAAJ;;axsP38wAAAAJ;bEcLezcAAAAJ;dxN1_X0AAAAJ;dfSzq27ZiVoC", "orcid": ";;0000-0002-5648-648X;;;;;;", "linkedin": ";%E6%9D%BE%E9%93%AD-%E5%88%98-7b8339254/;julius-berner/;%E9%93%96%E9%98%B3-%E5%BA%94-9b682a203/;;;anima-anandkumar-35171b1/;;", "or_profile": "~Chang_Su7;~Songming_Liu1;~Julius_Berner1;~Chengyang_Ying1;~Jian_Song3;~Jun_Zhu2;~anima_anandkumar1;~Hang_Su2;~Hao_Zhongkai1", "aff": "Tsinghua University;Tsinghua University;California Institute of Technology;Tsinghua University;Tsinghua University;Tsinghua University;California Institute of Technology;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;mails.tsinghua.edu.cn;caltech.edu;tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn;caltech.edu;tsinghua.edu.cn;mails.tsinghua.edu.cn", "position": "Undergrad student;PhD student;Postdoc;PhD student;Full Professor;Professor;Full Professor;Associate Professor;PhD student", "bibtex": "@inproceedings{\nhao2024dpot,\ntitle={{DPOT}: Auto-Regressive Denoising Operator Transformer for Large-Scale {PDE} Pre-Training},\nauthor={Zhongkai Hao and Chang Su and Songming Liu and Julius Berner and Chengyang Ying and Hang Su and Anima Anandkumar and Jian Song and Jun Zhu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=X7UnDevHOM}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1162905, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7545027446523951020&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "tsinghua.edu.cn;mails.tsinghua.edu.cn;caltech.edu;tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn;caltech.edu;tsinghua.edu.cn;mails.tsinghua.edu.cn", "author_num": 9, "aff_unique_index": "0;0;1;0;0;0;1;0;0", "aff_unique_norm": "Tsinghua University;California Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.caltech.edu", "aff_unique_abbr": "THU;Caltech", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Pasadena", "aff_country_unique_index": "0;0;1;0;0;0;1;0;0", "aff_country_unique": "China;United States" }, { "title": "Sparse-IFT: Sparse Iso-FLOP Transformations for Maximizing Training Efficiency", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33837", "id": "X8Ha2NiQcy", "proceeding": "https://proceedings.mlr.press/v235/thangarasa24a.html", "pdf": "https://openreview.net/pdf?id=X8Ha2NiQcy", "openreview": "https://openreview.net/forum?id=X8Ha2NiQcy", "author_site": "Vithursan Thangarasa, Shreyas Saxena, Abhay Gupta, Sean Lie", "tldr": "", "abstract": "Recent research has focused on weight sparsity in deep neural network training to reduce FLOPs, aiming for improved efficiency (test accuracy w.r.t training FLOPs). However, sparse weight training often compromises accuracy, requiring extended training schedules to attain the accuracy of dense models. In contrast, our approach, Sparse Iso-FLOP Transformations (Sparse-IFT), uses sparsity to improve accuracy while maintaining dense model FLOPs. Using a single hyperparameter (i.e., the sparsity level), Sparse-IFTs efficiently replace dense layers, expanding the search space for optimal sparse masks. In addition, dynamic sparse training (DST) with Sparse-IFT models effectively navigate this larger sparse mask-weight space, which is evidenced by a spectral analysis using Ramanujan graph properties. Our study reveals a robust correlation among mask topology, weights, and final performance. Notably, without adjusting any training hyperparameters, replacing dense layers with Sparse-IFT yields significant improvements, such as a +3.5% boost for ResNet-18 on ImageNet and +0.9% for GPT-3 Small on the Open LLM leaderboard. To the best of our knowledge, this is the first work to demonstrate the use of sparsity for improving the accuracy of dense models through a set of simple-to-use sparse transformations. Code is available at: https://github.com/CerebrasResearch/Sparse-IFT.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vithursan Thangarasa;Shreyas Saxena;Abhay Gupta;Sean Lie", "authorids": "~Vithursan_Thangarasa1;~Shreyas_Saxena1;~Abhay_Gupta1;~Sean_Lie1", "gender": "M;M;M;Not Specified", "homepage": "https://vithursant.com;;;", "dblp": "223/9965;96/10387;;", "google_scholar": "UUKxm4gAAAAJ;ePbtJPEAAAAJ;Pae7GxYAAAAJ;", "orcid": ";;;", "linkedin": "vithursant/;;gupta-abhay;sean-lie-4a80097/", "or_profile": "~Vithursan_Thangarasa1;~Shreyas_Saxena1;~Abhay_Gupta1;~Sean_Lie1", "aff": "Cerebras Systems, Inc;Cerebras Systems, Inc;Cerebras Systems, Inc;Cerebras Systems, Inc", "aff_domain": "cerebras.net;cerebras.net;cerebras.net;cerebras.net", "position": "Researcher;Principal Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\nthangarasa2024sparseift,\ntitle={Sparse-{IFT}: Sparse Iso-{FLOP} Transformations for Maximizing Training Efficiency},\nauthor={Vithursan Thangarasa and Shreyas Saxena and Abhay Gupta and Sean Lie},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=X8Ha2NiQcy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 846373, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5797139692757100030&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "cerebras.net;cerebras.net;cerebras.net;cerebras.net", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Cerebras Systems", "aff_unique_dep": "", "aff_unique_url": "https://www.cerebras.com", "aff_unique_abbr": "Cerebras", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "OT-CLIP: Understanding and Generalizing CLIP via Optimal Transport", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33836", "id": "X8uQ1TslUc", "proceeding": "https://proceedings.mlr.press/v235/shi24b.html", "pdf": "https://openreview.net/pdf?id=X8uQ1TslUc", "openreview": "https://openreview.net/forum?id=X8uQ1TslUc", "author_site": "Liangliang Shi, Jack Fan, Junchi Yan", "tldr": "", "abstract": "We propose to understand Contrastive Language-Image Pretraining model (CLIP) from the Optimal Transport (OT) perspective. Specifically, we show that training of CLIP is an embodiment of inverse OT and the adopted two InfoNCE losses in CLIP correspond to a special case of bilevel optimization of modified entropic OT. We then generalize the original CLIP loss to an OT-based loss family using variants of Regularized OT (e.g. Fused Gromov OT, unbalanced OT, etc.), and demonstrate their superior performance on public datasets for both image and text downstream tasks. We also rethink the inference stage of CLIP by using the tool of OT, and propose to adopt the fused Gromov OT for (zero-shot) classification, in which the prediction is based on the graph representation whereby images and texts are nodes for graph matching. By our new technique, we show how to generalize zero-shot classification to other more flexible zero-shot tasks with competitive performance: long-tailed classification and selective classification. The former assumes the known prior distribution of labels, while in the latter case, only a subset of samples are asked to predict, yet with high prediction confidence.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Liangliang Shi;Jack Fan;Junchi Yan", "authorids": "~Liangliang_Shi1;~Jack_Fan2;~Junchi_Yan2", "gender": "M;M;M", "homepage": ";;http://thinklab.sjtu.edu.cn/", "dblp": "89/8730;;60/7949.html", "google_scholar": "Qf1k8lUAAAAJ;;ga230VoAAAAJ", "orcid": "0000-0001-7033-4207;0009-0003-2804-7687;0000-0001-9639-7679", "linkedin": ";fan-jack/;", "or_profile": "~Liangliang_Shi1;~Jack_Fan2;~Junchi_Yan1", "aff": "Shanghai Jiaotong University;;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;;sjtu.edu.cn", "position": "PhD student;;Full Professor", "bibtex": "@inproceedings{\nshi2024otclip,\ntitle={{OT}-{CLIP}: Understanding and Generalizing {CLIP} via Optimal Transport},\nauthor={Liangliang Shi and Jack Fan and Junchi Yan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=X8uQ1TslUc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1930064, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5179281089376935426&as_sdt=8005&sciodt=0,7&hl=en", "gs_version_total": 5, "email": "sjtu.edu.cn;;sjtu.edu.cn", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Mixtures of Experts Unlock Parameter Scaling for Deep RL", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33835", "id": "X9VMhfFxwn", "proceeding": "https://proceedings.mlr.press/v235/obando-ceron24b.html", "pdf": "https://openreview.net/pdf?id=X9VMhfFxwn", "openreview": "https://openreview.net/forum?id=X9VMhfFxwn", "author_site": "Johan Obando Ceron, Ghada Sokar, Timon Willi, Clare Lyle, Jesse Farebrother, Jakob Foerster, Gintare Karolina Dziugaite, Doina Precup, Pablo Samuel Castro", "tldr": "", "abstract": "The recent rapid progress in (self) supervised learning models is in large part predicted by empirical scaling laws: a model's performance scales proportionally to its size. Analogous scaling laws remain elusive for reinforcement learning domains, however, where increasing the parameter count of a model often hurts its final performance. In this paper, we demonstrate that incorporating Mixture-of-Expert (MoE) modules, and in particular Soft MoEs (Puigcerver et al., 2023), into value-based networks results in more parameter-scalable models, evidenced by substantial performance increases across a variety of training regimes and model sizes. This work thus provides strong empirical evidence towards developing scaling laws for reinforcement learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Johan Samir Obando Ceron;Ghada Sokar;Timon Willi;Clare Lyle;Jesse Farebrother;Jakob Nicolaus Foerster;Gintare Karolina Dziugaite;Doina Precup;Pablo Samuel Castro", "authorids": "~Johan_Samir_Obando_Ceron1;~Ghada_Sokar1;~Timon_Willi1;~Clare_Lyle1;~Jesse_Farebrother1;~Jakob_Nicolaus_Foerster1;~Gintare_Karolina_Dziugaite1;~Doina_Precup1;~Pablo_Samuel_Castro1", "gender": "M;;;;M;M;F;F;M", "homepage": "https://johanobandoc.github.io;https://research.tue.nl/en/persons/ghada-sokar;https://www.timonwilli.com;;https://brosa.ca;https://www.jakobfoerster.com;http://gkdz.org/;http://cs.mcgill.ca/~dprecup/;https://psc-g.github.io/", "dblp": ";244/7833;243/3437;192/1910;228/6862;176/5095;163/1774;p/DoinaPrecup;05/5455", "google_scholar": "KViAb3EAAAAJ;https://scholar.google.nl/citations?user=0e6fdZsAAAAJ;Dn-udzAAAAAJ;;cA12XHcAAAAJ;6z4lQzMAAAAJ;5K1QB_8AAAAJ;https://scholar.google.com.tw/citations?user=j54VcVEAAAAJ;https://scholar.google.ca/citations?user=jn5r6TsAAAAJ", "orcid": ";;0000-0003-4405-5700;;0000-0002-5178-4947;;;;", "linkedin": "johan-obando/;;;;jessefarebro/;;;;pablo-samuel-castro-2113641b/", "or_profile": "~Johan_Samir_Obando_Ceron1;~Ghada_Sokar1;~Timon_Willi1;~Clare_Lyle1;~Jesse_Farebrother1;~Jakob_Nicolaus_Foerster1;~Gintare_Karolina_Dziugaite1;~Doina_Precup1;~Pablo_Samuel_Castro1", "aff": "Mila - Quebec AI Institute, Universit\u00e9 de Montr\u00e9al;Google DeepMind;University of Oxford, University of Oxford;Google DeepMind;Google DeepMind;University of Oxford, University of Oxford;Montreal Institute for Learning Algorithms, University of Montreal, University of Montreal;McGill University;Google", "aff_domain": "mila.umontreal.ca;google.com;eng.ox.ac.uk;google.com;google.com;eng.ox.ac.uk;mila.umontreal.ca;mcgill.ca;google.com", "position": "PhD student;Researcher;PhD student;Researcher;Student Researcher;Associate Professor;Member;Associate Professor;Researcher", "bibtex": "@inproceedings{\nceron2024mixtures,\ntitle={Mixtures of Experts Unlock Parameter Scaling for Deep {RL}},\nauthor={Johan Samir Obando Ceron and Ghada Sokar and Timon Willi and Clare Lyle and Jesse Farebrother and Jakob Nicolaus Foerster and Gintare Karolina Dziugaite and Doina Precup and Pablo Samuel Castro},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=X9VMhfFxwn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6381965, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1177995408752571622&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "mila.umontreal.ca;google.com;eng.ox.ac.uk;google.com;google.com;eng.ox.ac.uk;mila.umontreal.ca;mcgill.ca;google.com", "author_num": 9, "aff_unique_index": "0;1;2;1;1;2;3;4;1", "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al;Google;University of Oxford;University of Montreal;McGill University", "aff_unique_dep": "Mila - Quebec AI Institute;Google DeepMind;;Montreal Institute for Learning Algorithms;", "aff_unique_url": "https://www.mila.quebec/;https://deepmind.com;https://www.ox.ac.uk;https://www.umontreal.ca;https://www.mcgill.ca", "aff_unique_abbr": "Mila;DeepMind;Oxford;UM;McGill", "aff_campus_unique_index": "0;2;3", "aff_campus_unique": "Montr\u00e9al;;Montreal;Mountain View", "aff_country_unique_index": "0;1;1;1;1;1;0;0;2", "aff_country_unique": "Canada;United Kingdom;United States" }, { "title": "Enhancing Implicit Shape Generators Using Topological Regularizations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33834", "id": "XBNhJQU84y", "proceeding": "https://proceedings.mlr.press/v235/chen24bk.html", "pdf": "https://openreview.net/pdf?id=XBNhJQU84y", "openreview": "https://openreview.net/forum?id=XBNhJQU84y", "author_site": "Liyan Chen, Yan Zheng, Yang Li, Lohit A. Jagarapu, Haoxiang Li, Hao Kang, Gang Hua, Qixing Huang", "tldr": "", "abstract": "A fundamental problem in learning 3D shapes generative models is that when the generative model is simply fitted to the training data, the resulting synthetic 3D models can present various artifacts. Many of these artifacts are topological in nature, e.g., broken legs, unrealistic thin structures, and small holes. In this paper, we introduce a principled approach that utilizes topological regularization losses on an implicit shape generator to rectify topological artifacts. The objectives are two-fold. The first is to align the persistent diagram (PD) distribution of the training shapes with that of synthetic shapes. The second ensures that the PDs are smooth among adjacent synthetic shapes. We show how to achieve these two objectives using two simple but effective formulations. Specifically, distribution alignment is achieved to learn a generative model of PDs and align this generator with PDs of synthetic shapes. We show how to handle discrete and continuous variabilities of PDs by using a shape-regularization term when performing PD alignment. Moreover, we enforce the smoothness of the PDs using a smoothness loss on the PD generator, which further improves the behavior of PD distribution alignment. Experimental results on ShapeNet show that our approach leads to much better generalization behavior than state-of-the-art implicit shape generators.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Liyan Chen;Yan Zheng;Yang Li;Lohit Anirudh Jagarapu;Haoxiang Li;Hao Kang;Gang Hua;Qixing Huang", "authorids": "~Liyan_Chen1;~Yan_Zheng4;~Yang_Li4;~Lohit_Anirudh_Jagarapu1;~Haoxiang_Li1;~Hao_Kang1;~Gang_Hua3;~Qixing_Huang1", "gender": "M;F;F;Non-Binary;M;M;M;M", "homepage": "https://cs.utexas.edu/~liyanc;https://sites.google.com/view/yan-zheng-ut/home;http://yangli-feasibility.com;https://www.lohitjagarapu.com;https://resume.haoxiang.org;http://www.ganghua.org;https://www.cs.utexas.edu/~huangqx/;https://www.linkedin.com/in/haokang2017", "dblp": ";;37/4190-104;;;75/5209.html;82/241;", "google_scholar": "ppaEV-8AAAAJ;;_qMiOloAAAAJ;;Fu6aoXAAAAAJ;7SgUlggAAAAJ;https://scholar.google.com.tw/citations?user=pamL_rIAAAAJ;VeTCSyEAAAAJ", "orcid": "0009-0005-9517-6643;;;0009-0009-9683-5865;;0000-0001-9522-6157;;", "linkedin": ";;;;haoxiangli/;ganghua/;;", "or_profile": "~Liyan_Chen1;~Yan_Zheng4;~Yang_Li4;~Lohit_Anirudh_Jagarapu1;~Haoxiang_Li1;~Gang_Hua3;~Qixing_Huang1;~Hao_Kang2", "aff": "University of Texas, Austin;University of Texas at Austin;Tsinghua Shenzhen International Graduate School;University of Texas at Austin;Wormpex AI Research;Wormpex AI Research;University of Texas at Austin;Wormpex AI Research", "aff_domain": "cs.utexas.edu;cs.utexas.edu;sz.tsinghua.edu.cn;utexas.edu;wormpexai.com;bianlifeng.com;utexas.edu;wormpex.com", "position": "PhD student;PhD student;Associate Professor;Undergrad student;Principal Researcher;Chief Scientist and Managing Director;Associate Professor;Researcher", "bibtex": "@inproceedings{\nchen2024enhancing,\ntitle={Enhancing Implicit Shape Generators Using Topological Regularizations},\nauthor={Liyan Chen and Yan Zheng and Yang Li and Lohit Anirudh Jagarapu and Haoxiang Li and Hao Kang and Gang Hua and Qixing Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XBNhJQU84y}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5258485, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3568948415438736852&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "cs.utexas.edu;cs.utexas.edu;sz.tsinghua.edu.cn;utexas.edu;wormpexai.com;bianlifeng.com;utexas.edu;wormpex.com", "author_num": 8, "aff_unique_index": "0;0;1;0;2;2;0;2", "aff_unique_norm": "University of Texas at Austin;Tsinghua University;Wormpex AI Research", "aff_unique_dep": ";International Graduate School;AI Research", "aff_unique_url": "https://www.utexas.edu;https://www.tsinghua.edu.cn;", "aff_unique_abbr": "UT Austin;THU;Wormpex AI", "aff_campus_unique_index": "0;0;1;0;0", "aff_campus_unique": "Austin;Shenzhen;", "aff_country_unique_index": "0;0;1;0;0;0;0;0", "aff_country_unique": "United States;China" }, { "title": "Position: Cracking the Code of Cascading Disparity Towards Marginalized Communities", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33833", "id": "XDz9leJ9iK", "proceeding": "https://proceedings.mlr.press/v235/farnadi24a.html", "pdf": "https://openreview.net/pdf?id=XDz9leJ9iK", "openreview": "https://openreview.net/forum?id=XDz9leJ9iK", "author_site": "Golnoosh Farnadi, Mohammad Havaei, Negar Rostamzadeh", "tldr": "", "abstract": "The rise of foundation models holds immense promise for advancing AI, but this progress may amplify existing risks and inequalities, leaving marginalized communities behind. In this position paper, we discuss that disparities towards marginalized communities \u2013 performance, representation, privacy, robustness, interpretability and safety \u2013 are not isolated concerns but rather interconnected elements of a cascading disparity phenomenon. We contrast foundation models with traditional models and highlight the potential for exacerbated disparity against marginalized communities. Moreover, we emphasize the unique threat of cascading impacts in foundation models, where interconnected disparities can trigger long-lasting negative consequences, specifically to the people on the margin. We define marginalized communities within the machine learning context and explore the multifaceted nature of disparities. We analyze the sources of these disparities, tracing them from data creation, training and deployment procedures to highlight the complex technical and socio-technical landscape. To mitigate the pressing crisis, we conclude with a set of calls to action to mitigate disparity at its source.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Golnoosh Farnadi;Mohammad Havaei;Negar Rostamzadeh", "authorids": "~Golnoosh_Farnadi1;~Mohammad_Havaei2;~Negar_Rostamzadeh1", "gender": "F;F;M", "homepage": "http://www.cwi.ugent.be/people.php?userid=golnoosh;;https://havaeimo.github.io/about", "dblp": "148/1397;126/0982;117/9116.html", "google_scholar": "https://scholar.google.nl/citations?user=4Vjp6hwAAAAJ;https://scholar.google.ca/citations?user=t5ak3j0AAAAJ;https://scholar.google.ca/citations?user=LAoMyyoAAAAJ", "orcid": ";;", "linkedin": "gfarnadi/;;mohammad-havaei-52066415/", "or_profile": "~Golnoosh_Farnadi1;~Negar_Rostamzadeh1;~Mohammad_Havaei1", "aff": "McGill University;Google;Google", "aff_domain": "mcgill.ca;google.com;google.com", "position": "Assistant Professor;Research Scientist;Reseach scientist ", "bibtex": "@inproceedings{\nfarnadi2024position,\ntitle={Position: Cracking the Code of Cascading Disparity Towards Marginalized Communities},\nauthor={Golnoosh Farnadi and Mohammad Havaei and Negar Rostamzadeh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XDz9leJ9iK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 300524, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9712358188867322594&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "mcgill.ca;google.com;google.com", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "McGill University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.mcgill.ca;https://www.google.com", "aff_unique_abbr": "McGill;Google", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Canada;United States" }, { "title": "Enforcing Constraints in RNA Secondary Structure Predictions: A Post-Processing Framework Based on the Assignment Problem", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33832", "id": "XGGcnKelda", "proceeding": "https://proceedings.mlr.press/v235/suh24a.html", "pdf": "https://openreview.net/pdf?id=XGGcnKelda", "openreview": "https://openreview.net/forum?id=XGGcnKelda", "author_site": "Geewon Suh, Gyeongjo Hwang, SeokjunKang, Doojin Baek, Mingeun Kang", "tldr": "", "abstract": "RNA properties, such as function and stability, are intricately tied to their two-dimensional conformations. This has spurred the development of computational models for predicting the RNA secondary structures, leveraging dynamic programming or machine learning (ML) techniques. These structures are governed by specific rules; for example, only Watson-Crick and Wobble pairs are allowed, and sequences must not form sharp bends. Recent efforts introduced a systematic approach to post-process the predictions made by ML algorithms, aiming to modify them to respect the constraints. However, we still observe instances violating the requirements, significantly reducing biological relevance. To address this challenge, we present a novel post-processing framework for ML-based predictions on RNA secondary structures, inspired by the assignment problem in integer linear programming. Our algorithm offers a theoretical guarantee, ensuring that the resulting predictions adhere to the fundamental constraints of RNAs. Empirical evidence supports the efficacy of our approach, demonstrating improved predictive performance with no constraint violation, while requiring less running time.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Geewon Suh;Gyeongjo Hwang;Seokjun Kang;Doojin Baek;Mingeun Kang", "authorids": "~Geewon_Suh1;~Gyeongjo_Hwang1;~Seokjun_Kang1;djinb00@kaist.ac.kr;~Mingeun_Kang2", "gender": ";M;M;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": "5-eNIpgAAAAJ;;;;", "orcid": ";;;;", "linkedin": ";;\uc11d\uc900-\uac15-2bb3472b6;;", "or_profile": "~Geewon_Suh1;~Gyeongjo_Hwang1;~Seokjun_Kang1;djinb00@kaist.ac.kr;~Mingeun_Kang2", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;spidercore;;", "aff_domain": "kaist.ac.kr;kaist.ac.kr;spidercore.io;;", "position": "PhD student;MS student;Researcher;;", "bibtex": "@inproceedings{\nsuh2024enforcing,\ntitle={Enforcing Constraints in {RNA} Secondary Structure Predictions: A Post-Processing Framework Based on the Assignment Problem},\nauthor={Geewon Suh and Gyeongjo Hwang and Seokjun Kang and Doojin Baek and Mingeun Kang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XGGcnKelda}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 927560, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4sfEdJApVm8J:scholar.google.com/&scioq=Enforcing+Constraints+in+RNA+Secondary+Structure+Predictions:+A+Post-Processing+Framework+Based+on+the+Assignment+Problem&hl=en&as_sdt=0,31", "gs_version_total": 4, "email": "kaist.ac.kr;kaist.ac.kr;spidercore.io;;", "author_num": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;SpiderCore", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;", "aff_unique_abbr": "KAIST;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea;" }, { "title": "Risk-Sensitive Reward-Free Reinforcement Learning with CVaR", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33831", "id": "XGq30hC5MW", "proceeding": "https://proceedings.mlr.press/v235/ni24c.html", "pdf": "https://openreview.net/pdf?id=XGq30hC5MW", "openreview": "https://openreview.net/forum?id=XGq30hC5MW", "author_site": "Xinyi Ni, Guanlin Liu, Lifeng Lai", "tldr": "", "abstract": "Exploration is a crucial phase in reinforcement learning (RL). The reward-free RL paradigm, as explored by (Jin et al., 2020), offers an efficient method to design exploration algorithms for risk-neutral RL across various reward functions with a single exploration phase. However, as RL applications in safety critical settings grow, there's an increasing need for risk-sensitive RL, which considers potential risks in decision-making. Yet, efficient exploration strategies for risk-sensitive RL remain underdeveloped. This study presents a novel risk-sensitive reward-free framework based on Conditional Value-at-Risk (CVaR), designed to effectively address CVaR RL for any given reward function through a single exploration phase. We introduce the CVaR-RF-UCRL algorithm, which is shown to be $(\\epsilon,p)$-PAC, with a sample complexity upper bounded by $\\tilde{\\mathcal{O}}\\left(\\frac{S^2AH^4}{\\epsilon^2\\tau^2}\\right)$ with $\\tau$ being the risk tolerance parameter. We also prove a $\\Omega\\left(\\frac{S^2AH^2}{\\epsilon^2\\tau}\\right)$ lower bound for any CVaR-RF exploration algorithm, demonstrating the near-optimality of our algorithm. Additionally, we propose the planning algorithms: CVaR-VI and its more practical variant, CVaR-VI-DISC. The effectiveness and practicality of our CVaR reward-free approach are further validated through numerical experiments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xinyi Ni;Guanlin Liu;Lifeng Lai", "authorids": "~Xinyi_Ni1;~Guanlin_Liu1;~Lifeng_Lai1", "gender": "F;M;", "homepage": "https://nxyni.github.io/;;", "dblp": "332/9339;224/9954;12/4889", "google_scholar": "30opUTcAAAAJ;a7eYJk4AAAAJ;gOhaCfUAAAAJ", "orcid": ";0000-0002-0595-9398;", "linkedin": "xinyi-ni;;", "or_profile": "~Xinyi_Ni1;~Guanlin_Liu1;~Lifeng_Lai1", "aff": "University of California, Davis;University of California, Davis;University of California, Davis", "aff_domain": "ucdavis.edu;ucdavis.edu;ucdavis.edu", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nni2024risksensitive,\ntitle={Risk-Sensitive Reward-Free Reinforcement Learning with {CV}aR},\nauthor={Xinyi Ni and Guanlin Liu and Lifeng Lai},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XGq30hC5MW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 416522, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12845827483354280028&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "ucdavis.edu;ucdavis.edu;ucdavis.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Davis", "aff_unique_dep": "", "aff_unique_url": "https://www.ucdavis.edu", "aff_unique_abbr": "UC Davis", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Davis", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Bounding the Excess Risk for Linear Models Trained on Marginal-Preserving, Differentially-Private, Synthetic Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33830", "id": "XKxuTZRCXq", "proceeding": "https://proceedings.mlr.press/v235/zhou24k.html", "pdf": "https://openreview.net/pdf?id=XKxuTZRCXq", "openreview": "https://openreview.net/forum?id=XKxuTZRCXq", "author_site": "Yvonne Zhou, Mingyu Liang, Ivan Brugere, Danial Dervovic, Antigoni Polychroniadou, Min Wu, Dana Dachman-Soled", "tldr": "", "abstract": "The growing use of machine learning (ML) has raised concerns that an ML model may reveal private information about an individual who has contributed to the training dataset. To prevent leakage of sensitive data, we consider using differentially- private (DP), synthetic training data instead of real training data to train an ML model. A key desirable property of synthetic data is its ability to preserve the low-order marginals of the original distribution. Our main contribution comprises novel upper and lower bounds on the excess empirical risk of linear models trained on such synthetic data, for continuous and Lipschitz loss functions. We perform extensive experimentation alongside our theoretical results.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yvonne Zhou;Mingyu Liang;Ivan Brugere;Danial Dervovic;Antigoni Polychroniadou;Min Wu;Dana Dachman-Soled", "authorids": "~Yvonne_Zhou1;~Mingyu_Liang1;~Ivan_Brugere1;~Danial_Dervovic1;~Antigoni_Polychroniadou1;~Min_Wu1;~Dana_Dachman-Soled1", "gender": "F;M;M;M;F;F;", "homepage": "https://www.cs.umd.edu/people/skyzhou;;;https://www.danialdervovic.com;https://antigonip.github.io/work.html;https://user.eng.umd.edu/~minwu/bio.html;https://user.eng.umd.edu/~danadach/", "dblp": ";156/9748;50/10346;203/8299.html;40/11429;16/0-1;38/6981.html", "google_scholar": ";;JGlGUcsAAAAJ;ttWrIOcAAAAJ;https://scholar.google.dk/citations?user=5e-gHjMAAAAJ;https://scholar.google.com/citations?hl=en;Ss009KUAAAAJ", "orcid": ";;0000-0002-2953-3746;0000-0002-6135-561X;;0000-0001-7672-9357;", "linkedin": ";mingyu-liang/;ivanbrugere/;https://uk.linkedin.com/in/danial-dervovic;;minwu/;", "or_profile": "~Yvonne_Zhou1;~Mingyu_Liang1;~Ivan_Brugere1;~Danial_Dervovic1;~Antigoni_Polychroniadou1;~Min_Wu1;~Dana_Dachman-Soled1", "aff": "University of Maryland, College Park;George Washington University;J.P. Morgan;J.P. Morgan Chase;J.P. Morgan AI Research ;University of Maryland, College Park;University of Maryland, College Park", "aff_domain": "umd.edu;gwu.edu;jpmchase.com;jpmorgan.com;jpmorgan.com;umd.edu;umd.edu", "position": "PhD student;Postdoc;Researcher;Researcher;Principal Researcher;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nzhou2024bounding,\ntitle={Bounding the Excess Risk for Linear Models Trained on Marginal-Preserving, Differentially-Private, Synthetic Data},\nauthor={Yvonne Zhou and Mingyu Liang and Ivan Brugere and Danial Dervovic and Antigoni Polychroniadou and Min Wu and Dana Dachman-Soled},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XKxuTZRCXq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1079191, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15382952360516984888&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "umd.edu;gwu.edu;jpmchase.com;jpmorgan.com;jpmorgan.com;umd.edu;umd.edu", "author_num": 7, "aff_unique_index": "0;1;2;3;2;0;0", "aff_unique_norm": "University of Maryland;George Washington University;J.P. Morgan;JPMorgan Chase & Co.", "aff_unique_dep": ";;;", "aff_unique_url": "https://www/umd.edu;https://www.gwu.edu;https://www.jpmorganchase.com;https://www.jpmorganchase.com", "aff_unique_abbr": "UMD;GWU;JPM;JPM", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Test-Time Degradation Adaptation for Open-Set Image Restoration", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33829", "id": "XLlQb24X2o", "proceeding": "https://proceedings.mlr.press/v235/gou24a.html", "pdf": "https://openreview.net/pdf?id=XLlQb24X2o", "openreview": "https://openreview.net/forum?id=XLlQb24X2o", "author_site": "Yuanbiao Gou, Haiyu Zhao, Boyun Li, Xinyan Xiao, Xi Peng", "tldr": "", "abstract": "In contrast to close-set scenarios that restore images from a predefined set of degradations, open-set image restoration aims to handle the unknown degradations that were unforeseen during the pretraining phase, which is less-touched as far as we know. This work study this challenging problem and reveal its essence as unidentified distribution shifts between the test and training data. Recently, test-time adaptation has emerged as a fundamental method to address this inherent disparities. Inspired by it, we propose a test-time degradation adaptation framework for open-set image restoration, which consists of three components, *i.e.*, i) a pre-trained and degradation-agnostic diffusion model for generating clean images, ii) a test-time degradation adapter adapts the unknown degradations based on the input image during the testing phase, and iii) the adapter-guided image restoration guides the model through the adapter to produce the corresponding clean image. Through experiments on multiple degradations, we show that our method achieves comparable even better performance than those task-specific methods. The code is available at https://github.com/XLearning-SCU/2024-ICML-TAO.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuanbiao Gou;Haiyu Zhao;Boyun Li;Xinyan Xiao;Xi Peng", "authorids": "~Yuanbiao_Gou1;~Haiyu_Zhao2;~Boyun_Li1;~Xinyan_Xiao1;~Xi_Peng3", "gender": "M;M;M;;", "homepage": "https://ybgou.github.io/;https://pandint.github.io/about/;https://liboyun.github.io/;;", "dblp": "268/6723;203/8513;268/6988;;", "google_scholar": "o5OcgLcAAAAJ;vBsI10YAAAAJ;https://scholar.google.com/citations?hl=zh-CN;;", "orcid": ";0009-0003-5201-7904;;;", "linkedin": ";;;;", "or_profile": "~Yuanbiao_Gou1;~Haiyu_Zhao2;~Boyun_Li1;~Xinyan_Xiao1;~Xi_Peng3", "aff": "Sichuan University;Sichuan University;Sichuan University;;", "aff_domain": "scu.edu.cn;scu.edu.cn;scu.edu.cn;;", "position": "PhD student;PhD student;PhD student;;", "bibtex": "@inproceedings{\ngou2024testtime,\ntitle={Test-Time Degradation Adaptation for Open-Set Image Restoration},\nauthor={Yuanbiao Gou and Haiyu Zhao and Boyun Li and Xinyan Xiao and Xi Peng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XLlQb24X2o}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2369565, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3507077199500507238&as_sdt=40000005&sciodt=0,22&hl=en", "gs_version_total": 8, "email": "scu.edu.cn;scu.edu.cn;scu.edu.cn;;", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Sichuan University", "aff_unique_dep": "", "aff_unique_url": "https://www.scu.edu.cn", "aff_unique_abbr": "SCU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "From Neurons to Neutrons: A Case Study in Interpretability", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33828", "id": "XMlUlY7ONf", "proceeding": "https://proceedings.mlr.press/v235/kitouni24a.html", "pdf": "https://openreview.net/pdf?id=XMlUlY7ONf", "openreview": "https://openreview.net/forum?id=XMlUlY7ONf", "author_site": "Ouail Kitouni, Niklas Nolte, V\u00edctor Samuel P\u00e9rez-D\u00edaz, Sokratis Trifinopoulos, Mike Williams", "tldr": "", "abstract": "Mechanistic Interpretability (MI) proposes a path toward fully understanding how neural networks make their predictions. Prior work demonstrates that even when trained to perform simple arithmetic, models can implement a variety of algorithms (sometimes concurrently) depending on initialization and hyperparameters. Does this mean neuron-level interpretability techniques have limited applicability? Here, we argue that high-dimensional neural networks can learn *useful* low-dimensional representations of the data they were trained on, going beyond simply making good predictions: Such representations can be understood with the MI lens and provide insights that are surprisingly faithful to human-derived domain knowledge. This indicates that such approaches to interpretability can be useful for deriving a new understanding of a problem from models trained to solve it. As a case study, we extract nuclear physics concepts by studying models trained to reproduce nuclear data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ouail Kitouni;Niklas Nolte;V\u00edctor Samuel P\u00e9rez-D\u00edaz;Sokratis Trifinopoulos;Mike Williams", "authorids": "~Ouail_Kitouni1;~Niklas_Nolte1;~V\u00edctor_Samuel_P\u00e9rez-D\u00edaz1;~Sokratis_Trifinopoulos1;~Mike_Williams1", "gender": "M;Not Specified;M;M;M", "homepage": "https://okitouni.github.io/;https://nolte.dev/about;https://www.samuelperezdi.com/;;https://physics.mit.edu/faculty/michael-williams/", "dblp": "285/7983.html;;;;90/379", "google_scholar": ";5elJ_uIAAAAJ;Vg1b-tkAAAAJ;;", "orcid": ";0000-0003-2536-4209;0009-0000-5483-2652;0000-0002-0492-1144;", "linkedin": "ouail-kitouni-645804187/;;;;", "or_profile": "~Ouail_Kitouni1;~Niklas_Nolte1;~V\u00edctor_Samuel_P\u00e9rez-D\u00edaz1;~Sokratis_Trifinopoulos1;~Mike_Williams1", "aff": "Massachusetts Institute of Technology;Meta Facebook;Universidad del Rosario;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;meta.com;urosario.edu.co;mit.edu;mit.edu", "position": "PhD student;Researcher;Lecturer;Postdoc;Associate Professor", "bibtex": "@inproceedings{\nkitouni2024from,\ntitle={From Neurons to Neutrons: A Case Study in Interpretability},\nauthor={Ouail Kitouni and Niklas Nolte and V{\\'\\i}ctor Samuel P{\\'e}rez-D{\\'\\i}az and Sokratis Trifinopoulos and Mike Williams},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XMlUlY7ONf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3082864, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11785484939862980018&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "mit.edu;meta.com;urosario.edu.co;mit.edu;mit.edu", "author_num": 5, "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "Massachusetts Institute of Technology;Meta;Universidad del Rosario", "aff_unique_dep": ";Meta Platforms, Inc.;", "aff_unique_url": "https://web.mit.edu;https://meta.com;https://www.urosario.edu.co", "aff_unique_abbr": "MIT;Meta;UdelR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United States;Colombia" }, { "title": "Stability Evaluation through Distributional Perturbation Analysis", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33827", "id": "XPP6K57bop", "proceeding": "https://proceedings.mlr.press/v235/blanchet24a.html", "pdf": "https://openreview.net/pdf?id=XPP6K57bop", "openreview": "https://openreview.net/forum?id=XPP6K57bop", "author_site": "Jose Blanchet, Peng Cui, Jiajin Li, Jiashuo Liu", "tldr": "", "abstract": "The performance of learning models often deteriorates when deployed in out-of-sample environments. To ensure reliable deployment, we propose a stability evaluation criterion based on distributional perturbations. Conceptually, our stability evaluation criterion is defined as the minimal perturbation required on our observed dataset to induce a prescribed deterioration in risk evaluation. In this paper, we utilize the optimal transport (OT) discrepancy with moment constraints on the (sample, density) space to quantify this perturbation. Therefore, our stability evaluation criterion can address both data corruptions and sub-population shifts\u2014the two most common types of distribution shifts in real-world scenarios. To further realize practical benefits, we present a series of tractable convex formulations and computational methods tailored to different classes of loss functions. The key technical tool to achieve this is the strong duality theorem provided in this paper. Empirically, we validate the practical utility of our stability evaluation criterion across a host of real-world applications. These empirical studies showcase the criterion's ability not only to compare the stability of different learning models and features but also to provide valuable guidelines and strategies to further improve models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jose Blanchet;Peng Cui;Jiajin Li;Jiashuo Liu", "authorids": "~Jose_Blanchet1;~Peng_Cui1;~Jiajin_Li2;~Jiashuo_Liu1", "gender": "M;M;F;M", "homepage": "https://web.stanford.edu/~jblanche/;http://pengcui.thumedialab.com/;https://gerrili1996.github.io/;https://ljsthu.github.io", "dblp": "75/5093.html;31/891-1;;180/2823", "google_scholar": "https://scholar.google.co.in/citations?user=O24CcQQAAAAJ;https://scholar.google.com.tw/citations?user=G8x97ZgAAAAJ;;b7bpt5MAAAAJ", "orcid": ";0000-0003-2957-8511;;", "linkedin": "jose-blanchet;;;jiashuo-liu-244a6b1a4", "or_profile": "~Jose_Blanchet1;~Peng_Cui1;~Jiajin_Li2;~Jiashuo_Liu1", "aff": "Stanford University;Tsinghua University;Stanford University;University of Cambridge", "aff_domain": "stanford.edu;tsinghua.edu.cn;stanford.edu;cam.ac.uk", "position": "Professor;Associate Professor;Postdoc;Researcher", "bibtex": "@inproceedings{\nblanchet2024stability,\ntitle={Stability Evaluation through Distributional Perturbation Analysis},\nauthor={Jose Blanchet and Peng Cui and Jiajin Li and Jiashuo Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XPP6K57bop}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3616088, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2QXGQpltsGkJ:scholar.google.com/&scioq=Stability+Evaluation+through+Distributional+Perturbation+Analysis&hl=en&as_sdt=0,5", "gs_version_total": 4, "email": "stanford.edu;tsinghua.edu.cn;stanford.edu;cam.ac.uk", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Stanford University;Tsinghua University;University of Cambridge", "aff_unique_dep": ";;", "aff_unique_url": "https://www.stanford.edu;https://www.tsinghua.edu.cn;https://www.cam.ac.uk", "aff_unique_abbr": "Stanford;THU;Cambridge", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Stanford;;Cambridge", "aff_country_unique_index": "0;1;0;2", "aff_country_unique": "United States;China;United Kingdom" }, { "title": "Network Tight Community Detection", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33826", "id": "XQz7ytgETQ", "proceeding": "https://proceedings.mlr.press/v235/deng24f.html", "pdf": "https://openreview.net/pdf?id=XQz7ytgETQ", "openreview": "https://openreview.net/forum?id=XQz7ytgETQ", "author_site": "Jiayi Deng, Xiaodong Yang, Jun Yu, Jun Liu, Zhaiming Shen, Danyang Huang, Huimin Cheng", "tldr": "", "abstract": "Conventional community detection methods often categorize all nodes into clusters. However, the presumed community structure of interest may only be valid for a subset of nodes (named as `tight nodes'), while the rest of the network may consist of noninformative ``scattered nodes''. For example, a protein-protein network often contains proteins that do not belong to specific biological functional modules but are involved in more general processes, or act as bridges between different functional modules. Forcing each of these proteins into a single cluster introduces unwanted biases and obscures the underlying biological implication. To address this issue, we propose a tight community detection (TCD) method to identify tight communities excluding scattered nodes. The algorithm enjoys a strong theoretical guarantee of tight node identification accuracy and is scalable for large networks. The superiority of the proposed method is demonstrated by various synthetic and real experiments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiayi Deng;Xiaodong Yang;Jun Yu;Jun Liu;Zhaiming Shen;Danyang Huang;Huimin Cheng", "authorids": "~Jiayi_Deng1;~Xiaodong_Yang7;~Jun_Yu4;~Jun_Liu3;~Zhaiming_Shen1;dyhuang89@126.com;~Huimin_Cheng1", "gender": ";M;M;M;M;;F", "homepage": ";;https://math.bit.edu.cn/szdw/azcpl/tbfyjy/yj/index.htm;https://sites.harvard.edu/junliu/;https://sites.google.com/view/zhaiming-shen;;", "dblp": ";;;;149/2522.html;;", "google_scholar": ";;;-bHzVq8AAAAJ;ksQav6cAAAAJ;;ILbnB1QAAAAJ", "orcid": ";my-orcid?orcid=0000-0002-1152-9407;;0000-0002-4450-7239;0000-0003-3604-0015;;", "linkedin": ";;;;zhaiming-shen/;;", "or_profile": "~Jiayi_Deng1;~Xiaodong_Yang7;~Jun_Yu4;~Jun_Liu3;~Zhaiming_Shen1;dyhuang89@126.com;~Huimin_Cheng1", "aff": ";Harvard University, Harvard University;;Harvard University;University of Georgia;;Boston University, Boston University", "aff_domain": ";g.harvard.edu;;fas.harvard.edu;uga.edu;;bu.edu", "position": ";PhD student;;Full Professor;PhD student;;Assistant Professor", "bibtex": "@inproceedings{\ndeng2024network,\ntitle={Network Tight Community Detection},\nauthor={Jiayi Deng and Xiaodong Yang and Jun Yu and Jun Liu and Zhaiming Shen and Danyang Huang and Huimin Cheng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XQz7ytgETQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5527370, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13831711477632520327&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "email": ";g.harvard.edu;;fas.harvard.edu;uga.edu;;bu.edu", "author_num": 7, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Harvard University;University of Georgia;Boston University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.harvard.edu;https://www.uga.edu;https://www.bu.edu", "aff_unique_abbr": "Harvard;UGA;BU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Boston", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Fair Classification with Partial Feedback: An Exploration-Based Data Collection Approach", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33825", "id": "XSsoggg8pz", "proceeding": "https://proceedings.mlr.press/v235/keswani24a.html", "pdf": "https://openreview.net/pdf?id=XSsoggg8pz", "openreview": "https://openreview.net/forum?id=XSsoggg8pz", "author_site": "Vijay Keswani, Anay Mehrotra, L. Elisa Celis", "tldr": "", "abstract": "In many predictive contexts (e.g., credit lending), true outcomes are only observed for samples that were positively classified in the past. These past observations, in turn, form training datasets for classifiers that make future predictions. However, such training datasets lack information about the outcomes of samples that were (incorrectly) negatively classified in the past and can lead to erroneous classifiers. We present an approach that trains a classifier using available data and comes with a family of exploration strategies to collect outcome data about subpopulations that otherwise would have been ignored. For any exploration strategy, the approach comes with guarantees that (1) all sub-populations are explored, (2) the fraction of false positives is bounded, and (3) the trained classifier converges to a \"desired\" classifier. The right exploration strategy is context-dependent; it can be chosen to improve learning guarantees and encode context-specific group fairness properties. Evaluation on real-world datasets shows that this approach consistently boosts the quality of collected outcome data and improves the fraction of true positives for all groups, with only a small reduction in predictive utility.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vijay Keswani;Anay Mehrotra;L. Elisa Celis", "authorids": "~Vijay_Keswani1;~Anay_Mehrotra1;~L._Elisa_Celis2", "gender": ";;", "homepage": ";;", "dblp": ";234/8808;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Vijay_Keswani1;~Anay_Mehrotra1;~L._Elisa_Celis2", "aff": ";Yale University;", "aff_domain": ";yale.edu;", "position": ";PhD student;", "bibtex": "@inproceedings{\nkeswani2024fair,\ntitle={Fair Classification with Partial Feedback: An Exploration-Based Data Collection Approach},\nauthor={Vijay Keswani and Anay Mehrotra and L. Elisa Celis},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XSsoggg8pz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1552631, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:e8DPv_34asYJ:scholar.google.com/&scioq=Fair+Classification+with+Partial+Feedback:+An+Exploration-Based+Data+Collection+Approach&hl=en&as_sdt=0,44", "gs_version_total": 6, "email": ";yale.edu;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Yale University", "aff_unique_dep": "", "aff_unique_url": "https://www.yale.edu", "aff_unique_abbr": "Yale", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Implicit Bias of Policy Gradient in Linear Quadratic Control: Extrapolation to Unseen Initial States", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33824", "id": "XT6iF8FDZx", "proceeding": "https://proceedings.mlr.press/v235/razin24a.html", "pdf": "https://openreview.net/pdf?id=XT6iF8FDZx", "openreview": "https://openreview.net/forum?id=XT6iF8FDZx", "author_site": "Noam Razin, Yotam Alexander, Edo Cohen-Karlik, Raja Giryes, Amir Globerson, Nadav Cohen", "tldr": "", "abstract": "In modern machine learning, models can often fit training data in numerous ways, some of which perform well on unseen (test) data, while others do not. Remarkably, in such cases gradient descent frequently exhibits an implicit bias that leads to excellent performance on unseen data. This implicit bias was extensively studied in supervised learning, but is far less understood in optimal control (reinforcement learning). There, learning a controller applied to a system via gradient descent is known as policy gradient, and a question of prime importance is the extent to which a learned controller extrapolates to unseen initial states. This paper theoretically studies the implicit bias of policy gradient in terms of extrapolation to unseen initial states. Focusing on the fundamental Linear Quadratic Regulator (LQR) problem, we establish that the extent of extrapolation depends on the degree of exploration induced by the system when commencing from initial states included in training. Experiments corroborate our theory, and demonstrate its conclusions on problems beyond LQR, where systems are non-linear and controllers are neural networks. We hypothesize that real-world optimal control may be greatly improved by developing methods for informed selection of initial states to train on.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Noam Razin;Yotam Alexander;Edo Cohen-Karlik;Raja Giryes;Amir Globerson;Nadav Cohen", "authorids": "~Noam_Razin1;~Yotam_Alexander1;~Edo_Cohen-Karlik1;~Raja_Giryes1;~Amir_Globerson1;~Nadav_Cohen1", "gender": "M;M;M;M;M;M", "homepage": "https://noamrazin.github.io/;https://www.cohennadav.com/group.html;;https://www.giryes.sites.tau.ac.il/;http://www.cs.tau.ac.il/~gamir/;http://www.cohennadav.com", "dblp": "247/1241;342/9141.html;242/8962.html;50/7998;08/4162.html;119/7155", "google_scholar": "tDsd50oAAAAJ;;tWI9Pw8AAAAJ;https://scholar.google.co.il/citations?user=9aQUYVQAAAAJ;https://scholar.google.com.tw/citations?user=5JserkUAAAAJ;AfLwLQ0AAAAJ", "orcid": ";;;0000-0002-2830-0297;;", "linkedin": ";;;raja-giryes-0818935/;;cohennadav/", "or_profile": "~Noam_Razin1;~Yotam_Alexander1;~Edo_Cohen-Karlik1;~Raja_Giryes1;~Amir_Globerson1;~Nadav_Cohen1", "aff": "Tel Aviv University;Tel Aviv University, Tel Aviv University;;Tel Aviv University;Tel Aviv University;School of Computer Science, Tel Aviv University", "aff_domain": "tau.ac.il;tauex.tau.ac.il;;tauex.tau.ac.il;tau.ac.il;cs.tau.ac.il", "position": "PhD student;PhD student;;Associate Professor;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nrazin2024implicit,\ntitle={Implicit Bias of Policy Gradient in Linear Quadratic Control: Extrapolation to Unseen Initial States},\nauthor={Noam Razin and Yotam Alexander and Edo Cohen-Karlik and Raja Giryes and Amir Globerson and Nadav Cohen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XT6iF8FDZx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3570170, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3326032025638209174&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "tau.ac.il;tauex.tau.ac.il;;tauex.tau.ac.il;tau.ac.il;cs.tau.ac.il", "author_num": 6, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Tel Aviv University", "aff_unique_dep": "", "aff_unique_url": "https://www.tau.ac.il", "aff_unique_abbr": "TAU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Tel Aviv", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Israel" }, { "title": "Clifford-Steerable Convolutional Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33823", "id": "XTglHJjzQI", "proceeding": "https://proceedings.mlr.press/v235/zhdanov24a.html", "pdf": "https://openreview.net/pdf?id=XTglHJjzQI", "openreview": "https://openreview.net/forum?id=XTglHJjzQI", "author_site": "Maksim Zhdanov, David Ruhe, Maurice Weiler, Ana Lucic, Johannes Brandstetter, Patrick Forr\u00e9", "tldr": "", "abstract": "We present Clifford-Steerable Convolutional Neural Networks (CS-CNNs), a novel class of ${\\operatorname{E}}(p, q)$-equivariant CNNs. CS-CNNs process multivector fields on pseudo-Euclidean spaces $\\mathbb{R}^{p,q}$. They specialize, for instance, to ${\\operatorname{E}}(3)$-equivariance on $\\mathbb{R}^3$ and Poincar\u00e9-equivariance on Minkowski spacetime $\\mathbb{R}^{1,3}$. Our approach is based on an implicit parametrization of ${\\operatorname{O}}(p,q)$-steerable kernels via Clifford group equivariant neural networks. We significantly and consistently outperform baseline methods on fluid dynamics as well as relativistic electrodynamics forecasting tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Maksim Zhdanov;David Ruhe;Maurice Weiler;Ana Lucic;Johannes Brandstetter;Patrick Forr\u00e9", "authorids": "~Maksim_Zhdanov1;~David_Ruhe1;~Maurice_Weiler1;~Ana_Lucic1;~Johannes_Brandstetter1;~Patrick_Forr\u00e91", "gender": "M;;;F;M;", "homepage": "https://maxxxzdn.github.io/;;https://maurice-weiler.gitlab.io/;https://a-lucic.github.io;;", "dblp": "322/0190;243/3507;210/0855;;251/8691;", "google_scholar": "Llnm6XgAAAAJ;;uQePx6EAAAAJ;https://scholar.google.nl/citations?user=tQPUCysAAAAJ;KiRvOHcAAAAJ;", "orcid": ";;;;;", "linkedin": ";;maurice-weiler-78b6931a6/;ana-lucic-98581188/;;", "or_profile": "~Maksim_Zhdanov1;~David_Ruhe1;~Maurice_Weiler1;~Ana_Lucic1;~Johannes_Brandstetter1;~Patrick_Forr\u00e91", "aff": "University of Amsterdam;University of Amsterdam;University of Amsterdam;Microsoft;Microsoft;", "aff_domain": "uva.nl;uva.nl;uva.nl;microsoft.com;microsoft.com;", "position": "PhD student;PhD student;PhD student;Postdoc;Researcher;", "bibtex": "@inproceedings{\nzhdanov2024cliffordsteerable,\ntitle={Clifford-Steerable Convolutional Neural Networks},\nauthor={Maksim Zhdanov and David Ruhe and Maurice Weiler and Ana Lucic and Johannes Brandstetter and Patrick Forr{\\'e}},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XTglHJjzQI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3289063, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6906786070973199716&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "email": "uva.nl;uva.nl;uva.nl;microsoft.com;microsoft.com;", "author_num": 6, "aff_unique_index": "0;0;0;1;1", "aff_unique_norm": "University of Amsterdam;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.uva.nl;https://www.microsoft.com", "aff_unique_abbr": "UvA;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;1", "aff_country_unique": "Netherlands;United States" }, { "title": "Harmonic Self-Conditioned Flow Matching for joint Multi-Ligand Docking and Binding Site Design", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33822", "id": "XTrMY9sHKF", "proceeding": "https://proceedings.mlr.press/v235/stark24a.html", "pdf": "https://openreview.net/pdf?id=XTrMY9sHKF", "openreview": "https://openreview.net/forum?id=XTrMY9sHKF", "author_site": "Hannes St\u00e4rk, Bowen Jing, Regina Barzilay, Tommi Jaakkola", "tldr": "", "abstract": "A significant amount of protein function requires binding small molecules, including enzymatic catalysis. As such, designing binding pockets for small molecules has several impactful applications ranging from drug synthesis to energy storage. Towards this goal, we first develop HarmonicFlow, an improved generative process over 3D protein-ligand binding structures based on our self-conditioned flow matching objective. FlowSite extends this flow model to jointly generate a protein pocket's discrete residue types and the molecule's binding 3D structure. We show that HarmonicFlow improves upon state-of-the-art generative processes for docking in simplicity, generality, and average sample quality in pocket-level docking. Enabled by this structure modeling, FlowSite designs binding sites substantially better than baseline approaches.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hannes Stark;Bowen Jing;Regina Barzilay;Tommi Jaakkola", "authorids": "~Hannes_Stark1;~Bowen_Jing1;~Regina_Barzilay1;~Tommi_S._Jaakkola1", "gender": ";;female;", "homepage": ";;https://www.regina.csail.mit.edu/;", "dblp": ";;b/ReginaBarzilay;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Hannes_Stark1;~Bowen_Jing1;~Regina_Barzilay1;~Tommi_S._Jaakkola1", "aff": ";;Massachusetts Institute of Technology;", "aff_domain": ";;mit.edu;", "position": ";;Professor;", "bibtex": "@inproceedings{\nstark2024harmonic,\ntitle={Harmonic Self-Conditioned Flow Matching for joint Multi-Ligand Docking and Binding Site Design},\nauthor={Hannes Stark and Bowen Jing and Regina Barzilay and Tommi Jaakkola},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XTrMY9sHKF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5443243, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11139800986304074137&as_sdt=5,24&sciodt=0,24&hl=en", "gs_version_total": 5, "email": ";;mit.edu;", "author_num": 4, "aff_unique_index": "0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Parameter-Efficient Fine-Tuning with Discrete Fourier Transform", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33821", "id": "XUOHKSsurt", "proceeding": "https://proceedings.mlr.press/v235/gao24o.html", "pdf": "https://openreview.net/pdf?id=XUOHKSsurt", "openreview": "https://openreview.net/forum?id=XUOHKSsurt", "author_site": "Ziqi Gao, Qichao Wang, Aochuan Chen, Zijing Liu, Bingzhe Wu, Liang Chen, Jia Li", "tldr": "", "abstract": "Low-rank adaptation (LoRA) has recently gained much interest in fine-tuning foundation models. It effectively reduces the number of trainable parameters by incorporating low-rank matrices $A$ and $B$ to represent the weight change, i.e., $\\Delta W=BA$. Despite LoRA's progress, it faces storage challenges when handling extensive customization adaptations or larger base models. In this work, we aim to further compress trainable parameters by enjoying the powerful expressiveness of the Fourier transform. Specifically, we introduce FourierFT, which treats $\\Delta W$ as a matrix in the spatial domain and learns only a small fraction of its spectral coefficients. With the trained spectral coefficients, we implement the inverse discrete Fourier transform to recover $\\Delta W$. Empirically, our FourierFT method shows comparable or better performance with fewer parameters than LoRA on various tasks, including natural language understanding, natural language generation, instruction tuning, and image classification. For example, when performing instruction tuning on the LLaMA2-7B model, FourierFT surpasses LoRA with only 0.064M trainable parameters, compared to LoRA's 33.5M. Our code is released at [this link](https://github.com/Chaos96/fourierft).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ziqi Gao;Qichao Wang;Aochuan Chen;Zijing Liu;Bingzhe Wu;Liang Chen;Jia Li", "authorids": "~Ziqi_Gao1;~Qichao_Wang1;~Aochuan_Chen1;~Zijing_Liu1;~Bingzhe_Wu1;~Liang_Chen17;~Jia_Li4", "gender": ";M;M;;M;M;M", "homepage": ";https://qichaos-wang.github.io/;https://scholar.google.com/citations?hl=en&view_op=list_works&gmla=AJsN-F6N4cEX-_kViGgRpnUVo_iBHlVXwMpnhlyB-Cdrndwj6B0jaDy088r7K9gHPGqSwsQ9tNxpijGpb1IoIB2B5KVS3Scvtdz9Mt_WR9GSou_saurFpSA&user=7pY-Ie8AAAAJ;https://github.com/zj-liu;;https://sites.google.com/view/lijia;https://chenliang.tech/", "dblp": ";;331/2356;205/3211;207/4843;23/6950-9;https://dblp.uni-trier.de/pid/01/5394-1", "google_scholar": "https://scholar.google.com.hk/citations?user=UHwNFy8AAAAJ;;https://scholar.google.com/citations?hl=en;;_3hgtf8AAAAJ;1gSbcYoAAAAJ;pGZtPjcAAAAJ", "orcid": ";;0009-0002-2300-1498;0000-0002-0189-7409;;0000-0002-6362-4385;", "linkedin": ";;;;;;", "or_profile": "~Ziqi_Gao1;~Qichao_Wang1;~Aochuan_Chen1;~Zijing_Liu1;~Bingzhe_Wu1;~Jia_Li4;~Liang_Chen7", "aff": "Hong Kong University of Science and Technology;SUN YAT-SEN UNIVERSITY;Hong Kong University of Science and Technology;International Digital Economy Academy;Tencent AI Lab;Hong Kong University of Science and Technology (Guangzhou);", "aff_domain": "ust.hk;sysu.edu.cn;ust.hk;idea.edu.cn;tencent.com;ust.hk;", "position": "PhD student;MS student;PhD student;Researcher;Researcher;Assistant Professor;", "bibtex": "@inproceedings{\ngao2024parameterefficient,\ntitle={Parameter-Efficient Fine-Tuning with Discrete Fourier Transform},\nauthor={Ziqi Gao and Qichao Wang and Aochuan Chen and Zijing Liu and Bingzhe Wu and Liang Chen and Jia Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XUOHKSsurt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8555742, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9524360008038380215&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "email": "ust.hk;sysu.edu.cn;ust.hk;idea.edu.cn;tencent.com;ust.hk;", "author_num": 7, "aff_unique_index": "0;1;0;2;3;0", "aff_unique_norm": "Hong Kong University of Science and Technology;Sun Yat-sen University;International Digital Economy Academy;Tencent", "aff_unique_dep": ";;;Tencent AI Lab", "aff_unique_url": "https://www.ust.hk;http://www.sysu.edu.cn;;https://ai.tencent.com", "aff_unique_abbr": "HKUST;SYSU;;Tencent AI Lab", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China;" }, { "title": "Towards a Better Theoretical Understanding of Independent Subnetwork Training", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33820", "id": "XUc29ydmLX", "proceeding": "https://proceedings.mlr.press/v235/shulgin24a.html", "pdf": "https://openreview.net/pdf?id=XUc29ydmLX", "openreview": "https://openreview.net/forum?id=XUc29ydmLX", "author_site": "Egor Shulgin, Peter Richtarik", "tldr": "", "abstract": "Modern advancements in large-scale machine learning would be impossible without the paradigm of data-parallel distributed computing. Since distributed computing with large-scale models imparts excessive pressure on communication channels, significant recent research has been directed toward co-designing communication compression strategies and training algorithms with the goal of reducing communication costs. While pure data parallelism allows better data scaling, it suffers from poor model scaling properties. Indeed, compute nodes are severely limited by memory constraints, preventing further increases in model size. For this reason, the latest achievements in training giant neural network models also rely on some form of model parallelism. In this work, we take a closer theoretical look at Independent Subnetwork Training (IST), which is a recently proposed and highly effective technique for solving the aforementioned problems. We identify fundamental differences between IST and alternative approaches, such as distributed methods with compressed communication, and provide a precise analysis of its optimization performance on a quadratic model.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Egor Shulgin;Peter Richt\u00e1rik", "authorids": "~Egor_Shulgin1;~Peter_Richt\u00e1rik1", "gender": ";M", "homepage": "https://shulgin-egor.github.io/;https://richtarik.org", "dblp": "234/8612;62/8001", "google_scholar": "cND99UYAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0001-6500-7746;0000-0003-4380-5848", "linkedin": "egor-shulgin-a34373127/;richtarik/", "or_profile": "~Egor_Shulgin1;~Peter_Richtarik1", "aff": "KAUST;King Abdullah University of Science and Technology (KAUST)", "aff_domain": "kaust.edu.sa;kaust.edu.sa", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nshulgin2024towards,\ntitle={Towards a Better Theoretical Understanding of Independent Subnetwork Training},\nauthor={Egor Shulgin and Peter Richt{\\'a}rik},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XUc29ydmLX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 969346, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17068243367403856419&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 14, "email": "kaust.edu.sa;kaust.edu.sa", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "King Abdullah University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaust.edu.sa", "aff_unique_abbr": "KAUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Saudi Arabia" }, { "title": "Magicoder: Empowering Code Generation with OSS-Instruct", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33819", "id": "XUeoOBid3x", "proceeding": "https://proceedings.mlr.press/v235/wei24h.html", "pdf": "https://openreview.net/pdf?id=XUeoOBid3x", "openreview": "https://openreview.net/forum?id=XUeoOBid3x", "author_site": "Yuxiang Wei, Zhe Wang, Jiawei Liu, Yifeng Ding, LINGMING ZHANG", "tldr": "", "abstract": "We introduce Magicoder, a series of fully open-source (code, weights, and data) Large Language Models (LLMs) for code that significantly closes the gap with top code models while having no more than 7B parameters. Magicoder models are trained on 75K synthetic instruction data using **OSS-Instruct**, a novel approach to enlightening LLMs with open-source code snippets to generate diverse instruction data for code. Our main motivation is to mitigate the inherent bias of the synthetic data generated by LLMs through the wealth of open-source references for the production of more realistic and controllable data. The orthogonality of OSS-Instruct and other data generation methods like Evol-Instruct further enables us to build an enhanced MagicoderS. Both Magicoder and MagicoderS substantially outperform state-of-the-art code models with similar or even larger sizes on a wide range of coding benchmarks. Notably, MagicoderS-CL-7B based on CodeLlama even surpasses the prominent ChatGPT on HumanEval+ (66.5 vs. 65.9 in pass@1 ). Overall, OSS-Instruct opens a new direction for crafting diverse synthetic instruction data for code using abundant open-source references.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuxiang Wei;Zhe Wang;Jiawei Liu;Yifeng Ding;LINGMING ZHANG", "authorids": "~Yuxiang_Wei2;~Zhe_Wang41;~Jiawei_Liu11;~Yifeng_Ding2;~LINGMING_ZHANG2", "gender": "M;M;M;M;M", "homepage": "https://yuxiang.cs.illinois.edu;https://zhewang2001.github.io;https://jiawei-site.github.io/;https://yifeng-ding.com/;http://lingming.cs.illinois.edu/", "dblp": "301/1212;;12/8228-4;;27/7057-1", "google_scholar": "Clrvw6kAAAAJ;daDPx9AAAAAJ;Vw6el1AAAAAJ;ipXUDHgAAAAJ;zzbWQE4AAAAJ", "orcid": "0000-0002-4391-3753;;0000-0001-7122-8625;;", "linkedin": "yuxiang-wei-a94a63205/;https://linkedin.com/in/zhe-wang-67aa502a7;jiawei-liu-uiuc/;;", "or_profile": "~Yuxiang_Wei2;~Zhe_Wang41;~Jiawei_Liu11;~Yifeng_Ding2;~LINGMING_ZHANG2", "aff": "Snowflake;Tsinghua University;Amazon;Amazon;University of Illinois Urbana-Champaign", "aff_domain": "snowflake.com;mail.tsinghua.edu.cn;amazon.com;amazon.com;cs.illinois.edu", "position": "Intern;Undergrad student;Intern;Intern;Associate Professor", "bibtex": "@inproceedings{\nwei2024magicoder,\ntitle={Magicoder: Empowering Code Generation with {OSS}-Instruct},\nauthor={Yuxiang Wei and Zhe Wang and Jiawei Liu and Yifeng Ding and LINGMING ZHANG},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XUeoOBid3x}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1024240, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 104, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7310588387624823058&as_sdt=4005&sciodt=0,6&hl=en", "gs_version_total": 8, "email": "snowflake.com;mail.tsinghua.edu.cn;amazon.com;amazon.com;cs.illinois.edu", "author_num": 5, "aff_unique_index": "0;1;2;2;3", "aff_unique_norm": "Snowflake Inc.;Tsinghua University;Amazon;University of Illinois Urbana-Champaign", "aff_unique_dep": ";;Amazon.com, Inc.;", "aff_unique_url": "https://www.snowflake.com;https://www.tsinghua.edu.cn;https://www.amazon.com;https://illinois.edu", "aff_unique_abbr": "Snowflake;THU;Amazon;UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;China" }, { "title": "Stability and Generalization of Stochastic Compositional Gradient Descent Algorithms", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33818", "id": "XWkRyIjYDp", "proceeding": "https://proceedings.mlr.press/v235/yang24ad.html", "pdf": "https://openreview.net/pdf?id=XWkRyIjYDp", "openreview": "https://openreview.net/forum?id=XWkRyIjYDp", "author_site": "Ming Yang, Xiyuan Wei, Tianbao Yang, Yiming Ying", "tldr": "", "abstract": "Many machine learning tasks can be formulated as a stochastic compositional optimization (SCO) problem such as reinforcement learning, AUC maximization and meta-learning, where the objective function involves a nested composition associated with an expectation. Although many studies have been devoted to studying the convergence behavior of SCO algorithms, there is little work on understanding their generalization, that is, how these learning algorithms built from training data would behave on future test examples. In this paper, we provide the stability and generalization analysis of stochastic compositional gradient descent algorithms in the framework of statistical learning theory. Firstly, we introduce a stability concept called *compositional uniform stability* and establish its quantitative relation with generalization for SCO problems. Then, we establish the compositional uniform stability results for two notable stochastic compositional gradient descent algorithms, namely SCGD and SCSC. Finally, we derive *dimension-independent* excess risk bounds for SCGD and SCSC by balancing stability results and optimization errors. To the best of our knowledge, these are the first-ever known results on stability and generalization analysis of stochastic compositional gradient descent algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ming Yang;Xiyuan Wei;Tianbao Yang;Yiming Ying", "authorids": "~Ming_Yang17;~Xiyuan_Wei1;~Tianbao_Yang1;~Yiming_Ying1", "gender": "M;M;M;M", "homepage": "https://github.com/;https://xywei00.github.io/;https://people.tamu.edu/~tianbao-yang/publications.html;https://www.sydney.edu.au/science/about/our-people/academic-staff/yiming-ying.html", "dblp": ";203/0497;56/7047;41/2012", "google_scholar": ";7iGaeB0AAAAJ;https://scholar.google.com.tw/citations?user=BCxFU0EAAAAJ;xnA_lMMAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Ming_Yang17;~Xiyuan_Wei1;~Tianbao_Yang1;~Yiming_Ying1", "aff": "State University of New York at Albany;Texas A&M University - College Station;Texas A&M University - College Station;University of Sydney", "aff_domain": "albany.edu;tamu.edu;tamu.edu;sydney.edu.au", "position": "PhD student;PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nyang2024stability,\ntitle={Stability and Generalization of Stochastic Compositional Gradient Descent Algorithms},\nauthor={Ming Yang and Xiyuan Wei and Tianbao Yang and Yiming Ying},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XWkRyIjYDp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 588706, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1319961893903820056&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "albany.edu;tamu.edu;tamu.edu;sydney.edu.au", "author_num": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "State University of New York;Texas A&M University;University of Sydney", "aff_unique_dep": ";;", "aff_unique_url": "https://www.albany.edu;https://www.tamu.edu;https://www.sydney.edu.au", "aff_unique_abbr": "SUNY Albany;TAMU;USYD", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Albany;College Station;", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;Australia" }, { "title": "CW Complex Hypothesis for Image Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33817", "id": "XXioxiADDC", "proceeding": "https://proceedings.mlr.press/v235/wang24bs.html", "pdf": "https://openreview.net/pdf?id=XXioxiADDC", "openreview": "https://openreview.net/forum?id=XXioxiADDC", "author_site": "Yi Wang, Zhiren Wang", "tldr": "", "abstract": "We examine both the manifold hypothesis (Bengio et al., 2013) and the union of manifold hypothesis (Brown et al., 2023), and argue that, in contrast to these hypotheses, the local intrinsic dimension varies from point to point even in the same connected component. We propose an alternative CW complex hypothesis that image data is distributed in ``manifolds with skeletons\". We support the hypothesis through visualization of distributions of image data of random geometric objects, as well as by introducing and testing a criterion on natural image datasets. One motivation of our work is to explain why diffusion models have difficulty generating accurate higher dimensional details such as human hands. Under the CW complex hypothesis and with both theoretical and empirical evidences, we provide an interpretation that the mixture of higher and lower dimensional components in data obstructs diffusion models from efficient learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yi Wang;Zhiren Wang", "authorids": "~Yi_Wang22;~Zhiren_Wang1", "gender": ";", "homepage": "http://www.personal.psu.edu/zxw14/;http://www.math.jhu.edu/~ywang", "dblp": ";conf/iclr/LiuWW24", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Zhiren_Wang1;~Yi_Wang8", "aff": "Pennsylvania State University;Johns Hopkins Univ", "aff_domain": "psu.edu;jh.edu", "position": "Full Professor;Full Professor", "bibtex": "@inproceedings{\nwang2024cw,\ntitle={{CW} Complex Hypothesis for Image Data},\nauthor={Yi Wang and Zhiren Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XXioxiADDC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5800473, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17029818853598452915&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 4, "email": "psu.edu;jh.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Pennsylvania State University;Johns Hopkins University", "aff_unique_dep": ";", "aff_unique_url": "https://www.psu.edu;https://www.jhu.edu", "aff_unique_abbr": "PSU;JHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Principled Penalty-based Methods for Bilevel Reinforcement Learning and RLHF", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33816", "id": "Xb3IXEBYuw", "proceeding": "https://proceedings.mlr.press/v235/shen24g.html", "pdf": "https://openreview.net/pdf?id=Xb3IXEBYuw", "openreview": "https://openreview.net/forum?id=Xb3IXEBYuw", "author_site": "Han Shen, Zhuoran Yang, Tianyi Chen", "tldr": "", "abstract": "Bilevel optimization has been recently applied to many machine learning tasks. However, their applications have been restricted to the supervised learning setting, where static objective functions with benign structures are considered. But bilevel problems such as incentive design, inverse reinforcement learning (RL), and RL from human feedback (RLHF) are often modeled as dynamic objective functions that go beyond the simple static objective structures, which pose significant challenges of using existing bilevel solutions. To tackle this new class of bilevel problems, we introduce the first principled algorithmic framework for solving bilevel RL problems through the lens of penalty formulation. We provide theoretical studies of the problem landscape and its penalty-based (policy) gradient algorithms. We demonstrate the effectiveness of our algorithms via simulations in the Stackelberg game and RLHF.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Han Shen;Zhuoran Yang;Tianyi Chen", "authorids": "~Han_Shen3;~Zhuoran_Yang1;~Tianyi_Chen5", "gender": "M;M;M", "homepage": "https://hanshen95.github.io/;https://zhuoranyang.github.io/;https://chentianyi1991.github.io/", "dblp": ";;", "google_scholar": "UeWSr6oAAAAJ;;kFwvv38AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Han_Shen3;~Zhuoran_Yang1;~Tianyi_Chen5", "aff": "Rensselaer Polytechnic Institute;Yale University;Rensselaer Polytechnic Institute", "aff_domain": "rpi.edu;yale.edu;rpi.edu", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nshen2024principled,\ntitle={Principled Penalty-based Methods for Bilevel Reinforcement Learning and {RLHF}},\nauthor={Han Shen and Zhuoran Yang and Tianyi Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Xb3IXEBYuw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 700133, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6555566214593375278&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "rpi.edu;yale.edu;rpi.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Rensselaer Polytechnic Institute;Yale University", "aff_unique_dep": ";", "aff_unique_url": "https://www.rpi.edu;https://www.yale.edu", "aff_unique_abbr": "RPI;Yale", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "On the Last-Iterate Convergence of Shuffling Gradient Methods", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33815", "id": "Xdy9bjwHDu", "proceeding": "https://proceedings.mlr.press/v235/liu24cg.html", "pdf": "https://openreview.net/pdf?id=Xdy9bjwHDu", "openreview": "https://openreview.net/forum?id=Xdy9bjwHDu", "author_site": "Zijian Liu, Zhengyuan Zhou", "tldr": "", "abstract": "Shuffling gradient methods are widely used in modern machine learning tasks and include three popular implementations: Random Reshuffle (RR), Shuffle Once (SO), and Incremental Gradient (IG). Compared to the empirical success, the theoretical guarantee of shuffling gradient methods was not well-understood for a long time. Until recently, the convergence rates had just been established for the average iterate for convex functions and the last iterate for strongly convex problems (using squared distance as the metric). However, when using the function value gap as the convergence criterion, existing theories cannot interpret the good performance of the last iterate in different settings (e.g., constrained optimization). To bridge this gap between practice and theory, we prove the first last-iterate convergence rates for shuffling gradient methods with respect to the objective value even without strong convexity. Our new results either (nearly) match the existing last-iterate lower bounds or are as fast as the previous best upper bounds for the average iterate.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zijian Liu;Zhengyuan Zhou", "authorids": "~Zijian_Liu1;~Zhengyuan_Zhou2", "gender": ";M", "homepage": ";https://scholar.google.com/citations?user=hiGI9v0AAAAJ&hl=en", "dblp": ";125/5270", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Zijian_Liu1;~Zhengyuan_Zhou2", "aff": ";New York University", "aff_domain": ";nyu.edu", "position": ";Assistant Professor", "bibtex": "@inproceedings{\nliu2024on,\ntitle={On the Last-Iterate Convergence of Shuffling Gradient Methods},\nauthor={Zijian Liu and Zhengyuan Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Xdy9bjwHDu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 613860, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12700226424542928712&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 6, "email": ";nyu.edu", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "New York University", "aff_unique_dep": "", "aff_unique_url": "https://www.nyu.edu", "aff_unique_abbr": "NYU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Position: Benchmarking is Limited in Reinforcement Learning Research", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33814", "id": "Xe7n2ZqpBP", "proceeding": "https://proceedings.mlr.press/v235/jordan24a.html", "pdf": "https://openreview.net/pdf?id=Xe7n2ZqpBP", "openreview": "https://openreview.net/forum?id=Xe7n2ZqpBP", "author_site": "Scott Jordan, Adam White, Bruno da Silva, Martha White, Philip Thomas", "tldr": "", "abstract": "Novel reinforcement learning algorithms, or improvements on existing ones, are commonly justified by evaluating their performance on benchmark environments and are compared to an ever-changing set of standard algorithms. However, despite numerous calls for improvements, experimental practices continue to produce misleading or unsupported claims. One reason for the ongoing substandard practices is that conducting rigorous benchmarking experiments requires substantial computational time. This work investigates the sources of increased computation costs in rigorous experiment designs. We show that conducting rigorous performance benchmarks will likely have computational costs that are often prohibitive. As a result, we argue for using an additional experimentation paradigm to overcome the limitations of benchmarking.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Scott M. Jordan;Adam White;Bruno Castro da Silva;Martha White;Philip S. Thomas", "authorids": "~Scott_M._Jordan1;~Adam_White1;~Bruno_Castro_da_Silva1;~Martha_White1;~Philip_S._Thomas1", "gender": "M;M;F;M;M", "homepage": "https://scottjordan.github.io/scottjordan/;https://people.cs.umass.edu/~bsilva/;http://marthawhite.ca;http://psthomas.com;http://adamwhite.ca", "dblp": "222/1982;75/3139;60/7057;46/11107;91/10481", "google_scholar": "qg8AOdgAAAAJ;eskJDVUAAAAJ;t5zdD_IAAAAJ;e8Gzgo4AAAAJ;https://scholar.google.ca/citations?user=1GqGhcsAAAAJ", "orcid": "0000-0003-4567-8627;;0000-0002-5356-2950;;", "linkedin": ";;;;", "or_profile": "~Scott_M._Jordan1;~Bruno_Castro_da_Silva1;~Martha_White1;~Philip_S._Thomas1;~Adam_M_White1", "aff": "University of Alberta;University of Massachusetts, Amherst;University of Alberta;College of Information and Computer Science, University of Massachusetts, Amherst;University of Alberta", "aff_domain": "ualberta.ca;umass.edu;ualberta.ca;cs.umass.edu;ualberta.ca", "position": "Postdoc;Assistant Professor;Associate Professor;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\njordan2024position,\ntitle={Position: Benchmarking is Limited in Reinforcement Learning Research},\nauthor={Scott M. Jordan and Adam White and Bruno Castro da Silva and Martha White and Philip S. Thomas},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Xe7n2ZqpBP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1182771, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2444194218186661914&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "ualberta.ca;umass.edu;ualberta.ca;cs.umass.edu;ualberta.ca", "author_num": 5, "aff_unique_index": "0;1;0;1;0", "aff_unique_norm": "University of Alberta;University of Massachusetts Amherst", "aff_unique_dep": ";", "aff_unique_url": "https://www.ualberta.ca;https://www.umass.edu", "aff_unique_abbr": "UAlberta;UMass Amherst", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Amherst", "aff_country_unique_index": "0;1;0;1;0", "aff_country_unique": "Canada;United States" }, { "title": "FedCal: Achieving Local and Global Calibration in Federated Learning via Aggregated Parameterized Scaler", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33813", "id": "XecUTmB9yD", "proceeding": "https://proceedings.mlr.press/v235/peng24g.html", "pdf": "https://openreview.net/pdf?id=XecUTmB9yD", "openreview": "https://openreview.net/forum?id=XecUTmB9yD", "author_site": "Hongyi Peng, Han Yu, Xiaoli Tang, Xiaoxiao Li", "tldr": "", "abstract": "Federated learning (FL) enables collaborative machine learning across distributed data owners, but data heterogeneity poses a challenge for model calibration. While prior work focused on improving accuracy for non-iid data, calibration remains under-explored. This study reveals existing FL aggregation approaches lead to sub-optimal calibration, and theoretical analysis shows despite constraining variance in clients\u2019 label distributions, global calibration error is still asymptotically lower bounded. To address this, we propose a novel Federated Calibration (FedCal) approach, emphasizing both local and global calibration. It leverages client-specific scalers for local calibration to effectively correct output misalignment without sacrificing prediction accuracy. These scalers are then aggregated via weight averaging to generate a global scaler, minimizing the global calibration error. Extensive experiments demonstrate that FedCal significantly outperforms the best-performing baseline, reducing global calibration error by 47.66% on average.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hongyi Peng;Han Yu;Xiaoli Tang;Xiaoxiao Li", "authorids": "~Hongyi_Peng1;~Han_Yu1;~Xiaoli_Tang1;~Xiaoxiao_Li1", "gender": "M;M;F;Unspecified", "homepage": ";https://sites.google.com/site/hanyushomepage/home;https://scholar.google.com.sg/citations?user=Azooe2AAAAAJ&hl=en&oi=ao;https://xxlya.github.io/", "dblp": ";35/1096-1;;71/8042", "google_scholar": ";https://scholar.google.com.sg/citations?hl=en;https://scholar.google.com.sg/citations?user=Azooe2AAAAAJ;sdENOQ4AAAAJ", "orcid": ";0000-0001-6893-8650;;", "linkedin": "hongyi-peng-63774a119/?originalSubdomain=sg;;;", "or_profile": "~Hongyi_Peng1;~Han_Yu1;~Xiaoli_Tang1;~Xiaoxiao_Li1", "aff": "Nanyang Technological University;Nanyang Technological University;Nanyang Technological University;University of British Columbia", "aff_domain": "ntu.edu.sg;ntu.edu.sg;ntu.edu.sg;ece.ubc.ca", "position": "PhD student;Associate Professor;PhD student;Assistant Professor", "bibtex": "@inproceedings{\npeng2024fedcal,\ntitle={FedCal: Achieving Local and Global Calibration in Federated Learning via Aggregated Parameterized Scaler},\nauthor={Hongyi Peng and Han Yu and Xiaoli Tang and Xiaoxiao Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XecUTmB9yD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3030930, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12971552358847514872&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "ntu.edu.sg;ntu.edu.sg;ntu.edu.sg;ece.ubc.ca", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Nanyang Technological University;University of British Columbia", "aff_unique_dep": ";", "aff_unique_url": "https://www.ntu.edu.sg;https://www.ubc.ca", "aff_unique_abbr": "NTU;UBC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "Singapore;Canada" }, { "title": "On the Minimal Degree Bias in Generalization on the Unseen for non-Boolean Functions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33812", "id": "Xeh8171Fce", "proceeding": "https://proceedings.mlr.press/v235/pushkin24a.html", "pdf": "https://openreview.net/pdf?id=Xeh8171Fce", "openreview": "https://openreview.net/forum?id=Xeh8171Fce", "author_site": "Denys Pushkin, Rapha\u00ebl Berthier, Emmanuel Abbe", "tldr": "", "abstract": "We investigate the out-of-domain generalization of random feature (RF) models and Transformers. We first prove that in the `generalization on the unseen (GOTU)' setting, where training data is fully seen in some part of the domain but testing is made on another part, and for RF models in the small feature regime, the convergence takes place to interpolators of minimal degree as in the Boolean case (Abbe et al., 2023). We then consider the sparse target regime and explain how this regime relates to the small feature regime, but with a different regularization term that can alter the picture in the non-Boolean case. We show two different outcomes for the sparse regime with q-ary data tokens: (1) if the data is embedded with roots of unities, then a min-degree interpolator is learned like in the Boolean case for RF models, (2) if the data is not embedded as such, e.g., simply as integers, then RF models and Transformers may not learn minimal degree interpolators. This shows that the Boolean setting and its roots of unities generalization are special cases where the minimal degree interpolator offers a rare characterization of how learning takes place. For more general integer and real-valued settings, a more nuanced picture remains to be fully characterized.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Denys Pushkin;Rapha\u00ebl Berthier;Emmanuel Abbe", "authorids": "~Denys_Pushkin1;~Rapha\u00ebl_Berthier1;~Emmanuel_Abbe1", "gender": "M;;", "homepage": ";https://raphael-berthier.github.io/;", "dblp": "304/8769;205/3030;84/5016", "google_scholar": ";ZLCLbSQAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Denys_Pushkin1;~Rapha\u00ebl_Berthier1;~Emmanuel_Abbe1", "aff": "School of Computer and Communication Sciences, EPFL - EPF Lausanne;;Swiss Federal Institute of Technology Lausanne", "aff_domain": "ic.epfl.ch;;epfl.ch", "position": "PhD student;;Full Professor", "bibtex": "@inproceedings{\npushkin2024on,\ntitle={On the Minimal Degree Bias in Generalization on the Unseen for non-Boolean Functions},\nauthor={Denys Pushkin and Rapha{\\\"e}l Berthier and Emmanuel Abbe},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Xeh8171Fce}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 668727, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:NUo1RP8MjhsJ:scholar.google.com/&scioq=On+the+Minimal+Degree+Bias+in+Generalization+on+the+Unseen+for+non-Boolean+Functions&hl=en&as_sdt=0,33", "gs_version_total": 11, "email": "ic.epfl.ch;;epfl.ch", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "EPFL;Swiss Federal Institute of Technology Lausanne", "aff_unique_dep": "School of Computer and Communication Sciences;", "aff_unique_url": "https://www.epfl.ch;https://www.epfl.ch", "aff_unique_abbr": "EPFL;EPFL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "title": "Graph Structure Extrapolation for Out-of-Distribution Generalization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33811", "id": "Xgrey8uQhr", "proceeding": "https://proceedings.mlr.press/v235/li24y.html", "pdf": "https://openreview.net/pdf?id=Xgrey8uQhr", "openreview": "https://openreview.net/forum?id=Xgrey8uQhr", "author_site": "Xiner Li, Shurui Gui, Youzhi Luo, Shuiwang Ji", "tldr": "", "abstract": "Out-of-distribution (OOD) generalization deals with the prevalent learning scenario where test distribution shifts from training distribution. With rising application demands and inherent complexity, graph OOD problems call for specialized solutions. While data-centric methods exhibit performance enhancements on many generic machine learning tasks, there is a notable absence of data augmentation methods tailored for graph OOD generalization. In this work, we propose to achieve graph OOD generalization with the novel design of non-Euclidean-space linear extrapolation. The proposed augmentation strategy extrapolates structure spaces to generate OOD graph data. Our design tailors OOD samples for specific shifts without corrupting underlying causal mechanisms. Theoretical analysis and empirical results evidence the effectiveness of our method in solving target shifts, showing substantial and constant improvements across various graph OOD tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiner Li;Shurui Gui;Youzhi Luo;Shuiwang Ji", "authorids": "~Xiner_Li1;~Shurui_Gui1;~Youzhi_Luo1;~Shuiwang_Ji1", "gender": "F;M;M;M", "homepage": ";https://cm-bf.github.io;https://lyzustc.github.io/;http://people.tamu.edu/~sji", "dblp": "267/6459;272/0674.html;280/0590;84/6405", "google_scholar": "bBQx_5MAAAAJ;U4AjtOkAAAAJ;3lqQFIoAAAAJ;BZGj6sAAAAAJ", "orcid": ";;0000-0002-3763-0239;0000-0002-4205-4563", "linkedin": ";;youzhi-luo-139981172/;shuiwang-ji-9a040715/", "or_profile": "~Xiner_Li1;~Shurui_Gui1;~Youzhi_Luo1;~Shuiwang_Ji1", "aff": "Texas A&M University - College Station;Texas A&M University;Texas A&M University;Texas A&M University", "aff_domain": "tamu.edu;tamu.edu;tamu.edu;tamu.edu", "position": "PhD student;PhD student;PhD student;Professor", "bibtex": "@inproceedings{\nli2024graph,\ntitle={Graph Structure Extrapolation for Out-of-Distribution Generalization},\nauthor={Xiner Li and Shurui Gui and Youzhi Luo and Shuiwang Ji},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Xgrey8uQhr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3252222, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16926063966415278679&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 4, "email": "tamu.edu;tamu.edu;tamu.edu;tamu.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Texas A&M University", "aff_unique_dep": "", "aff_unique_url": "https://www.tamu.edu", "aff_unique_abbr": "TAMU", "aff_campus_unique_index": "0", "aff_campus_unique": "College Station;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "LeaPformer: Enabling Linear Transformers for Autoregressive and Simultaneous Tasks via Learned Proportions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33810", "id": "XhH1OKLANY", "proceeding": "https://proceedings.mlr.press/v235/agostinelli-iii24a.html", "pdf": "https://openreview.net/pdf?id=XhH1OKLANY", "openreview": "https://openreview.net/forum?id=XhH1OKLANY", "author_site": "Victor Agostinelli III, Sanghyun Hong, Lizhong Chen", "tldr": "", "abstract": "A promising approach to preserving model performance in linearized transformers is to employ position-based re-weighting functions. However, state-of-the-art re-weighting functions rely heavily on target sequence lengths, making it difficult or impossible to apply them to autoregressive and simultaneous tasks, where the target and sometimes even the input sequence length are unknown. To address this issue, we propose Learned Proportions (LeaP) and LeaPformers. Our contribution is built on two major components. First, we generalize the dependence on explicit positional representations and sequence lengths into dependence on sequence proportions for re-weighting. Second, we replace static positional representations with dynamic proportions derived via a compact module, enabling more flexible attention concentration patterns. We evaluate LeaPformer against eight representative efficient transformers on the Long-Range Arena benchmark, where we show that LeaPformer achieves the best quality-throughput trade-off, as well as apply LeaPformer to Wikitext-103b autoregressive language modeling and simultaneous speech-to-text translation for two language pairs, achieving competitive results in both tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Victor Agostinelli III;Sanghyun Hong;Lizhong Chen", "authorids": "~Victor_Agostinelli_III1;~Sanghyun_Hong1;~Lizhong_Chen2", "gender": ";M;M", "homepage": ";http://www.sanghyun-hong.com;https://web.engr.oregonstate.edu/~chenliz/", "dblp": ";135/8991;78/4756", "google_scholar": "1mrboE4AAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": ";;", "linkedin": "victor-agostinelli/;;", "or_profile": "~Victor_Agostinelli_III1;~Sanghyun_Hong1;~Lizhong_Chen2", "aff": ", Oregon State University;Oregon State University;Oregon State University", "aff_domain": "eecs.oregonstate.edu;oregonstate.edu;oregonstate.edu", "position": "PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\niii2024leapformer,\ntitle={LeaPformer: Enabling Linear Transformers for Autoregressive and Simultaneous Tasks via Learned Proportions},\nauthor={Victor Agostinelli III and Sanghyun Hong and Lizhong Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XhH1OKLANY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 839567, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:DYbgu_a7cQ4J:scholar.google.com/&scioq=LeaPformer:+Enabling+Linear+Transformers+for+Autoregressive+and+Simultaneous+Tasks+via+Learned+Proportions&hl=en&as_sdt=0,33", "gs_version_total": 9, "email": "eecs.oregonstate.edu;oregonstate.edu;oregonstate.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Oregon State University", "aff_unique_dep": "", "aff_unique_url": "https://oregonstate.edu", "aff_unique_abbr": "OSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Neuro-Visualizer: A Novel Auto-Encoder-Based Loss Landscape Visualization Method With an Application in Knowledge-Guided Machine Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33809", "id": "XiemSZpvh0", "proceeding": "https://proceedings.mlr.press/v235/elhamod24a.html", "pdf": "https://openreview.net/pdf?id=XiemSZpvh0", "openreview": "https://openreview.net/forum?id=XiemSZpvh0", "author_site": "Mohannad Elhamod, Anuj Karpatne", "tldr": "", "abstract": "In recent years, there has been a growing interest in visualizing the loss landscape of neural networks. Linear landscape visualization methods, such as principal component analysis, have become widely used as they intuitively help researchers study neural networks and their training process. However, these linear methods suffer from limitations and drawbacks due to their lack of flexibility and low fidelity at representing the high dimensional landscape. In this paper, we present a novel auto-encoder-based non-linear landscape visualization method called Neuro-Visualizer that addresses these shortcoming and provides useful insights about neural network loss landscapes. To demonstrate its potential, we run experiments on a variety of problems in two separate applications of knowledge-guided machine learning (KGML). Our findings show that Neuro-Visualizer outperforms other linear and non-linear baselines and helps corroborate, and sometime challenge, claims proposed by machine learning community. All code and data used in the experiments of this paper can be found at the link below.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mohannad Elhamod;Anuj Karpatne", "authorids": "~Mohannad_Elhamod1;~Anuj_Karpatne1", "gender": "Not Specified;", "homepage": "https://wordpress.cs.vt.edu/elhamod/;http://people.cs.vt.edu/karpatne/", "dblp": ";09/9720", "google_scholar": "6xcT3TUAAAAJ;", "orcid": "0000-0002-2383-947X;", "linkedin": "mohannadelhamod/;", "or_profile": "~Mohannad_Elhamod1;~Anuj_Karpatne1", "aff": ";Virginia Polytechnic Institute and State University", "aff_domain": ";vt.edu", "position": ";Associate Professor", "bibtex": "@inproceedings{\nelhamod2024neurovisualizer,\ntitle={Neuro-Visualizer: A Novel Auto-Encoder-Based Loss Landscape Visualization Method With an Application in Knowledge-Guided Machine Learning},\nauthor={Mohannad Elhamod and Anuj Karpatne},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XiemSZpvh0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2429023, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:wdON5ACYZSwJ:scholar.google.com/&scioq=Neuro-Visualizer:+A+Novel+Auto-Encoder-Based+Loss+Landscape+Visualization+Method+With+an+Application+in+Knowledge-Guided+Machine+Learning&hl=en&as_sdt=0,47", "gs_version_total": 4, "email": ";vt.edu", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Virginia Tech", "aff_unique_dep": "", "aff_unique_url": "https://www.vt.edu", "aff_unique_abbr": "VT", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "A Closer Look at the Limitations of Instruction Tuning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33808", "id": "XkHJo8iXGQ", "proceeding": "https://proceedings.mlr.press/v235/ghosh24a.html", "pdf": "https://openreview.net/pdf?id=XkHJo8iXGQ", "openreview": "https://openreview.net/forum?id=XkHJo8iXGQ", "author_site": "Sreyan Ghosh, Chandra Kiran Evuru, Sonal Kumar, Ramaneswaran S, Deepali Aneja, Zeyu Jin, Ramani Duraiswami, Dinesh Manocha", "tldr": "", "abstract": "Instruction Tuning (IT), the process of training large language models (LLMs) using instruction-response pairs, has emerged as the predominant method for transforming base pre-trained LLMs into open-domain conversational agents. While IT has achieved notable success and widespread adoption, its limitations and shortcomings remain underexplored. In this paper, through rigorous experiments and an in-depth analysis of the changes LLMs undergo through IT, we reveal various limitations of IT. In particular, we show that (1) IT fails to enhance knowledge or skills in LLMs. LoRA fine-tuning is limited to learning response initiation and style tokens, and full-parameter fine-tuning leads to knowledge degradation. (2) Copying response patterns from IT datasets derived from knowledgeable sources leads to a decline in response quality. (3) Full-parameter fine-tuning increases hallucination by inaccurately borrowing tokens from conceptually similar instances in the IT dataset for generating responses. (4) Popular methods to improve IT do not lead to performance improvements over a simple LoRA fine-tuned model. Our findings reveal that responses generated solely from pre-trained knowledge consistently outperform responses by models that learn any form of new knowledge from IT on open-source datasets. We hope the insights and challenges revealed in this paper inspire future work in related directions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sreyan Ghosh;Chandra Kiran Reddy Evuru;Sonal Kumar;Ramaneswaran S;Deepali Aneja;Zeyu Jin;Ramani Duraiswami;Dinesh Manocha", "authorids": "~Sreyan_Ghosh1;~Chandra_Kiran_Reddy_Evuru1;~Sonal_Kumar1;~Ramaneswaran_S1;~Deepali_Aneja2;~Zeyu_Jin2;~Ramani_Duraiswami1;~Dinesh_Manocha3", "gender": "M;M;M;M;F;M;M;M", "homepage": "https://sreyan88.github.io/;;https://sonalkum.github.io;;https://research.adobe.com/person/deepali-aneja/;https://research.adobe.com/person/zeyu-jin/;http://www.umiacs.umd.edu/~ramani/;https://www.cs.umd.edu/people/dmanocha", "dblp": "173/5626;355/1221;;;;;d/RamaniDuraiswami;m/DineshManocha", "google_scholar": "5HKZJHAAAAAJ;;jiJ2DcEAAAAJ;YIhHxbwAAAAJ;cq2iIuUAAAAJ;R-PFLHMAAAAJ;GNEcpkAAAAAJ;X08l_4IAAAAJ", "orcid": ";;;;;;0000-0002-5596-8460;0000-0001-7047-9801", "linkedin": ";ckevuru/;realsonalkumar/;;;;ramani-duraiswami-32a50b1/;dinesh-manocha-2311846", "or_profile": "~Sreyan_Ghosh1;~Chandra_Kiran_Reddy_Evuru1;~Sonal_Kumar1;~Ramaneswaran_S1;~Deepali_Aneja2;~Zeyu_Jin2;~Ramani_Duraiswami1;~Dinesh_Manocha3", "aff": "University of Maryland, College Park;University of Maryland, College Park;University of Maryland, College Park;NVIDIA;;Adobe Systems;University of Maryland, College Park;University of Maryland, College Park", "aff_domain": "umd.edu;umd.edu;umd.edu;nvidia.com;;adobe.com;umd.edu;umd.edu", "position": "PhD student;MS student;PhD student;Researcher;;Researcher;Full Professor;Professor", "bibtex": "@inproceedings{\nghosh2024a,\ntitle={A Closer Look at the Limitations of Instruction Tuning},\nauthor={Sreyan Ghosh and Chandra Kiran Reddy Evuru and Sonal Kumar and Ramaneswaran S and Deepali Aneja and Zeyu Jin and Ramani Duraiswami and Dinesh Manocha},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XkHJo8iXGQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2770724, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=278858099066880908&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "umd.edu;umd.edu;umd.edu;nvidia.com;;adobe.com;umd.edu;umd.edu", "author_num": 8, "aff_unique_index": "0;0;0;1;2;0;0", "aff_unique_norm": "University of Maryland;NVIDIA;Adobe", "aff_unique_dep": ";NVIDIA Corporation;Adobe Systems Incorporated", "aff_unique_url": "https://www/umd.edu;https://www.nvidia.com;https://www.adobe.com", "aff_unique_abbr": "UMD;NVIDIA;Adobe", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "MAGNOLIA: Matching Algorithms via GNNs for Online Value-to-go Approximation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33807", "id": "XlgeQ47Ra9", "proceeding": "https://proceedings.mlr.press/v235/hayderi24a.html", "pdf": "https://openreview.net/pdf?id=XlgeQ47Ra9", "openreview": "https://openreview.net/forum?id=XlgeQ47Ra9", "author_site": "Alexandre Hayderi, Amin Saberi, Ellen Vitercik, Anders Wikum", "tldr": "", "abstract": "Online Bayesian bipartite matching is a central problem in digital marketplaces and exchanges, including advertising, crowdsourcing, ridesharing, and kidney exchange. We introduce a graph neural network (GNN) approach that emulates the problem's combinatorially-complex optimal online algorithm, which selects actions (e.g., which nodes to match) by computing each action's *value-to-go (VTG)*\u2014the expected weight of the final matching if the algorithm takes that action, then acts optimally in the future. We train a GNN to estimate VTG and show empirically that this GNN returns high-weight matchings across a variety of tasks. Moreover, we identify a common family of graph distributions in spatial crowdsourcing applications, such as rideshare, under which VTG can be efficiently approximated by aggregating information within local neighborhoods in the graphs. This structure matches the local behavior of GNNs, providing theoretical justification for our approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alexandre Hayderi;Amin Saberi;Ellen Vitercik;Anders Wikum", "authorids": "alexhay@stanford.edu;~Amin_Saberi1;~Ellen_Vitercik1;~Anders_Wikum1", "gender": ";;F;M", "homepage": ";https://www.stanford.edu/~saberi;https://vitercik.github.io/;", "dblp": ";28/4017;160/8900;", "google_scholar": ";;6iUjvyMAAAAJ;", "orcid": ";;;", "linkedin": ";;;anders-wikum", "or_profile": "alexhay@stanford.edu;~Amin_Saberi1;~Ellen_Vitercik1;~Anders_Wikum1", "aff": ";Stanford University;Stanford University;Stanford University", "aff_domain": ";stanford.edu;stanford.edu;stanford.edu", "position": ";Full Professor;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nhayderi2024magnolia,\ntitle={{MAGNOLIA}: Matching Algorithms via {GNN}s for Online Value-to-go Approximation},\nauthor={Alexandre Hayderi and Amin Saberi and Ellen Vitercik and Anders Wikum},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XlgeQ47Ra9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 777864, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5hPMbREty-gJ:scholar.google.com/&scioq=MAGNOLIA:+Matching+Algorithms+via+GNNs+for+Online+Value-to-go+Approximation&hl=en&as_sdt=0,44", "gs_version_total": 7, "email": ";stanford.edu;stanford.edu;stanford.edu", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Generative Marginalization Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33806", "id": "XmLNDlQuzO", "proceeding": "https://proceedings.mlr.press/v235/liu24az.html", "pdf": "https://openreview.net/pdf?id=XmLNDlQuzO", "openreview": "https://openreview.net/forum?id=XmLNDlQuzO", "author_site": "Sulin Liu, Peter Ramadge, Ryan P. Adams", "tldr": "", "abstract": "We introduce *marginalization models* (MAMs), a new family of generative models for high-dimensional discrete data. They offer scalable and flexible generative modeling by explicitly modeling all induced marginal distributions. Marginalization models enable fast approximation of arbitrary marginal probabilities with a single forward pass of the neural network, which overcomes a major limitation of arbitrary marginal inference models, such as any-order autoregressive models. MAMs also address the scalability bottleneck encountered in training any-order generative models for high-dimensional problems under the context of *energy-based training*, where the goal is to match the learned distribution to a given desired probability (specified by an unnormalized log-probability function such as energy or reward function). We propose scalable methods for learning the marginals, grounded in the concept of \"*marginalization self-consistency*\". We demonstrate the effectiveness of the proposed model on a variety of discrete data distributions, including images, text, physical systems, and molecules, for *maximum likelihood* and *energy-based training* settings. MAMs achieve orders of magnitude speedup in evaluating the marginal probabilities on both settings. For energy-based training tasks, MAMs enable any-order generative modeling of high-dimensional problems beyond the scale of previous methods. Code is available at github.com/PrincetonLIPS/MaM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sulin Liu;Peter Ramadge;Ryan P Adams", "authorids": "~Sulin_Liu1;~Peter_Ramadge1;~Ryan_P_Adams1", "gender": "M;M;M", "homepage": "https://liusulin.github.io/;http://ee.princeton.edu/people/faculty/peter-j-ramadge;http://www.cs.princeton.edu/~rpa/", "dblp": "192/1289;77/3256;32/909", "google_scholar": "s3NlgA4AAAAJ;BOMboVoAAAAJ;grQ_GBgAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Sulin_Liu1;~Peter_Ramadge1;~Ryan_P_Adams1", "aff": "Massachusetts Institute of Technology;Princeton University;Princeton University", "aff_domain": "mit.edu;princeton.edu;princeton.edu", "position": "Postdoc;Full Professor;Professor", "bibtex": "@inproceedings{\nliu2024generative,\ntitle={Generative Marginalization Models},\nauthor={Sulin Liu and Peter Ramadge and Ryan P Adams},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XmLNDlQuzO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4563379, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18323067373203796909&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 10, "email": "mit.edu;princeton.edu;princeton.edu", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Massachusetts Institute of Technology;Princeton University", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://www.princeton.edu", "aff_unique_abbr": "MIT;Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Pseudo-Calibration: Improving Predictive Uncertainty Estimation in Unsupervised Domain Adaptation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33805", "id": "XnsI1HKAKC", "proceeding": "https://proceedings.mlr.press/v235/hu24i.html", "pdf": "https://openreview.net/pdf?id=XnsI1HKAKC", "openreview": "https://openreview.net/forum?id=XnsI1HKAKC", "author_site": "Dapeng Hu, Jian Liang, Xinchao Wang, Chuan-Sheng Foo", "tldr": "", "abstract": "Unsupervised domain adaptation (UDA) has seen substantial efforts to improve model accuracy for an unlabeled target domain with the help of a labeled source domain. However, UDA models often exhibit poorly calibrated predictive uncertainty on target data, a problem that remains under-explored and poses risks in safety-critical UDA applications. The calibration problem in UDA is particularly challenging due to the absence of labeled target data and severe distribution shifts between domains. In this paper, we approach UDA calibration as a target-domain-specific unsupervised problem, different from mainstream solutions based on *covariate shift*. We introduce Pseudo-Calibration (PseudoCal), a novel post-hoc calibration framework. Our innovative use of inference-stage *mixup* synthesizes a labeled pseudo-target set capturing the structure of the real unlabeled target data. This turns the unsupervised calibration problem into a supervised one, easily solvable with *temperature scaling*. Extensive empirical evaluations across 5 diverse UDA scenarios involving 10 UDA methods consistently demonstrate the superior performance and versatility of PseudoCal over existing solutions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dapeng Hu;Jian Liang;Xinchao Wang;Chuan-Sheng Foo", "authorids": "~Dapeng_Hu2;~Jian_Liang1;~Xinchao_Wang1;~Chuan-Sheng_Foo1", "gender": "M;M;M;M", "homepage": "https://lhxxhb.github.io/;https://liangjian.xyz;http://ai.stanford.edu/~csfoo;https://sites.google.com/site/sitexinchaowang/", "dblp": "247/3382;19/2208-1;73/1823;", "google_scholar": "wv9HjA0AAAAJ;https://scholar.google.com/citations?hl=en;AgbeqGkAAAAJ;https://scholar.google.com.tw/citations?user=w69Buq0AAAAJ", "orcid": ";0000-0003-3890-1894;0000-0002-4748-5792;", "linkedin": ";;;", "or_profile": "~Dapeng_Hu2;~Jian_Liang1;~Chuan-Sheng_Foo1;~Xinchao_WANG3", "aff": "Apple;Institute of Automation, Chinese Academy of Sciences;Institute for Infocomm Research, A*STAR;National University of Singapore", "aff_domain": "apple.com;ia.ac.cn;i2r.a-star.edu.sg;nus.edu", "position": "Siri Engineer;Associate Professor;Principal Scientist;Assistant Professor", "bibtex": "@inproceedings{\nhu2024pseudocalibration,\ntitle={Pseudo-Calibration: Improving Predictive Uncertainty Estimation in Unsupervised Domain Adaptation},\nauthor={Dapeng Hu and Jian Liang and Xinchao Wang and Chuan-Sheng Foo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XnsI1HKAKC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2397238, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12813593997776972723&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "apple.com;ia.ac.cn;i2r.a-star.edu.sg;nus.edu", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Apple;Chinese Academy of Sciences;Institute for Infocomm Research;National University of Singapore", "aff_unique_dep": "Apple Inc.;Institute of Automation;;", "aff_unique_url": "https://www.apple.com;http://www.ia.cas.cn;https://www.i2r.a-star.edu.sg;https://www.nus.edu.sg", "aff_unique_abbr": "Apple;CAS;I2R;NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;2", "aff_country_unique": "United States;China;Singapore" }, { "title": "How to Make the Gradients Small Privately: Improved Rates for Differentially Private Non-Convex Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33804", "id": "XoSF46Pc2e", "proceeding": "https://proceedings.mlr.press/v235/lowy24b.html", "pdf": "https://openreview.net/pdf?id=XoSF46Pc2e", "openreview": "https://openreview.net/forum?id=XoSF46Pc2e", "author_site": "Andrew Lowy, Jonathan Ullman, Stephen Wright", "tldr": "", "abstract": "We provide a simple and flexible framework for designing differentially private algorithms to find approximate stationary points of non-convex loss functions. Our framework is based on using a private approximate risk minimizer to \"warm start\" another private algorithm for finding stationary points. We use this framework to obtain improved, and sometimes optimal, rates for several classes of non-convex loss functions. First, we obtain improved rates for finding stationary points of smooth non-convex empirical loss functions. Second, we specialize to quasar-convex functions, which generalize star-convex functions and arise in learning dynamical systems and training some neural nets. We achieve the optimal rate for this class. Third, we give an optimal algorithm for finding stationary points of functions satisfying the Kurdyka-Lojasiewicz (KL) condition. For example, over-parameterized neural networks often satisfy this condition. Fourth, we provide new state-of-the-art rates for stationary points of non-convex population loss functions. Fifth, we obtain improved rates for non-convex generalized linear models. A modification of our algorithm achieves nearly the same rates for second-order stationary points of functions with Lipschitz Hessian, improving over the previous state-of-the-art for each of the above problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Andrew Lowy;Jonathan Ullman;Stephen Wright", "authorids": "~Andrew_Lowy1;~Jonathan_Ullman1;~Stephen_Wright1", "gender": ";M;M", "homepage": "https://sites.google.com/view/andrewlowy;https://jonathan-ullman.github.io/;https://wrightstephen.github.io/sw_proj/", "dblp": "285/5314;02/8164;75/2677", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=WfS41RAAAAAJ;VFQRIOwAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Andrew_Lowy1;~Jonathan_Ullman1;~Stephen_Wright1", "aff": "University of Wisconsin - Madison;Northeastern University;University of Wisconsin, Madison", "aff_domain": "wisc.edu;northeastern.edu;wisc.edu", "position": "Postdoc;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nlowy2024how,\ntitle={How to Make the Gradients Small Privately: Improved Rates for Differentially Private Non-Convex Optimization},\nauthor={Andrew Lowy and Jonathan Ullman and Stephen Wright},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XoSF46Pc2e}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1248391, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3939322659380279306&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "wisc.edu;northeastern.edu;wisc.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Wisconsin-Madison;Northeastern University;University of Wisconsin", "aff_unique_dep": ";;", "aff_unique_url": "https://www.wisc.edu;https://www.northeastern.edu;https://www.wisc.edu", "aff_unique_abbr": "UW-Madison;NEU;UW", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Value-Evolutionary-Based Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33803", "id": "XobPpcN4yZ", "proceeding": "https://proceedings.mlr.press/v235/li24z.html", "pdf": "https://openreview.net/pdf?id=XobPpcN4yZ", "openreview": "https://openreview.net/forum?id=XobPpcN4yZ", "author_site": "Pengyi Li, Jianye Hao, Hongyao Tang, Yan Zheng, Fazl Barez", "tldr": "", "abstract": "Combining Evolutionary Algorithms (EAs) and Reinforcement Learning (RL) for policy search has been proven to improve RL performance. However, previous works largely overlook value-based RL in favor of merging EAs with policy-based RL. This paper introduces Value-Evolutionary-Based Reinforcement Learning (VEB-RL) that focuses on the integration of EAs with value-based RL. The framework maintains a population of value functions instead of policies and leverages negative Temporal Difference error as the fitness metric for evolution. The metric is more sample-efficient for population evaluation than cumulative rewards and is closely associated with the accuracy of the value function approximation. Additionally, VEB-RL enables elites of the population to interact with the environment to offer high-quality samples for RL optimization, whereas the RL value function participates in the population's evolution in each generation. Experiments on MinAtar and Atari demonstrate the superiority of VEB-RL in significantly improving DQN, Rainbow, and SPR. Our code is available on https://github.com/yeshenpy/VEB-RL.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pengyi Li;Jianye HAO;Hongyao Tang;YAN ZHENG;Fazl Barez", "authorids": "~Pengyi_Li1;~Jianye_HAO1;~Hongyao_Tang1;~YAN_ZHENG1;~Fazl_Barez1", "gender": "M;M;M;M;", "homepage": "https://yeshenpy.github.io/;http://www.icdai.org/jianye.html;https://bluecontra.github.io/;https://yanzzzzz.github.io;", "dblp": "195/6948;21/7664.html;220/4275;10/2381-2;", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;;yIqzRH4AAAAJ;https://scholar.google.com.hk/citations?user=tJuhd1kAAAAJ;", "orcid": "0009-0009-8546-2346;0000-0002-0422-8235;;;", "linkedin": ";;;;", "or_profile": "~Pengyi_Li1;~Jianye_HAO1;~Hongyao_Tang1;~YAN_ZHENG1;~Fazl_Barez1", "aff": "Tianjin University;Tianjin University;Montreal Institute for Learning Algorithms, University of Montreal, Universit\u00e9 de Montr\u00e9al;Tianjin Unibersity, China;", "aff_domain": "tju.edu.cn;tju.edu.cn;mila.umontreal.ca;tju.edu.cn;", "position": "PhD student;Associate Professor;Postdoc;Associate Professor;", "bibtex": "@inproceedings{\nli2024valueevolutionarybased,\ntitle={Value-Evolutionary-Based Reinforcement Learning},\nauthor={Pengyi Li and Jianye HAO and Hongyao Tang and YAN ZHENG and Fazl Barez},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XobPpcN4yZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1246167, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9541402201241565429&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "tju.edu.cn;tju.edu.cn;mila.umontreal.ca;tju.edu.cn;", "author_num": 5, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Tianjin University;University of Montreal", "aff_unique_dep": ";Montreal Institute for Learning Algorithms", "aff_unique_url": "http://www.tju.edu.cn;https://www.mila.quebec", "aff_unique_abbr": "TJU;MILA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;Canada" }, { "title": "Non-confusing Generation of Customized Concepts in Diffusion Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33802", "id": "XoencoHWy7", "proceeding": "https://proceedings.mlr.press/v235/lin24d.html", "pdf": "https://openreview.net/pdf?id=XoencoHWy7", "openreview": "https://openreview.net/forum?id=XoencoHWy7", "author_site": "Wang Lin, Jingyuan CHEN, Jiaxin Shi, Yichen Zhu, Chen Liang, Junzhong Miao, Tao Jin, Zhou Zhao, Fei Wu, Shuicheng YAN, Hanwang Zhang", "tldr": "", "abstract": "We tackle the common challenge of inter-concept visual confusion in compositional concept generation using text-guided diffusion models (TGDMs). It becomes even more pronounced in the generation of customized concepts, due to the scarcity of user-provided concept visual examples. By revisiting the two major stages leading to the success of TGDMs---1) contrastive image-language pre-training (CLIP) for text encoder that encodes visual semantics, and 2) training TGDM that decodes the textual embeddings into pixels---we point that existing customized generation methods only focus on fine-tuning the second stage while overlooking the first one. To this end, we propose a simple yet effective solution called CLIF: contrastive image-language fine-tuning. Specifically, given a few samples of customized concepts, we obtain non-confusing textual embeddings of a concept by fine-tuning CLIP via contrasting a concept and the over-segmented visual regions of other concepts. Experimental results demonstrate the effectiveness of CLIF in preventing the confusion of multi-customized concept generation. Project page: https://clif-official.github.io/clif.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wang Lin;Jingyuan Chen;Jiaxin Shi;Yichen Zhu;Chen Liang;Junzhong Miao;Tao Jin;Zhou Zhao;Fei Wu;Shuicheng YAN;Hanwang Zhang", "authorids": "~Wang_Lin2;~Jingyuan_Chen3;~Jiaxin_Shi3;~Yichen_Zhu2;~Chen_Liang18;~Junzhong_Miao1;~Tao_Jin2;~Zhou_Zhao3;~Fei_Wu2;~Shuicheng_YAN3;~Hanwang_Zhang3", "gender": ";;M;M;M;M;M;;;M;M", "homepage": ";;;https://github.com/Echen-Zhu;https://homepage.lliangchenc.com;;https://hugddygff.github.io/;;https://person.zju.edu.cn/wufei;https://yanshuicheng.ai/;https://mreallab.github.io/index.html", "dblp": ";;;;;274/2119.html;88/4850-4.html;;84/3254-1;y/ShuichengYan;79/8116.html", "google_scholar": ";;8XcQHUEAAAAJ;9K3a7T8AAAAJ;;;;;XJLn4MYAAAAJ;https://scholar.google.com.hk/citations?user=DNuiPHwAAAAJ;YG0DFyYAAAAJ", "orcid": ";;;;0000-0003-0579-2716;;0000-0003-3564-1628;;;;", "linkedin": ";;;;;;;;;;", "or_profile": "~Wang_Lin2;~Jingyuan_Chen3;~Jiaxin_Shi3;~Yichen_Zhu2;~Chen_Liang18;~Junzhong_Miao1;~Tao_Jin2;~Zhou_Zhao3;~Fei_Wu2;~Shuicheng_YAN3;~Hanwang_Zhang3", "aff": ";;Huawei Technologies Ltd.;Zhejiang University;Tsinghua University;Harbin Institute of Technology;Zhejiang University;;Zhejiang University;sea Group;Nanyang Technological University", "aff_domain": ";;huawei.com;zju.edu.cn;mail.tsinghua.edu.cn;stu.hit.edu.cn;zju.edu.cn;;zju.edu.cn;sea.com;ntu.edu.sg", "position": ";;Researcher;MS student;PhD student;PhD student;Assistant Professor;;Full Professor;Researcher;Associate Professor", "bibtex": "@inproceedings{\nlin2024nonconfusing,\ntitle={Non-confusing Generation of Customized Concepts in Diffusion Models},\nauthor={Wang Lin and Jingyuan Chen and Jiaxin Shi and Yichen Zhu and Chen Liang and Junzhong Miao and Tao Jin and Zhou Zhao and Fei Wu and Shuicheng YAN and Hanwang Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XoencoHWy7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9579634, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5815267691396980463&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";;huawei.com;zju.edu.cn;mail.tsinghua.edu.cn;stu.hit.edu.cn;zju.edu.cn;;zju.edu.cn;sea.com;ntu.edu.sg", "author_num": 11, "aff_unique_index": "0;1;2;3;1;1;4;5", "aff_unique_norm": "Huawei;Zhejiang University;Tsinghua University;Harbin Institute of Technology;Sea Group;Nanyang Technological University", "aff_unique_dep": "Huawei Technologies;;;;;", "aff_unique_url": "https://www.huawei.com;https://www.zju.edu.cn;https://www.tsinghua.edu.cn;http://www.hit.edu.cn/;;https://www.ntu.edu.sg", "aff_unique_abbr": "Huawei;ZJU;THU;HIT;;NTU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Harbin", "aff_country_unique_index": "0;0;0;0;0;0;2", "aff_country_unique": "China;;Singapore" }, { "title": "How Learning by Reconstruction Produces Uninformative Features For Perception", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33801", "id": "XsDWw1Mn2p", "proceeding": "https://proceedings.mlr.press/v235/balestriero24b.html", "pdf": "https://openreview.net/pdf?id=XsDWw1Mn2p", "openreview": "https://openreview.net/forum?id=XsDWw1Mn2p", "author_site": "Randall Balestriero, Yann LeCun", "tldr": "", "abstract": "Input space reconstruction is an attractive representation learning paradigm. Despite interpretability benefit of reconstruction and generation, we identify a misalignment between learning to reconstruct, and learning for perception. We show that the former allocates a model's capacity towards a subspace of the data explaining the observed variance--a subspace with uninformative features for the latter. For example, the supervised TinyImagenet task with images projected onto the top subspace explaining 90% of the pixel variance can be solved with 45% test accuracy. Using the bottom subspace instead, accounting for only 20% of the pixel variance, reaches 55% test accuracy. Learning by reconstruction is also wasteful as the features for perception are learned last, pushing the need for long training schedules. We finally prove that learning by denoising can alleviate that misalignment for some noise strategies, e.g., masking. While tuning the noise strategy without knowledge of the perception task seems challenging, we provide a solution to detect if a noise strategy is never beneficial regardless of the perception task, e.g., additive Gaussian noise.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Randall Balestriero;Yann LeCun", "authorids": "~Randall_Balestriero1;~Yann_LeCun1", "gender": "M;M", "homepage": "https://randallbalestriero.github.io/;http://yann.lecun.com", "dblp": "175/5364;l/YannLeCun", "google_scholar": "S1x_xqcAAAAJ;WLN3QrAAAAAJ", "orcid": ";", "linkedin": "randallbalestriero/;", "or_profile": "~Randall_Balestriero1;~Yann_LeCun1", "aff": "Citadel;New York University", "aff_domain": "citadel.com;nyu.edu", "position": "Researcher;Full Professor", "bibtex": "@inproceedings{\nbalestriero2024how,\ntitle={How Learning by Reconstruction Produces Uninformative Features For Perception},\nauthor={Randall Balestriero and Yann LeCun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XsDWw1Mn2p}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4064610, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5384539738238772343&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "citadel.com;nyu.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Citadel;New York University", "aff_unique_dep": ";", "aff_unique_url": "https://www.citadel.edu;https://www.nyu.edu", "aff_unique_abbr": "Citadel;NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Transferring Knowledge From Large Foundation Models to Small Downstream Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33800", "id": "XtDJaSe8jE", "proceeding": "https://proceedings.mlr.press/v235/qiu24d.html", "pdf": "https://openreview.net/pdf?id=XtDJaSe8jE", "openreview": "https://openreview.net/forum?id=XtDJaSe8jE", "author_site": "Shikai Qiu, Boran Han, Danielle Robinson, Shuai Zhang, Yuyang Wang, Andrew Wilson", "tldr": "", "abstract": "How do we transfer the relevant knowledge from ever larger foundation models into small, task-specific downstream models that can run at much lower costs? Standard transfer learning using pre-trained weights as the initialization transfers limited information and commits us to often massive pre-trained architectures. This procedure also precludes combining multiple pre-trained models that learn complementary information. To address these shortcomings, we introduce Adaptive Feature Transfer (AFT). Instead of transferring weights, AFT operates purely on features, thereby decoupling the choice of the pre-trained model from the smaller downstream model. Rather than indiscriminately compressing all pre-trained features, AFT adaptively transfers pre-trained features that are most useful for performing the downstream task, using a simple regularization that adds minimal overhead. Across multiple vision, language, and multi-modal datasets, AFT achieves significantly better downstream performance compared to alternatives with a similar computational cost. Furthermore, AFT reliably translates improvement in pre-trained models into improvement in downstream performance, even if the downstream model is over $50\\times$ smaller, and can effectively transfer complementary information learned by multiple pre-trained models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shikai Qiu;Boran Han;Danielle C. Maddix;Shuai Zhang;Bernie Wang;Andrew Gordon Wilson", "authorids": "~Shikai_Qiu1;~Boran_Han1;~Danielle_C._Maddix1;~Shuai_Zhang7;~Bernie_Wang1;~Andrew_Gordon_Wilson1", "gender": "M;;;;M;Not Specified", "homepage": "https://shikaiqiu.github.io/;;https://dcmaddix.github.io/;;http://web.mit.edu/~ywang02/www/;https://cims.nyu.edu/~andrewgw", "dblp": ";;216/8804;;43/8355-1;65/10453", "google_scholar": "pK0OAsQAAAAJ;;IPDByA8AAAAJ;;IKUm624AAAAJ;https://scholar.google.com.tw/citations?user=twWX2LIAAAAJ", "orcid": ";;;;0000-0002-0291-7184;", "linkedin": ";;danielle-maddix-robinson/;;;", "or_profile": "~Shikai_Qiu1;~Boran_Han1;~Danielle_C._Maddix1;~Shuai_Zhang7;~Bernie_Wang1;~Andrew_Gordon_Wilson1", "aff": "New York University;;AWS AI Labs;;Amazon;New York University", "aff_domain": "nyu.edu;;amazon.com;;amazon.com;nyu.edu", "position": "PhD student;;Applied Scientist;;Principal Researcher;Associate Professor", "bibtex": "@inproceedings{\nqiu2024transferring,\ntitle={Transferring Knowledge From Large Foundation Models to Small Downstream Models},\nauthor={Shikai Qiu and Boran Han and Danielle C. Maddix and Shuai Zhang and Bernie Wang and Andrew Gordon Wilson},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XtDJaSe8jE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 570268, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=142512641903873876&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "nyu.edu;;amazon.com;;amazon.com;nyu.edu", "author_num": 6, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "New York University;Amazon", "aff_unique_dep": ";AWS AI Labs", "aff_unique_url": "https://www.nyu.edu;https://aws.amazon.com", "aff_unique_abbr": "NYU;AWS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Autoencoding Conditional Neural Processes for Representation Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33799", "id": "XuQPA4D396", "proceeding": "https://proceedings.mlr.press/v235/prokhorov24a.html", "pdf": "https://openreview.net/pdf?id=XuQPA4D396", "openreview": "https://openreview.net/forum?id=XuQPA4D396", "author_site": "Victor Prokhorov, Ivan Titov, Siddharth N", "tldr": "", "abstract": "Conditional neural processes (CNPs) are a flexible and efficient family of models that learn to learn a stochastic process from data. They have seen particular application in contextual image completion - observing pixel values at some locations to predict a distribution over values at other unobserved locations. However, the choice of pixels in learning CNPs is typically either random or derived from a simple statistical measure (e.g. pixel variance). Here, we turn the problem on its head and ask: which pixels would a CNP like to observe - do they facilitate fitting better CNPs, and do such pixels tell us something meaningful about the underlying image? To this end we develop the Partial Pixel Space Variational Autoencoder (PPS-VAE), an amortised variational framework that casts CNP context as latent variables learnt simultaneously with the CNP. We evaluate PPS-VAE over a number of tasks across different visual data, and find that not only can it facilitate better-fit CNPs, but also that the spatial arrangement and values meaningfully characterise image information - evaluated through the lens of classification on both within and out-of-data distributions. Our model additionally allows for dynamic adaption of context-set size and the ability to scale-up to larger images, providing a promising avenue to explore learning meaningful and effective visual representations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Victor Prokhorov;Ivan Titov;Siddharth N", "authorids": "~Victor_Prokhorov1;~Ivan_Titov1;~Siddharth_N1", "gender": ";;M", "homepage": "https://victorprokhorov.github.io/;http://ivan-titov.org;https://homepages.inf.ed.ac.uk/snaraya3/", "dblp": "203/8964;08/5391;67/8366", "google_scholar": "https://scholar.google.co.uk/citations?user=IQlUyHEAAAAJ;https://scholar.google.nl/citations?user=FKUc3vsAAAAJ;V7D7hxMAAAAJ", "orcid": ";;0000-0003-4911-7333", "linkedin": ";;", "or_profile": "~Victor_Prokhorov1;~Ivan_Titov1;~Siddharth_N1", "aff": "University of Edinburgh, University of Edinburgh;University of Amsterdam;University of Edinburgh", "aff_domain": "ed.ac.uk;uva.nl;ed.ac.uk", "position": "Postdoc;Associate Professor;Reader (Associate Professor)", "bibtex": "@inproceedings{\nprokhorov2024autoencoding,\ntitle={Autoencoding Conditional Neural Processes for Representation Learning},\nauthor={Victor Prokhorov and Ivan Titov and Siddharth N},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XuQPA4D396}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9257375, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Z-lUxrnbCV8J:scholar.google.com/&scioq=Autoencoding+Conditional+Neural+Processes+for+Representation+Learning&hl=en&as_sdt=0,5", "gs_version_total": 5, "email": "ed.ac.uk;uva.nl;ed.ac.uk", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Edinburgh;University of Amsterdam", "aff_unique_dep": ";", "aff_unique_url": "https://www.ed.ac.uk;https://www.uva.nl", "aff_unique_abbr": "Edinburgh;UvA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United Kingdom;Netherlands" }, { "title": "Probability Distribution of Hypervolume Improvement in Bi-objective Bayesian Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33798", "id": "XvmooikuHE", "proceeding": "https://proceedings.mlr.press/v235/wang24ce.html", "pdf": "https://openreview.net/pdf?id=XvmooikuHE", "openreview": "https://openreview.net/forum?id=XvmooikuHE", "author_site": "Hao Wang, Kaifeng Yang, Michael Affenzeller", "tldr": "", "abstract": "Hypervolume improvement (HVI) is commonly employed in multi-objective Bayesian optimization algorithms to define acquisition functions due to its Pareto-compliant property. Rather than focusing on specific statistical moments of HVI, this work aims to provide the exact expression of HVI's probability distribution for bi-objective problems. Considering a bi-variate Gaussian random variable resulting from Gaussian process (GP) modeling, we derive the probability distribution of its hypervolume improvement via a cell partition-based method. Our exact expression is superior in numerical accuracy and computation efficiency compared to the Monte Carlo approximation of HVI's distribution. Utilizing this distribution, we propose a novel acquisition function - $\\varepsilon$-probability of hypervolume improvement ($\\varepsilon$-PoHVI). Experimentally, we show that on many widely-applied bi-objective test problems, $\\varepsilon$-PoHVI significantly outperforms other related acquisition functions, e.g., $\\varepsilon$-PoI, and expected hypervolume improvement, when the GP model exhibits a large the prediction uncertainty.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hao Wang;Kaifeng Yang;Michael Affenzeller", "authorids": "~Hao_Wang48;~Kaifeng_Yang1;~Michael_Affenzeller1", "gender": "M;M;M", "homepage": "https://www.universiteitleiden.nl/en/staffmembers/hao-wang#tab-1;https://heal.heuristiclab.com/team/yang;https://heal.heuristiclab.com/team/affenzeller", "dblp": "w/HaoWang-25;;79/3445", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.nl/citations?user=7WkpjGwAAAAJ;https://scholar.google.at/citations?user=ODP92iYAAAAJ", "orcid": "0000-0002-4933-5181;0000-0002-3353-3298;0000-0001-5692-5940", "linkedin": "hao-wang-leidenuniv/?original_referer=;;michael-affenzeller-b523058/", "or_profile": "~Hao_Wang48;~Kaifeng_Yang1;~Michael_Affenzeller1", "aff": "Leiden University, Leiden University;University of Applied Sciences Upper Austria;Fachhochschulstudieng\u00e4nge Hagenberg", "aff_domain": "liacs.leidenuniv.nl;fh-ooe.at;fh-hagenberg.at", "position": "Assistant Professor;Researcher;Full Professor", "bibtex": "@inproceedings{\nwang2024probability,\ntitle={Probability Distribution of Hypervolume Improvement in Bi-objective Bayesian Optimization},\nauthor={Hao Wang and Kaifeng Yang and Michael Affenzeller},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XvmooikuHE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1185699, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10768561851711283551&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8, "email": "liacs.leidenuniv.nl;fh-ooe.at;fh-hagenberg.at", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Leiden University;University of Applied Sciences Upper Austria;Hagenberg Study Programs", "aff_unique_dep": ";;", "aff_unique_url": "https://www.universiteitleiden.nl;https://www.fh-ooe.at;https://www.fh-hagenberg.at", "aff_unique_abbr": "LU;FH Upper Austria;", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Leiden;;Hagenberg", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Netherlands;Austria" }, { "title": "Generalization Analysis of Stochastic Weight Averaging with General Sampling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33797", "id": "XwVkqvyziD", "proceeding": "https://proceedings.mlr.press/v235/wang24bl.html", "pdf": "https://openreview.net/pdf?id=XwVkqvyziD", "openreview": "https://openreview.net/forum?id=XwVkqvyziD", "author_site": "Wang Peng, Li Shen, Zerui Tao, Shuaida He, Dacheng Tao", "tldr": "", "abstract": "Stochastic weight averaging (SWA) method has empirically proven its advantages compared to stochastic gradient descent (SGD). Despite it is widespread used, theoretical investigations have been limited, particularly in scenarios beyond the ideal setting of convex and sampling with replacement. However, non-convex cases and sampling without replacement are very practical in real-world applications. The main challenges under the above settings are two-folds: (i) All the historical gradient information introduced by SWA is considered, while the analysis of SGD using the tool of uniform stability requires only to bound the current gradient. (ii) The $(1+\\alpha\\beta)$-expansion property causes the boundary of each gradient step dependent on the previous step, making the boundary of each historical gradient in SWA nested and the theoretical analysis even harder. To address the theoretical challenges, we adopt mathematical induction to find a recursive representation that bounds the gradient at each step. Based on this, we establish stability bounds supporting sampling with and without replacement in the non-convex setting. Furthermore, the derived generalization bounds of SWA are sharper than SGD. At last, experimental results on several benchmarks verify our theoretical results.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Peng Wang;Li Shen;Zerui Tao;Shuaida He;Dacheng Tao", "authorids": "~Peng_Wang44;~Li_Shen1;~Zerui_Tao1;~Shuaida_He1;~Dacheng_Tao1", "gender": "M;M;;M;", "homepage": "https://www.researchgate.net/profile/Wang-Peng-20;https://sites.google.com/site/mathshenli/home;;;", "dblp": ";91/3680-8;296/4527;;", "google_scholar": ";yVhgENIAAAAJ;https://scholar.google.com/citations?hl=zh-CN;;", "orcid": ";;0009-0003-9230-721X;0000-0001-9821-0492;", "linkedin": ";;zerui-tao-9a3093117;;", "or_profile": "~Peng_Wang44;~Li_Shen1;~Zerui_Tao1;~Shuaida_He1;~Dacheng_Tao1", "aff": "Huazhong University of Science and Technology;JD Explore Academy;Tokyo University of Agriculture and Technology;Southern University of Science and Technology;", "aff_domain": "hust.edu.cn;jd.com;tuat.ac.jp;mail.sustech.edu.cn;", "position": "PhD student;Researcher;PhD student;PhD student;", "bibtex": "@inproceedings{\nwang2024generalization,\ntitle={Generalization Analysis of Stochastic Weight Averaging with General Sampling},\nauthor={Peng Wang and Li Shen and Zerui Tao and Shuaida He and Dacheng Tao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XwVkqvyziD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2472234, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14197315985767887744&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 5, "email": "hust.edu.cn;jd.com;tuat.ac.jp;mail.sustech.edu.cn;", "author_num": 5, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Huazhong University of Science and Technology;JD;Tokyo University of Agriculture and Technology;Southern University of Science and Technology", "aff_unique_dep": ";JD Explore Academy;;", "aff_unique_url": "http://www.hust.edu.cn;;https://www.tuat.ac.jp;https://www.sustech.edu.cn", "aff_unique_abbr": "HUST;;TUAT;SUSTech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;2;0", "aff_country_unique": "China;;Japan" }, { "title": "Trustworthy Alignment of Retrieval-Augmented Large Language Models via Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33796", "id": "XwnABAdH5y", "proceeding": "https://proceedings.mlr.press/v235/zhang24bg.html", "pdf": "https://openreview.net/pdf?id=XwnABAdH5y", "openreview": "https://openreview.net/forum?id=XwnABAdH5y", "author_site": "Zongmeng Zhang, Yufeng Shi, Jinhua Zhu, Wengang Zhou, Xiang Qi, peng zhang, Houqiang Li", "tldr": "", "abstract": "Trustworthiness is an essential prerequisite for the real-world application of large language models. In this paper, we focus on the trustworthiness of language models with respect to retrieval augmentation. Despite being supported with external evidence, retrieval-augmented generation still suffers from hallucinations, one primary cause of which is the conflict between contextual and parametric knowledge. We deem that retrieval-augmented language models have the inherent capabilities of supplying response according to both contextual and parametric knowledge. Inspired by aligning language models with human preference, we take the first step towards aligning retrieval-augmented language models to a status where it responds relying merely on the external evidence and disregards the interference of parametric knowledge. Specifically, we propose a reinforcement learning based algorithm Trustworthy-Alignment, theoretically and experimentally demonstrating large language models' capability of reaching a trustworthy status without explicit supervision on how to respond. Our work highlights the potential of large language models on exploring its intrinsic abilities by its own and expands the application scenarios of alignment from fulfilling human preference to creating trustworthy agents.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zongmeng Zhang;Yufeng Shi;Jinhua Zhu;Wengang Zhou;Xiang Qi;peng zhang;Houqiang Li", "authorids": "~Zongmeng_Zhang1;~Yufeng_Shi2;~Jinhua_Zhu1;~Wengang_Zhou1;~Xiang_Qi2;~peng_zhang42;~Houqiang_Li1", "gender": "M;;M;M;M;M;M", "homepage": "https://zmzhang2000.github.io/;https://github.com/LOSEREVER;https://github.com/teslacool;http://staff.ustc.edu.cn/~zhwg/index.html;https://ant-work.antgroup-inc.cn/u/131585;;https://staff.ustc.edu.cn/~lihq/", "dblp": "303/1427;;18/1965-1;22/4544-1;;;59/7017.html", "google_scholar": "yKVZMKMAAAAJ;;https://scholar.google.com.hk/citations?user=FvGy0LQAAAAJ;8s1JF8YAAAAJ;;;7sFMIKoAAAAJ", "orcid": "0000-0003-3880-8913;;0000-0003-2157-9077;0000-0003-1690-9836;;;0000-0003-2188-3028", "linkedin": ";;;;;peng-zhang-b2616b3b/;", "or_profile": "~Zongmeng_Zhang1;~Yufeng_Shi2;~Jinhua_Zhu1;~Wengang_Zhou1;~Xiang_Qi2;~peng_zhang42;~Houqiang_Li1", "aff": "University of Science and Technology of China;University of Science and Technology of China;University of Science and Technology of China;University of Science and Technology of China;Alibaba Group;Alibaba Group;University of Science and Technology of China", "aff_domain": "ustc.edu.cn;mail.ustc.edu.cn;ustc.edu.cn;ustc.edu.cn;antgroup.com;antfin.com;ustc.edu.cn", "position": "PhD student;PhD student;PhD student;Full Professor;Researcher;Researcher;Professor", "bibtex": "@inproceedings{\nzhang2024trustworthy,\ntitle={Trustworthy Alignment of Retrieval-Augmented Large Language Models via Reinforcement Learning},\nauthor={Zongmeng Zhang and Yufeng Shi and Jinhua Zhu and Wengang Zhou and Xiang Qi and peng zhang and Houqiang Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XwnABAdH5y}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1619534, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11121487829771671134&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "ustc.edu.cn;mail.ustc.edu.cn;ustc.edu.cn;ustc.edu.cn;antgroup.com;antfin.com;ustc.edu.cn", "author_num": 7, "aff_unique_index": "0;0;0;0;1;1;0", "aff_unique_norm": "University of Science and Technology of China;Alibaba Group", "aff_unique_dep": ";", "aff_unique_url": "http://www.ustc.edu.cn;https://www.alibaba.com", "aff_unique_abbr": "USTC;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Realistic Unsupervised CLIP Fine-tuning with Universal Entropy Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33795", "id": "XxCfToC9pJ", "proceeding": "https://proceedings.mlr.press/v235/liang24e.html", "pdf": "https://openreview.net/pdf?id=XxCfToC9pJ", "openreview": "https://openreview.net/forum?id=XxCfToC9pJ", "author_site": "Jian Liang, Sheng, Zhengbo Wang, Ran He, Tieniu Tan", "tldr": "", "abstract": "The emergence of vision-language models, such as CLIP, has spurred a significant research effort towards their application for downstream supervised learning tasks. Although some previous studies have explored the unsupervised fine-tuning of CLIP, they often rely on prior knowledge in the form of class names associated with ground truth labels. This paper explores a realistic unsupervised fine-tuning scenario, considering the presence of out-of-distribution samples from unknown classes within the unlabeled data. In particular, we focus on simultaneously enhancing out-of-distribution detection and the recognition of instances associated with known classes. To tackle this problem, we present a simple, efficient, and effective approach called Universal Entropy Optimization (UEO). UEO leverages sample-level confidence to approximately minimize the conditional entropy of confident instances and maximize the marginal entropy of less confident instances. Apart from optimizing the textual prompt, UEO incorporates optimization of channel-wise affine transformations within the visual branch of CLIP. Extensive experiments across 15 domains and 4 different types of prior knowledge validate the effectiveness of UEO compared to baseline methods. The code is at https://github.com/tim-learn/UEO.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jian Liang;Lijun Sheng;Zhengbo Wang;Ran He;Tieniu Tan", "authorids": "~Jian_Liang1;~Lijun_Sheng1;~Zhengbo_Wang1;~Ran_He1;~Tieniu_Tan1", "gender": "M;M;;M;", "homepage": "https://liangjian.xyz;https://tomsheng21.github.io/;https://github.com/mrflogs;https://rhe-web.github.io/;", "dblp": "19/2208-1;321/3477;193/0358;61/6198-1;", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com.sg/citations?user=1sM6ZrcAAAAJ;;ayrg9AUAAAAJ;", "orcid": "0000-0003-3890-1894;;;0000-0002-3807-991X;", "linkedin": ";;;;", "or_profile": "~Jian_Liang1;~Lijun_Sheng1;~Zhengbo_Wang1;~Ran_He1;~Tieniu_Tan1", "aff": "Institute of Automation, Chinese Academy of Sciences;University of Science and Technology of China;University of Science and Technology of China;Institute of Automation, Chinese Academy of Sciences;", "aff_domain": "ia.ac.cn;ustc.edu.cn;ustc.edu.cn;ia.ac.cn;", "position": "Associate Professor;PhD student;PhD student;Full Professor;", "bibtex": "@inproceedings{\nliang2024realistic,\ntitle={Realistic Unsupervised {CLIP} Fine-tuning with Universal Entropy Optimization},\nauthor={Jian Liang and Lijun Sheng and Zhengbo Wang and Ran He and Tieniu Tan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XxCfToC9pJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3620728, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6389971524903108845&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "ia.ac.cn;ustc.edu.cn;ustc.edu.cn;ia.ac.cn;", "author_num": 5, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Chinese Academy of Sciences;University of Science and Technology of China", "aff_unique_dep": "Institute of Automation;", "aff_unique_url": "http://www.ia.cas.cn;http://www.ustc.edu.cn", "aff_unique_abbr": "CAS;USTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Robust Learning-Augmented Dictionaries", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33794", "id": "XyhgssAo5b", "proceeding": "https://proceedings.mlr.press/v235/zeynali24a.html", "pdf": "https://openreview.net/pdf?id=XyhgssAo5b", "openreview": "https://openreview.net/forum?id=XyhgssAo5b", "author_site": "Ali Zeynali, Shahin Kamali, Mohammad Hajiesmaili", "tldr": "", "abstract": "We present the first learning-augmented data structure for implementing dictionaries with optimal consistency and robustness. Our data structure, named RobustSL, is a Skip list augmented by predictions of access frequencies of elements in a data sequence. With proper predictions, RobustSL has optimal consistency (achieves static optimality). At the same time, it maintains a logarithmic running time for each operation, ensuring optimal robustness, even if predictions are generated adversarially. Therefore, RobustSL has all the advantages of the recent learning-augmented data structures of Lin, Luo, and Woodruff (ICML 2022) and Cao et al. (arXiv 2023), while providing robustness guarantees that are absent in the previous work. Numerical experiments show that RobustSL outperforms alternative data structures using both synthetic and real datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ali Zeynali;Shahin Kamali;Mohammad Hajiesmaili", "authorids": "~Ali_Zeynali1;~Shahin_Kamali1;~Mohammad_Hajiesmaili1", "gender": "M;M;M", "homepage": "https://ali-zeynali.github.io/home/;https://www.eecs.yorku.ca/~kamalis/;https://groups.cs.umass.edu/hajiesmaili/", "dblp": "276/0289;59/577.html;49/7911", "google_scholar": "WpCU1L4AAAAJ;hQXlVLsAAAAJ;XCGuYKIAAAAJ", "orcid": ";0000-0003-1404-2212;", "linkedin": "ali-zeynali/;shahin-kamali-4a3b376?originalSubdomain=ca;", "or_profile": "~Ali_Zeynali1;~Shahin_Kamali1;~Mohammad_Hajiesmaili1", "aff": "Department of Computer Science, University of Massachusetts at Amherst;York University;College of Information and Computer Science, University of Massachusetts, Amherst", "aff_domain": "cs.umass.edu;yorku.ca;cics.umass.edu", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nzeynali2024robust,\ntitle={Robust Learning-Augmented Dictionaries},\nauthor={Ali Zeynali and Shahin Kamali and Mohammad Hajiesmaili},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XyhgssAo5b}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5700801, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11408625235610286106&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 9, "email": "cs.umass.edu;yorku.ca;cics.umass.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Massachusetts Amherst;York University", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.umass.edu;https://www.yorku.ca", "aff_unique_abbr": "UMass Amherst;York U", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Amherst;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;Canada" }, { "title": "Sliced Wasserstein with Random-Path Projecting Directions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33793", "id": "XyxuhLtFA2", "proceeding": "https://proceedings.mlr.press/v235/nguyen24l.html", "pdf": "https://openreview.net/pdf?id=XyxuhLtFA2", "openreview": "https://openreview.net/forum?id=XyxuhLtFA2", "author_site": "Khai Nguyen, Shujian Zhang, Tam Le, Nhat Ho", "tldr": "", "abstract": "Slicing distribution selection has been used as an effective technique to improve the performance of parameter estimators based on minimizing sliced Wasserstein distance in applications. Previous works either utilize expensive optimization to select the slicing distribution or use slicing distributions that require expensive sampling methods. In this work, we propose an optimization-free slicing distribution that provides a fast sampling for the Monte Carlo estimation of expectation. In particular, we introduce the random-path projecting direction (RPD) which is constructed by leveraging the normalized difference between two random vectors following the two input measures. From the RPD, we derive the random-path slicing distribution (RPSD) and two variants of sliced Wasserstein, i.e., the Random-Path Projection Sliced Wasserstein (RPSW) and the Importance Weighted Random-Path Projection Sliced Wasserstein (IWRPSW). We then discuss the topological, statistical, and computational properties of RPSW and IWRPSW. Finally, we showcase the favorable performance of RPSW and IWRPSW in gradient flow and the training of denoising diffusion generative models on images.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Khai Nguyen;Shujian Zhang;Tam Le;Nhat Ho", "authorids": "~Khai_Nguyen1;~Shujian_Zhang1;~Tam_Le2;~Nhat_Ho1", "gender": "M;;M;M", "homepage": "https://khainb.com;https://www.utexas.edu/;https://tamle-ml.github.io/;https://nhatptnk8912.github.io/", "dblp": "120/4308;84/3190.html;137/4218;203/4479", "google_scholar": "im5fNaQAAAAJ;7RmLVQkAAAAJ;ZyrRB_8AAAAJ;https://scholar.google.ca/citations?user=Xs7cKMwAAAAJ", "orcid": ";;;", "linkedin": ";;lttam;nhat-pham-minh-ho-267b8164/", "or_profile": "~Khai_Nguyen1;~Shujian_Zhang1;~Tam_Le2;~Nhat_Ho1", "aff": "University of Texas, Austin;University of Texas, Austin;The Institute of Statistical Mathematics (ISM);University of Texas, Austin", "aff_domain": "utexas.edu;utexas.edu;ism.ac.jp;utexas.edu", "position": "PhD student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nnguyen2024sliced,\ntitle={Sliced Wasserstein with Random-Path Projecting Directions},\nauthor={Khai Nguyen and Shujian Zhang and Tam Le and Nhat Ho},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XyxuhLtFA2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3154344, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14729917140715756645&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 9, "email": "utexas.edu;utexas.edu;ism.ac.jp;utexas.edu", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Texas at Austin;Institute of Statistical Mathematics", "aff_unique_dep": ";", "aff_unique_url": "https://www.utexas.edu;https://www.ism.ac.jp", "aff_unique_abbr": "UT Austin;ISM", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;Japan" }, { "title": "Prediction Accuracy of Learning in Games : Follow-the-Regularized-Leader meets Heisenberg", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33792", "id": "Y0sH9HGMwq", "proceeding": "https://proceedings.mlr.press/v235/feng24g.html", "pdf": "https://openreview.net/pdf?id=Y0sH9HGMwq", "openreview": "https://openreview.net/forum?id=Y0sH9HGMwq", "author_site": "Yi Feng, Georgios Piliouras, Xiao Wang", "tldr": "", "abstract": "We investigate the accuracy of prediction in deterministic learning dynamics of zero-sum games with random initializations, specifically focusing on observer uncertainty and its relationship to the evolution of covariances. Zero-sum games are a prominent field of interest in machine learning due to their various applications. Concurrently, the accuracy of prediction in dynamical systems from mechanics has long been a classic subject of investigation since the discovery of the Heisenberg Uncertainty Principle. This principle employs covariance and standard deviation of particle states to measure prediction accuracy. In this study, we bring these two approaches together to analyze the Follow-the-Regularized-Leader (FTRL) algorithm in two-player zero-sum games. We provide growth rates of covariance information for continuous-time FTRL, as well as its two canonical discretization methods (Euler and Symplectic). A Heisenberg-type inequality is established for FTRL. Our analysis and experiments also show that employing Symplectic discretization enhances the accuracy of prediction in learning dynamics.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yi Feng;Georgios Piliouras;Xiao Wang", "authorids": "~Yi_Feng3;~Georgios_Piliouras1;~Xiao_Wang4", "gender": "M;;", "homepage": "https://sites.google.com/view/yifeng95524/home;;", "dblp": ";62/1236;", "google_scholar": "https://scholar.google.com/citations?hl=en;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yi_Feng3;~Georgios_Piliouras1;~Xiao_Wang4", "aff": "Shanghai University of Finance and Economics;Singapore University of Technology and Design;", "aff_domain": "shufe.edu;sutd.edu.sg;", "position": "PhD student;Associate Professor;", "bibtex": "@inproceedings{\nfeng2024prediction,\ntitle={Prediction Accuracy of Learning in Games : Follow-the-Regularized-Leader meets Heisenberg},\nauthor={Yi Feng and Georgios Piliouras and Xiao Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Y0sH9HGMwq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4450551, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2328645260211471912&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "shufe.edu;sutd.edu.sg;", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Shanghai University of Finance and Economics;Singapore University of Technology and Design", "aff_unique_dep": ";", "aff_unique_url": "http://www.sufe.edu.cn;https://www.sutd.edu.sg", "aff_unique_abbr": "SUFE;SUTD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "China;Singapore" }, { "title": "Towards a Self-contained Data-driven Global Weather Forecasting Framework", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33791", "id": "Y2WorV5ag6", "proceeding": "https://proceedings.mlr.press/v235/xiao24a.html", "pdf": "https://openreview.net/pdf?id=Y2WorV5ag6", "openreview": "https://openreview.net/forum?id=Y2WorV5ag6", "author_site": "Yi Xiao, LEI BAI, Wei Xue, Hao Chen, Kun Chen, kang chen, Tao Han, Wanli Ouyang", "tldr": "", "abstract": "Data-driven weather forecasting models are advancing rapidly, yet they rely on initial states (i.e., analysis states) typically produced by traditional data assimilation algorithms. Four-dimensional variational assimilation (4DVar) is one of the most widely adopted data assimilation algorithms in numerical weather prediction centers; it is accurate but computationally expensive. In this paper, we aim to couple the AI forecasting model, FengWu, with 4DVar to build a self-contained data-driven global weather forecasting framework, FengWu-4DVar. To achieve this, we propose an *AI-embedded* 4DVar algorithm that includes three components: (1) a 4DVar objective function embedded with the FengWu forecasting model and its error representation to enhance efficiency and accuracy; (2) a spherical-harmonic-transform-based (SHT-based) approximation strategy for capturing the horizontal correlation of background error; and (3) an auto-differentiation (AD) scheme for determining the optimal analysis fields. Experimental results show that under the ERA5 simulated observational data with varying proportions and noise levels, FengWu-4DVar can generate accurate analysis fields; remarkably, it has achieved stable self-contained global weather forecasts for an entire year for the first time, demonstrating its potential for real-world applications. Additionally, our framework is approximately 100 times faster than the traditional 4DVar algorithm under similar experimental conditions, highlighting its significant computational efficiency.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yi Xiao;LEI BAI;Wei Xue;Hao Chen;Kun Chen;kang chen;Tao Han;Wanli Ouyang", "authorids": "~Yi_Xiao4;~LEI_BAI1;~Wei_Xue1;~Hao_Chen14;~Kun_Chen5;~kang_chen3;~Tao_Han4;~Wanli_Ouyang1", "gender": "Non-Binary;M;M;;M;M;M;", "homepage": "https://github.com/xiaoyi-jason;http://leibai.site/;http://www.cs.tsinghua.edu.cn/publish/csen/4623/2010/20101224235122610366982/20101224235122610366982_.html;;https://github.com/kunc3301/;https://github.com/yuchendoudou;https://taohan10200.github.io/;", "dblp": ";119/1223-1;;;;;78/744-3;", "google_scholar": ";https://scholar.google.com.au/citations?user=sakOO04AAAAJ;https://scholar.google.com.tw/citations?user=iaziYXMAAAAJ;;;;a3OxwlMAAAAJ;", "orcid": ";0000-0003-3378-7201;;;;;;", "linkedin": ";lei-bai-641370153/;;;;;;", "or_profile": "~Yi_Xiao4;~LEI_BAI1;~Wei_Xue1;~Hao_Chen14;~Kun_Chen5;~kang_chen3;~Tao_Han4;~Wanli_Ouyang1", "aff": "Tsinghua University;Shanghai AI Laboratory;;;Fudan University;University of Science and Technology of China;Department of Computer Science and Engineering, Hong Kong University of Science and Technology;", "aff_domain": "tsinghua.edu.cn;pjlab.org.cn;;;fudan.edu.cn;ustc.edu.cn;cse.ust.hk;", "position": "PhD student;Researcher;;;MS student;PhD student;PhD student;", "bibtex": "@inproceedings{\nxiao2024towards,\ntitle={Towards a Self-contained Data-driven Global Weather Forecasting Framework},\nauthor={Yi Xiao and LEI BAI and Wei Xue and Hao Chen and Kun Chen and kang chen and Tao Han and Wanli Ouyang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Y2WorV5ag6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9516177, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2546814134974923994&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 3, "email": "tsinghua.edu.cn;pjlab.org.cn;;;fudan.edu.cn;ustc.edu.cn;cse.ust.hk;", "author_num": 8, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Tsinghua University;Shanghai AI Laboratory;Fudan University;University of Science and Technology of China;Hong Kong University of Science and Technology", "aff_unique_dep": ";;;;Department of Computer Science and Engineering", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.shanghai-ai-lab.com;https://www.fudan.edu.cn;http://www.ustc.edu.cn;https://www.ust.hk", "aff_unique_abbr": "THU;SAIL;Fudan;USTC;HKUST", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Structured Inverse-Free Natural Gradient Descent: Memory-Efficient & Numerically-Stable KFAC", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33790", "id": "Y2wRKE0Qor", "proceeding": "https://proceedings.mlr.press/v235/lin24f.html", "pdf": "https://openreview.net/pdf?id=Y2wRKE0Qor", "openreview": "https://openreview.net/forum?id=Y2wRKE0Qor", "author_site": "Wu Lin, Felix Dangel, Runa Eschenhagen, Kirill Neklyudov, Agustinus Kristiadi, Richard E Turner, Alireza Makhzani", "tldr": "", "abstract": "Second-order methods such as KFAC can be useful for neural net training. However, they are often memory-inefficient since their preconditioning Kronecker factors are dense, and numerically unstable in low precision as they require matrix inversion or decomposition. These limitations render such methods unpopular for modern mixed-precision training. We address them by (i) formulating an inverse-free KFAC update and (ii) imposing structures in the Kronecker factors, resulting in structured inverse-free natural gradient descent (SINGD). On modern neural networks, we show that SINGD is memory-efficient and numerically robust, in contrast to KFAC, and often outperforms AdamW even in half precision. Our work closes a gap between first- and second-order methods in modern low-precision training.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wu Lin;Felix Dangel;Runa Eschenhagen;Kirill Neklyudov;Agustinus Kristiadi;Richard E. Turner;Alireza Makhzani", "authorids": "~Wu_Lin2;~Felix_Dangel1;~Runa_Eschenhagen1;~Kirill_Neklyudov1;~Agustinus_Kristiadi1;~Richard_E_Turner1;~Alireza_Makhzani1", "gender": "M;;M;;M;;M", "homepage": "https://f-dangel.com;https://runame.github.io;https://necludov.github.io/;https://agustinus.kristia.de;https://rich-turner-group.github.io/;http://www.alireza.ai/;https://yorkerlin.github.io/", "dblp": "236/4218;242/9235;195/1093;215/3954;40/5352;122/5126.html;70/10338", "google_scholar": "9hlJ9W0AAAAJ;Ribmq4oAAAAJ;https://scholar.google.ru/citations?user=eOttYWgAAAAJ;_1qe2mYAAAAJ;https://scholar.google.co.uk/citations?user=DgLEyZgAAAAJ;B0KVWJEAAAAJ;https://scholar.google.ca/citations?user=sGl6muoAAAAJ", "orcid": "0000-0002-1414-8554;;;0000-0003-1615-1121;;;", "linkedin": ";;;agustinus-kristiadi/;;;", "or_profile": "~Felix_Dangel1;~Runa_Eschenhagen1;~Kirill_Neklyudov1;~Agustinus_Kristiadi1;~Richard_E_Turner1;~Alireza_Makhzani1;~Wu_Lin1", "aff": "Vector Institute, Toronto;University of Cambridge;Vector Institute;Vector Institute;Microsoft Research;Vector Institute;Vector Institute", "aff_domain": "vectorinstitute.ai;cam.ac.uk;vectorinstitute.ai;vectorinstitute.ai;research.microsoft.com;vectorinstitute.ai;vectorinstitute.ai", "position": "Postdoc;PhD student;Postdoc;Postdoc;Researcher;Researcher;Postdoc", "bibtex": "@inproceedings{\nlin2024structured,\ntitle={Structured Inverse-Free Natural Gradient Descent: Memory-Efficient \\& Numerically-Stable {KFAC}},\nauthor={Wu Lin and Felix Dangel and Runa Eschenhagen and Kirill Neklyudov and Agustinus Kristiadi and Richard E. Turner and Alireza Makhzani},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Y2wRKE0Qor}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 728563, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12201556095761769181&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "vectorinstitute.ai;cam.ac.uk;vectorinstitute.ai;vectorinstitute.ai;research.microsoft.com;vectorinstitute.ai;vectorinstitute.ai", "author_num": 7, "aff_unique_index": "0;1;0;0;2;0;0", "aff_unique_norm": "Vector Institute;University of Cambridge;Microsoft", "aff_unique_dep": ";;Microsoft Research", "aff_unique_url": "https://vectorinstitute.ai;https://www.cam.ac.uk;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Vector Institute;Cambridge;MSR", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Toronto;Cambridge;", "aff_country_unique_index": "0;1;0;0;2;0;0", "aff_country_unique": "Canada;United Kingdom;United States" }, { "title": "CuTS: Customizable Tabular Synthetic Data Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33789", "id": "Y4VgJfbjfl", "proceeding": "https://proceedings.mlr.press/v235/vero24a.html", "pdf": "https://openreview.net/pdf?id=Y4VgJfbjfl", "openreview": "https://openreview.net/forum?id=Y4VgJfbjfl", "author_site": "Mark Vero, Mislav Balunovic, Martin Vechev", "tldr": "", "abstract": "Privacy, data quality, and data sharing concerns pose a key limitation for tabular data applications. While generating synthetic data resembling the original distribution addresses some of these issues, most applications would benefit from additional customization on the generated data. However, existing synthetic data approaches are limited to particular constraints, e.g., differential privacy (DP) or fairness. In this work, we introduce CuTS, the first customizable synthetic tabular data generation framework. Customization in CuTS is achieved via declarative statistical and logical expressions, supporting a wide range of requirements (e.g., DP or fairness, among others). To ensure high synthetic data quality in the presence of custom specifications, CuTS is pre-trained on the original dataset and fine-tuned on a differentiable loss automatically derived from the provided specifications using novel relaxations. We evaluate CuTS over four datasets and on numerous custom specifications, outperforming state-of-the-art specialized approaches on several tasks while being more general. In particular, at the same fairness level, we achieve 2.3% higher downstream accuracy than the state-of-the-art in fair synthetic data generation on the Adult dataset.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mark Vero;Mislav Balunovic;Martin Vechev", "authorids": "~Mark_Vero1;~Mislav_Balunovic1;~Martin_Vechev1", "gender": "M;M;M", "homepage": "https://www.sri.inf.ethz.ch/people/markvero;https://www.sri.inf.ethz.ch/people/mislav;https://www.sri.inf.ethz.ch/people/martin", "dblp": "319/4985;231/7686;93/2189.html", "google_scholar": "vguDYtQAAAAJ;fxkgmGwAAAAJ;https://scholar.google.ch/citations?user=aZ1Rh50AAAAJ", "orcid": ";;", "linkedin": "https://linkedin.com/in/mark-vero-9a32bb17a;;", "or_profile": "~Mark_Vero1;~Mislav_Balunovic1;~Martin_Vechev1", "aff": "ETHZ-ETH Zurich;Swiss Federal Institute of Technology;Swiss Federal Institute of Technology", "aff_domain": "inf.ethz.ch;ethz.ch;ethz.ch", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nvero2024cuts,\ntitle={Cu{TS}: Customizable Tabular Synthetic Data Generation},\nauthor={Mark Vero and Mislav Balunovic and Martin Vechev},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Y4VgJfbjfl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 760511, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2739270242486808570&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "inf.ethz.ch;ethz.ch;ethz.ch", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "ETH Zurich;Swiss Federal Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch", "aff_unique_abbr": "ETHZ;ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Linear Alignment: A Closed-form Solution for Aligning Human Preferences without Tuning and Feedback", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33788", "id": "Y4wxCICbD0", "proceeding": "https://proceedings.mlr.press/v235/gao24f.html", "pdf": "https://openreview.net/pdf?id=Y4wxCICbD0", "openreview": "https://openreview.net/forum?id=Y4wxCICbD0", "author_site": "songyang gao, Qiming Ge, Wei Shen, Shihan Dou, Junjie Ye, Xiao Wang, Rui Zheng, Yicheng Zou, Zhi Chen, Hang Yan, Qi Zhang, Dahua Lin", "tldr": "", "abstract": "The success of AI assistants based on Language Models (LLMs) hinges on Reinforcement Learning from Human Feedback (RLHF) to comprehend and align with user intentions. However, traditional alignment algorithms, such as PPO, are hampered by complex annotation and training requirements. This reliance limits the applicability of RLHF and hinders the development of professional assistants tailored to diverse human preferences. In this work, we introduce *Linear Alignment*, a novel algorithm that aligns language models with human preferences in one single inference step, eliminating the reliance on data annotation and model training. Linear alignment incorporates a new parameterization for policy optimization under divergence constraints, which enables the extraction of optimal policy in a closed-form manner and facilitates the direct estimation of the aligned response. Extensive experiments on both general and personalized preference datasets demonstrate that linear alignment significantly enhances the performance and efficiency of LLM alignment across diverse scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Songyang Gao;Qiming Ge;Wei Shen;Shihan Dou;Junjie Ye;Xiao Wang;Rui Zheng;Yicheng Zou;Zhi Chen;Hang Yan;Qi Zhang;Dahua Lin", "authorids": "~Songyang_Gao1;~Qiming_Ge2;~Wei_Shen12;~Shihan_Dou1;~Junjie_Ye4;~Xiao_Wang12;~Rui_Zheng1;~Yicheng_Zou1;~Zhi_Chen1;~Hang_Yan2;~Qi_Zhang8;~Dahua_Lin1", "gender": "M;M;;;M;M;M;M;M;;M;M", "homepage": ";https://icesolitary.github.io/;http://github.com/fakerbaby;;;https://xiaowangnlp.github.io/;https://github.com/ruizheng20;;https://donmaclean7.github.io/;;http://qizhang.info;http://dahua.site", "dblp": "314/6067;;;;;;;224/6030.html;05/1539-6;;52/323-1;53/6088", "google_scholar": "O42mLrsAAAAJ;;-DlGT8IAAAAJ;;https://scholar.google.com.hk/citations?user=4uSMG9kAAAAJ;https://scholar.google.com.hk/citations?hl=zh-CN;https://scholar.google.com.hk/citations?user=7Z0V_SoAAAAJ;X_nKjOYAAAAJ;5pPm6FEAAAAJ;;XfqR3yYAAAAJ;GMzzRRUAAAAJ", "orcid": ";;;;0009-0004-0921-6323;;;;0000-0003-4180-8455;;;", "linkedin": ";;;;;;;;;;;", "or_profile": "~Songyang_Gao1;~Qiming_Ge2;~Wei_Shen12;~Shihan_Dou1;~Junjie_Ye4;~Xiao_Wang12;~Rui_Zheng1;~Yicheng_Zou1;~Zhi_Chen1;~Hang_Yan2;~Qi_Zhang8;~Dahua_Lin1", "aff": "Fudan University;Fudan University;Fudan University;;Fudan University;Fudan University;Fudan University;;Shanghai AI Laboratory;;Fudan University;The Chinese University of Hong Kong", "aff_domain": "fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;;pjlab.org.cn;;fudan.edu.cn;cuhk.edu.hk", "position": "MS student;MS student;MS student;;PhD student;PhD student;PhD student;;Researcher;;Full Professor;Associate Professor", "bibtex": "@inproceedings{\ngao2024linear,\ntitle={Linear Alignment: A Closed-form Solution for Aligning Human Preferences without Tuning and Feedback},\nauthor={Songyang Gao and Qiming Ge and Wei Shen and Shihan Dou and Junjie Ye and Xiao Wang and Rui Zheng and Yicheng Zou and Zhi Chen and Hang Yan and Qi Zhang and Dahua Lin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Y4wxCICbD0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 863503, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10957228960676933942&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 9, "email": "fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;;pjlab.org.cn;;fudan.edu.cn;cuhk.edu.hk", "author_num": 12, "aff_unique_index": "0;0;0;0;0;0;1;0;2", "aff_unique_norm": "Fudan University;Shanghai AI Laboratory;Chinese University of Hong Kong", "aff_unique_dep": ";;", "aff_unique_url": "https://www.fudan.edu.cn;https://www.shanghai-ai-lab.com;https://www.cuhk.edu.hk", "aff_unique_abbr": "Fudan;SAIL;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Using Uncertainty Quantification to Characterize and Improve Out-of-Domain Learning for PDEs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33787", "id": "Y50K6DSrWo", "proceeding": "https://proceedings.mlr.press/v235/mouli24a.html", "pdf": "https://openreview.net/pdf?id=Y50K6DSrWo", "openreview": "https://openreview.net/forum?id=Y50K6DSrWo", "author_site": "Chandra Mouli Sekar, Danielle Robinson, Shima Alizadeh, Gaurav Gupta, Andrew Stuart, Michael Mahoney, Yuyang Wang", "tldr": "", "abstract": "Existing work in scientific machine learning (SciML) has shown that data-driven learning of solution operators can provide a fast approximate alternative to classical numerical partial differential equation (PDE) solvers. Of these, Neural Operators (NOs) have emerged as particularly promising. We observe that several uncertainty quantification (UQ) methods for NOs fail for test inputs that are even moderately out-of-domain (OOD), even when the model approximates the solution well for in-domain tasks. To address this limitation, we show that ensembling several NOs can identify high-error regions and provide good uncertainty estimates that are well-correlated with prediction errors. Based on this, we propose a cost-effective alternative, DiverseNO, that mimics the properties of the ensemble by encouraging diverse predictions from its multiple heads in the last feed-forward layer. We then introduce Operator-ProbConserv, a method that uses these well-calibrated UQ estimates within the ProbConserv framework to update the model. Our empirical results show that Operator-ProbConserv enhances OOD model performance for a variety of challenging PDE problems and satisfies physical constraints such as conservation laws.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "S Chandra Mouli;Danielle C. Maddix;Shima Alizadeh;Gaurav Gupta;Andrew Stuart;Michael W. Mahoney;Bernie Wang", "authorids": "~S_Chandra_Mouli1;~Danielle_C._Maddix1;~Shima_Alizadeh1;~Gaurav_Gupta2;~Andrew_Stuart1;~Michael_W._Mahoney1;~Bernie_Wang1", "gender": "M;;F;M;M;;M", "homepage": "https://www.cs.purdue.edu/homes/chandr/;https://dcmaddix.github.io/;;http://guptagaurav.me/;http://www.cms.caltech.edu/people;;http://web.mit.edu/~ywang02/www/", "dblp": "167/6021;216/8804;;;;;43/8355-1", "google_scholar": "https://scholar.google.com/citations?hl=en;IPDByA8AAAAJ;;Maqaq6MAAAAJ;;;IKUm624AAAAJ", "orcid": ";;;;;;0000-0002-0291-7184", "linkedin": ";danielle-maddix-robinson/;shima-alizadeh-4853b340/;gaurav71531/;;;", "or_profile": "~S_Chandra_Mouli1;~Danielle_C._Maddix1;~Shima_Alizadeh1;~Gaurav_Gupta2;~Andrew_Stuart1;~Michael_W._Mahoney1;~Bernie_Wang1", "aff": ";AWS AI Labs;Amazon;Amazon;California Institute of Technology;;Amazon", "aff_domain": ";amazon.com;amazon.com;amazon.com;;;amazon.com", "position": ";Applied Scientist;Researcher;Applied Scientist-III;Full Professor;;Principal Researcher", "bibtex": "@inproceedings{\nmouli2024using,\ntitle={Using Uncertainty Quantification to Characterize and Improve Out-of-Domain Learning for {PDE}s},\nauthor={S Chandra Mouli and Danielle C. Maddix and Shima Alizadeh and Gaurav Gupta and Andrew Stuart and Michael W. Mahoney and Bernie Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Y50K6DSrWo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2907757, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8559657176235706504&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": ";amazon.com;amazon.com;amazon.com;;;amazon.com", "author_num": 7, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Amazon;California Institute of Technology", "aff_unique_dep": "AWS AI Labs;", "aff_unique_url": "https://aws.amazon.com;https://www.caltech.edu", "aff_unique_abbr": "AWS;Caltech", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pasadena", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Nash Learning from Human Feedback", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33786", "id": "Y5AmNYiyCQ", "proceeding": "https://proceedings.mlr.press/v235/munos24a.html", "pdf": "https://openreview.net/pdf?id=Y5AmNYiyCQ", "openreview": "https://openreview.net/forum?id=Y5AmNYiyCQ", "author_site": "REMI MUNOS, Michal Valko, Daniele Calandriello, Mohammad Gheshlaghi Azar, Mark Rowland, Zhaohan Guo, Yunhao Tang, Matthieu Geist, Thomas Mesnard, C\u00f4me Fiegel, Andrea Michi, Marco Selvi, Sertan Girgin, Nikola Momchev, Olivier Bachem, Daniel Mankowitz, Doina Precup, Bilal Piot", "tldr": "", "abstract": "Reinforcement learning from human feedback (RLHF) has emerged as the main paradigm for aligning large language models (LLMs) with human preferences. Traditionally, RLHF involves the initial step of learning a reward model from pairwise human feedback, i.e., expressed as preferences between pairs of text generations. Subsequently, the LLM's policy is fine-tuned to maximize the reward through a reinforcement learning algorithm. In this study, we introduce an alternative pipeline for the fine-tuning of LLMs using pairwise human feedback. Our approach entails the initial learning of a pairwise preference model, which is conditioned on two inputs (instead of a single input in the case of a reward model) given a prompt, followed by the pursuit of a policy that consistently generates responses preferred over those generated by any competing policy, thus defining the Nash equilibrium of this preference model. We term this approach Nash learning from human feedback (NLHF). In the context of a tabular policy representation, we present a novel algorithmic solution, Nash-MD, founded on the principles of mirror descent. This algorithm produces a sequence of policies, with the last iteration converging to the regularized Nash equilibrium. Additionally, we explore parametric representations of policies and introduce gradient descent algorithms for deep-learning architectures. We illustrate the effectiveness of our approach by presenting experimental results on a text summarization task. We believe NLHF offers a compelling avenue for fine-tuning LLMs and enhancing the alignment of LLMs with human preferences.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Remi Munos;Michal Valko;Daniele Calandriello;Mohammad Gheshlaghi Azar;Mark Rowland;Zhaohan Daniel Guo;Yunhao Tang;Matthieu Geist;Thomas Mesnard;C\u00f4me Fiegel;Andrea Michi;Marco Selvi;Sertan Girgin;Nikola Momchev;Olivier Bachem;Daniel J Mankowitz;Doina Precup;Bilal Piot", "authorids": "~Remi_Munos1;~Michal_Valko1;~Daniele_Calandriello1;~Mohammad_Gheshlaghi_Azar1;~Mark_Rowland1;~Zhaohan_Daniel_Guo1;~Yunhao_Tang1;~Matthieu_Geist1;~Thomas_Mesnard2;~C\u00f4me_Fiegel1;~Andrea_Michi1;~Marco_Selvi1;~Sertan_Girgin1;~Nikola_Momchev1;~Olivier_Bachem1;~Daniel_J_Mankowitz2;~Doina_Precup1;~Bilal_Piot1", "gender": "M;M;M;M;M;M;M;M;;M;;;;M;M;;F;M", "homepage": "http://researchers.lille.inria.fr/~munos/;https://misovalko.github.io/research.html;;http://mgazar.net;http://sites.google.com/view/markrowland;;https://robintyh1.github.io;;https://thomasmesnard.github.io/;;;;;;http://www.olivierbachem.ch/;;http://cs.mcgill.ca/~dprecup/;", "dblp": "69/6815;03/5455;129/1542;;86/4090;160/9943;210/2229;38/6508;;;;;;;https://dblp.org/pers/hd/b/Bachem:Olivier;;p/DoinaPrecup;", "google_scholar": "https://scholar.google.com/citations?hl=en;jrazNCQAAAAJ;;;https://scholar.google.co.uk/citations?user=-0U84zMAAAAJ;fxr_9oQAAAAJ;;ectPLEUAAAAJ;;;ow2rsbYAAAAJ;;;;https://scholar.google.ch/citations?user=mW9BcgsAAAAJ;;https://scholar.google.com.tw/citations?user=j54VcVEAAAAJ;https://scholar.google.fr/citations?user=fqxNUREAAAAJ", "orcid": ";;;;;;;;;0000-0001-9935-4981;0009-0001-4797-3593;;;;;;;", "linkedin": ";michalvalko/;;;;;;;;;https://linkedin.com/in/andreamichi;marcoselvi;;;olivier-bachem-10257756/;;;", "or_profile": "~Remi_Munos1;~Michal_Valko1;~Daniele_Calandriello1;~Mohammad_Gheshlaghi_Azar1;~Mark_Rowland1;~Zhaohan_Daniel_Guo1;~Yunhao_Tang1;~Matthieu_Geist1;~Thomas_Mesnard2;~C\u00f4me_Fiegel1;~Andrea_Michi1;~Marco_Selvi1;~Sertan_Girgin1;~Nikola_Momchev1;~Olivier_Bachem1;~Daniel_J_Mankowitz2;~Doina_Precup1;~Bilal_Piot1", "aff": "Google DeepMind;Meta;Google DeepMind;;Google DeepMind;Google DeepMind;Google DeepMind;Google;Google DeepMind;;Google;Google DeepMind;;Google;Google Brain;;McGill University;University Lille", "aff_domain": "google.com;meta.com;deepmind.com;;google.com;deepmind.com;deepmind.com;google.com;google.com;;google.com;deepmind.com;;google.com;google.com;;mcgill.ca;univ-lille1.fr", "position": "Research scientist;Principal Researcher;Researcher;;Research Scientist;Research Scientist;Research Scientist;Researcher;PhD student;;Researcher;Researcher;;Software Engineer;Research scientist;;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nmunos2024nash,\ntitle={Nash Learning from Human Feedback},\nauthor={Remi Munos and Michal Valko and Daniele Calandriello and Mohammad Gheshlaghi Azar and Mark Rowland and Zhaohan Daniel Guo and Yunhao Tang and Matthieu Geist and Thomas Mesnard and C{\\^o}me Fiegel and Andrea Michi and Marco Selvi and Sertan Girgin and Nikola Momchev and Olivier Bachem and Daniel J Mankowitz and Doina Precup and Bilal Piot},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Y5AmNYiyCQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 548928, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 18, "gs_citation": 129, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13401696069765119735&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "email": "google.com;meta.com;deepmind.com;;google.com;deepmind.com;deepmind.com;google.com;google.com;;google.com;deepmind.com;;google.com;google.com;;mcgill.ca;univ-lille1.fr", "author_num": 18, "aff_unique_index": "0;1;0;0;0;0;0;0;0;0;0;0;2;3", "aff_unique_norm": "Google;Meta;McGill University;University of Lille", "aff_unique_dep": "Google DeepMind;Meta Platforms, Inc.;;", "aff_unique_url": "https://deepmind.com;https://meta.com;https://www.mcgill.ca;https://www.univ-lille.fr", "aff_unique_abbr": "DeepMind;Meta;McGill;ULille", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;0;0;0;0;1;0;1;0;1;1;2;3", "aff_country_unique": "United Kingdom;United States;Canada;France" }, { "title": "GeoMFormer: A General Architecture for Geometric Molecular Representation Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33785", "id": "Y5Zi59N265", "proceeding": "https://proceedings.mlr.press/v235/chen24ac.html", "pdf": "https://openreview.net/pdf?id=Y5Zi59N265", "openreview": "https://openreview.net/forum?id=Y5Zi59N265", "author_site": "Tianlang Chen, Shengjie Luo, Di He, Shuxin Zheng, Tie-Yan Liu, Liwei Wang", "tldr": "", "abstract": "Molecular modeling, a central topic in quantum mechanics, aims to accurately calculate the properties and simulate the behaviors of molecular systems. The molecular model is governed by physical laws, which impose geometric constraints such as invariance and equivariance to coordinate rotation and translation. While numerous deep learning approaches have been developed to learn molecular representations under these constraints, most of them are built upon heuristic and costly modules. We argue that there is a strong need for a general and flexible framework for learning both invariant and equivariant features. In this work, we introduce a novel Transformer-based molecular model called GeoMFormer to achieve this goal. Using the standard Transformer modules, two separate streams are developed to maintain and learn invariant and equivariant representations. Carefully designed _cross-attention_ modules bridge the two streams, allowing information fusion and enhancing geometric modeling in each stream. As a general and flexible architecture, we show that many previous architectures can be viewed as special instantiations of GeoMFormer. Extensive experiments are conducted to demonstrate the power of GeoMFormer. All empirical results show that GeoMFormer achieves strong performance on both invariant and equivariant tasks of different types and scales. Code and models will be made publicly available at https://github.com/c-tl/GeoMFormer.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tianlang Chen;Shengjie Luo;Di He;Shuxin Zheng;Tie-Yan Liu;Liwei Wang", "authorids": "~Tianlang_Chen2;~Shengjie_Luo1;~Di_He1;~Shuxin_Zheng1;~Tie-Yan_Liu1;~Liwei_Wang1", "gender": ";M;M;M;M;M", "homepage": ";https://lsj2408.github.io;https://dihe-pku.github.io/;https://www.microsoft.com/en-us/research/people/shuz/;http://member.acm.org/~tieyanliu;http://www.liweiwang-pku.com/", "dblp": ";274/2110;74/184;186/8255;l/TieYanLiu;", "google_scholar": ";ImWO7WYAAAAJ;https://scholar.google.co.jp/citations?user=orVoz4IAAAAJ;https://scholar.google.co.jp/citations?user=rPhGUw0AAAAJ;Nh832fgAAAAJ;VZHxoh8AAAAJ", "orcid": ";;;;0000-0002-0476-8020;", "linkedin": ";shengjie-luo-ba6137193/;;;;", "or_profile": "~Tianlang_Chen2;~Shengjie_Luo1;~Di_He1;~Shuxin_Zheng1;~Tie-Yan_Liu1;~Liwei_Wang1", "aff": ";Microsoft;Microsoft;Microsoft;Microsoft;Peking University", "aff_domain": ";microsoft.com;microsoft.com;microsoft.com;microsoft.com;pku.edu.cn", "position": ";Intern;Senior Researcher;Senior Researcher;Distinguished Scientist;Full Professor", "bibtex": "@inproceedings{\nchen2024geomformer,\ntitle={Geo{MF}ormer: A General Architecture for Geometric Molecular Representation Learning},\nauthor={Tianlang Chen and Shengjie Luo and Di He and Shuxin Zheng and Tie-Yan Liu and Liwei Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Y5Zi59N265}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 844427, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10674107071133876192&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";microsoft.com;microsoft.com;microsoft.com;microsoft.com;pku.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Microsoft;Peking University", "aff_unique_dep": "Microsoft Corporation;", "aff_unique_url": "https://www.microsoft.com;http://www.pku.edu.cn", "aff_unique_abbr": "Microsoft;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "United States;China" }, { "title": "Emergence of In-Context Reinforcement Learning from Noise Distillation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33784", "id": "Y8KsHT1kTV", "proceeding": "https://proceedings.mlr.press/v235/zisman24a.html", "pdf": "https://openreview.net/pdf?id=Y8KsHT1kTV", "openreview": "https://openreview.net/forum?id=Y8KsHT1kTV", "author_site": "Ilya Zisman, Vladislav Kurenkov, Alexander Nikulin, Viacheslav Sinii, Sergey Kolesnikov", "tldr": "", "abstract": "Recently, extensive studies in Reinforcement Learning have been carried out on the ability of transformers to adapt in-context to various environments and tasks. Current in-context RL methods are limited by their strict requirements for data, which needs to be generated by RL agents or labeled with actions from an optimal policy. In order to address this prevalent problem, we propose AD$^\\varepsilon$, a new data acquisition approach that enables in-context Reinforcement Learning from noise-induced curriculum. We show that it is viable to construct a synthetic noise injection curriculum which helps to obtain learning histories. Moreover, we experimentally demonstrate that it is possible to alleviate the need for generation using optimal policies, with in-context RL still able to outperform the best suboptimal policy in a learning dataset by a 2x margin.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ilya Zisman;Vladislav Kurenkov;Alexander Nikulin;Viacheslav Sinii;Sergey Kolesnikov", "authorids": "~Ilya_Zisman1;~Vladislav_Kurenkov1;~Alexander_Nikulin1;~Viacheslav_Sinii1;~Sergey_Kolesnikov1", "gender": "Not Specified;M;M;M;M", "homepage": "https://zis.mn/;https://vkurenkov.me;https://howuhh.github.io/;https://t.me/identiki_t;https://scitator.com", "dblp": ";251/9126;314/6349;351/7957;191/1945", "google_scholar": "tmh78sQAAAAJ;w09vtVsAAAAJ;yACvnqUAAAAJ;IO-blf8AAAAJ;iukbpVEAAAAJ", "orcid": ";0000-0003-4078-1086;;;", "linkedin": "suessmann/;;;;scitator/", "or_profile": "~Ilya_Zisman1;~Vladislav_Kurenkov1;~Alexander_Nikulin1;~Viacheslav_Sinii1;~Sergey_Kolesnikov1", "aff": "Higher School of Economics;Tinkoff;Moscow Institute of Physics and Technology;Innopolis University;Tinkoff", "aff_domain": "hse.ru;tinkoff.ai;mipt.edu;innopolis.ru;tinkoff.ru", "position": "MS student;Researcher;PhD student;Undergrad student;Principal Researcher", "bibtex": "@inproceedings{\nzisman2024emergence,\ntitle={Emergence of In-Context Reinforcement Learning from Noise Distillation},\nauthor={Ilya Zisman and Vladislav Kurenkov and Alexander Nikulin and Viacheslav Sinii and Sergey Kolesnikov},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Y8KsHT1kTV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3677911, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1810912149999290965&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "hse.ru;tinkoff.ai;mipt.edu;innopolis.ru;tinkoff.ru", "author_num": 5, "aff_unique_index": "0;1;2;3;1", "aff_unique_norm": "Higher School of Economics;Tinkoff Bank;Moscow Institute of Physics and Technology;Innopolis University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.hse.ru;https://www.tinkoff.ru;https://www.mipt.ru/en;https://innopolis.ru/en", "aff_unique_abbr": "HSE;Tinkoff;MIPT;Innopolis", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Russian Federation" }, { "title": "Random Latent Exploration for Deep Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33783", "id": "Y9qzwNlKVU", "proceeding": "https://proceedings.mlr.press/v235/mahankali24a.html", "pdf": "https://openreview.net/pdf?id=Y9qzwNlKVU", "openreview": "https://openreview.net/forum?id=Y9qzwNlKVU", "author_site": "Srinath Mahankali, Zhang-Wei Hong, Ayush Sekhari, Alexander Rakhlin, Pulkit Agrawal", "tldr": "", "abstract": "The ability to efficiently explore high-dimensional state spaces is essential for the practical success of deep Reinforcement Learning (RL). This paper introduces a new exploration technique called Random Latent Exploration (RLE), that combines the strengths of exploration bonuses and randomized value functions (two popular approaches for effective exploration in deep RL). RLE leverages the idea of perturbing rewards by adding structured random rewards to the original task rewards in certain (random) states of the environment, to encourage the agent to explore the environment during training. RLE is straightforward to implement and performs well in practice. To demonstrate the practical effectiveness of RLE, we evaluate it on the challenging Atari and IsaacGym benchmarks and show that RLE exhibits higher overall scores across all the tasks than other approaches, including action-noise and randomized value function exploration.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Srinath V. Mahankali;Zhang-Wei Hong;Ayush Sekhari;Alexander Rakhlin;Pulkit Agrawal", "authorids": "~Srinath_V._Mahankali1;~Zhang-Wei_Hong1;~Ayush_Sekhari1;~Alexander_Rakhlin1;~Pulkit_Agrawal1", "gender": ";M;M;M;M", "homepage": ";;https://ayush.sekhari.com/;http://www.mit.edu/~rakhlin/;https://people.eecs.berkeley.edu/~pulkitag/", "dblp": "321/0657;198/0600;203/8152;59/407;149/2672", "google_scholar": ";GZkyN4cAAAAJ;jH9i188AAAAJ;https://scholar.google.com.tw/citations?user=fds2VpgAAAAJ;UpZmJI0AAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Srinath_V._Mahankali1;~Zhang-Wei_Hong1;~Ayush_Sekhari1;~Alexander_Rakhlin1;~Pulkit_Agrawal1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu;mit.edu;mit.edu", "position": "Undergrad student;PhD student;Postdoc;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nmahankali2024random,\ntitle={Random Latent Exploration for Deep Reinforcement Learning},\nauthor={Srinath V. Mahankali and Zhang-Wei Hong and Ayush Sekhari and Alexander Rakhlin and Pulkit Agrawal},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Y9qzwNlKVU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5244194, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5460314343530000938&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "mit.edu;mit.edu;mit.edu;mit.edu;mit.edu", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "On Gradient-like Explanation under a Black-box Setting: When Black-box Explanations Become as Good as White-box", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33782", "id": "YB1O99gK7b", "proceeding": "https://proceedings.mlr.press/v235/cai24h.html", "pdf": "https://openreview.net/pdf?id=YB1O99gK7b", "openreview": "https://openreview.net/forum?id=YB1O99gK7b", "author_site": "Yi Cai, Gerhard Wunder", "tldr": "", "abstract": "Attribution methods shed light on the explainability of data-driven approaches such as deep learning models by uncovering the most influential features in a to-be-explained decision. While determining feature attributions via gradients delivers promising results, the internal access required for acquiring gradients can be impractical under safety concerns, thus limiting the applicability of gradient-based approaches. In response to such limited flexibility, this paper presents GEEX (gradient-estimation-based explanation), a method that produces gradient-like explanations through only query-level access. The proposed approach holds a set of fundamental properties for attribution methods, which are mathematically rigorously proved, ensuring the quality of its explanations. In addition to the theoretical analysis, with a focus on image data, the experimental results empirically demonstrate the superiority of the proposed method over state-of-the-art black-box methods and its competitive performance compared to methods with full access.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yi Cai;Gerhard Wunder", "authorids": "~Yi_Cai6;g.wunder@fu-berlin.de", "gender": ";", "homepage": ";", "dblp": "58/3467-5;", "google_scholar": "Elyt1pkAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Yi_Cai6;g.wunder@fu-berlin.de", "aff": "Freie Universit\u00e4t Berlin;", "aff_domain": "fu-berlin.de;", "position": "PhD student;", "bibtex": "@inproceedings{\ncai2024on,\ntitle={On Gradient-like Explanation under a Black-box Setting: When Black-box Explanations Become as Good as White-box},\nauthor={Yi Cai and Gerhard Wunder},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YB1O99gK7b}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5474845, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7729622724065081022&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "fu-berlin.de;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Freie Universit\u00e4t Berlin", "aff_unique_dep": "", "aff_unique_url": "https://www.fu-berlin.de", "aff_unique_abbr": "FU Berlin", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "title": "Subhomogeneous Deep Equilibrium Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33781", "id": "YBXwr7wF7i", "proceeding": "https://proceedings.mlr.press/v235/sittoni24a.html", "pdf": "https://openreview.net/pdf?id=YBXwr7wF7i", "openreview": "https://openreview.net/forum?id=YBXwr7wF7i", "author_site": "Pietro Sittoni, Francesco Tudisco", "tldr": "", "abstract": "Implicit-depth neural networks have grown as powerful alternatives to traditional networks in various applications in recent years. However, these models often lack guarantees of existence and uniqueness, raising stability, performance, and reproducibility issues. In this paper, we present a new analysis of the existence and uniqueness of fixed points for implicit-depth neural networks based on the concept of subhomogeneous operators and the nonlinear Perron-Frobenius theory. Compared to previous similar analyses, our theory allows for weaker assumptions on the parameter matrices, thus yielding a more flexible framework for well-defined implicit networks. We illustrate the performance of the resulting subhomogeneous networks on feedforward, convolutional, and graph neural network examples", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pietro Sittoni;Francesco Tudisco", "authorids": "~Pietro_Sittoni1;~Francesco_Tudisco1", "gender": "M;M", "homepage": "https://www.linkedin.com/in/pietro-sittoni-9ab1a9208/;https://ftudisco.gitlab.io/", "dblp": ";136/5777", "google_scholar": ";uND_5REAAAAJ", "orcid": ";0000-0002-8150-4475", "linkedin": ";", "or_profile": "~Pietro_Sittoni1;~Francesco_Tudisco1", "aff": "Gran Sasso Science Institute;Gran Sasso Science Institute", "aff_domain": "gssi.it;gssi.it", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nsittoni2024subhomogeneous,\ntitle={Subhomogeneous Deep Equilibrium Models},\nauthor={Pietro Sittoni and Francesco Tudisco},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YBXwr7wF7i}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 639206, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14564642340628739080&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "gssi.it;gssi.it", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Gran Sasso Science Institute", "aff_unique_dep": "", "aff_unique_url": "https://www.gssi.it", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Italy" }, { "title": "Neural Collapse for Cross-entropy Class-Imbalanced Learning with Unconstrained ReLU Features Model", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33780", "id": "YBetKvUlF7", "proceeding": "https://proceedings.mlr.press/v235/dang24a.html", "pdf": "https://openreview.net/pdf?id=YBetKvUlF7", "openreview": "https://openreview.net/forum?id=YBetKvUlF7", "author_site": "Hien Dang, Tho Tran Huu, Tan Nguyen, Nhat Ho", "tldr": "", "abstract": "The current paradigm of training deep neural networks for classification tasks includes minimizing the empirical risk, pushing the training loss value towards zero even after the training classification error has vanished. In this terminal phase of training, it has been observed that the last-layer features collapse to their class-means and these class-means converge to the vertices of a simplex Equiangular Tight Frame (ETF). This phenomenon is termed as Neural Collapse ($\\mathcal{NC}$). However, this characterization only holds in class-balanced datasets where every class has the same number of training samples. When the training dataset is class-imbalanced, some $\\mathcal{NC}$ properties will no longer hold true, for example, the geometry of class-means will skew away from the simplex ETF. In this paper, we generalize $\\mathcal{NC}$ to imbalanced regime for cross-entropy loss under the unconstrained ReLU features model. We demonstrate that while the within-class features collapse property still holds in this setting, the class-means will converge to a structure consisting of orthogonal vectors with lengths dependent on the number of training samples. Furthermore, we find that the classifier weights (i.e., the last-layer linear classifier) are aligned to the scaled and centered class-means, with scaling factors dependent on the number of training samples of each class. This generalizes $\\mathcal{NC}$ in the class-balanced setting. We empirically validate our results through experiments on practical architectures and dataset.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hien Dang;Tho Tran Huu;Tan Minh Nguyen;Nhat Ho", "authorids": "~Hien_Dang1;~Tho_Tran_Huu1;~Tan_Minh_Nguyen1;~Nhat_Ho1", "gender": ";M;M;M", "homepage": ";;https://tanmnguyen89.github.io/;https://nhatptnk8912.github.io/", "dblp": ";337/2038;255/4725;203/4479", "google_scholar": ";fG3mIYEAAAAJ;OizOh88AAAAJ;https://scholar.google.ca/citations?user=Xs7cKMwAAAAJ", "orcid": ";;;", "linkedin": ";;;nhat-pham-minh-ho-267b8164/", "or_profile": "~Hien_Dang1;~Tho_Tran_Huu1;~Tan_Minh_Nguyen1;~Nhat_Ho1", "aff": ";National University of Singapore;National University of Singapore;University of Texas, Austin", "aff_domain": ";u.nus.edu;nus.edu.sg;utexas.edu", "position": ";PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\ndang2024neural,\ntitle={Neural Collapse for Cross-entropy Class-Imbalanced Learning with Unconstrained Re{LU} Features Model},\nauthor={Hien Dang and Tho Tran Huu and Tan Minh Nguyen and Nhat Ho},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YBetKvUlF7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8652028, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13120658235052172931&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";u.nus.edu;nus.edu.sg;utexas.edu", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "National University of Singapore;University of Texas at Austin", "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;https://www.utexas.edu", "aff_unique_abbr": "NUS;UT Austin", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Singapore;United States" }, { "title": "IBD-PSC: Input-level Backdoor Detection via Parameter-oriented Scaling Consistency", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33779", "id": "YCzbfs2few", "proceeding": "https://proceedings.mlr.press/v235/hou24a.html", "pdf": "https://openreview.net/pdf?id=YCzbfs2few", "openreview": "https://openreview.net/forum?id=YCzbfs2few", "author_site": "Linshan Hou, Ruili Feng, Zhongyun Hua, Wei Luo, Leo Yu Zhang, Yiming Li", "tldr": "", "abstract": "Deep neural networks (DNNs) are vulnerable to backdoor attacks, where adversaries can maliciously trigger model misclassifications by implanting a hidden backdoor during model training. This paper proposes a simple yet effective input-level backdoor detection (dubbed IBD-PSC) as a `firewall' to filter out malicious testing images. Our method is motivated by an intriguing phenomenon, i.e., parameter-oriented scaling consistency (PSC), where the prediction confidences of poisoned samples are significantly more consistent than those of benign ones when amplifying model parameters. In particular, we provide theoretical analysis to safeguard the foundations of the PSC phenomenon. We also design an adaptive method to select BN layers to scale up for effective detection. Extensive experiments are conducted on benchmark datasets, verifying the effectiveness and efficiency of our IBD-PSC method and its resistance to adaptive attacks. Codes are available at https://github.com/THUYimingLi/BackdoorBox.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Linshan Hou;Ruili Feng;Zhongyun Hua;Wei Luo;Leo Yu Zhang;Yiming Li", "authorids": "~Linshan_Hou1;~Ruili_Feng1;~Zhongyun_Hua1;~Wei_Luo3;~Leo_Yu_Zhang1;~Yiming_Li1", "gender": "F;;M;Unspecified;M;M", "homepage": ";https://github.com/RuiLiFeng;http://www.huazhongyun.cn/;;https://leozhangcs.github.io/;http://liyiming.tech", "dblp": "279/9891.html;20/9594;155/4920;05/6715-1;117/3526;l/YimingLi-4", "google_scholar": ";;Sl0BI_IAAAAJ;https://scholar.google.com.au/citations?user=fIxBU34AAAAJ;https://scholar.google.com.hk/citations?user=JK21OM0AAAAJ;mSW7kU8AAAAJ", "orcid": ";;0000-0002-3529-0541;0000-0002-4711-7543;0000-0001-9330-2662;0000-0002-2258-265X", "linkedin": ";;;;;yiming-li-thu/", "or_profile": "~Linshan_Hou1;~Ruili_Feng1;~Zhongyun_Hua1;~Wei_Luo3;~Leo_Yu_Zhang1;~Yiming_Li1", "aff": "Harbin Institute of Technology;University of Science and Technology of China;Harbin Institute of Technology Shenzhen;;Griffith University;Zhejiang University", "aff_domain": "hit.edu.cn;mail.ustc.edu.cn;hit.edu.cn;;griffith.edu.au;zju.edu.cn", "position": "PhD student;PhD student;Full Professor;;Researcher;Research Professor", "bibtex": "@inproceedings{\nhou2024ibdpsc,\ntitle={{IBD}-{PSC}: Input-level Backdoor Detection via Parameter-oriented Scaling Consistency},\nauthor={Linshan Hou and Ruili Feng and Zhongyun Hua and Wei Luo and Leo Yu Zhang and Yiming Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YCzbfs2few}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4547304, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12453661075266324589&as_sdt=20000005&sciodt=0,21&hl=en", "gs_version_total": 8, "email": "hit.edu.cn;mail.ustc.edu.cn;hit.edu.cn;;griffith.edu.au;zju.edu.cn", "author_num": 6, "aff_unique_index": "0;1;0;2;3", "aff_unique_norm": "Harbin Institute of Technology;University of Science and Technology of China;Griffith University;Zhejiang University", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.hit.edu.cn/;http://www.ustc.edu.cn;https://www.griffith.edu.au;https://www.zju.edu.cn", "aff_unique_abbr": "HIT;USTC;Griffith;ZJU", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Harbin;;Shenzhen", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "China;Australia" }, { "title": "Contextualized Policy Recovery: Modeling and Interpreting Medical Decisions with Adaptive Imitation Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33778", "id": "YEQM0asWCH", "proceeding": "https://proceedings.mlr.press/v235/deuschel24a.html", "pdf": "https://openreview.net/pdf?id=YEQM0asWCH", "openreview": "https://openreview.net/forum?id=YEQM0asWCH", "author_site": "Jannik Deuschel, Caleb Ellington, Yingtao Luo, Ben Lengerich, Pascal Friederich, Eric Xing", "tldr": "", "abstract": "Interpretable policy learning seeks to estimate intelligible decision policies from observed actions; however, existing models force a tradeoff between accuracy and interpretability, limiting data-driven interpretations of human decision-making processes. Fundamentally, existing approaches are burdened by this tradeoff because they represent the underlying decision process as a universal policy, when in fact human decisions are dynamic and can change drastically under different contexts. Thus, we develop Contextualized Policy Recovery (CPR), which re-frames the problem of modeling complex decision processes as a multi-task learning problem, where each context poses a unique task and complex decision policies can be constructed piece-wise from many simple context-specific policies. CPR models each context-specific policy as a linear map, and generates new policy models _on-demand_ as contexts are updated with new observations. We provide two flavors of the CPR framework: one focusing on exact local interpretability, and one retaining full global interpretability. We assess CPR through studies on simulated and real data, achieving state-of-the-art performance on predicting antibiotic prescription in intensive care units ($+22$% AUROC vs. previous SOTA) and predicting MRI prescription for Alzheimer's patients ($+7.7$% AUROC vs. previous SOTA). With this improvement, CPR closes the accuracy gap between interpretable and black-box methods, allowing high-resolution exploration and analysis of context-specific decision models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jannik Deuschel;Caleb Ellington;Yingtao Luo;Ben Lengerich;Pascal Friederich;Eric P. Xing", "authorids": "~Jannik_Deuschel1;~Caleb_Ellington1;~Yingtao_Luo1;~Ben_Lengerich1;~Pascal_Friederich1;~Eric_Xing1", "gender": ";;;;M;M", "homepage": ";;https://yingtaoluo.github.io/;http://web.mit.edu/~blengeri/www/;https://aimat.science;http://www.cs.cmu.edu/~epxing/", "dblp": ";;278/2956;203/8210;182/0165;36/3855", "google_scholar": "eGUxrfQAAAAJ;;g_MmNEoAAAAJ;a1Ck1CMAAAAJ;3B5h6u0AAAAJ;https://scholar.google.com.tw/citations?user=5pKTRxEAAAAJ", "orcid": ";;0000-0003-1794-3657;0000-0001-8690-9554;0000-0003-4465-1465;", "linkedin": "jadeusc/;;;;pascal-friederich-6088b9117/;", "or_profile": "~Jannik_Deuschel1;~Caleb_Ellington1;~Yingtao_Luo1;~Ben_Lengerich1;~Pascal_Friederich1;~Eric_Xing1", "aff": "Karlsruher Institut f\u00fcr Technologie;;Carnegie Mellon University;Massachusetts Institute of Technology;Karlsruher Institut f\u00fcr Technologie;School of Computer Science, Carnegie Mellon University", "aff_domain": "kit.edu;;andrew.cmu.edu;mit.edu;kit.edu;cs.cmu.edu", "position": "MS student;;PhD student;Postdoc;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\ndeuschel2024contextualized,\ntitle={Contextualized Policy Recovery: Modeling and Interpreting Medical Decisions with Adaptive Imitation Learning},\nauthor={Jannik Deuschel and Caleb Ellington and Yingtao Luo and Ben Lengerich and Pascal Friederich and Eric P. Xing},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YEQM0asWCH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6664505, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5031244604857473142&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "kit.edu;;andrew.cmu.edu;mit.edu;kit.edu;cs.cmu.edu", "author_num": 6, "aff_unique_index": "0;1;2;0;1", "aff_unique_norm": "Karlsruher Institut f\u00fcr Technologie;Carnegie Mellon University;Massachusetts Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kit.edu;https://www.cmu.edu;https://web.mit.edu", "aff_unique_abbr": "KIT;CMU;MIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0;1;1;0;1", "aff_country_unique": "Germany;United States" }, { "title": "Interpretability Illusions in the Generalization of Simplified Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33777", "id": "YJWlUMW6YP", "proceeding": "https://proceedings.mlr.press/v235/friedman24a.html", "pdf": "https://openreview.net/pdf?id=YJWlUMW6YP", "openreview": "https://openreview.net/forum?id=YJWlUMW6YP", "author_site": "Dan Friedman, Andrew Lampinen, Lucas Dixon, Danqi Chen, Asma Ghandeharioun", "tldr": "", "abstract": "A common method to study deep learning systems is to use simplified model representations\u2014for example, using singular value decomposition to visualize the model\u2019s hidden states in a lower dimensional space. This approach assumes that the results of these simplifications are faithful to the original model. Here, we illustrate an important caveat to this assumption: even if the simplified representations can accurately approximate the full model on the training set, they may fail to accurately capture the model\u2019s behavior out of distribution. We illustrate this by training Transformer models on controlled datasets with systematic generalization splits, including the Dyck balanced-parenthesis languages and a code completion task. We simplify these models using tools like dimensionality reduction and clustering, and then explicitly test how these simplified proxies match the behavior of the original model. We find consistent generalization gaps: cases in which the simplified proxies are more faithful to the original model on the in-distribution evaluations and less faithful on various tests of systematic generalization. This includes cases where the original model generalizes systematically but the simplified proxies fail, and cases where the simplified proxies generalize better. Together, our results raise questions about the extent to which mechanistic interpretations derived using tools like SVD can reliably predict what a model will do in novel situations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dan Friedman;Andrew Kyle Lampinen;Lucas Dixon;Danqi Chen;Asma Ghandeharioun", "authorids": "~Dan_Friedman2;~Andrew_Kyle_Lampinen1;~Lucas_Dixon1;~Danqi_Chen1;~Asma_Ghandeharioun1", "gender": ";M;Not Specified;F;", "homepage": "http://danfriedman0.github.io/;https://github.com/google/BIG-bench;https://research.google/people/lucas-dixon/;https://www.cs.princeton.edu/~danqic/;https://alum.mit.edu/www/asma_gh", "dblp": "205/9386;https://dblp.uni-trier.de/pers/hd/l/Lampinen:Andrew_K=;39/6853;87/7949;124/3110", "google_scholar": "1UMQ_KwAAAAJ;_N44XxAAAAAJ;nDs3-TMAAAAJ;sVR8ktkAAAAJ;CkfQy2gAAAAJ", "orcid": ";;0000-0003-1094-1675;;", "linkedin": ";;lucas-dixon-94070354/;;", "or_profile": "~Dan_Friedman2;~Andrew_Kyle_Lampinen1;~Lucas_Dixon1;~Danqi_Chen1;~Asma_Ghandeharioun1", "aff": "Princeton University;Google DeepMind;Research, Google;Princeton University;Google", "aff_domain": "princeton.edu;google.com;research.google.com;cs.princeton.edu;google.com", "position": "PhD student;Research Scientist;Researcher;Assistant Professor;Research Scientist", "bibtex": "@inproceedings{\nfriedman2024interpretability,\ntitle={Interpretability Illusions in the Generalization of Simplified Models},\nauthor={Dan Friedman and Andrew Kyle Lampinen and Lucas Dixon and Danqi Chen and Asma Ghandeharioun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YJWlUMW6YP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6039288, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13221559855051788867&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 7, "email": "princeton.edu;google.com;research.google.com;cs.princeton.edu;google.com", "author_num": 5, "aff_unique_index": "0;1;1;0;1", "aff_unique_norm": "Princeton University;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.princeton.edu;https://deepmind.com", "aff_unique_abbr": "Princeton;DeepMind", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "What Can Transformer Learn with Varying Depth? Case Studies on Sequence Learning Tasks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33776", "id": "YNbCbcGyXE", "proceeding": "https://proceedings.mlr.press/v235/chen24bp.html", "pdf": "https://openreview.net/pdf?id=YNbCbcGyXE", "openreview": "https://openreview.net/forum?id=YNbCbcGyXE", "author_site": "Xingwu Chen, Difan Zou", "tldr": "", "abstract": "We study the capabilities of the transformer architecture with varying depth. Specifically, we designed a novel set of sequence learning tasks to systematically evaluate and comprehend how the depth of transformer affects its ability to perform memorization, reasoning, generalization, and contextual generalization. We show a transformer with only one attention layer can excel in memorization but falls short in other tasks. Then, we show that exhibiting reasoning and generalization ability requires the transformer to have at least two attention layers, while context generalization ability may necessitate three attention layers. Additionally, we identify a class of simple operations that a single attention layer can execute, and show that the complex tasks can be approached as the combinations of these simple operations and thus can be resolved by stacking multiple attention layers. This sheds light on studying more practical and complex tasks beyond our design. Numerical experiments corroborate our theoretical findings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xingwu Chen;Difan Zou", "authorids": "~Xingwu_Chen1;~Difan_Zou1", "gender": "M;M", "homepage": "https://github.com;https://difanzou.github.io/", "dblp": ";161/8923", "google_scholar": ";Cp4fcTQAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Xingwu_Chen1;~Difan_Zou1", "aff": "University of Hong Kong;University of Hong Kong", "aff_domain": "hku.hk;hku.hk", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nchen2024what,\ntitle={What Can Transformer Learn with Varying Depth? Case Studies on Sequence Learning Tasks},\nauthor={Xingwu Chen and Difan Zou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YNbCbcGyXE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2888040, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12214185371456517283&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "hku.hk;hku.hk", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.hku.hk", "aff_unique_abbr": "HKU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Non-Asymptotic Analysis for Single-Loop (Natural) Actor-Critic with Compatible Function Approximation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33775", "id": "YNvGFaOG1p", "proceeding": "https://proceedings.mlr.press/v235/wang24by.html", "pdf": "https://openreview.net/pdf?id=YNvGFaOG1p", "openreview": "https://openreview.net/forum?id=YNvGFaOG1p", "author_site": "Yudan Wang, Yue Wang, Yi Zhou, Shaofeng Zou", "tldr": "", "abstract": "Actor-critic (AC) is a powerful method for learning an optimal policy in reinforcement learning, where the critic uses algorithms, e.g., temporal difference (TD) learning with function approximation, to evaluate the current policy and the actor updates the policy along an approximate gradient direction using information from the critic. This paper provides the *tightest* non-asymptotic convergence bounds for both the AC and natural AC (NAC) algorithms. Specifically, existing studies show that AC converges to an $\\epsilon+\\varepsilon_{\\text{critic}}$ neighborhood of stationary points with the best known sample complexity of $\\mathcal{O}(\\epsilon^{-2})$ (up to a log factor), and NAC converges to an $\\epsilon+\\varepsilon_{\\text{critic}}+\\sqrt{\\varepsilon_{\\text{actor}}}$ neighborhood of the global optimum with the best known sample complexity of $\\mathcal{O}(\\epsilon^{-3})$, where $\\varepsilon_{\\text{critic}}$ is the approximation error of the critic and $\\varepsilon_{\\text{actor}}$ is the approximation error induced by the insufficient expressive power of the parameterized policy class. This paper analyzes the convergence of both AC and NAC algorithms with compatible function approximation. Our analysis eliminates the term $\\varepsilon_{\\text{critic}}$ from the error bounds while still achieving the best known sample complexities. Moreover, we focus on the challenging single-loop setting with a single Markovian sample trajectory. Our major technical novelty lies in analyzing the stochastic bias due to policy-dependent and time-varying compatible function approximation in the critic, and handling the non-ergodicity of the MDP due to the single Markovian sample trajectory. Numerical results are also provided in the appendix.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yudan Wang;Yue Wang;Yi Zhou;Shaofeng Zou", "authorids": "~Yudan_Wang1;~Yue_Wang16;~Yi_Zhou2;~Shaofeng_Zou1", "gender": "F;;M;", "homepage": ";https://sites.google.com/view/ywangub;https://sites.google.com/site/yizhouhomepage/home;", "dblp": "189/3319;33/4822-68;;", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;ndMi_z8AAAAJ;4fK8bYIAAAAJ;", "orcid": ";0009-0001-9786-052X;;", "linkedin": ";;;", "or_profile": "~Yudan_Wang1;~Yue_Wang16;~Yi_Zhou2;~Shaofeng_Zou1", "aff": "State University of New York at Buffalo;University of Central Florida;University of Utah;", "aff_domain": "buffalo.edu;ucf.edu;utah.edu;", "position": "PhD student;Assistant Professor;Assistant Professor;", "bibtex": "@inproceedings{\nwang2024nonasymptotic,\ntitle={Non-Asymptotic Analysis for Single-Loop (Natural) Actor-Critic with Compatible Function Approximation},\nauthor={Yudan Wang and Yue Wang and Yi Zhou and Shaofeng Zou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YNvGFaOG1p}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 771746, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9640289146638883615&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "buffalo.edu;ucf.edu;utah.edu;", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "State University of New York at Buffalo;University of Central Florida;University of Utah", "aff_unique_dep": ";;", "aff_unique_url": "https://www.buffalo.edu;https://www.ucf.edu;https://www.utah.edu", "aff_unique_abbr": "SUNY Buffalo;UCF;Utah", "aff_campus_unique_index": "0", "aff_campus_unique": "Buffalo;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Conformal Prediction with Learned Features", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33774", "id": "YPbcUBcTAk", "proceeding": "https://proceedings.mlr.press/v235/kiyani24a.html", "pdf": "https://openreview.net/pdf?id=YPbcUBcTAk", "openreview": "https://openreview.net/forum?id=YPbcUBcTAk", "author_site": "Shayan Kiyani, George J. Pappas, Hamed Hassani", "tldr": "", "abstract": "In this paper, we focus on the problem of conformal prediction with conditional guarantees. Prior work has shown that it is impossible to construct nontrivial prediction sets with full conditional coverage guarantees. A wealth of research has considered relaxations of full conditional guarantees, relying on some *predefined* uncertainty structures. Departing from this line of thinking, we propose Partition Learning Conformal Prediction (PLCP), a framework to improve conditional validity of prediction sets through *learning* uncertainty-guided features from the calibration data. We implement PLCP efficiently with alternating gradient descent, utilizing off-the-shelf machine learning models. We further analyze PLCP theoretically and provide conditional guarantees for infinite and finite sample sizes. Finally, our experimental results over four real-world and synthetic datasets show the superior performance of PLCP compared to state-of-the-art methods in terms of coverage and length in both classification and regression scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shayan Kiyani;George J. Pappas;Hamed Hassani", "authorids": "~Shayan_Kiyani2;~George_J._Pappas1;~Hamed_Hassani2", "gender": "M;M;M", "homepage": ";https://www.seas.upenn.edu/~hassani/;http://www.georgejpappas.org/", "dblp": "303/1168;73/4984;p/GeorgeJPappas", "google_scholar": "R1oEJ0YAAAAJ;;https://scholar.google.com.tw/citations?user=Kia-4B0AAAAJ", "orcid": ";;0000-0001-9081-0637", "linkedin": "shayan-k-b97b6388;;", "or_profile": "~Shayan_Kiyani2;~Hamed_Hassani2;~George_Pappas1", "aff": "University of Pennsylvania;University of Pennsylvania;School of Engineering and Applied Science, University of Pennsylvania", "aff_domain": "upenn.edu;upenn.edu;seas.upenn.edu", "position": "PhD student;;Full Professor", "bibtex": "@inproceedings{\nkiyani2024conformal,\ntitle={Conformal Prediction with Learned Features},\nauthor={Shayan Kiyani and George J. Pappas and Hamed Hassani},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YPbcUBcTAk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 977649, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11364538133872685506&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "upenn.edu;upenn.edu;seas.upenn.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Two-Stage Shadow Inclusion Estimation: An IV Approach for Causal Inference under Latent Confounding and Collider Bias", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33773", "id": "YRWdiaupCr", "proceeding": "https://proceedings.mlr.press/v235/li24bu.html", "pdf": "https://openreview.net/pdf?id=YRWdiaupCr", "openreview": "https://openreview.net/forum?id=YRWdiaupCr", "author_site": "Baohong Li, Anpeng Wu, Ruoxuan Xiong, Kun Kuang", "tldr": "", "abstract": "Latent confounding bias and collider bias are two key challenges of causal inference in observational studies. Latent confounding bias occurs when failing to control the unmeasured covariates that are common causes of treatments and outcomes, which can be addressed by using the Instrumental Variable (IV) approach. Collider bias comes from non-random sample selection caused by both treatments and outcomes, which can be addressed by using a different type of instruments, i.e., shadow variables. However, in most scenarios, these two biases simultaneously exist in observational data, and the previous methods focusing on either one are inadequate. To the best of our knowledge, no approach has been developed for causal inference when both biases exist. In this paper, we propose a novel IV approach, Two-Stage Shadow Inclusion (2SSI), which can simultaneously address latent confounding bias and collider bias by utilizing the residual of the treatment as a shadow variable. Extensive experimental results on benchmark synthetic datasets and a real-world dataset show that 2SSI achieves noticeable performance improvement when both biases exist compared to existing methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Baohong Li;Anpeng Wu;Ruoxuan Xiong;Kun Kuang", "authorids": "~Baohong_Li1;~Anpeng_Wu1;~Ruoxuan_Xiong1;~Kun_Kuang1", "gender": "M;M;;M", "homepage": ";https://scholar.google.com.hk/citations?user=VQ4m6zQAAAAJ&hl=zh-CN&oi=sra;http://www.ruoxuanxiong.com/;http://kunkuang.github.io", "dblp": "83/3116;267/5637;222/2927;194/4245", "google_scholar": "M08DvYsAAAAJ;https://scholar.google.com.hk/citations?user=VQ4m6zQAAAAJ;lg_0u-0AAAAJ;https://scholar.google.com.hk/citations?user=FOsNiMQAAAAJ", "orcid": "0000-0002-3222-002X;0000-0003-3898-7122;;0009-0000-7528-8131", "linkedin": ";;;", "or_profile": "~Baohong_Li1;~Anpeng_Wu1;~Ruoxuan_Xiong1;~Kun_Kuang1", "aff": "Zhejiang University;Mohamed bin Zayed University of Artificial Intelligence;Emory University;Zhejiang University", "aff_domain": "zju.edu.cn;mbzuai.ac.ae;emory.edu;zju.edu.cn", "position": "PhD student;Researcher;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nli2024twostage,\ntitle={Two-Stage Shadow Inclusion Estimation: An {IV} Approach for Causal Inference under Latent Confounding and Collider Bias},\nauthor={Baohong Li and Anpeng Wu and Ruoxuan Xiong and Kun Kuang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YRWdiaupCr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 0, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11100337737421817727&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "zju.edu.cn;mbzuai.ac.ae;emory.edu;zju.edu.cn", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Zhejiang University;Mohamed bin Zayed University of Artificial Intelligence;Emory University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.zju.edu.cn;https://mbzuai.ac.ae;https://www.emory.edu", "aff_unique_abbr": "ZJU;MBZUAI;Emory", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0", "aff_country_unique": "China;United Arab Emirates;United States" }, { "title": "RL-VLM-F: Reinforcement Learning from Vision Language Foundation Model Feedback", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33772", "id": "YSoMmNWZZx", "proceeding": "https://proceedings.mlr.press/v235/wang24bn.html", "pdf": "https://openreview.net/pdf?id=YSoMmNWZZx", "openreview": "https://openreview.net/forum?id=YSoMmNWZZx", "author_site": "Yufei Wang, Zhanyi Sun, Jesse Zhang, Zhou Xian, Erdem Biyik, David Held, Zackory Erickson", "tldr": "", "abstract": "Reward engineering has long been a challenge in Reinforcement Learning (RL) research, as it often requires extensive human effort and iterative processes of trial-and-error to design effective reward functions. In this paper, we propose RL-VLM-F, a method that automatically generates reward functions for agents to learn new tasks, using only a text description of the task goal and the agent's visual observations, by leveraging feedbacks from vision language foundation models (VLMs). The key to our approach is to query these models to give preferences over pairs of the agent's image observations based on the text description of the task goal, and then learn a reward function from the preference labels, rather than directly prompting these models to output a raw reward score, which can be noisy and inconsistent. We demonstrate that RL-VLM-F successfully produces effective rewards and policies across various domains \u2014 including classic control, as well as manipulation of rigid, articulated, and deformable objects \u2014 without the need for human supervision, outperforming prior methods that use large pretrained models for reward generation under the same assumptions. Videos can be found on our project website: https://rlvlmf2024.github.io/", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yufei Wang;Zhanyi Sun;Jesse Zhang;Zhou Xian;Erdem Biyik;David Held;Zackory Erickson", "authorids": "~Yufei_Wang4;~Zhanyi_Sun1;~Jesse_Zhang3;~Zhou_Xian1;~Erdem_Biyik1;~David_Held1;~Zackory_Erickson1", "gender": ";;M;M;M;M;M", "homepage": "https://yufeiwang63.github.io/;;https://jessezhang.net;;http://people.eecs.berkeley.edu/~ebiyik/;http://davheld.github.io/;https://zackory.com", "dblp": ";324/2470;;258/5020;194/2736;22/11147;", "google_scholar": "HQl9718AAAAJ;9qap4XMAAAAJ;fSXCOfEAAAAJ;;https://scholar.google.com.tr/citations?user=P-G3sjYAAAAJ;0QtU-NsAAAAJ;wElkTtIAAAAJ", "orcid": ";;;;0000-0002-9516-3130;;", "linkedin": ";;;;https://linkedin.com/in/ebiyik;;", "or_profile": "~Yufei_Wang4;~Zhanyi_Sun1;~Jesse_Zhang3;~Zhou_Xian1;~Erdem_Biyik1;~David_Held1;~Zackory_Erickson1", "aff": "School of Computer Science, Carnegie Mellon University;Carnegie Mellon University;NVIDIA;Carnegie Mellon University;University of Southern California;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cs.cmu.edu;andrew.cmu.edu;nvidia.com;cmu.edu;usc.edu;cmu.edu;cmu.edu", "position": "PhD student;MS student;Intern;PhD student;Assistant Professor;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nwang2024rlvlmf,\ntitle={{RL}-{VLM}-F: Reinforcement Learning from Vision Language Foundation Model Feedback},\nauthor={Yufei Wang and Zhanyi Sun and Jesse Zhang and Zhou Xian and Erdem Biyik and David Held and Zackory Erickson},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YSoMmNWZZx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4789982, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13090870008751035680&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 7, "email": "cs.cmu.edu;andrew.cmu.edu;nvidia.com;cmu.edu;usc.edu;cmu.edu;cmu.edu", "author_num": 7, "aff_unique_index": "0;0;1;0;2;0;0", "aff_unique_norm": "Carnegie Mellon University;NVIDIA;University of Southern California", "aff_unique_dep": "School of Computer Science;NVIDIA Corporation;", "aff_unique_url": "https://www.cmu.edu;https://www.nvidia.com;https://www.usc.edu", "aff_unique_abbr": "CMU;NVIDIA;USC", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Pittsburgh;;Los Angeles", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "OptiMUS: Scalable Optimization Modeling with (MI)LP Solvers and Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33771", "id": "YT1dtdLvSN", "proceeding": "https://proceedings.mlr.press/v235/ahmaditeshnizi24a.html", "pdf": "https://openreview.net/pdf?id=YT1dtdLvSN", "openreview": "https://openreview.net/forum?id=YT1dtdLvSN", "author_site": "Ali AhmadiTeshnizi, Wenzhi Gao, Madeleine Udell", "tldr": "", "abstract": "Optimization problems are pervasive in sectors from manufacturing and distribution to healthcare. However, most such problems are still solved heuristically by hand rather than optimally by state-of-the-art solvers because the expertise required to formulate and solve these problems limits the widespread adoption of optimization tools and techniques. This paper introduces OptiMUS, a Large Language Model (LLM)-based agent designed to formulate and solve (mixed integer) linear programming problems from their natural language descriptions. OptiMUS can develop mathematical models, write and debug solver code, evaluate the generated solutions, and improve its model and code based on these evaluations. OptiMUS utilizes a modular structure to process problems, allowing it to handle problems with long descriptions and complex data without long prompts. Experiments demonstrate that OptiMUS outperforms existing state-of-the-art methods on easy datasets by more than $20$% and on hard datasets (including a new dataset, NLP4LP, released with this paper that features long and complex problems) by more than $30$%. The implementation and the datasets are available at https://github.com/teshnizi/OptiMUS.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ali AhmadiTeshnizi;Wenzhi Gao;Madeleine Udell", "authorids": "~Ali_AhmadiTeshnizi1;~Wenzhi_Gao1;~Madeleine_Udell1", "gender": ";M;F", "homepage": "https://teshnizi.github.io/;https://github.com/Gwzwpxz;https://people.orie.cornell.edu/mru8", "dblp": ";;153/2166", "google_scholar": "475ARYgAAAAJ;4lDkX_YAAAAJ;tZ9pEDMAAAAJ", "orcid": ";;0000-0002-3985-915X", "linkedin": "teshnizi/;;", "or_profile": "~Ali_AhmadiTeshnizi1;~Wenzhi_Gao1;~Madeleine_Udell1", "aff": "Stanford University;Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nahmaditeshnizi2024optimus,\ntitle={Opti{MUS}: Scalable Optimization Modeling with ({MI}){LP} Solvers and Large Language Models},\nauthor={Ali AhmadiTeshnizi and Wenzhi Gao and Madeleine Udell},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YT1dtdLvSN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 851632, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8278706928773927499&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 6, "email": "stanford.edu;stanford.edu;stanford.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "To Cool or not to Cool? Temperature Network Meets Large Foundation Models via DRO", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33770", "id": "YWuSLBkfOw", "proceeding": "https://proceedings.mlr.press/v235/qiu24c.html", "pdf": "https://openreview.net/pdf?id=YWuSLBkfOw", "openreview": "https://openreview.net/forum?id=YWuSLBkfOw", "author_site": "Zi-Hao Qiu, Siqi Guo, Mao Xu, Tuo Zhao, Lijun Zhang, Tianbao Yang", "tldr": "", "abstract": "The temperature parameter plays a profound role during training and/or inference with large foundation models (LFMs) such as large language models (LLMs) and CLIP models. Particularly, it adjusts the logits in the softmax function in LLMs, which is crucial for next token generation, and it scales the similarities in the contrastive loss for training CLIP models. A significant question remains: `` Is it viable to learn a neural network to predict a personalized temperature of any input data for enhancing LFMs?\" In this paper, we present a principled framework for learning a small yet generalizable temperature prediction network (TempNet) to improve LFMs. Our solution is composed of a novel learning framework with robust losses underpinned by constrained distributionally robust optimization (DRO), and a properly designed TempNet with theoretical inspiration. TempNet can be trained together with a large foundation model from scratch or learned separately given a pretrained foundation model. It is not only useful for predicting personalized temperature to promote the training of LFMs but also generalizable and transferable to new tasks. Our experiments on LLMs and CLIP models demonstrate that TempNet greatly improves the performance of existing solutions or models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zi-Hao Qiu;Siqi Guo;Mao Xu;Tuo Zhao;Lijun Zhang;Tianbao Yang", "authorids": "~Zi-Hao_Qiu1;siqi@tamu.edu;xumao@lamda.nju.edu.cn;~Tuo_Zhao2;~Lijun_Zhang1;~Tianbao_Yang1", "gender": "M;;;;;M", "homepage": "http://www.lamda.nju.edu.cn/qiuzh/;;;;;https://people.tamu.edu/~tianbao-yang/publications.html", "dblp": ";;;;;56/7047", "google_scholar": ";;;;;https://scholar.google.com.tw/citations?user=BCxFU0EAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Zi-Hao_Qiu1;siqi@tamu.edu;xumao@lamda.nju.edu.cn;~Tuo_Zhao2;~Lijun_Zhang1;~Tianbao_Yang1", "aff": "Nanjing University;;;;;Texas A&M University - College Station", "aff_domain": "nju.edu.cn;;;;;tamu.edu", "position": "PhD student;;;;;Associate Professor", "bibtex": "@inproceedings{\nqiu2024to,\ntitle={To Cool or not to Cool? Temperature Network Meets Large Foundation Models via {DRO}},\nauthor={Zi-Hao Qiu and Siqi Guo and Mao Xu and Tuo Zhao and Lijun Zhang and Tianbao Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YWuSLBkfOw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 931021, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13923097716247754301&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "nju.edu.cn;;;;;tamu.edu", "author_num": 6, "aff_unique_index": "0;1", "aff_unique_norm": "Nanjing University;Texas A&M University", "aff_unique_dep": ";", "aff_unique_url": "https://www.nju.edu.cn;https://www.tamu.edu", "aff_unique_abbr": "Nanjing U;TAMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";College Station", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "title": "Harmonizing Generalization and Personalization in Federated Prompt Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33769", "id": "YYwERRXsJW", "proceeding": "https://proceedings.mlr.press/v235/cui24c.html", "pdf": "https://openreview.net/pdf?id=YYwERRXsJW", "openreview": "https://openreview.net/forum?id=YYwERRXsJW", "author_site": "Tianyu Cui, Hongxia Li, Jingya Wang, Ye Shi", "tldr": "", "abstract": "Federated Prompt Learning (FPL) incorporates large pre-trained Vision-Language models (VLM) into federated learning through prompt tuning. The transferable representations and remarkable generalization capacity of VLM make them highly compatible with the integration of federated learning. Addressing data heterogeneity in federated learning requires personalization, but excessive focus on it across clients could compromise the model's ability to generalize effectively. To preserve the impressive generalization capability of VLM, it is crucial to strike a balance between personalization and generalization in FPL. To tackle this challenge, we proposed Federated Prompt Learning with CLIP Generalization and low-rank Personalization (FedPGP), which employs pre-trained CLIP to provide knowledge-guidance on the global prompt for improved generalization and incorporates a low-rank adaptation term to personalize the global prompt. Further, FedPGP integrates a prompt-wise contrastive loss to achieve knowledge guidance and personalized adaptation simultaneously, enabling a harmonious balance between personalization and generalization in FPL. We conduct extensive experiments on various datasets to explore base-to-novel generalization in both category-level and domain-level scenarios with heterogeneous data, showing the superiority of FedPGP in balancing generalization and personalization.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tianyu Cui;Hongxia Li;Jingya Wang;Ye Shi", "authorids": "~Tianyu_Cui2;~Hongxia_Li1;~Jingya_Wang3;~Ye_Shi1", "gender": ";F;F;M", "homepage": "https://github.com/TianyuCuiOvO;https://scholar.google.com/;https://faculty.sist.shanghaitech.edu.cn/faculty/wangjingya/;http://faculty.sist.shanghaitech.edu.cn/faculty/shiye", "dblp": ";https://dblp.org/rec/journals/corr/abs-2211-01572;;34/11191-1", "google_scholar": "TGB4zWUAAAAJ;;https://scholar.google.com.au/citations?user=vmvJV_IAAAAJ;gMqbZPUAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Tianyu_Cui2;~Hongxia_Li1;~Jingya_Wang3;~Ye_Shi1", "aff": "ShanghaiTech University;ShanghaiTech University;ShanghaiTech University;ShanghaiTech University", "aff_domain": "shanghaitech.edu.cn;shanghaitech.edu.cn;shanghaitech.edu.cn;shanghaitech.edu.cn", "position": "MS student;MS student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\ncui2024harmonizing,\ntitle={Harmonizing Generalization and Personalization in Federated Prompt Learning},\nauthor={Tianyu Cui and Hongxia Li and Jingya Wang and Ye Shi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YYwERRXsJW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 761108, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7624684548714340768&as_sdt=40000005&sciodt=0,22&hl=en", "gs_version_total": 7, "email": "shanghaitech.edu.cn;shanghaitech.edu.cn;shanghaitech.edu.cn;shanghaitech.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "ShanghaiTech University", "aff_unique_dep": "", "aff_unique_url": "https://www.shanghaitech.edu.cn", "aff_unique_abbr": "ShanghaiTech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Vision Mamba: Efficient Visual Representation Learning with Bidirectional State Space Model", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33768", "id": "YbHCqn4qF4", "proceeding": "https://proceedings.mlr.press/v235/zhu24f.html", "pdf": "https://openreview.net/pdf?id=YbHCqn4qF4", "openreview": "https://openreview.net/forum?id=YbHCqn4qF4", "author_site": "Lianghui Zhu, Bencheng Liao, Qian Zhang, Xinlong Wang, Wenyu Liu, Xinggang Wang", "tldr": "", "abstract": "Recently the state space models (SSMs) with efficient hardware-aware designs, i.e., the Mamba deep learning model, have shown great potential for long sequence modeling. Meanwhile building efficient and generic vision backbones purely upon SSMs is an appealing direction. However, representing visual data is challenging for SSMs due to the position-sensitivity of visual data and the requirement of global context for visual understanding. In this paper, we show that the reliance on self-attention for visual representation learning is not necessary and propose a new generic vision backbone with bidirectional Mamba blocks (Vim), which marks the image sequences with position embeddings and compresses the visual representation with bidirectional state space models. On ImageNet classification, COCO object detection, and ADE20k semantic segmentation tasks, Vim achieves higher performance compared to well-established vision transformers like DeiT, while also demonstrating significantly improved computation & memory efficiency. For example, Vim is 2.8x faster than DeiT and saves 86.8% GPU memory when performing batch inference to extract features on images with a resolution of 1248x1248. The results demonstrate that Vim is capable of overcoming the computation & memory constraints on performing Transformer-style understanding for high-resolution images and it has great potential to be the next-generation backbone for vision foundation models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lianghui Zhu;Bencheng Liao;Qian Zhang;Xinlong Wang;Wenyu Liu;Xinggang Wang", "authorids": "~Lianghui_Zhu3;~Bencheng_Liao1;~Qian_Zhang7;~Xinlong_Wang2;~Wenyu_Liu3;~Xinggang_Wang1", "gender": "M;M;M;M;M;M", "homepage": "https://github.com/LegendBC;;;http://eic.hust.edu.cn/professor/liuwenyu/;https://xwcv.github.io/index.htm;https://github.com/Unrealluver", "dblp": "289/0295;04/2024-9;;42/4110-1.html;95/3056;", "google_scholar": "rUBdh_sAAAAJ;pCY-bikAAAAJ;DPz0DjYAAAAJ;D7jDk7gAAAAJ;qNCTLV0AAAAJ;NvMHcs0AAAAJ", "orcid": ";;;0000-0002-4582-7488;0000-0001-6732-7823;", "linkedin": "%E6%9C%AC%E6%88%90-%E5%BB%96-21352b133/;;;;;", "or_profile": "~Bencheng_Liao1;~Qian_Zhang7;~Xinlong_Wang2;~Wenyu_Liu3;~Xinggang_Wang1;~Lianghui_Zhu2", "aff": "Huazhong University of Science and Technology;Horizon Robotics;Beijing Academy of Artificial Intelligence;Huazhong University of Science and Technology;Huazhong University of Science and Technology;Huazhong University of Science and Technology", "aff_domain": "hust.edu.cn;horizon.cc;baai.ac.cn;hust.edu.cn;hust.edu.cn;hust.edu.cn", "position": "PhD student;Researcher;Researcher;Full Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nzhu2024vision,\ntitle={Vision Mamba: Efficient Visual Representation Learning with Bidirectional State Space Model},\nauthor={Lianghui Zhu and Bencheng Liao and Qian Zhang and Xinlong Wang and Wenyu Liu and Xinggang Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YbHCqn4qF4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1001815, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "email": "hust.edu.cn;horizon.cc;baai.ac.cn;hust.edu.cn;hust.edu.cn;hust.edu.cn", "author_num": 6, "aff_unique_index": "0;1;2;0;0;0", "aff_unique_norm": "Huazhong University of Science and Technology;Horizon Robotics;Beijing Academy of Artificial Intelligence", "aff_unique_dep": ";;", "aff_unique_url": "http://www.hust.edu.cn;https://www.horizon-robotics.com/;https://www.baaic.cn", "aff_unique_abbr": "HUST;Horizon Robotics;BAAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Unified Training of Universal Time Series Forecasting Transformers", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33767", "id": "Yd8eHMY1wz", "proceeding": "https://proceedings.mlr.press/v235/woo24a.html", "pdf": "https://openreview.net/pdf?id=Yd8eHMY1wz", "openreview": "https://openreview.net/forum?id=Yd8eHMY1wz", "author_site": "Gerald Woo, Chenghao Liu, Akshat Kumar, Caiming Xiong, Silvio Savarese, Doyen Sahoo", "tldr": "", "abstract": "Deep learning for time series forecasting has traditionally operated within a one-model-per-dataset framework, limiting its potential to leverage the game-changing impact of large pre-trained models. The concept of *universal forecasting*, emerging from pre-training on a vast collection of time series datasets, envisions a single Large Time Series Model capable of addressing diverse downstream forecasting tasks. However, constructing such a model poses unique challenges specific to time series data: (i) cross-frequency learning, (ii) accommodating an arbitrary number of variates for multivariate time series, and (iii) addressing the varying distributional properties inherent in large-scale data. To address these challenges, we present novel enhancements to the conventional time series Transformer architecture, resulting in our proposed **M**asked Enc**o**der-based Un**i**ve**r**s**a**l T**i**me Series Forecasting Transformer (**Moirai**). Trained on our newly introduced Large-scale Open Time Series Archive (LOTSA) featuring over 27B observations across nine domains, Moirai achieves competitive or superior performance as a zero-shot forecaster when compared to full-shot models. Code, data, and model weights can be found at https://github.com/SalesforceAIResearch/uni2ts.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gerald Woo;Chenghao Liu;Akshat Kumar;Caiming Xiong;Silvio Savarese;Doyen Sahoo", "authorids": "~Gerald_Woo1;~Chenghao_Liu1;~Akshat_Kumar2;~Caiming_Xiong1;~Silvio_Savarese1;~Doyen_Sahoo1", "gender": "M;M;M;M;M;M", "homepage": ";;http://www.smu.edu.sg/faculty/profile/102291/Akshat-KUMAR;http://cmxiong.com/;;https://www.linkedin.com/in/doyensahoo/?originalSubdomain=sg", "dblp": "246/5297;;73/193;80/7282;50/3578;151/3155", "google_scholar": ";https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=zsYC3R0AAAAJ;vaSdahkAAAAJ;ImpbxLsAAAAJ;https://scholar.google.com.sg/citations?hl=en", "orcid": ";;;;;", "linkedin": "gerald-woo/;chenghao-liu-40a62a56/;;caiming-xiong-150a1417;;doyensahoo/?originalSubdomain=sg", "or_profile": "~Gerald_Woo1;~Chenghao_Liu1;~Akshat_Kumar2;~Caiming_Xiong1;~Silvio_Savarese1;~Doyen_Sahoo1", "aff": "Singapore Management University;Salesforce AI Research;Singapore Management University;Salesforce Research;Stanford University;SalesForce.com", "aff_domain": "smu.edu.sg;salesforce.com;smu.edu.sg;salesforce.com;stanford.edu;salesforce.com", "position": "PhD student;Researcher;Associate Professor;Research Scientist;Adjunct Professor;Researcher", "bibtex": "@inproceedings{\nwoo2024unified,\ntitle={Unified Training of Universal Time Series Forecasting Transformers},\nauthor={Gerald Woo and Chenghao Liu and Akshat Kumar and Caiming Xiong and Silvio Savarese and Doyen Sahoo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Yd8eHMY1wz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1387318, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 175, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8987509541718340256&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "smu.edu.sg;salesforce.com;smu.edu.sg;salesforce.com;stanford.edu;salesforce.com", "author_num": 6, "aff_unique_index": "0;1;0;1;2;1", "aff_unique_norm": "Singapore Management University;Salesforce;Stanford University", "aff_unique_dep": ";Salesforce AI Research;", "aff_unique_url": "https://www.smu.edu.sg;https://www.salesforce.com;https://www.stanford.edu", "aff_unique_abbr": "SMU;Salesforce AI;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;0;1;1;1", "aff_country_unique": "Singapore;United States" }, { "title": "Improving Interpretation Faithfulness for Vision Transformers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33766", "id": "YdwwWRX20q", "proceeding": "https://proceedings.mlr.press/v235/hu24k.html", "pdf": "https://openreview.net/pdf?id=YdwwWRX20q", "openreview": "https://openreview.net/forum?id=YdwwWRX20q", "author_site": "Lijie Hu, Yixin Liu, Ninghao Liu, Mengdi Huai, Lichao Sun, Di Wang", "tldr": "", "abstract": "Vision Transformers (ViTs) have achieved state-of-the-art performance for various vision tasks. One reason behind the success lies in their ability to provide plausible innate explanations for the behavior of neural architectures. However, ViTs suffer from issues with explanation faithfulness, as their focal points are fragile to adversarial attacks and can be easily changed with even slight perturbations on the input image. In this paper, we propose a rigorous approach to mitigate these issues by introducing Faithful ViTs (FViTs). Briefly speaking, an FViT should have the following two properties: (1) The top-$k$ indices of its self-attention vector should remain mostly unchanged under input perturbation, indicating stable explanations; (2) The prediction distribution should be robust to perturbations. To achieve this, we propose a new method called Denoised Diffusion Smoothing (DDS), which adopts randomized smoothing and diffusion-based denoising. We theoretically prove that processing ViTs directly with DDS can turn them into FViTs. We also show that Gaussian noise is nearly optimal for both $\\ell_2$ and $\\ell_\\infty$-norm cases. Finally, we demonstrate the effectiveness of our approach through comprehensive experiments and evaluations. Results show that FViTs are more robust against adversarial attacks while maintaining the explainability of attention, indicating higher faithfulness.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lijie Hu;Yixin Liu;Ninghao Liu;Mengdi Huai;Lichao Sun;Di Wang", "authorids": "~Lijie_Hu1;~Yixin_Liu4;~Ninghao_Liu2;~Mengdi_Huai1;~Lichao_Sun1;~Di_Wang1", "gender": "F;;;F;M;", "homepage": "https://lijie-hu.github.io;;;https://mdhuai.github.io/;https://lichao-sun.github.io/;", "dblp": "90/8790;;;150/8482;121/0780-1.html;", "google_scholar": "C-3UuqsAAAAJ;;;40ZYTzEAAAAJ;WhGUE7AAAAAJ;", "orcid": ";;;0000-0001-6368-5973;;", "linkedin": "lijie-hu-98045a126;;;;lichao-sun-b273a290/;", "or_profile": "~Lijie_Hu1;~Yixin_Liu4;~Ninghao_Liu2;~Mengdi_Huai1;~Lichao_Sun1;~Di_Wang1", "aff": "KAUST;;;Iowa State University;Lehigh University;", "aff_domain": "kaust.edu.sa;;;iastate.edu;lehigh.edu;", "position": "PhD student;;;Assistant Professor;Assistant Professor;", "bibtex": "@inproceedings{\nhu2024improving,\ntitle={Improving Interpretation Faithfulness for Vision Transformers},\nauthor={Lijie Hu and Yixin Liu and Ninghao Liu and Mengdi Huai and Lichao Sun and Di Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YdwwWRX20q}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1839328, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15056408236812178816&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 9, "email": "kaust.edu.sa;;;iastate.edu;lehigh.edu;", "author_num": 6, "aff_unique_index": "0;1;2", "aff_unique_norm": "King Abdullah University of Science and Technology;Iowa State University;Lehigh University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kaust.edu.sa;https://www.iastate.edu;https://www.lehigh.edu", "aff_unique_abbr": "KAUST;ISU;Lehigh", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Saudi Arabia;United States" }, { "title": "Stability and Multigroup Fairness in Ranking with Uncertain Predictions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33765", "id": "YiblhkVl2w", "proceeding": "https://proceedings.mlr.press/v235/devic24a.html", "pdf": "https://openreview.net/pdf?id=YiblhkVl2w", "openreview": "https://openreview.net/forum?id=YiblhkVl2w", "author_site": "Siddartha Devic, Aleksandra Korolova, David Kempe, Vatsal Sharan", "tldr": "", "abstract": "Rankings are ubiquitous across many applications, from search engines to hiring committees. In practice, many rankings are derived from the output of predictors. However, when predictors trained for classification tasks have intrinsic uncertainty, it is not obvious how this uncertainty should be represented in the derived rankings. Our work considers ranking functions: maps from individual predictions for a classification task to distributions over rankings. We focus on two aspects of ranking functions: stability to perturbations in predictions and fairness towards both individuals and subgroups. Not only is stability an important requirement for its own sake, but --- as we show --- it composes harmoniously with individual fairness in the sense of Dwork et al. (2012). While deterministic ranking functions cannot be stable aside from trivial scenarios, we show that the recently proposed uncertainty aware (UA) ranking functions of Singh et al. (2021) are stable. Our main result is that UA rankings also achieve group fairness through successful composition with multiaccurate or multicalibrated predictors. Our work demonstrates that UA rankings naturally interpolate between group and individual level fairness guarantees, while simultaneously satisfying stability guarantees important whenever machine-learned predictions are used.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Siddartha Devic;Aleksandra Korolova;David Kempe;Vatsal Sharan", "authorids": "~Siddartha_Devic1;~Aleksandra_Korolova1;~David_Kempe1;~Vatsal_Sharan1", "gender": ";;;M", "homepage": "http://sid.devic.us/;;https://david-kempe.com;https://vatsalsharan.github.io/", "dblp": "239/8389;;;126/2543", "google_scholar": "LVL-kmUAAAAJ;;;Ize17HEAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Siddartha_Devic1;~Aleksandra_Korolova1;~David_Kempe1;~Vatsal_Sharan1", "aff": "Amazon;;University of Southern California;University of Southern California", "aff_domain": "amazon.com;;usc.edu;usc.edu", "position": "Intern;;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\ndevic2024stability,\ntitle={Stability and Multigroup Fairness in Ranking with Uncertain Predictions},\nauthor={Siddartha Devic and Aleksandra Korolova and David Kempe and Vatsal Sharan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YiblhkVl2w}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 847493, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17003633623005670755&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "email": "amazon.com;;usc.edu;usc.edu", "author_num": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "Amazon;University of Southern California", "aff_unique_dep": "Amazon.com, Inc.;", "aff_unique_url": "https://www.amazon.com;https://www.usc.edu", "aff_unique_abbr": "Amazon;USC", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Supervised Matrix Factorization: Local Landscape Analysis and Applications", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33764", "id": "YlJy1FcM9E", "proceeding": "https://proceedings.mlr.press/v235/lee24p.html", "pdf": "https://openreview.net/pdf?id=YlJy1FcM9E", "openreview": "https://openreview.net/forum?id=YlJy1FcM9E", "author_site": "Joowon Lee, Hanbaek Lyu, Weixin Yao", "tldr": "", "abstract": "Supervised matrix factorization (SMF) is a classical machine learning method that seeks low-dimensional feature extraction and classification tasks at the same time. Training an SMF model involves solving a non-convex and factor-wise constrained optimization problem with at least three blocks of parameters. Due to the high non-convexity and constraints, theoretical understanding of the optimization landscape of SMF has been limited. In this paper, we provide an extensive local landscape analysis for SMF and derive several theoretical and practical applications. Analyzing diagonal blocks of the Hessian naturally leads to a block coordinate descent (BCD) algorithm with adaptive step sizes. We provide global convergence and iteration complexity guarantees for this algorithm. Full Hessian analysis gives minimum $L_{2}$-regularization to guarantee local strong convexity and robustness of parameters. We establish a local estimation guarantee under a statistical SMF model. We also propose a novel GPU-friendly neural implementation of the BCD algorithm and validate our theoretical findings through numerical experiments. Our work contributes to a deeper understanding of SMF optimization, offering insights into the optimization landscape and providing practical solutions to enhance its performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Joowon Lee;Hanbaek Lyu;Weixin Yao", "authorids": "~Joowon_Lee1;~Hanbaek_Lyu1;~Weixin_Yao1", "gender": "F;;M", "homepage": "https://ljw9510.github.io/joowonlee/;https://www.hanbaeklyu.com;https://faculty.ucr.edu/~weixiny/", "dblp": ";;", "google_scholar": "uBtJWX8AAAAJ;gDFWvgQAAAAJ;Kd1Ivt4AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Joowon_Lee1;~Hanbaek_Lyu1;~Weixin_Yao1", "aff": "University of Wisconsin - Madison;University of Wisconsin, Madison;University of California, Riverside", "aff_domain": "wisc.edu;wisc.edu;ucr.edu", "position": "PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nlee2024supervised,\ntitle={Supervised Matrix Factorization: Local Landscape Analysis and Applications},\nauthor={Joowon Lee and Hanbaek Lyu and Weixin Yao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YlJy1FcM9E}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2527679, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6314679945101271634&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "wisc.edu;wisc.edu;ucr.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Wisconsin-Madison;University of Wisconsin;University of California, Riverside", "aff_unique_dep": ";;", "aff_unique_url": "https://www.wisc.edu;https://www.wisc.edu;https://www.ucr.edu", "aff_unique_abbr": "UW-Madison;UW;UCR", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Madison;Riverside", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Enhancing Cross-Modal Fine-Tuning with Gradually Intermediate Modality Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33763", "id": "YlcSyCz21c", "proceeding": "https://proceedings.mlr.press/v235/cai24c.html", "pdf": "https://openreview.net/pdf?id=YlcSyCz21c", "openreview": "https://openreview.net/forum?id=YlcSyCz21c", "author_site": "Lincan Cai, Shuang Li, Wenxuan Ma, Jingxuan Kang, Binhui Xie, Zixun Sun, Chengwei Zhu", "tldr": "", "abstract": "Large-scale pretrained models have proven immensely valuable in handling data-intensive modalities like text and image. However, fine-tuning these models for certain specialized modalities, such as protein sequence and cosmic ray, poses challenges due to the significant modality discrepancy and scarcity of labeled data. In this paper, we propose an end-to-end method, **PaRe**, to enhance cross-modal fine-tuning, aiming to transfer a large-scale pretrained model to various target modalities. **PaRe** employs a gating mechanism to select key patches from both source and target data. Through a modality-agnostic **Pa**tch **Re**placement scheme, these patches are preserved and combined to construct data-rich intermediate modalities ranging from easy to hard. By gradually intermediate modality generation, we can not only effectively bridge the modality gap to enhance stability and transferability of cross-modal fine-tuning, but also address the challenge of limited data in the target modality by leveraging enriched intermediate modality data. Compared with hand-designed, general-purpose, task-specific, and state-of-the-art cross-modal fine-tuning approaches, **PaRe** demonstrates superior performance across three challenging benchmarks, encompassing more than ten modalities.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lincan Cai;Shuang Li;Wenxuan Ma;Jingxuan Kang;Binhui Xie;Zixun Sun;Chengwei Zhu", "authorids": "~Lincan_Cai1;~Shuang_Li6;~Wenxuan_Ma2;~Jingxuan_Kang1;~Binhui_Xie1;~Zixun_Sun1;~Chengwei_Zhu1", "gender": ";M;M;;M;;M", "homepage": "https://github.com/cailincan0129;https://shuangli.xyz;;;https://binhuixie.github.io/;;", "dblp": ";43/6294-8;289/0784-1;;;;", "google_scholar": "wH-dNbAAAAAJ;VXCiAc4AAAAJ;u7aJOt8AAAAJ;;cbVMMCwAAAAJ;;FUGRBLcAAAAJ", "orcid": ";0000-0001-6807-9905;0000-0001-5402-6028;;;;", "linkedin": ";;;;;;", "or_profile": "~Lincan_Cai1;~Shuang_Li6;~Wenxuan_Ma2;~Jingxuan_Kang1;~Binhui_Xie1;~Zixun_Sun1;~Chengwei_Zhu1", "aff": "Beijing Institute of Technology;Beijing Institute of Technology;Beijing Institute of Technology;;Beijing Institute of Technology;;Tencent", "aff_domain": "bit.edu.cn;bit.edu.cn;bit.edu.cn;;bit.edu.cn;;tencent.com", "position": "MS student;Associate Professor;MS student;;PhD student;;Researcher", "bibtex": "@inproceedings{\ncai2024enhancing,\ntitle={Enhancing Cross-Modal Fine-Tuning with Gradually Intermediate Modality Generation},\nauthor={Lincan Cai and Shuang Li and Wenxuan Ma and Jingxuan Kang and Binhui Xie and Zixun Sun and Chengwei Zhu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YlcSyCz21c}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7070472, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3056797778855963124&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "bit.edu.cn;bit.edu.cn;bit.edu.cn;;bit.edu.cn;;tencent.com", "author_num": 7, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Beijing Institute of Technology;Tencent", "aff_unique_dep": ";Tencent Holdings Limited", "aff_unique_url": "http://www.bit.edu.cn/;https://www.tencent.com", "aff_unique_abbr": "BIT;Tencent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Unveiling the Cycloid Trajectory of EM Iterations in Mixed Linear Regression", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33762", "id": "Yn8xnK90mS", "proceeding": "https://proceedings.mlr.press/v235/luo24c.html", "pdf": "https://openreview.net/pdf?id=Yn8xnK90mS", "openreview": "https://openreview.net/forum?id=Yn8xnK90mS", "author_site": "Zhankun Luo, Abolfazl Hashemi", "tldr": "", "abstract": "We study the trajectory of iterations and the convergence rates of the Expectation-Maximization (EM) algorithm for two-component Mixed Linear Regression (2MLR). The fundamental goal of MLR is to learn the regression models from unlabeled observations. The EM algorithm finds extensive applications in solving the mixture of linear regressions. Recent results have established the super-linear convergence of EM for 2MLR in the noiseless and high SNR settings under some assumptions and its global convergence rate with random initialization has been affirmed. However, the exponent of convergence has not been theoretically estimated and the geometric properties of the trajectory of EM iterations are not well-understood. In this paper, first, using Bessel functions we provide explicit closed-form expressions for the EM updates under all SNR regimes. Then, in the noiseless setting, we completely characterize the behavior of EM iterations by deriving a recurrence relation at the population level and notably show that all the iterations lie on a certain cycloid. Based on this new trajectory-based analysis, we exhibit the theoretical estimate for the exponent of super-linear convergence and further improve the statistical error bound at the finite-sample level. Our analysis provides a new framework for studying the behavior of EM for Mixed Linear Regression.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhankun Luo;Abolfazl Hashemi", "authorids": "~Zhankun_Luo1;~Abolfazl_Hashemi1", "gender": "Not Specified;M", "homepage": "https://zhankunluo.com/;https://abolfazlh.github.io/", "dblp": "277/0238.html;176/5595", "google_scholar": "ciMjrQ0AAAAJ;Se7mocgAAAAJ", "orcid": "0000-0002-3626-1988;0000-0002-8421-4270", "linkedin": ";abolfazlh", "or_profile": "~Zhankun_Luo1;~Abolfazl_Hashemi1", "aff": "Purdue University;Purdue University", "aff_domain": "purdue.edu;purdue.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nluo2024unveiling,\ntitle={Unveiling the Cycloid Trajectory of {EM} Iterations in Mixed Linear Regression},\nauthor={Zhankun Luo and Abolfazl Hashemi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Yn8xnK90mS}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3424521, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2RnNOfT0btIJ:scholar.google.com/&scioq=Unveiling+the+Cycloid+Trajectory+of+EM+Iterations+in+Mixed+Linear+Regression&hl=en&as_sdt=0,33", "gs_version_total": 9, "email": "purdue.edu;purdue.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Unsupervised Evaluation of Code LLMs with Round-Trip Correctness", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33761", "id": "YnFuUX08CE", "proceeding": "https://proceedings.mlr.press/v235/allamanis24a.html", "pdf": "https://openreview.net/pdf?id=YnFuUX08CE", "openreview": "https://openreview.net/forum?id=YnFuUX08CE", "author_site": "Miltiadis Allamanis, Sheena Panthaplackel, Pengcheng Yin", "tldr": "", "abstract": "To evaluate code large language models (LLMs), research has relied on a few small manually curated benchmarks, such as HumanEval and MBPP, which represent a narrow part of the real-world software domains. In this work, we introduce round-trip correctness (RTC) as an alternative evaluation method. RTC allows Code LLM evaluation on a broader spectrum of real-world software domains without the need for costly human curation. RTC rests on the idea that we can ask a model to make a prediction (e.g., describe some code using natural language), feed that prediction back (e.g., synthesize code from the predicted description), and check if this round-trip leads to code that is semantically equivalent to the original input. We show how to employ RTC to evaluate code synthesis and editing. We find that RTC strongly correlates with model performance on existing narrow-domain code synthesis benchmarks while allowing us to expand to a much broader set of domains and tasks which was not previously possible without costly human annotations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Miltiadis Allamanis;Sheena Panthaplackel;Pengcheng Yin", "authorids": "~Miltiadis_Allamanis1;~Sheena_Panthaplackel1;~Pengcheng_Yin1", "gender": ";F;M", "homepage": ";;https://pengcheng.in", "dblp": ";255/5631;130/7385", "google_scholar": ";bxAIuu4AAAAJ;t5lVb6sAAAAJ", "orcid": ";;", "linkedin": ";;pchyin/", "or_profile": "~Miltiadis_Allamanis1;~Sheena_Panthaplackel1;~Pengcheng_Yin1", "aff": ";Google;Google", "aff_domain": ";google.com;google.com", "position": ";Researcher;Researcher", "bibtex": "@inproceedings{\nallamanis2024unsupervised,\ntitle={Unsupervised Evaluation of Code {LLM}s with Round-Trip Correctness},\nauthor={Miltiadis Allamanis and Sheena Panthaplackel and Pengcheng Yin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YnFuUX08CE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 431604, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17221843062501705399&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": ";google.com;google.com", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Measures of diversity and space-filling designs for categorical data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33760", "id": "YoUb2vW9WP", "proceeding": "https://proceedings.mlr.press/v235/malherbe24a.html", "pdf": "https://openreview.net/pdf?id=YoUb2vW9WP", "openreview": "https://openreview.net/forum?id=YoUb2vW9WP", "author_site": "Cedric Malherbe, Emilio Dom\u00ednguez-S\u00e1nchez, Merwan Barlier, Igor Colin, Haitham Bou Ammar, Tom Diethe", "tldr": "", "abstract": "Selecting a small subset of items that represent the diversity of a larger population lies at the heart of many data analysis and machine learning applications. However, when it comes to items described by discrete features, the lack of natural ordering and the combinatorial nature of the search space pose significant challenges to the current selection techniques and make existing methods ill-suited. In this paper, we propose to make a step in that direction by proposing novel methods to select subsets of diverse categorical data based on the advances in combinatorial optimization. First, we start to cast the subset selection problem through the lens of the optimization of three diversity metrics. We then provide novel bounds for this problem and present exact solvers that unfortunately come with a high computational cost. To overcome this bottleneck, we go on and show how to employ tools from linear programming and submodular optimization by introducing two computationally plausible methods that still present approximation guarantees about the diversity metrics. Finally, a numerical assessment is provided to illustrate the potential of the designs with respect to state-of-the-art methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Cedric Malherbe;Emilio Dom\u00ednguez-S\u00e1nchez;Merwan Barlier;Igor Colin;Haitham Bou Ammar;Tom Diethe", "authorids": "~Cedric_Malherbe1;~Emilio_Dom\u00ednguez-S\u00e1nchez1;~Merwan_Barlier1;~Igor_Colin1;~Haitham_Bou_Ammar1;~Tom_Diethe1", "gender": "M;;;M;M;M", "homepage": "https://es.linkedin.com/in/cedmalherbe;;;https://igorcolin.github.io/;;http://www.tomdiethe.com", "dblp": "182/8945;;185/0155;157/8205;;33/1098", "google_scholar": ";;;;https://scholar.google.co.uk/citations?user=AE5suDoAAAAJ;https://scholar.google.co.uk/citations?user=oWGk9c8AAAAJ", "orcid": ";0000-0002-4593-657X;;;;0000-0002-0776-5407", "linkedin": ";;;;;tomdiethe/", "or_profile": "~Cedric_Malherbe1;~Emilio_Dom\u00ednguez-S\u00e1nchez1;~Merwan_Barlier1;~Igor_Colin1;~Haitham_Bou_Ammar1;~Tom_Diethe1", "aff": "Huawei Technologies Ltd.;;Huawei Technologies Ltd.;T\u00e9l\u00e9com Paris;Huawei R&D UK;AstraZeneca", "aff_domain": "huawei.com;;huawei.com;telecom-paris.fr;huawei.com;astrazeneca.com", "position": "Researcher;;Researcher;Researcher;Principal Researcher;Principal Researcher", "bibtex": "@inproceedings{\nmalherbe2024measures,\ntitle={Measures of diversity and space-filling designs for categorical data},\nauthor={Cedric Malherbe and Emilio Dom{\\'\\i}nguez-S{\\'a}nchez and Merwan Barlier and Igor Colin and Haitham Bou Ammar and Tom Diethe},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YoUb2vW9WP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1001967, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:hspAmVbyoAcJ:scholar.google.com/&scioq=Measures+of+diversity+and+space-filling+designs+for+categorical+data&hl=en&as_sdt=0,33", "gs_version_total": 5, "email": "huawei.com;;huawei.com;telecom-paris.fr;huawei.com;astrazeneca.com", "author_num": 6, "aff_unique_index": "0;0;1;0;2", "aff_unique_norm": "Huawei;T\u00e9l\u00e9com Paris;AstraZeneca", "aff_unique_dep": "Huawei Technologies;;", "aff_unique_url": "https://www.huawei.com;https://www.telecom-paris.fr;https://www.astrazeneca.com", "aff_unique_abbr": "Huawei;T\u00e9l\u00e9com Paris;AZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;2;2", "aff_country_unique": "China;France;United Kingdom" }, { "title": "Beyond the Norms: Detecting Prediction Errors in Regression Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33759", "id": "YqIIhl2ToH", "proceeding": "https://proceedings.mlr.press/v235/altieri24a.html", "pdf": "https://openreview.net/pdf?id=YqIIhl2ToH", "openreview": "https://openreview.net/forum?id=YqIIhl2ToH", "author_site": "Andres Altieri, Marco Romanelli, Georg Pichler, Florence Alberge, Pablo Piantanida", "tldr": "", "abstract": "This paper tackles the challenge of detecting unreliable behavior in regression algorithms, which may arise from intrinsic variability (e.g., aleatoric uncertainty) or modeling errors (e.g., model uncertainty). First, we formally introduce the notion of unreliability in regression, i.e., when the output of the regressor exceeds a specified discrepancy (or error). Then, using powerful tools for probabilistic modeling, we estimate the discrepancy density, and we measure its statistical diversity using our proposed metric for statistical dissimilarity. In turn, this allows us to derive a data-driven score that expresses the uncertainty of the regression outcome. We show empirical improvements in error detection for multiple regression tasks, consistently outperforming popular baseline approaches, and contributing to the broader field of uncertainty quantification and safe machine learning systems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Andres Altieri;Marco Romanelli;Georg Pichler;Florence Alberge;Pablo Piantanida", "authorids": "~Andres_Altieri1;~Marco_Romanelli1;~Georg_Pichler1;~Florence_Alberge1;~Pablo_Piantanida2", "gender": "M;;M;F;M", "homepage": "https://l2s.centralesupelec.fr/en/u/altieri-andres/;;;https://l2s.centralesupelec.fr/u/alberge-florence/;https://www.pablo-piantanida.org", "dblp": ";;155/0692.html;;44/1416", "google_scholar": "TQWIH9oAAAAJ;;;8CcRfB8AAAAJ;https://scholar.google.fr/citations?user=QyBEFv0AAAAJ", "orcid": "0000-0001-9346-6704;;0000-0001-5696-4472;;", "linkedin": ";;;;pablo-piantanida-60a51bb5/?locale=en_US", "or_profile": "~Andres_Altieri1;~Marco_Romanelli1;~Georg_Pichler1;~Florence_Alberge1;~Pablo_Piantanida2", "aff": "Consejo Nacional de Investigaciones Cient\u00edficas y T\u00e9cnicas;;TU Wien Vienna University of Technology;Universite Paris-Saclay;Mila - Quebec AI Institute ", "aff_domain": "conicet.gov.ar;;tuwien.ac.at;univ-paris-saclay.fr;mila.quebec", "position": "Researcher;;Postdoc;Full Professor;Full Professor", "bibtex": "@inproceedings{\naltieri2024beyond,\ntitle={Beyond the Norms: Detecting Prediction Errors in Regression Models},\nauthor={Andres Altieri and Marco Romanelli and Georg Pichler and Florence Alberge and Pablo Piantanida},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YqIIhl2ToH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1098834, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Z1ZQcuThfpQJ:scholar.google.com/&scioq=Beyond+the+Norms:+Detecting+Prediction+Errors+in+Regression+Models&hl=en&as_sdt=0,5", "gs_version_total": 20, "email": "conicet.gov.ar;;tuwien.ac.at;univ-paris-saclay.fr;mila.quebec", "author_num": 5, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Consejo Nacional de Investigaciones Cient\u00edficas y T\u00e9cnicas;Vienna University of Technology;Universit\u00e9 Paris-Saclay;Quebec AI Institute", "aff_unique_dep": ";;;AI Institute", "aff_unique_url": "https://www.conicet.gov.ar;https://www.tuwien.ac.at;https://www.universite-paris-saclay.fr;https://mila.quebec", "aff_unique_abbr": "CONICET;TU Wien;UPSaclay;Mila", "aff_campus_unique_index": "1", "aff_campus_unique": ";Vienna", "aff_country_unique_index": "0;1;2;3", "aff_country_unique": "Argentina;Austria;France;Canada" }, { "title": "Principled Preferential Bayesian Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33758", "id": "YqMOM5W9GF", "proceeding": "https://proceedings.mlr.press/v235/xu24y.html", "pdf": "https://openreview.net/pdf?id=YqMOM5W9GF", "openreview": "https://openreview.net/forum?id=YqMOM5W9GF", "author_site": "Wenjie Xu, Wenbin Wang, Yuning Jiang, Bratislav Svetozarevic, Colin Jones", "tldr": "", "abstract": "We study the problem of preferential Bayesian optimization (BO), where we aim to optimize a black-box function with only preference feedback over a pair of candidate solutions. Inspired by the likelihood ratio idea, we construct a confidence set of the black-box function using only the preference feedback. An optimistic algorithm with an efficient computational method is then developed to solve the problem, which enjoys an information-theoretic bound on the total cumulative regret, a first-of-its-kind for preferential BO. This bound further allows us to design a scheme to report an estimated best solution, with a guaranteed convergence rate. Experimental results on sampled instances from Gaussian processes, standard test functions, and a thermal comfort optimization problem all show that our method stably achieves better or competitive performance as compared to the existing state-of-the-art heuristics, which, however, do not have theoretical guarantees on regret bounds or convergence.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenjie Xu;Wenbin Wang;Yuning Jiang;Bratislav Svetozarevic;Colin Jones", "authorids": "~Wenjie_Xu3;~Wenbin_Wang6;~Yuning_Jiang5;~Bratislav_Svetozarevic1;~Colin_Jones1", "gender": "M;;M;M;M", "homepage": "https://jackiexuw.github.io/;;;;http://la.epfl.ch", "dblp": "25/1820.html;;;;", "google_scholar": "https://scholar.google.com/citations?hl=en;;8rcqZtoAAAAJ;e9LqzHAAAAAJ;SulkJJQAAAAJ", "orcid": ";;;;0000-0001-7239-4799", "linkedin": ";;;;", "or_profile": "~Wenjie_Xu3;~Wenbin_Wang6;~Yuning_Jiang5;~Bratislav_Svetozarevic1;~Colin_Jones1", "aff": "EPFL - EPF Lausanne;;EPFL - EPF Lausanne;;EPFL - EPF Lausanne", "aff_domain": "epfl.ch;;epfl.ch;;epfl.ch", "position": "PhD student;;Postdoc;;Associate Professor", "bibtex": "@inproceedings{\nxu2024principled,\ntitle={Principled Preferential Bayesian Optimization},\nauthor={Wenjie Xu and Wenbin Wang and Yuning Jiang and Bratislav Svetozarevic and Colin Jones},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YqMOM5W9GF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 861842, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17426315661873334620&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "epfl.ch;;epfl.ch;;epfl.ch", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "EPFL", "aff_unique_dep": "", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "On the Generalization of Equivariant Graph Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33757", "id": "Yqj3DzIC79", "proceeding": "https://proceedings.mlr.press/v235/karczewski24a.html", "pdf": "https://openreview.net/pdf?id=Yqj3DzIC79", "openreview": "https://openreview.net/forum?id=Yqj3DzIC79", "author_site": "Rafa\u0142 Karczewski, Amauri Souza, Vikas Garg", "tldr": "", "abstract": "$E(n)$-Equivariant Graph Neural Networks (EGNNs) are among the most widely used and successful models for representation learning on geometric graphs (e.g., 3D molecules). However, while the expressivity of EGNNs has been explored in terms of geometric variants of the Weisfeiler-Leman isomorphism test, characterizing their generalization capability remains open. In this work, we establish the first generalization bound for EGNNs. Our bound depicts a dependence on the weighted sum of logarithms of the spectral norms of the weight matrices (EGNN parameters). In addition, our main result reveals interesting novel insights: $i$) the spectral norms of the initial layers may impact generalization more than the final ones; $ii$) $\\varepsilon$-normalization is beneficial to generalization --- confirming prior empirical evidence. We leverage these insights to introduce a spectral norm regularizer tailored to EGNNs. Experiments on real-world datasets substantiate our analysis, demonstrating a high correlation between theoretical and empirical generalization gaps and the effectiveness of the proposed regularization scheme.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rafal Karczewski;Amauri H Souza;Vikas Garg", "authorids": "~Rafal_Karczewski1;~Amauri_H_Souza1;~Vikas_Garg2", "gender": ";M;", "homepage": ";http://www.amauriholanda.org;", "dblp": "228/6790;131/3352;", "google_scholar": ";lP0LBI4AAAAJ;", "orcid": ";;", "linkedin": "rafal-karczewski-906ab010a;;", "or_profile": "~Rafal_Karczewski1;~Amauri_H_Souza1;~Vikas_Garg2", "aff": "Aalto University;Federal Institute of Cear\u00e1;", "aff_domain": "aalto.fi;ifce.edu.br;", "position": "PhD student;Associate Professor;", "bibtex": "@inproceedings{\nkarczewski2024on,\ntitle={On the Generalization of Equivariant Graph Neural Networks},\nauthor={Rafal Karczewski and Amauri H Souza and Vikas Garg},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Yqj3DzIC79}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 946221, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8933030449143530840&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "aalto.fi;ifce.edu.br;", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Aalto University;Federal Institute of Cear\u00e1", "aff_unique_dep": ";", "aff_unique_url": "https://www.aalto.fi;http://www.ifce.edu.br", "aff_unique_abbr": "Aalto;IFCE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Finland;Brazil" }, { "title": "Sample Average Approximation for Conditional Stochastic Optimization with Dependent Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33756", "id": "YuGnRORkJm", "proceeding": "https://proceedings.mlr.press/v235/wang24bc.html", "pdf": "https://openreview.net/pdf?id=YuGnRORkJm", "openreview": "https://openreview.net/forum?id=YuGnRORkJm", "author_site": "Yafei Wang, Bo Pan, Mei Li, Jianya Lu, Lingchen Kong, Bei Jiang, Linglong Kong", "tldr": "", "abstract": "Conditional Stochastic Optimization (CSO) is a powerful modelling paradigm for optimization under uncertainty. The existing literature on CSO is mainly based on the independence assumption of data, which shows that the solution of CSO is asymptotically consistent and enjoys a finite sample guarantee. The independence assumption, however, does not typically hold in many important applications with dependence patterns, such as time series analysis, operational control, and reinforcement learning. In this paper, we aim to fill this gap and consider a Sample Average Approximation (SAA) for CSO with dependent data. Leveraging covariance inequalities and independent block sampling technique, we provide theoretical guarantees of SAA for CSO with dependent data. In particular, we show that SAA for CSO retains asymptotic consistency and a finite sample guarantee under mild conditions. In addition, we establish the sample complexity $O(d / \\varepsilon^4)$ of SAA for CSO, which is shown to be of the same order as independent cases. Through experiments on several applications, we verify the theoretical results and demonstrate that dependence does not degrade the performance of the SAA approach in real data applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yafei Wang;Bo Pan;Mei Li;Jianya Lu;Lingchen Kong;Bei Jiang;Linglong Kong", "authorids": "~Yafei_Wang1;~Bo_Pan1;limei@amss.ac.cn;jianya.lu@essex.ac.uk;~Lingchen_Kong1;~Bei_Jiang1;~Linglong_Kong2", "gender": ";;;;M;F;M", "homepage": "https://apps.ualberta.ca/directory/person/yafei2;;;;https://faculty.bjtu.edu.cn/profiles/;https://www.ualberta.ca/~bei1;https://www.ualberta.ca/~lkong", "dblp": ";;;;;190/4697;35/8525", "google_scholar": "6zZR3_gAAAAJ;;;;;https://scholar.google.ca/citations?user=MfOZ8G0AAAAJ;https://scholar.google.ca/citations?hl=en", "orcid": ";;;;;0000-0002-0033-839X;0000-0003-3011-9216", "linkedin": ";;;;;;", "or_profile": "~Yafei_Wang1;~Bo_Pan1;limei@amss.ac.cn;jianya.lu@essex.ac.uk;~Lingchen_Kong1;~Bei_Jiang1;~Linglong_Kong2", "aff": "University of Alberta;;;;Beijing Jiaotong University;University of Alberta;University of Alberta", "aff_domain": "ualberta.ca;;;;bjtu.edu.cn;ualberta.ca;ualberta.ca", "position": "Assistant Professor;;;;Full Professor;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nwang2024sample,\ntitle={Sample Average Approximation for Conditional Stochastic Optimization with Dependent Data},\nauthor={Yafei Wang and Bo Pan and Mei Li and Jianya Lu and Lingchen Kong and Bei Jiang and Linglong Kong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YuGnRORkJm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 511054, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:CuCOHaCNbTgJ:scholar.google.com/&scioq=Sample+Average+Approximation+for+Conditional+Stochastic+Optimization+with+Dependent+Data&hl=en&as_sdt=0,5", "gs_version_total": 4, "email": "ualberta.ca;;;;bjtu.edu.cn;ualberta.ca;ualberta.ca", "author_num": 7, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of Alberta;Beijing Jiao Tong University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ualberta.ca;http://www.njtu.edu.cn/en", "aff_unique_abbr": "UAlberta;BJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Canada;China" }, { "title": "CasCast: Skillful High-resolution Precipitation Nowcasting via Cascaded Modelling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33755", "id": "YuNFJSEkTi", "proceeding": "https://proceedings.mlr.press/v235/gong24a.html", "pdf": "https://openreview.net/pdf?id=YuNFJSEkTi", "openreview": "https://openreview.net/forum?id=YuNFJSEkTi", "author_site": "JUNCHAO GONG, LEI BAI, Peng Ye, Wanghan Xu, Na Liu, Jianhua Dai, Xiaokang Yang, Wanli Ouyang", "tldr": "", "abstract": "Precipitation nowcasting based on radar data plays a crucial role in extreme weather prediction and has broad implications for disaster management. Despite progresses have been made based on deep learning, two key challenges of precipitation nowcasting are not well-solved: (i) the modeling of complex precipitation system evolutions with different scales, and (ii) accurate forecasts for extreme precipitation. In this work, we propose CasCast, a cascaded framework composed of a deterministic and a probabilistic part to decouple the predictions for mesoscale precipitation distributions and small-scale patterns. Then, we explore training the cascaded framework at the high resolution and conducting the probabilistic modeling in a low dimensional latent space with a frame-wise-guided diffusion transformer for enhancing the optimization of extreme events while reducing computational costs. Extensive experiments on three benchmark radar precipitation datasets show that CasCast achieves competitive performance. Especially, CasCast significantly surpasses the baseline (up to +91.8%) for regional extreme-precipitation nowcasting.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Junchao Gong;LEI BAI;Peng Ye;Wanghan Xu;Na Liu;Jianhua Dai;Xiaokang Yang;Wanli Ouyang", "authorids": "~Junchao_Gong1;~LEI_BAI1;~Peng_Ye4;~Wanghan_Xu1;~Na_Liu1;~Jianhua_Dai3;~Xiaokang_Yang1;~Wanli_Ouyang1", "gender": ";M;M;M;;M;M;", "homepage": ";http://leibai.site/;;https://scholar.google.com/citations?user=lmCL5xQAAAAJ&hl=zh-CN;https://orcid.org/0009-0005-4499-4734;;https://icne.sjtu.edu.cn/info/1064/1078.htm;", "dblp": ";119/1223-1;53/930-6;367/7191;;;06/3071-1.html;", "google_scholar": ";https://scholar.google.com.au/citations?user=sakOO04AAAAJ;UEZZP5QAAAAJ;https://scholar.google.com.hk/citations?user=lmCL5xQAAAAJ;;https://scholar.google.cz/scholar?hl=zh-CN;yDEavdMAAAAJ;", "orcid": ";0000-0003-3378-7201;0000-0002-8486-7562;;0009-0005-4499-4734;0009-0004-4570-8579;0000-0003-4029-3322;", "linkedin": ";lei-bai-641370153/;;;;;;", "or_profile": "~Junchao_Gong1;~LEI_BAI1;~Peng_Ye4;~Wanghan_Xu1;~Na_Liu1;~Jianhua_Dai3;~Xiaokang_Yang1;~Wanli_Ouyang1", "aff": ";Shanghai AI Laboratory;Fudan University;Xi'an Jiaotong University;National Meteorological Information Center;;Shanghai Jiaotong University;", "aff_domain": ";pjlab.org.cn;fudan.edu.cn;xjtu.edu.cn;cma.gov.cn;;sjtu.edu.cn;", "position": ";Researcher;PhD student;Undergrad student;Researcher;;Full Professor;", "bibtex": "@inproceedings{\ngong2024cascast,\ntitle={CasCast: Skillful High-resolution Precipitation Nowcasting via Cascaded Modelling},\nauthor={Junchao Gong and LEI BAI and Peng Ye and Wanghan Xu and Na Liu and Jianhua Dai and Xiaokang Yang and Wanli Ouyang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YuNFJSEkTi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9612001, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7545743046405868546&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": ";pjlab.org.cn;fudan.edu.cn;xjtu.edu.cn;cma.gov.cn;;sjtu.edu.cn;", "author_num": 8, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Shanghai AI Laboratory;Fudan University;Xi'an Jiao Tong University;National Meteorological Information Center;Shanghai Jiao Tong University", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.shanghai-ai-lab.com;https://www.fudan.edu.cn;https://www.xjtu.edu.cn;http://www.nmic.cn;https://www.sjtu.edu.cn", "aff_unique_abbr": "SAIL;Fudan;XJTU;NMIC;SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Model-Free Robust $\\phi$-Divergence Reinforcement Learning Using Both Offline and Online Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33754", "id": "Yug1IEkvcb", "proceeding": "https://proceedings.mlr.press/v235/panaganti24a.html", "pdf": "https://openreview.net/pdf?id=Yug1IEkvcb", "openreview": "https://openreview.net/forum?id=Yug1IEkvcb", "author_site": "Kishan Panaganti, Adam Wierman, Eric Mazumdar", "tldr": "", "abstract": "The robust $\\phi$-regularized Markov Decision Process (RRMDP) framework focuses on designing control policies that are robust against parameter uncertainties due to mismatches between the simulator (nominal) model and real-world settings. This work makes *two* important contributions. First, we propose a *model-free* algorithm called *Robust $\\phi$-regularized fitted Q-iteration* for learning an $\\epsilon$-optimal robust policy that uses only the historical data collected by rolling out a behavior policy (with *robust exploratory* requirement) on the nominal model. To the best of our knowledge, we provide the *first* unified analysis for a class of $\\phi$-divergences achieving robust optimal policies in high-dimensional systems of arbitrary large state space with general function approximation. Second, we introduce the *hybrid robust $\\phi$-regularized reinforcement learning* framework to learn an optimal robust policy using both historical data and online sampling. Towards this framework, we propose a model-free algorithm called *Hybrid robust Total-variation-regularized Q-iteration*. To the best of our knowledge, we provide the *first* improved out-of-data-distribution assumption in large-scale problems of arbitrary large state space with general function approximation under the hybrid robust $\\phi$-regularized reinforcement learning framework.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kishan Panaganti;Adam Wierman;Eric Mazumdar", "authorids": "~Kishan_Panaganti1;~Adam_Wierman1;~Eric_Mazumdar1", "gender": "M;M;M", "homepage": "https://sites.google.com/a/tamu.edu/kpb;https://adamwierman.com/;http://people.eecs.berkeley.edu/~emazumdar/", "dblp": "260/0365;56/4447;177/9322", "google_scholar": "yTCoJdsAAAAJ;4OvOdSgAAAAJ;FZOxxvcAAAAJ", "orcid": ";0000-0002-5923-0199;", "linkedin": ";adam-wierman-a529474/;", "or_profile": "~Kishan_Panaganti1;~Adam_Wierman1;~Eric_Mazumdar1", "aff": "California Institute of Technology;California Institute of Technology;Deparment of Computing + Mathematical Sciences, California Institute of Technology", "aff_domain": "caltech.edu;caltech.edu;cms.caltech.edu", "position": "Postdoc;Professor;Assistant Professor", "bibtex": "@inproceedings{\npanaganti2024modelfree,\ntitle={Model-Free Robust \\${\\textbackslash}phi\\$-Divergence Reinforcement Learning Using Both Offline and Online Data},\nauthor={Kishan Panaganti and Adam Wierman and Eric Mazumdar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Yug1IEkvcb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 631660, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8669997367813229781&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "caltech.edu;caltech.edu;cms.caltech.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "California Institute of Technology;", "aff_unique_dep": ";", "aff_unique_url": "https://www.caltech.edu;", "aff_unique_abbr": "Caltech;", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pasadena;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States;" }, { "title": "AND: Audio Network Dissection for Interpreting Deep Acoustic Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33753", "id": "YvAyOYeGlo", "proceeding": "https://proceedings.mlr.press/v235/wu24q.html", "pdf": "https://openreview.net/pdf?id=YvAyOYeGlo", "openreview": "https://openreview.net/forum?id=YvAyOYeGlo", "author_site": "Tung-Yu Wu, Yu-Xiang Lin, Lily Weng", "tldr": "", "abstract": "Neuron-level interpretations aim to explain network behaviors and properties by investigating neurons responsive to specific perceptual or structural input patterns. Although there is emerging work in the vision and language domains, none is explored for acoustic models. To bridge the gap, we introduce *AND*, the first **A**udio **N**etwork **D**issection framework that automatically establishes natural language explanations of acoustic neurons based on highly responsive audio. *AND* features the use of LLMs to summarize mutual acoustic features and identities among audio. Extensive experiments are conducted to verify *AND*'s precise and informative descriptions. In addition, we highlight two acoustic model behaviors with analysis by *AND*. First, models discriminate audio with a combination of basic acoustic features rather than high-level abstract concepts. Second, training strategies affect neuron behaviors. Supervised training guides neurons to gradually narrow their attention, while self-supervised learning encourages neurons to be polysemantic for exploring high-level features. Finally, we demonstrate a potential use of *AND* in audio model unlearning by conducting concept-specific pruning based on the descriptions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tung-Yu Wu;Yu-Xiang Lin;Tsui-Wei Weng", "authorids": "~Tung-Yu_Wu1;~Yu-Xiang_Lin1;~Tsui-Wei_Weng1", "gender": "M;M;F", "homepage": "https://github.com/tony10101105;https://www.linkedin.com/in/yu-xiang-lin-1b8075201/;https://lilywenglab.github.io", "dblp": "96/9290;;177/9197", "google_scholar": "Zx1YxFgAAAAJ;https://scholar.google.com.tw/citations?user=cZZ5vD8AAAAJ;v8GM4xoAAAAJ", "orcid": ";;", "linkedin": ";yu-xiang-lin-1b8075201/;", "or_profile": "~Tung-Yu_Wu1;~Yu-Xiang_Lin1;~Tsui-Wei_Weng1", "aff": "National Taiwan University;National Taiwan University;University of California, San Diego", "aff_domain": "ntu.edu.tw;ntu.edu.tw;ucsd.edu", "position": "Undergrad student;MS student;Assistant Professor", "bibtex": "@inproceedings{\nwu2024and,\ntitle={{AND}: Audio Network Dissection for Interpreting Deep Acoustic Models},\nauthor={Tung-Yu Wu and Yu-Xiang Lin and Tsui-Wei Weng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YvAyOYeGlo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5134633, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10154255469006902711&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "ntu.edu.tw;ntu.edu.tw;ucsd.edu", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "National Taiwan University;University of California, San Diego", "aff_unique_dep": ";", "aff_unique_url": "https://www.ntu.edu.tw;https://www.ucsd.edu", "aff_unique_abbr": "NTU;UCSD", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Taiwan;San Diego", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;United States" }, { "title": "Non-stationary Online Convex Optimization with Arbitrary Delays", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33752", "id": "YvPNwLedpQ", "proceeding": "https://proceedings.mlr.press/v235/wan24h.html", "pdf": "https://openreview.net/pdf?id=YvPNwLedpQ", "openreview": "https://openreview.net/forum?id=YvPNwLedpQ", "author_site": "Yuanyu Wan, Chang Yao, Mingli Song, Lijun Zhang", "tldr": "", "abstract": "Online convex optimization (OCO) with arbitrary delays, in which gradients or other information of functions could be arbitrarily delayed, has received increasing attention recently. Different from previous studies that focus on stationary environments, this paper investigates the delayed OCO in non-stationary environments, and aims to minimize the dynamic regret with respect to any sequence of comparators. To this end, we first propose a simple algorithm, namely DOGD, which performs a gradient descent step for each delayed gradient according to their arrival order. Despite its simplicity, our novel analysis shows that the dynamic regret of DOGD can be automatically bounded by $O(\\sqrt{\\bar{d}T}(P_T+1))$ under mild assumptions, and $O(\\sqrt{dT}(P_T+1))$ in the worst case, where $\\bar{d}$ and $d$ denote the average and maximum delay respectively, $T$ is the time horizon, and $P_T$ is the path-length of comparators. Furthermore, we develop an improved algorithm, which reduces those dynamic regret bounds achieved by DOGD to $O(\\sqrt{\\bar{d}T(P_T+1)})$ and $O(\\sqrt{dT(P_T+1)})$, respectively. The key idea is to run multiple DOGD with different learning rates, and utilize a meta-algorithm to track the best one based on their delayed performance. Finally, we demonstrate that our improved algorithm is optimal in the worst case by deriving a matching lower bound.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuanyu Wan;Chang Yao;Mingli Song;Lijun Zhang", "authorids": "~Yuanyu_Wan1;~Chang_Yao2;~Mingli_Song1;~Lijun_Zhang1", "gender": "M;;M;", "homepage": "https://yuanyuwan.github.io/;;https://person.zju.edu.cn/msong;", "dblp": "221/3499;;71/5333;", "google_scholar": "CEymMc8AAAAJ;;7oLbhAwAAAAJ;", "orcid": ";;0000-0003-2621-6048;", "linkedin": ";;;", "or_profile": "~Yuanyu_Wan1;~Chang_Yao2;~Mingli_Song1;~Lijun_Zhang1", "aff": "Zhejiang University;;Zhejiang University;", "aff_domain": "zju.edu.cn;;zju.edu.cn;", "position": "Researcher;;Full Professor;", "bibtex": "@inproceedings{\nwan2024nonstationary,\ntitle={Non-stationary Online Convex Optimization with Arbitrary Delays},\nauthor={Yuanyu Wan and Chang Yao and Mingli Song and Lijun Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YvPNwLedpQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 365039, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10702077732759636816&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8, "email": "zju.edu.cn;;zju.edu.cn;", "author_num": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Zhejiang University", "aff_unique_dep": "", "aff_unique_url": "https://www.zju.edu.cn", "aff_unique_abbr": "ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Transolver: A Fast Transformer Solver for PDEs on General Geometries", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33751", "id": "Ywl6pODXjB", "proceeding": "https://proceedings.mlr.press/v235/wu24r.html", "pdf": "https://openreview.net/pdf?id=Ywl6pODXjB", "openreview": "https://openreview.net/forum?id=Ywl6pODXjB", "author_site": "Haixu Wu, Huakun Luo, Haowen Wang, Jianmin Wang, Mingsheng Long", "tldr": "", "abstract": "Transformers have empowered many milestones across various fields and have recently been applied to solve partial differential equations (PDEs). However, since PDEs are typically discretized into large-scale meshes with complex geometries, it is challenging for Transformers to capture intricate physical correlations directly from massive individual points. Going beyond superficial and unwieldy meshes, we present Transolver based on a more foundational idea, which is learning intrinsic physical states hidden behind discretized geometries. Specifically, we propose a new Physics-Attention to adaptively split the discretized domain into a series of learnable slices of flexible shapes, where mesh points under similar physical states will be ascribed to the same slice. By calculating attention to physics-aware tokens encoded from slices, Transovler can effectively capture intricate physical correlations under complex geometrics, which also empowers the solver with endogenetic geometry-general modeling capacity and can be efficiently computed in linear complexity. Transolver achieves consistent state-of-the-art with 22% relative gain across six standard benchmarks and also excels in large-scale industrial simulations, including car and airfoil designs. Code is available at https://github.com/thuml/Transolver.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haixu Wu;Huakun Luo;Haowen Wang;Jianmin Wang;Mingsheng Long", "authorids": "~Haixu_Wu1;~Huakun_Luo1;~Haowen_Wang4;~Jianmin_Wang1;~Mingsheng_Long5", "gender": "M;F;M;M;M", "homepage": ";https://whw-alex.github.io/;https://www.thss.tsinghua.edu.cn/en/faculty/jianminwang.htm;https://luohk19.github.io;http://ise.thss.tsinghua.edu.cn/~mlong", "dblp": "286/8115;;06/3456-1.html;338/9930;74/9023", "google_scholar": "oLL_x0wAAAAJ;;https://scholar.google.com.tw/citations?user=MiovcboAAAAJ;;_MjXpXkAAAAJ", "orcid": ";;0000-0001-6841-7943;;0000-0002-5412-9120", "linkedin": ";;;;", "or_profile": "~Haixu_Wu1;~Haowen_Wang4;~Jianmin_Wang1;~\u534e\u5764_\u7f571;~Mingsheng_Long2", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;mails.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;Undergrad student;Full Professor;Undergrad student;Associate Professor", "bibtex": "@inproceedings{\nwu2024transolver,\ntitle={Transolver: A Fast Transformer Solver for {PDE}s on General Geometries},\nauthor={Haixu Wu and Huakun Luo and Haowen Wang and Jianmin Wang and Mingsheng Long},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Ywl6pODXjB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9754803, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7399342504323818414&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "tsinghua.edu.cn;mails.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Neural Tangent Kernels for Axis-Aligned Tree Ensembles", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33750", "id": "YxmcEfcgp3", "proceeding": "https://proceedings.mlr.press/v235/kanoh24a.html", "pdf": "https://openreview.net/pdf?id=YxmcEfcgp3", "openreview": "https://openreview.net/forum?id=YxmcEfcgp3", "author_site": "Ryuichi Kanoh, Mahito Sugiyama", "tldr": "", "abstract": "While axis-aligned rules are known to induce an important inductive bias in machine learning models such as typical hard decision tree ensembles, theoretical understanding of the learning behavior is largely unrevealed due to the discrete nature of rules. To address this issue, we impose the axis-aligned constraint on soft trees, which relax the splitting process of decision trees and are trained using a gradient method, and present their Neural Tangent Kernel (NTK), which enables us to analytically describe the training behavior. We study two cases: imposing the axis-aligned constraint throughout the entire training process, and only at the initial state. Moreover, we extend the NTK framework to handle various tree architectures simultaneously, and prove that any axis-aligned non-oblivious tree ensemble can be transformed into axis-aligned oblivious tree ensembles with the same NTK. One can search for suitable tree architecture via Multiple Kernel Learning (MKL), and our numerical experiments show a variety of suitable features depending on the type of constraints. Our NTK analysis highlights both the theoretical and practical impacts of the axis-aligned constraint in tree ensemble learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ryuichi Kanoh;Mahito Sugiyama", "authorids": "~Ryuichi_Kanoh1;~Mahito_Sugiyama1", "gender": "M;M", "homepage": ";https://mahito.nii.ac.jp/", "dblp": "287/4416;05/8421", "google_scholar": ";qLlRvTkAAAAJ", "orcid": ";0000-0001-5907-9831", "linkedin": "ryuichi-kanoh-43ab4316b/;", "or_profile": "~Ryuichi_Kanoh1;~Mahito_Sugiyama1", "aff": "NII, the Graduate University for Advanced Studies;National Institute of Informatics", "aff_domain": "nii.ac.jp;nii.ac.jp", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nkanoh2024neural,\ntitle={Neural Tangent Kernels for Axis-Aligned Tree Ensembles},\nauthor={Ryuichi Kanoh and Mahito Sugiyama},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YxmcEfcgp3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7419415, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:a8X-VfUeUuEJ:scholar.google.com/&scioq=Neural+Tangent+Kernels+for+Axis-Aligned+Tree+Ensembles&hl=en&as_sdt=0,31", "gs_version_total": 4, "email": "nii.ac.jp;nii.ac.jp", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "National Institute of Informatics", "aff_unique_dep": "", "aff_unique_url": "https://www.nii.ac.jp", "aff_unique_abbr": "NII", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "title": "Towards Robust Model-Based Reinforcement Learning Against Adversarial Corruption", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33749", "id": "Z0S6fUdW68", "proceeding": "https://proceedings.mlr.press/v235/ye24a.html", "pdf": "https://openreview.net/pdf?id=Z0S6fUdW68", "openreview": "https://openreview.net/forum?id=Z0S6fUdW68", "author_site": "Chenlu Ye, Jiafan He, Quanquan Gu, Tong Zhang", "tldr": "", "abstract": "This study tackles the challenges of adversarial corruption in model-based reinforcement learning (RL), where the transition dynamics can be corrupted by an adversary. Existing studies on corruption-robust RL mostly focus on the setting of model-free RL, where robust least-square regression is often employed for value function estimation. However, these techniques cannot be directly applied to model-based RL. In this paper, we focus on model-based RL and take the maximum likelihood estimation (MLE) approach to learn transition model. Our work encompasses both online and offline settings. In the online setting, we introduce an algorithm called corruption-robust optimistic MLE (CR-OMLE), which leverages total-variation (TV)-based information ratios as uncertainty weights for MLE. We prove that CR-OMLE achieves a regret of $\\tilde{\\mathcal{O}}(\\sqrt{T} + C)$, where $C$ denotes the cumulative corruption level after $T$ episodes. We also prove a lower bound to show that the additive dependence on $C$ is optimal. We extend our weighting technique to the offline setting, and propose an algorithm named corruption-robust pessimistic MLE (CR-PMLE). Under a uniform coverage condition, CR-PMLE exhibits suboptimality worsened by $\\mathcal{O}(C/n)$, nearly matching the lower bound. To the best of our knowledge, this is the first work on corruption-robust model-based RL algorithms with provable guarantees.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chenlu Ye;Jiafan He;Quanquan Gu;Tong Zhang", "authorids": "~Chenlu_Ye1;~Jiafan_He1;~Quanquan_Gu1;~Tong_Zhang2", "gender": "F;M;M;M", "homepage": "https://chenluye99.github.io/;https://sites.google.com/g.ucla.edu/jiafan-he-homepage;http://web.cs.ucla.edu/~qgu/;http://tongzhang-ml.org", "dblp": "336/2092;214/5785;50/4597;07/4227-1", "google_scholar": "c8yK5XsAAAAJ;F3AXNBwAAAAJ;GU9HgNAAAAAJ;LurWtuYAAAAJ", "orcid": ";;;0000-0002-5511-2558", "linkedin": "https://www.linkedin.cn/incareer/in/chenlu-ye-9b015b184;;;", "or_profile": "~Chenlu_Ye1;~Jiafan_He1;~Quanquan_Gu1;~Tong_Zhang2", "aff": "University of Illinois, Urbana Champaign;University of California, Los Angeles;University of California, Los Angeles;UIUC", "aff_domain": "illinois.edu;ucla.edu;cs.ucla.edu;illinois.edu", "position": "PhD student;PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nye2024towards,\ntitle={Towards Robust Model-Based Reinforcement Learning Against Adversarial Corruption},\nauthor={Chenlu Ye and Jiafan He and Quanquan Gu and Tong Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Z0S6fUdW68}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 530618, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14179713279668930450&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "illinois.edu;ucla.edu;cs.ucla.edu;illinois.edu", "author_num": 4, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;University of California, Los Angeles", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://www.ucla.edu", "aff_unique_abbr": "UIUC;UCLA", "aff_campus_unique_index": "0;1;1;0", "aff_campus_unique": "Urbana-Champaign;Los Angeles", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Reward for Robot Skills Using Large Language Models via Self-Alignment", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33748", "id": "Z19JQ6WFtJ", "proceeding": "https://proceedings.mlr.press/v235/zeng24d.html", "pdf": "https://openreview.net/pdf?id=Z19JQ6WFtJ", "openreview": "https://openreview.net/forum?id=Z19JQ6WFtJ", "author_site": "Yuwei Zeng, Yao Mu, Lin Shao", "tldr": "", "abstract": "Learning reward functions remains the bottleneck to equip a robot with a broad repertoire of skills. Large Language Models (LLM) contain valuable task-related knowledge that can potentially aid in the learning of reward functions. However, the proposed reward function can be imprecise, thus ineffective which requires to be further grounded with environment information. We proposed a method to learn rewards more efficiently in the absence of humans. Our approach consists of two components: We first use the LLM to propose features and parameterization of the reward, then update the parameters through an iterative self-alignment process. In particular, the process minimizes the ranking inconsistency between the LLM and the learnt reward functions based on the execution feedback. The method was validated on 9 tasks across 2 simulation environments. It demonstrates a consistent improvement in training efficacy and efficiency, meanwhile consuming significantly fewer GPT tokens compared to the alternative mutation-based method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuwei Zeng;Yao Mu;Lin Shao", "authorids": "~Yuwei_Zeng1;~Yao_Mu1;~Lin_Shao2", "gender": ";M;M", "homepage": "https://friolero.github.io/;https://yaomarkmu.github.io/;https://linsats.github.io/", "dblp": ";260/0674;26/8546-2", "google_scholar": "PqvAzW4AAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yuwei_Zeng1;~Yao_Mu1;~Lin_Shao2", "aff": "National University of Singapore;The University of Hong Kong;National University of Singapore", "aff_domain": "comp.nus.edu.sg;hku.hk;nus.edu.sg", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nzeng2024learning,\ntitle={Learning Reward for Robot Skills Using Large Language Models via Self-Alignment},\nauthor={Yuwei Zeng and Yao Mu and Lin Shao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Z19JQ6WFtJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4164006, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4913879226335885002&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "comp.nus.edu.sg;hku.hk;nus.edu.sg", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "National University of Singapore;University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;https://www.hku.hk", "aff_unique_abbr": "NUS;HKU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Singapore;China" }, { "title": "How Universal Polynomial Bases Enhance Spectral Graph Neural Networks: Heterophily, Over-smoothing, and Over-squashing", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33747", "id": "Z2LH6Va7L2", "proceeding": "https://proceedings.mlr.press/v235/huang24z.html", "pdf": "https://openreview.net/pdf?id=Z2LH6Va7L2", "openreview": "https://openreview.net/forum?id=Z2LH6Va7L2", "author_site": "Keke Huang, Yu Guang Wang, Ming Li, Pietro Li\u00f3", "tldr": "", "abstract": "Spectral Graph Neural Networks (GNNs), alternatively known as *graph filters*, have gained increasing prevalence for heterophily graphs. Optimal graph filters rely on Laplacian eigendecomposition for Fourier transform. In an attempt to avert prohibitive computations, numerous polynomial filters have been proposed. However, polynomials in the majority of these filters are *predefined* and remain *fixed* across different graphs, failing to accommodate the varying degrees of heterophily. Addressing this gap, we demystify the intrinsic correlation between the spectral property of desired polynomial bases and the heterophily degrees via thorough theoretical analyses. Subsequently, we develop a novel adaptive heterophily basis wherein the basis vectors mutually form angles reflecting the heterophily degree of the graph. We integrate this heterophily basis with the homophily basis to construct a universal polynomial basis *UniBasis*, which devises a polynomial filter based graph neural network \u2013 *UniFilter*. It optimizes the convolution and propagation in GNN, thus effectively limiting over-smoothing and alleviating over-squashing. Our extensive experiments, conducted on datasets with a diverse range of heterophily, support the superiority of UniBasis in the universality but also its proficiency in graph explanation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Keke Huang;Yu Guang Wang;Ming Li;Pietro Lio", "authorids": "~Keke_Huang1;~Yu_Guang_Wang1;~Ming_Li15;~Pietro_Lio1", "gender": "M;M;M;M", "homepage": "https://sites.google.com/view/kekehuang/;https://yuguangwang.github.io/;https://mingli-ai.github.io;https://www.cst.cam.ac.uk/people/pl219", "dblp": ";03/10023-1;181/2821-65;l/PietroLio.html", "google_scholar": "https://scholar.google.com.sg/citations?user=OsceCbcAAAAJ;cMSEByAAAAAJ;Z7yEoOQAAAAJ;https://scholar.google.co.uk/citations?user=3YrWf7EAAAAJ", "orcid": "0000-0003-2190-7114;;0000-0002-1218-2804;0000-0002-0540-5053", "linkedin": "keke-huang-4594b9135/?originalSubdomain=sg;;;", "or_profile": "~Keke_Huang1;~Yu_Guang_Wang1;~Ming_Li15;~Pietro_Lio1", "aff": "National University of Singapore;Shanghai Jiaotong University;Zhejiang Normal University;University of Cambridge", "aff_domain": "nus.edu.sg;sjtu.edu.cn;zjnu.edu.cn;cam.ac.uk", "position": "Postdoc;Associate Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nhuang2024how,\ntitle={How Universal Polynomial Bases Enhance Spectral Graph Neural Networks: Heterophily, Over-smoothing, and Over-squashing},\nauthor={Keke Huang and Yu Guang Wang and Ming Li and Pietro Lio},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Z2LH6Va7L2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 599853, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14925208110001908821&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "nus.edu.sg;sjtu.edu.cn;zjnu.edu.cn;cam.ac.uk", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "National University of Singapore;Shanghai Jiao Tong University;Zhejiang Normal University;University of Cambridge", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.nus.edu.sg;https://www.sjtu.edu.cn;http://www.zjnu.edu.cn;https://www.cam.ac.uk", "aff_unique_abbr": "NUS;SJTU;ZJNU;Cambridge", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;1;1;2", "aff_country_unique": "Singapore;China;United Kingdom" }, { "title": "Identification and Estimation for Nonignorable Missing Data: A Data Fusion Approach", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33746", "id": "Z7MzVDFWDV", "proceeding": "https://proceedings.mlr.press/v235/wang24t.html", "pdf": "https://openreview.net/pdf?id=Z7MzVDFWDV", "openreview": "https://openreview.net/forum?id=Z7MzVDFWDV", "author_site": "Zixiao Wang, AmirEmad Ghassami, Ilya Shpitser", "tldr": "", "abstract": "We consider the task of identifying and estimating a parameter of interest in settings where data is missing not at random (MNAR). In general, such parameters are not identified without strong assumptions on the missing data model. In this paper, we take an alternative approach and introduce a method inspired by data fusion, where information in the MNAR dataset is augmented by information in an auxiliary dataset subject to missingness at random (MAR). We show that even if the parameter of interest cannot be identified given either dataset alone, it can be identified given pooled data, under two complementary sets of assumptions. We derive inverse probability weighted (IPW) estimators for identified parameters under both sets of assumptions, and evaluate the performance of our estimation strategies via simulation studies, and a data application.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zixiao Wang;AmirEmad Ghassami;Ilya Shpitser", "authorids": "~Zixiao_Wang6;~AmirEmad_Ghassami1;~Ilya_Shpitser1", "gender": "F;M;M", "homepage": ";https://www.aeghassami.com/;https://www.cs.jhu.edu/faculty/ilya-shpitser-3/", "dblp": ";169/2051;82/1901", "google_scholar": ";6bTxniwAAAAJ;", "orcid": ";;", "linkedin": "zixiao-w-3a79811a5/overlay/contact-info/;;", "or_profile": "~Zixiao_Wang6;~AmirEmad_Ghassami1;~Ilya_Shpitser1", "aff": "Johns Hopkins University;Boston University;Johns Hopkins University", "aff_domain": "jhu.edu;bu.edu;jhu.edu", "position": "MS student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nwang2024identification,\ntitle={Identification and Estimation for Nonignorable Missing Data: A Data Fusion Approach},\nauthor={Zixiao Wang and AmirEmad Ghassami and Ilya Shpitser},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Z7MzVDFWDV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 382371, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9435564208275146361&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "jhu.edu;bu.edu;jhu.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Johns Hopkins University;Boston University", "aff_unique_dep": ";", "aff_unique_url": "https://www.jhu.edu;https://www.bu.edu", "aff_unique_abbr": "JHU;BU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "NExT-Chat: An LMM for Chat, Detection and Segmentation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33745", "id": "ZAW37OZ6ig", "proceeding": "https://proceedings.mlr.press/v235/zhang24bu.html", "pdf": "https://openreview.net/pdf?id=ZAW37OZ6ig", "openreview": "https://openreview.net/forum?id=ZAW37OZ6ig", "author_site": "Ao Zhang, Yuan Yao, Wei Ji, Zhiyuan Liu, Tat-Seng Chua", "tldr": "", "abstract": "The development of large language models (LLMs) has greatly advanced the field of multimodal understanding, leading to the emergence of large multimodal models (LMMs). In order to enhance visual comprehension, recent studies have equipped LMMs with region-level understanding capabilities by representing object bounding box coordinates as a series of text sequences (pix2seq). In this paper, we introduce a novel paradigm for object location modeling called the pix2emb method, where we ask the LMM to output the location embeddings and then decode them with different decoders. This paradigm allows us to use different location formats (such as bounding boxes and masks) in multimodal conversations. Leveraging the proposed pix2emb method, we train an LMM named NExT-Chat and demonstrate its capability of handling multiple tasks like visual grounding, region captioning, and grounded reasoning. Comprehensive experiments show the effectiveness of our NExT-Chat on various tasks, e.g., NExT-Chat (87.7) vs. Shikra (86.9) on POPE-Random, NExT-Chat (71.3) vs. LISA (67.9) on referring expression segmentation task, and NExT-Chat (79.6) vs. Kosmos-2 (62.3) on region caption task.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ao Zhang;Yuan Yao;Wei Ji;Zhiyuan Liu;Tat-Seng Chua", "authorids": "~Ao_Zhang2;~Yuan_Yao12;~Wei_Ji1;~Zhiyuan_Liu1;~Tat-Seng_Chua2", "gender": "M;M;M;M;M", "homepage": "https://waxnkw.github.io/;https://yaoyuanthu.github.io/;https://jiwei0523.github.io/;http://nlp.csai.tsinghua.edu.cn/~lzy;http://www.comp.nus.edu.sg/~chuats/", "dblp": "187/6243;;52/3220-8;53/3245-1;", "google_scholar": "0akC8h8AAAAJ;https://scholar.google.com.hk/citations?user=3NWfi3YAAAAJ;69OFB-AAAAAJ;dT0v5u0AAAAJ;https://scholar.google.com.tw/citations?user=Z9DWCBEAAAAJ", "orcid": ";;0000-0002-8106-9768;0000-0002-7709-2543;0000-0001-6097-7807", "linkedin": ";;;;", "or_profile": "~Ao_Zhang2;~Yuan_Yao12;~Wei_Ji1;~Zhiyuan_Liu1;~Tat-seng_Chua1", "aff": "National University of Singapore;National University of Singapore;Nanjing University;Tsinghua University;National University of Singapore", "aff_domain": "u.nus.edu;nus.edu;nju.edu.cn;tsinghua.edu.cn;nus.edu.sg", "position": "PhD student;Postdoc;Associate Professor;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nzhang2024nextchat,\ntitle={{NE}xT-Chat: An {LMM} for Chat, Detection and Segmentation},\nauthor={Ao Zhang and Yuan Yao and Wei Ji and Zhiyuan Liu and Tat-Seng Chua},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZAW37OZ6ig}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4320725, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14096374840534906106&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "u.nus.edu;nus.edu;nju.edu.cn;tsinghua.edu.cn;nus.edu.sg", "author_num": 5, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "National University of Singapore;Nanjing University;Tsinghua University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nus.edu.sg;https://www.nju.edu.cn;https://www.tsinghua.edu.cn", "aff_unique_abbr": "NUS;Nanjing U;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0", "aff_country_unique": "Singapore;China" }, { "title": "A Unified Adaptive Testing System Enabled by Hierarchical Structure Search", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33744", "id": "ZFRrOiZruJ", "proceeding": "https://proceedings.mlr.press/v235/yu24r.html", "pdf": "https://openreview.net/pdf?id=ZFRrOiZruJ", "openreview": "https://openreview.net/forum?id=ZFRrOiZruJ", "author_site": "Junhao Yu, Yan Zhuang, Zhenya Huang, Qi Liu, Xin Li, Rui Li, Enhong Chen", "tldr": "", "abstract": "Adaptive Testing System (ATS) is a promising testing mode, extensively utilized in standardized tests like the GRE. It offers personalized ability assessment by dynamically adjusting questions based on individual ability levels. Compared to traditional exams, ATS can improve the accuracy of ability estimates while simultaneously reducing the number of questions required. Despite the diverse testing formats of ATS, tailored to different adaptability requirements in various testing scenarios, there is a notable absence of a unified framework for modeling them. In this paper, we introduce a unified data-driven ATS framework that conceptualizes the various testing formats as a hierarchical test structure search problem. It can learn directly from data to solve for the optimal questions for each student, eliminating the need for manual test design. The proposed solution algorithm comes with theoretical guarantees for estimation error and convergence. Empirical results show that our framework maintains assessment accuracy while reducing question count by 20% on average and improving training stability.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Junhao Yu;Yan Zhuang;Zhenya Huang;Qi Liu;Xin Li;Rui LI;Enhong Chen", "authorids": "~Junhao_Yu2;~Yan_Zhuang4;~Zhenya_Huang2;~Qi_Liu3;~Xin_Li56;rli@xidian.edu.cn;~Enhong_Chen1", "gender": "M;M;M;M;M;;M", "homepage": "https://github.com/Hhhhhhand?tab=repositories;http://home.ustc.edu.cn/~zykb/;http://staff.ustc.edu.cn/~huangzhy/;http://staff.ustc.edu.cn/~qiliuql/;https://www.scopus.com/authid/detail.uri?authorId=57196399539;;http://staff.ustc.edu.cn/~cheneh", "dblp": ";;178/8690;95/2446-3;09/1365-64;;07/258", "google_scholar": ";7MX_P5cAAAAJ;dVZuU90AAAAJ;5EoHAFwAAAAJ;;;Q9h02J0AAAAJ", "orcid": ";0000-0001-7351-377X;0000-0003-1661-0420;0000-0001-6956-5550;;;0000-0002-4835-4102", "linkedin": ";;;;;;", "or_profile": "~Junhao_Yu2;~Yan_Zhuang4;~Zhenya_Huang2;~Qi_Liu3;~Xin_Li56;rli@xidian.edu.cn;~Enhong_Chen1", "aff": "University of Science and Technology of China;University of Science and Technology of China;University of Science and Technology of China;University of Science and Technology of China;University of Science and Technology of China;;University of Science and Technology of China", "aff_domain": "ustc.edu.cn;ustc.edu.cn;ustc.edu.cn;ustc.edu.cn;ustc.edu.cn;;ustc.edu.cn", "position": "MS student;PhD student;Associate Professor;Full Professor;Assistant Professor;;Full Professor", "bibtex": "@inproceedings{\nyu2024a,\ntitle={A Unified Adaptive Testing System Enabled by Hierarchical Structure Search},\nauthor={Junhao Yu and Yan Zhuang and Zhenya Huang and Qi Liu and Xin Li and Rui LI and Enhong Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZFRrOiZruJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 611244, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13599107232776039803&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "ustc.edu.cn;ustc.edu.cn;ustc.edu.cn;ustc.edu.cn;ustc.edu.cn;;ustc.edu.cn", "author_num": 7, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "University of Science and Technology of China", "aff_unique_dep": "", "aff_unique_url": "http://www.ustc.edu.cn", "aff_unique_abbr": "USTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Getting the most out of your tokenizer for pre-training and domain adaptation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33743", "id": "ZFYBnLljtT", "proceeding": "https://proceedings.mlr.press/v235/dagan24a.html", "pdf": "https://openreview.net/pdf?id=ZFYBnLljtT", "openreview": "https://openreview.net/forum?id=ZFYBnLljtT", "author_site": "Gautier Dagan, Gabriel Synnaeve, Baptiste Roziere", "tldr": "", "abstract": "Tokenization is an understudied and often neglected component of modern LLMs. Most published works use a single tokenizer for all experiments, often borrowed from another model, without performing ablations or analysis to optimize tokenization. Moreover, the tokenizer is generally kept unchanged when fine-tuning a base model. In this paper, we show that the size, pre-tokenization regular expression, and training data of a tokenizer can significantly impact the model's generation speed, effective context size, memory usage, and downstream performance. We train specialized Byte-Pair Encoding code tokenizers, and conduct extensive ablations on the impact of tokenizer design on the performance of LLMs for code generation tasks such as HumanEval and MBPP, and provide recommendations for tokenizer hyper-parameters selection and switching the tokenizer in a pre-trained LLM. We perform our experiments on models trained from scratch and from pre-trained models, verifying their applicability to a wide range of use-cases. We find that when fine-tuning on more than 50 billion tokens, we can specialize the tokenizer of a pre-trained LLM to obtain large gains in generation speed and effective context size.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gautier Dagan;Gabriel Synnaeve;Baptiste Roziere", "authorids": "~Gautier_Dagan1;~Gabriel_Synnaeve1;~Baptiste_Roziere1", "gender": "M;M;", "homepage": "https://www.gautier.tech/;;", "dblp": "234/2268;http://dblp.uni-trier.de/pers/hd/s/Synnaeve:Gabriel;", "google_scholar": "fyqu2nIAAAAJ;wN9rBkcAAAAJ;CrSf2CQAAAAJ", "orcid": "0000-0002-1867-4201;;", "linkedin": "gautier-dagan/;;", "or_profile": "~Gautier_Dagan1;~Gabriel_Synnaeve1;~Baptiste_Roziere1", "aff": "University of Edinburgh, University of Edinburgh;Meta Facebook;Meta AI", "aff_domain": "ed.ac.uk;fb.com;fb.com", "position": "PhD student;Research Scientist;Researcher", "bibtex": "@inproceedings{\ndagan2024getting,\ntitle={Getting the most out of your tokenizer for pre-training and domain adaptation},\nauthor={Gautier Dagan and Gabriel Synnaeve and Baptiste Roziere},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZFYBnLljtT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1507537, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5076155361580940794&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "ed.ac.uk;fb.com;fb.com", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Edinburgh;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.ed.ac.uk;https://meta.com", "aff_unique_abbr": "Edinburgh;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "The Fundamental Limits of Least-Privilege Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33742", "id": "ZGEICuuUJo", "proceeding": "https://proceedings.mlr.press/v235/stadler24a.html", "pdf": "https://openreview.net/pdf?id=ZGEICuuUJo", "openreview": "https://openreview.net/forum?id=ZGEICuuUJo", "author_site": "Theresa Stadler, Bogdan Kulynych, Michael Gastpar, Nicolas Papernot, Carmela Troncoso", "tldr": "", "abstract": "The promise of least-privilege learning \u2013 to find feature representations that are useful for a learning task but prevent inference of any sensitive information unrelated to this task \u2013 is highly appealing. However, so far this concept has only been stated informally. It thus remains an open question whether and how we can achieve this goal. In this work, we provide the *first formalisation of the least-privilege principle for machine learning* and characterise its feasibility. We prove that there is a *fundamental trade-off* between a representation's utility for a given task and its leakage beyond the intended task: it is not possible to learn representations that have high utility for the intended task but, at the same time, prevent inference of any attribute other than the task label itself. This trade-off holds *regardless* of the technique used to learn the feature mappings that produce these representations. We empirically validate this result for a wide range of learning techniques, model architectures, and datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Theresa Stadler;Bogdan Kulynych;Michael Gastpar;Nicolas Papernot;Carmela Troncoso", "authorids": "~Theresa_Stadler1;~Bogdan_Kulynych1;~Michael_Gastpar1;~Nicolas_Papernot1;~Carmela_Troncoso1", "gender": ";Not Specified;;M;F", "homepage": "https://reslbesl.github.io;https://kulyny.ch;https://people.epfl.ch/michael.gastpar;https://www.papernot.fr;http://carmelatroncoso.com/", "dblp": ";203/9056;;162/1405;01/4825", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;https://scholar.google.ch/citations?user=IQ3hcw4AAAAJ;cGxq0cMAAAAJ;sMkt3SgAAAAJ", "orcid": ";;0000-0002-5499-5336;;0000-0002-2374-2248", "linkedin": ";;;nicolaspapernot;carmela-troncoso-b497975/?originalSubdomain=ch", "or_profile": "~Theresa_Stadler1;~Bogdan_Kulynych1;~Michael_Gastpar1;~Nicolas_Papernot1;~Carmela_Troncoso1", "aff": "Swiss Federal Institute of Technology Lausanne;CHUV - University Hospital Lausanne;School of Computer and Communication Sciences, EPFL - EPF Lausanne;Google;EPFL - EPF Lausanne", "aff_domain": "epfl.ch;chuv.ch;ic.epfl.ch;google.com;epfl.ch", "position": "PhD student;Postdoc;Full Professor;Research Scientist;Associate Professor", "bibtex": "@inproceedings{\nstadler2024the,\ntitle={The Fundamental Limits of Least-Privilege Learning},\nauthor={Theresa Stadler and Bogdan Kulynych and Michael Gastpar and Nicolas Papernot and Carmela Troncoso},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZGEICuuUJo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1761247, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10222119602895288907&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "epfl.ch;chuv.ch;ic.epfl.ch;google.com;epfl.ch", "author_num": 5, "aff_unique_index": "0;1;2;3;2", "aff_unique_norm": "Swiss Federal Institute of Technology Lausanne;University Hospital Lausanne;EPFL;Google", "aff_unique_dep": ";;School of Computer and Communication Sciences;Google", "aff_unique_url": "https://www.epfl.ch;https://www.chuv.ch;https://www.epfl.ch;https://www.google.com", "aff_unique_abbr": "EPFL;CHUV;EPFL;Google", "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Lausanne;Mountain View", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "Switzerland;United States" }, { "title": "AdsorbDiff: Adsorbate Placement via Conditional Denoising Diffusion", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33741", "id": "ZMgpE58PMj", "proceeding": "https://proceedings.mlr.press/v235/kolluru24a.html", "pdf": "https://openreview.net/pdf?id=ZMgpE58PMj", "openreview": "https://openreview.net/forum?id=ZMgpE58PMj", "author_site": "Adeesh Kolluru, John Kitchin", "tldr": "", "abstract": "Determining the optimal configuration of adsorbates on a slab (adslab) is pivotal in the exploration of novel catalysts across diverse applications. Traditionally, the quest for the lowest energy adslab configuration involves placing the adsorbate onto the slab followed by an optimization process. Prior methodologies have relied on heuristics, problem-specific intuitions, or brute-force approaches to guide adsorbate placement. In this work, we propose a novel framework for adsorbate placement using denoising diffusion. The model is designed to predict the optimal adsorbate site and orientation corresponding to the lowest energy configuration. Further, we have an end-to-end evaluation framework where diffusion-predicted adslab configuration is optimized with a pretrained machine learning force field and finally evaluated with Density Functional Theory (DFT). Our findings demonstrate an acceleration of up to 5x or 3.5x improvement in accuracy compared to the previous best approach. Given the novelty of this framework and application, we provide insights into the impact of pretraining, model architectures, and conduct extensive experiments to underscore the significance of this approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Adeesh Kolluru;John R. Kitchin", "authorids": "~Adeesh_Kolluru1;~John_R._Kitchin1", "gender": ";M", "homepage": "https://adeeshkolluru.github.io/;https://kitchingroup.cheme.cmu.edu", "dblp": "295/8440;199/6691", "google_scholar": ";jD_4h7sAAAAJ", "orcid": "0000-0001-8125-6881;0000-0003-2625-9232", "linkedin": ";john-kitchin-6b959038/", "or_profile": "~Adeesh_Kolluru1;~John_Kitchin1", "aff": "Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nkolluru2024adsorbdiff,\ntitle={AdsorbDiff: Adsorbate Placement via Conditional Denoising Diffusion},\nauthor={Adeesh Kolluru and John R. Kitchin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZMgpE58PMj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1077787, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18021500706943608663&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "cmu.edu;cmu.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Sampling is as easy as keeping the consistency: convergence guarantee for Consistency Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33740", "id": "ZPiEIhQpos", "proceeding": "https://proceedings.mlr.press/v235/lyu24b.html", "pdf": "https://openreview.net/pdf?id=ZPiEIhQpos", "openreview": "https://openreview.net/forum?id=ZPiEIhQpos", "author_site": "Junlong Lyu, Zhitang Chen, Shoubo Feng", "tldr": "", "abstract": "We provide the first convergence guarantee for the Consistency Models (CMs), a newly emerging type of one-step generative models that is capable of generating comparable samples to those sampled from state-of-the-art Diffusion Models. Our main result is that, under the basic assumptions on score-matching errors, consistency errors, and smoothness of the data distribution, CMs can efficiently generate samples in one step with small $W_2$ error to any real data distribution. Our results (1) hold for $L^2$-accurate assumptions on both score and consistency functions (rather than $L^\\infty$-accurate assumptions); (2) do not require strong assumptions on the data distribution such as log-Sobelev conditions; (3) scale polynomially in all parameters; and (4) match the state-of-the-art convergence guarantee for score-based generative models. We also show that the Multi-step Consistency Sampling procedure can further reduce the error comparing to one step sampling, which supports the original statement from Song Yang's work. Our result can be generalized to arbitrary bounded data distributions that may be supported on some low-dimensional sub-manifolds. Our results further imply TV error guarantees when making some Langevin-based modifications to the output distributions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Junlong Lyu;Zhitang Chen;Shoubo Feng", "authorids": "~Junlong_Lyu1;~Zhitang_Chen1;~Shoubo_Feng1", "gender": "M;M;M", "homepage": ";;", "dblp": "243/2962.html;06/10875;", "google_scholar": "S8ogqFcAAAAJ;;06JUo1MAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Junlong_Lyu1;~Zhitang_Chen1;~Shoubo_Feng1", "aff": "Huawei Technologies Ltd.;Huawei Technologies Ltd.;Huawei Technologies Ltd.", "aff_domain": "huawei.com;huawei.com;huawei.com", "position": "Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\nlyu2024sampling,\ntitle={Sampling is as easy as keeping the consistency: convergence guarantee for Consistency Models},\nauthor={Junlong Lyu and Zhitang Chen and Shoubo Feng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZPiEIhQpos}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 433246, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14628412173401979341&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "huawei.com;huawei.com;huawei.com", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Huawei", "aff_unique_dep": "Huawei Technologies", "aff_unique_url": "https://www.huawei.com", "aff_unique_abbr": "Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Cooperative Graph Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33739", "id": "ZQcqXCuoxD", "proceeding": "https://proceedings.mlr.press/v235/finkelshtein24a.html", "pdf": "https://openreview.net/pdf?id=ZQcqXCuoxD", "openreview": "https://openreview.net/forum?id=ZQcqXCuoxD", "author_site": "Ben Finkelshtein, Xingyue Huang, Michael Bronstein, Ismail Ceylan", "tldr": "", "abstract": "Graph neural networks are popular architectures for graph machine learning, based on iterative computation of node representations of an input graph through a series of invariant transformations. A large class of graph neural networks follow a standard message-passing paradigm: at every layer, each node state is updated based on an aggregate of messages from its neighborhood. In this work, we propose a novel framework for training graph neural networks, where every node is viewed as a player that can choose to either `listen`, `broadcast`, `listen and broadcast`, or to `isolate`. The standard message propagation scheme can then be viewed as a special case of this framework where every node `listens and broadcasts` to all neighbors. Our approach offers a more flexible and dynamic message-passing paradigm, where each node can determine its own strategy based on their state, effectively exploring the graph topology while learning. We provide a theoretical analysis of the new message-passing scheme which is further supported by an extensive empirical analysis on a synthetic and real-world datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ben Finkelshtein;Xingyue Huang;Michael M. Bronstein;Ismail Ilkan Ceylan", "authorids": "~Ben_Finkelshtein1;~Xingyue_Huang1;~Michael_M._Bronstein1;~Ismail_Ilkan_Ceylan2", "gender": "M;M;M;", "homepage": "https://benfinkelshtein.github.io/;https://github.com/HxyScotthuang;http://www.inf.usi.ch/bronstein/;https://www.cs.ox.ac.uk/people/ismaililkan.ceylan/", "dblp": "278/2449.html;208/4778;07/2668;147/6111", "google_scholar": "goWM7VwAAAAJ;qah4McsAAAAJ;UU3N6-UAAAAJ;avJ5kQcAAAAJ", "orcid": ";;;0000-0003-4118-4689", "linkedin": "ben-finkelshtein/;xingyue-huang-1251651a2/;mbronstein/;", "or_profile": "~Ben_Finkelshtein1;~Xingyue_Huang1;~Michael_M._Bronstein1;~Ismail_Ilkan_Ceylan2", "aff": "University of Oxford;University of Oxford;University of Oxford;University of Oxford", "aff_domain": "cs.ox.ac.uk;cs.ox.ac.uk;ox.ac.uk;oxford.ac.uk", "position": "PhD student;PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nfinkelshtein2024cooperative,\ntitle={Cooperative Graph Neural Networks},\nauthor={Ben Finkelshtein and Xingyue Huang and Michael M. Bronstein and Ismail Ilkan Ceylan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZQcqXCuoxD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7358794, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12907554645302309822&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "cs.ox.ac.uk;cs.ox.ac.uk;ox.ac.uk;oxford.ac.uk", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "On Convergence of Incremental Gradient for Non-convex Smooth Functions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33738", "id": "ZRMQX6aTUS", "proceeding": "https://proceedings.mlr.press/v235/koloskova24a.html", "pdf": "https://openreview.net/pdf?id=ZRMQX6aTUS", "openreview": "https://openreview.net/forum?id=ZRMQX6aTUS", "author_site": "Anastasiia Koloskova, Nikita Doikov, Sebastian Stich, Martin Jaggi", "tldr": "", "abstract": "In machine learning and neural network optimization, algorithms like incremental gradient, single shuffle SGD, and random reshuffle SGD are popular due to their cache-mismatch efficiency and good practical convergence behavior. However, their optimization properties in theory, especially for non-convex smooth functions, remain incompletely explored. This paper delves into the convergence properties of SGD algorithms with arbitrary data ordering, within a broad framework for non-convex smooth functions. Our findings show enhanced convergence guarantees for incremental gradient and single shuffle SGD. Particularly if $n$ is the training set size, we improve $n$ times the optimization term of convergence guarantee to reach accuracy $\\epsilon$ from $O \\left( \\frac{n}{\\epsilon} \\right)$ to $O \\left( \\frac{1}{\\epsilon}\\right)$.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anastasia Koloskova;Nikita Doikov;Sebastian U Stich;Martin Jaggi", "authorids": "~Anastasia_Koloskova2;~Nikita_Doikov1;~Sebastian_U_Stich1;~Martin_Jaggi1", "gender": ";M;M;F", "homepage": "https://doikov.com;https://www.sstich.ch;https://mlo.epfl.ch;", "dblp": "222/9897;04/10549;17/4402;228/9222", "google_scholar": "YNBhhjUAAAAJ;https://scholar.google.ch/citations?user=8l-mDfQAAAAJ;https://scholar.google.ch/citations?user=r1TJBr8AAAAJ;ldJpvE8AAAAJ", "orcid": ";;0000-0003-1579-5558;", "linkedin": ";;;", "or_profile": "~Nikita_Doikov1;~Sebastian_U_Stich1;~Martin_Jaggi1;~Anastasiia_Koloskova1", "aff": "EPFL - EPF Lausanne;CISPA Helmholtz Center for Information Security;EPFL;Swiss Federal Institute of Technology Lausanne", "aff_domain": "epfl.ch;cispa.de;epfl.ch;epfl.ch", "position": "Postdoc;Tenure Track Faculty;Associate Professor;PhD student", "bibtex": "@inproceedings{\nkoloskova2024on,\ntitle={On Convergence of Incremental Gradient for Non-convex Smooth Functions},\nauthor={Anastasia Koloskova and Nikita Doikov and Sebastian U Stich and Martin Jaggi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZRMQX6aTUS}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1373570, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6401443451280309657&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "epfl.ch;cispa.de;epfl.ch;epfl.ch", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "EPFL;CISPA Helmholtz Center for Information Security;Swiss Federal Institute of Technology Lausanne", "aff_unique_dep": ";;", "aff_unique_url": "https://www.epfl.ch;https://www.cispa.de/;https://www.epfl.ch", "aff_unique_abbr": "EPFL;CISPA;EPFL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Switzerland;Germany" }, { "title": "Online Learning and Information Exponents: The Importance of Batch size & Time/Complexity Tradeoffs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33737", "id": "ZSQAf5YlvN", "proceeding": "https://proceedings.mlr.press/v235/arnaboldi24a.html", "pdf": "https://openreview.net/pdf?id=ZSQAf5YlvN", "openreview": "https://openreview.net/forum?id=ZSQAf5YlvN", "author_site": "Luca Arnaboldi, Yatin Dandi, FLORENT KRZAKALA, Bruno Loureiro, Luca Pesce, Ludovic Stephan", "tldr": "", "abstract": "We study the impact of the batch size $n_b$ on the iteration time $T$ of training two-layer neural networks with one-pass stochastic gradient descent (SGD) on multi-index target functions of isotropic covariates. We characterize the optimal batch size minimizing the iteration time as a function of the hardness of the target, as characterized by the information exponents. We show that performing gradient updates with large batches $n_b \\lesssim d^{\\frac{\\ell}{2}}$ minimizes the training time without changing the total sample complexity, where $\\ell$ is the information exponent of the target to be learned and $d$ is the input dimension. However, larger batch sizes than $n_b \\gg d^{\\frac{\\ell}{2}}$ are detrimental for improving the time complexity of SGD. We provably overcome this fundamental limitation via a different training protocol, *Correlation loss SGD*, which suppresses the auto-correlation terms in the loss function. We show that one can track the training progress by a system of low-dimensional ordinary differential equations (ODEs). Finally, we validate our theoretical results with numerical experiments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Luca Arnaboldi;Yatin Dandi;Florent Krzakala;Bruno Loureiro;Luca Pesce;Ludovic Stephan", "authorids": "~Luca_Arnaboldi2;~Yatin_Dandi1;~Florent_Krzakala1;~Bruno_Loureiro1;~Luca_Pesce1;~Ludovic_Stephan2", "gender": "M;M;;M;M;M", "homepage": "https://arnaboldi.lu/;https://yatindandi.github.io/;http://Krzakala.org;https://brloureiro.github.io/;https://lucpoisson.github.io;", "dblp": "205/8141-2;255/6032;25/1282;207/1834;321/1650;230/4096", "google_scholar": "A-4QdoQAAAAJ;UiEzYkMAAAAJ;https://scholar.google.fr/citations?user=3jDeUlMAAAAJ;DXl3ir8AAAAJ;praGYvoAAAAJ;mEd3WCsAAAAJ", "orcid": "0009-0001-9739-8849;;0000-0003-2313-2578;0000-0002-6327-4688;;0000-0001-5612-3577", "linkedin": ";;;bruno-loureiro-43183b14a/;;", "or_profile": "~Luca_Arnaboldi2;~Yatin_Dandi1;~Florent_Krzakala1;~Bruno_Loureiro1;~Luca_Pesce1;~Ludovic_STEPHAN1", "aff": "EPFL - EPF Lausanne;EPFL - EPF Lausanne;Swiss Federal Institute of Technology Lausanne;Ecole Normale Sup\u00e9rieure, Ecole Normale Sup\u00e9rieure de Paris;EPFL - EPF Lausanne;EPFL - EPF Lausanne", "aff_domain": "epfl.ch;epfl.ch;epfl.ch;di.ens.fr;epfl.ch;epfl.ch", "position": "PhD student;PhD student;Full Professor;Researcher;PhD student;Postdoc", "bibtex": "@inproceedings{\narnaboldi2024online,\ntitle={Online Learning and Information Exponents: The Importance of Batch size \\& Time/Complexity Tradeoffs},\nauthor={Luca Arnaboldi and Yatin Dandi and Florent Krzakala and Bruno Loureiro and Luca Pesce and Ludovic Stephan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZSQAf5YlvN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2131570, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16266084246427067376&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "epfl.ch;epfl.ch;epfl.ch;di.ens.fr;epfl.ch;epfl.ch", "author_num": 6, "aff_unique_index": "0;0;1;2;0;0", "aff_unique_norm": "EPFL;Swiss Federal Institute of Technology Lausanne;Ecole Normale Sup\u00e9rieure de Paris", "aff_unique_dep": ";;", "aff_unique_url": "https://www.epfl.ch;https://www.epfl.ch;https://www.ens.psl.eu", "aff_unique_abbr": "EPFL;EPFL;ENS Paris", "aff_campus_unique_index": "0;0;0;1;0;0", "aff_campus_unique": "Lausanne;Paris", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "Switzerland;France" }, { "title": "MorphGrower: A Synchronized Layer-by-layer Growing Approach for Plausible Neuronal Morphology Generation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33736", "id": "ZTN866OsGx", "proceeding": "https://proceedings.mlr.press/v235/yang24ak.html", "pdf": "https://openreview.net/pdf?id=ZTN866OsGx", "openreview": "https://openreview.net/forum?id=ZTN866OsGx", "author_site": "Nianzu Yang, Kaipeng Zeng, Haotian Lu, Yexin Wu, Zexin Yuan, Danni Chen, Shengdian Jiang, Jiaxiang Wu, Yimin Wang, Junchi Yan", "tldr": "", "abstract": "Neuronal morphology is essential for studying brain functioning and understanding neurodegenerative disorders. As acquiring real-world morphology data is expensive, computational approaches for morphology generation have been studied. Traditional methods heavily rely on expert-set rules and parameter tuning, making it difficult to generalize across different types of morphologies. Recently, MorphVAE was introduced as the sole learning-based method, but its generated morphologies lack plausibility, i.e., they do not appear realistic enough and most of the generated samples are topologically invalid. To fill this gap, this paper proposes **MorphGrower**, which mimicks the neuron natural growth mechanism for generation. Specifically, MorphGrower generates morphologies layer by layer, with each subsequent layer conditioned on the previously generated structure. During each layer generation, MorphGrower utilizes a pair of sibling branches as the basic generation block and generates branch pairs synchronously. This approach ensures topological validity and allows for fine-grained generation, thereby enhancing the realism of the final generated morphologies. Results on four real-world datasets demonstrate that MorphGrower outperforms MorphVAE by a notable margin. Importantly, the electrophysiological response simulation demonstrates the plausibility of our generated samples from a neuroscience perspective. Our code is available at [https://github.com/Thinklab-SJTU/MorphGrower](https://github.com/Thinklab-SJTU/MorphGrower).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nianzu Yang;Kaipeng Zeng;Haotian Lu;Yexin Wu;Zexin Yuan;Danni Chen;Shengdian Jiang;Jiaxiang Wu;Yimin Wang;Junchi Yan", "authorids": "~Nianzu_Yang1;~Kaipeng_Zeng1;~Haotian_Lu1;~Yexin_Wu2;~Zexin_Yuan1;~Danni_Chen1;~Shengdian_Jiang1;~Jiaxiang_Wu1;~Yimin_Wang3;~Junchi_Yan2", "gender": "M;M;M;;M;F;M;M;M;M", "homepage": "https://yangnianzu0515.github.io/;https://github.com/zengkaipeng;https://github.com/flick-ai;https://librowu.github.io/;;;https://orcid.org/0000-0002-2277-263X;;https://yi-min.wang;http://thinklab.sjtu.edu.cn/", "dblp": "296/8412.html;345/6420;;;;;311/3156;119/6799-1.html;;60/7949.html", "google_scholar": ";https://scholar.google.com/citations?hl=zh-CN;;;;;5N5l92EAAAAJ;https://scholar.google.com.hk/citations?user=puazh38AAAAJ;MNuQBkMAAAAJ;ga230VoAAAAJ", "orcid": ";0000-0002-4798-7784;;;0009-0003-4929-6439;0009-0007-6870-9111;0000-0002-2277-263X;;;0000-0001-9639-7679", "linkedin": ";;;;;;;;;", "or_profile": "~Nianzu_Yang1;~Kaipeng_Zeng1;~Haotian_Lu1;~Yexin_Wu2;~Zexin_Yuan1;~Danni_Chen1;~Shengdian_Jiang1;~Jiaxiang_Wu1;~Yimin_Wang3;~Junchi_Yan1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;University of Illinois, Urbana Champaign;Shanghai University;Montreal Institute for Learning Algorithms, University of Montreal, Universit\u00e9 de Montr\u00e9al;;;Guangdong Institute of Intelligence Science and Technology;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;illinois.edu;shu.edu.cn;mila.umontreal.ca;;;gdiist.cn;sjtu.edu.cn", "position": "PhD student;MS student;Undergrad student;MS student;MS student;Intern;;;Principal Researcher;Full Professor", "bibtex": "@inproceedings{\nyang2024morphgrower,\ntitle={MorphGrower: A Synchronized Layer-by-layer Growing Approach for Plausible Neuronal Morphology Generation},\nauthor={Nianzu Yang and Kaipeng Zeng and Haotian Lu and Yexin Wu and Zexin Yuan and Danni Chen and Shengdian Jiang and Jiaxiang Wu and Yimin Wang and Junchi Yan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZTN866OsGx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9311870, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:VmjZ_70lrZEJ:scholar.google.com/&scioq=MorphGrower:+A+Synchronized+Layer-by-layer+Growing+Approach+for+Plausible+Neuronal+Morphology+Generation&hl=en&as_sdt=0,5", "gs_version_total": 4, "email": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;illinois.edu;shu.edu.cn;mila.umontreal.ca;;;gdiist.cn;sjtu.edu.cn", "author_num": 10, "aff_unique_index": "0;0;0;1;2;3;4;0", "aff_unique_norm": "Shanghai Jiao Tong University;University of Illinois Urbana-Champaign;Shanghai University;University of Montreal;Guangdong Institute of Intelligence Science and Technology", "aff_unique_dep": ";;;Montreal Institute for Learning Algorithms;", "aff_unique_url": "https://www.sjtu.edu.cn;https://illinois.edu;https://www.shu.edu.cn;https://www.mila.quebec;", "aff_unique_abbr": "SJTU;UIUC;SHU;MILA;", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Urbana-Champaign;Montreal", "aff_country_unique_index": "0;0;0;1;0;2;0;0", "aff_country_unique": "China;United States;Canada" }, { "title": "Safe Exploration in Dose Finding Clinical Trials with Heterogeneous Participants", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33735", "id": "ZUXvpIrz5l", "proceeding": "https://proceedings.mlr.press/v235/chien24a.html", "pdf": "https://openreview.net/pdf?id=ZUXvpIrz5l", "openreview": "https://openreview.net/forum?id=ZUXvpIrz5l", "author_site": "Isabel Chien, Wessel Bruinsma, Javier Gonzalez, Richard E Turner", "tldr": "", "abstract": "In drug development, early phase dose-finding clinical trials are carried out to identify an optimal dose to administer to patients in larger confirmatory clinical trials. Standard trial procedures do not optimize for participant benefit and do not consider participant heterogeneity, despite consequences to participants' health and downstream impacts to under-represented population subgroups. Many novel drugs also do not obey parametric modelling assumptions made in common dose-finding procedures. We present Safe Allocation for Exploration of Treatments SAFE-T, a procedure for adaptive dose-finding that adheres to safety constraints, improves utility for heterogeneous participants, and works well with small sample sizes. SAFE-T flexibly learns non-parametric multi-output Gaussian process models for dose toxicity and efficacy, using Bayesian optimization, and provides accurate final dose recommendations. We provide theoretical guarantees for the satisfaction of safety constraints. Using a comprehensive set of realistic synthetic scenarios, we demonstrate empirically that SAFE-T generally outperforms comparable methods and maintains performance across variations in sample size and subgroup distribution. Finally, we extend SAFE-T to a new adaptive setting, demonstrating its potential to improve traditional clinical trial procedures.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Isabel Chien;Wessel P Bruinsma;Javier Gonzalez;Richard E. Turner", "authorids": "~Isabel_Chien2;~Wessel_P_Bruinsma1;~Javier_Gonzalez2;~Richard_E_Turner1", "gender": "F;M;M;", "homepage": "https://chieni.github.io/;http://javiergonzalezh.github.io/;https://rich-turner-group.github.io/;https://wessel.ai", "dblp": "225/7539;;40/5352;242/3348.html", "google_scholar": "vYO3ncQAAAAJ;;https://scholar.google.co.uk/citations?user=DgLEyZgAAAAJ;QRQwz3cAAAAJ", "orcid": "0000-0001-7207-8526;;;", "linkedin": ";;;", "or_profile": "~Isabel_Chien2;~Javier_Gonzalez2;~Richard_E_Turner1;~Wessel_Bruinsma1", "aff": "University of Cambridge;Microsoft;Microsoft Research;", "aff_domain": "cam.ac.uk;microsoft.com;research.microsoft.com;", "position": "PhD student;Principal Researcher;Researcher;", "bibtex": "@inproceedings{\nchien2024safe,\ntitle={Safe Exploration in Dose Finding Clinical Trials with Heterogeneous Participants},\nauthor={Isabel Chien and Wessel P Bruinsma and Javier Gonzalez and Richard E. Turner},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZUXvpIrz5l}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6846612, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:C5xWPAS-36YJ:scholar.google.com/&scioq=Safe+Exploration+in+Dose+Finding+Clinical+Trials+with+Heterogeneous+Participants&hl=en&as_sdt=0,5", "gs_version_total": 8, "email": "cam.ac.uk;microsoft.com;research.microsoft.com;", "author_num": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Cambridge;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.cam.ac.uk;https://www.microsoft.com", "aff_unique_abbr": "Cambridge;Microsoft", "aff_campus_unique_index": "0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Improving Sample Efficiency of Model-Free Algorithms for Zero-Sum Markov Games", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33734", "id": "ZVmMV3AHjC", "proceeding": "https://proceedings.mlr.press/v235/feng24j.html", "pdf": "https://openreview.net/pdf?id=ZVmMV3AHjC", "openreview": "https://openreview.net/forum?id=ZVmMV3AHjC", "author_site": "Songtao Feng, Ming Yin, Yu-Xiang Wang, Jing Yang, Yingbin LIANG", "tldr": "", "abstract": "The problem of two-player zero-sum Markov games has recently attracted increasing interests in theoretical studies of multi-agent reinforcement learning (RL). In particular, for finite-horizon episodic Markov decision processes (MDPs), it has been shown that model-based algorithms can find an $\\epsilon$-optimal Nash Equilibrium (NE) with the sample complexity of $O(H^3SAB/\\epsilon^2)$, which is optimal in the dependence of the horizon $H$ and the number of states $S$ (where $A$ and $B$ denote the number of actions of the two players, respectively). However, none of the existing model-free algorithms can achieve such an optimality. In this work, we propose a model-free stage-based algorithm and show that it achieves the same sample complexity as the best model-based algorithm, and hence for the first time demonstrate that model-free algorithms can enjoy the same optimality in the $H$ dependence as model-based algorithms. The main improvement of the dependency on $H$ arises by leveraging the popular variance reduction technique based on the reference-advantage decomposition previously used only for single-agent RL. However, such a technique relies on a critical monotonicity property of the value function, which does not hold in Markov games due to the update of the policy via the coarse correlated equilibrium (CCE) oracle. Thus, to extend such a technique to Markov games, our algorithm features a key novel design of updating the reference value functions as the pair of optimistic and pessimistic value functions whose value difference is the smallest in the history in order to achieve the desired improvement in the sample efficiency.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Songtao Feng;Ming Yin;Yu-Xiang Wang;Jing Yang;Yingbin Liang", "authorids": "~Songtao_Feng1;~Ming_Yin4;~Yu-Xiang_Wang1;~Jing_Yang3;~Yingbin_Liang1", "gender": "M;M;;;F", "homepage": ";https://mingyin0312.github.io;http://www.cs.ucsb.edu/~yuxiangw/publications.html;http://www.ee.psu.edu/yang;https://sites.google.com/view/yingbinliang/home", "dblp": "217/1741;89/453.html;62/1637-3.html;;51/332", "google_scholar": "q11fVdcAAAAJ;ncBRYIUAAAAJ;HGNZ1fkAAAAJ;https://scholar.google.com/citations?hl=en;lGgLAiIAAAAJ", "orcid": ";0000-0001-6458-0751;;;", "linkedin": ";;;;", "or_profile": "~Songtao_Feng1;~Ming_Yin4;~Yu-Xiang_Wang1;~Jing_Yang3;~Yingbin_Liang1", "aff": "University of Florida;Princeton University;UC Santa Barbara;Pennsylvania State University;The Ohio State University", "aff_domain": "ufl.edu;princeton.edu;ucsb.edu;psu.edu;osu.edu", "position": "Postdoc;Postdoc;Assistant Professor;Associate Professor;Professor", "bibtex": "@inproceedings{\nfeng2024improving,\ntitle={Improving Sample Efficiency of Model-Free Algorithms for Zero-Sum Markov Games},\nauthor={Songtao Feng and Ming Yin and Yu-Xiang Wang and Jing Yang and Yingbin Liang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZVmMV3AHjC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 544114, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11342070135241651246&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "email": "ufl.edu;princeton.edu;ucsb.edu;psu.edu;osu.edu", "author_num": 5, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "University of Florida;Princeton University;University of California, Santa Barbara;Pennsylvania State University;Ohio State University", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.ufl.edu;https://www.princeton.edu;https://www.ucsb.edu;https://www.psu.edu;https://www.osu.edu", "aff_unique_abbr": "UF;Princeton;UCSB;PSU;OSU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Santa Barbara", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "CaPS: Collaborative and Private Synthetic Data Generation from Distributed Sources", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33733", "id": "ZXsNkm3bxu", "proceeding": "https://proceedings.mlr.press/v235/pentyala24a.html", "pdf": "https://openreview.net/pdf?id=ZXsNkm3bxu", "openreview": "https://openreview.net/forum?id=ZXsNkm3bxu", "author_site": "Sikha Pentyala, Mayana Pereira, Martine De Cock", "tldr": "", "abstract": "Data is the lifeblood of the modern world, forming a fundamental part of AI, decision-making, and research advances. With increase in interest in data, governments have taken important steps towards a regulated data world, drastically impacting data sharing and data usability and resulting in massive amounts of data confined within the walls of organizations. While synthetic data generation (SDG) is an appealing solution to break down these walls and enable data sharing, the main drawback of existing solutions is the assumption of a trusted aggregator for generative model training. Given that many data holders may not want to, or be legally allowed to, entrust a central entity with their raw data, we propose a framework for collaborative and private generation of synthetic tabular data from distributed data holders. Our solution is general, applicable to any marginal-based SDG, and provides input privacy by replacing the trusted aggregator with secure multi-party computation (MPC) protocols and output privacy via differential privacy (DP). We demonstrate the applicability and scalability of our approach for the state-of-the-art select-measure-generate SDG algorithms MWEM+PGM and AIM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sikha Pentyala;Mayana Pereira;Martine De Cock", "authorids": "~Sikha_Pentyala1;~Mayana_Pereira1;~Martine_De_Cock1", "gender": "F;F;F", "homepage": "https://sikhapentyala.github.io/;;http://faculty.washington.edu/mdecock/", "dblp": "284/9593.html;;", "google_scholar": "BcbKxy8AAAAJ;;UfCI-NcAAAAJ", "orcid": "0000-0001-7486-6016;0000-0001-8636-8882;0000-0001-7917-0771", "linkedin": "sikhapentyala/;;decockmartine/", "or_profile": "~Sikha_Pentyala1;~Mayana_Pereira1;~Martine_De_Cock1", "aff": "University of Washington;Microsoft;University of Washington", "aff_domain": "uw.edu;microsoft.com;uw.edu", "position": "PhD student;Researcher;Full Professor", "bibtex": "@inproceedings{\npentyala2024caps,\ntitle={Ca{PS}: Collaborative and Private Synthetic Data Generation from Distributed Sources},\nauthor={Sikha Pentyala and Mayana Pereira and Martine De Cock},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZXsNkm3bxu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 507293, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15494974260361988763&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 6, "email": "uw.edu;microsoft.com;uw.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Washington;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.washington.edu;https://www.microsoft.com", "aff_unique_abbr": "UW;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Few-Shot Character Understanding in Movies as an Assessment to Meta-Learning of Theory-of-Mind", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33732", "id": "ZZ7UKgK4c1", "proceeding": "https://proceedings.mlr.press/v235/yu24n.html", "pdf": "https://openreview.net/pdf?id=ZZ7UKgK4c1", "openreview": "https://openreview.net/forum?id=ZZ7UKgK4c1", "author_site": "Mo Yu, Qiujing Wang, Shunchi Zhang, Yisi Sang, Kangsheng Pu, Zekai Wei, Han Wang, Liyan Xu, Jing Li, Yue Yu, Jie Zhou", "tldr": "", "abstract": "When reading a story, humans can quickly understand new fictional characters with a few observations, mainly by drawing analogies to fictional and real people they already know. This reflects the few-shot and meta-learning essence of humans' inference of characters' mental states, *i.e.*, theory-of-mind (ToM), which is largely ignored in existing research. We fill this gap with a novel NLP dataset in a realistic narrative understanding scenario, ToM-in-AMC. Our dataset consists of $\\sim$1,000 parsed movie scripts, each corresponding to a few-shot character understanding task that requires models to mimic humans' ability of fast digesting characters with a few starting scenes in a new movie. We further propose a novel ToM prompting approach designed to explicitly assess the influence of multiple ToM dimensions. It surpasses existing baseline models, underscoring the significance of modeling multiple ToM dimensions for our task. Our extensive human study verifies that humans are capable of solving our problem by inferring characters' mental states based on their previously seen movies. In comparison, all the AI systems lag $>20\\%$ behind humans, highlighting a notable limitation in existing approaches' ToM capabilities. Code and data are available at https://github.com/ShunchiZhang/ToM-in-AMC", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mo Yu;Qiujing Wang;Shunchi Zhang;Yisi Sang;Kangsheng Pu;Zekai Wei;Han Wang;Liyan Xu;Jing Li;Yue Yu;Jie Zhou", "authorids": "~Mo_Yu1;~Qiujing_Wang1;~Shunchi_Zhang1;~Yisi_Sang1;~Kangsheng_Pu1;~Zekai_Wei1;~Han_Wang9;~Liyan_Xu1;~Jing_Li21;~Yue_Yu3;~Jie_Zhou8", "gender": "M;;;;M;M;M;M;;;M", "homepage": "http://researcher.ibm.com/researcher/view.php?person=us-yum;https://qiujing.wang;;;;;https://hannight.github.io/;https://lxucs.github.io/;;;", "dblp": "32/7445.html;;;;;;;56/5386;;;00/5012-16", "google_scholar": "vC8DssQAAAAJ;;;;;;xA8AYqkAAAAJ;4mx32ogAAAAJ;;;https://scholar.google.com.hk/citations?user=OijxQCMAAAAJ", "orcid": ";;;;;;;;;;0000-0002-5899-5165", "linkedin": ";;;;kspu/;zekai-wei-a053031a3/;;liyan-xu/;;;", "or_profile": "~Mo_Yu1;~Qiujing_Wang1;~Shunchi_Zhang1;~Yisi_Sang1;~Kangsheng_Pu1;~Zekai_Wei1;~Han_Wang9;~Liyan_Xu1;~Jing_Li21;~Yue_Yu3;~Jie_Zhou8", "aff": "WeChat AI, Tencent;Xi'an Jiaotong University;;;;;University of North Carolina at Chapel Hill;WeChat AI, Tencent;;;WeChat AI, Tencent Inc.", "aff_domain": "tencent.com;xjtu.edu.cn;;;;;cs.unc.edu;tencent.com;;;tencent.com", "position": "Principal Researcher;Undergrad student;;;;;PhD student;Researcher;;;Principal Researcher", "bibtex": "@inproceedings{\nyu2024fewshot,\ntitle={Few-Shot Character Understanding in Movies as an Assessment to Meta-Learning of Theory-of-Mind},\nauthor={Mo Yu and Qiujing Wang and Shunchi Zhang and Yisi Sang and Kangsheng Pu and Zekai Wei and Han Wang and Liyan Xu and Jing Li and Yue Yu and Jie Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZZ7UKgK4c1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2594239, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2201265039296253679&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "tencent.com;xjtu.edu.cn;;;;;cs.unc.edu;tencent.com;;;tencent.com", "author_num": 11, "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "Tencent;Xi'an Jiao Tong University;University of North Carolina", "aff_unique_dep": "WeChat AI;;", "aff_unique_url": "https://www.tencent.com;https://www.xjtu.edu.cn;https://www.unc.edu", "aff_unique_abbr": "Tencent;XJTU;UNC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Chapel Hill", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "China;United States" }, { "title": "MagicLens: Self-Supervised Image Retrieval with Open-Ended Instructions", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33731", "id": "Zc22RDtsvP", "proceeding": "https://proceedings.mlr.press/v235/zhang24an.html", "pdf": "https://openreview.net/pdf?id=Zc22RDtsvP", "openreview": "https://openreview.net/forum?id=Zc22RDtsvP", "author_site": "Kai Zhang, Yi Luan, Hexiang Hu, Kenton Lee, Siyuan Qiao, Wenhu Chen, Yu Su, Ming-Wei Chang", "tldr": "", "abstract": "Image retrieval, i.e., finding desired images given a reference image, inherently encompasses rich, multi-faceted search intents that are difficult to capture solely using image-based measures. Recent works leverage text instructions to allow users to more freely express their search intents. However, they primarily focus on image pairs that are visually similar and/or can be characterized by a small set of pre-defined relations. The core thesis of this paper is that text instructions can enable retrieving images with richer relations beyond visual similarity. To show this, we introduce MagicLens, a series of self-supervised image retrieval models that support open-ended instructions. MagicLens is built on a key novel insight: image pairs that naturally occur on the same web pages contain a wide range of implicit relations (e.g., inside view of), and we can bring those implicit relations explicit by synthesizing instructions via foundation models. Trained on 36.7M (query image, instruction, target image) triplets with rich semantic relations mined from the web, MagicLens achieves results comparable with or better than prior best on eight benchmarks of various image retrieval tasks, while maintaining high parameter efficiency with a significantly smaller model size. Additional human analyses on a 1.4M-image unseen corpus further demonstrate the diversity of search intents supported by MagicLens. Code and models are publicly available at the https://open-vision-language.github.io/MagicLens/.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kai Zhang;Yi Luan;Hexiang Hu;Kenton Lee;Siyuan Qiao;Wenhu Chen;Yu Su;Ming-Wei Chang", "authorids": "~Kai_Zhang10;~Yi_Luan1;~Hexiang_Hu1;~Kenton_Lee1;~Siyuan_Qiao1;~Wenhu_Chen3;~Yu_Su2;~Ming-Wei_Chang3", "gender": "M;F;;M;M;;M;", "homepage": "https://drogozhang.github.io;;;https://kentonl.com/;https://www.cs.jhu.edu/~syqiao/;;http://ysu1989.github.io;", "dblp": "55/957-33;125/7491;;121/7560;43/7562;;38/1070-1;", "google_scholar": "sDnAIsgAAAAJ;0i5Ys-4AAAAJ;;qXwJkr8AAAAJ;6Hfk-90AAAAJ;;rIh5OqoAAAAJ;", "orcid": ";;;;;;;", "linkedin": "kai-zhang-43774b196/;;;;;;;", "or_profile": "~Kai_Zhang10;~Yi_Luan1;~Hexiang_Hu1;~Kenton_Lee1;~Siyuan_Qiao1;~Wenhu_Chen3;~Yu_Su2;~Ming-Wei_Chang3", "aff": "Google DeepMind;Google;;Google Research;Google;;Microsoft;", "aff_domain": "google.com;google.com;;google.com;google.com;;microsoft.com;", "position": "Student Researcher;Research Scientist;;Research Scientist;Research Scientist;;Senior Researcher;", "bibtex": "@inproceedings{\nzhang2024magiclens,\ntitle={MagicLens: Self-Supervised Image Retrieval with Open-Ended Instructions},\nauthor={Kai Zhang and Yi Luan and Hexiang Hu and Kenton Lee and Siyuan Qiao and Wenhu Chen and Yu Su and Ming-Wei Chang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Zc22RDtsvP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8103520, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16412315041040059252&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": "google.com;google.com;;google.com;google.com;;microsoft.com;", "author_num": 8, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Google;Microsoft", "aff_unique_dep": "Google DeepMind;Microsoft Corporation", "aff_unique_url": "https://deepmind.com;https://www.microsoft.com", "aff_unique_abbr": "DeepMind;Microsoft", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "OSSCAR: One-Shot Structured Pruning in Vision and Language Models with Combinatorial Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33730", "id": "ZctlF8RlV4", "proceeding": "https://proceedings.mlr.press/v235/meng24a.html", "pdf": "https://openreview.net/pdf?id=ZctlF8RlV4", "openreview": "https://openreview.net/forum?id=ZctlF8RlV4", "author_site": "Xiang Meng, Shibal Ibrahim, Kayhan Behdin, Hussein Hazimeh, Natalia Ponomareva, Rahul Mazumder", "tldr": "", "abstract": "Structured pruning is a promising approach for reducing the inference costs of large vision and language models. By removing carefully chosen structures, e.g., neurons or attention heads, the improvements from this approach can be realized on standard deep learning hardware. In this work, we focus on structured pruning in the one-shot (post-training) setting, which does not require model retraining after pruning. We propose a novel combinatorial optimization framework for this problem, based on a layer-wise reconstruction objective and a careful reformulation that allows for scalable optimization. Moreover, we design a new local combinatorial optimization algorithm, which exploits low-rank updates for efficient local search. Our framework is time and memory-efficient and considerably improves upon state-of-the-art one-shot methods on vision models (e.g., ResNet50, MobileNet) and language models (e.g., OPT-1.3B -- OPT-30B). For language models, e.g., OPT-2.7B, OSSCAR can lead to $125\\times$ lower test perplexity on WikiText with $2\\times$ inference time speedup in comparison to the state-of-the-art ZipLM approach. Our framework is also $6\\times$ -- $8\\times$ faster. Notably, our work considers models with tens of billions of parameters, which is up to $100\\times$ larger than what has been previously considered in the structured pruning literature. Our code is available at https://github.com/mazumder-lab/OSSCAR.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiang Meng;Shibal Ibrahim;Kayhan Behdin;Hussein Hazimeh;Natalia Ponomareva;Rahul Mazumder", "authorids": "~Xiang_Meng1;~Shibal_Ibrahim1;~Kayhan_Behdin1;~Hussein_Hazimeh1;~Natalia_Ponomareva1;~Rahul_Mazumder1", "gender": "M;M;M;;F;M", "homepage": "https://www.linkedin.com/in/xiang-meng-0888b0201/;https://sites.google.com/view/shibal-ibrahim/;http://mit.edu/~behdink/www/;http://www.mit.edu/~hazimeh;;http://www.mit.edu/~rahulmaz/", "dblp": ";177/1113;199/2308.html;165/0820-1;71/6768-1;11/9365.html", "google_scholar": "AyWinq8AAAAJ;_ADL3k8AAAAJ;;;eIdQR5oAAAAJ;cyCp3pIAAAAJ", "orcid": ";0000-0002-3300-0213;0000-0003-3482-0421;0000-0003-4501-0678;0009-0005-6761-1468;0000-0003-1384-9743", "linkedin": ";shibal-ibrahim-70097b77;;;;", "or_profile": "~Xiang_Meng1;~Shibal_Ibrahim1;~Kayhan_Behdin1;~Hussein_Hazimeh1;~Natalia_Ponomareva1;~Rahul_Mazumder1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Google;Google;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu;google.com;google.com;mit.edu", "position": "PhD student;PhD student;PhD student;Research Scientist;Software Engineer in Research;Associate Professor", "bibtex": "@inproceedings{\nmeng2024osscar,\ntitle={{OSSCAR}: One-Shot Structured Pruning in Vision and Language Models with Combinatorial Optimization},\nauthor={Xiang Meng and Shibal Ibrahim and Kayhan Behdin and Hussein Hazimeh and Natalia Ponomareva and Rahul Mazumder},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZctlF8RlV4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 493924, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11642872467965616596&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "mit.edu;mit.edu;mit.edu;google.com;google.com;mit.edu", "author_num": 6, "aff_unique_index": "0;0;0;1;1;0", "aff_unique_norm": "Massachusetts Institute of Technology;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://web.mit.edu;https://www.google.com", "aff_unique_abbr": "MIT;Google", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Score-Based Causal Discovery of Latent Variable Causal Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33729", "id": "ZdSe1qnuia", "proceeding": "https://proceedings.mlr.press/v235/ng24a.html", "pdf": "https://openreview.net/pdf?id=ZdSe1qnuia", "openreview": "https://openreview.net/forum?id=ZdSe1qnuia", "author_site": "Ignavier Ng, Xinshuai Dong, Haoyue Dai, Biwei Huang, Peter Spirtes, Kun Zhang", "tldr": "", "abstract": "Identifying latent variables and the causal structure involving them is essential across various scientific fields. While many existing works fall under the category of constraint-based methods (with e.g. conditional independence or rank deficiency tests), they may face empirical challenges such as testing-order dependency, error propagation, and choosing an appropriate significance level. These issues can potentially be mitigated by properly designed score-based methods, such as Greedy Equivalence Search (GES) (Chickering, 2002) in the specific setting without latent variables. Yet, formulating score-based methods with latent variables is highly challenging. In this work, we develop score-based methods that are capable of identifying causal structures containing causally-related latent variables with identifiability guarantees. Specifically, we show that a properly formulated scoring function can achieve score equivalence and consistency for structure learning of latent variable causal models. We further provide a characterization of the degrees of freedom for the marginal over the observed variables under multiple structural assumptions considered in the literature, and accordingly develop both exact and continuous score-based methods. This offers a unified view of several existing constraint-based methods with different structural assumptions. Experimental results validate the effectiveness of the proposed methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ignavier Ng;Xinshuai Dong;Haoyue Dai;Biwei Huang;Peter Spirtes;Kun Zhang", "authorids": "~Ignavier_Ng1;~Xinshuai_Dong1;~Haoyue_Dai1;~Biwei_Huang1;~Peter_Spirtes1;~Kun_Zhang1", "gender": "M;M;;F;M;M", "homepage": "https://ignavierng.github.io/;https://dongxinshuai.github.io/;https://hyda.cc;;https://www.cmu.edu/dietrich/philosophy/people/faculty/spirtes.html;http://www.andrew.cmu.edu/user/kunz1/", "dblp": "251/3037;279/6151.html;277/1316;165/3288;87/3550;96/3115-1", "google_scholar": ";A7JyL1sAAAAJ;f4tCtoMAAAAJ;;mar1eCwAAAAJ;RGoypN4AAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Ignavier_Ng1;~Xinshuai_Dong1;~Haoyue_Dai1;~Biwei_Huang1;~Peter_Spirtes1;~Kun_Zhang1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;University of California, San Diego;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu;cmu.edu;ucsd.edu;cmu.edu;cmu.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nng2024scorebased,\ntitle={Score-Based Causal Discovery of Latent Variable Causal Models},\nauthor={Ignavier Ng and Xinshuai Dong and Haoyue Dai and Biwei Huang and Peter Spirtes and Kun Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZdSe1qnuia}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1018320, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9610755648101812703&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "cmu.edu;cmu.edu;cmu.edu;ucsd.edu;cmu.edu;cmu.edu", "author_num": 6, "aff_unique_index": "0;0;0;1;0;0", "aff_unique_norm": "Carnegie Mellon University;University of California, San Diego", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.ucsd.edu", "aff_unique_abbr": "CMU;UCSD", "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Generalization Bound and New Algorithm for Clean-Label Backdoor Attack", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33728", "id": "ZdqiT0McON", "proceeding": "https://proceedings.mlr.press/v235/yu24i.html", "pdf": "https://openreview.net/pdf?id=ZdqiT0McON", "openreview": "https://openreview.net/forum?id=ZdqiT0McON", "author_site": "Lijia Yu, Shuang Liu, Yibo Miao, Xiao-Shan Gao, Lijun Zhang", "tldr": "", "abstract": "The generalization bound is a crucial theoretical tool for assessing the generalizability of learning methods and there exist vast literatures on generalizability of normal learning, adversarial learning, and data poisoning. Unlike other data poison attacks, the backdoor attack has the special property that the poisoned triggers are contained in both the training set and the test set and the purpose of the attack is two-fold. To our knowledge, the generalization bound for the backdoor attack has not been established. In this paper, we fill this gap by deriving algorithm-independent generalization bounds in the clean-label backdoor attack scenario. Precisely, based on the goals of backdoor attack, we give upper bounds for the clean sample population errors and the poison population errors in terms of the empirical error on the poisoned training dataset. Furthermore, based on the theoretical result, a new clean-label backdoor attack is proposed that computes the poisoning trigger by combining adversarial noise and indiscriminate poison. We show its effectiveness in a variety of settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lijia Yu;Shuang Liu;Yibo Miao;Xiao-Shan Gao;Lijun Zhang", "authorids": "~Lijia_Yu2;~Shuang_Liu5;~Yibo_Miao1;~Xiao-Shan_Gao2;~Lijun_Zhang2", "gender": "M;M;M;M;M", "homepage": ";;http://www.amss.ac.cn/;http://www.mmrc.iss.ac.cn/~xgao/;", "dblp": "175/8873.html;;332/0699;13/3109;76/4015-1", "google_scholar": ";https://scholar.google.com/citations?view_op=list_works;;_se7GmUAAAAJ;", "orcid": ";;;0000-0003-2021-9395;", "linkedin": ";;;;", "or_profile": "~Lijia_Yu2;~Shuang_Liu5;~Yibo_Miao1;~Xiao-Shan_Gao2;~Lijun_Zhang2", "aff": "Institute of Software, Chinese Academy of Sciences;Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Chinese Academy of Sciences;Intel;Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Chinese Academy of Sciences;Chinese Academy of Sciences, Chinese Academy of Sciences", "aff_domain": "ios.ac.cn;amss.ac.cn;intel.com;amss.ac.cn;ios.ac.cn", "position": "Postdoc;PhD student;Intern;Full Professor;Full Professor", "bibtex": "@inproceedings{\nyu2024generalization,\ntitle={Generalization Bound and New Algorithm for Clean-Label Backdoor Attack},\nauthor={Lijia Yu and Shuang Liu and Yibo Miao and Xiao-Shan Gao and Lijun Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZdqiT0McON}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1242405, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13632034779474108831&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "ios.ac.cn;amss.ac.cn;intel.com;amss.ac.cn;ios.ac.cn", "author_num": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Chinese Academy of Sciences;Intel", "aff_unique_dep": "Institute of Software;Intel Corporation", "aff_unique_url": "http://www.ios.ac.cn;https://www.intel.com", "aff_unique_abbr": "CAS;Intel", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "China;United States" }, { "title": "Optimal Acceleration for Minimax and Fixed-Point Problems is Not Unique", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33727", "id": "ZeF75iQcAc", "proceeding": "https://proceedings.mlr.press/v235/yoon24b.html", "pdf": "https://openreview.net/pdf?id=ZeF75iQcAc", "openreview": "https://openreview.net/forum?id=ZeF75iQcAc", "author_site": "TaeHo Yoon, Jaeyeon Kim, Jaewook Suh, Ernest Ryu", "tldr": "", "abstract": "Recently, accelerated algorithms using the anchoring mechanism for minimax optimization and fixed-point problems have been proposed, and matching complexity lower bounds establish their optimality. In this work, we present the surprising observation that the optimal acceleration mechanism in minimax optimization and fixed-point problems is not unique. Our new algorithms achieve exactly the same worst-case convergence rates as existing anchor-based methods while using materially different acceleration mechanisms. Specifically, these new algorithms are dual to the prior anchor-based accelerated methods in the sense of H-duality. This finding opens a new avenue of research on accelerated algorithms since we now have a family of methods that empirically exhibit varied characteristics while having the same optimal worst-case guarantee.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "TaeHo Yoon;Jaeyeon Kim;Jaewook J. Suh;Ernest K. Ryu", "authorids": "~TaeHo_Yoon1;~Jaeyeon_Kim2;~Jaewook_J._Suh1;~Ernest_K._Ryu1", "gender": "M;M;M;M", "homepage": "https://tetrzim.github.io/;https://jaeyeonkim01.github.io/;https://jaewookjsuh.github.io/;http://www.math.snu.ac.kr/~ernestryu/", "dblp": "285/5543;;323/9242;165/5192", "google_scholar": "YHkh8eYAAAAJ;1bXthLsAAAAJ;https://scholar.google.com/citations?hl=ko;CNOqUZoAAAAJ", "orcid": ";;;0000-0001-6820-9095", "linkedin": ";;;", "or_profile": "~TaeHo_Yoon1;~Jaeyeon_Kim2;~Jaewook_J._Suh1;~Ernest_K._Ryu1", "aff": "Seoul National University;Seoul National University;Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr", "position": "PhD student;Undergrad student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nyoon2024optimal,\ntitle={Optimal Acceleration for Minimax and Fixed-Point Problems is Not Unique},\nauthor={TaeHo Yoon and Jaeyeon Kim and Jaewook J. Suh and Ernest K. Ryu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZeF75iQcAc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 810342, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7781114785760547640&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Convex Relaxations of ReLU Neural Networks Approximate Global Optima in Polynomial Time", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33726", "id": "Zn44XGFGam", "proceeding": "https://proceedings.mlr.press/v235/kim24ac.html", "pdf": "https://openreview.net/pdf?id=Zn44XGFGam", "openreview": "https://openreview.net/forum?id=Zn44XGFGam", "author_site": "Sungyoon Kim, Mert Pilanci", "tldr": "", "abstract": "In this paper, we study the optimality gap between two-layer ReLU networks regularized with weight decay and their convex relaxations. We show that when the training data is random, the relative optimality gap between the original problem and its relaxation can be bounded by a factor of O(\u221alog n), where n is the number of training samples. A simple application leads to a tractable polynomial-time algorithm that is guaranteed to solve the original non-convex problem up to a logarithmic factor. Moreover, under mild assumptions, we show that local gradient methods converge to a point with low training loss with high probability. Our result is an exponential improvement compared to existing results and sheds new light on understanding why local gradient methods work well.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sungyoon Kim;Mert Pilanci", "authorids": "~Sungyoon_Kim3;~Mert_Pilanci3", "gender": "M;M", "homepage": "https://laundaryman.github.io/;https://stanford.edu/~pilanci/", "dblp": "329/0052;45/8056", "google_scholar": ";aSAS-aAAAAAJ", "orcid": ";", "linkedin": ";mert-pilanci-ba615743/", "or_profile": "~Sungyoon_Kim3;~Mert_Pilanci3", "aff": "Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nkim2024convex,\ntitle={Convex Relaxations of Re{LU} Neural Networks Approximate Global Optima in Polynomial Time},\nauthor={Sungyoon Kim and Mert Pilanci},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Zn44XGFGam}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 639532, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5586483381491351866&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "stanford.edu;stanford.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Probabilistic Constrained Reinforcement Learning with Formal Interpretability", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33725", "id": "Zo9zXdVhW2", "proceeding": "https://proceedings.mlr.press/v235/wang24bf.html", "pdf": "https://openreview.net/pdf?id=Zo9zXdVhW2", "openreview": "https://openreview.net/forum?id=Zo9zXdVhW2", "author_site": "YANRAN WANG, QIUCHEN QIAN, David Boyle", "tldr": "", "abstract": "Reinforcement learning can provide effective reasoning for sequential decision-making problems with variable dynamics. Such reasoning in practical implementation, however, poses a persistent challenge in interpreting the reward function and the corresponding optimal policy. Consequently, representing sequential decision-making problems as probabilistic inference can have considerable value, as, in principle, the inference offers diverse and powerful mathematical tools to infer the stochastic dynamics whilst suggesting a probabilistic interpretation of policy optimization. In this study, we propose a novel Adaptive Wasserstein Variational Optimization, namely AWaVO, to tackle these interpretability challenges. Our approach uses formal methods to achieve the interpretability: convergence guarantee, training transparency, and intrinsic decision-interpretation. To demonstrate its practicality, we showcase guaranteed interpretability including a global convergence rate $\\Theta(1/\\sqrt{T})$ not only in simulation but also in real-world quadrotor tasks. In comparison with state-of-the-art benchmarks, including TRPO-IPO, PCPO, and CRPO, we empirically verify that AWaVO offers a reasonable trade-off between high performance and sufficient interpretability.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "YANRAN WANG;QIUCHEN QIAN;David Boyle", "authorids": "~YANRAN_WANG3;~QIUCHEN_QIAN1;~David_Boyle1", "gender": "M;M;M", "homepage": "https://alex-yanranwang.github.io/;;https://www.imperial.ac.uk/people/david.boyle", "dblp": ";;08/1254", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;https://scholar.google.co.uk/citations?user=XRzAhnYAAAAJ", "orcid": "0000-0003-1107-4235;;0000-0002-1993-4482", "linkedin": "yanran-wang-3882a223b/;;", "or_profile": "~YANRAN_WANG3;~QIUCHEN_QIAN1;~David_Boyle1", "aff": "Imperial College London;Imperial College London, Imperial College London;Imperial College London, Imperial College London", "aff_domain": "imperial.ac.uk;imperial.ac.uk;imperial.ac.uk", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nwang2024probabilistic,\ntitle={Probabilistic Constrained Reinforcement Learning with Formal Interpretability},\nauthor={YANRAN WANG and QIUCHEN QIAN and David Boyle},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Zo9zXdVhW2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9304707, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6675534011838279673&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "email": "imperial.ac.uk;imperial.ac.uk;imperial.ac.uk", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Imperial College London", "aff_unique_dep": "", "aff_unique_url": "https://www.imperial.ac.uk", "aff_unique_abbr": "ICL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Discounted Adaptive Online Learning: Towards Better Regularization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33724", "id": "ZoTIdyExx6", "proceeding": "https://proceedings.mlr.press/v235/zhang24e.html", "pdf": "https://openreview.net/pdf?id=ZoTIdyExx6", "openreview": "https://openreview.net/forum?id=ZoTIdyExx6", "author_site": "Zhiyu Zhang, David Bombara, Heng Yang", "tldr": "", "abstract": "We study online learning in adversarial nonstationary environments. Since the future can be very different from the past, a critical challenge is to gracefully forget the history while new data comes in. To formalize this intuition, we revisit the discounted regret in online convex optimization, and propose an adaptive (i.e., instance optimal), FTRL-based algorithm that improves the widespread non-adaptive baseline -- gradient descent with a constant learning rate. From a practical perspective, this refines the classical idea of regularization in lifelong learning: we show that designing better regularizers can be guided by the principled theory of adaptive online optimization. Complementing this result, we also consider the (Gibbs & Candes, 2021)-style online conformal prediction problem, where the goal is to sequentially predict the uncertainty sets of a black-box machine learning model. We show that the FTRL nature of our algorithm can simplify the conventional gradient-descent-based analysis, leading to instance-dependent performance guarantees.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhiyu Zhang;David Bombara;Heng Yang", "authorids": "~Zhiyu_Zhang1;~David_Bombara1;~Heng_Yang4", "gender": ";M;M", "homepage": "https://zhiyuzz.github.io/;;https://hankyang.seas.harvard.edu/", "dblp": "45/6271-3;;83/415-2", "google_scholar": "5KHfVTQAAAAJ;81Yx5YMAAAAJ;GuKEDfixZqsC", "orcid": ";;", "linkedin": ";david-bombara-jr/;", "or_profile": "~Zhiyu_Zhang1;~David_Bombara1;~Heng_Yang4", "aff": "Harvard University;Harvard University, Harvard University;NVIDIA", "aff_domain": "harvard.edu;g.harvard.edu;nvidia.com", "position": "Postdoc;PhD student;Researcher", "bibtex": "@inproceedings{\nzhang2024discounted,\ntitle={Discounted Adaptive Online Learning: Towards Better Regularization},\nauthor={Zhiyu Zhang and David Bombara and Heng Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZoTIdyExx6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 814744, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6649996803674740946&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": "harvard.edu;g.harvard.edu;nvidia.com", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Harvard University;NVIDIA", "aff_unique_dep": ";NVIDIA Corporation", "aff_unique_url": "https://www.harvard.edu;https://www.nvidia.com", "aff_unique_abbr": "Harvard;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Fast-Slow Test-Time Adaptation for Online Vision-and-Language Navigation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33723", "id": "Zos5wsaB5r", "proceeding": "https://proceedings.mlr.press/v235/gao24p.html", "pdf": "https://openreview.net/pdf?id=Zos5wsaB5r", "openreview": "https://openreview.net/forum?id=Zos5wsaB5r", "author_site": "JUNYU GAO, Xuan Yao, Changsheng Xu", "tldr": "", "abstract": "The ability to accurately comprehend natural language instructions and navigate to the target location is essential for an embodied agent. Such agents are typically required to execute user instructions in an online manner, leading us to explore the use of unlabeled test samples for effective online model adaptation. However, for online Vision-and-Language Navigation (VLN), due to the intrinsic nature of inter-sample online instruction execution and intra-sample multi-step action decision, frequent updates can result in drastic changes in model parameters, while occasional updates can make the model ill-equipped to handle dynamically changing environments. Therefore, we propose a Fast-Slow Test-Time Adaptation (FSTTA) approach for online VLN by performing joint decomposition-accumulation analysis for both gradients and parameters in a unified framework. Extensive experiments show that our method obtains impressive performance gains on four popular benchmarks. Code is available at https://github.com/Feliciaxyao/ICML2024-FSTTA.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Junyu Gao;Xuan Yao;Changsheng Xu", "authorids": "~Junyu_Gao1;~Xuan_Yao1;~Changsheng_Xu1", "gender": "M;F;M", "homepage": ";https://github.com/Feliciaxyao;", "dblp": "153/4522;;85/1301", "google_scholar": "y1nOY24AAAAJ;https://scholar.google.com.hk/citations?user=dTx7sN8AAAAJ;https://scholar.google.com.sg/citations?user=hI9NRDkAAAAJ", "orcid": ";0009-0000-8115-3954;", "linkedin": ";;", "or_profile": "~Junyu_Gao1;~Xuan_Yao1;~Changsheng_Xu1", "aff": "Institute of Automation, Chinese Academy of Sciences;MAIS;Institute of Automation, Chinese Academy of Sciences", "aff_domain": "ia.ac.cn;ia.ac.cn;ia.ac.cn", "position": "Associate Professor;PhD student;Full Professor", "bibtex": "@inproceedings{\ngao2024fastslow,\ntitle={Fast-Slow Test-Time Adaptation for Online Vision-and-Language Navigation},\nauthor={Junyu Gao and Xuan Yao and Changsheng Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Zos5wsaB5r}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1514767, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=623451420966493835&as_sdt=800005&sciodt=0,15&hl=en", "gs_version_total": 7, "email": "ia.ac.cn;ia.ac.cn;ia.ac.cn", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Chinese Academy of Sciences;MAIS", "aff_unique_dep": "Institute of Automation;", "aff_unique_url": "http://www.ia.cas.cn;", "aff_unique_abbr": "CAS;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China;" }, { "title": "Bridging Environments and Language with Rendering Functions and Vision-Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33722", "id": "ZrM67ZZ5vj", "proceeding": "https://proceedings.mlr.press/v235/cachet24a.html", "pdf": "https://openreview.net/pdf?id=ZrM67ZZ5vj", "openreview": "https://openreview.net/forum?id=ZrM67ZZ5vj", "author_site": "Th\u00e9o Cachet, Christopher Dance, Olivier Sigaud", "tldr": "", "abstract": "Vision-language models (VLMs) have tremendous potential for *grounding* language, and thus enabling *language-conditioned agents (LCAs)* to perform diverse tasks specified with text. This has motivated the study of LCAs based on reinforcement learning (RL) with rewards given by rendering images of an environment and evaluating those images with VLMs. If single-task RL is employed, such approaches are limited by the cost and time required to train a policy for each new task. Multi-task RL (MTRL) is a natural alternative, but requires a carefully designed corpus of training tasks and does not always generalize reliably to new tasks. Therefore, this paper introduces a novel decomposition of the problem of building an LCA: first find an *environment configuration* that has a high VLM score for text describing a task; then use a (pretrained) goal-conditioned policy to reach that configuration. We also explore several enhancements to the speed and quality of VLM-based LCAs, notably, the use of distilled models, and the evaluation of configurations from multiple viewpoints to resolve the ambiguities inherent in a single 2D view. We demonstrate our approach on the Humanoid environment, showing that it results in LCAs that outperform MTRL baselines in zero-shot generalization, without requiring any textual task descriptions or other forms of environment-specific annotation during training.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Theo Cachet;Christopher R Dance;Olivier Sigaud", "authorids": "~Theo_Cachet1;~Christopher_R_Dance1;~Olivier_Sigaud1", "gender": "M;M;M", "homepage": ";https://europe.naverlabs.com/;http://people.isir.upmc.fr/sigaud", "dblp": ";46/404;50/5522", "google_scholar": "-5oOttMAAAAJ;_MVzC_EAAAAJ;https://scholar.google.fr/citations?user=elLfDv0AAAAJ", "orcid": ";;0000-0002-8544-0229", "linkedin": "theocachet/;;", "or_profile": "~Theo_Cachet1;~Christopher_R_Dance1;~Olivier_Sigaud1", "aff": "Sorbonne Universit\u00e9 - Facult\u00e9 des Sciences (Paris VI);Naver Labs Europe;Sorbonne Universit\u00e9", "aff_domain": "sorbonne-universite.fr;naverlabs.com;upmc.fr", "position": "PhD student;Research Fellow;Full Professor", "bibtex": "@inproceedings{\ncachet2024bridging,\ntitle={Bridging Environments and Language with Rendering Functions and Vision-Language Models},\nauthor={Theo Cachet and Christopher R Dance and Olivier Sigaud},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZrM67ZZ5vj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7479300, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14861219187875980309&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": "sorbonne-universite.fr;naverlabs.com;upmc.fr", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Sorbonne Universit\u00e9;NAVER LABS", "aff_unique_dep": "Facult\u00e9 des Sciences;", "aff_unique_url": "https://www.sorbonne-universite.fr;https://labs.naver.com", "aff_unique_abbr": "Sorbonne U;NLE", "aff_campus_unique_index": "0", "aff_campus_unique": "Paris VI;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "France;Unknown" }, { "title": "Smoothing Proximal Gradient Methods for Nonsmooth Sparsity Constrained Optimization: Optimality Conditions and Global Convergence", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33721", "id": "Zs3qW8Njov", "proceeding": "https://proceedings.mlr.press/v235/yuan24a.html", "pdf": "https://openreview.net/pdf?id=Zs3qW8Njov", "openreview": "https://openreview.net/forum?id=Zs3qW8Njov", "tldr": "", "abstract": "Nonsmooth sparsity constrained optimization encompasses a broad spectrum of applications in machine learning. This problem is generally non-convex and NP-hard. Existing solutions to this problem exhibit several notable limitations, including their inability to address general nonsmooth problems, tendency to yield weaker optimality conditions, and lack of comprehensive convergence analysis. This paper considers Smoothing Proximal Gradient Methods (SPGM) as solutions to nonsmooth sparsity constrained optimization problems. Two specific variants of SPGM are explored: one based on Iterative Hard Thresholding (SPGM-IHT) and the other on Block Coordinate Decomposition (SPGM-BCD). It is shown that the SPGM-BCD algorithm finds stronger stationary points compared to previous methods. Additionally, novel theories for analyzing the convergence rates to approximate global optimal solutions of both the SPGM-IHT and SPGM-BCD algorithms are developed. Our theoretical bounds, capitalizing on the intrinsic sparsity of the optimization problem, are on par with the best-known error bounds available to date. Finally, numerical experiments reveal that SPGM-IHT performs comparably to current IHT-style methods, while SPGM-BCD consistently surpasses them.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ganzhao Yuan", "authorids": "~Ganzhao_Yuan1", "gender": "M", "homepage": "https://yuangzh.github.io/", "dblp": "117/4877", "google_scholar": "https://scholar.google.ca/citations?hl=en", "orcid": "", "linkedin": "", "or_profile": "~Ganzhao_Yuan1", "aff": "Peng Cheng Laboratory", "aff_domain": "pcl.ac.cn", "position": "Associate Professor", "bibtex": "@inproceedings{\nyuan2024smoothing,\ntitle={Smoothing Proximal Gradient Methods for Nonsmooth Sparsity Constrained Optimization: Optimality Conditions and Global Convergence},\nauthor={Ganzhao Yuan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Zs3qW8Njov}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 839133, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5733970485864575962&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 7, "email": "pcl.ac.cn", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Pengcheng Laboratory", "aff_unique_dep": "Peng Cheng Laboratory", "aff_unique_url": "http://www.pcl.ac.cn", "aff_unique_abbr": "PCL", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "GeminiFusion: Efficient Pixel-wise Multimodal Fusion for Vision Transformer", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33720", "id": "Zsz9Pdfvtg", "proceeding": "https://proceedings.mlr.press/v235/jia24b.html", "pdf": "https://openreview.net/pdf?id=Zsz9Pdfvtg", "openreview": "https://openreview.net/forum?id=Zsz9Pdfvtg", "author_site": "Ding Jia, Jianyuan Guo, Kai Han, Han Wu, Chao Zhang, Chang Xu, Xinghao Chen", "tldr": "", "abstract": "Cross-modal transformers have demonstrated superiority in various vision tasks by effectively integrating different modalities. This paper first critiques prior token exchange methods which replace less informative tokens with inter-modal features, and demonstrate exchange based methods underperform cross-attention mechanisms, while the computational demand of the latter inevitably restricts its use with longer sequences. To surmount the computational challenges, we propose *GeminiFusion*, a pixel-wise fusion approach that capitalizes on aligned cross-modal representations. *GeminiFusion* elegantly combines intra-modal and inter-modal attentions, dynamically integrating complementary information across modalities. We employ a layer-adaptive noise to adaptively control their interplay on a per-layer basis, thereby achieving a harmonized fusion process. Notably, *GeminiFusion* maintains linear complexity with respect to the number of input tokens, ensuring this multimodal framework operates with efficiency comparable to unimodal networks. Comprehensive evaluations across multimodal image-to-image translation, $3$D object detection and arbitrary-modal semantic segmentation tasks, including RGB, depth, LiDAR, event data, etc. demonstrate the superior performance of our *GeminiFusion* against leading-edge techniques. The PyTorch code is available [here](https://github.com/JiaDingCN/GeminiFusion).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ding Jia;Jianyuan Guo;Kai Han;Han Wu;Chao Zhang;Chang Xu;Xinghao Chen", "authorids": "~Ding_Jia1;~Jianyuan_Guo1;~Kai_Han2;~Han_Wu4;~Chao_Zhang10;~Chang_Xu4;~Xinghao_Chen1", "gender": "M;M;M;F;M;M;", "homepage": "https://github.com/JiaDingCN;https://ggjy.github.io/;https://iamhankai.github.io;https://www.sydney.edu.au/business/about/our-people/research-students/han-wu-293.html;http://www.cis.pku.edu.cn/faculty/vision/zhangchao/zhangchao.htm;;https://sydney.edu.au/engineering/about/our-people/academic-staff/c-xu.html", "dblp": "296/4147;190/0258;51/4757-2;13/1864;94/3019-1;30/4937-1;97/2966-2", "google_scholar": ";https://scholar.google.com/citations?hl=en;vThoBVcAAAAJ;;NeCCx-kAAAAJ;tuGWUVIAAAAJ;N4F_3eoAAAAJ", "orcid": ";;0000-0002-9761-2702;0000-0002-3750-0696;;0000-0002-2102-8235;0000-0002-4756-0609", "linkedin": ";;;;;;", "or_profile": "~Ding_Jia1;~Jianyuan_Guo1;~Kai_Han2;~Han_Wu4;~Chao_Zhang10;~Xinghao_Chen1;~Charles_Xu1", "aff": "Peking University;University of Sydney;Huawei Noah's Ark Lab;University of Sydney;Peking University;Huawei Noah's Ark Lab;University of Sydney", "aff_domain": "pku.edu.cn;usyd.edu.au;huawei.com;sydney.edu.au;pku.edu.cn;huawei.com;sydney.eud.au", "position": "MS student;PhD student;Principal Researcher;PhD student;Full Professor;Researcher;Associate Professor", "bibtex": "@inproceedings{\njia2024geminifusion,\ntitle={GeminiFusion: Efficient Pixel-wise Multimodal Fusion for Vision Transformer},\nauthor={Ding Jia and Jianyuan Guo and Kai Han and Han Wu and Chao Zhang and Chang Xu and Xinghao Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Zsz9Pdfvtg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2414694, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6267963220413521724&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "pku.edu.cn;usyd.edu.au;huawei.com;sydney.edu.au;pku.edu.cn;huawei.com;sydney.eud.au", "author_num": 7, "aff_unique_index": "0;1;2;1;0;2;1", "aff_unique_norm": "Peking University;University of Sydney;Huawei", "aff_unique_dep": ";;Noah's Ark Lab", "aff_unique_url": "http://www.pku.edu.cn;https://www.sydney.edu.au;https://www.huawei.com", "aff_unique_abbr": "Peking U;USYD;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;0;0;1", "aff_country_unique": "China;Australia" }, { "title": "Minimally Modifying a Markov Game to Achieve Any Nash Equilibrium and Value", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33719", "id": "ZtMqsSkIHX", "proceeding": "https://proceedings.mlr.press/v235/wu24t.html", "pdf": "https://openreview.net/pdf?id=ZtMqsSkIHX", "openreview": "https://openreview.net/forum?id=ZtMqsSkIHX", "author_site": "Young Wu, Jeremy McMahan, Yiding Chen, Yudong Chen, Jerry Zhu, Qiaomin Xie", "tldr": "", "abstract": "We study the game modification problem, where a benevolent game designer or a malevolent adversary modifies the reward function of a zero-sum Markov game so that a target deterministic or stochastic policy profile becomes the unique Markov perfect Nash equilibrium and has a value within a target range, in a way that minimizes the modification cost. We characterize the set of policy profiles that can be installed as the unique equilibrium of a game and establish sufficient and necessary conditions for successful installation. We propose an efficient algorithm that solves a convex optimization problem with linear constraints and then performs random perturbation to obtain a modification plan with a near-optimal cost.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Young Wu;Jeremy McMahan;Yiding Chen;Yudong Chen;Jerry Zhu;Qiaomin Xie", "authorids": "~Young_Wu1;~Jeremy_McMahan1;~Yiding_Chen1;~Yudong_Chen1;~Jerry_Zhu1;~Qiaomin_Xie1", "gender": "M;M;M;M;F;M", "homepage": "http://pages.cs.wisc.edu/~yw;http://jeremymmcmahan.com;https://chenyd.github.io;https://pages.cs.wisc.edu/~yudongchen/;https://qiaominxie.github.io/;http://pages.cs.wisc.edu/~jerryzhu/", "dblp": "304/7498;299/1330;;15/1975-1;37/10269;z/XiaojinZhu", "google_scholar": "M5F2EGUAAAAJ;Ujya6FIAAAAJ;AtMBDPUAAAAJ;ze5rCdwAAAAJ;RVNcy4EAAAAJ;https://scholar.google.com.tw/citations?user=hqTu-QcAAAAJ", "orcid": ";;;0000-0002-6416-5635;;", "linkedin": "young-wu-b5363936/;;;;;", "or_profile": "~Young_Wu1;~Jeremy_McMahan1;~Yiding_Chen1;~Yudong_Chen1;~Qiaomin_Xie1;~Xiaojin_Zhu1", "aff": "University of Wisconsin - Madison;University of Wisconsin - Madison;Cornell University;Department of Computer Sciences, University of Wisconsin - Madison;University of Wisconsin - Madison;University of Wisconsin, Madison", "aff_domain": "wisc.edu;wisc.edu;cornell.edu;cs.wisc.edu;wisc.edu;wisc.edu", "position": "Lecturer;PhD student;Postdoc;Associate Professor;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nwu2024minimally,\ntitle={Minimally Modifying a Markov Game to Achieve Any Nash Equilibrium and Value},\nauthor={Young Wu and Jeremy McMahan and Yiding Chen and Yudong Chen and Jerry Zhu and Qiaomin Xie},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZtMqsSkIHX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 894123, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12613123908401025041&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "wisc.edu;wisc.edu;cornell.edu;cs.wisc.edu;wisc.edu;wisc.edu", "author_num": 6, "aff_unique_index": "0;0;1;0;0;2", "aff_unique_norm": "University of Wisconsin-Madison;Cornell University;University of Wisconsin", "aff_unique_dep": ";;", "aff_unique_url": "https://www.wisc.edu;https://www.cornell.edu;https://www.wisc.edu", "aff_unique_abbr": "UW-Madison;Cornell;UW", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "SeMOPO: Learning High-quality Model and Policy from Low-quality Offline Visual Datasets", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33718", "id": "ZtOXZCTgBa", "proceeding": "https://proceedings.mlr.press/v235/wan24b.html", "pdf": "https://openreview.net/pdf?id=ZtOXZCTgBa", "openreview": "https://openreview.net/forum?id=ZtOXZCTgBa", "author_site": "Shenghua Wan, Ziyuan Chen, Le Gan, Shuai Feng, De-Chuan Zhan", "tldr": "", "abstract": "Model-based offline reinforcement Learning (RL) is a promising approach that leverages existing data effectively in many real-world applications, especially those involving high-dimensional inputs like images and videos. To alleviate the distribution shift issue in offline RL, existing model-based methods heavily rely on the uncertainty of learned dynamics. However, the model uncertainty estimation becomes significantly biased when observations contain complex distractors with non-trivial dynamics. To address this challenge, we propose a new approach - *Separated Model-based Offline Policy Optimization* (SeMOPO) - decomposing latent states into endogenous and exogenous parts via conservative sampling and estimating model uncertainty on the endogenous states only. We provide a theoretical guarantee of model uncertainty and performance bound of SeMOPO. To assess the efficacy, we construct the Low-Quality Vision Deep Data-Driven Datasets for RL (LQV-D4RL), where the data are collected by non-expert policy and the observations include moving distractors. Experimental results show that our method substantially outperforms all baseline methods, and further analytical experiments validate the critical designs in our method. The project website is https://sites.google.com/view/semopo.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shenghua Wan;Ziyuan Chen;Le Gan;Shuai Feng;De-Chuan Zhan", "authorids": "~Shenghua_Wan1;~Ziyuan_Chen2;~Le_Gan1;~Shuai_Feng3;~De-Chuan_Zhan1", "gender": ";M;M;;M", "homepage": ";;;;http://www.lamda.nju.edu.cn/zhandc/", "dblp": ";;199/0588.html;;74/498", "google_scholar": ";https://scholar.google.com/citations?hl=zh-CN;cCD5SDoAAAAJ;;mYJf4TcAAAAJ", "orcid": ";;0000-0002-8260-6932;;0000-0002-3533-2078", "linkedin": ";;;;", "or_profile": "~Shenghua_Wan1;~Ziyuan_Chen2;~Le_Gan1;~Shuai_Feng3;~De-Chuan_Zhan1", "aff": ";Peking University;Nanjing University;;Nanjing University", "aff_domain": ";pku.edu.cn;nju.edu.cn;;nju.edu.cn", "position": ";PhD student;Researcher;;Full Professor", "bibtex": "@inproceedings{\nwan2024semopo,\ntitle={Se{MOPO}: Learning High-quality Model and Policy from Low-quality Offline Visual Datasets},\nauthor={Shenghua Wan and Ziyuan Chen and Le Gan and Shuai Feng and De-Chuan Zhan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZtOXZCTgBa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 0, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jVTvkrpsiG8J:scholar.google.com/&scioq=SeMOPO:+Learning+High-quality+Model+and+Policy+from+Low-quality+Offline+Visual+Datasets&hl=en&as_sdt=0,5", "gs_version_total": 7, "email": ";pku.edu.cn;nju.edu.cn;;nju.edu.cn", "author_num": 5, "aff_unique_index": "0;1;1", "aff_unique_norm": "Peking University;Nanjing University", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.nju.edu.cn", "aff_unique_abbr": "Peking U;Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "The Stronger the Diffusion Model, the Easier the Backdoor: Data Poisoning to Induce Copyright BreachesWithout Adjusting Finetuning Pipeline", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33717", "id": "ZvFLbEPv6x", "proceeding": "https://proceedings.mlr.press/v235/wang24bm.html", "pdf": "https://openreview.net/pdf?id=ZvFLbEPv6x", "openreview": "https://openreview.net/forum?id=ZvFLbEPv6x", "author_site": "Haonan Wang, Qianli Shen, Yao Tong, Yang Zhang, Kenji Kawaguchi", "tldr": "", "abstract": "The commercialization of text-to-image diffusion models (DMs) brings forth potential copyright concerns. Despite numerous attempts to protect DMs from copyright issues, the vulnerabilities of these solutions are underexplored. In this study, we formalized the Copyright Infringement Attack on generative AI models and proposed a backdoor attack method, SilentBadDiffusion, to induce copyright infringement without requiring access to or control over training processes. Our method strategically embeds connections between pieces of copyrighted information and text references in poisoning data while carefully dispersing that information, making the poisoning data inconspicuous when integrated into a clean dataset. Our experiments show the stealth and efficacy of the poisoning data. When given specific text prompts, DMs trained with a poisoning ratio of 0.20% can produce copyrighted images. Additionally, the results reveal that the more sophisticated the DMs are, the easier the success of the attack becomes. These findings underline potential pitfalls in the prevailing copyright protection strategies and underscore the necessity for increased scrutiny to prevent the misuse of DMs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haonan Wang;Qianli Shen;Yao Tong;Yang Zhang;Kenji Kawaguchi", "authorids": "~Haonan_Wang1;~Qianli_Shen1;~Yao_Tong2;~Yang_Zhang22;~Kenji_Kawaguchi1", "gender": "M;M;;M;", "homepage": "http://charles-haonan-wang.me/;https://shenqianli.github.io/;;https://github.com/YaNgZhAnG-V5;https://ml.comp.nus.edu.sg/#members", "dblp": ";22/10357.html;;;", "google_scholar": "cLziVZMAAAAJ;p3ekN2kAAAAJ;;Ri0ENAUAAAAJ;aLl3rYoAAAAJ", "orcid": "0009-0006-6963-8987;;0009-0001-8256-8059;;", "linkedin": ";;;;", "or_profile": "~Haonan_Wang1;~Qianli_Shen1;~Yao_Tong2;~Yang_Zhang22;~Kenji_Kawaguchi1", "aff": "National University of Singapore;National University of Singapore;School of Computing, National University of Singapore;National University of Singapore;National University of Singapore", "aff_domain": "u.nus.edu;u.nus.edu;u.nus.edu;nus.edu;nus.edu", "position": "PhD student;PhD student;PhD student;PhD student;Presidential Young Professor", "bibtex": "@inproceedings{\nwang2024the,\ntitle={The Stronger the Diffusion Model, the Easier the Backdoor: Data Poisoning to Induce Copyright BreachesWithout Adjusting Finetuning Pipeline},\nauthor={Haonan Wang and Qianli Shen and Yao Tong and Yang Zhang and Kenji Kawaguchi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZvFLbEPv6x}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8981263, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10176910307287530539&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "email": "u.nus.edu;u.nus.edu;u.nus.edu;nus.edu;nus.edu", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "National University of Singapore", "aff_unique_dep": "", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", "aff_campus_unique_index": "1", "aff_campus_unique": ";Singapore", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Singapore" }, { "title": "Zero-Shot ECG Classification with Multimodal Learning and Test-time Clinical Knowledge Enhancement", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33716", "id": "ZvJ2lQQKjz", "proceeding": "https://proceedings.mlr.press/v235/liu24bg.html", "pdf": "https://openreview.net/pdf?id=ZvJ2lQQKjz", "openreview": "https://openreview.net/forum?id=ZvJ2lQQKjz", "author_site": "che liu, Zhongwei Wan, Cheng Ouyang, Anand Shah, Wenjia Bai, Rossella Arcucci", "tldr": "", "abstract": "Electrocardiograms (ECGs) are non-invasive diagnostic tools crucial for detecting cardiac arrhythmic diseases in clinical practice. While ECG Self-supervised Learning (eSSL) methods show promise in representation learning from unannotated ECG data, they often overlook the clinical knowledge that can be found in reports. This oversight and the requirement for annotated samples for downstream tasks limit eSSL's versatility. In this work, we address these issues with the **M**ultimodal **E**CG **R**epresentation **L**earning (**MERL**) framework. Through multimodal learning on ECG records and associated reports, MERL is capable of performing zero-shot ECG classification with text prompts, eliminating the need for training data in downstream tasks. At test time, we propose the **C**linical **K**nowledge **E**nhanced **P**rompt **E**ngineering (**CKEPE**) approach, which uses Large Language Models (LLMs) to exploit external expert-verified clinical knowledge databases, generating more descriptive prompts and reducing hallucinations in LLM-generated content to boost zero-shot classification. Based on MERL, we perform the first benchmark across six public ECG datasets, showing the superior performance of MERL compared against eSSL methods. Notably, MERL achieves an average AUC score of 75.2% in zero-shot classification (**without training data**), 3.2% higher than linear probed eSSL methods with 10% annotated training data, averaged across all six datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Che Liu;Zhongwei Wan;Cheng Ouyang;Anand Shah;Wenjia Bai;Rossella Arcucci", "authorids": "~Che_Liu3;~Zhongwei_Wan1;~Cheng_Ouyang2;~Anand_Shah1;~Wenjia_Bai1;~Rossella_Arcucci1", "gender": ";M;;M;M;F", "homepage": ";https://people.engineering.osu.edu/people/wan.512;;https://www.bing.com/ck/a?!&&p=ebd74ca33ca49e50JmltdHM9MTY2NzUyMDAwMCZpZ3VpZD0wNTY1MTYwZS0wOTFiLTZkOTMtMWIxMi0xOTM1MDg1YjZjNDAmaW5zaWQ9NTE3NQ&ptn=3&hsh=3&fclid=0565160e-091b-6d93-1b12-1935085b6c40&psq=imperial+URL+anand+shah&u=a1aHR0cHM6Ly93d3cuaW1wZXJpYWwuYWMudWsvcGVvcGxlL3MuYW5hbmQ&ntb=1;http://www.doc.ic.ac.uk/~wbai;https://www.imperial.ac.uk/people/r.arcucci", "dblp": ";260/6958.html;;;20/5519;130/5772", "google_scholar": ";https://scholar.google.com/citations?hl=en;;;https://scholar.google.co.uk/citations?user=IA1QFM4AAAAJ;oxy2ZQoAAAAJ", "orcid": ";;;;;0000-0002-9471-0585", "linkedin": ";;;;;https://www.linkedin.com/public-profile/settings?trk=d_flagship3_profile_self_view_public_profile", "or_profile": "~Che_Liu3;~Zhongwei_Wan1;~Cheng_Ouyang2;~Anand_Shah1;~Wenjia_Bai1;~Rossella_Arcucci1", "aff": ";Ohio State University, Columbus;;Imperial College London;Imperial College London;Imperial College London ", "aff_domain": ";osu.edu;;ic.ac.uk;imperial.ac.uk;imperial.ac.uk", "position": ";PhD student;;Lecturer;Associate Professor;Senior Lecturer", "bibtex": "@inproceedings{\nliu2024zeroshot,\ntitle={Zero-Shot {ECG} Classification with Multimodal Learning and Test-time Clinical Knowledge Enhancement},\nauthor={Che Liu and Zhongwei Wan and Cheng Ouyang and Anand Shah and Wenjia Bai and Rossella Arcucci},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZvJ2lQQKjz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7718078, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9331322248925166857&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";osu.edu;;ic.ac.uk;imperial.ac.uk;imperial.ac.uk", "author_num": 6, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Ohio State University;Imperial College London", "aff_unique_dep": ";", "aff_unique_url": "https://www.osu.edu;https://www.imperial.ac.uk", "aff_unique_abbr": "OSU;ICL", "aff_campus_unique_index": "0", "aff_campus_unique": "Columbus;", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Sarah Frank-Wolfe: Methods for Constrained Optimization with Best Rates and Practical Features", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33715", "id": "Zw52bJCZXc", "proceeding": "https://proceedings.mlr.press/v235/beznosikov24a.html", "pdf": "https://openreview.net/pdf?id=Zw52bJCZXc", "openreview": "https://openreview.net/forum?id=Zw52bJCZXc", "author_site": "Aleksandr Beznosikov, David Dobre, Gauthier Gidel", "tldr": "", "abstract": "The Frank-Wolfe (FW) method is a popular approach for solving optimization problems with structured constraints that arise in machine learning applications. In recent years, stochastic versions of FW have gained popularity, motivated by large datasets for which the computation of the full gradient is prohibitively expensive. In this paper, we present two new variants of the FW algorithms for stochastic finite-sum minimization. Our algorithms have the best convergence guarantees of existing stochastic FW approaches for both convex and non-convex objective functions. Our methods do not have the issue of permanently collecting large batches, which is common to many stochastic projection-free approaches. Moreover, our second approach does not require either large batches or full deterministic gradients, which is a typical weakness of many techniques for finite-sum problems. The faster theoretical rates of our approaches are confirmed experimentally.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aleksandr Beznosikov;David Dobre;Gauthier Gidel", "authorids": "~Aleksandr_Beznosikov1;~David_Dobre1;~Gauthier_Gidel1", "gender": ";M;M", "homepage": ";;https://gauthiergidel.github.io/", "dblp": ";;188/6326", "google_scholar": ";;https://scholar.google.fr/citations?user=bDrXQPUAAAAJ", "orcid": ";;", "linkedin": ";daviddobre/;", "or_profile": "~Aleksandr_Beznosikov1;~David_Dobre1;~Gauthier_Gidel1", "aff": ";Mila - Quebec Artificial Intelligence Institute;Mila - Quebec Artificial Intelligence Institute", "aff_domain": ";mila.quebec;mila.quebec", "position": ";PhD student;Assistant Professor", "bibtex": "@inproceedings{\nbeznosikov2024sarah,\ntitle={Sarah Frank-Wolfe: Methods for Constrained Optimization with Best Rates and Practical Features},\nauthor={Aleksandr Beznosikov and David Dobre and Gauthier Gidel},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Zw52bJCZXc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2299514, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11651038026540023216&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";mila.quebec;mila.quebec", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Quebec Artificial Intelligence Institute", "aff_unique_dep": "Artificial Intelligence", "aff_unique_url": "https://mila.quebec", "aff_unique_abbr": "Mila", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "title": "Minimum-Norm Interpolation Under Covariate Shift", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33714", "id": "Zw7TcnTmHj", "proceeding": "https://proceedings.mlr.press/v235/mallinar24a.html", "pdf": "https://openreview.net/pdf?id=Zw7TcnTmHj", "openreview": "https://openreview.net/forum?id=Zw7TcnTmHj", "author_site": "Neil Mallinar, Austin Zane, Spencer Frei, Bin Yu", "tldr": "", "abstract": "Transfer learning is a critical part of real-world machine learning deployments and has been extensively studied in experimental works with overparameterized neural networks. However, even in the simplest setting of linear regression a notable gap still exists in the theoretical understanding of transfer learning. In-distribution research on high-dimensional linear regression has led to the identification of a phenomenon known as *benign overfitting*, in which linear interpolators overfit to noisy training labels and yet still generalize well. This behavior occurs under specific conditions on the source covariance matrix and input data dimension. Therefore, it is natural to wonder how such high-dimensional linear models behave under transfer learning. We prove the first non-asymptotic excess risk bounds for benignly-overfit linear interpolators in the transfer learning setting. From our analysis, we propose a taxonomy of *beneficial* and *malignant* covariate shifts based on the degree of overparameterization. We follow our analysis with empirical studies that show these beneficial and malignant covariate shifts for linear interpolators on real image data, and for fully-connected neural networks in settings where the input data dimension is larger than the training sample size.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Neil Rohit Mallinar;Austin Zane;Spencer Frei;Bin Yu", "authorids": "~Neil_Rohit_Mallinar1;~Austin_Zane1;~Spencer_Frei1;~Bin_Yu5", "gender": "M;;M;M", "homepage": "https://mallinar.xyz;;http://spencerfrei.github.io/;https://binyu.stat.berkeley.edu", "dblp": "213/8278;;250/2714;27/116", "google_scholar": "6ogHsLsAAAAJ;;c7N8SoEAAAAJ;https://scholar.google.com.hk/citations?user=z1iJa3UAAAAJ", "orcid": ";;;0000-0003-3097-1433", "linkedin": ";;;bin-yu-b665063/", "or_profile": "~Neil_Rohit_Mallinar1;~Austin_Zane1;~Spencer_Frei1;~Bin_Yu5", "aff": "University of California, San Diego;;University of California, Davis;University of California, Berkeley", "aff_domain": "ucsd.edu;;ucdavis.edu;berkeley.edu", "position": "PhD student;;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nmallinar2024minimumnorm,\ntitle={Minimum-Norm Interpolation Under Covariate Shift},\nauthor={Neil Rohit Mallinar and Austin Zane and Spencer Frei and Bin Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Zw7TcnTmHj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1080123, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14927355944817645854&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "ucsd.edu;;ucdavis.edu;berkeley.edu", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of California, San Diego;University of California, Davis;University of California, Berkeley", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ucsd.edu;https://www.ucdavis.edu;https://www.berkeley.edu", "aff_unique_abbr": "UCSD;UC Davis;UC Berkeley", "aff_campus_unique_index": "0;1;2", "aff_campus_unique": "San Diego;Davis;Berkeley", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Position: AI-Powered Autonomous Weapons Risk Geopolitical Instability and Threaten AI Research", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33713", "id": "ZwUThOE7Zc", "proceeding": "https://proceedings.mlr.press/v235/simmons-edler24a.html", "pdf": "https://openreview.net/pdf?id=ZwUThOE7Zc", "openreview": "https://openreview.net/forum?id=ZwUThOE7Zc", "author_site": "Riley Simmons-Edler, Ryan Badman, Shayne Longpre, Kanaka Rajan", "tldr": "", "abstract": "The recent embrace of machine learning (ML) in the development of autonomous weapons systems (AWS) creates serious risks to geopolitical stability and the free exchange of ideas in AI research. This topic has received comparatively little attention of late compared to risks stemming from superintelligent artificial general intelligence (AGI), but requires fewer assumptions about the course of technological development and is thus a nearer-future issue. ML is already enabling the substitution of AWS for human soldiers in many battlefield roles, reducing the upfront human cost, and thus political cost, of waging offensive war. In the case of peer adversaries, this increases the likelihood of \"low intensity\" conflicts which risk escalation to broader warfare. In the case of non-peer adversaries, it reduces the domestic blowback to wars of aggression. This effect can occur regardless of other ethical issues around the use of military AI such as the risk of civilian casualties, and does not require any superhuman AI capabilities. Further, the military value of AWS raises the specter of an AI-powered arms race and the misguided imposition of national security restrictions on AI research. Our goal in this paper is to raise awareness among the public and ML researchers on the near-future risks posed by full or near-full autonomy in military technology, and we provide regulatory suggestions to mitigate these risks. We call upon AI policy experts and the defense AI community in particular to embrace transparency and caution in their development and deployment of AWS to avoid the negative effects on global stability and AI research that we highlight here.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Riley Simmons-Edler;Ryan Paul Badman;Shayne Longpre;Kanaka Rajan", "authorids": "~Riley_Simmons-Edler1;~Ryan_Paul_Badman1;~Shayne_Longpre1;~Kanaka_Rajan1", "gender": "M;M;M;F", "homepage": "https://rileyse.org/;;https://www.shaynelongpre.com;https://www.rajanlab.com/", "dblp": "222/1920;;190/7024;94/10452", "google_scholar": "https://scholar.google.com/citations?hl=en;Q2esw9MAAAAJ;ADd_YfkAAAAJ;IC6n33kAAAAJ", "orcid": ";0000-0001-8819-1144;;0000-0003-2749-2917", "linkedin": ";ryan-badman-60483939/;shayne-redford-longpre/;rajankdr", "or_profile": "~Riley_Simmons-Edler1;~Ryan_Paul_Badman1;~Shayne_Longpre1;~Kanaka_Rajan1", "aff": "Harvard University;Harvard Medical School;Massachusetts Institute of Technology;Icahn School of Medicine at Mount Sinai", "aff_domain": "harvard.edu;hms.harvard.edu;mit.edu;mssm.edu", "position": "Postdoc;Postdoc;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nsimmons-edler2024position,\ntitle={Position: {AI}-Powered Autonomous Weapons Risk Geopolitical Instability and Threaten {AI} Research},\nauthor={Riley Simmons-Edler and Ryan Paul Badman and Shayne Longpre and Kanaka Rajan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZwUThOE7Zc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 482327, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8790334811619814832&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "harvard.edu;hms.harvard.edu;mit.edu;mssm.edu", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Harvard University;Massachusetts Institute of Technology;Icahn School of Medicine at Mount Sinai", "aff_unique_dep": ";;School of Medicine", "aff_unique_url": "https://www.harvard.edu;https://web.mit.edu;https://icahn.mssm.edu", "aff_unique_abbr": "Harvard;MIT;ISMMS", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Boston;New York", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "AD3: Implicit Action is the Key for World Models to Distinguish the Diverse Visual Distractors", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33712", "id": "ZwrfsrCduj", "proceeding": "https://proceedings.mlr.press/v235/wang24bq.html", "pdf": "https://openreview.net/pdf?id=ZwrfsrCduj", "openreview": "https://openreview.net/forum?id=ZwrfsrCduj", "author_site": "Yucen Wang, Shenghua Wan, Le Gan, Shuai Feng, De-Chuan Zhan", "tldr": "", "abstract": "Model-based methods have significantly contributed to distinguishing task-irrelevant distractors for visual control. However, prior research has primarily focused on heterogeneous distractors like noisy background videos, leaving homogeneous distractors that closely resemble controllable agents largely unexplored, which poses significant challenges to existing methods. To tackle this problem, we propose Implicit Action Generator (IAG) to learn the implicit actions of visual distractors, and present a new algorithm named implicit Action-informed Diverse visual Distractors Distinguisher (AD3), that leverages the action inferred by IAG to train separated world models. Implicit actions effectively capture the behavior of background distractors, aiding in distinguishing the task-irrelevant components, and the agent can optimize the policy within the task-relevant state space. Our method achieves superior performance on various visual control tasks featuring both heterogeneous and homogeneous distractors. The indispensable role of implicit actions learned by IAG is also empirically validated.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yucen Wang;Shenghua Wan;Le Gan;Shuai Feng;De-Chuan Zhan", "authorids": "~Yucen_Wang1;~Shenghua_Wan1;~Le_Gan1;~Shuai_Feng3;~De-Chuan_Zhan1", "gender": "M;;M;;M", "homepage": "http://www.lamda.nju.edu.cn/wangyc/;;;;http://www.lamda.nju.edu.cn/zhandc/", "dblp": "349/7802.html;;199/0588.html;;74/498", "google_scholar": "https://scholar.google.com.hk/citations?user=W3DITm4AAAAJ;;cCD5SDoAAAAJ;;mYJf4TcAAAAJ", "orcid": ";;0000-0002-8260-6932;;0000-0002-3533-2078", "linkedin": ";;;;", "or_profile": "~Yucen_Wang1;~Shenghua_Wan1;~Le_Gan1;~Shuai_Feng3;~De-Chuan_Zhan1", "aff": "Nanjing University;;Nanjing University;;Nanjing University", "aff_domain": "nju.edu.cn;;nju.edu.cn;;nju.edu.cn", "position": "PhD student;;Researcher;;Full Professor", "bibtex": "@inproceedings{\nwang2024ad,\ntitle={{AD}3: Implicit Action is the Key for World Models to Distinguish the Diverse Visual Distractors},\nauthor={Yucen Wang and Shenghua Wan and Le Gan and Shuai Feng and De-Chuan Zhan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZwrfsrCduj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7553465, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2858800275767371827&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "nju.edu.cn;;nju.edu.cn;;nju.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "", "aff_unique_url": "https://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Federated Self-Explaining GNNs with Anti-shortcut Augmentations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33711", "id": "ZxDqSBgFSM", "proceeding": "https://proceedings.mlr.press/v235/yue24b.html", "pdf": "https://openreview.net/pdf?id=ZxDqSBgFSM", "openreview": "https://openreview.net/forum?id=ZxDqSBgFSM", "author_site": "Linan Yue, Qi Liu, Weibo Gao, Ye Liu, Kai Zhang, Yichao Du, Li Wang, Fangzhou Yao", "tldr": "", "abstract": "Graph Neural Networks (GNNs) have demonstrated remarkable performance in graph classification tasks. However, ensuring the explainability of their predictions remains a challenge. To address this, graph rationalization methods have been introduced to generate concise subsets of the original graph, known as rationales, which serve to explain the predictions made by GNNs. Existing rationalizations often rely on shortcuts in data for prediction and rationale composition. In response, de-shortcut rationalization methods have been proposed, which commonly leverage counterfactual augmentation to enhance data diversity for mitigating the shortcut problem. Nevertheless, these methods have predominantly focused on centralized datasets and have not been extensively explored in the Federated Learning (FL) scenarios. To this end, in this paper, we propose a Federated Graph Rationalization (FedGR) with anti-shortcut augmentations to achieve self-explaining GNNs, which involves two data augmenters. These augmenters are employed to produce client-specific shortcut conflicted samples at each client, which contributes to mitigating the shortcut problem under the FL scenarios. Experiments on real-world benchmarks and synthetic datasets validate the effectiveness of FedGR under the FL scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Linan Yue;Qi Liu;Weibo Gao;Ye Liu;Kai Zhang;Yichao Du;Li Wang;Fangzhou Yao", "authorids": "~Linan_Yue1;~Qi_Liu3;~Weibo_Gao1;~Ye_Liu10;~Kai_Zhang12;~Yichao_Du1;~Li_Wang18;~Fangzhou_Yao1", "gender": "M;M;;M;M;;F;", "homepage": "https://yuelinan.github.io/;http://staff.ustc.edu.cn/~qiliuql/;;https://liuyeah.github.io;http://home.ustc.edu.cn/~sa517494/;;;", "dblp": "297/1080;95/2446-3;;96/2615-11.html;55/957-38;;;", "google_scholar": "https://scholar.google.com.hk/citations?user=XDaNgG4AAAAJ;5EoHAFwAAAAJ;;HYYR4f8AAAAJ;t6IIpAUAAAAJ;;poE7k1wAAAAJ;", "orcid": "0000-0002-5980-6098;0000-0001-6956-5550;;;0000-0001-5335-2470;;;", "linkedin": ";;;;;;;", "or_profile": "~Linan_Yue1;~Qi_Liu3;~Weibo_Gao1;~Ye_Liu10;~Kai_Zhang12;~Yichao_Du1;~Li_Wang18;~Fangzhou_Yao1", "aff": "University of Science and Technology of China;University of Science and Technology of China;;University of Science and Technology of China;University of Science and Technology of China;;;", "aff_domain": "ustc.edu.cn;ustc.edu.cn;;ustc.edu;ustc.edu.cn;;;", "position": "PhD student;Full Professor;;PhD student;Researcher;;;", "bibtex": "@inproceedings{\nyue2024federated,\ntitle={Federated Self-Explaining {GNN}s with Anti-shortcut Augmentations},\nauthor={Linan Yue and Qi Liu and Weibo Gao and Ye Liu and Kai Zhang and Yichao Du and Li Wang and Fangzhou Yao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZxDqSBgFSM}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1280162, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ycqIhuNCX-cJ:scholar.google.com/&scioq=Federated+Self-Explaining+GNNs+with+Anti-shortcut+Augmentations&hl=en&as_sdt=0,44", "gs_version_total": 4, "email": "ustc.edu.cn;ustc.edu.cn;;ustc.edu;ustc.edu.cn;;;", "author_num": 8, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Science and Technology of China", "aff_unique_dep": "", "aff_unique_url": "http://www.ustc.edu.cn", "aff_unique_abbr": "USTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Estimating Canopy Height at Scale", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33710", "id": "ZzCY0fRver", "proceeding": "https://proceedings.mlr.press/v235/pauls24a.html", "pdf": "https://openreview.net/pdf?id=ZzCY0fRver", "openreview": "https://openreview.net/forum?id=ZzCY0fRver", "author_site": "Jan Pauls, Max Zimmer, Una Kelly, Martin Schwartz, Sassan Saatchi, Philippe CIAIS, Sebastian Pokutta, Martin Brandt, Fabian Gieseke", "tldr": "", "abstract": "We propose a framework for global-scale canopy height estimation based on satellite data. Our model leverages advanced data preprocessing techniques, resorts to a novel loss function designed to counter geolocation inaccuracies inherent in the ground-truth height measurements, and employs data from the Shuttle Radar Topography Mission to effectively filter out erroneous labels in mountainous regions, enhancing the reliability of our predictions in those areas. A comparison between predictions and ground-truth labels yields an MAE/RMSE of 2.43 / 4.73 (meters) overall and 4.45 / 6.72 (meters) for trees taller than five meters, which depicts a substantial improvement compared to existing global-scale products. The resulting height map as well as the underlying framework will facilitate and enhance ecological analyses at a global scale, including, but not limited to, large-scale forest and biomass monitoring.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jan Pauls;Max Zimmer;Una M. Kelly;Martin Schwartz;Sassan Saatchi;Philippe CIAIS;Sebastian Pokutta;Martin Brandt;Fabian Gieseke", "authorids": "~Jan_Pauls1;~Max_Zimmer1;~Una_M._Kelly1;~Martin_Schwartz1;~Sassan_Saatchi1;~Philippe_CIAIS1;~Sebastian_Pokutta1;~Martin_Brandt1;~Fabian_Gieseke1", "gender": "M;;;M;;M;M;M;M", "homepage": "https://www.wi.uni-muenster.de/de/institut/dasc/personen/jan-pauls;;https://www.wi.uni-muenster.de/de/institut/dasc/personen/una-kelly;;http://jpl.nasa.gov;https://www.lsce.ipsl.fr/Phocea/Pisp/index.php?nom=philippe.ciais;http://www.pokutta.com;;https://www.wi.uni-muenster.de/department/dasc/people/fabian-gieseke/publications", "dblp": ";;;;;;75/7718;;68/7056", "google_scholar": ";;;https://scholar.google.fr/citations?user=myVMGXUAAAAJ;xpQib9MAAAAJ;;;;https://scholar.google.de/citations?user=g3ZiieoAAAAJ", "orcid": ";;0000-0001-8813-8028;0000-0003-4038-9068;;0000-0001-8560-4943;;0000-0001-9531-1239;0000-0001-7093-5803", "linkedin": ";;una-kelly-ab159911b/?originalSubdomain=de;martin-s-910466136/;;philippe-ciais-82510032;;;", "or_profile": "~Jan_Pauls1;~Max_Zimmer1;~Una_M._Kelly1;~Martin_Schwartz1;~Sassan_Saatchi1;~Philippe_CIAIS1;~Sebastian_Pokutta1;~Martin_Brandt1;~Fabian_Gieseke1", "aff": "University of M\u00fcnster;;University of Twente;LSCE;Jet Propulsion Laboratory;CEA;TU Berlin;Copenhagen University;University of Copenhagen, Department of Computer Science", "aff_domain": "uni-muenster.de;;utwente.nl;lsce.ipsl.fr;jpl.nasa.gov;cea.fr;tu-berlin.de;ku.dk;diku.dk", "position": "PhD student;;PhD student;Postdoc;Researcher;Principal Researcher;Full Professor;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\npauls2024estimating,\ntitle={Estimating Canopy Height at Scale},\nauthor={Jan Pauls and Max Zimmer and Una M. Kelly and Martin Schwartz and Sassan Saatchi and Philippe CIAIS and Sebastian Pokutta and Martin Brandt and Fabian Gieseke},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZzCY0fRver}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8090378, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7034734718946523203&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "email": "uni-muenster.de;;utwente.nl;lsce.ipsl.fr;jpl.nasa.gov;cea.fr;tu-berlin.de;ku.dk;diku.dk", "author_num": 9, "aff_unique_index": "0;1;2;3;4;5;6;6", "aff_unique_norm": "University of M\u00fcnster;University of Twente;Laboratoire des Sciences du Climat et de l'Environnement;Jet Propulsion Laboratory;Commissariat \u00e0 l'\u00c9nergie Atomique et aux \u00c9nergies Alternatives;Technische Universit\u00e4t Berlin;University of Copenhagen", "aff_unique_dep": ";;;;;;", "aff_unique_url": "https://www.uni-muenster.de;https://www.utwente.nl;https://www.lsce.ipsl.fr;https://www.jpl.nasa.gov;https://www cea fr;https://www.tu-berlin.de;https://www.ku.dk", "aff_unique_abbr": "UM;UT;LSCE;JPL;CEA;TU Berlin;UCPH", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Pasadena;Berlin", "aff_country_unique_index": "0;1;2;3;2;0;4;4", "aff_country_unique": "Germany;Netherlands;France;United States;Denmark" }, { "title": "MC-GTA: Metric-Constrained Model-Based Clustering using Goodness-of-fit Tests with Autocorrelations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33709", "id": "ZzFTrzo0Cp", "proceeding": "https://proceedings.mlr.press/v235/wang24av.html", "pdf": "https://openreview.net/pdf?id=ZzFTrzo0Cp", "openreview": "https://openreview.net/forum?id=ZzFTrzo0Cp", "author_site": "Zhangyu Wang, Gengchen Mai, Krzysztof Janowicz, Ni Lao", "tldr": "", "abstract": "A wide range of (multivariate) temporal (1D) and spatial (2D) data analysis tasks, such as grouping vehicle sensor trajectories, can be formulated as clustering with given metric constraints. Existing metric-constrained clustering algorithms overlook the rich correlation between feature similarity and metric distance, i.e., metric autocorrelation. The model-based variations of these clustering algorithms (e.g. TICC and STICC) achieve SOTA performance, yet suffer from computational instability and complexity by using a metric-constrained Expectation-Maximization procedure. In order to address these two problems, we propose a novel clustering algorithm, MC-GTA (**M**odel-based **C**lustering via **G**oodness-of-fit **T**ests with **A**utocorrelations). Its objective is only composed of pairwise weighted sums of feature similarity terms (square Wasserstein-2 distance) and metric autocorrelation terms (a novel multivariate generalization of classic semivariogram). We show that MC-GTA is effectively minimizing the total hinge loss for intra-cluster observation pairs not passing goodness-of-fit tests, i.e., statistically not originating from the same distribution. Experiments on 1D/2D synthetic and real-world datasets demonstrate that MC-GTA successfully incorporates metric autocorrelation. It outperforms strong baselines by large margins (up to 14.3% in ARI and 32.1% in NMI) with faster and stabler optimization (>10x speedup).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhangyu Wang;Gengchen Mai;Krzysztof Janowicz;Ni Lao", "authorids": "~Zhangyu_Wang1;~Gengchen_Mai1;~Krzysztof_Janowicz1;~Ni_Lao1", "gender": ";M;;M", "homepage": ";https://gengchenmai.github.io/;http://geog.ucsb.edu/~jano/;http://www.cs.cmu.edu/~nlao", "dblp": ";151/5583;;82/283", "google_scholar": "8vNk5Z8AAAAJ;X2Wfl1UAAAAJ;6B2Z9vAAAAAJ;iUgWR3MAAAAJ", "orcid": ";0000-0002-7818-7309;;0000-0002-4034-7784", "linkedin": "zhangyu-wang-26aab0170/;gengchen-mai-144439121/;;ni-lao", "or_profile": "~Zhangyu_Wang1;~Gengchen_Mai1;~Krzysztof_Janowicz1;~Ni_Lao1", "aff": "University of California, Santa Barbara;University of Georgia;UC Santa Barbara;Google", "aff_domain": "ucsb.edu;uga.edu;ucsb.edu;google.com", "position": "PhD student;Assistant Professor;Full Professor;Researcher", "bibtex": "@inproceedings{\nwang2024mcgta,\ntitle={{MC}-{GTA}: Metric-Constrained Model-Based Clustering using Goodness-of-fit Tests with Autocorrelations},\nauthor={Zhangyu Wang and Gengchen Mai and Krzysztof Janowicz and Ni Lao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZzFTrzo0Cp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2672013, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6735402451891708151&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "ucsb.edu;uga.edu;ucsb.edu;google.com", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "University of California, Santa Barbara;University of Georgia;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.ucsb.edu;https://www.uga.edu;https://www.google.com", "aff_unique_abbr": "UCSB;UGA;Google", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Santa Barbara;;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Ditto: Quantization-aware Secure Inference of Transformers upon MPC", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33708", "id": "ZzXNCQGzqT", "proceeding": "https://proceedings.mlr.press/v235/wu24d.html", "pdf": "https://openreview.net/pdf?id=ZzXNCQGzqT", "openreview": "https://openreview.net/forum?id=ZzXNCQGzqT", "author_site": "Haoqi Wu, Wenjing Fang, Yancheng Zheng, Junming Ma, Jin Tan, Lei Wang", "tldr": "", "abstract": "Due to the rising privacy concerns on sensitive client data and trained models like Transformers, secure multi-party computation (MPC) techniques are employed to enable secure inference despite attendant overhead. Existing works attempt to reduce the overhead using more MPC-friendly non-linear function approximations. However, the integration of quantization widely used in plaintext inference into the MPC domain remains unclear. To bridge this gap, we propose the framework named Ditto to enable more efficient quantization-aware secure Transformer inference. Concretely, we first incorporate an MPC-friendly quantization into Transformer inference and employ a quantization-aware distillation procedure to maintain the model utility. Then, we propose novel MPC primitives to support the type conversions that are essential in quantization and implement the quantization-aware MPC execution of secure quantized inference. This approach significantly decreases both computation and communication overhead, leading to improvements in overall efficiency. We conduct extensive experiments on Bert and GPT2 models to evaluate the performance of Ditto. The results demonstrate that Ditto is about $3.14\\sim 4.40\\times$ faster than MPCFormer (ICLR 2023) and $1.44\\sim 2.35\\times$ faster than the state-of-the-art work PUMA with negligible utility degradation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haoqi Wu;Wenjing Fang;Yancheng Zheng;Junming Ma;Jin Tan;Lei Wang", "authorids": "~Haoqi_Wu1;~Wenjing_Fang1;~Yancheng_Zheng1;~Junming_Ma1;~Jin_Tan2;~Lei_Wang30", "gender": "M;;M;M;M;M", "homepage": ";;https://www.linkedin.com/in/yancheng-zheng-bab7746a/;;https://github.com/rivertalk;", "dblp": ";;;203/0941;;", "google_scholar": "gCfQJOEAAAAJ;;;;;", "orcid": "0000-0003-0650-5459;;;;;", "linkedin": ";;;;;%E7%A3%8A-%E7%8E%8B-b4994abb/", "or_profile": "~Haoqi_Wu1;~Wenjing_Fang1;~Yancheng_Zheng1;~Junming_Ma1;~Jin_Tan2;~Lei_Wang30", "aff": "Ant Group;;Ant Group;Ant Group;Alibaba Group;Ant Group", "aff_domain": "antgroup.com;;antgroup.com;antgroup.com;antgroup.com;antgroup.com", "position": "Researcher;;Senior Software Engineer;Software engineer;Researcher;Principal Researcher", "bibtex": "@inproceedings{\nwu2024ditto,\ntitle={Ditto: Quantization-aware Secure Inference of Transformers upon {MPC}},\nauthor={Haoqi Wu and Wenjing Fang and Yancheng Zheng and Junming Ma and Jin Tan and Lei Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZzXNCQGzqT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1103924, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7398384900077253212&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 7, "email": "antgroup.com;;antgroup.com;antgroup.com;antgroup.com;antgroup.com", "author_num": 6, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Ant Group;Alibaba Group", "aff_unique_dep": ";", "aff_unique_url": "https://www.antgroup.com;https://www.alibaba.com", "aff_unique_abbr": "Ant Group;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Diffusion Models Encode the Intrinsic Dimension of Data Manifolds", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33707", "id": "a0XiA6v256", "proceeding": "https://proceedings.mlr.press/v235/stanczuk24a.html", "pdf": "https://openreview.net/pdf?id=a0XiA6v256", "openreview": "https://openreview.net/forum?id=a0XiA6v256", "author_site": "Jan Stanczuk, Georgios Batzolis, Teo Deveney, Carola-Bibiane Sch\u00f6nlieb", "tldr": "", "abstract": "In this work, we provide a mathematical proof that diffusion models encode data manifolds by approximating their normal bundles. Based on this observation we propose a novel method for extracting the intrinsic dimension of the data manifold from a trained diffusion model. Our insights are based on the fact that a diffusion model approximates the score function i.e. the gradient of the log density of a noise-corrupted version of the target distribution for varying levels of corruption. We prove that as the level of corruption decreases, the score function points towards the manifold, as this direction becomes the direction of maximal likelihood increase. Therefore, at low noise levels, the diffusion model provides us with an approximation of the manifold's normal bundle, allowing for an estimation of the manifold's intrinsic dimension. To the best of our knowledge our method is the first estimator of intrinsic dimension based on diffusion models and it outperforms well established estimators in controlled experiments on both Euclidean and image data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jan Pawel Stanczuk;Georgios Batzolis;Teo Deveney;Carola-Bibiane Sch\u00f6nlieb", "authorids": "~Jan_Pawel_Stanczuk1;~Georgios_Batzolis1;~Teo_Deveney1;~Carola-Bibiane_Sch\u00f6nlieb1", "gender": "Not Specified;M;M;F", "homepage": ";;https://researchportal.bath.ac.uk/en/persons/teo-deveney;http://www.damtp.cam.ac.uk/research/cia/", "dblp": "286/8660;287/8984;249/9231;07/8184", "google_scholar": "Auwhh8sAAAAJ;;;nPeOXjwAAAAJ", "orcid": ";;;", "linkedin": ";georgios-batzolis-92577b128/;;", "or_profile": "~Jan_Pawel_Stanczuk1;~Georgios_Batzolis1;~Teo_Deveney1;~Carola-Bibiane_Sch\u00f6nlieb1", "aff": ";University of Cambridge;University of Bath;University of Cambridge", "aff_domain": ";cam.ac.uk;bath.ac.uk;cam.ac.uk", "position": ";PhD student;Postdoc;Full Professor", "bibtex": "@inproceedings{\nstanczuk2024diffusion,\ntitle={Diffusion Models Encode the Intrinsic Dimension of Data Manifolds},\nauthor={Jan Pawel Stanczuk and Georgios Batzolis and Teo Deveney and Carola-Bibiane Sch{\\\"o}nlieb},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=a0XiA6v256}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9985159, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1371588246834017438&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "email": ";cam.ac.uk;bath.ac.uk;cam.ac.uk", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Cambridge;University of Bath", "aff_unique_dep": ";", "aff_unique_url": "https://www.cam.ac.uk;https://www.bath.ac.uk", "aff_unique_abbr": "Cambridge;Bath", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "$\\mathtt{VITS}$ : Variational Inference Thompson Sampling for contextual bandits", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33706", "id": "a1GvTbadqA", "proceeding": "https://proceedings.mlr.press/v235/clavier24a.html", "pdf": "https://openreview.net/pdf?id=a1GvTbadqA", "openreview": "https://openreview.net/forum?id=a1GvTbadqA", "author_site": "Pierre Clavier, Tom Huix, Alain Oliviero Durmus", "tldr": "", "abstract": "In this paper, we introduce and analyze a variant of the Thompson sampling (TS) algorithm for contextual bandits. At each round, traditional TS requires samples from the current posterior distribution, which is usually intractable. To circumvent this issue, approximate inference techniques can be used and provide samples with distribution close to the posteriors. However, current approximate techniques yield to either poor estimation (Laplace approximation) or can be computationally expensive (MCMC methods, Ensemble sampling...). In this paper, we propose a new algorithm, Varational Inference TS $\\mathtt{VITS}$, based on Gaussian Variational Inference. This scheme provides powerful posterior approximations which are easy to sample from, and is computationally efficient, making it an ideal choice for TS. In addition, we show that $\\mathtt{VITS}$ achieves a sub-linear regret bound of the same order in the dimension and number of round as traditional TS for linear contextual bandit. Finally, we demonstrate experimentally the effectiveness of $\\mathtt{VITS}$ on both synthetic and real world datasets", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pierre Clavier;Tom Huix;Alain Oliviero Durmus", "authorids": "~Pierre_Clavier1;~Tom_Huix1;~Alain_Oliviero_Durmus1", "gender": "M;M;M", "homepage": "https://pierreclavier.github.io/aboutme/;;", "dblp": ";;01/11275", "google_scholar": "-KnIaGsAAAAJ;;", "orcid": ";;", "linkedin": "pierre-clavier-823171135/;tom-huix/;", "or_profile": "~Pierre_Clavier1;~Tom_Huix1;~Alain_Durmus1", "aff": "\u00c9cole Polytechnique;\u00c9cole Polytechnique;\u00c9cole Polytechnique", "aff_domain": "polytechnique.edu;polytechnique.fr;polytechnique.fr", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nclavier2024mathttvits,\ntitle={\\${\\textbackslash}mathtt\\{{VITS}\\}\\$ : Variational Inference Thompson Sampling for contextual bandits},\nauthor={Pierre Clavier and Tom Huix and Alain Oliviero Durmus},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=a1GvTbadqA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2568778, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7382097064965697378&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "polytechnique.edu;polytechnique.fr;polytechnique.fr", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Ecole Polytechnique", "aff_unique_dep": "", "aff_unique_url": "https://www.polytechnique.edu", "aff_unique_abbr": "X", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "PAC-Bayesian Error Bound, via R\u00e9nyi Divergence, for a Class of Linear Time-Invariant State-Space Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33705", "id": "a1Olc2QhPv", "proceeding": "https://proceedings.mlr.press/v235/eringis24a.html", "pdf": "https://openreview.net/pdf?id=a1Olc2QhPv", "openreview": "https://openreview.net/forum?id=a1Olc2QhPv", "author_site": "Deividas Eringis, john leth, Zheng-Hua Tan, Rafal Wisniewski, Mihaly Petreczky", "tldr": "", "abstract": "In this paper we derive a PAC-Bayesian error bound for a class of stochastic dynamical systems with inputs, namely, for linear time-invariant stochastic state-space models (stochastic LTI systems for short). This class of systems is widely used in control engineering and econometrics, in particular, they represent a special case of recurrent neural networks. In this paper we 1) formalize the learning problem for stochastic LTI systems with inputs, 2) derive a PAC-Bayesian error bound for such systems, and 3) discuss various consequences of this error bound.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Deividas Eringis;john leth;Zheng-Hua Tan;Rafal Wisniewski;Mihaly Petreczky", "authorids": "~Deividas_Eringis1;~john_leth1;~Zheng-Hua_Tan1;~Rafal_Wisniewski1;~Mihaly_Petreczky2", "gender": "M;M;M;M;M", "homepage": ";https://vbn.aau.dk/da/persons/102987;https://zhenghuatan.es.aau.dk/;;", "dblp": "289/1568;;39/4898;w/RafaelWisniewski;", "google_scholar": "BOl8lrQAAAAJ;;fugL2E8AAAAJ;;", "orcid": "0000-0002-7419-9388;;0000-0001-6856-8928;;0000-0003-2264-5689", "linkedin": "deividas-eringis/;;zhenghuatan/;;", "or_profile": "~Deividas_Eringis1;~john_leth1;~Zheng-Hua_Tan1;~Rafal_Wisniewski1;~Mihaly_Petreczky2", "aff": ";Automation & Control, Department of Electronic Systems, Aalborg University;Aalborg University;Aalborg University;CNRS, University Lille, Ecole Centrale Lille,", "aff_domain": ";es.aau.dk;aau.dk;es.aau.dk;univ-lille.fr", "position": ";Associate Professor;Full Professor;Full Professor;Researcher", "bibtex": "@inproceedings{\neringis2024pacbayesian,\ntitle={{PAC}-Bayesian Error Bound, via R\\'enyi Divergence, for a Class of Linear Time-Invariant State-Space Models},\nauthor={Deividas Eringis and john leth and Zheng-Hua Tan and Rafal Wisniewski and Mihaly Petreczky},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=a1Olc2QhPv}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 828840, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12827826465579722027&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";es.aau.dk;aau.dk;es.aau.dk;univ-lille.fr", "author_num": 5, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Aalborg University;CNRS", "aff_unique_dep": "Department of Electronic Systems;", "aff_unique_url": "https://www.aau.dk;https://www.cnrs.fr", "aff_unique_abbr": "AAU;CNRS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "Denmark;France" }, { "title": "Efficient Pareto Manifold Learning with Low-Rank Structure", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33704", "id": "a2uFstsHPb", "proceeding": "https://proceedings.mlr.press/v235/chen24x.html", "pdf": "https://openreview.net/pdf?id=a2uFstsHPb", "openreview": "https://openreview.net/forum?id=a2uFstsHPb", "author_site": "Weiyu CHEN, James Kwok", "tldr": "", "abstract": "Multi-task learning, which optimizes performance across multiple tasks, is inherently a multi-objective optimization problem. Various algorithms are developed to provide discrete trade-off solutions on the Pareto front. Recently, continuous Pareto front approximations using a linear combination of base networks have emerged as a compelling strategy. However, it suffers from scalability issues when the number of tasks is large. To address this issue, we propose a novel approach that integrates a main network with several low-rank matrices to efficiently learn the Pareto manifold. It significantly reduces the number of parameters and facilitates the extraction of shared features. We also introduce orthogonal regularization to further bolster performance. Extensive experimental results demonstrate that the proposed approach outperforms state-of-the-art baselines, especially on datasets with a large number of tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weiyu Chen;James Kwok", "authorids": "~Weiyu_Chen1;~James_Kwok1", "gender": "M;", "homepage": ";", "dblp": "73/6153;", "google_scholar": ";", "orcid": "0000-0002-1620-6500;", "linkedin": ";", "or_profile": "~Weiyu_Chen1;~James_Kwok1", "aff": "Hong Kong University of Science and Technology;", "aff_domain": "ust.hk;", "position": "PhD student;", "bibtex": "@inproceedings{\nchen2024efficient,\ntitle={Efficient Pareto Manifold Learning with Low-Rank Structure},\nauthor={Weiyu Chen and James Kwok},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=a2uFstsHPb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1271382, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6203615672061462298&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "ust.hk;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Hong Kong University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.ust.hk", "aff_unique_abbr": "HKUST", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "Reward Shaping for Reinforcement Learning with An Assistant Reward Agent", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33703", "id": "a3XFF0PGLU", "proceeding": "https://proceedings.mlr.press/v235/ma24l.html", "pdf": "https://openreview.net/pdf?id=a3XFF0PGLU", "openreview": "https://openreview.net/forum?id=a3XFF0PGLU", "author_site": "Haozhe Ma, Kuankuan Sima, Thanh Vinh Vo, Di Fu, Tze-Yun Leong", "tldr": "", "abstract": "Reward shaping is a promising approach to tackle the sparse-reward challenge of reinforcement learning by reconstructing more informative and dense rewards. This paper introduces a novel dual-agent reward shaping framework, composed of two synergistic agents: a policy agent to learn the optimal behavior and a reward agent to generate auxiliary reward signals. The proposed method operates as a self-learning approach, without reliance on expert knowledge or hand-crafted functions. By restructuring the rewards to capture future-oriented information, our framework effectively enhances the sample efficiency and convergence stability. Furthermore, the auxiliary reward signals facilitate the exploration of the environment in the early stage and the exploitation of the policy agent in the late stage, achieving a self-adaptive balance. We evaluate our framework on continuous control tasks with sparse and delayed rewards, demonstrating its robustness and superiority over existing methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haozhe Ma;Kuankuan Sima;Thanh Vinh Vo;Di Fu;Tze-Yun Leong", "authorids": "~Haozhe_Ma1;~Kuankuan_Sima1;~Thanh_Vinh_Vo2;~Di_Fu2;~Tze-Yun_Leong2", "gender": "M;M;M;M;", "homepage": "https://mahaozhe.github.io/;;https://vothanhvinh.github.io/;https://sg.linkedin.com/in/di-fu-49625a139;https://www.comp.nus.edu.sg/~leongty", "dblp": ";;222/7878;;", "google_scholar": ";;;;", "orcid": "0009-0002-7645-0115;;0000-0001-9722-4884;;0000-0002-1139-803X", "linkedin": ";kuankuan-sima-108253202/;;;tze-yun-leong-9aa60238", "or_profile": "~Haozhe_Ma1;~Kuankuan_Sima1;~Thanh_Vinh_Vo2;~Di_Fu2;~Tze-Yun_Leong2", "aff": "National University of Singapore;National University of Singapore;National University of Singapore;Tsinghua University;National University of Singapore", "aff_domain": "u.nus.edu;u.nus.edu;nus.edu.sg;tsinghua.edu.cn;nus.edu.sg", "position": "PhD student;MS student;Postdoc;Undergrad student;Full Professor", "bibtex": "@inproceedings{\nma2024reward,\ntitle={Reward Shaping for Reinforcement Learning with An Assistant Reward Agent},\nauthor={Haozhe Ma and Kuankuan Sima and Thanh Vinh Vo and Di Fu and Tze-Yun Leong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=a3XFF0PGLU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5994800, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5552645679597219973&as_sdt=4005&sciodt=0,6&hl=en", "gs_version_total": 6, "email": "u.nus.edu;u.nus.edu;nus.edu.sg;tsinghua.edu.cn;nus.edu.sg", "author_num": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "National University of Singapore;Tsinghua University", "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;https://www.tsinghua.edu.cn", "aff_unique_abbr": "NUS;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "Singapore;China" }, { "title": "Position: What makes an image realistic?", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33702", "id": "a6366lEzbX", "proceeding": "https://proceedings.mlr.press/v235/theis24a.html", "pdf": "https://openreview.net/pdf?id=a6366lEzbX", "openreview": "https://openreview.net/forum?id=a6366lEzbX", "tldr": "", "abstract": "The last decade has seen tremendous progress in our ability to *generate* realistic-looking data, be it images, text, audio, or video. Here, we discuss the closely related problem of *quantifying* realism, that is, designing functions that can reliably tell realistic data from unrealistic data. This problem turns out to be significantly harder to solve and remains poorly understood, despite its prevalence in machine learning and recent breakthroughs in generative AI. Drawing on insights from algorithmic information theory, we discuss why this problem is challenging, why a good generative model alone is insufficient to solve it, and what a good solution would look like. In particular, we introduce the notion of a *universal critic*, which unlike adversarial critics does not require adversarial training. While universal critics are not immediately practical, they can serve both as a North Star for guiding practical implementations and as a tool for analyzing existing attempts to capture realism.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lucas Theis", "authorids": "~Lucas_Theis1", "gender": "M", "homepage": "http://theis.io", "dblp": "28/8772", "google_scholar": "https://scholar.google.co.uk/citations?hl=en", "orcid": "", "linkedin": "", "or_profile": "~Lucas_Theis1", "aff": "Google", "aff_domain": "google.com", "position": "Researcher", "bibtex": "@inproceedings{\ntheis2024position,\ntitle={Position: What makes an image realistic?},\nauthor={Lucas Theis},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=a6366lEzbX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 398675, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13774494954193995144&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "google.com", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Unsupervised Zero-Shot Reinforcement Learning via Functional Reward Encodings", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33701", "id": "a6wCNfIj8E", "proceeding": "https://proceedings.mlr.press/v235/frans24a.html", "pdf": "https://openreview.net/pdf?id=a6wCNfIj8E", "openreview": "https://openreview.net/forum?id=a6wCNfIj8E", "author_site": "Kevin Frans, Seohong Park, Pieter Abbeel, Sergey Levine", "tldr": "", "abstract": "Can we pre-train a generalist agent from a large amount of unlabeled offline trajectories such that it can be immediately adapted to any new downstream tasks in a zero-shot manner? In this work, we present a *functional* reward encoding (FRE) as a general, scalable solution to this *zero-shot RL* problem. Our main idea is to learn functional representations of any arbitrary tasks by encoding their state-reward samples using a transformer-based variational auto-encoder. This functional encoding not only enables the pre-training of an agent from a wide diversity of general unsupervised reward functions, but also provides a way to solve any new downstream tasks in a zero-shot manner, given a small number of reward-annotated samples. We empirically show that FRE agents trained on diverse random unsupervised reward functions can generalize to solve novel tasks in a range of simulated robotic benchmarks, often outperforming previous zero-shot RL and offline RL methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kevin Frans;Seohong Park;Pieter Abbeel;Sergey Levine", "authorids": "~Kevin_Frans1;~Seohong_Park1;~Pieter_Abbeel2;~Sergey_Levine1", "gender": "M;;M;M", "homepage": "http://kvfrans.com;https://seohong.me/;https://people.eecs.berkeley.edu/~pabbeel/;https://people.eecs.berkeley.edu/~svlevine/", "dblp": "199/2314;227/6308;;80/7594", "google_scholar": "NQ2ZWBoAAAAJ;;https://scholar.google.com.tw/citations?user=vtwH6GkAAAAJ;8R35rCwAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Kevin_Frans1;~Seohong_Park1;~Pieter_Abbeel2;~Sergey_Levine1", "aff": "University of California, Berkeley;University of California, Berkeley;Covariant;Google", "aff_domain": "berkeley.edu;berkeley.edu;covariant.ai;google.com", "position": "PhD student;PhD student;Founder;Research Scientist", "bibtex": "@inproceedings{\nfrans2024unsupervised,\ntitle={Unsupervised Zero-Shot Reinforcement Learning via Functional Reward Encodings},\nauthor={Kevin Frans and Seohong Park and Pieter Abbeel and Sergey Levine},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=a6wCNfIj8E}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6099160, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14404294212706649411&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "berkeley.edu;berkeley.edu;covariant.ai;google.com", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "University of California, Berkeley;Covariant;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.berkeley.edu;;https://www.google.com", "aff_unique_abbr": "UC Berkeley;;Google", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Berkeley;;Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States;" }, { "title": "A New Computationally Efficient Algorithm to solve Feature Selection for Functional Data Classification in High-dimensional Spaces", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33700", "id": "a7MW5kFFOf", "proceeding": "https://proceedings.mlr.press/v235/boschi24a.html", "pdf": "https://openreview.net/pdf?id=a7MW5kFFOf", "openreview": "https://openreview.net/forum?id=a7MW5kFFOf", "author_site": "Tobia Boschi, FRANCESCA BONIN, Rodrigo Ordonez-Hurtado, Alessandra Pascale, Jonathan Epperlein", "tldr": "", "abstract": "This paper introduces a novel methodology for Feature Selection for Functional Classification, FSFC, that addresses the challenge of jointly performing feature selection and classification of functional data in scenarios with categorical responses and multivariate longitudinal features. FSFC tackles a newly defined optimization problem that integrates logistic loss and functional features to identify the most crucial variables for classification. To address the minimization procedure, we employ functional principal components and develop a new adaptive version of the Dual Augmented Lagrangian algorithm. The computational efficiency of FSFC enables handling high-dimensional scenarios where the number of features may considerably exceed the number of statistical units. Simulation experiments demonstrate that FSFC outperforms other machine learning and deep learning methods in computational time and classification accuracy. Furthermore, the FSFC feature selection capability can be leveraged to significantly reduce the problem's dimensionality and enhance the performances of other classification algorithms. The efficacy of FSFC is also demonstrated through a real data application, analyzing relationships between four chronic diseases and other health and demographic factors. FSFC source code is publicly available at https://github.com/IBM/funGCN.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tobia Boschi;Francesca Bonin;Rodrigo Ordonez-Hurtado;Alessandra Pascale;Jonathan P Epperlein", "authorids": "~Tobia_Boschi1;fbonin@ie.ibm.com;rodrigo.ordonez.hurtado@ibm.com;apascale@ie.ibm.com;~Jonathan_P_Epperlein1", "gender": "M;;;;M", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": "PB7-twMAAAAJ;;;;https://scholar.google.co.uk/citations?user=MR9vRDwAAAAJ", "orcid": "0000-0003-1990-1653;;;;", "linkedin": "tobia-boschi-08277a156/;;;;", "or_profile": "~Tobia_Boschi1;fbonin@ie.ibm.com;rodrigo.ordonez.hurtado@ibm.com;apascale@ie.ibm.com;~Jonathan_P_Epperlein1", "aff": "International Business Machines;;;;International Business Machines", "aff_domain": "ibm.com;;;;ibm.com", "position": "Researcher;;;;Researcher", "bibtex": "@inproceedings{\nboschi2024a,\ntitle={A New Computationally Efficient Algorithm to solve Feature Selection for Functional Data Classification in High-dimensional Spaces},\nauthor={Tobia Boschi and Francesca Bonin and Rodrigo Ordonez-Hurtado and Alessandra Pascale and Jonathan P Epperlein},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=a7MW5kFFOf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1620070, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2631102456723349391&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "ibm.com;;;;ibm.com", "author_num": 5, "aff_unique_index": "0;0", "aff_unique_norm": "International Business Machines Corporation", "aff_unique_dep": "", "aff_unique_url": "https://www.ibm.com", "aff_unique_abbr": "IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "SurfPro: Functional Protein Design Based on Continuous Surface", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33699", "id": "a8QpoEJCRI", "proceeding": "https://proceedings.mlr.press/v235/song24c.html", "pdf": "https://openreview.net/pdf?id=a8QpoEJCRI", "openreview": "https://openreview.net/forum?id=a8QpoEJCRI", "author_site": "Zhenqiao Song, Tinglin Huang, Lei Li, Wengong Jin", "tldr": "", "abstract": "How can we design proteins with desired functions? We are motivated by a chemical intuition that both geometric structure and biochemical properties are critical to a protein's function. In this paper, we propose SurfPro, a new method to generate functional proteins given a desired surface and its associated biochemical properties. SurfPro comprises a hierarchical encoder that progressively models the geometric shape and biochemical features of a protein surface, and an autoregressive decoder to produce an amino acid sequence. We evaluate SurfPro on a standard inverse folding benchmark CATH 4.2 and two functional protein design tasks: protein binder design and enzyme design. Our SurfPro consistently surpasses previous state-of-the-art inverse folding methods, achieving a recovery rate of 57.78% on CATH 4.2 and higher success rates in terms of protein-protein binding and enzyme-substrate interaction scores", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhenqiao Song;Tinglin Huang;Lei Li;Wengong Jin", "authorids": "~Zhenqiao_Song1;~Tinglin_Huang1;~Lei_Li11;~Wengong_Jin1", "gender": "F;M;M;", "homepage": "https://jocelynsong.github.io/;https://huangtinglin.github.io/;https://www.cs.cmu.edu/~leili;http://people.csail.mit.edu/wengong", "dblp": "227/7889;;13/7007-5.html;173/6620", "google_scholar": "https://scholar.google.com/citations?hl=en;izW2ygYAAAAJ;BYXqAlwAAAAJ;IE5D8_QAAAAJ", "orcid": ";0009-0005-5644-4879;0000-0003-3095-9776;", "linkedin": ";;;", "or_profile": "~Zhenqiao_Song1;~Tinglin_Huang1;~Lei_Li11;~Wengong_Jin1", "aff": "Carnegie Mellon University;Yale University;School of Computer Science, Carnegie Mellon University;Broad Institute", "aff_domain": "andrew.cmu.edu;yale.edu;cs.cmu.edu;broadinstitute.org", "position": "PhD student;PhD student;Assistant Professor;Postdoc", "bibtex": "@inproceedings{\nsong2024surfpro,\ntitle={SurfPro: Functional Protein Design Based on Continuous Surface},\nauthor={Zhenqiao Song and Tinglin Huang and Lei Li and Wengong Jin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=a8QpoEJCRI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1884477, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3374286506740803442&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 10, "email": "andrew.cmu.edu;yale.edu;cs.cmu.edu;broadinstitute.org", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Carnegie Mellon University;Yale University;Broad Institute", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cmu.edu;https://www.yale.edu;https://www.broadinstitute.org", "aff_unique_abbr": "CMU;Yale;Broad", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Critical windows: non-asymptotic theory for feature emergence in diffusion models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33698", "id": "a8ZpjLJuKk", "proceeding": "https://proceedings.mlr.press/v235/li24g.html", "pdf": "https://openreview.net/pdf?id=a8ZpjLJuKk", "openreview": "https://openreview.net/forum?id=a8ZpjLJuKk", "author_site": "Marvin Li, Sitan Chen", "tldr": "", "abstract": "We develop theory to understand an intriguing property of diffusion models for image generation that we term *critical windows*. Empirically, it has been observed that there are narrow time intervals in sampling during which particular features of the final image emerge, e.g. the image class or background color (Ho et al., 2020b; Meng et al., 2022; Choi et al., 2022; Raya & Ambrogioni, 2023; Georgiev et al., 2023; Sclocchi et al., 2024; Biroli et al., 2024). While this is advantageous for interpretability as it implies one can localize properties of the generation to a small segment of the trajectory, it seems at odds with the continuous nature of the diffusion. We propose a formal framework for studying these windows and show that for data coming from a mixture of strongly log-concave densities, these windows can be provably bounded in terms of certain measures of inter- and intra-group separation. We also instantiate these bounds for concrete examples like well-conditioned Gaussian mixtures. Finally, we use our bounds to give a rigorous interpretation of diffusion models as hierarchical samplers that progressively \u201cdecide\u201d output features over a discrete sequence of times. We validate our bounds with experiments on synthetic data and show that critical windows may serve as a useful tool for diagnosing fairness and privacy violations in real-world diffusion models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Marvin Li;Sitan Chen", "authorids": "~Marvin_Li1;~Sitan_Chen1", "gender": "M;M", "homepage": "https://marvinfli.com;https://sitanchen.com", "dblp": ";141/7670", "google_scholar": "NhMTzpsAAAAJ;YnJVsp4AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Marvin_Li1;~Sitan_Chen1", "aff": "Harvard University;Harvard University", "aff_domain": "harvard.edu;seas.harvard.edu", "position": "Undergrad student;Assistant Professor", "bibtex": "@inproceedings{\nli2024critical,\ntitle={Critical windows: non-asymptotic theory for feature emergence in diffusion models},\nauthor={Marvin Li and Sitan Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=a8ZpjLJuKk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3391890, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18292269064143680130&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "harvard.edu;seas.harvard.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Rolling Diffusion Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33697", "id": "a9bzTv9SzO", "proceeding": "https://proceedings.mlr.press/v235/ruhe24a.html", "pdf": "https://openreview.net/pdf?id=a9bzTv9SzO", "openreview": "https://openreview.net/forum?id=a9bzTv9SzO", "author_site": "David Ruhe, Jonathan Heek, Tim Salimans, Emiel Hoogeboom", "tldr": "", "abstract": "Diffusion models have recently been increasingly applied to temporal data such as video, fluid mechanics simulations, or climate data. These methods generally treat subsequent frames equally regarding the amount of noise in the diffusion process. This paper explores Rolling Diffusion: a new approach that uses a sliding window denoising process. It ensures that the diffusion process progressively corrupts through time by assigning more noise to frames that appear later in a sequence, reflecting greater uncertainty about the future as the generation process unfolds. Empirically, we show that when the temporal dynamics are complex, Rolling Diffusion is superior to standard diffusion. In particular, this result is demonstrated in a video prediction task using the Kinetics-600 video dataset and in a chaotic fluid dynamics forecasting experiment.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "David Ruhe;Jonathan Heek;Tim Salimans;Emiel Hoogeboom", "authorids": "~David_Ruhe1;~Jonathan_Heek1;~Tim_Salimans1;~Emiel_Hoogeboom1", "gender": ";;M;", "homepage": ";;;", "dblp": "243/3507;247/1004;116/2791;217/1488", "google_scholar": ";;;https://scholar.google.nl/citations?user=nkTd_BIAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~David_Ruhe1;~Jonathan_Heek1;~Tim_Salimans1;~Emiel_Hoogeboom1", "aff": "University of Amsterdam;Google;Google;Google", "aff_domain": "uva.nl;google.com;google.com;google.com", "position": "PhD student;Software Engineer;Research Scientist;Researcher", "bibtex": "@inproceedings{\nruhe2024rolling,\ntitle={Rolling Diffusion Models},\nauthor={David Ruhe and Jonathan Heek and Tim Salimans and Emiel Hoogeboom},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=a9bzTv9SzO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4119967, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=688059591673465176&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "uva.nl;google.com;google.com;google.com", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of Amsterdam;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.uva.nl;https://www.google.com", "aff_unique_abbr": "UvA;Google", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Netherlands;United States" }, { "title": "Simulation of Graph Algorithms with Looped Transformers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33696", "id": "aA2326y3hf", "proceeding": "https://proceedings.mlr.press/v235/back-de-luca24a.html", "pdf": "https://openreview.net/pdf?id=aA2326y3hf", "openreview": "https://openreview.net/forum?id=aA2326y3hf", "author_site": "Artur Back de Luca, Kimon Fountoulakis", "tldr": "", "abstract": "The execution of graph algorithms using neural networks has recently attracted significant interest due to promising empirical progress. This motivates further understanding of how neural networks can replicate reasoning steps with relational data. In this work, we study the ability of transformer networks to simulate algorithms on graphs from a theoretical perspective. The architecture we use is a looped transformer with extra attention heads that interact with the graph. We prove by construction that this architecture can simulate individual algorithms such as Dijkstra's shortest path, Breadth- and Depth-First Search, and Kosaraju's strongly connected components, as well as multiple algorithms simultaneously. The number of parameters in the networks does not increase with the input graph size, which implies that the networks can simulate the above algorithms for any graph. Despite this property, we show a limit to simulation in our solution due to finite precision. Finally, we show a Turing Completeness result with constant width when the extra attention heads are utilized.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Artur Back de Luca;Kimon Fountoulakis", "authorids": "~Artur_Back_de_Luca1;~Kimon_Fountoulakis1", "gender": "M;M", "homepage": "https://artur-deluca.github.io/;https://opallab.ca", "dblp": ";149/5799", "google_scholar": "tL9d0UoAAAAJ;https://scholar.google.ca/citations?user=K-SafJUAAAAJ", "orcid": ";", "linkedin": "https://linkedin.com/in/arturbackdeluca;", "or_profile": "~Artur_Back_de_Luca1;~Kimon_Fountoulakis1", "aff": "University of Waterloo;University of Waterloo", "aff_domain": "uwaterloo.ca;uwaterloo.ca", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nluca2024simulation,\ntitle={Simulation of Graph Algorithms with Looped Transformers},\nauthor={Artur Back de Luca and Kimon Fountoulakis},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=aA2326y3hf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 828752, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5091481593153180481&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "uwaterloo.ca;uwaterloo.ca", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Waterloo", "aff_unique_dep": "", "aff_unique_url": "https://uwaterloo.ca", "aff_unique_abbr": "UW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "title": "Protein Conformation Generation via Force-Guided SE(3) Diffusion Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33695", "id": "aC1LSa4nXs", "proceeding": "https://proceedings.mlr.press/v235/wang24cv.html", "pdf": "https://openreview.net/pdf?id=aC1LSa4nXs", "openreview": "https://openreview.net/forum?id=aC1LSa4nXs", "author_site": "YAN WANG, Lihao Wang, Yuning Shen, Yiqun Wang, Huizhuo Yuan, Yue Wu, Quanquan Gu", "tldr": "", "abstract": "The conformational landscape of proteins is crucial to understanding their functionality in complex biological processes. Traditional physics-based computational methods, such as molecular dynamics (MD) simulations, suffer from rare event sampling and long equilibration time problems, hindering their applications in general protein systems. Recently, deep generative modeling techniques, especially diffusion models, have been employed to generate novel protein conformations. However, existing score-based diffusion methods cannot properly incorporate important physical prior knowledge to guide the generation process, causing large deviations in the sampled protein conformations from the equilibrium distribution. In this paper, to overcome these limitations, we propose a force-guided $\\mathrm{SE}(3)$ diffusion model, ConfDiff, for protein conformation generation. By incorporating a force-guided network with a mixture of data-based score models, ConfDiff can generate protein conformations with rich diversity while preserving high fidelity. Experiments on a variety of protein conformation prediction tasks, including 12 fast-folding proteins and the Bovine Pancreatic Trypsin Inhibitor (BPTI), demonstrate that our method surpasses the state-of-the-art method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "YanWang;Lihao Wang;Yuning Shen;Yiqun Wang;Huizhuo Yuan;Yue Wu;Quanquan Gu", "authorids": "~YanWang1;~Lihao_Wang1;~Yuning_Shen1;~Yiqun_Wang3;~Huizhuo_Yuan1;~Yue_Wu12;~Quanquan_Gu1", "gender": "F;M;;M;;M;M", "homepage": ";;;https://raymond-yiqunwang.github.io/;;https://yuewu.us/;http://web.cs.ucla.edu/~qgu/", "dblp": ";;;;;41/5979-11;50/4597", "google_scholar": "OquFV3wAAAAJ;;;Z_9piXQAAAAJ;;kSQ1mLYAAAAJ;GU9HgNAAAAAJ", "orcid": ";0000-0002-4960-015X;;0000-0002-1457-0085;;;", "linkedin": ";;;yiqun-raymond-wang-dot-science;;;", "or_profile": "~YanWang1;~Lihao_Wang1;~Yuning_Shen1;~Yiqun_Wang3;~Huizhuo_Yuan1;~Yue_Wu12;~Quanquan_Gu1", "aff": "Tongji University;ByteDance Inc.;;ByteDance Inc.;;University of California, Los Angeles;University of California, Los Angeles", "aff_domain": "tongji.edu.cn;bytedance.com;;bytedance.com;;ucla.edu;cs.ucla.edu", "position": "PhD student;Researcher;;Researcher;;PhD student;Associate Professor", "bibtex": "@inproceedings{\nyanwang2024protein,\ntitle={Protein Conformation Generation via Force-Guided {SE}(3) Diffusion Models},\nauthor={YanWang and Lihao Wang and Yuning Shen and Yiqun Wang and Huizhuo Yuan and Yue Wu and Quanquan Gu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=aC1LSa4nXs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6661880, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12488762723438285627&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "tongji.edu.cn;bytedance.com;;bytedance.com;;ucla.edu;cs.ucla.edu", "author_num": 7, "aff_unique_index": "0;1;1;2;2", "aff_unique_norm": "Tongji University;ByteDance;University of California, Los Angeles", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tongji.edu.cn;https://www.bytedance.com;https://www.ucla.edu", "aff_unique_abbr": "Tongji;ByteDance;UCLA", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;1;1", "aff_country_unique": "China;United States" }, { "title": "Learning to Explore for Stochastic Gradient MCMC", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33694", "id": "aECamk9izk", "proceeding": "https://proceedings.mlr.press/v235/kim24k.html", "pdf": "https://openreview.net/pdf?id=aECamk9izk", "openreview": "https://openreview.net/forum?id=aECamk9izk", "author_site": "SeungHyun Kim, Seohyeon Jung, SeongHyeon Kim, Juho Lee", "tldr": "", "abstract": "Bayesian Neural Networks(BNNs) with high-dimensional parameters pose a challenge for posterior inference due to the multi-modality of the posterior distributions. Stochastic Gradient Markov Chain Monte Carlo(SGMCMC) with cyclical learning rate scheduling is a promising solution, but it requires a large number of sampling steps to explore high-dimensional multi-modal posteriors, making it computationally expensive. In this paper, we propose a meta-learning strategy to build SGMCMC which can efficiently explore the multi-modal target distributions. Our algorithm allows the learned SGMCMC to quickly explore the high-density region of the posterior landscape. Also, we show that this exploration property is transferrable to various tasks, even for the ones unseen during a meta-training stage. Using popular image classification benchmarks and a variety of downstream tasks, we demonstrate that our method significantly improves the sampling efficiency, achieving better performance than vanilla SGMCMC without incurring significant computational overhead.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "SeungHyun Kim;Seohyeon Jung;SeongHyeon Kim;Juho Lee", "authorids": "~SeungHyun_Kim3;~Seohyeon_Jung1;~SeongHyeon_Kim4;~Juho_Lee2", "gender": "M;F;M;M", "homepage": ";;;https://juho.lee.github.io", "dblp": ";350/4069;;55/3410-1", "google_scholar": "QUsSqoYAAAAJ;https://scholar.google.com/citations?view_op=list_works;;Py4URJUAAAAJ", "orcid": ";;;", "linkedin": ";;seong-hyeon-kim-260b8b261;", "or_profile": "~SeungHyun_Kim3;~Seohyeon_Jung1;~SeongHyeon_Kim4;~Juho_Lee2", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.edu;kaist.edu;kaist.ac.kr;kaist.ac.kr", "position": "MS student;MS student;Undergrad student;Associate Professor", "bibtex": "@inproceedings{\nkim2024learning,\ntitle={Learning to Explore for Stochastic Gradient {MCMC}},\nauthor={SeungHyun Kim and Seohyeon Jung and SeongHyeon Kim and Juho Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=aECamk9izk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3988635, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vjt8EelnV8sJ:scholar.google.com/&scioq=Learning+to+Explore+for+Stochastic+Gradient+MCMC&hl=en&as_sdt=0,5", "gs_version_total": 7, "email": "kaist.edu;kaist.edu;kaist.ac.kr;kaist.ac.kr", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "BayOTIDE: Bayesian Online Multivariate Time Series Imputation with Functional Decomposition", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33693", "id": "aGBpiEcB8z", "proceeding": "https://proceedings.mlr.press/v235/fang24d.html", "pdf": "https://openreview.net/pdf?id=aGBpiEcB8z", "openreview": "https://openreview.net/forum?id=aGBpiEcB8z", "author_site": "Shikai Fang, Qingsong Wen, Yingtao Luo, Shandian Zhe, Liang Sun", "tldr": "", "abstract": "In real-world scenarios such as traffic and energy management, we frequently encounter large volumes of time-series data characterized by missing values, noise, and irregular sampling patterns. While numerous imputation methods have been proposed, the majority tend to operate within a local horizon, which involves dividing long sequences into batches of fixed-length segments for model training. This local horizon often leads to the overlooking of global trends and periodic patterns. More importantly, most methods assume the observations are sampled at regular timestamps, and fail to handle complex irregular sampled time series in various applications. Additionally, most existing methods are learned in an offline manner. Thus, it is not suitable for applications with rapidly arriving streaming data. To address these challenges, we propose BayOTIDE: Bayesian Online Multivariate Time series Imputation with functional decomposition. Our method conceptualizes multivariate time series as the weighted combination of groups of low-rank temporal factors with different patterns. We employ a suite of Gaussian Processes (GPs),each with a unique kernel, as functional priors to model these factors. For computational efficiency, we further convert the GPs into a state-space prior by constructing an equivalent stochastic differential equation (SDE), and developing a scalable algorithm for online inference. The proposed method can not only handle imputation over arbitrary timestamps, but also offer uncertainty quantification and interpretability for the downstream application. We evaluate our method on both synthetic and real-world datasets. We release the code at https://github.com/xuangu-fang/BayOTIDE.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shikai Fang;Qingsong Wen;Yingtao Luo;Shandian Zhe;Liang Sun", "authorids": "~Shikai_Fang2;~Qingsong_Wen2;~Yingtao_Luo1;~Shandian_Zhe1;~Liang_Sun2", "gender": ";;;;M", "homepage": ";;https://yingtaoluo.github.io/;;https://www.linkedin.com/in/liang-sun-a0a87621/", "dblp": ";;278/2956;;18/5837-1", "google_scholar": ";;g_MmNEoAAAAJ;;D_cOMBgAAAAJ", "orcid": ";;0000-0003-1794-3657;;0009-0002-5835-7259", "linkedin": ";;;;", "or_profile": "~Shikai_Fang2;~Qingsong_Wen2;~Yingtao_Luo1;~Shandian_Zhe1;~Liang_Sun2", "aff": ";;Carnegie Mellon University;;Alibaba Group", "aff_domain": ";;andrew.cmu.edu;;alibaba-inc.com", "position": ";;PhD student;;Staff Software Engineer", "bibtex": "@inproceedings{\nfang2024bayotide,\ntitle={Bay{OTIDE}: Bayesian Online Multivariate Time Series Imputation with Functional Decomposition},\nauthor={Shikai Fang and Qingsong Wen and Yingtao Luo and Shandian Zhe and Liang Sun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=aGBpiEcB8z}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 848590, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17471294240726183854&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": ";;andrew.cmu.edu;;alibaba-inc.com", "author_num": 5, "aff_unique_index": "0;1", "aff_unique_norm": "Carnegie Mellon University;Alibaba Group", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.alibaba.com", "aff_unique_abbr": "CMU;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;China" }, { "title": "Copula-Nested Spectral Kernel Network", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33692", "id": "aK1FyEP2Sn", "proceeding": "https://proceedings.mlr.press/v235/tian24f.html", "pdf": "https://openreview.net/pdf?id=aK1FyEP2Sn", "openreview": "https://openreview.net/forum?id=aK1FyEP2Sn", "author_site": "Jinyue Tian, Hui Xue, Yanfang Xue, Pengfei Fang", "tldr": "", "abstract": "Spectral Kernel Networks (SKNs) emerge as a promising approach in machine learning, melding solid theoretical foundations of spectral kernels with the representation power of hierarchical architectures. At its core, the spectral density function plays a pivotal role by revealing essential patterns in data distributions, thereby offering deep insights into the underlying framework in real-world tasks. Nevertheless, prevailing designs of spectral density often overlook the intricate interactions within data structures. This phenomenon consequently neglects expanses of the hypothesis space, thus curtailing the performance of SKNs. This paper addresses the issues through a novel approach, the **Co**pula-Nested Spectral **Ke**rnel **Net**work (**CokeNet**). Concretely, we first redefine the spectral density with the form of copulas to enhance the diversity of spectral densities. Next, the specific expression of the copula module is designed to allow the excavation of complex dependence structures. Finally, the unified kernel network is proposed by integrating the corresponding spectral kernel and the copula module. Through rigorous theoretical analysis and experimental verification, CokeNet demonstrates superior performance and significant advancements over SOTA algorithms in the field.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jinyue Tian;Hui Xue;Yanfang Xue;Pengfei Fang", "authorids": "~Jinyue_Tian1;~Hui_Xue6;~Yanfang_Xue1;~Pengfei_Fang1", "gender": "F;F;M;F", "homepage": ";;https://fpfcjdsg.github.io/;http://palm.seu.edu.cn/hxue/", "dblp": ";285/5137;204/7650.html;27/3541-2.html", "google_scholar": ";;Fk4A13IAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": "0000-0003-4061-5475;0000-0001-9664-7754;0000-0001-8939-0460;", "linkedin": ";;fang-pengfei-62956a96/?locale=zh_CN;", "or_profile": "~Jinyue_Tian1;~Yanfang_Xue1;~Pengfei_Fang1;~hui_xue3", "aff": "Southeast University;Southeast University;Southeast University;Southeast University", "aff_domain": "seu.edu.cn;seu.edu.cn;seu.edu.cn;seu.edu.cn", "position": "MS student;PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\ntian2024copulanested,\ntitle={Copula-Nested Spectral Kernel Network},\nauthor={Jinyue Tian and Hui Xue and Yanfang Xue and Pengfei Fang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=aK1FyEP2Sn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2352616, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FfSyqpXEpTEJ:scholar.google.com/&scioq=Copula-Nested+Spectral+Kernel+Network&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": "seu.edu.cn;seu.edu.cn;seu.edu.cn;seu.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Southeast University", "aff_unique_dep": "", "aff_unique_url": "https://www.seu.edu.cn/", "aff_unique_abbr": "SEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Boosting Offline Optimizers with Surrogate Sensitivity", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33691", "id": "aLSA3JH08h", "proceeding": "https://proceedings.mlr.press/v235/dao24b.html", "pdf": "https://openreview.net/pdf?id=aLSA3JH08h", "openreview": "https://openreview.net/forum?id=aLSA3JH08h", "author_site": "Cuong Dao, Phi Le Nguyen, Thao Nguyen Truong, Nghia Hoang", "tldr": "", "abstract": "Offline optimization is an important task in numerous material engineering domains where online experimentation to collect data is too expensive and needs to be replaced by an in silico maximization of a surrogate of the black-box function. Although such a surrogate can be learned from offline data, its prediction might not be reliable outside the offline data regime, which happens when the surrogate has narrow prediction margin and is (therefore) sensitive to small perturbations of its parameterization. This raises the following questions: (1) how to regulate the sensitivity of a surrogate model; and (2) whether conditioning an offline optimizer with such less sensitive surrogate will lead to better optimization performance. To address these questions, we develop an optimizable sensitivity measurement for the surrogate model, which then inspires a sensitivity-informed regularizer that is applicable to a wide range of offline optimizers. This development is both orthogonal and synergistic to prior research on offline optimization, which is demonstrated in our extensive experiment benchmark.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Manh Cuong Dao;Phi Le Nguyen;Thao Nguyen Truong;Trong Nghia Hoang", "authorids": "~Manh_Cuong_Dao1;~Phi_Le_Nguyen2;~Thao_Nguyen_Truong1;~Trong_Nghia_Hoang1", "gender": "M;F;M;M", "homepage": ";https://users.soict.hust.edu.vn/lenp/;https://researchmap.jp/nguyentt;https://htnghia87.github.io/", "dblp": ";147/2816;233/1462.html;62/540", "google_scholar": "ksYAJugAAAAJ;L_NKoQwAAAAJ;;E-kZZeQAAAAJ", "orcid": ";;0000-0003-3641-374X;", "linkedin": "cuong-dao-9501a2223/;;;", "or_profile": "~Manh_Cuong_Dao1;~Phi_Le_Nguyen2;~Thao_Nguyen_Truong1;~Nghia_Hoang2", "aff": "Hanoi University of Science and Technology;Hanoi University of Science and Technology;AIST, National Institute of Advanced Industrial Science and Technology;Washington State University", "aff_domain": "hust.edu.vn;hust.edu.vn;aist.go.jp;eecs.wsu.edu", "position": "MS student;Associate Professor;Researcher;Assistant Professor", "bibtex": "@inproceedings{\ndao2024boosting,\ntitle={Boosting Offline Optimizers with Surrogate Sensitivity},\nauthor={Manh Cuong Dao and Phi Le Nguyen and Thao Nguyen Truong and Trong Nghia Hoang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=aLSA3JH08h}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1262233, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4106409843889159295&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "hust.edu.vn;hust.edu.vn;aist.go.jp;eecs.wsu.edu", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Hanoi University of Science and Technology;National Institute of Advanced Industrial Science and Technology;Washington State University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.hust.edu.vn;https://www.aist.go.jp;https://wsu.edu", "aff_unique_abbr": "HUST;AIST;WSU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hanoi;", "aff_country_unique_index": "0;0;1;2", "aff_country_unique": "Vietnam;Japan;United States" }, { "title": "How Smooth Is Attention?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33690", "id": "aP0H8A1ywk", "proceeding": "https://proceedings.mlr.press/v235/castin24a.html", "pdf": "https://openreview.net/pdf?id=aP0H8A1ywk", "openreview": "https://openreview.net/forum?id=aP0H8A1ywk", "author_site": "Val\u00e9rie Castin, Pierre Ablin, Gabriel Peyr\u00e9", "tldr": "", "abstract": "Self-attention and masked self-attention are at the heart of Transformers' outstanding success. Still, our mathematical understanding of attention, in particular of its Lipschitz properties \u2014 which are key when it comes to analyzing robustness and expressive power \u2014 is incomplete. We provide a detailed study of the Lipschitz constant of self-attention in several practical scenarios, discussing the impact of the sequence length $n$ and layer normalization on the local Lipschitz constant of both unmasked and masked self-attention. In particular, we show that for inputs of length $n$ in any compact set, the Lipschitz constant of self-attention is bounded by $\\sqrt{n}$ up to a constant factor and that this bound is tight for reasonable sequence lengths. When the sequence length $n$ is too large for the previous bound to be tight, which we refer to as the mean-field regime, we provide an upper bound and a matching lower bound which are independent of $n$. Our mean-field framework for masked self-attention is novel and of independent interest. Our experiments on pretrained and randomly initialized BERT and GPT-2 support our theoretical findings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Val\u00e9rie Castin;Pierre Ablin;Gabriel Peyr\u00e9", "authorids": "~Val\u00e9rie_Castin1;~Pierre_Ablin2;~Gabriel_Peyr\u00e92", "gender": ";M;M", "homepage": ";https://pierreablin.com/;http://gpeyre.com/", "dblp": ";174/0980.html;65/1759", "google_scholar": ";1ZsunaYAAAAJ;https://scholar.google.fr/citations?user=KqA1dYcAAAAJ", "orcid": ";;", "linkedin": "val\u00e9rie-castin-7160b420a/;;", "or_profile": "~Val\u00e9rie_Castin1;~Pierre_Ablin2;~Gabriel_Peyr\u00e92", "aff": ";Apple;CNRS", "aff_domain": ";apple.com;cnrs.fr", "position": ";Researcher;Researcher", "bibtex": "@inproceedings{\ncastin2024how,\ntitle={How Smooth Is Attention?},\nauthor={Val{\\'e}rie Castin and Pierre Ablin and Gabriel Peyr{\\'e}},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=aP0H8A1ywk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 834745, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17224160298166789163&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";apple.com;cnrs.fr", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Apple;Centre National de la Recherche Scientifique", "aff_unique_dep": "Apple Inc.;", "aff_unique_url": "https://www.apple.com;https://www.cnrs.fr", "aff_unique_abbr": "Apple;CNRS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;France" }, { "title": "On the Embedding Collapse when Scaling up Recommendation Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33689", "id": "aPVwOAr1aW", "proceeding": "https://proceedings.mlr.press/v235/guo24e.html", "pdf": "https://openreview.net/pdf?id=aPVwOAr1aW", "openreview": "https://openreview.net/forum?id=aPVwOAr1aW", "author_site": "Xingzhuo Guo, Junwei Pan, Ximei Wang, Baixu Chen, Jie Jiang, Mingsheng Long", "tldr": "", "abstract": "Recent advances in foundation models have led to a promising trend of developing large recommendation models to leverage vast amounts of available data. Still, mainstream models remain embarrassingly small in size and naive enlarging does not lead to sufficient performance gain, suggesting a deficiency in the model scalability. In this paper, we identify the embedding collapse phenomenon as the inhibition of scalability, wherein the embedding matrix tends to occupy a low-dimensional subspace. Through empirical and theoretical analysis, we demonstrate a two-sided effect of feature interaction specific to recommendation models. On the one hand, interacting with collapsed embeddings restricts embedding learning and exacerbates the collapse issue. On the other hand, interaction is crucial in mitigating the fitting of spurious features as a scalability guarantee. Based on our analysis, we propose a simple yet effective multi-embedding design incorporating embedding-set-specific interaction modules to learn embedding sets with large diversity and thus reduce collapse. Extensive experiments demonstrate that this proposed design provides consistent scalability and effective collapse mitigation for various recommendation models. Code is available at this repository: https://github.com/thuml/Multi-Embedding.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xingzhuo Guo;Junwei Pan;Ximei Wang;Baixu Chen;Jie Jiang;Mingsheng Long", "authorids": "~Xingzhuo_Guo1;~Junwei_Pan1;~Ximei_Wang1;~Baixu_Chen2;~Jie_Jiang3;~Mingsheng_Long5", "gender": "M;M;M;M;M;M", "homepage": ";https://junwei-pan.github.io/;https://wxm17.github.io/;https://github.com/tsingcbx99;https://baike.baidu.com/item/%E8%92%8B%E6%9D%B0/58674740;http://ise.thss.tsinghua.edu.cn/~mlong", "dblp": ";210/6440;89/8876;279/4076;32/7018-15.html;74/9023", "google_scholar": "Cbinj9QAAAAJ;sUaBkFkAAAAJ;WmOCCVgAAAAJ;;;_MjXpXkAAAAJ", "orcid": ";0009-0003-2697-7012;;;0000-0001-9658-5127;0000-0002-5412-9120", "linkedin": ";;;;;", "or_profile": "~Xingzhuo_Guo1;~Junwei_Pan1;~Ximei_Wang1;~Baixu_Chen2;~jie_jiang3;~Mingsheng_Long2", "aff": "Tsinghua University;Tencent;Tencent;Tsinghua University;Tencent AI Lab;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tencent.com;tencent.com;tsinghua.edu.cn;tencent.com;tsinghua.edu.cn", "position": "PhD student;Researcher;Researcher;MS student;VP;Associate Professor", "bibtex": "@inproceedings{\nguo2024on,\ntitle={On the Embedding Collapse when Scaling up Recommendation Models},\nauthor={Xingzhuo Guo and Junwei Pan and Ximei Wang and Baixu Chen and Jie Jiang and Mingsheng Long},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=aPVwOAr1aW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2165458, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14239355219505619999&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 9, "email": "tsinghua.edu.cn;tencent.com;tencent.com;tsinghua.edu.cn;tencent.com;tsinghua.edu.cn", "author_num": 6, "aff_unique_index": "0;1;1;0;1;0", "aff_unique_norm": "Tsinghua University;Tencent", "aff_unique_dep": ";Tencent Holdings Limited", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.tencent.com", "aff_unique_abbr": "THU;Tencent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "A Universal Transfer Theorem for Convex Optimization Algorithms Using Inexact First-order Oracles", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33688", "id": "aPhwhueqjR", "proceeding": "https://proceedings.mlr.press/v235/kerger24a.html", "pdf": "https://openreview.net/pdf?id=aPhwhueqjR", "openreview": "https://openreview.net/forum?id=aPhwhueqjR", "author_site": "Phillip Kerger, Marco Molinaro, Hongyi Jiang, Amitabh Basu", "tldr": "", "abstract": "Given *any* algorithm for convex optimization that uses exact first-order information (i.e., function values and subgradients), we show how to use such an algorithm to solve the problem with access to *inexact* first-order information. This is done in a ``black-box'' manner without knowledge of the internal workings of the algorithm. This complements previous work that considers the performance of specific algorithms like (accelerated) gradient descent with inexact information. In particular, our results apply to a wider range of algorithms beyond variants of gradient descent, e.g., projection-free methods, cutting-plane methods, or any other first-order methods formulated in the future. Further, they also apply to algorithms that handle structured nonconvexities like mixed-integer decision variables.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Phillip Kerger;Marco Molinaro;Hongyi Jiang;Amitabh Basu", "authorids": "~Phillip_Kerger1;~Marco_Molinaro1;~Hongyi_Jiang1;~Amitabh_Basu1", "gender": "M;M;M;M", "homepage": "https://www.inf.puc-rio.br/~mmolinaro;https://scholar.google.com/citations?hl=en&user=Ms8IYg0AAAAJ;;http://phillipkerger.github.io/", "dblp": "88/4732;;;", "google_scholar": ";;;2HbfNloAAAAJ", "orcid": ";;;0000-0002-9938-6067", "linkedin": ";;;", "or_profile": "~Marco_Molinaro1;~Hongyi_Jiang1;~Amitabh_Basu1;~Phillip_Alexander_Kerger1", "aff": "Pontificia Universidade Catolica, Rio de Janeiro, Brazil;Cornell University;Johns Hopkins University;Whiting School of Engineering", "aff_domain": "puc-rio.br;cornell.edu;jhu.edu;engineering.jhu.edu", "position": "Assistant Professor;Postdoc;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nkerger2024a,\ntitle={A Universal Transfer Theorem for Convex Optimization Algorithms Using Inexact First-order Oracles},\nauthor={Phillip Kerger and Marco Molinaro and Hongyi Jiang and Amitabh Basu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=aPhwhueqjR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 512065, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jYFPnrtKAjEJ:scholar.google.com/&scioq=A+Universal+Transfer+Theorem+for+Convex+Optimization+Algorithms+Using+Inexact+First-order+Oracles&hl=en&as_sdt=0,5", "gs_version_total": 7, "email": "puc-rio.br;cornell.edu;jhu.edu;engineering.jhu.edu", "author_num": 4, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Pontifical Catholic University of Rio de Janeiro;Cornell University;Johns Hopkins University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.puc-rio.br/;https://www.cornell.edu;https://www.jhu.edu", "aff_unique_abbr": "PUC-Rio;Cornell;JHU", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Rio de Janeiro;;Baltimore", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Brazil;United States" }, { "title": "SLAB: Efficient Transformers with Simplified Linear Attention and Progressive Re-parameterized Batch Normalization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33687", "id": "aQl4xiwVBc", "proceeding": "https://proceedings.mlr.press/v235/guo24a.html", "pdf": "https://openreview.net/pdf?id=aQl4xiwVBc", "openreview": "https://openreview.net/forum?id=aQl4xiwVBc", "author_site": "Jialong Guo, Xinghao Chen, Yehui Tang, Yunhe Wang", "tldr": "", "abstract": "Transformers have become foundational architectures for both natural language and computer vision tasks. However, the high computational cost makes it quite challenging to deploy on resource-constraint devices. This paper investigates the computational bottleneck modules of efficient transformer, *i.e.*, normalization layers and attention modules. LayerNorm is commonly used in transformer architectures but is not computational friendly due to statistic calculation during inference. However, replacing LayerNorm with more efficient BatchNorm in transformer often leads to inferior performance and collapse in training. To address this problem, we propose a novel method named PRepBN to progressively replace LayerNorm with re-parameterized BatchNorm in training. Moreover, we propose a simplified linear attention (SLA) module that is simple yet effective to achieve strong performance. Extensive experiments on image classification as well as object detection demonstrate the effectiveness of our proposed method. For example, our SLAB-Swin obtains $83.6\\%$ top-1 accuracy on ImageNet-1K with $16.2$ms latency, which is $2.4$ms less than that of Flatten-Swin with $0.1\\%$ higher accuracy. We also evaluated our method for language modeling task and obtain comparable performance and lower latency. Codes are publicly available at https://github.com/xinghaochen/SLAB and https://github.com/mindspore-lab/models/tree/master/research/huawei-noah/SLAB.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jialong Guo;Xinghao Chen;Yehui Tang;Yunhe Wang", "authorids": "~Jialong_Guo1;~Xinghao_Chen1;~Yehui_Tang1;~Yunhe_Wang1", "gender": ";M;M;M", "homepage": ";;;https://www.wangyunhe.site/", "dblp": ";30/4937-1;244/9659;63/8217-1", "google_scholar": "https://scholar.google.com.hk/citations?view_op=list_works;tuGWUVIAAAAJ;TkSZQ6gAAAAJ;https://scholar.google.com.sg/citations?user=isizOkYAAAAJ", "orcid": ";0000-0002-2102-8235;;0000-0002-0142-509X", "linkedin": ";;;", "or_profile": "~Jialong_Guo1;~Xinghao_Chen1;~Yehui_Tang1;~Yunhe_Wang1", "aff": "Huawei Technologies Ltd.;Huawei Noah's Ark Lab;Huawei Technologies Ltd.;Huawei Noah's Ark Lab", "aff_domain": "huawei.com;huawei.com;huawei.com;huawei.com", "position": "Researcher;Researcher;Researcher;Principal Researcher", "bibtex": "@inproceedings{\nguo2024slab,\ntitle={{SLAB}: Efficient Transformers with Simplified Linear Attention and Progressive Re-parameterized Batch Normalization},\nauthor={Jialong Guo and Xinghao Chen and Yehui Tang and Yunhe Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=aQl4xiwVBc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 993505, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17612775596876683095&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "huawei.com;huawei.com;huawei.com;huawei.com", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Huawei", "aff_unique_dep": "Huawei Technologies", "aff_unique_url": "https://www.huawei.com", "aff_unique_abbr": "Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "UP2ME: Univariate Pre-training to Multivariate Fine-tuning as a General-purpose Framework for Multivariate Time Series Analysis", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33686", "id": "aR3uxWlZhX", "proceeding": "https://proceedings.mlr.press/v235/zhang24al.html", "pdf": "https://openreview.net/pdf?id=aR3uxWlZhX", "openreview": "https://openreview.net/forum?id=aR3uxWlZhX", "author_site": "Yunhao Zhang, Liu Minghao, Shengyang Zhou, Junchi Yan", "tldr": "", "abstract": "Despite the success of self-supervised pre-training in texts and images, applying it to multivariate time series (MTS) falls behind tailored methods for tasks like forecasting, imputation and anomaly detection. We propose a general-purpose framework, named UP2ME (**U**nivariate **P**re-training to **M**ultivariate Fin**e**-tuning). It conducts task-agnostic pre-training when downstream tasks are unspecified. Once the task and setting (e.g. forecasting length) are determined, it gives sensible solutions with frozen pre-trained parameters, which has not been achieved before. UP2ME is further refined by fine-tuning. A univariate-to-multivariate paradigm is devised to address the heterogeneity of temporal and cross-channel dependencies. In univariate pre-training, univariate instances with diverse lengths are generated for Masked AutoEncoder (MAE) pre-training, discarding cross-channel dependency. The pre-trained model handles downstream tasks by formulating them into specific mask-reconstruction problems. In multivariate fine-tuning, it constructs a dependency graph among channels using the pre-trained encoder to enhance cross-channel dependency capture. Experiments on eight real-world datasets show its SOTA performance in forecasting and imputation, approaching task-specific performance in anomaly detection. Our code is available at https://github.com/Thinklab-SJTU/UP2ME.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yunhao Zhang;Minghao Liu;Shengyang Zhou;Junchi Yan", "authorids": "~Yunhao_Zhang1;~Minghao_Liu6;~Shengyang_Zhou1;~Junchi_Yan2", "gender": "M;M;M;M", "homepage": ";https://github.com/Learner209/Learner209.github.io;https://github.com/Arrebol-logos;http://thinklab.sjtu.edu.cn/", "dblp": "10/2569;;271/1855.html;60/7949.html", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;;Ma22V1gAAAAJ;ga230VoAAAAJ", "orcid": ";;;0000-0001-9639-7679", "linkedin": ";;;", "or_profile": "~Yunhao_Zhang1;~Minghao_Liu6;~Shengyang_Zhou1;~Junchi_Yan1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "position": "PhD student;Undergrad student;Undergrad student;Full Professor", "bibtex": "@inproceedings{\nzhang2024upme,\ntitle={{UP}2{ME}: Univariate Pre-training to Multivariate Fine-tuning as a General-purpose Framework for Multivariate Time Series Analysis},\nauthor={Yunhao Zhang and Minghao Liu and Shengyang Zhou and Junchi Yan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=aR3uxWlZhX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1734117, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9629621987358229500&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 4, "email": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Self-attention Networks Localize When QK-eigenspectrum Concentrates", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33685", "id": "aRZjRj41WQ", "proceeding": "https://proceedings.mlr.press/v235/bao24b.html", "pdf": "https://openreview.net/pdf?id=aRZjRj41WQ", "openreview": "https://openreview.net/forum?id=aRZjRj41WQ", "author_site": "Han Bao, Ryuichiro Hataya, Ryo Karakida", "tldr": "", "abstract": "The self-attention mechanism prevails in modern machine learning. It has an interesting functionality of adaptively selecting tokens from an input sequence by modulating the degree of attention localization, which many researchers speculate is the basis of the powerful model performance but complicates the underlying mechanism of the learning dynamics. In recent years, mainly two arguments have connected attention localization to the model performances. One is the rank collapse, where the embedded tokens by a self-attention block become very similar across different tokens, leading to a less expressive network. The other is the entropy collapse, where the attention probability approaches non-uniform and entails low entropy, making the learning dynamics more likely to be trapped in plateaus. These two failure modes may apparently contradict each other because the rank and entropy collapses are relevant to uniform and non-uniform attention, respectively. To this end, we characterize the notion of attention localization by the eigenspectrum of query-key parameter matrices and reveal that a small eigenspectrum variance leads attention to be localized. Interestingly, the small eigenspectrum variance prevents both rank and entropy collapse, leading to better model expressivity and trainability.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Han Bao;Ryuichiro Hataya;Ryo Karakida", "authorids": "~Han_Bao2;~Ryuichiro_Hataya1;~Ryo_Karakida2", "gender": "M;Unspecified;M", "homepage": "https://hermite.jp/;https://mosko.tokyo;https://sites.google.com/view/ryokarakida/english", "dblp": "120/1444-2;238/1068;", "google_scholar": "MqMzjeMAAAAJ;https://scholar.google.com/citations?view_op=list_works;", "orcid": "0000-0002-4473-2604;;", "linkedin": ";;", "or_profile": "~Han_Bao2;~Ryuichiro_Hataya1;~Ryo_Karakida2", "aff": "Kyoto University, Kyoto University;RIKEN;AIST, National Institute of Advanced Industrial Science and Technology", "aff_domain": "i.kyoto-u.ac.jp;riken.jp;aist.go.jp", "position": "Assistant Professor;Postdoc;Researcher", "bibtex": "@inproceedings{\nbao2024selfattention,\ntitle={Self-attention Networks Localize When {QK}-eigenspectrum Concentrates},\nauthor={Han Bao and Ryuichiro Hataya and Ryo Karakida},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=aRZjRj41WQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1553012, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3889944091668113689&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "i.kyoto-u.ac.jp;riken.jp;aist.go.jp", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Kyoto University;RIKEN;National Institute of Advanced Industrial Science and Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kyoto-u.ac.jp;https://www.riken.jp;https://www.aist.go.jp", "aff_unique_abbr": "Kyoto U;RIKEN;AIST", "aff_campus_unique_index": "0", "aff_campus_unique": "Kyoto;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Japan" }, { "title": "Beyond Regular Grids: Fourier-Based Neural Operators on Arbitrary Domains", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33684", "id": "aVqqoFAavs", "proceeding": "https://proceedings.mlr.press/v235/lingsch24a.html", "pdf": "https://openreview.net/pdf?id=aVqqoFAavs", "openreview": "https://openreview.net/forum?id=aVqqoFAavs", "author_site": "Levi Lingsch, Mike Yan Michelis, Emmanuel de B\u00e9zenac, Sirani M. Perera, Robert Katzschmann, Siddhartha Mishra", "tldr": "", "abstract": "The computational efficiency of many neural operators, widely used for learning solutions of PDEs, relies on the fast Fourier transform (FFT) for performing spectral computations. As the FFT is limited to equispaced (rectangular) grids, this limits the efficiency of such neural operators when applied to problems where the input and output functions need to be processed on general non-equispaced point distributions. Leveraging the observation that a limited set of Fourier (Spectral) modes suffice to provide the required expressivity of a neural operator, we propose a simple method, based on the efficient direct evaluation of the underlying spectral transformation, to extend neural operators to arbitrary domains. An efficient implementation of such *direct spectral evaluations* is coupled with existing neural operator models to allow the processing of data on arbitrary non-equispaced distributions of points. With extensive empirical evaluation, we demonstrate that the proposed method allows us to extend neural operators to arbitrary point distributions with significant gains in training speed over baselines, while retaining or improving the accuracy of Fourier neural operators (FNOs) and related neural operators.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Levi E. Lingsch;Mike Yan Michelis;Emmanuel de Bezenac;Sirani M. Perera;Robert K. Katzschmann;Siddhartha Mishra", "authorids": "~Levi_E._Lingsch1;~Mike_Yan_Michelis1;~Emmanuel_de_Bezenac2;~Sirani_M._Perera1;~Robert_K._Katzschmann1;~Siddhartha_Mishra1", "gender": "M;M;M;M;Not Specified;F", "homepage": ";;http://www.sam.math.ethz.ch/;;http://srl.ethz.ch;https://faculty.erau.edu/Sirani.Perera", "dblp": "292/3712;;07/2856.html;304/9412;139/3491;140/7610", "google_scholar": "xxCOii8AAAAJ;https://scholar.google.fr/citations?user=KvZw5gYAAAAJ;FmEqyNcAAAAJ;9pJIGJUAAAAJ;https://scholar.google.ch/citations?hl=en;QgrLwXAAAAAJ", "orcid": ";;;;0000-0001-7143-7259;", "linkedin": "mike-yan-michelis-669774174/;;;levi-l-1996a3151;robertkatzschmann/;sirani-m-perera-89a754b/", "or_profile": "~Mike_Yan_Michelis1;~Emmanuel_de_Bezenac2;~Siddhartha_Mishra1;~Levi_Evan_Lingsch1;~Robert_Kevin_Katzschmann1;~Sirani_Mututhanthrige-Perera1", "aff": "ETHZ - ETH Zurich;ETHZ - ETH Zurich;Swiss Federal Institute of Technology;ETHZ - ETH Zurich;Swiss Federal Institute of Technology;Embry-Riddle Aeronautical University", "aff_domain": "ethz.ch;ethz.ch;ethz.ch;ethz.ch;ethz.ch;erau.edu", "position": "PhD student;Postdoc;Full Professor;MS student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nlingsch2024beyond,\ntitle={Beyond Regular Grids: Fourier-Based Neural Operators on Arbitrary Domains},\nauthor={Levi E. Lingsch and Mike Yan Michelis and Emmanuel de Bezenac and Sirani M. Perera and Robert K. Katzschmann and Siddhartha Mishra},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=aVqqoFAavs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8527126, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5886381280487950715&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 10, "email": "ethz.ch;ethz.ch;ethz.ch;ethz.ch;ethz.ch;erau.edu", "author_num": 6, "aff_unique_index": "0;0;1;0;1;2", "aff_unique_norm": "ETH Zurich;Swiss Federal Institute of Technology;Embry-Riddle Aeronautical University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch;https://www.erau.edu", "aff_unique_abbr": "ETHZ;ETH Zurich;ERAU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1", "aff_country_unique": "Switzerland;United States" }, { "title": "Improving Open-Ended Text Generation via Adaptive Decoding", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33683", "id": "aXD94eATtT", "proceeding": "https://proceedings.mlr.press/v235/zhu24d.html", "pdf": "https://openreview.net/pdf?id=aXD94eATtT", "openreview": "https://openreview.net/forum?id=aXD94eATtT", "author_site": "Wenhong Zhu, Hongkun Hao, Zhiwei He, Yiming Ai, Rui Wang", "tldr": "", "abstract": "Current language models decode text token by token according to probabilistic distribution, and determining the appropriate candidates for the next token is crucial to ensure generation quality. This study introduces adaptive decoding, a mechanism that dynamically empowers language models to ascertain a sensible candidate set during generation. Specifically, we introduce an entropy-based metric called confidence and conceptualize determining the optimal candidate set as a confidence-increasing process. The rationality of including a token in the candidate set is assessed by leveraging the increment of confidence. Experimental results reveal that our method balances diversity and coherence well. The human evaluation shows that our method can generate human-preferred text. Additionally, our method can potentially improve the reasoning ability of language models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenhong Zhu;Hongkun Hao;Zhiwei He;Yiming Ai;Rui Wang", "authorids": "~Wenhong_Zhu1;~Hongkun_Hao1;~Zhiwei_He1;~Yiming_Ai1;~Rui_Wang10", "gender": "M;M;M;;M", "homepage": "https://github.com/zwhong714;https://hongkunhao.github.io/;https://zwhe99.github.io/;https://github.com/rutilel;https://wangruinlp.github.io/", "dblp": ";349/2933;52/6077-2;;w/RuiWang15", "google_scholar": "psCdg8EAAAAJ;a3UulbMAAAAJ;https://scholar.google.com/citations?hl=en;;oTU0v5IAAAAJ", "orcid": ";;0000-0002-4807-0062;;0000-0001-8007-2503", "linkedin": ";hongkun-hao-372090172/;;;", "or_profile": "~Wenhong_Zhu1;~Hongkun_Hao1;~Zhiwei_He1;~Yiming_Ai1;~Rui_Wang7", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "position": "PhD student;MS student;PhD student;MS student;Associate Professor", "bibtex": "@inproceedings{\nzhu2024improving,\ntitle={Improving Open-Ended Text Generation via Adaptive Decoding},\nauthor={Wenhong Zhu and Hongkun Hao and Zhiwei He and Yiming Ai and Rui Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=aXD94eATtT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2264809, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7332707036991134323&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Predicting Lagrangian Multipliers for Mixed Integer Linear Programs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33682", "id": "aZnZOqUOHq", "proceeding": "https://proceedings.mlr.press/v235/demelas24a.html", "pdf": "https://openreview.net/pdf?id=aZnZOqUOHq", "openreview": "https://openreview.net/forum?id=aZnZOqUOHq", "author_site": "Francesco Demelas, Joseph Roux, Mathieu Lacroix, Axel Parmentier", "tldr": "", "abstract": "Lagrangian Relaxation stands among the most efficient approaches for solving Mixed Integer Linear Programs (MILPs) with difficult constraints. Given any duals for these constraints, called Lagrangian Multipliers (LMs), it returns a bound on the optimal value of the MILP, and Lagrangian methods seek the LMs giving the best such bound. But these methods generally rely on iterative algorithms resembling gradient descent to maximize the concave piecewise linear dual function: the computational burden grows quickly with the number of relaxed constraints. We introduce a deep learning approach that bypasses the descent, effectively amortizing per instance optimization. A probabilistic encoder based on a graph neural network computes, given a MILP instance and its Continuous Relaxation (CR) solution, high-dimensional representations of relaxed constraints, which are turned into LMs by a decoder. We train the encoder and the decoder jointly by directly optimizing the bound obtained from the predicted multipliers. Our method is applicable to any problem with a compact MILP formulation, and to any Lagrangian Relaxation providing a tighter bound than CR. Experiments on two widely known problems, Multi-Commodity Network Design and Generalized Assignment, show that our approach closes up to 85% of the gap between the continuous relaxation and the best Lagrangian bound, and provides a high-quality warm-start for descent-based Lagrangian methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Francesco Demelas;Joseph Le Roux;Mathieu Lacroix;Axel Parmentier", "authorids": "~Francesco_Demelas1;~Joseph_Le_Roux1;~Mathieu_Lacroix1;~Axel_Parmentier1", "gender": ";M;M;M", "homepage": ";https://www.lipn.fr/~leroux;https://lipn.univ-paris13.fr/~lacroix/;https://cermics.enpc.fr/~parmenta/", "dblp": ";25/5993;84/8724;150/6250", "google_scholar": ";WVUs6rEAAAAJ;;https://scholar.google.fr/citations?hl=fr", "orcid": "0000-0003-1888-3182;;0000-0001-8385-3890;0000-0003-1762-4947", "linkedin": ";;;axel-parmentier-466548148/", "or_profile": "~Francesco_Demelas1;~Joseph_Le_Roux1;~Mathieu_Lacroix1;~Axel_Parmentier1", "aff": "University Paris 13, Universit\u00e9 Paris Nord (Paris XIII);Universit\u00e9 Paris 13;Universit\u00e9 Paris Nord (Paris XIII);Ecole Nationale des Ponts et Chausees", "aff_domain": "lipn.univ-paris13.fr;univ-paris13.fr;univ-paris13.fr;enpc.fr", "position": "PhD student;Associate Professor;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\ndemelas2024predicting,\ntitle={Predicting Lagrangian Multipliers for Mixed Integer Linear Programs},\nauthor={Francesco Demelas and Joseph Le Roux and Mathieu Lacroix and Axel Parmentier},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=aZnZOqUOHq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 455502, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=698020942188700280&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "lipn.univ-paris13.fr;univ-paris13.fr;univ-paris13.fr;enpc.fr", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University Paris 13;Universit\u00e9 Paris 13;Universit\u00e9 Paris Nord;Ecole Nationale des Ponts et Chaussees", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.univ-paris13.fr;https://www.univ-paris13.fr;https://www.univ-paris13.fr;https://www.enpc.fr", "aff_unique_abbr": "UP13;UP13;UP13;ENPC", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Paris;;Paris XIII", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "France" }, { "title": "Socialized Learning: Making Each Other Better Through Multi-Agent Collaboration", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33681", "id": "aaeJpJw5Ur", "proceeding": "https://proceedings.mlr.press/v235/yao24d.html", "pdf": "https://openreview.net/pdf?id=aaeJpJw5Ur", "openreview": "https://openreview.net/forum?id=aaeJpJw5Ur", "author_site": "Xinjie Yao, Yu Wang, Pengfei Zhu, Wanyu LIN, Li Jialu, Weihao Li, Qinghua Hu", "tldr": "", "abstract": "Learning new knowledge frequently occurs in our dynamically changing world, e.g., humans culturally evolve by continuously acquiring new abilities to sustain their survival, leveraging collective intelligence rather than a large number of individual attempts. The effective learning paradigm during cultural evolution is termed socialized learning (SL). Consequently, a straightforward question arises: Can multi-agent systems acquire more new abilities like humans? In contrast to most existing methods that address continual learning and multi-agent collaboration, our emphasis lies in a more challenging problem: we prioritize the knowledge in the original expert classes, and as we adeptly learn new ones, the accuracy in the original expert classes stays superior among all in a directional manner. Inspired by population genetics and cognitive science, leading to unique and complete development, we propose Multi-Agent Socialized Collaboration (MASC), which achieves SL through interactions among multiple agents. Specifically, we introduce collective collaboration and reciprocal altruism modules, organizing collaborative behaviors, promoting information sharing, and facilitating learning and knowledge interaction among individuals. We demonstrate the effectiveness of multi-agent collaboration in an extensive empirical study. Our code will be publicly available at https://github.com/yxjdarren/SL.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xinjie Yao;Yu Wang;Pengfei Zhu;Wanyu Lin;Jialu Li;Weihao Li;Qinghua Hu", "authorids": "~Xinjie_Yao1;~Yu_Wang33;~Pengfei_Zhu1;~Wanyu_Lin1;~Jialu_Li4;~Weihao_Li3;~Qinghua_Hu1", "gender": "M;M;M;F;F;M;M", "homepage": "https://yxjdarren.github.io/;https://wangyutju.github.io/;http://aiskyeye.com/;https://wanyu-lin.github.io;;;http://cic.tju.edu.cn/faculty/huqinghua/index.html", "dblp": "254/2736;02/5889-106;40/6172-1.html;152/1714;32/11008;;", "google_scholar": "tScuhLkAAAAJ;;https://scholar.google.com/citations?hl=zh-TW;vgLANV0AAAAJ;;;TVSNq_wAAAAJ", "orcid": "0000-0001-5495-5345;;;;0000-0002-6504-8625;0009-0002-2173-3847;0000-0001-7765-8095", "linkedin": ";;;;;%E7%BB%B4%E6%B5%A9-%E6%9D%8E-a93a10298/;", "or_profile": "~Xinjie_Yao1;~Yu_Wang33;~Pengfei_Zhu1;~Wanyu_Lin1;~Jialu_Li4;~Weihao_Li3;~Qinghua_Hu1", "aff": "Tianjin University;Tianjin University;Tianjin University;The Hong Kong Polytechnic University;Tianjin University;Tianjin University;Tianjin University", "aff_domain": "tju.edu.cn;tju.edu.cn;tju.edu.cn;polyu.edu.hk;tju.edu.cn;tju.edu.cn;tju.edu.cn", "position": "PhD student;Associate Professor;Full Professor;Assistant Professor;PhD student;PhD student;Professor", "bibtex": "@inproceedings{\nyao2024socialized,\ntitle={Socialized Learning: Making Each Other Better Through Multi-Agent Collaboration},\nauthor={Xinjie Yao and Yu Wang and Pengfei Zhu and Wanyu Lin and Jialu Li and Weihao Li and Qinghua Hu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=aaeJpJw5Ur}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4737484, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17609960126684556221&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "tju.edu.cn;tju.edu.cn;tju.edu.cn;polyu.edu.hk;tju.edu.cn;tju.edu.cn;tju.edu.cn", "author_num": 7, "aff_unique_index": "0;0;0;1;0;0;0", "aff_unique_norm": "Tianjin University;Hong Kong Polytechnic University", "aff_unique_dep": ";", "aff_unique_url": "http://www.tju.edu.cn;https://www.polyu.edu.hk", "aff_unique_abbr": "TJU;PolyU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Graph Geometry-Preserving Autoencoders", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33680", "id": "acTLXagzqd", "proceeding": "https://proceedings.mlr.press/v235/lim24a.html", "pdf": "https://openreview.net/pdf?id=acTLXagzqd", "openreview": "https://openreview.net/forum?id=acTLXagzqd", "author_site": "Jungbin Lim, Jihwan Kim, Yonghyeon Lee, Cheongjae Jang, Frank Chongwoo Park", "tldr": "", "abstract": "When using an autoencoder to learn the low-dimensional manifold of high-dimensional data, it is crucial to find the latent representations that preserve the geometry of the data manifold. However, most existing studies assume a Euclidean nature for the high-dimensional data space, which is arbitrary and often does not precisely reflect the underlying semantic or domain-specific attributes of the data. In this paper, we propose a novel autoencoder regularization framework based on the premise that the geometry of the data manifold can often be better captured with a well-designed similarity graph associated with data points. Given such a graph, we utilize a Riemannian geometric distortion measure as a regularizer to preserve the geometry derived from the graph Laplacian and make it suitable for larger-scale autoencoder training. Through extensive experiments, we show that our method outperforms existing state-of-the-art geometry-preserving and graph-based autoencoders with respect to learning accurate latent structures that preserve the graph geometry, and is particularly effective in learning dynamics in the latent space. Code is available at https://github.com/JungbinLim/GGAE-public.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jungbin Lim;Jihwan Kim;Yonghyeon Lee;Cheongjae Jang;Frank C. Park", "authorids": "~Jungbin_Lim1;~Jihwan_Kim2;~Yonghyeon_Lee2;~Cheongjae_Jang1;~Frank_C._Park1", "gender": ";M;M;;M", "homepage": "https://sites.google.com/robotics.snu.ac.kr/fcp/;http://robot.snu.ac.kr/;https://www.gabe-yhlee.com;;http://robotics.snu.ac.kr", "dblp": ";;182/6796;148/4946;p/FrankChongwooPark", "google_scholar": ";;;https://scholar.google.co.kr/citations?user=VXAyVXYAAAAJ;u-h3PJIAAAAJ", "orcid": ";;;0000-0001-6029-4125;0000-0002-0293-6975", "linkedin": ";;;;", "or_profile": "~Jungbin_Lim1;~Jihwan_Kim2;~Yonghyeon_Lee2;~Cheongjae_Jang1;~Frank_C._Park1", "aff": "Seoul National University;Seoul National University;Korea Institute for Advanced Study;Hanyang University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;kias.re.kr;hanyang.ac.kr;snu.ac.kr", "position": "MS student;PhD student;Postdoc;Researcher;Full Professor", "bibtex": "@inproceedings{\nlim2024graph,\ntitle={Graph Geometry-Preserving Autoencoders},\nauthor={Jungbin Lim and Jihwan Kim and Yonghyeon Lee and Cheongjae Jang and Frank C. Park},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=acTLXagzqd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5802072, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1812240439829825411&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "snu.ac.kr;snu.ac.kr;kias.re.kr;hanyang.ac.kr;snu.ac.kr", "author_num": 5, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Seoul National University;Korea Institute for Advanced Study;Hanyang University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.snu.ac.kr;http://www.kaist.edu;https://www.hanyang.ac.kr", "aff_unique_abbr": "SNU;KIAS;HYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Why Do You Grok? A Theoretical Analysis on Grokking Modular Addition", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33679", "id": "ad5I6No9G1", "proceeding": "https://proceedings.mlr.press/v235/mohamadi24a.html", "pdf": "https://openreview.net/pdf?id=ad5I6No9G1", "openreview": "https://openreview.net/forum?id=ad5I6No9G1", "author_site": "Mohamad Amin Mohamadi, Zhiyuan Li, Lei Wu, Danica J Sutherland", "tldr": "", "abstract": "We present a theoretical explanation of the \u201cgrokking\u201d phenomenon (Power et al., 2022), where a model generalizes long after overfitting, for the originally-studied problem of modular addition. First, we show that early in gradient descent, so that the \u201ckernel regime\u201d approximately holds, no permutation-equivariant model can achieve small population error on modular addition unless it sees at least a constant fraction of all possible data points. Eventually, however, models escape the kernel regime. We show that one-hidden-layer quadratic networks that achieve zero training loss with bounded $\\ell_\\infty$ norm generalize well with substantially fewer training points, and further show such networks exist and can be found by gradient descent with small $\\ell_\\infty$ regularization. We further provide empirical evidence that these networks leave the kernel regime only after initially overfitting. Taken together, our results strongly support the case for grokking as a consequence of the transition from kernel-like behavior to limiting behavior of gradient descent on deep networks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mohamad Amin Mohamadi;Zhiyuan Li;Lei Wu;Danica J. Sutherland", "authorids": "~Mohamad_Amin_Mohamadi1;~Zhiyuan_Li2;~Lei_Wu1;~Danica_J._Sutherland1", "gender": "M;M;M;F", "homepage": ";https://zhiyuanli.ttic.edu;https://leiwu0.github.io/;http://www.djsutherland.ml", "dblp": "323/6299;l/ZhiyuanLi;;92/10966", "google_scholar": ";https://scholar.google.com/citations?hl=en;CMweeYcAAAAJ;https://scholar.google.co.uk/citations?user=uO_NqicAAAAJ", "orcid": ";;;0000-0002-1525-3532", "linkedin": "mohamad-amin-mohamadi-b4196b89/;;;", "or_profile": "~Mohamad_Amin_Mohamadi1;~Zhiyuan_Li2;~Lei_Wu1;~Danica_J._Sutherland2", "aff": "Toyota Technological Institute at Chicago;Toyota Technological Institute at Chicago;Peking University;University of British Columbia", "aff_domain": "ttic.edu;ttic.edu;math.pku.edu.cn;cs.ubc.ca", "position": "PhD student;Assistant Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nmohamadi2024why,\ntitle={Why Do You Grok? A Theoretical Analysis on Grokking Modular Addition},\nauthor={Mohamad Amin Mohamadi and Zhiyuan Li and Lei Wu and Danica J. Sutherland},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ad5I6No9G1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 809589, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=77021859622142169&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "ttic.edu;ttic.edu;math.pku.edu.cn;cs.ubc.ca", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Toyota Technological Institute at Chicago;Peking University;University of British Columbia", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tti-chicago.org;http://www.pku.edu.cn;https://www.ubc.ca", "aff_unique_abbr": "TTI Chicago;Peking U;UBC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Chicago;", "aff_country_unique_index": "0;0;1;2", "aff_country_unique": "United States;China;Canada" }, { "title": "Accelerated Policy Gradient: On the Convergence Rates of the Nesterov Momentum for Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33678", "id": "aeXRBnLoPP", "proceeding": "https://proceedings.mlr.press/v235/chen24t.html", "pdf": "https://openreview.net/pdf?id=aeXRBnLoPP", "openreview": "https://openreview.net/forum?id=aeXRBnLoPP", "author_site": "Yen-Ju Chen, Nai-Chieh Huang, Ching-pei Lee, Ping-Chun Hsieh", "tldr": "", "abstract": "Various acceleration approaches for Policy Gradient (PG) have been analyzed within the realm of Reinforcement Learning (RL). However, the theoretical understanding of the widely used momentum-based acceleration method on PG remains largely open. In response to this gap, we adapt the celebrated Nesterov's accelerated gradient (NAG) method to policy optimization in RL, termed *Accelerated Policy Gradient* (APG). To demonstrate the potential of APG in achieving fast convergence, we formally prove that with the true gradient and under the softmax policy parametrization, APG converges to an optimal policy at rates: (i) $\\tilde{O}(1/t^2)$ with nearly constant step sizes; (ii) $O(e^{-ct})$ with time-varying step sizes. To the best of our knowledge, this is the first characterization of the convergence rates of NAG in the context of RL. Notably, our analysis relies on one interesting finding: Regardless of the parameter initialization, APG ends up entering a locally nearly-concave regime, where APG can significantly benefit from the momentum, within finite iterations. Through numerical validation and experiments on the Atari 2600 benchmarks, we confirm that APG exhibits a $\\tilde{O}(1/t^2)$ rate with nearly constant step sizes and a linear convergence rate with time-varying step sizes, significantly improving convergence over the standard PG.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yen-Ju Chen;Nai-Chieh Huang;Ching-pei Lee;Ping-Chun Hsieh", "authorids": "~Yen-Ju_Chen1;~Nai-Chieh_Huang1;~Ching-pei_Lee2;~Ping-Chun_Hsieh1", "gender": "M;M;Unspecified;M", "homepage": ";https://pinghsieh.github.io/;http://leepei.github.io;", "dblp": ";163/7352;;59/1760", "google_scholar": "https://scholar.google.com.tw/citations?hl=zh-TW;ix38JgoAAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": ";;;", "linkedin": "https://www.linkedin.com/mwlite/in/nai-chieh-huang-244351207;;;Mru0222/", "or_profile": "~Nai-Chieh_Huang1;~Ping-Chun_Hsieh1;~Ching-Pei_Lee1;~Yen_Ju_Chen1", "aff": "National Yang-Ming Chiao-Tung University;National Yang Ming Chiao Tung University;Institute of Statistical Mathematics, Japan;National Chiao Tung University", "aff_domain": "nycu.edu.tw;nycu.edu.tw;ism.ac.jp;nctu.edu.tw", "position": "Undergrad student;Associate Professor;Associate Professor;MS student", "bibtex": "@inproceedings{\nchen2024accelerated,\ntitle={Accelerated Policy Gradient: On the Convergence Rates of the Nesterov Momentum for Reinforcement Learning},\nauthor={Yen-Ju Chen and Nai-Chieh Huang and Ching-pei Lee and Ping-Chun Hsieh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=aeXRBnLoPP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1531839, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18346162769593969617&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "nycu.edu.tw;nycu.edu.tw;ism.ac.jp;nctu.edu.tw", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "National Yang-Ming Chiao-Tung University;National Yang Ming Chiao Tung University;Institute of Statistical Mathematics;National Chiao Tung University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.nycu.edu.tw;https://www.nycu.edu.tw;https://www.ism.ac.jp;https://www.nctu.edu.tw", "aff_unique_abbr": "NYCU;NYCU;ISM;NCTU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Taiwan;", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;Japan" }, { "title": "Gaussian Processes on Cellular Complexes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33677", "id": "afnyJfQddk", "proceeding": "https://proceedings.mlr.press/v235/alain24a.html", "pdf": "https://openreview.net/pdf?id=afnyJfQddk", "openreview": "https://openreview.net/forum?id=afnyJfQddk", "author_site": "Mathieu Alain, So Takao, Brooks Paige, Marc Deisenroth", "tldr": "", "abstract": "In recent years, there has been considerable interest in developing machine learning models on graphs to account for topological inductive biases. In particular, recent attention has been given to Gaussian processes on such structures since they can additionally account for uncertainty. However, graphs are limited to modelling relations between two vertices. In this paper, we go beyond this dyadic setting and consider polyadic relations that include interactions between vertices, edges and one of their generalisations, known as cells. Specifically, we propose Gaussian processes on cellular complexes, a generalisation of graphs that captures interactions between these higher-order cells. One of our key contributions is the derivation of two novel kernels, one that generalises the graph Mat\u00e9rn kernel and one that additionally mixes information of different cell types.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mathieu Alain;So Takao;Brooks Paige;Marc Peter Deisenroth", "authorids": "~Mathieu_Alain1;~So_Takao1;~Brooks_Paige1;~Marc_Peter_Deisenroth1", "gender": "M;M;M;M", "homepage": ";;https://tbrx.github.io;https://deisenroth.cc", "dblp": ";247/1437;https://dblp.uni-trier.de/pers/p/Paige:Brooks;76/5043", "google_scholar": "FJu2i-gAAAAJ;https://scholar.google.co.uk/citations?\u2026;JrFJmx0AAAAJ;https://scholar.google.co.uk/citations?user=GDabimYAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Mathieu_Alain1;~So_Takao1;~Brooks_Paige1;~Marc_Deisenroth1", "aff": "University College London;California Institute of Technology;University College London;Alan Turing Institute", "aff_domain": "ucl.ac.uk;caltech.edu;ucl.ac.uk;turing.ac.uk", "position": "PhD student;Postdoc;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nalain2024gaussian,\ntitle={Gaussian Processes on Cellular Complexes},\nauthor={Mathieu Alain and So Takao and Brooks Paige and Marc Peter Deisenroth},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=afnyJfQddk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1905993, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=976814021279976145&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "ucl.ac.uk;caltech.edu;ucl.ac.uk;turing.ac.uk", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "University College London;California Institute of Technology;Alan Turing Institute", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ucl.ac.uk;https://www.caltech.edu;https://www.turing.ac.uk", "aff_unique_abbr": "UCL;Caltech;ATI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pasadena", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "Transformers Implement Functional Gradient Descent to Learn Non-Linear Functions In Context", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33676", "id": "ah1BlQcLv4", "proceeding": "https://proceedings.mlr.press/v235/cheng24a.html", "pdf": "https://openreview.net/pdf?id=ah1BlQcLv4", "openreview": "https://openreview.net/forum?id=ah1BlQcLv4", "author_site": "Xiang Cheng, Yuxin Chen, Suvrit Sra", "tldr": "", "abstract": "Many neural network architectures are known to be Turing Complete, and can thus, in principle implement arbitrary algorithms. However, Transformers are unique in that they can implement gradient-based learning algorithms *under simple parameter configurations*. This paper provides theoretical and empirical evidence that (non-linear) Transformers naturally learn to implement gradient descent *in function space*, which in turn enable them to learn non-linear functions in context. Our results apply to a broad class of combinations of non-linear architectures and non-linear in-context learning tasks. Additionally, we show that the optimal choice of non-linear activation depends in a natural way on the class of functions that need to be learned.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiang Cheng;Yuxin Chen;Suvrit Sra", "authorids": "~Xiang_Cheng1;~Yuxin_Chen8;~Suvrit_Sra1", "gender": "M;F;", "homepage": "https://sites.google.com/berkeley.edu/xiangcheng/home;https://yuxinxinchen.github.io/;https://optml.mit.edu", "dblp": "29/1059-6;;90/930", "google_scholar": "-WJinlEAAAAJ;YmXg5xMAAAAJ;eyCw9goAAAAJ", "orcid": ";0000-0003-0661-6132;", "linkedin": ";;", "or_profile": "~Xiang_Cheng1;~Yuxin_Chen8;~Suvrit_Sra1", "aff": ";;Massachusetts Institute of Technology", "aff_domain": ";;mit.edu", "position": ";;Associate Professor", "bibtex": "@inproceedings{\ncheng2024transformers,\ntitle={Transformers Implement Functional Gradient Descent to Learn Non-Linear Functions In Context},\nauthor={Xiang Cheng and Yuxin Chen and Suvrit Sra},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ah1BlQcLv4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 990679, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12694642765193101372&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";;mit.edu", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Outlier Weighed Layerwise Sparsity (OWL): A Missing Secret Sauce for Pruning LLMs to High Sparsity", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33675", "id": "ahEm3l2P6w", "proceeding": "https://proceedings.mlr.press/v235/yin24e.html", "pdf": "https://openreview.net/pdf?id=ahEm3l2P6w", "openreview": "https://openreview.net/forum?id=ahEm3l2P6w", "author_site": "Lu Yin, You Wu, Zhenyu Zhang, Cheng-Yu Hsieh, Yaqing Wang, Yiling Jia, Gen Li, Ajay Jaiswal, Mykola Pechenizkiy, Yi Liang, Michael Bendersky, Zhangyang \u201cAtlas\u201d Wang, Shiwei Liu", "tldr": "", "abstract": "Large Language Models (LLMs), renowned for their remarkable performance across diverse domains, present a challenge due to their colossal model size when it comes to practical deployment. In response to this challenge, efforts have been directed toward the application of traditional network pruning techniques to LLMs, uncovering a massive number of parameters can be pruned in one-shot without hurting performance. Building upon insights gained from pre-LLM models, particularly BERT-level language models, prevailing LLM pruning strategies have consistently adhered to the practice of uniformly pruning all layers at equivalent sparsity levels, resulting in robust performance. However, this observation stands in contrast to the prevailing trends observed in the field of vision models, where non-uniform layerwise sparsity typically yields substantially improved results. To elucidate the underlying reasons for this disparity, we conduct a comprehensive analysis of the distribution of token features within LLMs. In doing so, we discover a strong correlation with the emergence of outliers, defined as features exhibiting significantly greater magnitudes compared to their counterparts in feature dimensions. Inspired by this finding, we introduce a novel LLM pruning methodology that incorporates a tailored set of **non-uniform layerwise sparsity ratios** specifically designed for LLM pruning, termed as **O**utlier **W**eighed **L**ayerwise sparsity (**OWL**). The sparsity ratio of OWL is directly proportional to the outlier ratio observed within each layer, facilitating a more effective alignment between layerwise weight sparsity and outlier ratios. Our empirical evaluation, conducted across the LLaMA-V1/V2, Vicuna, OPT, and Mistral, spanning various benchmarks, demonstrates the distinct advantages offered by OWL over previous methods. For instance, OWL exhibits a remarkable performance gain, surpassing the state-of-the-art Wanda and SparseGPT by **61.22** and **6.80** perplexity at a high sparsity level of 70%, respectively, while delivering **2.6$\\times$** end-to-end inference speed-up in the DeepSparse inference engine. Code is available at https://github.com/luuyin/OWL.git.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lu Yin;You Wu;Zhenyu Zhang;Cheng-Yu Hsieh;Yaqing Wang;Yiling Jia;Gen Li;AJAY KUMAR JAISWAL;Mykola Pechenizkiy;Yi Liang;Michael Bendersky;Zhangyang Wang;Shiwei Liu", "authorids": "~Lu_Yin1;~You_Wu1;~Zhenyu_Zhang4;~Cheng-Yu_Hsieh1;~Yaqing_Wang1;~Yiling_Jia1;~Gen_Li4;~AJAY_KUMAR_JAISWAL1;~Mykola_Pechenizkiy1;~Yi_Liang1;~Michael_Bendersky1;~Zhangyang_Wang1;~Shiwei_Liu2", "gender": ";M;M;M;M;F;M;M;M;M;;M;M", "homepage": "https://luuyin.com/;;https://zhenyu.gallery;https://chengyuhsieh.github.io/;https://yaqingwang.github.io/;https://yilingjia.github.io;https://coulsonlee.github.io;https://ajay1994.github.io/;http://www.win.tue.nl/~mpechen/;https://research.google/people/108265/;http://bendersky.github.io/;https://vita-group.github.io;https://shiweiliuiiiiiii.github.io/", "dblp": "87/2528-6;16/8675-1;01/1844-15;40/4421;147/1393;218/7475;28/538-12;30/9707;37/4649;;80/4305;119/4026;234/8697-3.html", "google_scholar": "G4Xe1NkAAAAJ;https://scholar.google.com/citations?hl=en;ZLyJRxoAAAAJ;WXX6ZwwAAAAJ;_Rfg2CAAAAAJ;6-pZivMAAAAJ;;I783HxYAAAAJ;https://scholar.google.com.tw/citations?user=F0uFT_kAAAAJ;9vQ7gbgAAAAJ;C9mxM5IAAAAJ;pxFyKAIAAAAJ;73IbXtsAAAAJ", "orcid": ";;;;;;;;0000-0003-4955-0743;0000-0002-6622-8919;0000-0002-2941-6240;;", "linkedin": ";;zhenyu-allen-zhang-a9b1391a3/;;;yiling-jia-793b2228/;;;mpechen/;;;;", "or_profile": "~Lu_Yin1;~You_Wu1;~Zhenyu_Zhang4;~Cheng-Yu_Hsieh1;~Yaqing_Wang1;~Yiling_Jia1;~Gen_Li4;~AJAY_KUMAR_JAISWAL1;~Mykola_Pechenizkiy1;~Yi_Liang1;~Michael_Bendersky1;~Zhangyang_Wang1;~Shiwei_Liu2", "aff": "University of Aberdeen;Google;University of Texas at Austin;Google;Google DeepMind;Google;Clemson University;University of Texas, Austin;Eindhoven University of Technology;Research, Google;Google;University of Texas at Austin;University of Oxford", "aff_domain": "abdn.ac.uk;google.com;utexas.edu;google.com;google.com;google.com;clemson.edu;utexas.edu;tue.nl;research.google.com;google.com;utexas.edu;ox.ac.uk", "position": "Assistant Professor;Researcher;PhD student;Intern;Research Scientist;Research Scientist;PhD student;PhD student;Full Professor;Researcher;Researcher;Associate Professor;Postdoc", "bibtex": "@inproceedings{\nyin2024outlier,\ntitle={Outlier Weighed Layerwise Sparsity ({OWL}): A Missing Secret Sauce for Pruning {LLM}s to High Sparsity},\nauthor={Lu Yin and You Wu and Zhenyu Zhang and Cheng-Yu Hsieh and Yaqing Wang and Yiling Jia and Gen Li and AJAY KUMAR JAISWAL and Mykola Pechenizkiy and Yi Liang and Michael Bendersky and Zhangyang Wang and Shiwei Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ahEm3l2P6w}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 409940, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 13, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6082026189179939341&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "abdn.ac.uk;google.com;utexas.edu;google.com;google.com;google.com;clemson.edu;utexas.edu;tue.nl;research.google.com;google.com;utexas.edu;ox.ac.uk", "author_num": 13, "aff_unique_index": "0;1;2;1;1;1;3;2;4;1;1;2;5", "aff_unique_norm": "University of Aberdeen;Google;University of Texas at Austin;Clemson University;Eindhoven University of Technology;University of Oxford", "aff_unique_dep": ";Google;;;;", "aff_unique_url": "https://www.abdn.ac.uk;https://www.google.com;https://www.utexas.edu;https://www.clemson.edu;https://www.tue.nl;https://www.ox.ac.uk", "aff_unique_abbr": "Aberdeen;Google;UT Austin;Clemson;TU/e;Oxford", "aff_campus_unique_index": "1;2;1;1;2;1;1;2", "aff_campus_unique": ";Mountain View;Austin", "aff_country_unique_index": "0;1;1;1;0;1;1;1;2;1;1;1;0", "aff_country_unique": "United Kingdom;United States;Netherlands" }, { "title": "Exploiting Human-AI Dependence for Learning to Defer", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33674", "id": "aiz79FxjaI", "proceeding": "https://proceedings.mlr.press/v235/wei24a.html", "pdf": "https://openreview.net/pdf?id=aiz79FxjaI", "openreview": "https://openreview.net/forum?id=aiz79FxjaI", "author_site": "Zixi Wei, Yuzhou Cao, Lei Feng", "tldr": "", "abstract": "The learning to defer (L2D) framework allows models to defer their decisions to human experts. For L2D, the Bayes optimality is the basic requirement of theoretical guarantees for the design of consistent surrogate loss functions, which requires the minimizer (i.e., learned classifier) by the surrogate loss to be the Bayes optimality. However, we find that the original form of Bayes optimality fails to consider the dependence between the model and the expert, and such a dependence could be further exploited to design a better consistent loss for L2D. In this paper, we provide a new formulation for the Bayes optimality called dependent Bayes optimality, which reveals the dependence pattern in determining whether to defer. Based on the dependent Bayes optimality, we further present a deferral principle for L2D. Following the guidance of the deferral principle, we propose a novel consistent surrogate loss. Comprehensive experimental results on both synthetic and real-world datasets demonstrate the superiority of our proposed method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zixi Wei;Yuzhou Cao;Lei Feng", "authorids": "~Zixi_Wei1;~Yuzhou_Cao1;~Lei_Feng1", "gender": "M;M;M", "homepage": ";https://yzcao-nkg.github.io/;https://lfeng1995.github.io/", "dblp": "247/3651.html;256/5052;76/847-6", "google_scholar": ";https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.sg/citations?user=KomQOFkAAAAJ", "orcid": ";;0000-0003-2839-5799", "linkedin": ";;", "or_profile": "~Zixi_Wei1;~Yuzhou_Cao1;~Lei_Feng1", "aff": "Chongqing University;Nanyang Technological University;Singapore University of Technology and Design", "aff_domain": "cqu.edu.cn;ntu.edu;sutd.edu.sg", "position": "MS student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nwei2024exploiting,\ntitle={Exploiting Human-{AI} Dependence for Learning to Defer},\nauthor={Zixi Wei and Yuzhou Cao and Lei Feng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=aiz79FxjaI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 423918, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16252219556035363458&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "cqu.edu.cn;ntu.edu;sutd.edu.sg", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Chongqing University;Nanyang Technological University;Singapore University of Technology and Design", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cqu.edu.cn;https://www.ntu.edu.sg;https://www.sutd.edu.sg", "aff_unique_abbr": "CQU;NTU;SUTD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;Singapore" }, { "title": "Multi-layer Rehearsal Feature Augmentation for Class-Incremental Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33673", "id": "aksdU1KOpT", "proceeding": "https://proceedings.mlr.press/v235/zheng24p.html", "pdf": "https://openreview.net/pdf?id=aksdU1KOpT", "openreview": "https://openreview.net/forum?id=aksdU1KOpT", "author_site": "Bowen Zheng, Da-Wei Zhou, Han-Jia Ye, De-Chuan Zhan", "tldr": "", "abstract": "Class-Incremental Learning (CIL) seeks to learn new concepts without forgetting previously learned knowledge. To achieve this, rehearsal-based methods keep a replay memory consisting of a small number of trained samples from previous tasks. However, recent studies show that rehearsal-based methods are prone to overfitting on rehearsal samples, resulting in poor generalization on previous tasks. Since the generalization error is bounded by the margin on the training dataset, in this paper, we study the generalization by all-layer margin on deep neural networks to alleviate catastrophic forgetting. Specifically, we show that the average margin of the rehearsal samples are smaller during incremental learning. To acquire larger margin thus better generalization on rehearsal samples, we propose Multi-layer Rehearsal Feature Augmentation (MRFA) in rehearsal training to optimize the all-layer margin on rehearsal samples. The proposed method augments the features of rehearsal samples at each layer by gradient ascent step of the current model with respect to the feature. With such augmentations on layer features, the margin on rehearsal samples are larger, rehearsal samples are able to provide more information for refining the decision boundary during incremental learning, thus alleviating catastrophic forgetting. Extensive experiments show the effectiveness of MRFA on various CIL scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bowen Zheng;Da-Wei Zhou;Han-Jia Ye;De-Chuan Zhan", "authorids": "~Bowen_Zheng4;~Da-Wei_Zhou1;~Han-Jia_Ye1;~De-Chuan_Zhan1", "gender": ";;M;M", "homepage": ";http://www.lamda.nju.edu.cn/zhoudw/;http://www.lamda.nju.edu.cn/yehj;http://www.lamda.nju.edu.cn/zhandc/", "dblp": ";120/6109;165/3014;74/498", "google_scholar": ";kMNaR-YAAAAJ;mgOYhtoAAAAJ;mYJf4TcAAAAJ", "orcid": ";;;0000-0002-3533-2078", "linkedin": ";;;", "or_profile": "~Bowen_Zheng4;~Da-Wei_Zhou1;~Han-Jia_Ye1;~De-Chuan_Zhan1", "aff": ";Nanjing University;Nanjing University;Nanjing University", "aff_domain": ";nju.edu.cn;nju.edu.cn;nju.edu.cn", "position": ";PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nzheng2024multilayer,\ntitle={Multi-layer Rehearsal Feature Augmentation for Class-Incremental Learning},\nauthor={Bowen Zheng and Da-Wei Zhou and Han-Jia Ye and De-Chuan Zhan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=aksdU1KOpT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2021053, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7930564027883152382&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": ";nju.edu.cn;nju.edu.cn;nju.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "", "aff_unique_url": "https://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "FedLMT: Tackling System Heterogeneity of Federated Learning via Low-Rank Model Training with Theoretical Guarantees", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33672", "id": "akyElNlUVA", "proceeding": "https://proceedings.mlr.press/v235/liu24ch.html", "pdf": "https://openreview.net/pdf?id=akyElNlUVA", "openreview": "https://openreview.net/forum?id=akyElNlUVA", "author_site": "Jiahao Liu, Yipeng Zhou, Di Wu, Miao Hu, Mohsen Guizani, Quan Sheng", "tldr": "", "abstract": "Federated learning (FL) is an emerging machine learning paradigm for preserving data privacy. However, diverse client hardware often has varying computation resources. Such system heterogeneity limits the participation of resource-constrained clients in FL, and hence degrades the global model accuracy. To enable heterogeneous clients to participate in and contribute to FL training, previous works tackle this problem by assigning customized sub-models to individual clients with model pruning, distillation, or low-rank based techniques. Unfortunately, the global model trained by these methods still encounters performance degradation due to heterogeneous sub-model aggregation. Besides, most methods are heuristic-based and lack convergence analysis. In this work, we propose the FedLMT framework to bridge the performance gap, by assigning clients with a homogeneous pre-factorized low-rank model to substantially reduce resource consumption without conducting heterogeneous aggregation. We theoretically prove that the convergence of the low-rank model can guarantee the convergence of the original full model. To further meet clients' personalized resource needs, we extend FedLMT to pFedLMT, by separating model parameters into common and custom ones. Finally, extensive experiments are conducted to verify our theoretical analysis and show that FedLMT and pFedLMT outperform other baselines with much less communication and computation costs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiahao Liu;Yipeng Zhou;Di Wu;Miao Hu;Mohsen Guizani;Quan Z. Sheng", "authorids": "~Jiahao_Liu8;~Yipeng_Zhou1;~Di_Wu21;~Miao_Hu2;mguizani@ieee.org;~Quan_Z._Sheng1", "gender": "M;M;M;M;;M", "homepage": ";https://sites.google.com/site/yipenghomepage/;http://netlabsysu.org/dwu/;;;http://web.science.mq.edu.au/~qsheng/", "dblp": ";78/6594.html;52/328-1;74/8189-1;;s/QuanZSheng", "google_scholar": ";https://scholar.google.ca/citations?user=uv95RgUAAAAJ;https://scholar.google.com.hk/citations?hl=zh-CN;5qp88KUAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": "0000-0001-8608-381X;;0000-0002-9433-7725;0000-0002-1518-002X;;0000-0002-3326-4147", "linkedin": ";;;;;", "or_profile": "~Jiahao_Liu8;~Yipeng_Zhou1;~Di_Wu21;~Miao_Hu2;mguizani@ieee.org;~Quan_Z._Sheng1", "aff": "Sun Yat-Sen University;Macquarie University;SUN YAT-SEN UNIVERSITY;SUN YAT-SEN UNIVERSITY;;Macquarie University", "aff_domain": "mail2.sysu.edu.cn;mq.edu.au;sysu.edu.cn;sysu.edu.cn;;mq.edu.au", "position": "MS student;Associate Professor;Full Professor;Associate Professor;;Full Professor", "bibtex": "@inproceedings{\nliu2024fedlmt,\ntitle={Fed{LMT}: Tackling System Heterogeneity of Federated Learning via Low-Rank Model Training with Theoretical Guarantees},\nauthor={Jiahao Liu and Yipeng Zhou and Di Wu and Miao Hu and Mohsen Guizani and Quan Z. Sheng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=akyElNlUVA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 908962, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=33103640499510064&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "mail2.sysu.edu.cn;mq.edu.au;sysu.edu.cn;sysu.edu.cn;;mq.edu.au", "author_num": 6, "aff_unique_index": "0;1;0;0;1", "aff_unique_norm": "Sun Yat-sen University;Macquarie University", "aff_unique_dep": ";", "aff_unique_url": "http://www.sysu.edu.cn/;https://www.mq.edu.au", "aff_unique_abbr": "SYSU;MQ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1", "aff_country_unique": "China;Australia" }, { "title": "Position: Why Tabular Foundation Models Should Be a Research Priority", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33671", "id": "amRSBdZlw9", "proceeding": "https://proceedings.mlr.press/v235/van-breugel24a.html", "pdf": "https://openreview.net/pdf?id=amRSBdZlw9", "openreview": "https://openreview.net/forum?id=amRSBdZlw9", "author_site": "Boris van Breugel, M van der Schaar", "tldr": "", "abstract": "Recent text and image foundation models are incredibly impressive, and these models are attracting an ever-increasing portion of research resources. In this position piece we aim to shift the ML research community's priorities ever so slightly to a different modality: tabular data. Tabular data is the dominant modality in many fields, yet it is given hardly any research attention and significantly lags behind in terms of scale and power. **We believe the time is now to start developing tabular foundation models**, or what we coin a _Large Tabular Model_ (LTM). LTMs could revolutionise the way science and ML use tabular data: not as single datasets that are analyzed in a vacuum, but contextualized with respect to related datasets. The potential impact is far-reaching: from few-shot tabular models to automating data science; from out-of-distribution synthetic data to empowering multidisciplinary scientific discovery. We intend to excite reflections on the modalities we study, and convince some researchers to study Large Tabular Models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Boris van Breugel;Mihaela van der Schaar", "authorids": "~Boris_van_Breugel2;~Mihaela_van_der_Schaar2", "gender": ";F", "homepage": ";https://www.vanderschaar-lab.com", "dblp": "284/0835;", "google_scholar": "https://scholar.google.com/citations?hl=en;DZ3S--MAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Boris_van_Breugel2;~Mihaela_van_der_Schaar2", "aff": "University of Cambridge;University of California, Los Angeles", "aff_domain": "cam.ac.uk;ucla.edu", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nbreugel2024position,\ntitle={Position: Why Tabular Foundation Models Should Be a Research Priority},\nauthor={Boris van Breugel and Mihaela van der Schaar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=amRSBdZlw9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 572295, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10202186614829787571&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "cam.ac.uk;ucla.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Cambridge;University of California, Los Angeles", "aff_unique_dep": ";", "aff_unique_url": "https://www.cam.ac.uk;https://www.ucla.edu", "aff_unique_abbr": "Cambridge;UCLA", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Cambridge;Los Angeles", "aff_country_unique_index": "0;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "EvoluNet: Advancing Dynamic Non-IID Transfer Learning on Graphs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33670", "id": "anM1M5aoM8", "proceeding": "https://proceedings.mlr.press/v235/wang24aw.html", "pdf": "https://openreview.net/pdf?id=anM1M5aoM8", "openreview": "https://openreview.net/forum?id=anM1M5aoM8", "author_site": "Haohui Wang, Yuzhen Mao, Yujun Yan, Yaoqing Yang, Jianhui Sun, Kevin Choi, Balaji Veeramani, Alison Hu, Edward Bowen, Tyler Cody, Dawei Zhou", "tldr": "", "abstract": "Non-IID transfer learning on graphs is crucial in many high-stakes domains. The majority of existing works assume stationary distribution for both source and target domains. However, real-world graphs are intrinsically dynamic, presenting challenges in terms of domain evolution and dynamic discrepancy between source and target domains. To bridge the gap, we shift the problem to the dynamic setting and pose the question: given the *label-rich* source graphs and the *label-scarce* target graphs both observed in previous $T$ timestamps, how can we effectively characterize the evolving domain discrepancy and optimize the generalization performance of the target domain at the incoming $T+1$ timestamp? To answer it, we propose a generalization bound for *dynamic non-IID transfer learning on graphs*, which implies the generalization performance is dominated by domain evolution and domain discrepancy between source and target graphs. Inspired by the theoretical results, we introduce a novel generic framework named EvoluNet. It leverages a transformer-based temporal encoding module to model temporal information of the evolving domains and then uses a dynamic domain unification module to efficiently learn domain-invariant representations across the source and target domains. Finally, EvoluNet outperforms the state-of-the-art models by up to 12.1%, demonstrating its effectiveness in transferring knowledge from dynamic source graphs to dynamic target graphs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haohui Wang;Yuzhen Mao;Yujun Yan;Yaoqing Yang;Jianhui Sun;Kevin Choi;Balaji Veeramani;Alison Hu;Edward Bowen;Tyler Cody;Dawei Zhou", "authorids": "~Haohui_Wang1;~Yuzhen_Mao2;~Yujun_Yan1;~Yaoqing_Yang1;~Jianhui_Sun1;~Kevin_Choi1;~Balaji_Veeramani1;~Alison_Hu1;~Edward_Bowen1;~Tyler_Cody1;~Dawei_Zhou1", "gender": "F;M;F;M;;M;M;F;;M;M", "homepage": "https://github.com/wanghh7;https://github.com/yuzhenmao;https://sites.google.com/umich.edu/yujunyan/home;https://sites.google.com/site/yangyaoqingcmu/;https://jsycsjh.github.io/;;https://scholar.google.com/citations?user=XHLs8eoAAAAJ&hl=en;;;https://tcody.net/;https://sites.google.com/view/dawei-zhou/home?authuser=0", "dblp": "294/8598;336/2249;219/1736;04/4176;207/9364;23/3132.html;;;;248/2813.html;39/3130-3.html", "google_scholar": "ijh64HMAAAAJ;9wKn1A0AAAAJ;5TQUP58AAAAJ;LYvugWgAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;XHLs8eoAAAAJ;;;zUWyADQAAAAJ;8dakqOgAAAAJ", "orcid": "0009-0000-7391-096X;;0000-0003-3776-4293;0000-0001-9908-5531;;;0009-0002-5263-1210;;0000-0003-3669-6444;;0000-0002-7065-2990", "linkedin": ";;;;jianhui-sun-76a722a6/;;balaji-veeramani-9a161b9/;alison-hu-7134ba33/;;;dawei-zhou-31035668/", "or_profile": "~Haohui_Wang1;~Yuzhen_Mao2;~Yujun_Yan1;~Yaoqing_Yang1;~Jianhui_Sun1;~Kevin_Choi1;~Balaji_Veeramani1;~Alison_Hu1;~Edward_Bowen1;~Tyler_Cody1;~Dawei_Zhou1", "aff": "Virginia Polytechnic Institute and State University;Simon Fraser University;Dartmouth College;Dartmouth College;University of Virginia;;Deloitte & Touche LLP;;Deloitte Consulting;Virginia Polytechnic Institute and State University;Virginia Polytechnic Institute and State University", "aff_domain": "vt.edu;sfu.ca;dartmouth.edu;dartmouth.edu;virginia.edu;;deloitte.com;;deloitte.com;vt.edu;vt.edu", "position": "PhD student;Researcher;Assistant Professor;Assistant Professor;PhD student;;Associate VP;;Researcher;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nwang2024evolunet,\ntitle={EvoluNet: Advancing Dynamic Non-{IID} Transfer Learning on Graphs},\nauthor={Haohui Wang and Yuzhen Mao and Yujun Yan and Yaoqing Yang and Jianhui Sun and Kevin Choi and Balaji Veeramani and Alison Hu and Edward Bowen and Tyler Cody and Dawei Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=anM1M5aoM8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1608785, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18235591079850509583&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "vt.edu;sfu.ca;dartmouth.edu;dartmouth.edu;virginia.edu;;deloitte.com;;deloitte.com;vt.edu;vt.edu", "author_num": 11, "aff_unique_index": "0;1;2;2;3;4;5;0;0", "aff_unique_norm": "Virginia Tech;Simon Fraser University;Dartmouth College;University of Virginia;Deloitte & Touche;Deloitte Consulting", "aff_unique_dep": ";;;;;", "aff_unique_url": "https://www.vt.edu;https://www.sfu.ca;https://www.dartmouth.edu;https://www.virginia.edu;https://www.deloitte.com;https://www.deloitte.com", "aff_unique_abbr": "VT;SFU;Dartmouth;UVA;Deloitte;Deloitte", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;0;0;0;0", "aff_country_unique": "United States;Canada" }, { "title": "Toward Adaptive Reasoning in Large Language Models with Thought Rollback", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33669", "id": "aoAPOOtN9E", "proceeding": "https://proceedings.mlr.press/v235/chen24y.html", "pdf": "https://openreview.net/pdf?id=aoAPOOtN9E", "openreview": "https://openreview.net/forum?id=aoAPOOtN9E", "author_site": "Sijia Chen, Baochun Li", "tldr": "", "abstract": "Large language models (LLMs) have been routinely used to solve various tasks using step-by-step reasoning. However, the structure of intermediate reasoning steps, or *thoughts*, is rigid and unidirectional, such as chains, trees, or acyclic-directed graphs. Consequently, the resulting inflexible and forward-only reasoning may not address challenging tasks and fail when the LLM frequently gives false responses, i.e., hallucinations. This paper proposes a new reasoning framework, called *Thought Rollback* (TR), allowing LLMs to adaptively build thought structure while maintaining effective reasoning toward problem-solving under hallucinations. The core mechanism of TR is *rolling back thoughts*, which allows LLMs to perform error analysis on thoughts, and thus roll back to any previously mistaken thought for revision. Subsequently, by including such trial-and-error in the prompt to guide the LLM, each rollback leads to one more reliable reasoning path. Therefore, starting with a simple prompt without human annotations, LLM with TR adaptively and gradually explores thoughts for a correct solution. Comprehensive experiments on mathematical problems and multi-task reasoning demonstrate the state-of-the-art performance of TR in terms of problem-solving rate and interaction cost. For instance, the solving rate of GPT-4 with TR outperforms the current best by $9\\%$ on the MATH dataset. The source code is available under the folder *examples/ThoughtRollback* of https://github.com/iQua/llmpebase.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sijia Chen;Baochun Li", "authorids": "~Sijia_Chen2;~Baochun_Li1", "gender": "M;M", "homepage": "https://csjdeveloper.github.io/sjiachen.github.io/;http://iqua.ece.toronto.edu/bli/", "dblp": "241/8721;l/BaochunLi", "google_scholar": "https://scholar.google.ca/citations?user=QWGJWDMAAAAJ;https://scholar.google.com.tw/citations?user=rkb3_FgAAAAJ", "orcid": ";0000-0003-2404-0974", "linkedin": ";https://linkedin.com/in/baochun", "or_profile": "~Sijia_Chen2;~Baochun_Li1", "aff": "Toronto University;University of Toronto", "aff_domain": "utoronto.ca;toronto.edu", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nchen2024toward,\ntitle={Toward Adaptive Reasoning in Large Language Models with Thought Rollback},\nauthor={Sijia Chen and Baochun Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=aoAPOOtN9E}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2177594, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8348903273534791275&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "utoronto.ca;toronto.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Toronto", "aff_unique_dep": "", "aff_unique_url": "https://www.utoronto.ca", "aff_unique_abbr": "U of T", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "title": "Privacy-Preserving Embedding via Look-up Table Evaluation with Fully Homomorphic Encryption", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33668", "id": "apxON2uH4N", "proceeding": "https://proceedings.mlr.press/v235/kim24ab.html", "pdf": "https://openreview.net/pdf?id=apxON2uH4N", "openreview": "https://openreview.net/forum?id=apxON2uH4N", "author_site": "Jae-yun Kim, Saerom Park, Joohee Lee, Jung Hee Cheon", "tldr": "", "abstract": "In privacy-preserving machine learning (PPML), homomorphic encryption (HE) has emerged as a significant primitive, allowing the use of machine learning (ML) models while protecting the confidentiality of input data. Although extensive research has been conducted on implementing PPML with HE by developing the efficient construction of private counterparts to ML models, the efficient HE implementation of embedding layers for token inputs such as words remains inadequately addressed. Thus, our study proposes an efficient algorithm for privacy-preserving embedding via look-up table evaluation with HE(HELUT) by developing an encrypted indicator function (EIF) that assures high precision with the use of the approximate HE scheme(CKKS). Based on the proposed EIF, we propose the CodedHELUT algorithm to facilitate an encrypted embedding layer for the first time. CodedHELUT leverages coded inputs to improve overall efficiency and optimize memory usage. Our comprehensive empirical analysis encompasses both synthetic tables and real-world largescale word embedding models. CodedHELUT algorithm achieves amortized evaluation time of 0.018-0.242s for GloVe6B50d, 0.104-01.298s for GloVe42300d, 0.262-3.283s for GPT-2 and BERT embedding layers while maintaining high precision (16 bits)", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jae-yun Kim;Saerom Park;Joohee Lee;Jung Hee Cheon", "authorids": "~Jae-yun_Kim1;~Saerom_Park1;~Joohee_Lee2;~Jung_Hee_Cheon2", "gender": "Non-Binary;F;F;M", "homepage": ";;https://sungshincrypto.github.io/;https://www.math.snu.ac.kr/~jhcheon", "dblp": ";209/8156;146/3549.html;64/5207.html", "google_scholar": "https://scholar.google.com/citations?hl=ko;AqiXxNkAAAAJ;https://scholar.google.co.kr/citations?user=CsZaTc8AAAAJ;KlTc3U4AAAAJ", "orcid": ";;0000-0002-1901-2410;", "linkedin": ";;joohee-lee-26baa5184/;", "or_profile": "~Jae-yun_Kim1;~Saerom_Park1;~Joohee_Lee2;~Jung_Hee_Cheon2", "aff": "Seoul National University;Ulsan National Institute of Science and Technology;Sungshin Women's University;Seoul National University", "aff_domain": "snu.ac.kr;unist.ac.kr;sungshin.ac.kr;snu.ac.kr", "position": "PhD student;Assistant Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nkim2024privacypreserving,\ntitle={Privacy-Preserving Embedding via Look-up Table Evaluation with Fully Homomorphic Encryption},\nauthor={Jae-yun Kim and Saerom Park and Joohee Lee and Jung Hee Cheon},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=apxON2uH4N}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 556002, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16246946938426965664&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "snu.ac.kr;unist.ac.kr;sungshin.ac.kr;snu.ac.kr", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Seoul National University;Ulsan National Institute of Science and Technology;Sungshin Women's University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.snu.ac.kr;https://www.unist.ac.kr;http://www.sungshin.ac.kr", "aff_unique_abbr": "SNU;UNIST;SWU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "DPN: Decoupling Partition and Navigation for Neural Solvers of Min-max Vehicle Routing Problems", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33667", "id": "ar174skI9u", "proceeding": "https://proceedings.mlr.press/v235/zheng24m.html", "pdf": "https://openreview.net/pdf?id=ar174skI9u", "openreview": "https://openreview.net/forum?id=ar174skI9u", "author_site": "zhi Zheng, Shunyu Yao, Zhenkun Wang, Tong Xialiang, Mingxuan Yuan, Ke Tang", "tldr": "", "abstract": "The min-max vehicle routing problem (min-max VRP) traverses all given customers by assigning several routes and aims to minimize the length of the longest route. Recently, reinforcement learning (RL)-based sequential planning methods have exhibited advantages in solving efficiency and optimality. However, these methods fail to exploit the problem-specific properties in learning representations, resulting in less effective features for decoding optimal routes. This paper considers the sequential planning process of min-max VRPs as two coupled optimization tasks: customer partition for different routes and customer navigation in each route (i.e., partition and navigation). To effectively process min-max VRP instances, we present a novel attention-based Partition-and-Navigation encoder (P&N Encoder) that learns distinct embeddings for partition and navigation. Furthermore, we utilize an inherent symmetry in decoding routes and develop an effective agent-permutation-symmetric (APS) loss function. Experimental results demonstrate that the proposed Decoupling-Partition-Navigation (DPN) method significantly surpasses existing learning-based methods in both single-depot and multi-depot min-max VRPs. Our code is available at", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhi Zheng;Shunyu Yao;Zhenkun Wang;Tong Xialiang;Mingxuan Yuan;Ke Tang", "authorids": "~Zhi_Zheng2;~Shunyu_Yao3;~Zhenkun_Wang1;~Tong_Xialiang2;~Mingxuan_Yuan1;~Ke_Tang2", "gender": "M;;M;M;M;M", "homepage": "https://zz1358m.github.io/zhizheng.github.io//;;https://faculty.sustech.edu.cn/wangzk3/en/;;;https://faculty.sustech.edu.cn/tangk3/", "dblp": ";;96/9114;https://dblp.uni-trier.de/pid/245/5977.html;74/2356;https://dblp.uni-trier.de/pers/hd/t/Tang:Ke.html", "google_scholar": "nxJ4qM4AAAAJ;;https://scholar.google.com.sg/citations?user=r9ezy2gAAAAJ;;https://scholar.google.com/citations?hl=en;mzLHFbAAAAAJ", "orcid": "0009-0005-8785-8177;;0000-0003-1152-6780;;0000-0002-2236-8784;0000-0002-6236-2002", "linkedin": ";;;;;", "or_profile": "~Zhi_Zheng2;~Shunyu_Yao3;~Zhenkun_Wang1;~Tong_Xialiang2;~Mingxuan_Yuan1;~Ke_Tang2", "aff": "Southern University of Science and Technology;;Southern University of Science and Technology;Huawei Technologies Ltd.;Huawei Technologies Ltd.;Southern University of Science and Technology", "aff_domain": "sustc.edu.cn;;sustech.edu.cn;huawei.com;huawei.com;sustech.edu.cn", "position": "Undergrad student;;Assistant Professor;Researcher;Researcher;Full Professor", "bibtex": "@inproceedings{\nzheng2024dpn,\ntitle={{DPN}: Decoupling Partition and Navigation for Neural Solvers of Min-max Vehicle Routing Problems},\nauthor={Zhi Zheng and Shunyu Yao and Zhenkun Wang and Tong Xialiang and Mingxuan Yuan and Ke Tang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ar174skI9u}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 0, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=247594182377515100&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "sustc.edu.cn;;sustech.edu.cn;huawei.com;huawei.com;sustech.edu.cn", "author_num": 6, "aff_unique_index": "0;0;1;1;0", "aff_unique_norm": "Southern University of Science and Technology;Huawei", "aff_unique_dep": ";Huawei Technologies", "aff_unique_url": "https://www.sustech.edu.cn;https://www.huawei.com", "aff_unique_abbr": "SUSTech;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Position: Opportunities Exist for Machine Learning in Magnetic Fusion Energy", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33666", "id": "arwP5FA2dO", "proceeding": "https://proceedings.mlr.press/v235/spangher24a.html", "pdf": "https://openreview.net/pdf?id=arwP5FA2dO", "openreview": "https://openreview.net/forum?id=arwP5FA2dO", "author_site": "Lucas Spangher, Allen Wang, Andrew Maris, Myles Stapelberg, Viraj Mehta, Alex Saperstein, Stephen Lane-Walsh, Akshata Moharir, Alessandro Pau, Cristina Rea", "tldr": "", "abstract": "Magnetic confinement fusion may one day provide reliable, carbon-free energy, but the field currently faces technical hurdles. In this position paper, we highlight six key research challenges in the field of fusion energy that we believe should be research priorities for the Machine Learning (ML) community because they are especially ripe for ML applications: (1) disruption prediction, (2) simulation and dynamics modeling (3) resolving partially observed data, (4) improving controls, (5) guiding experiments with optimal design, and (6) enhancing materials discovery. For each problem, we give background, review past ML work, suggest features of future models, and list challenges and idiosyncrasies facing ML development. We also discuss ongoing efforts to update the fusion data ecosystem and identify opportunities further down the line that will be enabled as fusion and its data infrastructure advance. It is our position that fusion energy offers especially exciting opportunities for ML practitioners to impact decarbonization and the future of energy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lucas Spangher;Allen M. Wang;Andrew Maris;Myles Stapelberg;Viraj Mehta;Alex Saperstein;Stephen Lane-Walsh;Akshata Kishore Moharir;Alessandro Pau;Cristina Rea", "authorids": "~Lucas_Spangher1;~Allen_M._Wang1;maris@psfc.mit.edu;myless@psfc.mit.edu;~Viraj_Mehta1;saperstein@psfc.mit.edu;slwalsh@psfc.mit.edu;~Akshata_Kishore_Moharir2;alessandro.pau@epfl.ch;crea@psfc.mit.edu", "gender": "M;M;;;M;;;F;;", "homepage": ";;;;http://virajm.com;;;;;", "dblp": "267/8772;;;;https://dblp.org/pers/m/Mehta:Viraj.html;;;;;", "google_scholar": "https://scholar.google.com/citations?hl=en;hq9BUBYAAAAJ;;;4pHjHBkAAAAJ;;;jHAEGDEAAAAJ;;", "orcid": ";;;;0000-0002-2021-9718;;;;;", "linkedin": "lucasspangher;;;;virajrmehta/;;;akshatakm/;;", "or_profile": "~Lucas_Spangher1;~Allen_M._Wang1;maris@psfc.mit.edu;myless@psfc.mit.edu;~Viraj_Mehta1;saperstein@psfc.mit.edu;slwalsh@psfc.mit.edu;~Akshata_Kishore_Moharir2;alessandro.pau@epfl.ch;crea@psfc.mit.edu", "aff": "University of California, Berkeley;Massachusetts Institute of Technology;;;Carnegie Mellon University;;;Microsoft;;", "aff_domain": "berkeley.edu;mit.edu;;;cmu.edu;;;microsoft.com;;", "position": "PhD student;PhD student;;;PhD student;;;Researcher;;", "bibtex": "@inproceedings{\nspangher2024position,\ntitle={Position: Opportunities Exist for Machine Learning in Magnetic Fusion Energy},\nauthor={Lucas Spangher and Allen M. Wang and Andrew Maris and Myles Stapelberg and Viraj Mehta and Alex Saperstein and Stephen Lane-Walsh and Akshata Kishore Moharir and Alessandro Pau and Cristina Rea},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=arwP5FA2dO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1493361, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12627643436266459914&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 5, "email": "berkeley.edu;mit.edu;;;cmu.edu;;;microsoft.com;;", "author_num": 10, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of California, Berkeley;Massachusetts Institute of Technology;Carnegie Mellon University;Microsoft", "aff_unique_dep": ";;;Microsoft Corporation", "aff_unique_url": "https://www.berkeley.edu;https://web.mit.edu;https://www.cmu.edu;https://www.microsoft.com", "aff_unique_abbr": "UC Berkeley;MIT;CMU;Microsoft", "aff_campus_unique_index": "0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Language Models Represent Beliefs of Self and Others", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33665", "id": "asJTE8EBjg", "proceeding": "https://proceedings.mlr.press/v235/zhu24o.html", "pdf": "https://openreview.net/pdf?id=asJTE8EBjg", "openreview": "https://openreview.net/forum?id=asJTE8EBjg", "author_site": "Wentao Zhu, Zhining Zhang, Yizhou Wang", "tldr": "", "abstract": "Understanding and attributing mental states, known as Theory of Mind (ToM), emerges as a fundamental capability for human social reasoning. While Large Language Models (LLMs) appear to possess certain ToM abilities, the mechanisms underlying these capabilities remain elusive. In this study, we discover that it is possible to linearly decode the belief status from the perspectives of various agents through neural activations of language models, indicating the existence of internal representations of self and others' beliefs. By manipulating these representations, we observe dramatic changes in the models' ToM performance, underscoring their pivotal role in the social reasoning process. Additionally, our findings extend to diverse social reasoning tasks that involve different causal inference patterns, suggesting the potential generalizability of these representations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wentao Zhu;Zhining Zhang;Yizhou Wang", "authorids": "~Wentao_Zhu3;~Zhining_Zhang1;~Yizhou_Wang1", "gender": "M;M;M", "homepage": "https://wentao.live;https://github.com/zzn-nzz;https://cfcs.pku.edu.cn/wangyizhou/", "dblp": "117/0354-4;371/5001.html;71/3387-1", "google_scholar": "https://scholar.google.com/citations?hl=en;;831z_VcAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Wentao_Zhu3;~Zhining_Zhang1;~Yizhou_Wang1", "aff": "Peking University;Johns Hopkins University;Peking University", "aff_domain": "pku.edu.cn;jhu.edu;pku.edu.cn", "position": "PhD student;Intern;Full Professor", "bibtex": "@inproceedings{\nzhu2024language,\ntitle={Language Models Represent Beliefs of Self and Others},\nauthor={Wentao Zhu and Zhining Zhang and Yizhou Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=asJTE8EBjg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4186447, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10403364992964413204&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "pku.edu.cn;jhu.edu;pku.edu.cn", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Peking University;Johns Hopkins University", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.jhu.edu", "aff_unique_abbr": "Peking U;JHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;United States" }, { "title": "Towards Theoretical Understandings of Self-Consuming Generative Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33664", "id": "aw6L8sB2Ts", "proceeding": "https://proceedings.mlr.press/v235/fu24i.html", "pdf": "https://openreview.net/pdf?id=aw6L8sB2Ts", "openreview": "https://openreview.net/forum?id=aw6L8sB2Ts", "author_site": "Shi Fu, Sen Zhang, Yingjie Wang, Xinmei Tian, Dacheng Tao", "tldr": "", "abstract": "This paper tackles the emerging challenge of training generative models within a self-consuming loop, wherein successive generations of models are recursively trained on mixtures of real and synthetic data from previous generations. We construct a theoretical framework to rigorously evaluate how this training procedure impacts the data distributions learned by future models, including parametric and non-parametric models. Specifically, we derive bounds on the total variation (TV) distance between the synthetic data distributions produced by future models and the original real data distribution under various mixed training scenarios for diffusion models with a one-hidden-layer neural network score function. Our analysis demonstrates that this distance can be effectively controlled under the condition that mixed training dataset sizes or proportions of real data are large enough. Interestingly, we further unveil a phase transition induced by expanding synthetic data amounts, proving theoretically that while the TV distance exhibits an initial ascent, it declines beyond a threshold point. Finally, we present results for kernel density estimation, delivering nuanced insights such as the impact of mixed data training on error propagation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shi Fu;Sen Zhang;Yingjie Wang;Xinmei Tian;Dacheng Tao", "authorids": "~Shi_Fu1;~Sen_Zhang3;~Yingjie_Wang1;~Xinmei_Tian1;~Dacheng_Tao1", "gender": "M;M;M;F;", "homepage": "http:// home.ustc.edu.cn/~fs311;https://github.com/SenZHANG-GitHub;https://www.researchgate.net/profile/Yingjie-Wang-37;https://faculty.ustc.edu.cn/tianxinmei1/zh_CN/index.htm;", "dblp": ";57/6221-6;33/6297-7;03/5204-1;", "google_scholar": ";-bJJNV0AAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.au/citations?hl=zh-CN;", "orcid": ";;;0000-0002-5952-8753;", "linkedin": ";;;;", "or_profile": "~Shi_Fu1;~Sen_Zhang3;~Yingjie_Wang1;~Xinmei_Tian1;~Dacheng_Tao1", "aff": "University of Science and Technology of China;University of Sydney, University of Sydney;Nanyang Technological University;University of Science and Technology of China;", "aff_domain": "ustc.edu.cn;sydney.edu.au;ntu.edu.sg;ustc.edu.cn;", "position": "MS student;Postdoc;Postdoc;Full Professor;", "bibtex": "@inproceedings{\nfu2024towards,\ntitle={Towards Theoretical Understandings of Self-Consuming Generative Models},\nauthor={Shi Fu and Sen Zhang and Yingjie Wang and Xinmei Tian and Dacheng Tao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=aw6L8sB2Ts}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 601857, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7054360106989507226&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 6, "email": "ustc.edu.cn;sydney.edu.au;ntu.edu.sg;ustc.edu.cn;", "author_num": 5, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Science and Technology of China;University of Sydney;Nanyang Technological University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.sydney.edu.au;https://www.ntu.edu.sg", "aff_unique_abbr": "USTC;USYD;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0", "aff_country_unique": "China;Australia;Singapore" }, { "title": "Language-guided Skill Learning with Temporal Variational Inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33663", "id": "awo5H10K6v", "proceeding": "https://proceedings.mlr.press/v235/fu24e.html", "pdf": "https://openreview.net/pdf?id=awo5H10K6v", "openreview": "https://openreview.net/forum?id=awo5H10K6v", "author_site": "Haotian Fu, Pratyusha Sharma, Elias Stengel-Eskin, George Konidaris, Nicolas Le Roux, Marc-Alexandre C\u00f4t\u00e9, Xingdi Yuan", "tldr": "", "abstract": "We present an algorithm for skill discovery from expert demonstrations. The algorithm first utilizes Large Language Models (LLMs) to propose an initial segmentation of the trajectories. Following that, a hierarchical variational inference framework incorporates the LLM-generated segmentation information to discover reusable skills by merging trajectory segments. To further control the trade-off between compression and reusability, we introduce a novel auxiliary objective based on the Minimum Description Length principle that helps guide this skill discovery process. Our results demonstrate that agents equipped with our method are able to discover skills that help accelerate learning and outperform baseline skill learning approaches on new long-horizon tasks in BabyAI, a grid world navigation environment, as well as ALFRED, a household simulation environment.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haotian Fu;Pratyusha Sharma;Elias Stengel-Eskin;George Konidaris;Nicolas Le Roux;Marc-Alexandre C\u00f4t\u00e9;Xingdi Yuan", "authorids": "~Haotian_Fu3;~Pratyusha_Sharma1;~Elias_Stengel-Eskin1;~George_Konidaris1;~Nicolas_Le_Roux2;~Marc-Alexandre_C\u00f4t\u00e92;~Xingdi_Yuan2", "gender": "M;F;M;M;M;M;M", "homepage": "https://haotianfu.me/;https://pratyushasharma.github.io/;https://esteng.github.io;http://cs.brown.edu/people/gdk/;https://www.microsoft.com/en-us/research/people/macote;http://nicolas.le-roux.name;https://xingdi-eric-yuan.github.io/", "dblp": "237/9681;228/7904;212/6138;56/6762;118/9636;http://dblp.uni-trier.de/pers/hd/r/Roux:Nicolas_Le;40/10147", "google_scholar": "btaP96wAAAAJ;RGiCLUgAAAAJ;gr_ZVSQAAAAJ;9UERvVEAAAAJ;https://scholar.google.ca/citations?user=L83CE5gAAAAJ;https://scholar.google.fr/citations?user=LmKtwk8AAAAJ;hYfE-B8AAAAJ", "orcid": ";;0000-0002-6689-505X;;;;", "linkedin": ";;;;;;", "or_profile": "~Haotian_Fu3;~Pratyusha_Sharma1;~Elias_Stengel-Eskin1;~George_Konidaris1;~Marc-Alexandre_Cote1;~Nicolas_Le_Roux1;~Eric_Yuan1", "aff": "Brown University;Massachusetts Institute of Technology;University of North Carolina at Chapel Hill;Brown University;Microsoft;Microsoft;Microsoft Research", "aff_domain": "brown.edu;mit.edu;cs.unc.edu;brown.edu;microsoft.com;microsoft.com;microsoft.com", "position": "PhD student;PhD student;Postdoc;Assistant Professor;Principal Researcher;Researcher;Senior Researcher", "bibtex": "@inproceedings{\nfu2024languageguided,\ntitle={Language-guided Skill Learning with Temporal Variational Inference},\nauthor={Haotian Fu and Pratyusha Sharma and Elias Stengel-Eskin and George Konidaris and Nicolas Le Roux and Marc-Alexandre C{\\^o}t{\\'e} and Xingdi Yuan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=awo5H10K6v}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8508957, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10329603078920113943&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "brown.edu;mit.edu;cs.unc.edu;brown.edu;microsoft.com;microsoft.com;microsoft.com", "author_num": 7, "aff_unique_index": "0;1;2;0;3;3;3", "aff_unique_norm": "Brown University;Massachusetts Institute of Technology;University of North Carolina;Microsoft", "aff_unique_dep": ";;;Microsoft Corporation", "aff_unique_url": "https://www.brown.edu;https://web.mit.edu;https://www.unc.edu;https://www.microsoft.com", "aff_unique_abbr": "Brown;MIT;UNC;Microsoft", "aff_campus_unique_index": "1", "aff_campus_unique": ";Chapel Hill", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Spotting LLMs With Binoculars: Zero-Shot Detection of Machine-Generated Text", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33662", "id": "axl3FAkpik", "proceeding": "https://proceedings.mlr.press/v235/hans24a.html", "pdf": "https://openreview.net/pdf?id=axl3FAkpik", "openreview": "https://openreview.net/forum?id=axl3FAkpik", "author_site": "Abhimanyu Hans, Avi Schwarzschild, Valeriia Cherepanova, Hamid Kazemi, Aniruddha Saha, Micah Goldblum, Jonas Geiping, Tom Goldstein", "tldr": "", "abstract": "Detecting text generated by modern large language models is thought to be hard, as both LLMs and humans can exhibit a wide range of complex behaviors. However, we find that a score based on contrasting two closely related language models is highly accurate at separating human-generated and machine-generated text. Based on this mechanism, we propose a novel LLM detector that only requires simple calculations using a pair of pre-trained LLMs. The method, called *Binoculars*, achieves state-of-the-art accuracy without any training data. It is capable of spotting machine text from a range of modern LLMs without any model-specific modifications. We comprehensively evaluate *Binoculars* on a number of text sources and in varied situations. Over a wide range of document types, *Binoculars* detects over 90% of generated samples from ChatGPT (and other LLMs) at a false positive rate of 0.01%, despite not being trained on any ChatGPT data. Code available at https://github.com/ahans30/Binoculars.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Abhimanyu Hans;Avi Schwarzschild;Valeriia Cherepanova;Hamid Kazemi;Aniruddha Saha;Micah Goldblum;Jonas Geiping;Tom Goldstein", "authorids": "~Abhimanyu_Hans1;~Avi_Schwarzschild1;~Valeriia_Cherepanova1;~Hamid_Kazemi1;~Aniruddha_Saha1;~Micah_Goldblum1;~Jonas_Geiping1;~Tom_Goldstein1", "gender": "M;M;F;M;M;;M;M", "homepage": "https://ahans30.github.io/;https://cs.umd.edu/~avi1;https://www.vcherepanova.com/;;https://ani0075saha.github.io/;;https://jonasgeiping.github.io/;https://www.cs.umd.edu/~tomg/", "dblp": ";249/9334.html;;;221/8102;241/7231;190/7229;25/8184", "google_scholar": "b77HAM8AAAAJ;WNvQ7AcAAAAJ;PySUqqUAAAAJ;7hNdaGQAAAAJ;xfjALj0AAAAJ;pGDKzuUAAAAJ;https://scholar.google.de/citations?user=206vNCEAAAAJ;KmSuVtgAAAAJ", "orcid": ";;;;;;;", "linkedin": "abhimanyu-hans-891a15122/;;;hamid-kazemi-608a8085/;;;;", "or_profile": "~Abhimanyu_Hans1;~Avi_Schwarzschild1;~Valeriia_Cherepanova1;~Hamid_Kazemi1;~Aniruddha_Saha1;~Micah_Goldblum1;~Jonas_Geiping1;~Tom_Goldstein1", "aff": "Department of Computer Science, University of Maryland, College Park;Carnegie Mellon University;Amazon;University of Maryland, College Park;University of Maryland, College Park;New York University;Max Planck Institute for Intelligent Systems, Max-Planck Institute;University of Maryland, College Park", "aff_domain": "cs.umd.edu;cmu.edu;amazon.com;umd.edu;umd.edu;nyu.edu;tuebingen.mpg.de;umd.edu", "position": "PhD student;Postdoc;Postdoc;PhD student;Postdoc;Postdoc;Principal Researcher;Full Professor", "bibtex": "@inproceedings{\nhans2024spotting,\ntitle={Spotting {LLM}s With Binoculars: Zero-Shot Detection of Machine-Generated Text},\nauthor={Abhimanyu Hans and Avi Schwarzschild and Valeriia Cherepanova and Hamid Kazemi and Aniruddha Saha and Micah Goldblum and Jonas Geiping and Tom Goldstein},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=axl3FAkpik}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2717284, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2163974418024982315&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "cs.umd.edu;cmu.edu;amazon.com;umd.edu;umd.edu;nyu.edu;tuebingen.mpg.de;umd.edu", "author_num": 8, "aff_unique_index": "0;1;2;3;3;4;5;3", "aff_unique_norm": "University of Maryland, College Park;Carnegie Mellon University;Amazon;University of Maryland;New York University;Max Planck Institute for Intelligent Systems", "aff_unique_dep": "Department of Computer Science;;Amazon.com, Inc.;;;Intelligent Systems", "aff_unique_url": "https://www/umd.edu;https://www.cmu.edu;https://www.amazon.com;https://www/umd.edu;https://www.nyu.edu;https://www.mpi-is.mpg.de", "aff_unique_abbr": "UMD;CMU;Amazon;UMD;NYU;MPI-IS", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0;0;1;0", "aff_country_unique": "United States;Germany" }, { "title": "Category-Aware Active Domain Adaptation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33661", "id": "axwrD8F1yq", "proceeding": "https://proceedings.mlr.press/v235/xiao24b.html", "pdf": "https://openreview.net/pdf?id=axwrD8F1yq", "openreview": "https://openreview.net/forum?id=axwrD8F1yq", "author_site": "Wenxiao Xiao, Jiuxiang Gu, Hongfu Liu", "tldr": "", "abstract": "Active domain adaptation has shown promising results in enhancing unsupervised domain adaptation (DA), by actively selecting and annotating a small amount of unlabeled samples from the target domain. Despite its effectiveness in boosting overall performance, the gain usually concentrates on the categories that are readily improvable, while challenging categories that demand the utmost attention are often overlooked by existing models. To alleviate this discrepancy, we propose a novel category-aware active DA method that aims to boost the adaptation for the individual category without adversely affecting others. Specifically, our approach identifies the unlabeled data that are most important for the recognition of the targeted category. Our method assesses the impact of each unlabeled sample on the recognition loss of the target data via the influence function, which allows us to directly evaluate the sample importance, without relying on indirect measurements used by existing methods. Comprehensive experiments and in-depth explorations demonstrate the efficacy of our method on category-aware active DA over three datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenxiao Xiao;Jiuxiang Gu;Hongfu Liu", "authorids": "~Wenxiao_Xiao1;~Jiuxiang_Gu2;~Hongfu_Liu2", "gender": "M;M;M", "homepage": ";http://gujiuxiang.com;http://hongfuliu.com/", "dblp": "319/4901;173/4935.html;32/9075-1", "google_scholar": ";https://scholar.google.com.sg/citations?user=zPxKV9EAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;", "linkedin": "wenxiao-xiao-a16263212/;;", "or_profile": "~Wenxiao_Xiao1;~Jiuxiang_Gu2;~Hongfu_Liu2", "aff": "Brandeis University;Adobe Systems;Brandeis University", "aff_domain": "brandeis.edu;adobe.com;brandeis.edu", "position": "PhD student;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nxiao2024categoryaware,\ntitle={Category-Aware Active Domain Adaptation},\nauthor={Wenxiao Xiao and Jiuxiang Gu and Hongfu Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=axwrD8F1yq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1098008, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:z00rXvRFMakJ:scholar.google.com/&scioq=Category-Aware+Active+Domain+Adaptation&hl=en&as_sdt=0,33", "gs_version_total": 5, "email": "brandeis.edu;adobe.com;brandeis.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Brandeis University;Adobe", "aff_unique_dep": ";Adobe Systems Incorporated", "aff_unique_url": "https://www.brandeis.edu;https://www.adobe.com", "aff_unique_abbr": "Brandeis;Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "ILILT: Implicit Learning of Inverse Lithography Technologies", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33660", "id": "b0lxGL2n3d", "proceeding": "https://proceedings.mlr.press/v235/yang24s.html", "pdf": "https://openreview.net/pdf?id=b0lxGL2n3d", "openreview": "https://openreview.net/forum?id=b0lxGL2n3d", "author_site": "Haoyu Yang, Mark Ren", "tldr": "", "abstract": "Lithography, transferring chip design masks to the silicon wafer, is the most important phase in modern semiconductor manufacturing flow. Due to the limitations of lithography systems, Extensive design optimizations are required to tackle the design and silicon mismatch. Inverse lithography technology (ILT) is one of the promising solutions to perform pre-fabrication optimization, termed mask optimization. Because of mask optimization problems\u2019 constrained non-convexity, numerical ILT solvers rely heavily on good initialization to avoid getting stuck on sub-optimal solutions. Machine learning (ML) techniques are hence proposed to generate mask initialization for ILT solvers with one-shot inference, targeting faster and better convergence during ILT. This paper addresses the question of whether ML models can directly generate high-quality optimized masks without engaging ILT solvers in the loop. We propose an implicit learning ILT framework: ILILT, which leverages the implicit layer learning method and lithography-conditioned inputs to ground the model. Trained to understand the ILT optimization procedure, ILILT can outperform the state-of-the-art machine learning solutions, significantly improving efficiency and quality.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haoyu Yang;Haoxing Ren", "authorids": "~Haoyu_Yang4;~Haoxing_Ren1", "gender": "M;", "homepage": "https://phdyang007.github.io/;", "dblp": ";", "google_scholar": "https://scholar.google.com.hk/citations?user=aTJ0RJUAAAAJ;https://scholar.google.ca/citations?user=y-TNUJ4AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Haoyu_Yang4;~Haoxing_Ren1", "aff": "NVIDIA;NVIDIA", "aff_domain": "nvidia.com;nvidia.com", "position": "Researcher;Director", "bibtex": "@inproceedings{\nyang2024ililt,\ntitle={{ILILT}: Implicit Learning of Inverse Lithography Technologies},\nauthor={Haoyu Yang and Haoxing Ren},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=b0lxGL2n3d}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2614556, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1603678062721360187&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "nvidia.com;nvidia.com", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "NVIDIA", "aff_unique_dep": "NVIDIA Corporation", "aff_unique_url": "https://www.nvidia.com", "aff_unique_abbr": "NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Is In-Context Learning in Large Language Models Bayesian? A Martingale Perspective", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33659", "id": "b1YQ5WKY3w", "proceeding": "https://proceedings.mlr.press/v235/falck24a.html", "pdf": "https://openreview.net/pdf?id=b1YQ5WKY3w", "openreview": "https://openreview.net/forum?id=b1YQ5WKY3w", "author_site": "Fabian Falck, Ziyu Wang, Christopher Holmes", "tldr": "", "abstract": "In-context learning (ICL) has emerged as a particularly remarkable characteristic of Large Language Models (LLM): given a pretrained LLM and an observed dataset, LLMs can make predictions for new data points from the same distribution without fine-tuning. Numerous works have postulated ICL as approximately Bayesian inference, rendering this a natural hypothesis. In this work, we analyse this hypothesis from a new angle through the *martingale property*, a fundamental requirement of a Bayesian learning system for exchangeable data. We show that the martingale property is a necessary condition for unambiguous predictions in such scenarios, and enables a principled, decomposed notion of uncertainty vital in trustworthy, safety-critical systems. We derive actionable checks with corresponding theory and test statistics which must hold if the martingale property is satisfied. We also examine if uncertainty in LLMs decreases as expected in Bayesian learning when more data is observed. In three experiments, we provide evidence for violations of the martingale property, and deviations from a Bayesian scaling behaviour of uncertainty, falsifying the hypothesis that ICL is Bayesian.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fabian Falck;Ziyu Wang;Christopher C. Holmes", "authorids": "~Fabian_Falck1;~Ziyu_Wang2;~Christopher_C._Holmes1", "gender": ";Unspecified;M", "homepage": ";http://ziyu-wang.info;", "dblp": ";73/4689-6;08/6129", "google_scholar": ";zMAlv2kAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Fabian_Falck1;~Ziyu_Wang2;~Christopher_C._Holmes1", "aff": ";University of Oxford;University of Oxford", "aff_domain": ";ox.ac.uk;ox.ac.uk", "position": ";Postdoc;Full Professor", "bibtex": "@inproceedings{\nfalck2024is,\ntitle={Is In-Context Learning in Large Language Models Bayesian? A Martingale Perspective},\nauthor={Fabian Falck and Ziyu Wang and Christopher C. Holmes},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=b1YQ5WKY3w}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2580162, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14921045529041881581&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";ox.ac.uk;ox.ac.uk", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Integrating Multimodal Data for Joint Generative Modeling of Complex Dynamics", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33658", "id": "b1iurBHDck", "proceeding": "https://proceedings.mlr.press/v235/brenner24a.html", "pdf": "https://openreview.net/pdf?id=b1iurBHDck", "openreview": "https://openreview.net/forum?id=b1iurBHDck", "author_site": "Manuel Brenner, Florian Hess, Georgia Koppe, Daniel Durstewitz", "tldr": "", "abstract": "Many, if not most, systems of interest in science are naturally described as nonlinear dynamical systems. Empirically, we commonly access these systems through time series measurements. Often such time series may consist of discrete random variables rather than continuous measurements, or may be composed of measurements from multiple data modalities observed simultaneously. For instance, in neuroscience we may have behavioral labels in addition to spike counts and continuous physiological recordings. While by now there is a burgeoning literature on deep learning for dynamical systems reconstruction (DSR), multimodal data integration has hardly been considered in this context. Here we provide such an efficient and flexible algorithmic framework that rests on a multimodal variational autoencoder for generating a sparse teacher signal that guides training of a reconstruction model, exploiting recent advances in DSR training techniques. It enables to combine various sources of information for optimal reconstruction, even allows for reconstruction from symbolic data (class labels) alone, and connects different types of observations within a common latent dynamics space. In contrast to previous multimodal data integration techniques for scientific applications, our framework is fully generative, producing, after training, trajectories with the same geometrical and temporal structure as those of the ground truth system.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Manuel Brenner;Florian Hess;Georgia Koppe;Daniel Durstewitz", "authorids": "~Manuel_Brenner1;~Florian_Hess1;~Georgia_Koppe1;~Daniel_Durstewitz1", "gender": "M;M;F;", "homepage": ";https://www.zi-mannheim.de/forschung/abteilungen-ags-institute/theoret-neurowissenschaften/infos-theor-neurowiss.html;https://www.zi-mannheim.de/en/research/people/person/7111.html;https://durstewitzlab.github.io", "dblp": "323/8935;;152/2270;98/2120", "google_scholar": "HCUeyg8AAAAJ;nOZM-1AAAAAJ;https://scholar.google.de/citations?user=5EVBcowAAAAJ;https://scholar.google.de/citations?user=2bcbKU0AAAAJ", "orcid": ";;;0000-0002-9340-3786", "linkedin": "manuel-brenner-772261191/;;;", "or_profile": "~Manuel_Brenner1;~Florian_Hess1;~Georgia_Koppe1;~Daniel_Durstewitz1", "aff": "Heidelberg University;Ruprecht-Karls-Universit\u00e4t Heidelberg;Central Institute of Mental Health;Heidelberg University", "aff_domain": "uni-heidelberg.de;uni-heidelberg.de;zi-mannheim.de;uni-heidelberg.de", "position": "PhD student;PhD student;Principal Researcher;Full Professor", "bibtex": "@inproceedings{\nbrenner2024integrating,\ntitle={Integrating Multimodal Data for Joint Generative Modeling of Complex Dynamics},\nauthor={Manuel Brenner and Florian Hess and Georgia Koppe and Daniel Durstewitz},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=b1iurBHDck}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 10131994, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1694610844273365457&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "uni-heidelberg.de;uni-heidelberg.de;zi-mannheim.de;uni-heidelberg.de", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Heidelberg University;Ruprecht-Karls-Universit\u00e4t Heidelberg;Central Institute of Mental Health", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uni-heidelberg.de;https://www.uni-heidelberg.de/;https://www.zi-mannheim.de", "aff_unique_abbr": "Uni Heidelberg;Uni Heidelberg;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "title": "IM-Unpack: Training and Inference with Arbitrarily Low Precision Integers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33657", "id": "b2D9PBNNQ2", "proceeding": "https://proceedings.mlr.press/v235/zeng24g.html", "pdf": "https://openreview.net/pdf?id=b2D9PBNNQ2", "openreview": "https://openreview.net/forum?id=b2D9PBNNQ2", "author_site": "Zhanpeng Zeng, Karthikeyan Sankaralingam, Vikas Singh", "tldr": "", "abstract": "GEneral Matrix Multiply (GEMM) is a central operation in deep learning and corresponds to a large chunk of the compute footprint. Therefore, improving its efficiency is an active topic of research. A popular strategy is the use of low bit-width integers to approximate the original matrix entries. This allows efficiency gains, but often requires sophisticated techniques to control the rounding error. In this work, we first verify that when the low bit-width restriction is removed, for a variety of Transformer-based models, integers are, in fact, sufficient for all GEMMs need -- for both training and inference stages, and achieve parity (with floating point). No sophisticated techniques are needed. We find that while a large majority of entries in matrices (encountered in such models) can be easily represented by low bit-width integers, the existence of a few heavy hitter entries make it difficult to achieve efficiency gains via the exclusive use of low bit-width GEMMs alone. To address this issue, we develop a simple algorithm, Integer Matrix Unpacking (IM-Unpack), to unpack a matrix with large integer entries into a larger matrix whose entries all lie within the representable range of arbitrarily low bit-width integers. This allows equivalence with the original GEMM, i.e., the exact result can be obtained using purely low bit-width integer GEMMs. This comes at the cost of additional operations -- we show that for many popular models, this overhead is quite small. Code is available at https://github.com/vsingh-group/im-unpack.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhanpeng Zeng;Karthikeyan Sankaralingam;Vikas Singh", "authorids": "~Zhanpeng_Zeng1;~Karthikeyan_Sankaralingam1;~Vikas_Singh1", "gender": "M;M;M", "homepage": ";http://pages.cs.wisc.edu/~karu/;http://vsingh-www.cs.wisc.edu/", "dblp": "284/9150;22/858.html;", "google_scholar": "P9ctuRUAAAAJ;https://scholar.google.com.tw/citations?user=O0W9jEQAAAAJ;d32BmwcAAAAJ", "orcid": ";;", "linkedin": ";karusankaralingam/;", "or_profile": "~Zhanpeng_Zeng1;~Karthikeyan_Sankaralingam1;~Vikas_Singh1", "aff": "University of Wisconsin, Madison;Department of Computer Science, University of Wisconsin - Madison;University of Wisconsin, Madison", "aff_domain": "wisc.edu;cs.wisc.edu;wisc.edu", "position": "PhD student;Full Professor;Professor", "bibtex": "@inproceedings{\nzeng2024imunpack,\ntitle={{IM}-Unpack: Training and Inference with Arbitrarily Low Precision Integers},\nauthor={Zhanpeng Zeng and Karthikeyan Sankaralingam and Vikas Singh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=b2D9PBNNQ2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1987565, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2466866139013366107&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "wisc.edu;cs.wisc.edu;wisc.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Wisconsin;University of Wisconsin-Madison", "aff_unique_dep": ";Department of Computer Science", "aff_unique_url": "https://www.wisc.edu;https://www.wisc.edu", "aff_unique_abbr": "UW;UW-Madison", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Conformal Prediction for Deep Classifier via Label Ranking", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33656", "id": "b3pYoZfcoo", "proceeding": "https://proceedings.mlr.press/v235/huang24aa.html", "pdf": "https://openreview.net/pdf?id=b3pYoZfcoo", "openreview": "https://openreview.net/forum?id=b3pYoZfcoo", "author_site": "Jianguo Huang, HuaJun Xi, Linjun Zhang, Huaxiu Yao, Yue Qiu, Hongxin Wei", "tldr": "", "abstract": "Conformal prediction is a statistical framework that generates prediction sets containing ground-truth labels with a desired coverage guarantee. The predicted probabilities produced by machine learning models are generally miscalibrated, leading to large prediction sets in conformal prediction. To address this issue, we propose a novel algorithm named $\\textit{Sorted Adaptive Prediction Sets}$ (SAPS), which discards all the probability values except for the maximum softmax probability. The key idea behind SAPS is to minimize the dependence of the non-conformity score on the probability values while retaining the uncertainty information. In this manner, SAPS can produce compact prediction sets and communicate instance-wise uncertainty. Extensive experiments validate that SAPS not only lessens the prediction sets but also broadly enhances the conditional coverage rate of prediction sets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jianguo Huang;HuaJun Xi;Linjun Zhang;Huaxiu Yao;Yue Qiu;Hongxin Wei", "authorids": "~Jianguo_Huang2;~HuaJun_Xi1;~Linjun_Zhang1;~Huaxiu_Yao1;~Yue_Qiu5;~Hongxin_Wei1", "gender": "M;;M;M;M;M", "homepage": "https://jianguo99.github.io/;https://github.com/Bei-jifeng?tab=repositories;;http://huaxiuyao.mystrikingly.com;https://faculty.cqu.edu.cn/qiuyue;https://hongxin001.github.io/", "dblp": ";;;197/1635;16/1041;150/6350", "google_scholar": "r-BhZGwAAAAJ;;TUAzs3sAAAAJ;A20BZnQAAAAJ;https://scholar.google.de/citations?user=QCjHDYQAAAAJ;cABH034AAAAJ", "orcid": ";;;;;", "linkedin": "jianguo-huang-1471b0232/;;;huaxiuyao/;;", "or_profile": "~Jianguo_Huang2;~HuaJun_Xi1;~Linjun_Zhang1;~Huaxiu_Yao1;~Yue_Qiu5;~Hongxin_Wei1", "aff": "ShanghaiTech University;Southern University of Science and Technology;Rutgers University;Department of Computer Science, University of North Carolina at Chapel Hill;Chongqing University;Southern University of Science and Technology", "aff_domain": "shanghaitech.edu.cn;sustech.edu;rutgers.edu;cs.unc.edu;cqu.edu.cn;sustech.edu.cn", "position": "MS student;Undergrad student;Assistant Professor;Assistant Professor;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nhuang2024conformal,\ntitle={Conformal Prediction for Deep Classifier via Label Ranking},\nauthor={Jianguo Huang and HuaJun Xi and Linjun Zhang and Huaxiu Yao and Yue Qiu and Hongxin Wei},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=b3pYoZfcoo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 521396, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13223554017288004150&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "shanghaitech.edu.cn;sustech.edu;rutgers.edu;cs.unc.edu;cqu.edu.cn;sustech.edu.cn", "author_num": 6, "aff_unique_index": "0;1;2;3;4;1", "aff_unique_norm": "ShanghaiTech University;Southern University of Science and Technology;Rutgers University;University of North Carolina at Chapel Hill;Chongqing University", "aff_unique_dep": ";;;Department of Computer Science;", "aff_unique_url": "https://www.shanghaitech.edu.cn;https://www.sustech.edu.cn;https://www.rutgers.edu;https://www.unc.edu;https://www.cqu.edu.cn", "aff_unique_abbr": "ShanghaiTech;SUSTech;Rutgers;UNC Chapel Hill;CQU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Chapel Hill", "aff_country_unique_index": "0;0;1;1;0;0", "aff_country_unique": "China;United States" }, { "title": "Probabilistic Subgoal Representations for Hierarchical Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33655", "id": "b6AwZauZPV", "proceeding": "https://proceedings.mlr.press/v235/wang24bx.html", "pdf": "https://openreview.net/pdf?id=b6AwZauZPV", "openreview": "https://openreview.net/forum?id=b6AwZauZPV", "author_site": "Vivienne Wang, Tinghuai Wang, wenyan yang, Joni-kristian Kamarainen, Joni Pajarinen", "tldr": "", "abstract": "In goal-conditioned hierarchical reinforcement learning (HRL), a high-level policy specifies a subgoal for the low-level policy to reach. Effective HRL hinges on a suitable subgoal representation function, abstracting state space into latent subgoal space and inducing varied low-level behaviors. Existing methods adopt a subgoal representation that provides a deterministic mapping from state space to latent subgoal space. Instead, this paper utilizes Gaussian Processes (GPs) for the first probabilistic subgoal representation. Our method employs a GP prior on the latent subgoal space to learn a posterior distribution over the subgoal representation functions while exploiting the long-range correlation in the state space through learnable kernels. This enables an adaptive memory that integrates long-range subgoal information from prior planning steps allowing to cope with stochastic uncertainties. Furthermore, we propose a novel learning objective to facilitate the simultaneous learning of probabilistic subgoal representations and policies within a unified framework. In experiments, our approach outperforms state-of-the-art baselines in standard benchmarks but also in environments with stochastic elements and under diverse reward conditions. Additionally, our model shows promising capabilities in transferring low-level policies across different tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vivienne Huiling Wang;Tinghuai Wang;Wenyan Yang;Joni-kristian Kamarainen;Joni Pajarinen", "authorids": "~Vivienne_Huiling_Wang1;~Tinghuai_Wang1;~Wenyan_Yang1;~Joni-kristian_Kamarainen1;~Joni_Pajarinen2", "gender": ";;M;M;", "homepage": ";;;https://webpages.tuni.fi/vision/public_pages/JoniKamarainen/;", "dblp": ";;;k/JoniKristianKamarainen;23/8355", "google_scholar": ";;https://scholar.google.com/citations?hl=en;https://scholar.google.fi/citations?user=r6Y4nacAAAAJ;https://scholar.google.fi/citations?user=-2fJStwAAAAJ", "orcid": ";;;0000-0002-5801-4371;0000-0003-4469-8191", "linkedin": ";;;;", "or_profile": "~Vivienne_Huiling_Wang1;~Tinghuai_Wang1;~Wenyan_Yang1;~Joni-kristian_Kamarainen1;~Joni_Pajarinen2", "aff": ";;Tampere University;Tampere University;Aalto University", "aff_domain": ";;tuni.fi;tuni.fi;aalto.fi", "position": ";;PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nwang2024probabilistic,\ntitle={Probabilistic Subgoal Representations for Hierarchical Reinforcement Learning},\nauthor={Vivienne Huiling Wang and Tinghuai Wang and Wenyan Yang and Joni-kristian Kamarainen and Joni Pajarinen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=b6AwZauZPV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5005999, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5189204605229334674&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 10, "email": ";;tuni.fi;tuni.fi;aalto.fi", "author_num": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "Tampere University;Aalto University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tuni.fi;https://www.aalto.fi", "aff_unique_abbr": "Tuni;Aalto", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Finland" }, { "title": "ArCHer: Training Language Model Agents via Hierarchical Multi-Turn RL", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33654", "id": "b6rA0kAHT1", "proceeding": "https://proceedings.mlr.press/v235/zhou24t.html", "pdf": "https://openreview.net/pdf?id=b6rA0kAHT1", "openreview": "https://openreview.net/forum?id=b6rA0kAHT1", "author_site": "Yifei Zhou, Andrea Zanette, Jiayi Pan, Sergey Levine, Aviral Kumar", "tldr": "", "abstract": "Large language models (LLMs) have the potential to tackle sequential decision-making problems due to their generalist capabilities. Instead of optimizing ``myopic'' surrogate objectives such as human preferences within a single turn, in such problems, we wish to directly optimize long-term objectives, such as user satisfaction over an entire dialogue with an LLM or delayed success metrics in web navigation. Multi-turn reinforcement learning (RL) provides an appealing approach to directly optimize long-term objectives, but how can we design effective and efficient multi-turn RL algorithms for LLMs? In this work, we propose an algorithmic framework to multi-turn RL for LLMs that preserves the flexibility of token-by-token RL used in single-turn RL problems, while still accommodating long horizons and delayed rewards more effectively. Our framework, the **A**cto**r**-**C**ritic Framework with a **H**i**e**rarchical Structu**r**e (**ArCHer**), combines a high-level off-policy RL algorithm that trains a value function with a low-level RL algorithm that trains a token-by-token policy. While ArCHer can be instantiated with multiple RL algorithms, a particularly convenient instantiation is to use temporal difference (TD) learning at the high level and on-policy token-level policy gradient at the low level. Empirically, we show that ArCHer significantly improves efficiency and performance of multi-turn LLM tasks, attaining sample efficiency boosts of about **100x** over prior on-policy methods and converging to a much better performance than other off-policy methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yifei Zhou;Andrea Zanette;Jiayi Pan;Sergey Levine;Aviral Kumar", "authorids": "~Yifei_Zhou1;~Andrea_Zanette1;~Jiayi_Pan1;~Sergey_Levine1;~Aviral_Kumar2", "gender": "M;;M;M;M", "homepage": "https://yifeizhou02.github.io/;;https://www.jiayipan.me/;https://people.eecs.berkeley.edu/~svlevine/;https://aviralkumar2907.github.io/", "dblp": "50/7699;;39/6476-2;80/7594;202/7961", "google_scholar": ";;n9Y_sQEAAAAJ;8R35rCwAAAAJ;", "orcid": ";;0000-0003-0817-4083;;", "linkedin": "yifei-zhou-57aa9b222/;;;;", "or_profile": "~Yifei_Zhou1;~Andrea_Zanette1;~Jiayi_Pan1;~Sergey_Levine1;~Aviral_Kumar2", "aff": "University of California, Berkeley;;University of California, Berkeley;Google;Google DeepMind", "aff_domain": "berkeley.edu;;berkeley.edu;google.com;google.com", "position": "PhD student;;PhD student;Research Scientist;Researcher", "bibtex": "@inproceedings{\nzhou2024archer,\ntitle={Ar{CH}er: Training Language Model Agents via Hierarchical Multi-Turn {RL}},\nauthor={Yifei Zhou and Andrea Zanette and Jiayi Pan and Sergey Levine and Aviral Kumar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=b6rA0kAHT1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3860395, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12573070193432622657&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "berkeley.edu;;berkeley.edu;google.com;google.com", "author_num": 5, "aff_unique_index": "0;0;1;1", "aff_unique_norm": "University of California, Berkeley;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;Google", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Berkeley;Mountain View;", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Graph As Point Set", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33653", "id": "b6yHkQpSwZ", "proceeding": "https://proceedings.mlr.press/v235/wang24am.html", "pdf": "https://openreview.net/pdf?id=b6yHkQpSwZ", "openreview": "https://openreview.net/forum?id=b6yHkQpSwZ", "author_site": "Xiyuan Wang, Pan Li, Muhan Zhang", "tldr": "", "abstract": "Graph is a fundamental data structure to model interconnections between entities. Set, on the contrary, stores independent elements. To learn graph representations, current Graph Neural Networks (GNNs) primarily use message passing to encode the interconnections. In contrast, this paper introduces a novel graph-to-set conversion method that bijectively transforms interconnected nodes into a set of independent points and then uses a set encoder to learn the graph representation. This conversion method holds dual significance. Firstly, it enables using set encoders to learn from graphs, thereby significantly expanding the design space of GNNs. Secondly, for Transformer, a specific set encoder, we provide a novel and principled approach to inject graph information losslessly, different from all the heuristic structural/positional encoding methods adopted in previous graph transformers. To demonstrate the effectiveness of our approach, we introduce Point Set Transformer (PST), a transformer architecture that accepts a point set converted from a graph as input. Theoretically, PST exhibits superior expressivity for both short-range substructure counting and long-range shortest path distance tasks compared to existing GNNs. Extensive experiments further validate PST's outstanding real-world performance. Besides Transformer, we also devise a Deepset-based set encoder, which achieves performance comparable to representative GNNs, affirming the versatility of our graph-to-set method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiyuan Wang;Pan Li;Muhan Zhang", "authorids": "~Xiyuan_Wang1;~Pan_Li2;~Muhan_Zhang1", "gender": ";;M", "homepage": ";;https://muhanzhang.github.io/", "dblp": "95/8542;https://dblp.org/pers/hd/l/Li_0005:Pan;157/5518", "google_scholar": ";IroP0EwAAAAJ;https://scholar.google.com.hk/citations?user=OBBqkosAAAAJ", "orcid": ";;0000-0002-7680-6401", "linkedin": "%E5%B8%8C%E5%85%83-%E7%8E%8B-969660221/;pan-li-b951105a/;jerry-muhan-zhang-a33a1777/", "or_profile": "~Xiyuan_Wang1;~Pan_Li2;~Muhan_Zhang1", "aff": "Peking University;Purdue University;Peking University", "aff_domain": "pku.edu.cn;purdue.edu;pku.edu.cn", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nwang2024graph,\ntitle={Graph As Point Set},\nauthor={Xiyuan Wang and Pan Li and Muhan Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=b6yHkQpSwZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1211440, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12876007589269967137&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 7, "email": "pku.edu.cn;purdue.edu;pku.edu.cn", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Peking University;Purdue University", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.purdue.edu", "aff_unique_abbr": "Peking U;Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;United States" }, { "title": "Not Just Pretty Pictures: Toward Interventional Data Augmentation Using Text-to-Image Generators", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33652", "id": "b89JtZj9gm", "proceeding": "https://proceedings.mlr.press/v235/yuan24e.html", "pdf": "https://openreview.net/pdf?id=b89JtZj9gm", "openreview": "https://openreview.net/forum?id=b89JtZj9gm", "author_site": "Jianhao Yuan, Francesco Pinto, Adam Davies, Phil Torr", "tldr": "", "abstract": "Neural image classifiers are known to undergo severe performance degradation when exposed to inputs that are sampled from environmental conditions that differ from their training data. Given the recent progress in Text-to-Image (T2I) generation, a natural question is how modern T2I generators can be used to simulate arbitrary interventions over such environmental factors in order to augment training data and improve the robustness of downstream classifiers. We experiment across a diverse collection of benchmarks in single domain generalization (SDG) and reducing reliance on spurious features (RRSF), ablating across key dimensions of T2I generation, including interventional prompting strategies, conditioning mechanisms, and post-hoc filtering, showing that modern T2I generators like Stable Diffusion can indeed be used to implement a powerful interventional data augmentation (IDA) mechanism, outperforming previously state-of-the-art data augmentation techniques regardless of how each dimension is configured.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jianhao Yuan;Francesco Pinto;Adam Davies;Philip Torr", "authorids": "~Jianhao_Yuan2;~Francesco_Pinto1;~Adam_Davies2;~Philip_Torr1", "gender": "M;Non-Binary;;M", "homepage": ";https://ahdavies6.github.io/;http://www.robots.ox.ac.uk/~tvg/;https://yuanjianhao508.github.io/", "dblp": "281/7477;;;", "google_scholar": "rqAdo2MAAAAJ;vqkOH7gAAAAJ;;BUJPCegAAAAJ", "orcid": ";0000-0002-0610-2732;;", "linkedin": "francesco-pinto-42a389b1?lipi=urn%3Ali%3Apage%3Ad_flagship3_profile_view_base_contact_details%3BishkY8oUQ8OTPPeV0SSCdw%3D%3D;adamhdavies/;;", "or_profile": "~Francesco_Pinto1;~Adam_Davies2;~Philip_Torr1;~JIANHAO_YUAN1", "aff": "University of Oxford;University of Illinois, Urbana Champaign;University of Oxford;University of Oxford", "aff_domain": "ox.ac.uk;illinois.edu;ox.ac.uk;robots.ox.ac.uk", "position": "PhD student;PhD student;Full Professor;PhD student", "bibtex": "@inproceedings{\nyuan2024not,\ntitle={Not Just Pretty Pictures: Toward Interventional Data Augmentation Using Text-to-Image Generators},\nauthor={Jianhao Yuan and Francesco Pinto and Adam Davies and Philip Torr},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=b89JtZj9gm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 950020, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4464747621104770790&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "ox.ac.uk;illinois.edu;ox.ac.uk;robots.ox.ac.uk", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of Oxford;University of Illinois Urbana-Champaign", "aff_unique_dep": ";", "aff_unique_url": "https://www.ox.ac.uk;https://illinois.edu", "aff_unique_abbr": "Oxford;UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "Fine-grained Classes and How to Find Them", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33651", "id": "b9VfvegTEO", "proceeding": "https://proceedings.mlr.press/v235/grcic24a.html", "pdf": "https://openreview.net/pdf?id=b9VfvegTEO", "openreview": "https://openreview.net/forum?id=b9VfvegTEO", "author_site": "Matej Grcic, Artyom Gadetsky, Maria Brbic", "tldr": "", "abstract": "In many practical applications, coarse-grained labels are readily available compared to fine-grained labels that reflect subtle differences between classes. However, existing methods cannot leverage coarse labels to infer fine-grained labels in an unsupervised manner. To bridge this gap, we propose FALCON, a method that discovers fine-grained classes from coarsely labeled data without any supervision at the fine-grained level. FALCON simultaneously infers unknown fine-grained classes and underlying relationships between coarse and fine-grained classes. Moreover, FALCON is a modular method that can effectively learn from multiple datasets labeled with different strategies. We evaluate FALCON on eight image classification tasks and a single-cell classification task. FALCON outperforms baselines by a large margin, achieving 22% improvement over the best baseline on the tieredImageNet dataset with over 600 fine-grained classes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Matej Grcic;Artyom Gadetsky;Maria Brbic", "authorids": "~Matej_Grcic1;~Artyom_Gadetsky1;~Maria_Brbic1", "gender": ";M;F", "homepage": ";https://agadetsky.github.io;https://brbiclab.epfl.ch/", "dblp": ";222/2900;130/3233", "google_scholar": ";J48uBYgAAAAJ;ltxmeroAAAAJ", "orcid": ";;0000-0002-1120-1778", "linkedin": ";;", "or_profile": "~Matej_Grcic1;~Artyom_Gadetsky1;~Maria_Brbic1", "aff": ";EPFL - EPF Lausanne;EPFL - EPF Lausanne", "aff_domain": ";epfl.ch;epfl.ch", "position": ";PhD student;Assistant Professor", "bibtex": "@inproceedings{\ngrcic2024finegrained,\ntitle={Fine-grained Classes and How to Find Them},\nauthor={Matej Grcic and Artyom Gadetsky and Maria Brbic},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=b9VfvegTEO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7625994, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15757916408171047512&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";epfl.ch;epfl.ch", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "EPFL", "aff_unique_dep": "", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "title": "Analyzing $D^\\alpha$ seeding for $k$-means", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33650", "id": "b9uHveqszc", "proceeding": "https://proceedings.mlr.press/v235/bamas24a.html", "pdf": "https://openreview.net/pdf?id=b9uHveqszc", "openreview": "https://openreview.net/forum?id=b9uHveqszc", "author_site": "Etienne Bamas, Sai Ganesh Nagarajan, Ola Svensson", "tldr": "", "abstract": "One of the most popular clustering algorithms is the celebrated $D^\\alpha$ seeding algorithm (also know as $k$-means++ when $\\alpha=2$) by Arthur and Vassilvitskii (2007), who showed that it guarantees in expectation an $O(2^{2\\alpha}\\cdot \\log k)$-approximate solution to the ($k$,$\\alpha$)-clustering cost (where distances are raised to the power $\\alpha$) for any $\\alpha\\ge 1$. More recently, Balcan, Dick, and White (2018) observed experimentally that using $D^\\alpha$ seeding with $\\alpha>2$ can lead to a better solution with respect to the standard $k$-means objective (i.e. the $(k,2)$-clustering cost). In this paper, we provide a rigorous understanding of this phenomenon. For any $\\alpha>2$, we show that $D^\\alpha$ seeding guarantees in expectation an approximation factor of \r\n\\begin{equation*} O_\\alpha \\left(\\left(\\frac{\\sigma_{\\textrm{max}}}{\\sigma_{\\textrm{min}}}\\right)^{2-4/\\alpha}\\cdot (g_\\alpha \\cdot \\min \\lbrace\\ell,\\log k\\rbrace)^{2/\\alpha}\\right) \\end{equation*}\r\n with respect to the standard $k$-means cost of any underlying clustering; where $g_\\alpha$ is a parameter capturing the concentration of the points in each cluster, $\\sigma_{\\textrm{max}}$ and $\\sigma_{\\textrm{min}}$ are the maximum and minimum standard deviation of the clusters around their center, and $\\ell$ is the number of distinct mixing weights in the underlying clustering (after rounding them to the nearest power of $2$). For instance, if the underlying clustering is defined by a mixture of $k$ Gaussian distributions with equal cluster variance (up to a constant-factor), then our result implies that: (1) if there are a constant number of mixing weights, any constant $\\alpha>2$ yields a constant-factor approximation; (2) if the mixing weights are arbitrary, any constant $\\alpha>2$ yields an $O\\left(\\log^{2/\\alpha}k\\right)$-approximation, and $\\alpha=\\Theta(\\log\\log k)$ yields an $O(\\log\\log k)^3$-approximation. We complement these results by some lower bounds showing that the dependency on $g_\\alpha$ and $\\sigma_{\\textrm{max}}/\\sigma_{\\textrm{min}}$ is tight. Finally, we provide an experimental validation of the effects of the aforementioned parameters when using $D^\\alpha$ seeding.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Etienne Bamas;Sai Ganesh Nagarajan;Ola Svensson", "authorids": "~Etienne_Bamas1;~Sai_Ganesh_Nagarajan1;~Ola_Svensson2", "gender": "M;;", "homepage": ";https://sites.google.com/view/sgnagarajan/home;https://theory.epfl.ch/osven/", "dblp": "227/2455;171/6916;11/6945.html", "google_scholar": "Cu8EcIAAAAAJ;https://scholar.google.com.sg/citations?user=VoaosL4AAAAJ;https://scholar.google.ch/citations?user=Nc2RiF4AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Etienne_Bamas1;~Sai_Ganesh_Nagarajan1;~Ola_Svensson2", "aff": "ETHZ - ETH Zurich;Zuse Institute Berlin;Swiss Federal Institute of Technology Lausanne", "aff_domain": "ethz.ch;zib.de;epfl.ch", "position": "Postdoc;Postdoc;Associate Professor", "bibtex": "@inproceedings{\nbamas2024analyzing,\ntitle={Analyzing \\$D{\\textasciicircum}{\\textbackslash}alpha\\$ seeding for \\$k\\$-means},\nauthor={Etienne Bamas and Sai Ganesh Nagarajan and Ola Svensson},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=b9uHveqszc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 694662, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "email": "ethz.ch;zib.de;epfl.ch", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "ETH Zurich;Zuse Institute Berlin;Swiss Federal Institute of Technology Lausanne", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ethz.ch;https://www.zib.de;https://www.epfl.ch", "aff_unique_abbr": "ETHZ;ZIB;EPFL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Lausanne", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Switzerland;Germany" }, { "title": "Learning Solution-Aware Transformers for Efficiently Solving Quadratic Assignment Problem", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33649", "id": "bBkQ51PmjC", "proceeding": "https://proceedings.mlr.press/v235/tan24d.html", "pdf": "https://openreview.net/pdf?id=bBkQ51PmjC", "openreview": "https://openreview.net/forum?id=bBkQ51PmjC", "author_site": "Zhentao Tan, Yadong Mu", "tldr": "", "abstract": "Recently various optimization problems, such as Mixed Integer Linear Programming Problems (MILPs), have undergone comprehensive investigation, leveraging the capabilities of machine learning. This work focuses on learning-based solutions for efficiently solving the Quadratic Assignment Problem (QAPs), which stands as a formidable challenge in combinatorial optimization. While many instances of simpler problems admit fully polynomial-time approximate solution (FPTAS), QAP is shown to be strongly NPhard. Even finding a FPTAS for QAP is difficult, in the sense that the existence of a FPTAS implies P = NP. Current research on QAPs suffer from limited scale and computational inefficiency. To attack the aforementioned issues, we here propose the first solution of its kind for QAP in the learn-to-improve category. This work encodes facility and location nodes separately, instead of forming computationally intensive association graphs prevalent in current approaches. This design choice enables scalability to larger problem sizes. Furthermore, a Solution AWare Transformer (SAWT) architecture integrates the incumbent solution matrix with the attention score to effectively capture higher-order information of the QAPs. Our model\u2019s effectiveness is validated through extensive experiments on self-generated QAP instances of varying sizes and the QAPLIB benchmark.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhentao Tan;Yadong MU", "authorids": "~Zhentao_Tan2;~Yadong_MU1", "gender": "M;M", "homepage": "https://github.com/PKUTAN;http://www.muyadong.com/", "dblp": ";55/1817", "google_scholar": "jDsfBUwAAAAJ;https://scholar.google.com.tw/citations?user=Fqqx4HsAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Zhentao_Tan2;~Yadong_MU1", "aff": "Peking University;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn", "position": "MS student;Associate Professor", "bibtex": "@inproceedings{\ntan2024learning,\ntitle={Learning Solution-Aware Transformers for Efficiently Solving Quadratic Assignment Problem},\nauthor={Zhentao Tan and Yadong MU},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bBkQ51PmjC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1461222, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=206785491223125530&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "pku.edu.cn;pku.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "High-Dimensional Kernel Methods under Covariate Shift: Data-Dependent Implicit Regularization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33648", "id": "bBzlapzeR1", "proceeding": "https://proceedings.mlr.press/v235/chen24aa.html", "pdf": "https://openreview.net/pdf?id=bBzlapzeR1", "openreview": "https://openreview.net/forum?id=bBzlapzeR1", "author_site": "Yihang Chen, Fanghui Liu, Taiji Suzuki, Volkan Cevher", "tldr": "", "abstract": "This paper studies kernel ridge regression in high dimensions under covariate shifts and analyzes the role of importance re-weighting. We first derive the asymptotic expansion of high dimensional kernels under covariate shifts. By a bias-variance decomposition, we theoretically demonstrate that the re-weighting strategy allows for decreasing the variance. For bias, we analyze the regularization of the arbitrary or well-chosen scale, showing that the bias can behave very differently under different regularization scales. In our analysis, the bias and variance can be characterized by the spectral decay of a data-dependent regularized kernel: the original kernel matrix associated with an additional re-weighting matrix, and thus the re-weighting strategy can be regarded as a data-dependent regularization for better understanding. Besides, our analysis provides asymptotic expansion of kernel functions/vectors under covariate shift, which has its own interest.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yihang Chen;Fanghui Liu;Taiji Suzuki;Volkan Cevher", "authorids": "~Yihang_Chen1;~Fanghui_Liu1;~Taiji_Suzuki1;~Volkan_Cevher1", "gender": "M;M;M;M", "homepage": "https://yhangchen.github.io/;http://www.lfhsgre.org;http://ibis.t.u-tokyo.ac.jp/suzuki/;http://lions.epfl.ch", "dblp": ";119/1038;08/312;70/5301", "google_scholar": "HzlOQRkAAAAJ;AKxBgssAAAAJ;x8osrBsAAAAJ;https://scholar.google.ch/citations?user=hlWhzU8AAAAJ", "orcid": ";0000-0003-4133-7921;;", "linkedin": ";;;", "or_profile": "~Yihang_Chen1;~Fanghui_Liu1;~Taiji_Suzuki1;~Volkan_Cevher1", "aff": "EPFL - EPF Lausanne;University of Warwick;The University of Tokyo;Amazon Development Center Germany", "aff_domain": "epfl.ch;warwick.ac.uk;tokyo.ac.jp;amazon.de", "position": "MS student;Assistant Professor;Associate Professor;Amazon Scholar", "bibtex": "@inproceedings{\nchen2024highdimensional,\ntitle={High-Dimensional Kernel Methods under Covariate Shift: Data-Dependent Implicit Regularization},\nauthor={Yihang Chen and Fanghui Liu and Taiji Suzuki and Volkan Cevher},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bBzlapzeR1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 634814, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14011934554939553000&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "epfl.ch;warwick.ac.uk;tokyo.ac.jp;amazon.de", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "EPFL;University of Warwick;University of Tokyo;Amazon", "aff_unique_dep": ";;;Development Center", "aff_unique_url": "https://www.epfl.ch;https://www.warwick.ac.uk;https://www.u-tokyo.ac.jp;https://www.amazon.de", "aff_unique_abbr": "EPFL;Warwick;UTokyo;Amazon", "aff_campus_unique_index": "0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;1;2;3", "aff_country_unique": "Switzerland;United Kingdom;Japan;Germany" }, { "title": "Policy Evaluation for Variance in Average Reward Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33647", "id": "bID9PiBFpT", "proceeding": "https://proceedings.mlr.press/v235/agrawal24a.html", "pdf": "https://openreview.net/pdf?id=bID9PiBFpT", "openreview": "https://openreview.net/forum?id=bID9PiBFpT", "author_site": "Shubhada Agrawal, Prashanth L.A., Siva Maguluri", "tldr": "", "abstract": "We consider an average reward reinforcement learning (RL) problem and work with asymptotic variance as a risk measure to model safety-critical applications. We design a temporal-difference (TD) type algorithm tailored for policy evaluation in this context. Our algorithm is based on linear stochastic approximation of an equivalent formulation of the asymptotic variance in terms of the solution of the Poisson equation. We consider both the tabular and linear function approximation settings, and establish $\\tilde {O}(1/k)$ finite time convergence rate, where $k$ is the number of steps of the algorithm. Our work paves the way for developing actor-critic style algorithms for variance-constrained RL. To the best of our knowledge, our result provides the first sequential estimator for asymptotic variance of a Markov chain with provable finite sample guarantees, which is of independent interest.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shubhada Agrawal;Prashanth L A;Siva Theja Maguluri", "authorids": "~Shubhada_Agrawal1;~Prashanth_L_A1;~Siva_Theja_Maguluri1", "gender": "F;M;", "homepage": "https://sites.google.com/view/shubhada-agrawal/home;http://www.cse.iitm.ac.in/~prashla/;https://sites.google.com/site/sivatheja/", "dblp": "247/9653;90/3161;", "google_scholar": "RQGMXiYAAAAJ;https://scholar.google.co.in/citations?user=Q1YXWpoAAAAJ;", "orcid": ";;", "linkedin": "shubhada-agrawal-55561867/;;", "or_profile": "~Shubhada_Agrawal1;~Prashanth_L_A1;~Siva_Theja_Maguluri1", "aff": "Georgia Institute of Technology;Indian Institute of Technology Madras;Georgia Institute of Technology", "aff_domain": "gatech.edu;iitm.ac.in;gatech.edu", "position": "Postdoc;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nagrawal2024policy,\ntitle={Policy Evaluation for Variance in Average Reward Reinforcement Learning},\nauthor={Shubhada Agrawal and Prashanth L A and Siva Theja Maguluri},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bID9PiBFpT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 459978, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8706091877203255541&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "gatech.edu;iitm.ac.in;gatech.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Georgia Institute of Technology;Indian Institute of Technology Madras", "aff_unique_dep": ";", "aff_unique_url": "https://www.gatech.edu;https://www.iitm.ac.in", "aff_unique_abbr": "Georgia Tech;IIT Madras", "aff_campus_unique_index": "1", "aff_campus_unique": ";Madras", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;India" }, { "title": "Genie: Generative Interactive Environments", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33646", "id": "bJbSbJskOS", "proceeding": "https://proceedings.mlr.press/v235/bruce24a.html", "pdf": "https://openreview.net/pdf?id=bJbSbJskOS", "openreview": "https://openreview.net/forum?id=bJbSbJskOS", "author_site": "Jake Bruce, Michael Dennis, Ashley Edwards, Jack Parker-Holder, Yuge Shi, Edward Hughes, Matthew Lai, Aditi Mavalankar, Richie Steigerwald, Chris Apps, Yusuf Aytar, Sarah Bechtle, Feryal Behbahani, Stephanie Chan, Nicolas Heess, Lucy Gonzalez, Simon Osindero, Sherjil Ozair, Scott Reed, Jingwei Zhang, Konrad Zolna, Jeff Clune, Nando de Freitas, Satinder Singh, Tim Rockt\u00e4schel", "tldr": "", "abstract": "We introduce Genie, the first *generative interactive environment* trained in an unsupervised manner from unlabelled Internet videos. The model can be prompted to generate an endless variety of action-controllable virtual worlds described through text, synthetic images, photographs, and even sketches. At 11B parameters, Genie can be considered a *foundation world model*. It is comprised of a spatiotemporal video tokenizer, an autoregressive dynamics model, and a simple and scalable latent action model. Genie enables users to act in the generated environments on a frame-by-frame basis *despite training without any ground-truth action labels* or other domain specific requirements typically found in the world model literature. Further the resulting learned latent action space facilitates training agents to imitate behaviors from unseen videos, opening the path for training generalist agents of the future.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jake Bruce;Michael D Dennis;Ashley Edwards;Jack Parker-Holder;Yuge Shi;Edward Hughes;Matthew Lai;Aditi Mavalankar;Richie Steigerwald;Chris Apps;Yusuf Aytar;Sarah Maria Elisabeth Bechtle;Feryal Behbahani;Stephanie C.Y. Chan;Nicolas Heess;Lucy Gonzalez;Simon Osindero;Sherjil Ozair;Scott Reed;Jingwei Zhang;Konrad Zolna;Jeff Clune;Nando de Freitas;Satinder Singh;Tim Rockt\u00e4schel", "authorids": "~Jake_Bruce1;~Michael_D_Dennis1;~Ashley_Edwards1;~Jack_Parker-Holder1;~Yuge_Shi1;~Edward_Hughes1;~Matthew_Lai1;~Aditi_Mavalankar1;~Richie_Steigerwald1;capps@google.com;~Yusuf_Aytar1;~Sarah_Maria_Elisabeth_Bechtle1;~Feryal_Behbahani1;~Stephanie_C.Y._Chan1;~Nicolas_Heess1;lucygps@google.com;~Simon_Osindero1;~Sherjil_Ozair1;~Scott_Reed1;~Jingwei_Zhang2;~Konrad_Zolna1;~Jeff_Clune3;~Nando_de_Freitas1;~Satinder_Singh2;~Tim_Rockt\u00e4schel1", "gender": "M;M;F;M;F;M;M;F;;;M;F;F;F;;;Non-Binary;M;;;Unspecified;;M;;", "homepage": "http://jakebruce.ca;;https://ashedwards.github.io/;https://jparkerholder.github.io/;https://yugeten.github.io/;http://www.edwardhughes.io;;https://aditimavalankar.github.io/;;;;;https://feryal.github.io;https://scychan.github.io/;;;;http://sherjil.ozair.io;https://scottreed.info;;;;;;", "dblp": "173/6014;;;237/9793.html;227/4684;217/2003;163/2126;168/1704;;;41/5577;180/9966;;255/7866;76/9181;;05/5467;139/0736;;;;;http://dblp.uni-trier.de/pers/hd/f/Freitas:Nando_de;;", "google_scholar": "https://scholar.google.co.uk/citations?user=RGNVBKMAAAAJ;WXXu26AAAAAJ;wmlAA70AAAAJ;;https://scholar.google.co.uk/citations?user=t6B_Z7MAAAAJ;3tj5358AAAAJ;;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?view_op=list_works;;0ncQNL8AAAAJ;https://scholar.google.com/citations?hl=de;;https://scholar.google.com/citations?hl=en;79k7bGEAAAAJ;;Jq8ZS5kAAAAJ;O7MZStwAAAAJ;jEANvfgAAAAJ;;https://scholar.google.ca/citations?user=Kg_f9PwAAAAJ;;nzEluBwAAAAJ;;", "orcid": ";;;;;;;;;;;;;;;;;;;;;;;;", "linkedin": ";;aedwards8;;;;;;;;;;;scychan;;;;;;;http://linkedin.com/in/konradzolna;;;;", "or_profile": "~Jake_Bruce1;~Michael_D_Dennis1;~Ashley_Edwards1;~Jack_Parker-Holder1;~Yuge_Shi1;~Edward_Hughes1;~Matthew_Lai1;~Aditi_Mavalankar1;~Richie_Steigerwald1;capps@google.com;~Yusuf_Aytar1;~Sarah_Maria_Elisabeth_Bechtle1;~Feryal_Behbahani1;~Stephanie_C.Y._Chan1;~Nicolas_Heess1;lucygps@google.com;~Simon_Osindero1;~Sherjil_Ozair1;~Scott_Reed1;~Jingwei_Zhang2;~Konrad_Zolna1;~Jeff_Clune3;~Nando_de_Freitas1;~Satinder_Singh2;~Tim_Rockt\u00e4schel1", "aff": "Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;University of Oxford;Google DeepMind;;Google DeepMind;Google DeepMind;;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;;Google;Google;Google;;Google DeepMind;;Google DeepMind;;", "aff_domain": "deepmind.com;google.com;deepmind.com;google.com;ox.ac.uk;deepmind.com;;google.com;deepmind.com;;google.com;deepmind.com;google.com;deepmind.com;google.com;;google.com;google.com;google.com;;deepmind.com;;google.com;;", "position": "Research Scientist;Researcher;Researcher;Researcher;PhD student;Researcher;;Research Scientist;Researcher;;Research Scientist;Researcher;Research Scientist;Research Scientist;Research Scientist;;Scientist;Intern;Research Scientist;;Research Scientist;;Principal Scientist;;", "bibtex": "@inproceedings{\nbruce2024genie,\ntitle={Genie: Generative Interactive Environments},\nauthor={Jake Bruce and Michael D Dennis and Ashley Edwards and Jack Parker-Holder and Yuge Shi and Edward Hughes and Matthew Lai and Aditi Mavalankar and Richie Steigerwald and Chris Apps and Yusuf Aytar and Sarah Maria Elisabeth Bechtle and Feryal Behbahani and Stephanie C.Y. Chan and Nicolas Heess and Lucy Gonzalez and Simon Osindero and Sherjil Ozair and Scott Reed and Jingwei Zhang and Konrad Zolna and Jeff Clune and Nando de Freitas and Satinder Singh and Tim Rockt{\\\"a}schel},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bJbSbJskOS}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3267865, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 25, "gs_citation": 172, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=849166493791555392&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "deepmind.com;google.com;deepmind.com;google.com;ox.ac.uk;deepmind.com;;google.com;deepmind.com;;google.com;deepmind.com;google.com;deepmind.com;google.com;;google.com;google.com;google.com;;deepmind.com;;google.com;;", "author_num": 25, "aff_unique_index": "0;0;0;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0", "aff_unique_norm": "Google;University of Oxford", "aff_unique_dep": "Google DeepMind;", "aff_unique_url": "https://deepmind.com;https://www.ox.ac.uk", "aff_unique_abbr": "DeepMind;Oxford", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0;1;1;1;0;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "Watermarks in the Sand: Impossibility of Strong Watermarking for Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33645", "id": "bM2s12t4hR", "proceeding": "https://proceedings.mlr.press/v235/zhang24o.html", "pdf": "https://openreview.net/pdf?id=bM2s12t4hR", "openreview": "https://openreview.net/forum?id=bM2s12t4hR", "author_site": "Hanlin Zhang, Benjamin Edelman, Danilo Francati, Daniele Venturi, Giuseppe Ateniese, Boaz Barak", "tldr": "", "abstract": "Watermarking generative models consists of planting a statistical signal (watermark) in a model's output so that it can be later verified that the output was generated by the given model. A strong watermarking scheme satisfies the property that a computationally bounded attacker cannot erase the watermark without causing significant quality degradation. In this paper, we study the (im)possibility of strong watermarking schemes. We prove that, under well-specified and natural assumptions, strong watermarking is impossible to achieve. This holds even in the private detection algorithm setting, where the watermark insertion and detection algorithms share a secret key, unknown to the attacker. To prove this result, we introduce a generic efficient watermark attack; the attacker is not required to know the private key of the scheme or even which scheme is used. Our attack is based on two assumptions: (1) The attacker has access to a \"quality oracle\" that can evaluate whether a candidate output is a high-quality response to a prompt, and (2) The attacker has access to a \"perturbation oracle\" which can modify an output with a nontrivial probability of maintaining quality, and which induces an efficiently mixing random walk on high-quality outputs. We argue that both assumptions can be satisfied in practice by an attacker with weaker computational capabilities than the watermarked model itself, to which the attacker has only black-box access. Furthermore, our assumptions will likely only be easier to satisfy over time as models grow in capabilities and modalities. We demonstrate the feasibility of our attack by instantiating it to attack three existing watermarking schemes for large language models: Kirchenbauer et al. (2023), Kuditipudi et al. (2023), and Zhao et al. (2023), and include preliminary results on vision-language models. The same attack successfully removes the watermarks planted by all schemes, with only minor quality degradation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hanlin Zhang;Benjamin L. Edelman;Danilo Francati;Daniele Venturi;Giuseppe Ateniese;Boaz Barak", "authorids": "~Hanlin_Zhang1;~Benjamin_L._Edelman1;danilofrancati@gmail.com;venturi@di.uniroma1.it;~Giuseppe_Ateniese1;~Boaz_Barak2", "gender": "M;;;;;M", "homepage": "https://hanlin-zhang.com/;;;;;https://boazbarak.org", "dblp": ";;;;;b/BBarak", "google_scholar": "h5IXxToAAAAJ;;;;;I0fbJ6cAAAAJ", "orcid": "0000-0002-9292-1645;;;;;0000-0002-4053-8927", "linkedin": "hanlin-zhang-931b46143/;;;;;", "or_profile": "~Hanlin_Zhang1;~Benjamin_L._Edelman1;danilofrancati@gmail.com;venturi@di.uniroma1.it;~Giuseppe_Ateniese1;~Boaz_Barak2", "aff": "Harvard University;;;;;Harvard University", "aff_domain": "harvard.edu;;;;;fas.harvard.edu", "position": "PhD student;;;;;Full Professor", "bibtex": "@inproceedings{\nzhang2024watermarks,\ntitle={Watermarks in the Sand: Impossibility of Strong Watermarking for Language Models},\nauthor={Hanlin Zhang and Benjamin L. Edelman and Danilo Francati and Daniele Venturi and Giuseppe Ateniese and Boaz Barak},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bM2s12t4hR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7407177, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6198177462665602911&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "harvard.edu;;;;;fas.harvard.edu", "author_num": 6, "aff_unique_index": "0;0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Robust Data-driven Prescriptiveness Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33644", "id": "bNgAdyv7ZP", "proceeding": "https://proceedings.mlr.press/v235/poursoltani24a.html", "pdf": "https://openreview.net/pdf?id=bNgAdyv7ZP", "openreview": "https://openreview.net/forum?id=bNgAdyv7ZP", "author_site": "Mehran Poursoltani, Erick Delage, Angelos Georghiou", "tldr": "", "abstract": "The abundance of data has led to the emergence of a variety of optimization techniques that attempt to leverage available side information to provide more anticipative decisions. The wide range of methods and contexts of application have motivated the design of a universal unitless measure of performance known as the coefficient of prescriptiveness. This coefficient was designed to quantify both the quality of contextual decisions compared to a reference one and the prescriptive power of side information. To identify policies that maximize the former in a data-driven context, this paper introduces a distributionally robust contextual optimization model where the coefficient of prescriptiveness substitutes for the classical empirical risk minimization objective. We present a bisection algorithm to solve this model, which relies on solving a series of linear programs when the distributional ambiguity set has an appropriate nested form and polyhedral structure. Studying a contextual shortest path problem, we evaluate the robustness of the resulting policies against alternative methods when the out-of-sample dataset is subject to varying amounts of distribution shift.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mehran Poursoltani;Erick Delage;Angelos Georghiou", "authorids": "~Mehran_Poursoltani1;~Erick_Delage2;~Angelos_Georghiou1", "gender": ";M;M", "homepage": ";http://web.hec.ca/pages/erick.delage/;https://www.ucy.ac.cy/directory/en/profile/ageorg07", "dblp": ";26/1546;45/10289", "google_scholar": ";https://scholar.google.ca/citations?user=ciH2ROgAAAAJ;https://scholar.google.ca/citations?hl=en", "orcid": ";0000-0002-6740-3600;0000-0003-4490-4020", "linkedin": ";erick-delage-2105361/?originalSubdomain=ca;", "or_profile": "~Mehran_Poursoltani1;~Erick_Delage2;~Angelos_Georghiou1", "aff": ";Computer Science Department;University of Cyprus", "aff_domain": ";cs.stanford.edu;ucy.ac.cy", "position": ";Researcher;Assistant Professor", "bibtex": "@inproceedings{\npoursoltani2024robust,\ntitle={Robust Data-driven Prescriptiveness Optimization},\nauthor={Mehran Poursoltani and Erick Delage and Angelos Georghiou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bNgAdyv7ZP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 933916, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17227617100252540409&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": ";cs.stanford.edu;ucy.ac.cy", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Computer Science Department;University of Cyprus", "aff_unique_dep": "Computer Science;", "aff_unique_url": ";https://www.ucy.ac.cy", "aff_unique_abbr": ";UCY", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1", "aff_country_unique": ";Cyprus" }, { "title": "Modular Learning of Deep Causal Generative Models for High-dimensional Causal Inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33643", "id": "bOhzU7NpTB", "proceeding": "https://proceedings.mlr.press/v235/rahman24a.html", "pdf": "https://openreview.net/pdf?id=bOhzU7NpTB", "openreview": "https://openreview.net/forum?id=bOhzU7NpTB", "author_site": "Md Musfiqur Rahman, Murat Kocaoglu", "tldr": "", "abstract": "Sound and complete algorithms have been proposed to compute identifiable causal queries using the causal structure and data. However, most of these algorithms assume accurate estimation of the data distribution, which is impractical for high-dimensional variables such as images. On the other hand, modern deep generative architectures can be trained to sample from high-dimensional distributions. However, training these networks are typically very costly. Thus, it is desirable to leverage pre-trained models to answer causal queries using such high-dimensional data. To address this, we propose modular training of deep causal generative models that not only makes learning more efficient, but also allows us to utilize large, pre-trained conditional generative models. To the best of our knowledge, our algorithm, Modular-DCM is the first algorithm that, given the causal structure, uses adversarial training to learn the network weights, and can make use of pre-trained models to provably sample from any identifiable causal query in the presence of latent confounders. With extensive experiments on the Colored-MNIST dataset, we demonstrate that our algorithm outperforms the baselines. We also show our algorithm's convergence on the COVIDx dataset and its utility with a causal invariant prediction problem on CelebA-HQ.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Md Musfiqur Rahman;Murat Kocaoglu", "authorids": "~Md_Musfiqur_Rahman1;~Murat_Kocaoglu1", "gender": "M;M", "homepage": "https://sites.google.com/view/musfiqshohan/home;https://www.muratkocaoglu.com", "dblp": "249/2369;74/11343", "google_scholar": "vMGENI8AAAAJ;7N7bzdwAAAAJ", "orcid": ";", "linkedin": "md-musfiqur-rahman-861b58150/;mkocaoglu/", "or_profile": "~Md_Musfiqur_Rahman1;~Murat_Kocaoglu1", "aff": "Purdue University;Purdue University", "aff_domain": "purdue.edu;purdue.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nrahman2024modular,\ntitle={Modular Learning of Deep Causal Generative Models for High-dimensional Causal Inference},\nauthor={Md Musfiqur Rahman and Murat Kocaoglu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bOhzU7NpTB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8869578, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17563728587187365091&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "purdue.edu;purdue.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Graph-Triggered Rising Bandits", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33642", "id": "bPsohGR6gD", "proceeding": "https://proceedings.mlr.press/v235/genalti24a.html", "pdf": "https://openreview.net/pdf?id=bPsohGR6gD", "openreview": "https://openreview.net/forum?id=bPsohGR6gD", "author_site": "Gianmarco Genalti, Marco Mussi, Nicola Gatti, Marcello Restelli, Matteo Castiglioni, Alberto Maria Metelli", "tldr": "", "abstract": "In this paper, we propose a novel generalization of rested and restless bandits where the evolution of the arms' expected rewards is governed by a graph defined over the arms. An edge connecting a pair of arms $(i,j)$ represents the fact that a pull of arm $i$ *triggers* the evolution of arm $j$, and vice versa. Interestingly, rested and restless bandits are both special cases of our model for some suitable (degenerate) graphs. Still, the model can represent way more general and interesting scenarios. We first tackle the problem of computing the optimal policy when no specific structure is assumed on the graph, showing that it is NP-hard. Then, we focus on a specific structure forcing the graph to be composed of a set of fully connected subgraphs (i.e., cliques), and we prove that the optimal policy can be easily computed in closed form. Then, we move to the learning problem presenting regret minimization algorithms for deterministic and stochastic cases. Our regret bounds highlight the complexity of the learning problem by incorporating instance-dependent terms that encode specific properties of the underlying graph structure. Moreover, we illustrate how the knowledge of the underlying graph is not necessary for achieving the no-regret property.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gianmarco Genalti;Marco Mussi;Nicola Gatti;Marcello Restelli;Matteo Castiglioni;Alberto Maria Metelli", "authorids": "~Gianmarco_Genalti1;~Marco_Mussi1;~Nicola_Gatti1;~Marcello_Restelli1;~Matteo_Castiglioni1;~Alberto_Maria_Metelli2", "gender": "Not Specified;M;M;M;;M", "homepage": ";https://marcomussi.github.io/;https://www4.ceda.polimi.it/manifesti/manifesti/controller/ricerche/RicercaPerDocentiPublic.do?k_doc=75785&lang=EN&EVN_PRODOTTI=evento&__pj0=0&__pj1=d918ee8916afbd0005f5c0bc3c0ff350;http://home.deib.polimi.it/restelli/;https://castiglionimatteo.github.io;https://albertometelli.github.io/", "dblp": ";321/0756;g/NicolaGatti;64/1011;225/7720;209/4941", "google_scholar": "b4UMI8kAAAAJ;3gca-JUAAAAJ;https://scholar.google.com.tw/citations?user=j-HrYREAAAAJ;https://scholar.google.com.tw/citations?user=xdgxRiEAAAAJ;https://scholar.google.it/citations?user=NPE3HAYAAAAJ;R31IsPwAAAAJ", "orcid": ";0000-0001-8356-6744;0000-0001-7349-3932;0000-0002-6322-1076;0000-0002-1070-6766;0000-0002-3424-5212", "linkedin": "gianmarco-genalti-26328a1a4/;marcomussi95/;nicola-gatti-1284b21;;;", "or_profile": "~Gianmarco_Genalti1;~Marco_Mussi1;~Nicola_Gatti1;~Marcello_Restelli1;~Matteo_Castiglioni1;~Alberto_Maria_Metelli2", "aff": "Polytechnic Institute of Milan;Politecnico di Milano;Polytechnic Institute of Milan;Politecnico di Milano;Politecnico di Milano;Politecnico di Milano", "aff_domain": "polimi.it;polimi.it;polimi.it;polimi.it;polimi.it;polimi.it", "position": "PhD student;PhD student;Full Professor;Associate Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\ngenalti2024graphtriggered,\ntitle={Graph-Triggered Rising Bandits},\nauthor={Gianmarco Genalti and Marco Mussi and Nicola Gatti and Marcello Restelli and Matteo Castiglioni and Alberto Maria Metelli},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bPsohGR6gD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 794926, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15523011310207254508&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "polimi.it;polimi.it;polimi.it;polimi.it;polimi.it;polimi.it", "author_num": 6, "aff_unique_index": "0;1;0;1;1;1", "aff_unique_norm": "Polytechnic Institute of Milan;Politecnico di Milano", "aff_unique_dep": ";", "aff_unique_url": "https://www.polimi.it/;https://www.polimi.it", "aff_unique_abbr": "Politecnico di Milano;Polimi", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "Italy" }, { "title": "Differentiable Model Scaling using Differentiable Topk", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33641", "id": "bULHOW1RXM", "proceeding": "https://proceedings.mlr.press/v235/liu24bi.html", "pdf": "https://openreview.net/pdf?id=bULHOW1RXM", "openreview": "https://openreview.net/forum?id=bULHOW1RXM", "author_site": "Kai Liu, Ruohui Wang, Jianfei Gao, Kai Chen", "tldr": "", "abstract": "Over the past few years, as large language models have ushered in an era of intelligence emergence, there has been an intensified focus on scaling networks. Although Neural Architecture Search (NAS) methods have been proposed to automate this process, they suffer from low search efficiency. This study introduces Differentiable Model Scaling (DMS), increasing the efficiency for searching optimal width and depth in networks. DMS can model both width and depth in a direct and fully differentiable way, making it easy to optimize. We have evaluated our DMS across diverse tasks, ranging from vision tasks to NLP tasks and various network architectures, including CNNs and Transformers. Results consistently indicate that our DMS can find improved structures and outperforms state-of-the-art NAS methods. Specifically, for image classification on ImageNet, our DMS improves the top-1 accuracy of EfficientNet-B0 and Deit-Tiny by 1.4% and 0.6%, respectively, and outperforms the state-of-the-art zero-shot NAS method, ZiCo, by 1.3% while requiring only 0.4 GPU days for searching. For object detection on COCO, DMS improves the mAP of Yolo-v8-n by 2.0%. For language modeling, our pruned Llama-7B outperforms the prior method with lower perplexity and higher zero-shot classification accuracy. Our code is available at https://github.com/LKJacky/Differentiable-Model-Scaling.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kai Liu;Ruohui Wang;Jianfei Gao;Kai Chen", "authorids": "~Kai_Liu16;~Ruohui_Wang3;~Jianfei_Gao1;~Kai_Chen4", "gender": ";;M;M", "homepage": "https://github.com/LKJacky;;https://chenkai.site/;https://github.com/pppppM", "dblp": ";;181/2839-26;", "google_scholar": "https://scholar.google.com.hk/citations?user=b6NWTbAAAAAJ;G-9-nIYAAAAJ;https://scholar.google.com.hk/citations?user=eGD0b7IAAAAJ;", "orcid": ";;0000-0002-6820-2325;", "linkedin": ";;;", "or_profile": "~Kai_Liu16;~Ruohui_Wang3;~Kai_Chen4;~Gao_Jianfei1", "aff": "Shanghai Artificial Intelligence Laboratory;shanghai ai laboratory;Shanghai AI Laboratory;Shanghai Artificial Intelligence Laboratory", "aff_domain": "pjlab.org.cn;pjlab.org.cn;pjlab.org.cn;shlab.org.cn", "position": "Researcher;Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\nliu2024differentiable,\ntitle={Differentiable Model Scaling using Differentiable Topk},\nauthor={Kai Liu and Ruohui Wang and Jianfei Gao and Kai Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bULHOW1RXM}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 782065, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6120393179649329179&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "pjlab.org.cn;pjlab.org.cn;pjlab.org.cn;shlab.org.cn", "author_num": 4, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Shanghai Artificial Intelligence Laboratory;Shanghai AI Laboratory", "aff_unique_dep": ";", "aff_unique_url": "http://www.shailab.org/;", "aff_unique_abbr": "Shanghai AI Lab;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Self-Infilling Code Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33640", "id": "bV9yT24t9B", "proceeding": "https://proceedings.mlr.press/v235/zheng24o.html", "pdf": "https://openreview.net/pdf?id=bV9yT24t9B", "openreview": "https://openreview.net/forum?id=bV9yT24t9B", "author_site": "Lin Zheng, Jianbo Yuan, Zhi Zhang, Hongxia Yang, Lingpeng Kong", "tldr": "", "abstract": "In this work, we introduce self-infilling code generation, a general framework that incorporates infilling operations into auto-regressive decoding. Our approach capitalizes on the observation that recent infilling-capable code language models can perform self-infilling: whereas conventional infilling is designed to fill in the middle based on a predefined prefix and suffix, self-infilling sequentially generates both such surrounding context and the infilled content. We utilize self-infilling to introduce novel interruption and looping mechanisms in conventional decoding, evolving it into a non-monotonic process. Interruptions allow for postponing the generation of specific code until a definitive suffix is established, enhancing control during decoding. Meanwhile, the looping mechanism, which leverages the complementary nature of self-infilling and left-to-right decoding, can iteratively update and synchronize each piece of generation cyclically. Extensive experiments across a variety of code generation benchmarks demonstrate that decoding with self-infilling not only improves the output quality but also regularizes the overall generation, which effectively mitigates potential degeneration and scaffolds code to be more consistent with intended functionality.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lin Zheng;Jianbo Yuan;Zhi Zhang;Hongxia Yang;Lingpeng Kong", "authorids": "~Lin_Zheng1;~Jianbo_Yuan1;~Zhi_Zhang4;~Hongxia_Yang2;~Lingpeng_Kong1", "gender": "M;M;M;F;M", "homepage": "https://lzhengisme.github.io/;;https://zhreshold.github.io;https://www4.comp.polyu.edu.hk/~hongxyang/;https://ikekonglp.github.io/", "dblp": ";134/6790;;;144/7656", "google_scholar": "3NXH0t8AAAAJ;https://scholar.google.com/citations?hl=en;nZr0oXQAAAAJ;iJlC5mMAAAAJ;f1hBi5wAAAAJ", "orcid": ";;0000-0003-0249-1678;;", "linkedin": ";;;;", "or_profile": "~Lin_Zheng1;~Jianbo_Yuan1;~Zhi_Zhang4;~Hongxia_Yang2;~Lingpeng_Kong1", "aff": "The University of Hong Kong;Bytedance;ByteDance Inc.;ByteDance Inc.;Department of Computer Science, The University of Hong Kong", "aff_domain": "hku.hk;bytedance.com;bytedance.com;bytedance.com;cs.hku.hk", "position": "PhD student;Researcher;Researcher;Principal Researcher;Assistant Professor", "bibtex": "@inproceedings{\nzheng2024selfinfilling,\ntitle={Self-Infilling Code Generation},\nauthor={Lin Zheng and Jianbo Yuan and Zhi Zhang and Hongxia Yang and Lingpeng Kong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bV9yT24t9B}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 513594, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12998630015459338001&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "hku.hk;bytedance.com;bytedance.com;bytedance.com;cs.hku.hk", "author_num": 5, "aff_unique_index": "0;1;1;1;0", "aff_unique_norm": "University of Hong Kong;ByteDance", "aff_unique_dep": ";", "aff_unique_url": "https://www.hku.hk;https://www.bytedance.com", "aff_unique_abbr": "HKU;Bytedance", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Controlled Decoding from Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33639", "id": "bVIcZb7Qa0", "proceeding": "https://proceedings.mlr.press/v235/mudgal24a.html", "pdf": "https://openreview.net/pdf?id=bVIcZb7Qa0", "openreview": "https://openreview.net/forum?id=bVIcZb7Qa0", "author_site": "Sidharth Mudgal, Jong Lee, Harish Ganapathy, YaGuang Li, Tao Wang, Yanping Huang, Zhifeng Chen, Heng-Tze Cheng, Michael Collins, Trevor Strohman, Jilin Chen, Alex Beutel, Ahmad Beirami", "tldr": "", "abstract": "KL-regularized reinforcement learning (RL) is a popular alignment framework to control the language model responses towards high reward outcomes. We pose a tokenwise RL objective and propose a modular solver for it, called *controlled decoding (CD)*. CD exerts control through a separate *prefix scorer* module, which is trained to learn a value function for the reward. The prefix scorer is used at inference time to control the generation from a frozen base model, provably sampling from a solution to the RL objective. We empirically demonstrate that CD is effective as a control mechanism on popular benchmarks. We also show that prefix scorers for multiple rewards may be combined at inference time, effectively solving a multi-objective RL problem with no additional training. We show that the benefits of applying CD transfer to an unseen base model with no further tuning as well. Finally, we show that CD can be applied in a blockwise decoding fashion at inference-time, essentially bridging the gap between the popular best-of-$K$ strategy and tokenwise control through reinforcement learning. This makes CD a promising approach for alignment of language models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sidharth Mudgal;Jong Lee;Harish Ganapathy;YaGuang Li;Tao Wang;Yanping Huang;Zhifeng Chen;Heng-Tze Cheng;Michael Collins;Trevor Strohman;Jilin Chen;Alex Beutel;Ahmad Beirami", "authorids": "~Sidharth_Mudgal1;~Jong_Lee2;~Harish_Ganapathy1;~YaGuang_Li2;~Tao_Wang30;~Yanping_Huang1;~Zhifeng_Chen1;~Heng-Tze_Cheng1;~Michael_Collins1;~Trevor_Strohman1;~Jilin_Chen1;~Alex_Beutel1;~Ahmad_Beirami1", "gender": "M;;;;;M;M;;M;;;;M", "homepage": ";;;;;;;https://www.linkedin.com/in/hengtze;http://www.cs.columbia.edu/~mcollins;https://research.google/people/TrevorStrohman;;;https://beirami.github.io/", "dblp": "150/5903;;;;;00/10104;61/5154;30/8739;29/1340;22/5377;50/6953;;41/9367", "google_scholar": "pVcZzXkAAAAJ;;jdFVqmgAAAAJ;;;uEtBQScAAAAJ;;;DxoenfgAAAAJ;0wJT4H8AAAAJ;;;VuKWbMMAAAAJ", "orcid": ";;;;;;;;;;;;", "linkedin": ";jongleeee;;;;;;;;;;;ahmad-beirami-97001962", "or_profile": "~Sidharth_Mudgal1;~Jong_Lee2;~Harish_Ganapathy1;~YaGuang_Li2;~Tao_Wang30;~Yanping_Huang1;~Zhifeng_Chen1;~Heng-Tze_Cheng1;~Michael_Collins1;~Trevor_Strohman1;~Jilin_Chen1;~Alex_Beutel1;~Ahmad_Beirami1", "aff": "Google;Research, Google;Google;;;Google;Google;;Columbia University;Google;Google;;Massachusetts Institute of Technology", "aff_domain": "google.com;research.google.com;google.com;;;google.com;google.com;;columbia.edu;google.com;google.com;;mit.edu", "position": "Software Engineer;Researcher;Software Engineer;;;Engineer;Engineer;;Full Professor;Researcher;Researcher;;Research Affiliate", "bibtex": "@inproceedings{\nmudgal2024controlled,\ntitle={Controlled Decoding from Language Models},\nauthor={Sidharth Mudgal and Jong Lee and Harish Ganapathy and YaGuang Li and Tao Wang and Yanping Huang and Zhifeng Chen and Heng-Tze Cheng and Michael Collins and Trevor Strohman and Jilin Chen and Alex Beutel and Ahmad Beirami},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bVIcZb7Qa0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 988062, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 13, "gs_citation": 86, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1469069424984404642&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "google.com;research.google.com;google.com;;;google.com;google.com;;columbia.edu;google.com;google.com;;mit.edu", "author_num": 13, "aff_unique_index": "0;0;0;0;0;1;0;0;2", "aff_unique_norm": "Google;Columbia University;Massachusetts Institute of Technology", "aff_unique_dep": "Google;;", "aff_unique_url": "https://www.google.com;https://www.columbia.edu;https://web.mit.edu", "aff_unique_abbr": "Google;Columbia;MIT", "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Preference Fine-Tuning of LLMs Should Leverage Suboptimal, On-Policy Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33638", "id": "bWNPx6t0sF", "proceeding": "https://proceedings.mlr.press/v235/tajwar24a.html", "pdf": "https://openreview.net/pdf?id=bWNPx6t0sF", "openreview": "https://openreview.net/forum?id=bWNPx6t0sF", "author_site": "Fahim Tajwar, Anikait Singh, Archit Sharma, Rafael Rafailov, Jeff Schneider, Tengyang Xie, Stefano Ermon, Chelsea Finn, Aviral Kumar", "tldr": "", "abstract": "Learning from preference labels plays a crucial role in fine-tuning large language models --- this is done via supervised learning, on-policy reinforcement learning (RL), or contrastive learning. Different methods come with different implementation tradeoffs, and existing empirical findings present different conclusions, for instance, some results show that online RL is quite important to attain good fine-tuning results, while others find offline methods sufficient. This raises a question: **what kind of approaches are important for fine-tuning with preference data and why?** In this paper, we answer this question by performing a rigorous analysis of a number of fine-tuning techniques on didactic and full-scale LLM problems. Our main finding is that approaches that use on-policy sampling and attempt to push down the likelihood on certain responses (i.e., employ a ''negative gradient'') outperform offline and maximum likelihood objectives. We conceptualize our insights and unify methods that use on-policy sampling or negative gradient under a notion of mode-seeking objectives for categorical distributions. Mode-seeking objectives are able to alter probability mass on specific bins of a categorical distribution at a fast rate compared to maximum likelihood, allowing them to relocate masses across bins more effectively. Our analysis prescribes actionable insights for preference fine-tuning of LLMs and informs how data should be collected for maximal improvement.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fahim Tajwar;Anikait Singh;Archit Sharma;Rafael Rafailov;Jeff Schneider;Tengyang Xie;Stefano Ermon;Chelsea Finn;Aviral Kumar", "authorids": "~Fahim_Tajwar1;~Anikait_Singh1;~Archit_Sharma1;~Rafael_Rafailov1;~Jeff_Schneider1;~Tengyang_Xie1;~Stefano_Ermon1;~Chelsea_Finn1;~Aviral_Kumar2", "gender": "M;M;M;M;;;M;F;M", "homepage": "https://tajwarfahim.github.io/;https://asap7772.github.io/;;https://rmrafailov.github.io/;https://www.cs.cmu.edu/~schneide;https://tengyangxie.github.io/;http://cs.stanford.edu/~ermon/;https://ai.stanford.edu/~cbfinn/;https://aviralkumar2907.github.io/", "dblp": "292/1504;302/3876;220/3163.html;272/5358;38/247;227/3335;47/8135;131/1783;202/7961", "google_scholar": "iMlmLO4AAAAJ;lPaISmIAAAAJ;_0IIzxgAAAAJ;TwABcRgAAAAJ;3bSbb20AAAAJ;rlmROVsAAAAJ;;vfPE6hgAAAAJ;", "orcid": "0000-0001-9257-6282;;;;0000-0002-5080-9073;;;;", "linkedin": "fahim-tajwar-8a5377162/;asap7772/;;;jeff-schneider-1593b322/;;;;", "or_profile": "~Fahim_Tajwar1;~Anikait_Singh1;~Archit_Sharma1;~Rafael_Rafailov1;~Jeff_Schneider1;~Tengyang_Xie1;~Stefano_Ermon1;~Chelsea_Finn1;~Aviral_Kumar2", "aff": "Carnegie Mellon University;Stanford University;Stanford University;Stanford University;Carnegie Mellon University;Microsoft Research, New England & NYC;Stanford University;Google;Google DeepMind", "aff_domain": "andrew.cmu.edu;stanford.edu;stanford.edu;stanford.edu;cs.cmu.edu;microsoft.com;stanford.edu;google.com;google.com", "position": "PhD student;PhD student;Graduate Student;PhD student;Researcher;Postdoc;Associate Professor;Research Scientist;Researcher", "bibtex": "@inproceedings{\ntajwar2024preference,\ntitle={Preference Fine-Tuning of {LLM}s Should Leverage Suboptimal, On-Policy Data},\nauthor={Fahim Tajwar and Anikait Singh and Archit Sharma and Rafael Rafailov and Jeff Schneider and Tengyang Xie and Stefano Ermon and Chelsea Finn and Aviral Kumar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bWNPx6t0sF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9615489, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15674372511535195634&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "andrew.cmu.edu;stanford.edu;stanford.edu;stanford.edu;cs.cmu.edu;microsoft.com;stanford.edu;google.com;google.com", "author_num": 9, "aff_unique_index": "0;1;1;1;0;2;1;3;3", "aff_unique_norm": "Carnegie Mellon University;Stanford University;Microsoft;Google", "aff_unique_dep": ";;Microsoft Research;Google", "aff_unique_url": "https://www.cmu.edu;https://www.stanford.edu;https://www.microsoft.com/en-us/research;https://www.google.com", "aff_unique_abbr": "CMU;Stanford;MSR;Google", "aff_campus_unique_index": "1;1;1;2;1;3", "aff_campus_unique": ";Stanford;New England;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Position: TrustLLM: Trustworthiness in Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33637", "id": "bWUU0LwwMp", "proceeding": "https://proceedings.mlr.press/v235/huang24x.html", "pdf": "https://openreview.net/pdf?id=bWUU0LwwMp", "openreview": "https://openreview.net/forum?id=bWUU0LwwMp", "author_site": "Yue Huang, Lichao Sun, Haoran Wang, Siyuan Wu, Qihui Zhang, Yuan Li, Chujie Gao, Yixin Huang, Wenhan Lyu, Yixuan Zhang, Xiner Li, Hanchi Sun, Zhengliang Liu, Yixin Liu, Yijue Wang, Zhikun Zhang, Bertie Vidgen, Bhavya Kailkhura, Caiming Xiong, Chaowei Xiao, Chunyuan Li, Eric Xing, Furong Huang, Hao Liu, Heng Ji, Hongyi Wang, Huan Zhang, Huaxiu Yao, Manolis Kellis, Marinka Zitnik, Meng Jiang, Mohit Bansal, James Zou, Jian Pei, Jian Liu, Jianfeng Gao, Jiawei Han, Jieyu Zhao, Jiliang Tang, Jindong Wang, Joaquin Vanschoren, John Mitchell, Kai Shu, Kaidi Xu, Kai-Wei Chang, Lifang He, Lifu Huang, Michael Backes, Neil Gong, Philip Yu, Pin-Yu Chen, Quanquan Gu, Ran Xu, ZHITAO YING, Shuiwang Ji, Suman Jana, Tianlong Chen, Tianming Liu, Tianyi Zhou, William Wang, Xiang Li, Xiangliang Zhang, Xiao Wang, Xing Xie, Xun Chen, Xuyu Wang, Yan Liu, Yanfang Ye, Yinzhi Cao, Yong Chen, Yue Zhao", "tldr": "", "abstract": "Large language models (LLMs) have gained considerable attention for their excellent natural language processing capabilities. Nonetheless, these LLMs present many challenges, particularly in the realm of trustworthiness. This paper introduces TrustLLM, a comprehensive study of trustworthiness in LLMs, including principles for different dimensions of trustworthiness, established benchmark, evaluation, and analysis of trustworthiness for mainstream LLMs, and discussion of open challenges and future directions. Specifically, we first propose a set of principles for trustworthy LLMs that span eight different dimensions. Based on these principles, we further establish a benchmark across six dimensions including truthfulness, safety, fairness, robustness, privacy, and machine ethics. We then present a study evaluating 16 mainstream LLMs in TrustLLM, consisting of over 30 datasets. Our findings firstly show that in general trustworthiness and capability (i.e., functional effectiveness) are positively related. Secondly, our observations reveal that proprietary LLMs generally outperform most open-source counterparts in terms of trustworthiness, raising concerns about the potential risks of widely accessible open-source LLMs. However, a few open-source LLMs come very close to proprietary ones, suggesting that open-source models can achieve high levels of trustworthiness without additional mechanisms like *moderator*, offering valuable insights for developers in this field. Thirdly, it is important to note that some LLMs may be overly calibrated towards exhibiting trustworthiness, to the extent that they compromise their utility by mistakenly treating benign prompts as harmful and consequently not responding. Besides these observations, we've uncovered key insights into the multifaceted trustworthiness in LLMs. We emphasize the importance of ensuring transparency not only in the models themselves but also in the technologies that underpin trustworthiness. We advocate that the establishment of an AI alliance between industry, academia, the open-source community to foster collaboration is imperative to advance the trustworthiness of LLMs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yue Huang;Lichao Sun;Haoran Wang;Siyuan Wu;Qihui Zhang;Yuan Li;Chujie Gao;Yixin Huang;Wenhan Lyu;Yixuan Zhang;Xiner Li;Hanchi Sun;Zhengliang Liu;Yixin Liu;Yijue Wang;Zhikun Zhang;Bertie Vidgen;Bhavya Kailkhura;Caiming Xiong;Chaowei Xiao;Chunyuan Li;Eric P. Xing;Furong Huang;Hao Liu;Heng Ji;Hongyi Wang;Huan Zhang;Huaxiu Yao;Manolis Kellis;Marinka Zitnik;Meng Jiang;Mohit Bansal;James Zou;Jian Pei;Jian Liu;Jianfeng Gao;Jiawei Han;Jieyu Zhao;Jiliang Tang;Jindong Wang;Joaquin Vanschoren;John Mitchell;Kai Shu;Kaidi Xu;Kai-Wei Chang;Lifang He;Lifu Huang;Michael Backes;Neil Zhenqiang Gong;Philip S. Yu;Pin-Yu Chen;Quanquan Gu;Ran Xu;Rex Ying;Shuiwang Ji;Suman Jana;Tianlong Chen;Tianming Liu;Tianyi Zhou;William Yang Wang;Xiang Li;Xiangliang Zhang;Xiao Wang;Xing Xie;Xun Chen;Xuyu Wang;Yan Liu;Yanfang Ye;Yinzhi Cao;Yong Chen;Yue Zhao", "authorids": "~Yue_Huang9;~Lichao_Sun1;~Haoran_Wang12;~Siyuan_Wu6;~Qihui_Zhang1;~Yuan_Li18;~Chujie_Gao1;~Yixin_Huang2;~Wenhan_Lyu1;~Yixuan_Zhang7;~Xiner_Li1;~Hanchi_Sun2;~Zhengliang_Liu1;~Yixin_Liu4;~Yijue_Wang1;~Zhikun_Zhang2;~Bertie_Vidgen1;~Bhavya_Kailkhura1;~Caiming_Xiong1;~Chaowei_Xiao2;~Chunyuan_Li1;~Eric_Xing1;~Furong_Huang1;~Hao_Liu1;~Heng_Ji3;~Hongyi_Wang1;~Huan_Zhang1;~Huaxiu_Yao1;~Manolis_Kellis1;~Marinka_Zitnik1;~Meng_Jiang3;~Mohit_Bansal2;~James_Zou1;~Jian_Pei1;~Jian_Liu12;~Jianfeng_Gao1;~Jiawei_Han1;~Jieyu_Zhao1;~Jiliang_Tang1;~Jindong_Wang1;~Joaquin_Vanschoren1;~John_Mitchell1;~Kai_Shu1;~Kaidi_Xu1;~Kai-Wei_Chang1;~Lifang_He1;~Lifu_Huang1;~Michael_Backes3;~Neil_Zhenqiang_Gong1;~Philip_S._Yu1;~Pin-Yu_Chen1;~Quanquan_Gu1;~Ran_Xu1;~Zhitao_Ying1;~Shuiwang_Ji1;~Suman_Jana1;~Tianlong_Chen1;~Tianming_Liu3;~Tianyi_Zhou1;~William_Yang_Wang2;~Xiang_Li14;~Xiangliang_Zhang1;~Xiao_Wang11;~Xing_Xie3;~Xun_Chen1;~Xuyu_Wang1;~Yan_Liu1;~Yanfang_Ye1;~Yinzhi_Cao1;~Yong_Chen9;~Yue_Zhao13", "gender": ";M;M;;M;M;F;;;F;F;M;M;;M;M;M;M;M;;;M;F;;F;M;M;M;M;;M;M;;;M;M;M;F;M;;M;M;;M;M;F;M;;;M;M;M;M;M;M;M;M;M;M;;M;F;M;M;;M;F;;M;M;M", "homepage": ";https://lichao-sun.github.io/;https://haoranwang18.github.io/;https://github.com/nauyisu022;https://github.com/Mask-Hui;;;;https://www.wenhanlyu.com/;https://zjanice.github.io/;;;;;;http://zhangzhk.com/;https://www.turing.ac.uk/people/researchers/bertie-vidgen;https://people.llnl.gov/kailkhura1;http://cmxiong.com/;;http://chunyuan.li/;http://www.cs.cmu.edu/~epxing/;https://furong-huang.com;;http://blender.cs.illinois.edu/hengji.html;https://hwang595.github.io/;http://huan-zhang.com;http://huaxiuyao.mystrikingly.com;http://compbio.mit.edu;https://zitniklab.hms.harvard.edu;http://www.meng-jiang.com/;https://www.cs.unc.edu/~mbansal/;;;https://web.eecs.utk.edu/~jliu/;https://www.microsoft.com/en-us/research/people/jfgao/;http://hanj.cs.illinois.edu/;http://jyzhao.net/;https://www.cse.msu.edu/~tangjili/;;http://www.win.tue.nl/~jvanscho/;https://profiles.stanford.edu/john-mitchell;https://www.cs.emory.edu/~kshu5/;https://kaidixu.com/;http://kwchang.net;https://engineering.lehigh.edu/faculty/lifang-he;https://wilburone.github.io/;;;https://cs.uic.edu/profiles/philip-yu/;http://www.pinyuchen.com;http://web.cs.ucla.edu/~qgu/;;https://www.cs.yale.edu/homes/ying-rex;http://people.tamu.edu/~sji;http://sumanj.info;https://tianlong-chen.github.io;https://cobweb.cs.uga.edu/~tliu/;https://tianyizhou.github.io/;;https://xiangli-shaun.github.io/;https://sites.nd.edu/xiangliang-zhang/;https://wangxiao1254.github.io/;http://research.microsoft.com/en-us/people/xingx/;;https://users.cs.fiu.edu/~xuywang/;http://www-bcf.usc.edu/~liu32/;http://yes-lab.org/;http://yinzhicao.org/;https://penncil.med.upenn.edu/about-pi/;https://viterbi-web.usc.edu/~yzhao010/", "dblp": ";121/0780-1.html;;44/3983-1;160/4750;;366/6075;;;;267/6459;;;;173/2751;90/545-1.html;;132/8938;80/7282;;64/9590;36/3855;72/8513;;;15/832-1.html;23/1797-1.html;197/1635;75/2690.html;53/11277.html;69/339-1;32/5243.html;;;35/295-1;92/5339;h/JiaweiHan.html;59/2379-1;64/10812;;85/5045;m/JohnCMitchell;153/5265;195/8175;18/2428;129/8146;127/0072;;;y/PhilipSYu;39/8969;50/4597;;209/4936;84/6405;74/28;;96/5013-1.html;88/8205-1;;;74/1890-1;150/9413;08/6809-1;;27/10801;150/4295;;28/8733.html;67/6351-16.html;48/76-16", "google_scholar": ";WhGUE7AAAAAJ;aEuLcokAAAAJ;v8qD1HsAAAAJ;;;1AqAngQAAAAJ;;CceUXEYAAAAJ;NJJJ45AAAAAJ;bBQx_5MAAAAJ;;p8tAM0AAAAAJ;;RqhSccwAAAAJ;-GFAhOEAAAAJ;https://scholar.google.co.uk/citations?user=yRhnVoIAAAAJ;SQpJmOgAAAAJ;vaSdahkAAAAJ;;Zd7WmXUAAAAJ;https://scholar.google.com.tw/citations?user=5pKTRxEAAAAJ;13yyuCcAAAAJ;;z7GCqT4AAAAJ;zYdZORsAAAAJ;LTa3GzEAAAAJ;A20BZnQAAAAJ;lsYXBx8AAAAJ;YtUDgPIAAAAJ;LZIPfCkAAAAJ;DN8QtscAAAAJ;23ZXZvEAAAAJ;;Ckfl4_AAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=Kv9AbjMAAAAJ;9VaGBCQAAAAJ;WtzKMWAAAAAJ;;HhDsD9UAAAAJ;https://scholar.google.com.tw/citations?user=1_kJPIEAAAAJ;-6bAV2cAAAAJ;lYK0wlsAAAAJ;fqDBtzYAAAAJ;obgTcyoAAAAJ;76IEGtYAAAAJ;;;D0lL1r0AAAAJ;jxwlCUUAAAAJ;GU9HgNAAAAAJ;sgBB2sUAAAAJ;6fqNXooAAAAJ;BZGj6sAAAAAJ;https://scholar.google.com.tw/citations?user=SDY9FwUAAAAJ;LE3ctn0AAAAJ;92RPXm0AAAAJ;OKvgizMAAAAJ;;MjkwwiQAAAAJ;BhRJe4wAAAAJ;QbWLR8QAAAAJ;5EQfAFIAAAAJ;;f2liGLoAAAAJ;UUKLPMYAAAAJ;egjr888AAAAJ;0jBP_aEAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.ca/citations?user=zoGDYsoAAAAJ", "orcid": ";;0000-0002-5787-3131;;;;;;;;;;0000-0001-7061-6714;;;;;;;;;;;;;;;;0000-0001-7113-9630;;0000-0002-3009-519X;;;;0000-0002-3239-1080;;0000-0002-3629-2696;;0000-0001-7125-3898;;0000-0001-7044-9805;0000-0002-0024-860X;;;0000-0001-5365-0072;0000-0001-7810-9071;;;;0000-0002-3491-5968;0000-0003-1039-8369;;;;0000-0002-4205-4563;;0000-0001-7774-8197;;0000-0001-5348-0632;;0000-0002-9851-6376;0000-0002-3574-5665;;0000-0002-8608-8482;;;0000-0002-7055-9518;;;;0000-0003-3401-4921", "linkedin": ";lichao-sun-b273a290/;haoran-wang-165236209/;;;yuan-li-087605195/;;;lyu-wenhan/;;;hanchi-ryan-sun-33561722b/;;;yijuewang/;;bertie-vidgen-001/;;caiming-xiong-150a1417;;;;;;;hongyi-wang-b89651102/;;huaxiuyao/;manolis-kellis;;meng-jiang-94b10916/;;;;;;;;;;;john-mitchell-36338a8/;;;kai-wei-chang-41239040;;;;;;pin-yu-chen-940062a2;;;rex-ying-92770148/;shuiwang-ji-9a040715/;;tianlong-chen-783862167/;;tianyizhou;;xiang-shaun-li-11b2b99/;;;xingx/;;;;;;;yzhao062/", "or_profile": "~Yue_Huang9;~Lichao_Sun1;~Haoran_Wang12;~Siyuan_Wu6;~Qihui_Zhang1;~Yuan_Li18;~Chujie_Gao1;~Yixin_Huang2;~Wenhan_Lyu1;~Yixuan_Zhang7;~Xiner_Li1;~Hanchi_Sun2;~Zhengliang_Liu1;~Yixin_Liu4;~Yijue_Wang1;~Zhikun_Zhang2;~Bertie_Vidgen1;~Bhavya_Kailkhura1;~Caiming_Xiong1;~Chaowei_Xiao2;~Chunyuan_Li1;~Eric_Xing1;~Furong_Huang1;~Hao_Liu1;~Heng_Ji3;~Hongyi_Wang1;~Huan_Zhang1;~Huaxiu_Yao1;~Manolis_Kellis1;~Marinka_Zitnik1;~Meng_Jiang3;~Mohit_Bansal2;~James_Zou1;~Jian_Pei1;~Jian_Liu12;~Jianfeng_Gao1;~Jiawei_Han1;~Jieyu_Zhao1;~Jiliang_Tang1;~Jindong_Wang1;~Joaquin_Vanschoren1;~John_Mitchell1;~Kai_Shu1;~Kaidi_Xu1;~Kai-Wei_Chang1;~Lifang_He1;~Lifu_Huang1;~Michael_Backes3;~Neil_Zhenqiang_Gong1;~Philip_S._Yu1;~Pin-Yu_Chen1;~Quanquan_Gu1;~Ran_Xu1;~Zhitao_Ying1;~Shuiwang_Ji1;~Suman_Jana1;~Tianlong_Chen1;~Tianming_Liu3;~Tianyi_Zhou1;~William_Yang_Wang2;~Xiang_Li14;~Xiangliang_Zhang1;~Xiao_Wang11;~Xing_Xie3;~Xun_Chen1;~Xuyu_Wang1;~Yan_Liu1;~Yanfang_Ye1;~Yinzhi_Cao1;~Yong_Chen9;~Yue_Zhao13", "aff": ";Lehigh University;Illinois Institute of Technology;University of Waterloo;Huazhong University of Science and Technology;University of Cambridge;;;College of William and Mary;College of William and Mary;Texas A&M University - College Station;Lehigh University;University of Georgia;;Meta Facebook;CISPA Helmholtz Center for Information Security;MLCommons;Lawrence Livermore National Laboratory;Salesforce Research;;Microsoft Research;School of Computer Science, Carnegie Mellon University;University of Maryland;;University of Illinois, Urbana-Champaign;Carnegie Mellon University;University of Illinois, Urbana Champaign;Department of Computer Science, University of North Carolina at Chapel Hill;Massachusetts Institute of Technology;Harvard University;University of Notre Dame;University of North Carolina at Chapel Hill;Stanford University;;University of Tennessee, Knoxville;Microsoft Research;University of Illinois at Urbana-Champaign (UIUC);University of Southern California;Michigan State University;;Eindhoven University of Technology;Stanford University;Emory University;Drexel University;Amazon;Lehigh University;Virginia Tech;;;University of Illinois Chicago;International Business Machines;University of California, Los Angeles;SalesForce.com;Yale University;Texas A&M University;, Columbia University;Harvard University;University of Georgia;University of Maryland, College Park;;Massachusetts General Hospital, Harvard University;University of Notre Dame;Northwestern University;Microsoft Research Asia;;Florida International University;University of Southern California;University of Notre Dame;Johns Hopkins University;;University of Southern California", "aff_domain": ";lehigh.edu;iit.edu;uwaterloo.ca;hust.edu.cn;cam.ac.uk;;;wm.edu;wm.edu;tamu.edu;lehigh.edu;uga.edu;;meta.com;cispa.de;mlcommons.org;llnl.gov;salesforce.com;;microsoft.com;cs.cmu.edu;cs.umd.edu;;uiuc.edu;andrew.cmu.edu;uiuc.edu;cs.unc.edu;mit.edu;harvard.edu;nd.edu;unc.edu;stanford.edu;;utk.edu;microsoft.com;illinois.edu;usc.edu;msu.edu;;tue.nl;stanford.edu;emory.edu;drexel.edu;amazon.com;lehigh.edu;vt.edu;;;uic.edu;ibm.com;cs.ucla.edu;salesforce.com;yale.edu;tamu.edu;cs.columbia.edu;harvard.edu;uga.edu;umd.edu;;mgh.harvard.edu;nd.edu;northwestern.edu;microsoft.com;;fiu.edu;usc.edu;nd.edu;jhu.edu;;usc.edu", "position": ";Assistant Professor;PhD student;Intern;Intern;MS student;;;PhD student;Assistant Professor;PhD student;PhD student;PhD student;;Researcher;Researcher;Evaluation lead;Research Staff;Research Scientist;;Principal Researcher;Full Professor;Assistant Professor;;Full Professor;Researcher;Assistant Professor;Assistant Professor;Full Professor;Associate Professor;Associate Professor;Full Professor;Assistant Professor;;Assistant Professor;Principal Researcher;Full Professor;Assistant Professor;Full Professor;;Associate Professor;Full Professor;Assistant Professor;Assistant Professor;Researcher;Assistant Professor;Assistant Professor;;;Full Professor;Principal Researcher;Associate Professor;senior manager;Assistant Professor;Professor;Associate Professor;Postdoc;Professor;Assistant Professor;;Assistant Professor;Associate Professor;Assistant Professor;Senior Principal Researcher;;Assistant Professor;Professor;Associate Professor;Assistant Professor;;Assistant Professor", "bibtex": "@inproceedings{\nhuang2024position,\ntitle={Position: Trust{LLM}: Trustworthiness in Large Language Models},\nauthor={Yue Huang and Lichao Sun and Haoran Wang and Siyuan Wu and Qihui Zhang and Yuan Li and Chujie Gao and Yixin Huang and Wenhan Lyu and Yixuan Zhang and Xiner Li and Hanchi Sun and Zhengliang Liu and Yixin Liu and Yijue Wang and Zhikun Zhang and Bertie Vidgen and Bhavya Kailkhura and Caiming Xiong and Chaowei Xiao and Chunyuan Li and Eric P. Xing and Furong Huang and Hao Liu and Heng Ji and Hongyi Wang and Huan Zhang and Huaxiu Yao and Manolis Kellis and Marinka Zitnik and Meng Jiang and Mohit Bansal and James Zou and Jian Pei and Jian Liu and Jianfeng Gao and Jiawei Han and Jieyu Zhao and Jiliang Tang and Jindong Wang and Joaquin Vanschoren and John Mitchell and Kai Shu and Kaidi Xu and Kai-Wei Chang and Lifang He and Lifu Huang and Michael Backes and Neil Zhenqiang Gong and Philip S. Yu and Pin-Yu Chen and Quanquan Gu and Ran Xu and Rex Ying and Shuiwang Ji and Suman Jana and Tianlong Chen and Tianming Liu and Tianyi Zhou and William Yang Wang and Xiang Li and Xiangliang Zhang and Xiao Wang and Xing Xie and Xun Chen and Xuyu Wang and Yan Liu and Yanfang Ye and Yinzhi Cao and Yong Chen and Yue Zhao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bWUU0LwwMp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2560872, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 71, "gs_citation": 95, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16851673837614021666&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "email": ";lehigh.edu;iit.edu;uwaterloo.ca;hust.edu.cn;cam.ac.uk;;;wm.edu;wm.edu;tamu.edu;lehigh.edu;uga.edu;;meta.com;cispa.de;mlcommons.org;llnl.gov;salesforce.com;;microsoft.com;cs.cmu.edu;cs.umd.edu;;uiuc.edu;andrew.cmu.edu;uiuc.edu;cs.unc.edu;mit.edu;harvard.edu;nd.edu;unc.edu;stanford.edu;;utk.edu;microsoft.com;illinois.edu;usc.edu;msu.edu;;tue.nl;stanford.edu;emory.edu;drexel.edu;amazon.com;lehigh.edu;vt.edu;;;uic.edu;ibm.com;cs.ucla.edu;salesforce.com;yale.edu;tamu.edu;cs.columbia.edu;harvard.edu;uga.edu;umd.edu;;mgh.harvard.edu;nd.edu;northwestern.edu;microsoft.com;;fiu.edu;usc.edu;nd.edu;jhu.edu;;usc.edu", "author_num": 71, "aff_unique_index": "0;1;2;3;4;5;5;6;0;7;8;9;10;11;12;13;14;15;16;14;17;18;19;20;21;22;23;24;13;17;25;26;27;23;28;29;30;0;31;32;33;34;12;35;6;36;20;7;15;20;21;37;13;38;25;21;39;25", "aff_unique_norm": "Lehigh University;Illinois Institute of Technology;University of Waterloo;Huazhong University of Science and Technology;University of Cambridge;College of William and Mary;Texas A&M University;University of Georgia;Meta;CISPA Helmholtz Center for Information Security;MLCommons;Lawrence Livermore National Laboratory;Salesforce;Microsoft;Carnegie Mellon University;University of Maryland;University of Illinois;University of Illinois Urbana-Champaign;University of North Carolina at Chapel Hill;Massachusetts Institute of Technology;Harvard University;University of Notre Dame;University of North Carolina;Stanford University;University of Tennessee;University of Southern California;Michigan State University;Eindhoven University of Technology;Emory University;Drexel University;Amazon;Virginia Tech;University of Illinois at Chicago;International Business Machines Corporation;University of California, Los Angeles;Yale University;Columbia University;Northwestern University;Florida International University;Johns Hopkins University", "aff_unique_dep": ";;;;;;;;Meta Platforms, Inc.;;;;Salesforce Research;Microsoft Research;School of Computer Science;;;;Department of Computer Science;;;;;;;;;;;;Amazon.com, Inc.;;;;;;;;;", "aff_unique_url": "https://www.lehigh.edu;https://www.iit.edu;https://uwaterloo.ca;http://www.hust.edu.cn;https://www.cam.ac.uk;https://www.wm.edu;https://www.tamu.edu;https://www.uga.edu;https://meta.com;https://www.cispa.de/;https://mlcommons.org;https://www.llnl.gov;https://research.salesforce.com;https://www.microsoft.com/en-us/research;https://www.cmu.edu;https://www/umd.edu;https://illinois.edu;https://illinois.edu;https://www.unc.edu;https://web.mit.edu;https://www.harvard.edu;https://www.nd.edu;https://www.unc.edu;https://www.stanford.edu;https://www.utk.edu;https://www.usc.edu;https://www.msu.edu;https://www.tue.nl;https://www.emory.edu;https://www.drexel.edu;https://www.amazon.com;https://www.vt.edu;https://www.uic.edu;https://www.ibm.com;https://www.ucla.edu;https://www.yale.edu;https://www.columbia.edu;https://www.northwestern.edu;https://www.fiu.edu;https://www.jhu.edu", "aff_unique_abbr": "Lehigh;IIT;UW;HUST;Cambridge;WM;TAMU;UGA;Meta;CISPA;MLCommons;LLNL;Salesforce;MSR;CMU;UMD;UIUC;UIUC;UNC Chapel Hill;MIT;Harvard;Notre Dame;UNC;Stanford;UT;USC;MSU;TU/e;Emory;Drexel;Amazon;VT;UIC;IBM;UCLA;Yale;Columbia;NU;FIU;JHU", "aff_campus_unique_index": "1;2;3;4;4;5;5;6;7;4;8;6;9;8;10;11;8;8", "aff_campus_unique": ";Cambridge;College Station;Pittsburgh;Urbana-Champaign;Chapel Hill;Stanford;Knoxville;Los Angeles;Chicago;College Park;Asia", "aff_country_unique_index": "0;0;1;2;3;0;0;0;0;0;0;4;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;5;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;2;0;0;0;0;0", "aff_country_unique": "United States;Canada;China;United Kingdom;Germany;Netherlands" }, { "title": "Safety Fine-Tuning at (Almost) No Cost: A Baseline for Vision Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33636", "id": "bWZKvF0g7G", "proceeding": "https://proceedings.mlr.press/v235/zong24a.html", "pdf": "https://openreview.net/pdf?id=bWZKvF0g7G", "openreview": "https://openreview.net/forum?id=bWZKvF0g7G", "author_site": "Yongshuo Zong, Ondrej Bohdal, Tingyang Yu, Yongxin Yang, Timothy Hospedales", "tldr": "", "abstract": "Current vision large language models (VLLMs) exhibit remarkable capabilities yet are prone to generate harmful content and are vulnerable to even the simplest jailbreaking attacks. Our initial analysis finds that this is due to the presence of harmful data during vision-language instruction fine-tuning, and that VLLM fine-tuning can cause forgetting of safety alignment previously learned by the underpinning LLM. To address this issue, we first curate a vision-language safe instruction-following dataset VLGuard covering various harmful categories. Our experiments demonstrate that integrating this dataset into standard vision-language fine-tuning or utilizing it for post-hoc fine-tuning effectively safety aligns VLLMs. This alignment is achieved with minimal impact on, or even enhancement of, the models' helpfulness. The versatility of our safety fine-tuning dataset makes it a valuable resource for safety-testing existing VLLMs, training new models or safeguarding pre-trained VLLMs. Empirical results demonstrate that fine-tuned VLLMs effectively reject unsafe instructions and substantially reduce the success rates of several black-box adversarial attacks, which approach zero in many cases. The code and dataset will be open-sourced.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yongshuo Zong;Ondrej Bohdal;Tingyang Yu;Yongxin Yang;Timothy Hospedales", "authorids": "~Yongshuo_Zong1;~Ondrej_Bohdal1;~Tingyang_Yu1;~Yongxin_Yang1;~Timothy_Hospedales1", "gender": ";M;M;F;M", "homepage": "https://ys-zong.github.io/;https://ondrejbohdal.github.io/;http://homepages.inf.ed.ac.uk/thospeda/;https://yistyu.github.io/;", "dblp": ";267/5714.html;32/3545;;150/4258", "google_scholar": "38-dM-MAAAAJ;aKppg0QAAAAJ;https://scholar.google.fr/citations?user=nHhtvqkAAAAJ;1Cw8oZ4AAAAJ;https://scholar.google.co.uk/citations?user=F7PtrL8AAAAJ", "orcid": ";;0000-0003-4867-7486;;", "linkedin": ";;timothyhospedales/;;", "or_profile": "~Yongshuo_Zong1;~Ondrej_Bohdal1;~Timothy_Hospedales1;~Yist_Tingyang_YU1;~Yongxin_Yang3", "aff": "University of Edinburgh;University of Edinburgh;Samsung AI Research Centre;EPFL - EPF Lausanne;Queen Mary University of London", "aff_domain": "ed.ac.uk;ed.ac.uk;samsung.com;epfl.ch;qmul.ac.uk", "position": "PhD student;PhD student;Principal Researcher;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nzong2024safety,\ntitle={Safety Fine-Tuning at (Almost) No Cost: A Baseline for Vision Large Language Models},\nauthor={Yongshuo Zong and Ondrej Bohdal and Tingyang Yu and Yongxin Yang and Timothy Hospedales},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bWZKvF0g7G}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7537417, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 89, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14180532884136940455&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "ed.ac.uk;ed.ac.uk;samsung.com;epfl.ch;qmul.ac.uk", "author_num": 5, "aff_unique_index": "0;0;1;2;3", "aff_unique_norm": "University of Edinburgh;Samsung;EPFL;Queen Mary University of London", "aff_unique_dep": ";AI Research;;", "aff_unique_url": "https://www.ed.ac.uk;https://www.samsung.com/global/researchers/samsung-ai-research-centre/;https://www.epfl.ch;https://www.qmul.ac.uk", "aff_unique_abbr": "Edinburgh;SARC;EPFL;QMUL", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Lausanne;London", "aff_country_unique_index": "0;0;1;2;0", "aff_country_unique": "United Kingdom;South Korea;Switzerland" }, { "title": "Monitoring AI-Modified Content at Scale: A Case Study on the Impact of ChatGPT on AI Conference Peer Reviews", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33635", "id": "bX3J7ho18S", "proceeding": "https://proceedings.mlr.press/v235/liang24b.html", "pdf": "https://openreview.net/pdf?id=bX3J7ho18S", "openreview": "https://openreview.net/forum?id=bX3J7ho18S", "author_site": "Weixin Liang, Zachary Izzo, Yaohui Zhang, Haley Lepp, Hancheng Cao, Xuandong Zhao, Lingjiao Chen, Haotian Ye, Sheng Liu, Zhi Huang, Daniel McFarland, James Zou", "tldr": "", "abstract": "We present an approach for estimating the fraction of text in a large corpus which is likely to be substantially modified or produced by a large language model (LLM). Our maximum likelihood model leverages expert-written and AI-generated reference texts to accurately and efficiently examine real-world LLM-use at the corpus level. We apply this approach to a case study of scientific peer review in AI conferences that took place after the release of ChatGPT: *ICLR* 2024, *NeurIPS* 2023, *CoRL* 2023 and *EMNLP* 2023. Our results suggest that between 6.5% and 16.9% of text submitted as peer reviews to these conferences could have been substantially modified by LLMs, i.e. beyond spell-checking or minor writing updates. The circumstances in which generated text occurs offer insight into user behavior: the estimated fraction of LLM-generated text is higher in reviews which report lower confidence, were submitted close to the deadline, and from reviewers who are less likely to respond to author rebuttals. We also observe corpus-level trends in generated text which may be too subtle to detect at the individual level, and discuss the implications of such trends on peer review. We call for future interdisciplinary work to examine how LLM use is changing our information and knowledge practices.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weixin Liang;Zachary Izzo;Yaohui Zhang;Haley Lepp;Hancheng Cao;Xuandong Zhao;Lingjiao Chen;Haotian Ye;Sheng Liu;Zhi Huang;Daniel McFarland;James Y. Zou", "authorids": "~Weixin_Liang1;~Zachary_Izzo1;~Yaohui_Zhang2;~Haley_Lepp1;~Hancheng_Cao1;~Xuandong_Zhao1;~Lingjiao_Chen1;~Haotian_Ye1;~Sheng_Liu2;zhihuang@stanford.edu;~Daniel_McFarland1;~James_Y._Zou1", "gender": ";M;M;;;M;;M;;;;M", "homepage": "https://ai.stanford.edu/~wxliang/;https://zleizzo.github.io;https://github.com/Yaohui-Zhang;;http://hanchengcao.me/;https://xuandongzhao.github.io/;;https://haotianye.com;https://shengliu66.github.io/;;;", "dblp": "231/1803;259/2117;;;217/5558;244/8033;131/6638.html;284/0539;;;;72/8399", "google_scholar": "7z9P1jYAAAAJ;K9XheYUAAAAJ;;;w0UNVG0AAAAJ;CxeH4uoAAAAJ;;VU4chlsAAAAJ;rzhzR-cAAAAJ;;;", "orcid": ";;;;;;;;;;;", "linkedin": "weixin-liang-2562aa154/;;;;;xuandong-zhao-a3270610b/;;;;;;", "or_profile": "~Weixin_Liang1;~Zachary_Izzo1;~Yaohui_Zhang2;~Haley_Lepp1;~Hancheng_Cao1;~Xuandong_Zhao1;~Lingjiao_Chen1;~Haotian_Ye1;~Sheng_Liu2;zhihuang@stanford.edu;~Daniel_McFarland1;~James_Y._Zou1", "aff": "Stanford University;NEC Labs America;Stanford University;;Stanford University;UC Santa Barbara;Stanford University;Stanford University;Stanford University;;;", "aff_domain": "stanford.edu;nec-labs.com;stanford.edu;;stanford.edu;ucsb.edu;stanford.edu;stanford.edu;stanford.edu;;;", "position": "PhD student;Researcher;MS student;;PhD student;PhD student;PhD student;PhD student;Postdoc;;;", "bibtex": "@inproceedings{\nliang2024monitoring,\ntitle={Monitoring {AI}-Modified Content at Scale: A Case Study on the Impact of Chat{GPT} on {AI} Conference Peer Reviews},\nauthor={Weixin Liang and Zachary Izzo and Yaohui Zhang and Haley Lepp and Hancheng Cao and Xuandong Zhao and Lingjiao Chen and Haotian Ye and Sheng Liu and Zhi Huang and Daniel McFarland and James Y. Zou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bX3J7ho18S}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1176710, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 122, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11990396490620390287&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 10, "email": "stanford.edu;nec-labs.com;stanford.edu;;stanford.edu;ucsb.edu;stanford.edu;stanford.edu;stanford.edu;;;", "author_num": 12, "aff_unique_index": "0;1;0;0;2;0;0;0", "aff_unique_norm": "Stanford University;NEC Labs America;University of California, Santa Barbara", "aff_unique_dep": ";;", "aff_unique_url": "https://www.stanford.edu;https://www.nec-labs.com;https://www.ucsb.edu", "aff_unique_abbr": "Stanford;NEC LA;UCSB", "aff_campus_unique_index": "0;0;0;2;0;0;0", "aff_campus_unique": "Stanford;;Santa Barbara", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Timer: Generative Pre-trained Transformers Are Large Time Series Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33634", "id": "bYRYb7DMNo", "proceeding": "https://proceedings.mlr.press/v235/liu24cb.html", "pdf": "https://openreview.net/pdf?id=bYRYb7DMNo", "openreview": "https://openreview.net/forum?id=bYRYb7DMNo", "author_site": "Yong Liu, Haoran Zhang, Chenyu Li, Xiangdong Huang, Jianmin Wang, Mingsheng Long", "tldr": "", "abstract": "Deep learning has contributed remarkably to the advancement of time series analysis. Still, deep models can encounter performance bottlenecks in real-world data-scarce scenarios, which can be concealed due to the performance saturation with small models on current benchmarks. Meanwhile, large models have demonstrated great powers in these scenarios through large-scale pre-training. Continuous progress has been achieved with the emergence of large language models, exhibiting unprecedented abilities such as few-shot generalization, scalability, and task generality, which are however absent in small deep models. To change the status quo of training scenario-specific small models from scratch, this paper aims at the early development of large time series models (LTSM). During pre-training, we curate large-scale datasets with up to 1 billion time points, unify heterogeneous time series into single-series sequence (S3) format, and develop the GPT-style architecture toward LTSMs. To meet diverse application needs, we convert forecasting, imputation, and anomaly detection of time series into a unified generative task. The outcome of this study is a Time Series Transformer (Timer), which is generative pre-trained by next token prediction and adapted to various downstream tasks with promising capabilities as an LTSM. Code and datasets are available at: https://github.com/thuml/Large-Time-Series-Model.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yong Liu;Haoran Zhang;Chenyu Li;Xiangdong Huang;Jianmin Wang;Mingsheng Long", "authorids": "~Yong_Liu15;~Haoran_Zhang9;~Chenyu_Li2;~Xiangdong_Huang1;~Jianmin_Wang1;~Mingsheng_Long5", "gender": ";M;M;M;M;", "homepage": ";https://www.thss.tsinghua.edu.cn/;https://lichenyu20.github.io/;;https://www.thss.tsinghua.edu.cn/en/faculty/jianminwang.htm;", "dblp": ";;51/2854-3.html;;06/3456-1.html;", "google_scholar": ";;Ul5hyswAAAAJ;2u7MRD8AAAAJ;https://scholar.google.com.tw/citations?user=MiovcboAAAAJ;", "orcid": ";0009-0004-3245-459X;0009-0002-1179-788X;;0000-0001-6841-7943;", "linkedin": ";;;;;", "or_profile": "~Yong_Liu15;~Haoran_Zhang9;~Chenyu_Li2;~Xiangdong_Huang1;~Jianmin_Wang1;~Mingsheng_Long5", "aff": ";Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;", "aff_domain": ";mails.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;", "position": ";Undergrad student;Undergrad student;Associate Professor;Full Professor;", "bibtex": "@inproceedings{\nliu2024timer,\ntitle={Timer: Generative Pre-trained Transformers Are Large Time Series Models},\nauthor={Yong Liu and Haoran Zhang and Chenyu Li and Xiangdong Huang and Jianmin Wang and Mingsheng Long},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bYRYb7DMNo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5830408, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9730826464197192066&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": ";mails.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;", "author_num": 6, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Split-and-Denoise: Protect large language model inference with local differential privacy", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33633", "id": "bZ4fzw1iz7", "proceeding": "https://proceedings.mlr.press/v235/mai24a.html", "pdf": "https://openreview.net/pdf?id=bZ4fzw1iz7", "openreview": "https://openreview.net/forum?id=bZ4fzw1iz7", "author_site": "Peihua Mai, Ran Yan, Zhe Huang, Youjia Yang, Yan (James) Pang", "tldr": "", "abstract": "Large Language Models (LLMs) excel in natural language understanding by capturing hidden semantics in vector space. This process enriches the value of text embeddings for various downstream tasks, thereby fostering the Embedding-as-a-Service (EaaS) business model. However, the risk of privacy leakage due to direct text transmission to servers remains a critical concern. To address this, we introduce Split-N-Denoise (SnD), an private inference framework that splits the model to execute the token embedding layer on the client side at minimal computational cost. This allows the client to introduce noise prior to transmitting the embeddings to the server, and subsequently receive and denoise the perturbed output embeddings for downstream tasks. Our approach is designed for the inference stage of LLMs and requires no modifications to the model parameters. Extensive experiments demonstrate SnD\u2019s effectiveness in optimizing the privacy-utility tradeoff across various LLM architectures and diverse downstream tasks. The results reveal an improvement in performance under the same privacy budget compared to the baselines by over 10% on average, offering clients a privacy-preserving solution for local privacy protection.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Peihua Mai;Ran Yan;Zhe Huang;Youjia Yang;Yan Pang", "authorids": "~Peihua_Mai1;~Ran_Yan3;~Zhe_Huang6;~Youjia_Yang1;~Yan_Pang1", "gender": "F;F;M;;M", "homepage": ";;;;https://discovery.nus.edu.sg/10509-yan-james-pang", "dblp": "331/2673;;;;", "google_scholar": "ZtDss4cAAAAJ;;;;", "orcid": "0000-0002-5851-2290;0009-0009-3405-5361;0009-0002-4420-6601;;", "linkedin": ";;;youjiayang35;", "or_profile": "~Peihua_Mai1;~Ran_Yan3;~Zhe_Huang6;~Youjia_Yang1;~Yan_Pang1", "aff": "National University of Singapore;;North China Electric Power University;University of Southern California;National University of Singapore", "aff_domain": "nus.edu;;ncepu.edu;usc.edu;nus.edu.sg", "position": "PhD student;;MS student;MS student;Full Professor", "bibtex": "@inproceedings{\nmai2024splitanddenoise,\ntitle={Split-and-Denoise: Protect large language model inference with local differential privacy},\nauthor={Peihua Mai and Ran Yan and Zhe Huang and Youjia Yang and Yan Pang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bZ4fzw1iz7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1436101, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12645019224439374827&as_sdt=5,24&sciodt=0,24&hl=en", "gs_version_total": 7, "email": "nus.edu;;ncepu.edu;usc.edu;nus.edu.sg", "author_num": 5, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "National University of Singapore;North China Electric Power University;University of Southern California", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nus.edu.sg;http://www.ncepu.edu.cn;https://www.usc.edu", "aff_unique_abbr": "NUS;NCEPU;USC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;2;0", "aff_country_unique": "Singapore;China;United States" }, { "title": "On the Recoverability of Causal Relations from Temporally Aggregated I.I.D. Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33632", "id": "bZNH0SU37Y", "proceeding": "https://proceedings.mlr.press/v235/fan24a.html", "pdf": "https://openreview.net/pdf?id=bZNH0SU37Y", "openreview": "https://openreview.net/forum?id=bZNH0SU37Y", "author_site": "Shunxing Fan, Mingming Gong, Kun Zhang", "tldr": "", "abstract": "We consider the effect of temporal aggregation on instantaneous (non-temporal) causal discovery in general setting. This is motivated by the observation that the true causal time lag is often considerably shorter than the observational interval. This discrepancy leads to high aggregation, causing time-delay causality to vanish and instantaneous dependence to manifest. Although we expect such instantaneous dependence has consistency with the true causal relation in certain sense to make the discovery results meaningful, it remains unclear what type of consistency we need and when will such consistency be satisfied. We proposed functional consistency and conditional independence consistency in formal way correspond functional causal model-based methods and conditional independence-based methods respectively and provide the conditions under which these consistencies will hold. We show theoretically and experimentally that causal discovery results may be seriously distorted by aggregation especially in complete nonlinear case and we also find causal relationship still recoverable from aggregated data if we have partial linearity or appropriate prior. Our findings suggest community should take a cautious and meticulous approach when interpreting causal discovery results from such data and show why and when aggregation will distort the performance of causal discovery methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shunxing Fan;Mingming Gong;Kun Zhang", "authorids": "~Shunxing_Fan1;~Mingming_Gong1;~Kun_Zhang1", "gender": "M;M;M", "homepage": "https://shunxing-fan.github.io/;https://mingming-gong.github.io/;http://www.andrew.cmu.edu/user/kunz1/", "dblp": "344/5064;98/8479;96/3115-1", "google_scholar": "SbGcOdYAAAAJ;https://scholar.google.com.au/citations?user=6BmiCJIAAAAJ;RGoypN4AAAAJ", "orcid": "0000-0003-2106-8074;0000-0001-7147-5589;", "linkedin": ";;", "or_profile": "~Shunxing_Fan1;~Mingming_Gong1;~Kun_Zhang1", "aff": "Mohamed bin Zayed University of Artificial Intelligence;University of Melbourne;Carnegie Mellon University", "aff_domain": "mbzuai.ac.ae;unimelb.edu.au;cmu.edu", "position": "Researcher;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nfan2024on,\ntitle={On the Recoverability of Causal Relations from Temporally Aggregated I.I.D. Data},\nauthor={Shunxing Fan and Mingming Gong and Kun Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bZNH0SU37Y}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 909326, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2617085200053013339&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "mbzuai.ac.ae;unimelb.edu.au;cmu.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;University of Melbourne;Carnegie Mellon University", "aff_unique_dep": ";;", "aff_unique_url": "https://mbzuai.ac.ae;https://www.unimelb.edu.au;https://www.cmu.edu", "aff_unique_abbr": "MBZUAI;UniMelb;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "United Arab Emirates;Australia;United States" }, { "title": "State-Constrained Zero-Sum Differential Games with One-Sided Information", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33631", "id": "bcN7KSB2YS", "proceeding": "https://proceedings.mlr.press/v235/ghimire24a.html", "pdf": "https://openreview.net/pdf?id=bcN7KSB2YS", "openreview": "https://openreview.net/forum?id=bcN7KSB2YS", "author_site": "Mukesh Ghimire, Lei Zhang, Zhe Xu, Yi Ren", "tldr": "", "abstract": "We study zero-sum differential games with state constraints and one-sided information, where the informed player (Player 1) has a categorical payoff type unknown to the uninformed player (Player 2). The goal of Player 1 is to minimize his payoff without violating the constraints, while that of Player 2 is to violate the state constraints if possible, or to maximize the payoff otherwise. One example of the game is a man-to-man matchup in football. Without state constraints, Cardaliaguet (2007) showed that the value of such a game exists and is convex to the common belief of players. Our theoretical contribution is an extension of this result to games with state constraints and the derivation of the primal and dual subdynamic principles necessary for computing behavioral strategies. Different from existing works that are concerned about the scalability of no-regret learning in games with discrete dynamics, our study reveals the underlying structure of strategies for belief manipulation resulting from information asymmetry and state constraints. This structure will be necessary for scalable learning on games with continuous actions and long time windows. We use a simplified football game to demonstrate the utility of this work, where we reveal player positions and belief states in which the attacker should (or should not) play specific random deceptive moves to take advantage of information asymmetry, and compute how the defender should respond.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mukesh Ghimire;Lei Zhang;Zhe Xu;Yi Ren", "authorids": "~Mukesh_Ghimire1;~Lei_Zhang40;~Zhe_Xu7;~Yi_Ren3", "gender": "M;M;;M", "homepage": "https://mukeshghimire.com.np;;https://sites.google.com/site/zhexudavid00710;http://designinformaticslab.github.io/", "dblp": "309/8402;;;", "google_scholar": ";https://scholar.google.com/citations?view_op=list_works;j8ilzcsAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0009-0005-5660-5054;;;", "linkedin": ";;;", "or_profile": "~Mukesh_Ghimire1;~Lei_Zhang40;~Zhe_Xu7;~Yi_Ren3", "aff": "Arizona State University;Arizona State University;Arizona State University;Arizona State University", "aff_domain": "asu.edu;asu.edu;asu.edu;asu.edu", "position": "PhD student;PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nghimire2024stateconstrained,\ntitle={State-Constrained Zero-Sum Differential Games with One-Sided Information},\nauthor={Mukesh Ghimire and Lei Zhang and Zhe Xu and Yi Ren},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bcN7KSB2YS}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 809841, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9524053575913615015&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "asu.edu;asu.edu;asu.edu;asu.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Arizona State University", "aff_unique_dep": "", "aff_unique_url": "https://www.asu.edu", "aff_unique_abbr": "ASU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Riemannian coordinate descent algorithms on matrix manifolds", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33630", "id": "bdKaQmrM81", "proceeding": "https://proceedings.mlr.press/v235/han24c.html", "pdf": "https://openreview.net/pdf?id=bdKaQmrM81", "openreview": "https://openreview.net/forum?id=bdKaQmrM81", "author_site": "Andi Han, Pratik Kumar Jawanpuria, Bamdev Mishra", "tldr": "", "abstract": "Many machine learning applications are naturally formulated as optimization problems on Riemannian manifolds. The main idea behind Riemannian optimization is to maintain the feasibility of the variables while moving along a descent direction on the manifold. This results in updating all the variables at every iteration. In this work, we provide a general framework for developing computationally efficient coordinate descent (CD) algorithms on matrix manifolds that allows updating only a few variables at every iteration while adhering to the manifold constraint. In particular, we propose CD algorithms for various manifolds such as Stiefel, Grassmann, (generalized) hyperbolic, symplectic, and symmetric positive (semi)definite. While the cost per iteration of the proposed CD algorithms is low, we further develop a more efficient variant via a first-order approximation of the objective function. We analyze their convergence and complexity, and empirically illustrate their efficacy in several applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Andi Han;Pratik Jawanpuria;Bamdev Mishra", "authorids": "~Andi_Han1;~Pratik_Jawanpuria1;~Bamdev_Mishra1", "gender": "M;M;", "homepage": "https://github.com/andyjm3;https://pratikjawanpuria.com;https://bamdevmishra.in", "dblp": "268/7976.html;32/9841;133/8291", "google_scholar": "AKHQHs0AAAAJ;_GUZDtMAAAAJ;https://scholar.google.co.in/citations?user=25IuNrMAAAAJ", "orcid": "0000-0003-4655-655X;;", "linkedin": ";;", "or_profile": "~Andi_Han1;~Pratik_Jawanpuria1;~Bamdev_Mishra1", "aff": "RIKEN AIP;Microsoft;Microsoft", "aff_domain": "riken.jp;microsoft.com;microsoft.com", "position": "Postdoc;Principal Researcher;Applied Scientist", "bibtex": "@inproceedings{\nhan2024riemannian,\ntitle={Riemannian coordinate descent algorithms on matrix manifolds},\nauthor={Andi Han and Pratik Jawanpuria and Bamdev Mishra},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bdKaQmrM81}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1244326, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6071201850069994000&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "riken.jp;microsoft.com;microsoft.com", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "RIKEN;Microsoft", "aff_unique_dep": "Advanced Institute for Computational Science;Microsoft Corporation", "aff_unique_url": "https://www.aip.riken.jp;https://www.microsoft.com", "aff_unique_abbr": "RIKEN AIP;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Japan;United States" }, { "title": "High-Dimensional Bayesian Optimization via Semi-Supervised Learning with Optimized Unlabeled Data Sampling", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33629", "id": "beXQVQorse", "proceeding": "https://proceedings.mlr.press/v235/yin24d.html", "pdf": "https://openreview.net/pdf?id=beXQVQorse", "openreview": "https://openreview.net/forum?id=beXQVQorse", "author_site": "Yuxuan Yin, Yu Wang, Peng Li", "tldr": "", "abstract": "We introduce a novel semi-supervised learning approach, named Teacher-Student Bayesian Optimization ($\\texttt{TSBO}$), integrating the teacher-student paradigm into BO to minimize expensive labeled data queries for the first time. $\\texttt{TSBO}$ incorporates a teacher model, an unlabeled data sampler, and a student model. The student is trained on unlabeled data locations generated by the sampler, with pseudo labels predicted by the teacher. The interplay between these three components implements a unique *selective regularization* to the teacher in the form of student feedback. This scheme enables the teacher to predict high-quality pseudo labels, enhancing the generalization of the GP surrogate model in the search space. To fully exploit $\\texttt{TSBO}$, we propose two optimized unlabeled data samplers to construct effective student feedback that well aligns with the objective of Bayesian optimization. Furthermore, we quantify and leverage the uncertainty of the teacher-student model for the provision of reliable feedback to the teacher in the presence of risky pseudo-label predictions. $\\texttt{TSBO}$ demonstrates significantly improved sample-efficiency in several global optimization tasks under tight labeled data budgets. The implementation is available at https://github.com/reminiscenty/TSBO-Official.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuxuan Yin;Yu Wang;Peng Li", "authorids": "~Yuxuan_Yin1;~Yu_Wang29;~Peng_Li8", "gender": "M;M;M", "homepage": ";;https://www.ece.ucsb.edu/~lip/", "dblp": "287/5093;;83/6353-1.html", "google_scholar": "g6SyvToAAAAJ;https://scholar.google.com/citations?authuser=1;QYQUS7gAAAAJ", "orcid": ";;0000-0003-3548-4589", "linkedin": ";yu-wang-b526a4220/;peng-li-ucsb/", "or_profile": "~Yuxuan_Yin1;~Yu_Wang29;~Peng_Li8", "aff": "University of California, Santa Barbara;UC Santa Barbara;UC Santa Barbara", "aff_domain": "ucsb.edu;ucsb.edu;ucsb.edu", "position": "PhD student;PhD student;Professor", "bibtex": "@inproceedings{\nyin2024highdimensional,\ntitle={High-Dimensional Bayesian Optimization via Semi-Supervised Learning with Optimized Unlabeled Data Sampling},\nauthor={Yuxuan Yin and Yu Wang and Peng Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=beXQVQorse}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1838319, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17661275767978311463&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "ucsb.edu;ucsb.edu;ucsb.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Santa Barbara", "aff_unique_dep": "", "aff_unique_url": "https://www.ucsb.edu", "aff_unique_abbr": "UCSB", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Santa Barbara", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Submodular framework for structured-sparse optimal transport", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33628", "id": "bfQCO9Vqhk", "proceeding": "https://proceedings.mlr.press/v235/manupriya24a.html", "pdf": "https://openreview.net/pdf?id=bfQCO9Vqhk", "openreview": "https://openreview.net/forum?id=bfQCO9Vqhk", "author_site": "Piyushi Manupriya, Pratik Kumar Jawanpuria, Karthik Gurumoorthy, Sakethanath Jagarlapudi, Bamdev Mishra", "tldr": "", "abstract": "Unbalanced optimal transport (UOT) has recently gained much attention due to its flexible framework for handling un-normalized measures and its robustness properties. In this work, we explore learning (structured) sparse transport plans in the UOT setting, i.e., transport plans have an upper bound on the number of non-sparse entries in each column (structured sparse pattern) or in the whole plan (general sparse pattern). We propose novel sparsity-constrained UOT formulations building on the recently explored maximum mean discrepancy based UOT. We show that the proposed optimization problem is equivalent to the maximization of a weakly submodular function over a uniform matroid or a partition matroid. We develop efficient gradient-based discrete greedy algorithms and provide the corresponding theoretical guarantees. Empirically, we observe that our proposed greedy algorithms select a diverse support set and we illustrate the efficacy of the proposed approach in various applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Piyushi Manupriya;Pratik Jawanpuria;Karthik S. Gurumoorthy;SakethaNath Jagarlapudi;Bamdev Mishra", "authorids": "~Piyushi_Manupriya1;~Pratik_Jawanpuria1;~Karthik_S._Gurumoorthy2;~SakethaNath_Jagarlapudi1;~Bamdev_Mishra1", "gender": "F;M;M;;M", "homepage": ";https://pratikjawanpuria.com;;https://bamdevmishra.in;https://people.iith.ac.in/saketha/research.html", "dblp": ";32/9841;48/1893.html;133/8291;45/3130", "google_scholar": "-9baEZEAAAAJ;_GUZDtMAAAAJ;6BysJLEAAAAJ;https://scholar.google.co.in/citations?user=25IuNrMAAAAJ;https://scholar.google.com.tw/citations?user=k70LrvsAAAAJ", "orcid": "0009-0000-7563-193X;;;;", "linkedin": ";;;;", "or_profile": "~Piyushi_Manupriya1;~Pratik_Jawanpuria1;~Karthik_S._Gurumoorthy2;~Bamdev_Mishra1;~J._Nath1", "aff": "Indian Institute of Technology Hyderabad;Microsoft;Walmart Global Tech;Microsoft;Indian Institute of Technology Hyderabad", "aff_domain": "iith.ac.in;microsoft.com;walmart.com;microsoft.com;iith.ac.in", "position": "PhD student;Principal Researcher;Principal Researcher;Applied Scientist;Associate Professor", "bibtex": "@inproceedings{\nmanupriya2024submodular,\ntitle={Submodular framework for structured-sparse optimal transport},\nauthor={Piyushi Manupriya and Pratik Jawanpuria and Karthik S. Gurumoorthy and SakethaNath Jagarlapudi and Bamdev Mishra},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bfQCO9Vqhk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1371570, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15002694887002653809&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "iith.ac.in;microsoft.com;walmart.com;microsoft.com;iith.ac.in", "author_num": 5, "aff_unique_index": "0;1;2;1;0", "aff_unique_norm": "Indian Institute of Technology Hyderabad;Microsoft;Walmart Global Tech", "aff_unique_dep": ";Microsoft Corporation;", "aff_unique_url": "https://www.iith.ac.in;https://www.microsoft.com;https://www.walmart.com/careers/globaltech", "aff_unique_abbr": "IIT Hyderabad;Microsoft;Walmart GT", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hyderabad;", "aff_country_unique_index": "0;1;1;1;0", "aff_country_unique": "India;United States" }, { "title": "Unbiased Multi-Label Learning from Crowdsourced Annotations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33627", "id": "bgP8Rxv2eB", "proceeding": "https://proceedings.mlr.press/v235/xia24a.html", "pdf": "https://openreview.net/pdf?id=bgP8Rxv2eB", "openreview": "https://openreview.net/forum?id=bgP8Rxv2eB", "author_site": "Mingxuan Xia, Zenan Huang, Runze Wu, Gengyu Lyu, Junbo Zhao, Gang Chen, Haobo Wang", "tldr": "", "abstract": "This work studies the novel Crowdsourced Multi-Label Learning (CMLL) problem, where each instance is related to multiple true labels but the model only receives unreliable labels from different annotators. Although a few Crowdsourced Multi-Label Inference (CMLI) methods have been developed, they require both the training and testing sets to be assigned crowdsourced labels and focus on true label inferring rather than prediction, making them less practical. In this paper, by excavating the generation process of crowdsourced labels, we establish the first **unbiased risk estimator** for CMLL based on the crowdsourced transition matrices. To facilitate transition matrix estimation, we upgrade our unbiased risk estimator by aggregating crowdsourced labels and transition matrices from all annotators while guaranteeing its theoretical characteristics. Integrating with the unbiased risk estimator, we further propose a decoupled autoencoder framework to exploit label correlations and boost performance. We also provide a generalization error bound to ensure the convergence of the empirical risk estimator. Experiments on various CMLL scenarios demonstrate the effectiveness of our proposed method. The source code is available at https://github.com/MingxuanXia/CLEAR.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mingxuan Xia;Zenan Huang;Runze Wu;Gengyu Lyu;Junbo Zhao;Gang Chen;Haobo Wang", "authorids": "~Mingxuan_Xia1;~Zenan_Huang1;~Runze_Wu1;~Gengyu_Lyu3;~Junbo_Zhao1;~Gang_Chen6;~Haobo_Wang1", "gender": "M;M;M;M;M;M;M", "homepage": "https://mingxuanxia.github.io/;https://lccurious.github.io;https://wu-runze.github.io/;http://jakezhao.net/;;https://hbzju.github.io/;https://gengyulyu.github.io/homepage/", "dblp": "325/0852;;;191/6665;67/6383-1;;218/6818", "google_scholar": "q5XeJeAAAAAJ;EVfmFW8AAAAJ;8Uxbo9AAAAAJ;8ipao8MAAAAJ;;DnN-rggAAAAJ;wZYLfyIAAAAJ", "orcid": ";0000-0003-3950-2692;0000-0002-6986-5825;;0000-0002-7483-0045;0000-0001-8586-3048;", "linkedin": ";;;;;;", "or_profile": "~Mingxuan_Xia1;~Zenan_Huang1;~Runze_Wu1;~Junbo_Zhao1;~Gang_Chen6;~Haobo_Wang1;~Gengyu_Lyu1", "aff": "Zhejiang University;Zhejiang University;NetEase Corp;Zhejiang University;College of Computer Science and Technology, Zhejiang University;Zhejiang University;Beijing University of Technology", "aff_domain": "zju.edu.cn;zju.edu.cn;netease.com;zju.edu.cn;cs.zju.edu.cn;zju.edu.cn;bjut.edu.cn", "position": "PhD student;Researcher;Principal Researcher;Assistant Professor;Full Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nxia2024unbiased,\ntitle={Unbiased Multi-Label Learning from Crowdsourced Annotations},\nauthor={Mingxuan Xia and Zenan Huang and Runze Wu and Gengyu Lyu and Junbo Zhao and Gang Chen and Haobo Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bgP8Rxv2eB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 995688, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=193062481983013092&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 4, "email": "zju.edu.cn;zju.edu.cn;netease.com;zju.edu.cn;cs.zju.edu.cn;zju.edu.cn;bjut.edu.cn", "author_num": 7, "aff_unique_index": "0;0;1;0;0;0;2", "aff_unique_norm": "Zhejiang University;NetEase Corporation;Beijing University of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.zju.edu.cn;https://www.163.com;http://www.bjut.edu.cn", "aff_unique_abbr": "ZJU;NetEase;BJUT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Fundamental Limits of Distributed Covariance Matrix Estimation Under Communication Constraints", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33626", "id": "biE1uHyG0l", "proceeding": "https://proceedings.mlr.press/v235/rahmani24a.html", "pdf": "https://openreview.net/pdf?id=biE1uHyG0l", "openreview": "https://openreview.net/forum?id=biE1uHyG0l", "author_site": "Mohammad Reza Rahmani, Mohammad Hossein Yassaee, Mohammad Ali Maddah Ali, Mohammad Reza Aref", "tldr": "", "abstract": "Estimating high-dimensional covariance matrices is crucial in various domains. This work considers a scenario where two collaborating agents access disjoint dimensions of $m$ samples from a high--dimensional random vector, and they can only communicate a limited number of bits to a central server, which wants to accurately approximate the covariance matrix. We analyze the fundamental trade--off between communication cost, number of samples, and estimation accuracy. We prove a lower bound on the error achievable by any estimator, highlighting the impact of dimensions, number of samples, and communication budget. Furthermore, we present an algorithm that achieves this lower bound up to a logarithmic factor, demonstrating its near-optimality in practical settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mohammad Reza Rahmani;Mohammad Hossein Yassaee;Mohammad Ali Maddah-Ali;Mohammad Reza Aref", "authorids": "~Mohammad_Reza_Rahmani1;~Mohammad_Hossein_Yassaee1;~Mohammad_Ali_Maddah-Ali2;aref@sharif.edu", "gender": ";;M;", "homepage": ";;https://maddah.umn.edu/;", "dblp": ";;;", "google_scholar": ";;CFIJZwoAAAAJ;", "orcid": ";0000-0001-9353-3073;;", "linkedin": ";;;", "or_profile": "~Mohammad_Reza_Rahmani1;~Mohammad_Hossein_Yassaee1;~Mohammad_Ali_Maddah-Ali2;aref@sharif.edu", "aff": ";Sharif University of Technology, Sharif University of Technology;University of Minnesota - Twin Cities;", "aff_domain": ";ee.sharif.edu;umn.edu;", "position": ";Assistant Professor;Associate Professor;", "bibtex": "@inproceedings{\nrahmani2024fundamental,\ntitle={Fundamental Limits of Distributed Covariance Matrix Estimation Under Communication Constraints},\nauthor={Mohammad Reza Rahmani and Mohammad Hossein Yassaee and Mohammad Ali Maddah-Ali and Mohammad Reza Aref},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=biE1uHyG0l}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 598231, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:i5xdOy_MJ_IJ:scholar.google.com/&scioq=Fundamental+Limits+of+Distributed+Covariance+Matrix+Estimation+Under+Communication+Constraints&hl=en&as_sdt=0,24", "gs_version_total": 5, "email": ";ee.sharif.edu;umn.edu;", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Sharif University of Technology;University of Minnesota", "aff_unique_dep": ";", "aff_unique_url": "https://www.sharif.edu;https://www.minnesota.edu", "aff_unique_abbr": "SUT;UMN", "aff_campus_unique_index": "1", "aff_campus_unique": ";Twin Cities", "aff_country_unique_index": "0;1", "aff_country_unique": "Iran;United States" }, { "title": "Learning Decision Trees and Forests with Algorithmic Recourse", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33625", "id": "blGpu9aGs6", "proceeding": "https://proceedings.mlr.press/v235/kanamori24a.html", "pdf": "https://openreview.net/pdf?id=blGpu9aGs6", "openreview": "https://openreview.net/forum?id=blGpu9aGs6", "author_site": "Kentaro Kanamori, Takuya Takagi, Ken Kobayashi, Yuichi Ike", "tldr": "", "abstract": "This paper proposes a new algorithm for learning accurate tree-based models while ensuring the existence of recourse actions. Algorithmic Recourse (AR) aims to provide a recourse action for altering the undesired prediction result given by a model. Typical AR methods provide a reasonable action by solving an optimization task of minimizing the required effort among executable actions. In practice, however, such actions do not always exist for models optimized only for predictive performance. To alleviate this issue, we formulate the task of learning an accurate classification tree under the constraint of ensuring the existence of reasonable actions for as many instances as possible. Then, we propose an efficient top-down greedy algorithm by leveraging the adversarial training techniques. We also show that our proposed algorithm can be applied to the random forest, which is known as a popular framework for learning tree ensembles. Experimental results demonstrated that our method successfully provided reasonable actions to more instances than the baselines without significantly degrading accuracy and computational efficiency.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kentaro Kanamori;Takuya Takagi;Ken Kobayashi;Yuichi Ike", "authorids": "~Kentaro_Kanamori1;~Takuya_Takagi1;~Ken_Kobayashi1;~Yuichi_Ike1", "gender": "M;M;;M", "homepage": "https://sites.google.com/view/kentarokanamori;https://sites.google.com/site/takuyatakagipersonalpage/home;https://kenkoba2119.github.io/;https://sites.google.com/view/yuichi-ike", "dblp": "242/8425.html;161/9026;73/3956.html;https://dblp.uni-trier.de/pid/230/3805", "google_scholar": "GRSSt5AAAAAJ;9fY1WVIAAAAJ;https://scholar.google.co.jp/citations?user=fyMWmOMAAAAJ;https://scholar.google.com/citations?hl=ja", "orcid": ";0000-0002-5788-130X;;0000-0002-8907-8319", "linkedin": ";;;yuichi-ike-a74305169/", "or_profile": "~Kentaro_Kanamori1;~Takuya_Takagi1;~Ken_Kobayashi1;~Yuichi_Ike1", "aff": "Fujitsu Limited;Fujitsu Ltd.;Tokyo Institute of Technology;Institute of Mathematics for Industry, Kyushu University", "aff_domain": "fujitsu.com;fujitsu.com;m.titech.ac.jp;kyushu-u.ac.jp", "position": "Researcher;Researcher;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nkanamori2024learning,\ntitle={Learning Decision Trees and Forests with Algorithmic Recourse},\nauthor={Kentaro Kanamori and Takuya Takagi and Ken Kobayashi and Yuichi Ike},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=blGpu9aGs6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1597384, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7549226448044070247&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "fujitsu.com;fujitsu.com;m.titech.ac.jp;kyushu-u.ac.jp", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Fujitsu Limited;Tokyo Institute of Technology;Kyushu University", "aff_unique_dep": ";;Institute of Mathematics for Industry", "aff_unique_url": "https://www.fujitsu.com;https://www.titech.ac.jp;https://www.kyushu-u.ac.jp", "aff_unique_abbr": "Fujitsu;Titech;Kyushu U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Japan" }, { "title": "Nonlinear Filtering with Brenier Optimal Transport Maps", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33624", "id": "blzDxD6bKt", "proceeding": "https://proceedings.mlr.press/v235/al-jarrah24a.html", "pdf": "https://openreview.net/pdf?id=blzDxD6bKt", "openreview": "https://openreview.net/forum?id=blzDxD6bKt", "author_site": "Mohammad Al-Jarrah, Niyizhen Jin, Bamdad Hosseini, Amirhossein Taghvaei", "tldr": "", "abstract": "This paper is concerned with the problem of nonlinear filtering, i.e., computing the conditional distribution of the state of a stochastic dynamical system given a history of noisy partial observations. Conventional sequential importance resampling (SIR) particle filters suffer from fundamental limitations, in scenarios involving degenerate likelihoods or high-dimensional states, due to the weight degeneracy issue. In this paper, we explore an alternative method, which is based on estimating the Brenier optimal transport (OT) map from the current prior distribution of the state to the posterior distribution at the next time step. Unlike SIR particle filters, the OT formulation does not require the analytical form of the likelihood. Moreover, it allows us to harness the approximation power of neural networks to model complex and multi-modal distributions and employ stochastic optimization algorithms to enhance scalability. Extensive numerical experiments are presented that compare the OT method to the SIR particle filter and the ensemble Kalman filter, evaluating the performance in terms of sample efficiency, high-dimensional scalability, and the ability to capture complex and multi-modal distributions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mohammad Al-Jarrah;Niyizhen Jin;Bamdad Hosseini;Amirhossein Taghvaei", "authorids": "~Mohammad_Al-Jarrah1;njin2@uw.edu;~Bamdad_Hosseini1;~Amirhossein_Taghvaei1", "gender": "M;;M;M", "homepage": ";;https://www.bamdadhosseini.org;https://amirtag.github.io/", "dblp": ";;;158/4926", "google_scholar": "dflm_C4AAAAJ;;;l96zhjwAAAAJ", "orcid": "0009-0006-0433-9230;;;", "linkedin": ";;;", "or_profile": "~Mohammad_Al-Jarrah1;njin2@uw.edu;~Bamdad_Hosseini1;~Amirhossein_Taghvaei1", "aff": "University of Washington;;University of Washington;University of Washington, Seattle", "aff_domain": "uw.edu;;u.washington.edu;uw.edu", "position": "PhD student;;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nal-jarrah2024nonlinear,\ntitle={Nonlinear Filtering with Brenier Optimal Transport Maps},\nauthor={Mohammad Al-Jarrah and Niyizhen Jin and Bamdad Hosseini and Amirhossein Taghvaei},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=blzDxD6bKt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7817983, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7075317302047632487&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "uw.edu;;u.washington.edu;uw.edu", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Washington", "aff_unique_dep": "", "aff_unique_url": "https://www.washington.edu", "aff_unique_abbr": "UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Sequential Kernel Goodness-of-fit Testing", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33623", "id": "bmeUeCUMHA", "proceeding": "https://proceedings.mlr.press/v235/zhou24m.html", "pdf": "https://openreview.net/pdf?id=bmeUeCUMHA", "openreview": "https://openreview.net/forum?id=bmeUeCUMHA", "author_site": "Zhengyu Zhou, Weiwei Liu", "tldr": "", "abstract": "Goodness-of-fit testing, a classical statistical tool, has been extensively explored in the batch setting, where the sample size is predetermined. However, practitioners often prefer methods that adapt to the complexity of a problem rather than fixing the sample size beforehand. Classical batch tests are generally unsuitable for streaming data, as valid inference after data peeking requires multiple testing corrections, resulting in reduced statistical power. To address this issue, we delve into the design of consistent sequential goodness-of-fit tests. Following the principle of *testing by betting*, we reframe this task as selecting a sequence of payoff functions that maximize the wealth of a fictitious bettor, betting against the null in a repeated game. We conduct experiments to demonstrate the adaptability of our sequential test across varying difficulty levels of problems while maintaining control over type-I errors.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhengyu Zhou;Weiwei Liu", "authorids": "~Zhengyu_Zhou1;~Weiwei_Liu1", "gender": ";M", "homepage": ";https://sites.google.com/site/weiweiliuhomepage/", "dblp": ";54/6677-3.html", "google_scholar": ";https://scholar.google.com/citations?hl=zh-CN", "orcid": ";", "linkedin": "zhengyu-zhou-650534212/;weiwei-liu-4a7849134/", "or_profile": "~Zhengyu_Zhou1;~Weiwei_Liu1", "aff": ", Wuhan University;Wuhan University", "aff_domain": "cs.whu.edu.cn;whu.edu.cn", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nzhou2024sequential,\ntitle={Sequential Kernel Goodness-of-fit Testing},\nauthor={Zhengyu Zhou and Weiwei Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bmeUeCUMHA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 508717, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TOHWPTv59TwJ:scholar.google.com/&scioq=Sequential+Kernel+Goodness-of-fit+Testing&hl=en&as_sdt=0,5", "gs_version_total": 4, "email": "cs.whu.edu.cn;whu.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Wuhan University", "aff_unique_dep": "", "aff_unique_url": "http://www.whu.edu.cn", "aff_unique_abbr": "WHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Batch and match: black-box variational inference with a score-based divergence", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33622", "id": "bplNmU2ROC", "proceeding": "https://proceedings.mlr.press/v235/cai24d.html", "pdf": "https://openreview.net/pdf?id=bplNmU2ROC", "openreview": "https://openreview.net/forum?id=bplNmU2ROC", "author_site": "Diana Cai, Chirag Modi, Loucas Pillaud-Vivien, Charles Margossian, Robert Gower, David Blei, Lawrence Saul", "tldr": "", "abstract": "Most leading implementations of black-box variational inference (BBVI) are based on optimizing a stochastic evidence lower bound (ELBO). But such approaches to BBVI often converge slowly due to the high variance of their gradient estimates and their sensitivity to hyperparameters. In this work, we propose _batch and match_ (BaM), an alternative approach to BBVI based on a score-based divergence. Notably, this score-based divergence can be optimized by a closed-form proximal update for Gaussian variational families with full covariance matrices. We analyze the convergence of BaM when the target distribution is Gaussian, and we prove that in the limit of infinite batch size the variational parameter updates converge exponentially quickly to the target mean and covariance. We also evaluate the performance of BaM on Gaussian and non-Gaussian target distributions that arise from posterior inference in hierarchical and deep generative models. In these experiments, we find that BaM typically converges in fewer (and sometimes significantly fewer) gradient evaluations than leading implementations of BBVI based on ELBO maximization.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Diana Cai;Chirag Modi;Loucas Pillaud-Vivien;Charles Margossian;Robert M. Gower;David Blei;Lawrence K. Saul", "authorids": "~Diana_Cai1;~Chirag_Modi1;~Loucas_Pillaud-Vivien1;~Charles_Margossian1;~Robert_M._Gower1;~David_Blei2;~Lawrence_K._Saul3", "gender": "F;M;M;M;M;M;M", "homepage": "https://www.dianacai.com;;https://thebiglouloup.github.io/loucaspillaudvivien/;https://charlesm93.github.io./;https://gowerrobert.github.io/;http://www.cs.columbia.edu/~blei/;https://users.flatironinstitute.org/~lsaul/", "dblp": "191/6693;57/6166;211/7988;;143/0056;86/1910;66/6611", "google_scholar": "WrLjBYgAAAAJ;yEh-Tj8AAAAJ;https://scholar.google.com/citations?hl=en;nPtLsvIAAAAJ;okKw87MAAAAJ;https://scholar.google.com.tw/citations?user=8OYE6iEAAAAJ;Xy7pzxoAAAAJ", "orcid": ";;;0000-0002-3274-5619;;;", "linkedin": ";;;charles-margossian-3428935b/;;;", "or_profile": "~Diana_Cai1;~Chirag_Modi1;~Loucas_Pillaud-Vivien1;~Charles_Margossian1;~Robert_M._Gower1;~David_Blei2;~Lawrence_Saul1", "aff": "Flatiron Institute;Simons Foundation;Flatiron Institute;Flatiron Institute;Flatiron Institute;Columbia University;University of California, San Diego", "aff_domain": "flatiron.org;simonsfoundation.org;flatironinstitute.org;flatironinstitute.org;simonsfoundation.org;columbia.edu;ucsd.edu", "position": "Postdoc;Postdoc;Postdoc;Postdoc;Researcher;Full Professor;Full Professor", "bibtex": "@inproceedings{\ncai2024batch,\ntitle={Batch and match: black-box variational inference with a score-based divergence},\nauthor={Diana Cai and Chirag Modi and Loucas Pillaud-Vivien and Charles Margossian and Robert M. Gower and David Blei and Lawrence K. Saul},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bplNmU2ROC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3983777, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=602991306338466334&as_sdt=5,24&sciodt=0,24&hl=en", "gs_version_total": 6, "email": "flatiron.org;simonsfoundation.org;flatironinstitute.org;flatironinstitute.org;simonsfoundation.org;columbia.edu;ucsd.edu", "author_num": 7, "aff_unique_index": "0;1;0;0;0;2;3", "aff_unique_norm": "Flatiron Institute;Simons Foundation;Columbia University;University of California, San Diego", "aff_unique_dep": ";;;", "aff_unique_url": "https://flatironinstitute.org;https://www.simonsfoundation.org;https://www.columbia.edu;https://www.ucsd.edu", "aff_unique_abbr": "Flatiron;Simons Foundation;Columbia;UCSD", "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "SciBench: Evaluating College-Level Scientific Problem-Solving Abilities of Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33621", "id": "bq1JEgioLr", "proceeding": "https://proceedings.mlr.press/v235/wang24z.html", "pdf": "https://openreview.net/pdf?id=bq1JEgioLr", "openreview": "https://openreview.net/forum?id=bq1JEgioLr", "author_site": "Xiaoxuan Wang, ziniu hu, Pan Lu, Yanqiao Zhu, Jieyu Zhang, Satyen Subramaniam, Arjun Loomba, Shichang Zhang, Yizhou Sun, Wei Wang", "tldr": "", "abstract": "Most existing Large Language Model (LLM) benchmarks on scientific problem reasoning focus on problems grounded in high-school subjects and are confined to elementary algebraic operations. To systematically examine the reasoning capabilities required for solving complex scientific problems, we introduce an expansive benchmark suite SciBench for LLMs. SciBench contains a carefully curated dataset featuring a range of collegiate-level scientific problems from mathematics, chemistry, and physics domains. Based on the dataset, we conduct an in-depth benchmarking study of representative open-source and proprietary LLMs with various prompting strategies. The results reveal that current LLMs fall short of delivering satisfactory performance, with the best overall score of merely 43.22%. Furthermore, through a detailed user study, we categorize the errors made by LLMs into ten problem-solving abilities. Our analysis indicates that no single prompting strategy significantly outperforms the others and some strategies that demonstrate improvements in certain problem-solving skills could result in declines in other skills. We envision that SciBench will catalyze further developments in the reasoning abilities of LLMs, thereby ultimately contributing to scientific research and discovery.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaoxuan Wang;Ziniu Hu;Pan Lu;Yanqiao Zhu;Jieyu Zhang;Satyen Subramaniam;Arjun R Loomba;Shichang Zhang;Yizhou Sun;Wei Wang", "authorids": "~Xiaoxuan_Wang2;~Ziniu_Hu1;~Pan_Lu2;~Yanqiao_Zhu1;~Jieyu_Zhang1;~Satyen_Subramaniam1;~Arjun_R_Loomba1;~Shichang_Zhang2;~Yizhou_Sun1;~Wei_Wang13", "gender": "F;M;M;M;M;M;M;F;F;M", "homepage": ";http://acbull.github.io;https://sxkdz.github.io;https://jieyuz2.github.io/;;;https://shichangzh.github.io/;http://web.cs.ucla.edu/~yzsun/;http://www.cs.ucla.edu/~weiwang;https://lupantech.github.io/", "dblp": ";180/5436;67/8383-1;;;;234/4118;37/3868;w/WeiWang.html;", "google_scholar": "5LDKaEYAAAAJ;x6ct1CsAAAAJ;NBbJT3AAAAAJ;T_INUHUAAAAJ;;;TYqG0x4AAAAJ;https://scholar.google.com.tw/citations?user=TQgOjK0AAAAJ;UedS9LQAAAAJ;IyucsdQAAAAJ", "orcid": ";;0000-0003-2205-5304;0000-0002-1846-2436;;;0000-0003-0954-5018;;0000-0002-8180-2886;", "linkedin": "mandy-wang-a72046192/;;;jieyu-zhang-3baaa8154/;satyen-sub;arjun-raj-loomba-63473719b/;shichang-zhang-4430a4106/;;wei-wang-8800845/;pan-lu-9308909a/", "or_profile": "~Xiaoxuan_Wang2;~Ziniu_Hu1;~Yanqiao_Zhu1;~Jieyu_Zhang1;~Satyen_Subramaniam1;~Arjun_R_Loomba1;~Shichang_Zhang2;~Yizhou_Sun1;~Wei_Wang13;~Pan_Lu1", "aff": ", University of California, Los Angeles;Deepmind;University of California, Los Angeles;University of Washington;University of California, Los Angeles;University of California, Los Angeles;University of California, Los Angeles;University of California, Los Angeles;University of California, Los Angeles;University of California, Los Angeles", "aff_domain": "cs.ucla.edu;deepmind.com;ucla.edu;cs.washington.edu;ucla.edu;ucla.edu;cs.ucla.edu;ucla.edu;ucla.edu;ucla.edu", "position": "PhD student;Visiting Researcher;PhD student;PhD student;Undergrad student;Undergrad student;PhD student;Associate Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nwang2024scibench,\ntitle={SciBench: Evaluating College-Level Scientific Problem-Solving Abilities of Large Language Models},\nauthor={Xiaoxuan Wang and Ziniu Hu and Pan Lu and Yanqiao Zhu and Jieyu Zhang and Satyen Subramaniam and Arjun R Loomba and Shichang Zhang and Yizhou Sun and Wei Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bq1JEgioLr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3081806, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 129, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18110388007971651414&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "cs.ucla.edu;deepmind.com;ucla.edu;cs.washington.edu;ucla.edu;ucla.edu;cs.ucla.edu;ucla.edu;ucla.edu;ucla.edu", "author_num": 10, "aff_unique_index": "0;1;0;2;0;0;0;0;0;0", "aff_unique_norm": "University of California, Los Angeles;DeepMind;University of Washington", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ucla.edu;https://deepmind.com;https://www.washington.edu", "aff_unique_abbr": "UCLA;DeepMind;UW", "aff_campus_unique_index": "0;0;0;0;0;0;0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;1;0;0;0;0;0;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Challenges and Considerations in the Evaluation of Bayesian Causal Discovery", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33620", "id": "bqgtkBDkNs", "proceeding": "https://proceedings.mlr.press/v235/karimi-mamaghan24a.html", "pdf": "https://openreview.net/pdf?id=bqgtkBDkNs", "openreview": "https://openreview.net/forum?id=bqgtkBDkNs", "author_site": "Amir Mohammad Karimi Mamaghan, Panagiotis Tigas, Karl Johansson, Yarin Gal, Yashas Annadani, Stefan Bauer", "tldr": "", "abstract": "Representing uncertainty in causal discovery is a crucial component for experimental design, and more broadly, for safe and reliable causal decision making. Bayesian Causal Discovery (BCD) offers a principled approach to encapsulating this uncertainty. Unlike non-Bayesian causal discovery, which relies on a single estimated causal graph and model parameters for assessment, evaluating BCD presents challenges due to the nature of its inferred quantity \u2013 the posterior distribution. As a result, the research community has proposed various metrics to assess the quality of the approximate posterior. However, there is, to date, no consensus on the most suitable metric(s) for evaluation. In this work, we reexamine this question by dissecting various metrics and understanding their limitations. Through extensive empirical evaluation, we find that many existing metrics fail to exhibit a strong correlation with the quality of approximation to the true posterior, especially in scenarios with low sample sizes where BCD is most desirable. We highlight the suitability (or lack thereof) of these metrics under two distinct factors: the identifiability of the underlying causal model and the quantity of available data. Both factors affect the entropy of the true posterior, indicating that the current metrics are less fitting in settings of higher entropy. Our findings underline the importance of a more nuanced evaluation of new methods by taking into account the nature of the true posterior, as well as guide and motivate the development of new evaluation procedures for this challenge.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Amir Mohammad Karimi Mamaghan;Panagiotis Tigas;Karl Henrik Johansson;Yarin Gal;Yashas Annadani;Stefan Bauer", "authorids": "~Amir_Mohammad_Karimi_Mamaghan1;~Panagiotis_Tigas1;~Karl_Henrik_Johansson1;~Yarin_Gal1;~Yashas_Annadani1;~Stefan_Bauer1", "gender": "M;;;;;", "homepage": ";https://people.kth.se/~kallej/;http://www.cs.ox.ac.uk/people/yarin.gal/website//;https://yashasannadani.com;https://cifar.ca/bios/stefan-bauer/;https://ptigas.com", "dblp": "302/9103;;67/9076;190/7411;;159/7244", "google_scholar": "UAmANdAAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.co.uk/citations?user=SIayDoQAAAAJ;ExgzcVMAAAAJ;O-oICE8AAAAJ;https://scholar.google.co.uk/citations?user=E9ITYW0AAAAJ", "orcid": "0000-0002-6820-948X;;;;;0000-0001-9944-1129", "linkedin": "amk6610/;;;;;", "or_profile": "~Amir_Mohammad_Karimi_Mamaghan1;~Karl_Henrik_Johansson1;~Yarin_Gal1;~Yashas_Annadani1;~Stefan_Bauer1;~Panagiotis_Tigkas1", "aff": "KTH Royal Institute of Technology;KTH Royal Institute of Technology;University of Oxford;Max Planck Institute for Intelligent Systems, Max-Planck Institute;Technische Universit\u00e4t M\u00fcnchen;Isomorphic Labs (Alphabet entity)", "aff_domain": "kth.se;kth.se;ox.ac.uk;tuebingen.mpg.de;tum.de;google.com", "position": "PhD student;Full Professor;Associate Professor;PhD student;Associate Professor;Researcher", "bibtex": "@inproceedings{\nmamaghan2024challenges,\ntitle={Challenges and Considerations in the Evaluation of Bayesian Causal Discovery},\nauthor={Amir Mohammad Karimi Mamaghan and Panagiotis Tigas and Karl Henrik Johansson and Yarin Gal and Yashas Annadani and Stefan Bauer},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bqgtkBDkNs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1174369, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1148740938478544174&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "kth.se;kth.se;ox.ac.uk;tuebingen.mpg.de;tum.de;google.com", "author_num": 6, "aff_unique_index": "0;0;1;2;3;4", "aff_unique_norm": "KTH Royal Institute of Technology;University of Oxford;Max Planck Institute for Intelligent Systems;Technische Universit\u00e4t M\u00fcnchen;Isomorphic Labs", "aff_unique_dep": ";;Intelligent Systems;;", "aff_unique_url": "https://www.kth.se;https://www.ox.ac.uk;https://www.mpi-is.mpg.de;https://www.tum.de;https://isomorphiclabs.com", "aff_unique_abbr": "KTH;Oxford;MPI-IS;TUM;Isomorphic Labs", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;2;2;3", "aff_country_unique": "Sweden;United Kingdom;Germany;United States" }, { "title": "Precise Accuracy / Robustness Tradeoffs in Regression: Case of General Norms", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33619", "id": "btYeH65fI3", "proceeding": "https://proceedings.mlr.press/v235/dohmatob24c.html", "pdf": "https://openreview.net/pdf?id=btYeH65fI3", "openreview": "https://openreview.net/forum?id=btYeH65fI3", "author_site": "Elvis Dohmatob, Meyer Scetbon", "tldr": "", "abstract": "In this paper, we investigate the impact of test-time adversarial attacks on linear regression models and determine the optimal level of robustness that any model can reach while maintaining a given level of standard predictive performance (accuracy). Through quantitative estimates, we uncover fundamental tradeoffs between adversarial robustness and accuracy in different regimes. We obtain a precise characterization which distinguishes between regimes where robustness is achievable without hurting standard accuracy and regimes where a tradeoff might be unavoidable. Our findings are empirically confirmed with simple experiments that represent a variety of settings. This work covers feature covariance matrices and attack norms of any nature, extending previous works in this area.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Elvis Dohmatob;Meyer Scetbon", "authorids": "~Elvis_Dohmatob1;~Meyer_Scetbon1", "gender": "M;M", "homepage": "http://dohmatob.github.io/;https://meyerscetbon.github.io", "dblp": "134/9794;249/8054", "google_scholar": "https://scholar.google.fr/citations?user=FDWgJY8AAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Elvis_Dohmatob1;~Meyer_Scetbon1", "aff": "Meta Facebook;Microsoft", "aff_domain": "facebook.com;microsoft.com", "position": "Researcher;Researcher", "bibtex": "@inproceedings{\ndohmatob2024precise,\ntitle={Precise Accuracy / Robustness Tradeoffs in Regression: Case of General Norms},\nauthor={Elvis Dohmatob and Meyer Scetbon},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=btYeH65fI3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 888923, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9927259306196101993&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "email": "facebook.com;microsoft.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Meta;Microsoft", "aff_unique_dep": "Meta Platforms, Inc.;Microsoft Corporation", "aff_unique_url": "https://meta.com;https://www.microsoft.com", "aff_unique_abbr": "Meta;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Feature Distribution on Graph Topology Mediates the Effect of Graph Convolution: Homophily Perspective", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33618", "id": "buW1Bi6XFw", "proceeding": "https://proceedings.mlr.press/v235/lee24m.html", "pdf": "https://openreview.net/pdf?id=buW1Bi6XFw", "openreview": "https://openreview.net/forum?id=buW1Bi6XFw", "author_site": "Soo Yong Lee, Sunwoo Kim, Fanchen Bu, Jaemin Yoo, Jiliang Tang, Kijung Shin", "tldr": "", "abstract": "How would randomly shuffling feature vectors among nodes from the same class affect graph neural networks (GNNs)? The feature shuffle, intuitively, perturbs the dependence between graph topology and features (A-X dependence) for GNNs to learn from. Surprisingly, we observe a consistent and significant improvement in GNN performance following the feature shuffle. Having overlooked the impact of A-X dependence on GNNs, the prior literature does not provide a satisfactory understanding of the phenomenon. Thus, we raise two research questions. First, how should A-X dependence be measured, while controlling for potential confounds? Second, how does A-X dependence affect GNNs? In response, we (i) propose a principled measure for A-X dependence, (ii) design a random graph model that controls A-X dependence, (iii) establish a theory on how A-X dependence relates to graph convolution, and (iv) present empirical analysis on real-world graphs that align with the theory. We conclude that A-X dependence mediates the effect of graph convolution, such that smaller dependence improves GNN-based node classification.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Soo Yong Lee;Sunwoo Kim;Fanchen Bu;Jaemin Yoo;Jiliang Tang;Kijung Shin", "authorids": "~Soo_Yong_Lee1;~Sunwoo_Kim4;~Fanchen_Bu1;~Jaemin_Yoo1;~Jiliang_Tang1;~Kijung_Shin2", "gender": "M;M;M;M;M;M", "homepage": "https://syleetolow.notion.site/Soo-Yong-s-Homepage-2e5cfa74f1784bf4957e7ba0ab0fbc7a;https://sites.google.com/view/sunwoo97;https://github.com/bokveizen;https://jaeminyoo.github.io/;https://www.cse.msu.edu/~tangjili/;https://kijungs.github.io/", "dblp": "348/9631;16/3210.html;270/0123;211/2843;64/10812;153/2052", "google_scholar": "U3vZd0kAAAAJ;fYxrC_EAAAAJ;XjNu7-AAAAAJ;https://scholar.google.co.kr/citations?user=LcxcTRUAAAAJ;WtzKMWAAAAAJ;https://scholar.google.co.kr/citations?user=Yp3Cz5AAAAAJ", "orcid": "0000-0001-7957-7600;0009-0006-6002-169X;0000-0003-0497-3902;0000-0001-7237-5117;0000-0001-7125-3898;0000-0002-2872-1526", "linkedin": "syleeheal/;;fanchen-bu-1268a1255/;jaemin-yoo-8b3678142/;;kijungshin/", "or_profile": "~Soo_Yong_Lee1;~Sunwoo_Kim4;~Fanchen_Bu1;~Jaemin_Yoo1;~Jiliang_Tang1;~Kijung_Shin2", "aff": "KAIST;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;KAIST;Michigan State University;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.edu;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;msu.edu;kaist.ac.kr", "position": "PhD student;MS student;PhD student;Assistant Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nlee2024feature,\ntitle={Feature Distribution on Graph Topology Mediates the Effect of Graph Convolution: Homophily Perspective},\nauthor={Soo Yong Lee and Sunwoo Kim and Fanchen Bu and Jaemin Yoo and Jiliang Tang and Kijung Shin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=buW1Bi6XFw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3550189, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5742967654710729236&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "kaist.edu;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;msu.edu;kaist.ac.kr", "author_num": 6, "aff_unique_index": "0;0;0;0;1;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Michigan State University", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;https://www.msu.edu", "aff_unique_abbr": "KAIST;MSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0", "aff_country_unique": "South Korea;United States" }, { "title": "Optimal Ridge Regularization for Out-of-Distribution Prediction", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33617", "id": "bvPYroQgc3", "proceeding": "https://proceedings.mlr.press/v235/patil24a.html", "pdf": "https://openreview.net/pdf?id=bvPYroQgc3", "openreview": "https://openreview.net/forum?id=bvPYroQgc3", "author_site": "Pratik Patil, Jin-Hong Du, Ryan Tibshirani", "tldr": "", "abstract": "We study the behavior of optimal ridge regularization and optimal ridge risk for out-of-distribution prediction, where the test distribution deviates arbitrarily from the train distribution. We establish general conditions that determine the sign of the optimal regularization level under covariate and regression shifts. These conditions capture the alignment between the covariance and signal structures in the train and test data and reveal stark differences compared to the in-distribution setting. For example, a negative regularization level can be optimal under covariate shift or regression shift, even when the training features are isotropic or the design is underparameterized. Furthermore, we prove that the optimally tuned risk is monotonic in the data aspect ratio, even in the out-of-distribution setting and when optimizing over negative regularization levels. In general, our results do not make any modeling assumptions for the train or the test distributions, except for moment bounds, and allow for arbitrary shifts and the widest possible range of (negative) regularization levels.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pratik Patil;Jin-Hong Du;Ryan Tibshirani", "authorids": "~Pratik_Patil1;~Jin-Hong_Du1;~Ryan_Tibshirani2", "gender": ";;", "homepage": "https://www.stat.berkeley.edu/~pratikpatil/;;", "dblp": "48/2268;;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Pratik_Patil1;~Jin-Hong_Du1;~Ryan_Tibshirani2", "aff": "University of California, Berkeley;;", "aff_domain": "berkeley.edu;;", "position": "Postdoc;;", "bibtex": "@inproceedings{\npatil2024optimal,\ntitle={Optimal Ridge Regularization for Out-of-Distribution Prediction},\nauthor={Pratik Patil and Jin-Hong Du and Ryan Tibshirani},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bvPYroQgc3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1882786, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18324678336886177924&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 9, "email": "berkeley.edu;;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "An Unsupervised Approach for Periodic Source Detection in Time Series", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33616", "id": "bwZlD7mYoa", "proceeding": "https://proceedings.mlr.press/v235/demirel24b.html", "pdf": "https://openreview.net/pdf?id=bwZlD7mYoa", "openreview": "https://openreview.net/forum?id=bwZlD7mYoa", "author_site": "Berken Utku Demirel, Christian Holz", "tldr": "", "abstract": "Detection of periodic patterns of interest within noisy time series data plays a critical role in various tasks, spanning from health monitoring to behavior analysis. Existing learning techniques often rely on labels or clean versions of signals for detecting the periodicity, and those employing self-supervised methods are required to apply proper augmentations, which is already challenging for time series and can result in collapse\u2014all representations collapse to a single point due to strong augmentation. In this work, we propose a novel method to detect the periodicity in time series without the need for any labels or requiring tailored positive or negative data generation mechanisms. We mitigate the collapse issue by ensuring the learned representations retain information from the original samples without imposing any variance constraints on the batch. Our experiments in three time-series tasks against state-of-the-art learning methods show that the proposed approach consistently outperforms prior works, achieving performance improvements of more than 45--50%, showing its effectiveness.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Berken Utku Demirel;Christian Holz", "authorids": "~Berken_Utku_Demirel2;~Christian_Holz1", "gender": "M;M", "homepage": "https://berken-demirel.github.io/BerkenUtku-Demirel/;https://siplab.org", "dblp": "283/8117;79/7439-1", "google_scholar": "https://scholar.google.ch/citations?user=zbgxpdIAAAAJ;OfXP9jMAAAAJ", "orcid": ";0000-0001-9655-9519", "linkedin": ";", "or_profile": "~Berken_Utku_Demirel2;~Christian_Holz1", "aff": "Department of Computer Science, ETHZ - ETH Zurich;ETH Zurich", "aff_domain": "inf.ethz.ch;inf.ethz.ch", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\ndemirel2024an,\ntitle={An Unsupervised Approach for Periodic Source Detection in Time Series},\nauthor={Berken Utku Demirel and Christian Holz},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bwZlD7mYoa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1114320, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17393462468730337156&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 7, "email": "inf.ethz.ch;inf.ethz.ch", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "ETH Zurich", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "0", "aff_campus_unique": "Zurich;", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "title": "Plug-and-Play image restoration with Stochastic deNOising REgularization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33615", "id": "byAXJTk0LH", "proceeding": "https://proceedings.mlr.press/v235/renaud24a.html", "pdf": "https://openreview.net/pdf?id=byAXJTk0LH", "openreview": "https://openreview.net/forum?id=byAXJTk0LH", "author_site": "Marien Renaud, Jean Prost, Arthur Leclaire, Nicolas Papadakis", "tldr": "", "abstract": "Plug-and-Play (PnP) algorithms are a class of iterative algorithms that address image inverse problems by combining a physical model and a deep neural network for regularization. Even if they produce impressive image restoration results, these algorithms rely on a non-standard use of a denoiser on images that are less and less noisy along the iterations, which contrasts with recent algorithms based on Diffusion Models (DM), where the denoiser is applied only on re-noised images. We propose a new PnP framework, called Stochastic deNOising REgularization (SNORE), which applies the denoiser only on images with noise of the adequate level. It is based on an explicit stochastic regularization, which leads to a stochastic gradient descent algorithm to solve ill-posed inverse problems. A convergence analysis of this algorithm and its annealing extension is provided. Experimentally, we prove that SNORE is competitive with respect to state-of-the-art methods on deblurring and inpainting tasks, both quantitatively and qualitatively.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Marien Renaud;Jean Prost;Arthur Leclaire;Nicolas Papadakis", "authorids": "~Marien_Renaud1;~Jean_Prost1;~Arthur_Leclaire1;~Nicolas_Papadakis3", "gender": ";M;M;M", "homepage": ";;https://perso.telecom-paristech.fr/aleclaire/;https://www.math.u-bordeaux.fr/~npapadak/", "dblp": ";285/6483;130/1813;70/1520", "google_scholar": ";l8V9zvAAAAAJ;;https://scholar.google.fr/citations?user=hfyLiLYAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Marien_Renaud1;~Jean_Prost1;~Arthur_Leclaire1;~Nicolas_Papadakis3", "aff": ";Universit\u00e9 Paris Cit\u00e9;T\u00e9l\u00e9com Paris;CNRS/IMB", "aff_domain": ";u-paris.fr;telecom-paris.fr;u-bordeaux.fr", "position": ";Postdoc;Associate Professor;Researcher", "bibtex": "@inproceedings{\nrenaud2024plugandplay,\ntitle={Plug-and-Play image restoration with Stochastic de{NO}ising {RE}gularization},\nauthor={Marien Renaud and Jean Prost and Arthur Leclaire and Nicolas Papadakis},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=byAXJTk0LH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 10062059, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13177718514686683634&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";u-paris.fr;telecom-paris.fr;u-bordeaux.fr", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Universit\u00e9 Paris Cit\u00e9;T\u00e9l\u00e9com Paris;CNRS", "aff_unique_dep": ";;Institut de Math\u00e9matiques de Bordeaux", "aff_unique_url": "https://www.universite-paris.fr;https://www.telecom-paris.fr;https://www.cnrs.fr", "aff_unique_abbr": "UPC;T\u00e9l\u00e9com Paris;CNRS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "Autoformalizing Euclidean Geometry", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33614", "id": "bylZbZOsGA", "proceeding": "https://proceedings.mlr.press/v235/murphy24a.html", "pdf": "https://openreview.net/pdf?id=bylZbZOsGA", "openreview": "https://openreview.net/forum?id=bylZbZOsGA", "author_site": "Logan Murphy, Kaiyu Yang, Jialiang Sun, Zhaoyu Li, Anima Anandkumar, Xujie Si", "tldr": "", "abstract": "Autoformalization involves automatically translating informal math into formal theorems and proofs that are machine-verifiable. Euclidean geometry provides an interesting and controllable domain for studying autoformalization. In this paper, we introduce a neuro-symbolic framework for autoformalizing Euclidean geometry, which combines domain knowledge, SMT solvers, and large language models (LLMs). One challenge in Euclidean geometry is that informal proofs rely on diagrams, leaving gaps in texts that are hard to formalize. To address this issue, we use theorem provers to fill in such diagrammatic information automatically, so that the LLM only needs to autoformalize the explicit textual steps, making it easier for the model. We also provide automatic semantic evaluation for autoformalized theorem statements. We construct LeanEuclid, an autoformalization benchmark consisting of problems from Euclid's Elements and the UniGeo dataset formalized in the Lean proof assistant. Experiments with GPT-4 and GPT-4V show the capability and limitations of state-of-the-art LLMs on autoformalizing geometry problems. The data and code are available at https://github.com/loganrjmurphy/LeanEuclid.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Logan Murphy;Kaiyu Yang;Jialiang Sun;Zhaoyu Li;Anima Anandkumar;Xujie Si", "authorids": "lmurphy@cs.toronto.edu;~Kaiyu_Yang1;~Jialiang_Sun2;~Zhaoyu_Li3;~Anima_Anandkumar1;~Xujie_Si1", "gender": ";M;M;M;;M", "homepage": ";https://yangky11.github.io;https://jacksun200312.github.io/JackWebPage;https://www.zhaoyu-li.com/;;https://xujie.si", "dblp": ";177/9276;;;;142/8449", "google_scholar": ";FciCu4EAAAAJ;Hjd_pFAAAAAJ;;;Ru-jrx4AAAAJ", "orcid": ";0000-0002-2777-612X;;;;", "linkedin": ";kaiyuy;jack-sun-2741711b5/;zhaoyu-li-9171892a5/;;", "or_profile": "lmurphy@cs.toronto.edu;~Kaiyu_Yang1;~Jialiang_Sun2;~Zhaoyu_Li3;~Anima_Anandkumar1;~Xujie_Si1", "aff": ";California Institute of Technology;Department of Computer Science, University of Toronto;University of Toronto;;University of Toronto", "aff_domain": ";caltech.edu;cs.toronto.edu;cs.toronto.edu;;toronto.edu", "position": ";Postdoc;PhD student;PhD student;;Assistant Professor", "bibtex": "@inproceedings{\nmurphy2024autoformalizing,\ntitle={Autoformalizing Euclidean Geometry},\nauthor={Logan Murphy and Kaiyu Yang and Jialiang Sun and Zhaoyu Li and Anima Anandkumar and Xujie Si},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bylZbZOsGA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2034256, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9610198611307025129&as_sdt=5,24&sciodt=0,24&hl=en", "gs_version_total": 6, "email": ";caltech.edu;cs.toronto.edu;cs.toronto.edu;;toronto.edu", "author_num": 6, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "California Institute of Technology;University of Toronto", "aff_unique_dep": ";Department of Computer Science", "aff_unique_url": "https://www.caltech.edu;https://www.utoronto.ca", "aff_unique_abbr": "Caltech;U of T", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Pasadena;Toronto;", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United States;Canada" }, { "title": "Decomposing Uncertainty for Large Language Models through Input Clarification Ensembling", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33613", "id": "byxXa99PtF", "proceeding": "https://proceedings.mlr.press/v235/hou24b.html", "pdf": "https://openreview.net/pdf?id=byxXa99PtF", "openreview": "https://openreview.net/forum?id=byxXa99PtF", "author_site": "Bairu Hou, Yujian Liu, Kaizhi Qian, Jacob Andreas, Shiyu Chang, Yang Zhang", "tldr": "", "abstract": "Uncertainty decomposition refers to the task of decomposing the total uncertainty of a predictive model into aleatoric (data) uncertainty, resulting from inherent randomness in the data-generating process, and epistemic (model) uncertainty, resulting from missing information in the model's training data. In large language models (LLMs) specifically, identifying sources of uncertainty is an important step toward improving reliability, trustworthiness, and interpretability, but remains an important open research question. In this paper, we introduce an uncertainty decomposition framework for LLMs, called input clarification ensembling, which can be applied to any pre-trained LLM. Our approach generates a set of clarifications for the input, feeds them into an LLM, and ensembles the corresponding predictions. We show that, when aleatoric uncertainty arises from ambiguity or under-specification in LLM inputs, this approach makes it possible to factor an (un-clarified) LLM's predictions into separate aleatoric and epistemic terms, using a decomposition similar to the one employed by Bayesian neural networks. Empirical evaluations demonstrate that input clarification ensembling provides accurate and reliable uncertainty quantification on several language processing tasks. Code and data are available at https://github.com/UCSB-NLP-Chang/llm_uncertainty.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bairu Hou;Yujian Liu;Kaizhi Qian;Jacob Andreas;Shiyu Chang;Yang Zhang", "authorids": "~Bairu_Hou2;~Yujian_Liu1;~Kaizhi_Qian1;~Jacob_Andreas1;~Shiyu_Chang2;~Yang_Zhang3", "gender": ";M;;M;Unspecified;M", "homepage": "https://hbr690188270.github.io/;https://yujianll.github.io;;http://web.mit.edu/jda/www;http://people.csail.mit.edu/chang87/;", "dblp": "274/7151;206/8853;212/6254;97/8154;28/9988;06/6785-1", "google_scholar": "FO7taJgAAAAJ;rLetNLIAAAAJ;;dnZ8udEAAAAJ;r21asW4AAAAJ;_-5PSgQAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Bairu_Hou2;~Yujian_Liu1;~Kaizhi_Qian1;~Jacob_Andreas1;~Shiyu_Chang2;~Yang_Zhang3", "aff": "University of California, Santa Barbara;University of California, Santa Barbara;International Business Machines;Microsoft;University of California, Santa Barbara;International Business Machines", "aff_domain": "ucsb.edu;ucsb.edu;ibm.com;microsoft.com;ucsb.edu;ibm.com", "position": "PhD student;PhD student;Researcher;Researcher;Assistant Professor;Research Staff Employee", "bibtex": "@inproceedings{\nhou2024decomposing,\ntitle={Decomposing Uncertainty for Large Language Models through Input Clarification Ensembling},\nauthor={Bairu Hou and Yujian Liu and Kaizhi Qian and Jacob Andreas and Shiyu Chang and Yang Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=byxXa99PtF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 642349, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2562707741888856024&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13, "email": "ucsb.edu;ucsb.edu;ibm.com;microsoft.com;ucsb.edu;ibm.com", "author_num": 6, "aff_unique_index": "0;0;1;2;0;1", "aff_unique_norm": "University of California, Santa Barbara;International Business Machines Corporation;Microsoft", "aff_unique_dep": ";;Microsoft Corporation", "aff_unique_url": "https://www.ucsb.edu;https://www.ibm.com;https://www.microsoft.com", "aff_unique_abbr": "UCSB;IBM;Microsoft", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Santa Barbara;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "What Will My Model Forget? Forecasting Forgotten Examples in Language Model Refinement", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33612", "id": "bzNwexOPWm", "proceeding": "https://proceedings.mlr.press/v235/jin24d.html", "pdf": "https://openreview.net/pdf?id=bzNwexOPWm", "openreview": "https://openreview.net/forum?id=bzNwexOPWm", "author_site": "Xisen Jin, Xiang Ren", "tldr": "", "abstract": "Language models deployed in the wild make errors. However, simply updating the model with the corrected error instances causes catastrophic forgetting---the updated model makes errors on instances learned during the instruction tuning or upstream training phase. Randomly replaying upstream data yields unsatisfactory performance and often comes with high variance and poor controllability. To this end, we try to forecast upstream examples that will be forgotten due to a model update for improved controllability of the replay process and interpretability. We train forecasting models given a collection of online learned examples and corresponding forgotten upstream pre-training examples. We propose a partially interpretable forecasting model based on the observation that changes in pre-softmax logit scores of pretraining examples resemble that of online learned examples, which performs decently on BART but fails on T5 models. We further show a black-box classifier based on inner products of example representations achieves better forecasting performance over a series of setups. Finally, we show that we reduce forgetting of upstream pretraining examples by replaying examples that are forecasted to be forgotten, demonstrating the practical utility of forecasting example forgetting.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xisen Jin;Xiang Ren", "authorids": "~Xisen_Jin3;~Xiang_Ren1", "gender": "M;M", "homepage": "https://xsjin.xyz;https://shanzhenren.github.io/", "dblp": "222/9324;36/360-1", "google_scholar": "https://scholar.google.com/citations?hl=en;_moJlrIAAAAJ", "orcid": ";", "linkedin": ";xren7", "or_profile": "~Xisen_Jin3;~Xiang_Ren1", "aff": "University of Southern California;University of Southern California", "aff_domain": "usc.edu;usc.edu", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\njin2024what,\ntitle={What Will My Model Forget? Forecasting Forgotten Examples in Language Model Refinement},\nauthor={Xisen Jin and Xiang Ren},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bzNwexOPWm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 472881, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17902762272606913529&as_sdt=5,24&sciodt=0,24&hl=en", "gs_version_total": 7, "email": "usc.edu;usc.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Southern California", "aff_unique_dep": "", "aff_unique_url": "https://www.usc.edu", "aff_unique_abbr": "USC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "A Language Model\u2019s Guide Through Latent Space", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33611", "id": "c0LoolDFw4", "proceeding": "https://proceedings.mlr.press/v235/von-rutte24a.html", "pdf": "https://openreview.net/pdf?id=c0LoolDFw4", "openreview": "https://openreview.net/forum?id=c0LoolDFw4", "author_site": "Dimitri von R\u00fctte, Sotiris Anagnostidis, Gregor Bachmann, Thomas Hofmann", "tldr": "", "abstract": "Concept guidance has emerged as a cheap and simple way to control the behavior of language models by probing their hidden representations for concept vectors and using them to perturb activations at inference time. While the focus of previous work has largely been on *truthfulness*, in this paper we extend this framework to a richer set of concepts such as *appropriateness*, *humor*, *creativity* and *quality*, and explore to what degree current detection and guidance strategies work in these challenging settings. To facilitate evaluation, we develop a novel metric for concept guidance that takes into account both the success of concept elicitation as well as the potential degradation in fluency of the guided model. Our extensive experiments reveal that while some concepts such as *truthfulness* more easily allow for guidance with current techniques, novel concepts such as *appropriateness* or *humor* either remain difficult to elicit, need extensive tuning to work, or even experience confusion. Moreover, we find that probes with optimal detection accuracies do not necessarily make for the optimal guides, contradicting previous observations for *truthfulness*. Our work warrants a deeper investigation into the interplay between detectability, guidability, and the nature of the concept, and we hope that our rich experimental test-bed for guidance research inspires stronger follow-up approaches.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dimitri von R\u00fctte;Sotiris Anagnostidis;Gregor Bachmann;Thomas Hofmann", "authorids": "~Dimitri_von_R\u00fctte1;~Sotiris_Anagnostidis1;~Gregor_Bachmann1;~Thomas_Hofmann1", "gender": ";M;M;M", "homepage": ";;http://www.da.inf.ethz.ch/people/GregorBachmann;http://www.da.inf.ethz.ch/", "dblp": ";286/1763;;h/ThHofmann", "google_scholar": "wVQcUf8AAAAJ;qjzTKWUAAAAJ;bbGqqloAAAAJ;T3hAyLkAAAAJ", "orcid": ";;;", "linkedin": "dimitri-von-r%C3%BCtte-890633215/;sotiris-anagnostidis-b064a5129/;;thomas-hofmann-1ab2402/", "or_profile": "~Dimitri_von_R\u00fctte1;~Sotiris_Anagnostidis1;~Gregor_Bachmann1;~Thomas_Hofmann1", "aff": "ETH Zurich;ETH Zurich;Swiss Federal Institute of Technology;Swiss Federal Institute of Technology", "aff_domain": "ethz.ch;inf.ethz.ch;ethz.ch;ethz.ch", "position": "MS student;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nr{\\\"u}tte2024a,\ntitle={A Language Model{\\textquoteright}s Guide Through Latent Space},\nauthor={Dimitri von R{\\\"u}tte and Sotiris Anagnostidis and Gregor Bachmann and Thomas Hofmann},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=c0LoolDFw4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3356547, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2855738414083458801&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 8, "email": "ethz.ch;inf.ethz.ch;ethz.ch;ethz.ch", "author_num": 4, "aff_unique_index": "0;0;1;1", "aff_unique_norm": "ETH Zurich;Swiss Federal Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch", "aff_unique_abbr": "ETHZ;ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Switzerland" }, { "title": "A3S: A General Active Clustering Method with Pairwise Constraints", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33610", "id": "c18noxRh3X", "proceeding": "https://proceedings.mlr.press/v235/deng24b.html", "pdf": "https://openreview.net/pdf?id=c18noxRh3X", "openreview": "https://openreview.net/forum?id=c18noxRh3X", "author_site": "Xun Deng, Junlong Liu, Han Zhong, Fuli Feng, Chen Shen, Xiangnan He, Jieping Ye, Zheng Wang", "tldr": "", "abstract": "Active clustering aims to boost the clustering performance by integrating human-annotated pairwise constraints through strategic querying. Conventional approaches with semi-supervised clustering schemes encounter high query costs when applied to large datasets with numerous classes. To address these limitations, we propose a novel Adaptive Active Aggregation and Splitting (A3S) framework, falling within the cluster-adjustment scheme in active clustering. A3S features strategic active clustering adjustment on the initial cluster result, which is obtained by an adaptive clustering algorithm. In particular, our cluster adjustment is inspired by the quantitative analysis of Normalized mutual information gain under the information theory framework and can provably improve the clustering quality. The proposed A3S framework significantly elevates the performance and scalability of active clustering. In extensive experiments across diverse real-world datasets, A3S achieves desired results with significantly fewer human queries compared with existing methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xun Deng;Junlong Liu;Han Zhong;Fuli Feng;Chen Shen;Xiangnan He;Jieping Ye;Zheng Wang", "authorids": "~Xun_Deng1;~Junlong_Liu2;~Han_Zhong1;~Fuli_Feng1;~Chen_Shen7;~Xiangnan_He1;~Jieping_Ye4;~Zheng_Wang32", "gender": "M;M;;M;M;M;M;", "homepage": ";;https://hanzhong-ml.github.io/;https://fulifeng.github.io/;;http://staff.ustc.edu.cn/~hexn;http://yelabs.net/;", "dblp": "154/0106/;;137/8096.html;183/9198;55/5393-3;59/1007;03/5454;", "google_scholar": "LILR85MAAAAJ;;Bk5q_pAAAAAJ;https://scholar.google.com.sg/citations?user=QePM4u8AAAAJ;b6vn1uMAAAAJ;https://scholar.google.com.sg/citations?user=X45Go24AAAAJ;T9AzhwcAAAAJ;", "orcid": ";0000-0001-5125-3022;;0000-0002-5828-9842;;0000-0001-8472-7992;0000-0001-8662-5818;", "linkedin": ";;;;;;;", "or_profile": "~Xun_Deng1;~Junlong_Liu2;~Han_Zhong1;~Fuli_Feng1;~Chen_Shen7;~Xiangnan_He1;~Jieping_Ye4;~Zheng_Wang32", "aff": "University of Science and Technology of China;;Peking University;University of Science and Technology of China;Alibaba Group;University of Science and Technology of China;Alibaba Group;", "aff_domain": "mail.ustc.edu.cn;;stu.pku.edu.cn;ustc.edu.cn;alibaba-inc.com;ustc.edu.cn;alibaba-inc.com;", "position": "PhD student;;PhD student;Full Professor;Researcher;Professor;Principal Researcher;", "bibtex": "@inproceedings{\ndeng2024as,\ntitle={A3S: A General Active Clustering Method with Pairwise Constraints},\nauthor={Xun Deng and Junlong Liu and Han Zhong and Fuli Feng and Chen Shen and Xiangnan He and Jieping Ye and Zheng Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=c18noxRh3X}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1460083, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2012834412990307573&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 6, "email": "mail.ustc.edu.cn;;stu.pku.edu.cn;ustc.edu.cn;alibaba-inc.com;ustc.edu.cn;alibaba-inc.com;", "author_num": 8, "aff_unique_index": "0;1;0;2;0;2", "aff_unique_norm": "University of Science and Technology of China;Peking University;Alibaba Group", "aff_unique_dep": ";;", "aff_unique_url": "http://www.ustc.edu.cn;http://www.pku.edu.cn;https://www.alibaba.com", "aff_unique_abbr": "USTC;Peking U;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Iterative Preference Learning from Human Feedback: Bridging Theory and Practice for RLHF under KL-constraint", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33609", "id": "c1AKcA6ry1", "proceeding": "https://proceedings.mlr.press/v235/xiong24a.html", "pdf": "https://openreview.net/pdf?id=c1AKcA6ry1", "openreview": "https://openreview.net/forum?id=c1AKcA6ry1", "author_site": "Wei Xiong, Hanze Dong, Chenlu Ye, Ziqi Wang, Han Zhong, Heng Ji, Nan Jiang, Tong Zhang", "tldr": "", "abstract": "This paper studies the theoretical framework of the alignment process of generative models with Reinforcement Learning from Human Feedback (RLHF). We consider a standard mathematical formulation, the reverse-KL regularized contextual bandit for RLHF. Despite its widespread practical application, a rigorous theoretical analysis of this formulation remains open. We investigate its behavior in three distinct settings---offline, online, and hybrid---and propose efficient algorithms with finite-sample theoretical guarantees. Moving towards practical applications, our framework, with a robust approximation of the information-theoretical policy improvement oracle, naturally gives rise to several novel RLHF algorithms. This includes an iterative version of the Direct Preference Optimization (DPO) algorithm for online settings, and a multi-step rejection sampling strategy for offline scenarios. Our empirical evaluations on real-world alignment experiment of large language model demonstrate that these proposed methods significantly surpass existing strong baselines, such as DPO and Rejection Sampling Optimization (RSO), showcasing the connections between solid theoretical foundations and their potent practical implementations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wei Xiong;Hanze Dong;Chenlu Ye;Ziqi Wang;Han Zhong;Heng Ji;Nan Jiang;Tong Zhang", "authorids": "~Wei_Xiong9;~Hanze_Dong1;~Chenlu_Ye1;~Ziqi_Wang2;~Han_Zhong1;~Heng_Ji3;~Nan_Jiang2;~Tong_Zhang2", "gender": "M;M;F;;;F;M;M", "homepage": "https://weixiongust.github.io/WeiXiongUST/index.html;https://hendrydong.github.io/;https://chenluye99.github.io/;https://www.wzq016.github.io;https://hanzhong-ml.github.io/;http://blender.cs.illinois.edu/hengji.html;http://nanjiang.cs.illinois.edu;http://tongzhang-ml.org", "dblp": "33/4054-15;228/7798;336/2092;38/8097-3;137/8096.html;;06/4489-8;07/4227-1", "google_scholar": "m2-OwQEAAAAJ;g9WLzWoAAAAJ;c8yK5XsAAAAJ;xYRZiZkAAAAJ;Bk5q_pAAAAAJ;z7GCqT4AAAAJ;nUlanA8AAAAJ;LurWtuYAAAAJ", "orcid": ";;;;;;;0000-0002-5511-2558", "linkedin": ";hanze-dong/;https://www.linkedin.cn/incareer/in/chenlu-ye-9b015b184;;;;nan-jiang-28139937/;", "or_profile": "~Wei_Xiong9;~Hanze_Dong1;~Chenlu_Ye1;~Ziqi_Wang2;~Han_Zhong1;~Heng_Ji3;~Nan_Jiang2;~Tong_Zhang2", "aff": "Google;SalesForce;University of Illinois, Urbana Champaign;Meta Facebook;Peking University;University of Illinois, Urbana-Champaign;University of Illinois, Urbana Champaign;UIUC", "aff_domain": "deepmind.com;salesforce.com;illinois.edu;meta.com;stu.pku.edu.cn;uiuc.edu;illinois.edu;illinois.edu", "position": "Intern;Researcher;PhD student;Intern;PhD student;Full Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nxiong2024iterative,\ntitle={Iterative Preference Learning from Human Feedback: Bridging Theory and Practice for {RLHF} under {KL}-constraint},\nauthor={Wei Xiong and Hanze Dong and Chenlu Ye and Ziqi Wang and Han Zhong and Heng Ji and Nan Jiang and Tong Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=c1AKcA6ry1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 841522, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 139, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5185507906793268794&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "deepmind.com;salesforce.com;illinois.edu;meta.com;stu.pku.edu.cn;uiuc.edu;illinois.edu;illinois.edu", "author_num": 8, "aff_unique_index": "0;1;2;3;4;5;2;2", "aff_unique_norm": "Google;Salesforce;University of Illinois Urbana-Champaign;Meta;Peking University;University of Illinois", "aff_unique_dep": "Google;;;Meta Platforms, Inc.;;", "aff_unique_url": "https://www.google.com;https://www.salesforce.com;https://illinois.edu;https://meta.com;http://www.pku.edu.cn;https://illinois.edu", "aff_unique_abbr": "Google;Salesforce;UIUC;Meta;Peking U;UIUC", "aff_campus_unique_index": "0;2;2;2;2", "aff_campus_unique": "Mountain View;;Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;1;0;0;0", "aff_country_unique": "United States;China" }, { "title": "Deep Neural Room Acoustics Primitive", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33608", "id": "c2CKmP9l5X", "proceeding": "https://proceedings.mlr.press/v235/he24b.html", "pdf": "https://openreview.net/pdf?id=c2CKmP9l5X", "openreview": "https://openreview.net/forum?id=c2CKmP9l5X", "author_site": "Yuhang He, Anoop Cherian, Gordon Wichern, Andrew Markham", "tldr": "", "abstract": "The primary objective of room acoustics is to model the intricate sound propagation dynamics from any source to receiver position within enclosed 3D spaces. These dynamics are encapsulated in the form of a 1D room impulse response (RIR). Precisely measuring RIR is difficult due to the complexity of sound propagation encompassing reflection, diffraction, and absorption. In this work, we propose to learn a continuous neural room acoustics field that implicitly encodes all essential sound propagation primitives for each enclosed 3D space, so that we can infer the RIR corresponding to arbitrary source-receiver positions unseen in the training dataset. Our framework, dubbed DeepNeRAP, is trained in a self-supervised manner without requiring direct access to RIR ground truth that is often needed in prior methods. The key idea is to design two cooperative acoustic agents to actively probe a 3D space, one emitting and the other receiving sound at various locations. Analyzing this sound helps to inversely characterize the acoustic primitives. Our framework is well-grounded in the fundamental physical principles of sound propagation, including reciprocity and globality, and thus is acoustically interpretable and meaningful. We present experiments on both synthetic and real-world datasets, demonstrating superior quality in RIR estimation against closely related methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuhang He;Anoop Cherian;Gordon Wichern;Andrew Markham", "authorids": "~Yuhang_He3;~Anoop_Cherian1;~Gordon_Wichern1;~Andrew_Markham2", "gender": "M;;M;M", "homepage": "https://yuhanghe01.github.io/;;;http://users.cecs.anu.edu.au/~cherian/", "dblp": ";72/6049;83/7169;44/7734", "google_scholar": "H1p3ve8AAAAJ;;https://scholar.google.co.uk/citations?user=g3JTO9EAAAAJ;https://scholar.google.com.au/citations?hl=en", "orcid": ";;;0000-0002-5566-0351", "linkedin": ";;;anoop-cherian-4678a04/", "or_profile": "~Yuhang_He3;~Gordon_Wichern1;~Andrew_Markham2;~Anoop_Cherian2", "aff": "University of Oxford;Mitsubishi Electric Research Labs;University of Oxford;Mitsubishi Electric Research Labs", "aff_domain": "ox.ac.uk;merl.com;ox.ac.uk;merl.com", "position": "PhD student;Principal Research Scientist;Associate Professor;Principal Researcher", "bibtex": "@inproceedings{\nhe2024deep,\ntitle={Deep Neural Room Acoustics Primitive},\nauthor={Yuhang He and Anoop Cherian and Gordon Wichern and Andrew Markham},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=c2CKmP9l5X}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5062412, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10633725098285414080&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "ox.ac.uk;merl.com;ox.ac.uk;merl.com", "author_num": 4, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "University of Oxford;Mitsubishi Electric Research Laboratories", "aff_unique_dep": ";", "aff_unique_url": "https://www.ox.ac.uk;https://www.merl.com", "aff_unique_abbr": "Oxford;MERL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Position: Insights from Survey Methodology can Improve Training Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33607", "id": "c3ls5AVOw7", "proceeding": "https://proceedings.mlr.press/v235/eckman24a.html", "pdf": "https://openreview.net/pdf?id=c3ls5AVOw7", "openreview": "https://openreview.net/forum?id=c3ls5AVOw7", "author_site": "Stephanie Eckman, Barbara Plank, Frauke Kreuter", "tldr": "", "abstract": "Whether future AI models are fair, trustworthy, and aligned with the public's interests rests in part on our ability to collect accurate data about what we want the models to do. However, collecting high-quality data is difficult, and few AI/ML researchers are trained in data collection methods. Recent research in data-centric AI has show that higher quality training data leads to better performing models, making this the right moment to introduce AI/ML researchers to the field of survey methodology, the science of data collection. We summarize insights from the survey methodology literature and discuss how they can improve the quality of training and feedback data. We also suggest collaborative research ideas into how biases in data collection can be mitigated, making models more accurate and human-centric.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Stephanie Eckman;Barbara Plank;Frauke Kreuter", "authorids": "~Stephanie_Eckman1;~Barbara_Plank2;~Frauke_Kreuter1", "gender": ";;F", "homepage": ";https://bplank.github.io/;https://umd.edu/", "dblp": ";46/521;240/9345.html", "google_scholar": ";;iD8Vb4MAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Stephanie_Eckman1;~Barbara_Plank2;~Frauke_Kreuter1", "aff": ";IT University of Copenhagen;University of Maryland", "aff_domain": ";itu.dk;umd.edu", "position": ";Full Professor;Full Professor", "bibtex": "@inproceedings{\neckman2024position,\ntitle={Position: Insights from Survey Methodology can Improve Training Data},\nauthor={Stephanie Eckman and Barbara Plank and Frauke Kreuter},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=c3ls5AVOw7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 496891, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6467978180199146511&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": ";itu.dk;umd.edu", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "IT University of Copenhagen;University of Maryland", "aff_unique_dep": ";", "aff_unique_url": "https://itu.dk;https://www/umd.edu", "aff_unique_abbr": "ITU;UMD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Denmark;United States" }, { "title": "Hybrid Reinforcement Learning from Offline Observation Alone", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33606", "id": "c6rVlTKpb5", "proceeding": "https://proceedings.mlr.press/v235/song24a.html", "pdf": "https://openreview.net/pdf?id=c6rVlTKpb5", "openreview": "https://openreview.net/forum?id=c6rVlTKpb5", "author_site": "Yuda Song, J. Bagnell, Aarti Singh", "tldr": "", "abstract": "We consider the hybrid reinforcement learning setting where the agent has access to both offline data and online interactive access. While RL research typically assumes offline data contains complete action, reward and transition information, datasets with only state information (also known as *observation-only* datasets) are more general, abundant and practical. This motivates our study of the *hybrid RL with observation-only offline dataset* framework. While the task of competing with the best policy ``covered'' by the offline data can be solved if a *reset* model of the environment is provided (i.e., one that can be reset to any state), we show evidence of hardness of competing when only given the weaker *trace* model (i.e., one can only reset to the initial states and must produce full traces through the environment), without further assumption of *admissibility* of the offline data. Under the admissibility assumptions-- that the offline data could actually be produced by the policy class we consider-- we propose the first algorithm in the trace model setting that provably matches the performance of algorithms that leverage a reset model. We also perform proof-of-concept experiments that suggest the effectiveness of our algorithm in practice.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuda Song;Drew Bagnell;Aarti Singh", "authorids": "~Yuda_Song2;~Drew_Bagnell2;~Aarti_Singh1", "gender": "M;;F", "homepage": "https://yudasong.github.io/;https://robotwhisperer.org/;https://www.cs.cmu.edu/~aarti", "dblp": "250/4880-1;;64/5328", "google_scholar": "0QDCG8IAAAAJ;7t4jbPQAAAAJ;vGBcNVAAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yuda_Song2;~Drew_Bagnell2;~Aarti_Singh1", "aff": "Carnegie Mellon University;Carnegie Mellon University;University of Wisconsin - Madison", "aff_domain": "andrew.cmu.edu;cmu.edu;wisc.edu", "position": "PhD student;Associate Professor;PhD student", "bibtex": "@inproceedings{\nsong2024hybrid,\ntitle={Hybrid Reinforcement Learning from Offline Observation Alone},\nauthor={Yuda Song and Drew Bagnell and Aarti Singh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=c6rVlTKpb5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1271293, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7-AABkT22FkJ:scholar.google.com/&scioq=Hybrid+Reinforcement+Learning+from+Offline+Observation+Alone&hl=en&as_sdt=0,23", "gs_version_total": 6, "email": "andrew.cmu.edu;cmu.edu;wisc.edu", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Carnegie Mellon University;University of Wisconsin-Madison", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.wisc.edu", "aff_unique_abbr": "CMU;UW-Madison", "aff_campus_unique_index": "1", "aff_campus_unique": ";Madison", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "A Resilient and Accessible Distribution-Preserving Watermark for Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33605", "id": "c8qWiNiqRY", "proceeding": "https://proceedings.mlr.press/v235/wu24h.html", "pdf": "https://openreview.net/pdf?id=c8qWiNiqRY", "openreview": "https://openreview.net/forum?id=c8qWiNiqRY", "author_site": "Yihan Wu, Zhengmian Hu, Junfeng Guo, Hongyang Zhang, Heng Huang", "tldr": "", "abstract": "Watermarking techniques offer a promising way to identify machine-generated content via embedding covert information into the contents generated from language models. A challenge in the domain lies in preserving the distribution of original generated content after watermarking. Our research extends and improves upon existing watermarking framework, placing emphasis on the importance of a Distribution-Preserving (DiP) watermark. Contrary to the current strategies, our proposed DiPmark simultaneously preserves the original token distribution during watermarking (distribution-preserving), is detectable without access to the language model API and prompts (accessible), and is provably robust to moderate changes of tokens (resilient). DiPmark operates by selecting a random set of tokens prior to the generation of a word, then modifying the token distribution through a distribution-preserving reweight function to enhance the probability of these selected tokens during the sampling process. Extensive empirical evaluation on various language models and tasks demonstrates our approach's distribution-preserving property, accessibility, and resilience, making it a effective solution for watermarking tasks that demand impeccable quality preservation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yihan Wu;Zhengmian Hu;Junfeng Guo;Hongyang Zhang;Heng Huang", "authorids": "~Yihan_Wu1;~Zhengmian_Hu1;~Junfeng_Guo2;~Hongyang_Zhang1;~Heng_Huang1", "gender": "M;M;M;M;M", "homepage": "https://yihwu.github.io/;https://www.umd.edu/;https://junfenggo.github.io/;https://hongyanz.github.io/;https://www.cs.umd.edu/~heng/", "dblp": ";285/4945;;23/10537-1;03/281", "google_scholar": "cajTg_wAAAAJ;4eXiWWgAAAAJ;TqblqYcAAAAJ;https://scholar.google.com/citations?hl=en;4OqLaDwAAAAJ", "orcid": ";0000-0003-0316-146X;;;", "linkedin": ";;;;", "or_profile": "~Yihan_Wu1;~Zhengmian_Hu1;~Junfeng_Guo2;~Hongyang_Zhang1;~Heng_Huang1", "aff": "University of Maryland, College Park;University of Maryland, College Park;University of Maryland Institute for Advanced Computer Studies, University of Maryland, College Park;School of Computer Science, University of Waterloo;Department of Computer Science, University of Maryland, College Park", "aff_domain": "umd.edu;umd.edu;umiacs.umd.edu;uwaterloo.ca;cs.umd.edu", "position": "PhD student;PhD student;Postdoc;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nwu2024a,\ntitle={A Resilient and Accessible Distribution-Preserving Watermark for Large Language Models},\nauthor={Yihan Wu and Zhengmian Hu and Junfeng Guo and Hongyang Zhang and Heng Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=c8qWiNiqRY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2022986, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4256396966204569198&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "umd.edu;umd.edu;umiacs.umd.edu;uwaterloo.ca;cs.umd.edu", "author_num": 5, "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "University of Maryland;University of Waterloo;University of Maryland, College Park", "aff_unique_dep": ";School of Computer Science;Department of Computer Science", "aff_unique_url": "https://www/umd.edu;https://uwaterloo.ca;https://www/umd.edu", "aff_unique_abbr": "UMD;UWaterloo;UMD", "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "College Park;Waterloo", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;Canada" }, { "title": "Scalable Pre-training of Large Autoregressive Image Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33604", "id": "c92KDfEZTg", "proceeding": "https://proceedings.mlr.press/v235/el-nouby24a.html", "pdf": "https://openreview.net/pdf?id=c92KDfEZTg", "openreview": "https://openreview.net/forum?id=c92KDfEZTg", "author_site": "Alaaeldin Ali, Michal Klein, Shuangfei Zhai, Miguel Angel Bautista Martin, Vaishaal Shankar, Alexander Toshev, Joshua M Susskind, Armand Joulin", "tldr": "", "abstract": "This paper introduces AIM, a collection of vision models pre-trained with an autoregressive objective. These models are inspired by their textual counterparts, i.e., Large Language Models (LLMs), and exhibit similar scaling properties. Specifically, we highlight two key findings: (1) the performance of the visual features scale with both the model capacity and the quantity of data, (2) the value of the objective function correlates with the performance of the model on downstream tasks. We illustrate the practical implication of these findings by pre-training a 7 billion parameter AIM on 2 billion images, that achieves 84.0% on ImageNet-1k with a frozen trunk. Interestingly, even at this scale, we observe no sign of saturation in performance, suggesting that AIM potentially represents a new frontier for training large-scale vision models. The pre-training of AIM is similar to the pre-training of LLMs, and does not require any image-specific strategy to stabilize the training at scale.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alaaeldin El-Nouby;Michal Klein;Shuangfei Zhai;Miguel \u00c1ngel Bautista;Vaishaal Shankar;Alexander T Toshev;Joshua M. Susskind;Armand Joulin", "authorids": "~Alaaeldin_El-Nouby1;~Michal_Klein1;~Shuangfei_Zhai3;~Miguel_\u00c1ngel_Bautista1;~Vaishaal_Shankar1;~Alexander_T_Toshev1;~Joshua_M._Susskind1;~Armand_Joulin2", "gender": "M;M;M;M;M;M;M;M", "homepage": "https://github.com/michalk8;http://cs.binghamton.edu/~szhai2;;http://www.apple.com;;http://aelnouby.github.io/;http://alex.toshev.tech;http://vaishaal.com", "dblp": "332/4607;;38/10085;132/7797;;215/4993;96/2687;159/3628", "google_scholar": "zByzdzcAAAAJ;G6vdBYsAAAAJ;ZrRs-qoAAAAJ;Sv2TGqsAAAAJ;;jxpBMwwAAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": "0000-0002-2433-6380;;;;;;;", "linkedin": "michal-klein-148697165/;;;joshua-susskind-8ab2ab5/;;;alexander-toshev-9270726/;", "or_profile": "~Michal_Klein1;~Shuangfei_Zhai3;~Miguel_\u00c1ngel_Bautista1;~Joshua_M._Susskind1;~Armand_Joulin2;~Alaaeldin_M_El-Nouby1;~Alexander_Toshev1;~vaishaal_naanny_shankar1", "aff": "Apple;Apple;Apple;Apple;Meta Facebook;Apple;Apple;Apple", "aff_domain": "apple.com;apple.com;apple.com;apple.com;fb.com;apple.com;apple.com;apple.com", "position": "Researcher;Research Scientist;Research Scientist;Researcher;Associate Professor;Researcher;research scientist;Researcher", "bibtex": "@inproceedings{\nel-nouby2024scalable,\ntitle={Scalable Pre-training of Large Autoregressive Image Models},\nauthor={Alaaeldin El-Nouby and Michal Klein and Shuangfei Zhai and Miguel {\\'A}ngel Bautista and Vaishaal Shankar and Alexander T Toshev and Joshua M. Susskind and Armand Joulin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=c92KDfEZTg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 616793, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15609135621145061491&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "apple.com;apple.com;apple.com;apple.com;fb.com;apple.com;apple.com;apple.com", "author_num": 8, "aff_unique_index": "0;0;0;0;1;0;0;0", "aff_unique_norm": "Apple;Meta", "aff_unique_dep": "Apple Inc.;Meta Platforms, Inc.", "aff_unique_url": "https://www.apple.com;https://meta.com", "aff_unique_abbr": "Apple;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Bayesian Adaptation of Network Depth and Width for Continual Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33603", "id": "c9HddKGiYk", "proceeding": "https://proceedings.mlr.press/v235/thapa24b.html", "pdf": "https://openreview.net/pdf?id=c9HddKGiYk", "openreview": "https://openreview.net/forum?id=c9HddKGiYk", "author_site": "Jeevan Thapa, Rui Li", "tldr": "", "abstract": "While existing dynamic architecture-based continual learning methods adapt network width by growing new branches, they overlook the critical aspect of network depth. We propose a novel non-parametric Bayesian approach to infer network depth and adapt network width while maintaining model performance across tasks. Specifically, we model the growth of network depth with a beta process and apply drop-connect regularization to network width using a conjugate Bernoulli process. Our results show that our proposed method achieves superior or comparable performance with state-of-the-art methods across various continual learning benchmarks. Moreover, our approach can be readily extended to unsupervised continual learning, showcasing competitive performance compared to existing techniques.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jeevan Thapa;Rui Li", "authorids": "~Jeevan_Thapa1;~Rui_Li3", "gender": "M;M", "homepage": "https://jeevan11vision.github.io/;https://ruililuci.com", "dblp": ";96/4282-2", "google_scholar": ";AHx53ngAAAAJ", "orcid": ";0000-0001-5096-1553", "linkedin": ";", "or_profile": "~Jeevan_Thapa1;~Rui_Li3", "aff": "Rochester Institute of Technology;Rochester Institute of Technology", "aff_domain": "rit.edu;rit.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nthapa2024bayesian,\ntitle={Bayesian Adaptation of Network Depth and Width for Continual Learning},\nauthor={Jeevan Thapa and Rui Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=c9HddKGiYk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1097863, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6974973369526991554&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "rit.edu;rit.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Rochester Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.rit.edu", "aff_unique_abbr": "RIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Transforming and Combining Rewards for Aligning Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33602", "id": "cAWbm9KRZO", "proceeding": "https://proceedings.mlr.press/v235/wang24ay.html", "pdf": "https://openreview.net/pdf?id=cAWbm9KRZO", "openreview": "https://openreview.net/forum?id=cAWbm9KRZO", "author_site": "Zihao Wang, Chirag Nagpal, Jonathan Berant, Jacob Eisenstein, Alexander D'Amour, Sanmi Koyejo, Victor Veitch", "tldr": "", "abstract": "A common approach for aligning language models to human preferences is to first learn a reward model from preference data, and then use this reward model to update the language model. We study two closely related problems that arise in this approach. First, any monotone transformation of the reward model preserves preference ranking; is there a choice that is \"better\" than others? Second, we often wish to align language models to multiple properties: how should we combine multiple reward models? Using a probabilistic interpretation of the alignment procedure, we identify a natural choice for transformation for (the common case of) rewards learned from Bradley-Terry preference models. The derived transformation is straightforward: we apply a log-sigmoid function to the centered rewards, a method we term \"LSC-transformation\" (log-sigmoid-centered transformation). This transformation has two important properties. First, it emphasizes improving poorly-performing outputs, rather than outputs that already score well. This mitigates both underfitting (where some prompts are not improved) and reward hacking (where the model learns to exploit misspecification of the reward model). Second, it enables principled aggregation of rewards by linking summation to logical conjunction: the sum of transformed rewards corresponds to the probability that the output is \"good\" in all measured properties, in a sense we make precise. Experiments aligning language models to be both helpful and harmless using RLHF show substantial improvements over the baseline (non-transformed) approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zihao Wang;Chirag Nagpal;Jonathan Berant;Jacob Eisenstein;Alexander Nicholas D'Amour;Sanmi Koyejo;Victor Veitch", "authorids": "~Zihao_Wang8;~Chirag_Nagpal1;~Jonathan_Berant1;~Jacob_Eisenstein1;~Alexander_D'Amour1;~Sanmi_Koyejo1;~Victor_Veitch1", "gender": ";;M;M;M;;M", "homepage": ";http://cs.cmu.edu/~chiragn;http://www.cs.tau.ac.il/~joberant/;https://jacobeisenstein.github.io;http://www.alexdamour.com;http://victorveitch.com;https://cs.stanford.edu/~sanmi/", "dblp": "https://dblp.org/rec/journals/corr/abs-2105-13440;149/2771;31/8178;82/2305;209/4892;167/5650;14/8885", "google_scholar": "jyBHUM8AAAAJ;rAbWdAkAAAAJ;https://scholar.google.co.il/citations?user=xCYHonIAAAAJ;Wb_lnjAAAAAJ;okP0uukAAAAJ;https://scholar.google.com/citations?hl=en;EaaOeJwAAAAJ", "orcid": ";;;;;;0000-0002-4023-419X", "linkedin": "https://www.linkedin.com/mwlite/in/zihao-wang-2b1645123;;;;;;sanmi-koyejo-984754/", "or_profile": "~Zihao_Wang8;~Chirag_Nagpal1;~Jonathan_Berant1;~Jacob_Eisenstein1;~Alexander_D'Amour1;~Victor_Veitch1;~Oluwasanmi_O_Koyejo1", "aff": "University of Chicago;Google;Tel Aviv University;Google;Google;Google;Google", "aff_domain": "uchicago.edu;google.com;tau.ac.il;google.com;google.com;google.com;google.com", "position": "PhD student;Researcher;Associate Professor;Research Scientist;Research Scientist;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nwang2024transforming,\ntitle={Transforming and Combining Rewards for Aligning Large Language Models},\nauthor={Zihao Wang and Chirag Nagpal and Jonathan Berant and Jacob Eisenstein and Alexander Nicholas D'Amour and Sanmi Koyejo and Victor Veitch},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cAWbm9KRZO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6239621, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14434576745253093758&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "uchicago.edu;google.com;tau.ac.il;google.com;google.com;google.com;google.com", "author_num": 7, "aff_unique_index": "0;1;2;1;1;1;1", "aff_unique_norm": "University of Chicago;Google;Tel Aviv University", "aff_unique_dep": ";Google;", "aff_unique_url": "https://www.uchicago.edu;https://www.google.com;https://www.tau.ac.il", "aff_unique_abbr": "UChicago;Google;TAU", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1;0;0;0;0", "aff_country_unique": "United States;Israel" }, { "title": "AST-T5: Structure-Aware Pretraining for Code Generation and Understanding", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33601", "id": "cBWVJh5Fvf", "proceeding": "https://proceedings.mlr.press/v235/gong24c.html", "pdf": "https://openreview.net/pdf?id=cBWVJh5Fvf", "openreview": "https://openreview.net/forum?id=cBWVJh5Fvf", "author_site": "Linyuan Gong, Mostafa Elhoushi, Alvin Cheung", "tldr": "", "abstract": "Large language models (LLMs) have made significant advancements in code-related tasks, yet many LLMs treat code as simple sequences, neglecting its structured nature. We introduce AST-T5, a novel pretraining paradigm that leverages the Abstract Syntax Tree (AST) for enhanced code generation, transpilation, and understanding. Using dynamic programming, our AST-Aware Segmentation retains code structure, while our AST-Aware Span Corruption objective equips the model to reconstruct various code structures. Unlike other models, AST-T5 avoids complex program analyses or architectural changes, so it integrates seamlessly with any encoder-decoder Transformer. Evaluations show that AST-T5 consistently outperforms similar-sized LMs across various code-related tasks including HumanEval and MBPP. Structure-awareness makes AST-T5 particularly powerful in code-to-code tasks, surpassing CodeT5 by 2 points in exact match score for the Bugs2Fix task and by 3 points in exact match score for Java-C# Transpilation in CodeXGLUE. Our code and model are publicly available at https://github.com/gonglinyuan/ast_t5.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Linyuan Gong;Mostafa Elhoushi;Alvin Cheung", "authorids": "~Linyuan_Gong1;~Mostafa_Elhoushi1;~Alvin_Cheung2", "gender": "M;M;", "homepage": "https://gonglinyuan.com;;", "dblp": "213/8172;157/6350;", "google_scholar": "w5A4QPQAAAAJ;https://scholar.google.ca/citations?user=y_cwSKAAAAAJ;", "orcid": ";0000-0001-6172-4510;", "linkedin": ";mostafaelhoushi/;", "or_profile": "~Linyuan_Gong1;~Mostafa_Elhoushi1;~Alvin_Cheung2", "aff": "University of California, Berkeley;Meta;", "aff_domain": "berkeley.edu;meta.com;", "position": "PhD student;Researcher;", "bibtex": "@inproceedings{\ngong2024astt,\ntitle={{AST}-T5: Structure-Aware Pretraining for Code Generation and Understanding},\nauthor={Linyuan Gong and Mostafa Elhoushi and Alvin Cheung},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cBWVJh5Fvf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 388794, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9535961078207435437&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "berkeley.edu;meta.com;", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "University of California, Berkeley;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.berkeley.edu;https://meta.com", "aff_unique_abbr": "UC Berkeley;Meta", "aff_campus_unique_index": "0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Position: Rethinking Post-Hoc Search-Based Neural Approaches for Solving Large-Scale Traveling Salesman Problems", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33600", "id": "cEJ9jNJuJP", "proceeding": "https://proceedings.mlr.press/v235/xia24f.html", "pdf": "https://openreview.net/pdf?id=cEJ9jNJuJP", "openreview": "https://openreview.net/forum?id=cEJ9jNJuJP", "author_site": "Yifan Xia, Xianliang Yang, Zichuan Liu, Zhihao Liu, Lei Song, Jiang Bian", "tldr": "", "abstract": "Recent advancements in solving large-scale traveling salesman problems (TSP) utilize the heatmap-guided Monte Carlo tree search (MCTS) paradigm, where machine learning (ML) models generate heatmaps, indicating the probability distribution of each edge being part of the optimal solution, to guide MCTS in solution finding. However, our theoretical and experimental analysis raises doubts about the effectiveness of ML-based heatmap generation. In support of this, we demonstrate that a simple baseline method can outperform complex ML approaches in heatmap generation. Furthermore, we question the practical value of the heatmap-guided MCTS paradigm. To substantiate this, our findings show its inferiority to the LKH-3 heuristic despite the paradigm's reliance on problem-specific, hand-crafted strategies. For the future, we suggest research directions focused on developing more theoretically sound heatmap generation methods and exploring autonomous, generalizable ML approaches for combinatorial problems. The code is available for review: https://github.com/xyfffff/rethink_mcts_for_tsp.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yifan Xia;Xianliang Yang;Zichuan Liu;Zhihao Liu;Lei Song;Jiang Bian", "authorids": "~Yifan_Xia2;~Xianliang_Yang1;~Zichuan_Liu3;~Zhihao_Liu3;~Lei_Song3;~Jiang_Bian1", "gender": "M;M;;M;M;M", "homepage": ";https://github.com/VictorYXL;https://zichuan-liu.github.io/;;;https://sites.google.com/view/jiangbian", "dblp": ";;;;76/893-1.html;09/851-2.html", "google_scholar": "QH2Yjk0AAAAJ;;SUvzKxwAAAAJ;https://scholar.google.com/citations?hl=zh-CN;pXDSOocAAAAJ;pZBEnY8AAAAJ", "orcid": ";;;;;0000-0002-9472-600X", "linkedin": "yifan-xia-476bb5287/;;;;;jbian/", "or_profile": "~Yifan_Xia2;~Xianliang_Yang1;~Zichuan_Liu3;~Zhihao_Liu3;~Lei_Song3;~Jiang_Bian1", "aff": "Nanjing University;Microsoft;Nanjing University;Institute of Automation, Chinese Academy of Sciences;Microsoft;Microsoft", "aff_domain": "nju.edu.cn;microsoft.com;nju.edu.cn;ia.ac.cn;microsoft.com;microsoft.com", "position": "MS student;Researcher;MS student;PhD student;Principal Researcher;Partner Research Manager", "bibtex": "@inproceedings{\nxia2024position,\ntitle={Position: Rethinking Post-Hoc Search-Based Neural Approaches for Solving Large-Scale Traveling Salesman Problems},\nauthor={Yifan Xia and Xianliang Yang and Zichuan Liu and Zhihao Liu and Lei Song and Jiang Bian},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cEJ9jNJuJP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 527288, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7518191977523936687&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "nju.edu.cn;microsoft.com;nju.edu.cn;ia.ac.cn;microsoft.com;microsoft.com", "author_num": 6, "aff_unique_index": "0;1;0;2;1;1", "aff_unique_norm": "Nanjing University;Microsoft;Chinese Academy of Sciences", "aff_unique_dep": ";Microsoft Corporation;Institute of Automation", "aff_unique_url": "https://www.nju.edu.cn;https://www.microsoft.com;http://www.ia.cas.cn", "aff_unique_abbr": "Nanjing U;Microsoft;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1;1", "aff_country_unique": "China;United States" }, { "title": "Towards Causal Foundation Model: on Duality between Optimal Balancing and Attention", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33599", "id": "cFDaYtZR4u", "proceeding": "https://proceedings.mlr.press/v235/zhang24x.html", "pdf": "https://openreview.net/pdf?id=cFDaYtZR4u", "openreview": "https://openreview.net/forum?id=cFDaYtZR4u", "author_site": "Jiaqi Zhang, Joel Jennings, Agrin Hilmkil, Nick Pawlowski, Cheng Zhang, Chao Ma", "tldr": "", "abstract": "Foundation models have brought changes to the landscape of machine learning, demonstrating sparks of human-level intelligence across a diverse array of tasks. However, a gap persists in complex tasks such as causal inference, primarily due to challenges associated with intricate reasoning steps and high numerical precision requirements. In this work, we take a first step towards building causally-aware foundation models for treatment effect estimations. We propose a novel, theoretically justified method called Causal Inference with Attention (CInA), which utilizes multiple unlabeled datasets to perform self-supervised causal learning, and subsequently enables zero-shot causal inference on unseen tasks with new data. This is based on our theoretical results that demonstrate the primal-dual connection between optimal covariate balancing and self-attention, facilitating zero-shot causal inference through the final layer of a trained transformer-type architecture. We demonstrate empirically that CInA effectively generalizes to out-of-distribution datasets and various real-world datasets, matching or even surpassing traditional per-dataset methodologies. These results provide compelling evidence that our method has the potential to serve as a stepping stone for the development of causal foundation models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiaqi Zhang;Joel Jennings;Agrin Hilmkil;Nick Pawlowski;Cheng Zhang;Chao Ma", "authorids": "~Jiaqi_Zhang2;~Joel_Jennings1;~Agrin_Hilmkil1;~Nick_Pawlowski2;~Cheng_Zhang1;~Chao_Ma2", "gender": "F;;;M;F;M", "homepage": ";;;http://nickpawlowski.de;http://cheng-zhang.org;", "dblp": ";;;198/1040;82/6384-5;", "google_scholar": ";;;https://scholar.google.de/citations?user=a5u9fVYAAAAJ;r40iAwIAAAAJ;https://scholar.google.co.uk/citations?user=UWP3kWEAAAAJ", "orcid": "0000-0001-9039-6843;;;0000-0002-2748-7977;;", "linkedin": "vicky-jiaqi-zhang-34b490180/;;;nickpawlowski;;", "or_profile": "~Jiaqi_Zhang2;~Joel_Jennings1;~Agrin_Hilmkil1;~Nick_Pawlowski2;~Cheng_Zhang1;~Chao_Ma2", "aff": "Apple;;;Microsoft;Microsoft;Microsoft", "aff_domain": "apple.com;;;microsoft.com;microsoft.com;microsoft.com", "position": "Intern;;;Senior Researcher;Principal Researcher;Researcher", "bibtex": "@inproceedings{\nzhang2024towards,\ntitle={Towards Causal Foundation Model: on Duality between Optimal Balancing and Attention},\nauthor={Jiaqi Zhang and Joel Jennings and Agrin Hilmkil and Nick Pawlowski and Cheng Zhang and Chao Ma},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cFDaYtZR4u}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1519710, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11865923090565645526&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "apple.com;;;microsoft.com;microsoft.com;microsoft.com", "author_num": 6, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Apple;Microsoft", "aff_unique_dep": "Apple Inc.;Microsoft Corporation", "aff_unique_url": "https://www.apple.com;https://www.microsoft.com", "aff_unique_abbr": "Apple;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Discovering Mixtures of Structural Causal Models from Time Series Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33598", "id": "cHJAUdam3i", "proceeding": "https://proceedings.mlr.press/v235/varambally24a.html", "pdf": "https://openreview.net/pdf?id=cHJAUdam3i", "openreview": "https://openreview.net/forum?id=cHJAUdam3i", "author_site": "Sumanth Varambally, Yian Ma, Rose Yu", "tldr": "", "abstract": "Discovering causal relationships from time series data is significant in fields such as finance, climate science, and neuroscience. However, contemporary techniques rely on the simplifying assumption that data originates from the same causal model, while in practice, data is heterogeneous and can stem from different causal models. In this work, we relax this assumption and perform causal discovery from time series data originating from *a mixture of causal models*. We propose a general variational inference-based framework called MCD to infer the underlying causal models as well as the mixing probability of each sample. Our approach employs an end-to-end training process that maximizes an evidence-lower bound for the data likelihood. We present two variants: MCD-Linear for linear relationships and independent noise, and MCD-Nonlinear for nonlinear causal relationships and history-dependent noise. We demonstrate that our method surpasses state-of-the-art benchmarks in causal discovery tasks through extensive experimentation on synthetic and real-world datasets, particularly when the data emanates from diverse underlying causal graphs. Theoretically, we prove the identifiability of such a model under some mild assumptions. Implementation is available at [https://github.com/Rose-STL-Lab/MCD](https://github.com/Rose-STL-Lab/MCD).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sumanth Varambally;Yian Ma;Rose Yu", "authorids": "~Sumanth_Varambally1;~Yian_Ma1;~Rose_Yu1", "gender": ";M;F", "homepage": ";https://sites.google.com/view/yianma;http://roseyu.com", "dblp": ";;164/7314", "google_scholar": ";A0TFlacAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Sumanth_Varambally1;~Yian_Ma1;~Rose_Yu1", "aff": ";University of California, San Diego;University of California, San Diego", "aff_domain": ";ucsd.edu;ucsd.edu", "position": ";Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nvarambally2024discovering,\ntitle={Discovering Mixtures of Structural Causal Models from Time Series Data},\nauthor={Sumanth Varambally and Yian Ma and Rose Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cHJAUdam3i}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2447757, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3602400541678954792&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 9, "email": ";ucsd.edu;ucsd.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of California, San Diego", "aff_unique_dep": "", "aff_unique_url": "https://www.ucsd.edu", "aff_unique_abbr": "UCSD", "aff_campus_unique_index": "0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Accelerating Heterogeneous Federated Learning with Closed-form Classifiers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33597", "id": "cMige5MK1N", "proceeding": "https://proceedings.mlr.press/v235/fani-24a.html", "pdf": "https://openreview.net/pdf?id=cMige5MK1N", "openreview": "https://openreview.net/forum?id=cMige5MK1N", "author_site": "Eros Fan\u00ec, Raffaello Camoriano, Barbara Caputo, Marco Ciccone", "tldr": "", "abstract": "Federated Learning (FL) methods often struggle in highly statistically heterogeneous settings. Indeed, non-IID data distributions cause client drift and biased local solutions, particularly pronounced in the final classification layer, negatively impacting convergence speed and accuracy. To address this issue, we introduce *Federated Recursive Ridge Regression* (Fed3R). Our method fits a Ridge Regression classifier computed in closed form leveraging pre-trained features. Fed3R is immune to statistical heterogeneity and is invariant to the sampling order of the clients. Therefore, it proves particularly effective in cross-device scenarios. Furthermore, it is fast and efficient in terms of communication and computation costs, requiring up to two orders of magnitude fewer resources than the competitors. Finally, we propose to leverage the Fed3R parameters as an initialization for a softmax classifier and subsequently fine-tune the model using any FL algorithm (Fed3R with Fine-Tuning, Fed3R+FT). Our findings also indicate that maintaining a fixed classifier aids in stabilizing the training and learning more discriminative features in cross-device settings. Official website: https://fed-3r.github.io/.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Eros Fan\u00ec;Raffaello Camoriano;Barbara Caputo;Marco Ciccone", "authorids": "~Eros_Fan\u00ec1;~Raffaello_Camoriano1;~Barbara_Caputo1;~Marco_Ciccone1", "gender": "M;M;F;M", "homepage": "https://erosfani.github.io;https://www.iit.it/it/people-details/-/people/raffaello-camoriano;http://www.dauin.polito.it/personale/scheda/(nominativo)/barbara.caputo;https://marcociccone.github.io/", "dblp": "314/7956;166/1687;04/7038;191/9375", "google_scholar": "rwto7AgAAAAJ;vBBJ2wkAAAAJ;https://scholar.google.it/citations?user=mHbdIAwAAAAJ;https://scholar.google.it/citations?user=hOQjblcAAAAJ", "orcid": ";0000-0002-8890-2732;;", "linkedin": ";raffaellocamoriano;;", "or_profile": "~Eros_Fan\u00ec1;~Raffaello_Camoriano1;~Barbara_Caputo1;~Marco_Ciccone1", "aff": "Polytechnic University of Turin;Istituto Italiano di Tecnologia;Sapienza University of Rome;Vector Institute", "aff_domain": "polito.it;iit.it;uniroma1.it;vectorinstitute.ai", "position": "PhD student;Researcher;Associate Professor;Postdoc", "bibtex": "@inproceedings{\nfan{\\`\\i}2024accelerating,\ntitle={Accelerating Heterogeneous Federated Learning with Closed-form Classifiers},\nauthor={Eros Fan{\\`\\i} and Raffaello Camoriano and Barbara Caputo and Marco Ciccone},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cMige5MK1N}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8118352, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1849226189464841872&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 7, "email": "polito.it;iit.it;uniroma1.it;vectorinstitute.ai", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Polytechnic University of Turin;Istituto Italiano di Tecnologia;Sapienza University of Rome;Vector Institute", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.polito.it;https://www.iit.it;https://www.uniroma1.it;https://vectorinstitute.ai/", "aff_unique_abbr": "Polito;IIT;Sapienza;Vector Institute", "aff_campus_unique_index": "1", "aff_campus_unique": ";Rome", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "Italy;Canada" }, { "title": "Understanding Finetuning for Factual Knowledge Extraction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33596", "id": "cPsn9AcOYh", "proceeding": "https://proceedings.mlr.press/v235/ghosal24a.html", "pdf": "https://openreview.net/pdf?id=cPsn9AcOYh", "openreview": "https://openreview.net/forum?id=cPsn9AcOYh", "author_site": "Gaurav Ghosal, Tatsunori Hashimoto, Aditi Raghunathan", "tldr": "", "abstract": "In this work, we study the impact of QA fine-tuning data on downstream factuality. We show that fine-tuning on lesser-known facts that are poorly stored during pretraining yields significantly worse factuality than fine-tuning on well-known facts, even when all facts are seen during pretraining. We prove this phenomenon theoretically, showing that training on lesser-known facts can lead the model to ignore subject entity names and instead output a generic plausible response even when the relevant factual knowledge is encoded in the model. On three question answering benchmarks (PopQA, Entity Questions, and MMLU) and two language models (Llama-2-7B and Mistral-7B), we find that (i) finetuning on a completely factual but lesser-known subset of the data deteriorates downstream factuality (5-10%) and (ii) finetuning on a subset of better-known examples matches or outperforms finetuning on the entire dataset. Ultimately, our results shed light on the interaction between pretrained knowledge and finetuning data and demonstrate the importance of taking into account how facts are stored in the pretrained model when fine-tuning for knowledge-intensive tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gaurav Rohit Ghosal;Tatsunori Hashimoto;Aditi Raghunathan", "authorids": "~Gaurav_Rohit_Ghosal1;~Tatsunori_Hashimoto1;~Aditi_Raghunathan1", "gender": "M;M;F", "homepage": ";https://thashim.github.io;https://www.cs.cmu.edu/~aditirag/", "dblp": "270/4049;;166/1409", "google_scholar": "SkyPSDUAAAAJ;5ygiTwsAAAAJ;Ch9iRwQAAAAJ", "orcid": ";;", "linkedin": "gaurav-ghosal-a2b3a318b/;;", "or_profile": "~Gaurav_Rohit_Ghosal1;~Tatsunori_Hashimoto1;~Aditi_Raghunathan1", "aff": "Carnegie Mellon University;Stanford University;Carnegie Mellon University", "aff_domain": "andrew.cmu.edu;stanford.edu;cmu.edu", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nghosal2024understanding,\ntitle={Understanding Finetuning for Factual Knowledge Extraction},\nauthor={Gaurav Rohit Ghosal and Tatsunori Hashimoto and Aditi Raghunathan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cPsn9AcOYh}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1534550, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=539721535657926674&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "andrew.cmu.edu;stanford.edu;cmu.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Carnegie Mellon University;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.stanford.edu", "aff_unique_abbr": "CMU;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Forget Sharpness: Perturbed Forgetting of Model Biases Within SAM Dynamics", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33595", "id": "cU20finY8V", "proceeding": "https://proceedings.mlr.press/v235/vani24a.html", "pdf": "https://openreview.net/pdf?id=cU20finY8V", "openreview": "https://openreview.net/forum?id=cU20finY8V", "author_site": "Ankit Vani, Frederick Tung, Gabriel Oliveira, Hossein Sharifi-Noghabi", "tldr": "", "abstract": "Despite attaining high empirical generalization, the sharpness of models trained with sharpness-aware minimization (SAM) do not always correlate with generalization error. Instead of viewing SAM as minimizing sharpness to improve generalization, our paper considers a new perspective based on SAM's training dynamics. We propose that perturbations in SAM perform *perturbed forgetting*, where they discard undesirable model biases to exhibit learning signals that generalize better. We relate our notion of forgetting to the information bottleneck principle, use it to explain observations like the better generalization of smaller perturbation batches, and show that perturbed forgetting can exhibit a stronger correlation with generalization than flatness. While standard SAM targets model biases exposed by the steepest ascent directions, we propose a new perturbation that targets biases exposed through the model's outputs. Our output bias forgetting perturbations outperform standard SAM, GSAM, and ASAM on ImageNet, robustness benchmarks, and transfer to CIFAR-10,100, while sometimes converging to sharper regions. Our results suggest that the benefits of SAM can be explained by alternative mechanistic principles that do not require flatness of the loss surface.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ankit Vani;Frederick Tung;Gabriel L. Oliveira;Hossein Sharifi-Noghabi", "authorids": "~Ankit_Vani1;~Frederick_Tung1;~Gabriel_L._Oliveira1;~Hossein_Sharifi-Noghabi1", "gender": "M;M;M;M", "homepage": "https://ankitvani.com/;;https://sites.google.com/view/gabriel-leivas-oliveira/home;https://hosseinshn.github.io/", "dblp": "178/2855;10/7697;117/2073;", "google_scholar": "KtnTuq8AAAAJ;https://scholar.google.ca/citations?user=T4EeZ9gAAAAJ;5anRZEcAAAAJ;9aw6MfUAAAAJ", "orcid": ";;0000-0003-0099-9873;", "linkedin": "ankitvani/;;;hossein-sharifi-noghabi-b6953b5b/", "or_profile": "~Ankit_Vani1;~Frederick_Tung1;~Gabriel_L._Oliveira1;~Hossein_Sharifi-Noghabi1", "aff": "Mila;Borealis AI;Borealis AI;RBC Borealis", "aff_domain": "mila.quebec;borealisai.com;borealisai.com;borealisai.com", "position": "PhD student;Researcher;Senior Machine Learning Researcher;Researcher", "bibtex": "@inproceedings{\nvani2024forget,\ntitle={Forget Sharpness: Perturbed Forgetting of Model Biases Within {SAM} Dynamics},\nauthor={Ankit Vani and Frederick Tung and Gabriel L. Oliveira and Hossein Sharifi-Noghabi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cU20finY8V}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 548234, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7689396271211374464&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "mila.quebec;borealisai.com;borealisai.com;borealisai.com", "author_num": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Mila;Borealis AI;Royal Bank of Canada", "aff_unique_dep": "Quebec Artificial Intelligence Institute;;", "aff_unique_url": "https://mila.quebec;https://www.borealisai.com;https://www.rbc.com", "aff_unique_abbr": "Mila;Borealis AI;RBC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Canada" }, { "title": "SIN: Selective and Interpretable Normalization for Long-Term Time Series Forecasting", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33594", "id": "cUMOVfOIve", "proceeding": "https://proceedings.mlr.press/v235/han24e.html", "pdf": "https://openreview.net/pdf?id=cUMOVfOIve", "openreview": "https://openreview.net/forum?id=cUMOVfOIve", "author_site": "Lu Han, Han-Jia Ye, De-Chuan Zhan", "tldr": "", "abstract": "In real-world applications, time series data frequently exhibit non-stationarity, with statistics changing over time. This variability undermines the forecasting accuracy of deep learning models that are trained on historical data but deployed for future prediction. A common approach to mitigate this issue involves normalizing the data to counteract statistical drift, followed by denormalization on the prediction. However, existing methods often employ heuristic normalization techniques that do not fully account for the unique characteristics of the series. Our paper addresses the critical question in this context: which statistics should be removed and restored? We argue that the statistics selected for normalization should exhibit both local invariance and global variability to ensure their correctness and helpfulness. To this end, we propose the Selective and Interpretable Normalization methodology, dubbed SIN. This approach maximizes the covariance between a given look-back window and its subsequent future values, thereby identifying key statistics for normalization and simultaneously learning the corresponding normalization transformations. The interpretable framework can be used to explain the success and limitations of some popular normalization methods. By integrating SIN, we demonstrate improvements in the performance of several prevalent forecasting models, thereby validating the utility of our approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lu Han;Han-Jia Ye;De-Chuan Zhan", "authorids": "~Lu_Han2;~Han-Jia_Ye1;~De-Chuan_Zhan1", "gender": "M;M;M", "homepage": "http://www.lamda.nju.edu.cn/hanlu/;http://www.lamda.nju.edu.cn/yehj;http://www.lamda.nju.edu.cn/zhandc/", "dblp": ";165/3014;74/498", "google_scholar": "https://scholar.google.com.hk/citations?user=m-WYn7gAAAAJ;mgOYhtoAAAAJ;mYJf4TcAAAAJ", "orcid": ";;0000-0002-3533-2078", "linkedin": ";;", "or_profile": "~Lu_Han2;~Han-Jia_Ye1;~De-Chuan_Zhan1", "aff": "Nanjing University;Nanjing University;Nanjing University", "aff_domain": "nju.edu.cn;nju.edu.cn;nju.edu.cn", "position": "PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nhan2024sin,\ntitle={{SIN}: Selective and Interpretable Normalization for Long-Term Time Series Forecasting},\nauthor={Lu Han and Han-Jia Ye and De-Chuan Zhan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cUMOVfOIve}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 905236, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8301455087177158318&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "nju.edu.cn;nju.edu.cn;nju.edu.cn", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "", "aff_unique_url": "https://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "A sampling theory perspective on activations for implicit neural representations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33593", "id": "cVkqItmYLQ", "proceeding": "https://proceedings.mlr.press/v235/saratchandran24a.html", "pdf": "https://openreview.net/pdf?id=cVkqItmYLQ", "openreview": "https://openreview.net/forum?id=cVkqItmYLQ", "author_site": "Hemanth Saratchandran, Sameera Ramasinghe, Violetta Shevchenko, Alexander Long, Simon Lucey", "tldr": "", "abstract": "Implicit Neural Representations (INRs) have gained popularity for encoding signals as compact, differentiable entities. While commonly using techniques like Fourier positional encodings or non-traditional activation functions (e.g., Gaussian, sinusoid, or wavelets) to capture high-frequency content, their properties lack exploration within a unified theoretical framework. Addressing this gap, we conduct a comprehensive analysis of these activations from a sampling theory perspective. Our investigation reveals that, especially in shallow INRs, $\\mathrm{sinc}$ activations\u2014previously unused in conjunction with INRs\u2014are theoretically optimal for signal encoding. Additionally, we establish a connection between dynamical systems and INRs, leveraging sampling theory to bridge these two paradigms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hemanth Saratchandran;Sameera Ramasinghe;Violetta Shevchenko;Alexander Long;Simon Lucey", "authorids": "~Hemanth_Saratchandran1;~Sameera_Ramasinghe1;~Violetta_Shevchenko1;~Alexander_Long1;~Simon_Lucey2", "gender": ";M;F;M;M", "homepage": ";;;https://github.com/AlexanderJLong;https://www.adelaide.edu.au/directory/simon.lucey", "dblp": ";181/4514;231/1762;156/9630;01/3542", "google_scholar": ";https://scholar.google.com.au/citations?user=-j0m9aMAAAAJ;aWqA0BIAAAAJ;;vmAe35UAAAAJ", "orcid": ";;;;", "linkedin": ";;violetta-shevchenko-12b62714a/;;", "or_profile": "~Hemanth_Saratchandran1;~Sameera_Ramasinghe1;~Violetta_Shevchenko1;~Alexander_Long1;~Simon_Lucey2", "aff": ";Amazon;Amazon;Amazon;University of Adelaide", "aff_domain": ";amazon.com;amazon.com;amazon.com;adelaide.edu.au", "position": ";Researcher;Researcher;Researcher;Full Professor", "bibtex": "@inproceedings{\nsaratchandran2024a,\ntitle={A sampling theory perspective on activations for implicit neural representations},\nauthor={Hemanth Saratchandran and Sameera Ramasinghe and Violetta Shevchenko and Alexander Long and Simon Lucey},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cVkqItmYLQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5730901, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17214371833795195926&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": ";amazon.com;amazon.com;amazon.com;adelaide.edu.au", "author_num": 5, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Amazon;University of Adelaide", "aff_unique_dep": "Amazon.com, Inc.;", "aff_unique_url": "https://www.amazon.com;https://www.adelaide.edu.au", "aff_unique_abbr": "Amazon;Adelaide", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;Australia" }, { "title": "FESSNC: Fast Exponentially Stable and Safe Neural Controller", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33592", "id": "cVp8blEw2i", "proceeding": "https://proceedings.mlr.press/v235/zhang24bs.html", "pdf": "https://openreview.net/pdf?id=cVp8blEw2i", "openreview": "https://openreview.net/forum?id=cVp8blEw2i", "author_site": "Jingdong Zhang, Luan Yang, Qunxi Zhu, Wei Lin", "tldr": "", "abstract": "In order to stabilize nonlinear systems modeled by stochastic differential equations, we design a Fast Exponentially Stable and Safe Neural Controller (FESSNC) for fast learning controllers. Our framework is parameterized by neural networks, and realizing both rigorous exponential stability and safety guarantees. Concretely, we design heuristic methods to learn the exponentially stable and the safe controllers, respectively, in light of the classical theory of stochastic exponential stability and our established theorem on guaranteeing the almost-sure safety for stochastic dynamics. More significantly, to rigorously ensure the stability and the safety guarantees for the learned controllers, we develop a projection operator, projecting to the space of exponentially-stable and safe controllers. To reduce the highly computational cost for solving the projection operation, approximate projection operators are delicately proposed with closed forms that map the learned controllers to the target controller space. Furthermore, we employ Hutchinson's trace estimator for a scalable unbiased estimate of the Hessian matrix that is used in the projection operator, which thus allows for reducing computational cost and, therefore, can accelerate the training and testing processes. More importantly, our approximate projection operations are applicable to the nonparametric control methods, improving their stability and safety performance. We empirically demonstrate the superiority of the FESSNC over the existing methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jingdong Zhang;Luan Yang;Qunxi Zhu;Wei Lin", "authorids": "~Jingdong_Zhang1;~Luan_Yang1;~Qunxi_Zhu1;~Wei_Lin1", "gender": "M;F;M;M", "homepage": "https://scholar.google.com/citations?user=Bjo3nfwAAAAJ&hl=zh-CN;;https://www.researchgate.net/profile/Qunxi_Zhu;https://faculty.fudan.edu.cn/wlin/zh_CN/", "dblp": "163/0015-1;;219/7742;99/2649", "google_scholar": "Bjo3nfwAAAAJ;;https://scholar.google.co.jp/citations?user=45oFQD4AAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": "0000-0002-4120-6561;0000-0001-8820-1760;0000-0001-7281-5274;0000-0002-1863-4306", "linkedin": ";;;", "or_profile": "~Jingdong_Zhang1;~Luan_Yang1;~Qunxi_Zhu1;~Wei_Lin1", "aff": "Fudan University;Fudan University;Fudan University;Fudan University", "aff_domain": "fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn", "position": "PhD student;PhD student;Postdoc;Full Professor", "bibtex": "@inproceedings{\nzhang2024fessnc,\ntitle={{FESSNC}: Fast Exponentially Stable and Safe Neural Controller},\nauthor={Jingdong Zhang and Luan Yang and Qunxi Zhu and Wei Lin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cVp8blEw2i}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1597024, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14912672631936175118&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Fudan University", "aff_unique_dep": "", "aff_unique_url": "https://www.fudan.edu.cn", "aff_unique_abbr": "Fudan", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Just Cluster It: An Approach for Exploration in High-Dimensions using Clustering and Pre-Trained Representations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33591", "id": "cXBPPfNUZJ", "proceeding": "https://proceedings.mlr.press/v235/wagner24a.html", "pdf": "https://openreview.net/pdf?id=cXBPPfNUZJ", "openreview": "https://openreview.net/forum?id=cXBPPfNUZJ", "author_site": "Stefan Sylvius Wagner Martinez, Stefan Harmeling", "tldr": "", "abstract": "In this paper we adopt a representation-centric perspective on exploration in reinforcement learning, viewing exploration fundamentally as a density estimation problem. We investigate the effectiveness of clustering representations for exploration in 3-D environments, based on the observation that the importance of pixel changes between transitions is less pronounced in 3-D environments compared to 2-D environments, where pixel changes between transitions are typically distinct and significant. We propose a method that performs episodic and global clustering on random representations and on pre-trained DINO representations to count states, i.e, estimate pseudo-counts. Surprisingly, even random features can be clustered effectively to count states in 3-D environments, however when these become visually more complex, pre-trained DINO representations are more effective thanks to the pre-trained inductive biases in the representations. Overall, this presents a pathway for integrating pre-trained biases into exploration. We evaluate our approach on the VizDoom and Habitat environments, demonstrating that our method surpasses other well-known exploration methods in these settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Stefan Sylvius Wagner;Stefan Harmeling", "authorids": "~Stefan_Sylvius_Wagner1;~Stefan_Harmeling1", "gender": "M;Unspecified", "homepage": ";", "dblp": "295/8868;67/3271", "google_scholar": "https://scholar.google.de/citations?user=nk46qycAAAAJ;https://scholar.google.de/citations?user=TA2fG64AAAAJ", "orcid": ";0000-0001-9709-8160", "linkedin": "stefan-wagner-a30423108/;", "or_profile": "~Stefan_Sylvius_Wagner1;~Stefan_Harmeling1", "aff": "University of D\u00fcsseldorf;Technische Universit\u00e4t Dortmund", "aff_domain": "hhu.de;tu-dortmund.de", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nwagner2024just,\ntitle={Just Cluster It: An Approach for Exploration in High-Dimensions using Clustering and Pre-Trained Representations},\nauthor={Stefan Sylvius Wagner and Stefan Harmeling},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cXBPPfNUZJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1735079, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16328747436837712101&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "hhu.de;tu-dortmund.de", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Heinrich-Heine-Universit\u00e4t D\u00fcsseldorf;Technische Universit\u00e4t Dortmund", "aff_unique_dep": ";", "aff_unique_url": "https://www.hhu.de;https://www.tu-dortmund.de", "aff_unique_abbr": "HHU;TU Dortmund", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Variational Learning is Effective for Large Deep Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33590", "id": "cXBv07GKvk", "proceeding": "https://proceedings.mlr.press/v235/shen24b.html", "pdf": "https://openreview.net/pdf?id=cXBv07GKvk", "openreview": "https://openreview.net/forum?id=cXBv07GKvk", "author_site": "Yuesong Shen, Nico Daheim, Bai Cong, Peter Nickl, Gian Maria Marconi, Bazan Raoul, Rio Yokota, Iryna Gurevych, Daniel Cremers, Khan Emtiyaz, Thomas Moellenhoff", "tldr": "", "abstract": "We give extensive empirical evidence against the common belief that variational learning is ineffective for large neural networks. We show that an optimizer called Improved Variational Online Newton (IVON) consistently matches or outperforms Adam for training large networks such as GPT-2 and ResNets from scratch. IVON's computational costs are nearly identical to Adam but its predictive uncertainty is better. We show several new use cases of IVON where we improve finetuning and model merging in Large Language Models, accurately predict generalization error, and faithfully estimate sensitivity to data. We find overwhelming evidence that variational learning is effective. Code is available at https://github.com/team-approx-bayes/ivon.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuesong Shen;Nico Daheim;Bai Cong;Peter Nickl;Gian Maria Marconi;Bazan Clement Emile Marcel Raoul;Rio Yokota;Iryna Gurevych;Daniel Cremers;Mohammad Emtiyaz Khan;Thomas M\u00f6llenhoff", "authorids": "~Yuesong_Shen1;~Nico_Daheim1;~Bai_Cong1;~Peter_Nickl1;~Gian_Maria_Marconi1;~Bazan_Clement_Emile_Marcel_Raoul1;~Rio_Yokota1;~Iryna_Gurevych1;~Daniel_Cremers1;~Mohammad_Emtiyaz_Khan1;~Thomas_M\u00f6llenhoff1", "gender": "M;M;;M;M;M;M;;M;M;M", "homepage": "https://vision.in.tum.de/members/sheny;https://ndaheim.github.io;;https://pnickl.github.io;;;https://www.rio.scrc.iir.isct.ac.jp/en/index.html;;https://vision.in.tum.de/members/cremers;https://emtiyaz.github.io/;http://www.thomasmoellenhoff.net", "dblp": "190/1791;285/5587;;278/2984;222/3259;;61/7413;;c/DanielCremers;58/10432;", "google_scholar": ";n6wJfqUAAAAJ;;FTVom6gAAAAJ;;;klw9KE0AAAAJ;;cXQciMEAAAAJ;https://scholar.google.com/citations?hl=en;KAqmeqAAAAAJ", "orcid": ";;;0009-0006-9534-3955;;;0000-0001-7573-7873;;;;", "linkedin": ";;;peter-nickl-a7a403160/?originalSubdomain=jp;;cl%C3%A9ment-bazan-7a53b5268/;rio-yokota-62857235/?originalSubdomain=jp;;;;", "or_profile": "~Yuesong_Shen1;~Nico_Daheim1;~Bai_Cong1;~Peter_Nickl1;~Gian_Maria_Marconi1;~Bazan_Clement_Emile_Marcel_Raoul1;~Rio_Yokota1;~Iryna_Gurevych1;~Daniel_Cremers1;~Mohammad_Emtiyaz_Khan1;~Thomas_M\u00f6llenhoff1", "aff": "Technical University Munich;Technische Universit\u00e4t Darmstadt;;RIKEN Center for Advanced Intelligence Project;;Tokyo Institute of Technology, Tokyo Institute of Technology;Institute of Science Tokyo;;Technical University Munich;RIKEN Center for AI Project;RIKEN", "aff_domain": "tum.de;tu-darmstadt.de;;riken.jp;;titech.ac.jp;isct.ac.jp;;tum.de;riken.jp;riken.jp", "position": "PhD student;PhD student;;Researcher;;MS student;Full Professor;;Full Professor;Full Professor;Researcher", "bibtex": "@inproceedings{\nshen2024variational,\ntitle={Variational Learning is Effective for Large Deep Networks},\nauthor={Yuesong Shen and Nico Daheim and Bai Cong and Peter Nickl and Gian Maria Marconi and Bazan Clement Emile Marcel Raoul and Rio Yokota and Iryna Gurevych and Daniel Cremers and Mohammad Emtiyaz Khan and Thomas M{\\\"o}llenhoff},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cXBv07GKvk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2112837, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10926052208530229708&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "tum.de;tu-darmstadt.de;;riken.jp;;titech.ac.jp;isct.ac.jp;;tum.de;riken.jp;riken.jp", "author_num": 11, "aff_unique_index": "0;1;2;3;4;0;2;2", "aff_unique_norm": "Technical University of Munich;Technische Universit\u00e4t Darmstadt;RIKEN;Tokyo Institute of Technology;Institute of Science, Tokyo", "aff_unique_dep": ";;Center for Advanced Intelligence Project;;", "aff_unique_url": "https://www.tum.de;https://www.tu-darmstadt.de;https://www.riken.jp/en/;https://www.titech.ac.jp;https://www.iost.jp", "aff_unique_abbr": "TUM;TUD;RIKEN;Titech;IoST", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Tokyo", "aff_country_unique_index": "0;0;1;1;1;0;1;1", "aff_country_unique": "Germany;Japan" }, { "title": "The Max-Min Formulation of Multi-Objective Reinforcement Learning: From Theory to a Model-Free Algorithm", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33589", "id": "cY9g0bwiZx", "proceeding": "https://proceedings.mlr.press/v235/park24b.html", "pdf": "https://openreview.net/pdf?id=cY9g0bwiZx", "openreview": "https://openreview.net/forum?id=cY9g0bwiZx", "author_site": "Giseung Park, woohyeon Byeon, Seongmin Kim, Elad Havakuk, Amir Leshem, Youngchul Sung", "tldr": "", "abstract": "In this paper, we consider multi-objective reinforcement learning, which arises in many real-world problems with multiple optimization goals. We approach the problem with a max-min framework focusing on fairness among the multiple goals and develop a relevant theory and a practical model-free algorithm under the max-min framework. The developed theory provides a theoretical advance in multi-objective reinforcement learning, and the proposed algorithm demonstrates a notable performance improvement over existing baseline methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Giseung Park;Woohyeon Byeon;Seongmin Kim;Elad Havakuk;Amir Leshem;Youngchul Sung", "authorids": "~Giseung_Park1;~Woohyeon_Byeon1;~Seongmin_Kim2;~Elad_Havakuk1;~Amir_Leshem1;~Youngchul_Sung1", "gender": "M;;M;M;;M", "homepage": "https://sites.google.com/view/giseung-park;https://sites.google.com/view/sisrelkaist/members/whbyeon?authuser=0;https://sites.google.com/view/sisrelkaist/members/smkim;http://google.com;https://www.biu.eng.ac.il;https://sites.google.com/view/youngchulsung", "dblp": "233/3816;;;;84/3398;17/6798", "google_scholar": ";;;;;-9D2k3UAAAAJ", "orcid": "0000-0002-9737-4142;0009-0004-2993-9297;0009-0008-3032-8943;;0000-0002-2265-7463;0000-0003-4536-6690", "linkedin": ";;;;;", "or_profile": "~Giseung_Park1;~Woohyeon_Byeon1;~Seongmin_Kim2;~Elad_Havakuk1;~Amir_Leshem1;~Youngchul_Sung1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;KAIST;Bar-Ilan University;;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;ee.kaist.ac.kr;biu.ac.il;;kaist.ac.kr", "position": "PhD student;MS student;PhD student;MS student;;Full Professor", "bibtex": "@inproceedings{\npark2024the,\ntitle={The Max-Min Formulation of Multi-Objective Reinforcement Learning: From Theory to a Model-Free Algorithm},\nauthor={Giseung Park and Woohyeon Byeon and Seongmin Kim and Elad Havakuk and Amir Leshem and Youngchul Sung},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cY9g0bwiZx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 740668, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17654022660316234944&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "kaist.ac.kr;kaist.ac.kr;ee.kaist.ac.kr;biu.ac.il;;kaist.ac.kr", "author_num": 6, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Bar-Ilan University", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;https://www.biu.ac.il", "aff_unique_abbr": "KAIST;BIU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "South Korea;Israel" }, { "title": "Continuous Treatment Effects with Surrogate Outcomes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33588", "id": "cZNuYKtoOZ", "proceeding": "https://proceedings.mlr.press/v235/zeng24a.html", "pdf": "https://openreview.net/pdf?id=cZNuYKtoOZ", "openreview": "https://openreview.net/forum?id=cZNuYKtoOZ", "author_site": "Zhenghao Zeng, David Arbour, Avi Feller, Raghavendra Addanki, Ryan A Rossi, Ritwik Sinha, Edward Kennedy", "tldr": "", "abstract": "In many real-world causal inference applications, the primary outcomes (labels) are often partially missing, especially if they are expensive or difficult to collect. If the missingness depends on covariates (i.e., missingness is not completely at random), analyses based on fully observed samples alone may be biased. Incorporating surrogates, which are fully observed post-treatment variables related to the primary outcome, can improve estimation in this case. In this paper, we study the role of surrogates in estimating continuous treatment effects and propose a doubly robust method to efficiently incorporate surrogates in the analysis, which uses both labeled and unlabeled data and does not suffer from the above selection bias problem. Importantly, we establish the asymptotic normality of the proposed estimator and show possible improvements on the variance compared with methods that solely use labeled data. Extensive simulations show our methods enjoy appealing empirical performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhenghao Zeng;David Arbour;Avi Feller;Raghavendra Addanki;Ryan A. Rossi;Ritwik Sinha;Edward Kennedy", "authorids": "~Zhenghao_Zeng1;~David_Arbour1;~Avi_Feller1;~Raghavendra_Addanki1;~Ryan_A._Rossi2;~Ritwik_Sinha1;~Edward_Kennedy1", "gender": "M;;M;M;M;M;M", "homepage": "https://tigerzhzeng.com/;http://darbour.github.io;http://www.avifeller.com;https://raddanki.github.io/;https://research.adobe.com/person/ritwik-sinha/;http://www.ehkennedy.com/;http://ryanrossi.com", "dblp": ";87/7578;194/2418;218/5579;127/3163;222/3133;17/5085", "google_scholar": "Lx5Kh6AAAAAJ;prj0heYAAAAJ;Mz7heb4AAAAJ;SUPaOhgAAAAJ;https://scholar.google.co.in/citations?user=4SDTMIQAAAAJ;dXztgDYAAAAJ;_Dc6lbQAAAAJ", "orcid": ";;;;;;0000-0001-9758-0635", "linkedin": ";david-arbour/;;;;;", "or_profile": "~Zhenghao_Zeng1;~David_Arbour1;~Avi_Feller1;~Raghavendra_Addanki1;~Ritwik_Sinha1;~Edward_Kennedy1;~Ryan_Rossi1", "aff": "Carnegie Mellon University;Adobe Systems;University of California, Berkeley;Adobe Systems;Adobe Systems;Carnegie Mellon University;Adobe Research", "aff_domain": "cmu.edu;adobe.com;berkeley.edu;adobe.com;adobe.com;cmu.edu;adobe.com", "position": "PhD student;Research Scientist;Associate Professor;Research Scientist;Researcher;Assistant Professor;Senior Research Scientist", "bibtex": "@inproceedings{\nzeng2024continuous,\ntitle={Continuous Treatment Effects with Surrogate Outcomes},\nauthor={Zhenghao Zeng and David Arbour and Avi Feller and Raghavendra Addanki and Ryan A. Rossi and Ritwik Sinha and Edward Kennedy},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cZNuYKtoOZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 564690, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11575221366061125540&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 7, "email": "cmu.edu;adobe.com;berkeley.edu;adobe.com;adobe.com;cmu.edu;adobe.com", "author_num": 7, "aff_unique_index": "0;1;2;1;1;0;1", "aff_unique_norm": "Carnegie Mellon University;Adobe;University of California, Berkeley", "aff_unique_dep": ";Adobe Systems Incorporated;", "aff_unique_url": "https://www.cmu.edu;https://www.adobe.com;https://www.berkeley.edu", "aff_unique_abbr": "CMU;Adobe;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Graph Generation with Diffusion Mixture", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33587", "id": "cZTFxktg23", "proceeding": "https://proceedings.mlr.press/v235/jo24b.html", "pdf": "https://openreview.net/pdf?id=cZTFxktg23", "openreview": "https://openreview.net/forum?id=cZTFxktg23", "author_site": "Jaehyeong Jo, Dongki Kim, Sung Ju Hwang", "tldr": "", "abstract": "Generation of graphs is a major challenge for real-world tasks that require understanding the complex nature of their non-Euclidean structures. Although diffusion models have achieved notable success in graph generation recently, they are ill-suited for modeling the topological properties of graphs since learning to denoise the noisy samples does not explicitly learn the graph structures to be generated. To tackle this limitation, we propose a generative framework that models the topology of graphs by explicitly learning the final graph structures of the diffusion process. Specifically, we design the generative process as a mixture of endpoint-conditioned diffusion processes which is driven toward the predicted graph that results in rapid convergence. We further introduce a simple parameterization of the mixture process and develop an objective for learning the final graph structure, which enables maximum likelihood training. Through extensive experimental validation on general graph and 2D/3D molecule generation tasks, we show that our method outperforms previous generative models, generating graphs with correct topology with both continuous (e.g. 3D coordinates) and discrete (e.g. atom types) features. Our code is available at https://github.com/harryjo97/GruM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jaehyeong Jo;Dongki Kim;Sung Ju Hwang", "authorids": "~Jaehyeong_Jo1;~Dongki_Kim1;~Sung_Ju_Hwang1", "gender": "M;M;", "homepage": "https://github.com/harryjo97;https://github.com/dongkikim95;", "dblp": "296/2037;02/1692;", "google_scholar": "https://scholar.google.com/citations?hl=ko;Cz_OIhEAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Jaehyeong_Jo1;~Dongki_Kim1;~Sung_Ju_Hwang1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;", "aff_domain": "kaist.ac.kr;kaist.ac.kr;", "position": "MS student;PhD student;", "bibtex": "@inproceedings{\njo2024graph,\ntitle={Graph Generation with Diffusion Mixture},\nauthor={Jaehyeong Jo and Dongki Kim and Sung Ju Hwang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cZTFxktg23}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8707550, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13756789564709156574&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "kaist.ac.kr;kaist.ac.kr;", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Understanding the Impact of Introducing Constraints at Inference Time on Generalization Error", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33586", "id": "cbZTnjqIib", "proceeding": "https://proceedings.mlr.press/v235/nishino24a.html", "pdf": "https://openreview.net/pdf?id=cbZTnjqIib", "openreview": "https://openreview.net/forum?id=cbZTnjqIib", "author_site": "Masaaki Nishino, Kengo Nakamura, Norihito Yasuda", "tldr": "", "abstract": "Since machine learning technologies are being used in various practical situations, models with merely low prediction errors might not be satisfactory; prediction errors occurring with a low probability might yield dangerous results in some applications. Therefore, there are attempts to achieve an ML model whose input-output pairs are guaranteed to satisfy given constraints. Among such attempts, many previous works chose the approach of modifying the outputs of an ML model at the inference time to satisfy the constraints. Such a strategy is handy because we can control its output without expensive training or fine-tuning. However, it is unclear whether using constraints only in the inference time degrades a model's predictive performance. This paper analyses how the generalization error bounds change when we only put constraints in the inference time. Our main finding is that a class of loss functions preserves the relative generalization error, i.e., the difference in generalization error compared with the best model will not increase by imposing constraints at the inference time on multi-class classification. Some popular loss functions preserve the relative error, including the softmax cross-entropy loss. On the other hand, we also show that some loss functions do not preserve relative error when we use constraints. Our results suggest the importance of choosing a suitable loss function when we only use constraints in the inference time.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Masaaki Nishino;Kengo Nakamura;Norihito Yasuda", "authorids": "~Masaaki_Nishino1;~Kengo_Nakamura1;~Norihito_Yasuda1", "gender": "M;M;M", "homepage": ";http://www.kecl.ntt.co.jp/icl/lirg/members/nakamura/index.html;", "dblp": "90/1078;158/3521;41/2921", "google_scholar": ";;", "orcid": ";0000-0002-9615-3479;", "linkedin": ";;", "or_profile": "~Masaaki_Nishino1;~Kengo_Nakamura1;~Norihito_Yasuda1", "aff": "NTT;Kyoto University;NTT", "aff_domain": "ntt.co.jp;kyoto-u.ac.jp;ntt.co.jp", "position": "Distinguished Researcher;PhD student;Principal Researcher", "bibtex": "@inproceedings{\nnishino2024understanding,\ntitle={Understanding the Impact of Introducing Constraints at Inference Time on Generalization Error},\nauthor={Masaaki Nishino and Kengo Nakamura and Norihito Yasuda},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cbZTnjqIib}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 333491, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9489894673226205716&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 5, "email": "ntt.co.jp;kyoto-u.ac.jp;ntt.co.jp", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "NTT Corporation;Kyoto University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ntt.co.jp;https://www.kyoto-u.ac.jp", "aff_unique_abbr": "NTT;Kyoto U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Japan" }, { "title": "Trained Random Forests Completely Reveal your Dataset", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33585", "id": "cc72Vnfvoc", "proceeding": "https://proceedings.mlr.press/v235/ferry24a.html", "pdf": "https://openreview.net/pdf?id=cc72Vnfvoc", "openreview": "https://openreview.net/forum?id=cc72Vnfvoc", "author_site": "Julien Ferry, Ricardo Fukasawa, Timoth\u00e9e Pascal, Thibaut Vidal", "tldr": "", "abstract": "We introduce an optimization-based reconstruction attack capable of completely or near-completely reconstructing a dataset utilized for training a random forest. Notably, our approach relies solely on information readily available in commonly used libraries such as scikit-learn. To achieve this, we formulate the reconstruction problem as a combinatorial problem under a maximum likelihood objective. We demonstrate that this problem is NP-hard, though solvable at scale using constraint programming - an approach rooted in constraint propagation and solution-domain reduction. Through an extensive computational investigation, we demonstrate that random forests trained without bootstrap aggregation but with feature randomization are susceptible to a complete reconstruction. This holds true even with a small number of trees. Even with bootstrap aggregation, the majority of the data can also be reconstructed. These findings underscore a critical vulnerability inherent in widely adopted ensemble methods, warranting attention and mitigation. Although the potential for such reconstruction attacks has been discussed in privacy research, our study provides clear empirical evidence of their practicability.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Julien Ferry;Ricardo Fukasawa;Timoth\u00e9e Pascal;Thibaut Vidal", "authorids": "~Julien_Ferry1;~Ricardo_Fukasawa1;~Timoth\u00e9e_Pascal1;~Thibaut_Vidal1", "gender": "M;M;M;M", "homepage": "https://homepages.laas.fr/jferry/;http://www.math.uwaterloo.ca/~rfukasaw;;https://w1.cirrelt.ca/~vidalt/en/home-thibaut-vidal.html", "dblp": "248/8140.html;;;40/11481", "google_scholar": "https://scholar.google.fr/citations?hl=en;Z0NqKVYAAAAJ;;https://scholar.google.com.tw/citations?user=qbO0xwUAAAAJ", "orcid": "0000-0002-8764-0080;;;0000-0001-5183-8485", "linkedin": "julien-ferry-9435341a7/;;timothee-pascal;thibaut-vidal-7a877055/", "or_profile": "~Julien_Ferry1;~Ricardo_Fukasawa1;~Timoth\u00e9e_Pascal1;~Thibaut_Vidal1", "aff": "\u00c9cole Polytechnique de Montr\u00e9al, Universit\u00e9 de Montr\u00e9al;University of Waterloo;Ecole Nationale des Ponts et Chausees;Polytechnique Montreal", "aff_domain": "polymtl.ca;uwaterloo.ca;enpc.fr;polymtl.ca", "position": "Postdoc;Full Professor;MS student;Associate Professor", "bibtex": "@inproceedings{\nferry2024trained,\ntitle={Trained Random Forests Completely Reveal your Dataset},\nauthor={Julien Ferry and Ricardo Fukasawa and Timoth{\\'e}e Pascal and Thibaut Vidal},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cc72Vnfvoc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 514902, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5389795534886954979&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 9, "email": "polymtl.ca;uwaterloo.ca;enpc.fr;polymtl.ca", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "\u00c9cole Polytechnique de Montr\u00e9al;University of Waterloo;Ecole Nationale des Ponts et Chaussees;Polytechnique Montreal", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.polymtl.ca;https://uwaterloo.ca;https://www.enpc.fr;https://www.polymtl.ca", "aff_unique_abbr": "Polytechnique Montr\u00e9al;UW;ENPC;PolyMTL", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Montr\u00e9al;;Montreal", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Canada;France" }, { "title": "Long-Tail Learning with Foundation Model: Heavy Fine-Tuning Hurts", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33584", "id": "ccSSKTz9LX", "proceeding": "https://proceedings.mlr.press/v235/shi24g.html", "pdf": "https://openreview.net/pdf?id=ccSSKTz9LX", "openreview": "https://openreview.net/forum?id=ccSSKTz9LX", "author_site": "Jiang-Xin Shi, Tong Wei, Zhi Zhou, Jie-Jing Shao, Xin-Yan Han, Yu-Feng Li", "tldr": "", "abstract": "The fine-tuning paradigm in addressing long-tail learning tasks has sparked significant interest since the emergence of foundation models. Nonetheless, how fine-tuning impacts performance in long-tail learning was not explicitly quantified. In this paper, we disclose that heavy fine-tuning may even lead to non-negligible performance deterioration on tail classes, and lightweight fine-tuning is more effective. The reason is attributed to inconsistent class conditions caused by heavy fine-tuning. With the observation above, we develop a low-complexity and accurate long-tail learning algorithms LIFT with the goal of facilitating fast prediction and compact models by adaptive lightweight fine-tuning. Experiments clearly verify that both the training time and the learned parameters are significantly reduced with more accurate predictive performance compared with state-of-the-art approaches. The implementation code is available at https://github.com/shijxcs/LIFT.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiang-Xin Shi;Tong Wei;Zhi Zhou;Jie-Jing Shao;Xin-Yan Han;Yu-Feng Li", "authorids": "~Jiang-Xin_Shi1;~Tong_Wei1;~Zhi_Zhou2;~Jie-Jing_Shao1;~Xin-Yan_Han1;~Yu-Feng_Li1", "gender": ";M;M;M;F;M", "homepage": "http://www.lamda.nju.edu.cn/shijx;https://palm.seu.edu.cn/weit/;http://www.lamda.nju.edu.cn/zhouz/;http://www.lamda.nju.edu.cn/shaojj/;http://www.lamda.nju.edu.cn/hanxy/;https://cs.nju.edu.cn/liyf/index.htm", "dblp": "299/5485.html;49/933-1;04/2090-7;299/4982;;57/413", "google_scholar": "KEgtGncAAAAJ;EFCZuW4AAAAJ;VzvP5a8AAAAJ;k1tEDpQAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-0318-0911;0000-0002-2766-8209;;0000-0001-8107-114X;;0000-0002-2220-5248", "linkedin": ";;;;;", "or_profile": "~Jiang-Xin_Shi1;~Tong_Wei1;~Zhi_Zhou2;~Jie-Jing_Shao1;~Xin-Yan_Han1;~Yu-feng_Li2", "aff": "Nanjing University;Southeast University;Nanjing University;Nanjing University;;Nanjing University", "aff_domain": "nju.edu.cn;seu.edu.cn;nju.edu.cn;nju.edu.cn;;nju.edu.cn", "position": "PhD student;Associate Professor;PhD student;PhD student;;Assistant Professor", "bibtex": "@inproceedings{\nshi2024longtail,\ntitle={Long-Tail Learning with Foundation Model: Heavy Fine-Tuning Hurts},\nauthor={Jiang-Xin Shi and Tong Wei and Zhi Zhou and Jie-Jing Shao and Xin-Yan Han and Yu-Feng Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ccSSKTz9LX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4864090, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17022739996072692389&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 8, "email": "nju.edu.cn;seu.edu.cn;nju.edu.cn;nju.edu.cn;;nju.edu.cn", "author_num": 6, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Nanjing University;Southeast University", "aff_unique_dep": ";", "aff_unique_url": "https://www.nju.edu.cn;https://www.seu.edu.cn/", "aff_unique_abbr": "Nanjing U;SEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Beyond Individual Input for Deep Anomaly Detection on Tabular Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33583", "id": "chDpBp2P6b", "proceeding": "https://proceedings.mlr.press/v235/thimonier24a.html", "pdf": "https://openreview.net/pdf?id=chDpBp2P6b", "openreview": "https://openreview.net/forum?id=chDpBp2P6b", "author_site": "Hugo Thimonier, Fabrice Popineau, Arpad Rimmel, Bich-Li\u00ean DOAN", "tldr": "", "abstract": "Anomaly detection is vital in many domains, such as finance, healthcare, and cybersecurity. In this paper, we propose a novel deep anomaly detection method for tabular data that leverages Non-Parametric Transformers (NPTs), a model initially proposed for supervised tasks, to capture both feature-feature and sample-sample dependencies. In a reconstruction-based framework, we train an NPT to reconstruct masked features of normal samples. In a non-parametric fashion, we leverage the whole training set during inference and use the model's ability to reconstruct the masked features to generate an anomaly score. To the best of our knowledge, this is the first work to successfully combine feature-feature and sample-sample dependencies for anomaly detection on tabular datasets. Through extensive experiments on 31 benchmark tabular datasets, we demonstrate that our method achieves state-of-the-art performance, outperforming existing methods by 2.4% and 1.2% in terms of F1-score and AUROC, respectively. Our ablation study further proves that modeling both types of dependencies is crucial for anomaly detection on tabular data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hugo Thimonier;Fabrice Popineau;Arpad Rimmel;Bich-Li\u00ean DOAN", "authorids": "~Hugo_Thimonier1;~Fabrice_Popineau1;~Arpad_Rimmel1;~Bich-Li\u00ean_DOAN2", "gender": "M;M;;F", "homepage": ";https://fabrice.popineau.net/;;https://www.lisn.upsaclay.fr/members/doan-bich-lien/", "dblp": "287/8967;43/5266;00/4867;48/2445", "google_scholar": "p1mWlucAAAAJ;https://scholar.google.fr/citations?hl=fr;;Oo6uX2oAAAAJ", "orcid": "0000-0003-4762-477X;;;", "linkedin": ";fabricepopineau/;;bich-li%C3%AAn-doan-3897536/?originalSubdomain=fr", "or_profile": "~Hugo_Thimonier1;~Fabrice_Popineau1;~Arpad_Rimmel1;~Bich-Li\u00ean_DOAN2", "aff": "CentraleSupelec;CentraleSupelec;Laboratoire de recherche en informatique;CentraleSupelec", "aff_domain": "centralesupelec.fr;centralesupelec.fr;lri.fr;centralesupelec.fr", "position": "PhD student;Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nthimonier2024beyond,\ntitle={Beyond Individual Input for Deep Anomaly Detection on Tabular Data},\nauthor={Hugo Thimonier and Fabrice Popineau and Arpad Rimmel and Bich-Li{\\^e}n DOAN},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=chDpBp2P6b}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3048966, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10270480832026916718&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "email": "centralesupelec.fr;centralesupelec.fr;lri.fr;centralesupelec.fr", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "CentraleSup\u00e9lec;Laboratoire de recherche en informatique", "aff_unique_dep": ";Laboratoire de recherche en informatique", "aff_unique_url": "https://www.centralesupelec.fr;", "aff_unique_abbr": "CS;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "France" }, { "title": "Taylor Videos for Action Recognition", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33582", "id": "chhIZGqlUG", "proceeding": "https://proceedings.mlr.press/v235/wang24ck.html", "pdf": "https://openreview.net/pdf?id=chhIZGqlUG", "openreview": "https://openreview.net/forum?id=chhIZGqlUG", "author_site": "Lei Wang, Xiuyuan Yuan, Tom Gedeon, Liang Zheng", "tldr": "", "abstract": "Effectively extracting motions from video is a critical and long-standing problem for action recognition. This problem is very challenging because motions (i) do not have an explicit form, (ii) have various concepts such as displacement, velocity, and acceleration, and (iii) often contain noise caused by unstable pixels. Addressing these challenges, we propose the Taylor video, a new video format that highlights the dominate motions (e.g., a waving hand) in each of its frames named the Taylor frame. Taylor video is named after Taylor series, which approximates a function at a given point using important terms. In the scenario of videos, we define an implicit motion-extraction function which aims to extract motions from video temporal block. In this block, using the frames, the difference frames, and higher-order difference frames, we perform Taylor expansion to approximate this function at the starting frame. We show the summation of the higher-order terms in the Taylor series gives us dominant motion patterns, where static objects, small and unstable motions are removed. Experimentally we show that Taylor videos are effective inputs to popular architectures including 2D CNNs, 3D CNNs, and transformers. When used individually, Taylor videos yield competitive action recognition accuracy compared to RGB videos and optical flow. When fused with RGB or optical flow videos, further accuracy improvement is achieved. Additionally, we apply Taylor video computation to human skeleton sequences, resulting in Taylor skeleton sequences that outperform the use of original skeletons for skeleton-based action recognition.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lei Wang;Xiuyuan Yuan;Tom Gedeon;Liang Zheng", "authorids": "~Lei_Wang20;~Xiuyuan_Yuan1;~Tom_Gedeon1;~Liang_Zheng4", "gender": "M;M;M;M", "homepage": "https://leiwangr.github.io/;https://jackyuanx.github.io/;https://cs.anu.edu.au/people/Tom.Gedeon/;http://zheng-lab.cecs.anu.edu.au/", "dblp": "181/2817-108;;g/TamasDGedeon.html;61/7360-1", "google_scholar": "VWCZLXgAAAAJ;;https://scholar.google.com.tw/citations?user=lPTjWIkAAAAJ;https://scholar.google.com.au/citations?user=vNHqr3oAAAAJ", "orcid": "0000-0002-8600-7099;;0000-0001-8356-4909;", "linkedin": "lei-l-wang/;xiuyuan-yuan-676b87292/;tom-gedeon;liang-zheng-76341311a/", "or_profile": "~Lei_Wang20;~Xiuyuan_Yuan1;~Tom_Gedeon1;~Liang_Zheng4", "aff": "Australian National University;Australian National University;Curtin University of Technology;Australian National University", "aff_domain": "anu.edu.au;anu.edu.au;curtin.edu.au;anu.edu.au", "position": "Postdoc;Undergrad student;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nwang2024taylor,\ntitle={Taylor Videos for Action Recognition},\nauthor={Lei Wang and Xiuyuan Yuan and Tom Gedeon and Liang Zheng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=chhIZGqlUG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3065169, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16032287380357962079&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "email": "anu.edu.au;anu.edu.au;curtin.edu.au;anu.edu.au", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Australian National University;Curtin University", "aff_unique_dep": ";", "aff_unique_url": "https://www.anu.edu.au;https://www.curtin.edu.au", "aff_unique_abbr": "ANU;Curtin", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Australia" }, { "title": "Federated Full-Parameter Tuning of Billion-Sized Language Models with Communication Cost under 18 Kilobytes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33581", "id": "cit0hg4sEz", "proceeding": "https://proceedings.mlr.press/v235/qin24a.html", "pdf": "https://openreview.net/pdf?id=cit0hg4sEz", "openreview": "https://openreview.net/forum?id=cit0hg4sEz", "author_site": "Zhen Qin, Daoyuan Chen, Bingchen Qian, Bolin Ding, Yaliang Li, Shuiguang Deng", "tldr": "", "abstract": "Pre-trained large language models (LLMs) need fine-tuning to improve their responsiveness to natural language instructions. Federated learning offers a way to fine-tune LLMs using the abundant data on end devices without compromising data privacy. Most existing federated fine-tuning methods for LLMs rely on parameter-efficient fine-tuning techniques, which may not reach the performance height possible with full-parameter tuning. However, federated full-parameter tuning of LLMs is a non-trivial problem due to the immense communication cost. This work introduces FedKSeed that employs zeroth-order optimization with a finite set of random seeds. It significantly reduces transmission requirements between the server and clients to just a few random seeds and scalar gradients, amounting to only a few thousand bytes, making federated full-parameter tuning of billion-sized LLMs possible on devices. Building on it, we develop a strategy enabling probability-differentiated seed sampling, prioritizing perturbations with greater impact on model accuracy. Experiments across six scenarios with various LLMs, datasets and data partitions demonstrate that our approach outperforms existing federated LLM fine-tuning methods in both communication efficiency and zero-shot generalization.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhen Qin;Daoyuan Chen;Bingchen Qian;Bolin Ding;Yaliang Li;Shuiguang Deng", "authorids": "~Zhen_Qin10;~Daoyuan_Chen1;~Bingchen_Qian1;~Bolin_Ding3;~Yaliang_Li1;~Shuiguang_Deng1", "gender": "M;M;M;M;M;M", "homepage": "https://zhenqinzq.cn/;https://yxdyc.github.io/;;https://bolinding.github.io/;https://sites.google.com/site/yaliangli/;https://person.zju.edu.cn/shuiguang", "dblp": "06/864-4.html;217/4891;294/3682.html;46/3522.html;https://dblp.org/pers/hd/l/Li:Yaliang;d/ShuiguangDeng", "google_scholar": "yRy86mQAAAAJ;https://scholar.google.com.hk/citations?user=1GdfinUAAAAJ;;AjYkTi8AAAAJ;CCPBcdYAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": "0000-0002-1756-6102;0000-0002-8015-2121;;;0000-0002-4204-6096;0000-0001-5015-6095", "linkedin": ";;;bolin-ding-50a0119/;;", "or_profile": "~Zhen_Qin10;~Daoyuan_Chen1;~Bingchen_Qian1;~Bolin_Ding3;~Yaliang_Li1;~Shuiguang_Deng1", "aff": "Zhejiang University;Alibaba Group;Alibaba Group;Alibaba Group;Alibaba Group;Zhejiang University", "aff_domain": "zju.edu.cn;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;zju.edu.cn", "position": "PhD student;Staff;Researcher;Senior Director;Staff Engineer;Full Professor", "bibtex": "@inproceedings{\nqin2024federated,\ntitle={Federated Full-Parameter Tuning of Billion-Sized Language Models with Communication Cost under 18 Kilobytes},\nauthor={Zhen Qin and Daoyuan Chen and Bingchen Qian and Bolin Ding and Yaliang Li and Shuiguang Deng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cit0hg4sEz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1684880, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=496567781906318692&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "zju.edu.cn;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;zju.edu.cn", "author_num": 6, "aff_unique_index": "0;1;1;1;1;0", "aff_unique_norm": "Zhejiang University;Alibaba Group", "aff_unique_dep": ";", "aff_unique_url": "https://www.zju.edu.cn;https://www.alibaba.com", "aff_unique_abbr": "ZJU;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "BOtied: Multi-objective Bayesian optimization with tied multivariate ranks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33580", "id": "cj5HbaX14p", "proceeding": "https://proceedings.mlr.press/v235/park24k.html", "pdf": "https://openreview.net/pdf?id=cj5HbaX14p", "openreview": "https://openreview.net/forum?id=cj5HbaX14p", "author_site": "Ji Won Park, Natasa Tagasovska, Michael Maser, Stephen Ra, Kyunghyun Cho", "tldr": "", "abstract": "Many scientific and industrial applications require the joint optimization of multiple, potentially competing objectives. Multi-objective Bayesian optimization (MOBO) is a sample-efficient framework for identifying Pareto-optimal solutions. At the heart of MOBO is the acquisition function, which determines the next candidate to evaluate by navigating the best compromises among the objectives. Acquisition functions that rely on integrating over the objective space scale poorly to a large number of objectives. In this paper, we show a natural connection between the non-dominated solutions and the highest multivariate rank, which coincides with the extreme level line of the joint cumulative distribution function (CDF). Motivated by this link, we propose the CDF indicator, a Pareto-compliant metric for evaluating the quality of approximate Pareto sets, that can complement the popular hypervolume indicator. We then introduce an acquisition function based on the CDF indicator, called BOtied. BOtied can be implemented efficiently with copulas, a statistical tool for modeling complex, high-dimensional distributions. Our experiments on a variety of synthetic and real-world experiments demonstrate that BOtied outperforms state-of-the-art MOBO algorithms while being computationally efficient for many objectives.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ji Won Park;Natasa Tagasovska;Michael Maser;Stephen Ra;Kyunghyun Cho", "authorids": "~Ji_Won_Park1;~Natasa_Tagasovska2;~Michael_Maser1;~Stephen_Ra1;~Kyunghyun_Cho1", "gender": "F;F;;M;M", "homepage": ";https://datascience.ch/team_member/natasa-tagasovska-computer-scientist/;;https://www.stephenra.com;http://kyunghyuncho.me", "dblp": "83/10554;;;255/5897;41/9736", "google_scholar": "URG3MMYAAAAJ;S2ZUSL0AAAAJ;;bxl__-MAAAAJ;https://scholar.google.fi/citations?user=0RAmmIAAAAAJ", "orcid": "0000-0002-0692-1092;;;;", "linkedin": ";natasha-tagasovska/;michael-maser-7a9844b9;;", "or_profile": "~Ji_Won_Park1;~Natasa_Tagasovska2;~Michael_Maser1;~Stephen_Ra1;~Kyunghyun_Cho1", "aff": "Genentech;Prescient Design - Genentech, Roche;Genentech;Prescient Design, Genentech;Genentech", "aff_domain": "gene.com;roche.com;gene.com;gene.com;gene.com", "position": "Researcher;Senior Machine Learning Scientis;Researcher;Director of Frontier Research;Senior Director of Frontier Research", "bibtex": "@inproceedings{\npark2024botied,\ntitle={{BO}tied: Multi-objective Bayesian optimization with tied multivariate ranks},\nauthor={Ji Won Park and Natasa Tagasovska and Michael Maser and Stephen Ra and Kyunghyun Cho},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cj5HbaX14p}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2967799, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8452782534607282985&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 8, "email": "gene.com;roche.com;gene.com;gene.com;gene.com", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Genentech", "aff_unique_dep": "", "aff_unique_url": "https://www.genentech.com", "aff_unique_abbr": "Genentech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Symmetric Replay Training: Enhancing Sample Efficiency in Deep Reinforcement Learning for Combinatorial Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33579", "id": "cmD5E6ami4", "proceeding": "https://proceedings.mlr.press/v235/kim24o.html", "pdf": "https://openreview.net/pdf?id=cmD5E6ami4", "openreview": "https://openreview.net/forum?id=cmD5E6ami4", "author_site": "Hyeonah Kim, Minsu Kim, Sungsoo Ahn, Jinkyoo Park", "tldr": "", "abstract": "Deep reinforcement learning (DRL) has significantly advanced the field of combinatorial optimization (CO). However, its practicality is hindered by the necessity for a large number of reward evaluations, especially in scenarios involving computationally intensive function assessments. To enhance the sample efficiency, we propose a simple but effective method, called *symmetric replay training (SRT)*, which can be easily integrated into various DRL methods. Our method leverages high-reward samples to encourage exploration of the under-explored symmetric regions without additional online interactions - *free*. Through replay training, the policy is trained to maximize the likelihood of the symmetric trajectories of discovered high-rewarded samples. Experimental results demonstrate the consistent improvement of our method in sample efficiency across diverse DRL methods applied to real-world tasks, such as molecular optimization and hardware design.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hyeonah Kim;Minsu Kim;Sungsoo Ahn;Jinkyoo Park", "authorids": "~Hyeonah_Kim1;~Minsu_Kim2;~Sungsoo_Ahn1;~Jinkyoo_Park1", "gender": "F;M;M;M", "homepage": ";https://minsuukim.github.io/;https://sungsooahn.super.site/;http://silab.kaist.ac.kr/", "dblp": ";;90/5164;156/7535", "google_scholar": ";https://scholar.google.ca/citations?user=VvyLuhAAAAAJ;XTenHs0AAAAJ;sH2a0nkAAAAJ", "orcid": "0000-0002-0629-1879;;;0000-0003-2620-1479", "linkedin": "hyeonahkimm/;;;", "or_profile": "~Hyeonah_Kim1;~Minsu_Kim2;~Sungsoo_Ahn1;~Jinkyoo_Park1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Pohang University of Science and Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.edu;kaist.ac.kr;postech.ac.kr;kaist.ac.kr", "position": "PhD student;PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nkim2024symmetric,\ntitle={Symmetric Replay Training: Enhancing Sample Efficiency in Deep Reinforcement Learning for Combinatorial Optimization},\nauthor={Hyeonah Kim and Minsu Kim and Sungsoo Ahn and Jinkyoo Park},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cmD5E6ami4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2094254, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15202814107695017432&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 7, "email": "kaist.edu;kaist.ac.kr;postech.ac.kr;kaist.ac.kr", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Pohang University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;https://www.postech.ac.kr", "aff_unique_abbr": "KAIST;POSTECH", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pohang", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Hierarchical Integral Probability Metrics: A distance on random probability measures with low sample complexity", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33578", "id": "cmy38XZlJu", "proceeding": "https://proceedings.mlr.press/v235/catalano24a.html", "pdf": "https://openreview.net/pdf?id=cmy38XZlJu", "openreview": "https://openreview.net/forum?id=cmy38XZlJu", "author_site": "Marta Catalano, Hugo Lavenant", "tldr": "", "abstract": "Random probabilities are a key component to many nonparametric methods in Statistics and Machine Learning. To quantify comparisons between different laws of random probabilities several works are starting to use the elegant Wasserstein over Wasserstein distance. In this paper we prove that the infinite dimensionality of the space of probabilities drastically deteriorates its sample complexity, which is slower than any polynomial rate in the sample size. We propose a new distance that preserves many desirable properties of the former while achieving a parametric rate of convergence. In particular, our distance 1) metrizes weak convergence; 2) can be estimated numerically through samples with low complexity; 3) can be bounded analytically from above and below. The main ingredient are integral probability metrics, which lead to the name *hierarchical IPM*.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Marta Catalano;Hugo Lavenant", "authorids": "~Marta_Catalano1;~Hugo_Lavenant1", "gender": "F;", "homepage": "https://martacatalano.github.io/;https://hugolav.github.io/", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Marta_Catalano1;~Hugo_Lavenant1", "aff": "Luiss Guido Carli University;Bocconi University", "aff_domain": "luiss.it;unibocconi.it", "position": "Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\ncatalano2024hierarchical,\ntitle={Hierarchical Integral Probability Metrics: A distance on random probability measures with low sample complexity},\nauthor={Marta Catalano and Hugo Lavenant},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cmy38XZlJu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 490232, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10975963980516924093&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 7, "email": "luiss.it;unibocconi.it", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Luiss Guido Carli University;Bocconi University", "aff_unique_dep": ";", "aff_unique_url": "https://www.luiss.edu/;https://www.bocconi.edu", "aff_unique_abbr": "Luiss;Bocconi", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Italy" }, { "title": "Dynamic Spectral Clustering with Provable Approximation Guarantee", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33577", "id": "coP4kPdhKr", "proceeding": "https://proceedings.mlr.press/v235/laenen24a.html", "pdf": "https://openreview.net/pdf?id=coP4kPdhKr", "openreview": "https://openreview.net/forum?id=coP4kPdhKr", "author_site": "Steinar Laenen, He Sun", "tldr": "", "abstract": "This paper studies clustering algorithms for dynamically evolving graphs $\\{G_t\\}$, in which new edges (and potential new vertices) are added into a graph, and the underlying cluster structure of the graph can gradually change. The paper proves that, under some mild condition on the cluster-structure, the clusters of the final graph $G_T$ of $n_T$ vertices at time $T$ can be well approximated by a dynamic variant of the spectral clustering algorithm. The algorithm runs in amortised update time $O(1)$ and query time $o(n_T)$. Experimental studies on both synthetic and real-world datasets further confirm the practicality of our designed algorithm.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Steinar Laenen;He Sun", "authorids": "~Steinar_Laenen1;~He_Sun5", "gender": "M;M", "homepage": "https://www.steinar.dev;http://homepages.inf.ed.ac.uk/hsun4/", "dblp": ";", "google_scholar": ";https://scholar.google.co.uk/citations?user=K6-JprYAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Steinar_Laenen1;~He_Sun5", "aff": "University of Edinburgh;University of Edinburgh", "aff_domain": "ed.ac.uk;ed.ac.uk", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nlaenen2024dynamic,\ntitle={Dynamic Spectral Clustering with Provable Approximation Guarantee},\nauthor={Steinar Laenen and He Sun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=coP4kPdhKr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 698843, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6450286144774312545&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "ed.ac.uk;ed.ac.uk", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Edinburgh", "aff_unique_dep": "", "aff_unique_url": "https://www.ed.ac.uk", "aff_unique_abbr": "Edinburgh", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Mean Estimation in the Add-Remove Model of Differential Privacy", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33576", "id": "cwIhvoTzuK", "proceeding": "https://proceedings.mlr.press/v235/kulesza24a.html", "pdf": "https://openreview.net/pdf?id=cwIhvoTzuK", "openreview": "https://openreview.net/forum?id=cwIhvoTzuK", "author_site": "Alex Kulesza, Ananda Suresh, Yuyan Wang", "tldr": "", "abstract": "Differential privacy is often studied under two different models of neighboring datasets: the add-remove model and the swap model. While the swap model is frequently used in the academic literature to simplify analysis, many practical applications rely on the more conservative add-remove model, where obtaining tight results can be difficult. Here, we study the problem of one-dimensional mean estimation under the add-remove model. We propose a new algorithm and show that it is min-max optimal, achieving the best possible constant in the leading term of the mean squared error for all $\\epsilon$, and that this constant is the same as the optimal algorithm under the swap model. These results show that the add-remove and swap models give nearly identical errors for mean estimation, even though the add-remove model cannot treat the size of the dataset as public information. We also demonstrate empirically that our proposed algorithm yields at least a factor of two improvement in mean squared error over algorithms frequently used in practice. One of our main technical contributions is a new hourglass mechanism, which might be of independent interest in other scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alex Kulesza;Ananda Theertha Suresh;Yuyan Wang", "authorids": "~Alex_Kulesza2;~Ananda_Theertha_Suresh1;~Yuyan_Wang1", "gender": ";M;F", "homepage": "http://www.alexkulesza.com/;https://theertha.info;", "dblp": "61/4512;119/3884;", "google_scholar": "2OUGYFAAAAAJ;K6ef57QAAAAJ;JvSO3Q0AAAAJ", "orcid": ";;", "linkedin": ";;yuyan-wang-670a55199/", "or_profile": "~Alex_Kulesza2;~Ananda_Theertha_Suresh1;~Yuyan_Wang1", "aff": "Google;Google;Google", "aff_domain": "google.com;google.com;google.com", "position": "Research Scientist;Research Scientist;Researcher", "bibtex": "@inproceedings{\nkulesza2024mean,\ntitle={Mean Estimation in the Add-Remove Model of Differential Privacy},\nauthor={Alex Kulesza and Ananda Theertha Suresh and Yuyan Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cwIhvoTzuK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1144536, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2057834678112592008&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 7, "email": "google.com;google.com;google.com", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Weakly-Supervised Residual Evidential Learning for Multi-Instance Uncertainty Estimation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33575", "id": "cxiqxDnrCx", "proceeding": "https://proceedings.mlr.press/v235/liu24ac.html", "pdf": "https://openreview.net/pdf?id=cxiqxDnrCx", "openreview": "https://openreview.net/forum?id=cxiqxDnrCx", "author_site": "Pei Liu, Luping Ji", "tldr": "", "abstract": "Uncertainty estimation (UE), as an effective means of quantifying predictive uncertainty, is crucial for safe and reliable decision-making, especially in high-risk scenarios. Existing UE schemes usually assume that there are completely-labeled samples to support fully-supervised learning. In practice, however, many UE tasks often have no sufficiently-labeled data to use, such as the Multiple Instance Learning (MIL) with only weak instance annotations. To bridge this gap, this paper, for the first time, addresses the weakly-supervised issue of *Multi-Instance UE* (MIUE) and proposes a new baseline scheme, *Multi-Instance Residual Evidential Learning* (MIREL). Particularly, at the fine-grained instance UE with only weak supervision, we derive a multi-instance residual operator through the Fundamental Theorem of Symmetric Functions. On this operator derivation, we further propose MIREL to jointly model the high-order predictive distribution at bag and instance levels for MIUE. Extensive experiments empirically demonstrate that our MIREL not only could often make existing MIL networks perform better in MIUE, but also could surpass representative UE methods by large margins, especially in instance-level UE tasks. Our source code is available at https://github.com/liupei101/MIREL.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pei Liu;Luping Ji", "authorids": "~Pei_Liu5;~Luping_Ji1", "gender": "M;M", "homepage": "https://liupei101.github.io/;https://faculty.uestc.edu.cn/jiluping/zh_CN/index.htm", "dblp": "84/3210-8.html;92/281", "google_scholar": "FNghdtEAAAAJ;", "orcid": "0000-0002-3795-6140;0000-0002-1200-5218", "linkedin": ";", "or_profile": "~Pei_Liu5;~Luping_Ji1", "aff": "University of Electronic Science and Technology of China;University of Electronic Science and Technology of China", "aff_domain": "uestc.edu.cn;uestc.edu.cn", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nliu2024weaklysupervised,\ntitle={Weakly-Supervised Residual Evidential Learning for Multi-Instance Uncertainty Estimation},\nauthor={Pei Liu and Luping Ji},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cxiqxDnrCx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8587254, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16110860892605788809&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "uestc.edu.cn;uestc.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Electronic Science and Technology of China", "aff_unique_dep": "", "aff_unique_url": "https://www.uestc.edu.cn", "aff_unique_abbr": "UESTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Collapse-Aware Triplet Decoupling for Adversarially Robust Image Retrieval", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33574", "id": "cy3JBZKCw1", "proceeding": "https://proceedings.mlr.press/v235/tian24a.html", "pdf": "https://openreview.net/pdf?id=cy3JBZKCw1", "openreview": "https://openreview.net/forum?id=cy3JBZKCw1", "author_site": "Qiwei Tian, Chenhao Lin, Zhengyu Zhao, Qian Li, Chao Shen", "tldr": "", "abstract": "Adversarial training has achieved substantial performance in defending image retrieval against adversarial examples. However, existing studies in deep metric learning (DML) still suffer from two major limitations: *weak adversary* and *model collapse*. In this paper, we address these two limitations by proposing **C**ollapse-**A**ware **TRI**plet **DE**coupling (**CA-TRIDE**). Specifically, TRIDE yields a stronger adversary by spatially decoupling the perturbation targets into the anchor and the other candidates. Furthermore, CA prevents the consequential model collapse, based on a novel metric, collapseness, which is incorporated into the optimization of perturbation. We also identify two drawbacks of the existing robustness metric in image retrieval and propose a new metric for a more reasonable robustness evaluation. Extensive experiments on three datasets demonstrate that CA-TRIDE outperforms existing defense methods in both conventional and new metrics. *Codes are available at https://github.com/michaeltian108/CA-TRIDE.*", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qiwei Tian;Chenhao Lin;Zhengyu Zhao;Qian Li;Chao Shen", "authorids": "~Qiwei_Tian1;~Chenhao_Lin1;~Zhengyu_Zhao1;~Qian_Li11;~Chao_Shen2", "gender": "M;M;M;M;M", "homepage": ";;https://zhengyuzhao.github.io/;;http://gr.xjtu.edu.cn/web/cshen", "dblp": "354/1101;198/9470;58/10770-1;69/5902-24.html;48/4825-1", "google_scholar": "aremIvUAAAAJ;YK0G990AAAAJ;pC8KpPMAAAAJ;;m6QY7-wAAAAJ", "orcid": ";;;;0000-0002-6959-0569", "linkedin": "https://www.linkedin.cn/incareer/in/ACoAABLKX9IB5DxryaFFeXnPtlwiFvlRqfgjDGs;;;;", "or_profile": "~Qiwei_Tian1;~Chenhao_Lin1;~Zhengyu_Zhao1;~Qian_Li11;~Chao_Shen2", "aff": "Xi'an Jiaotong University;Xi'an Jiaotong University;Xi'an Jiaotong University;Xi'an Jiaotong University;Xi\u2019an Jiaotong University", "aff_domain": "xjtu.edu.cn;xjtu.edu.cn;xjtu.edu.cn;xjtu.edu.cn;xjtu.edu.cn", "position": "PhD student;Full Professor;Researcher;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\ntian2024collapseaware,\ntitle={Collapse-Aware Triplet Decoupling for Adversarially Robust Image Retrieval},\nauthor={Qiwei Tian and Chenhao Lin and Zhengyu Zhao and Qian Li and Chao Shen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cy3JBZKCw1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1627602, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=963280703783870596&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "xjtu.edu.cn;xjtu.edu.cn;xjtu.edu.cn;xjtu.edu.cn;xjtu.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Xi'an Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.xjtu.edu.cn", "aff_unique_abbr": "XJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Neural Jump-Diffusion Temporal Point Processes", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33573", "id": "d1P6GtRzuV", "proceeding": "https://proceedings.mlr.press/v235/zhang24cm.html", "pdf": "https://openreview.net/pdf?id=d1P6GtRzuV", "openreview": "https://openreview.net/forum?id=d1P6GtRzuV", "author_site": "Shuai Zhang, Chuan Zhou, Yang Liu, PENG ZHANG, Xixun Lin, Zhiming Ma", "tldr": "", "abstract": "We present a novel perspective on temporal point processes (TPPs) by reformulating their intensity processes as solutions to stochastic differential equations (SDEs). In particular, we first prove the equivalent SDE formulations of several classical TPPs, including Poisson processes, Hawkes processes, and self-correcting processes. Based on these proofs, we introduce a unified TPP framework called Neural Jump-Diffusion Temporal Point Process (NJDTPP), whose intensity process is governed by a neural jump-diffusion SDE (NJDSDE) where the drift, diffusion, and jump coefficient functions are parameterized by neural networks. Compared to previous works, NJDTPP exhibits model flexibility in capturing intensity dynamics without relying on any specific functional form, and provides theoretical guarantees regarding the existence and uniqueness of the solution to the proposed NJDSDE. Experiments on both synthetic and real-world datasets demonstrate that NJDTPP is capable of capturing the dynamics of intensity processes in different scenarios and significantly outperforms the state-of-the-art TPP models in prediction tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shuai Zhang;Chuan Zhou;Yang Aron Liu;Peng Zhang;Xixun Lin;Zhi-Ming Ma", "authorids": "~Shuai_Zhang23;~Chuan_Zhou3;~Yang_Aron_Liu1;~Peng_Zhang55;~Xixun_Lin3;~Zhi-Ming_Ma1", "gender": "M;M;M;M;M;", "homepage": ";http://www.chuanzhou.online/;https://github.com/liu-yang-maker;;https://linxixun.github.io/;http://homepage.amss.ac.cn/research/homePage/8eb59241e2e74d828fb84eec0efadba5/myHomePage.html", "dblp": ";https://dblp.uni-trier.de/pid/52/564-1;;21/1048-1;190/7231;", "google_scholar": "https://scholar.google.com.hk/citations?user=PUDxORcAAAAJ;4oBUWVEAAAAJ;;https://scholar.google.com.au/citations?user=89C_mxcAAAAJ;https://scholar.google.com/citations?hl=zh-CN;", "orcid": ";0000-0001-9958-8673;;0000-0001-7973-2746;0009-0004-6645-0597;", "linkedin": ";;;;;", "or_profile": "~Shuai_Zhang23;~Chuan_Zhou3;~Yang_Aron_Liu1;~Peng_Zhang55;~Xixun_Lin3;~Zhi-Ming_Ma1", "aff": "Academy of Mathematics and Systems Science, Chinese Academy of Sciences;Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Chinese Academy of Sciences;Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Chinese Academy of Sciences;Guangzhou University;Institute of Information Engineering, Chinese Academy of Sciences;Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Chinese Academy of Sciences", "aff_domain": "amss.ac.cn;amss.ac.cn;amss.ac.cn;gzhu.edu.cn;iie.ac.cn;amss.ac.cn", "position": "PhD student;Associate Professor;PhD student;Full Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nzhang2024neural,\ntitle={Neural Jump-Diffusion Temporal Point Processes},\nauthor={Shuai Zhang and Chuan Zhou and Yang Aron Liu and Peng Zhang and Xixun Lin and Zhi-Ming Ma},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=d1P6GtRzuV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 716794, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6930674245375702690&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": "amss.ac.cn;amss.ac.cn;amss.ac.cn;gzhu.edu.cn;iie.ac.cn;amss.ac.cn", "author_num": 6, "aff_unique_index": "0;0;0;1;0;0", "aff_unique_norm": "Chinese Academy of Sciences;Guangzhou University", "aff_unique_dep": "Academy of Mathematics and Systems Science;", "aff_unique_url": "http://www.amss.cas.cn;http://www.gzhu.edu.cn", "aff_unique_abbr": "AMSS;GU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Multiplicative Weights Update, Area Convexity and Random Coordinate Descent for Densest Subgraph Problems", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33572", "id": "d2E2i5rJ4x", "proceeding": "https://proceedings.mlr.press/v235/nguyen24e.html", "pdf": "https://openreview.net/pdf?id=d2E2i5rJ4x", "openreview": "https://openreview.net/forum?id=d2E2i5rJ4x", "author_site": "Ta Duy Nguyen, Alina Ene", "tldr": "", "abstract": "We study the densest subgraph problem and give algorithms via multiplicative weights update and area convexity that converge in $O\\left(\\frac{\\log m}{\\epsilon^{2}}\\right)$ and $O\\left(\\frac{\\log m}{\\epsilon}\\right)$ iterations, respectively, both with nearly-linear time per iteration. Compared with the work by Bahmani et al. (2014), our MWU algorithm uses a very different and much simpler procedure for recovering the dense subgraph from the fractional solution and does not employ a binary search. Compared with the work by Boob et al. (2019), our algorithm via area convexity improves the iteration complexity by a factor $\\Delta$---the maximum degree in the graph, and matches the fastest theoretical runtime currently known via flows (Chekuri et al., 2022) in total time. Next, we study the dense subgraph decomposition problem and give the first practical iterative algorithm with linear convergence rate $O\\left(mn\\log\\frac{1}{\\epsilon}\\right)$ via accelerated random coordinate descent. This significantly improves over $O\\left(\\frac{m\\sqrt{mn\\Delta}}{\\epsilon}\\right)$ time of the FISTA-based algorithm by Harb et al. (2022). In the high precision regime $\\epsilon\\ll\\frac{1}{n}$ where we can even recover the exact solution, our algorithm has a total runtime of $O\\left(mn\\log n\\right)$, matching the state of the art exact algorithm via parametric flows (Gallo et al., 1989). Empirically, we show that this algorithm is very practical and scales to very large graphs, and its performance is competitive with widely used methods that have significantly weaker theoretical guarantees.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ta Duy Nguyen;Alina Ene", "authorids": "~Ta_Duy_Nguyen1;~Alina_Ene1", "gender": ";", "homepage": "https://nguyentaduy.github.io/;", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Ta_Duy_Nguyen1;~Alina_Ene1", "aff": "Boston University;", "aff_domain": "bu.edu;", "position": "PhD student;", "bibtex": "@inproceedings{\nnguyen2024multiplicative,\ntitle={Multiplicative Weights Update, Area Convexity and Random Coordinate Descent for Densest Subgraph Problems},\nauthor={Ta Duy Nguyen and Alina Ene},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=d2E2i5rJ4x}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1356225, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15857972172057570733&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "bu.edu;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Boston University", "aff_unique_dep": "", "aff_unique_url": "https://www.bu.edu", "aff_unique_abbr": "BU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "GRATH: Gradual Self-Truthifying for Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33571", "id": "d2f2sCXQuI", "proceeding": "https://proceedings.mlr.press/v235/chen24aj.html", "pdf": "https://openreview.net/pdf?id=d2f2sCXQuI", "openreview": "https://openreview.net/forum?id=d2f2sCXQuI", "author_site": "Weixin Chen, Dawn Song, Bo Li", "tldr": "", "abstract": "Truthfulness is paramount for large language models (LLMs) as they are increasingly deployed in real-world applications. However, existing LLMs still struggle with generating truthful content, as evidenced by their modest performance on benchmarks like TruthfulQA. To address this issue, we propose GRAdual self-truTHifying (GRATH), a novel post-processing method to enhance truthfulness of LLMs. GRATH utilizes out-of-domain question prompts to generate pairwise truthfulness training data with each pair containing a question and its correct and incorrect answers, and then optimizes the model via direct preference optimization (DPO) to learn from the truthfulness difference between answer pairs. GRATH iteratively refines truthfulness data and updates the model, leading to a gradual improvement in model truthfulness in a self-supervised manner. Empirically, we evaluate GRATH using different 7B-LLMs and compare with LLMs with similar or even larger sizes on benchmark datasets. Our results show that GRATH effectively improves LLMs' truthfulness without compromising other core capabilities. Notably, GRATH achieves state-of-the-art performance on TruthfulQA, with MC1 accuracy of 54.71% and MC2 accuracy of 69.10%, which even surpass those on 70B-LLMs. The code is available at https://github.com/chenweixin107/GRATH.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weixin Chen;Dawn Song;Bo Li", "authorids": "~Weixin_Chen1;~Dawn_Song1;~Bo_Li19", "gender": "F;F;F", "homepage": "https://chenweixin107.github.io/;;http://boli.cs.illinois.edu/", "dblp": "72/8212;s/DXSong;50/3402-26", "google_scholar": "ZlBEHxwAAAAJ;;K8vJkTcAAAAJ", "orcid": ";;", "linkedin": "weixin-chen-0250872aa/;;", "or_profile": "~Weixin_Chen1;~Dawn_Song1;~Bo_Li19", "aff": "University of Illinois, Urbana Champaign;University of California, Berkeley;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;berkeley.edu;illinois.edu", "position": "PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nchen2024grath,\ntitle={{GRATH}: Gradual Self-Truthifying for Large Language Models},\nauthor={Weixin Chen and Dawn Song and Bo Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=d2f2sCXQuI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7184485, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9698076442505535106&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "illinois.edu;berkeley.edu;illinois.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://www.berkeley.edu", "aff_unique_abbr": "UIUC;UC Berkeley", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Urbana-Champaign;Berkeley", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "From Yes-Men to Truth-Tellers: Addressing Sycophancy in Large Language Models with Pinpoint Tuning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33570", "id": "d2vONO90Rw", "proceeding": "https://proceedings.mlr.press/v235/chen24u.html", "pdf": "https://openreview.net/pdf?id=d2vONO90Rw", "openreview": "https://openreview.net/forum?id=d2vONO90Rw", "author_site": "Wei Chen, Zhen Huang, Liang Xie, Binbin Lin, Houqiang Li, Le Lu, Xinmei Tian, CAI DENG, Yonggang Zhang, Wenxiao Wang, Xu Shen, Jieping Ye", "tldr": "", "abstract": "Large Language Models (LLMs) tend to prioritize adherence to user prompts over providing veracious responses, leading to the sycophancy issue. When challenged by users, LLMs tend to admit mistakes and provide inaccurate responses even if they initially provided the correct answer. Recent works propose to employ supervised fine-tuning (SFT) to mitigate the sycophancy issue, while it typically leads to the degeneration of LLMs' general capability. To address the challenge, we propose a novel supervised pinpoint tuning (SPT), where the region-of-interest modules are tuned for a given objective. Specifically, SPT first reveals and verifies a small percentage (<5%) of the basic modules, which significantly affect a particular behavior of LLMs. i.e., sycophancy. Subsequently, SPT merely fine-tunes these identified modules while freezing the rest. To verify the effectiveness of the proposed SPT, we conduct comprehensive experiments, demonstrating that SPT significantly mitigates the sycophancy issue of LLMs (even better than SFT). Moreover, SPT introduces limited or even no side effects on the general capability of LLMs. Our results shed light on how to precisely, effectively, and efficiently explain and improve the targeted ability of LLMs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wei Chen;Zhen Huang;Liang Xie;Binbin Lin;Houqiang Li;Le Lu;Xinmei Tian;Deng Cai;Yonggang Zhang;Wenxiao Wang;Xu Shen;Jieping Ye", "authorids": "~Wei_Chen59;~Zhen_Huang4;~Liang_Xie3;~Binbin_Lin3;~Houqiang_Li1;~Le_Lu3;~Xinmei_Tian1;~Deng_Cai4;~Yonggang_Zhang1;~Wenxiao_Wang2;~Xu_Shen1;~Jieping_Ye4", "gender": "M;;M;M;M;M;F;M;M;M;M;M", "homepage": ";;https://www.linkedin.com/in/%E4%BA%AE-%E8%B0%A2-254928160/;https://www.linkedin.com/in/binbin-lin-03598b31/;https://staff.ustc.edu.cn/~lihq/;http://www.cs.jhu.edu/~lelu/;https://faculty.ustc.edu.cn/tianxinmei1/zh_CN/index.htm;http://www.cad.zju.edu.cn/home/dengcai/;https://yonggangzhangben.github.io/index.html;https://wenxiaowang.com;;http://yelabs.net/", "dblp": ";22/3870;81/2806-3;51/8073;59/7017.html;78/6574-1.html;03/5204-1;c/DCai;27/6859-3;243/5853-1;09/10130-1.html;03/5454", "google_scholar": ";WDYhR1cAAAAJ;https://scholar.google.com/citations?hl=zh-CN;Zmvq4KYAAAAJ;7sFMIKoAAAAJ;kZn0f6gAAAAJ;https://scholar.google.com.au/citations?hl=zh-CN;vzxDyJoAAAAJ;XSbEr98AAAAJ;https://scholar.google.com.hk/citations?user=rcxOjikAAAAJ;38jwGs8AAAAJ;T9AzhwcAAAAJ", "orcid": "0009-0009-3265-3966;;0000-0002-7604-1410;0000-0002-0330-6406;0000-0003-2188-3028;0000-0002-6799-9416;0000-0002-5952-8753;;0000-0002-4080-7592;;;0000-0001-8662-5818", "linkedin": ";;%E4%BA%AE-%E8%B0%A2-254928160/;;;tigerlelu/;;;;;;", "or_profile": "~Wei_Chen59;~Zhen_Huang4;~Liang_Xie3;~Binbin_Lin3;~Houqiang_Li1;~Le_Lu3;~Xinmei_Tian1;~Deng_Cai4;~Yonggang_Zhang1;~Wenxiao_Wang2;~Xu_Shen1;~Jieping_Ye4", "aff": "Zhejiang University;;Zhejiang University of Technology;Zhejiang University;University of Science and Technology of China;Alibaba Group;University of Science and Technology of China;Zhejiang University;Hong Kong Baptist University;Zhejiang University;Alibaba Group;Alibaba Group", "aff_domain": "zju.edu.cn;;zjut.edu.cn;zju.edu.cn;ustc.edu.cn;alibaba-inc.com;ustc.edu.cn;zju.edu.cn;hkbu.edu.hk;zju.edu.cn;alibaba-inc.com;alibaba-inc.com", "position": "MS student;;Postdoc;Researcher;Professor;Full Professor;Full Professor;Professor;Postdoc;Assistant Professor;Researcher;Principal Researcher", "bibtex": "@inproceedings{\nchen2024from,\ntitle={From Yes-Men to Truth-Tellers: Addressing Sycophancy in Large Language Models with Pinpoint Tuning},\nauthor={Wei Chen and Zhen Huang and Liang Xie and Binbin Lin and Houqiang Li and Le Lu and Xinmei Tian and Deng Cai and Yonggang Zhang and Wenxiao Wang and Xu Shen and Jieping Ye},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=d2vONO90Rw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2063411, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11610849791500137001&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "zju.edu.cn;;zjut.edu.cn;zju.edu.cn;ustc.edu.cn;alibaba-inc.com;ustc.edu.cn;zju.edu.cn;hkbu.edu.hk;zju.edu.cn;alibaba-inc.com;alibaba-inc.com", "author_num": 12, "aff_unique_index": "0;1;0;2;3;2;0;4;0;3;3", "aff_unique_norm": "Zhejiang University;Zhejiang University of Technology;University of Science and Technology of China;Alibaba Group;Hong Kong Baptist University", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.zju.edu.cn;https://www.zjut.edu.cn;http://www.ustc.edu.cn;https://www.alibaba.com;https://www.hkbu.edu.hk", "aff_unique_abbr": "ZJU;ZJUT;USTC;Alibaba;HKBU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "InfiAgent-DABench: Evaluating Agents on Data Analysis Tasks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33569", "id": "d5LURMSfTx", "proceeding": "https://proceedings.mlr.press/v235/hu24s.html", "pdf": "https://openreview.net/pdf?id=d5LURMSfTx", "openreview": "https://openreview.net/forum?id=d5LURMSfTx", "author_site": "Xueyu Hu, Ziyu Zhao, Shuang Wei, Ziwei Chai, Qianli Ma, Guoyin Wang, Xuwu Wang, Jing Su, Jingjing Xu, Ming Zhu, Yao Cheng, Jianbo Yuan, Jiwei Li, Kun Kuang, Yang Yang, Hongxia Yang, Fei Wu", "tldr": "", "abstract": "In this paper, we introduce InfiAgent-DABench, the first benchmark specifically designed to evaluate LLM-based agents on data analysis tasks. Agents need to solve these tasks end-to-end by interacting with an execution environment. This benchmark contains DAEval, a dataset consisting of 603 data analysis questions derived from 124 CSV files, and an agent framework which incorporates LLMs to serve as data analysis agents for both serving and evaluating. Since data analysis questions are often open-ended and hard to evaluate without human supervision, we adopt a format-prompting technique to convert each question into a closed-form format so that they can be automatically evaluated. Our extensive benchmarking of 34 LLMs uncovers the current challenges encountered in data analysis tasks. In addition, building upon our agent framework, we develop a specialized agent, DAAgent, which surpasses GPT-3.5 by 3.9% on DABench. Evaluation datasets and toolkits for InfiAgent-DABench are released at https://github.com/InfiAgent/InfiAgent.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xueyu Hu;Ziyu Zhao;Shuang Wei;Ziwei Chai;Qianli Ma;Guoyin Wang;Xuwu Wang;Jing Su;Jingjing Xu;Ming Zhu;Yao Cheng;Jianbo Yuan;Jiwei Li;Kun Kuang;Yang Yang;Hongxia Yang;Fei Wu", "authorids": "~Xueyu_Hu1;~Ziyu_Zhao3;~Shuang_Wei1;~Ziwei_Chai1;~Qianli_Ma4;~Guoyin_Wang1;~Xuwu_Wang2;~Jing_Su2;~Jingjing_Xu1;~Ming_Zhu1;~Yao_Cheng7;~Jianbo_Yuan1;~Jiwei_Li1;~Kun_Kuang1;~Yang_Yang35;~Hongxia_Yang2;~Fei_Wu1", "gender": ";M;M;Not Specified;M;M;F;F;F;;M;M;M;M;M;F;M", "homepage": ";https://scholar.google.com/citations?user=GzZxXIcAAAAJ;;https://zwchai.github.io;https://fazzie-key.cool/about/index.html;;;;;;https://www.linkedin.com/in/yao-cheng-14553a177;;https://nlp.stanford.edu/~bdlijiwei/;http://kunkuang.github.io;http://yangy.org;https://www4.comp.polyu.edu.hk/~hongxyang/;https://person.zju.edu.cn/wufei", "dblp": ";157/9260-1;;325/1758;;05/3838-2;247/9313.html;;25/624;;;134/6790;73/5746-1;194/4245;;;84/3254-1", "google_scholar": ";GzZxXIcAAAAJ;;;;https://scholar.google.com/citations?hl=en;Fww9LGsAAAAJ;;;;;https://scholar.google.com/citations?hl=en;PwU16JEAAAAJ;https://scholar.google.com.hk/citations?user=FOsNiMQAAAAJ;;iJlC5mMAAAAJ;XJLn4MYAAAAJ", "orcid": ";0000-0003-1460-2777;;0000-0003-1376-5101;;;0000-0003-3363-570X;;;;;;;0009-0000-7528-8131;0000-0002-5058-4417;;", "linkedin": ";;shuang-wei-b34b2114a/;;;;;%E5%A9%A7-%E8%8B%8F-254877189;;;;;;;;;", "or_profile": "~Xueyu_Hu1;~Ziyu_Zhao3;~Shuang_Wei1;~Ziwei_Chai1;~Qianli_Ma4;~Guoyin_Wang1;~Xuwu_Wang2;~Jing_Su2;~Jingjing_Xu1;~Ming_Zhu1;~Yao_Cheng7;~Jianbo_Yuan1;~Jiwei_Li1;~Kun_Kuang1;~Yang_Yang35;~Hongxia_Yang2;~Fei_Wu1", "aff": ";Zhejiang University;Rochester Institute of Technology;Zhejiang University;;Bytedance;ByteDance Inc.;;;;;Bytedance;Zhejiang University;Zhejiang University;Zhejiang University;ByteDance Inc.;Zhejiang University", "aff_domain": ";zju.edu.cn;rit.edu;zju.edu.cn;;bytedance.com;bytedance.com;;;;;bytedance.com;zju.edu.cn;zju.edu.cn;zju.edu.cn;bytedance.com;zju.edu.cn", "position": ";PhD student;PhD student;PhD student;;Principal Researcher;Researcher;;;;;Researcher;Assistant Professor;Associate Professor;Associate Professor;Principal Researcher;Full Professor", "bibtex": "@inproceedings{\nhu2024infiagentdabench,\ntitle={InfiAgent-{DAB}ench: Evaluating Agents on Data Analysis Tasks},\nauthor={Xueyu Hu and Ziyu Zhao and Shuang Wei and Ziwei Chai and Qianli Ma and Guoyin Wang and Xuwu Wang and Jing Su and Jingjing Xu and Ming Zhu and Yao Cheng and Jianbo Yuan and Jiwei Li and Kun Kuang and Yang Yang and Hongxia Yang and Fei Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=d5LURMSfTx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1296106, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 17, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7601893057860851459&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 8, "email": ";zju.edu.cn;rit.edu;zju.edu.cn;;bytedance.com;bytedance.com;;;;;bytedance.com;zju.edu.cn;zju.edu.cn;zju.edu.cn;bytedance.com;zju.edu.cn", "author_num": 17, "aff_unique_index": "0;1;0;2;2;2;0;0;0;2;0", "aff_unique_norm": "Zhejiang University;Rochester Institute of Technology;ByteDance", "aff_unique_dep": ";;", "aff_unique_url": "https://www.zju.edu.cn;https://www.rit.edu;https://www.bytedance.com", "aff_unique_abbr": "ZJU;RIT;Bytedance", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;0;0;0;0;0;0", "aff_country_unique": "China;United States" }, { "title": "KernelSHAP-IQ: Weighted Least Square Optimization for Shapley Interactions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33568", "id": "d5jXW2H4gg", "proceeding": "https://proceedings.mlr.press/v235/fumagalli24a.html", "pdf": "https://openreview.net/pdf?id=d5jXW2H4gg", "openreview": "https://openreview.net/forum?id=d5jXW2H4gg", "author_site": "Fabian Fumagalli, Maximilian Muschalik, Patrick Kolpaczki, Eyke H\u00fcllermeier, CITEC Barbara Hammer", "tldr": "", "abstract": "The Shapley value (SV) is a prevalent approach of allocating credit to machine learning (ML) entities to understand black box ML models. Enriching such interpretations with higher-order interactions is inevitable for complex systems, where the Shapley Interaction Index (SII) is a direct axiomatic extension of the SV. While it is well-known that the SV yields an optimal approximation of any game via a weighted least square (WLS) objective, an extension of this result to SII has been a long-standing open problem, which even led to the proposal of an alternative index. In this work, we characterize higher-order SII as a solution to a WLS problem, which constructs an optimal approximation via SII and k-Shapley values (k-SII). We prove this representation for the SV and pairwise SII and give empirically validated conjectures for higher orders. As a result, we propose KernelSHAP-IQ, a direct extension of KernelSHAP for SII, and demonstrate state-of-the-art performance for feature interactions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fabian Fumagalli;Maximilian Muschalik;Patrick Kolpaczki;Eyke H\u00fcllermeier;Barbara Hammer", "authorids": "~Fabian_Fumagalli1;~Maximilian_Muschalik1;~Patrick_Kolpaczki1;~Eyke_H\u00fcllermeier1;~Barbara_Hammer4", "gender": "M;M;M;M;F", "homepage": "https://hammer-lab.techfak.uni-bielefeld.de/people/316634936/;https://maxmuschalik.com/;https://www.kiml.ifi.lmu.de/people/employees/kolpaczki/index.html;https://cs.uni-paderborn.de/index.php?id=60202;https://www.techfak.uni-bielefeld.de/~bhammer/", "dblp": "329/4508;329/4090;304/9952;h/EykeHullermeier;h/BarbaraHammer", "google_scholar": "anUMB08AAAAJ;https://scholar.google.de/citations?user=jJBCW74AAAAJ;PVwqZS8AAAAJ;https://scholar.google.de/citations?user=usVJeNN3xFAC;1d3OxaUAAAAJ", "orcid": "0000-0003-3955-3510;0000-0002-6921-0204;;0000-0002-9944-4108;0000-0002-2615-8151", "linkedin": "fabian-fumagalli/;maximilian-muschalik/;;;", "or_profile": "~Fabian_Fumagalli1;~Maximilian_Muschalik1;~Patrick_Kolpaczki1;~Eyke_H\u00fcllermeier1;~Barbara_Hammer4", "aff": "Universit\u00e4t Bielefeld;Institute of Computer Science, Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;Universit\u00e4t Paderborn;Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;Universit\u00e4t Bielefeld", "aff_domain": "uni-bielefeld.de;ifi.lmu.de;uni-paderborn.de;lmu.de;uni-bielefeld.de", "position": "PhD student;PhD student;PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nfumagalli2024kernelshapiq,\ntitle={Kernel{SHAP}-{IQ}: Weighted Least Square Optimization for Shapley Interactions},\nauthor={Fabian Fumagalli and Maximilian Muschalik and Patrick Kolpaczki and Eyke H{\\\"u}llermeier and Barbara Hammer},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=d5jXW2H4gg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1303093, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15621423163398553979&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "email": "uni-bielefeld.de;ifi.lmu.de;uni-paderborn.de;lmu.de;uni-bielefeld.de", "author_num": 5, "aff_unique_index": "0;1;2;1;0", "aff_unique_norm": "Universit\u00e4t Bielefeld;Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;University of Paderborn", "aff_unique_dep": ";Institute of Computer Science;", "aff_unique_url": "https://www.uni-bielefeld.de/;https://www.uni-muenchen.de;https://www.uni-paderborn.de", "aff_unique_abbr": "Uni Bielefeld;LMU M\u00fcnchen;UPB", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Fully-Dynamic Approximate Decision Trees With Worst-Case Update Time Guarantees", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33567", "id": "d5tJWH5yCi", "proceeding": "https://proceedings.mlr.press/v235/bressan24a.html", "pdf": "https://openreview.net/pdf?id=d5tJWH5yCi", "openreview": "https://openreview.net/forum?id=d5tJWH5yCi", "author_site": "Marco Bressan, Mauro Sozio", "tldr": "", "abstract": "We study the problem of maintaining a decision tree in the fully-dynamic setting, where the dataset is updated by an adversarial sequence of insertions and deletions. We present the first algorithm with strong guarantees on both the quality of the tree and the worst-case update time (the maximum time spent between two consecutive dataset updates). For instance, we can maintain a tree where each node has Gini gain within $\\beta$ of the optimum, while guaranteeing an update time $O(d \\beta^{-3} \\log^4 n )$, where $d$ is the number of features and $n$ the maximum size of the dataset. This is optimal up to polylogarithmic factors, as any dynamic algorithm must have update time in $\\Omega(d)$. Similar guarantees hold for the variance and information gain, for classification and regression, and even for *boosted* trees. This shows that many popular decision trees such as ID3 or C4.5 can be efficiently be made dynamic, answering an open question of Bressan, Damay and Sozio (AAAI 2023). We also show that, under the 3SUM conjecture or the Orthogonal Vectors Hypothesis, the update time must be polynomial in $1/\\beta$.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Marco Bressan;Mauro Sozio", "authorids": "~Marco_Bressan4;~Mauro_Sozio2", "gender": "M;M", "homepage": "https://sites.google.com/view/marco-bressan/home;https://sites.google.com/site/maurosozio/home", "dblp": "b/MarcoBressan2;72/3698.html", "google_scholar": "https://scholar.google.it/citations?user=8Rh17n8AAAAJ;xvH0eIsAAAAJ", "orcid": "0000-0001-5211-2264;", "linkedin": ";", "or_profile": "~Marco_Bressan4;~Mauro_Sozio2", "aff": "University of Milan;", "aff_domain": "unimi.it;", "position": "Assistant Professor;", "bibtex": "@inproceedings{\nbressan2024fullydynamic,\ntitle={Fully-Dynamic Approximate Decision Trees With Worst-Case Update Time Guarantees},\nauthor={Marco Bressan and Mauro Sozio},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=d5tJWH5yCi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 489415, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16697853894678570227&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "unimi.it;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "University of Milan", "aff_unique_dep": "", "aff_unique_url": "https://www.unimi.it", "aff_unique_abbr": "UniMi", "aff_country_unique_index": "0", "aff_country_unique": "Italy" }, { "title": "Position: The Causal Revolution Needs Scientific Pragmatism", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33566", "id": "dBMLtuKH01", "proceeding": "https://proceedings.mlr.press/v235/loftus24a.html", "pdf": "https://openreview.net/pdf?id=dBMLtuKH01", "openreview": "https://openreview.net/forum?id=dBMLtuKH01", "author_site": "Joshua Loftus", "tldr": "", "abstract": "Causal models and methods have great promise, but their progress has been stalled. Proposals using causality get squeezed between two opposing worldviews. Scientific perfectionism--an insistence on only using ``correct'' models--slows the adoption of causal methods in knowledge generating applications. Pushing in the opposite direction, the academic discipline of computer science prefers algorithms with no or few assumptions, and technologies based on automation and scalability are often selected for economic and business applications. We argue that these system-centric inductive biases should be replaced with a human-centric philosophy we refer to as scientific pragmatism. The machine learning community must strike the right balance to make space for the causal revolution to prosper.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Joshua R. Loftus", "authorids": "~Joshua_R._Loftus1", "gender": "M", "homepage": "http://joshualoftus.com/", "dblp": "198/0795", "google_scholar": "SIbr3XUAAAAJ", "orcid": "0000-0002-2905-1632", "linkedin": "", "or_profile": "~Joshua_R._Loftus1", "aff": "London School of Economics", "aff_domain": "lse.ac.uk", "position": "Assistant Professor", "bibtex": "@inproceedings{\nloftus2024position,\ntitle={Position: The Causal Revolution Needs Scientific Pragmatism},\nauthor={Joshua R. Loftus},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dBMLtuKH01}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 187474, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4203461023296212544&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 9, "email": "lse.ac.uk", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "London School of Economics", "aff_unique_dep": "", "aff_unique_url": "https://www.lse.ac.uk", "aff_unique_abbr": "LSE", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "title": "A Mechanistic Understanding of Alignment Algorithms: A Case Study on DPO and Toxicity", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33565", "id": "dBqHGZPGZI", "proceeding": "https://proceedings.mlr.press/v235/lee24a.html", "pdf": "https://openreview.net/pdf?id=dBqHGZPGZI", "openreview": "https://openreview.net/forum?id=dBqHGZPGZI", "author_site": "Andrew Lee, Xiaoyan Bai, Itamar Pres, Martin Wattenberg, Jonathan K. Kummerfeld, Rada Mihalcea", "tldr": "", "abstract": "While alignment algorithms are commonly used to tune pre-trained language models towards user preferences, we lack explanations for the underlying mechanisms in which models become ``aligned'', thus making it difficult to explain phenomena like jailbreaks. In this work we study a popular algorithm, direct preference optimization (DPO), and the mechanisms by which it reduces toxicity. Namely, we first study how toxicity is represented and elicited in pre-trained language models (GPT2-medium, Llama2-7b). We then apply DPO with a carefully crafted pairwise dataset to reduce toxicity. We examine how the resulting models avert toxic outputs, and find that capabilities learned from pre-training are not removed, but rather bypassed. We use this insight to demonstrate a simple method to un-align the models, reverting them back to their toxic behavior.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Andrew Lee;Xiaoyan Bai;Itamar Pres;Martin Wattenberg;Jonathan K. Kummerfeld;Rada Mihalcea", "authorids": "~Andrew_Lee2;~Xiaoyan_Bai1;presi@umich.edu;~Martin_Wattenberg1;~Jonathan_K._Kummerfeld2;~Rada_Mihalcea1", "gender": ";F;;M;;F", "homepage": ";https://elena-baixy.github.io/;;http://www.bewitched.com;;https://web.eecs.umich.edu/~mihalcea/", "dblp": ";63/3140;;w/MartinWattenberg;;m/RadaMihalcea", "google_scholar": "oQiCjnwAAAAJ;ic3BUhMAAAAJ;;pv54dqMAAAAJ;;https://scholar.google.com.tw/citations?user=UetM7FgAAAAJ", "orcid": ";;;;;0000-0002-0767-6703", "linkedin": ";elenabai/;;;;", "or_profile": "~Andrew_Lee2;~Xiaoyan_Bai1;presi@umich.edu;~Martin_Wattenberg1;~Jonathan_K._Kummerfeld2;~Rada_Mihalcea1", "aff": "University of Michigan;University of Michigan - Ann Arbor;;Google;;University of Michigan", "aff_domain": "umich.edu;umich.edu;;google.com;;umich.edu", "position": "PhD student;Undergrad student;;Principal Researcher;;Full Professor", "bibtex": "@inproceedings{\nlee2024a,\ntitle={A Mechanistic Understanding of Alignment Algorithms: A Case Study on {DPO} and Toxicity},\nauthor={Andrew Lee and Xiaoyan Bai and Itamar Pres and Martin Wattenberg and Jonathan K. Kummerfeld and Rada Mihalcea},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dBqHGZPGZI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 884395, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 93, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6168801330158049880&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "umich.edu;umich.edu;;google.com;;umich.edu", "author_num": 6, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Michigan;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.umich.edu;https://www.google.com", "aff_unique_abbr": "UM;Google", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Ann Arbor;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Self-Supervised Interpretable End-to-End Learning via Latent Functional Modularity", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33564", "id": "dFEeI51O5j", "proceeding": "https://proceedings.mlr.press/v235/seong24a.html", "pdf": "https://openreview.net/pdf?id=dFEeI51O5j", "openreview": "https://openreview.net/forum?id=dFEeI51O5j", "author_site": "Hyunki Seong, Hyunchul Shim", "tldr": "", "abstract": "We introduce MoNet, a novel functionally modular network for self-supervised and interpretable end-to-end learning. By leveraging its functional modularity with a latent-guided contrastive loss function, MoNet efficiently learns task-specific decision-making processes in latent space without requiring task-level supervision. Moreover, our method incorporates an online, post-hoc explainability approach that enhances the interpretability of end-to-end inferences without compromising sensorimotor control performance. In real-world indoor environments, MoNet demonstrates effective visual autonomous navigation, outperforming baseline models by 7% to 28% in task specificity analysis. We further explore the interpretability of our network through post-hoc analysis of perceptual saliency maps and latent decision vectors. This provides valuable insights into the incorporation of explainable artificial intelligence into robotic learning, encompassing both perceptual and behavioral perspectives. Supplementary materials are available at https://sites.google.com/view/monet-lgc.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hyunki Seong;Hyunchul Shim", "authorids": "~Hyunki_Seong1;~Hyunchul_Shim1", "gender": "M;M", "homepage": "https://www.youtube.com/channel/UCjnZcH9To6W9fD_t3jFJ7lw;http://unmanned.kaist.ac.kr", "dblp": "283/0926;", "google_scholar": "DTNeW6kAAAAJ;-CKm5DEAAAAJ", "orcid": "0000-0002-7169-3006;", "linkedin": "hyunki-seong-hynkis/;", "or_profile": "~Hyunki_Seong1;~Hyunchul_Shim1", "aff": "KAIST, Korea Advanced Institute of Science & Technology;KAIST, Korea Advanced Institute of Science & Technology", "aff_domain": "ee.kaist.ac.kr;ee.kaist.ac.kr", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nseong2024selfsupervised,\ntitle={Self-Supervised Interpretable End-to-End Learning via Latent Functional Modularity},\nauthor={Hyunki Seong and Hyunchul Shim},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dFEeI51O5j}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4209055, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12324859928663648160&as_sdt=805&sciodt=0,3&hl=en", "gs_version_total": 7, "email": "ee.kaist.ac.kr;ee.kaist.ac.kr", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Sign is Not a Remedy: Multiset-to-Multiset Message Passing for Learning on Heterophilic Graphs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33563", "id": "dGDFZM018a", "proceeding": "https://proceedings.mlr.press/v235/liang24c.html", "pdf": "https://openreview.net/pdf?id=dGDFZM018a", "openreview": "https://openreview.net/forum?id=dGDFZM018a", "author_site": "Langzhang Liang, Sunwoo Kim, Kijung Shin, Zenglin Xu, Shirui Pan, Yuan Qi", "tldr": "", "abstract": "Graph Neural Networks (GNNs) have gained significant attention as a powerful modeling and inference method, especially for homophilic graph-structured data. To empower GNNs in heterophilic graphs, where adjacent nodes exhibit dissimilar labels or features, Signed Message Passing (SMP) has been widely adopted. However, there is a lack of theoretical and empirical analysis regarding the limitations of SMP. In this work, we unveil the potential pitfalls of SMP and their remedies. We first identify two limitations of SMP: undesirable representation update for multi-hop neighbors and vulnerability against oversmoothing issues. To overcome these challenges, we propose a novel message-passing function called Multiset to Multiset GNN (M2M-GNN). Our theoretical analyses and extensive experiments demonstrate that M2M-GNN effectively alleviates the limitations of SMP, yielding superior performance in comparison.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Langzhang Liang;Sunwoo Kim;Kijung Shin;Zenglin Xu;Shirui Pan;Yuan Qi", "authorids": "~Langzhang_Liang1;~Sunwoo_Kim4;~Kijung_Shin2;~Zenglin_Xu2;~Shirui_Pan1;~Yuan_Qi2", "gender": "M;M;M;;M;M", "homepage": "https://orcid.org/0000-0001-8919-0215;https://sites.google.com/view/sunwoo97;https://kijungs.github.io/;;;https://faculty.fudan.edu.cn/xuzenglin/en/index.htm", "dblp": "304/3069;16/3210.html;153/2052;91/8171;;68/1538", "google_scholar": "Gq2LVnIAAAAJ;fYxrC_EAAAAJ;https://scholar.google.co.kr/citations?user=Yp3Cz5AAAAAJ;https://scholar.google.com.au/citations?user=frWRJN4AAAAJ;;gF0H9nEAAAAJ", "orcid": "0000-0001-8919-0215;0009-0006-6002-169X;0000-0002-2872-1526;0000-0003-0794-527X;;0000-0001-5550-6461", "linkedin": ";;kijungshin/;;yuan-alan-qi-30ba1b4/;", "or_profile": "~Langzhang_Liang1;~Sunwoo_Kim4;~Kijung_Shin2;~Shirui_Pan1;~Yuan_Qi2;~Zenglin_Xu1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Griffith University;Fudan University;Harbin Institute of Technology Shenzhen", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;griffith.edu.au;fudan.edu.cn;hit.edu.cn", "position": "PhD student;MS student;Associate Professor;Full Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nliang2024sign,\ntitle={Sign is Not a Remedy: Multiset-to-Multiset Message Passing for Learning on Heterophilic Graphs},\nauthor={Langzhang Liang and Sunwoo Kim and Kijung Shin and Zenglin Xu and Shirui Pan and Yuan Qi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dGDFZM018a}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 902838, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10249431573463553724&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;griffith.edu.au;fudan.edu.cn;hit.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;1;2;3", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Griffith University;Fudan University;Harbin Institute of Technology", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.kaist.ac.kr;https://www.griffith.edu.au;https://www.fudan.edu.cn;https://www.hit.edu.cn/", "aff_unique_abbr": "KAIST;Griffith;Fudan;HIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0;1;2;2", "aff_country_unique": "South Korea;Australia;China" }, { "title": "Deep Functional Factor Models: Forecasting High-Dimensional Functional Time Series via Bayesian Nonparametric Factorization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33562", "id": "dHXKCyaIkp", "proceeding": "https://proceedings.mlr.press/v235/liu24aw.html", "pdf": "https://openreview.net/pdf?id=dHXKCyaIkp", "openreview": "https://openreview.net/forum?id=dHXKCyaIkp", "author_site": "Yirui Liu, Xinghao Qiao, Yulong Pei, Liying Wang", "tldr": "", "abstract": "This paper introduces the Deep Functional Factor Model (DF2M), a Bayesian nonparametric model designed for analysis of high-dimensional functional time series. DF2M is built upon the Indian Buffet Process and the multi-task Gaussian Process, incorporating a deep kernel function that captures non-Markovian and nonlinear temporal dynamics. Unlike many black-box deep learning models, DF2M offers an explainable approach to utilizing neural networks by constructing a factor model and integrating deep neural networks within the kernel function. Additionally, we develop a computationally efficient variational inference algorithm to infer DF2M. Empirical results from four real-world datasets demonstrate that DF2M provides better explainability and superior predictive accuracy compared to conventional deep learning models for high-dimensional functional time series.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yirui Liu;Xinghao Qiao;Yulong Pei;Liying Wang", "authorids": "~Yirui_Liu1;~Xinghao_Qiao1;~Yulong_Pei1;~Liying_Wang3", "gender": "M;M;;", "homepage": "https://personal.lse.ac.uk/liuy110/;https://personal.lse.ac.uk/qiaox/;;https://sites.google.com/view/liyingwang2023/home", "dblp": "200/9650;184/0949.html;;75/3190", "google_scholar": "6n1vWWgAAAAJ;xqEF8PgAAAAJ;;", "orcid": ";;;", "linkedin": ";;;liying-wang-0296a3b2/", "or_profile": "~Yirui_Liu1;~Xinghao_Qiao1;~Yulong_Pei1;~Liying_Wang3", "aff": "J.P. Morgan Chase;London School of Economics;;University of Liverpool", "aff_domain": "jpmorgan.com;lse.ac.uk;;liverpool.ac.uk", "position": "Researcher;Associate Professor;;Lecturer", "bibtex": "@inproceedings{\nliu2024deep,\ntitle={Deep Functional Factor Models: Forecasting High-Dimensional Functional Time Series via Bayesian Nonparametric Factorization},\nauthor={Yirui Liu and Xinghao Qiao and Yulong Pei and Liying Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dHXKCyaIkp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1501536, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18170072087157404031&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "email": "jpmorgan.com;lse.ac.uk;;liverpool.ac.uk", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "JPMorgan Chase & Co.;London School of Economics;University of Liverpool", "aff_unique_dep": ";;", "aff_unique_url": "https://www.jpmorganchase.com;https://www.lse.ac.uk;https://www.liverpool.ac.uk", "aff_unique_abbr": "JPM;LSE;Liv Uni", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "In-context Vectors: Making In Context Learning More Effective and Controllable Through Latent Space Steering", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33561", "id": "dJTChKgv3a", "proceeding": "https://proceedings.mlr.press/v235/liu24bx.html", "pdf": "https://openreview.net/pdf?id=dJTChKgv3a", "openreview": "https://openreview.net/forum?id=dJTChKgv3a", "author_site": "Sheng Liu, Haotian Ye, Lei Xing, James Zou", "tldr": "", "abstract": "Large language models (LLMs) demonstrate emergent in-context learning capabilities, where they adapt to new tasks based on example demonstrations. However, in-context learning has seen limited effectiveness in many settings, is difficult to quantitatively control and takes up context window space. To overcome these limitations, we propose an alternative approach that recasts in-context learning as in-context vectors (ICV). Using ICV has two steps. We first use a forward pass on demonstration examples to create the in-context vector from the latent embedding of the LLM. This vector captures essential information about the intended task. On a new query, instead of adding demonstrations to the prompt, we shift the latent states of the LLM using the ICV. The ICV approach has several benefits: 1) it enables the LLM to more effectively follow the demonstration examples; 2) it's easy to control by adjusting the magnitude of the ICV; 3) it reduces the length of the prompt by removing the in-context demonstrations; 4) ICV is computationally much more efficient than fine-tuning. We demonstrate that ICV achieves better performance compared to standard in-context learning and fine-tuning on diverse tasks including safety, style transfer, role-playing and formatting. Moreover, we show that we can flexibly teach LLM to simultaneously follow different types of instructions by simple vector arithmetics on the corresponding ICVs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sheng Liu;Haotian Ye;Lei Xing;James Y. Zou", "authorids": "~Sheng_Liu2;~Haotian_Ye1;~Lei_Xing1;~James_Y._Zou1", "gender": ";M;M;M", "homepage": "https://shengliu66.github.io/;https://haotianye.com;http://med.stanford.edu/xinglab.html;", "dblp": ";284/0539;;72/8399", "google_scholar": "rzhzR-cAAAAJ;VU4chlsAAAAJ;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Sheng_Liu2;~Haotian_Ye1;~Lei_Xing1;~James_Y._Zou1", "aff": "Stanford University;Stanford University;Stanford University;", "aff_domain": "stanford.edu;stanford.edu;stanford.edu;", "position": "Postdoc;PhD student;Professor, Dept of Radiation Oncology,;", "bibtex": "@inproceedings{\nliu2024incontext,\ntitle={In-context Vectors: Making In Context Learning More Effective and Controllable Through Latent Space Steering},\nauthor={Sheng Liu and Haotian Ye and Lei Xing and James Y. Zou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dJTChKgv3a}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 993660, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 77, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7947926275765560723&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "stanford.edu;stanford.edu;stanford.edu;", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Position: A Safe Harbor for AI Evaluation and Red Teaming", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33560", "id": "dLojMSgSFW", "proceeding": "https://proceedings.mlr.press/v235/longpre24a.html", "pdf": "https://openreview.net/pdf?id=dLojMSgSFW", "openreview": "https://openreview.net/forum?id=dLojMSgSFW", "author_site": "Shayne Longpre, Sayash Kapoor, Kevin Klyman, Ashwin Ramaswami, Rishi Bommasani, Borhane Blili-Hamelin, Yangsibo Huang, Aviya Skowron, Zheng Xin Yong, Suhas Kotha, Yi Zeng, Weiyan Shi, Xianjun Yang, Reid Southen, Alex Robey, Patrick Chao, Diyi Yang, Ruoxi Jia, Daniel Kang, Alex Pentland, Arvind Narayanan, Percy Liang, Peter Henderson", "tldr": "", "abstract": "Independent evaluation and red teaming are critical for identifying the risks posed by generative AI systems. However, the terms of service and enforcement strategies used by prominent AI companies to deter model misuse have disincentives on good faith safety evaluations. This causes some researchers to fear that conducting such research or releasing their findings will result in account suspensions or legal reprisal. Although some companies offer researcher access programs, they are an inadequate substitute for independent research access, as they have limited community representation, receive inadequate funding, and lack independence from corporate incentives. We propose that major generative AI developers commit to providing a legal and technical safe harbor, protecting public interest safety research and removing the threat of account suspensions or legal reprisal. These proposals emerged from our collective experience conducting safety, privacy, and trustworthiness research on generative AI systems, where norms and incentives could be better aligned with public interests, without exacerbating model misuse. We believe these commitments are a necessary step towards more inclusive and unimpeded community efforts to tackle the risks of generative AI.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shayne Longpre;Sayash Kapoor;Kevin Klyman;Ashwin Ramaswami;Rishi Bommasani;Borhane Blili-Hamelin;Yangsibo Huang;Aviya Skowron;Zheng Xin Yong;Suhas Kotha;Yi Zeng;Weiyan Shi;Xianjun Yang;Reid Southen;Alexander Robey;Patrick Chao;Diyi Yang;Ruoxi Jia;Daniel Kang;Alex Pentland;Arvind Narayanan;Percy Liang;Peter Henderson", "authorids": "~Shayne_Longpre1;~Sayash_Kapoor2;~Kevin_Klyman1;aramaswamis@gmail.com;~Rishi_Bommasani1;~Borhane_Blili-Hamelin1;~Yangsibo_Huang2;~Aviya_Skowron1;~Zheng_Xin_Yong1;~Suhas_Kotha1;~Yi_Zeng3;~Weiyan_Shi2;~Xianjun_Yang1;~Reid_Southen1;~Alexander_Robey1;~Patrick_Chao1;~Diyi_Yang2;~Ruoxi_Jia1;~Daniel_Kang1;~Alex_Pentland1;~Arvind_Narayanan1;~Percy_Liang1;~Peter_Henderson1", "gender": "M;M;M;;M;M;F;Non-Binary;M;M;M;F;M;;M;;F;;;M;;;M", "homepage": "https://www.shaynelongpre.com;https://www.cs.princeton.edu/~sayashk/;;;https://rishibommasani.github.io/;https://borhane.xyz/;https://hazelsuko07.github.io/yangsibo/;https://www.eleuther.ai/staff;https://yongzx.github.io;https://www.andrew.cmu.edu/user/suhask/;https://yizeng623.github.io/;https://sites.google.com/ucdavis.edu/wyshi/;;http://www.reidsouthenart.com;https://arobey1.github.io/;https://patrickrchao.github.io/;https://cs.stanford.edu/~diyiy/;https://ruoxijia.info/;https://ddkang.github.io/;https://www.media.mit.edu/people/sandy/overview/;https://www.cs.princeton.edu/~arvindn/;https://cs.stanford.edu/~pliang/;http://www.peterhenderson.co/", "dblp": "190/7024;;;;245/8673;;;344/3578;266/0855;312/5932.html;75/148;218/5722;37/10237;;242/9113;222/2677.html;70/11145;147/5355-1;40/6300.html;p/AlexPentland;08/3080.html;04/1701;h/PeterHenderson2", "google_scholar": "ADd_YfkAAAAJ;;PhN2CjMAAAAJ;;WMBXw1EAAAAJ;;NMPUDa0AAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;;slUNmHQAAAAJ;xj666rUAAAAJ;Tunh15sAAAAJ;;V5NWZc8AAAAJ;;j9jhYqQAAAAJ;JCrug-YAAAAJ;CpMjT0YAAAAJ;P4nfoKYAAAAJ;0Bi5CMgAAAAJ;pouyVyUAAAAJ;dy_JBs0AAAAJ", "orcid": ";;;;;0000-0002-9573-3332;;;;;0000-0002-6901-9194;;0000-0003-3318-8444;;;;;;;;;;", "linkedin": "shayne-redford-longpre/;;;;;borhane;;;;;chnyizeng/;;xianjun-yang-0062aa1a6/;;alexrobey/;;;;;;;;phende/", "or_profile": "~Shayne_Longpre1;~Sayash_Kapoor2;~Kevin_Klyman1;aramaswamis@gmail.com;~Rishi_Bommasani1;~Borhane_Blili-Hamelin1;~Yangsibo_Huang2;~Aviya_Skowron1;~Zheng_Xin_Yong1;~Suhas_Kotha1;~Yi_Zeng3;~Weiyan_Shi2;~Xianjun_Yang1;~Reid_Southen1;~Alexander_Robey1;~Patrick_Chao1;~Diyi_Yang2;~Ruoxi_Jia1;~Daniel_Kang1;~Alex_Pentland1;~Arvind_Narayanan1;~Percy_Liang1;~Peter_Henderson1", "aff": "Massachusetts Institute of Technology;Princeton University;Stanford University;;Stanford University;BABL AI;Princeton University;EleutherAI;Brown University;Carnegie Mellon University;Virginia Tech;Stanford University;University of California, Santa Barbara;;School of Engineering and Applied Science, University of Pennsylvania;The Wharton School, University of Pennsylvania;Stanford University;Virginia Tech;Department of Computer Science;Massachusetts Institute of Technology;Princeton University;Stanford University;Princeton University", "aff_domain": "mit.edu;princeton.edu;stanford.edu;;stanford.edu;babl.ai;princeton.edu;eleuther.ai;brown.edu;cmu.edu;vt.edu;stanford.edu;ucsb.edu;;seas.upenn.edu;wharton.upenn.edu;stanford.edu;vt.edu;cs.illinois.edu;mit.edu;princeton.edu;stanford.edu;princeton.edu", "position": "PhD student;PhD student;MS student;;PhD student;Researcher;PhD student;Researcher;PhD student;MS student;PhD student;Postdoc;PhD student;;PhD student;PhD student;Assistant Professor;Assistant Professor;Assistant Professor;Full Professor;Full Professor;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nlongpre2024position,\ntitle={Position: A Safe Harbor for {AI} Evaluation and Red Teaming},\nauthor={Shayne Longpre and Sayash Kapoor and Kevin Klyman and Ashwin Ramaswami and Rishi Bommasani and Borhane Blili-Hamelin and Yangsibo Huang and Aviya Skowron and Zheng Xin Yong and Suhas Kotha and Yi Zeng and Weiyan Shi and Xianjun Yang and Reid Southen and Alexander Robey and Patrick Chao and Diyi Yang and Ruoxi Jia and Daniel Kang and Alex Pentland and Arvind Narayanan and Percy Liang and Peter Henderson},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dLojMSgSFW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 311993, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 23, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7925766917416791442&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "mit.edu;princeton.edu;stanford.edu;;stanford.edu;babl.ai;princeton.edu;eleuther.ai;brown.edu;cmu.edu;vt.edu;stanford.edu;ucsb.edu;;seas.upenn.edu;wharton.upenn.edu;stanford.edu;vt.edu;cs.illinois.edu;mit.edu;princeton.edu;stanford.edu;princeton.edu", "author_num": 23, "aff_unique_index": "0;1;2;2;3;1;4;5;6;7;2;8;9;9;2;7;10;0;1;2;1", "aff_unique_norm": "Massachusetts Institute of Technology;Princeton University;Stanford University;BABL AI;EleutherAI;Brown University;Carnegie Mellon University;Virginia Tech;University of California, Santa Barbara;University of Pennsylvania;Unknown Institution", "aff_unique_dep": ";;;;;;;;;School of Engineering and Applied Science;Department of Computer Science", "aff_unique_url": "https://web.mit.edu;https://www.princeton.edu;https://www.stanford.edu;;https://www.eleuther.ai;https://www.brown.edu;https://www.cmu.edu;https://www.vt.edu;https://www.ucsb.edu;https://www.upenn.edu;", "aff_unique_abbr": "MIT;Princeton;Stanford;;EleutherAI;Brown;CMU;VT;UCSB;UPenn;", "aff_campus_unique_index": "1;1;1;2;1;1", "aff_campus_unique": ";Stanford;Santa Barbara", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States;" }, { "title": "Compositional Text-to-Image Generation with Dense Blob Representations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33559", "id": "dMOhgHNYAf", "proceeding": "https://proceedings.mlr.press/v235/nie24b.html", "pdf": "https://openreview.net/pdf?id=dMOhgHNYAf", "openreview": "https://openreview.net/forum?id=dMOhgHNYAf", "author_site": "Weili Nie, Sifei Liu, Morteza Mardani, Chao Liu, Benjamin Eckart, Arash Vahdat", "tldr": "", "abstract": "Existing text-to-image models struggle to follow complex text prompts, raising the need for extra grounding inputs for better controllability. In this work, we propose to decompose a scene into visual primitives - denoted as dense blob representations - that contain fine-grained details of the scene while being modular, human-interpretable, and easy-to-construct. Based on blob representations, we develop a blob-grounded text-to-image diffusion model, termed BlobGEN, for compositional generation. Particularly, we introduce a new masked cross-attention module to disentangle the fusion between blob representations and visual features. To leverage the compositionality of large language models (LLMs), we introduce a new in-context learning approach to generate blob representations from text prompts. Our extensive experiments show that BlobGEN achieves superior zero-shot generation quality and better layout-guided controllability on MS-COCO. When augmented by LLMs, our method exhibits superior numerical and spatial correctness on compositional image generation benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weili Nie;Sifei Liu;Morteza Mardani;Chao Liu;Benjamin Eckart;Arash Vahdat", "authorids": "~Weili_Nie1;~Sifei_Liu2;~Morteza_Mardani1;~Chao_Liu11;~Benjamin_Eckart1;~Arash_Vahdat3", "gender": "M;F;M;M;M;M", "homepage": "https://weilinie.github.io/;https://www.sifeiliu.net;http://web.stanford.edu/~morteza/;https://research.nvidia.com/labs/genair/author/chao-liu/;https://research.nvidia.com/person/ben-eckart;http://latentspace.cc/", "dblp": "147/4786;118/1301;74/258;;23/6784;92/8108", "google_scholar": "zW7BH7oAAAAJ;j4pcHV4AAAAJ;H7edsyEAAAAJ;8gAliWUAAAAJ;9PRX6q8AAAAJ;https://scholar.google.ca/citations?user=p9-nlRIAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Weili_Nie1;~Sifei_Liu2;~Morteza_Mardani1;~Chao_Liu11;~Benjamin_Eckart1;~Arash_Vahdat3", "aff": "NVIDIA;NVIDIA;;NVIDIA;NVIDIA;NVIDIA", "aff_domain": "nvidia.com;nvidia.com;;nvidia.com;nvidia.com;nvidia.com", "position": "Research Scientist;Researcher;;Researcher;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nnie2024compositional,\ntitle={Compositional Text-to-Image Generation with Dense Blob Representations},\nauthor={Weili Nie and Sifei Liu and Morteza Mardani and Chao Liu and Benjamin Eckart and Arash Vahdat},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dMOhgHNYAf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7953336, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=455180204441304369&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "nvidia.com;nvidia.com;;nvidia.com;nvidia.com;nvidia.com", "author_num": 6, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "NVIDIA", "aff_unique_dep": "NVIDIA Corporation", "aff_unique_url": "https://www.nvidia.com", "aff_unique_abbr": "NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Scalable Wasserstein Gradient Flow for Generative Modeling through Unbalanced Optimal Transport", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33558", "id": "dMhF96PfQi", "proceeding": "https://proceedings.mlr.press/v235/choi24a.html", "pdf": "https://openreview.net/pdf?id=dMhF96PfQi", "openreview": "https://openreview.net/forum?id=dMhF96PfQi", "author_site": "Jaemoo Choi, Jaewoong Choi, Myungjoo Kang", "tldr": "", "abstract": "Wasserstein gradient flow (WGF) describes the gradient dynamics of probability density within the Wasserstein space. WGF provides a promising approach for conducting optimization over the probability distributions. Numerically approximating the continuous WGF requires the time discretization method. The most well-known method for this is the JKO scheme. In this regard, previous WGF models employ the JKO scheme and parametrized transport map for each JKO step. However, this approach results in quadratic training complexity $O(K^2)$ with the number of JKO step $K$. This severely limits the scalability of WGF models. In this paper, we introduce a scalable WGF-based generative model, called Semi-dual JKO (S-JKO). Our model is based on the semi-dual form of the JKO step, derived from the equivalence between the JKO step and the Unbalanced Optimal Transport. Our approach reduces the training complexity to $O(K)$. We demonstrate that our model significantly outperforms existing WGF-based generative models, achieving FID scores of 2.62 on CIFAR-10 and 6.42 on CelebA-HQ-256, which are comparable to state-of-the-art image generative models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jaemoo Choi;Jaewoong Choi;Myungjoo Kang", "authorids": "~Jaemoo_Choi1;~Jaewoong_Choi1;~Myungjoo_Kang1", "gender": "M;M;", "homepage": "https://github.com/JaemooC;;http://ncia.snu.ac.kr/", "dblp": "295/8916;63/11483;64/5657.html", "google_scholar": "Ba2G6sIAAAAJ;e4ZLjREAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Jaemoo_Choi1;~Jaewoong_Choi1;~Myungjoo_Kang1", "aff": "Seoul National University;Korea Institute for Advanced Study;Seoul National University", "aff_domain": "snu.ac.kr;kias.re.kr;snu.ac.kr", "position": "PhD student;Postdoc;Full Professor", "bibtex": "@inproceedings{\nchoi2024scalable,\ntitle={Scalable Wasserstein Gradient Flow for Generative Modeling through Unbalanced Optimal Transport},\nauthor={Jaemoo Choi and Jaewoong Choi and Myungjoo Kang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dMhF96PfQi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9508503, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=245439765582763743&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "snu.ac.kr;kias.re.kr;snu.ac.kr", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Seoul National University;Korea Institute for Advanced Study", "aff_unique_dep": ";", "aff_unique_url": "https://www.snu.ac.kr;http://www.kaist.edu", "aff_unique_abbr": "SNU;KIAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Solving Poisson Equations using Neural Walk-on-Spheres", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33557", "id": "dQveBV9lZl", "proceeding": "https://proceedings.mlr.press/v235/nam24a.html", "pdf": "https://openreview.net/pdf?id=dQveBV9lZl", "openreview": "https://openreview.net/forum?id=dQveBV9lZl", "author_site": "Hong Chul Nam, Julius Berner, Anima Anandkumar", "tldr": "", "abstract": "We propose Neural Walk-on-Spheres (NWoS), a novel neural PDE solver for the efficient solution of high-dimensional Poisson equations. Leveraging stochastic representations and Walk-on-Spheres methods, we develop novel losses for neural networks based on the recursive solution of Poisson equations on spheres inside the domain. The resulting method is highly parallelizable and does not require spatial gradients for the loss. We provide a comprehensive comparison against competing methods based on PINNs, the Deep Ritz method, and (backward) stochastic differential equations. In several challenging, high-dimensional numerical examples, we demonstrate the superiority of NWoS in accuracy, speed, and computational costs. Compared to commonly used PINNs, our approach can reduce memory usage and errors by orders of magnitude. Furthermore, we apply NWoS to problems in PDE-constrained optimization and molecular dynamics to show its efficiency in practical applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hong Chul Nam;Julius Berner;Anima Anandkumar", "authorids": "~Hong_Chul_Nam1;~Julius_Berner1;~Anima_Anandkumar1", "gender": "M;M;F", "homepage": "https://www.alsemy.com;https://jberner.info/;http://tensorlab.cms.caltech.edu/users/anima/", "dblp": ";227/2217;", "google_scholar": ";73-D2jgAAAAJ;bEcLezcAAAAJ", "orcid": ";0000-0002-5648-648X;", "linkedin": ";julius-berner/;anima-anandkumar-35171b1/", "or_profile": "~Hong_Chul_Nam1;~Julius_Berner1;~anima_anandkumar1", "aff": "ETHZ - ETH Zurich;California Institute of Technology;California Institute of Technology", "aff_domain": "ethz.ch;caltech.edu;caltech.edu", "position": "Undergrad student;Postdoc;Full Professor", "bibtex": "@inproceedings{\nnam2024solving,\ntitle={Solving Poisson Equations using Neural Walk-on-Spheres},\nauthor={Hong Chul Nam and Julius Berner and Anima Anandkumar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dQveBV9lZl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1052990, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4601202868681343114&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "ethz.ch;caltech.edu;caltech.edu", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "ETH Zurich;California Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.caltech.edu", "aff_unique_abbr": "ETHZ;Caltech", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Pasadena", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Switzerland;United States" }, { "title": "Adaptive Observation Cost Control for Variational Quantum Eigensolvers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33556", "id": "dSrdnhLS2h", "proceeding": "https://proceedings.mlr.press/v235/anders24a.html", "pdf": "https://openreview.net/pdf?id=dSrdnhLS2h", "openreview": "https://openreview.net/forum?id=dSrdnhLS2h", "author_site": "Christopher J. Anders, Kim A. Nicoli, Bingting Wu, Naima Borras, Samuele Pedrielli, Lena Funcke, Karl Jansen, Stefan K\u00fchn, Shinichi Nakajima", "tldr": "", "abstract": "The objective to be minimized in the variational quantum eigensolver (VQE) has a restricted form, which allows a specialized sequential minimal optimization (SMO) that requires only a few observations in each iteration. However, the SMO iteration is still costly due to the observation noise---one *observation* at a point typically requires averaging over hundreds to thousands of repeated quantum *measurement shots* for achieving a reasonable noise level. In this paper, we propose an adaptive cost control method, named *subspace in confident region* (SubsCoRe), for SMO. SubsCoRe uses the Gaussian process (GP) surrogate, and requires it to have low uncertainty over the subspace being updated, so that optimization in each iteration is performed with guaranteed accuracy. Adaptive cost control is performed by setting the required accuracy according to the progress of the optimization, and identifying the minimum number of measurement shots, as well as their distribution, satisfying the SubsCoRe requirement.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Christopher J. Anders;Kim Andrea Nicoli;Bingting Wu;Naima Elosegui;Samuele Pedrielli;Lena Funcke;Karl Jansen;Stefan K\u00fchn;Shinichi Nakajima", "authorids": "~Christopher_J._Anders1;~Kim_Andrea_Nicoli1;wubingti@hu-berlin.de;naima.elosegui@gmail.com;samuele.pedrielli@campus.tu-berlin.de;~Lena_Funcke1;~Karl_Jansen1;~Stefan_K\u00fchn1;~Shinichi_Nakajima2", "gender": ";M;;;;;M;;M", "homepage": "https://cjanders.de;;;;;https://www.uni-bonn.de/de/forschung-lehre/forschungsprofil/transdisziplinaere-forschungsbereiche/tra-2-matter/clausius-professur-1;https://www-zeuthen.desy.de/~kjansen/;https://quantum-zeuthen.desy.de/;https://web.ml.tu-berlin.de/author/dr.-shinichi-nakajima/", "dblp": "243/2919;238/0997;;;;270/3670;;296/5765;97/6115.html", "google_scholar": "https://scholar.google.de/citations?user=9SIAzH4AAAAJ;0GzYud8AAAAJ;;;;VmwmvlEAAAAJ;;;hXSvID4AAAAJ", "orcid": "0000-0003-3295-8486;0000-0001-5933-1822;;;;0000-0001-5022-9506;;0000-0001-7693-350X;0000-0003-3970-4569", "linkedin": ";;;;;lena-funcke-003b0a127/;;;", "or_profile": "~Christopher_J._Anders1;~Kim_Andrea_Nicoli1;wubingti@hu-berlin.de;naima.elosegui@gmail.com;samuele.pedrielli@campus.tu-berlin.de;~Lena_Funcke1;~Karl_Jansen1;~Stefan_K\u00fchn1;~Shinichi_Nakajima2", "aff": "Technische Universit\u00e4t Berlin;Rheinische Friedrich-Wilhelms Universit\u00e4t Bonn;;;;Rheinische Friedrich-Wilhelms Universit\u00e4t Bonn;DESY;Deutsches Elektronen Synchrotron;BIFOLD, TU Berlin", "aff_domain": "tu-berlin.de;uni-bonn.de;;;;uni-bonn.de;desy.de;desy.de;tu-berlin.de", "position": "PhD student;Postdoc;;;;Assistant Professor;Principal Researcher;Researcher;Postdoc", "bibtex": "@inproceedings{\nanders2024adaptive,\ntitle={Adaptive Observation Cost Control for Variational Quantum Eigensolvers},\nauthor={Christopher J. Anders and Kim Andrea Nicoli and Bingting Wu and Naima Elosegui and Samuele Pedrielli and Lena Funcke and Karl Jansen and Stefan K{\\\"u}hn and Shinichi Nakajima},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dSrdnhLS2h}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 934362, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8456773213337066075&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "tu-berlin.de;uni-bonn.de;;;;uni-bonn.de;desy.de;desy.de;tu-berlin.de", "author_num": 9, "aff_unique_index": "0;1;1;2;3;0", "aff_unique_norm": "Technische Universit\u00e4t Berlin;Rheinische Friedrich-Wilhelms Universit\u00e4t Bonn;Deutsches Elektronen-Synchrotron;Deutsches Elektronen-Synchrotron ( DESY )", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.tu-berlin.de;https://www.uni-bonn.de/;https://www.desy.de;https://www.desy.de", "aff_unique_abbr": "TU Berlin;Uni Bonn;DESY;DESY", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Human vs. Generative AI in Content Creation Competition: Symbiosis or Conflict?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33555", "id": "dT6ZbSxh33", "proceeding": "https://proceedings.mlr.press/v235/yao24b.html", "pdf": "https://openreview.net/pdf?id=dT6ZbSxh33", "openreview": "https://openreview.net/forum?id=dT6ZbSxh33", "author_site": "Fan Yao, Chuanhao Li, Denis Nekipelov, Hongning Wang, Haifeng Xu", "tldr": "", "abstract": "The advent of generative AI (GenAI) technology produces a transformative impact on the content creation landscape, offering alternative approaches to produce diverse, good-quality content across media, thereby reshaping online ecosystems but also raising concerns about market over-saturation and the potential marginalization of human creativity. Our work introduces a competition model generalized from the Tullock contest to analyze the tension between human creators and GenAI. Our theory and simulations suggest that despite challenges, a stable equilibrium between human and AI-generated content is possible. Our work contributes to understanding the competitive dynamics in the content creation industry, offering insights into the future interplay between human creativity and technological advancements in GenAI.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fan Yao;Chuanhao Li;Denis Nekipelov;Hongning Wang;Haifeng Xu", "authorids": "~Fan_Yao2;~Chuanhao_Li1;~Denis_Nekipelov1;~Hongning_Wang1;~Haifeng_Xu1", "gender": "M;;M;M;M", "homepage": "https://github.com/MarcusYF/MarcusYF.github.io;https://cyrilli.github.io/;;http://www.cs.virginia.edu/~hw5x/;http://www.haifeng-xu.com/", "dblp": ";195/9947;;05/6545;04/1895", "google_scholar": "Vb4MZPMAAAAJ;w2ShljkAAAAJ;QB_fwL8AAAAJ;qkdvKNoAAAAJ;nLgg388AAAAJ", "orcid": "0009-0006-4764-4198;;;0000-0002-6524-9195;", "linkedin": ";;;;", "or_profile": "~Fan_Yao2;~Chuanhao_Li1;~Denis_Nekipelov1;~Hongning_Wang1;~Haifeng_Xu1", "aff": "Meta Facebook;Yale University;University of Virginia, Charlottesville;Tsinghua University;University of Chicago", "aff_domain": "meta.com;yale.edu;virginia.edu;tsinghua.edu.cn;cs.uchicago.edu", "position": "Intern;Postdoc;Associate Professor;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nyao2024human,\ntitle={Human vs. Generative {AI} in Content Creation Competition: Symbiosis or Conflict?},\nauthor={Fan Yao and Chuanhao Li and Denis Nekipelov and Hongning Wang and Haifeng Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dT6ZbSxh33}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 842515, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5052782162268621285&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "meta.com;yale.edu;virginia.edu;tsinghua.edu.cn;cs.uchicago.edu", "author_num": 5, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Meta;Yale University;University of Virginia;Tsinghua University;University of Chicago", "aff_unique_dep": "Meta Platforms, Inc.;;;;", "aff_unique_url": "https://meta.com;https://www.yale.edu;https://www.virginia.edu;https://www.tsinghua.edu.cn;https://www.uchicago.edu", "aff_unique_abbr": "Meta;Yale;UVA;THU;UChicago", "aff_campus_unique_index": "1", "aff_campus_unique": ";Charlottesville", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;China" }, { "title": "Contrasting Multiple Representations with the Multi-Marginal Matching Gap", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33554", "id": "dV9B9qFeGi", "proceeding": "https://proceedings.mlr.press/v235/piran24a.html", "pdf": "https://openreview.net/pdf?id=dV9B9qFeGi", "openreview": "https://openreview.net/forum?id=dV9B9qFeGi", "author_site": "Zoe Piran, Michal Klein, James Thornton, Marco Cuturi", "tldr": "", "abstract": "Learning meaningful representations of complex objects that can be seen through multiple ($k\\geq 3$) views or modalities is a core task in machine learning. Existing methods use losses originally intended for paired views, and extend them to $k$ views, either by instantiating $\\tfrac12k(k-1)$ loss-pairs, or by using reduced embeddings, following a *one vs. average-of-rest* strategy. We propose the multi-marginal matching gap (M3G), a loss that borrows tools from multi-marginal optimal transport (MM-OT) theory to simultaneously incorporate all $k$ views. Given a batch of $n$ points, each seen as a $k$-tuple of views subsequently transformed into $k$ embeddings, our loss contrasts the cost of matching these $n$ ground-truth $k$-tuples with the MM-OT polymatching cost, which seeks $n$ optimally arranged $k$-tuples chosen within these $n\\times k$ vectors. While the exponential complexity $O(n^k$) of the MM-OT problem may seem daunting, we show in experiments that a suitable generalization of the Sinkhorn algorithm for that problem can scale to, e.g., $k=3\\sim 6$ views using mini-batches of size $64~\\sim128$. Our experiments demonstrate improved performance over multiview extensions of pairwise losses, for both self-supervised and multimodal tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zoe Piran;Michal Klein;James Thornton;marco cuturi", "authorids": "~Zoe_Piran1;~Michal_Klein1;~James_Thornton1;~marco_cuturi2", "gender": "F;M;;M", "homepage": "https://www.zoepiran.com/;https://github.com/michalk8;https://jtt94.github.io/;http://marcocuturi.net", "dblp": "267/2038;332/4607;;85/5102", "google_scholar": "BlDw0uIAAAAJ;zByzdzcAAAAJ;oFZHOwgAAAAJ;https://scholar.google.fr/citations?user=kQEydDMAAAAJ", "orcid": "0000-0003-0241-8948;0000-0002-2433-6380;;", "linkedin": ";michal-klein-148697165/;;", "or_profile": "~Zoe_Piran1;~Michal_Klein1;~James_Thornton1;~marco_cuturi2", "aff": "The Hebrew University of Jerusalem;Apple;Apple;Ensae ParisTech", "aff_domain": "mail.huji.ac.il;apple.com;apple.com;ensae.fr", "position": "PhD student;Researcher;Researcher;Full Professor", "bibtex": "@inproceedings{\npiran2024contrasting,\ntitle={Contrasting Multiple Representations with the Multi-Marginal Matching Gap},\nauthor={Zoe Piran and Michal Klein and James Thornton and marco cuturi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dV9B9qFeGi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2176285, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2077800502037595796&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "mail.huji.ac.il;apple.com;apple.com;ensae.fr", "author_num": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Hebrew University of Jerusalem;Apple;ENSAE ParisTech", "aff_unique_dep": ";Apple Inc.;", "aff_unique_url": "https://www.huji.ac.il;https://www.apple.com;https://www.ensae.fr", "aff_unique_abbr": "HUJI;Apple;Ensae", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;2", "aff_country_unique": "Israel;United States;France" }, { "title": "A Unified View of FANOVA: A Comprehensive Bayesian Framework for Component Selection and Estimation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33553", "id": "dV9QGostQk", "proceeding": "https://proceedings.mlr.press/v235/marnissi24a.html", "pdf": "https://openreview.net/pdf?id=dV9QGostQk", "openreview": "https://openreview.net/forum?id=dV9QGostQk", "author_site": "Yosra MARNISSI, Maxime Leiber", "tldr": "", "abstract": "This paper presents a comprehensive Bayesian framework for FANOVA models. We provide guidelines for tuning and practical implementation to improve scalability of learning and prediction. Our model is very flexible and can handle different levels of sparsity across and within decomposition orders, as well as among covariates. This flexibility enables the modeling of complex real-world data while enhancing interpretability. Additionally, it allows our model to unify diverse deterministic and Bayesian non-parametric approaches into a single equation, making comparisons and understanding easier. Notably, our model serves as the Bayesian counterpart of several deterministic methods allowing uncertainty quantification. This general framework unlocks potential for novel model developments that have been previously overlooked, such as the proposed Dirichlet mixing model that addresses limitations of existing models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "yosra marnissi;Maxime Leiber", "authorids": "~yosra_marnissi1;~Maxime_Leiber1", "gender": "F;M", "homepage": "https://www.linkedin.com/in/yosra-marnissi-39428058/;", "dblp": "180/2860;327/3447", "google_scholar": "https://scholar.google.com.sg/citations?user=01CT2_oAAAAJ;tt22lqEAAAAJ", "orcid": ";", "linkedin": ";maxime-leiber/", "or_profile": "~yosra_marnissi1;~Maxime_Leiber1", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nmarnissi2024a,\ntitle={A Unified View of {FANOVA}: A Comprehensive Bayesian Framework for Component Selection and Estimation},\nauthor={yosra marnissi and Maxime Leiber},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dV9QGostQk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2214275, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xdYOB-JPvg0J:scholar.google.com/&scioq=A+Unified+View+of+FANOVA:+A+Comprehensive+Bayesian+Framework+for+Component+Selection+and+Estimation&hl=en&as_sdt=0,5", "gs_version_total": 3, "email": ";", "author_num": 2 }, { "title": "NaturalSpeech 3: Zero-Shot Speech Synthesis with Factorized Codec and Diffusion Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33552", "id": "dVhrnjZJad", "proceeding": "https://proceedings.mlr.press/v235/ju24b.html", "pdf": "https://openreview.net/pdf?id=dVhrnjZJad", "openreview": "https://openreview.net/forum?id=dVhrnjZJad", "author_site": "Zeqian Ju, Yuancheng Wang, Kai Shen, Xu Tan, Detai Xin, Dongchao Yang, Eric Liu, Yichong Leng, Kaitao Song, Siliang Tang, Zhizheng Wu, Tao Qin, Xiangyang Li, Wei Ye, Shikun Zhang, Jiang Bian, Lei He, Jinyu Li, sheng zhao", "tldr": "", "abstract": "While recent large-scale text-to-speech (TTS) models have achieved significant progress, they still fall shorts in speech quality, similarity, and prosody. Considering that speech intricately encompasses various attributes (e.g., content, prosody, timbre, and acoustic details) that pose significant challenges for generation, a natural idea is to factorize speech into individual subspaces representing different attributes and generate them individually. Motivated by it, we propose a TTS system with novel factorized diffusion models to generate natural speech in a zero-shot way. Specifically, 1) we design a neural codec with factorized vector quantization (FVQ) to disentangle speech waveform into subspaces of content, prosody, timbre, and acoustic details; 2) we propose a factorized diffusion model, which generates attributes in each subspace following its corresponding prompt. With this factorization design, our method can effectively and efficiently model the intricate speech with disentangled subspaces in a divide-and-conquer way. Experimental results show that our method outperforms the state-of-the-art TTS systems on quality, similarity, prosody, and intelligibility.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zeqian Ju;Yuancheng Wang;Kai Shen;Xu Tan;Detai Xin;Dongchao Yang;Eric Liu;Yichong Leng;Kaitao Song;Siliang Tang;Zhizheng Wu;Tao Qin;Xiangyang Li;Wei Ye;Shikun Zhang;Jiang Bian;Lei He;Jinyu Li;sheng zhao", "authorids": "~Zeqian_Ju1;~Yuancheng_Wang1;~Kai_Shen2;~Xu_Tan1;~Detai_Xin1;~Dongchao_Yang1;~Eric_Liu1;~Yichong_Leng1;~Kaitao_Song1;~Siliang_Tang1;~Zhizheng_Wu1;~Tao_Qin1;~Xiangyang_Li4;~Wei_Ye2;~Shikun_Zhang2;~Jiang_Bian1;~Lei_He6;~Jinyu_Li1;~sheng_zhao1", "gender": "Not Specified;M;M;M;M;M;M;M;M;M;;M;;M;M;M;M;M;M", "homepage": ";https://hecheng0625.github.io/;;https://tan-xu.github.io/;;http://dongchaoyang.top;https://blog.csdn.net/parakpurple;;;https://person.zju.edu.cn/en/siliang;;https://www.microsoft.com/en-us/research/people/taoqin/;http://staff.ustc.edu.cn/~xiangyangli/;https://se.pku.edu.cn/kcl/weiye/;;https://sites.google.com/view/jiangbian;;https://www.microsoft.com/en-us/research/people/jinyli;https://www.aaai.org/ojs/index.php/AAAI/article/view/4642", "dblp": "262/3979;199/2310;;96/10484-3;277/3917.html;;;242/8492;222/2082;44/5693;;14/6841;l/XiangYangLi;09/5394-4;83/3715.html;09/851-2.html;;87/4873-1;", "google_scholar": "uN1JaDEAAAAJ;60uamz4AAAAJ;https://scholar.google.com/citations?hl=en;tob-U1oAAAAJ;DjLO4xkAAAAJ;WNiojyAAAAAJ;dIJFz4UAAAAJ;https://scholar.google.ae/citations?user=1jwteOQAAAAJ;https://scholar.google.com.hk/citations?user=LLk9dR8AAAAJ;8e7H3PcAAAAJ;;Bl4SRU0AAAAJ;JURtNb0AAAAJ;RgLGFMIAAAAJ;uiklLscAAAAJ;pZBEnY8AAAAJ;EKl9yY8AAAAJ;grUvupMAAAAJ;689bIIwAAAAJ", "orcid": ";;;0000-0001-5631-0639;0009-0007-1908-1137;;0000-0002-4150-0680;;;0000-0002-7356-9711;;;;;;0000-0002-9472-600X;;0000-0002-1089-9748;", "linkedin": ";;;;%E5%BE%B7%E6%B3%B0-%E8%BE%9B-b81167197/;;;;;siliang-tang-4734272a/;;;;;;jbian/;;;", "or_profile": "~Zeqian_Ju1;~Yuancheng_Wang1;~Kai_Shen2;~Xu_Tan1;~Detai_Xin1;~Dongchao_Yang1;~Eric_Liu1;~Yichong_Leng1;~Kaitao_Song1;~Siliang_Tang1;~Zhizheng_Wu1;~Tao_Qin1;~Xiangyang_Li4;~Wei_Ye2;~Shikun_Zhang2;~Jiang_Bian1;~Lei_He6;~Jinyu_Li1;~sheng_zhao1", "aff": "Microsoft;The Chinese University of Hong Kong, Shenzhen;Zhejiang University;Microsoft;Tokyo University, Tokyo Institute of Technology;Chinese University of Hong Kong;Microsoft;University of Science and Technology of China;Microsoft;Zhejiang University;;;University of Science and Technology of China;Peking University;Peking University;Microsoft;Microsoft;Microsoft;Microsoft", "aff_domain": "microsoft.com;cuhk.edu.cn;zju.edu.cn;microsoft.com;u-tokyo.ac.jp;cuhk.hk;microsoft.com;ustc.edu.cn;microsoft.com;zju.edu.cn;;;ustc.edu;pku.edu.cn;pku.edu.cn;microsoft.com;microsoft.com;microsoft.com;microsoft.com", "position": "Intern;PhD student;PhD student;Principal Researcher;PhD student;PhD student;Principal Researcher;PhD student;Researcher;Full Professor;;;Full Professor;Associate Professor;Full Professor;Partner Research Manager;Principal Scientist Manager;Researcher;Researcher", "bibtex": "@inproceedings{\nju2024naturalspeech,\ntitle={NaturalSpeech 3: Zero-Shot Speech Synthesis with Factorized Codec and Diffusion Models},\nauthor={Zeqian Ju and Yuancheng Wang and Kai Shen and Xu Tan and Detai Xin and Dongchao Yang and Eric Liu and Yichong Leng and Kaitao Song and Siliang Tang and Zhizheng Wu and Tao Qin and Xiangyang Li and Wei Ye and Shikun Zhang and Jiang Bian and Lei He and Jinyu Li and sheng zhao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dVhrnjZJad}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 611204, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 19, "gs_citation": 172, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11303548308295061158&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "microsoft.com;cuhk.edu.cn;zju.edu.cn;microsoft.com;u-tokyo.ac.jp;cuhk.hk;microsoft.com;ustc.edu.cn;microsoft.com;zju.edu.cn;;;ustc.edu;pku.edu.cn;pku.edu.cn;microsoft.com;microsoft.com;microsoft.com;microsoft.com", "author_num": 19, "aff_unique_index": "0;1;2;0;3;1;0;4;0;2;4;5;5;0;0;0;0", "aff_unique_norm": "Microsoft;Chinese University of Hong Kong;Zhejiang University;Tokyo University;University of Science and Technology of China;Peking University", "aff_unique_dep": "Microsoft Corporation;;;;;", "aff_unique_url": "https://www.microsoft.com;https://www.cuhk.edu.cn;https://www.zju.edu.cn;https://www.u-tokyo.ac.jp;http://www.ustc.edu.cn;http://www.pku.edu.cn", "aff_unique_abbr": "Microsoft;CUHK;ZJU;UT;USTC;Peking U", "aff_campus_unique_index": "1;2;3", "aff_campus_unique": ";Shenzhen;Tokyo;Hong Kong SAR", "aff_country_unique_index": "0;1;1;0;2;1;0;1;0;1;1;1;1;0;0;0;0", "aff_country_unique": "United States;China;Japan" }, { "title": "Stop Regressing: Training Value Functions via Classification for Scalable Deep RL", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33551", "id": "dVpFKfqF3R", "proceeding": "https://proceedings.mlr.press/v235/farebrother24a.html", "pdf": "https://openreview.net/pdf?id=dVpFKfqF3R", "openreview": "https://openreview.net/forum?id=dVpFKfqF3R", "author_site": "Jesse Farebrother, Jordi Orbay, Quan Vuong, Adrien Ali Taiga, Yevgen Chebotar, Ted Xiao, Alexander Irpan, Sergey Levine, Pablo Samuel Castro, Aleksandra Faust, Aviral Kumar, Rishabh Agarwal", "tldr": "", "abstract": "Value functions are an essential component in deep reinforcement learning (RL), that are typically trained via mean squared error regression to match bootstrapped target values. However, scaling value-based RL methods to large networks has proven challenging. This difficulty is in stark contrast to supervised learning: by leveraging a cross-entropy classification loss, supervised methods have scaled reliably to massive networks. Observing this discrepancy, in this paper, we investigate whether the scalability of deep RL can also be improved simply by using classification in place of regression for training value functions. We show that training value functions with categorical cross-entropy significantly enhances performance and scalability across various domains, including single-task RL on Atari 2600 games, multi-task RL on Atari with large-scale ResNets, robotic manipulation with Q-transformers, playing Chess without search, and a language-agent Wordle task with high-capacity Transformers, achieving *state-of-the-art results* on these domains. Through careful analysis, we show that categorical cross-entropy mitigates issues inherent to value-based RL, such as noisy targets and non-stationarity. We argue that shifting to categorical cross-entropy for training value functions can substantially improve the scalability of deep RL at little-to-no cost.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jesse Farebrother;Jordi Orbay;Quan Vuong;Adrien Ali Taiga;Yevgen Chebotar;Ted Xiao;Alex Irpan;Sergey Levine;Pablo Samuel Castro;Aleksandra Faust;Aviral Kumar;Rishabh Agarwal", "authorids": "~Jesse_Farebrother1;~Jordi_Orbay1;~Quan_Vuong2;~Adrien_Ali_Taiga1;~Yevgen_Chebotar1;~Ted_Xiao1;~Alex_Irpan1;~Sergey_Levine1;~Pablo_Samuel_Castro1;~Aleksandra_Faust1;~Aviral_Kumar2;~Rishabh_Agarwal2", "gender": "M;M;M;M;M;M;M;F;M;M;M;M", "homepage": "https://brosa.ca;https://quanvuong.github.io;;https://www.tedxiao.me;http://www.alexirpan.com;https://people.eecs.berkeley.edu/~svlevine/;https://psc-g.github.io/;http://www.afaust.info;https://aviralkumar2907.github.io/;https://agarwl.github.io;;", "dblp": "228/6862;;01/11424;198/0598;202/2063;80/7594;05/5455;135/8420;202/7961;;190/7835;", "google_scholar": "cA12XHcAAAAJ;NSWI3OwAAAAJ;ADkiClQAAAAJ;;;8R35rCwAAAAJ;https://scholar.google.ca/citations?user=jn5r6TsAAAAJ;RK72t68AAAAJ;;https://scholar.google.ca/citations?user=aH8AJu4AAAAJ;rRbCqtoAAAAJ;", "orcid": "0000-0002-5178-4947;;;;;;;0000-0002-3268-8685;;;;", "linkedin": "jessefarebro/;;;;;;pablo-samuel-castro-2113641b/;aleksandrafaust;;;;jorgeorbay", "or_profile": "~Jesse_Farebrother1;~Quan_Vuong2;~Yevgen_Chebotar1;~Ted_Xiao1;~Alex_Irpan1;~Sergey_Levine1;~Pablo_Samuel_Castro1;~Aleksandra_Faust1;~Aviral_Kumar2;~Rishabh_Agarwal2;~Ali_Adrien_Ali_Taiga1;~Jorge_Orbay1", "aff": "Google DeepMind;physical intelligence;Google;;Google DeepMind;Google;Google;Google Brain;Google DeepMind;Google DeepMind;Google Deepmind;Google", "aff_domain": "google.com;physicalintelligence.company;google.com;;google.com;google.com;google.com;google.com;google.com;google.com;google.com;google.com", "position": "Student Researcher;Researcher;Research Scientist;;Researcher;Research Scientist;Researcher;Principal Researcher;Researcher;Research Scientist;Researcher;Researcher", "bibtex": "@inproceedings{\nfarebrother2024stop,\ntitle={Stop Regressing: Training Value Functions via Classification for Scalable Deep {RL}},\nauthor={Jesse Farebrother and Jordi Orbay and Quan Vuong and Adrien Ali Taiga and Yevgen Chebotar and Ted Xiao and Alex Irpan and Sergey Levine and Pablo Samuel Castro and Aleksandra Faust and Aviral Kumar and Rishabh Agarwal},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dVpFKfqF3R}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4964765, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9964023227561411189&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "google.com;physicalintelligence.company;google.com;;google.com;google.com;google.com;google.com;google.com;google.com;google.com;google.com", "author_num": 12, "aff_unique_index": "0;1;0;0;0;0;0;0;0;2;0", "aff_unique_norm": "Google;Physical Intelligence;DeepMind", "aff_unique_dep": "Google DeepMind;;DeepMind", "aff_unique_url": "https://deepmind.com;;https://deepmind.com", "aff_unique_abbr": "DeepMind;;DeepMind", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;2;0;2;2;2;0;0;0;2", "aff_country_unique": "United Kingdom;;United States" }, { "title": "Denoising Autoregressive Representation Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33550", "id": "dW29JZj0G5", "proceeding": "https://proceedings.mlr.press/v235/li24d.html", "pdf": "https://openreview.net/pdf?id=dW29JZj0G5", "openreview": "https://openreview.net/forum?id=dW29JZj0G5", "author_site": "Yazhe Li, Jorg Bornschein, Ting Chen", "tldr": "", "abstract": "In this paper, we explore a new generative approach for learning visual representations. Our method, DARL, employs a decoder-only Transformer to predict image patches autoregressively. We find that training with Mean Squared Error (MSE) alone leads to strong representations. To enhance the image generation ability, we replace the MSE loss with the diffusion objective by using a denoising patch decoder. We show that the learned representation can be improved by using tailored noise schedules and longer training in larger models. Notably, the optimal schedule differs significantly from the typical ones used in standard image diffusion models. Overall, despite its simple architecture, DARL delivers performance remarkably close to state-of-the-art masked prediction models under the fine-tuning protocol. This marks an important step towards a unified model capable of both visual perception and generation, effectively combining the strengths of autoregressive and denoising diffusion models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yazhe Li;Jorg Bornschein;Ting Chen", "authorids": "~Yazhe_Li2;~Jorg_Bornschein1;~Ting_Chen1", "gender": ";M;M", "homepage": ";;", "dblp": "182/2163;13/8510;19/1766", "google_scholar": "lpswgyIAAAAJ;X7kZFnoAAAAJ;KoXUMbsAAAAJ", "orcid": ";0000-0002-3356-7922;", "linkedin": ";;", "or_profile": "~Yazhe_Li2;~Jorg_Bornschein1;~Ting_Chen1", "aff": "Google DeepMind;Google Deepmind;Google", "aff_domain": "deepmind.com;google.com;google.com", "position": "Researcher;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nli2024denoising,\ntitle={Denoising Autoregressive Representation Learning},\nauthor={Yazhe Li and Jorg Bornschein and Ting Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dW29JZj0G5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9044791, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3352907373391009891&as_sdt=1005&sciodt=0,4&hl=en", "gs_version_total": 6, "email": "deepmind.com;google.com;google.com", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Google;DeepMind", "aff_unique_dep": "Google DeepMind;DeepMind", "aff_unique_url": "https://deepmind.com;https://deepmind.com", "aff_unique_abbr": "DeepMind;DeepMind", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Generalist Equivariant Transformer Towards 3D Molecular Interaction Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33549", "id": "dWxb80a0TW", "proceeding": "https://proceedings.mlr.press/v235/kong24b.html", "pdf": "https://openreview.net/pdf?id=dWxb80a0TW", "openreview": "https://openreview.net/forum?id=dWxb80a0TW", "author_site": "Xiangzhe Kong, Wenbing Huang, Yang Liu", "tldr": "", "abstract": "Many processes in biology and drug discovery involve various 3D interactions between molecules, such as protein and protein, protein and small molecule, etc. Given that different molecules are usually represented in different granularity, existing methods usually encode each type of molecules independently with different models, leaving it defective to learn the various underlying interaction physics. In this paper, we first propose to universally represent an arbitrary 3D complex as a geometric graph of sets, shedding light on encoding all types of molecules with one model. We then propose a Generalist Equivariant Transformer (GET) to effectively capture both domain-specific hierarchies and domain-agnostic interaction physics. To be specific, GET consists of a bilevel attention module, a feed-forward module and a layer normalization module, where each module is E(3) equivariant and specialized for handling sets of variable sizes. Notably, in contrast to conventional pooling-based hierarchical models, our GET is able to retain fine-grained information of all levels. Extensive experiments on the interactions between proteins, small molecules and RNA/DNAs verify the effectiveness and generalization capability of our proposed method across different domains.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiangzhe Kong;Wenbing Huang;Yang Liu", "authorids": "~Xiangzhe_Kong1;~Wenbing_Huang1;~Yang_Liu19", "gender": "M;M;M", "homepage": "https://kxz18.github.io/;https://gsai.ruc.edu.cn/english/wenbing_huang;http://nlp.csai.tsinghua.edu.cn/~ly/", "dblp": "293/7526;155/3181-1.html;51/3710-5", "google_scholar": "0oSFYmkAAAAJ;0yNkmO4AAAAJ;https://scholar.google.com.hk/citations?user=lVhoKNcAAAAJ", "orcid": ";;0000-0002-3087-242X", "linkedin": ";;", "or_profile": "~Xiangzhe_Kong1;~Wenbing_Huang1;~Yang_Liu19", "aff": "Tsinghua University;Renmin University of China;Tsinghua University", "aff_domain": "tsinghua.edu.cn;ruc.edu.cn;tsinghua.edu.cn", "position": "PhD student;Associate Professor;Professor", "bibtex": "@inproceedings{\nkong2024generalist,\ntitle={Generalist Equivariant Transformer Towards 3D Molecular Interaction Learning},\nauthor={Xiangzhe Kong and Wenbing Huang and Yang Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dWxb80a0TW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4792759, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8111101484159480140&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "tsinghua.edu.cn;ruc.edu.cn;tsinghua.edu.cn", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Tsinghua University;Renmin University of China", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;http://www.ruc.edu.cn", "aff_unique_abbr": "THU;RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "On The Statistical Complexity of Offline Decision-Making", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33548", "id": "dYDPcx78tm", "proceeding": "https://proceedings.mlr.press/v235/nguyen-tang24a.html", "pdf": "https://openreview.net/pdf?id=dYDPcx78tm", "openreview": "https://openreview.net/forum?id=dYDPcx78tm", "author_site": "Thanh Nguyen-Tang, Raman Arora", "tldr": "", "abstract": "We study the statistical complexity of offline decision-making with function approximation, establishing (near) minimax-optimal rates for stochastic contextual bandits and Markov decision processes. The performance limits are captured by the pseudo-dimension of the (value) function class and a new characterization of the behavior policy that *strictly* subsumes all the previous notions of data coverage in the offline decision-making literature. In addition, we seek to understand the benefits of using offline data in online decision-making and show nearly minimax-optimal rates in a wide range of regimes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Thanh Nguyen-Tang;Raman Arora", "authorids": "~Thanh_Nguyen-Tang1;~Raman_Arora1", "gender": "M;M", "homepage": "http://www.cs.jhu.edu/~raman/Home.html;https://thanhnguyentang.github.io/", "dblp": ";287/5102.html", "google_scholar": "Spe0xdkAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0002-1917-2190", "linkedin": ";thanhnguyentang/", "or_profile": "~Raman_Arora1;~Thanh_Tang_Nguyen2", "aff": "Johns Hopkins University;Johns Hopkins University", "aff_domain": "jhu.edu;jhu.edu", "position": "Associate Professor;Postdoc", "bibtex": "@inproceedings{\nnguyen-tang2024on,\ntitle={On The Statistical Complexity of Offline Decision-Making},\nauthor={Thanh Nguyen-Tang and Raman Arora},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dYDPcx78tm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 508945, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=24101551850143487&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "jhu.edu;jhu.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Johns Hopkins University", "aff_unique_dep": "", "aff_unique_url": "https://www.jhu.edu", "aff_unique_abbr": "JHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Understanding Reasoning Ability of Language Models From the Perspective of Reasoning Paths Aggregation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33547", "id": "dZsEOFUDew", "proceeding": "https://proceedings.mlr.press/v235/wang24a.html", "pdf": "https://openreview.net/pdf?id=dZsEOFUDew", "openreview": "https://openreview.net/forum?id=dZsEOFUDew", "author_site": "Xinyi Wang, Alfonso Amayuelas, Kexun Zhang, Liangming Pan, Wenhu Chen, William Wang", "tldr": "", "abstract": "Pre-trained language models (LMs) are able to perform complex reasoning without explicit fine-tuning. To understand how pre-training with a next-token prediction objective contributes to the emergence of such reasoning capability, we propose that we can view an LM as deriving new conclusions by aggregating indirect reasoning paths seen at pre-training time. We found this perspective effective in two important cases of reasoning: logic reasoning with knowledge graphs (KGs) and chain-of-thought (CoT) reasoning. More specifically, we formalize the reasoning paths as random walk paths on the knowledge/reasoning graphs. Analyses of learned LM distributions suggest that a weighted sum of relevant random walk path probabilities is a reasonable way to explain how LMs reason. Experiments and analysis on multiple KG and CoT datasets reveal the effect of training on random walk paths and suggest that augmenting unlabeled random walk reasoning paths can improve real-world multi-step reasoning performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xinyi Wang;Alfonso Amayuelas;Kexun Zhang;Liangming Pan;Wenhu Chen;William Yang Wang", "authorids": "~Xinyi_Wang2;~Alfonso_Amayuelas2;~Kexun_Zhang1;~Liangming_Pan1;~Wenhu_Chen3;~William_Yang_Wang2", "gender": "F;M;M;M;M;M", "homepage": "https://wangxinyilinda.github.io/;https://www.amayuelas.me/;https://zkx06111.github.io;https://liangmingpan.bio;https://wenhuchen.github.io/;https://www.cs.ucsb.edu/~william/", "dblp": ";281/7669;295/8815;186/9707;136/0957.html;08/9282", "google_scholar": "3vvbplcAAAAJ;https://scholar.google.dk/citations?user=QGQ2G28AAAAJ;;JcjjOTUAAAAJ;https://scholar.google.co.jp/citations?user=U8ShbhUAAAAJ;gf8Ms_8AAAAJ", "orcid": ";;;;;", "linkedin": "xinyi-wang-444385133/;alfonsoamayuelas/;;;;", "or_profile": "~Xinyi_Wang2;~Alfonso_Amayuelas2;~Kexun_Zhang1;~Liangming_Pan1;~wenhu_chen1;~William_Wang1", "aff": "International Business Machines;University of California, Santa Barbara;Carnegie Mellon University;University of California, Santa Barbara;University of Waterloo;UC Santa Barbara", "aff_domain": "ibm.com;ucsb.edu;cmu.edu;ucsb.edu;uwaterloo.ca;ucsb.edu", "position": "Intern;PhD student;PhD student;Postdoc;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nwang2024understanding,\ntitle={Understanding Reasoning Ability of Language Models From the Perspective of Reasoning Paths Aggregation},\nauthor={Xinyi Wang and Alfonso Amayuelas and Kexun Zhang and Liangming Pan and Wenhu Chen and William Yang Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dZsEOFUDew}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4855391, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16962228788580510400&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "email": "ibm.com;ucsb.edu;cmu.edu;ucsb.edu;uwaterloo.ca;ucsb.edu", "author_num": 6, "aff_unique_index": "0;1;2;1;3;1", "aff_unique_norm": "International Business Machines Corporation;University of California, Santa Barbara;Carnegie Mellon University;University of Waterloo", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ibm.com;https://www.ucsb.edu;https://www.cmu.edu;https://uwaterloo.ca", "aff_unique_abbr": "IBM;UCSB;CMU;UW", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Santa Barbara", "aff_country_unique_index": "0;0;0;0;1;0", "aff_country_unique": "United States;Canada" }, { "title": "Reparameterized Importance Sampling for Robust Variational Bayesian Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33546", "id": "da7MMwICjC", "proceeding": "https://proceedings.mlr.press/v235/long24a.html", "pdf": "https://openreview.net/pdf?id=da7MMwICjC", "openreview": "https://openreview.net/forum?id=da7MMwICjC", "author_site": "Yunfei Long, Zilin Tian, Liguo Zhang, Huosheng Xu", "tldr": "", "abstract": "Mean-field variational inference (MFVI) methods provide computationally cheap approximations to the posterior of Bayesian Neural Networks (BNNs) when compared to alternatives like MCMC. However, applying MFVI to BNNs encounters limitations due to the Monte Carlo sampling problem. This problem stems from two main issues. *First*, most samples do not accurately represent the most probable weights. *Second*, random sampling from variational distributions introduces high variance in gradient estimates, which can hinder the optimization process, leading to slow convergence or even failure. In this paper, we introduce a novel sampling method called *Reparameterized Importance Sampling* (RIS) to estimate the first moment in neural networks, reducing variance during feed-forward propagation. We begin by analyzing the generalized form of the optimal proposal distribution and presenting an inexpensive approximation. Next, we describe the sampling process from the proposal distribution as a transformation that combines exogenous randomness with the variational parameters. Our experimental results demonstrate the effectiveness of the proposed RIS method in three critical aspects: improved convergence, enhanced predictive performance, and successful uncertainty estimation for out-of-distribution data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yunfei Long;Zilin Tian;Liguo Zhang;Huosheng Xu", "authorids": "~Yunfei_Long5;~Zilin_Tian1;~Liguo_Zhang1;~Huosheng_Xu2", "gender": "M;F;M;M", "homepage": "https://github.com/unclelongheu;https://github.com/ZiLinT;http://homepage.hrbeu.edu.cn/web/zhangliguo;https://github.com/huoshengXu", "dblp": ";;;", "google_scholar": ";;P4_SRzQAAAAJ;", "orcid": "0009-0009-1458-4514;0009-0001-6401-6314;0000-0002-3814-7783;", "linkedin": ";;;", "or_profile": "~Yunfei_Long5;~Zilin_Tian1;~Liguo_Zhang1;~Huosheng_Xu2", "aff": "Harbin Engineering University;Harbin Engineering University;Harbin Engineering University;Harbin Engineering University", "aff_domain": "hrbeu.edu.cn;hrbeu.edu.cn;hrbeu.edu.cn;hrbeu.edu.cn", "position": "PhD student;PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nlong2024reparameterized,\ntitle={Reparameterized Importance Sampling for Robust Variational Bayesian Neural Networks},\nauthor={Yunfei Long and Zilin Tian and Liguo Zhang and Huosheng Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=da7MMwICjC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 372151, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:rNEXio30iMYJ:scholar.google.com/&scioq=Reparameterized+Importance+Sampling+for+Robust+Variational+Bayesian+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 5, "email": "hrbeu.edu.cn;hrbeu.edu.cn;hrbeu.edu.cn;hrbeu.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Harbin Engineering University", "aff_unique_dep": "", "aff_unique_url": "http://www.heu.edu.cn", "aff_unique_abbr": "HEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "MLLM-as-a-Judge: Assessing Multimodal LLM-as-a-Judge with Vision-Language Benchmark", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33545", "id": "dbFEFHAD79", "proceeding": "https://proceedings.mlr.press/v235/chen24h.html", "pdf": "https://openreview.net/pdf?id=dbFEFHAD79", "openreview": "https://openreview.net/forum?id=dbFEFHAD79", "author_site": "Dongping Chen, Ruoxi Chen, Shilin Zhang, Yaochen Wang, Yinuo Liu, Huichi Zhou, Qihui Zhang, Yao Wan, Pan Zhou, Lichao Sun", "tldr": "", "abstract": "Multimodal Large Language Models (MLLMs) have gained significant attention recently, showing remarkable potential in artificial general intelligence. However, assessing the utility of MLLMs presents considerable challenges, primarily due to the absence multimodal benchmarks that align with human preferences. Drawing inspiration from the concept of LLM-as-a-Judge within LLMs, this paper introduces a novel benchmark, termed MLLM-as-a-Judge, to assess the ability of MLLMs in assisting judges across diverse modalities, encompassing three distinct tasks: Scoring Evaluation, Pair Comparison, and Batch Ranking. Our study reveals that, while MLLMs demonstrate remarkable human-like discernment in Pair Comparisons, there is a significant divergence from human preferences in Scoring Evaluation and Batch Ranking tasks. Furthermore, a closer examination reveals persistent challenges in the evaluative capacities of LLMs, including diverse biases, hallucinatory responses, and inconsistencies in judgment, even in advanced models such as GPT-4V. These findings emphasize the pressing need for enhancements and further research efforts to be undertaken before regarding MLLMs as fully reliable evaluators. In light of this, we advocate for additional efforts dedicated to supporting the continuous development within the domain of MLLM functioning as judges. The code and dataset are publicly available at our project homepage: https://mllm-judge.github.io/.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dongping Chen;Ruoxi Chen;Shilin Zhang;Yaochen Wang;Yinuo Liu;Huichi Zhou;Qihui Zhang;Yao Wan;Pan Zhou;Lichao Sun", "authorids": "~Dongping_Chen1;~Ruoxi_Chen1;~Shilin_Zhang2;~Yaochen_Wang1;~Yinuo_Liu1;~Huichi_Zhou1;~Qihui_Zhang1;~Yao_Wan2;~Pan_Zhou5;~Lichao_Sun1", "gender": "M;;F;M;F;;M;M;M;M", "homepage": "https://dongping-chen.github.io;;https://github.com/sRk8ARtDkQF;https://misaki-wang.github.io/;https://github.com/Norrrrrrr-lyn;;https://github.com/Mask-Hui;http://wanyao.me;http://faculty.hust.edu.cn/pzhou/zh_CN/index.htm;https://lichao-sun.github.io/", "dblp": "151/7051;;;;;;160/4750;167/0275.html;84/6614-1;121/0780-1.html", "google_scholar": ";;;;;;;c3MtqtMAAAAJ;cTpFPJgAAAAJ;WhGUE7AAAAAJ", "orcid": "0009-0009-9848-2557;;;;;;;0000-0001-6937-4180;;", "linkedin": ";;;;;;;;;lichao-sun-b273a290/", "or_profile": "~Dongping_Chen1;~Ruoxi_Chen1;~Shilin_Zhang2;~Yaochen_Wang1;~Yinuo_Liu1;~Huichi_Zhou1;~Qihui_Zhang1;~Yao_Wan2;~Pan_Zhou5;~Lichao_Sun1", "aff": "Huazhong University of Science and Technology;;South China University of Technology;Nanjing University of Posts and Telecommunications;Huazhong University of Science and Technology;;Huazhong University of Science and Technology;Huazhong University of Science and Technology;Huazhong University of Science and Technology;Lehigh University", "aff_domain": "hust.edu.cn;;scut.edu.cn;njupt.edu.cn;hust.edu.cn;;hust.edu.cn;hust.edu.cn;hust.edu.cn;lehigh.edu", "position": "Undergrad student;;Undergrad student;Undergrad student;Undergrad student;;Intern;Assistant Professor;Professor;Assistant Professor", "bibtex": "@inproceedings{\nchen2024mllmasajudge,\ntitle={{MLLM}-as-a-Judge: Assessing Multimodal {LLM}-as-a-Judge with Vision-Language Benchmark},\nauthor={Dongping Chen and Ruoxi Chen and Shilin Zhang and Yaochen Wang and Yinuo Liu and Huichi Zhou and Qihui Zhang and Yao Wan and Pan Zhou and Lichao Sun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dbFEFHAD79}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2684295, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 90, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6962079740663458368&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 7, "email": "hust.edu.cn;;scut.edu.cn;njupt.edu.cn;hust.edu.cn;;hust.edu.cn;hust.edu.cn;hust.edu.cn;lehigh.edu", "author_num": 10, "aff_unique_index": "0;1;2;0;0;0;0;3", "aff_unique_norm": "Huazhong University of Science and Technology;South China University of Technology;Nanjing University of Posts and Telecommunications;Lehigh University", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.hust.edu.cn;https://www.scut.edu.cn;http://www.njupt.edu.cn;https://www.lehigh.edu", "aff_unique_abbr": "HUST;SCUT;NJUPT;Lehigh", "aff_campus_unique_index": "1", "aff_campus_unique": ";Nanjing", "aff_country_unique_index": "0;0;0;0;0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Equivariant Graph Neural Operator for Modeling 3D Dynamics", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33544", "id": "dccRCYmL5x", "proceeding": "https://proceedings.mlr.press/v235/xu24j.html", "pdf": "https://openreview.net/pdf?id=dccRCYmL5x", "openreview": "https://openreview.net/forum?id=dccRCYmL5x", "author_site": "Minkai Xu, Jiaqi Han, Aaron Lou, Jean Kossaifi, Arvind Ramanathan, Kamyar Azizzadenesheli, Jure Leskovec, Stefano Ermon, Anima Anandkumar", "tldr": "", "abstract": "Modeling the complex three-dimensional (3D) dynamics of relational systems is an important problem in the natural sciences, with applications ranging from molecular simulations to particle mechanics. Machine learning methods have achieved good success by learning graph neural networks to model spatial interactions. However, these approaches do not faithfully capture temporal correlations since they only model next-step predictions. In this work, we propose Equivariant Graph Neural Operator (EGNO), a novel and principled method that directly models dynamics as trajectories instead of just next-step prediction. Different from existing methods, EGNO explicitly learns the temporal evolution of 3D dynamics where we formulate the dynamics as a function over time and learn neural operators to approximate it. To capture the temporal correlations while keeping the intrinsic SE(3)-equivariance, we develop equivariant temporal convolutions parameterized in the Fourier space and build EGNO by stacking the Fourier layers over equivariant networks. EGNO is the first operator learning framework that is capable of modeling solution dynamics functions over time while retaining 3D equivariance. Comprehensive experiments in multiple domains, including particle simulations, human motion capture, and molecular dynamics, demonstrate the significantly superior performance of EGNO against existing methods, thanks to the equivariant temporal modeling. Our code is available at https://github.com/MinkaiXu/egno.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Minkai Xu;Jiaqi Han;Aaron Lou;Jean Kossaifi;Arvind Ramanathan;Kamyar Azizzadenesheli;Jure Leskovec;Stefano Ermon;Anima Anandkumar", "authorids": "~Minkai_Xu1;~Jiaqi_Han2;~Aaron_Lou1;~Jean_Kossaifi1;~Arvind_Ramanathan1;~Kamyar_Azizzadenesheli1;~Jure_Leskovec1;~Stefano_Ermon1;~Anima_Anandkumar1", "gender": "M;M;M;M;M;M;;M;F", "homepage": "https://minkaixu.com;https://hanjq17.github.io;https://aaronlou.com;http://jeankossaifi.com/;https://ramanathanlab.org;https://kamyar.page/;http://cs.stanford.edu/~jure/;http://cs.stanford.edu/~ermon/;http://tensorlab.cms.caltech.edu/users/anima/", "dblp": "257/3355;235/0412;232/3858;155/6766;;176/5584;l/JureLeskovec;47/8135;", "google_scholar": "https://scholar.google.com/citations?hl=en;AKppgMAAAAAJ;;https://scholar.google.co.uk/citations?user=hJS2TXwAAAAJ;;CxAS4SQAAAAJ;Q_kKkIUAAAAJ;;bEcLezcAAAAJ", "orcid": ";;;;;;0000-0002-5411-923X;;", "linkedin": ";;;;;;leskovec/;;anima-anandkumar-35171b1/", "or_profile": "~Minkai_Xu1;~Jiaqi_Han2;~Aaron_Lou1;~Jean_Kossaifi1;~Arvind_Ramanathan1;~Kamyar_Azizzadenesheli1;~Jure_Leskovec1;~Stefano_Ermon1;~anima_anandkumar1", "aff": "Stanford University;Computer Science Department, Stanford University;Stanford University;NVIDIA AI;Argonne National Laboratory;NVIDIA;Kumo.AI;Stanford University;California Institute of Technology", "aff_domain": "stanford.edu;cs.stanford.edu;stanford.edu;nvidia.com;anl.gov;nvidia.com;kumo.ai;stanford.edu;caltech.edu", "position": "PhD student;PhD student;PhD student;Researcher;Researcher;Researcher;Chief Scientist;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nxu2024equivariant,\ntitle={Equivariant Graph Neural Operator for Modeling 3D Dynamics},\nauthor={Minkai Xu and Jiaqi Han and Aaron Lou and Jean Kossaifi and Arvind Ramanathan and Kamyar Azizzadenesheli and Jure Leskovec and Stefano Ermon and Anima Anandkumar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dccRCYmL5x}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3188507, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10943342957164133565&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 10, "email": "stanford.edu;cs.stanford.edu;stanford.edu;nvidia.com;anl.gov;nvidia.com;kumo.ai;stanford.edu;caltech.edu", "author_num": 9, "aff_unique_index": "0;0;0;1;2;1;3;0;4", "aff_unique_norm": "Stanford University;NVIDIA;Argonne National Laboratory;Kumo.AI;California Institute of Technology", "aff_unique_dep": ";NVIDIA AI;;;", "aff_unique_url": "https://www.stanford.edu;https://www.nvidia.com/en-us/research/;https://www.anl.gov;https://www.kumo.ai;https://www.caltech.edu", "aff_unique_abbr": "Stanford;NVIDIA;ANL;Kumo.AI;Caltech", "aff_campus_unique_index": "0;0;0;0;2", "aff_campus_unique": "Stanford;;Pasadena", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "On Which Nodes Does GCN Fail? Enhancing GCN From the Node Perspective", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33543", "id": "dcwUGaK9sQ", "proceeding": "https://proceedings.mlr.press/v235/huang24t.html", "pdf": "https://openreview.net/pdf?id=dcwUGaK9sQ", "openreview": "https://openreview.net/forum?id=dcwUGaK9sQ", "author_site": "Jincheng Huang, Jialie SHEN, Xiaoshuang Shi, Xiaofeng Zhu", "tldr": "", "abstract": "The label smoothness assumption is at the core of Graph Convolutional Networks (GCNs): nodes in a local region have similar labels. Thus, GCN performs local feature smoothing operation to adhere to this assumption. However, there exist some nodes whose labels obtained by feature smoothing conflict with the label smoothness assumption. We find that the label smoothness assumption and the process of feature smoothing are both problematic on these nodes, and call these nodes out of GCN's control (OOC nodes). In this paper, first, we design the corresponding algorithm to locate the OOC nodes, then we summarize the characteristics of OOC nodes that affect their representation learning, and based on their characteristics, we present DaGCN, an efficient framework that can facilitate the OOC nodes. Extensive experiments verify the superiority of the proposed method and demonstrate that current advanced GCNs are improvements specifically on OOC nodes; the remaining nodes under GCN's control (UC nodes) are already optimally represented by vanilla GCN on most datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jincheng Huang;Jialie Shen;Xiaoshuang Shi;Xiaofeng Zhu", "authorids": "~Jincheng_Huang1;~Jialie_Shen2;~Xiaoshuang_Shi1;~Xiaofeng_Zhu7", "gender": "M;Not Specified;M;M", "homepage": "https://www.city.ac.uk/about/find-contact/staff-directory?query=Jialie+shen;http://plaza.ufl.edu/xsshi2015/;https://sites.google.com/site/seanzhuxf/;https://huangjc0429.github.io/JinchengHuang.github.io/", "dblp": "33/7046;87/10627;60/4671-1;68/1979-5.html", "google_scholar": "d3h-zScAAAAJ;BWGQt3YAAAAJ;https://scholar.google.com/citations?hl=en;BAgUbZsAAAAJ", "orcid": "0000-0002-4560-8509;;0000-0001-6840-0578;", "linkedin": ";;;", "or_profile": "~Jialie_Shen2;~Xiaoshuang_Shi1;~Xiaofeng_Zhu7;~Jin_Cheng_Huang1", "aff": "City, University of London;University of Electronic Science and Technology of China;University of Electronic Science and Technology of China;University of Electronic Science and Technology of China", "aff_domain": "city.ac.uk;uestc.edu.cn;uestc.edu.cn;uestc.edu.cn", "position": "Full Professor;Full Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nhuang2024on,\ntitle={On Which Nodes Does {GCN} Fail? Enhancing {GCN} From the Node Perspective},\nauthor={Jincheng Huang and Jialie Shen and Xiaoshuang Shi and Xiaofeng Zhu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dcwUGaK9sQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1138856, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6752682195435106361&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 6, "email": "city.ac.uk;uestc.edu.cn;uestc.edu.cn;uestc.edu.cn", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "City, University of London;University of Electronic Science and Technology of China", "aff_unique_dep": ";", "aff_unique_url": "https://www.city.ac.uk;https://www.uestc.edu.cn", "aff_unique_abbr": "City, University of London;UESTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United Kingdom;China" }, { "title": "Deep Equilibrium Models are Almost Equivalent to Not-so-deep Explicit Models for High-dimensional Gaussian Mixtures", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33542", "id": "ddjRdm3wUW", "proceeding": "https://proceedings.mlr.press/v235/ling24a.html", "pdf": "https://openreview.net/pdf?id=ddjRdm3wUW", "openreview": "https://openreview.net/forum?id=ddjRdm3wUW", "author_site": "Zenan Ling, Longbo Li, Zhanbo Feng, YIXUAN ZHANG, Feng Zhou, Robert Qiu, Zhenyu Liao", "tldr": "", "abstract": "Deep equilibrium models (DEQs), as typical implicit neural networks, have demonstrated remarkable success on various tasks. There is, however, a lack of theoretical understanding of the connections and differences between implicit DEQs and explicit neural network models. In this paper, leveraging recent advances in random matrix theory (RMT), we perform an in-depth analysis on the eigenspectra of the conjugate kernel (CK) and neural tangent kernel (NTK) matrices for implicit DEQs, when the input data are drawn from a high-dimensional Gaussia mixture. We prove that, in this setting, the spectral behavior of these Implicit-CKs and NTKs depend on the DEQ activation function and initial weight variances, *but only via a system of four nonlinear equations*. As a direct consequence of this theoretical result, we demonstrate that a shallow explicit network can be carefully designed to produce the same CK or NTK as a given DEQ. Despite derived here for Gaussian mixture data, empirical results show the proposed theory and design principles also apply to popular real-world datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zenan Ling;Longbo Li;Zhanbo Feng;YIXUAN ZHANG;Feng Zhou;Robert C Qiu;Zhenyu Liao", "authorids": "~Zenan_Ling1;~Longbo_Li1;~Zhanbo_Feng1;~YIXUAN_ZHANG1;~Feng_Zhou9;~Robert_C_Qiu1;~Zhenyu_Liao1", "gender": "M;M;M;;;;M", "homepage": "https://scholar.google.com/citations?user=BabePTkAAAAJ&hl=zh-CN;https://github.com/StephenLi24;http://SadAngel.cn/;;;;https://zhenyu-liao.github.io/", "dblp": "183/7798;;234/7758;57/1240-6;;;49/10218-1", "google_scholar": "BabePTkAAAAJ;;;oHaa8jsAAAAJ;;;https://scholar.google.fr/citations?user=SPYhJV8AAAAJ", "orcid": ";;;0009-0005-0094-7143;;;0000-0002-1915-8559", "linkedin": ";;;;;;", "or_profile": "~Zenan_Ling1;~Longbo_Li1;~Zhanbo_Feng1;~YIXUAN_ZHANG1;~Feng_Zhou9;~Robert_C_Qiu1;~Zhenyu_Liao1", "aff": "Huazhong University of Science and Technology;Huazhong University of Science and Technology;Shanghai Jiaotong University;Hangzhou Dianzi University;;;Huazhong University of Science and Technology", "aff_domain": "hust.edu.cn;hust.edu.cn;sjtu.edu.cn;hdu.edu.cn;;;hust.edu.cn", "position": "Researcher;MS student;PhD student;Lecturer;;;Associate Professor", "bibtex": "@inproceedings{\nling2024deep,\ntitle={Deep Equilibrium Models are Almost Equivalent to Not-so-deep Explicit Models for High-dimensional Gaussian Mixtures},\nauthor={Zenan Ling and Longbo Li and Zhanbo Feng and YIXUAN ZHANG and Feng Zhou and Robert C Qiu and Zhenyu Liao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ddjRdm3wUW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 702903, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4933289856893733313&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "hust.edu.cn;hust.edu.cn;sjtu.edu.cn;hdu.edu.cn;;;hust.edu.cn", "author_num": 7, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Huazhong University of Science and Technology;Shanghai Jiao Tong University;Hangzhou Dianzi University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.hust.edu.cn;https://www.sjtu.edu.cn;http://www.hdu.edu.cn/", "aff_unique_abbr": "HUST;SJTU;HGHDU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Consistent Long-Term Forecasting of Ergodic Dynamical Systems", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33541", "id": "dfR6FU53qk", "proceeding": "https://proceedings.mlr.press/v235/kostic24a.html", "pdf": "https://openreview.net/pdf?id=dfR6FU53qk", "openreview": "https://openreview.net/forum?id=dfR6FU53qk", "author_site": "Vladimir Kostic, Karim Lounici, Prune Inzerilli, Pietro Novelli, Massimiliano Pontil", "tldr": "", "abstract": "We study the problem of forecasting the evolution of a function of the state (observable) of a discrete ergodic dynamical system over multiple time steps. The elegant theory of Koopman and transfer operators can be used to evolve any such function forward in time. However, their estimators are usually unreliable in long-term forecasting. We show how classical techniques of eigenvalue deflation from operator theory and feature centering from statistics can be exploited to enhance standard estimators. We develop a novel technique to derive high probability bounds on powers of empirical estimators. Our approach, rooted in the stability _theory of non-normal operators_, allows us to establish uniform in time bounds for the forecasting error, which hold even on _infinite time horizons_. We further show that our approach can be seamlessly employed to forecast future state distributions from an initial one, with provably uniform error bounds. Numerical experiments illustrate the advantages of our approach in practice.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vladimir R Kostic;Karim Lounici;Prune Inzerilli;Pietro Novelli;Massimiliano Pontil", "authorids": "~Vladimir_R_Kostic1;~Karim_Lounici1;prune.inzerilli@polytechnique.edu;~Pietro_Novelli1;~Massimiliano_Pontil4", "gender": "M;;;M;Not Specified", "homepage": "https://vladi-iit.github.io/;;;;https://www.iit.it/web/computational-statistics-and-machine-learning", "dblp": "94/879;;;318/3513;", "google_scholar": "66gV7SAAAAAJ;;;;lcOacs8AAAAJ", "orcid": ";;;0000-0003-1623-5659;0000-0001-9415-098X", "linkedin": "vladimir-kostic-77500652/;;;;", "or_profile": "~Vladimir_R_Kostic1;~Karim_Lounici1;prune.inzerilli@polytechnique.edu;~Pietro_Novelli1;~Massimiliano_Pontil4", "aff": "University of Novi Sad;;;Istituto Italiano di Tecnologia;University College London, University of London", "aff_domain": "uns.ac.rs;;;iit.it;ucl.ac.uk", "position": "Associate Professor;;;Postdoc;Full Professor", "bibtex": "@inproceedings{\nkostic2024consistent,\ntitle={Consistent Long-Term Forecasting of Ergodic Dynamical Systems},\nauthor={Vladimir R Kostic and Karim Lounici and Prune Inzerilli and Pietro Novelli and Massimiliano Pontil},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dfR6FU53qk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 766307, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4230014619395834293&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "uns.ac.rs;;;iit.it;ucl.ac.uk", "author_num": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Novi Sad;Istituto Italiano di Tecnologia;University College London", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uns.ac.rs;https://www.iit.it;https://www.ucl.ac.uk", "aff_unique_abbr": "UNS;IIT;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "Serbia;Italy;United Kingdom" }, { "title": "LQER: Low-Rank Quantization Error Reconstruction for LLMs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33540", "id": "dh8k41g775", "proceeding": "https://proceedings.mlr.press/v235/zhang24j.html", "pdf": "https://openreview.net/pdf?id=dh8k41g775", "openreview": "https://openreview.net/forum?id=dh8k41g775", "author_site": "Cheng Zhang, Jianyi Cheng, George Constantinides, Yiren Zhao", "tldr": "", "abstract": "Post-training quantization of Large Language Models (LLMs) is challenging. In this work, we introduce **L**ow-rank **Q**uantization **E**rror **R**eduction (LQER), which combines quantization and low-rank approximation to recover the model capability. LQER leverages an activation-induced scale matrix to drive the singular value distribution of quantization error towards a desirable distribution, which enables nearly-lossless W4A8 quantization on various LLMs and downstream tasks without the need for knowledge distillation, grid search, or gradient-based iterative optimization. Unlike existing methods, the computation pattern of LQER eliminates the need for specialized Scatter and Gather processes to collect high-precision weights from irregular memory locations. Our W4A8 LLMs achieve near-lossless performance on six popular downstream tasks, while using $1.36 \\times$ fewer hardware resources than the leading state-of-the-art method. We will open-source our framework at [https://github.com/ChengZhang-98/lqer](https://github.com/ChengZhang-98/lqer)", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Cheng Zhang;Jianyi Cheng;George Anthony Constantinides;Yiren Zhao", "authorids": "~Cheng_Zhang21;jianyi.cheng@cl.cam.ac.uk;~George_Anthony_Constantinides1;~Yiren_Zhao2", "gender": "M;;M;M", "homepage": "https://chengzhang-98.github.io/blog/;;http://cas.ee.ic.ac.uk;https://aaronzhao.me", "dblp": ";;38/1966;https://dblp.uni-trier.de/pers/hd/z/Zhao:Yiren", "google_scholar": "6K-mHPoAAAAJ;;https://scholar.google.co.uk/citations?user=NTn1NJAAAAAJ;lOOmgEgAAAAJ", "orcid": ";;;", "linkedin": "chengzhang98/;;;yiren-aaron-zhao-baa8b5116/", "or_profile": "~Cheng_Zhang21;jianyi.cheng@cl.cam.ac.uk;~George_Anthony_Constantinides1;~Yiren_Zhao2", "aff": "Imperial College London;;Imperial College London;Imperial College London", "aff_domain": "ic.ac.uk;;imperial.ac.uk;ic.ac.uk", "position": "PhD student;;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nzhang2024lqer,\ntitle={{LQER}: Low-Rank Quantization Error Reconstruction for {LLM}s},\nauthor={Cheng Zhang and Jianyi Cheng and George Anthony Constantinides and Yiren Zhao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dh8k41g775}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 525926, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2817485279149663514&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "ic.ac.uk;;imperial.ac.uk;ic.ac.uk", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Imperial College London", "aff_unique_dep": "", "aff_unique_url": "https://www.imperial.ac.uk", "aff_unique_abbr": "ICL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "ELTA: An Enhancer against Long-Tail for Aesthetics-oriented Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33539", "id": "dhrNfAJAH6", "proceeding": "https://proceedings.mlr.press/v235/liu24w.html", "pdf": "https://openreview.net/pdf?id=dhrNfAJAH6", "openreview": "https://openreview.net/forum?id=dhrNfAJAH6", "author_site": "Limin Liu, Shuai He, Anlong Ming, Rui Xie, Huadong Ma", "tldr": "", "abstract": "Real-world datasets often exhibit long-tailed distributions, compromising the generalization and fairness of learning-based models. This issue is particularly pronounced in Image Aesthetics Assessment (IAA) tasks, where such imbalance is difficult to mitigate due to a severe distribution mismatch between features and labels, as well as the great sensitivity of aesthetics to image variations. To address these issues, we propose an Enhancer against Long-Tail for Aesthetics-oriented models (ELTA). ELTA first utilizes a dedicated mixup technique to enhance minority feature representation in high-level space while preserving their intrinsic aesthetic qualities. Next, it aligns features and labels through a similarity consistency approach, effectively alleviating the distribution mismatch. Finally, ELTA adopts a specific strategy to refine the output distribution, thereby enhancing the quality of pseudo-labels. Experiments on four representative datasets (AVA, AADB, TAD66K, and PARA) show that our proposed ELTA achieves state-of-the-art performance by effectively mitigating the long-tailed issue in IAA datasets. Moreover, ELTA is designed with plug-and-play capabilities for seamless integration with existing methods. To our knowledge, this is the first contribution in the IAA community addressing long-tail. All resources are available in here.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Limin Liu;Shuai He;Anlong Ming;Rui Xie;Huadong Ma", "authorids": "~Limin_Liu1;~Shuai_He2;~Anlong_Ming1;~Rui_Xie5;~Huadong_Ma1", "gender": "M;M;M;M;M", "homepage": "https://github.com/PRAGMATISM-630;https://github.com/woshidandan;https://teacher.bupt.edu.cn/mal/en/index.htm;https://github.com/MIBXR;https://scs.bupt.edu.cn/", "dblp": ";;52/3276;86/2228;04/6217", "google_scholar": ";;y5kFLCwAAAAJ;;", "orcid": ";0000-0001-8817-0685;0000-0003-2952-7757;;", "linkedin": ";;;;", "or_profile": "~Limin_Liu1;~Shuai_He2;~Anlong_Ming1;~Rui_Xie5;~Huadong_Ma1", "aff": "Beijing University of Posts and Telecommunications;China, Beijing University of Posts and Telecommunications;Beijing University of Posts and Telecommunications;Beijing University of Posts and Telecommunications;Beijing University of Post and Telecommunication", "aff_domain": "bupt.edu.cn;bupt.edu.cn;bupt.edu.cn;bupt.edu.cn;bupt.edu.cn", "position": "MS student;PhD student;Full Professor;MS student;Full Professor", "bibtex": "@inproceedings{\nliu2024elta,\ntitle={{ELTA}: An Enhancer against Long-Tail for Aesthetics-oriented Models},\nauthor={Limin Liu and Shuai He and Anlong Ming and Rui Xie and Huadong Ma},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dhrNfAJAH6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2424704, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17664186870106373933&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "bupt.edu.cn;bupt.edu.cn;bupt.edu.cn;bupt.edu.cn;bupt.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Beijing University of Posts and Telecommunications", "aff_unique_dep": "", "aff_unique_url": "http://www.bupt.edu.cn/", "aff_unique_abbr": "BUPT", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Efficient Adaptation in Mixed-Motive Environments via Hierarchical Opponent Modeling and Planning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33538", "id": "disVlUOH4b", "proceeding": "https://proceedings.mlr.press/v235/huang24p.html", "pdf": "https://openreview.net/pdf?id=disVlUOH4b", "openreview": "https://openreview.net/forum?id=disVlUOH4b", "author_site": "Yizhe Huang, Anji Liu, Fanqi Kong, Yaodong Yang, Song-Chun Zhu, Xue Feng", "tldr": "", "abstract": "Despite the recent successes of multi-agent reinforcement learning (MARL) algorithms, efficiently adapting to co-players in mixed-motive environments remains a significant challenge. One feasible approach is to hierarchically model co-players' behavior based on inferring their characteristics. However, these methods often encounter difficulties in efficient reasoning and utilization of inferred information. To address these issues, we propose Hierarchical Opponent modeling and Planning (HOP), a novel multi-agent decision-making algorithm that enables few-shot adaptation to unseen policies in mixed-motive environments. HOP is hierarchically composed of two modules: an opponent modeling module that infers others' goals and learns corresponding goal-conditioned policies, and a planning module that employs Monte Carlo Tree Search (MCTS) to identify the best response. Our approach improves efficiency by updating beliefs about others' goals both across and within episodes and by using information from the opponent modeling module to guide planning. Experimental results demonstrate that in mixed-motive environments, HOP exhibits superior few-shot adaptation capabilities when interacting with various unseen agents, and excels in self-play scenarios. Furthermore, the emergence of social intelligence during our experiments underscores the potential of our approach in complex multi-agent environments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yizhe Huang;Anji Liu;Fanqi Kong;Yaodong Yang;Song-Chun Zhu;Xue Feng", "authorids": "~Yizhe_Huang2;~Anji_Liu1;~Fanqi_Kong1;~Yaodong_Yang1;~Song-Chun_Zhu1;~Xue_Feng3", "gender": ";M;M;M;M;F", "homepage": ";https://liuanji.github.io/;https://github.com/kfq20;https://www.yangyaodong.com;https://zhusongchun.net/;", "dblp": ";227/8622;377/2991;170/1496-1;10/10313;", "google_scholar": "j5AxMFUAAAAJ;k_4zYecAAAAJ;Aj13PkAAAAAJ;https://scholar.google.co.uk/citations?user=6yL0xw8AAAAJ;https://scholar.google.com.tw/citations?user=Al8dyb4AAAAJ;", "orcid": "0000-0001-8722-7221;;;0000-0001-8132-5613;;0000-0002-7163-7274", "linkedin": ";anji-liu-7610b7190/;;yaodong-yang;;", "or_profile": "~Yizhe_Huang2;~Anji_Liu1;~Fanqi_Kong1;~Yaodong_Yang1;~Song-Chun_Zhu1;~Xue_Feng3", "aff": "Peking University;University of California, Los Angeles;Tsinghua University;Peking University;Peking University;Beijing Institute for General Artificial Intelligence", "aff_domain": "pku.edu.cn;ucla.edu;tsinghua.edu.cn;pku.edu.cn;pku.edu.cn;bigai.ai", "position": "PhD student;PhD student;Undergrad student;Assistant Professor;Full Professor;Researcher", "bibtex": "@inproceedings{\nhuang2024efficient,\ntitle={Efficient Adaptation in Mixed-Motive Environments via Hierarchical Opponent Modeling and Planning},\nauthor={Yizhe Huang and Anji Liu and Fanqi Kong and Yaodong Yang and Song-Chun Zhu and Xue Feng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=disVlUOH4b}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4196640, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17100846629261274635&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 7, "email": "pku.edu.cn;ucla.edu;tsinghua.edu.cn;pku.edu.cn;pku.edu.cn;bigai.ai", "author_num": 6, "aff_unique_index": "0;1;2;0;0;3", "aff_unique_norm": "Peking University;University of California, Los Angeles;Tsinghua University;Beijing Institute for General Artificial Intelligence", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.pku.edu.cn;https://www.ucla.edu;https://www.tsinghua.edu.cn;http://www.bigaiai.org/", "aff_unique_abbr": "Peking U;UCLA;THU;BIGAI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;0;0;0;0", "aff_country_unique": "China;United States" }, { "title": "Balanced Resonate-and-Fire Neurons", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33537", "id": "dkdilv4XD4", "proceeding": "https://proceedings.mlr.press/v235/higuchi24a.html", "pdf": "https://openreview.net/pdf?id=dkdilv4XD4", "openreview": "https://openreview.net/forum?id=dkdilv4XD4", "author_site": "Saya Higuchi, Sebastian Kairat, Sander Bohte, Sebastian Otte", "tldr": "", "abstract": "The resonate-and-fire (RF) neuron, introduced over two decades ago, is a simple, efficient, yet biologically plausible spiking neuron model, which can extract frequency patterns within the time domain due to its resonating membrane dynamics. However, previous RF formulations suffer from intrinsic shortcomings that limit effective learning and prevent exploiting the principled advantage of RF neurons. Here, we introduce the balanced RF (BRF) neuron, which alleviates some of the intrinsic limitations of vanilla RF neurons and demonstrates its effectiveness within recurrent spiking neural networks (RSNNs) on various sequence learning tasks. We show that networks of BRF neurons achieve overall higher task performance, produce only a fraction of the spikes, and require significantly fewer parameters as compared to modern RSNNs. Moreover, BRF-RSNN consistently provide much faster and more stable training convergence, even when bridging many hundreds of time steps during backpropagation through time (BPTT). These results underscore that our BRF-RSNN is a strong candidate for future large-scale RSNN architectures, further lines of research in SNN methodology, and more efficient hardware implementations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Saya Higuchi;Sebastian Kairat;Sander Bohte;Sebastian Otte", "authorids": "~Saya_Higuchi1;sebastian.kairat@student.uni-tuebingen.de;~Sander_Bohte1;~Sebastian_Otte1", "gender": "F;;M;", "homepage": ";;https://www.cwi.nl/~sbohte;", "dblp": "371/2701;;15/5737;", "google_scholar": ";;https://scholar.google.nl/citations?user=zHlebkUAAAAJ;", "orcid": ";;0000-0002-7866-278X;", "linkedin": "saya-higuchi-745836274;;;", "or_profile": "~Saya_Higuchi1;sebastian.kairat@student.uni-tuebingen.de;~Sander_Bohte1;~Sebastian_Otte1", "aff": "Universit\u00e4t zu L\u00fcbeck;;Centrum voor Wiskunde en Informatica;", "aff_domain": "uni-luebeck.de;;cwi.nl;", "position": "PhD student;;Principal Researcher;", "bibtex": "@inproceedings{\nhiguchi2024balanced,\ntitle={Balanced Resonate-and-Fire Neurons},\nauthor={Saya Higuchi and Sebastian Kairat and Sander Bohte and Sebastian Otte},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dkdilv4XD4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4729456, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12480297780261914235&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 9, "email": "uni-luebeck.de;;cwi.nl;", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "University of L\u00fcbeck;Centrum voor Wiskunde en Informatica", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-luebeck.de;https://www.cwi.nl/", "aff_unique_abbr": "UzL;CWI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Germany;Netherlands" }, { "title": "Causality Based Front-door Defense Against Backdoor Attack on Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33536", "id": "dmHHVcHFdM", "proceeding": "https://proceedings.mlr.press/v235/liu24bu.html", "pdf": "https://openreview.net/pdf?id=dmHHVcHFdM", "openreview": "https://openreview.net/forum?id=dmHHVcHFdM", "author_site": "Yiran Liu, Xiaoang Xu, Zhiyi Hou, Yang Yu", "tldr": "", "abstract": "We have developed a new framework based on the theory of causal inference to protect language models against backdoor attacks. Backdoor attackers can poison language models with different types of triggers, such as words, sentences, grammar, and style, enabling them to selectively modify the decision-making of the victim model. However, existing defense approaches are only effective when the backdoor attack form meets specific assumptions, making it difficult to counter diverse backdoor attacks. We propose a new defense framework **F**ront-door **A**djustment for **B**ackdoor **E**limination (FABE) based on causal reasoning that does not rely on assumptions about the form of triggers. This method effectively differentiates between spurious and legitimate associations by creating a 'front door' that maps out the actual causal relationships. The term 'front door' refers to a text that retains the semantic equivalence of the initial input, which is generated by an additional, fine-tuned language model, denoted as the defense model. Our defense experiments against various attack methods at the token, sentence, and syntactic levels reduced the attack success rate from 93.63% to 15.12%, improving the defense effect by 2.91 times compared to the best baseline result of 66.61%, achieving state-of-the-art results. Through ablation study analysis, we analyzed the effect of each module in FABE, demonstrating the importance of complying with the front-door criterion and front-door adjustment formula, which also explains why previous methods failed. Our code to reproduce the experiments is available at: https://github.com/lyr17/Frontdoor-Adjustment-Backdoor-Elimination.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yiran Liu;Xiaoang Xu;Zhiyi Hou;Yang Yu", "authorids": "~Yiran_Liu1;~Xiaoang_Xu1;~Zhiyi_Hou1;~Yang_Yu13", "gender": "M;M;;M", "homepage": "https://github.com/xuxiaoang;https://www.zhihu.com/people/qi-tian-da-sheng-51-45;https://iiis.tsinghua.edu.cn/zh/yuy/;", "dblp": "384/4207;;;", "google_scholar": ";;;", "orcid": ";;;0000-0003-3450-7881", "linkedin": ";;;", "or_profile": "~Xiaoang_Xu1;~Zhiyi_Hou1;~Yu_Yang11;~Liu_Yiran2", "aff": "Harbin University of Science and Technology;Harbin Institute of Technology;;Tsinghua University", "aff_domain": "hrbust.edu.cn;stu.hit.edu.cn;;tsinghua.edu.cn", "position": "Undergrad student;Undergrad student;;PhD student", "bibtex": "@inproceedings{\nliu2024causality,\ntitle={Causality Based Front-door Defense Against Backdoor Attack on Language Models},\nauthor={Yiran Liu and Xiaoang Xu and Zhiyi Hou and Yang Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dmHHVcHFdM}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 610820, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15739209338132758555&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "hrbust.edu.cn;stu.hit.edu.cn;;tsinghua.edu.cn", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Harbin University of Science and Technology;Harbin Institute of Technology;Tsinghua University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.hust.edu.cn;http://www.hit.edu.cn/;https://www.tsinghua.edu.cn", "aff_unique_abbr": "HUST;HIT;THU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Harbin;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "ACPO: A Policy Optimization Algorithm for Average MDPs with Constraints", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33535", "id": "dmfvHU1LNF", "proceeding": "https://proceedings.mlr.press/v235/agnihotri24a.html", "pdf": "https://openreview.net/pdf?id=dmfvHU1LNF", "openreview": "https://openreview.net/forum?id=dmfvHU1LNF", "author_site": "Akhil Agnihotri, Rahul Jain, Haipeng Luo", "tldr": "", "abstract": "Reinforcement Learning (RL) for constrained MDPs (CMDPs) is an increasingly important problem for various applications. Often, the average criterion is more suitable than the discounted criterion. Yet, RL for average-CMDPs (ACMDPs) remains a challenging problem. Algorithms designed for discounted constrained RL problems often do not perform well for the average CMDP setting. In this paper, we introduce a new policy optimization with function approximation algorithm for constrained MDPs with the average criterion. The Average-Constrained Policy Optimization (ACPO) algorithm is inspired by trust region-based policy optimization algorithms. We develop basic sensitivity theory for average CMDPs, and then use the corresponding bounds in the design of the algorithm. We provide theoretical guarantees on its performance, and through extensive experimental work in various challenging OpenAI Gym environments, show its superior empirical performance when compared to other state-of-the-art algorithms adapted for the ACMDPs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Akhil Agnihotri;Rahul Jain;Haipeng Luo", "authorids": "~Akhil_Agnihotri1;~Rahul_Jain1;~Haipeng_Luo1", "gender": "M;M;M", "homepage": "http://agnihotriakhil.github.io/;http://www.rahuljain.net;https://haipeng-luo.net/", "dblp": "248/8264;42/4430-2.html;62/2576", "google_scholar": "Kf1o27gAAAAJ;NIj18UQAAAAJ;ct2hw4UAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Akhil_Agnihotri1;~Rahul_Jain1;~Haipeng_Luo1", "aff": "University of Southern California;University of Southern California;University of Southern California", "aff_domain": "usc.edu;usc.edu;usc.edu", "position": "PhD student;Professor;Associate Professor", "bibtex": "@inproceedings{\nagnihotri2024acpo,\ntitle={{ACPO}: A Policy Optimization Algorithm for Average {MDP}s with Constraints},\nauthor={Akhil Agnihotri and Rahul Jain and Haipeng Luo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dmfvHU1LNF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4197754, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13592099661361270441&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "usc.edu;usc.edu;usc.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Southern California", "aff_unique_dep": "", "aff_unique_url": "https://www.usc.edu", "aff_unique_abbr": "USC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Exploration by Optimization with Hybrid Regularizers: Logarithmic Regret with Adversarial Robustness in Partial Monitoring", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33534", "id": "dplgaRn4Ae", "proceeding": "https://proceedings.mlr.press/v235/tsuchiya24a.html", "pdf": "https://openreview.net/pdf?id=dplgaRn4Ae", "openreview": "https://openreview.net/forum?id=dplgaRn4Ae", "author_site": "Taira Tsuchiya, Shinji Ito, Junya Honda", "tldr": "", "abstract": "Partial monitoring is a generic framework of online decision-making problems with limited feedback. To make decisions from such limited feedback, it is necessary to find an appropriate distribution for exploration. Recently, a powerful approach for this purpose, exploration by optimization (ExO), was proposed, which achieves optimal bounds in adversarial environments with follow-the-regularized-leader for a wide range of online decision-making problems. However, a naive application of ExO in stochastic environments significantly degrades regret bounds. To resolve this issue in locally observable games, we first establish a new framework and analysis for ExO with a hybrid regularizer. This development allows us to significantly improve existing regret bounds of best-of-both-worlds (BOBW) algorithms, which achieves nearly optimal bounds both in stochastic and adversarial environments. In particular, we derive a stochastic regret bound of $O(\\sum_{a \\neq a^*} k^2 m^2 \\log T / \\Delta_a)$, where $k$, $m$, and $T$ are the numbers of actions, observations and rounds, $a^*$ is an optimal action, and $\\Delta_a$ is the suboptimality gap for action $a$. This bound is roughly $\\Theta(k^2 \\log T)$ times smaller than existing BOBW bounds. In addition, for globally observable games, we provide a new BOBW algorithm with the first $O(\\log T)$ stochastic bound.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Taira Tsuchiya;Shinji Ito;Junya Honda", "authorids": "~Taira_Tsuchiya1;~Shinji_Ito1;~Junya_Honda1", "gender": "M;M;M", "homepage": "https://tsuchhiii.github.io/;https://researchmap.jp/shinji_ito?lang=en;http://stat.sys.i.kyoto-u.ac.jp/honda/index.html", "dblp": "226/5536;49/852;56/9070", "google_scholar": "https://scholar.google.co.jp/citations?view_op=list_works;https://scholar.google.co.jp/citations?user=GX0V06wAAAAJ;https://scholar.google.co.jp/citations?user=Aw8OrxQAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Taira_Tsuchiya1;~Shinji_Ito1;~Junya_Honda1", "aff": "The University of Tokyo;NEC;Kyoto University", "aff_domain": "u-tokyo.ac.jp;nec.com;kyoto-u.ac.jp", "position": "Assistant Professor;Principal Researcher;Associate Professor", "bibtex": "@inproceedings{\ntsuchiya2024exploration,\ntitle={Exploration by Optimization with Hybrid Regularizers: Logarithmic Regret with Adversarial Robustness in Partial Monitoring},\nauthor={Taira Tsuchiya and Shinji Ito and Junya Honda},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dplgaRn4Ae}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 244238, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8764758004867074684&as_sdt=5,38&sciodt=0,38&hl=en", "gs_version_total": 8, "email": "u-tokyo.ac.jp;nec.com;kyoto-u.ac.jp", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Tokyo;NEC Corporation;Kyoto University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.u-tokyo.ac.jp;https://www.nec.com;https://www.kyoto-u.ac.jp", "aff_unique_abbr": "UTokyo;NEC;Kyoto U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Japan" }, { "title": "An Improved Finite-time Analysis of Temporal Difference Learning with Deep Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33533", "id": "dqdctVbSfs", "proceeding": "https://proceedings.mlr.press/v235/ke24b.html", "pdf": "https://openreview.net/pdf?id=dqdctVbSfs", "openreview": "https://openreview.net/forum?id=dqdctVbSfs", "author_site": "Zhifa Ke, Zaiwen Wen, Junyu Zhang", "tldr": "", "abstract": "Temporal difference (TD) learning algorithms with neural network function parameterization have well-established empirical success in many practical large-scale reinforcement learning tasks. However, theoretical understanding of these algorithms remains challenging due to the nonlinearity of the action-value approximation. In this paper, we develop an improved non-asymptotic analysis of the neural TD method with a general $L$-layer neural network. New proof techniques are developed and an improved new $\\tilde{\\mathcal{O}}(\\epsilon^{-1})$ sample complexity is derived. To our best knowledge, this is the first finite-time analysis of neural TD that achieves an $\\tilde{\\mathcal{O}}(\\epsilon^{-1})$ complexity under the Markovian sampling, as opposed to the best known $\\tilde{\\mathcal{O}}(\\epsilon^{-2})$ complexity in the existing literature.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhifa Ke;Zaiwen Wen;Junyu Zhang", "authorids": "~Zhifa_Ke1;~Zaiwen_Wen1;~Junyu_Zhang1", "gender": "M;M;M", "homepage": ";http://bicmr.pku.edu.cn/~wenzw;", "dblp": ";26/8184;", "google_scholar": ";QfxrxDoAAAAJ;bsN1uT0AAAAJ", "orcid": ";;", "linkedin": "https://www.linkedin.cn/incareer/in/ACoAAEBpXh4Botlqkn9Kw5BRTy1SVZOkMTJbaeM;;", "or_profile": "~Zhifa_Ke1;~Zaiwen_Wen1;~Junyu_Zhang1", "aff": "Peking University;Peking University;National University of Singapore", "aff_domain": "pku.edu.cn;pku.edu.cn;nus.edu.sg", "position": "MS student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nke2024an,\ntitle={An Improved Finite-time Analysis of Temporal Difference Learning with Deep Neural Networks},\nauthor={Zhifa Ke and Zaiwen Wen and Junyu Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dqdctVbSfs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1047104, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14795575760907293739&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "pku.edu.cn;pku.edu.cn;nus.edu.sg", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Peking University;National University of Singapore", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.nus.edu.sg", "aff_unique_abbr": "Peking U;NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;Singapore" }, { "title": "Offline Transition Modeling via Contrastive Energy Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33532", "id": "dqpg8jdA2w", "proceeding": "https://proceedings.mlr.press/v235/chen24w.html", "pdf": "https://openreview.net/pdf?id=dqpg8jdA2w", "openreview": "https://openreview.net/forum?id=dqpg8jdA2w", "author_site": "Ruifeng Chen, Chengxing Jia, Zefang Huang, Tian-Shuo Liu, Xu-Hui Liu, Yang Yu", "tldr": "", "abstract": "Learning a high-quality transition model is of great importance for sequential decision-making tasks, especially in offline settings. Nevertheless, the complex behaviors of transition dynamics in real-world environments pose challenges for the standard forward models because of their inductive bias towards smooth regressors, conflicting with the inherent nature of transitions such as discontinuity or large curvature. In this work, we propose to model the transition probability implicitly through a scalar-value energy function, which enables not only flexible distribution prediction but also capturing complex transition behaviors. The Energy-based Transition Models (ETM) are shown to accurately fit the discontinuous transition functions and better generalize to out-of-distribution transition data. Furthermore, we demonstrate that energy-based transition models improve the evaluation accuracy and significantly outperform other off-policy evaluation methods in DOPE benchmark. Finally, we show that energy-based transition models also benefit reinforcement learning and outperform prior offline RL algorithms in D4RL Gym-Mujoco tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ruifeng Chen;Chengxing Jia;Zefang Huang;Tian-Shuo Liu;Xu-Hui Liu;Yang Yu", "authorids": "~Ruifeng_Chen1;~Chengxing_Jia1;~Zefang_Huang1;~Tian-Shuo_Liu1;~Xu-Hui_Liu1;~Yang_Yu5", "gender": "M;M;M;M;;M", "homepage": "http://www.lamda.nju.edu.cn/chenrf/;http://www.lamda.nju.edu.cn/jiacx/;https://hzffrank.github.io;https://github.com/LTSure;http://www.lamda.nju.edu.cn/liuxh/;http://www.lamda.nju.edu.cn/yuy", "dblp": "https://dblp.uni-trier.de/pid/58/10097-3;;231/3939;;292/7577;46/2181-1", "google_scholar": ";;;;;PG2lDSwAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Ruifeng_Chen1;~Chengxing_Jia1;~Zefang_Huang1;~Tian-Shuo_Liu1;~Xu-Hui_Liu1;~Yang_Yu2", "aff": "Nanjing University;Nanjing University;Nanjing University;Nanjing university;Nanjing University;Nanjing University", "aff_domain": "nju.edu.cn;nju.edu.cn;smail.nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn", "position": "PhD student;PhD student;Undergrad student;MS student;PhD student;Professor", "bibtex": "@inproceedings{\nchen2024offline,\ntitle={Offline Transition Modeling via Contrastive Energy Learning},\nauthor={Ruifeng Chen and Chengxing Jia and Zefang Huang and Tian-Shuo Liu and Xu-Hui Liu and Yang Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dqpg8jdA2w}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5058948, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2537124054256549109&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "nju.edu.cn;nju.edu.cn;smail.nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "", "aff_unique_url": "https://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Does Label Smoothing Help Deep Partial Label Learning?", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33531", "id": "drjjxmi2Ha", "proceeding": "https://proceedings.mlr.press/v235/gong24b.html", "pdf": "https://openreview.net/pdf?id=drjjxmi2Ha", "openreview": "https://openreview.net/forum?id=drjjxmi2Ha", "author_site": "Xiuwen Gong, Nitin Bisht, Guandong Xu", "tldr": "", "abstract": "Although deep partial label learning (deep PLL) classifiers have shown their competitive performance, they are heavily influenced by the noisy false-positive labels leading to poorer performance as the training progresses. Meanwhile, existing deep PLL research lacks theoretical guarantee on the analysis of correlation between label noise (or ambiguity degree) and classification performance. This paper addresses the above limitations with label smoothing (LS) from both theoretical and empirical aspects. In theory, we prove lower and upper bounds of the expected risk to show that label smoothing can help deep PLL. We further derive the optimal smoothing rate to investigate the conditions, i.e., when label smoothing benefits deep PLL. In practice, we design a benchmark solution and a novel optimization algorithm called Label Smoothing-based Partial Label Learning (LS-PLL). Extensive experimental results on benchmark PLL datasets and various deep architectures validate that label smoothing does help deep PLL in improving classification performance and learning distinguishable representations, and the best results can be achieved when the empirical smoothing rate approximately approaches the optimal smoothing rate in theoretical findings. Code is publicly available at https://github.com/kalpiree/LS-PLL.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiuwen Gong;Nitin Bisht;Guandong Xu", "authorids": "~Xiuwen_Gong2;nitin.bisht@student.uts.edu.au;~Guandong_Xu2", "gender": "F;;M", "homepage": ";;https://profiles.uts.edu.au/Guandong.Xu", "dblp": "160/9976;;https://dblp.uni-trier.de/pid/59/2340.html", "google_scholar": ";;https://scholar.google.com.au/citations?user=kcrdCq4AAAAJ", "orcid": "0000-0002-1078-1571;;0000-0003-4493-6663", "linkedin": ";;guandong-xu-7a560325/", "or_profile": "~Xiuwen_Gong2;nitin.bisht@student.uts.edu.au;~Guandong_Xu2", "aff": ";;University of Technology Sydney", "aff_domain": ";;uts.edu.au", "position": ";;Full Professor", "bibtex": "@inproceedings{\ngong2024does,\ntitle={Does Label Smoothing Help Deep Partial Label Learning?},\nauthor={Xiuwen Gong and Nitin Bisht and Guandong Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=drjjxmi2Ha}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 653687, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7464267710296608031&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": ";;uts.edu.au", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "University of Technology Sydney", "aff_unique_dep": "", "aff_unique_url": "https://www.uts.edu.au", "aff_unique_abbr": "UTS", "aff_country_unique_index": "0", "aff_country_unique": "Australia" }, { "title": "Community-Invariant Graph Contrastive Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33530", "id": "dskLpg8WFb", "proceeding": "https://proceedings.mlr.press/v235/tan24b.html", "pdf": "https://openreview.net/pdf?id=dskLpg8WFb", "openreview": "https://openreview.net/forum?id=dskLpg8WFb", "author_site": "Shiyin Tan, Dongyuan Li, Renhe Jiang, Ying Zhang, Manabu Okumura", "tldr": "", "abstract": "Graph augmentation has received great attention in recent years for graph contrastive learning (GCL) to learn well-generalized node/graph representations. However, mainstream GCL methods often favor randomly disrupting graphs for augmentation, which shows limited generalization and inevitably leads to the corruption of high-level graph information, i.e., the graph community. Moreover, current knowledge-based graph augmentation methods can only focus on either topology or node features, causing the model to lack robustness against various types of noise. To address these limitations, this research investigated the role of the graph community in graph augmentation and figured out its crucial advantage for learnable graph augmentation. Based on our observations, we propose a community-invariant GCL framework to maintain graph community structure during learnable graph augmentation. By maximizing the spectral changes, this framework unifies the constraints of both topology and feature augmentation, enhancing the model's robustness. Empirical evidence on 21 benchmark datasets demonstrates the exclusive merits of our framework. Code is released on Github (https://github.com/ShiyinTan/CI-GCL.git).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shiyin Tan;Dongyuan Li;Renhe Jiang;Ying Zhang;Manabu Okumura", "authorids": "~Shiyin_Tan1;~Dongyuan_Li1;~Renhe_Jiang1;~Ying_Zhang16;~Manabu_Okumura2", "gender": "M;M;M;F;M", "homepage": "https://scholar.google.com/citations?user=0csQTusAAAAJ&hl=en;https://clearloveyuan.github.io/;https://www.renhejiang.com/;https://zhangying9128.github.io/;http://lr-www.pi.titech.ac.jp/wp/", "dblp": "283/3406;23/9793;213/1173;13/6769-65;79/125", "google_scholar": "0csQTusAAAAJ;Pgo9ZZ0AAAAJ;Yo2lwasAAAAJ;tbDNsHsAAAAJ;NpQMX_8AAAAJ", "orcid": "0000-0001-8316-2838;0000-0002-4462-3563;0000-0003-2593-4638;0009-0000-9627-8768;0009-0001-7730-1536", "linkedin": ";dongyuan-li-2471b726b/?originalSubdomain=jp;renhejiang/;;", "or_profile": "~Shiyin_Tan1;~Dongyuan_Li1;~Renhe_Jiang1;~Ying_Zhang16;~Manabu_Okumura1", "aff": "Tokyo Institute of Technology;Tokyo Institute of Technology;The University of Tokyo;Tokyo Institute of Technology, Tokyo Institute of Technology;Tokyo Institute of Technology, Tokyo Institute of Technology", "aff_domain": "titech.ac.jp;titech.ac.jp;u-tokyo.ac.jp;titech.ac.jp;titech.ac.jp", "position": "PhD student;PhD student;Lecturer;Postdoc;Full Professor", "bibtex": "@inproceedings{\ntan2024communityinvariant,\ntitle={Community-Invariant Graph Contrastive Learning},\nauthor={Shiyin Tan and Dongyuan Li and Renhe Jiang and Ying Zhang and Manabu Okumura},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dskLpg8WFb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3818374, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11579197140121036864&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "titech.ac.jp;titech.ac.jp;u-tokyo.ac.jp;titech.ac.jp;titech.ac.jp", "author_num": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Tokyo Institute of Technology;University of Tokyo", "aff_unique_dep": ";", "aff_unique_url": "https://www.titech.ac.jp;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "Titech;UTokyo", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Tokyo", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Japan" }, { "title": "Position: Automatic Environment Shaping is the Next Frontier in RL", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33529", "id": "dslUyy1rN4", "proceeding": "https://proceedings.mlr.press/v235/park24i.html", "pdf": "https://openreview.net/pdf?id=dslUyy1rN4", "openreview": "https://openreview.net/forum?id=dslUyy1rN4", "author_site": "Younghyo Park, Gabriel Margolis, Pulkit Agrawal", "tldr": "", "abstract": "Many roboticists dream of presenting a robot with a task in the evening and returning the next morning to find the robot capable of solving the task. What is preventing us from achieving this? Sim-to-real reinforcement learning (RL) has achieved impressive performance on challenging robotics tasks, but requires substantial human effort to set up the task in a way that is amenable to RL. It's our position that algorithmic improvements in policy optimization and other ideas should be guided towards resolving the primary bottleneck of shaping the training environment, i.e., designing observations, actions, rewards and simulation dynamics. Most practitioners don't tune the RL algorithm, but other environment parameters to obtain a desirable controller. We posit that scaling RL to diverse robotic tasks will only be achieved if the community focuses on automating environment shaping procedures.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Younghyo Park;Gabriel B. Margolis;Pulkit Agrawal", "authorids": "~Younghyo_Park1;~Gabriel_B._Margolis1;~Pulkit_Agrawal1", "gender": ";M;M", "homepage": "https://younghyopark.me/;https://people.eecs.berkeley.edu/~pulkitag/;https://gmargo11.github.io/", "dblp": ";149/2672;305/0205", "google_scholar": ";UpZmJI0AAAAJ;Jzt5uNAAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Younghyo_Park1;~Pulkit_Agrawal1;~Gabriel_B_Margolis1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu", "position": "PhD student;Assistant Professor;PhD Student", "bibtex": "@inproceedings{\npark2024position,\ntitle={Position: Automatic Environment Shaping is the Next Frontier in {RL}},\nauthor={Younghyo Park and Gabriel B. Margolis and Pulkit Agrawal},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dslUyy1rN4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4061437, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13804184309622785733&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "mit.edu;mit.edu;mit.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Feedback Efficient Online Fine-Tuning of Diffusion Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33528", "id": "dtVlc9ybTm", "proceeding": "https://proceedings.mlr.press/v235/uehara24a.html", "pdf": "https://openreview.net/pdf?id=dtVlc9ybTm", "openreview": "https://openreview.net/forum?id=dtVlc9ybTm", "author_site": "Masatoshi Uehara, Yulai Zhao, Kevin Black, Ehsan Hajiramezanali, Gabriele Scalia, Nathaniel Diamant, Alex Tseng, Sergey Levine, Tommaso Biancalani", "tldr": "", "abstract": "Diffusion models excel at modeling complex data distributions, including those of images, proteins, and small molecules. However, in many cases, our goal is to model parts of the distribution that maximize certain properties: for example, we may want to generate images with high aesthetic quality, or molecules with high bioactivity. It is natural to frame this as a reinforcement learning (RL) problem, in which the objective is to finetune a diffusion model to maximize a reward function that corresponds to some property. Even with access to online queries of the ground-truth reward function, efficiently discovering high-reward samples can be challenging: they might have a low probability in the initial distribution, and there might be many infeasible samples that do not even have a well-defined reward (e.g., unnatural images or physically impossible molecules). In this work, we propose a novel reinforcement learning procedure that efficiently explores on the manifold of feasible samples. We present a theoretical analysis providing a regret guarantee, as well as empirical validation across three domains: images, biological sequences, and molecules.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Masatoshi Uehara;Yulai Zhao;Kevin Black;Ehsan Hajiramezanali;Gabriele Scalia;Nathaniel Lee Diamant;Alex M Tseng;Sergey Levine;Tommaso Biancalani", "authorids": "~Masatoshi_Uehara1;~Yulai_Zhao1;~Kevin_Black2;~Ehsan_Hajiramezanali1;~Gabriele_Scalia1;~Nathaniel_Lee_Diamant1;~Alex_M_Tseng1;~Sergey_Levine1;~Tommaso_Biancalani1", "gender": "M;M;;M;;;M;M;Non-Binary", "homepage": "https://www.masatoshiuehara.com/;https://yulaizhao.com/;https://kevin.black;http://ehsanhajiramezanali.github.io/;;;https://people.eecs.berkeley.edu/~svlevine/;;https://alextseng.net/", "dblp": "225/6517;64/6357-2;66/9687;225/3486;201/9258;290/2075;80/7594;;", "google_scholar": "https://scholar.google.co.jp/citations?user=xuLKJboAAAAJ;r-mWYj0AAAAJ;axX7PCwAAAAJ;20I_DMoAAAAJ;MxeFvewAAAAJ;;8R35rCwAAAAJ;https://scholar.google.it/citations?user=s_qd9x0AAAAJ;", "orcid": "0000-0001-9017-3105;0000-0002-6930-3590;;;0000-0003-3305-9220;0000-0002-1738-304X;;;0000-0002-5191-1926", "linkedin": ";yulaizhao/;;ehsan-hajiramezanali-978a3b52/;gabriele-scalia;nathaniel-diamant-6b35b0106;;;", "or_profile": "~Masatoshi_Uehara1;~Yulai_Zhao1;~Kevin_Black2;~Ehsan_Hajiramezanali1;~Gabriele_Scalia1;~Nathaniel_Lee_Diamant1;~Sergey_Levine1;~Tommaso_Biancalani1;~Alex_Tseng1", "aff": "Genentech ;Princeton University;University of California, Berkeley;Genentech;Genentech;genentech;Google;Genentech;Genentech", "aff_domain": "gene.com;princeton.edu;berkeley.edu;gene.come;gene.com;gene.com;google.com;gene.com;gene.com", "position": "Researcher;PhD student;PhD student;Principal Researcher;Researcher;Researcher;Research Scientist;Director;Researcher", "bibtex": "@inproceedings{\nuehara2024feedback,\ntitle={Feedback Efficient Online Fine-Tuning of Diffusion Models},\nauthor={Masatoshi Uehara and Yulai Zhao and Kevin Black and Ehsan Hajiramezanali and Gabriele Scalia and Nathaniel Lee Diamant and Alex M Tseng and Sergey Levine and Tommaso Biancalani},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dtVlc9ybTm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6736369, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3450289086867114746&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "gene.com;princeton.edu;berkeley.edu;gene.come;gene.com;gene.com;google.com;gene.com;gene.com", "author_num": 9, "aff_unique_index": "0;1;2;0;0;0;3;0;0", "aff_unique_norm": "Genentech;Princeton University;University of California, Berkeley;Google", "aff_unique_dep": ";;;Google", "aff_unique_url": "https://www.genentech.com;https://www.princeton.edu;https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "Genentech;Princeton;UC Berkeley;Google", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Berkeley;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Repeat After Me: Transformers are Better than State Space Models at Copying", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33527", "id": "duRRoGeoQT", "proceeding": "https://proceedings.mlr.press/v235/jelassi24a.html", "pdf": "https://openreview.net/pdf?id=duRRoGeoQT", "openreview": "https://openreview.net/forum?id=duRRoGeoQT", "author_site": "Samy Jelassi, David Brandfonbrener, Sham Kakade, Eran Malach", "tldr": "", "abstract": "Transformers are the dominant architecture for sequence modeling, but there is growing interest in models that use a fixed-size latent state that does not depend on the sequence length, which we refer to as ''generalized state space models'' (GSSMs). In this paper we show that while GSSMs are promising in terms of inference-time efficiency, they are limited compared to transformer models on tasks that require copying from the input context. We start with a theoretical analysis of the simple task of string copying and prove that a two layer transformer can copy strings of exponential length while GSSMs are fundamentally limited by their fixed-size latent state. Empirically, we find that transformers outperform GSSMs in terms of efficiency and generalization on synthetic tasks that require copying the context. Finally, we evaluate pretrained large language models and find that transformer models dramatically outperform state space models at copying and retrieving information from context. Taken together, these results suggest a fundamental gap between transformers and GSSMs on tasks of practical interest.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Samy Jelassi;David Brandfonbrener;Sham M. Kakade;eran malach", "authorids": "~Samy_Jelassi1;~David_Brandfonbrener1;~Sham_M._Kakade1;~eran_malach1", "gender": "M;M;M;M", "homepage": "https://sjelassi.github.io/;https://davidbrandfonbrener.github.io;https://shamulent.github.io;", "dblp": "222/3149;214/9461;s/SMKakade;202/2566", "google_scholar": ";https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=wb-DKCIAAAAJ;I15dUOwAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Samy_Jelassi1;~David_Brandfonbrener1;~Sham_M._Kakade1;~eran_malach1", "aff": "Harvard University;Harvard University;Harvard University;Harvard University", "aff_domain": "harvard.edu;harvard.edu;harvard.edu;harvard.edu", "position": "Postdoc;Postdoc;Full Professor;Postdoc", "bibtex": "@inproceedings{\njelassi2024repeat,\ntitle={Repeat After Me: Transformers are Better than State Space Models at Copying},\nauthor={Samy Jelassi and David Brandfonbrener and Sham M. Kakade and eran malach},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=duRRoGeoQT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 836621, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 78, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13131620499995952578&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "harvard.edu;harvard.edu;harvard.edu;harvard.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Slot Abstractors: Toward Scalable Abstract Visual Reasoning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33526", "id": "duyl8sy8qV", "proceeding": "https://proceedings.mlr.press/v235/mondal24a.html", "pdf": "https://openreview.net/pdf?id=duyl8sy8qV", "openreview": "https://openreview.net/forum?id=duyl8sy8qV", "author_site": "Shanka Subhra Mondal, Jonathan Cohen, Taylor Webb", "tldr": "", "abstract": "Abstract visual reasoning is a characteristically human ability, allowing the identification of relational patterns that are abstracted away from object features, and the systematic generalization of those patterns to unseen problems. Recent work has demonstrated strong systematic generalization in visual reasoning tasks involving multi-object inputs, through the integration of slot-based methods used for extracting object-centric representations coupled with strong inductive biases for relational abstraction. However, this approach was limited to problems containing a single rule, and was not scalable to visual reasoning problems containing a large number of objects. Other recent work proposed Abstractors, an extension of Transformers that incorporates strong relational inductive biases, thereby inheriting the Transformer's scalability and multi-head architecture, but it has yet to be demonstrated how this approach might be applied to multi-object visual inputs. Here we combine the strengths of the above approaches and propose Slot Abstractors, an approach to abstract visual reasoning that can be scaled to problems involving a large number of objects and multiple relations among them. The approach displays state-of-the-art performance across four abstract visual reasoning tasks, as well as an abstract reasoning task involving real-world images.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shanka Subhra Mondal;Jonathan D. Cohen;Taylor Whittington Webb", "authorids": "~Shanka_Subhra_Mondal1;~Jonathan_D._Cohen1;~Taylor_Whittington_Webb1", "gender": "M;M;M", "homepage": "https://sites.google.com/view/shankasubhramondal/;https://scholar.google.com/citations?user=WCmrJoQAAAAJ&hl=en;https://jdc.princeton.edu", "dblp": "241/7065;183/6144;31/5509-3", "google_scholar": "5V-xQYUAAAAJ;WCmrJoQAAAAJ;https://scholar.google.com.tw/citations?user=NCkkQAMAAAAJ", "orcid": ";;0000-0003-2316-0763", "linkedin": "shanka-subhra-mondal-057622147;;", "or_profile": "~Shanka_Subhra_Mondal1;~Taylor_Whittington_Webb1;~Jonathan_Cohen1", "aff": "Princeton University;University of California, Los Angeles;", "aff_domain": "princeton.edu;ucla.edu;", "position": "PhD student;Postdoc;", "bibtex": "@inproceedings{\nmondal2024slot,\ntitle={Slot Abstractors: Toward Scalable Abstract Visual Reasoning},\nauthor={Shanka Subhra Mondal and Jonathan D. Cohen and Taylor Whittington Webb},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=duyl8sy8qV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1533731, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6447062608524516838&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8, "email": "princeton.edu;ucla.edu;", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Princeton University;University of California, Los Angeles", "aff_unique_dep": ";", "aff_unique_url": "https://www.princeton.edu;https://www.ucla.edu", "aff_unique_abbr": "Princeton;UCLA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Robust Inverse Graphics via Probabilistic Inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33525", "id": "dwWef5w2cR", "proceeding": "https://proceedings.mlr.press/v235/le24b.html", "pdf": "https://openreview.net/pdf?id=dwWef5w2cR", "openreview": "https://openreview.net/forum?id=dwWef5w2cR", "author_site": "Tuan Anh Le, Pavel Sountsov, Matthew Hoffman, Ben Lee, Brian Patton, Rif Saurous", "tldr": "", "abstract": "How do we infer a 3D scene from a single image in the presence of corruptions like rain, snow or fog? Straightforward domain randomization relies on knowing the family of corruptions ahead of time. Here, we propose a Bayesian approach---dubbed robust inverse graphics (RIG)---that relies on a strong scene prior and an uninformative uniform corruption prior, making it applicable to a wide range of corruptions. Given a single image, RIG performs posterior inference jointly over the scene and the corruption. We demonstrate this idea by training a neural radiance field (NeRF) scene prior and using a secondary NeRF to represent the corruptions over which we place an uninformative prior. RIG, trained only on clean data, outperforms depth estimators and alternative NeRF approaches that perform point estimation instead of full inference. The results hold for a number of scene prior architectures based on normalizing flows and diffusion models. For the latter, we develop reconstruction-guidance with auxiliary latents (ReGAL)---a diffusion conditioning algorithm that is applicable in the presence of auxiliary latent variables such as the corruption. RIG demonstrates how scene priors can be used beyond generation tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tuan Anh Le;Pavel Sountsov;Matthew Douglas Hoffman;Ben Lee;Brian Patton;Rif A. Saurous", "authorids": "~Tuan_Anh_Le1;~Pavel_Sountsov2;~Matthew_Douglas_Hoffman1;~Ben_Lee2;~Brian_Patton1;~Rif_A._Saurous1", "gender": "M;;M;M;M;M", "homepage": "https://www.tuananhle.co.uk;http://people.brandeis.edu/~sl157/;http://www.matthewdhoffman.com;;;", "dblp": "76/10097-1;;07/4433;;;186/7923", "google_scholar": "https://scholar.google.co.uk/citations?user=tkceMM0AAAAJ;;IeHKeGYAAAAJ;t8PgXFYAAAAJ;UxWSR3oAAAAJ;QNnjg7YAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Tuan_Anh_Le1;~Pavel_Sountsov2;~Matthew_Douglas_Hoffman1;~Ben_Lee2;~Brian_Patton1;~Rif_A._Saurous1", "aff": "Google Research;;Google;Google;Google;Google", "aff_domain": "google.com;;google.com;google.com;google.com;google.com", "position": "Research Scientist;;Research Scientist;Researcher;Software engineer;Engineer, Director", "bibtex": "@inproceedings{\nle2024robust,\ntitle={Robust Inverse Graphics via Probabilistic Inference},\nauthor={Tuan Anh Le and Pavel Sountsov and Matthew Douglas Hoffman and Ben Lee and Brian Patton and Rif A. Saurous},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dwWef5w2cR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9002611, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eq3fcie0keEJ:scholar.google.com/&scioq=Robust+Inverse+Graphics+via+Probabilistic+Inference&hl=en&as_sdt=0,44", "gs_version_total": 6, "email": "google.com;;google.com;google.com;google.com;google.com", "author_num": 6, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google Research", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Imitation Learning from Purified Demonstrations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33524", "id": "dyfsPNuYCk", "proceeding": "https://proceedings.mlr.press/v235/wang24m.html", "pdf": "https://openreview.net/pdf?id=dyfsPNuYCk", "openreview": "https://openreview.net/forum?id=dyfsPNuYCk", "author_site": "Yunke Wang, Minjing Dong, Yukun Zhao, Bo Du, Chang Xu", "tldr": "", "abstract": "Imitation learning has emerged as a promising approach for addressing sequential decision-making problems, with the assumption that expert demonstrations are optimal. However, in real-world scenarios, most demonstrations are often imperfect, leading to challenges in the effectiveness of imitation learning. While existing research has focused on optimizing with imperfect demonstrations, the training typically requires a certain proportion of optimal demonstrations to guarantee performance. To tackle these problems, we propose to purify the potential noises in imperfect demonstrations first, and subsequently conduct imitation learning from these purified demonstrations. Motivated by the success of diffusion model, we introduce a two-step purification via diffusion process. In the first step, we apply a forward diffusion process to smooth potential noises in imperfect demonstrations by introducing additional noise. Subsequently, a reverse generative process is utilized to recover the optimal demonstration from the diffused ones. We provide theoretical evidence supporting our approach, demonstrating that the distance between the purified and optimal demonstration can be bounded. Empirical results on MuJoCo and RoboSuite demonstrate the effectiveness of our method from different aspects.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yunke Wang;Minjing Dong;Yukun Zhao;Bo Du;Chang Xu", "authorids": "~Yunke_Wang1;~Minjing_Dong1;~Yukun_Zhao5;~Bo_Du3;~Chang_Xu4", "gender": "Not Specified;M;M;M;", "homepage": "https://yunke-wang.github.io/;https://www.cs.cityu.edu.hk/~minjdong/;https://maths.whu.edu.cn/;;https://sydney.edu.au/engineering/about/our-people/academic-staff/c-xu.html", "dblp": "165/9106;246/2900.html;;70/6443-1.html;97/2966-2", "google_scholar": "m4wbcOsAAAAJ;https://scholar.google.com.au/citations?user=gJJRqlsAAAAJ;;Shy1gnMAAAAJ;N4F_3eoAAAAJ", "orcid": "0009-0003-9796-530X;0009-0003-1717-818X;;;0000-0002-4756-0609", "linkedin": ";;;;", "or_profile": "~Yunke_Wang1;~Minjing_Dong1;~Yukun_Zhao5;~Bo_Du1;~Charles_Xu1", "aff": "Wuhan University;City University of Hong Kong;Wuhan University;Wuhan University;University of Sydney", "aff_domain": "whu.edu.cn;cityu.edu.hk;whu.edu.cn;whu.edu.cn;sydney.eud.au", "position": "PhD student;Assistant Professor;Undergrad student;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nwang2024imitation,\ntitle={Imitation Learning from Purified Demonstrations},\nauthor={Yunke Wang and Minjing Dong and Yukun Zhao and Bo Du and Chang Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dyfsPNuYCk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2285999, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5332409431866289941&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "whu.edu.cn;cityu.edu.hk;whu.edu.cn;whu.edu.cn;sydney.eud.au", "author_num": 5, "aff_unique_index": "0;1;0;0;2", "aff_unique_norm": "Wuhan University;City University of Hong Kong;University of Sydney", "aff_unique_dep": ";;", "aff_unique_url": "http://www.whu.edu.cn/;https://www.cityu.edu.hk;https://www.sydney.edu.au", "aff_unique_abbr": "WHU;CityU;USYD", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "China;Australia" }, { "title": "Discovering Bias in Latent Space: An Unsupervised Debiasing Approach", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33523", "id": "dztd61efGy", "proceeding": "https://proceedings.mlr.press/v235/adila24a.html", "pdf": "https://openreview.net/pdf?id=dztd61efGy", "openreview": "https://openreview.net/forum?id=dztd61efGy", "author_site": "Dyah Adila, Shuai Zhang, Boran Han, Yuyang Wang", "tldr": "", "abstract": "The question-answering (QA) capabilities of foundation models are highly sensitive to prompt variations, rendering their performance susceptible to superficial, non-meaning-altering changes. This vulnerability often stems from the model's preference or bias towards specific input characteristics, such as option position or superficial image features in multi-modal settings. We propose to rectify this bias directly in the model's internal representation. Our approach, SteerFair, finds the bias direction in the model's representation space and steers activation values away from it during inference. Specifically, we exploit the observation that bias often adheres to simple association rules, such as the spurious association between the first option and correctness likelihood. Next, we construct demonstrations of these rules from unlabeled samples and use them to identify the bias directions. We empirically show that SteerFair significantly reduces instruction-tuned model performance variance across prompt modifications on three benchmark tasks. Remarkably, our approach surpasses a supervised baseline with 100 labels by an average of 10.86% accuracy points and 12.95 score points and matches the performance with 500 labels.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dyah Adila;Shuai Zhang;Boran Han;Bernie Wang", "authorids": "~Dyah_Adila1;~Shuai_Zhang7;~Boran_Han1;~Bernie_Wang1", "gender": "F;;;M", "homepage": ";;;http://web.mit.edu/~ywang02/www/", "dblp": ";;;43/8355-1", "google_scholar": ";;;IKUm624AAAAJ", "orcid": ";;;0000-0002-0291-7184", "linkedin": "dyahadila/;;;", "or_profile": "~Dyah_Adila1;~Shuai_Zhang7;~Boran_Han1;~Bernie_Wang1", "aff": "University of Wisconsin, Madison;;;Amazon", "aff_domain": "wisc.edu;;;amazon.com", "position": "PhD student;;;Principal Researcher", "bibtex": "@inproceedings{\nadila2024discovering,\ntitle={Discovering Bias in Latent Space: An Unsupervised Debiasing Approach},\nauthor={Dyah Adila and Shuai Zhang and Boran Han and Bernie Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dztd61efGy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1283795, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4649565446954952878&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "wisc.edu;;;amazon.com", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "University of Wisconsin;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.wisc.edu;https://www.amazon.com", "aff_unique_abbr": "UW;Amazon", "aff_campus_unique_index": "0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Positive Concave Deep Equilibrium Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33522", "id": "e0SKaKEEdr", "proceeding": "https://proceedings.mlr.press/v235/gabor24a.html", "pdf": "https://openreview.net/pdf?id=e0SKaKEEdr", "openreview": "https://openreview.net/forum?id=e0SKaKEEdr", "author_site": "Mateusz Gabor, Tomasz Piotrowski, Renato L. G. Cavalcante", "tldr": "", "abstract": "Deep equilibrium (DEQ) models are widely recognized as a memory efficient alternative to standard neural networks, achieving state-of-the-art performance in language modeling and computer vision tasks. These models solve a fixed point equation instead of explicitly computing the output, which sets them apart from standard neural networks. However, existing DEQ models often lack formal guarantees of the existence and uniqueness of the fixed point, and the convergence of the numerical scheme used for computing the fixed point is not formally established. As a result, DEQ models are potentially unstable in practice. To address these drawbacks, we introduce a novel class of DEQ models called positive concave deep equilibrium (pcDEQ) models. Our approach, which is based on nonlinear Perron-Frobenius theory, enforces nonnegative weights and activation functions that are concave on the positive orthant. By imposing these constraints, we can easily ensure the existence and uniqueness of the fixed point without relying on additional complex assumptions commonly found in the DEQ literature, such as those based on monotone operator theory in convex analysis. Furthermore, the fixed point can be computed with the standard fixed point algorithm, and we provide theoretical guarantees of its geometric convergence, which, in particular, simplifies the training process. Experiments demonstrate the competitiveness of our pcDEQ models against other implicit models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mateusz Gabor;Tomasz Piotrowski;Renato L. G. Cavalcante", "authorids": "~Mateusz_Gabor1;tpiotrowski@is.umk.pl;renato.cavalcante@hhi.fraunhofer.de", "gender": "M;;", "homepage": ";;", "dblp": ";;", "google_scholar": "z5JaKvYAAAAJ;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Mateusz_Gabor1;tpiotrowski@is.umk.pl;renato.cavalcante@hhi.fraunhofer.de", "aff": "Technical University of Wroclaw;;", "aff_domain": "pwr.edu.pl;;", "position": "PhD student;;", "bibtex": "@inproceedings{\ngabor2024positive,\ntitle={Positive Concave Deep Equilibrium Models},\nauthor={Mateusz Gabor and Tomasz Piotrowski and Renato L. G. Cavalcante},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=e0SKaKEEdr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 680237, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11079676652490498705&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "pwr.edu.pl;;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Wroclaw University of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.pwr.edu.pl", "aff_unique_abbr": "WUT", "aff_country_unique_index": "0", "aff_country_unique": "Poland" }, { "title": "Double Variance Reduction: A Smoothing Trick for Composite Optimization Problems without First-Order Gradient", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33521", "id": "e1jPdRJeo7", "proceeding": "https://proceedings.mlr.press/v235/di24b.html", "pdf": "https://openreview.net/pdf?id=e1jPdRJeo7", "openreview": "https://openreview.net/forum?id=e1jPdRJeo7", "author_site": "Hao Di, Haishan Ye, Yueling Zhang, Xiangyu Chang, Guang Dai, Ivor Tsang", "tldr": "", "abstract": "Variance reduction techniques are designed to decrease the sampling variance, thereby accelerating convergence rates of first-order (FO) and zeroth-order (ZO) optimization methods. However, in composite optimization problems, ZO methods encounter an additional variance called the coordinate-wise variance, which stems from the random gradient estimation. To reduce this variance, prior works require estimating all partial derivatives, essentially approximating FO information. This approach demands $\\mathcal{O}(d)$ function evaluations ($d$ is the dimension size), which incurs substantial computational costs and is prohibitive in high-dimensional scenarios. This paper proposes the Zeroth-order Proximal Double Variance Reduction ($\\texttt{ZPDVR}$) method, which utilizes the averaging trick to reduce both sampling and coordinate-wise variances. Compared to prior methods, $\\texttt{ZPDVR}$ relies solely on random gradient estimates, calls the stochastic zeroth-order oracle (SZO) in expectation $\\mathcal{O}(1)$ times per iteration, and achieves the optimal $\\mathcal{O}(d(n + \\kappa)\\log (\\frac{1}{\\epsilon}))$ SZO query complexity in the strongly convex and smooth setting, where $\\kappa$ represents the condition number and $\\epsilon$ is the desired accuracy. Empirical results validate $\\texttt{ZPDVR}$'s linear convergence and demonstrate its superior performance over other related methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hao Di;Haishan Ye;Yueling Zhang;Xiangyu Chang;Guang Dai;Ivor Tsang", "authorids": "~Hao_Di2;~Haishan_Ye2;~Yueling_Zhang3;~Xiangyu_Chang1;~Guang_Dai1;~Ivor_Tsang1", "gender": "M;M;M;M;M;F", "homepage": "https://conscien.top/;;;;https://www.a-star.edu.sg/cfar/about-cfar/management/prof-ivor-tsang;https://orcid.org/0009-0003-3414-4969", "dblp": "95/8967;162/0002.html;90/9705;;35/5873;", "google_scholar": ";;;;rJMOlVsAAAAJ;", "orcid": "0009-0004-1846-9787;;;0000-0002-3529-9087;;", "linkedin": ";;;;;", "or_profile": "~Hao_Di2;~Haishan_Ye2;~Xiangyu_Chang1;~Guang_Dai1;~Ivor_W_Tsang1;~Zhang_Yueling2", "aff": "Xi'an Jiaotong University;Xi'an Jiaotong University;Xi'an Jiaotong University;SGIT AI;A*STAR;Beijing Foreign Studies University", "aff_domain": "xjtu.edu.cn;xjtu.edu.cn;xjtu.edu.cn;sgcc.com.cn;cfar.a-star.edu.sg;bfsu.edu.cn", "position": "PhD student;Associate Professor;Associate Professor;Principal Researcher;Principal Researcher;Undergrad student", "bibtex": "@inproceedings{\ndi2024double,\ntitle={Double Variance Reduction: A Smoothing Trick for Composite Optimization Problems without First-Order Gradient},\nauthor={Hao Di and Haishan Ye and Yueling Zhang and Xiangyu Chang and Guang Dai and Ivor Tsang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=e1jPdRJeo7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 466106, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:UwVmfZzvmoUJ:scholar.google.com/&scioq=Double+Variance+Reduction:+A+Smoothing+Trick+for+Composite+Optimization+Problems+without+First-Order+Gradient&hl=en&as_sdt=0,5", "gs_version_total": 7, "email": "xjtu.edu.cn;xjtu.edu.cn;xjtu.edu.cn;sgcc.com.cn;cfar.a-star.edu.sg;bfsu.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;1;2;3", "aff_unique_norm": "Xi'an Jiao Tong University;SGIT AI;Agency for Science, Technology and Research;Beijing Foreign Studies University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.xjtu.edu.cn;;https://www.a-star.edu.sg;http://www.bfsu.edu.cn", "aff_unique_abbr": "XJTU;;A*STAR;BFSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;2;0", "aff_country_unique": "China;;Singapore" }, { "title": "Decoding Compressed Trust: Scrutinizing the Trustworthiness of Efficient LLMs Under Compression", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33520", "id": "e3Dpq3WdMv", "proceeding": "https://proceedings.mlr.press/v235/hong24a.html", "pdf": "https://openreview.net/pdf?id=e3Dpq3WdMv", "openreview": "https://openreview.net/forum?id=e3Dpq3WdMv", "author_site": "Junyuan Hong, Jinhao Duan, Chenhui Zhang, Zhangheng Li, Chulin Xie, Kelsey Lieberman, James Diffenderfer, Brian Bartoldson, Ajay Jaiswal, Kaidi Xu, Bhavya Kailkhura, Dan Hendrycks, Dawn Song, Zhangyang \u201cAtlas\u201d Wang, Bo Li", "tldr": "", "abstract": "Compressing high-capability Large Language Models (LLMs) has emerged as a favored strategy for resource-efficient inferences. While state-of-the-art (SoTA) compression methods boast impressive advancements in preserving benign task performance, the potential risks of compression in terms of safety and trustworthiness have been largely neglected. This study conducts the first, thorough evaluation of **three (3) leading LLMs** using **five (5) SoTA compression techniques** across **eight (8) trustworthiness dimensions**. Our experiments highlight the intricate interplay between compression and trustworthiness, revealing some interesting patterns. We find that quantization is currently a more effective approach than pruning in achieving efficiency and trustworthiness simultaneously. For instance, a 4-bit quantized model retains the trustworthiness of its original counterpart, but model pruning significantly degrades trustworthiness, even at 50% sparsity. Moreover, employing quantization within a moderate bit range could unexpectedly improve certain trustworthiness dimensions such as ethics and fairness. Conversely, extreme quantization to very low bit levels (3 bits) tends to reduce trustworthiness significantly. This increased risk cannot be uncovered by looking at benign performance alone, in turn, mandating comprehensive trustworthiness evaluation in practice. These findings culminate in practical recommendations for simultaneously achieving high utility, efficiency, and trustworthiness in LLMs. Code and models are available at https://decoding-comp-trust.github.io.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Junyuan Hong;Jinhao Duan;Chenhui Zhang;Zhangheng LI;Chulin Xie;Kelsey Lieberman;James Diffenderfer;Brian R. Bartoldson;AJAY KUMAR JAISWAL;Kaidi Xu;Bhavya Kailkhura;Dan Hendrycks;Dawn Song;Zhangyang Wang;Bo Li", "authorids": "~Junyuan_Hong1;~Jinhao_Duan1;~Chenhui_Zhang2;~Zhangheng_LI2;~Chulin_Xie1;~Kelsey_Lieberman1;~James_Diffenderfer1;~Brian_R._Bartoldson1;~AJAY_KUMAR_JAISWAL1;~Kaidi_Xu1;~Bhavya_Kailkhura1;~Dan_Hendrycks1;~Dawn_Song1;~Zhangyang_Wang1;~Bo_Li19", "gender": "M;M;M;M;F;F;;M;M;M;;F;M;F;M", "homepage": "https://jyhong.gitlab.io/;https://jinhaoduan.github.io;https://www.danielz.ch/;;;;;https://ajay1994.github.io/;https://kaidixu.com/;https://people.llnl.gov/kailkhura1;;;https://vita-group.github.io;http://boli.cs.illinois.edu/;https://brianbartoldson.wordpress.com/", "dblp": "185/1316;282/2912;;;245/4284;243/2573;188/4110;30/9707;195/8175;132/8938;182/2504;s/DXSong;119/4026;50/3402-26;220/5475", "google_scholar": "7Cbv6doAAAAJ;aWeTAXYAAAAJ;UYxdrBsAAAAJ;https://scholar.google.co.uk/citations?user=NZCLqZMAAAAJ;WeJnzAgAAAAJ;D2UjU3oAAAAJ;nRr24_QAAAAJ;I783HxYAAAAJ;lYK0wlsAAAAJ;SQpJmOgAAAAJ;;;pxFyKAIAAAAJ;K8vJkTcAAAAJ;YdiZoJgAAAAJ", "orcid": "0000-0002-5718-5187;;0000-0003-3915-6099;;;;;;;;;;;;", "linkedin": ";;danielz02/;%E7%AB%A0%E6%81%92-%E6%9D%8E-b1b19711a/;;kelsey-l/;;;;;;;;;", "or_profile": "~Junyuan_Hong1;~Jinhao_Duan1;~Chenhui_Zhang2;~Zhangheng_LI2;~Chulin_Xie1;~Kelsey_Lieberman1;~James_Diffenderfer1;~AJAY_KUMAR_JAISWAL1;~Kaidi_Xu1;~Bhavya_Kailkhura1;~Dan_Hendrycks1;~Dawn_Song1;~Zhangyang_Wang1;~Bo_Li19;~Brian_R_Bartoldson1", "aff": "University of Texas at Austin;Drexel University;Massachusetts Institute of Technology;University of Texas at Austin;University of Illinois, Urbana Champaign;Duke University;Lawrence Livermore National Labs;University of Texas, Austin;Drexel University;Lawrence Livermore National Laboratory;Center for AI Safety;University of California, Berkeley;University of Texas at Austin;University of Illinois, Urbana Champaign;Lawrence Livermore National Labs", "aff_domain": "utexas.edu;drexel.edu;mit.edu;utexas.edu;illinois.edu;cs.duke.edu;llnl.gov;utexas.edu;drexel.edu;llnl.gov;safe.ai;berkeley.edu;utexas.edu;illinois.edu;llnl.gov", "position": "Postdoc;PhD student;PhD student;PhD student;PhD student;PhD student;Researcher;PhD student;Assistant Professor;Research Staff;Executive and Research Director;Full Professor;Associate Professor;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nhong2024decoding,\ntitle={Decoding Compressed Trust: Scrutinizing the Trustworthiness of Efficient {LLM}s Under Compression},\nauthor={Junyuan Hong and Jinhao Duan and Chenhui Zhang and Zhangheng LI and Chulin Xie and Kelsey Lieberman and James Diffenderfer and Brian R. Bartoldson and AJAY KUMAR JAISWAL and Kaidi Xu and Bhavya Kailkhura and Dan Hendrycks and Dawn Song and Zhangyang Wang and Bo Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=e3Dpq3WdMv}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1915625, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 15, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13017637936149030359&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "utexas.edu;drexel.edu;mit.edu;utexas.edu;illinois.edu;cs.duke.edu;llnl.gov;utexas.edu;drexel.edu;llnl.gov;safe.ai;berkeley.edu;utexas.edu;illinois.edu;llnl.gov", "author_num": 15, "aff_unique_index": "0;1;2;0;3;4;5;0;1;5;6;7;0;3;5", "aff_unique_norm": "University of Texas at Austin;Drexel University;Massachusetts Institute of Technology;University of Illinois Urbana-Champaign;Duke University;Lawrence Livermore National Laboratory;Center for AI Safety;University of California, Berkeley", "aff_unique_dep": ";;;;;;;", "aff_unique_url": "https://www.utexas.edu;https://www.drexel.edu;https://web.mit.edu;https://illinois.edu;https://www.duke.edu;https://www.llnl.gov;https://www.centerforaisafety.org;https://www.berkeley.edu", "aff_unique_abbr": "UT Austin;Drexel;MIT;UIUC;Duke;LLNL;;UC Berkeley", "aff_campus_unique_index": "0;0;2;0;3;0;2", "aff_campus_unique": "Austin;;Urbana-Champaign;Berkeley", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Momentor: Advancing Video Large Language Model with Fine-Grained Temporal Reasoning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33519", "id": "e3geukCBw6", "proceeding": "https://proceedings.mlr.press/v235/qian24a.html", "pdf": "https://openreview.net/pdf?id=e3geukCBw6", "openreview": "https://openreview.net/forum?id=e3geukCBw6", "author_site": "Long Qian, Juncheng Li, Yu Wu, Yaobo Ye, Hao Fei, Tat-Seng Chua, Yueting Zhuang, Siliang Tang", "tldr": "", "abstract": "Large Language Models (LLMs) demonstrate remarkable proficiency in comprehending and handling text-based tasks. Many efforts are being made to transfer these attributes to video modality, which are termed Video-LLMs. However, existing Video-LLMs can only capture the coarse-grained semantics and are unable to effectively handle tasks related to comprehension or localization of specific video segments. In light of these challenges, we propose Momentor, a Video-LLM capable of accomplishing fine-grained temporal understanding tasks. To support the training of Momentor, we design an automatic data generation engine to construct Moment-10M, a large-scale video instruction dataset with segment-level instruction data. We train Momentor on Moment-10M, enabling it to perform segment-level reasoning and localization. Zero-shot evaluations on several tasks demonstrate that Momentor excels in fine-grained temporally grounded comprehension and localization.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Long Qian;Juncheng Li;Yu Wu;Yaobo Ye;Hao Fei;Tat-Seng Chua;Yueting Zhuang;Siliang Tang", "authorids": "~Long_Qian2;~Juncheng_Li3;~Yu_Wu3;~Yaobo_Ye1;~Hao_Fei1;~Tat-Seng_Chua2;~Yueting_Zhuang1;~Siliang_Tang1", "gender": "M;M;M;M;M;M;M;M", "homepage": ";;https://yu-wu.net;https://github.com/loveofguoke;https://haofei.vip/;https://person.zju.edu.cn/yzhuang;https://person.zju.edu.cn/en/siliang;http://www.comp.nus.edu.sg/~chuats/", "dblp": ";182/7674-6;22/0-11;;81/3569-1;;44/5693;", "google_scholar": "o17W2pUAAAAJ;lm9s-QgAAAAJ;23SZHUwAAAAJ;https://scholar.google.com/citations?hl=en;YGDX46AAAAAJ;1RD7UJAAAAAJ;8e7H3PcAAAAJ;https://scholar.google.com.tw/citations?user=Z9DWCBEAAAAJ", "orcid": ";0000-0003-2258-1291;;;0000-0003-3026-6347;;0000-0002-7356-9711;0000-0001-6097-7807", "linkedin": ";;;;;;siliang-tang-4734272a/;", "or_profile": "~Long_Qian2;~Juncheng_Li3;~Yu_Wu3;~Yaobo_Ye1;~Hao_Fei1;~Yueting_Zhuang1;~Siliang_Tang1;~Tat-seng_Chua1", "aff": "Zhejiang University;National University of Singapore;Wuhan University;Zhejiang University;National University of Singapore;Zhejiang University;Zhejiang University;National University of Singapore", "aff_domain": "zju.edu.cn;nus.edu;whu.edu.cn;zju.edu.cn;nus.edu.sg;zju.edu.cn;zju.edu.cn;nus.edu.sg", "position": "MS student;Postdoc;Full Professor;Undergrad student;Postdoc;Full Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nqian2024momentor,\ntitle={Momentor: Advancing Video Large Language Model with Fine-Grained Temporal Reasoning},\nauthor={Long Qian and Juncheng Li and Yu Wu and Yaobo Ye and Hao Fei and Tat-Seng Chua and Yueting Zhuang and Siliang Tang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=e3geukCBw6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5336004, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12111347250057203814&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "zju.edu.cn;nus.edu;whu.edu.cn;zju.edu.cn;nus.edu.sg;zju.edu.cn;zju.edu.cn;nus.edu.sg", "author_num": 8, "aff_unique_index": "0;1;2;0;1;0;0;1", "aff_unique_norm": "Zhejiang University;National University of Singapore;Wuhan University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.zju.edu.cn;https://www.nus.edu.sg;http://www.whu.edu.cn/", "aff_unique_abbr": "ZJU;NUS;WHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1;0;0;1", "aff_country_unique": "China;Singapore" }, { "title": "Position: A Call for Embodied AI", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33518", "id": "e5admkWKgV", "proceeding": "https://proceedings.mlr.press/v235/paolo24a.html", "pdf": "https://openreview.net/pdf?id=e5admkWKgV", "openreview": "https://openreview.net/forum?id=e5admkWKgV", "author_site": "Giuseppe Paolo, Jonas Gonzalez-Billandon, Bal\u00e1zs K\u00e9gl", "tldr": "", "abstract": "We propose Embodied AI (E-AI) as the next fundamental step in the pursuit of Artificial General Intelligence (AGI), juxtaposing it against current AI advancements, particularly Large Language Models (LLMs). We traverse the evolution of the embodiment concept across diverse fields (philosophy, psychology, neuroscience, and robotics) to highlight how E-AI distinguishes itself from the classical paradigm of static learning. By broadening the scope of E-AI, we introduce a theoretical framework based on cognitive architectures, emphasizing perception, action, memory, and learning as essential components of an embodied agent. This framework is aligned with Friston\u2019s active inference principle, offering a comprehensive approach to E-AI development. Despite the progress made in the field of AI, substantial challenges, such as the formulation of a novel AI learning theory and the innovation of advanced hardware, persist. Our discussion lays down a foundational guideline for future E-AI research. Highlighting the importance of creating E-AI agents capable of seamless communication, collaboration, and coexistence with humans and other intelligent entities within real-world environments, we aim to steer the AI community towards addressing the multifaceted challenges and seizing the opportunities that lie ahead in the quest for AGI.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Giuseppe Paolo;Jonas Gonzalez-Billandon;Bal\u00e1zs K\u00e9gl", "authorids": "~Giuseppe_Paolo1;~Jonas_Gonzalez-Billandon1;~Bal\u00e1zs_K\u00e9gl2", "gender": ";M;M", "homepage": "https://www.giupaolo.com/;;https://scholar.google.com/citations?user=s0njcGgAAAAJ&hl=en&oi=ao", "dblp": "198/1004;234/2581;k/BalazsKegl.html", "google_scholar": "https://scholar.google.fr/citations?user=khT6tDsAAAAJ;https://scholar.google.ca/citations?user=2c1jTiYAAAAJ;s0njcGgAAAAJ", "orcid": "0000-0003-4201-5967;;", "linkedin": "gpaolo93/;jonas-gonzalez-billandon;balazskegl", "or_profile": "~Giuseppe_Paolo1;~Jonas_Gonzalez-Billandon1;~Balazs_Kegl1", "aff": "Huawei Technologies Ltd.;Huawei Technologies Ltd.;CNRS (on leave)", "aff_domain": "huawei.com;huawei.com;in2p3.fr", "position": "Researcher;Researcher;Principal Researcher", "bibtex": "@inproceedings{\npaolo2024position,\ntitle={Position: A Call for Embodied {AI}},\nauthor={Giuseppe Paolo and Jonas Gonzalez-Billandon and Bal{\\'a}zs K{\\'e}gl},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=e5admkWKgV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 225645, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9204445466745119271&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "huawei.com;huawei.com;in2p3.fr", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Huawei;CNRS", "aff_unique_dep": "Huawei Technologies;", "aff_unique_url": "https://www.huawei.com;https://www.cnrs.fr", "aff_unique_abbr": "Huawei;CNRS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;France" }, { "title": "Adaptively Learning to Select-Rank in Online Platforms", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33517", "id": "e5tA3Apbmy", "proceeding": "https://proceedings.mlr.press/v235/wang24l.html", "pdf": "https://openreview.net/pdf?id=e5tA3Apbmy", "openreview": "https://openreview.net/forum?id=e5tA3Apbmy", "author_site": "Jingyuan Wang, Perry Dong, Ying Jin, Ruohan Zhan, Zhengyuan Zhou", "tldr": "", "abstract": "Ranking algorithms are fundamental to various online platforms across e-commerce sites to content streaming services. Our research addresses the challenge of adaptively ranking items from a candidate pool for heterogeneous users, a key component in personalizing user experience. We develop a user response model that considers diverse user preferences and the varying effects of item positions, aiming to optimize overall user satisfaction with the ranked list. We frame this problem within a contextual bandits framework, with each ranked list as an action. Our approach incorporates an upper confidence bound to adjust predicted user satisfaction scores and selects the ranking action that maximizes these adjusted scores, efficiently solved via maximum weight imperfect matching. We demonstrate that our algorithm achieves a cumulative regret bound of $O(d\\sqrt{NKT})$ for ranking $K$ out of $N$ items in a $d$-dimensional context space over $T$ rounds, under the assumption that user responses follow a generalized linear model. This regret alleviates dependence on the ambient action space, whose cardinality grows exponentially with $N$ and $K$ (thus rendering direct application of existing adaptive learning algorithms -- such as UCB or Thompson sampling -- infeasible). Experiments conducted on both simulated and real-world datasets demonstrate our algorithm outperforms the baseline.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jingyuan Wang;Perry Dong;Ying Jin;Ruohan Zhan;Zhengyuan Zhou", "authorids": "~Jingyuan_Wang3;~Perry_Dong1;~Ying_Jin4;~Ruohan_Zhan1;~Zhengyuan_Zhou2", "gender": "F;;F;F;M", "homepage": ";;https://ying531.github.io/;https://ruohanzhan.github.io;https://scholar.google.com/citations?user=hiGI9v0AAAAJ&hl=en", "dblp": ";;https://dblp.org/rec/conf/icml/JinWL20;;125/5270", "google_scholar": ";;lT5KFUkAAAAJ;;", "orcid": ";;;;", "linkedin": "sharon-wang-baa05a163/;;;;", "or_profile": "~Jingyuan_Wang3;~Perry_Dong1;~Ying_Jin4;~Ruohan_Zhan1;~Zhengyuan_Zhou2", "aff": "New York University;;Stanford University;Hong Kong University of Science and Technology;New York University", "aff_domain": "nyu.edu;;stanford.edu;ust.hk;nyu.edu", "position": "PhD student;;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nwang2024adaptively,\ntitle={Adaptively Learning to Select-Rank in Online Platforms},\nauthor={Jingyuan Wang and Perry Dong and Ying Jin and Ruohan Zhan and Zhengyuan Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=e5tA3Apbmy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1801928, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_TslqHv4SE4J:scholar.google.com/&scioq=Adaptively+Learning+to+Select-Rank+in+Online+Platforms&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": "nyu.edu;;stanford.edu;ust.hk;nyu.edu", "author_num": 5, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "New York University;Stanford University;Hong Kong University of Science and Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nyu.edu;https://www.stanford.edu;https://www.ust.hk", "aff_unique_abbr": "NYU;Stanford;HKUST", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Stanford;Hong Kong SAR", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;China" }, { "title": "Is Temperature Sample Efficient for Softmax Gaussian Mixture of Experts?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33516", "id": "e76GrGhIgf", "proceeding": "https://proceedings.mlr.press/v235/nguyen24a.html", "pdf": "https://openreview.net/pdf?id=e76GrGhIgf", "openreview": "https://openreview.net/forum?id=e76GrGhIgf", "author_site": "Huy Nguyen, Pedram Akbarian, Nhat Ho", "tldr": "", "abstract": "Dense-to-sparse gating mixture of experts (MoE) has recently become an effective alternative to a well-known sparse MoE. Rather than fixing the number of activated experts as in the latter model, which could limit the investigation of potential experts, the former model utilizes the temperature to control the softmax weight distribution and the sparsity of the MoE during training in order to stabilize the expert specialization. Nevertheless, while there are previous attempts to theoretically comprehend the sparse MoE, a comprehensive analysis of the dense-to-sparse gating MoE has remained elusive. Therefore, we aim to explore the impacts of the dense-to-sparse gate on the maximum likelihood estimation under the Gaussian MoE in this paper. We demonstrate that due to interactions between the temperature and other model parameters via some partial differential equations, the convergence rates of parameter estimations are slower than any polynomial rates, and could be as slow as $\\mathcal{O}(1/\\log(n))$, where $n$ denotes the sample size. To address this issue, we propose using a novel activation dense-to-sparse gate, which routes the output of a linear layer to an activation function before delivering them to the softmax function. By imposing linearly independence conditions on the activation function and its derivatives, we show that the parameter estimation rates are significantly improved to polynomial rates. Finally, we conduct a simulation study to empirically validate our theoretical results.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Huy Nguyen;Pedram Akbarian;Nhat Ho", "authorids": "~Huy_Nguyen5;~Pedram_Akbarian1;~Nhat_Ho1", "gender": "M;M;M", "homepage": "https://huynm99.github.io/;https://pedakb.github.io/;https://nhatptnk8912.github.io/", "dblp": "48/6075;358/2800;203/4479", "google_scholar": "_YYwzhQAAAAJ;eg68QWIAAAAJ;https://scholar.google.ca/citations?user=Xs7cKMwAAAAJ", "orcid": ";;", "linkedin": "huy-nguyen-081199/;;nhat-pham-minh-ho-267b8164/", "or_profile": "~Huy_Nguyen5;~Pedram_Akbarian1;~Nhat_Ho1", "aff": "Microsoft AI;University of Texas at Austin;University of Texas, Austin", "aff_domain": "microsoft.com;utexas.edu;utexas.edu", "position": "Intern;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nnguyen2024is,\ntitle={Is Temperature Sample Efficient for Softmax Gaussian Mixture of Experts?},\nauthor={Huy Nguyen and Pedram Akbarian and Nhat Ho},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=e76GrGhIgf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 934759, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3640363889487755243&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "email": "microsoft.com;utexas.edu;utexas.edu", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Microsoft;University of Texas at Austin", "aff_unique_dep": "Microsoft AI;", "aff_unique_url": "https://www.microsoft.com;https://www.utexas.edu", "aff_unique_abbr": "Microsoft;UT Austin", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Simple linear attention language models balance the recall-throughput tradeoff", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33515", "id": "e93ffDcpH3", "proceeding": "https://proceedings.mlr.press/v235/arora24a.html", "pdf": "https://openreview.net/pdf?id=e93ffDcpH3", "openreview": "https://openreview.net/forum?id=e93ffDcpH3", "author_site": "Simran Arora, Sabri Eyuboglu, Michael Zhang, Aman Timalsina, Silas Alberti, James Zou, Atri Rudra, Christopher Re", "tldr": "", "abstract": "Recent work has shown that attention-based language models excel at \"recall\", the ability to ground generations in tokens previously seen in context. However, the efficiency of attention-based models is bottle-necked during inference by the KV-cache's aggressive memory consumption. In this work, we explore whether we can improve language model efficiency (e.g. by reducing memory consumption) without compromising on recall. By applying experiments and theory to a broad set of architectures, we identify a key tradeoff between a model's recurrent state size and recall ability. We show that efficient alternatives to attention (e.g. H3, Mamba, RWKV) maintain a fixed-size recurrent state, but struggle at recall. We propose BASED a simple architecture combining linear and sliding window attention. By varying BASED window size and linear attention feature dimension, we can dial the state size and traverse the Pareto frontier of the recall-memory tradeoff curve, recovering the full quality of attention on one end and the small state size of attention-alternatives on the other. We train language models up to $1.3$b parameters and show that BASED matches the strongest sub-quadratic models (e.g. Mamba) in perplexity and outperforms them on real-world recall-intensive tasks by 10.36 accuracy points. We further develop IO-aware algorithms that enable BASED to provide 24\u00d7 higher throughput on language generation than FlashAttention-2, when generating 1024 tokens using 1.3b parameter models. Overall, BASED expands the Pareto frontier of the throughput-recall tradeoff space beyond prior architectures.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Simran Arora;Sabri Eyuboglu;Michael Zhang;Aman Timalsina;Silas Alberti;James Zou;Atri Rudra;Christopher Re", "authorids": "~Simran_Arora1;~Sabri_Eyuboglu1;~Michael_Zhang4;~Aman_Timalsina1;~Silas_Alberti2;~James_Zou1;~Atri_Rudra1;~Christopher_Re1", "gender": ";;M;M;M;;M;", "homepage": "https://scholar.google.com/citations?user=rGRsWH8AAAAJ&hl=en;http://www.sabrieyuboglu.com/;https://michaelzhang.xyz/;;https://silasalberti.com;;http://www.cse.buffalo.edu/faculty/atri/;", "dblp": "243/2342;298/7563;;;339/6742;;04/4980;", "google_scholar": ";;DG_asaIAAAAJ;https://scholar.google.com/citations?hl=en;;23ZXZvEAAAAJ;https://scholar.google.com.tw/citations?user=_e5H8IoAAAAJ;", "orcid": ";;;;;;;", "linkedin": ";;;;silasalberti/;;;", "or_profile": "~Simran_Arora1;~Sabri_Eyuboglu1;~Michael_Zhang4;~Aman_Timalsina1;~Silas_Alberti2;~James_Zou1;~Atri_Rudra1;~Christopher_Re1", "aff": "The Wharton School, University of Pennsylvania;Stanford University;Stanford University;Purdue University;Stanford University;Stanford University;State University of New York, Buffalo;", "aff_domain": "wharton.upenn.edu;stanford.edu;stanford.edu;purdue.edu;stanford.edu;stanford.edu;buffalo.edu;", "position": "Undergrad student;PhD student;PhD student;MS student;PhD student;Assistant Professor;Professor;", "bibtex": "@inproceedings{\narora2024simple,\ntitle={Simple linear attention language models balance the recall-throughput tradeoff},\nauthor={Simran Arora and Sabri Eyuboglu and Michael Zhang and Aman Timalsina and Silas Alberti and James Zou and Atri Rudra and Christopher Re},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=e93ffDcpH3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2497597, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 72, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7267916972102908939&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "wharton.upenn.edu;stanford.edu;stanford.edu;purdue.edu;stanford.edu;stanford.edu;buffalo.edu;", "author_num": 8, "aff_unique_index": "0;1;1;2;1;1;3", "aff_unique_norm": "University of Pennsylvania;Stanford University;Purdue University;State University of New York at Buffalo", "aff_unique_dep": "The Wharton School;;;", "aff_unique_url": "https://www.wharton.upenn.edu;https://www.stanford.edu;https://www.purdue.edu;https://www.buffalo.edu", "aff_unique_abbr": "UPenn Wharton;Stanford;Purdue;SUNY Buffalo", "aff_campus_unique_index": "1;1;1;1;2", "aff_campus_unique": ";Stanford;Buffalo", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Saliency strikes back: How filtering out high frequencies improves white-box explanations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33514", "id": "eC1OOpOGZW", "proceeding": "https://proceedings.mlr.press/v235/muzellec24a.html", "pdf": "https://openreview.net/pdf?id=eC1OOpOGZW", "openreview": "https://openreview.net/forum?id=eC1OOpOGZW", "author_site": "Sabine Muzellec, Thomas FEL, Victor Boutin, L\u00e9o And\u00e9ol, Rufin VanRullen, Thomas Serre", "tldr": "", "abstract": "Attribution methods correspond to a class of explainability methods (XAI) that aim to assess how individual inputs contribute to a model's decision-making process. We have identified a significant limitation in one type of attribution methods, known as ``white-box\" methods. Although highly efficient, as we will show, these methods rely on a gradient signal that is often contaminated by high-frequency artifacts. To overcome this limitation, we introduce a new approach called \"FORGrad\". This simple method effectively filters out these high-frequency artifacts using optimal cut-off frequencies tailored to the unique characteristics of each model architecture. Our findings show that FORGrad *consistently enhances* the performance of already existing white-box methods, enabling them to compete effectively with more accurate yet computationally demanding \"black-box\" methods. We anticipate that our research will foster broader adoption of simpler and more efficient white-box methods for explainability, offering a better balance between faithfulness and computational efficiency.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sabine Muzellec;Thomas FEL;Victor Boutin;L\u00e9o And\u00e9ol;Rufin VanRullen;Thomas Serre", "authorids": "~Sabine_Muzellec1;~Thomas_FEL1;~Victor_Boutin2;~L\u00e9o_And\u00e9ol1;~Rufin_VanRullen1;~Thomas_Serre1", "gender": ";M;M;;M;M", "homepage": ";https://thomasfel.me;;;https://rufinv.github.io;https://serre-lab.clps.brown.edu/", "dblp": ";274/2390;228/3333;;83/2121;", "google_scholar": ";1m5Mlx4AAAAJ;Z-YF5FsAAAAJ;;1pwyaYgAAAAJ;kZlPW4wAAAAJ", "orcid": ";;0000-0003-3372-5940;;0000-0002-3611-7716;", "linkedin": ";;;;;", "or_profile": "~Sabine_Muzellec1;~Thomas_FEL1;~Victor_Boutin2;~L\u00e9o_And\u00e9ol1;~Rufin_VanRullen1;~Thomas_Serre1", "aff": ";Brown University;Brown University;;CNRS;Universit\u00e9 de Toulouse", "aff_domain": ";brown.edu;brown.edu;;cnrs.fr;univ-toulouse.fr", "position": ";PhD student;Postdoc;;Research Director;Full Professor", "bibtex": "@inproceedings{\nmuzellec2024saliency,\ntitle={Saliency strikes back: How filtering out high frequencies improves white-box explanations},\nauthor={Sabine Muzellec and Thomas FEL and Victor Boutin and L{\\'e}o And{\\'e}ol and Rufin VanRullen and Thomas Serre},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eC1OOpOGZW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4356932, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3998032303628933652&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": ";brown.edu;brown.edu;;cnrs.fr;univ-toulouse.fr", "author_num": 6, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Brown University;Centre National de la Recherche Scientifique;Universit\u00e9 de Toulouse", "aff_unique_dep": ";;", "aff_unique_url": "https://www.brown.edu;https://www.cnrs.fr;https://www.univ-toulouse.fr", "aff_unique_abbr": "Brown;CNRS;UT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "United States;France" }, { "title": "Improving Instruction Following in Language Models through Proxy-Based Uncertainty Estimation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33513", "id": "eCCaHZKdl4", "proceeding": "https://proceedings.mlr.press/v235/lee24z.html", "pdf": "https://openreview.net/pdf?id=eCCaHZKdl4", "openreview": "https://openreview.net/forum?id=eCCaHZKdl4", "author_site": "JoonHo Lee, Jae Oh Woo, Juree Seok, Parisa Hassanzadeh, Wooseok Jang, JuYoun Son, Sima Didari, Baruch Gutow, Heng Hao, Hankyu Moon, Wenjun Hu, Yeong-Dae Kwon, Taehee Lee, Seungjai Min", "tldr": "", "abstract": "Assessing response quality to instructions in language models is vital but challenging due to the complexity of human language across different contexts. This complexity often results in ambiguous or inconsistent interpretations, making accurate assessment difficult. To address this issue, we propose a novel Uncertainty-aware Reward Model (URM) that introduces a robust uncertainty estimation for the quality of paired responses based on Bayesian approximation. Trained with preference datasets, our uncertainty-enabled proxy not only scores rewards for responses but also evaluates their inherent uncertainty. Empirical results demonstrate significant benefits of incorporating the proposed proxy into language model training. Our method boosts the instruction following capability of language models by refining data curation for training and improving policy optimization objectives, thereby surpassing existing methods by a large margin on benchmarks such as Vicuna and MT-bench. These findings highlight that our proposed approach substantially advances language model training and paves a new way of harnessing uncertainty within language models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "JoonHo Lee;Jae Oh Woo;Juree Seok;Parisa Hassanzadeh;Wooseok Jang;JuYoun Son;Sima Didari;Baruch Gutow;Heng Hao;Hankyu Moon;Wenjun Hu;Yeong-Dae Kwon;Taehee Lee;Seungjai Min", "authorids": "~JoonHo_Lee1;~Jae_Oh_Woo1;~Juree_Seok1;~Parisa_Hassanzadeh1;~Wooseok_Jang4;~JuYoun_Son1;~Sima_Didari1;~Baruch_Gutow1;~Heng_Hao1;~Hankyu_Moon1;~Wenjun_Hu2;~Yeong-Dae_Kwon1;~Taehee_Lee1;~Seungjai_Min1", "gender": "M;M;F;F;M;F;F;M;F;;;;;M", "homepage": ";https://sites.google.com/site/jaeohwoo/;;;;https://naver.me/xpWm9XLY;;;;;;;;", "dblp": ";149/2599;;172/1276;;;;;;64/3302;;l/TaeheeLee2;;277/6514", "google_scholar": "63wzupQAAAAJ;;https://scholar.google.com/citations?view_op=list_works;GBlNK3IAAAAJ;;;;;;;rCG_IBgAAAAJ;SM-2GN4AAAAJ;;cEKyTVUAAAAJ", "orcid": "0000-0002-6577-298X;0000-0001-6799-6189;;;;;;;;;;;;0000-0002-7823-6860", "linkedin": "joonho-jeremy-lee/;;juree-seok-870b2a97;;wooseok-jang-904271146/;;simadidari/;baruch-gutow/;heng-hao-36a5b997/;;wenjunhuprofile;;seungjaimin/;", "or_profile": "~JoonHo_Lee1;~Jae_Oh_Woo1;~Juree_Seok1;~Parisa_Hassanzadeh1;~Wooseok_Jang4;~JuYoun_Son1;~Sima_Didari1;~Baruch_Gutow1;~Heng_Hao1;~Hankyu_Moon1;~Wenjun_Hu2;~Taehee_Lee1;~Seungjai_Min1;~Yeong_Dae_Kwon1", "aff": "Seoul National University of Science and Technology;Samsung;Samsung;J.P. Morgan Chase;Samsung;Samsung sds;Samsung;Samsung;Samsung SDSA;Samsung SDS Research America;;Samsung SDS;Samsung SDS;Samsung SDS", "aff_domain": "seoultech.ac.kr;samsung.com;samsung.com;jpmorgan.com;samsung.com;samsung.com;samsung.com;samsung.com;samsung.com;samsung.com;;samsung.com;samsung.com;samsung.com", "position": "PhD student;Researcher;Researcher;Researcher;Researcher;Researcher;Researcher;Researcher;Researcher;Research Scientist;;Engineering Director;Research Scientist;Vice President", "bibtex": "@inproceedings{\nlee2024improving,\ntitle={Improving Instruction Following in Language Models through Proxy-Based Uncertainty Estimation},\nauthor={JoonHo Lee and Jae Oh Woo and Juree Seok and Parisa Hassanzadeh and Wooseok Jang and JuYoun Son and Sima Didari and Baruch Gutow and Heng Hao and Hankyu Moon and Wenjun Hu and Yeong-Dae Kwon and Taehee Lee and Seungjai Min},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eCCaHZKdl4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2025845, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 14, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:aEVfwK4Q7vAJ:scholar.google.com/&scioq=Improving+Instruction+Following+in+Language+Models+through+Proxy-Based+Uncertainty+Estimation&hl=en&as_sdt=0,5", "gs_version_total": 7, "email": "seoultech.ac.kr;samsung.com;samsung.com;jpmorgan.com;samsung.com;samsung.com;samsung.com;samsung.com;samsung.com;samsung.com;;samsung.com;samsung.com;samsung.com", "author_num": 14, "aff_unique_index": "0;1;1;2;1;1;1;1;1;1;1;1;1", "aff_unique_norm": "Seoul National University of Science and Technology;Samsung;JPMorgan Chase & Co.", "aff_unique_dep": ";Samsung;", "aff_unique_url": "https://www.snust.ac.kr;https://www.samsung.com;https://www.jpmorganchase.com", "aff_unique_abbr": "SNUST;Samsung;JPM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0;0;0;0;1;0;0;0", "aff_country_unique": "South Korea;United States" }, { "title": "Break the Sequential Dependency of LLM Inference Using Lookahead Decoding", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33512", "id": "eDjvSFOkXw", "proceeding": "https://proceedings.mlr.press/v235/fu24a.html", "pdf": "https://openreview.net/pdf?id=eDjvSFOkXw", "openreview": "https://openreview.net/forum?id=eDjvSFOkXw", "author_site": "Yichao Fu, Peter Bailis, Ion Stoica, Hao Zhang", "tldr": "", "abstract": "Autoregressive decoding of large language models (LLMs) is memory bandwidth bounded, resulting in high latency and significant wastes of the parallel processing power of modern accelerators. Existing methods for accelerating LLM decoding often require a draft model (e.g., speculative decoding), which is nontrivial to obtain and unable to generalize. In this paper, we introduce Lookahead decoding, an exact, parallel decoding algorithm that accelerates LLM decoding without needing auxiliary models or data stores. It allows trading per-step log(FLOPs) to reduce the number of total decoding steps, is more parallelizable on single or multiple modern accelerators, and is compatible with concurrent memory-efficient attention (e.g., FlashAttention). Our implementation of Lookahead decoding can speed up autoregressive decoding by up to 1.8x on MT-bench and 4x with strong scaling on multiple GPUs in code completion tasks. Our code is avialable at https://github.com/hao-ai-lab/LookaheadDecoding", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yichao Fu;Peter Bailis;Ion Stoica;Hao Zhang", "authorids": "~Yichao_Fu1;~Peter_Bailis2;~Ion_Stoica1;~Hao_Zhang2", "gender": ";M;M;M", "homepage": ";http://www.bailis.org/;http://people.eecs.berkeley.edu/~istoica/;https://cseweb.ucsd.edu/~haozhang/", "dblp": ";47/8816;s/IonStoica;55/2270-25", "google_scholar": ";qG1LVpQAAAAJ;vN-is70AAAAJ;H1d4BS8AAAAJ", "orcid": ";;;", "linkedin": ";;ionstoica;", "or_profile": "~Yichao_Fu1;~Peter_Bailis2;~Ion_Stoica1;~Hao_Zhang2", "aff": ";Stanford University;University of California, Berkeley;Carnegie Mellon University", "aff_domain": ";stanford.edu;berkeley.edu;cmu.edu", "position": ";Adjunct Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nfu2024break,\ntitle={Break the Sequential Dependency of {LLM} Inference Using Lookahead Decoding},\nauthor={Yichao Fu and Peter Bailis and Ion Stoica and Hao Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eDjvSFOkXw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 925568, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 92, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12292302913779993353&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";stanford.edu;berkeley.edu;cmu.edu", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Stanford University;University of California, Berkeley;Carnegie Mellon University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.stanford.edu;https://www.berkeley.edu;https://www.cmu.edu", "aff_unique_abbr": "Stanford;UC Berkeley;CMU", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Stanford;Berkeley;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Harnessing Neural Unit Dynamics for Effective and Scalable Class-Incremental Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33511", "id": "eDtty9ZCvt", "proceeding": "https://proceedings.mlr.press/v235/li24bk.html", "pdf": "https://openreview.net/pdf?id=eDtty9ZCvt", "openreview": "https://openreview.net/forum?id=eDtty9ZCvt", "author_site": "Depeng Li, Tianqi Wang, Junwei Chen, Wei Dai, Zhigang Zeng", "tldr": "", "abstract": "Class-incremental learning (CIL) aims to train a model to learn new classes from non-stationary data streams without forgetting old ones. In this paper, we propose a new kind of connectionist model by tailoring neural unit dynamics that adapt the behavior of neural networks for CIL. In each training session, it introduces a supervisory mechanism to guide network expansion whose growth size is compactly commensurate with the intrinsic complexity of a newly arriving task. This constructs a near-minimal network while allowing the model to expand its capacity when cannot sufficiently hold new classes. At inference time, it automatically reactivates the required neural units to retrieve knowledge and leaves the remaining inactivated to prevent interference. We name our model AutoActivator, which is effective and scalable. To gain insights into the neural unit dynamics, we theoretically analyze the model\u2019s convergence property via a universal approximation theorem on learning sequential mappings, which is under-explored in the CIL community. Experiments show that our method achieves strong CIL performance in rehearsal-free and minimal-expansion settings with different backbones.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Depeng Li;Tianqi Wang;Junwei Chen;Wei Dai;Zhigang Zeng", "authorids": "~Depeng_Li1;~Tianqi_Wang4;~Junwei_Chen1;~Wei_Dai14;~Zhigang_Zeng1", "gender": "M;M;M;M;M", "homepage": ";https://github.com/luozhiqi-huster;https://ethenwillson.github.io/;;http://aia.hust.edu.cn/zhigangzeng/", "dblp": "90/4415-1;;;76/2897-4;85/1640", "google_scholar": "8IsEAO4AAAAJ;;;;https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-2281-7535;;;0000-0003-3057-7225;", "linkedin": ";;;;", "or_profile": "~Depeng_Li1;~Tianqi_Wang4;~Junwei_Chen1;~Wei_Dai14;~Zhigang_Zeng1", "aff": "Huazhong University of Science and Technology;Huazhong University of Science and Technology;Huazhong University of Science and Technology;China University of Mining Technology - Xuzhou;Huazhong University of Science and Technology", "aff_domain": "hust.edu.cn;hust.edu.cn;hust.edu.cn;cumt.edu.cn;hust.edu.cn", "position": "PhD student;Undergrad student;MS student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nli2024harnessing,\ntitle={Harnessing Neural Unit Dynamics for Effective and Scalable Class-Incremental Learning},\nauthor={Depeng Li and Tianqi Wang and Junwei Chen and Wei Dai and Zhigang Zeng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eDtty9ZCvt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 502655, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16669179910580356281&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 7, "email": "hust.edu.cn;hust.edu.cn;hust.edu.cn;cumt.edu.cn;hust.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Huazhong University of Science and Technology;China University of Mining Technology", "aff_unique_dep": ";", "aff_unique_url": "http://www.hust.edu.cn;http://www.cumt.edu.cn/", "aff_unique_abbr": "HUST;CUMT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Xuzhou", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Generalization Bounds for Heavy-Tailed SDEs through the Fractional Fokker-Planck Equation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33510", "id": "eFSppFiVYG", "proceeding": "https://proceedings.mlr.press/v235/dupuis24a.html", "pdf": "https://openreview.net/pdf?id=eFSppFiVYG", "openreview": "https://openreview.net/forum?id=eFSppFiVYG", "author_site": "Benjamin Dupuis, Umut Simsekli", "tldr": "", "abstract": "Understanding the generalization properties of heavy-tailed stochastic optimization algorithms has attracted increasing attention over the past years. While illuminating interesting aspects of stochastic optimizers by using heavy-tailed stochastic differential equations as proxies, prior works either provided expected generalization bounds, or introduced non-computable information theoretic terms. Addressing these drawbacks, in this work, we prove high-probability generalization bounds for heavy-tailed SDEs which do not contain any nontrivial information theoretic terms. To achieve this goal, we develop new proof techniques based on estimating the entropy flows associated with the so-called fractional Fokker-Planck equation (a partial differential equation that governs the evolution of the distribution of the corresponding heavy-tailed SDE). In addition to obtaining high-probability bounds, we show that our bounds have a better dependence on the dimension of parameters as compared to prior art. Our results further identify a phase transition phenomenon, which suggests that heavy tails can be either beneficial or harmful depending on the problem structure. We support our theory with experiments conducted in a variety of settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Benjamin Dupuis;Umut Simsekli", "authorids": "~Benjamin_Dupuis1;~Umut_Simsekli1", "gender": "M;M", "homepage": "https://benjidupuis.github.io;https://www.di.ens.fr/~simsekli/", "dblp": "294/9740;https://dblp.org/pers/s/Simsekli:Umut.html", "google_scholar": "r99oWgkAAAAJ;https://scholar.google.fr/citations?user=CuArAkgAAAAJ", "orcid": ";", "linkedin": "benjamin-dupuis-3b453a176/;", "or_profile": "~Benjamin_Dupuis1;~Umut_Simsekli1", "aff": "INRIA;INRIA", "aff_domain": "inria.fr;inria.fr", "position": "PhD student;Research Faculty", "bibtex": "@inproceedings{\ndupuis2024generalization,\ntitle={Generalization Bounds for Heavy-Tailed {SDE}s through the Fractional Fokker-Planck Equation},\nauthor={Benjamin Dupuis and Umut Simsekli},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eFSppFiVYG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2619825, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=948768444083096752&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "inria.fr;inria.fr", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "INRIA", "aff_unique_dep": "", "aff_unique_url": "https://www.inria.fr", "aff_unique_abbr": "INRIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "title": "Provably Efficient Exploration in Quantum Reinforcement Learning with Logarithmic Worst-Case Regret", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33509", "id": "eFvoL7BOny", "proceeding": "https://proceedings.mlr.press/v235/zhong24b.html", "pdf": "https://openreview.net/pdf?id=eFvoL7BOny", "openreview": "https://openreview.net/forum?id=eFvoL7BOny", "author_site": "Han Zhong, Jiachen Hu, Yecheng Xue, Tongyang Li, Liwei Wang", "tldr": "", "abstract": "While quantum reinforcement learning (RL) has attracted a surge of attention recently, its theoretical understanding is limited. In particular, it remains elusive how to design provably efficient quantum RL algorithms that can address the exploration-exploitation trade-off. To this end, we propose a novel UCRL-style algorithm that takes advantage of quantum computing for tabular Markov decision processes (MDPs) with $S$ states, $A$ actions, and horizon $H$, and establish an $\\mathcal{O}(\\mathrm{poly}(S, A, H, \\log T))$ worst-case regret for it, where $T$ is the number of episodes. Furthermore, we extend our results to quantum RL with linear function approximation, which is capable of handling problems with large state spaces. Specifically, we develop a quantum algorithm based on value target regression (VTR) for linear mixture MDPs with $d$-dimensional linear representation and prove that it enjoys $\\mathcal{O}(\\mathrm{poly}(d, H, \\log T))$ regret. Our algorithms are variants of UCRL/UCRL-VTR algorithms in classical RL, which also leverage a novel combination of lazy updating mechanisms and quantum estimation subroutines. This is the key to breaking the $\\Omega(\\sqrt{T})$-regret barrier in classical RL. To the best of our knowledge, this is the first work studying the online exploration in quantum RL with provable logarithmic worst-case regret.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Han Zhong;Jiachen Hu;Yecheng Xue;Tongyang Li;Liwei Wang", "authorids": "~Han_Zhong1;~Jiachen_Hu1;~Yecheng_Xue1;~Tongyang_Li1;~Liwei_Wang1", "gender": ";M;;M;M", "homepage": "https://hanzhong-ml.github.io/;https://nickhclos.github.io/;;https://www.tongyangli.com/;http://www.liweiwang-pku.com/", "dblp": "137/8096.html;239/5040;340/7132;142/1312;", "google_scholar": "Bk5q_pAAAAAJ;5GavKiQAAAAJ;https://scholar.google.com/;ny0ZgiQAAAAJ;VZHxoh8AAAAJ", "orcid": ";;;0000-0002-0338-413X;", "linkedin": ";;;;", "or_profile": "~Han_Zhong1;~Jiachen_Hu1;~Yecheng_Xue1;~Tongyang_Li1;~Liwei_Wang1", "aff": "Peking University;Peking University;Peking University;Peking University;Peking University", "aff_domain": "stu.pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn", "position": "PhD student;PhD student;PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nzhong2024provably,\ntitle={Provably Efficient Exploration in Quantum Reinforcement Learning with Logarithmic Worst-Case Regret},\nauthor={Han Zhong and Jiachen Hu and Yecheng Xue and Tongyang Li and Liwei Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eFvoL7BOny}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 442697, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7945288915797372320&as_sdt=8005&sciodt=0,7&hl=en", "gs_version_total": 8, "email": "stu.pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "OLLIE: Imitation Learning from Offline Pretraining to Online Finetuning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33508", "id": "eG42XBhV9a", "proceeding": "https://proceedings.mlr.press/v235/yue24a.html", "pdf": "https://openreview.net/pdf?id=eG42XBhV9a", "openreview": "https://openreview.net/forum?id=eG42XBhV9a", "author_site": "Sheng Yue, Xingyuan Hua, Ju Ren, Sen Lin, Junshan Zhang, Yaoxue Zhang", "tldr": "", "abstract": "In this paper, we study offline-to-online Imitation Learning (IL) that pretrains an imitation policy from static demonstration data, followed by fast finetuning with minimal environmental interaction. We find the naive combination of existing offline IL and online IL methods tends to behave poorly in this context, because the initial discriminator (often used in online IL) operates randomly and discordantly against the policy initialization, leading to misguided policy optimization and *unlearning* of pretraining knowledge. To overcome this challenge, we propose a principled offline-to-online IL method, named OLLIE, that simultaneously learns a near-expert policy initialization along with an *aligned discriminator initialization*, which can be seamlessly integrated into online IL, achieving smooth and fast finetuning. Empirically, OLLIE consistently and significantly outperforms the baseline methods in **20** challenging tasks, from continuous control to vision-based domains, in terms of performance, demonstration efficiency, and convergence speed. This work may serve as a foundation for further exploration of pretraining and finetuning in the context of IL.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sheng Yue;Xingyuan Hua;Ju Ren;Sen Lin;Junshan Zhang;Yaoxue Zhang", "authorids": "~Sheng_Yue1;xingyuanhua@bit.edu.cn;~Ju_Ren1;~Sen_Lin1;~Junshan_Zhang1;~Yaoxue_Zhang3", "gender": "M;;;;M;M", "homepage": "https://shaunyue.github.io;;;https://slin70.github.io/;https://faculty.engineering.ucdavis.edu/jzhang/;", "dblp": "236/3241;;;70/9499-1.html;59/1232.html;99/4094", "google_scholar": "n0Gjw_oAAAAJ;;;94-TbUsAAAAJ;UtAdFs8AAAAJ;q_76wvMAAAAJ", "orcid": "0009-0001-3416-8181;;;;;", "linkedin": ";;;;;", "or_profile": "~Sheng_Yue1;xingyuanhua@bit.edu.cn;~Ju_Ren1;~Sen_Lin1;~Junshan_Zhang1;~Yaoxue_Zhang3", "aff": "Tsinghua University;;;University of Houston;University of California, Davis;Tsinghua University", "aff_domain": "tsinghua.edu.cn;;;uh.edu;ucdavis.edu;tsinghua.edu.cn", "position": "Postdoc;;;Assistant Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nyue2024ollie,\ntitle={{OLLIE}: Imitation Learning from Offline Pretraining to Online Finetuning},\nauthor={Sheng Yue and Xingyuan Hua and Ju Ren and Sen Lin and Junshan Zhang and Yaoxue Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eG42XBhV9a}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8452011, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3228548393053648485&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "email": "tsinghua.edu.cn;;;uh.edu;ucdavis.edu;tsinghua.edu.cn", "author_num": 6, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Tsinghua University;University of Houston;University of California, Davis", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.uh.edu;https://www.ucdavis.edu", "aff_unique_abbr": "THU;UH;UC Davis", "aff_campus_unique_index": "1", "aff_campus_unique": ";Davis", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "China;United States" }, { "title": "Simplicity Bias of Two-Layer Networks beyond Linearly Separable Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33507", "id": "eGZH3HCuGm", "proceeding": "https://proceedings.mlr.press/v235/tsoy24a.html", "pdf": "https://openreview.net/pdf?id=eGZH3HCuGm", "openreview": "https://openreview.net/forum?id=eGZH3HCuGm", "author_site": "Nikita Tsoy, Nikola Konstantinov", "tldr": "", "abstract": "Simplicity bias, the propensity of deep models to over-rely on simple features, has been identified as a potential reason for limited out-of-distribution generalization of neural networks (Shah et al., 2020). Despite the important implications, this phenomenon has been theoretically confirmed and characterized only under strong dataset assumptions, such as linear separability (Lyu et al., 2021). In this work, we characterize simplicity bias for general datasets in the context of two-layer neural networks initialized with small weights and trained with gradient flow. Specifically, we prove that in the early training phases, network features cluster around a few directions that do not depend on the size of the hidden layer. Furthermore, for datasets with an XOR-like pattern, we precisely identify the learned features and demonstrate that simplicity bias intensifies during later training stages. These results indicate that features learned in the middle stages of training may be more useful for OOD transfer. We support this hypothesis with experiments on image data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nikita Tsoy;Nikola Konstantinov", "authorids": "~Nikita_Tsoy1;~Nikola_Konstantinov1", "gender": ";M", "homepage": "https://insait.ai/nikita-tsoy/;https://nikolakon.github.io/", "dblp": "348/6414;217/1964", "google_scholar": "R6oTOHUAAAAJ;https://scholar.google.at/citations?user=0_lvOo8AAAAJ", "orcid": "0000-0001-8612-057X;", "linkedin": "nikita-tsoy-560010238/;", "or_profile": "~Nikita_Tsoy1;~Nikola_Konstantinov1", "aff": "Sofia University \"St. Kliment Ohridski\";Sofia University \"St. Kliment Ohridski\"", "aff_domain": "insait.ai;insait.ai", "position": "PhD student;Tenure-track faculty", "bibtex": "@inproceedings{\ntsoy2024simplicity,\ntitle={Simplicity Bias of Two-Layer Networks beyond Linearly Separable Data},\nauthor={Nikita Tsoy and Nikola Konstantinov},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eGZH3HCuGm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1462053, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8419472221995565851&as_sdt=20000005&sciodt=0,21&hl=en", "gs_version_total": 7, "email": "insait.ai;insait.ai", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Sofia University St. Kliment Ohridski", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-sofia.bg", "aff_unique_abbr": "SU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Bulgaria" }, { "title": "RoboMP$^2$: A Robotic Multimodal Perception-Planning Framework with Multimodal Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33506", "id": "eJFQROkaj0", "proceeding": "https://proceedings.mlr.press/v235/lv24a.html", "pdf": "https://openreview.net/pdf?id=eJFQROkaj0", "openreview": "https://openreview.net/forum?id=eJFQROkaj0", "author_site": "Qi Lv, Hao Li, Xiang Deng, Rui Shao, Michael Wang, Liqiang Nie", "tldr": "", "abstract": "Multimodal Large Language Models (MLLMs) have shown impressive reasoning abilities and general intelligence in various domains. It inspires researchers to train end-to-end MLLMs or utilize large models to generate policies with human-selected prompts for embodied agents. However, these methods exhibit limited generalization capabilities on unseen tasks or scenarios, and overlook the multimodal environment information which is critical for robots to make decisions. In this paper, we introduce a novel **Robo**tic **M**ultimodal **P**erception-**P**lanning (**RoboMP$^2$**) framework for robotic manipulation which consists of a Goal-Conditioned Multimodal Preceptor (GCMP) and a Retrieval-Augmented Multimodal Planner (RAMP). Specially, GCMP captures environment states by employing a tailored MLLMs for embodied agents with the abilities of semantic reasoning and localization. RAMP utilizes coarse-to-fine retrieval method to find the $k$ most-relevant policies as in-context demonstrations to enhance the planner. Extensive experiments demonstrate the superiority of RoboMP$^2$ on both VIMA benchmark and real-world tasks, with around 10% improvement over the baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qi Lv;Hao Li;Xiang Deng;Rui Shao;Michael Y Wang;Liqiang Nie", "authorids": "~Qi_Lv1;~Hao_Li59;~Xiang_Deng1;~Rui_Shao1;~Michael_Y_Wang1;~Liqiang_Nie2", "gender": "M;M;;M;;M", "homepage": "https://github.com/Aopolin-Lv;https://github.com/1223haohao/li_hao;;https://rshaojimmy.github.io/;;https://liqiangnie.github.io/index.html", "dblp": ";;;;;92/8277", "google_scholar": ";D-8csxoAAAAJ;;https://scholar.google.com/citations?hl=en;;yywVMhUAAAAJ", "orcid": "0000-0002-8507-7167;;;0000-0003-0090-9604;;0000-0003-1476-0273", "linkedin": "qi-lv-%EF%BC%88%E5%90%95%E5%A5%87%EF%BC%89-075614311/;;;;;", "or_profile": "~Qi_Lv1;~Hao_Li59;~Xiang_Deng1;~Rui_Shao1;~Michael_Y_Wang1;~Liqiang_Nie2", "aff": "Harbin Institute of Technology;Harbin Institute of Technology;;Harbin Institute of Technology;;Shandong University", "aff_domain": "hit.edu.cn;stu.hit.edu.cn;;hit.edu.cn;;sdu.edu.cn", "position": "PhD student;MS student;;Full Professor;;Full Professor", "bibtex": "@inproceedings{\nlv2024robomp,\ntitle={Robo{MP}\\${\\textasciicircum}2\\$: A Robotic Multimodal Perception-Planning Framework with Multimodal Large Language Models},\nauthor={Qi Lv and Hao Li and Xiang Deng and Rui Shao and Michael Y Wang and Liqiang Nie},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eJFQROkaj0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4009441, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2218358064760087056&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "hit.edu.cn;stu.hit.edu.cn;;hit.edu.cn;;sdu.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Harbin Institute of Technology;Shandong University", "aff_unique_dep": ";", "aff_unique_url": "http://www.hit.edu.cn/;http://www.sdu.edu.cn", "aff_unique_abbr": "HIT;SDU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Harbin;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Towards efficient deep spiking neural networks construction with spiking activity based pruning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33505", "id": "eMQyb1tvvc", "proceeding": "https://proceedings.mlr.press/v235/li24bz.html", "pdf": "https://openreview.net/pdf?id=eMQyb1tvvc", "openreview": "https://openreview.net/forum?id=eMQyb1tvvc", "author_site": "Yaxin Li, Qi Xu, Jiangrong Shen, Hongming Xu, Long Chen, Gang Pan", "tldr": "", "abstract": "The emergence of deep and large-scale spiking neural networks (SNNs) exhibiting high performance across diverse complex datasets has led to a need for compressing network models due to the presence of a significant number of redundant structural units, aiming to more effectively leverage their low-power consumption and biological interpretability advantages. Currently, most model compression techniques for SNNs are based on unstructured pruning of individual connections, which requires specific hardware support. Hence, we propose a structured pruning approach based on the activity levels of convolutional kernels named Spiking Channel Activity-based (SCA) network pruning framework. Inspired by synaptic plasticity mechanisms, our method dynamically adjusts the network's structure by pruning and regenerating convolutional kernels during training, enhancing the model's adaptation to the current target task. While maintaining model performance, this approach refines the network architecture, ultimately reducing computational load and accelerating the inference process. This indicates that structured dynamic sparse learning methods can better facilitate the application of deep SNNs in low-power and high-efficiency scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yaxin Li;Qi Xu;Jiangrong Shen;Hongming Xu;Long Chen;Gang Pan", "authorids": "~Yaxin_Li4;~Qi_Xu1;~Jiangrong_Shen1;~Hongming_Xu3;~Long_Chen18;~Gang_Pan1", "gender": ";M;F;M;M;", "homepage": ";https://www.researchgate.net/profile/Qi_Xu43;;https://xhm1014.github.io/index.html;https://iris.ucl.ac.uk/iris/browse/profile?upi=LCHEI54;", "dblp": "143/0251-3;;208/3564;150/7585-2;64/5725-19.html;", "google_scholar": ";dGEcAuYAAAAJ;3XK6COkAAAAJ;nErn9W8AAAAJ;J_v0xb8AAAAJ;", "orcid": "0000-0003-0160-8950;0000-0001-9245-5544;;0000-0002-1305-0010;0000-0001-8552-859X;", "linkedin": ";;;;;", "or_profile": "~Yaxin_Li4;~Qi_Xu1;~Jiangrong_Shen1;~Hongming_Xu3;~Long_Chen18;~Gang_Pan1", "aff": "Dalian University of Technology;School of Computer Science and Technology;Zhejiang University;Dalian University of Technology;Imperial College London;", "aff_domain": "dlut.edu.cn;dlut.edu.cn;zju.edu.cn;dlut.edu.cn;ic.ac.uk;", "position": "MS student;Associate Professor;Postdoc;Associate Professor;Postdoc;", "bibtex": "@inproceedings{\nli2024towards,\ntitle={Towards efficient deep spiking neural networks construction with spiking activity based pruning},\nauthor={Yaxin Li and Qi Xu and Jiangrong Shen and Hongming Xu and Long Chen and Gang Pan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eMQyb1tvvc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 936798, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8902417123251783381&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "dlut.edu.cn;dlut.edu.cn;zju.edu.cn;dlut.edu.cn;ic.ac.uk;", "author_num": 6, "aff_unique_index": "0;1;2;0;3", "aff_unique_norm": "Dalian University of Technology;School of Computer Science and Technology;Zhejiang University;Imperial College London", "aff_unique_dep": ";Computer Science and Technology;;", "aff_unique_url": "http://www.dlut.edu.cn/;;https://www.zju.edu.cn;https://www.imperial.ac.uk", "aff_unique_abbr": "DUT;;ZJU;ICL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;2", "aff_country_unique": "China;;United Kingdom" }, { "title": "Advancing DRL Agents in Commercial Fighting Games: Training, Integration, and Agent-Human Alignment", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33504", "id": "eN1T7I7OpZ", "proceeding": "https://proceedings.mlr.press/v235/zhang24v.html", "pdf": "https://openreview.net/pdf?id=eN1T7I7OpZ", "openreview": "https://openreview.net/forum?id=eN1T7I7OpZ", "author_site": "Chen Zhang, Qiang HE, Yuan Zhou, Elvis S. Liu, Hong Wang, Jian Zhao, Yang Wang", "tldr": "", "abstract": "Deep Reinforcement Learning (DRL) agents have demonstrated impressive success in a wide range of game genres. However, existing research primarily focuses on optimizing DRL competence rather than addressing the challenge of prolonged player interaction. In this paper, we propose a practical DRL agent system for fighting games named _Sh\u016bkai_, which has been successfully deployed to Naruto Mobile, a popular fighting game with over 100 million registered users. _Sh\u016bkai_ quantifies the state to enhance generalizability, introducing Heterogeneous League Training (HELT) to achieve balanced competence, generalizability, and training efficiency. Furthermore, _Sh\u016bkai_ implements specific rewards to align the agent's behavior with human expectations. _Sh\u016bkai_'s ability to generalize is demonstrated by its consistent competence across all characters, even though it was trained on only 13% of them. Additionally, HELT exhibits a remarkable 22% improvement in sample efficiency. _Sh\u016bkai_ serves as a valuable training partner for players in Naruto Mobile, enabling them to enhance their abilities and skills.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chen Zhang;Qiang He;Yuan Zhou;Elvis S. Liu;Hong Wang;Jian Zhao;Yang Wang", "authorids": "~Chen_Zhang22;~Qiang_He1;~Yuan_Zhou9;~Elvis_S._Liu1;~Hong_Wang8;~Jian_Zhao7;~Yang_Wang32", "gender": "M;;M;M;M;M;M", "homepage": ";;;https://www.linkedin.com/in/wang-hong-47210523b/;;http://staff.ustc.edu.cn/~angyan/;", "dblp": ";;;;70/2932-18.html;;", "google_scholar": "l6Y2ZDYAAAAJ;;https://scholar.google.co.uk/citations?user=ZxRKSisAAAAJ;;n6zuurcAAAAJ;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";;;;0000-0003-4895-990X;0000-0002-6079-7053;0000-0001-9341-922X", "linkedin": ";https://www.linkedin.cn/incareer/in/\u5706-\u5468-6002b623a;;;;;", "or_profile": "~Qiang_He1;~Yuan_Zhou9;~Elvis_S._Liu1;~Hong_Wang8;~Jian_Zhao7;~Yang_Wang32;~chen_zhang15", "aff": "Ruhr-Universit\u00e4t Bochum;Tencent;Tencent;;polixir;University of Science and Technology of China;University of Science and Technology of China", "aff_domain": "ruhr-uni-bochum.de;tencent.com;tencent.com;;polixir.ai;ustc.edu.cn;ustc.edu.cn", "position": "PhD student;Researcher;Principal Researcher;;Researcher;Associate Professor;PhD student", "bibtex": "@inproceedings{\nzhang2024advancing,\ntitle={Advancing {DRL} Agents in Commercial Fighting Games: Training, Integration, and Agent-Human Alignment},\nauthor={Chen Zhang and Qiang He and Yuan Zhou and Elvis S. Liu and Hong Wang and Jian Zhao and Yang Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eN1T7I7OpZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2586644, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7238636469024223854&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "ruhr-uni-bochum.de;tencent.com;tencent.com;;polixir.ai;ustc.edu.cn;ustc.edu.cn", "author_num": 7, "aff_unique_index": "0;1;1;2;3;3", "aff_unique_norm": "Ruhr-Universit\u00e4t Bochum;Tencent;Polixir;University of Science and Technology of China", "aff_unique_dep": ";Tencent Holdings Limited;;", "aff_unique_url": "https://www.ruhr-uni-bochum.de;https://www.tencent.com;;http://www.ustc.edu.cn", "aff_unique_abbr": "RUB;Tencent;;USTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "Germany;China;" }, { "title": "Characteristic Guidance: Non-linear Correction for Diffusion Model at Large Guidance Scale", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33503", "id": "eOtjMYdGLt", "proceeding": "https://proceedings.mlr.press/v235/zheng24f.html", "pdf": "https://openreview.net/pdf?id=eOtjMYdGLt", "openreview": "https://openreview.net/forum?id=eOtjMYdGLt", "author_site": "Candi Zheng, Yuan LAN", "tldr": "", "abstract": "Popular guidance for denoising diffusion probabilistic model (DDPM) linearly combines distinct conditional models together to provide enhanced control over samples. However, this approach overlooks nonlinear effects that become significant when guidance scale is large. To address this issue, we propose characteristic guidance, a guidance method that provides first-principle non-linear correction for classifier-free guidance. Such correction forces the guided DDPMs to respect the Fokker-Planck (FP) equation of diffusion process, in a way that is training-free and compatible with existing sampling methods. Experiments show that characteristic guidance enhances semantic characteristics of prompts and mitigate irregularities in image generation, proving effective in diverse applications ranging from simulating magnet phase transitions to latent space sampling.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Candi Zheng;Yuan Lan", "authorids": "~Candi_Zheng1;~Yuan_Lan1", "gender": "M;", "homepage": ";", "dblp": ";", "google_scholar": "https://scholar.google.com.hk/citations?user=akzAFKcAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Candi_Zheng1;~Yuan_Lan1", "aff": "Hong Kong University of Science and Technology;", "aff_domain": "ust.hk;", "position": "Postdoc;", "bibtex": "@inproceedings{\nzheng2024characteristic,\ntitle={Characteristic Guidance: Non-linear Correction for Diffusion Model at Large Guidance Scale},\nauthor={Candi Zheng and Yuan Lan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eOtjMYdGLt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7111962, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14082549207289448273&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "ust.hk;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Hong Kong University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.ust.hk", "aff_unique_abbr": "HKUST", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "Ensemble Pruning for Out-of-distribution Generalization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33502", "id": "eP3vsbB5wW", "proceeding": "https://proceedings.mlr.press/v235/qiao24a.html", "pdf": "https://openreview.net/pdf?id=eP3vsbB5wW", "openreview": "https://openreview.net/forum?id=eP3vsbB5wW", "author_site": "Fengchun Qiao, Xi Peng", "tldr": "", "abstract": "Ensemble of deep neural networks has achieved great success in hedging against single-model failure under distribution shift. However, existing techniques suffer from producing redundant models, limiting predictive diversity and yielding compromised generalization performance. Existing ensemble pruning methods can only guarantee predictive diversity for in-distribution data, which may not transfer well to out-of-distribution (OoD) data. To address this gap, we propose a principled optimization framework for ensemble pruning under distribution shifts. Since the annotations of test data are not available, we explore relationships between prediction distributions of the models, encapsulated in a topology graph. By incorporating this topology into a combinatorial optimization framework, complementary models with high predictive diversity are selected with theoretical guarantees. Our approach is model-agnostic and can be applied on top of a broad spectrum of off-the-shelf ensembling methods for improved generalization performance. Experiments on common benchmarks demonstrate the superiority of our approach in both multi- and single-source OoD generalization. The source codes are publicly available at: https://github.com/joffery/TEP.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fengchun Qiao;Xi Peng", "authorids": "~Fengchun_Qiao1;~Xi_Peng1", "gender": ";Not Specified", "homepage": "https://joffery.github.io/joffery/;https://deep-real.github.io/dr_xipeng.html", "dblp": "215/3373;149/7762-5", "google_scholar": "BY6zd_0AAAAJ;DWw4v0kAAAAJ", "orcid": "0000-0003-2714-2036;0000-0002-7772-001X", "linkedin": "fengchun-qiao-9148ba157/;xi-peng-74b540b6/", "or_profile": "~Fengchun_Qiao1;~Xi_Peng1", "aff": "University of Delaware;University of Delaware", "aff_domain": "udel.edu;udel.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nqiao2024ensemble,\ntitle={Ensemble Pruning for Out-of-distribution Generalization},\nauthor={Fengchun Qiao and Xi Peng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eP3vsbB5wW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3930246, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12113550055141062107&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "udel.edu;udel.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Delaware", "aff_unique_dep": "", "aff_unique_url": "https://www.udel.edu", "aff_unique_abbr": "UD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Few-shot Adaptation to Distribution Shifts By Mixing Source and Target Embeddings", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33501", "id": "ePDnv4xESI", "proceeding": "https://proceedings.mlr.press/v235/xue24a.html", "pdf": "https://openreview.net/pdf?id=ePDnv4xESI", "openreview": "https://openreview.net/forum?id=ePDnv4xESI", "author_site": "Yihao Xue, Ali Payani, Yu Yang, Baharan Mirzasoleiman", "tldr": "", "abstract": "Pretrained machine learning models need to be adapted to distribution shifts when deployed in new target environments. When obtaining labeled data from the target distribution is expensive, few-shot adaptation with only a few examples from the target distribution becomes essential. In this work, we propose MixPro, a lightweight and highly data-efficient approach for few-shot adaptation. MixPro first generates a relatively large dataset by mixing (linearly combining) pre-trained embeddings of large source data with those of the few target examples. This process preserves important features of both source and target distributions, while mitigating the specific noise in the small target data. Then, it trains a linear classifier on the mixed embeddings to effectively adapts the model to the target distribution without overfitting the small target data. Theoretically, we demonstrate the advantages of MixPro over previous methods. Our experiments, conducted across various model architectures on 8 datasets featuring different types of distribution shifts, reveal that MixPro can outperform baselines by as much as 7%, with only 2-4 target examples.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yihao Xue;Ali Payani;Yu Yang;Baharan Mirzasoleiman", "authorids": "~Yihao_Xue1;~Ali_Payani1;~Yu_Yang4;~Baharan_Mirzasoleiman1", "gender": ";M;F;F", "homepage": ";;https://sites.google.com/view/yuyang0901/home;http://web.cs.ucla.edu/~baharan/", "dblp": "271/2194;184/3921;16/4505-7;52/10075", "google_scholar": "vMHVm8MAAAAJ;9rHwD8wAAAAJ;KK6Yj4IAAAAJ;x63j7HEAAAAJ", "orcid": ";0000-0003-4054-2958;;", "linkedin": ";ali-payani-59267515;;", "or_profile": "~Yihao_Xue1;~Ali_Payani1;~Yu_Yang4;~Baharan_Mirzasoleiman1", "aff": "IBM, International Business Machines;Cisco;University of California, Los Angeles;University of California, Los Angeles", "aff_domain": "us.ibm.com;cisco.com;ucla.edu;ucla.edu", "position": "Intern;Researcher;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nxue2024fewshot,\ntitle={Few-shot Adaptation to Distribution Shifts By Mixing Source and Target Embeddings},\nauthor={Yihao Xue and Ali Payani and Yu Yang and Baharan Mirzasoleiman},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ePDnv4xESI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 803902, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4271778378225898078&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "us.ibm.com;cisco.com;ucla.edu;ucla.edu", "author_num": 4, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "International Business Machines;Cisco Systems;University of California, Los Angeles", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ibm.com;https://www.cisco.com;https://www.ucla.edu", "aff_unique_abbr": "IBM;Cisco;UCLA", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Fast Decision Boundary based Out-of-Distribution Detector", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33500", "id": "eQaOb4r6YC", "proceeding": "https://proceedings.mlr.press/v235/liu24ax.html", "pdf": "https://openreview.net/pdf?id=eQaOb4r6YC", "openreview": "https://openreview.net/forum?id=eQaOb4r6YC", "author_site": "Litian Liu, Yao Qin", "tldr": "", "abstract": "Efficient and effective Out-of-Distribution (OOD) detection is essential for the safe deployment of AI systems. Existing feature space methods, while effective, often incur significant computational overhead due to their reliance on auxiliary models built from training features. In this paper, we propose a computationally-efficient OOD detector without using auxiliary models while still leveraging the rich information embedded in the feature space. Specifically, we detect OOD samples based on their feature distances to decision boundaries. To minimize computational cost, we introduce an efficient closed-form estimation, analytically proven to tightly lower bound the distance. Based on our estimation, we discover that In-Distribution (ID) features tend to be further from decision boundaries than OOD features. Additionally, ID and OOD samples are better separated when compared at equal deviation levels from the mean of training features. By regularizing the distances to decision boundaries based on feature deviation from the mean, we develop a hyperparameter-free, auxiliary model-free OOD detector. Our method matches or surpasses the effectiveness of state-of-the-art methods in extensive experiments while incurring negligible overhead in inference latency. Overall, our approach significantly improves the efficiency-effectiveness trade-off in OOD detection. Code is available at: https://github.com/litianliu/fDBD-OOD.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Litian Liu;Yao Qin", "authorids": "~Litian_Liu1;~Yao_Qin1", "gender": "F;", "homepage": "https://litianliu.github.io/;https://yaoqin1.github.io", "dblp": ";66/10420-1", "google_scholar": ";https://scholar.google.com/citations?view_op=list_works", "orcid": ";", "linkedin": ";", "or_profile": "~Litian_Liu1;~Yao_Qin1", "aff": "Qualcomm Inc, QualComm;Google", "aff_domain": "qti.qualcomm.com;google.com", "position": "Researcher;Researcher", "bibtex": "@inproceedings{\nliu2024fast,\ntitle={Fast Decision Boundary based Out-of-Distribution Detector},\nauthor={Litian Liu and Yao Qin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eQaOb4r6YC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2198134, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12294271328309147921&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "qti.qualcomm.com;google.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Qualcomm Incorporated;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.qualcomm.com;https://www.google.com", "aff_unique_abbr": "Qualcomm;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Calibration Bottleneck: Over-compressed Representations are Less Calibratable", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33499", "id": "eRThYD9BGD", "proceeding": "https://proceedings.mlr.press/v235/wang24cm.html", "pdf": "https://openreview.net/pdf?id=eRThYD9BGD", "openreview": "https://openreview.net/forum?id=eRThYD9BGD", "author_site": "Deng-Bao Wang, Min-Ling Zhang", "tldr": "", "abstract": "Although deep neural networks have achieved remarkable success, they often exhibit a significant deficiency in reliable uncertainty calibration. This paper focus on model calibratability, which assesses how amenable a model is to be well recalibrated post-hoc. We find that the widely used weight decay regularizer detrimentally affects model calibratability, subsequently leading to a decline in final calibration performance after post-hoc calibration. To identify the underlying causes leading to poor calibratability, we delve into the calibratability of intermediate features across the hidden layers. We observe a U-shaped trend in the calibratability of intermediate features from the bottom to the top layers, which indicates that over-compression of the top representation layers significantly hinders model calibratability. Based on the observations, this paper introduces a weak classifier hypothesis, i.e., given a weak classification head that has not been over-trained, the representation module can be better learned to produce more calibratable features. Consequently, we propose a progressively layer-peeled training (PLP) method to exploit this hypothesis, thereby enhancing model calibratability. Our comparative experiments show the effectiveness of our method, which improves model calibration and also yields competitive predictive performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Deng-Bao Wang;Min-Ling Zhang", "authorids": "~Deng-Bao_Wang1;~Min-Ling_Zhang2", "gender": "M;M", "homepage": "https://dengbaowang.github.io/;http://palm.seu.edu.cn/zhangml/", "dblp": "204/2255;84/271.html", "google_scholar": "QCA7j2cAAAAJ;uFHCIM0AAAAJ", "orcid": ";0000-0003-1880-5918", "linkedin": ";", "or_profile": "~Deng-Bao_Wang1;~Min-Ling_Zhang2", "aff": "Southeast University;Southeast University", "aff_domain": "seu.edu.cn;seu.edu.cn", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nwang2024calibration,\ntitle={Calibration Bottleneck: Over-compressed Representations are Less Calibratable},\nauthor={Deng-Bao Wang and Min-Ling Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eRThYD9BGD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 10199344, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15414488292460653855&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "seu.edu.cn;seu.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Southeast University", "aff_unique_dep": "", "aff_unique_url": "https://www.seu.edu.cn/", "aff_unique_abbr": "SEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Graph-enhanced Large Language Models in Asynchronous Plan Reasoning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33498", "id": "eVGpdivOnQ", "proceeding": "https://proceedings.mlr.press/v235/lin24k.html", "pdf": "https://openreview.net/pdf?id=eVGpdivOnQ", "openreview": "https://openreview.net/forum?id=eVGpdivOnQ", "author_site": "Fangru Lin, Emanuele La Malfa, Valentin Hofmann, Elle Michelle Yang, Anthony Cohn, Janet Pierrehumbert", "tldr": "", "abstract": "Planning is a fundamental property of human intelligence. Reasoning about asynchronous plans is challenging since it requires sequential and parallel planning to optimize time costs. Can large language models (LLMs) succeed at this task? Here, we present the first large-scale study investigating this question. We find that a representative set of closed and open-source LLMs, including GPT-4 and LLaMA-2, behave poorly when not supplied with illustrations about the task-solving process in our benchmark AsyncHow. We propose a novel technique called *Plan Like a Graph* (PLaG) that combines graphs with natural language prompts and achieves state-of-the-art results. We show that although PLaG can boost model performance, LLMs still suffer from drastic degradation when task complexity increases, highlighting the limits of utilizing LLMs for simulating digital devices. We see our study as an exciting step towards using LLMs as efficient autonomous agents. Our code and data are available at https://github.com/fangru-lin/graph-llm-asynchow-plan.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fangru Lin;Emanuele La Malfa;Valentin Hofmann;Elle Michelle Yang;Anthony G. Cohn;Janet B. Pierrehumbert", "authorids": "~Fangru_Lin1;~Emanuele_La_Malfa2;~Valentin_Hofmann1;~Elle_Michelle_Yang1;~Anthony_G._Cohn2;~Janet_B._Pierrehumbert1", "gender": "F;M;;F;F;M", "homepage": "http://fangru-lin.github.io;https://emanuelelm.github.io/;https://valentinhofmann.github.io/;https://www.elleismatic.com;https://eng.ox.ac.uk/people/janet-pierrehumbert/;https://eps.leeds.ac.uk/computing/staff/76/professor-anthony-tony-g-cohn-freng-flsw-ceng-citp", "dblp": "367/2200;276/0274;264/4665;367/9336;60/5814;c/AnthonyGCohn.html", "google_scholar": "LFS5dV0AAAAJ;4_91m08AAAAJ;bbHOPKwAAAAJ;T3HITrgAAAAJ;ebzKiiwAAAAJ;tal4mMkAAAAJ", "orcid": "0009-0003-6454-172X;0000-0002-6254-0470;;;0000-0002-5989-3574;0000-0002-7652-8907", "linkedin": "fangru-lin-oxford;;;elleyang;;tonycohn/?originalSubdomain=uk", "or_profile": "~Fangru_Lin1;~Emanuele_La_Malfa2;~Valentin_Hofmann1;~Elle_Michelle_Yang1;~Janet_B._Pierrehumbert1;~Anthony_Cohn1", "aff": "Microsoft;Department of Computer Science, University of Oxford;Allen Institute for Artificial Intelligence;University of Oxford;University of Oxford;University of Leeds", "aff_domain": "microsoft.com;cs.ox.ac.uk;allenai.org;cs.ox.ac.uk;ox.ac.uk;leeds.ac.uk", "position": "Intern;Postdoc;Postdoc;PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nlin2024graphenhanced,\ntitle={Graph-enhanced Large Language Models in Asynchronous Plan Reasoning},\nauthor={Fangru Lin and Emanuele La Malfa and Valentin Hofmann and Elle Michelle Yang and Anthony G. Cohn and Janet B. Pierrehumbert},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eVGpdivOnQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2629223, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9602688918856095301&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "microsoft.com;cs.ox.ac.uk;allenai.org;cs.ox.ac.uk;ox.ac.uk;leeds.ac.uk", "author_num": 6, "aff_unique_index": "0;1;2;1;1;3", "aff_unique_norm": "Microsoft;University of Oxford;Allen Institute for Artificial Intelligence;University of Leeds", "aff_unique_dep": "Microsoft Corporation;Department of Computer Science;;", "aff_unique_url": "https://www.microsoft.com;https://www.ox.ac.uk;https://allenai.org;https://www.leeds.ac.uk", "aff_unique_abbr": "Microsoft;Oxford;AI2;Leeds", "aff_campus_unique_index": "1", "aff_campus_unique": ";Oxford", "aff_country_unique_index": "0;1;0;1;1;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "StrokeNUWA\u2014Tokenizing Strokes for Vector Graphic Synthesis", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33497", "id": "eVlx8DaG9h", "proceeding": "https://proceedings.mlr.press/v235/tang24h.html", "pdf": "https://openreview.net/pdf?id=eVlx8DaG9h", "openreview": "https://openreview.net/forum?id=eVlx8DaG9h", "author_site": "Zecheng Tang, Chenfei Wu, Zekai Zhang, Minheng Ni, Shengming Yin, Yu Liu, Zhengyuan Yang, Lijuan Wang, Zicheng Liu, Juntao Li, Nan Duan", "tldr": "", "abstract": "To leverage LLMs for visual synthesis, traditional methods convert raster image information into discrete grid tokens through specialized visual modules, while disrupting the model\u2019s ability to capture the true semantic representation of visual scenes. This paper posits that an alternative representation of images, vector graphics, can effectively surmount this limitation by enabling a more natural and semantically coherent segmentation of the image information. Thus, we introduce StrokeNUWA, a pioneering work exploring a better visual representation \"stroke\" tokens on vector graphics, which is inherently visual semantics rich, naturally compatible with LLMs, and highly compressed. Equipped with stroke tokens, StrokeNUWA can significantly surpass traditional LLM-based and optimization-based methods across various metrics in the vector graphic generation task. Besides, StrokeNUWA achieves up to a $94\\times$ speedup in inference over the speed of prior methods with an exceptional SVG code compression ratio of 6.9%.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zecheng Tang;Chenfei Wu;Zekai Zhang;Minheng Ni;Shengming Yin;Yu Liu;Zhengyuan Yang;Lijuan Wang;Zicheng Liu;Juntao Li;Nan Duan", "authorids": "~Zecheng_Tang1;~Chenfei_Wu2;~Zekai_Zhang4;~Minheng_Ni1;~Shengming_Yin1;yluiu@microsoft.com;~Zhengyuan_Yang1;~Lijuan_Wang1;~Zicheng_Liu1;~Juntao_Li2;~Nan_Duan1", "gender": "M;M;M;M;M;;M;F;M;M;M", "homepage": "https://zetangforward.github.io/;;;https://kodenii.github.io;https://shengming-yin.github.io/;;http://zhengyuan.info/;https://www.microsoft.com/en-us/research/people/lijuanw/;https://sites.google.com/view/zichengliu/home?pli=1;https://lijuntaopku.github.io/;https://nanduan.github.io/", "dblp": "326/0272;;;263/9969;340/8237;;163/9713;51/2527.html;l/ZichengLiu;;", "google_scholar": "HUDkBMUAAAAJ;https://scholar.google.com/citations?hl=zh-CN;;-ybr4_cAAAAJ;rzaiNqIAAAAJ;;https://scholar.google.com/citations?hl=zh-CN;cDcWXuIAAAAJ;bkALdvsAAAAJ;sZSygsYAAAAJ;Qaa6OxIAAAAJ", "orcid": "0009-0000-0075-8282;;;;;;;;0000-0001-5894-7828;0000-0002-6286-7529;", "linkedin": "%E6%B3%BD%E6%88%90-%E6%B1%A4-aa6333285/;;zekai-zhang-65924526a/;https://linkedin.com/in/minheng-ni-7b8a99146;shengming-yin-098490259/;;;;;;", "or_profile": "~Zecheng_Tang1;~Chenfei_Wu2;~Zekai_Zhang4;~Minheng_Ni1;~Shengming_Yin1;yluiu@microsoft.com;~Zhengyuan_Yang1;~Lijuan_Wang1;~Zicheng_Liu1;~Juntao_Li2;~Nan_Duan1", "aff": "Soochow University;Microsoft;Peking University;Microsoft;University of Science and Technology of China;;Microsoft;Microsoft;Microsoft;Soochow University, China;Microsoft Research Asia", "aff_domain": "suda.edu.cn;microsoft.com;pku.edu.cn;microsoft.com;ustc.edu.cn;;microsoft.com;microsoft.com;microsoft.com;suda.edu.cn;microsoft.com", "position": "PhD student;Researcher;PhD student;Research Intern;MS student;;Researcher;Principal Researcher;partner research manager;Associate Professor;Principal Researcher", "bibtex": "@inproceedings{\ntang2024strokenuwatokenizing,\ntitle={Stroke{NUWA}{\\textemdash}Tokenizing Strokes for Vector Graphic Synthesis},\nauthor={Zecheng Tang and Chenfei Wu and Zekai Zhang and Minheng Ni and Shengming Yin and Yu Liu and Zhengyuan Yang and Lijuan Wang and Zicheng Liu and Juntao Li and Nan Duan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eVlx8DaG9h}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6479278, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16931218216543252661&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "suda.edu.cn;microsoft.com;pku.edu.cn;microsoft.com;ustc.edu.cn;;microsoft.com;microsoft.com;microsoft.com;suda.edu.cn;microsoft.com", "author_num": 11, "aff_unique_index": "0;1;2;1;3;1;1;1;0;1", "aff_unique_norm": "Soochow University;Microsoft;Peking University;University of Science and Technology of China", "aff_unique_dep": ";Microsoft Corporation;;", "aff_unique_url": "https://www.soochow.edu.cn;https://www.microsoft.com;http://www.pku.edu.cn;http://www.ustc.edu.cn", "aff_unique_abbr": "Soochow U;Microsoft;Peking U;USTC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1;0;1;0;1;1;1;0;0", "aff_country_unique": "China;United States" }, { "title": "Novel Spectral Algorithms for the Partial Credit Model", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33496", "id": "eW0pZmziBH", "proceeding": "https://proceedings.mlr.press/v235/nguyen24k.html", "pdf": "https://openreview.net/pdf?id=eW0pZmziBH", "openreview": "https://openreview.net/forum?id=eW0pZmziBH", "author_site": "Duc Nguyen, Anderson Zhang", "tldr": "", "abstract": "The Partial Credit Model (PCM) of Andrich (1978) and Masters (1982) is a fundamental model within the psychometric literature with wide-ranging modern applications. It models the integer-valued response that a subject gives to an item where there is a natural notion of monotonic progress between consecutive response values, such as partial scores on a test and customer ratings of a product. In this paper, we introduce a novel, time-efficient and accurate statistical spectral algorithm for inference under the PCM model. We complement our algorithmic contribution with in-depth non-asymptotic statistical analysis, the first of its kind in the literature. We show that the spectral algorithm enjoys the optimal error guarantee under three different metrics, all under reasonable sampling assumptions. We leverage the efficiency of the spectral algorithm to propose a novel EM-based algorithm for learning mixtures of PCMs. We perform comprehensive experiments on synthetic and real-life datasets covering education testing, recommendation systems, and financial investment applications. We show that the proposed spectral algorithm is competitive with previously introduced algorithms in terms of accuracy while being orders of magnitude faster.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Duc Nguyen;Anderson Ye Zhang", "authorids": "~Duc_Nguyen3;~Anderson_Ye_Zhang1", "gender": "M;", "homepage": "https://dnguyen1196.github.io/;", "dblp": ";", "google_scholar": "ELbDvOsAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Duc_Nguyen3;~Anderson_Ye_Zhang1", "aff": "University of Pennsylvania;", "aff_domain": "seas.upenn.edu;", "position": "PhD student;", "bibtex": "@inproceedings{\nnguyen2024novel,\ntitle={Novel Spectral Algorithms for the Partial Credit Model},\nauthor={Duc Nguyen and Anderson Ye Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eW0pZmziBH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 574787, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:E5pI4PsBrfcJ:scholar.google.com/&scioq=Novel+Spectral+Algorithms+for+the+Partial+Credit+Model&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": "seas.upenn.edu;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Towards Theoretical Understanding of Learning Large-scale Dependent Data via Random Features", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33495", "id": "eY4jrFe6Qc", "proceeding": "https://proceedings.mlr.press/v235/wang24e.html", "pdf": "https://openreview.net/pdf?id=eY4jrFe6Qc", "openreview": "https://openreview.net/forum?id=eY4jrFe6Qc", "author_site": "Chao Wang, Xin Bing, Xin HE, Caixing Wang", "tldr": "", "abstract": "Random feature (RF) mapping is an attractive and powerful technique for solving large-scale nonparametric regression. Yet, the existing theoretical analysis crucially relies on the i.i.d. assumption that individuals in the data are independent and identically distributed. It is still unclear whether learning accuracy would be compromised when the i.i.d. assumption is violated. This paper aims to provide theoretical understanding of the kernel ridge regression (KRR) with RFs for large-scale dependent data. Specifically, we consider two types of data dependence structure, namely, the $\\tau$-mixing process with exponential decay coefficient, and that with polynomial decay coefficient. Theoretically, we prove that the kernel ridge estimator with RFs achieves the minimax optimality under the exponential decay scenario, but yields a sub-optimal result under the polynomial decay case. Our analysis further reveals how the decay rate of the $\\tau$-mixing coefficient impacts the learning accuracy of the kernel ridge estimator with RFs. Extensive numerical experiments on both synthetic and real examples further validate our theoretical findings and support the effectiveness of the KRR with RFs in dealing with dependent data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chao Wang;Xin Bing;Xin HE;Caixing Wang", "authorids": "~Chao_Wang39;~Xin_Bing1;~Xin_HE6;~Caixing_Wang1", "gender": "M;;M;M", "homepage": "https://sites.coecis.cornell.edu/xinbing/;https://sites.google.com/view/guoqinghe;http://wangcaixing96.com/;https://github.com/wangchao-afk", "dblp": ";;;", "google_scholar": ";aduqO4EAAAAJ;SLEH6XYAAAAJ;", "orcid": ";;0009-0009-3068-6094;", "linkedin": ";;;", "or_profile": "~Xin_Bing1;~Xin_HE6;~Wang_Caixing1;~Wang_Chao2", "aff": "University of Toronto;Shanghai University of Finance and Economics;Shanghai University of Finance and Economics;Shanghai University of Finance and Economics", "aff_domain": "utoronto.ca;shufe.edu;shufe.edu.cn;sufe.edu", "position": "Assistant Professor;Associate Professor;PhD student;PhD student", "bibtex": "@inproceedings{\nwang2024towards,\ntitle={Towards Theoretical Understanding of Learning Large-scale Dependent Data via Random Features},\nauthor={Chao Wang and Xin Bing and Xin HE and Caixing Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eY4jrFe6Qc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 604003, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4EoEU0WxbIoJ:scholar.google.com/&scioq=Towards+Theoretical+Understanding+of+Learning+Large-scale+Dependent+Data+via+Random+Features&hl=en&as_sdt=0,5", "gs_version_total": 4, "email": "utoronto.ca;shufe.edu;shufe.edu.cn;sufe.edu", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of Toronto;Shanghai University of Finance and Economics", "aff_unique_dep": ";", "aff_unique_url": "https://www.utoronto.ca;http://www.sufe.edu.cn", "aff_unique_abbr": "U of T;SUFE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Canada;China" }, { "title": "Learning-Rate-Free Stochastic Optimization over Riemannian Manifolds", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33494", "id": "eY98MVffrD", "proceeding": "https://proceedings.mlr.press/v235/dodd24a.html", "pdf": "https://openreview.net/pdf?id=eY98MVffrD", "openreview": "https://openreview.net/forum?id=eY98MVffrD", "author_site": "Daniel Dodd, Louis Sharrock, Chris Nemeth", "tldr": "", "abstract": "In recent years, interest in gradient-based optimization over Riemannian manifolds has surged. However, a significant challenge lies in the reliance on hyperparameters, especially the learning rate, which requires meticulous tuning by practitioners to ensure convergence at a suitable rate. In this work, we introduce innovative learning-rate-free algorithms for stochastic optimization over Riemannian manifolds, eliminating the need for hand-tuning and providing a more robust and user-friendly approach. We establish high probability convergence guarantees that are optimal, up to logarithmic factors, compared to the best-known optimally tuned rate in the deterministic setting. Our approach is validated through numerical experiments, demonstrating competitive performance against learning-rate-dependent algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Daniel Dodd;Louis Sharrock;Christopher Nemeth", "authorids": "~Daniel_Dodd1;~Louis_Sharrock1;~Christopher_Nemeth1", "gender": ";M;M", "homepage": "https://github.com/daniel-dodd;https://louissharrock.github.io/;http://www.lancs.ac.uk/~nemeth/", "dblp": "325/6223;304/5319;88/10513", "google_scholar": "https://scholar.google.com/citations?hl=en;O0xSdYcAAAAJ;https://scholar.google.co.uk/citations?user=17-Ze24AAAAJ", "orcid": ";0000-0003-1691-1215;0000-0002-9084-3866", "linkedin": ";louissharrock/;christopher-nemeth-815963233/", "or_profile": "~Daniel_Dodd1;~Louis_Sharrock1;~Christopher_Nemeth1", "aff": "Lancaster University;Lancaster University;Lancaster University", "aff_domain": "lancaster.ac.uk;lancaster.ac.uk;lancaster.ac.uk", "position": "PhD student;Postdoc;Full Professor", "bibtex": "@inproceedings{\ndodd2024learningratefree,\ntitle={Learning-Rate-Free Stochastic Optimization over Riemannian Manifolds},\nauthor={Daniel Dodd and Louis Sharrock and Christopher Nemeth},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eY98MVffrD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8639882, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:A_QTUr9qOzQJ:scholar.google.com/&scioq=Learning-Rate-Free+Stochastic+Optimization+over+Riemannian+Manifolds&hl=en&as_sdt=0,33", "gs_version_total": 5, "email": "lancaster.ac.uk;lancaster.ac.uk;lancaster.ac.uk", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Lancaster University", "aff_unique_dep": "", "aff_unique_url": "https://www.lancaster.ac.uk", "aff_unique_abbr": "Lancaster", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Optimal Hessian/Jacobian-Free Nonconvex-PL Bilevel Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33493", "id": "eZiQWM5U0E", "proceeding": "https://proceedings.mlr.press/v235/huang24a.html", "pdf": "https://openreview.net/pdf?id=eZiQWM5U0E", "openreview": "https://openreview.net/forum?id=eZiQWM5U0E", "tldr": "", "abstract": "Bilevel optimization is widely applied in many machine learning tasks such as hyper-parameter learning, meta learning and reinforcement learning. Although many algorithms recently have been developed to solve the bilevel optimization problems, they generally rely on the (strongly) convex lower-level problems. More recently, some methods have been proposed to solve the nonconvex-PL bilevel optimization problems, where their upper-level problems are possibly nonconvex, and their lower-level problems are also possibly nonconvex while satisfying Polyak-\u0141ojasiewicz (PL) condition. However, these methods still have a high convergence complexity or a high computation complexity such as requiring compute expensive Hessian/Jacobian matrices and its inverses. In the paper, thus, we propose an efficient Hessian/Jacobian-free method (i.e., HJFBiO) with the optimal convergence complexity to solve the nonconvex-PL bilevel problems. Theoretically, under some mild conditions, we prove that our HJFBiO method obtains an optimal convergence rate of $O(\\frac{1}{T})$, where $T$ denotes the number of iterations, and has an optimal gradient complexity of $O(\\epsilon^{-1})$ in finding an $\\epsilon$-stationary solution. We conduct some numerical experiments on the bilevel PL game and hyper-representation learning task to demonstrate efficiency of our proposed method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Feihu Huang", "authorids": "~Feihu_Huang1", "gender": "M", "homepage": "", "dblp": "169/6247", "google_scholar": "tRQwlHUAAAAJ", "orcid": "0000-0003-0806-6074", "linkedin": "", "or_profile": "~Feihu_Huang1", "aff": "Nanjing University of Aeronautics and Astronautics", "aff_domain": "nuaa.edu.cn", "position": "Full Professor", "bibtex": "@inproceedings{\nhuang2024optimal,\ntitle={Optimal Hessian/Jacobian-Free Nonconvex-{PL} Bilevel Optimization},\nauthor={Feihu Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eZiQWM5U0E}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 770392, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5496986745197515186&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "nuaa.edu.cn", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Nanjing University of Aeronautics and Astronautics", "aff_unique_dep": "", "aff_unique_url": "http://www.nuaa.edu.cn", "aff_unique_abbr": "NUAA", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "Position: Leverage Foundational Models for Black-Box Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33492", "id": "ea2MgKn3sV", "proceeding": "https://proceedings.mlr.press/v235/song24h.html", "pdf": "https://openreview.net/pdf?id=ea2MgKn3sV", "openreview": "https://openreview.net/forum?id=ea2MgKn3sV", "author_site": "Xingyou Song, Yingtao Tian, Robert Lange, Chansoo Lee, Yujin Tang, Yutian Chen", "tldr": "", "abstract": "Undeniably, Large Language Models (LLMs) have stirred an extraordinary wave of innovation in the machine learning research domain, resulting in substantial impact across diverse fields such as reinforcement learning, robotics, and computer vision. Their incorporation has been rapid and transformative, marking a significant paradigm shift in the field of machine learning research. However, the field of experimental design, grounded on black-box optimization, has been much less affected by such a paradigm shift, even though integrating LLMs with optimization presents a unique landscape ripe for exploration. In this position paper, we frame the field of black-box optimization around sequence-based foundation models and organize their relationship with previous literature. We discuss the most promising ways foundational language models can revolutionize optimization, which include harnessing the vast wealth of information encapsulated in free-form text to enrich task comprehension, utilizing highly flexible sequence models such as Transformers to engineer superior optimization strategies, and enhancing performance prediction over previously unseen search spaces.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xingyou Song;Yingtao Tian;Robert Tjarko Lange;Chansoo Lee;Yujin Tang;Yutian Chen", "authorids": "~Xingyou_Song1;~Yingtao_Tian1;~Robert_Tjarko_Lange1;~Chansoo_Lee1;~Yujin_Tang1;~Yutian_Chen1", "gender": "M;;;M;M;", "homepage": "https://xingyousong.github.io/;https://alantian.net/;https://roberttlange.github.io/;;;http://yutianchen.com/", "dblp": "211/7623;180/5335;245/9152;137/3219;190/1177.html;95/7441-1", "google_scholar": "GnpHmO8AAAAJ;17Fe5K0AAAAJ;https://scholar.google.es/citations?user=cTrc3x4AAAAJ;;https://scholar.google.co.jp/citations?user=3czUzRYAAAAJ;fAWKizAAAAAJ", "orcid": ";;;;;", "linkedin": "xingyou-song-355629a1/;;;;;", "or_profile": "~Xingyou_Song1;~Yingtao_Tian1;~Robert_Tjarko_Lange1;~Chansoo_Lee1;~Yujin_Tang1;~Yutian_Chen1", "aff": "Google DeepMind;Google;TU Berlin;Google;Sakana AI;Google DeepMind", "aff_domain": "google.com;google.com;tu-berlin.de;google.com;sakana.ai;google.com", "position": "Senior Research Scientist;Research Scientist;PhD student;Researcher;Researcher;Research Scientist", "bibtex": "@inproceedings{\nsong2024position,\ntitle={Position: Leverage Foundational Models for Black-Box Optimization},\nauthor={Xingyou Song and Yingtao Tian and Robert Tjarko Lange and Chansoo Lee and Yujin Tang and Yutian Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ea2MgKn3sV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1157526, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9893123578640149682&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "google.com;google.com;tu-berlin.de;google.com;sakana.ai;google.com", "author_num": 6, "aff_unique_index": "0;0;1;0;2;0", "aff_unique_norm": "Google;Technische Universit\u00e4t Berlin;Sakana AI", "aff_unique_dep": "Google DeepMind;;", "aff_unique_url": "https://deepmind.com;https://www.tu-berlin.de;", "aff_unique_abbr": "DeepMind;TU Berlin;", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Mountain View;Berlin", "aff_country_unique_index": "0;1;2;1;0", "aff_country_unique": "United Kingdom;United States;Germany;" }, { "title": "Learning Adaptive and View-Invariant Vision Transformer for Real-Time UAV Tracking", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33491", "id": "eaNLvrP8n1", "proceeding": "https://proceedings.mlr.press/v235/li24ax.html", "pdf": "https://openreview.net/pdf?id=eaNLvrP8n1", "openreview": "https://openreview.net/forum?id=eaNLvrP8n1", "author_site": "Yongxin Li, Mengyuan Liu, You Wu, Xucheng Wang, Xiangyang Yang, Shuiwang Li", "tldr": "", "abstract": "Harnessing transformer-based models, visual tracking has made substantial strides. However, the sluggish performance of current trackers limits their practicality on devices with constrained computational capabilities, especially for real-time unmanned aerial vehicle (UAV) tracking. Addressing this challenge, we introduce AVTrack, an adaptive computation framework tailored to selectively activate transformer blocks for real-time UAV tracking in this work. Our novel Activation Module (AM) dynamically optimizes ViT architecture, selectively engaging relevant components and enhancing inference efficiency without compromising much tracking performance. Moreover, we bolster the effectiveness of ViTs, particularly in addressing challenges arising from extreme changes in viewing angles commonly encountered in UAV tracking, by learning view-invariant representations through mutual information maximization. Extensive experiments on five tracking benchmarks affirm the effectiveness and versatility of our approach, positioning it as a state-of-the-art solution in visual tracking. Code is released at: https://github.com/wuyou3474/AVTrack.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yongxin Li;Mengyuan Liu;You Wu;Xucheng Wang;Xiangyang Yang;Shuiwang Li", "authorids": "~Yongxin_Li1;~Mengyuan_Liu3;~You_Wu8;~Xucheng_Wang1;~Xiangyang_Yang1;~Shuiwang_Li1", "gender": "M;M;M;M;M;M", "homepage": "https://github.com/lyx-hush;;;;https://github.com/xyyang317;", "dblp": ";;;323/8501;;160/6992.html", "google_scholar": ";a8isW34AAAAJ;;;;", "orcid": ";;0009-0001-4230-899X;;;0000-0002-4587-513X", "linkedin": ";;;;;", "or_profile": "~Yongxin_Li1;~Mengyuan_Liu3;~You_Wu8;~Xucheng_Wang1;~Xiangyang_Yang1;~Shuiwang_Li1", "aff": "Guilin University of Technology;Guilin University of Technology;Guilin University of Technology;Guilin University of Technology;Guilin University of Technology;Guilin University of Technology", "aff_domain": "glut.edu.cn;glut.edu.cn;glut.edu.cn;glut.edu.cn;glut.edu.cn;glut.edu.cn", "position": "MS student;bachelor;MS student;Undergrad student;MS student;Associate Professor", "bibtex": "@inproceedings{\nli2024learning,\ntitle={Learning Adaptive and View-Invariant Vision Transformer for Real-Time {UAV} Tracking},\nauthor={Yongxin Li and Mengyuan Liu and You Wu and Xucheng Wang and Xiangyang Yang and Shuiwang Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eaNLvrP8n1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9538156, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3228123845403440027&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "glut.edu.cn;glut.edu.cn;glut.edu.cn;glut.edu.cn;glut.edu.cn;glut.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Guilin University of Technology", "aff_unique_dep": "", "aff_unique_url": "http://www.gut.edu.cn", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Leveraging Attractor Dynamics in Spatial Navigation for Better Language Parsing", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33490", "id": "eapFRURALQ", "proceeding": "https://proceedings.mlr.press/v235/zou24a.html", "pdf": "https://openreview.net/pdf?id=eapFRURALQ", "openreview": "https://openreview.net/forum?id=eapFRURALQ", "author_site": "Xiaolong Zou, Xingxing Cao, Xiaojiao Yang, Bo Hong", "tldr": "", "abstract": "Increasing experimental evidence suggests that the human hippocampus, evolutionarily shaped by spatial navigation tasks, also plays an important role in language comprehension, indicating a shared computational mechanism for both functions. However, the specific relationship between the hippocampal formation's computational mechanism in spatial navigation and its role in language processing remains elusive. To investigate this question, we develop a prefrontal-hippocampal-entorhinal model (which called PHE-trinity) that features two key aspects: 1) the use of a modular continuous attractor neural network to represent syntactic structure, akin to the grid network in the entorhinal cortex; 2) the creation of two separate input streams, mirroring the factorized structure-content representation found in the hippocampal formation. We evaluate our model on language command parsing tasks, specifically using the SCAN dataset. Our findings include: 1) attractor dynamics can facilitate systematic generalization and efficient learning from limited data; 2) through visualization and reverse engineering, we unravel a potential dynamic mechanism for grid network representing syntactic structure. Our research takes an initial step in uncovering the dynamic mechanism shared by spatial navigation and language information processing.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaolong Zou;Xingxing Cao;Xiaojiao Yang;Bo Hong", "authorids": "~Xiaolong_Zou1;~Xingxing_Cao1;~Xiaojiao_Yang1;~Bo_Hong2", "gender": "M;;F;M", "homepage": "https://www.researchgate.net/profile/Ben_Zou2?ev=hdr_xprf&_sg=4_qbmTPFzK47T60qCvL9GWs71qZNjrkYWpvY4BV6W595esmy6xa1AXjbRoS2P3fk-Tb_-1RYvjTh_jz-ZgYhLfn-;;;http://neuro.med.tsinghua.edu.cn", "dblp": "135/8911;;;30/6939", "google_scholar": ";https://scholar.google.com/citations?hl=en;;qSIysB4AAAAJ", "orcid": ";0009-0001-5153-3734;0009-0009-9855-4397;0000-0003-2900-6791", "linkedin": ";;;", "or_profile": "~Xiaolong_Zou1;~Xingxing_Cao1;~Xiaojiao_Yang1;~Bo_Hong2", "aff": ";;;Department of Biomedical Engineering, Tsinghua University", "aff_domain": ";;;tsinghua.edu.cn", "position": ";;;Full Professor", "bibtex": "@inproceedings{\nzou2024leveraging,\ntitle={Leveraging Attractor Dynamics in Spatial Navigation for Better Language Parsing},\nauthor={Xiaolong Zou and Xingxing Cao and Xiaojiao Yang and Bo Hong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eapFRURALQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9397824, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GkomD8jYRTsJ:scholar.google.com/&scioq=Leveraging+Attractor+Dynamics+in+Spatial+Navigation+for+Better+Language+Parsing&hl=en&as_sdt=0,47", "gs_version_total": 3, "email": ";;;tsinghua.edu.cn", "author_num": 4, "aff_unique_index": "0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "Department of Biomedical Engineering", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "Harnessing Hierarchical Label Distribution Variations in Test Agnostic Long-tail Recognition", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33489", "id": "ebt5BfRHcW", "proceeding": "https://proceedings.mlr.press/v235/yang24af.html", "pdf": "https://openreview.net/pdf?id=ebt5BfRHcW", "openreview": "https://openreview.net/forum?id=ebt5BfRHcW", "author_site": "Zhiyong Yang, Qianqian Xu, Zitai Wang, Sicong Li, Boyu Han, Shilong Bao, Xiaochun Cao, Qingming Huang", "tldr": "", "abstract": "This paper explores test-agnostic long-tail recognition, a challenging long-tail task where the test label distributions are unknown and arbitrarily imbalanced. We argue that the variation in these distributions can be broken down hierarchically into global and local levels. The global ones reflect a broad range of diversity, while the local ones typically arise from milder changes, often focused On a particular neighbor. Traditional methods predominantly use a Mixture-of-Expert (MoE) approach, targeting a few fixed test label distributions that exhibit substantial global variations. However, the local variations are left unconsidered. To address this issue, we propose a new MoE strategy, $\\mathsf{DirMixE}$, which assigns experts to different Dirichlet meta-distributions of the label distribution, each targeting a specific aspect of local variations. Additionally, the diversity among these Dirichlet meta-distributions inherently captures global variations. This dual-level approach also leads to a more stable objective function, allowing us to sample different test distributions better to quantify the mean and variance of performance outcomes. Theoretically, we show that our proposed objective benefits from enhanced generalization by virtue of the variance-based regularization. Comprehensive experiments across multiple benchmarks confirm the effectiveness of $\\mathsf{DirMixE}$.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhiyong Yang;Qianqian Xu;Zitai Wang;Sicong Li;Boyu Han;Shilong Bao;Xiaochun Cao;Qingming Huang", "authorids": "~Zhiyong_Yang1;~Qianqian_Xu2;~Zitai_Wang1;~Sicong_Li2;~Boyu_Han1;~Shilong_Bao1;~Xiaochun_Cao3;~Qingming_Huang1", "gender": "M;F;M;M;;M;M;", "homepage": "https://joshuaas.github.io/;http://vipl.ict.ac.cn/people/~qianqianxu;https://wang22ti.github.io;https://github.com/scongl;;https://statusrank.github.io/;https://scst.sysu.edu.cn/members/caoxiaochun.htm;", "dblp": "01/452-1.html;07/7627;251/3361;;;143/0246;39/3695;", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.hk/citations?user=MjifS2MAAAAJ;45qZ_LcAAAAJ;;;https://scholar.google.com.hk/citations?user=5ZCgkQkAAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": "0000-0002-4409-4999;;0000-0003-4156-6417;;;;0000-0001-7141-708X;", "linkedin": ";;;;;;;", "or_profile": "~Zhiyong_Yang1;~Qianqian_Xu2;~Zitai_Wang1;~Sicong_Li2;~Boyu_Han1;~Shilong_Bao1;~Xiaochun_Cao3;~Qingming_Huang1", "aff": "University of Chinese Academic of Sciences;Institute of Computing Technology, Chinese Academy of Sciences;University of Chinese Academy of Sciences;Beihang University;;University of Chinese Academy of Sciences;SUN YAT-SEN UNIVERSITY;", "aff_domain": "ucas.ac.cb;ict.ac.cn;ucas.ac.cn;buaa.edu;;ucas.ac.cn;sysu.edu.cn;", "position": "Associate Professor;Full Professor;PhD student;Undergrad student;;PhD student;Full Professor;", "bibtex": "@inproceedings{\nyang2024harnessing,\ntitle={Harnessing Hierarchical Label Distribution Variations in Test Agnostic Long-tail Recognition},\nauthor={Zhiyong Yang and Qianqian Xu and Zitai Wang and Sicong Li and Boyu Han and Shilong Bao and Xiaochun Cao and Qingming Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ebt5BfRHcW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2623698, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12565567388009853686&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "ucas.ac.cb;ict.ac.cn;ucas.ac.cn;buaa.edu;;ucas.ac.cn;sysu.edu.cn;", "author_num": 8, "aff_unique_index": "0;1;0;2;0;3", "aff_unique_norm": "University of Chinese Academy of Sciences;Chinese Academy of Sciences;Beihang University;Sun Yat-sen University", "aff_unique_dep": ";Institute of Computing Technology;;", "aff_unique_url": "http://www.ucas.ac.cn;http://www.ict.ac.cn;http://www.buaa.edu.cn/;http://www.sysu.edu.cn", "aff_unique_abbr": "UCAS;CAS;BUAA;SYSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "MF-CLR: Multi-Frequency Contrastive Learning Representation for Time Series", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33488", "id": "ecO7WOIlMD", "proceeding": "https://proceedings.mlr.press/v235/duan24b.html", "pdf": "https://openreview.net/pdf?id=ecO7WOIlMD", "openreview": "https://openreview.net/forum?id=ecO7WOIlMD", "author_site": "Jufang Duan, Wei Zheng, Yangzhou Du, Wenfa Wu, Haipeng Jiang, Hongsheng Qi", "tldr": "", "abstract": "Learning a decent representation from unlabeled time series is a challenging task, especially when the time series data is derived from diverse channels at different sampling rates. Our motivation stems from the financial domain, where sparsely labeled covariates are commonly collected at different frequencies, *e.g.*, daily stock market index, monthly unemployment rate and quarterly net revenue of a certain listed corporation. This paper presents **M**ulti-**F**requency **C**ontrastive **L**earning **R**epresentation (MF-CLR), aimed at learning a good representation of multi-frequency time series in a self-supervised paradigm by leveraging the ability of contrastive learning. MF-CLR introduces a hierarchical mechanism that spans across different frequencies along the feature dimension. Within each contrastive block, two groups of subseries with adjacent frequencies are embedded based on our proposed cross-frequency consistency. To validate the effectiveness of MF-CLR, we conduct extensive experiments on five downstream tasks, including long-term and short-term forecasting, classification, anomaly detection and imputation. Experimental evidence shows that MF-CLR delivers a leading performance in all the downstream tasks and keeps consistent performance across different target dataset scales in the transfer learning scenario.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jufang Duan;Wei Zheng;Yangzhou Du;Wenfa Wu;Haipeng Jiang;Hongsheng Qi", "authorids": "~Jufang_Duan2;~Wei_Zheng9;~Yangzhou_Du2;~Wenfa_Wu1;~Haipeng_Jiang1;~Hongsheng_Qi1", "gender": "M;F;M;M;Not Specified;M", "homepage": ";https://github.com/ZhengWr;;;https://www.lenovo.com.cn/;https://lenovo.com", "dblp": ";;40/951.html;;;", "google_scholar": ";;;;;", "orcid": "0000-0003-1825-7982;0000-0002-5881-0140;;0000-0002-4178-7378;0009-0004-5643-8439;", "linkedin": ";;;;;hong-sheng-qi-a542151/", "or_profile": "~Jufang_Duan2;~Wei_Zheng9;~Yangzhou_Du2;~Wenfa_Wu1;~Haipeng_Jiang1;~Hongsheng_Qi1", "aff": "Lenovo;Lenovo;Lenovo;Lenovo Research;;Xi'an University of Electronic Science and Technology", "aff_domain": "lenovo.com;lenovo.com;lenovo.com;lenovo.com;;xidian.edu.cn", "position": "Researcher;Researcher;Researcher;Researcher;;Lecturer", "bibtex": "@inproceedings{\nduan2024mfclr,\ntitle={{MF}-{CLR}: Multi-Frequency Contrastive Learning Representation for Time Series},\nauthor={Jufang Duan and Wei Zheng and Yangzhou Du and Wenfa Wu and Haipeng Jiang and Hongsheng Qi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ecO7WOIlMD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1245328, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dyy--XLKqxEJ:scholar.google.com/&scioq=MF-CLR:+Multi-Frequency+Contrastive+Learning+Representation+for+Time+Series&hl=en&as_sdt=0,14", "gs_version_total": 4, "email": "lenovo.com;lenovo.com;lenovo.com;lenovo.com;;xidian.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "Lenovo Group Limited;Lenovo;Xi'an University of Electronic Science and Technology", "aff_unique_dep": ";Research;", "aff_unique_url": "https://www.lenovo.com;https://www.lenovo.com;http://www.xidian.edu.cn/", "aff_unique_abbr": "Lenovo;Lenovo;Xidian University", "aff_campus_unique_index": "1", "aff_campus_unique": ";Xi'an", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Speech Self-Supervised Learning Using Diffusion Model Synthetic Data", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33487", "id": "ecnpYYHjt9", "proceeding": "https://proceedings.mlr.press/v235/gao24j.html", "pdf": "https://openreview.net/pdf?id=ecnpYYHjt9", "openreview": "https://openreview.net/forum?id=ecnpYYHjt9", "author_site": "Heting Gao, Kaizhi Qian, Junrui Ni, Chuang Gan, Mark Hasegawa-Johnson, Shiyu Chang, Yang Zhang", "tldr": "", "abstract": "While self-supervised learning (SSL) in speech has greatly reduced the reliance of speech processing systems on annotated corpora, the success of SSL still hinges on the availability of a large-scale unannotated corpus, which is still often impractical for many low-resource languages or under privacy concerns. Some existing work seeks to alleviate the problem by data augmentation, but most works are confined to introducing perturbations to real speech and do not introduce new variations in speech prosody, speakers, and speech content, which are important for SSL. Motivated by the recent finding that diffusion models have superior capabilities for modeling data distributions, we propose DiffS4L, a pretraining scheme that augments the limited unannotated data with synthetic data with different levels of variations, generated by a diffusion model trained on the limited unannotated data. Finally, an SSL model is pre-trained on the real and the synthetic speech. Our experiments show that DiffS4L can significantly improve the performance of SSL models, such as reducing the WER of the HuBERT pretrained model by 6.26 percentage points in the English ASR task. Notably, we find that the synthetic speech with all levels of variations, i.e. new prosody, new speakers, and even new content (despite the new content being mostly babble), accounts for significant performance improvement. The code is available at github.com/Hertin/DiffS4L.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Heting Gao;Kaizhi Qian;Junrui Ni;Chuang Gan;Mark A. Hasegawa-Johnson;Shiyu Chang;Yang Zhang", "authorids": "~Heting_Gao1;~Kaizhi_Qian1;~Junrui_Ni1;~Chuang_Gan1;~Mark_A._Hasegawa-Johnson1;~Shiyu_Chang2;~Yang_Zhang3", "gender": "M;;M;M;M;Unspecified;M", "homepage": ";;;http://people.csail.mit.edu/ganchuang/;http://speechtechnology.web.illinois.edu;http://people.csail.mit.edu/chang87/;", "dblp": ";212/6254;249/5376;139/6993;70/3186;28/9988;06/6785-1", "google_scholar": "_mQ9NAgAAAAJ;;;PTeSCbIAAAAJ;18O0OAwAAAAJ;r21asW4AAAAJ;_-5PSgQAAAAJ", "orcid": "0000-0002-2857-3842;;0009-0004-1666-3842;;0000-0002-5631-2893;;", "linkedin": ";;junrui-ni-931051156/;;mark-hasegawa-johnson-21a86825/;;", "or_profile": "~Heting_Gao1;~Kaizhi_Qian1;~Junrui_Ni1;~Chuang_Gan1;~Mark_A._Hasegawa-Johnson1;~Shiyu_Chang2;~Yang_Zhang3", "aff": "University of Illinois, Urbana Champaign;International Business Machines;University of Illinois, Urbana Champaign;University of Massachusetts at Amherst;University of Illinois, Urbana Champaign;University of California, Santa Barbara;International Business Machines", "aff_domain": "uiuc.edu;ibm.com;illinous.edu;umass.edu;illinois.edu;ucsb.edu;ibm.com", "position": "PhD student;Researcher;PhD student;Assistant Professor;Full Professor;Assistant Professor;Research Staff Employee", "bibtex": "@inproceedings{\ngao2024speech,\ntitle={Speech Self-Supervised Learning Using Diffusion Model Synthetic Data},\nauthor={Heting Gao and Kaizhi Qian and Junrui Ni and Chuang Gan and Mark A. Hasegawa-Johnson and Shiyu Chang and Yang Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ecnpYYHjt9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1688638, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10239750836211314748&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "uiuc.edu;ibm.com;illinous.edu;umass.edu;illinois.edu;ucsb.edu;ibm.com", "author_num": 7, "aff_unique_index": "0;1;0;2;0;3;1", "aff_unique_norm": "University of Illinois Urbana-Champaign;International Business Machines Corporation;University of Massachusetts Amherst;University of California, Santa Barbara", "aff_unique_dep": ";;;", "aff_unique_url": "https://illinois.edu;https://www.ibm.com;https://www.umass.edu;https://www.ucsb.edu", "aff_unique_abbr": "UIUC;IBM;UMass Amherst;UCSB", "aff_campus_unique_index": "0;0;2;0;3", "aff_campus_unique": "Urbana-Champaign;;Amherst;Santa Barbara", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Deletion-Anticipative Data Selection with a Limited Budget", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33486", "id": "ecvuJWE1YY", "proceeding": "https://proceedings.mlr.press/v235/sim24a.html", "pdf": "https://openreview.net/pdf?id=ecvuJWE1YY", "openreview": "https://openreview.net/forum?id=ecvuJWE1YY", "author_site": "Rachael Hwee Ling Sim, Jue Fan, Xiao Tian, Patrick Jaillet, Bryan Kian Hsiang Low", "tldr": "", "abstract": "Learners with a limited budget can use supervised data subset selection and active learning techniques to select a smaller training set and reduce the cost of acquiring data and training _machine learning_ (ML) models. However, the resulting high model performance, measured by a data utility function, may not be preserved when some data owners, enabled by the GDPR's right to erasure, request their data to be deleted from the ML model. This raises an important question for learners who are temporarily unable or unwilling to acquire data again: _During the initial data acquisition of a training set of size $k$, can we proactively maximize the data utility after future unknown deletions?_ We propose that the learner anticipates/estimates the probability that (i) each data owner in the feasible set will independently delete its data or (ii) a number of deletions occur out of $k$, and justify our proposal with concrete real-world use cases. Then, instead of directly maximizing the data utility function, the learner can maximize the expected or risk-averse post-deletion utility based on the anticipated probabilities. We further propose how to construct these _deletion-anticipative data selection_ ($\\texttt{DADS}$) maximization objectives to preserve monotone submodularity and near-optimality of greedy solutions, how to optimize the objectives and empirically evaluate $\\texttt{DADS}$' performance on real-world datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rachael Hwee Ling Sim;Jue Fan;Xiao Tian;Patrick Jaillet;Bryan Kian Hsiang Low", "authorids": "~Rachael_Hwee_Ling_Sim1;~Jue_Fan1;~Xiao_Tian1;~Patrick_Jaillet1;~Bryan_Kian_Hsiang_Low1", "gender": ";F;M;M;M", "homepage": ";;https://snoidetx.github.io/Snoidepaedia/;http://web.mit.edu/jaillet/www/;http://www.comp.nus.edu.sg/~lowkh", "dblp": ";;;https://dblp.uni-trier.de/pers/hd/j/Jaillet:Patrick;97/4877", "google_scholar": ";x6ZbPaQAAAAJ;vOiEt8oAAAAJ;ND0FM6EAAAAJ;https://scholar.google.com.tw/citations?user=2P-Q09UAAAAJ", "orcid": ";;0000-0003-3346-0313;0000-0002-8585-6566;", "linkedin": ";jue-fan-a66a541b6/;snoidetx/;patrick-jaillet-1260445/;", "or_profile": "~Rachael_Hwee_Ling_Sim1;~Jue_Fan1;~Xiao_Tian1;~Patrick_Jaillet1;~Bryan_Kian_Hsiang_Low1", "aff": ";National University of Singapore;National University of Singapore;Massachusetts Institute of Technology;National University of Singapore", "aff_domain": ";nus.edu;u.nus.edu;mit.edu;nus.edu.sg", "position": ";Undergrad student;Undergrad student;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nsim2024deletionanticipative,\ntitle={Deletion-Anticipative Data Selection with a Limited Budget},\nauthor={Rachael Hwee Ling Sim and Jue Fan and Xiao Tian and Patrick Jaillet and Bryan Kian Hsiang Low},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ecvuJWE1YY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2849249, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dyAG4TywVKYJ:scholar.google.com/&scioq=Deletion-Anticipative+Data+Selection+with+a+Limited+Budget&hl=en&as_sdt=0,44", "gs_version_total": 6, "email": ";nus.edu;u.nus.edu;mit.edu;nus.edu.sg", "author_num": 5, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "National University of Singapore;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;https://web.mit.edu", "aff_unique_abbr": "NUS;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Singapore;United States" }, { "title": "One Prompt is not Enough: Automated Construction of a Mixture-of-Expert Prompts", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33485", "id": "edHLN40DWu", "proceeding": "https://proceedings.mlr.press/v235/wang24b.html", "pdf": "https://openreview.net/pdf?id=edHLN40DWu", "openreview": "https://openreview.net/forum?id=edHLN40DWu", "author_site": "Ruochen Wang, Sohyun An, Minhao Cheng, Tianyi Zhou, Sung Ju Hwang, Cho-Jui Hsieh", "tldr": "", "abstract": "Large Language Models (LLMs) exhibit strong generalization capabilities to novel tasks when prompted with language instructions and in-context demos. Since this ability sensitively depends on the quality of prompts, various methods have been explored to automate the instruction design. While these methods demonstrated promising results, they also restricted the searched prompt to one instruction. Such simplification significantly limits their capacity, as a single demo-free instruction might not be able to cover the entire complex problem space of the targeted task. To alleviate this issue, we adopt the Mixture-of-Expert paradigm and divide the problem space into a set of sub-regions; Each sub-region is governed by a specialized expert, equipped with both an instruction and a set of demos. A two-phase process is developed to construct the specialized expert for each region: (1) demo assignment: Inspired by the theoretical connection between in-context learning and kernel regression, we group demos into experts based on their semantic similarity; (2) instruction assignment: A region-based joint search of an instruction per expert complements the demos assigned to it, yielding a synergistic effect. The resulting method, codenamed Mixture-of-Prompts (MoP), achieves an average win rate of 81% against prior arts across several major benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ruochen Wang;Sohyun An;Minhao Cheng;Tianyi Zhou;Sung Ju Hwang;Cho-Jui Hsieh", "authorids": "~Ruochen_Wang2;~Sohyun_An1;~Minhao_Cheng1;~Tianyi_Zhou1;~Sung_Ju_Hwang1;~Cho-Jui_Hsieh1", "gender": "M;F;M;M;;M", "homepage": "https://ruocwang.github.io/;https://cownowan.github.io/;https://cmhcbb.github.io/;https://tianyizhou.github.io/;;http://web.cs.ucla.edu/~chohsieh/index.html", "dblp": "33/120;348/6996;174/1717;88/8205-1;;14/2770", "google_scholar": "8fXrlRAAAAAJ;tW1jSXMAAAAJ;_LkC1yoAAAAJ;OKvgizMAAAAJ;;Wy89g4IAAAAJ", "orcid": ";;0000-0003-3965-4215;0000-0001-5348-0632;;", "linkedin": "ruochen-wang-1699b1113/;sohyunan0423;;tianyizhou;;", "or_profile": "~Ruochen_Wang2;~Sohyun_An1;~Minhao_Cheng1;~Tianyi_Zhou1;~Sung_Ju_Hwang1;~Cho-Jui_Hsieh1", "aff": "University of California, Los Angeles;Korea Advanced Institute of Science & Technology;Pennsylvania State University;University of Maryland, College Park;;University of California, Los Angeles", "aff_domain": "ucla.edu;kaist.ac.kr;psu.edu;umd.edu;;ucla.edu", "position": "PhD student;MS student;Assistant Professor;Assistant Professor;;Associate Professor", "bibtex": "@inproceedings{\nwang2024one,\ntitle={One Prompt is not Enough: Automated Construction of a Mixture-of-Expert Prompts},\nauthor={Ruochen Wang and Sohyun An and Minhao Cheng and Tianyi Zhou and Sung Ju Hwang and Cho-Jui Hsieh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=edHLN40DWu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4365714, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13974604052529899923&as_sdt=1005&sciodt=0,4&hl=en", "gs_version_total": 8, "email": "ucla.edu;kaist.ac.kr;psu.edu;umd.edu;;ucla.edu", "author_num": 6, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "University of California, Los Angeles;Korea Advanced Institute of Science and Technology;Pennsylvania State University;University of Maryland", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ucla.edu;https://www.kaist.ac.kr;https://www.psu.edu;https://www/umd.edu", "aff_unique_abbr": "UCLA;KAIST;PSU;UMD", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Los Angeles;;College Park", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;South Korea" }, { "title": "Interaction-based Retrieval-augmented Diffusion Models for Protein-specific 3D Molecule Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33484", "id": "eejhD9FCP3", "proceeding": "https://proceedings.mlr.press/v235/huang24ab.html", "pdf": "https://openreview.net/pdf?id=eejhD9FCP3", "openreview": "https://openreview.net/forum?id=eejhD9FCP3", "author_site": "Zhilin Huang, Ling Yang, Xiangxin Zhou, Chujun Qin, Yijie Yu, Xiawu Zheng, Zikun Zhou, Wentao Zhang, Yu Wang, Wenming Yang", "tldr": "", "abstract": "Generating ligand molecules that bind to specific protein targets via generative models holds substantial promise for advancing structure-based drug design. Existing methods generate molecules from scratch without reference or template ligands, which poses challenges in model optimization and may yield suboptimal outcomes. To address this problem, we propose an innovative interaction-based retrieval-augmented diffusion model named IRDiff to facilitate target-aware molecule generation. IRDiff leverages a curated set of ligand references, i.e., those with desired properties such as high binding affinity, to steer the diffusion model towards synthesizing ligands that satisfy design criteria. Specifically, we utilize a protein-molecule interaction network (PMINet), which is pretrained with binding affinity signals to: (i) retrieve target-aware ligand molecules with high binding affinity to serve as references, and (ii) incorporate essential protein-ligand binding structures for steering molecular diffusion generation with two effective augmentation mechanisms, i.e., retrieval augmentation and self augmentation. Empirical studies on CrossDocked2020 dataset show IRDiff can generate molecules with more realistic 3D structures and achieve state-of-the-art binding affinities towards the protein targets, while maintaining proper molecular properties. The codes and models are available at https://github.com/YangLing0818/IRDiff", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhilin Huang;Ling Yang;Xiangxin Zhou;Chujun Qin;Yijie Yu;Xiawu Zheng;Zikun Zhou;Wentao Zhang;Yu Wang;Wenming Yang", "authorids": "~Zhilin_Huang1;~Ling_Yang1;~Xiangxin_Zhou1;~Chujun_Qin1;~Yijie_Yu1;~Xiawu_Zheng1;~Zikun_Zhou2;~Wentao_Zhang1;~Yu_Wang60;~Wenming_Yang1", "gender": "M;M;Not Specified;F;F;M;M;M;M;M", "homepage": "https://zerinhwang03.github.io/;https://yangling0818.github.io/;;https://dblp.org/pid/283/7033.html;;https://sites.google.com/view/zhengxiawu/%E9%A6%96%E9%A1%B5;https://sites.google.com/view/zikunzhou-homepage;;https://www.sigs.tsinghua.edu.cn/ywm_en/main.htm;https://zwt233.github.io/", "dblp": "266/8046.html;01/24-6.html;247/9275;283/7033;138/0399.html;222/7865;271/8084;02/5889-8;75/2339.html;41/3249-1.html", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.hk/citations?user=sIKujqAAAAAJ;eQgIWcQAAAAJ;;;jBgXocYAAAAJ;4A8SXMEAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=zh-CN;JE4VON0AAAAJ", "orcid": "0000-0003-3417-743X;0000-0003-1905-8053;;0009-0004-8223-873X;;0000-0002-6855-5403;0000-0002-2687-7762;0000-0003-4976-9366;0000-0002-2506-1286;0000-0002-7532-5550", "linkedin": ";;;;;;;;;", "or_profile": "~Zhilin_Huang1;~Ling_Yang1;~Xiangxin_Zhou1;~Chujun_Qin1;~Yijie_Yu1;~Xiawu_Zheng1;~Zikun_Zhou2;~Yu_Wang60;~Wenming_Yang1;~Zhang_wen_tao1", "aff": "Tsinghua University;Peking University;Institute of Automation, Chinese Academy of Sciences;China Southern Power Grid ;Tsinghua University;PengCheng Lab;Peng Cheng Laboratory;Peng Cheng Laboratory;Tsinghua University,;Peking University", "aff_domain": "mails.tsinghua.edu.cn;pku.edu.cn;ia.ac.cn;csg.cn;mail.tsinghua.edu.cn;pcl.ac.cn;pcl.ac.cn;pcl.ac.cn;tsinghua.edu.cn;pku.edu.cn", "position": "PhD student;PhD student;PhD student;Researcher;PhD student;Postdoc;Researcher;Full Professor;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nhuang2024interactionbased,\ntitle={Interaction-based Retrieval-augmented Diffusion Models for Protein-specific 3D Molecule Generation},\nauthor={Zhilin Huang and Ling Yang and Xiangxin Zhou and Chujun Qin and Yijie Yu and Xiawu Zheng and Zikun Zhou and Wentao Zhang and Yu Wang and Wenming Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eejhD9FCP3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3968661, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6558146910393274830&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "mails.tsinghua.edu.cn;pku.edu.cn;ia.ac.cn;csg.cn;mail.tsinghua.edu.cn;pcl.ac.cn;pcl.ac.cn;pcl.ac.cn;tsinghua.edu.cn;pku.edu.cn", "author_num": 10, "aff_unique_index": "0;1;2;3;0;4;5;5;0;1", "aff_unique_norm": "Tsinghua University;Peking University;Chinese Academy of Sciences;China Southern Power Grid;Pengcheng Lab;Pengcheng Laboratory", "aff_unique_dep": ";;Institute of Automation;;;Peng Cheng Laboratory", "aff_unique_url": "https://www.tsinghua.edu.cn;http://www.pku.edu.cn;http://www.ia.cas.cn;http://www.csg.cn;;http://www.pcl.ac.cn", "aff_unique_abbr": "THU;Peking U;CAS;CSG;;PCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Conformal Predictions under Markovian Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33483", "id": "efzkSbpyRw", "proceeding": "https://proceedings.mlr.press/v235/zheng24j.html", "pdf": "https://openreview.net/pdf?id=efzkSbpyRw", "openreview": "https://openreview.net/forum?id=efzkSbpyRw", "author_site": "Fr\u00e9d\u00e9ric Zheng, Alexandre Proutiere", "tldr": "", "abstract": "We study the split Conformal Prediction method when applied to Markovian data. We quantify the gap in terms of coverage induced by the correlations in the data (compared to exchangeable data). This gap strongly depends on the mixing properties of the underlying Markov chain, and we prove that it typically scales as $\\sqrt{t_\\mathrm{mix}\\ln(n)/n}$ (where $t_\\mathrm{mix}$ is the mixing time of the chain). We also derive upper bounds on the impact of the correlations on the size of the prediction set. Finally we present $K$-split CP, a method that consists in thinning the calibration dataset and that adapts to the mixing properties of the chain. Its coverage gap is reduced to $t_\\mathrm{mix}/(n\\ln(n))$ without really affecting the size of the prediction set. We finally test our algorithms on synthetic and real-world datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fr\u00e9d\u00e9ric Zheng;Alexandre Proutiere", "authorids": "~Fr\u00e9d\u00e9ric_Zheng1;~Alexandre_Proutiere1", "gender": ";M", "homepage": ";https://people.kth.se/~alepro/", "dblp": ";p/AlexandreProutiere", "google_scholar": ";g5sya5cAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Fr\u00e9d\u00e9ric_Zheng1;~Alexandre_Proutiere1", "aff": ";KTH Royal Institute of Technology, Stockholm, Sweden", "aff_domain": ";kth.se", "position": ";Full Professor", "bibtex": "@inproceedings{\nzheng2024conformal,\ntitle={Conformal Predictions under Markovian Data},\nauthor={Fr{\\'e}d{\\'e}ric Zheng and Alexandre Proutiere},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=efzkSbpyRw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 878944, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:I7dA7pFINagJ:scholar.google.com/&scioq=Conformal+Predictions+under+Markovian+Data&hl=en&as_sdt=0,5", "gs_version_total": 7, "email": ";kth.se", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "KTH Royal Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kth.se", "aff_unique_abbr": "KTH", "aff_campus_unique_index": "0", "aff_campus_unique": "Stockholm", "aff_country_unique_index": "0", "aff_country_unique": "Sweden" }, { "title": "Contrastive Learning for Clinical Outcome Prediction with Partial Data Sources", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33482", "id": "elCOPIm4Xw", "proceeding": "https://proceedings.mlr.press/v235/xia24e.html", "pdf": "https://openreview.net/pdf?id=elCOPIm4Xw", "openreview": "https://openreview.net/forum?id=elCOPIm4Xw", "author_site": "Xia, Jonathan Wilson, Benjamin Goldstein, Ricardo Henao", "tldr": "", "abstract": "The use of machine learning models to predict clinical outcomes from (longitudinal) electronic health record (EHR) data is becoming increasingly popular due to advances in deep architectures, representation learning, and the growing availability of large EHR datasets. Existing models generally assume access to the same data sources during both training and inference stages. However, this assumption is often challenged by the fact that real-world clinical datasets originate from various data sources (with distinct sets of covariates), which though can be available for training (in a research or retrospective setting), are more realistically only partially available (a subset of such sets) for inference when deployed. So motivated, we introduce Contrastive Learning for clinical Outcome Prediction with Partial data Sources (CLOPPS), that trains encoders to capture information across different data sources and then leverages them to build classifiers restricting access to a single data source. This approach can be used with existing cross-sectional or longitudinal outcome classification models. We present experiments on two real-world datasets demonstrating that CLOPPS consistently outperforms strong baselines in several practical scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Meng Xia;Jonathan Wilson;Benjamin Goldstein;Ricardo Henao", "authorids": "~Meng_Xia2;~Jonathan_Wilson2;~Benjamin_Goldstein1;~Ricardo_Henao1", "gender": "F;M;M;M", "homepage": ";https://scholars.duke.edu/person/jonathan.a.wilson;;http://rhenaog.github.io", "dblp": ";;120/9848;27/3207", "google_scholar": ";;;p_mm4-YAAAAJ", "orcid": "0000-0001-5062-8689;0000-0001-9984-4825;;0000-0003-4980-845X", "linkedin": ";;;", "or_profile": "~Meng_Xia2;~Jonathan_Wilson2;~Benjamin_Goldstein1;~Ricardo_Henao1", "aff": "Duke University;Duke University;;King Abdullah University of Science and Technology", "aff_domain": "duke.edu;dukehealth.org;;kaust.edu.sa", "position": "PhD student;Researcher;;Associate Professor", "bibtex": "@inproceedings{\nxia2024contrastive,\ntitle={Contrastive Learning for Clinical Outcome Prediction with Partial Data Sources},\nauthor={Meng Xia and Jonathan Wilson and Benjamin Goldstein and Ricardo Henao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=elCOPIm4Xw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7626111, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1296074708870062756&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "email": "duke.edu;dukehealth.org;;kaust.edu.sa", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Duke University;King Abdullah University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.duke.edu;https://www.kast.kau.edu.sa", "aff_unique_abbr": "Duke;KAUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;Saudi Arabia" }, { "title": "NDOT: Neuronal Dynamics-based Online Training for Spiking Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33481", "id": "elF0QoBSFV", "proceeding": "https://proceedings.mlr.press/v235/jiang24a.html", "pdf": "https://openreview.net/pdf?id=elF0QoBSFV", "openreview": "https://openreview.net/forum?id=elF0QoBSFV", "author_site": "Haiyan Jiang, Giulia De Masi, Huan Xiong, Bin Gu", "tldr": "", "abstract": "Spiking Neural Networks (SNNs) are attracting great attention for their energy-efficient and fast-inference properties in neuromorphic computing. However, the efficient training of deep SNNs poses challenges in gradient calculation due to the non-differentiability of their binary spike-generating activation functions. The widely used surrogate gradient (SG) method, combined with the back-propagation through time (BPTT), has shown considerable effectiveness. Yet, BPTT's process of unfolding and back-propagating along the computation graph requires storing intermediate information at all time-steps, resulting in huge memory consumption and failing to meet online requirements. In this work, we propose Neuronal Dynamics-based Online Training (NDOT) for SNNs, which uses the neuronal dynamics-based temporal dependency/sensitivity in gradient computation. NDOT enables forward-in-time learning by decomposing the full gradient into temporal and spatial gradients. To illustrate the intuition behind NDOT, we employ the Follow-the-Regularized-Leader (FTRL) algorithm. FTRL explicitly utilizes historical information and addresses limitations in instantaneous loss. Our proposed NDOT method accurately captures temporal dependencies through neuronal dynamics, functioning similarly to FTRL's explicit utilizing historical information. Experiments on CIFAR-10, CIFAR-100, and CIFAR10-DVS demonstrate the superior performance of our NDOT method on large-scale static and neuromorphic datasets within a small number of time steps. The codes are available at https://github.com/HaiyanJiang/SNN-NDOT.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haiyan Jiang;Giulia De Masi;Huan Xiong;Bin Gu", "authorids": "~Haiyan_Jiang1;~Giulia_De_Masi1;~Huan_Xiong1;~Bin_Gu1", "gender": "F;;M;M", "homepage": ";;https://scholar.google.com/citations?user=l4hm14MAAAAJ&hl=en;https://mbzuai.ac.ae/study/faculty/bin-gu/", "dblp": ";147/8719;;29/1758-1", "google_scholar": "vpHnhJsAAAAJ;G1K5hX0AAAAJ;l4hm14MAAAAJ;Vo8OgCgAAAAJ", "orcid": "0000-0002-4099-480X;0000-0003-3284-880X;;0000-0001-6049-1815", "linkedin": ";;;", "or_profile": "~Haiyan_Jiang1;~Giulia_De_Masi1;~Huan_Xiong1;~Bin_Gu1", "aff": "Mohamed bin Zayed University of Artificial Intelligence;Technology Innovation Institute;Harbin Institute of Technology;Mohamed bin Zayed University of Artificial Intelligence", "aff_domain": "mbzuai.ac.ae;tii.ae;hit.edu.cn;mbzuai.ac.ae", "position": "Researcher;Principal Researcher;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\njiang2024ndot,\ntitle={{NDOT}: Neuronal Dynamics-based Online Training for Spiking Neural Networks},\nauthor={Haiyan Jiang and Giulia De Masi and Huan Xiong and Bin Gu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=elF0QoBSFV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 535109, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1438415420753793375&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "mbzuai.ac.ae;tii.ae;hit.edu.cn;mbzuai.ac.ae", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;Technology Innovation Institute;Harbin Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://mbzuai.ac.ae;;http://www.hit.edu.cn/", "aff_unique_abbr": "MBZUAI;;HIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Harbin", "aff_country_unique_index": "0;2;0", "aff_country_unique": "United Arab Emirates;;China" }, { "title": "AttnLRP: Attention-Aware Layer-Wise Relevance Propagation for Transformers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33480", "id": "emtXYlBrNF", "proceeding": "https://proceedings.mlr.press/v235/achtibat24a.html", "pdf": "https://openreview.net/pdf?id=emtXYlBrNF", "openreview": "https://openreview.net/forum?id=emtXYlBrNF", "author_site": "Reduan Achtibat, Sayed Mohammad Vakilzadeh Hatefi, Maximilian Dreyer, Aakriti Jain, Thomas Wiegand, Sebastian Lapuschkin, Wojciech Samek", "tldr": "", "abstract": "Large Language Models are prone to biased predictions and hallucinations, underlining the paramount importance of understanding their model-internal reasoning process. However, achieving faithful attributions for the entirety of a black-box transformer model and maintaining computational efficiency is an unsolved challenge. By extending the Layer-wise Relevance Propagation attribution method to handle attention layers, we address these challenges effectively. While partial solutions exist, our method is the first to faithfully and holistically attribute not only input but also latent representations of transformer models with the computational efficiency similar to a single backward pass. Through extensive evaluations against existing methods on LLaMa 2, Mixtral 8x7b, Flan-T5 and vision transformer architectures, we demonstrate that our proposed approach surpasses alternative methods in terms of faithfulness and enables the understanding of latent representations, opening up the door for concept-based explanations. We provide an LRP library at https://github.com/rachtibat/LRP-eXplains-Transformers.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Reduan Achtibat;Sayed Mohammad Vakilzadeh Hatefi;Maximilian Dreyer;Aakriti Jain;Thomas Wiegand;Sebastian Lapuschkin;Wojciech Samek", "authorids": "~Reduan_Achtibat1;~Sayed_Mohammad_Vakilzadeh_Hatefi1;~Maximilian_Dreyer1;~Aakriti_Jain1;~Thomas_Wiegand1;~Sebastian_Lapuschkin1;~Wojciech_Samek1", "gender": "M;M;M;Not Specified;M;M;M", "homepage": ";;;https://linkedin.com/in/aakriti-j;https://iphome.hhi.de/wiegand/;http://iphome.hhi.de/lapuschkin/;http://iphome.hhi.de/samek/", "dblp": "322/1194;368/3080;301/9327;;w/ThomasWiegand.html;184/7883;79/9736", "google_scholar": "gStbosAAAAAJ;https://scholar.google.de/citations?user=FqxGe-MAAAAJ;https://scholar.google.de/citations?user=6Uz8sBQAAAAJ;;https://scholar.google.de/citations?user=VnF9QNgAAAAJ;https://scholar.google.de/citations?user=wpLQuroAAAAJ;7aQwO08AAAAJ", "orcid": ";0009-0006-8485-5808;;;0000-0002-1121-2581;0000-0002-0762-7258;", "linkedin": ";erfanhatefi?utm_source=share&utm_campaign=share_via&utm_content=profile&utm_medium=android_app;;;https://de.linkedin.com/in/thomas-wiegand-12379422;sebastian-lapuschkin/;", "or_profile": "~Reduan_Achtibat1;~Sayed_Mohammad_Vakilzadeh_Hatefi1;~Maximilian_Dreyer1;~Aakriti_Jain1;~Thomas_Wiegand1;~Sebastian_Lapuschkin1;~Wojciech_Samek1", "aff": "Fraunhofer Heinrich Hertz Institut;Fraunhofer HHI, Fraunhofer IAIS;Fraunhofer HHI;Fraunhofer HHI;Technische Universit\u00e4t Berlin;Fraunhofer HHI;Fraunhofer HHI", "aff_domain": "hhi.fraunhofer.de;hhi.fraunhofer.de;hhi.fraunhofer.de;hhi.fraunhofer.de;tu-berlin.de;hhi.fraunhofer.de;hhi.fraunhofer.de", "position": "PhD student;Researcher;PhD student;Researcher;Full Professor;Head of Explainable Artificial Intelligence;Assistant Professor", "bibtex": "@inproceedings{\nachtibat2024attnlrp,\ntitle={Attn{LRP}: Attention-Aware Layer-Wise Relevance Propagation for Transformers},\nauthor={Reduan Achtibat and Sayed Mohammad Vakilzadeh Hatefi and Maximilian Dreyer and Aakriti Jain and Thomas Wiegand and Sebastian Lapuschkin and Wojciech Samek},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=emtXYlBrNF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6307164, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8894992340300760609&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "hhi.fraunhofer.de;hhi.fraunhofer.de;hhi.fraunhofer.de;hhi.fraunhofer.de;tu-berlin.de;hhi.fraunhofer.de;hhi.fraunhofer.de", "author_num": 7, "aff_unique_index": "0;1;0;0;2;0;0", "aff_unique_norm": "Fraunhofer Heinrich Hertz Institute;Fraunhofer HHI;Technische Universit\u00e4t Berlin", "aff_unique_dep": ";;", "aff_unique_url": "https://www.hhi.fraunhofer.de/;https://www.fraunhofer.de/en/institutes/hhi.html;https://www.tu-berlin.de", "aff_unique_abbr": "HHI;HHI;TU Berlin", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Agnostic Learning of Mixed Linear Regressions with EM and AM Algorithms", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33479", "id": "eo88noTbb5", "proceeding": "https://proceedings.mlr.press/v235/ghosh24b.html", "pdf": "https://openreview.net/pdf?id=eo88noTbb5", "openreview": "https://openreview.net/forum?id=eo88noTbb5", "author_site": "Avishek Ghosh, Arya Mazumdar", "tldr": "", "abstract": "Mixed linear regression is a well-studied problem in parametric statistics and machine learning. Given a set of samples, tuples of covariates and labels, the task of mixed linear regression is to find a small list of linear relationships that best fit the samples. Usually it is assumed that the label is generated stochastically by randomly selecting one of two or more linear functions, applying this chosen function to the covariates, and potentially introducing noise to the result. In that situation, the objective is to estimate the ground-truth linear functions up to some parameter error. The popular expectation maximization (EM) and alternating minimization (AM) algorithms have been previously analyzed for this. In this paper, we consider the more general problem of agnostic learning of mixed linear regression from samples, without such generative models. In particular, we show that the AM and EM algorithms, under standard conditions of separability and good initialization, lead to agnostic learning in mixed linear regression by converging to the population loss minimizers, for suitably defined loss functions. In some sense, this shows the strength of AM and EM algorithms that converges to ``optimal solutions'' even in the absence of realizable generative models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Avishek Ghosh;Arya Mazumdar", "authorids": "~Avishek_Ghosh2;~Arya_Mazumdar1", "gender": "M;M", "homepage": "https://sites.google.com/view/avishekghosh;http://www.cs.umass.edu/~arya", "dblp": "98/275;77/6050", "google_scholar": "8y0Dg5cAAAAJ;https://scholar.google.com.tw/citations?user=9tjQU1EAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Avishek_Ghosh2;~Arya_Mazumdar1", "aff": "Indian Institute of Technology, Bombay;University of California, San Diego", "aff_domain": "iitb.ac.in;ucsd.edu", "position": "Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nghosh2024agnostic,\ntitle={Agnostic Learning of Mixed Linear Regressions with {EM} and {AM} Algorithms},\nauthor={Avishek Ghosh and Arya Mazumdar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eo88noTbb5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 468975, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:tlq0985tA60J:scholar.google.com/&scioq=Agnostic+Learning+of+Mixed+Linear+Regressions+with+EM+and+AM+Algorithms&hl=en&as_sdt=0,5", "gs_version_total": 9, "email": "iitb.ac.in;ucsd.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Indian Institute of Technology Bombay;University of California, San Diego", "aff_unique_dep": ";", "aff_unique_url": "https://www.iitb.ac.in;https://www.ucsd.edu", "aff_unique_abbr": "IIT Bombay;UCSD", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Bombay;San Diego", "aff_country_unique_index": "0;1", "aff_country_unique": "India;United States" }, { "title": "Asymptotically Optimal and Computationally Efficient Average Treatment Effect Estimation in A/B testing", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33478", "id": "eqIGoEoI10", "proceeding": "https://proceedings.mlr.press/v235/deep24a.html", "pdf": "https://openreview.net/pdf?id=eqIGoEoI10", "openreview": "https://openreview.net/forum?id=eqIGoEoI10", "author_site": "VIKAS DEEP, Achal Bassamboo, Sandeep Juneja", "tldr": "", "abstract": "Motivated by practical applications in clinical trials and online platforms, we study A/B testing with the aim of estimating a confidence interval (CI) for the average treatment effect (ATE) using the minimum expected sample size. This CI should have a width at most $\\epsilon$ while ensuring that the probability of the CI not containing the true ATE is at most $\\delta$. To answer this, we first establish a lower bound on the expected sample size needed for any adaptive policy which constructs a CI of ATE with desired properties. Specifically, we prove that the lower bound is based on the solution to a max-min non-convex optimization problem for small $\\delta$. Tailoring the ``plug-in'' approach for the ATE problem, we construct an adaptive policy that is asymptotically optimal, i.e., matches the lower bound on the expected sample size for small $\\delta$. Interestingly, we find that, for small $\\epsilon$ and $\\delta$, the asymptotically optimal fraction of treatment assignment for A and B is proportional to the standard deviation of the outcome distributions of treatments A and B, respectively. However, as the proposed approach can be computationally intensive, we propose an alternative adaptive policy. This new policy, informed by insights from our lower bound analysis, is computationally efficient while remaining asymptotically optimal for small values of $\\epsilon$ and $\\delta$. Numerical comparisons demonstrate that both policies perform similarly across practical values of $\\epsilon$ and $\\delta$, offering efficient solutions for A/B testing.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "VIKAS DEEP;Achal Bassamboo;Sandeep Kumar Juneja", "authorids": "~VIKAS_DEEP1;a-bassamboo@kellogg.northwestern.edu;~Sandeep_Kumar_Juneja1", "gender": "M;;M", "homepage": "https://kelloggphds.northwestern.edu/vikas-deep/;;http://www.tcs.tifr.res.in/~sandeepj/", "dblp": ";;98/5399.html", "google_scholar": "Ke1uGUsAAAAJ;;https://scholar.google.co.in/citations?user=Tfgv6VgAAAAJ", "orcid": ";;", "linkedin": ";;sandeep-juneja-937b44b/?originalSubdomain=in", "or_profile": "~VIKAS_DEEP1;a-bassamboo@kellogg.northwestern.edu;~Sandeep_Kumar_Juneja1", "aff": "Northwestern University;;Tata Institute of Fundamental Research", "aff_domain": "northwestern.edu;;tifr.res.edu", "position": "PhD student;;Full Professor", "bibtex": "@inproceedings{\ndeep2024asymptotically,\ntitle={Asymptotically Optimal and Computationally Efficient Average Treatment Effect Estimation in A/B testing},\nauthor={VIKAS DEEP and Achal Bassamboo and Sandeep Kumar Juneja},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eqIGoEoI10}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1391761, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9IpLC01Y7qgJ:scholar.google.com/&scioq=Asymptotically+Optimal+and+Computationally+Efficient+Average+Treatment+Effect+Estimation+in+A/B+testing&hl=en&as_sdt=0,5", "gs_version_total": 5, "email": "northwestern.edu;;tifr.res.edu", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Northwestern University;Tata Institute of Fundamental Research", "aff_unique_dep": ";", "aff_unique_url": "https://www.northwestern.edu;https://www.tifr.res.in", "aff_unique_abbr": "NU;TIFR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;India" }, { "title": "Image Fusion via Vision-Language Model", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33477", "id": "eqY64Z1rsT", "proceeding": "https://proceedings.mlr.press/v235/zhao24d.html", "pdf": "https://openreview.net/pdf?id=eqY64Z1rsT", "openreview": "https://openreview.net/forum?id=eqY64Z1rsT", "author_site": "Zixiang Zhao, Lilun Deng, Haowen Bai, Yukun Cui, Zhipeng Zhang, Yulun Zhang, Haotong Qin, Dongdong Chen, Jiangshe Zhang, Peng Wang, Luc Van Gool", "tldr": "", "abstract": "Image fusion integrates essential information from multiple images into a single composite, enhancing structures, textures, and refining imperfections. Existing methods predominantly focus on pixel-level and semantic visual features for recognition, but often overlook the deeper text-level semantic information beyond vision. Therefore, we introduce a novel fusion paradigm named image Fusion via vIsion-Language Model (FILM), for the first time, utilizing explicit textual information from source images to guide the fusion process. Specifically, FILM generates semantic prompts from images and inputs them into ChatGPT for comprehensive textual descriptions. These descriptions are fused within the textual domain and guide the visual information fusion, enhancing feature extraction and contextual understanding, directed by textual semantic information via cross-attention. FILM has shown promising results in four image fusion tasks: infrared-visible, medical, multi-exposure, and multi-focus image fusion. We also propose a vision-language dataset containing ChatGPT-generated paragraph descriptions for the eight image fusion datasets across four fusion tasks, facilitating future research in vision-language model-based image fusion. Code and dataset are available at https://github.com/Zhaozixiang1228/IF-FILM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zixiang Zhao;Lilun Deng;Haowen Bai;Yukun Cui;Zhipeng Zhang;Yulun Zhang;Haotong Qin;Dongdong Chen;Jiangshe Zhang;PENG WANG;Luc Van Gool", "authorids": "~Zixiang_Zhao1;~Lilun_Deng1;~Haowen_Bai1;~Yukun_Cui2;~Zhipeng_Zhang5;~Yulun_Zhang1;~Haotong_Qin1;~Dongdong_Chen4;~Jiangshe_Zhang1;~PENG_WANG15;~Luc_Van_Gool1", "gender": "M;M;M;M;;M;M;;M;M;", "homepage": "https://zhaozixiang1228.github.io/;https://github.com/Icondll;https://github.com/HaowenBai;https://github.com/Cuiyukun84;;http://yulunzhang.com/;https://htqin.github.io/;;https://gr.xjtu.edu.cn/en/web/jszhang;https://wangpengnorman.github.io/;", "dblp": "65/5420;354/8355;334/4082;219/1701;;166/2763-1.html;262/3626.html;;74/982-1;95/4442-15.html;61/5017", "google_scholar": "tUv_X8cAAAAJ;;;;;ORmLjWoAAAAJ;mK6n-KgAAAAJ;;https://scholar.google.com/citations?hl=en;https://scholar.google.com.au/citations?user=aPLp7pAAAAAJ;https://scholar.google.be/citations?user=TwMib_QAAAAJ", "orcid": ";;;;;0000-0002-2288-5079;;;;0000-0001-7689-3405;", "linkedin": ";;;;;yulun-zhang-1116b5b9/;;;;;", "or_profile": "~Zixiang_Zhao1;~Lilun_Deng1;~Haowen_Bai1;~Yukun_Cui2;~Zhipeng_Zhang5;~Yulun_Zhang1;~Haotong_Qin1;~Dongdong_Chen4;~Jiangshe_Zhang1;~PENG_WANG15;~Luc_Van_Gool1", "aff": "ETHZ - ETH Zurich;Xi'an Jiaotong University;Xi'an Jiaotong University;Xi'an Jiaotong University;;Swiss Federal Institute of Technology;ETHZ - ETH Zurich;;Xi'an Jiaotong University;Northwestern Polytechnical University;KU Leuven", "aff_domain": "ethz.ch;xjtlu.edu.cn;xjtu.edu.cn;xjtu.edu.cn;;ethz.ch;ethz.ch;;xjtu.edu.cn;nwpu.edu.cn;kuleuven.be", "position": "Postdoc;PhD student;PhD student;PhD student;;Postdoc;Postdoc;;Full Professor;Full Professor;Emeritus", "bibtex": "@inproceedings{\nzhao2024image,\ntitle={Image Fusion via Vision-Language Model},\nauthor={Zixiang Zhao and Lilun Deng and Haowen Bai and Yukun Cui and Zhipeng Zhang and Yulun Zhang and Haotong Qin and Dongdong Chen and Jiangshe Zhang and PENG WANG and Luc Van Gool},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eqY64Z1rsT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 10022420, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13257570110177102766&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "ethz.ch;xjtlu.edu.cn;xjtu.edu.cn;xjtu.edu.cn;;ethz.ch;ethz.ch;;xjtu.edu.cn;nwpu.edu.cn;kuleuven.be", "author_num": 11, "aff_unique_index": "0;1;1;1;2;0;1;3;4", "aff_unique_norm": "ETH Zurich;Xi'an Jiao Tong University;Swiss Federal Institute of Technology;Northwestern Polytechnical University;Katholieke Universiteit Leuven", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.ethz.ch;https://www.xjtu.edu.cn;https://www.ethz.ch;https://www.nwpu.edu.cn;https://www.kuleuven.be", "aff_unique_abbr": "ETHZ;XJTU;ETH Zurich;NWPU;KU Leuven", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;0;0;1;1;2", "aff_country_unique": "Switzerland;China;Belgium" }, { "title": "Dense Reward for Free in Reinforcement Learning from Human Feedback", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33476", "id": "eyxVRMrZ4m", "proceeding": "https://proceedings.mlr.press/v235/chan24a.html", "pdf": "https://openreview.net/pdf?id=eyxVRMrZ4m", "openreview": "https://openreview.net/forum?id=eyxVRMrZ4m", "author_site": "Alexander Chan, Hao Sun, Samuel Holt, M van der Schaar", "tldr": "", "abstract": "Reinforcement Learning from Human Feedback (RLHF) has been credited as the key advance that has allowed Large Language Models (LLMs) to effectively follow instructions and produce useful assistance. Classically, this involves generating completions from the LLM in response to a query before using a separate reward model to assign a score to the full completion. As an auto-regressive process, the LLM has to take many \u201cactions\u201d (selecting individual tokens) and only receives a single, sparse reward at the end of an episode, a setup that is known to be difficult to optimise in traditional reinforcement learning. In this work we leverage the fact that the reward model contains more information than just its scalar output, in particular, it calculates an attention map over tokens as part of the transformer architecture. We use these attention weights to redistribute the reward along the whole completion, effectively densifying the signal and highlighting the most important tokens, all without incurring extra computational cost or requiring any additional modelling. We demonstrate that, theoretically, this approach is equivalent to potential-based reward shaping, ensuring that the optimal policy remains unchanged. Empirically, we show that it stabilises training, accelerates the rate of learning, and, in practical cases, may lead to better local optima.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alex James Chan;Hao Sun;Samuel Holt;Mihaela van der Schaar", "authorids": "~Alex_James_Chan1;~Hao_Sun1;~Samuel_Holt1;~Mihaela_van_der_Schaar2", "gender": "M;M;;F", "homepage": "https://alexjchan.com;https://holarissun.github.io;https://samholt.github.io/;https://www.vanderschaar-lab.com", "dblp": "268/6948;SunLLZL19;322/3656;", "google_scholar": "yfy_BGIAAAAJ;7ZNoHJkAAAAJ;Ey5aInIAAAAJ;DZ3S--MAAAAJ", "orcid": ";;;", "linkedin": "alex-chan-040081131/;;;", "or_profile": "~Alex_James_Chan1;~Hao_Sun1;~Samuel_Holt1;~Mihaela_van_der_Schaar2", "aff": "Spotify;University of Cambridge;Google DeepMind;University of California, Los Angeles", "aff_domain": "spotify.com;cam.ac.uk;google.com;ucla.edu", "position": "Researcher;PhD student;Intern;Full Professor", "bibtex": "@inproceedings{\nchan2024dense,\ntitle={Dense Reward for Free in Reinforcement Learning from Human Feedback},\nauthor={Alex James Chan and Hao Sun and Samuel Holt and Mihaela van der Schaar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eyxVRMrZ4m}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4060452, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18224434425713577387&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "spotify.com;cam.ac.uk;google.com;ucla.edu", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Spotify;University of Cambridge;Google;University of California, Los Angeles", "aff_unique_dep": ";;Google DeepMind;", "aff_unique_url": "https://www.spotify.com;https://www.cam.ac.uk;https://deepmind.com;https://www.ucla.edu", "aff_unique_abbr": "Spotify;Cambridge;DeepMind;UCLA", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Cambridge;Los Angeles", "aff_country_unique_index": "0;1;1;2", "aff_country_unique": "Sweden;United Kingdom;United States" }, { "title": "HarmBench: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33475", "id": "f3TUipYU3U", "proceeding": "https://proceedings.mlr.press/v235/mazeika24a.html", "pdf": "https://openreview.net/pdf?id=f3TUipYU3U", "openreview": "https://openreview.net/forum?id=f3TUipYU3U", "author_site": "Mantas Mazeika, Long Phan, Xuwang Yin, Andy Zou, Zifan Wang, Norman Mu, Elham Sakhaee, Nathaniel Li, Steven Basart, Bo Li, David Forsyth, Dan Hendrycks", "tldr": "", "abstract": "Automated red teaming holds substantial promise for uncovering and mitigating the risks associated with the malicious use of large language models (LLMs), yet the field lacks a standardized evaluation framework to rigorously assess new methods. To address this issue, we introduce HarmBench, a standardized evaluation framework for automated red teaming. We identify several desirable properties previously unaccounted for in red teaming evaluations and systematically design HarmBench to meet these criteria. Using HarmBench, we conduct a large-scale comparison of 18 red teaming methods and 33 target LLMs and defenses, yielding novel insights. We also introduce a highly efficient adversarial training method that greatly enhances LLM robustness across a wide range of attacks, demonstrating how HarmBench enables codevelopment of attacks and defenses. We open source HarmBench at https://github.com/centerforaisafety/HarmBench.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mantas Mazeika;Long Phan;Xuwang Yin;Andy Zou;Zifan Wang;Norman Mu;Elham Sakhaee;Nathaniel Li;Steven Basart;Bo Li;David Forsyth;Dan Hendrycks", "authorids": "~Mantas_Mazeika3;~Long_Phan1;~Xuwang_Yin2;~Andy_Zou1;~Zifan_Wang1;~Norman_Mu1;~Elham_Sakhaee3;~Nathaniel_Li1;~Steven_Basart1;~Bo_Li19;~David_Forsyth1;~Dan_Hendrycks1", "gender": "M;M;M;;M;M;F;M;M;F;M;", "homepage": "https://github.com/mmazeika;;https://xuwangyin.github.io/;;https://www.zifanw.net;;;https://nli0.github.io;http://stevenbas.art;http://boli.cs.illinois.edu/;https://cs.illinois.edu/directory/profile/daf;", "dblp": "215/4447;;125/2311;274/2362;;232/1821;;;245/2547;50/3402-26;f/DavidAForsyth;182/2504", "google_scholar": ";fVRQn4wAAAAJ;c425B6UAAAAJ;;HJOP3wMAAAAJ;UFlWdvUAAAAJ;;2XmBzbcAAAAJ;MzKvJhAAAAAJ;K8vJkTcAAAAJ;https://scholar.google.com.tw/citations?user=5H0arvkAAAAJ;", "orcid": ";;;;;;;;;;0000-0002-2278-0752;", "linkedin": ";long-phan-3110/;;andy-zou-09ba3616a/;zifan-wang-sail/;;elham-sakhaee;nli0/;xksteven/;;;", "or_profile": "~Mantas_Mazeika3;~Long_Phan1;~Xuwang_Yin2;~Andy_Zou1;~Zifan_Wang1;~Norman_Mu1;~Elham_Sakhaee3;~Nathaniel_Li1;~Steven_Basart1;~Bo_Li19;~David_Forsyth1;~Dan_Hendrycks1", "aff": "University of Illinois, Urbana-Champaign;Center for AI Safety;Center for AI Safety;Carnegie Mellon University;Center for AI Safety;University of California, Berkeley;;University of California, Berkeley;Center for AI Safety ;University of Illinois, Urbana Champaign;University of Illinois, Urbana-Champaign;Center for AI Safety", "aff_domain": "uiuc.edu;safe.ai;safe.ai;andrew.cmu.edu;safe.ai;berkeley.edu;;berkeley.edu;safe.ai;illinois.edu;uiuc.edu;safe.ai", "position": "PhD student;Research Engineer;Researcher;PhD student;Researcher;PhD student;;Undergrad student;Researcher;Assistant Professor;Full Professor;Executive and Research Director", "bibtex": "@inproceedings{\nmazeika2024harmbench,\ntitle={HarmBench: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal},\nauthor={Mantas Mazeika and Long Phan and Xuwang Yin and Andy Zou and Zifan Wang and Norman Mu and Elham Sakhaee and Nathaniel Li and Steven Basart and Bo Li and David Forsyth and Dan Hendrycks},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=f3TUipYU3U}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3050143, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 270, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16539780429070888458&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "uiuc.edu;safe.ai;safe.ai;andrew.cmu.edu;safe.ai;berkeley.edu;;berkeley.edu;safe.ai;illinois.edu;uiuc.edu;safe.ai", "author_num": 12, "aff_unique_index": "0;1;1;2;1;3;3;1;4;0;1", "aff_unique_norm": "University of Illinois;Center for AI Safety;Carnegie Mellon University;University of California, Berkeley;University of Illinois Urbana-Champaign", "aff_unique_dep": ";;;;", "aff_unique_url": "https://illinois.edu;https://www.centerforaisafety.org;https://www.cmu.edu;https://www.berkeley.edu;https://illinois.edu", "aff_unique_abbr": "UIUC;;CMU;UC Berkeley;UIUC", "aff_campus_unique_index": "0;2;2;0;0", "aff_campus_unique": "Urbana-Champaign;;Berkeley", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Erasing the Bias: Fine-Tuning Foundation Models for Semi-Supervised Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33474", "id": "f47ZK6gy3I", "proceeding": "https://proceedings.mlr.press/v235/gan24a.html", "pdf": "https://openreview.net/pdf?id=f47ZK6gy3I", "openreview": "https://openreview.net/forum?id=f47ZK6gy3I", "author_site": "Kai Gan, Tong Wei", "tldr": "", "abstract": "Semi-supervised learning (SSL) has witnessed remarkable progress, resulting in the emergence of numerous method variations. However, practitioners often encounter challenges when attempting to deploy these methods due to their subpar performance. In this paper, we present a novel SSL approach named FineSSL that significantly addresses this limitation by adapting pre-trained foundation models. We identify the aggregated biases and cognitive deviation problems inherent in foundation models, and propose a simple yet effective solution by imposing balanced margin softmax and decoupled label smoothing. Through extensive experiments, we demonstrate that FineSSL sets a new state of the art for SSL on multiple benchmark datasets, reduces the training cost by over six times, and can seamlessly integrate various fine-tuning and modern SSL algorithms. The source code is available at https://github.com/Gank0078/FineSSL.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kai Gan;Tong Wei", "authorids": "~Kai_Gan1;~Tong_Wei1", "gender": "M;M", "homepage": "https://palm.seu.edu.cn/weit/;https://cse.seu.edu.cn/", "dblp": "49/933-1;281/4015", "google_scholar": "EFCZuW4AAAAJ;", "orcid": "0000-0002-2766-8209;0009-0009-0286-907X", "linkedin": ";", "or_profile": "~Tong_Wei1;~Gan_Kai1", "aff": "Southeast University;Southeast University", "aff_domain": "seu.edu.cn;seu.edu.cn", "position": "Associate Professor;MS student", "bibtex": "@inproceedings{\ngan2024erasing,\ntitle={Erasing the Bias: Fine-Tuning Foundation Models for Semi-Supervised Learning},\nauthor={Kai Gan and Tong Wei},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=f47ZK6gy3I}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 724130, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10317466894732073841&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "seu.edu.cn;seu.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Southeast University", "aff_unique_dep": "", "aff_unique_url": "https://www.seu.edu.cn/", "aff_unique_abbr": "SEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Data Poisoning Attacks against Conformal Prediction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33473", "id": "f49AkFT5jf", "proceeding": "https://proceedings.mlr.press/v235/li24l.html", "pdf": "https://openreview.net/pdf?id=f49AkFT5jf", "openreview": "https://openreview.net/forum?id=f49AkFT5jf", "author_site": "Yangyi Li, Aobo Chen, Wei Qian, Chenxu Zhao, Divya Lidder, Mengdi Huai", "tldr": "", "abstract": "The efficient and theoretically sound uncertainty quantification is crucial for building trust in deep learning models. This has spurred a growing interest in conformal prediction (CP), a powerful technique that provides a model-agnostic and distribution-free method for obtaining conformal prediction sets with theoretical guarantees. However, the vulnerabilities of such CP methods with regard to dedicated data poisoning attacks have not been studied previously. To bridge this gap, for the first time, we in this paper propose a new class of black-box data poisoning attacks against CP, where the adversary aims to cause the desired manipulations of some specific examples' prediction uncertainty results (instead of misclassifications). Additionally, we design novel optimization frameworks for our proposed attacks. Further, we conduct extensive experiments to validate the effectiveness of our attacks on various settings (e.g., the full and split CP settings). Notably, our extensive experiments show that our attacks are more effective in manipulating uncertainty results than traditional poisoning attacks that aim at inducing misclassifications, and existing defenses against conventional attacks are ineffective against our proposed attacks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yangyi Li;Aobo Chen;Wei Qian;Chenxu Zhao;Divya Lidder;Mengdi Huai", "authorids": "~Yangyi_Li1;~Aobo_Chen1;~Wei_Qian5;~Chenxu_Zhao4;~Divya_Lidder1;~Mengdi_Huai1", "gender": ";M;M;F;F;M", "homepage": ";;;;https://mdhuai.github.io/;", "dblp": "326/2983;158/7501;;;150/8482;", "google_scholar": "evF1Es8AAAAJ;unVFhR8AAAAJ;n1gDJZQAAAAJ;;40ZYTzEAAAAJ;6J8ln3QAAAAJ", "orcid": ";;0000-0002-3298-9218;;0000-0001-6368-5973;", "linkedin": ";;;divya-lidder-041980213/;;chenxu-zhao-2b6590181/", "or_profile": "~Yangyi_Li1;~Aobo_Chen1;~Wei_Qian5;~Divya_Lidder1;~Mengdi_Huai1;~CHENXU_ZHAO2", "aff": "Iowa State University;Iowa State University;Iowa State University;Iowa State University;Iowa State University;Iowa State University", "aff_domain": "iastate.edu;iastate.edu;cs.iastate.edu;iastate.edu;iastate.edu;iastate.edu", "position": "PhD student;PhD student;PhD student;Undergrad student;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nli2024data,\ntitle={Data Poisoning Attacks against Conformal Prediction},\nauthor={Yangyi Li and Aobo Chen and Wei Qian and Chenxu Zhao and Divya Lidder and Mengdi Huai},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=f49AkFT5jf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 429152, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3509796549155389022&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "iastate.edu;iastate.edu;cs.iastate.edu;iastate.edu;iastate.edu;iastate.edu", "author_num": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Iowa State University", "aff_unique_dep": "", "aff_unique_url": "https://www.iastate.edu", "aff_unique_abbr": "ISU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Self-Composing Policies for Scalable Continual Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33472", "id": "f5gtX2VWSB", "proceeding": "https://proceedings.mlr.press/v235/malagon24a.html", "pdf": "https://openreview.net/pdf?id=f5gtX2VWSB", "openreview": "https://openreview.net/forum?id=f5gtX2VWSB", "author_site": "Mikel Malag\u00f3n, Josu Ceberio, Jose A Lozano", "tldr": "", "abstract": "This work introduces a growable and modular neural network architecture that naturally avoids catastrophic forgetting and interference in continual reinforcement learning. The structure of each module allows the selective combination of previous policies along with its internal policy accelerating the learning process on the current task. Unlike previous growing neural network approaches, we show that the number of parameters of the proposed approach grows linearly with respect to the number of tasks, and does not sacrifice plasticity to scale. Experiments conducted in benchmark continuous control and visual problems reveal that the proposed approach achieves greater knowledge transfer and performance than alternative methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mikel Malagon;Josu Ceberio;Jose A. Lozano", "authorids": "~Mikel_Malagon1;~Josu_Ceberio1;~Jose_A._Lozano1", "gender": "M;M;M", "homepage": "https://mikelma.srht.site/;http://www.sc.ehu.es/ccwbayes/members/jceberio/home/index.html;", "dblp": "239/8285;78/7651;", "google_scholar": "Vf3XURgAAAAJ;https://scholar.google.es/citations?user=yFknLj0AAAAJ;lhzoWpwAAAAJ", "orcid": "0000-0001-8246-9918;0000-0001-7120-6338;", "linkedin": ";;", "or_profile": "~Mikel_Malagon1;~Josu_Ceberio1;~Jose_A._Lozano1", "aff": "Universidad del Pa\u00eds Vasco;Universidad del Pa\u00eds Vasco;Basque Center for Applied Mathematics", "aff_domain": "ehu.eus;ehu.eus;bcamath.org", "position": "PhD student;Associate Professor;Principal Researcher", "bibtex": "@inproceedings{\nmalagon2024selfcomposing,\ntitle={Self-Composing Policies for Scalable Continual Reinforcement Learning},\nauthor={Mikel Malagon and Josu Ceberio and Jose A. Lozano},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=f5gtX2VWSB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6434727, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=607815899598752874&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "ehu.eus;ehu.eus;bcamath.org", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Universidad del Pa\u00eds Vasco;Basque Center for Applied Mathematics", "aff_unique_dep": ";", "aff_unique_url": "https://www.ehu.eus/en;https://www.bcamath.org/", "aff_unique_abbr": "UPV/EHU;BCAM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Spain" }, { "title": "How Deep Do We Need: Accelerating Training and Inference of Neural ODEs via Control Perspective", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33471", "id": "f6QenZyyeP", "proceeding": "https://proceedings.mlr.press/v235/miao24a.html", "pdf": "https://openreview.net/pdf?id=f6QenZyyeP", "openreview": "https://openreview.net/forum?id=f6QenZyyeP", "author_site": "Keyan Miao, Konstantinos Gatsis", "tldr": "", "abstract": "Neural Ordinary Differential Equations (ODEs) have shown promise in learning continuous dynamics. However, their slow training and inference speed hinder wider applications. In this paper, we propose to optimize Neural ODEs from a spatial and temporal perspective, drawing inspiration from control theory. We aim to find a reasonable depth of the network, accelerating both training and inference while maintaining network performance. Two approaches are proposed. One reformulates training as a minimum-time optimal control problem directly in a single stage to search for the terminal time and network weights. The second approach uses pre-training coupled with a Lyapunov method in an initial stage, and then at a secondary stage introduces a safe terminal time updating mechanism in the forward direction. Experimental results demonstrate the effectiveness of speeding up Neural ODEs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Keyan Miao;Konstantinos Gatsis", "authorids": "~Keyan_Miao1;~Konstantinos_Gatsis1", "gender": "F;", "homepage": "https://kymiao.github.io;https://kgatsis.github.io/", "dblp": ";", "google_scholar": ";YoOgmWEAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Keyan_Miao1;~Konstantinos_Gatsis1", "aff": "University of Oxford;University of Southampton", "aff_domain": "ox.ac.uk;soton.ac.uk", "position": "PhD student;Lecturer", "bibtex": "@inproceedings{\nmiao2024how,\ntitle={How Deep Do We Need: Accelerating Training and Inference of Neural {ODE}s via Control Perspective},\nauthor={Keyan Miao and Konstantinos Gatsis},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=f6QenZyyeP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1843989, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12477755857009734760&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "ox.ac.uk;soton.ac.uk", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Oxford;University of Southampton", "aff_unique_dep": ";", "aff_unique_url": "https://www.ox.ac.uk;https://www.southampton.ac.uk", "aff_unique_abbr": "Oxford;Southampton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Amend to Alignment: Decoupled Prompt Tuning for Mitigating Spurious Correlation in Vision-Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33470", "id": "f8G2KSCSdp", "proceeding": "https://proceedings.mlr.press/v235/zhang24as.html", "pdf": "https://openreview.net/pdf?id=f8G2KSCSdp", "openreview": "https://openreview.net/forum?id=f8G2KSCSdp", "author_site": "Jie ZHANG, Xiaosong Ma, Song Guo, Peng Li, Wenchao Xu, Xueyang Tang, Zicong Hong", "tldr": "", "abstract": "Fine-tuning the learnable prompt for a pre-trained vision-language model (VLM), such as CLIP, has demonstrated exceptional efficiency in adapting to a broad range of downstream tasks. Existing prompt tuning methods for VLMs do not distinguish spurious features introduced by biased training data from invariant features, and employ a uniform alignment process when adapting to unseen target domains. This can impair the cross-modal feature alignment when the testing data significantly deviate from the distribution of the training data, resulting in a poor out-of-distribution (OOD) generalization performance. In this paper, we reveal that the prompt tuning failure in such OOD scenarios can be attribute to the undesired alignment between the textual and the spurious feature. As a solution, we propose **CoOPood**, a fine-grained prompt tuning method that can discern the causal features and deliberately align the text modality with the invariant feature. Specifically, we design two independent contrastive phases using two lightweight projection layers during the alignment, each with different objectives: 1) pulling the text embedding closer to invariant image embedding and 2) pushing text embedding away from spurious image embedding. We have illustrated that **CoOPood** can serve as a general framework for VLMs and can be seamlessly integrated with existing prompt tuning methods. Extensive experiments on various OOD datasets demonstrate the performance superiority over state-of-the-art methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jie ZHANG;Xiaosong Ma;Song Guo;Peng Li;Wenchao Xu;Xueyang Tang;Zicong Hong", "authorids": "~Jie_ZHANG18;~Xiaosong_Ma4;~Song_Guo5;~Peng_Li20;~Wenchao_Xu1;~Xueyang_Tang1;~Zicong_Hong1", "gender": "F;M;M;M;;M;M", "homepage": "https://cugzj.github.io/zhangjie.github.io/;;https://cse.hkust.edu.hk/~songguo/;http://www.u-aizu.ac.jp/~pengli;;;https://zicongs-homepage.webflow.io/", "dblp": "84/6889-76;m/XiaosongMa;01/267-1;;;;", "google_scholar": "JRCNlI8AAAAJ;;https://scholar.google.com/citations?hl=en;BwVWb8UAAAAJ;;wAGIpRAAAAAJ;", "orcid": "0000-0002-8073-2118;0000-0001-7979-2183;;;;0000-0003-4284-9806;", "linkedin": ";;;;;;", "or_profile": "~Jie_ZHANG18;~Xiaosong_Ma4;~Song_Guo5;~Peng_Li20;~Wenchao_Xu1;~Xueyang_Tang1;~Zicong_Hong1", "aff": "The Hong Kong Polytechnic University;Hong Kong Polytechnic University;Department of Computer Science and Engineering, Hong Kong University of Science and Technology;University of Aizu;;The Hong Kong Polytechnic University;Hong Kong Polytechnic University", "aff_domain": "polyu.edu.hk;polyu.edu.hk;cse.ust.hk;u-aizu.ac.jp;;polyu.edu.hk;polyu.edu.hk", "position": "Postdoc;PhD student;Full Professor;Associate Professor;;PhD student;PhD student", "bibtex": "@inproceedings{\nzhang2024amend,\ntitle={Amend to Alignment: Decoupled Prompt Tuning for Mitigating Spurious Correlation in Vision-Language Models},\nauthor={Jie ZHANG and Xiaosong Ma and Song Guo and Peng Li and Wenchao Xu and Xueyang Tang and Zicong Hong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=f8G2KSCSdp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2327680, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5645075572006327649&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "polyu.edu.hk;polyu.edu.hk;cse.ust.hk;u-aizu.ac.jp;;polyu.edu.hk;polyu.edu.hk", "author_num": 7, "aff_unique_index": "0;0;1;2;0;0", "aff_unique_norm": "Hong Kong Polytechnic University;Hong Kong University of Science and Technology;University of Aizu", "aff_unique_dep": ";Department of Computer Science and Engineering;", "aff_unique_url": "https://www.polyu.edu.hk;https://www.ust.hk;https://www.u-aizu.ac.jp", "aff_unique_abbr": "PolyU;HKUST;UoA", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "China;Japan" }, { "title": "Reshape and Adapt for Output Quantization (RAOQ): Quantization-aware Training for In-memory Computing Systems", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33469", "id": "fM9xTkpAdu", "proceeding": "https://proceedings.mlr.press/v235/zhang24i.html", "pdf": "https://openreview.net/pdf?id=fM9xTkpAdu", "openreview": "https://openreview.net/forum?id=fM9xTkpAdu", "author_site": "Bonan Zhang, Chia-Yu Chen, Naveen Verma", "tldr": "", "abstract": "In-memory computing (IMC) has emerged as a promising solution to address both computation and data-movement challenges, by performing computation on data in-place directly in the memory array. IMC typically relies on analog operation, which makes analog-to-digital converters (ADCs) necessary, for converting results back to the digital domain. However, ADCs maintain computational efficiency by having limited precision, leading to substantial quantization errors in compute outputs. This work proposes RAOQ (Reshape and Adapt for Output Quantization) to overcome this issue, which comprises two classes of mechanisms including: 1) mitigating ADC quantization error by adjusting the statistics of activations and weights, through an activation-shifting approach (A-shift) and a weight reshaping technique (W-reshape); 2) adapting AI models to better tolerate ADC quantization through a bit augmentation method (BitAug), complemented by the introduction of ADC-LoRA, a low-rank approximation technique, to reduce the training overhead. RAOQ demonstrates consistently high performance across different scales and domains of neural network models for computer vision and natural language processing (NLP) tasks at various bit precisions, achieving state-of-the-art results with practical IMC implementations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bonan Zhang;Chia-Yu Chen;Naveen Verma", "authorids": "~Bonan_Zhang1;~Chia-Yu_Chen2;~Naveen_Verma1", "gender": ";M;M", "homepage": "https://ece.princeton.edu/people/bonan-zhang;https://www.linkedin.com/in/chia-yu-chen-82235723;https://nverma.princeton.edu/", "dblp": ";;", "google_scholar": ";;aX4-unUAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Bonan_Zhang1;~Chia-Yu_Chen2;~Naveen_Verma1", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nzhang2024reshape,\ntitle={Reshape and Adapt for Output Quantization ({RAOQ}): Quantization-aware Training for In-memory Computing Systems},\nauthor={Bonan Zhang and Chia-Yu Chen and Naveen Verma},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fM9xTkpAdu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1315526, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14635013488959972103&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": ";;", "author_num": 3 }, { "title": "Scale-Free Image Keypoints Using Differentiable Persistent Homology", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33468", "id": "fNJbcxhxRj", "proceeding": "https://proceedings.mlr.press/v235/barbarani24a.html", "pdf": "https://openreview.net/pdf?id=fNJbcxhxRj", "openreview": "https://openreview.net/forum?id=fNJbcxhxRj", "author_site": "Giovanni Barbarani, Francesco Vaccarino, Gabriele Trivigno, Marco Guerra, Gabriele Berton, Carlo Masone", "tldr": "", "abstract": "In computer vision, keypoint detection is a fundamental task, with applications spanning from robotics to image retrieval; however, existing learning-based methods suffer from scale dependency, and lack flexibility. This paper introduces a novel approach that leverages Morse theory and persistent homology, powerful tools rooted in algebraic topology. We propose a novel loss function based on the recent introduction of a notion of subgradient in persistent homology, paving the way towards topological learning. Our detector, MorseDet, is the first topology-based learning model for feature detection, which achieves competitive performance in keypoint repeatability and introduces a principled and theoretically robust approach to the problem.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Giovanni Barbarani;Francesco Vaccarino;Gabriele Trivigno;Marco Guerra;Gabriele Berton;Carlo Masone", "authorids": "~Giovanni_Barbarani1;~Francesco_Vaccarino1;~Gabriele_Trivigno1;~Marco_Guerra1;~Gabriele_Berton1;~Carlo_Masone1", "gender": "M;M;;M;M;M", "homepage": "https://it.linkedin.com/in/giovanni-barbarani-1b5361187;https://www.polito.it/en/staff?p=francesco.vaccarino;https://www.dauin.polito.it/it/personale/scheda/(nominativo)/gabriele.trivigno;https://www.marcoguerra192.github.io;;https://cmas1.github.io/", "dblp": ";;318/1059;;276/5303;26/8365", "google_scholar": ";https://scholar.google.it/citations?user=4XfzoZQAAAAJ;JXf_iToAAAAJ;;pc_rMSMAAAAJ;https://scholar.google.it/citations?user=cM3Iz_4AAAAJ", "orcid": ";0000-0002-0610-9168;;0000-0003-0033-3748;;0000-0002-1609-9338", "linkedin": ";francesco-vaccarino-67201b2/;gabriele-trivigno-7a1586168/;;;", "or_profile": "~Giovanni_Barbarani1;~Francesco_Vaccarino1;~Gabriele_Trivigno1;~Marco_Guerra1;~Gabriele_Berton1;~Carlo_Masone1", "aff": "Polytechnic Institute of Turin;Polytechnic Institute of Turin;Polytechnic Institute of Turin;Universit\u00e9 Grenoble Alpes;Carnegie Mellon University;Polytechnic Institute of Turin", "aff_domain": "polito.it;polito.it;polito.it;univ-grenoble-alpes.fr;cmu.edu;polito.it", "position": "MS student;Associate Professor;PhD student;Postdoc;Intern;Assistant Professor", "bibtex": "@inproceedings{\nbarbarani2024scalefree,\ntitle={Scale-Free Image Keypoints Using Differentiable Persistent Homology},\nauthor={Giovanni Barbarani and Francesco Vaccarino and Gabriele Trivigno and Marco Guerra and Gabriele Berton and Carlo Masone},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fNJbcxhxRj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6549390, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3230891814065938542&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 9, "email": "polito.it;polito.it;polito.it;univ-grenoble-alpes.fr;cmu.edu;polito.it", "author_num": 6, "aff_unique_index": "0;0;0;1;2;0", "aff_unique_norm": "Polytechnic Institute of Turin;Universit\u00e9 Grenoble Alpes;Carnegie Mellon University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.polito.it;https://www.univ-grenoble-alpes.fr;https://www.cmu.edu", "aff_unique_abbr": "Polito;UGA;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;2;0", "aff_country_unique": "Italy;France;United States" }, { "title": "Video-of-Thought: Step-by-Step Video Reasoning from Perception to Cognition", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33467", "id": "fO31YAyNbI", "proceeding": "https://proceedings.mlr.press/v235/fei24a.html", "pdf": "https://openreview.net/pdf?id=fO31YAyNbI", "openreview": "https://openreview.net/forum?id=fO31YAyNbI", "author_site": "Hao Fei, Shengqiong Wu, Wei Ji, Hanwang Zhang, Meishan Zhang, Mong-Li Lee, Wynne Hsu", "tldr": "", "abstract": "Existing research of video understanding still struggles to achieve in-depth comprehension and reasoning in complex videos, primarily due to the under-exploration of two key bottlenecks: fine-grained spatial-temporal perceptive understanding and cognitive-level video scene comprehension. This paper bridges the gap by presenting a novel solution. We first introduce a novel video Multimodal Large Language Model (MLLM), MotionEpic, which achieves fine-grained pixel-level spatial-temporal video grounding by integrating video spatial-temporal scene graph (STSG) representation. Building upon MotionEpic, we then develop a Video-of-Thought (VoT) reasoning framework. VoT inherits the Chain-of-Thought (CoT) core, breaking down a complex task into simpler and manageable sub-problems, and addressing them step-by-step from a low-level pixel perception to high-level cognitive interpretation. Extensive experiments across various complex video QA benchmarks demonstrate that our overall framework strikingly boosts existing state-of-the-art. To our knowledge, this is the first attempt at successfully implementing the CoT technique for achieving human-level video reasoning, where we show great potential in extending it to a wider range of video understanding scenarios. Systems and codes will be open later.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hao Fei;Shengqiong Wu;Wei Ji;Hanwang Zhang;Meishan Zhang;Mong-Li Lee;Wynne Hsu", "authorids": "~Hao_Fei1;~Shengqiong_Wu2;~Wei_Ji1;~Hanwang_Zhang3;~Meishan_Zhang1;~Mong-Li_Lee1;~Wynne_Hsu1", "gender": "M;F;M;M;M;F;F", "homepage": "https://haofei.vip/;https://chocowu.github.io/;https://jiwei0523.github.io/;https://mreallab.github.io/index.html;https://zhangmeishan.github.io/;https://www.comp.nus.edu.sg/~leeml/;http://www.comp.nus.edu.sg/~whsu/", "dblp": "81/3569-1;274/7191;52/3220-8;79/8116.html;127/0273;l/MongLiLee;h/WynneHsu", "google_scholar": "YGDX46AAAAAJ;RJJLKR0AAAAJ;69OFB-AAAAAJ;YG0DFyYAAAAJ;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.tw/citations?user=_xFTK8wAAAAJ;https://scholar.google.com.tw/citations?user=ljyBjv8AAAAJ", "orcid": "0000-0003-3026-6347;0000-0001-6192-1194;0000-0002-8106-9768;;;0000-0002-9636-388X;0000-0002-4142-8893", "linkedin": ";;;;;;", "or_profile": "~Hao_Fei1;~Shengqiong_Wu2;~Wei_Ji1;~Hanwang_Zhang3;~Meishan_Zhang1;~Mong-Li_Lee1;~Wynne_Hsu1", "aff": "National University of Singapore;National University of Singapore;Nanjing University;Nanyang Technological University;Harbin Institute of Technology (Shenzhen), China;National University of Singapore;National University of Singapore", "aff_domain": "nus.edu.sg;u.nus.edu;nju.edu.cn;ntu.edu.sg;hit.edu.cn;nus.edu.sg;nus.edu.sg", "position": "Postdoc;PhD student;Associate Professor;Associate Professor;Associate Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nfei2024videoofthought,\ntitle={Video-of-Thought: Step-by-Step Video Reasoning from Perception to Cognition},\nauthor={Hao Fei and Shengqiong Wu and Wei Ji and Hanwang Zhang and Meishan Zhang and Mong-Li Lee and Wynne Hsu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fO31YAyNbI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1347644, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 99, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12557432622340577168&as_sdt=8005&sciodt=0,7&hl=en", "gs_version_total": 9, "email": "nus.edu.sg;u.nus.edu;nju.edu.cn;ntu.edu.sg;hit.edu.cn;nus.edu.sg;nus.edu.sg", "author_num": 7, "aff_unique_index": "0;0;1;2;3;0;0", "aff_unique_norm": "National University of Singapore;Nanjing University;Nanyang Technological University;Harbin Institute of Technology", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.nus.edu.sg;https://www.nju.edu.cn;https://www.ntu.edu.sg;http://en.hhit.edu.cn/", "aff_unique_abbr": "NUS;Nanjing U;NTU;HIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;1;0;1;0;0", "aff_country_unique": "Singapore;China" }, { "title": "Learning Low-dimensional Latent Dynamics from High-dimensional Observations: Non-asymptotics and Lower Bounds", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33466", "id": "fOBas5H4Xc", "proceeding": "https://proceedings.mlr.press/v235/zhang24bh.html", "pdf": "https://openreview.net/pdf?id=fOBas5H4Xc", "openreview": "https://openreview.net/forum?id=fOBas5H4Xc", "author_site": "Yuyang Zhang, Shahriar Talebi, Na Li", "tldr": "", "abstract": "In this paper, we focus on learning a linear time-invariant (LTI) model with low-dimensional latent variables but high-dimensional observations. We provide an algorithm that recovers the high-dimensional features, i.e. column space of the observer, embeds the data into low dimensions and learns the low-dimensional model parameters. Our algorithm enjoys a sample complexity guarantee of order $\\tilde{\\mathcal{O}}(n/\\epsilon^2)$, where $n$ is the observation dimension. We further establish a fundamental lower bound indicating this complexity bound is optimal up to logarithmic factors and dimension-independent constants. We show that this inevitable linear factor of $n$ is due to the learning error of the observer's column space in the presence of high-dimensional noises. Extending our results, we consider a meta-learning problem inspired by various real-world applications, where the observer column space can be collectively learned from datasets of multiple LTI systems. An end-to-end algorithm is then proposed, facilitating learning LTI systems from a meta-dataset which breaks the sample complexity lower bound in certain scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuyang Zhang;Shahriar Talebi;Na Li", "authorids": "~Yuyang_Zhang4;~Shahriar_Talebi2;~Na_Li3", "gender": "M;M;F", "homepage": ";https://shahriarta.github.io;https://nali.seas.harvard.edu/", "dblp": ";204/4214;", "google_scholar": "https://scholar.google.ca/citations?user=NiBKGakAAAAJ;https://scholar.google.com/citations?hl=en;qdGelXoAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yuyang_Zhang4;~Shahriar_Talebi2;~Na_Li3", "aff": "Harvard University, Harvard University;Harvard University;Harvard University", "aff_domain": "g.harvard.edu;harvard.edu;harvard.edu", "position": "PhD student;Postdoc;Full Professor", "bibtex": "@inproceedings{\nzhang2024learning,\ntitle={Learning Low-dimensional Latent Dynamics from High-dimensional Observations: Non-asymptotics and Lower Bounds},\nauthor={Yuyang Zhang and Shahriar Talebi and Na Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fOBas5H4Xc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1242805, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5769815821150030408&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "g.harvard.edu;harvard.edu;harvard.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Riemannian Accelerated Zeroth-order Algorithm: Improved Robustness and Lower Query Complexity", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33465", "id": "fPwWfoyxL1", "proceeding": "https://proceedings.mlr.press/v235/he24h.html", "pdf": "https://openreview.net/pdf?id=fPwWfoyxL1", "openreview": "https://openreview.net/forum?id=fPwWfoyxL1", "author_site": "Chang He, Zhaoye Pan, Xiao Wang, Bo Jiang", "tldr": "", "abstract": "Optimization problems with access to only zeroth-order information of the objective function on Riemannian manifolds arise in various applications, spanning from statistical learning to robot learning. While various zeroth-order algorithms have been proposed in Euclidean space, they are not inherently designed to handle the challenging constraints imposed by Riemannian manifolds. The proper adaptation of zeroth-order techniques to Riemannian manifolds remained unknown until the pioneering work of (Li et al., 2023a). However, zeroth-order algorithms are widely observed to converge slowly and be unstable in practice. To alleviate these issues, we propose a Riemannian accelerated zeroth-order algorithm with improved robustness. Regarding efficiency, our accelerated algorithm has the function query complexity of $\\mathcal{O}(\\epsilon^{-7/4}d)$ for finding an $\\epsilon$-approximate first-order stationary point. By introducing a small perturbation, it exhibits a function query complexity of $\\tilde{\\mathcal{O}}(\\epsilon^{-7/4}d)$ for seeking a second-order stationary point with a high probability, matching state-of-the-art result in Euclidean space. Moreover, we further establish the almost sure convergence in the asymptotic sense through the Stable Manifold Theorem. Regarding robustness, our algorithm requires larger smoothing parameters in the order of $\\tilde{\\mathcal{O}}(\\epsilon^{7/8}d^{-1/2})$, improving the existing result by a factor of $\\tilde{\\mathcal{O}}(\\epsilon^{3/4})$.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chang He;Zhaoye Pan;Xiao Wang;Bo Jiang", "authorids": "~Chang_He1;~Zhaoye_Pan1;~Xiao_Wang4;~Bo_Jiang3", "gender": "M;M;;M", "homepage": ";https://muhoushaonian.github.io/;;https://sites.google.com/site/isyebojiang/", "dblp": ";;;", "google_scholar": "5LpHDYAAAAAJ;;;zsl_4FYAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Chang_He1;~Zhaoye_Pan1;~Xiao_Wang4;~Bo_Jiang3", "aff": "Shanghai University of Finance and Economics;Shanghai University of Finance and Economics;;", "aff_domain": "sufe.edu;sufe.edu.cn;;", "position": "PhD student;MS student;;", "bibtex": "@inproceedings{\nhe2024riemannian,\ntitle={Riemannian Accelerated Zeroth-order Algorithm: Improved Robustness and Lower Query Complexity},\nauthor={Chang He and Zhaoye Pan and Xiao Wang and Bo Jiang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fPwWfoyxL1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1275869, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3621525091069814828&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 6, "email": "sufe.edu;sufe.edu.cn;;", "author_num": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Shanghai University of Finance and Economics", "aff_unique_dep": "", "aff_unique_url": "http://www.sufe.edu.cn", "aff_unique_abbr": "SUFE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Large Language Models Can Automatically Engineer Features for Few-Shot Tabular Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33464", "id": "fRG45xL1WT", "proceeding": "https://proceedings.mlr.press/v235/han24f.html", "pdf": "https://openreview.net/pdf?id=fRG45xL1WT", "openreview": "https://openreview.net/forum?id=fRG45xL1WT", "author_site": "Sungwon Han, Jinsung Yoon, Sercan Arik, Tomas Pfister", "tldr": "", "abstract": "Large Language Models (LLMs), with their remarkable ability to tackle challenging and unseen reasoning problems, hold immense potential for tabular learning, that is vital for many real-world applications. In this paper, we propose a novel in-context learning framework, FeatLLM, which employs LLMs as feature engineers to produce an input data set that is optimally suited for tabular predictions. The generated features are used to infer class likelihood with a simple downstream machine learning model, such as linear regression and yields high performance few-shot learning. The proposed FeatLLM framework only uses this simple predictive model with the discovered features at inference time. Compared to existing LLM-based approaches, FeatLLM eliminates the need to send queries to the LLM for each sample at inference time. Moreover, it merely requires API-level access to LLMs, and overcomes prompt size limitations. As demonstrated across numerous tabular datasets from a wide range of domains, FeatLLM generates high-quality rules, significantly (10% on average) outperforming alternatives such as TabLLM and STUNT.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sungwon Han;Jinsung Yoon;Sercan O Arik;Tomas Pfister", "authorids": "~Sungwon_Han1;~Jinsung_Yoon1;~Sercan_O_Arik1;~Tomas_Pfister1", "gender": "M;M;M;M", "homepage": "https://sites.google.com/view/sungwon-han/;https://sites.google.com/corp/view/jinsungyoon;https://www.sercanarik.com/;http://tomas.pfister.fi", "dblp": "72/5688-1;173/5409.html;;14/8360", "google_scholar": "8zWgcFgAAAAJ;kiFd6A8AAAAJ;;ahSpJOAAAAAJ", "orcid": "0000-0002-1129-760X;;0000-0001-6333-1729;0009-0004-4088-8718", "linkedin": "sungwon-han-1bbb63133/;jinsung-yoon-bb7751b8;;", "or_profile": "~Sungwon_Han1;~Jinsung_Yoon1;~Sercan_O_Arik1;~Tomas_Pfister1", "aff": "Korea Advanced Institute of Science & Technology;Google;Google;Google", "aff_domain": "kaist.ac.kr;google.com;google.com;google.com", "position": "Integrated PhD student;Research Scientist;Research Scientist;Head of Research @ Cloud AI", "bibtex": "@inproceedings{\nhan2024large,\ntitle={Large Language Models Can Automatically Engineer Features for Few-Shot Tabular Learning},\nauthor={Sungwon Han and Jinsung Yoon and Sercan O Arik and Tomas Pfister},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fRG45xL1WT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1078737, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18114022116274410220&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "kaist.ac.kr;google.com;google.com;google.com", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.kaist.ac.kr;https://www.google.com", "aff_unique_abbr": "KAIST;Google", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "South Korea;United States" }, { "title": "Graph Neural Networks Use Graphs When They Shouldn't", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33463", "id": "fSNHK7mu3j", "proceeding": "https://proceedings.mlr.press/v235/bechler-speicher24a.html", "pdf": "https://openreview.net/pdf?id=fSNHK7mu3j", "openreview": "https://openreview.net/forum?id=fSNHK7mu3j", "author_site": "Maya Bechler-Speicher, Ido Amos, Ran Gilad-Bachrach, Amir Globerson", "tldr": "", "abstract": "Predictions over graphs play a crucial role in various domains, including social networks and medicine. Graph Neural Networks (GNNs) have emerged as the dominant approach for learning on graph data. Although a graph-structure is provided as input to the GNN, in some cases the best solution can be obtained by ignoring it. While GNNs have the ability to ignore the graph-structure in such cases, it is not clear that they will. In this work, we show that GNNs actually tend to overfit the given graph-structure in the sense that they use it even when a better solution can be obtained by ignoring it. We analyze the implicit bias of gradient-descent learning of GNNs and prove that when the ground truth function does not use the graphs, GNNs are not guaranteed to learn a solution that ignores the graph, even with infinite data. We examine this phenomenon with respect to different graph distributions and find that regular graphs are more robust to this overfitting. We also prove that within the family of regular graphs, GNNs are guaranteed to extrapolate when learning with gradient descent. Finally, based on our empirical and theoretical findings, we demonstrate on real-data how regular graphs can be leveraged to reduce graph overfitting and enhance performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Maya Bechler-Speicher;Ido Amos;Ran Gilad-Bachrach;Amir Globerson", "authorids": "~Maya_Bechler-Speicher1;~Ido_Amos1;~Ran_Gilad-Bachrach2;~Amir_Globerson1", "gender": "Not Specified;;;M", "homepage": ";;http://mlwell.org;http://www.cs.tau.ac.il/~gamir/", "dblp": ";;g/RGiladBachrach;08/4162.html", "google_scholar": "https://scholar.google.co.il/citations?user=5Fj_AUoAAAAJ;;nnLiId8AAAAJ;https://scholar.google.com.tw/citations?user=5JserkUAAAAJ", "orcid": ";;0000-0002-4001-8307;", "linkedin": "maya-bechler-speicher-815103103/;;ranigb/;", "or_profile": "~Maya_Bechler-Speicher1;~Ido_Amos1;~Ran_Gilad-Bachrach2;~Amir_Globerson1", "aff": "Tel Aviv University;;Microsoft;Tel Aviv University", "aff_domain": "tau.ac.il;;microsoft.com;tau.ac.il", "position": "PhD student;;Principal Researcher;Associate Professor", "bibtex": "@inproceedings{\nbechler-speicher2024graph,\ntitle={Graph Neural Networks Use Graphs When They Shouldn't},\nauthor={Maya Bechler-Speicher and Ido Amos and Ran Gilad-Bachrach and Amir Globerson},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fSNHK7mu3j}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2624906, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4128621615630680357&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "tau.ac.il;;microsoft.com;tau.ac.il", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Tel Aviv University;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.tau.ac.il;https://www.microsoft.com", "aff_unique_abbr": "TAU;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Israel;United States" }, { "title": "Boundary Exploration for Bayesian Optimization With Unknown Physical Constraints", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33462", "id": "fSnMqHZ8xr", "proceeding": "https://proceedings.mlr.press/v235/tian24g.html", "pdf": "https://openreview.net/pdf?id=fSnMqHZ8xr", "openreview": "https://openreview.net/forum?id=fSnMqHZ8xr", "author_site": "Yunsheng Tian, Ane Zuniga, Xinwei Zhang, Johannes P. D\u00fcrholt, Payel Das, Jie Chen, Wojciech Matusik, Mina Konakovic Lukovic", "tldr": "", "abstract": "Bayesian optimization has been successfully applied to optimize black-box functions where the number of evaluations is severely limited. However, in many real-world applications, it is hard or impossible to know in advance which designs are feasible due to some physical or system limitations. These issues lead to an even more challenging problem of optimizing an unknown function with unknown constraints. In this paper, we observe that in such scenarios optimal solution typically lies on the boundary between feasible and infeasible regions of the design space, making it considerably more difficult than that with interior optima. Inspired by this observation, we propose BE-CBO, a new Bayesian optimization method that efficiently explores the boundary between feasible and infeasible designs. To identify the boundary, we learn the constraints with an ensemble of neural networks that outperform the standard Gaussian Processes for capturing complex boundaries. Our method demonstrates superior performance against state-of-the-art methods through comprehensive experiments on synthetic and real-world benchmarks. Code available at: https://github.com/yunshengtian/BE-CBO", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yunsheng Tian;Ane Zuniga;Xinwei Zhang;Johannes P. D\u00fcrholt;Payel Das;Jie Chen;Wojciech Matusik;Mina Konakovic Lukovic", "authorids": "~Yunsheng_Tian1;~Ane_Zuniga1;~Xinwei_Zhang1;~Johannes_P._D\u00fcrholt1;~Payel_Das1;~Jie_Chen1;~Wojciech_Matusik2;~Mina_Konakovic_Lukovic1", "gender": "M;;M;;F;;M;F", "homepage": "https://www.yunshengtian.com/;https://anezuniga.github.io/;https://564612540.github.io/;;;https://jiechenjiechen.github.io;https://cdfg.mit.edu/wojciech;http://people.csail.mit.edu/mina/", "dblp": "224/0723;;55/9870-1.html;;56/7926;92/6289-7;;", "google_scholar": "sf6RjM4AAAAJ;;uq46meMAAAAJ;;;Z-lkme8AAAAJ;https://scholar.google.com/citations?hl=en;32Q2ni8AAAAJ", "orcid": ";;0000-0001-7967-7150;;;;0000-0003-0212-5643;0000-0002-2895-0206", "linkedin": ";;;;;;wojciech-matusik-67238126/;", "or_profile": "~Yunsheng_Tian1;~Ane_Zuniga1;~Xinwei_Zhang1;~Johannes_P._D\u00fcrholt1;~Payel_Das1;~Jie_Chen1;~Wojciech_Matusik2;~Mina_Konakovic_Lukovic1", "aff": "International Business Machines;Massachusetts Institute of Technology;University of Southern California;;IBM, International Business Machines;International Business Machines;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "ibm.com;mit.edu;usc.edu;;us.ibm.com;ibm.com;mit.edu;mit.edu", "position": "Intern;PhD student;Postdoc;;Principal Researcher;Research Staff Member;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\ntian2024boundary,\ntitle={Boundary Exploration for Bayesian Optimization With Unknown Physical Constraints},\nauthor={Yunsheng Tian and Ane Zuniga and Xinwei Zhang and Johannes P. D{\\\"u}rholt and Payel Das and Jie Chen and Wojciech Matusik and Mina Konakovic Lukovic},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fSnMqHZ8xr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8926222, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TYja0fbuklQJ:scholar.google.com/&scioq=Boundary+Exploration+for+Bayesian+Optimization+With+Unknown+Physical+Constraints&hl=en&as_sdt=0,44", "gs_version_total": 6, "email": "ibm.com;mit.edu;usc.edu;;us.ibm.com;ibm.com;mit.edu;mit.edu", "author_num": 8, "aff_unique_index": "0;1;2;3;0;1;1", "aff_unique_norm": "International Business Machines Corporation;Massachusetts Institute of Technology;University of Southern California;International Business Machines", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ibm.com;https://web.mit.edu;https://www.usc.edu;https://www.ibm.com", "aff_unique_abbr": "IBM;MIT;USC;IBM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Beyond ELBOs: A Large-Scale Evaluation of Variational Methods for Sampling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33461", "id": "fVg9YrSllr", "proceeding": "https://proceedings.mlr.press/v235/blessing24a.html", "pdf": "https://openreview.net/pdf?id=fVg9YrSllr", "openreview": "https://openreview.net/forum?id=fVg9YrSllr", "author_site": "Denis Blessing, Xiaogang Jia, Johannes Esslinger, Francisco Vargas, Gerhard Neumann", "tldr": "", "abstract": "Monte Carlo methods, Variational Inference, and their combinations play a pivotal role in sampling from intractable probability distributions. However, current studies lack a unified evaluation framework, relying on disparate performance measures and limited method comparisons across diverse tasks, complicating the assessment of progress and hindering the decision-making of practitioners. In response to these challenges, our work introduces a benchmark that evaluates sampling methods using a standardized task suite and a broad range of performance criteria. Moreover, we study existing metrics for quantifying mode collapse and introduce novel metrics for this purpose. Our findings provide insights into strengths and weaknesses of existing sampling methods, serving as a valuable reference for future developments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Denis Blessing;Xiaogang Jia;Johannes Esslinger;Francisco Vargas;Gerhard Neumann", "authorids": "~Denis_Blessing1;~Xiaogang_Jia1;~Johannes_Esslinger1;~Francisco_Vargas1;~Gerhard_Neumann2", "gender": "M;M;M;M;M", "homepage": ";https://xiaogangjia.github.io/Personal_Website/;;;https://alr.anthropomatik.kit.edu/", "dblp": "219/1435;23/10777;;79/7431-1;60/4878", "google_scholar": "https://scholar.google.de/citations?view_op=list_works;E7Tja9gAAAAJ;;;https://scholar.google.com.tw/citations?user=GL360kMAAAAJ", "orcid": ";;;;", "linkedin": ";;johannes-esslinger-0b5197211;;", "or_profile": "~Denis_Blessing1;~Xiaogang_Jia1;~Johannes_Esslinger1;~Francisco_Vargas1;~Gerhard_Neumann1", "aff": "Karlsruher Institut f\u00fcr Technologie;Karlsruher Institut f\u00fcr Technologie;Karlsruher Institut f\u00fcr Technologie;University of Cambridge;Karlsruhe Institute of Technology", "aff_domain": "kit.edu;kit.edu;kit.edu;cam.ac.uk;kit.edu", "position": "PhD student;PhD student;MS student;PhD student;Full Professor", "bibtex": "@inproceedings{\nblessing2024beyond,\ntitle={Beyond {ELBO}s: A Large-Scale Evaluation of Variational Methods for Sampling},\nauthor={Denis Blessing and Xiaogang Jia and Johannes Esslinger and Francisco Vargas and Gerhard Neumann},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fVg9YrSllr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7487108, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15325031186159406366&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "kit.edu;kit.edu;kit.edu;cam.ac.uk;kit.edu", "author_num": 5, "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "Karlsruher Institut f\u00fcr Technologie;University of Cambridge;Karlsruhe Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kit.edu;https://www.cam.ac.uk;https://www.kit.edu", "aff_unique_abbr": "KIT;Cambridge;KIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "Germany;United Kingdom" }, { "title": "Prompting is a Double-Edged Sword: Improving Worst-Group Robustness of Foundation Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33460", "id": "fdroxYsgzQ", "proceeding": "https://proceedings.mlr.press/v235/setlur24a.html", "pdf": "https://openreview.net/pdf?id=fdroxYsgzQ", "openreview": "https://openreview.net/forum?id=fdroxYsgzQ", "author_site": "Amrith Setlur, Saurabh Garg, Virginia Smith, Sergey Levine", "tldr": "", "abstract": "Machine learning models fail catastrophically under distribution shift, but a surprisingly effective way to empirically improve robustness to some types of shift (*e.g.*, Imagenet-A/C) is to use stronger open-vocabulary classifiers derived from foundation models. In this work, we first note that for shifts governed by spurious correlations (features spuriously correlated with the label on the training data, but not on test), the zero-shot and few-shot performance of foundation models is no better than ERM models, and remains unchanged when pretrained data/model size is scaled. Secondly, even in these situations, foundation models are quite accurate at predicting the value of the spurious feature. In a simplified setup, we theoretically analyze both these findings. Specifically, we show that during contrastive pretraining, the simplicity bias of foundation models tends to result in the learning of features that mostly rely on the spurious attribute, compared to more robust features. We leverage these observations to propose Prompting for Robustness (PfR) which first uses foundation models to zero-shot predict the spurious attribute on labeled examples, and then learns a classifier with balanced performance across different groups of labels and spurious attribute. Across 5 vision and language tasks, we show that PfR's performance nearly equals that of an oracle algorithm (group DRO) that leverages human labeled spurious attributes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Amrith Setlur;Saurabh Garg;Virginia Smith;Sergey Levine", "authorids": "~Amrith_Setlur1;~Saurabh_Garg3;~Virginia_Smith1;~Sergey_Levine1", "gender": "M;M;F;M", "homepage": "http://ars22.github.io;http://saurabhgarg1996.github.io/;;https://people.eecs.berkeley.edu/~svlevine/", "dblp": "https://dblp.uni-trier.de/pers/hd/s/Setlur:Amrith;80/208;120/0921;80/7594", "google_scholar": "https://scholar.google.ru/citations?user=i7V1kJgAAAAJ;SAnJ1hIAAAAJ;;8R35rCwAAAAJ", "orcid": "0000-0002-7061-3094;;;", "linkedin": ";saurabh-garg-b680b5b8/;;", "or_profile": "~Amrith_Setlur1;~Saurabh_Garg3;~Virginia_Smith1;~Sergey_Levine1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;Google", "aff_domain": "cmu.edu;cmu.edu;cmu.edu;google.com", "position": "PhD student;PhD student;Associate Professor;Research Scientist", "bibtex": "@inproceedings{\nsetlur2024prompting,\ntitle={Prompting is a Double-Edged Sword: Improving Worst-Group Robustness of Foundation Models},\nauthor={Amrith Setlur and Saurabh Garg and Virginia Smith and Sergey Levine},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fdroxYsgzQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2483543, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12216105835016844011&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "cmu.edu;cmu.edu;cmu.edu;google.com", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Carnegie Mellon University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.cmu.edu;https://www.google.com", "aff_unique_abbr": "CMU;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "MAGDi: Structured Distillation of Multi-Agent Interaction Graphs Improves Reasoning in Smaller Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33459", "id": "ffLblkoCw8", "proceeding": "https://proceedings.mlr.press/v235/chen24ah.html", "pdf": "https://openreview.net/pdf?id=ffLblkoCw8", "openreview": "https://openreview.net/forum?id=ffLblkoCw8", "author_site": "Justin Chih-Yao Chen, Swarnadeep Saha, Elias Stengel-Eskin, Mohit Bansal", "tldr": "", "abstract": "Multi-agent interactions between Large Language Model (LLM) agents have shown major improvements on diverse reasoning tasks. However, these involve long generations from multiple models across several rounds, making them expensive. Moreover, these multi-agent approaches fail to provide a final, single model for efficient inference. To address this, we introduce MAGDi, a new method for structured distillation of the reasoning interactions between multiple LLMs into smaller LMs. MAGDi teaches smaller models by representing multi-agent interactions as graphs, augmenting a base student model with a graph encoder, and distilling knowledge using three objective functions: next-token prediction, a contrastive loss between correct and incorrect reasoning, and a graph-based objective to model the interaction structure. Experiments on seven widely used commonsense and math reasoning benchmarks show that MAGDi improves the reasoning capabilities of smaller models, outperforming several methods that distill from a single teacher and multiple teachers. Moreover, MAGDi also demonstrates an order of magnitude higher efficiency over its teachers. We conduct extensive analyses to show that MAGDi (1) enhances the generalizability to out-of-domain tasks, (2) scales positively with the size and strength of the base student model, and (3) obtains larger improvements (via our multi-teacher training) when applying self-consistency \u2013 an inference technique that relies on model diversity.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Justin Chen;Swarnadeep Saha;Elias Stengel-Eskin;Mohit Bansal", "authorids": "~Justin_Chen1;~Swarnadeep_Saha2;~Elias_Stengel-Eskin1;~Mohit_Bansal2", "gender": "M;M;M;M", "homepage": "https://esteng.github.io;https://www.cs.unc.edu/~mbansal/;https://dinobby.github.io/;https://swarnahub.github.io/", "dblp": "212/6138;32/5243.html;248/8754.html;203/9296", "google_scholar": "gr_ZVSQAAAAJ;DN8QtscAAAAJ;https://scholar.google.com.tw/citations?user=ODoG9isAAAAJ;sY5SyBgAAAAJ", "orcid": "0000-0002-6689-505X;;0009-0006-4125-6418;", "linkedin": ";;;", "or_profile": "~Elias_Stengel-Eskin1;~Mohit_Bansal2;~Chih_Yao_Chen1;~Swarnadeep_Saha1", "aff": "University of North Carolina at Chapel Hill;University of North Carolina at Chapel Hill;University of North Carolina at Chapel Hill;Department of Computer Science, University of North Carolina, Chapel Hill", "aff_domain": "cs.unc.edu;unc.edu;unc.edu;cs.unc.edu", "position": "Postdoc;Full Professor;PhD student;PhD student", "bibtex": "@inproceedings{\nchen2024magdi,\ntitle={{MAGD}i: Structured Distillation of Multi-Agent Interaction Graphs Improves Reasoning in Smaller Language Models},\nauthor={Justin Chen and Swarnadeep Saha and Elias Stengel-Eskin and Mohit Bansal},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ffLblkoCw8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1745327, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16877099597860982025&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "cs.unc.edu;unc.edu;unc.edu;cs.unc.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of North Carolina", "aff_unique_dep": "", "aff_unique_url": "https://www.unc.edu", "aff_unique_abbr": "UNC", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Chapel Hill", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Lessons from Generalization Error Analysis of Federated Learning: You May Communicate Less Often!", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33458", "id": "ffS0aYP6mk", "proceeding": "https://proceedings.mlr.press/v235/sefidgaran24a.html", "pdf": "https://openreview.net/pdf?id=ffS0aYP6mk", "openreview": "https://openreview.net/forum?id=ffS0aYP6mk", "author_site": "Milad Sefidgaran, Romain Chor, Abdellatif Zaidi, Yijun Wan", "tldr": "", "abstract": "We investigate the generalization error of statistical learning models in a Federated Learning (FL) setting. Specifically, we study the evolution of the generalization error with the number of communication rounds $R$ between $K$ clients and a parameter server (PS), i.e. the effect on the generalization error of how often the clients' local models are aggregated at PS. In our setup, the more the clients communicate with PS the less data they use for local training in each round, such that the amount of training data per client is identical for distinct values of $R$. We establish PAC-Bayes and rate-distortion theoretic bounds on the generalization error that account explicitly for the effect of the number of rounds $R$, in addition to the number of participating devices $K$ and individual datasets size $n$. The bounds, which apply to a large class of loss functions and learning algorithms, appear to be the first of their kind for the FL setting. Furthermore, we apply our bounds to FL-type Support Vector Machines (FSVM); and derive (more) explicit bounds in this case. In particular, we show that the generalization bound of FSVM increases with $R$, suggesting that more frequent communication with PS diminishes the generalization power. This implies that the population risk decreases less fast with $R$ than does the empirical risk. Moreover, our bound suggests that the generalization error of FSVM decreases faster than that of centralized learning by a factor of $\\mathcal{O}(\\sqrt{\\log(K)/K})$. Finally, we provide experimental results obtained using neural networks (ResNet-56) which show evidence that not only may our observations for FSVM hold more generally but also that the population risk may even start to increase beyond some value of $R$.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Milad Sefidgaran;Romain Chor;Abdellatif Zaidi;Yijun Wan", "authorids": "~Milad_Sefidgaran1;~Romain_Chor1;~Abdellatif_Zaidi1;~Yijun_Wan1", "gender": "M;M;M;", "homepage": ";;http://www-syscom.univ-mlv.fr/~zaidi/;", "dblp": "56/9885.html;;07/3113;", "google_scholar": "https://scholar.google.com/citations?hl=en;VNjZ9WwAAAAJ;;", "orcid": ";;;", "linkedin": "milad-sefidgaran;romain-chor/;;", "or_profile": "~Milad_Sefidgaran1;~Romain_Chor1;~Abdellatif_Zaidi1;~Yijun_Wan1", "aff": "Huawei Technologies Ltd. (Pairs Resaerch Center);Huawei Technologies France;Universit\u00e9 Gustave Eiffel;", "aff_domain": "huawei.com;huawei.com;univ-eiffel.fr;", "position": "Researcher;PhD student;Associate Professor;", "bibtex": "@inproceedings{\nsefidgaran2024lessons,\ntitle={Lessons from Generalization Error Analysis of Federated Learning: You May Communicate Less Often!},\nauthor={Milad Sefidgaran and Romain Chor and Abdellatif Zaidi and Yijun Wan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ffS0aYP6mk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1574614, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1111791673415573705&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "huawei.com;huawei.com;univ-eiffel.fr;", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Huawei;Universit\u00e9 Gustave Eiffel", "aff_unique_dep": "Huawei Technologies Ltd.;", "aff_unique_url": "https://www.huawei.com;https://www.univ-gustave-eiffel.fr", "aff_unique_abbr": "Huawei;UGE", "aff_campus_unique_index": "0", "aff_campus_unique": "Pairs Resaerch Center;", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;France" }, { "title": "SFC: Achieve Accurate Fast Convolution under Low-precision Arithmetic", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33457", "id": "fgBWtOw66T", "proceeding": "https://proceedings.mlr.press/v235/he24m.html", "pdf": "https://openreview.net/pdf?id=fgBWtOw66T", "openreview": "https://openreview.net/forum?id=fgBWtOw66T", "author_site": "Liulu He, yufei zhao, rui gao, Yuan Du, Li Du", "tldr": "", "abstract": "Fast convolution algorithms, including Winograd and FFT, can efficiently accelerate convolution operations in deep models. However, these algorithms depend on high-precision arithmetic to maintain inference accuracy, which conflicts with the model quantization. To resolve this conflict and further improve the efficiency of quantized convolution, we proposes SFC, a new algebra transform for fast convolution by extending the Discrete Fourier Transform (DFT) with symbolic computing, in which only additions are required to perform the transformation at specific transform points, avoiding the calculation of irrational number and reducing the requirement for precision. Additionally, we enhance convolution efficiency by introducing correction terms to convert invalid circular convolution outputs of the Fourier method into effective ones. The numerical error analysis is presented for the first time in this type of work and proves that our algorithms can provide a 3.68\u00d7 multiplication reduction for 3\u00d73 convolution, while the Winograd algorithm only achieves a 2.25\u00d7 reduction with similarly low numerical errors. Experiments carried out on benchmarks and FPGA show that our new algorithms can further improve the computation efficiency of quantized models while maintaining accuracy, surpassing both the quantization-alone method and existing works on fast convolution quantization.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Liulu He;yufei zhao;rui gao;Yuan Du;Li Du", "authorids": "~Liulu_He1;662023230005@smail.nju.edu.cn;gaorui_seu@foxmail.com;~Yuan_Du2;~Li_Du5", "gender": "M;;;M;", "homepage": "https://iscl.nju.edu.cn/2c/80/c42983a601216/page.htm;;;https://ese.nju.edu.cn/dy_en/list.htm;", "dblp": "283/8409.html;;;26/8831;", "google_scholar": ";;;zyu8Qy4AAAAJ;", "orcid": ";;;0000-0002-5316-619X;", "linkedin": ";;;;", "or_profile": "~Liulu_He1;662023230005@smail.nju.edu.cn;gaorui_seu@foxmail.com;~Yuan_Du2;~Li_Du5", "aff": "Nanjing University;;;Nanjing University;", "aff_domain": "nju.edu.cn;;;nju.edu.cn;", "position": "PhD student;;;Associate Professor;", "bibtex": "@inproceedings{\nhe2024sfc,\ntitle={{SFC}: Achieve Accurate Fast Convolution under Low-precision Arithmetic},\nauthor={Liulu He and yufei zhao and rui gao and Yuan Du and Li Du},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fgBWtOw66T}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2996115, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16324884500485990524&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "nju.edu.cn;;;nju.edu.cn;", "author_num": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "", "aff_unique_url": "https://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "EDISON: Enhanced Dictionary-Induced Tensorized Incomplete Multi-View Clustering with Gaussian Error Rank Minimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33456", "id": "fiugPLSXjK", "proceeding": "https://proceedings.mlr.press/v235/gu24b.html", "pdf": "https://openreview.net/pdf?id=fiugPLSXjK", "openreview": "https://openreview.net/forum?id=fiugPLSXjK", "author_site": "Zhibin Gu, Zhendong Li, Songhe Feng", "tldr": "", "abstract": "This paper presents an efficient and scalable incomplete multi-view clustering method, referred to as Enhanced Dictionary-Induced tenSorized incomplete multi-view clustering with Gaussian errOr raNk minimization (EDISON). Specifically, EDISON employs an enhanced dictionary representation strategy as the foundation for inferring missing data and constructing anchor graphs, ensuring robustness to less-than-ideal data and maintaining high computational efficiency. Additionally, we introduce Gaussian error rank as a concise approximation of the true tensor rank, facilitating a comprehensive exploration of the diverse information encapsulated by various singular values in tensor data. Additionally, we integrate a hyper-anchor graph Laplacian manifold regularization into the tensor representation, allowing for the simultaneous utilization of inter-view high-order correlations and intra-view local correlations. Extensive experiments demonstrate the superiority of the EDISON model in both effectiveness and efficiency compared to SOTA methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhibin Gu;Zhendong Li;Songhe Feng", "authorids": "~Zhibin_Gu1;~Zhendong_Li2;~Songhe_Feng1", "gender": "M;M;M", "homepage": "https://guzhibin23.github.io/guzhibin23-github.io/;;http://faculty.bjtu.edu.cn/8407/", "dblp": "236/0821;;92/2415", "google_scholar": "-uBBqdYAAAAJ;;K5lqMYgAAAAJ", "orcid": "0000-0002-1085-9084;0009-0004-2560-6460;0000-0002-5922-9358", "linkedin": ";;", "or_profile": "~Zhibin_Gu1;~Zhendong_Li2;~Songhe_Feng1", "aff": "Beijing Jiaotong University;Beijing Jiaotong University;Beijing Jiaotong University", "aff_domain": "bjtu.edu.cn;bjtu.edu.cn;bjtu.edu.cn", "position": "PhD student;MS student;Full Professor", "bibtex": "@inproceedings{\ngu2024edison,\ntitle={{EDISON}: Enhanced Dictionary-Induced Tensorized Incomplete Multi-View Clustering with Gaussian Error Rank Minimization},\nauthor={Zhibin Gu and Zhendong Li and Songhe Feng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fiugPLSXjK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1112614, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9885202881597528702&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "bjtu.edu.cn;bjtu.edu.cn;bjtu.edu.cn", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Beijing Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "http://www.njtu.edu.cn/en", "aff_unique_abbr": "BJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Pursuing Overall Welfare in Federated Learning through Sequential Decision Making", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33455", "id": "foPMkomvk1", "proceeding": "https://proceedings.mlr.press/v235/hahn24a.html", "pdf": "https://openreview.net/pdf?id=foPMkomvk1", "openreview": "https://openreview.net/forum?id=foPMkomvk1", "author_site": "Seok-Ju Hahn, Gi-Soo Kim, Junghye Lee", "tldr": "", "abstract": "In traditional federated learning, a single global model cannot perform equally well for all clients. Therefore, the need to achieve the *client-level fairness* in federated system has been emphasized, which can be realized by modifying the static aggregation scheme for updating the global model to an adaptive one, in response to the local signals of the participating clients. Our work reveals that existing fairness-aware aggregation strategies can be unified into an online convex optimization framework, in other words, a central server's *sequential decision making* process. To enhance the decision making capability, we propose simple and intuitive improvements for suboptimal designs within existing methods, presenting $\\texttt{AAggFF}$. Considering practical requirements, we further subdivide our method tailored for the *cross-device* and the *cross-silo* settings, respectively. Theoretical analyses guarantee sublinear regret upper bounds for both settings: $\\mathcal{O}(\\sqrt{T \\log{K}})$ for the cross-device setting, and $\\mathcal{O}(K \\log{T})$ for the cross-silo setting, with $K$ clients and $T$ federation rounds. Extensive experiments demonstrate that the federated system equipped with $\\texttt{AAggFF}$ achieves better degree of client-level fairness than existing methods in both practical settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Seok-Ju Hahn;Gi-Soo Kim;Junghye Lee", "authorids": "~Seok-Ju_Hahn1;~Gi-Soo_Kim1;~Junghye_Lee1", "gender": "M;F;F", "homepage": "https://vaseline555.github.io;;https://d3m.snu.ac.kr/", "dblp": "251/3287;203/0732;234/8224", "google_scholar": "AKN5vg8AAAAJ;uW2qIcYAAAAJ;aHTONgIAAAAJ", "orcid": "0000-0003-4688-8802;;0000-0003-3221-1526", "linkedin": "seokju-hahn/;;", "or_profile": "~Seok-Ju_Hahn1;~Gi-Soo_Kim1;~Junghye_Lee1", "aff": "Ulsan National Institute of Science and Technology;Ulsan National Institute of Science and Technology;Seoul National University", "aff_domain": "unist.ac.kr;unist.ac.kr;snu.ac.kr", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nhahn2024pursuing,\ntitle={Pursuing Overall Welfare in Federated Learning through Sequential Decision Making},\nauthor={Seok-Ju Hahn and Gi-Soo Kim and Junghye Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=foPMkomvk1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1089660, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:hA1JRbYRkIQJ:scholar.google.com/&scioq=Pursuing+Overall+Welfare+in+Federated+Learning+through+Sequential+Decision+Making&hl=en&as_sdt=0,5", "gs_version_total": 9, "email": "unist.ac.kr;unist.ac.kr;snu.ac.kr", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Ulsan National Institute of Science and Technology;Seoul National University", "aff_unique_dep": ";", "aff_unique_url": "https://www.unist.ac.kr;https://www.snu.ac.kr", "aff_unique_abbr": "UNIST;SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Using AI Uncertainty Quantification to Improve Human Decision-Making", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33454", "id": "fowZNENcVJ", "proceeding": "https://proceedings.mlr.press/v235/marusich24a.html", "pdf": "https://openreview.net/pdf?id=fowZNENcVJ", "openreview": "https://openreview.net/forum?id=fowZNENcVJ", "author_site": "Laura Marusich, Jonathan Bakdash, Yan Zhou, Murat Kantarcioglu", "tldr": "", "abstract": "AI Uncertainty Quantification (UQ) has the potential to improve human decision-making beyond AI predictions alone by providing additional probabilistic information to users. The majority of past research on AI and human decision-making has concentrated on model explainability and interpretability, with little focus on understanding the potential impact of UQ on human decision-making. We evaluated the impact on human decision-making for instance-level UQ, calibrated using a strict scoring rule, in two online behavioral experiments. In the first experiment, our results showed that UQ was beneficial for decision-making performance compared to only AI predictions. In the second experiment, we found UQ had generalizable benefits for decision-making across a variety of representations for probabilistic information. These results indicate that implementing high quality, instance-level UQ for AI may improve decision-making with real systems compared to AI predictions alone.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Laura Marusich;Jonathan Bakdash;Yan Zhou;Murat Kantarcioglu", "authorids": "~Laura_Marusich1;~Jonathan_Bakdash1;~Yan_Zhou2;~Murat_Kantarcioglu1", "gender": ";M;;", "homepage": ";;;https://www.kantarcioglu.net", "dblp": ";31/4218.html;60/5157-1;36/195.html", "google_scholar": ";CIhOJGkAAAAJ;;https://scholar.google.com.tw/citations?user=qXb4xQMAAAAJ", "orcid": "0000-0002-3524-6110;0000-0002-1409-4779;;0000-0001-9795-9063", "linkedin": ";;;kantarcioglu/", "or_profile": "~Laura_Marusich1;~Jonathan_Bakdash1;~Yan_Zhou2;~Murat_Kantarcioglu1", "aff": "US DEVCOM Army Research Laboratory;UT Dallas;University of Texas, Dallas;Harvard University", "aff_domain": "arl.army.mil;cs.utdallas.edu;utdallas.edu;harvard.edu", "position": "Researcher;Researcher;Researcher;Faculty Associate", "bibtex": "@inproceedings{\nmarusich2024using,\ntitle={Using {AI} Uncertainty Quantification to Improve Human Decision-Making},\nauthor={Laura Marusich and Jonathan Bakdash and Yan Zhou and Murat Kantarcioglu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fowZNENcVJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5985206, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=958026395153612609&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "email": "arl.army.mil;cs.utdallas.edu;utdallas.edu;harvard.edu", "author_num": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "US Army Research Laboratory;University of Texas at Dallas;Harvard University", "aff_unique_dep": "DEVCOM;;", "aff_unique_url": "https://www.arl.army.mil;https://www.utdallas.edu;https://www.harvard.edu", "aff_unique_abbr": "ARL;UT Dallas;Harvard", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Dallas", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Language Models are Super Mario: Absorbing Abilities from Homologous Models as a Free Lunch", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33453", "id": "fq0NaiU8Ex", "proceeding": "https://proceedings.mlr.press/v235/yu24p.html", "pdf": "https://openreview.net/pdf?id=fq0NaiU8Ex", "openreview": "https://openreview.net/forum?id=fq0NaiU8Ex", "author_site": "Le Yu, Bowen Yu, Haiyang Yu, Fei Huang, Yongbin Li", "tldr": "", "abstract": "In this paper, we unveil that Language Models (LMs) can acquire new capabilities by assimilating parameters from homologous models without retraining or GPUs. We first introduce DARE to set most delta parameters (i.e., the disparity between fine-tuned and pre-trained parameters) to zeros without affecting the abilities of Supervised Fine-Tuning (SFT) LMs, which randomly **D**rops delta parameters with a ratio $p$ **A**nd **RE**scales the remaining ones by $1 / (1 - p)$ to approximate the original embeddings. Then, we use DARE as a versatile plug-in to sparsify delta parameters of multiple SFT homologous models for mitigating parameter interference and merge them into a single model by parameter fusing. We experiment with encoder- and decoder-based LMs, showing that: (1) SFT delta parameter value ranges are typically small (within 0.002) with extreme redundancy, and DARE can effortlessly eliminate 90% or even 99% of them; (2) DARE can merge multiple task-specific LMs into one LM with diverse capabilities. Notably, this phenomenon is more pronounced in large-scale LMs, where the merged LM reveals the potential to surpass the performance of any source LM, providing a new discovery. We also utilize DARE to create a merged LM that ranks first among models with 7 billion parameters on the Open LLM Leaderboard.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Le Yu;Bowen Yu;Haiyang Yu;Fei Huang;Yongbin Li", "authorids": "~Le_Yu2;~Bowen_Yu3;~Haiyang_Yu3;~Fei_Huang1;~Yongbin_Li2", "gender": "M;M;M;M;M", "homepage": "https://yule-buaa.github.io/;https://yubowen-ph.github.io/;;https://yongbin-li.github.io/;https://sites.google.com/view/fei-huang", "dblp": "23/7122-4;95/10266-2.html;90/6643-3;;h/FeiHuang.html", "google_scholar": "-h_ehVsAAAAJ;oHoEp34AAAAJ;VhWV-1wAAAAJ;xF5VrokAAAAJ;9r98PpoAAAAJ", "orcid": "0000-0002-4908-3199;0000-0002-6804-1859;;;", "linkedin": ";;;;fei-huang-cas-cmu", "or_profile": "~Le_Yu2;~Bowen_Yu3;~Haiyang_Yu3;~Yongbin_Li2;~Fei_Huang2", "aff": "Beihang University;Alibaba Group;Alibaba Group;Alibaba Group;Alibaba Group US", "aff_domain": "buaa.edu.cn;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com", "position": "PhD student;Researcher;Researcher;Researcher;Senior Research Director", "bibtex": "@inproceedings{\nyu2024language,\ntitle={Language Models are Super Mario: Absorbing Abilities from Homologous Models as a Free Lunch},\nauthor={Le Yu and Bowen Yu and Haiyang Yu and Fei Huang and Yongbin Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fq0NaiU8Ex}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6553079, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 252, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1085930300417345788&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "buaa.edu.cn;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com", "author_num": 5, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "Beihang University;Alibaba Group", "aff_unique_dep": ";", "aff_unique_url": "http://www.buaa.edu.cn/;https://www.alibaba.com", "aff_unique_abbr": "BUAA;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Encodings for Prediction-based Neural Architecture Search", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33452", "id": "fqPH6ejwGi", "proceeding": "https://proceedings.mlr.press/v235/akhauri24a.html", "pdf": "https://openreview.net/pdf?id=fqPH6ejwGi", "openreview": "https://openreview.net/forum?id=fqPH6ejwGi", "author_site": "Yash Akhauri, Mohamed Abdelfattah", "tldr": "", "abstract": "Predictor-based methods have substantially enhanced Neural Architecture Search (NAS) optimization. The efficacy of these predictors is largely influenced by the method of encoding neural network architectures. While traditional encodings used an adjacency matrix describing the graph structure of a neural network, novel encodings embrace a variety of approaches from unsupervised pretraining of latent representations to vectors of zero-cost proxies. In this paper, we categorize and investigate neural encodings from three main types: structural, learned, and score-based. Furthermore, we extend these encodings and introduce *unified encodings*, that extend NAS predictors to multiple search spaces. Our analysis draws from experiments conducted on over 1.5 million neural network architectures on NAS spaces such as NASBench-101 (NB101), NB201, NB301, Network Design Spaces (NDS), and TransNASBench-101. Building on our study, we present our predictor **FLAN**: **Fl**ow **A**ttention for **N**AS. FLAN integrates critical insights on predictor design, transfer learning, and *unified encodings* to enable more than an order of magnitude cost reduction for training NAS accuracy predictors. Our implementation and encodings for all neural networks are open-sourced at https://github.com/abdelfattah-lab/flan_nas.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yash Akhauri;Mohamed S Abdelfattah", "authorids": "~Yash_Akhauri1;~Mohamed_S_Abdelfattah1", "gender": "M;M", "homepage": ";https://mohsaied.github.io/", "dblp": "241/9414;124/7095", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.ca/citations?user=q4wBpWAAAAAJ", "orcid": ";", "linkedin": ";mabdelfattah/", "or_profile": "~Yash_Akhauri1;~Mohamed_S_Abdelfattah1", "aff": "Cornell University;Cornell University", "aff_domain": "cornell.edu;cornell.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nakhauri2024encodings,\ntitle={Encodings for Prediction-based Neural Architecture Search},\nauthor={Yash Akhauri and Mohamed S Abdelfattah},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fqPH6ejwGi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4613673, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3146587467407181773&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "cornell.edu;cornell.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Cornell University", "aff_unique_dep": "", "aff_unique_url": "https://www.cornell.edu", "aff_unique_abbr": "Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Differentially Private Bias-Term Fine-tuning of Foundation Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33451", "id": "fqeANcjBMT", "proceeding": "https://proceedings.mlr.press/v235/bu24c.html", "pdf": "https://openreview.net/pdf?id=fqeANcjBMT", "openreview": "https://openreview.net/forum?id=fqeANcjBMT", "author_site": "Zhiqi Bu, Yu-Xiang Wang, Sheng Zha, George Karypis", "tldr": "", "abstract": "We study the problem of differentially private (DP) fine-tuning of large pre-trained models \u2014 a recent privacy-preserving approach suitable for solving downstream tasks with sensitive data. Existing work has demonstrated that high accuracy is possible under strong privacy constraint, yet requires significant computational overhead or modifications to the network architecture. We propose differentially private bias-term fine-tuning (DP-BiTFiT), which matches the state-of-the-art accuracy for DP algorithms and the efficiency of the standard BiTFiT. DP-BiTFiT is model agnostic (not modifying the network architecture), parameter efficient (only training about 0.1% of the parameters), and computation efficient (almost removing the overhead caused by DP, in both the time and space complexity). On a wide range of tasks, DP-BiTFiT is 2 - 30X faster and uses 2 - 8X less memory than DP full fine-tuning, even faster than the standard full fine-tuning. This amazing efficiency enables us to conduct DP fine-tuning on language and vision tasks with long-sequence texts and high-resolution images, which were computationally difficult using existing methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhiqi Bu;Yu-Xiang Wang;Sheng Zha;George Karypis", "authorids": "~Zhiqi_Bu1;~Yu-Xiang_Wang1;~Sheng_Zha1;~George_Karypis1", "gender": "M;;M;M", "homepage": "https://sites.google.com/view/zhiqi-bu;http://www.cs.ucsb.edu/~yuxiangw/publications.html;https://github.com/szha;", "dblp": "245/2573;62/1637-3.html;218/5471;", "google_scholar": "MEvTLxIAAAAJ;HGNZ1fkAAAAJ;;ElqwScwAAAAJ", "orcid": ";;;", "linkedin": ";;shengzha/;", "or_profile": "~Zhiqi_Bu1;~Yu-Xiang_Wang1;~Sheng_Zha1;~George_Karypis1", "aff": "Amazon;UC Santa Barbara;Amazon;University of Minnesota, Minneapolis", "aff_domain": "amazon.com;ucsb.edu;amazon.com;umn.edu", "position": "Researcher;Assistant Professor;Researcher;Full Professor", "bibtex": "@inproceedings{\nbu2024differentially,\ntitle={Differentially Private Bias-Term Fine-tuning of Foundation Models},\nauthor={Zhiqi Bu and Yu-Xiang Wang and Sheng Zha and George Karypis},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fqeANcjBMT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 675623, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 55, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10076035367100406984&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 10, "email": "amazon.com;ucsb.edu;amazon.com;umn.edu", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Amazon;University of California, Santa Barbara;University of Minnesota", "aff_unique_dep": "Amazon.com, Inc.;;", "aff_unique_url": "https://www.amazon.com;https://www.ucsb.edu;https://www.minnesota.edu", "aff_unique_abbr": "Amazon;UCSB;UMN", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Santa Barbara;Minneapolis", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Probabilistic Inference in Language Models via Twisted Sequential Monte Carlo", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33450", "id": "frA0NNBS1n", "proceeding": "https://proceedings.mlr.press/v235/zhao24c.html", "pdf": "https://openreview.net/pdf?id=frA0NNBS1n", "openreview": "https://openreview.net/forum?id=frA0NNBS1n", "author_site": "Stephen Zhao, Rob Brekelmans, Alireza Makhzani, Roger Grosse", "tldr": "", "abstract": "Numerous capability and safety techniques of Large Language Models (LLMs), including RLHF, automated red-teaming, prompt engineering, and infilling, can be cast as sampling from an unnormalized target distribution defined by a given reward or potential function over the full sequence. In this work, we leverage the rich toolkit of Sequential Monte Carlo (SMC) for these probabilistic inference problems. In particular, we use learned twist functions to estimate the expected future value of the potential at each timestep, which enables us to focus inference-time computation on promising partial sequences. We propose a novel contrastive method for learning the twist functions, and establish connections with the rich literature of soft reinforcement learning. As a complementary application of our twisted SMC framework, we present methods for evaluating the accuracy of language model inference techniques using novel bidirectional SMC bounds on the log partition function. These bounds can be used to estimate the KL divergence between the inference and target distributions in both directions. We apply our inference evaluation techniques to show that twisted SMC is effective for sampling undesirable outputs from a pretrained model (a useful component of harmlessness training and automated red-teaming), generating reviews with varied sentiment, and performing infilling tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Stephen Zhao;Rob Brekelmans;Alireza Makhzani;Roger Baker Grosse", "authorids": "~Stephen_Zhao1;~Rob_Brekelmans1;~Alireza_Makhzani1;~Roger_Baker_Grosse1", "gender": ";M;;M", "homepage": ";https://brekelma.github.io;http://www.alireza.ai/;http://www.cs.toronto.edu/~rgrosse/", "dblp": "269/9998.html;207/7856.html;122/5126.html;26/7058", "google_scholar": "2SXFnzQAAAAJ;M6ADg_UAAAAJ;B0KVWJEAAAAJ;xgQd1qgAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Stephen_Zhao1;~Rob_Brekelmans1;~Alireza_Makhzani1;~Roger_Baker_Grosse1", "aff": "Department of Computer Science, University of Toronto;;Vector Institute;Vector Institute", "aff_domain": "cs.toronto.edu;;vectorinstitute.ai;vectorinstitute.ai", "position": "PhD student;;Researcher;Faculty Member", "bibtex": "@inproceedings{\nzhao2024probabilistic,\ntitle={Probabilistic Inference in Language Models via Twisted Sequential Monte Carlo},\nauthor={Stephen Zhao and Rob Brekelmans and Alireza Makhzani and Roger Baker Grosse},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=frA0NNBS1n}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 992463, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14930072017296319250&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "cs.toronto.edu;;vectorinstitute.ai;vectorinstitute.ai", "author_num": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Toronto;Vector Institute", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.utoronto.ca;https://vectorinstitute.ai/", "aff_unique_abbr": "U of T;Vector Institute", "aff_campus_unique_index": "0", "aff_campus_unique": "Toronto;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "title": "On Mechanistic Knowledge Localization in Text-to-Image Generative Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33449", "id": "fsVBsxjRER", "proceeding": "https://proceedings.mlr.press/v235/basu24b.html", "pdf": "https://openreview.net/pdf?id=fsVBsxjRER", "openreview": "https://openreview.net/forum?id=fsVBsxjRER", "author_site": "Samyadeep Basu, Keivan Rezaei, Priyatham Kattakinda, Vlad Morariu, Nanxuan Zhao, Ryan A Rossi, Varun Manjunatha, Soheil Feizi", "tldr": "", "abstract": "Identifying layers within text-to-image models which control visual attributes can facilitate efficient model editing through closed-form updates. Recent work, leveraging causal tracing show that early Stable-Diffusion variants confine knowledge primarily to the first layer of the CLIP text-encoder, while it diffuses throughout the UNet. Extending this framework, we observe that for recent models (e.g., SD-XL, DeepFloyd), causal tracing fails in pinpointing localized knowledge, highlighting challenges in model editing. To address this issue, we introduce the concept of mechanistic localization in text-to-image models, where knowledge about various visual attributes (e.g., \"style\", \"objects\", \"facts\") can be mechanistically localized to a small fraction of layers in the UNet, thus facilitating efficient model editing. We localize knowledge using our method LocoGen which measures the direct effect of intermediate layers to output generation by performing interventions in the cross-attention layers of the UNet. We then employ LocoEdit, a fast closed-form editing method across popular open-source text-to-image models (including the latest SD-XL) and explore the possibilities of neuron-level model editing. Using mechanistic localization, our work offers a better view of successes and failures in localization-based text-to-image model editing.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Samyadeep Basu;Keivan Rezaei;Priyatham Kattakinda;Vlad I Morariu;Nanxuan Zhao;Ryan A. Rossi;Varun Manjunatha;Soheil Feizi", "authorids": "~Samyadeep_Basu1;~Keivan_Rezaei1;~Priyatham_Kattakinda1;~Vlad_I_Morariu1;~Nanxuan_Zhao1;~Ryan_A._Rossi2;~Varun_Manjunatha1;~Soheil_Feizi2", "gender": "M;M;M;M;F;M;M;M", "homepage": "https://samyadeepbasu.github.io/;https://k1rezaei.github.io;https://priyathamkat.com/;https://research.adobe.com/person/vlad-morariu/;http://nxzhao.com;https://research.adobe.com/person/varun-manjunatha/;https://www.cs.umd.edu/~sfeizi/;http://ryanrossi.com", "dblp": "250/9138;339/7254;;27/6671;224/0709;https://dblp.org/pers/m/Manjunatha:Varun.html;57/2132;17/5085", "google_scholar": "6aRwDecAAAAJ;NsJKrKIAAAAJ;D9ebp-YAAAAJ;oyWpVa8AAAAJ;;nO-We6sAAAAJ;lptAmrMAAAAJ;_Dc6lbQAAAAJ", "orcid": ";;;;;;;0000-0001-9758-0635", "linkedin": ";keivan-rezaei-1b434680/;priyathamkat/;;;;;", "or_profile": "~Samyadeep_Basu1;~Keivan_Rezaei1;~Priyatham_Kattakinda1;~Vlad_I_Morariu1;~Nanxuan_Zhao1;~Varun_Manjunatha1;~Soheil_Feizi2;~Ryan_Rossi1", "aff": "Adobe Systems;University of Maryland, College Park;University of Maryland, College Park;Adobe;Adobe Research;Adobe Systems;University of Maryland, College Park;Adobe Research", "aff_domain": "adobe.com;umd.edu;umd.edu;adobe.com;adobe.com;adobe.com;umd.edu;adobe.com", "position": "Intern;PhD student;PhD student;Senior Research Scientist;Researcher;Research Scientist;Associate Professor;Senior Research Scientist", "bibtex": "@inproceedings{\nbasu2024on,\ntitle={On Mechanistic Knowledge Localization in Text-to-Image Generative Models},\nauthor={Samyadeep Basu and Keivan Rezaei and Priyatham Kattakinda and Vlad I Morariu and Nanxuan Zhao and Ryan A. Rossi and Varun Manjunatha and Soheil Feizi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fsVBsxjRER}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5380545, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16326097257647319763&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "adobe.com;umd.edu;umd.edu;adobe.com;adobe.com;adobe.com;umd.edu;adobe.com", "author_num": 8, "aff_unique_index": "0;1;1;0;0;0;1;0", "aff_unique_norm": "Adobe;University of Maryland", "aff_unique_dep": "Adobe Systems Incorporated;", "aff_unique_url": "https://www.adobe.com;https://www/umd.edu", "aff_unique_abbr": "Adobe;UMD", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";College Park", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "On Universally Optimal Algorithms for A/B Testing", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33448", "id": "ft5jK9uPgC", "proceeding": "https://proceedings.mlr.press/v235/wang24c.html", "pdf": "https://openreview.net/pdf?id=ft5jK9uPgC", "openreview": "https://openreview.net/forum?id=ft5jK9uPgC", "author_site": "Po-An Wang, Kaito Ariu, Alexandre Proutiere", "tldr": "", "abstract": "We study the problem of best-arm identification with fixed budget in stochastic multi-armed bandits with Bernoulli rewards. For the problem with two arms, also known as the A/B testing problem, we prove that there is no algorithm that (i) performs as well as the algorithm sampling each arm equally (referred to as the *uniform sampling* algorithm) in all instances, and that (ii) strictly outperforms uniform sampling on at least one instance. In short, there is no algorithm better than the uniform sampling algorithm. To establish this result, we first introduce the natural class of *consistent* and *stable* algorithms, and show that any algorithm that performs as well as the uniform sampling algorithm in all instances belongs to this class. The proof then proceeds by deriving a lower bound on the error rate satisfied by any consistent and stable algorithm, and by showing that the uniform sampling algorithm matches this lower bound. Our results provide a solution to the two open problems presented in (Qin, 2022). For the general problem with more than two arms, we provide a first set of results. We characterize the asymptotic error rate of the celebrated Successive Rejects (SR) algorithm (Audibert et al., 2010) and show that, surprisingly, the uniform sampling algorithm outperforms the SR algorithm in some instances.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Po-An Wang;Kaito Ariu;Alexandre Proutiere", "authorids": "~Po-An_Wang1;~Kaito_Ariu1;~Alexandre_Proutiere1", "gender": ";M;M", "homepage": ";https://researchmap.jp/ariu?lang=en;https://people.kth.se/~alepro/", "dblp": "203/4451;229/7578;p/AlexandreProutiere", "google_scholar": "https://scholar.google.com.tw/citations?user=kzXIxFYAAAAJ;https://scholar.google.co.jp/citations?user=4zXjxhsAAAAJ;g5sya5cAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Po-An_Wang1;~Kaito_Ariu1;~Alexandre_Proutiere1", "aff": "KTH Royal Institute of Technology, Stockholm, Sweden;CyberAgent, Inc.;KTH Royal Institute of Technology, Stockholm, Sweden", "aff_domain": "kth.se;cyberagent.co.jp;kth.se", "position": "PhD student;Research Scientist;Full Professor", "bibtex": "@inproceedings{\nwang2024on,\ntitle={On Universally Optimal Algorithms for A/B Testing},\nauthor={Po-An Wang and Kaito Ariu and Alexandre Proutiere},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ft5jK9uPgC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1033459, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17903137174625707289&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 9, "email": "kth.se;cyberagent.co.jp;kth.se", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "KTH Royal Institute of Technology;CyberAgent", "aff_unique_dep": ";", "aff_unique_url": "https://www.kth.se;https://www.cyberagent.co.jp", "aff_unique_abbr": "KTH;CyberAgent", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stockholm;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Sweden;Japan" }, { "title": "SLEB: Streamlining LLMs through Redundancy Verification and Elimination of Transformer Blocks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33447", "id": "fuX4hyLPmO", "proceeding": "https://proceedings.mlr.press/v235/song24f.html", "pdf": "https://openreview.net/pdf?id=fuX4hyLPmO", "openreview": "https://openreview.net/forum?id=fuX4hyLPmO", "author_site": "Jiwon Song, Kyungseok Oh, Taesu Kim, Hyungjun Kim, Yulhwa Kim, jae-joon kim", "tldr": "", "abstract": "Large language models (LLMs) have proven to be highly effective across various natural language processing tasks. However, their large number of parameters poses significant challenges for practical deployment. Pruning, a technique aimed at reducing the size and complexity of LLMs, offers a potential solution by removing redundant components from the network. Despite the promise of pruning, existing methods often struggle to achieve substantial end-to-end LLM inference speedup. In this paper, we introduce SLEB, a novel approach designed to stream- line LLMs by eliminating redundant transformer blocks. We choose the transformer block as the fundamental unit for pruning, because LLMs exhibit block-level redundancy with high similarity between the outputs of neighboring blocks. This choice allows us to effectively enhance the processing speed of LLMs. Our experimental results demonstrate that SLEB outperforms previous LLM pruning methods in accelerating LLM inference while also maintaining superior perplexity and accuracy, making SLEB as a promising technique for enhancing the efficiency of LLMs. The code is available at: https://github.com/jiwonsong-dev/SLEB.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiwon Song;Kyungseok Oh;Taesu Kim;Hyungjun Kim;Yulhwa Kim;jae-joon kim", "authorids": "~Jiwon_Song1;~Kyungseok_Oh1;~Taesu_Kim1;~Hyungjun_Kim2;~Yulhwa_Kim1;~jae-joon_kim1", "gender": "M;M;M;;;M", "homepage": ";;;;https://eic.skku.edu/;http://vlsi.snu.ac.kr", "dblp": ";;44/6997;;223/9434;", "google_scholar": "https://scholar.google.co.kr/citations?user=ysclwTEAAAAJ;;zzII2gsAAAAJ;pX2macYAAAAJ;VRkM404AAAAJ;Ee994T0AAAAJ", "orcid": ";;;0000-0001-8403-1557;0000-0003-3735-821X;", "linkedin": "jiwon-song-28b9b6184/;%EC%98%A4%EA%B2%BD%EC%84%9D-9b07612a0/;;;;", "or_profile": "~Jiwon_Song1;~Kyungseok_Oh1;~Taesu_Kim1;~Hyungjun_Kim2;~Yulhwa_Kim1;~jae-joon_kim1", "aff": "Seoul National University;Seoul National University;SqueezeBits Inc.;SqueezeBits Inc.;Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;squeezebits.com;squeezebits.com;snu.ac.kr;snu.ac.kr", "position": "Undergrad student;PhD student;Researcher;CEO;Researcher;Full Professor", "bibtex": "@inproceedings{\nsong2024sleb,\ntitle={{SLEB}: Streamlining {LLM}s through Redundancy Verification and Elimination of Transformer Blocks},\nauthor={Jiwon Song and Kyungseok Oh and Taesu Kim and Hyungjun Kim and Yulhwa Kim and jae-joon kim},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fuX4hyLPmO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1957176, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=380348016741596514&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "snu.ac.kr;snu.ac.kr;squeezebits.com;squeezebits.com;snu.ac.kr;snu.ac.kr", "author_num": 6, "aff_unique_index": "0;0;1;1;0;0", "aff_unique_norm": "Seoul National University;SqueezeBits Inc.", "aff_unique_dep": ";", "aff_unique_url": "https://www.snu.ac.kr;", "aff_unique_abbr": "SNU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0;0", "aff_country_unique": "South Korea;United States" }, { "title": "Inherent Trade-Offs between Diversity and Stability in Multi-Task Benchmarks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33446", "id": "fwxnHViGNj", "proceeding": "https://proceedings.mlr.press/v235/zhang24u.html", "pdf": "https://openreview.net/pdf?id=fwxnHViGNj", "openreview": "https://openreview.net/forum?id=fwxnHViGNj", "author_site": "Guanhua Zhang, Moritz Hardt", "tldr": "", "abstract": "We examine multi-task benchmarks in machine learning through the lens of social choice theory. We draw an analogy between benchmarks and electoral systems, where models are candidates and tasks are voters. This suggests a distinction between cardinal and ordinal benchmark systems. The former aggregate numerical scores into one model ranking; the latter aggregate rankings for each task. We apply Arrow's impossibility theorem to ordinal benchmarks to highlight the inherent limitations of ordinal systems, particularly their sensitivity to the inclusion of irrelevant models. Inspired by Arrow's theorem, we empirically demonstrate a strong trade-off between diversity and sensitivity to irrelevant changes in existing multi-task benchmarks. Our result is based on new quantitative measures of diversity and sensitivity that we introduce. Sensitivity quantifies the impact that irrelevant changes to tasks have on a benchmark. Diversity captures the degree of disagreement in model rankings across tasks. We develop efficient approximation algorithms for both measures, as exact computation is computationally challenging. Through extensive experiments on seven cardinal benchmarks and eleven ordinal benchmarks, we demonstrate a clear trade-off between diversity and stability: The more diverse a multi-task benchmark, the more sensitive to trivial changes it is. Additionally, we show that the aggregated rankings of existing benchmarks are highly unstable under irrelevant changes. The codes and data are available at https://socialfoundations.github.io/benchbench/.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guanhua Zhang;Moritz Hardt", "authorids": "~Guanhua_Zhang1;~Moritz_Hardt1", "gender": ";Not Specified", "homepage": ";http://mrtz.org/", "dblp": "171/0962.html;26/4683", "google_scholar": "_hrEN-sAAAAJ;adnTgaAAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Guanhua_Zhang1;~Moritz_Hardt1", "aff": "Max Planck Institute for Intelligent Systems, Max-Planck Institute;Max-Planck-Institute for Intelligent Systems, Max-Planck Institute", "aff_domain": "tuebingen.mpg.de;is.mpg.de", "position": "PhD student;Principal Researcher", "bibtex": "@inproceedings{\nzhang2024inherent,\ntitle={Inherent Trade-Offs between Diversity and Stability in Multi-Task Benchmarks},\nauthor={Guanhua Zhang and Moritz Hardt},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fwxnHViGNj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 603444, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=750747885369528566&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "tuebingen.mpg.de;is.mpg.de", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Max Planck Institute for Intelligent Systems;Max-Planck-Institute for Intelligent Systems", "aff_unique_dep": "Intelligent Systems;Intelligent Systems", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.mpi-is.mpg.de", "aff_unique_abbr": "MPI-IS;MPI-IS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Feature Importance Disparities for Data Bias Investigations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33445", "id": "fywWm06IGn", "proceeding": "https://proceedings.mlr.press/v235/chang24a.html", "pdf": "https://openreview.net/pdf?id=fywWm06IGn", "openreview": "https://openreview.net/forum?id=fywWm06IGn", "author_site": "Peter Chang, Leor Fishman, Seth Neel", "tldr": "", "abstract": "It is widely held that one cause of downstream bias in classifiers is bias present in the training data. Rectifying such biases may involve context-dependent interventions such as training separate models on subgroups, removing features with bias in the collection process, or even conducting real-world experiments to ascertain sources of bias. Despite the need for such data bias investigations, few automated methods exist to assist practitioners in these efforts. In this paper, we present one such method that given a dataset $X$ consisting of protected and unprotected features, outcomes $y$, and a regressor $h$ that predicts $y$ given $X$, outputs a tuple $(f_j, g)$, with the following property: $g$ corresponds to a subset of the training dataset $(X, y)$, such that the $j^{th}$ feature $f_j$ has much larger (or smaller) *influence* in the subgroup $g$, than on the dataset overall, which we call *feature importance disparity* (FID). We show across $4$ datasets and $4$ common feature importance methods of broad interest to the machine learning community that we can efficiently find subgroups with large FID values even over exponentially large subgroup classes and in practice these groups correspond to subgroups with potentially serious bias issues as measured by standard fairness metrics.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Peter W Chang;Leor Fishman;Seth Neel", "authorids": "~Peter_W_Chang1;~Leor_Fishman1;~Seth_Neel2", "gender": "M;M;M", "homepage": ";;https://sethneel.com", "dblp": ";;188/6406", "google_scholar": ";;", "orcid": "0000-0003-3971-2630;;", "linkedin": "peter-chang-31718baa/;leor-f-63490785/;", "or_profile": "~Peter_W_Chang1;~Leor_Fishman1;~Seth_Neel1", "aff": "Harvard University;;Harvard University", "aff_domain": "harvard.edu;;harvard.edu", "position": "Researcher;;Assistant Professor", "bibtex": "@inproceedings{\nchang2024feature,\ntitle={Feature Importance Disparities for Data Bias Investigations},\nauthor={Peter W Chang and Leor Fishman and Seth Neel},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fywWm06IGn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2312168, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17237708404702624885&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 7, "email": "harvard.edu;;harvard.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "MOKD: Cross-domain Finetuning for Few-shot Classification via Maximizing Optimized Kernel Dependence", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33444", "id": "fz9PaJNViP", "proceeding": "https://proceedings.mlr.press/v235/tian24b.html", "pdf": "https://openreview.net/pdf?id=fz9PaJNViP", "openreview": "https://openreview.net/forum?id=fz9PaJNViP", "author_site": "Hongduan Tian, Feng Liu, Tongliang Liu, Bo Du, Yiu Ming Cheung, Bo Han", "tldr": "", "abstract": "In cross-domain few-shot classification, _nearest centroid classifier_ (NCC) aims to learn representations to construct a metric space where few-shot classification can be performed by measuring the similarities between samples and the prototype of each class. An intuition behind NCC is that each sample is pulled closer to the class centroid it belongs to while pushed away from those of other classes. However, in this paper, we find that there exist high similarities between NCC-learned representations of two samples from different classes. In order to address this problem, we propose a bi-level optimization framework, _maximizing optimized kernel dependence_ (MOKD) to learn a set of class-specific representations that match the cluster structures indicated by labeled data of the given task. Specifically, MOKD first optimizes the kernel adopted in *Hilbert-Schmidt independence criterion* (HSIC) to obtain the optimized kernel HSIC (opt-HSIC) that can capture the dependence more precisely. Then, an optimization problem regarding the opt-HSIC is addressed to simultaneously maximize the dependence between representations and labels and minimize the dependence among all samples. Extensive experiments on Meta-Dataset demonstrate that MOKD can not only achieve better generalization performance on unseen domains in most cases but also learn better data representation clusters. The project repository of MOKD is available at: [https://github.com/tmlr-group/MOKD](https://github.com/tmlr-group/MOKD).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hongduan Tian;Feng Liu;Tongliang Liu;Bo Du;Yiu-ming Cheung;Bo Han", "authorids": "~Hongduan_Tian1;~Feng_Liu2;~Tongliang_Liu1;~Bo_Du3;~Yiu-ming_Cheung1;~Bo_Han1", "gender": "M;M;M;;;", "homepage": "https://hongduantian.github.io/;https://fengliu90.github.io/index.html;https://tongliang-liu.github.io/;;;", "dblp": "270/0676;77/1318-3;150/6667;;;", "google_scholar": "07lUB9kAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;;;", "orcid": ";0000-0002-5005-9129;;;;", "linkedin": ";alexfengliu;;;;", "or_profile": "~Hongduan_Tian1;~Feng_Liu2;~Tongliang_Liu1;~Bo_Du3;~Yiu-ming_Cheung1;~Bo_Han1", "aff": "Hong Kong Baptist University;University of Melbourne;Mohamed bin Zayed University of Artificial Intelligence;;;", "aff_domain": "hkbu.edu.hk;unimelb.edu.au;mbzuai.ac.ae;;;", "position": "PhD student;Assistant Professor;Affiliated Associate Professor;;;", "bibtex": "@inproceedings{\ntian2024mokd,\ntitle={{MOKD}: Cross-domain Finetuning for Few-shot Classification via Maximizing Optimized Kernel Dependence},\nauthor={Hongduan Tian and Feng Liu and Tongliang Liu and Bo Du and Yiu-ming Cheung and Bo Han},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fz9PaJNViP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4229885, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7093572113554961132&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "hkbu.edu.hk;unimelb.edu.au;mbzuai.ac.ae;;;", "author_num": 6, "aff_unique_index": "0;1;2", "aff_unique_norm": "Hong Kong Baptist University;University of Melbourne;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";;", "aff_unique_url": "https://www.hkbu.edu.hk;https://www.unimelb.edu.au;https://mbzuai.ac.ae", "aff_unique_abbr": "HKBU;UniMelb;MBZUAI", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;2", "aff_country_unique": "China;Australia;United Arab Emirates" }, { "title": "Latent variable model for high-dimensional point process with structured missingness", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33443", "id": "g1Gf0hoPSz", "proceeding": "https://proceedings.mlr.press/v235/sinelnikov24a.html", "pdf": "https://openreview.net/pdf?id=g1Gf0hoPSz", "openreview": "https://openreview.net/forum?id=g1Gf0hoPSz", "author_site": "Maksim Sinelnikov, Manuel Haussmann, Harri L\u00e4hdesm\u00e4ki", "tldr": "", "abstract": "Longitudinal data are important in numerous fields, such as healthcare, sociology and seismology, but real-world datasets present notable challenges for practitioners because they can be high-dimensional, contain structured missingness patterns, and measurement time points can be governed by an unknown stochastic process. While various solutions have been suggested, the majority of them have been designed to account for only one of these challenges. In this work, we propose a flexible and efficient latent-variable model that is capable of addressing all these limitations. Our approach utilizes Gaussian processes to capture correlations between samples and their associated missingness masks as well as to model the underlying point process. We construct our model as a variational autoencoder together with deep neural network parameterised decoder and encoder models, and develop a scalable amortised variational inference approach for efficient model training. We demonstrate competitive performance using both simulated and real datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Maksim Sinelnikov;Manuel Haussmann;Harri L\u00e4hdesm\u00e4ki", "authorids": "~Maksim_Sinelnikov2;~Manuel_Haussmann1;~Harri_L\u00e4hdesm\u00e4ki1", "gender": "M;;M", "homepage": ";https://manuelhaussmann.github.io/;https://research.cs.aalto.fi/csb/", "dblp": ";198/2433;85/4466", "google_scholar": "_D3TA-0AAAAJ;https://scholar.google.com/citations?hl=de;https://scholar.google.com/citations?hl=en", "orcid": ";;", "linkedin": "maksim-sinelnikov-24253b223/;;", "or_profile": "~Maksim_Sinelnikov2;~Manuel_Haussmann1;~Harri_L\u00e4hdesm\u00e4ki1", "aff": "Aalto University;University of Southern Denmark - SDU;Aalto University", "aff_domain": "aalto.fi;sdu.dk;aalto.fi", "position": "PhD student;Postdoc;Associate Professor", "bibtex": "@inproceedings{\nsinelnikov2024latent,\ntitle={Latent variable model for high-dimensional point process with structured missingness},\nauthor={Maksim Sinelnikov and Manuel Haussmann and Harri L{\\\"a}hdesm{\\\"a}ki},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=g1Gf0hoPSz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 897022, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7991949639519325672&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13, "email": "aalto.fi;sdu.dk;aalto.fi", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Aalto University;University of Southern Denmark", "aff_unique_dep": ";", "aff_unique_url": "https://www.aalto.fi;https://www.sdu.dk", "aff_unique_abbr": "Aalto;SDU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Finland;Denmark" }, { "title": "Momentum for the Win: Collaborative Federated Reinforcement Learning across Heterogeneous Environments", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33442", "id": "g43yUNWX4V", "proceeding": "https://proceedings.mlr.press/v235/wang24v.html", "pdf": "https://openreview.net/pdf?id=g43yUNWX4V", "openreview": "https://openreview.net/forum?id=g43yUNWX4V", "author_site": "Han Wang, Sihong He, Zhili Zhang, Fei Miao, James Anderson", "tldr": "", "abstract": "We explore a Federated Reinforcement Learning (FRL) problem where $N$ agents collaboratively learn a common policy without sharing their trajectory data. To date, existing FRL work has primarily focused on agents operating in the same or ``similar\" environments. In contrast, our problem setup allows for arbitrarily large levels of environment heterogeneity. To obtain the optimal policy which maximizes the average performance across all *potentially completely different* environments, we propose two algorithms: FedSVRPG-M and FedHAPG-M. In contrast to existing results, we demonstrate that both FedSVRPG-M and FedHAPG-M, both of which leverage momentum mechanisms, can exactly converge to a stationary point of the average performance function, regardless of the magnitude of environment heterogeneity. Furthermore, by incorporating the benefits of variance-reduction techniques or Hessian approximation, both algorithms achieve state-of-the-art convergence results, characterized by a sample complexity of $\\mathcal{O}\\left(\\epsilon^{-\\frac{3}{2}}/N\\right)$. Notably, our algorithms enjoy linear convergence speedups with respect to the number of agents, highlighting the benefit of collaboration among agents in finding a common policy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Han Wang;Sihong He;Zhili Zhang;Fei Miao;James Anderson", "authorids": "~Han_Wang14;~Sihong_He1;~Zhili_Zhang5;~Fei_Miao1;~James_Anderson6", "gender": ";F;M;F;", "homepage": "https://sites.google.com/view/han-wang/home;https://sihonghe.com/;;http://www.feimiao.org;http://www.columbia.edu/~ja3451/", "dblp": ";237/6086;;143/6002;", "google_scholar": "ALzWbZQAAAAJ;-zSd9V0AAAAJ;vu0InQcAAAAJ;fH2YF6YAAAAJ;https://scholar.google.co.uk/citations?user=rIX6oiMAAAAJ", "orcid": ";;;0000-0003-0066-4379;0000-0001-8210-6527", "linkedin": ";;;fei-miao-76964727/;", "or_profile": "~Han_Wang14;~Sihong_He1;~Zhili_Zhang5;~Fei_Miao1;~James_Anderson6", "aff": "Columbia University;University of Connecticut;University of Connecticut;University of Connecticut;Columbia University", "aff_domain": "columbia.edu;uconn.edu;uconn.edu;uconn.edu;columbia.edu", "position": "PhD student;PhD student;PhD student;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nwang2024momentum,\ntitle={Momentum for the Win: Collaborative Federated Reinforcement Learning across Heterogeneous Environments},\nauthor={Han Wang and Sihong He and Zhili Zhang and Fei Miao and James Anderson},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=g43yUNWX4V}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2886417, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15806312712679287782&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "columbia.edu;uconn.edu;uconn.edu;uconn.edu;columbia.edu", "author_num": 5, "aff_unique_index": "0;1;1;1;0", "aff_unique_norm": "Columbia University;University of Connecticut", "aff_unique_dep": ";", "aff_unique_url": "https://www.columbia.edu;https://www.uconn.edu", "aff_unique_abbr": "Columbia;UConn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning to Predict Mutational Effects of Protein-Protein Interactions by Microenvironment-aware Hierarchical Prompt Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33441", "id": "g89jAdrnAF", "proceeding": "https://proceedings.mlr.press/v235/wu24y.html", "pdf": "https://openreview.net/pdf?id=g89jAdrnAF", "openreview": "https://openreview.net/forum?id=g89jAdrnAF", "author_site": "Lirong Wu, Yijun Tian, Haitao Lin, Yufei Huang, Siyuan Li, Nitesh Chawla, Stan Z Li", "tldr": "", "abstract": "Protein-protein bindings play a key role in a variety of fundamental biological processes, and thus predicting the effects of amino acid mutations on protein-protein binding is crucial. To tackle the scarcity of annotated mutation data, pre-training with massive unlabeled data has emerged as a promising solution. However, this process faces a series of challenges: (1) complex higher-order dependencies among multiple (more than paired) structural scales have not yet been fully captured; (2) it is rarely explored how mutations alter the local conformation of the surrounding microenvironment; (3) pre-training is costly, both in data size and computational burden. In this paper, we first construct a hierarchical prompt codebook to record common microenvironmental patterns at different structural scales independently. Then, we develop a novel codebook pre-training task, namely masked microenvironment modeling, to model the joint distribution of each mutation with their residue types, angular statistics, and local conformational changes in the microenvironment. With the constructed prompt codebook, we encode the microenvironment around each mutation into multiple hierarchical prompts and combine them to flexibly provide information to wild-type and mutated protein complexes about their microenvironmental differences. Such a hierarchical prompt learning framework has demonstrated superior performance and training efficiency over state-of-the-art pre-training-based methods in mutation effect prediction and a case study of optimizing human antibodies against SARS-CoV-2.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lirong Wu;Yijun Tian;Haitao Lin;Yufei Huang;Siyuan Li;Nitesh V Chawla;Stan Z. Li", "authorids": "~Lirong_Wu1;~Yijun_Tian1;~Haitao_Lin2;~Yufei_Huang4;~Siyuan_Li6;~Nitesh_V_Chawla1;~Stan_Z._Li2", "gender": ";;M;M;M;M;M", "homepage": ";https://www.yijuntian.com/;;https://2021.igem.org/Team:ZJU-China;https://lupin1998.github.io/;http://niteshchawla.nd.edu;https://en.westlake.edu.cn/academics/School_of_Engineering/About/Our_People/Faculty/201912/t20191206_2497.shtml", "dblp": "15/10330;234/9123-1;34/1040;68/1946-2;63/9705-2;c/NiteshVChawla.html;l/StanZLi", "google_scholar": "Tk7TrCoAAAAJ;dbaBgV0AAAAJ;o5A23qIAAAAJ;qmTjdwIAAAAJ;https://scholar.google.com/citations?hl=zh-CN;hDLBEhkAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";0000-0003-2795-6080;;0009-0007-8184-4529;0000-0001-6806-2468;;", "linkedin": ";yijun-tian/;;;https://www.linkedin.cn/incareer/in/siyuan-li-lupin1998/;;stan-z-li-%E6%9D%8E%E5%AD%90%E9%9D%92-55753224/", "or_profile": "~Lirong_Wu1;~Yijun_Tian1;~Haitao_Lin2;~Yufei_Huang4;~Siyuan_Li6;~Nitesh_Chawla1;~Stan_Z._Li1", "aff": "Westlake University;University of Notre Dame;Westlake University;Zhejiang University;Alibaba Group;University of Notre Dame;Westlake University", "aff_domain": "westlake.edu.cn;nd.edu;westlake.edu.cn;zju.edu.cn;alibaba-inc.com;nd.edu;westlake.edu.cn", "position": "PhD student;PhD student;PhD student;PhD student;Intern;Full Professor;Chair Professor", "bibtex": "@inproceedings{\nwu2024learning,\ntitle={Learning to Predict Mutational Effects of Protein-Protein Interactions by Microenvironment-aware Hierarchical Prompt Learning},\nauthor={Lirong Wu and Yijun Tian and Haitao Lin and Yufei Huang and Siyuan Li and Nitesh V Chawla and Stan Z. Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=g89jAdrnAF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1529866, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5251433062078905938&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "westlake.edu.cn;nd.edu;westlake.edu.cn;zju.edu.cn;alibaba-inc.com;nd.edu;westlake.edu.cn", "author_num": 7, "aff_unique_index": "0;1;0;2;3;1;0", "aff_unique_norm": "Westlake University;University of Notre Dame;Zhejiang University;Alibaba Group", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.westlake.edu.cn;https://www.nd.edu;https://www.zju.edu.cn;https://www.alibaba.com", "aff_unique_abbr": "WU;Notre Dame;ZJU;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Symbolic Music Generation with Non-Differentiable Rule Guided Diffusion", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33440", "id": "g8AigOTNXL", "proceeding": "https://proceedings.mlr.press/v235/huang24g.html", "pdf": "https://openreview.net/pdf?id=g8AigOTNXL", "openreview": "https://openreview.net/forum?id=g8AigOTNXL", "author_site": "Yujia Huang, Adishree Ghatare, Yuanzhe Liu, ziniu hu, Qinsheng Zhang, Chandramouli Shama Sastry, Siddharth Gururani, Sageev Oore, Yisong Yue", "tldr": "", "abstract": "We study the problem of symbolic music generation (e.g., generating piano rolls), with a technical focus on non-differentiable rule guidance. Musical rules are often expressed in symbolic form on note characteristics, such as note density or chord progression, many of which are non-differentiable which pose a challenge when using them for guided diffusion. We propose Stochastic Control Guidance (SCG), a novel guidance method that only requires forward evaluation of rule functions that can work with pre-trained diffusion models in a plug-and-play way, thus achieving training-free guidance for non-differentiable rules for the first time. Additionally, we introduce a latent diffusion architecture for symbolic music generation with high time resolution, which can be composed with SCG in a plug-and-play fashion. Compared to standard strong baselines in symbolic music generation, this framework demonstrates marked advancements in music quality and rule-based controllability, outperforming current state-of-the-art generators in a variety of settings. For detailed demonstrations, code and model checkpoints, please visit our [project website](https://scg-rule-guided-music.github.io/).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yujia Huang;Adishree Ghatare;Yuanzhe Liu;Ziniu Hu;Qinsheng Zhang;Chandramouli Shama Sastry;Siddharth Gururani;Sageev Oore;Yisong Yue", "authorids": "~Yujia_Huang1;adishree@caltech.edu;liuy72@rpi.edu;~Ziniu_Hu1;~Qinsheng_Zhang1;~Chandramouli_Shama_Sastry1;~Siddharth_Gururani1;~Sageev_Oore1;~Yisong_Yue1", "gender": "F;;;M;M;M;M;M;M", "homepage": "https://yjhuangcd.github.io;;;http://acbull.github.io;https://qsh-zh.github.io/;https://scholar.google.com/citations?user=yR5pPqAAAAAJ&hl=en;;;http://www.yisongyue.com", "dblp": ";;;180/5436;;223/6317;185/3616;67/4980;28/1244", "google_scholar": ";;;x6ct1CsAAAAJ;;;_C-H8_MAAAAJ;https://scholar.google.ca/citations?user=cI0dYX4AAAAJ;tEk4qo8AAAAJ", "orcid": ";;;;;;;;0000-0001-9127-1989", "linkedin": ";;;;;;;;yisongyue/", "or_profile": "~Yujia_Huang1;adishree@caltech.edu;liuy72@rpi.edu;~Ziniu_Hu1;~Qinsheng_Zhang1;~Chandramouli_Shama_Sastry1;~Siddharth_Gururani1;~Sageev_Oore1;~Yisong_Yue1", "aff": "California Institute of Technology;;;Deepmind;Georgia Institute of Technology;Vector Institute/Dalhousie University;NVIDIA;Vector Institute;California Institute of Technology", "aff_domain": "caltech.edu;;;deepmind.com;gatech.edu;dal.ca;nvidia.com;vectorinstitute.ai;caltech.edu", "position": "PhD student;;;Visiting Researcher;PhD student;PhD student;Researcher;Researcher;Full Professor", "bibtex": "@inproceedings{\nhuang2024symbolic,\ntitle={Symbolic Music Generation with Non-Differentiable Rule Guided Diffusion},\nauthor={Yujia Huang and Adishree Ghatare and Yuanzhe Liu and Ziniu Hu and Qinsheng Zhang and Chandramouli Shama Sastry and Siddharth Gururani and Sageev Oore and Yisong Yue},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=g8AigOTNXL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2382530, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14503961278697120377&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "caltech.edu;;;deepmind.com;gatech.edu;dal.ca;nvidia.com;vectorinstitute.ai;caltech.edu", "author_num": 9, "aff_unique_index": "0;1;2;3;4;5;0", "aff_unique_norm": "California Institute of Technology;DeepMind;Georgia Institute of Technology;Dalhousie University;NVIDIA;Vector Institute", "aff_unique_dep": ";;;Vector Institute;NVIDIA Corporation;", "aff_unique_url": "https://www.caltech.edu;https://deepmind.com;https://www.gatech.edu;https://www.dal.ca;https://www.nvidia.com;https://vectorinstitute.ai/", "aff_unique_abbr": "Caltech;DeepMind;Georgia Tech;Dal;NVIDIA;Vector Institute", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Pasadena;", "aff_country_unique_index": "0;1;0;2;0;2;0", "aff_country_unique": "United States;United Kingdom;Canada" }, { "title": "Policy-conditioned Environment Models are More Generalizable", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33439", "id": "g9mYBdooPA", "proceeding": "https://proceedings.mlr.press/v235/chen24g.html", "pdf": "https://openreview.net/pdf?id=g9mYBdooPA", "openreview": "https://openreview.net/forum?id=g9mYBdooPA", "author_site": "Ruifeng Chen, Xiong-Hui Chen, Yihao Sun, Siyuan Xiao, Minhui Li, Yang Yu", "tldr": "", "abstract": "In reinforcement learning, it is crucial to have an accurate environment dynamics model to evaluate different policies' value in downstream tasks like offline policy optimization and policy evaluation. However, the learned model is known to be inaccurate in predictions when evaluating target policies different from data-collection policies. In this work, we found that utilizing policy representation for model learning, called policy-conditioned model (PCM) learning, is useful to mitigate the problem, especially when the offline dataset is collected from diversified behavior policies. The reason beyond that is in this case, PCM becomes a meta-dynamics model that is trained to be aware of and focus on the evaluation policies that on-the-fly adjust the model to be suitable to the evaluation policies\u2019 state-action distribution, thus improving the prediction accuracy. Based on that intuition, we propose an easy-to-implement yet effective algorithm of PCM for accurate model learning. We also give a theoretical analysis and experimental evidence to demonstrate the feasibility of reducing value gaps by adapting the dynamics model under different policies. Experiment results show that PCM outperforms the existing SOTA off-policy evaluation methods in the DOPE benchmark by a large margin, and derives significantly better policies in offline policy selection and model predictive control compared with the standard model learning method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ruifeng Chen;Xiong-Hui Chen;Yihao Sun;Siyuan Xiao;Minhui Li;Yang Yu", "authorids": "~Ruifeng_Chen1;~Xiong-Hui_Chen1;~Yihao_Sun1;~Siyuan_Xiao1;~Minhui_Li1;~Yang_Yu5", "gender": "M;M;M;M;F;M", "homepage": "http://www.lamda.nju.edu.cn/chenrf/;http://www.lamda.nju.edu.cn/chenxh/;http://www.lamda.nju.edu.cn/sunyh/;https://www.lamda.nju.edu.cn/limh/;https://github.com/SiyuanXiao;http://www.lamda.nju.edu.cn/yuy", "dblp": "https://dblp.uni-trier.de/pid/58/10097-3;241/7938;;;;46/2181-1", "google_scholar": ";H5pguCYAAAAJ;pFNG8fMAAAAJ;;;PG2lDSwAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Ruifeng_Chen1;~Xiong-Hui_Chen1;~Yihao_Sun1;~Minhui_Li1;~Xiao_Siyuan1;~Yang_Yu2", "aff": "Nanjing University;Nanjing University;Nanjing University;Nanjing University;Nanjing University;Nanjing University", "aff_domain": "nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn", "position": "PhD student;PhD student;MS student;PhD student;Undergrad student;Professor", "bibtex": "@inproceedings{\nchen2024policyconditioned,\ntitle={Policy-conditioned Environment Models are More Generalizable},\nauthor={Ruifeng Chen and Xiong-Hui Chen and Yihao Sun and Siyuan Xiao and Minhui Li and Yang Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=g9mYBdooPA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 756099, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4625810499312332718&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn;nju.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "", "aff_unique_url": "https://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "SceneCraft: An LLM Agent for Synthesizing 3D Scenes as Blender Code", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33438", "id": "gAyzjHw2ml", "proceeding": "https://proceedings.mlr.press/v235/hu24g.html", "pdf": "https://openreview.net/pdf?id=gAyzjHw2ml", "openreview": "https://openreview.net/forum?id=gAyzjHw2ml", "author_site": "ziniu hu, Ahmet Iscen, Aashi Jain, Thomas Kipf, Yisong Yue, David Ross, Cordelia Schmid, Alireza Fathi", "tldr": "", "abstract": "This paper introduces SceneCraft, a Large Language Model (LLM) Agent converting text descriptions into Blender-executable Python scripts which render complex scenes with up to a hundred 3D assets. This process requires complex spatial planning and arrangement. We tackle these challenges through a combination of advanced abstraction, strategic planning, and library learning. SceneCraft first models a scene graph as a blueprint, detailing the spatial relationships among assets in the scene. SceneCraft then writes Python scripts based on this graph, translating relationships into numerical constraints for asset layout. Next, SceneCraft leverages the perceptual strengths of vision-language foundation models like GPT-V to analyze rendered images and iteratively refine the scene. On top of this process, SceneCraft features a library learning mechanism that compiles common script functions into a reusable library, facilitating continuous self-improvement without expensive LLM parameter tuning. Our evaluation demonstrates that SceneCraft surpasses existing LLM-based agents in rendering complex scenes, as shown by its adherence to constraints and favorable human assessments. We also showcase the broader application potential of SceneCraft by reconstructing detailed 3D scenes from the Sintel movie and guiding a video generative model with generated scenes as intermediary control signal.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ziniu Hu;Ahmet Iscen;Aashi Jain;Thomas Kipf;Yisong Yue;David A Ross;Cordelia Schmid;Alireza Fathi", "authorids": "~Ziniu_Hu1;~Ahmet_Iscen3;~Aashi_Jain1;~Thomas_Kipf2;~Yisong_Yue1;~David_A_Ross1;~Cordelia_Schmid1;~Alireza_Fathi1", "gender": "M;M;F;M;F;M;M;M", "homepage": "http://acbull.github.io;;https://www.aashi7jain.com/;http://www.yisongyue.com;https://cordeliaschmid.github.io/;http://ai.stanford.edu/~alireza/;http://www.cs.toronto.edu/~dross/;http://tkipf.github.io/", "dblp": "180/5436;140/7520;196/0093;28/1244;s/CordeliaSchmid;70/3898;68/2171;186/8206", "google_scholar": "x6ct1CsAAAAJ;wIjyqzAAAAAJ;b00gnjoAAAAJ;tEk4qo8AAAAJ;IvqCXP4AAAAJ;luv0xMIAAAAJ;RqOzJR0AAAAJ;83HL5FwAAAAJ", "orcid": ";;;0000-0001-9127-1989;;;;", "linkedin": ";;aashi7jain/;yisongyue/;cordelia-schmid-47985a9;alireza-fathi-04338411/;;thomas-kipf-6b260410a", "or_profile": "~Ziniu_Hu1;~Ahmet_Iscen3;~Aashi_Jain1;~Yisong_Yue1;~Cordelia_Schmid1;~Alireza_Fathi1;~David_Alexander_Ross1;~Thomas_N._Kipf1", "aff": "Deepmind;Google;Google;California Institute of Technology;Inria;Google;Research, Google;Google", "aff_domain": "deepmind.com;google.com;google.com;caltech.edu;inria.fr;google.com;research.google.com;google.com", "position": "Visiting Researcher;Researcher;Researcher;Full Professor;Researcher;researcher;Software Engineer;Research Scientist", "bibtex": "@inproceedings{\nhu2024scenecraft,\ntitle={SceneCraft: An {LLM} Agent for Synthesizing 3D Scenes as Blender Code},\nauthor={Ziniu Hu and Ahmet Iscen and Aashi Jain and Thomas Kipf and Yisong Yue and David A Ross and Cordelia Schmid and Alireza Fathi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gAyzjHw2ml}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2589234, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12861113865771703625&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "deepmind.com;google.com;google.com;caltech.edu;inria.fr;google.com;research.google.com;google.com", "author_num": 8, "aff_unique_index": "0;1;1;2;3;1;1;1", "aff_unique_norm": "DeepMind;Google;California Institute of Technology;INRIA", "aff_unique_dep": ";Google;;", "aff_unique_url": "https://deepmind.com;https://www.google.com;https://www.caltech.edu;https://www.inria.fr", "aff_unique_abbr": "DeepMind;Google;Caltech;Inria", "aff_campus_unique_index": "1;1;2;1;1;1", "aff_campus_unique": ";Mountain View;Pasadena", "aff_country_unique_index": "0;1;1;1;2;1;1;1", "aff_country_unique": "United Kingdom;United States;France" }, { "title": "Can Gaussian Sketching Converge Faster on a Preconditioned Landscape?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33437", "id": "gB3E8IwQZy", "proceeding": "https://proceedings.mlr.press/v235/wang24ch.html", "pdf": "https://openreview.net/pdf?id=gB3E8IwQZy", "openreview": "https://openreview.net/forum?id=gB3E8IwQZy", "author_site": "Yilong Wang, Haishan Ye, Guang Dai, Ivor Tsang", "tldr": "", "abstract": "This paper focuses on the large-scale optimization which is very popular in the big data era. The gradient sketching is an important technique in the large-scale optimization. Specifically, the random coordinate descent algorithm is a kind of gradient sketching method with the random sampling matrix as the sketching matrix. In this paper, we propose a novel gradient sketching called GSGD (Gaussian Sketched Gradient Descent). Compared with the classical gradient sketching methods such as the random coordinate descent and SEGA (Hanzely et al., 2018), our GSGD does not require the importance sampling but can achieve a fast convergence rate matching the ones of these methods with importance sampling. Furthermore, if the objective function has a non-smooth regularization term, our GSGD can also exploit the implicit structure information of the regularization term to achieve a fast convergence rate. Finally, our experimental results substantiate the effectiveness and efficiency of our algorithm.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yilong Wang;Haishan Ye;Guang Dai;Ivor Tsang", "authorids": "~Yilong_Wang3;~Haishan_Ye2;~Guang_Dai1;~Ivor_Tsang1", "gender": "M;M;M;M", "homepage": "https://longming321.github.io/;;;https://www.a-star.edu.sg/cfar/about-cfar/management/prof-ivor-tsang", "dblp": ";162/0002.html;;35/5873", "google_scholar": ";;;rJMOlVsAAAAJ", "orcid": ";;0000-0002-3529-9087;", "linkedin": ";;;", "or_profile": "~Yilong_Wang3;~Haishan_Ye2;~Guang_Dai1;~Ivor_W_Tsang1", "aff": "Xi'an Jiaotong University;Xi'an Jiaotong University;SGIT AI;A*STAR", "aff_domain": "xjtu.edu.cn;xjtu.edu.cn;sgcc.com.cn;cfar.a-star.edu.sg", "position": "Undergrad student;Associate Professor;Principal Researcher;Principal Researcher", "bibtex": "@inproceedings{\nwang2024can,\ntitle={Can Gaussian Sketching Converge Faster on a Preconditioned Landscape?},\nauthor={Yilong Wang and Haishan Ye and Guang Dai and Ivor Tsang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gB3E8IwQZy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 730586, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17023911722759353839&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "xjtu.edu.cn;xjtu.edu.cn;sgcc.com.cn;cfar.a-star.edu.sg", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Xi'an Jiao Tong University;SGIT AI;Agency for Science, Technology and Research", "aff_unique_dep": ";;", "aff_unique_url": "https://www.xjtu.edu.cn;;https://www.a-star.edu.sg", "aff_unique_abbr": "XJTU;;A*STAR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;2", "aff_country_unique": "China;;Singapore" }, { "title": "Small-loss Adaptive Regret for Online Convex Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33436", "id": "gDQuupz8mm", "proceeding": "https://proceedings.mlr.press/v235/yang24l.html", "pdf": "https://openreview.net/pdf?id=gDQuupz8mm", "openreview": "https://openreview.net/forum?id=gDQuupz8mm", "author_site": "Wenhao Yang, Wei Jiang, Yibo Wang, Ping Yang, Yao Hu, Lijun Zhang", "tldr": "", "abstract": "To deal with changing environments, adaptive regret has been proposed to minimize the regret over every interval. Previous studies have established a small-loss adaptive regret bound for general convex functions under the smoothness condition, offering the advantage of being much tighter than minimax rates for benign problems. However, it remains unclear whether similar bounds are attainable for other types of convex functions, such as exp-concave and strongly convex functions. In this paper, we first propose a novel algorithm that achieves a small-loss adaptive regret bound for exp-concave and smooth function. Subsequently, to address the limitation that existing algorithms can only handle one type of convex functions, we further design a universal algorithm capable of delivering small-loss adaptive regret bounds for general convex, exp-concave, and strongly convex functions simultaneously. That is challenging because the universal algorithm follows the meta-expert framework, and we need to ensure that upper bounds for both meta-regret and expert-regret are of small-loss types. Moreover, we provide a novel analysis demonstrating that our algorithms are also equipped with minimax adaptive regret bounds when functions are non-smooth.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenhao Yang;Wei Jiang;Yibo Wang;Ping Yang;Yao Hu;Lijun Zhang", "authorids": "~Wenhao_Yang3;~Wei_Jiang8;~Yibo_Wang2;jiadi@xiaohongshu.com;~Yao_Hu1;~Lijun_Zhang1", "gender": "M;M;;;M;", "homepage": "http://www.lamda.nju.edu.cn/yangwh/;http://www.lamda.nju.edu.cn/jiangw/?AspxAutoDetectCookieSupport=1;;;;", "dblp": "233/4699;;;;;", "google_scholar": "ycccau7cWYIC;;;;LIu7k7wAAAAJ;", "orcid": ";;;;0009-0006-1274-7111;", "linkedin": ";;;;;", "or_profile": "~Wenhao_Yang3;~Wei_Jiang8;~Yibo_Wang2;jiadi@xiaohongshu.com;~Yao_Hu1;~Lijun_Zhang1", "aff": "Nanjing University;Nanjing University;;;Zhejiang University of Technology;", "aff_domain": "nju.edu.cn;nju.edu.cn;;;zjut.edu.cn;", "position": "PhD student;PhD student;;;Researcher;", "bibtex": "@inproceedings{\nyang2024smallloss,\ntitle={Small-loss Adaptive Regret for Online Convex Optimization},\nauthor={Wenhao Yang and Wei Jiang and Yibo Wang and Ping Yang and Yao Hu and Lijun Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gDQuupz8mm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 491975, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17068142220706607096&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "nju.edu.cn;nju.edu.cn;;;zjut.edu.cn;", "author_num": 6, "aff_unique_index": "0;0;1", "aff_unique_norm": "Nanjing University;Zhejiang University of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.nju.edu.cn;https://www.zjut.edu.cn", "aff_unique_abbr": "Nanjing U;ZJUT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Navigating Complexity: Toward Lossless Graph Condensation via Expanding Window Matching", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33435", "id": "gE7qZurGH3", "proceeding": "https://proceedings.mlr.press/v235/zhang24cg.html", "pdf": "https://openreview.net/pdf?id=gE7qZurGH3", "openreview": "https://openreview.net/forum?id=gE7qZurGH3", "author_site": "Yuchen Zhang, Tianle Zhang, Kai Wang, Ziyao Guo, Yuxuan Liang, Xavier Bresson, Wei Jin, Yang You", "tldr": "", "abstract": "Graph condensation aims to reduce the size of a large-scale graph dataset by synthesizing a compact counterpart without sacrificing the performance of Graph Neural Networks (GNNs) trained on it, which has shed light on reducing the computational cost for training GNNs. Nevertheless, existing methods often fall short of accurately replicating the original graph for certain datasets, thereby failing to achieve the objective of lossless condensation. To understand this phenomenon, we investigate the potential reasons and reveal that the previous state-of-the-art trajectory matching method provides biased and restricted supervision signals from the original graph when optimizing the condensed one. This significantly limits both the scale and efficacy of the condensed graph. In this paper, we make the first attempt toward *lossless graph condensation* by bridging the previously neglected supervision signals. Specifically, we employ a curriculum learning strategy to train expert trajectories with more diverse supervision signals from the original graph, and then effectively transfer the information into the condensed graph with expanding window matching. Moreover, we design a loss function to further extract knowledge from the expert trajectories. Theoretical analysis justifies the design of our method and extensive experiments verify its superiority across different datasets. Code is released at https://github.com/NUS-HPC-AI-Lab/GEOM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuchen Zhang;Tianle Zhang;Kai Wang;Ziyao Guo;Yuxuan Liang;Xavier Bresson;Wei Jin;Yang You", "authorids": "~Yuchen_Zhang8;~Tianle_Zhang4;~Kai_Wang8;~Ziyao_Guo1;~Yuxuan_Liang1;~Xavier_Bresson6;~Wei_Jin4;~Yang_You1", "gender": "M;M;M;M;M;M;;M", "homepage": "https://yuczhang.com/;;https://kaiwang960112.github.io/;https://github.com/GzyAftermath;https://yuxuanliang.com;https://www.comp.nus.edu.sg/cs/people/xaviercs/;http://www.cs.emory.edu/~wjin30/;https://www.comp.nus.edu.sg/~youy/", "dblp": ";;78/2022-36;309/6165;183/0977;95/378;66/2173-9;33/8167-1.html", "google_scholar": "Y2oqeP0AAAAJ;;i2II0XIAAAAJ;FlZSxJMAAAAJ;n9cODgcAAAAJ;https://scholar.google.com.sg/citations?hl=en;eWow24EAAAAJ;jF4dPZwAAAAJ", "orcid": ";0000-0003-1502-9730;0000-0002-1154-5175;;0000-0003-2817-7337;;;", "linkedin": ";;;;yoshall/;;;yang-you-0b92914b/", "or_profile": "~Yuchen_Zhang8;~Tianle_Zhang4;~Kai_Wang8;~Ziyao_Guo1;~Yuxuan_Liang1;~Xavier_Bresson6;~Wei_Jin4;~Yang_You1", "aff": "University of Electronic Science and Technology of China;University of Electronic Science and Technology of China;National University of Singapore;Xidian University;The Hong Kong University of Science and Technology (Guangzhou);National University of Singapore;Emory University;National University of Singapore", "aff_domain": "uestc.edu.cn;cn.edu;u.nus.edu;xidian.edu.cn;hkust-gz.edu.cn;nus.edu.sg;emory.edu;nus.edu.sg", "position": "Undergrad student;Undergrad student;PhD student;MS student;Assistant Professor;Associate Professor;Assistant Professor;Professor", "bibtex": "@inproceedings{\nzhang2024navigating,\ntitle={Navigating Complexity: Toward Lossless Graph Condensation via Expanding Window Matching},\nauthor={Yuchen Zhang and Tianle Zhang and Kai Wang and Ziyao Guo and Yuxuan Liang and Xavier Bresson and Wei Jin and Yang You},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gE7qZurGH3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6609225, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6959979505600863249&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "uestc.edu.cn;cn.edu;u.nus.edu;xidian.edu.cn;hkust-gz.edu.cn;nus.edu.sg;emory.edu;nus.edu.sg", "author_num": 8, "aff_unique_index": "0;0;1;2;3;1;4;1", "aff_unique_norm": "University of Electronic Science and Technology of China;National University of Singapore;Xidian University;Hong Kong University of Science and Technology;Emory University", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.uestc.edu.cn;https://www.nus.edu.sg;http://www.xidian.edu.cn/;https://www.ust.hk;https://www.emory.edu", "aff_unique_abbr": "UESTC;NUS;Xidian;HKUST;Emory", "aff_campus_unique_index": "1", "aff_campus_unique": ";Guangzhou", "aff_country_unique_index": "0;0;1;0;0;1;2;1", "aff_country_unique": "China;Singapore;United States" }, { "title": "Learning to Intervene on Concept Bottlenecks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33434", "id": "gEbl6XNLK6", "proceeding": "https://proceedings.mlr.press/v235/steinmann24a.html", "pdf": "https://openreview.net/pdf?id=gEbl6XNLK6", "openreview": "https://openreview.net/forum?id=gEbl6XNLK6", "author_site": "David Steinmann, Wolfgang Stammer, Felix Friedrich, Kristian Kersting", "tldr": "", "abstract": "While deep learning models often lack interpretability, concept bottleneck models (CBMs) provide inherent explanations via their concept representations. Moreover, they allow users to perform interventional interactions on these concepts by updating the concept values and thus correcting the predictive output of the model. Up to this point, these interventions were typically applied to the model just once and then discarded. To rectify this, we present concept bottleneck memory models (CB2Ms), which keep a memory of past interventions. Specifically, CB2Ms leverage a two-fold memory to generalize interventions to appropriate novel situations, enabling the model to identify errors and reapply previous interventions. This way, a CB2M learns to automatically improve model performance from a few initially obtained interventions. If no prior human interventions are available, a CB2M can detect potential mistakes of the CBM bottleneck and request targeted interventions. Our experimental evaluations on challenging scenarios like handling distribution shifts and confounded data demonstrate that CB2Ms are able to successfully generalize interventions to unseen data and can indeed identify wrongly inferred concepts. Hence, CB2Ms are a valuable tool for users to provide interactive feedback on CBMs, by guiding a user's interaction and requiring fewer interventions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "David Steinmann;Wolfgang Stammer;Felix Friedrich;Kristian Kersting", "authorids": "~David_Steinmann1;~Wolfgang_Stammer1;~Felix_Friedrich1;~Kristian_Kersting1", "gender": "M;M;;M", "homepage": ";https://ml-research.github.io/people/wstammer/;https://ml-research.github.io/people/ffriedrich/;http://www.ml.informatik.tu-darmstadt.de/", "dblp": ";256/5497;18/4626;40/3793", "google_scholar": ";66-aU5AAAAAJ;RfM9ud0AAAAJ;QY-earAAAAAJ", "orcid": "0000-0001-5823-2945;0000-0003-3793-8046;0000-0001-8387-793X;0000-0002-2873-9152", "linkedin": ";https://linkedin.com/in/wolfgang-stammer-7835a4207/en-us?trk=people-guest_people_search-card;;", "or_profile": "~David_Steinmann1;~Wolfgang_Stammer1;~Felix_Friedrich1;~Kristian_Kersting1", "aff": "Technische Universit\u00e4t Darmstadt;CS Department, TU Darmstadt;TU Darmstadt;TU Darmstadt", "aff_domain": "tu-darmstadt.de;cs.tu-darmstadt.de;tu-darmstadt.de;tu-darmstadt.de", "position": "PhD student;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nsteinmann2024learning,\ntitle={Learning to Intervene on Concept Bottlenecks},\nauthor={David Steinmann and Wolfgang Stammer and Felix Friedrich and Kristian Kersting},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gEbl6XNLK6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 554343, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10644770077055091326&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "tu-darmstadt.de;cs.tu-darmstadt.de;tu-darmstadt.de;tu-darmstadt.de", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Technische Universit\u00e4t Darmstadt", "aff_unique_dep": "", "aff_unique_url": "https://www.tu-darmstadt.de", "aff_unique_abbr": "TUD", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Darmstadt", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Causal-IQA: Towards the Generalization of Image Quality Assessment Based on Causal Inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33433", "id": "gKPkipJ3gm", "proceeding": "https://proceedings.mlr.press/v235/zhong24e.html", "pdf": "https://openreview.net/pdf?id=gKPkipJ3gm", "openreview": "https://openreview.net/forum?id=gKPkipJ3gm", "author_site": "Yan Zhong, Xingyu Wu, Li Zhang, Chenxi Yang, Tingting Jiang", "tldr": "", "abstract": "Due to the high cost of Image Quality Assessment (IQA) datasets, achieving robust generalization remains challenging for prevalent deep learning-based IQA methods. To address this, this paper proposes a novel end-to-end blind IQA method: Causal-IQA. Specifically, we first analyze the causal mechanisms in IQA tasks and construct a causal graph to understand the interplay and confounding effects between distortion types, image contents, and subjective human ratings. Then, through shifting the focus from correlations to causality, Causal-IQA aims to improve the estimation accuracy of image quality scores by mitigating the confounding effects using a causality-based optimization strategy. This optimization strategy is implemented on the sample subsets constructed by a Counterfactual Division process based on the Backdoor Criterion. Extensive experiments illustrate the superiority of Causal-IQA.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yan Zhong;Xingyu Wu;Li Zhang;Chenxi Yang;Tingting Jiang", "authorids": "~Yan_Zhong2;~Xingyu_Wu3;~Li_Zhang25;~Chenxi_Yang2;~Tingting_Jiang2", "gender": "M;M;;;F", "homepage": ";https://wuxingyu-ai.github.io/;;;http://www.vie.group/ttj", "dblp": "81/5094-1.html;143/0523.html;;;72/2833-1", "google_scholar": ";E10XSzEAAAAJ;;;p6RJZj0AAAAJ", "orcid": "0000-0003-0005-2620;0000-0002-8204-6197;;0000-0001-5058-5404;0000-0002-5372-0656", "linkedin": ";;;;", "or_profile": "~Yan_Zhong2;~Xingyu_Wu3;~Li_Zhang25;~Chenxi_Yang2;~Tingting_Jiang2", "aff": "Peking University;Hong Kong Polytechnic University;;Peking University;School of Computer Science, Peking University", "aff_domain": "pku.edu.cn;polyu.edu.hk;;pku.edu.cn;pku.edu.cn", "position": "PhD student;Postdoc;;PhD student;Associate Professor", "bibtex": "@inproceedings{\nzhong2024causaliqa,\ntitle={Causal-{IQA}: Towards the Generalization of Image Quality Assessment Based on Causal Inference},\nauthor={Yan Zhong and Xingyu Wu and Li Zhang and Chenxi Yang and Tingting Jiang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gKPkipJ3gm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8769966, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=737355844345578347&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "pku.edu.cn;polyu.edu.hk;;pku.edu.cn;pku.edu.cn", "author_num": 5, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Peking University;Hong Kong Polytechnic University", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.polyu.edu.hk", "aff_unique_abbr": "Peking U;PolyU", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Hong Kong SAR;Beijing", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "New Bounds on the Cohesion of Complete-link and Other Linkage Methods for Agglomerative Clustering", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33432", "id": "gL5djEYLx2", "proceeding": "https://proceedings.mlr.press/v235/dasgupta24a.html", "pdf": "https://openreview.net/pdf?id=gL5djEYLx2", "openreview": "https://openreview.net/forum?id=gL5djEYLx2", "author_site": "Sanjoy Dasgupta, Eduardo Laber", "tldr": "", "abstract": "Linkage methods are among the most popular algorithms for hierarchical clustering. Despite their relevance, the current knowledge regarding the quality of the clustering produced by these methods is limited. Here, we improve the currently available bounds on the maximum diameter of the clustering obtained by complete-link for metric spaces. One of our new bounds, in contrast to the existing ones, allows us to separate complete-link from single-link in terms of approximation for the diameter, which corroborates the common perception that the former is more suitable than the latter when the goal is producing compact clusters. We also show that our techniques can be employed to derive upper bounds on the cohesion of a class of linkage methods that includes the quite popular average-link.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sanjoy Dasgupta;Eduardo Sany Laber", "authorids": "~Sanjoy_Dasgupta3;~Eduardo_Sany_Laber1", "gender": "M;", "homepage": "http://www-di.inf.puc-rio.br/~laber/;http://www.cs.ucsd.edu/~dasgupta", "dblp": "49/5557;34/5967", "google_scholar": "https://scholar.google.com.br/citations?hl=pt-BR;", "orcid": "0000-0002-9025-8333;", "linkedin": ";", "or_profile": "~Eduardo_Sany_Laber1;~Sanjoy_Dasgupta2", "aff": "Pontificia Universidade Catolica, Rio de Janeiro, Brazil;University of California, San Diego", "aff_domain": "puc-rio.br;ucsd.edu", "position": "Associate Professor;Full Professor", "bibtex": "@inproceedings{\ndasgupta2024new,\ntitle={New Bounds on the Cohesion of Complete-link and Other Linkage Methods for Agglomerative Clustering},\nauthor={Sanjoy Dasgupta and Eduardo Sany Laber},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gL5djEYLx2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 523845, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9831922110082848041&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "puc-rio.br;ucsd.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Pontifical Catholic University of Rio de Janeiro;University of California, San Diego", "aff_unique_dep": ";", "aff_unique_url": "http://www.puc-rio.br/;https://www.ucsd.edu", "aff_unique_abbr": "PUC-Rio;UCSD", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Rio de Janeiro;San Diego", "aff_country_unique_index": "0;1", "aff_country_unique": "Brazil;United States" }, { "title": "Stochastic Gradient Flow Dynamics of Test Risk and its Exact Solution for Weak Features", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33431", "id": "gPBMkJG7bt", "proceeding": "https://proceedings.mlr.press/v235/veiga24a.html", "pdf": "https://openreview.net/pdf?id=gPBMkJG7bt", "openreview": "https://openreview.net/forum?id=gPBMkJG7bt", "author_site": "Rodrigo Veiga, Anastasia Remizova, Nicolas Macris", "tldr": "", "abstract": "We investigate the test risk of a continuous time stochastic gradient flow dynamics in learning theory. Using a path integral formulation we provide, in the regime of small learning rate, a general formula for computing the difference between test risk curves of pure gradient and stochastic gradient flows. We apply the general theory to a simple model of weak features, which displays the double descent phenomenon, and explicitly compute the corrections brought about by the added stochastic term in the dynamics, as a function of time and model parameters. The analytical results are compared to simulations of discrete time stochastic gradient descent and show good agreement.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rodrigo Veiga;Anastasia Remizova;Nicolas Macris", "authorids": "~Rodrigo_Veiga1;~Anastasia_Remizova1;~Nicolas_Macris1", "gender": "M;F;M", "homepage": "https://rodsveiga.github.io/;;", "dblp": "313/2696-1;302/1093;47/5851", "google_scholar": "https://scholar.google.com.tr/citations?user=SIzY0mgAAAAJ;ZaJy9J4AAAAJ;", "orcid": "0000-0002-6835-4871;;0000-0003-2189-7411", "linkedin": "rodrigo-soares-veiga;aremizova;", "or_profile": "~Rodrigo_Veiga1;~Anastasia_Remizova1;~Nicolas_Macris1", "aff": "EPFL - EPF Lausanne;EPFL - EPF Lausanne;Ecole Polytechnique Federale Lausanne", "aff_domain": "epfl.ch;epfl.ch;epfl.ch", "position": "Postdoc;PhD student;Associate Professor", "bibtex": "@inproceedings{\nveiga2024stochastic,\ntitle={Stochastic Gradient Flow Dynamics of Test Risk and its Exact Solution for Weak Features},\nauthor={Rodrigo Veiga and Anastasia Remizova and Nicolas Macris},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gPBMkJG7bt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1704175, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7534341799804475201&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "epfl.ch;epfl.ch;epfl.ch", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "EPFL;Ecole Polytechnique Federale de Lausanne", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://www.epfl.ch", "aff_unique_abbr": "EPFL;EPFL", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Discovering Environments with XRM", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33430", "id": "gPStP3FSY9", "proceeding": "https://proceedings.mlr.press/v235/pezeshki24a.html", "pdf": "https://openreview.net/pdf?id=gPStP3FSY9", "openreview": "https://openreview.net/forum?id=gPStP3FSY9", "author_site": "Mohammad Pezeshki, Diane Bouchacourt, Mark Ibrahim, Nicolas Ballas, Pascal Vincent, David Lopez-Paz", "tldr": "", "abstract": "Environment annotations are essential for the success of many out-of-distribution (OOD) generalization methods. Unfortunately, these are costly to obtain and often limited by human annotators' biases. To achieve robust generalization, it is essential to develop algorithms for automatic environment discovery within datasets. Current proposals, which divide examples based on their training error, suffer from one fundamental problem. These methods introduce hyper-parameters and early-stopping criteria, which require a validation set with human-annotated environments, the very information subject to discovery. In this paper, we propose Cross-Risk Minimization (XRM) to address this issue. XRM trains twin networks, each learning from one random half of the training data, while imitating confident held-out mistakes made by its sibling. XRM provides a recipe for hyper-parameter tuning, does not require early-stopping, and can discover environments for all training and validation data. Algorithms built on top of XRM environments achieve oracle worst-group-accuracy, addressing a long-standing challenge in OOD generalization.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mohammad Pezeshki;Diane Bouchacourt;Mark Ibrahim;Nicolas Ballas;Pascal Vincent;David Lopez-Paz", "authorids": "~Mohammad_Pezeshki1;~Diane_Bouchacourt3;~Mark_Ibrahim1;~Nicolas_Ballas1;~Pascal_Vincent1;~David_Lopez-Paz2", "gender": "M;;;M;;F", "homepage": "https://mohammadpz.github.io/;https://markibrahim.me/;;http://www.iro.umontreal.ca/~vincentp;http://lopezpaz.org;https://dianebouchacourt.github.io/", "dblp": "139/0888;180/5660;120/9066;43/861;74/10481;176/1498", "google_scholar": "HT85tXsAAAAJ;AqYyoCMAAAAJ;euUV4iUAAAAJ;WBCKQMsAAAAJ;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Mohammad_Pezeshki1;~Mark_Ibrahim1;~Nicolas_Ballas1;~Pascal_Vincent1;~David_Lopez-Paz2;~Diane_Nicole_Bouchacourt1", "aff": "Meta Facebook;Facebook AI Research (FAIR) Meta;Meta;Facebook A.I. Research;Meta Facebook;Meta AI Research", "aff_domain": "fb.com;ai.facebook.com;meta.com;fb.com;fb.com;meta.com", "position": "Postdoc;Researcher;Researcher;Research Scientist;Research Scientist;Researcher", "bibtex": "@inproceedings{\npezeshki2024discovering,\ntitle={Discovering Environments with {XRM}},\nauthor={Mohammad Pezeshki and Diane Bouchacourt and Mark Ibrahim and Nicolas Ballas and Pascal Vincent and David Lopez-Paz},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gPStP3FSY9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1889214, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12498502238921331189&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "fb.com;ai.facebook.com;meta.com;fb.com;fb.com;meta.com", "author_num": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Meta", "aff_unique_dep": "Meta Platforms, Inc.", "aff_unique_url": "https://meta.com", "aff_unique_abbr": "Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Position: A Roadmap to Pluralistic Alignment", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33429", "id": "gQpBnRHwxM", "proceeding": "https://proceedings.mlr.press/v235/sorensen24a.html", "pdf": "https://openreview.net/pdf?id=gQpBnRHwxM", "openreview": "https://openreview.net/forum?id=gQpBnRHwxM", "author_site": "Taylor Sorensen, Jared Moore, Jillian Fisher, Mitchell Gordon, Niloofar Mireshghallah, Christopher Rytting, Andre Ye, Liwei Jiang, Ximing Lu, Nouha Dziri, Tim Althoff, Yejin Choi", "tldr": "", "abstract": "With increased power and prevalence of AI systems, it is ever more critical that AI systems are designed to serve *all*, i.e., people with diverse values and perspectives. However, aligning models to serve *pluralistic* human values remains an open research question. In this piece, we propose a roadmap to pluralistic alignment, specifically using large language models as a test bed. We identify and formalize three possible ways to define and operationalize pluralism in AI systems: 1) *Overton pluralistic* models that present a spectrum of reasonable responses; 2) *Steerably pluralistic* models that can steer to reflect certain perspectives; and 3) *Distributionally pluralistic* models that are well-calibrated to a given population in distribution. We also formalize and discuss three possible classes of *pluralistic benchmarks*: 1) *Multi-objective* benchmarks, 2) *Trade-off steerable* benchmarks that incentivize models to steer to arbitrary trade-offs, and 3) *Jury-pluralistic* benchmarks that explicitly model diverse human ratings. We use this framework to argue that current alignment techniques may be fundamentally limited for pluralistic AI; indeed, we highlight empirical evidence, both from our own experiments and from other work, that standard alignment procedures might *reduce* distributional pluralism in models, motivating the need for further research on pluralistic alignment.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Taylor Sorensen;Jared Moore;Jillian Fisher;Mitchell L Gordon;Niloofar Mireshghallah;Christopher Michael Rytting;Andre Ye;Liwei Jiang;Ximing Lu;Nouha Dziri;Tim Althoff;Yejin Choi", "authorids": "~Taylor_Sorensen1;~Jared_Moore1;~Jillian_Fisher1;~Mitchell_L_Gordon1;~Niloofar_Mireshghallah1;~Christopher_Michael_Rytting1;~Andre_Ye1;~Liwei_Jiang2;~Ximing_Lu1;~Nouha_Dziri2;~Tim_Althoff2;~Yejin_Choi1", "gender": "M;;F;M;;M;;F;F;;M;F", "homepage": "https://tsor13.github.io;;http://jfisher52.github.io;http://mgordon.me;;https://chrisrytting.github.io/;https://andre-ye.github.io;https://liweijiang.me;https://gloriaximinglu.github.io/;;https://althoff.cs.uw.edu/;https://yejinc.github.io/", "dblp": "294/0706;;336/3238;144/5660.html;;303/4521;354/8393;;24/10879;;119/1352;89/579-1", "google_scholar": "https://scholar.google.com/citations?hl=en;;Gnk0E_QAAAAJ;pMkqt6sAAAAJ;;6Gq1WzQAAAAJ;;lcPsDgUAAAAJ;https://scholar.google.com/citations?hl=en;;yc4nBNgAAAAJ;vhP-tlcAAAAJ", "orcid": "0000-0002-3251-3527;;;;;0000-0002-7373-9741;;;;;0000-0003-4793-2289;", "linkedin": "sorensen-taylor/;;jillianrosefisher/;;;chris-rytting-594889ba/;;;;;timalthoff/;", "or_profile": "~Taylor_Sorensen1;~Jared_Moore1;~Jillian_Fisher1;~Mitchell_L_Gordon1;~Niloofar_Mireshghallah1;~Christopher_Michael_Rytting1;~Andre_Ye1;~Liwei_Jiang2;~Ximing_Lu1;~Nouha_Dziri2;~Tim_Althoff2;~Yejin_Choi1", "aff": "Google DeepMind;;University of Washington;University of Washington;;;University of Washington;University of Washington;University of Washington;;Department of Computer Science, University of Washington;Department of Computer Science, University of Washington", "aff_domain": "deepmind.com;;uw.edu;washington.edu;;;uw.edu;washington.edu;cs.washington.edu;;cs.washington.edu;cs.washington.edu", "position": "Intern;;PhD student;Postdoc;;;Undergrad student;PhD student;PhD student;;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nsorensen2024position,\ntitle={Position: A Roadmap to Pluralistic Alignment},\nauthor={Taylor Sorensen and Jared Moore and Jillian Fisher and Mitchell L Gordon and Niloofar Mireshghallah and Christopher Michael Rytting and Andre Ye and Liwei Jiang and Ximing Lu and Nouha Dziri and Tim Althoff and Yejin Choi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gQpBnRHwxM}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3384922, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "email": "deepmind.com;;uw.edu;washington.edu;;;uw.edu;washington.edu;cs.washington.edu;;cs.washington.edu;cs.washington.edu", "author_num": 12, "aff_unique_index": "0;1;1;1;1;1;1;1", "aff_unique_norm": "Google;University of Washington", "aff_unique_dep": "Google DeepMind;", "aff_unique_url": "https://deepmind.com;https://www.washington.edu", "aff_unique_abbr": "DeepMind;UW", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;1;1;1;1;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Applying language models to algebraic topology: generating simplicial cycles using multi-labeling in Wu's formula", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33428", "id": "gQz30hTkRE", "proceeding": "https://proceedings.mlr.press/v235/brilliantov24a.html", "pdf": "https://openreview.net/pdf?id=gQz30hTkRE", "openreview": "https://openreview.net/forum?id=gQz30hTkRE", "author_site": "Kirill Brilliantov, Fedor Pavutnitskiy, Dmitrii A. Pasechniuk, German Magai", "tldr": "", "abstract": "Computing homotopy groups of spheres has long been a fundamental objective in algebraic topology. Various theoretical and algorithmic approaches have been developed to tackle this problem. In this paper we take a step towards the goal of comprehending the group-theoretic structure of the generators of these homotopy groups by leveraging the power of machine learning. Specifically, in the simplicial group setting of Wu's formula, we reformulate the problem of generating simplicial cycles as a problem of sampling from the intersection of algorithmic datasets related to Dyck languages. We present and evaluate language modelling approaches that employ multi-label information for input sequences, along with the necessary group-theoretic toolkit and non-neural baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kirill Brilliantov;Fedor Pavutnitskiy;Dmitry Pasechnyuk;German Magai", "authorids": "~Kirill_Brilliantov1;~Fedor_Pavutnitskiy1;~Dmitry_Pasechnyuk1;~German_Magai1", "gender": "M;M;M;M", "homepage": "https://github.com/kibrq;;http://dmivilensky.ru/;https://www.hse.ru/en/org/persons/364631586", "dblp": "350/5533;285/5395;242/6650;318/9190", "google_scholar": "thgwrhYAAAAJ;;yUfa6X8AAAAJ;", "orcid": ";0000-0002-8676-6941;0000-0002-1208-1659;", "linkedin": ";;;german-magai-0b7a69233/", "or_profile": "~Kirill_Brilliantov1;~Fedor_Pavutnitskiy1;~Dmitry_Pasechnyuk1;~German_Magai1", "aff": "ETHZ - ETH Zurich;Beijing Institute of Mathematical Sciences and Applications;Mohamed bin Zayed University of Artificial Intelligence;Higher School of Economics", "aff_domain": "ethz.ch;bimsa.cn;mbzuai.ac.ae;hse.ru", "position": "MS student;Assistant Professor;Researcher;PhD student", "bibtex": "@inproceedings{\nbrilliantov2024applying,\ntitle={Applying language models to algebraic topology: generating simplicial cycles using multi-labeling in Wu's formula},\nauthor={Kirill Brilliantov and Fedor Pavutnitskiy and Dmitry Pasechnyuk and German Magai},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gQz30hTkRE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 960001, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ri0V03MH7o8J:scholar.google.com/&scioq=Applying+language+models+to+algebraic+topology:+generating+simplicial+cycles+using+multi-labeling+in+Wu%27s+formula&hl=en&as_sdt=0,5", "gs_version_total": 9, "email": "ethz.ch;bimsa.cn;mbzuai.ac.ae;hse.ru", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "ETH Zurich;Beijing Institute of Mathematical Sciences and Applications;Mohamed bin Zayed University of Artificial Intelligence;Higher School of Economics", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ethz.ch;;https://mbzuai.ac.ae;https://www.hse.ru", "aff_unique_abbr": "ETHZ;;MBZUAI;HSE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;3", "aff_country_unique": "Switzerland;China;United Arab Emirates;Russian Federation" }, { "title": "Representing Molecules as Random Walks Over Interpretable Grammars", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33427", "id": "gS3nc9iUrH", "proceeding": "https://proceedings.mlr.press/v235/sun24c.html", "pdf": "https://openreview.net/pdf?id=gS3nc9iUrH", "openreview": "https://openreview.net/forum?id=gS3nc9iUrH", "author_site": "Michael Sun, Minghao Guo, Weize Yuan, Veronika Thost, Crystal Owens, Aristotle Grosz, Sharvaa Selvan, Katelyn Zhou, Hassan Mohiuddin, Benjamin Pedretti, Zachary Smith, Jie Chen, Wojciech Matusik", "tldr": "", "abstract": "Recent research in molecular discovery has primarily been devoted to small, drug-like molecules, leaving many similarly important applications in material design without adequate technology. These applications often rely on more complex molecular structures with fewer examples that are carefully designed using known substructures. We propose a data-efficient and interpretable model for representing and reasoning over such molecules in terms of graph grammars that explicitly describe the hierarchical design space featuring motifs to be the design basis. We present a novel representation in the form of random walks over the design space, which facilitates both molecule generation and property prediction. We demonstrate clear advantages over existing methods in terms of performance, efficiency, and synthesizability of predicted molecules, and we provide detailed insights into the method's chemical interpretability.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Michael Sun;Minghao Guo;Weize Yuan;Veronika Thost;Crystal Elaine Owens;Aristotle Franklin Grosz;Sharvaa Selvan;Katelyn Zhou;Hassan Mohiuddin;Benjamin J Pedretti;Zachary P Smith;Jie Chen;Wojciech Matusik", "authorids": "~Michael_Sun1;~Minghao_Guo1;~Weize_Yuan1;~Veronika_Thost1;~Crystal_Elaine_Owens1;~Aristotle_Franklin_Grosz1;~Sharvaa_Selvan1;~Katelyn_Zhou1;~Hassan_Mohiuddin1;~Benjamin_J_Pedretti1;zpsmith@mit.edu;~Jie_Chen1;~Wojciech_Matusik2", "gender": "M;M;F;F;F;M;M;F;M;M;;;M", "homepage": "https://michaelsun.tech;https://www.minghaoguo.com/;;https://mitibmwatsonailab.mit.edu/people/veronika-thost/;https://www.crystalowens.com/;;;;;;;https://jiechenjiechen.github.io;https://cdfg.mit.edu/wojciech", "dblp": ";145/0008/;;132/3874;;;;;;;;92/6289-7;", "google_scholar": "https://scholar.google.com/citations?hl=en;Hq2unJcAAAAJ;SAAd6VwAAAAJ;TyScgJ0AAAAJ;LvkKN_oAAAAJ;;;;;https://scholar.google.com/citations?hl=en;;Z-lkme8AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;0000-0003-4984-1532;0000-0002-2433-7025;;;;;;;;0000-0003-0212-5643", "linkedin": "michael-sun-1610b2155/;;;;crystaleowens/;aristotlefgrosz/;sharvaa-selvan/;kzh23/;hassan-mohiuddin-0462b51aa/;;;;wojciech-matusik-67238126/", "or_profile": "~Michael_Sun1;~Minghao_Guo1;~Weize_Yuan1;~Veronika_Thost1;~Crystal_Elaine_Owens1;~Aristotle_Franklin_Grosz1;~Sharvaa_Selvan1;~Katelyn_Zhou1;~Hassan_Mohiuddin1;~Benjamin_J_Pedretti1;zpsmith@mit.edu;~Jie_Chen1;~Wojciech_Matusik2", "aff": "Computer Science and Artificial Intelligence Laboratory, Electrical Engineering & Computer Science;Massachusetts Institute of Technology;Massachusetts Institute of Technology;IBM Research;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Wellesley College;Massachusetts Institute of Technology;Massachusetts Institute of Technology;;International Business Machines;Massachusetts Institute of Technology", "aff_domain": "csail.mit.edu;mit.edu;mit.edu;ibm.com;mit.edu;mit.edu;mit.edu;wellesley.edu;mit.edu;mit.edu;;ibm.com;mit.edu", "position": "PhD student;PhD student;PhD student;Research Scientist;Postdoc;PhD student;Undergrad student;Undergrad student;Undergrad student;Postdoc;;Research Staff Member;Full Professor", "bibtex": "@inproceedings{\nsun2024representing,\ntitle={Representing Molecules as Random Walks Over Interpretable Grammars},\nauthor={Michael Sun and Minghao Guo and Weize Yuan and Veronika Thost and Crystal Elaine Owens and Aristotle Franklin Grosz and Sharvaa Selvan and Katelyn Zhou and Hassan Mohiuddin and Benjamin J Pedretti and Zachary P Smith and Jie Chen and Wojciech Matusik},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gS3nc9iUrH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8960690, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 13, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3644662736031241711&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "csail.mit.edu;mit.edu;mit.edu;ibm.com;mit.edu;mit.edu;mit.edu;wellesley.edu;mit.edu;mit.edu;;ibm.com;mit.edu", "author_num": 13, "aff_unique_index": "0;0;0;1;0;0;0;2;0;0;3;0", "aff_unique_norm": "Massachusetts Institute of Technology;IBM;Wellesley College;International Business Machines Corporation", "aff_unique_dep": "Computer Science and Artificial Intelligence Laboratory;IBM Research;;", "aff_unique_url": "https://www.csail.mit.edu;https://www.ibm.com/research;https://www.wellesley.edu;https://www.ibm.com", "aff_unique_abbr": "CSAIL;IBM;Wellesley;IBM", "aff_campus_unique_index": "0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Quasi-Monte Carlo Features for Kernel Approximation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33426", "id": "gSMUjrkRRk", "proceeding": "https://proceedings.mlr.press/v235/huang24w.html", "pdf": "https://openreview.net/pdf?id=gSMUjrkRRk", "openreview": "https://openreview.net/forum?id=gSMUjrkRRk", "author_site": "ZHEN HUANG, Jiajin Sun, Yian Huang", "tldr": "", "abstract": "Random features (Rahimi & Recht, 2007), based on Monte Carlo (MC) method, is one of the most popular approximation techniques to accelerate kernel methods. We show for a class of kernels, including Gaussian kernels, quasi-Monte Carlo (QMC) methods can be used in place of MC to improve the approximation error from $O_P(1/\\sqrt{M})$ to $O(1/M)$ (up to logarithmic factors), for estimating both the kernel function itself and the associated integral operator, where $M$ is the number of features being used. Furthermore, we demonstrate the advantage of QMC features in the case of kernel ridge regression, where theoretically, fewer random features suffice to guarantee the same convergence rate of the excess risk. In practice, the QMC kernel approximation approach is easily implementable and shows superior performance, as supported by the empirical evidence provided in the paper.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhen Huang;Jiajin Sun;Yian Huang", "authorids": "~Zhen_Huang7;~Jiajin_Sun1;~Yian_Huang1", "gender": "M;;", "homepage": ";https://sites.google.com/view/jiajin-sun/home;", "dblp": ";;", "google_scholar": "xbz5YwoAAAAJ;;", "orcid": ";;", "linkedin": ";;yian-huang-53b43a226/", "or_profile": "~Zhen_Huang7;~Jiajin_Sun1;~Yian_Huang1", "aff": "Columbia University;Columbia University;Columbia University", "aff_domain": "columbia.edu;columbia.edu;columbia.edu", "position": "PhD student;PhD student;PhD student", "bibtex": "@inproceedings{\nhuang2024quasimonte,\ntitle={Quasi-Monte Carlo Features for Kernel Approximation},\nauthor={Zhen Huang and Jiajin Sun and Yian Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gSMUjrkRRk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 692502, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6638509453657056479&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": "columbia.edu;columbia.edu;columbia.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Columbia University", "aff_unique_dep": "", "aff_unique_url": "https://www.columbia.edu", "aff_unique_abbr": "Columbia", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Subsampling is not Magic: Why Large Batch Sizes Work for Differentially Private Stochastic Optimisation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33425", "id": "gTBjkJvadC", "proceeding": "https://proceedings.mlr.press/v235/raisa24a.html", "pdf": "https://openreview.net/pdf?id=gTBjkJvadC", "openreview": "https://openreview.net/forum?id=gTBjkJvadC", "author_site": "Ossi R\u00e4is\u00e4, Joonas J\u00e4lk\u00f6, Antti Honkela", "tldr": "", "abstract": "We study how the batch size affects the total gradient variance in differentially private stochastic gradient descent (DP-SGD), seeking a theoretical explanation for the usefulness of large batch sizes. As DP-SGD is the basis of modern DP deep learning, its properties have been widely studied, and recent works have empirically found large batch sizes to be beneficial. However, theoretical explanations of this benefit are currently heuristic at best. We first observe that the total gradient variance in DP-SGD can be decomposed into subsampling-induced and noise-induced variances. We then prove that in the limit of an infinite number of iterations, the effective noise-induced variance is invariant to the batch size. The remaining subsampling-induced variance decreases with larger batch sizes, so large batches reduce the effective total gradient variance. We confirm numerically that the asymptotic regime is relevant in practical settings when the batch size is not small, and find that outside the asymptotic regime, the total gradient variance decreases even more with large batch sizes. We also find a sufficient condition that implies that large batch sizes similarly reduce effective DP noise variance for one iteration of DP-SGD.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ossi R\u00e4is\u00e4;Joonas J\u00e4lk\u00f6;Antti Honkela", "authorids": "~Ossi_R\u00e4is\u00e41;~Joonas_J\u00e4lk\u00f61;~Antti_Honkela1", "gender": "M;M;M", "homepage": ";;https://www.cs.helsinki.fi/u/ahonkela/", "dblp": "296/0031;188/5963;h/AnttiHonkela", "google_scholar": "https://scholar.google.fi/citations?user=FpmQ-jcAAAAJ;;XsyLs6AAAAAJ", "orcid": ";;0000-0001-9193-8093", "linkedin": "ossi-r%C3%A4is%C3%A4-749502139/;;", "or_profile": "~Ossi_R\u00e4is\u00e41;~Joonas_J\u00e4lk\u00f61;~Antti_Honkela1", "aff": "University of Helsinki;University of Helsinki;University of Helsinki", "aff_domain": "helsinki.fi;helsinki.fi;helsinki.fi", "position": "PhD student;Postdoc;Full Professor", "bibtex": "@inproceedings{\nr{\\\"a}is{\\\"a}2024subsampling,\ntitle={Subsampling is not Magic: Why Large Batch Sizes Work for Differentially Private Stochastic Optimisation},\nauthor={Ossi R{\\\"a}is{\\\"a} and Joonas J{\\\"a}lk{\\\"o} and Antti Honkela},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gTBjkJvadC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 462987, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2560146539489686805&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "email": "helsinki.fi;helsinki.fi;helsinki.fi", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Helsinki", "aff_unique_dep": "", "aff_unique_url": "https://www.helsinki.fi", "aff_unique_abbr": "UH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Finland" }, { "title": "VNN: Verification-Friendly Neural Networks with Hard Robustness Guarantees", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33424", "id": "gUFufRkzjV", "proceeding": "https://proceedings.mlr.press/v235/baninajjar24a.html", "pdf": "https://openreview.net/pdf?id=gUFufRkzjV", "openreview": "https://openreview.net/forum?id=gUFufRkzjV", "author_site": "Anahita Baninajjar, Ahmed Rezine, Amir Aminifar", "tldr": "", "abstract": "Machine learning techniques often lack formal correctness guarantees, evidenced by the widespread adversarial examples that plague most deep-learning applications. This lack of formal guarantees resulted in several research efforts that aim at verifying Deep Neural Networks (DNNs), with a particular focus on safety-critical applications. However, formal verification techniques still face major scalability and precision challenges. The over-approximation introduced during the formal verification process to tackle the scalability challenge often results in inconclusive analysis. To address this challenge, we propose a novel framework to generate Verification-Friendly Neural Networks (VNNs). We present a post-training optimization framework to achieve a balance between preserving prediction performance and verification-friendliness. Our proposed framework results in VNNs that are comparable to the original DNNs in terms of prediction performance, while amenable to formal verification techniques. This essentially enables us to establish robustness for more VNNs than their DNN counterparts, in a time-efficient manner.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anahita Baninajjar;Ahmed Rezine;Amir Aminifar", "authorids": "~Anahita_Baninajjar1;ahmed.rezine@liu.se;~Amir_Aminifar1", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nbaninajjar2024vnn,\ntitle={{VNN}: Verification-Friendly Neural Networks with Hard Robustness Guarantees},\nauthor={Anahita Baninajjar and Ahmed Rezine and Amir Aminifar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gUFufRkzjV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 381010, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14115434246167155142&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "email": ";;", "author_num": 3 }, { "title": "Long Range Propagation on Continuous-Time Dynamic Graphs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33423", "id": "gVg8V9isul", "proceeding": "https://proceedings.mlr.press/v235/gravina24a.html", "pdf": "https://openreview.net/pdf?id=gVg8V9isul", "openreview": "https://openreview.net/forum?id=gVg8V9isul", "author_site": "Alessio Gravina, Giulio Lovisotto, Claudio Gallicchio, Davide Bacciu, Claas Grohnfeldt", "tldr": "", "abstract": "Learning Continuous-Time Dynamic Graphs (C-TDGs) requires accurately modeling spatio-temporal information on streams of irregularly sampled events. While many methods have been proposed recently, we find that most message passing-, recurrent- or self-attention-based methods perform poorly on *long-range* tasks. These tasks require correlating information that occurred \"far\" away from the current event, either spatially (higher-order node information) or along the time dimension (events occurred in the past). To address long-range dependencies, we introduce Continuous-Time Graph Anti-Symmetric Network (CTAN). Grounded within the ordinary differential equations framework, our method is designed for efficient propagation of information. In this paper, we show how CTAN's (i) long-range modeling capabilities are substantiated by theoretical findings and how (ii) its empirical performance on synthetic long-range benchmarks and real-world benchmarks is superior to other methods. Our results motivate CTAN's ability to propagate long-range information in C-TDGs as well as the inclusion of long-range tasks as part of temporal graph models evaluation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alessio Gravina;Giulio Lovisotto;Claudio Gallicchio;Davide Bacciu;Claas Grohnfeldt", "authorids": "~Alessio_Gravina1;~Giulio_Lovisotto2;~Claudio_Gallicchio1;~Davide_Bacciu1;~Claas_Grohnfeldt1", "gender": ";M;M;M;", "homepage": "http://pages.di.unipi.it/gravina/;https://giuliolovisotto.github.io/;https://sites.google.com/site/cgallicch/;http://pages.di.unipi.it/bacciu/;", "dblp": ";;41/9473;07/6626;", "google_scholar": "oAzxkbYAAAAJ;;https://scholar.google.com/citations?hl=en;https://scholar.google.it/citations?user=1d5n2WkAAAAJ;https://scholar.google.de/citations?user=wcbFVEQAAAAJ", "orcid": "0000-0001-5526-2479;;;0000-0001-5213-2468;", "linkedin": "alessio-gravina/;;claudio-gallicchio-05a47038/;bacciu/;", "or_profile": "~Alessio_Gravina1;~Giulio_Lovisotto2;~Claudio_Gallicchio1;~Davide_Bacciu1;~Claas_Grohnfeldt1", "aff": "University of Pisa;;University of Pisa;University of Pisa;Huawei Technologies Ltd.", "aff_domain": "unipi.it;;unipi.it;unipi.it;huawei.com", "position": "PhD student;;Assistant Professor;Full Professor;Principal Research Engineer", "bibtex": "@inproceedings{\ngravina2024long,\ntitle={Long Range Propagation on Continuous-Time Dynamic Graphs},\nauthor={Alessio Gravina and Giulio Lovisotto and Claudio Gallicchio and Davide Bacciu and Claas Grohnfeldt},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gVg8V9isul}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1686340, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10928045338189865422&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "unipi.it;;unipi.it;unipi.it;huawei.com", "author_num": 5, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of Pisa;Huawei", "aff_unique_dep": ";Huawei Technologies", "aff_unique_url": "https://www.unipi.it;https://www.huawei.com", "aff_unique_abbr": "UNIP;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "Italy;China" }, { "title": "Iterated Denoising Energy Matching for Sampling from Boltzmann Densities", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33422", "id": "gVjMwLDFoQ", "proceeding": "https://proceedings.mlr.press/v235/akhound-sadegh24a.html", "pdf": "https://openreview.net/pdf?id=gVjMwLDFoQ", "openreview": "https://openreview.net/forum?id=gVjMwLDFoQ", "author_site": "Tara Akhound-Sadegh, Jarrid Rector-Brooks, Joey Bose, Sarthak Mittal, Pablo Lemos, Chenghao Liu, Marcin Sendera, Siamak Ravanbakhsh, Gauthier Gidel, Yoshua Bengio, Nikolay Malkin, Alexander Tong", "tldr": "", "abstract": "Efficiently generating statistically independent samples from an unnormalized probability distribution, such as equilibrium samples of many-body systems, is a foundational problem in science. In this paper, we propose Iterated Denoising Energy Matching (iDEM), an iterative algorithm that uses a novel stochastic score matching objective leveraging solely the energy function and its gradient---and no data samples---to train a diffusion-based sampler. Specifically, iDEM alternates between (I) sampling regions of high model density from a diffusion-based sampler and (II) using these samples in our stochastic matching objective to further improve the sampler. iDEM is scalable to high dimensions as the inner matching objective, is *simulation-free*, and requires no MCMC samples. Moreover, by leveraging the fast mode mixing behavior of diffusion, iDEM smooths out the energy landscape enabling efficient exploration and learning of an amortized sampler. We evaluate iDEM on a suite of tasks ranging from standard synthetic energy functions to invariant $n$-body particle systems. We show that the proposed approach achieves state-of-the-art performance on all metrics and trains $2-5\\times$ faster, which allows it to be the first method to train using energy on the challenging $55$-particle Lennard-Jones system.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tara Akhound-Sadegh;Jarrid Rector-Brooks;Joey Bose;Sarthak Mittal;Pablo Lemos;Cheng-Hao Liu;Marcin Sendera;Siamak Ravanbakhsh;Gauthier Gidel;Yoshua Bengio;Nikolay Malkin;Alexander Tong", "authorids": "~Tara_Akhound-Sadegh1;~Jarrid_Rector-Brooks2;~Joey_Bose1;~Sarthak_Mittal1;~Pablo_Lemos1;~Cheng-Hao_Liu1;~Marcin_Sendera1;~Siamak_Ravanbakhsh1;~Gauthier_Gidel1;~Yoshua_Bengio1;~Nikolay_Malkin1;~Alexander_Tong1", "gender": ";M;M;M;M;M;M;;M;M;;", "homepage": "https://sites.google.com/view/taraakhound-sadegh/home;;https://joeybose.github.io/;https://sarthmit.github.io/;https://pablo-lemos.github.io;https://pchliu.github.io/;;;https://gauthiergidel.github.io/;http://yoshuabengio.org;;https://alextong.net", "dblp": ";230/4010;174/3372;228/8275;313/2645;;220/9876;;188/6326;56/953;;153/9296", "google_scholar": "RHDoTkkAAAAJ;gxRPZh4AAAAJ;ybPyI7IAAAAJ;FGGgTrcAAAAJ;AklQTTsAAAAJ;iVJGx0cAAAAJ;https://scholar.google.pl/citations?user=ScNBRmQAAAAJ;;https://scholar.google.fr/citations?user=bDrXQPUAAAAJ;kukA0LcAAAAJ;;CS80pt4AAAAJ", "orcid": ";;;;0000-0002-4728-8473;0000-0001-7923-6806;0000-0002-8741-6919;;;;;0000-0002-2031-4096", "linkedin": "tara-akhound-sadegh-574748101/;;;;;chenghao-peter-liu/;marcin-sendera-976516123/;;;yoshuabengio/?originalSubdomain=ca;;atong01/", "or_profile": "~Tara_Akhound-Sadegh1;~Jarrid_Rector-Brooks2;~Joey_Bose1;~Sarthak_Mittal1;~Pablo_Lemos1;~Cheng-Hao_Liu1;~Marcin_Sendera1;~Siamak_Ravanbakhsh1;~Gauthier_Gidel1;~Yoshua_Bengio1;~Nikolay_Malkin1;~Alexander_Tong1", "aff": "McGill University;Montreal Institute for Learning Algorithms, University of Montreal, University of Montreal;University of Oxford;University of Montreal;Universit\u00e9 de Montr\u00e9al;Montreal Institute for Learning Algorithms, University of Montreal, Universit\u00e9 de Montr\u00e9al;Jagiellonian University;;Mila - Quebec Artificial Intelligence Institute;University of Montreal;;Universit\u00e9 de Montr\u00e9al", "aff_domain": "mcgill.ca;mila.umontreal.ca;oxford.ac.uk;umontreal.ca;umontreal.ca;mila.umontreal.ca;uj.edu.pl;;mila.quebec;umontreal.ca;;umontreal.ca", "position": "PhD student;PhD student;Postdoc;PhD student;Postdoc;PhD student intern;PhD student;;Assistant Professor;Full Professor;;Postdoc", "bibtex": "@inproceedings{\nakhound-sadegh2024iterated,\ntitle={Iterated Denoising Energy Matching for Sampling from Boltzmann Densities},\nauthor={Tara Akhound-Sadegh and Jarrid Rector-Brooks and Joey Bose and Sarthak Mittal and Pablo Lemos and Cheng-Hao Liu and Marcin Sendera and Siamak Ravanbakhsh and Gauthier Gidel and Yoshua Bengio and Nikolay Malkin and Alexander Tong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gVjMwLDFoQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8967333, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6701361690026759984&as_sdt=4005&sciodt=0,6&hl=en", "gs_version_total": 7, "email": "mcgill.ca;mila.umontreal.ca;oxford.ac.uk;umontreal.ca;umontreal.ca;mila.umontreal.ca;uj.edu.pl;;mila.quebec;umontreal.ca;;umontreal.ca", "author_num": 12, "aff_unique_index": "0;1;2;1;3;1;4;5;1;3", "aff_unique_norm": "McGill University;University of Montreal;University of Oxford;Universit\u00e9 de Montr\u00e9al;Jagiellonian University;Quebec Artificial Intelligence Institute", "aff_unique_dep": ";Montreal Institute for Learning Algorithms;;;;Artificial Intelligence", "aff_unique_url": "https://www.mcgill.ca;https://www.umontreal.ca;https://www.ox.ac.uk;https://www.umontreal.ca;https://www.uj.edu.pl;https://mila.quebec", "aff_unique_abbr": "McGill;UM;Oxford;UdeM;UJ;Mila", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;0;1;0;0;0;2;0;0;0", "aff_country_unique": "Canada;United Kingdom;Poland" }, { "title": "Accelerating Federated Learning with Quick Distributed Mean Estimation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33421", "id": "gWEwIlZrbQ", "proceeding": "https://proceedings.mlr.press/v235/ben-basat24a.html", "pdf": "https://openreview.net/pdf?id=gWEwIlZrbQ", "openreview": "https://openreview.net/forum?id=gWEwIlZrbQ", "author_site": "Ran Ben Basat, Shay Vargaftik, Amit Portnoy, Gil Einziger, Yaniv Ben Itzhak, Michael Mitzenmacher", "tldr": "", "abstract": "Distributed Mean Estimation (DME), in which $n$ clients communicate vectors to a parameter server that estimates their average, is a fundamental building block in communication-efficient federated learning. In this paper, we improve on previous DME techniques that achieve the optimal $O(1/n)$ Normalized Mean Squared Error (NMSE) guarantee by asymptotically improving the complexity for either encoding or decoding (or both). To achieve this, we formalize the problem in a novel way that allows us to use off-the-shelf mathematical solvers to design the quantization. Using various datasets and training tasks, we demonstrate how QUIC-FL achieves state of the art accuracy with faster encoding and decoding times compared to other DME methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ran Ben-Basat;shay vargaftik;Amit Portnoy;Gil Einziger;Yaniv Ben-Itzhak;Michael Mitzenmacher", "authorids": "~Ran_Ben-Basat1;~shay_vargaftik1;~Amit_Portnoy1;~Gil_Einziger1;~Yaniv_Ben-Itzhak1;~Michael_Mitzenmacher1", "gender": ";;;M;M;M", "homepage": "https://bbasat.com;;;;;", "dblp": "140/7690;;159/8806;139/7090;75/7855;74/838", "google_scholar": "6G61qDwAAAAJ;;https://scholar.google.co.il/citations?user=2umZKagAAAAJ;;https://scholar.google.co.il/citations?user=6YWAONwAAAAJ;e8aRmAsAAAAJ", "orcid": ";;0000-0001-6491-5814;;;", "linkedin": ";;amit-portnoy-75060766;;yaniv-ben-itzhak-5889307/;", "or_profile": "~Ran_Ben-Basat1;~shay_vargaftik1;~Amit_Portnoy1;~Gil_Einziger1;~Yaniv_Ben-Itzhak1;~Michael_Mitzenmacher1", "aff": "University College London;;Microsoft;;VMware;Harvard University", "aff_domain": "ucl.ac.uk;;microsoft.com;;vmware.com;harvard.edu", "position": "Assistant Professor;;Researcher;;Researcher;Full Professor", "bibtex": "@inproceedings{\nben-basat2024accelerating,\ntitle={Accelerating Federated Learning with Quick Distributed Mean Estimation},\nauthor={Ran Ben-Basat and shay vargaftik and Amit Portnoy and Gil Einziger and Yaniv Ben-Itzhak and Michael Mitzenmacher},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gWEwIlZrbQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1632031, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17693468417672484679&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "ucl.ac.uk;;microsoft.com;;vmware.com;harvard.edu", "author_num": 6, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University College London;Microsoft;VMware, Inc.;Harvard University", "aff_unique_dep": ";Microsoft Corporation;;", "aff_unique_url": "https://www.ucl.ac.uk;https://www.microsoft.com;https://www.vmware.com;https://www.harvard.edu", "aff_unique_abbr": "UCL;Microsoft;VMware;Harvard", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Quality-Weighted Vendi Scores And Their Application To Diverse Experimental Design", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33420", "id": "gbD9MAc9p0", "proceeding": "https://proceedings.mlr.press/v235/nguyen24d.html", "pdf": "https://openreview.net/pdf?id=gbD9MAc9p0", "openreview": "https://openreview.net/forum?id=gbD9MAc9p0", "author_site": "Quan Nguyen, Adji Bousso Dieng", "tldr": "", "abstract": "Experimental design techniques such as active search and Bayesian optimization are widely used in the natural sciences for data collection and discovery. However, existing techniques tend to favor exploitation over exploration of the search space, which causes them to get stuck in local optima. This _collapse_ problem prevents experimental design algorithms from yielding diverse high-quality data. In this paper, we extend the Vendi scores\u2014a family of interpretable similarity-based diversity metrics\u2014to account for quality. We then leverage these *quality-weighted Vendi scores* to tackle experimental design problems across various applications, including drug discovery, materials discovery, and reinforcement learning. We found that quality-weighted Vendi scores allow us to construct policies for experimental design that flexibly balance quality and diversity, and ultimately assemble rich and diverse sets of high-performing data points. Our algorithms led to a 70%\u2013170% increase in the number of effective discoveries compared to baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Quan Nguyen;Adji Bousso Dieng", "authorids": "~Quan_Nguyen2;~Adji_Bousso_Dieng1", "gender": "M;F", "homepage": "https://krisnguyen135.github.io/;https://vertaix.princeton.edu/", "dblp": ";188/6478", "google_scholar": "NhVuN_EAAAAJ;ZCniP_MAAAAJ", "orcid": ";0000-0001-5687-3554", "linkedin": "quan-m-nguyen/;diengadji45", "or_profile": "~Quan_Nguyen2;~Adji_Bousso_Dieng1", "aff": "Washington University, St. Louis;Princeton University", "aff_domain": "wustl.edu;princeton.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nnguyen2024qualityweighted,\ntitle={Quality-Weighted Vendi Scores And Their Application To Diverse Experimental Design},\nauthor={Quan Nguyen and Adji Bousso Dieng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gbD9MAc9p0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1497658, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1645546611847520905&as_sdt=4005&sciodt=0,6&hl=en", "gs_version_total": 7, "email": "wustl.edu;princeton.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Washington University in St. Louis;Princeton University", "aff_unique_dep": ";", "aff_unique_url": "https://wustl.edu;https://www.princeton.edu", "aff_unique_abbr": "WUSTL;Princeton", "aff_campus_unique_index": "0", "aff_campus_unique": "St. Louis;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "First-Order Manifold Data Augmentation for Regression Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33419", "id": "geajNKab7g", "proceeding": "https://proceedings.mlr.press/v235/kaufman24a.html", "pdf": "https://openreview.net/pdf?id=geajNKab7g", "openreview": "https://openreview.net/forum?id=geajNKab7g", "author_site": "Ilya Kaufman, Omri Azencot", "tldr": "", "abstract": "Data augmentation (DA) methods tailored to specific domains generate synthetic samples by applying transformations that are appropriate for the characteristics of the underlying data domain, such as rotations on images and time warping on time series data. In contrast, *domain-independent* approaches, e.g. *mixup*, are applicable to various data modalities, and as such they are general and versatile. While regularizing classification tasks via DA is a well-explored research topic, the effect of DA on regression problems received less attention. To bridge this gap, we study the problem of domain-independent augmentation for regression, and we introduce *FOMA*: a new data-driven domain-independent data augmentation method. Essentially, our approach samples new examples from the tangent planes of the train distribution. Augmenting data in this way aligns with the network tendency towards capturing the dominant features of its input signals. We evaluate *FOMA* on in-distribution generalization and out-of-distribution robustness benchmarks, and we show that it improves the generalization of several neural architectures. We also find that strong baselines based on *mixup* are less effective in comparison to our approach. Our code is publicly available at https://github.com/azencot-group/FOMA", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ilya Kaufman;Omri Azencot", "authorids": "~Ilya_Kaufman1;~Omri_Azencot1", "gender": ";Unspecified", "homepage": ";http://omriazencot.com", "dblp": ";132/3985.html", "google_scholar": ";https://scholar.google.co.il/citations?user=MEGuRmAAAAAJ", "orcid": ";", "linkedin": "ilya-kaufman-2bb698149;omri-azencot-a8812417/", "or_profile": "~Ilya_Kaufman1;~Omri_Azencot1", "aff": "Ben-Gurion University of the Negev;Ben-Gurion University of the Negev", "aff_domain": "bgu.ac.il;bgu.ac.il", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nkaufman2024firstorder,\ntitle={First-Order Manifold Data Augmentation for Regression Learning},\nauthor={Ilya Kaufman and Omri Azencot},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=geajNKab7g}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 857786, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2832144823095488987&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 8, "email": "bgu.ac.il;bgu.ac.il", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Ben-Gurion University of the Negev", "aff_unique_dep": "", "aff_unique_url": "https://www.bgu.ac.il", "aff_unique_abbr": "BGU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "Weak-to-Strong Generalization: Eliciting Strong Capabilities With Weak Supervision", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33418", "id": "ghNRg2mEgN", "proceeding": "https://proceedings.mlr.press/v235/burns24b.html", "pdf": "https://openreview.net/pdf?id=ghNRg2mEgN", "openreview": "https://openreview.net/forum?id=ghNRg2mEgN", "author_site": "Collin Burns, Pavel Izmailov, Jan Kirchner, Bowen Baker, Leo Gao, Leopold Aschenbrenner, Yining Chen, Adrien Ecoffet, Manas Joglekar, Jan Leike, Ilya Sutskever, Jeffrey K Wu", "tldr": "", "abstract": "Widely used alignment techniques, such as reinforcement learning from human feedback (RLHF), rely on the ability of humans to supervise model behavior---for example, to evaluate whether a model faithfully followed instructions or generated safe outputs. However, future superhuman models will behave in complex ways too difficult for humans to reliably evaluate; humans will only be able to *weakly supervise* superhuman models. We study an analogy to this problem: can weak model supervision elicit the full capabilities of a much stronger model? We test this using a range of pretrained language models in the GPT-4 family on natural language processing (NLP), chess, and reward modeling tasks. We find that when we naively finetune strong pretrained models on labels generated by a weak model, they consistently perform better than their weak supervisors, a phenomenon we call *weak-to-strong generalization*. However, we are still far from recovering the full capabilities of strong models with naive finetuning alone, suggesting that techniques like RLHF may scale poorly to superhuman models without further work. We find that simple methods can often significantly improve weak-to-strong generalization: for example, when finetuning GPT-4 with a GPT-2-level supervisor and an auxiliary confidence loss, we can recover close to GPT-3.5-level performance on NLP tasks. Our results suggest that it is feasible to make empirical progress today on a fundamental challenge of aligning superhuman models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Collin Burns;Pavel Izmailov;Jan Hendrik Kirchner;Bowen Baker;Leo Gao;Leopold Aschenbrenner;Yining Chen;Adrien Ecoffet;Manas Joglekar;Jan Leike;Ilya Sutskever;Jeffrey Wu", "authorids": "~Collin_Burns1;~Pavel_Izmailov1;~Jan_Hendrik_Kirchner1;~Bowen_Baker2;~Leo_Gao1;leopold@openai.com;~Yining_Chen1;~Adrien_Ecoffet1;manas@openai.com;~Jan_Leike1;~Ilya_Sutskever2;~Jeffrey_Wu1", "gender": "M;M;M;M;;;F;;;M;M;M", "homepage": "http://collinpburns.com/;https://izmailovpavel.github.io/;https://universalprior.substack.com/;https://bowenbaker.github.io;https://leogao.dev;;;;;https://jan.leike.name;https://www.cs.utoronto.ca/~ilya/;https://www.wuthejeff.com/", "dblp": ";190/7218;322/1096;190/7159;279/3125;;;;;https://dblp.uni-trier.de/pers/hd/l/Leike:Jan;;85/2082-3", "google_scholar": "JGS2xjkAAAAJ;https://scholar.google.ru/citations?user=AXxTpGUAAAAJ;B027ViYAAAAJ;bMfPYdYAAAAJ;r6mBY50AAAAJ;;4a6iPeUAAAAJ;;;beiWcokAAAAJ;;", "orcid": ";;;;;;;;;;;", "linkedin": "collin-burns/;;;;;;;;;;;", "or_profile": "~Collin_Burns1;~Pavel_Izmailov1;~Jan_Hendrik_Kirchner1;~Bowen_Baker2;~Leo_Gao1;leopold@openai.com;~Yining_Chen1;~Adrien_Ecoffet1;manas@openai.com;~Jan_Leike1;~Ilya_Sutskever2;~Jeffrey_Wu1", "aff": "OpenAI;OpenAI;OpenAI;OpenAI;OpenAI;;OpenAI;;;OpenAI;OpenAI;OpenAI", "aff_domain": "openai.com;openai.com;openai.com;openai.com;openai.com;;openai.com;;;openai.com;openai.com;openai.com", "position": "Researcher;Researcher;Researcher;Research Scientist;Researcher;;Researcher;;;Alignment Team Lead;Researcher;Researcher", "bibtex": "@inproceedings{\nburns2024weaktostrong,\ntitle={Weak-to-Strong Generalization: Eliciting Strong Capabilities With Weak Supervision},\nauthor={Collin Burns and Pavel Izmailov and Jan Hendrik Kirchner and Bowen Baker and Leo Gao and Leopold Aschenbrenner and Yining Chen and Adrien Ecoffet and Manas Joglekar and Jan Leike and Ilya Sutskever and Jeffrey Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ghNRg2mEgN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1949302, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 260, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5282626994675281986&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 9, "email": "openai.com;openai.com;openai.com;openai.com;openai.com;;openai.com;;;openai.com;openai.com;openai.com", "author_num": 12, "aff_unique_index": "0;0;0;0;0;0;0;0;0", "aff_unique_norm": "OpenAI", "aff_unique_dep": "", "aff_unique_url": "https://openai.com", "aff_unique_abbr": "OpenAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "PolySketchFormer: Fast Transformers via Sketching Polynomial Kernels", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33417", "id": "ghYrfdJfjK", "proceeding": "https://proceedings.mlr.press/v235/kacham24a.html", "pdf": "https://openreview.net/pdf?id=ghYrfdJfjK", "openreview": "https://openreview.net/forum?id=ghYrfdJfjK", "author_site": "Praneeth Kacham, Vahab Mirrokni, Peilin Zhong", "tldr": "", "abstract": "The quadratic time and memory complexity inherent to self-attention mechanisms, with respect to sequence length, presents a critical computational bottleneck in the training and deployment of large-scale Transformer-based language models. Recent theoretical results indicate the intractability of sub-quadratic softmax attention approximation under reasonable complexity assumptions. This paper addresses this challenge by first demonstrating that polynomial attention with high degree can effectively replace softmax without sacrificing model quality. Next, we develop polynomial sketching techniques from numerical linear algebra to achieve linear-time polynomial attention with approximation guarantees. Crucially, our approach achieves this speedup without requiring the sparsification of attention matrices. We also present a block-based algorithm to apply causal masking efficiently. Combining these techniques, we provide *PolySketchFormer*, a practical linear-time Transformer architecture for language modeling that offers provable guarantees. We validate PolySketchFormer empirically by training language models capable of handling long contexts. These experiments utilize both synthetic and real-world datasets (PG19, Wikipedia and C4) on Google Cloud TPUs. For context lengths of 32k and GPT-2 style models, our model achieves 2x speedup in training compared to FlashAttention of the fastest configuration, with no observed degradation in quality across our experiments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Praneeth Kacham;Vahab Mirrokni;Peilin Zhong", "authorids": "~Praneeth_Kacham1;~Vahab_Mirrokni2;~Peilin_Zhong1", "gender": "M;M;M", "homepage": "https://www.praneethkacham.com;https://people.csail.mit.edu/mirrokni/Welcome.html;http://www.cs.columbia.edu/~peilin/", "dblp": "255/5684;m/VahabSMirrokni;148/9632", "google_scholar": "hKhPmTkAAAAJ;opbZfw0AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;", "linkedin": ";;", "or_profile": "~Praneeth_Kacham1;~Vahab_Mirrokni2;~Peilin_Zhong1", "aff": "Carnegie Mellon University;Google Research;Google", "aff_domain": "cmu.edu;google.com;google.com", "position": "PhD student;VP, Google Fellow;Researcher", "bibtex": "@inproceedings{\nkacham2024polysketchformer,\ntitle={PolySketchFormer: Fast Transformers via Sketching Polynomial Kernels},\nauthor={Praneeth Kacham and Vahab Mirrokni and Peilin Zhong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ghYrfdJfjK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 683918, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4478834473034085616&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "cmu.edu;google.com;google.com", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Carnegie Mellon University;Google", "aff_unique_dep": ";Google Research", "aff_unique_url": "https://www.cmu.edu;https://research.google", "aff_unique_abbr": "CMU;Google Research", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Can AI Assistants Know What They Don't Know?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33416", "id": "girxGkdECL", "proceeding": "https://proceedings.mlr.press/v235/cheng24i.html", "pdf": "https://openreview.net/pdf?id=girxGkdECL", "openreview": "https://openreview.net/forum?id=girxGkdECL", "author_site": "Qinyuan Cheng, Tianxiang Sun, Xiangyang Liu, Wenwei Zhang, Zhangyue Yin, Shimin Li, Linyang Li, Zhengfu He, Kai Chen, Xipeng Qiu", "tldr": "", "abstract": "AI assistants powered by Large Language Models (LLMs) have demonstrated impressive performance in various tasks. However, LLMs still make factual errors in knowledge-intensive tasks such as open-domain question answering. These untruthful responses from AI assistants can pose significant risks in practical applications. Therefore, in this paper, we ask the question **Can AI assistants know what they don't know and express this awareness through natural language?** To investigate this, we construct a model-specific \"I don't know\" (Idk) dataset. This dataset includes Supervised Fine-tuning data and preference data, categorizing questions based on whether the assistant knows or does not know the answers. Then, we align the assistant with its corresponding Idk dataset using different alignment methods, including Supervised Fine-tuning and preference optimization. Experimental results show that, after alignment with the Idk dataset, the assistant is more capable of declining to answer questions outside its knowledge scope. The assistant aligned with the Idk dataset shows significantly higher truthfulness than the original assistant.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qinyuan Cheng;Tianxiang Sun;Xiangyang Liu;Wenwei Zhang;Zhangyue Yin;Shimin Li;Linyang Li;Zhengfu He;Kai Chen;Xipeng Qiu", "authorids": "~Qinyuan_Cheng1;~Tianxiang_Sun1;~Xiangyang_Liu3;~Wenwei_Zhang1;~Zhangyue_Yin1;~Shimin_Li1;~Linyang_Li1;~Zhengfu_He2;~Kai_Chen4;~Xipeng_Qiu1", "gender": "M;M;M;M;M;M;M;M;M;M", "homepage": "https://xiami2019.github.io/;https://txsun1997.github.io/;;https://zhangwenwei.cn;https://yinzhangyue.github.io/;;https://github.com/LinyangLee;;https://chenkai.site/;https://xpqiu.github.io/", "dblp": "331/9838;254/1189;;;314/5418;;228/8051;321/1124;181/2839-26;69/1395", "google_scholar": "nu_iPXAAAAAJ;puHFkM0AAAAJ;https://scholar.google.com.hk/citations?user=U8QD9mwAAAAJ;QDXADSEAAAAJ;9gRQqSkAAAAJ;0xxkGjMAAAAJ;T6eEqcMAAAAJ;mou-vPwAAAAJ;https://scholar.google.com.hk/citations?user=eGD0b7IAAAAJ;Pq4Yp_kAAAAJ", "orcid": ";;;0000-0002-2748-4514;;;;;0000-0002-6820-2325;0000-0001-7163-5247", "linkedin": "https://www.linkedin.cn/injobs/in/qinyuan-cheng-5168951ab;;;wenweizhang-b9769a124/;zhangyue-yin-083286288/;;;;;", "or_profile": "~Qinyuan_Cheng1;~Tianxiang_Sun1;~Xiangyang_Liu3;~Wenwei_Zhang1;~Zhangyue_Yin1;~Shimin_Li1;~Linyang_Li1;~Zhengfu_He2;~Kai_Chen4;~Xipeng_Qiu1", "aff": "Fudan University;Fudan University;Fudan University;Shanghai AI Laboratory;Fudan University;Fudan University;Fudan University;Fudan University;Shanghai AI Laboratory;Fudan University", "aff_domain": "fudan.edu.cn;fudan.edu;fudan.edu.cn;pjlab.org.cn;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;pjlab.org.cn;fudan.edu.cn", "position": "PhD student;PhD student;PhD student;Researcher;PhD student;PhD student;PhD student;PhD student;Researcher;Full Professor", "bibtex": "@inproceedings{\ncheng2024can,\ntitle={Can {AI} Assistants Know What They Don't Know?},\nauthor={Qinyuan Cheng and Tianxiang Sun and Xiangyang Liu and Wenwei Zhang and Zhangyue Yin and Shimin Li and Linyang Li and Zhengfu He and Kai Chen and Xipeng Qiu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=girxGkdECL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 567769, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2847161660744118386&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 7, "email": "fudan.edu.cn;fudan.edu;fudan.edu.cn;pjlab.org.cn;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;pjlab.org.cn;fudan.edu.cn", "author_num": 10, "aff_unique_index": "0;0;0;1;0;0;0;0;1;0", "aff_unique_norm": "Fudan University;Shanghai AI Laboratory", "aff_unique_dep": ";", "aff_unique_url": "https://www.fudan.edu.cn;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "Fudan;SAIL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "SelfIE: Self-Interpretation of Large Language Model Embeddings", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33415", "id": "gjgRKbdYR7", "proceeding": "https://proceedings.mlr.press/v235/chen24ao.html", "pdf": "https://openreview.net/pdf?id=gjgRKbdYR7", "openreview": "https://openreview.net/forum?id=gjgRKbdYR7", "author_site": "Haozhe Chen, Carl Vondrick, Chengzhi Mao", "tldr": "", "abstract": "How do large language models (LLMs) obtain their answers? The ability to explain and control an LLM\u2019s reasoning process is key for reliability, transparency, and future model developments. We propose SelfIE (Self-Interpretation of Embeddings), a framework that enables LLMs to interpret their own embeddings in natural language by leveraging their ability to respond to inquiries about a given passage. Capable of interpreting open-world concepts in the hidden embeddings, SelfIE reveals LLM internal reasoning in cases such as making ethical decisions, internalizing prompt injection, and recalling harmful knowledge. SelfIE\u2019s text descriptions on hidden embeddings open avenues to control LLM reasoning. We propose Supervised Control, which allows editing open-ended concepts while only requiring gradient computation of individual layer. We extend RLHF to hidden embeddings and propose Reinforcement Control that erases harmful knowledge in LLM without supervision targets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haozhe Chen;Carl Vondrick;Chengzhi Mao", "authorids": "~Haozhe_Chen2;~Carl_Vondrick2;~Chengzhi_Mao2", "gender": "M;M;M", "homepage": ";http://www.cs.columbia.edu/~vondrick/;http://www.cs.columbia.edu/~mcz/", "dblp": ";26/8610;", "google_scholar": ";3MzhkFIAAAAJ;pTTEiHUAAAAJ", "orcid": ";;", "linkedin": "haozhe-chen/;;", "or_profile": "~Haozhe_Chen2;~Carl_Vondrick2;~Chengzhi_Mao2", "aff": "Columbia University;Columbia University;Mila - Quebec Artificial Intelligence Institute", "aff_domain": "columbia.edu;columbia.edu;mila.quebec", "position": "Undergrad student;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nchen2024selfie,\ntitle={Self{IE}: Self-Interpretation of Large Language Model Embeddings},\nauthor={Haozhe Chen and Carl Vondrick and Chengzhi Mao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gjgRKbdYR7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3779215, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2575383889661528703&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 7, "email": "columbia.edu;columbia.edu;mila.quebec", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Columbia University;Quebec Artificial Intelligence Institute", "aff_unique_dep": ";Artificial Intelligence", "aff_unique_url": "https://www.columbia.edu;https://mila.quebec", "aff_unique_abbr": "Columbia;Mila", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;Canada" }, { "title": "VisionGraph: Leveraging Large Multimodal Models for Graph Theory Problems in Visual Context", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33414", "id": "gjoUXwuZdy", "proceeding": "https://proceedings.mlr.press/v235/li24ab.html", "pdf": "https://openreview.net/pdf?id=gjoUXwuZdy", "openreview": "https://openreview.net/forum?id=gjoUXwuZdy", "author_site": "yunxin li, Baotian Hu, Haoyuan Shi, Wei Wang, Longyue Wang, Min Zhang", "tldr": "", "abstract": "Large Multimodal Models (LMMs) have achieved impressive success in visual reasoning, particularly in visual mathematics. However, problem-solving capabilities in graph theory remain less explored for LMMs, despite being a crucial aspect of mathematical reasoning that requires an accurate understanding of graphical structures and multi-step reasoning on visual graphs. To step forward in this direction, we are the first to design a benchmark named **VisionGraph**, used to explore the capabilities of advanced LMMs in solving multimodal graph theory problems. It encompasses eight complex graph problem tasks, from connectivity to shortest path problems. Subsequently, we present a Description-Program-Reasoning (DPR) chain to enhance the logical accuracy of reasoning processes through graphical structure description generation and algorithm-aware multi-step reasoning. Our extensive study shows that 1) GPT-4V outperforms Gemini Pro in multi-step graph reasoning; 2) All LMMs exhibit inferior perception accuracy for graphical structures, whether in zero/few-shot settings or with supervised fine-tuning (SFT), which further affects problem-solving performance; 3) DPR significantly improves the multi-step graph reasoning capabilities of LMMs and the GPT-4V (DPR) agent achieves SOTA performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "yunxin li;Baotian Hu;Haoyuan Shi;Wei Wang;Longyue Wang;Min Zhang", "authorids": "~yunxin_li1;~Baotian_Hu1;~Haoyuan_Shi2;~Wei_Wang57;~Longyue_Wang3;~Min_Zhang9", "gender": "M;M;M;M;M;M", "homepage": "https://yunxinli.github.io;;https://github.com/HaoyuanShi;;http://longyuewang.com/;https://zhangmin-nlp-ai.github.io/", "dblp": "11/2484;155/1902;;;127/3421;83/5342-5", "google_scholar": "U98QY0QAAAAJ;5NiJ1VoAAAAJ;;https://scholar.google.com.hk/citations?user=B-pQ7nAAAAAJ;r1ctChkAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";0000-0001-7490-684X;;;0000-0002-9062-6183;", "linkedin": ";;;;vincentwang0229/;", "or_profile": "~yunxin_li1;~Baotian_Hu1;~Haoyuan_Shi2;~Wei_Wang57;~Longyue_Wang3;~Min_Zhang9", "aff": "Harbin Institute of Technology;Harbin Institute of Technology, Shenzhen;Jilin University;SUN YAT-SEN UNIVERSITY;Tencent AI Lab;Harbin Institute of Technology, Shenzhen", "aff_domain": "hit.edu.cn;hhit.edu.cn;jlu.edu.cn;sysu.edu.cn;tencent.com;hit.edu.cn", "position": "PhD student;Associate Professor;Undergrad student;Postdoc;Senior Researcher;Full Professor", "bibtex": "@inproceedings{\nli2024visiongraph,\ntitle={VisionGraph: Leveraging Large Multimodal Models for Graph Theory Problems in Visual Context},\nauthor={yunxin li and Baotian Hu and Haoyuan Shi and Wei Wang and Longyue Wang and Min Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gjoUXwuZdy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5493247, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6815209149365274338&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 9, "email": "hit.edu.cn;hhit.edu.cn;jlu.edu.cn;sysu.edu.cn;tencent.com;hit.edu.cn", "author_num": 6, "aff_unique_index": "0;0;1;2;3;0", "aff_unique_norm": "Harbin Institute of Technology;Jilin University;Sun Yat-sen University;Tencent", "aff_unique_dep": ";;;Tencent AI Lab", "aff_unique_url": "http://www.hit.edu.cn/;http://www.jlu.edu.cn;http://www.sysu.edu.cn;https://ai.tencent.com", "aff_unique_abbr": "HIT;JLU;SYSU;Tencent AI Lab", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Harbin;Shenzhen;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Characterizing Large Language Model Geometry Helps Solve Toxicity Detection and Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33413", "id": "glfcwSsks8", "proceeding": "https://proceedings.mlr.press/v235/balestriero24a.html", "pdf": "https://openreview.net/pdf?id=glfcwSsks8", "openreview": "https://openreview.net/forum?id=glfcwSsks8", "author_site": "Randall Balestriero, Romain Cosentino, Sarath Shekkizhar", "tldr": "", "abstract": "Large Language Models (LLMs) drive current AI breakthroughs despite very little being known about their internal representations. In this work, we propose to shed the light on LLMs inner mechanisms through the lens of geometry. In particular, we develop in closed form $(i)$ the intrinsic dimension in which the Multi-Head Attention embeddings are constrained to exist and $(ii)$ the partition and per-region affine mappings of the feedforward (MLP) network of LLMs' layers. Our theoretical findings further enable the design of novel principled solutions applicable to state-of-the-art LLMs. First, we show that, through our geometric understanding, we can bypass LLMs' RLHF protection by controlling the embedding's intrinsic dimension through informed prompt manipulation. Second, we derive interpretable geometrical features that can be extracted from any (pre-trained) LLM, providing a rich abstract representation of their inputs. We observe that these features are sufficient to help solve toxicity detection, and even allow the identification of various types of toxicity. Our results demonstrate how, even in large-scale regimes, exact theoretical results can answer practical questions in LLMs. Code: https://github.com/RandallBalestriero/SplineLLM", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Randall Balestriero;Romain Cosentino;Sarath Shekkizhar", "authorids": "~Randall_Balestriero1;~Romain_Cosentino2;~Sarath_Shekkizhar1", "gender": "M;M;", "homepage": "https://randallbalestriero.github.io/;;http://shekkizh.github.io", "dblp": "175/5364;;251/3334", "google_scholar": "S1x_xqcAAAAJ;YTY02q0AAAAJ;dWd8dQQAAAAJ", "orcid": ";;", "linkedin": "randallbalestriero/;;", "or_profile": "~Randall_Balestriero1;~Romain_Cosentino2;~Sarath_Shekkizhar1", "aff": "Citadel;;University of Southern California", "aff_domain": "citadel.com;;usc.edu", "position": "Researcher;;PhD student", "bibtex": "@inproceedings{\nbalestriero2024characterizing,\ntitle={Characterizing Large Language Model Geometry Helps Solve Toxicity Detection and Generation},\nauthor={Randall Balestriero and Romain Cosentino and Sarath Shekkizhar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=glfcwSsks8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5938227, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15639890844187061000&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 6, "email": "citadel.com;;usc.edu", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Citadel;University of Southern California", "aff_unique_dep": ";", "aff_unique_url": "https://www.citadel.edu;https://www.usc.edu", "aff_unique_abbr": "Citadel;USC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "StackSight: Unveiling WebAssembly through Large Language Models and Neurosymbolic Chain-of-Thought Decompilation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33412", "id": "gn5AsHIIwb", "proceeding": "https://proceedings.mlr.press/v235/fang24e.html", "pdf": "https://openreview.net/pdf?id=gn5AsHIIwb", "openreview": "https://openreview.net/forum?id=gn5AsHIIwb", "author_site": "Weike Fang, Zhejian Zhou, Junzhou He, Weihang Wang", "tldr": "", "abstract": "WebAssembly enables near-native execution in web applications and is increasingly adopted for tasks that demand high performance and robust security. However, its assembly-like syntax, implicit stack machine, and low-level data types make it extremely difficult for human developers to understand, spurring the need for effective WebAssembly reverse engineering techniques. In this paper, we propose StackSight, a novel neurosymbolic approach that combines Large Language Models (LLMs) with advanced program analysis to decompile complex WebAssembly code into readable C++ snippets. StackSight visualizes and tracks virtual stack alterations via a static analysis algorithm and then applies chain-of-thought prompting to harness LLM's complex reasoning capabilities. Evaluation results show that StackSight significantly improves WebAssembly decompilation. Our user study also demonstrates that code snippets generated by StackSight have significantly higher win rates and enable a better grasp of code semantics.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weike Fang;Zhejian Zhou;Junzhou He;Weihang Wang", "authorids": "~Weike_Fang1;~Zhejian_Zhou1;~Junzhou_He1;~Weihang_Wang3", "gender": "M;;Not Specified;", "homepage": ";;https://jz2000.de;", "dblp": "368/8478;;;", "google_scholar": "https://scholar.google.com/citations?hl=en;;;", "orcid": "0000-0002-1803-6912;;;", "linkedin": "weikefang/;;;", "or_profile": "~Weike_Fang1;~Zhejian_Zhou1;~Junzhou_He1;~Weihang_Wang3", "aff": "University of Southern California;;University of Southern California;", "aff_domain": "usc.edu;;usc.edu;", "position": "PhD student;;MS student;", "bibtex": "@inproceedings{\nfang2024stacksight,\ntitle={StackSight: Unveiling WebAssembly through Large Language Models and Neurosymbolic Chain-of-Thought Decompilation},\nauthor={Weike Fang and Zhejian Zhou and Junzhou He and Weihang Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gn5AsHIIwb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1778395, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15346928856044942570&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "usc.edu;;usc.edu;", "author_num": 4, "aff_unique_index": "0;0", "aff_unique_norm": "University of Southern California", "aff_unique_dep": "", "aff_unique_url": "https://www.usc.edu", "aff_unique_abbr": "USC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Fast, Scalable, Warm-Start Semidefinite Programming with Spectral Bundling and Sketching", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33411", "id": "gqA8ZHO0j8", "proceeding": "https://proceedings.mlr.press/v235/angell24a.html", "pdf": "https://openreview.net/pdf?id=gqA8ZHO0j8", "openreview": "https://openreview.net/forum?id=gqA8ZHO0j8", "author_site": "Rico Angell, Andrew McCallum", "tldr": "", "abstract": "While semidefinite programming (SDP) has traditionally been limited to moderate-sized problems, recent algorithms augmented with matrix sketching techniques have enabled solving larger SDPs. However, these methods achieve scalability at the cost of an increase in the number of necessary iterations, resulting in slower convergence as the problem size grows. Furthermore, they require iteration-dependent parameter schedules that prohibit effective utilization of warm-start initializations important in practical applications with incrementally-arriving data or mixed-integer programming. We present Unified Spectral Bundling with Sketching (USBS), a provably correct, fast and scalable algorithm for solving massive SDPs that can leverage a warm-start initialization to further accelerate convergence. Our proposed algorithm is a spectral bundle method for solving general SDPs containing both equality and inequality constraints. Moveover, when augmented with an optional matrix sketching technique, our algorithm achieves the dramatically improved scalability of previous work while sustaining convergence speed. We empirically demonstrate the effectiveness of our method across multiple applications, with and without warm-starting. For example, USBS provides a 500x speed-up over the state-of-the-art scalable SDP solver on an instance with over 2 billion decision variables. We make our implementation in pure JAX publicly available.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rico Angell;Andrew McCallum", "authorids": "~Rico_Angell1;~Andrew_McCallum1", "gender": "M;M", "homepage": "https://people.cs.umass.edu/~rangell/;http://www.cs.umass.edu/~mccallum", "dblp": "184/9716;m/AndrewMcCallum", "google_scholar": "https://scholar.google.com/citations?hl=en;yILa1y0AAAAJ", "orcid": ";0009-0004-5487-2848", "linkedin": ";andrew-mccallum-a412", "or_profile": "~Rico_Angell1;~Andrew_McCallum1", "aff": "University of Massachusetts Amherst;University of Massachusetts Amherst", "aff_domain": "cs.umass.edu;cs.umass.edu", "position": "PhD student;Distinguished Professor", "bibtex": "@inproceedings{\nangell2024fast,\ntitle={Fast, Scalable, Warm-Start Semidefinite Programming with Spectral Bundling and Sketching},\nauthor={Rico Angell and Andrew McCallum},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gqA8ZHO0j8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2951317, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2305445104512135118&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 7, "email": "cs.umass.edu;cs.umass.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Massachusetts Amherst", "aff_unique_dep": "", "aff_unique_url": "https://www.umass.edu", "aff_unique_abbr": "UMass Amherst", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Amherst", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "LAGMA: LAtent Goal-guided Multi-Agent Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33410", "id": "gtYdvSGMYV", "proceeding": "https://proceedings.mlr.press/v235/na24b.html", "pdf": "https://openreview.net/pdf?id=gtYdvSGMYV", "openreview": "https://openreview.net/forum?id=gtYdvSGMYV", "author_site": "Hyungho Na, IL CHUL MOON", "tldr": "", "abstract": "In cooperative multi-agent reinforcement learning (MARL), agents collaborate to achieve common goals, such as defeating enemies and scoring a goal. However, learning goal-reaching paths toward such a semantic goal takes a considerable amount of time in complex tasks and the trained model often fails to find such paths. To address this, we present LAtent Goal-guided Multi-Agent reinforcement learning (LAGMA), which generates a goal-reaching trajectory in latent space and provides a latent goal-guided incentive to transitions toward this reference trajectory. LAGMA consists of three major components: (a) quantized latent space constructed via a modified VQ-VAE for efficient sample utilization, (b) goal-reaching trajectory generation via extended VQ codebook, and (c) latent goal-guided intrinsic reward generation to encourage transitions towards the sampled goal-reaching path. The proposed method is evaluated by StarCraft II with both dense and sparse reward settings and Google Research Football. Empirical results show further performance improvement over state-of-the-art baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hyungho Na;Il-chul Moon", "authorids": "~Hyungho_Na1;~Il-chul_Moon1", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nna2024lagma,\ntitle={{LAGMA}: {LA}tent Goal-guided Multi-Agent Reinforcement Learning},\nauthor={Hyungho Na and Il-chul Moon},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gtYdvSGMYV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9257981, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11208917341665551127&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "email": ";", "author_num": 2 }, { "title": "Generalized Preference Optimization: A Unified Approach to Offline Alignment", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33409", "id": "gu3nacA9AH", "proceeding": "https://proceedings.mlr.press/v235/tang24b.html", "pdf": "https://openreview.net/pdf?id=gu3nacA9AH", "openreview": "https://openreview.net/forum?id=gu3nacA9AH", "author_site": "Yunhao Tang, Zhaohan Guo, Zeyu Zheng, Daniele Calandriello, REMI MUNOS, Mark Rowland, Pierre Richemond, Michal Valko, Bernardo Avila Pires, Bilal Piot", "tldr": "", "abstract": "Offline preference optimization allows fine-tuning large models directly from offline data, and has proved effective in recent alignment practices. We propose generalized preference optimization (GPO), a family of offline losses parameterized by a general class of convex functions. GPO enables a unified view over preference optimization, encompassing existing algorithms such as DPO, IPO and SLiC as special cases, while naturally introducing new variants. The GPO framework also sheds light on how offline algorithms enforce regularization, through the design of the convex function that defines the loss. Our analysis and experiments reveal the connections and subtle differences between the offline regularization and the KL divergence regularization intended by the canonical RLHF formulation. In a controlled setting akin to Gao et al 2023, we also show that different GPO variants achieve similar trade-offs between regularization and performance, though the optimal values of hyper-parameter might differ as predicted by theory. In all, our results present new algorithmic toolkits and empirical insights to alignment practitioners.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yunhao Tang;Zhaohan Daniel Guo;Zeyu Zheng;Daniele Calandriello;Remi Munos;Mark Rowland;Pierre Harvey Richemond;Michal Valko;Bernardo Avila Pires;Bilal Piot", "authorids": "~Yunhao_Tang1;~Zhaohan_Daniel_Guo1;~Zeyu_Zheng1;~Daniele_Calandriello1;~Remi_Munos1;~Mark_Rowland1;~Pierre_Harvey_Richemond1;~Michal_Valko1;~Bernardo_Avila_Pires1;~Bilal_Piot1", "gender": "M;M;M;M;M;M;M;M;M;M", "homepage": "https://robintyh1.github.io;;http://www-personal.umich.edu/~zeyu/;;http://researchers.lille.inria.fr/~munos/;http://sites.google.com/view/markrowland;;https://misovalko.github.io/research.html;;", "dblp": "210/2229;160/9943;48/7883;129/1542;69/6815;86/4090;200/8842;03/5455;124/8971;", "google_scholar": ";fxr_9oQAAAAJ;;;https://scholar.google.com/citations?hl=en;https://scholar.google.co.uk/citations?user=-0U84zMAAAAJ;;jrazNCQAAAAJ;WpAH4iUAAAAJ;https://scholar.google.fr/citations?user=fqxNUREAAAAJ", "orcid": ";;;;;;;;;", "linkedin": ";;;;;;;michalvalko/;;", "or_profile": "~Yunhao_Tang1;~Zhaohan_Daniel_Guo1;~Zeyu_Zheng1;~Daniele_Calandriello1;~Remi_Munos1;~Mark_Rowland1;~Pierre_Harvey_Richemond1;~Michal_Valko1;~Bernardo_Avila_Pires1;~Bilal_Piot1", "aff": "Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Imperial College London;Meta;Google DeepMind;University Lille", "aff_domain": "deepmind.com;deepmind.com;deepmind.com;deepmind.com;google.com;google.com;imperial.ac.uk;meta.com;google.com;univ-lille1.fr", "position": "Research Scientist;Research Scientist;Research Scientist;Researcher;Research scientist;Research Scientist;Visiting Researcher;Principal Researcher;Research Scientist;Associate Professor", "bibtex": "@inproceedings{\ntang2024generalized,\ntitle={Generalized Preference Optimization: A Unified Approach to Offline Alignment},\nauthor={Yunhao Tang and Zhaohan Daniel Guo and Zeyu Zheng and Daniele Calandriello and Remi Munos and Mark Rowland and Pierre Harvey Richemond and Michal Valko and Bernardo Avila Pires and Bilal Piot},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gu3nacA9AH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1637268, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 83, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16220126631894221059&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "deepmind.com;deepmind.com;deepmind.com;deepmind.com;google.com;google.com;imperial.ac.uk;meta.com;google.com;univ-lille1.fr", "author_num": 10, "aff_unique_index": "0;0;0;0;0;0;1;2;0;3", "aff_unique_norm": "Google;Imperial College London;Meta;University of Lille", "aff_unique_dep": "Google DeepMind;;Meta Platforms, Inc.;", "aff_unique_url": "https://deepmind.com;https://www.imperial.ac.uk;https://meta.com;https://www.univ-lille.fr", "aff_unique_abbr": "DeepMind;ICL;Meta;ULille", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;1;0;2", "aff_country_unique": "United Kingdom;United States;France" }, { "title": "Equivariance via Minimal Frame Averaging for More Symmetries and Efficiency", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33408", "id": "guFsTBXsov", "proceeding": "https://proceedings.mlr.press/v235/lin24i.html", "pdf": "https://openreview.net/pdf?id=guFsTBXsov", "openreview": "https://openreview.net/forum?id=guFsTBXsov", "author_site": "Yuchao Lin, Jacob Helwig, Shurui Gui, Shuiwang Ji", "tldr": "", "abstract": "We consider achieving equivariance in machine learning systems via frame averaging. Current frame averaging methods involve a costly sum over large frames or rely on sampling-based approaches that only yield approximate equivariance. Here, we propose Minimal Frame Averaging (MFA), a mathematical framework for constructing provably minimal frames that are exactly equivariant. The general foundations of MFA also allow us to extend frame averaging to more groups than previously considered, including the Lorentz group for describing symmetries in space-time, and the unitary group for complex-valued domains. Results demonstrate the efficiency and effectiveness of encoding symmetries via MFA across a diverse range of tasks, including $n$-body simulation, top tagging in collider physics, and relaxed energy prediction. Our code is available at https://github.com/divelab/MFA.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuchao Lin;Jacob Helwig;Shurui Gui;Shuiwang Ji", "authorids": "~Yuchao_Lin1;~Jacob_Helwig1;~Shurui_Gui1;~Shuiwang_Ji1", "gender": "M;M;M;M", "homepage": "https://kruskallin.github.io/;https://jacobhelwig.github.io/;https://cm-bf.github.io;http://people.tamu.edu/~sji", "dblp": "322/5499;349/0477.html;272/0674.html;84/6405", "google_scholar": ";NtqpyUAAAAAJ;U4AjtOkAAAAJ;BZGj6sAAAAAJ", "orcid": ";0000-0001-7718-7449;;0000-0002-4205-4563", "linkedin": ";jacob-helwig/;;shuiwang-ji-9a040715/", "or_profile": "~Yuchao_Lin1;~Jacob_Helwig1;~Shurui_Gui1;~Shuiwang_Ji1", "aff": "Texas A&M;Texas A&M University - College Station;Texas A&M University;Texas A&M University", "aff_domain": "tamu.edu;tamu.edu;tamu.edu;tamu.edu", "position": "PhD student;PhD student;PhD student;Professor", "bibtex": "@inproceedings{\nlin2024equivariance,\ntitle={Equivariance via Minimal Frame Averaging for More Symmetries and Efficiency},\nauthor={Yuchao Lin and Jacob Helwig and Shurui Gui and Shuiwang Ji},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=guFsTBXsov}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 839925, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9090327534243774591&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "tamu.edu;tamu.edu;tamu.edu;tamu.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Texas A&M University", "aff_unique_dep": "", "aff_unique_url": "https://www.tamu.edu", "aff_unique_abbr": "TAMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";College Station", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Q-Probe: A Lightweight Approach to Reward Maximization for Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33407", "id": "gxOQEMRbRa", "proceeding": "https://proceedings.mlr.press/v235/li24ae.html", "pdf": "https://openreview.net/pdf?id=gxOQEMRbRa", "openreview": "https://openreview.net/forum?id=gxOQEMRbRa", "author_site": "Kenneth Li, Samy Jelassi, Hugh Zhang, Sham Kakade, Martin Wattenberg, David Brandfonbrener", "tldr": "", "abstract": "We present an approach called Q-probing to adapt a pre-trained language model to maximize a task-specific reward function. At a high level, Q-probing sits between heavier approaches such as finetuning and lighter approaches such as few shot prompting, but can also be combined with either. The idea is to learn a simple linear function on a model's embedding space that can be used to reweight candidate completions. We theoretically show that this sampling procedure is equivalent to a KL-constrained maximization of the Q-probe as the number of samples increases. To train the Q-probes we consider either reward modeling or a class of novel direct policy learning objectives based on importance-weighted policy gradients. With this technique, we see gains in domains with ground-truth rewards (code generation) as well as implicit rewards defined by preference data, even outperforming finetuning in data-limited regimes. Moreover, a Q-probe can be trained on top of an API since it only assumes access to sampling and embeddings. Code: [https://github.com/likenneth/q_probe](https://github.com/likenneth/q_probe).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kenneth Li;Samy Jelassi;Hugh Zhang;Sham M. Kakade;Martin Wattenberg;David Brandfonbrener", "authorids": "~Kenneth_Li1;~Samy_Jelassi1;~Hugh_Zhang1;~Sham_M._Kakade1;~Martin_Wattenberg1;~David_Brandfonbrener1", "gender": ";M;;M;M;M", "homepage": "https://likenneth.github.io/;https://sjelassi.github.io/;;https://shamulent.github.io;http://www.bewitched.com;https://davidbrandfonbrener.github.io", "dblp": "75/6627-12;222/3149;239/4076;s/SMKakade;w/MartinWattenberg;214/9461", "google_scholar": "v0GItgwAAAAJ;;;https://scholar.google.com.tw/citations?user=wb-DKCIAAAAJ;pv54dqMAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Kenneth_Li1;~Samy_Jelassi1;~Hugh_Zhang1;~Sham_M._Kakade1;~Martin_Wattenberg1;~David_Brandfonbrener1", "aff": "Harvard University;Harvard University;Harvard University;Harvard University;Google;Harvard University", "aff_domain": "harvard.edu;harvard.edu;harvard.edu;harvard.edu;google.com;harvard.edu", "position": "PhD student;Postdoc;PhD student;Full Professor;Principal Researcher;Postdoc", "bibtex": "@inproceedings{\nli2024qprobe,\ntitle={Q-Probe: A Lightweight Approach to Reward Maximization for Language Models},\nauthor={Kenneth Li and Samy Jelassi and Hugh Zhang and Sham M. Kakade and Martin Wattenberg and David Brandfonbrener},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gxOQEMRbRa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 745663, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5230380006686869825&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "harvard.edu;harvard.edu;harvard.edu;harvard.edu;google.com;harvard.edu", "author_num": 6, "aff_unique_index": "0;0;0;0;1;0", "aff_unique_norm": "Harvard University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.harvard.edu;https://www.google.com", "aff_unique_abbr": "Harvard;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Expert Proximity as Surrogate Rewards for Single Demonstration Imitation Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33406", "id": "gzis9n5r7e", "proceeding": "https://proceedings.mlr.press/v235/chiang24a.html", "pdf": "https://openreview.net/pdf?id=gzis9n5r7e", "openreview": "https://openreview.net/forum?id=gzis9n5r7e", "author_site": "Chia-Cheng Chiang, Li-Cheng Lan, Wei-Fang Sun, Chien Feng, Cho-Jui Hsieh, Chun-Yi Lee", "tldr": "", "abstract": "In this paper, we focus on single-demonstration imitation learning (IL), a practical approach for real-world applications where acquiring multiple expert demonstrations is costly or infeasible and the ground truth reward function is not available. In contrast to typical IL settings with multiple demonstrations, single-demonstration IL involves an agent having access to only one expert trajectory. We highlight the issue of sparse reward signals in this setting and propose to mitigate this issue through our proposed Transition Discriminator-based IL (TDIL) method. TDIL is an IRL method designed to address reward sparsity by introducing a denser surrogate reward function that considers environmental dynamics. This surrogate reward function encourages the agent to navigate towards states that are proximal to expert states. In practice, TDIL trains a transition discriminator to differentiate between valid and non-valid transitions in a given environment to compute the surrogate rewards. The experiments demonstrate that TDIL outperforms existing IL approaches and achieves expert-level performance in the single-demonstration IL setting across five widely adopted MuJoCo benchmarks as well as the \"Adroit Door\" robotic environment.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chia-Cheng Chiang;Li-Cheng Lan;Wei-Fang Sun;Chien Feng;Cho-Jui Hsieh;Chun-Yi Lee", "authorids": "~Chia-Cheng_Chiang1;~Li-Cheng_Lan1;~Wei-Fang_Sun1;~Chien_Feng2;~Cho-Jui_Hsieh1;~Chun-Yi_Lee1", "gender": "M;M;;M;M;M", "homepage": "https://lan-lc.github.io/;;;http://web.cs.ucla.edu/~chohsieh/index.html;https://elsalab.ai;", "dblp": "200/8672;275/9039;;14/2770;36/3668;", "google_scholar": "https://scholar.google.com.tw/citations?view_op=list_works;TgMlVRUAAAAJ;;Wy89g4IAAAAJ;https://scholar.google.com.tw/citations?user=5mYNdo0AAAAJ;", "orcid": ";;;;0000-0002-4680-4800;", "linkedin": ";;chien-feng-56300320b;;;%E5%AE%B6%E4%B8%9E-%E6%B1%9F-0a637017a/", "or_profile": "~Li-Cheng_Lan1;~Wei-Fang_Sun1;~Chien_Feng2;~Cho-Jui_Hsieh1;~Chun-Yi_Lee1;~Stanley_Chiang1", "aff": "University of California, Los Angeles;NVIDIA AI Technology Center;Department of Computer Science, National Tsing Hua University, National Tsinghua University;University of California, Los Angeles;National Tsing Hua University;National Tsing Hua University", "aff_domain": "ucla.edu;nvidia.com;cs.nthu.edu.tw;ucla.edu;nthu.edu.tw;nthu.edu.tw", "position": "PhD student;Researcher;MS student;Associate Professor;Full Professor;MS student", "bibtex": "@inproceedings{\nchiang2024expert,\ntitle={Expert Proximity as Surrogate Rewards for Single Demonstration Imitation Learning},\nauthor={Chia-Cheng Chiang and Li-Cheng Lan and Wei-Fang Sun and Chien Feng and Cho-Jui Hsieh and Chun-Yi Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gzis9n5r7e}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3256462, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YDYF5TX8JXYJ:scholar.google.com/&scioq=Expert+Proximity+as+Surrogate+Rewards+for+Single+Demonstration+Imitation+Learning&hl=en&as_sdt=0,33", "gs_version_total": 8, "email": "ucla.edu;nvidia.com;cs.nthu.edu.tw;ucla.edu;nthu.edu.tw;nthu.edu.tw", "author_num": 6, "aff_unique_index": "0;1;2;0;3;3", "aff_unique_norm": "University of California, Los Angeles;NVIDIA;National Tsinghua University;National Tsing Hua University", "aff_unique_dep": ";NVIDIA AI Technology Center;Department of Computer Science;", "aff_unique_url": "https://www.ucla.edu;https://www.nvidia.com/en-us/research/;https://www.tsinghua.edu.cn;https://www.nthu.edu.tw", "aff_unique_abbr": "UCLA;NVIDIA;THU;NTHU", "aff_campus_unique_index": "0;0;2;2", "aff_campus_unique": "Los Angeles;;Taiwan", "aff_country_unique_index": "0;0;1;0;1;1", "aff_country_unique": "United States;China" }, { "title": "Adaptive Sampling of k-Space in Magnetic Resonance for Rapid Pathology Prediction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33405", "id": "h2uBuQvpp8", "proceeding": "https://proceedings.mlr.press/v235/yen24a.html", "pdf": "https://openreview.net/pdf?id=h2uBuQvpp8", "openreview": "https://openreview.net/forum?id=h2uBuQvpp8", "author_site": "Chen-Yu Yen, raghav singhal, Umang Sharma, Rajesh Ranganath, Sumit Chopra, Lerrel Pinto", "tldr": "", "abstract": "Magnetic Resonance (MR) imaging, despite its proven diagnostic utility, remains an inaccessible imaging modality for disease surveillance at the population level. A major factor rendering MR inaccessible is lengthy scan times. An MR scanner collects measurements associated with the underlying anatomy in the Fourier space, also known as the k-space. Creating a high-fidelity image requires collecting large quantities of such measurements, increasing the scan time. Traditionally to accelerate an MR scan, image reconstruction from under-sampled k-space data is the method of choice. However, recent works show the feasibility of bypassing image reconstruction and directly learning to detect disease directly from a sparser learned subset of the k-space measurements. In this work, we propose Adaptive Sampling for MR (ASMR), a sampling method that learns an adaptive policy to sequentially select k-space samples to optimize for target disease detection. On 6 out of 8 pathology classification tasks spanning the Knee, Brain, and Prostate MR scans, ASMR reaches within 2% of the performance of a fully sampled classifier while using only 8% of the k-space, as well as outperforming prior state-of-the-art work in k-space sampling such as EMRT, LOUPE, and DPS.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chen-Yu Yen;Raghav Singhal;Umang Sharma;Rajesh Ranganath;Sumit Chopra;Lerrel Pinto", "authorids": "~Chen-Yu_Yen1;~Raghav_Singhal1;us453@nyu.edu;~Rajesh_Ranganath2;~Sumit_Chopra1;~Lerrel_Pinto1", "gender": "M;;;;M;M", "homepage": ";;;;https://www.spchopra.net;https://www.lerrelpinto.com/", "dblp": ";;;97/7057;68/4681;168/8304", "google_scholar": "OqRO_psAAAAJ;8IWpqtcAAAAJ;;;https://scholar.google.com/citations?hl=en;pmVPj94AAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Chen-Yu_Yen1;~Raghav_Singhal1;us453@nyu.edu;~Rajesh_Ranganath2;~Sumit_Chopra1;~Lerrel_Pinto1", "aff": "New York University;New York University;;New York University;NYU Grossman School of Medicine;New York University", "aff_domain": "nyu.edu;nyu.edu;;nyu.edu;nyulangone.org;cs.nyu.edu", "position": "PhD student;PhD student;;Assistant Professor;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nyen2024adaptive,\ntitle={Adaptive Sampling of k-Space in Magnetic Resonance for Rapid Pathology Prediction},\nauthor={Chen-Yu Yen and Raghav Singhal and Umang Sharma and Rajesh Ranganath and Sumit Chopra and Lerrel Pinto},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=h2uBuQvpp8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2826536, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17650401410015216156&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "nyu.edu;nyu.edu;;nyu.edu;nyulangone.org;cs.nyu.edu", "author_num": 6, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "New York University;New York University Grossman School of Medicine", "aff_unique_dep": ";School of Medicine", "aff_unique_url": "https://www.nyu.edu;https://med.nyu.edu", "aff_unique_abbr": "NYU;NYU Grossman SOM", "aff_campus_unique_index": "1", "aff_campus_unique": ";New York", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Infinite-Horizon Distributionally Robust Regret-Optimal Control", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33404", "id": "h3SGdpI4Ta", "proceeding": "https://proceedings.mlr.press/v235/kargin24a.html", "pdf": "https://openreview.net/pdf?id=h3SGdpI4Ta", "openreview": "https://openreview.net/forum?id=h3SGdpI4Ta", "author_site": "Taylan Kargin, Joudi Hajar, Vikrant Malik, Babak Hassibi", "tldr": "", "abstract": "We study the infinite-horizon distributionally robust (DR) control of linear systems with quadratic costs, where disturbances have unknown, possibly time-correlated distribution within a Wasserstein-2 ambiguity set. We aim to minimize the worst-case expected regret\u2014the excess cost of a causal policy compared to a non-causal one with access to future disturbance. Though the optimal policy lacks a finite-order state-space realization (i.e., it is non-rational), it can be characterized by a finite-dimensional parameter. Leveraging this, we develop an efficient frequency-domain algorithm to compute this optimal control policy and present a convex optimization method to construct a near-optimal state-space controller that approximates the optimal non-rational controller in the $\\mathit{H}_\\infty$-norm. This approach avoids solving a computationally expensive semi-definite program (SDP) that scales with the time horizon in the finite-horizon setting.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Taylan Kargin;Joudi Hajar;Vikrant Malik;Babak Hassibi", "authorids": "~Taylan_Kargin1;~Joudi_Hajar1;vmalik@caltech.edu;~Babak_Hassibi1", "gender": "M;F;;M", "homepage": "https://tkargin.github.io;;;https://ee.caltech.edu/people/hassibi", "dblp": "322/5556;;;09/1803.html", "google_scholar": "5VpXWyIAAAAJ;Rff6VZsAAAAJ;;1XoZPhEAAAAJ", "orcid": "0000-0001-6744-654X;;;", "linkedin": "taylan-kargin/;;;babak-hassibi-2853614/", "or_profile": "~Taylan_Kargin1;~Joudi_Hajar1;vmalik@caltech.edu;~Babak_Hassibi1", "aff": "California Institute of Technology;California Institute of Technology;;California Institute of Technology", "aff_domain": "caltech.edu;caltech.edu;;caltech.edu", "position": "PhD student;PhD student;;Full Professor", "bibtex": "@inproceedings{\nkargin2024infinitehorizon,\ntitle={Infinite-Horizon Distributionally Robust Regret-Optimal Control},\nauthor={Taylan Kargin and Joudi Hajar and Vikrant Malik and Babak Hassibi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=h3SGdpI4Ta}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2152762, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7599773057556115587&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "caltech.edu;caltech.edu;;caltech.edu", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "California Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.caltech.edu", "aff_unique_abbr": "Caltech", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pasadena", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "SiBBlInGS: Similarity-driven Building-Block Inference using Graphs across States", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33403", "id": "h8aTi32tul", "proceeding": "https://proceedings.mlr.press/v235/mudrik24a.html", "pdf": "https://openreview.net/pdf?id=h8aTi32tul", "openreview": "https://openreview.net/forum?id=h8aTi32tul", "author_site": "Noga Mudrik, Gal Mishne, Adam Charles", "tldr": "", "abstract": "Time series data across scientific domains are often collected under distinct states (e.g., tasks), wherein latent processes (e.g., biological factors) create complex inter- and intra-state variability. A key approach to capture this complexity is to uncover fundamental interpretable units within the data, Building Blocks (BBs), which modulate their activity and adjust their structure across observations. Existing methods for identifying BBs in multi-way data often overlook inter- vs. intra-state variability, produce uninterpretable components, or do not align with properties of real-world data, such as missing samples and sessions of different duration. Here, we present a framework for Similarity-driven Building Block Inference using Graphs across States (SiBBlInGS). SiBBlInGS offers a graph-based dictionary learning approach for discovering sparse BBs along with their temporal traces, based on co-activity patterns and inter- vs. intra-state relationships. Moreover, SiBBlInGS captures per-trial temporal variability and controlled cross-state structural BB adaptations, identifies state-specific vs. state-invariant components, and accommodates variability in the number and duration of observed sessions across states. We demonstrate SiBBlInGS's ability to reveal insights into complex phenomena as well as its robustness to noise and missing samples through several synthetic and real-world examples, including web search and neural data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Noga Mudrik;Gal Mishne;Adam Shabti Charles", "authorids": "~Noga_Mudrik1;~Gal_Mishne1;~Adam_Shabti_Charles1", "gender": ";F;M", "homepage": ";http://mishne.ucsd.edu/;https://www.bme.jhu.edu/ascharles/", "dblp": ";125/3214;04/10257", "google_scholar": ";KrwpdXYAAAAJ;c8RKLp0AAAAJ", "orcid": ";0000-0002-5287-3626;", "linkedin": ";;", "or_profile": "~Noga_Mudrik1;~Gal_Mishne1;~Adam_Shabti_Charles1", "aff": ";University of California, San Diego;Johns Hopkins University", "aff_domain": ";ucsd.edu;jhu.edu", "position": ";Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nmudrik2024sibblings,\ntitle={Si{BB}lIn{GS}: Similarity-driven Building-Block Inference using Graphs across States},\nauthor={Noga Mudrik and Gal Mishne and Adam Shabti Charles},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=h8aTi32tul}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9515053, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=389601076059086212&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";ucsd.edu;jhu.edu", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "University of California, San Diego;Johns Hopkins University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucsd.edu;https://www.jhu.edu", "aff_unique_abbr": "UCSD;JHU", "aff_campus_unique_index": "0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Discovering Features with Synergistic Interactions in Multiple Views", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33402", "id": "hFEgae0od4", "proceeding": "https://proceedings.mlr.press/v235/kim24ag.html", "pdf": "https://openreview.net/pdf?id=hFEgae0od4", "openreview": "https://openreview.net/forum?id=hFEgae0od4", "author_site": "Chohee Kim, M van der Schaar, Changhee Lee", "tldr": "", "abstract": "Discovering features with synergistic interactions in multi-view data, that provide more information gain when considered together than when considered separately, is particularly valuable. This fosters a more comprehensive understanding of the target outcome from diverse perspectives (views). However, despite the increasing opportunities presented by multi-view data, surprisingly little attention has been paid to uncovering these crucial interactions. To address this gap, we formally define the problem of selecting synergistic and non-synergistic feature subsets in multi-view data, leveraging an information-theoretic concept known as interaction information. To this end, we introduce a novel deep learning-based feature selection method that identifies different interactions across multiple views, employing a Bernoulli relaxation technique to solve this intractable subset searching problem. Experiments on synthetic, semi-synthetic, and real-world multi-view datasets demonstrate that our model discovers relevant feature subsets with synergistic and non-synergistic interactions, achieving remarkable similarity to the ground truth. Furthermore, we corroborate the discovered features with supporting medical and scientific literature, underscoring its utility in elucidating complex dependencies and interactions in multi-view data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chohee Kim;Mihaela van der Schaar;Changhee Lee", "authorids": "~Chohee_Kim1;~Mihaela_van_der_Schaar2;~Changhee_Lee1", "gender": "F;F;", "homepage": ";https://www.vanderschaar-lab.com;", "dblp": ";;", "google_scholar": ";DZ3S--MAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;", "linkedin": "chohee-kim-9b5064b7/;;", "or_profile": "~Chohee_Kim1;~Mihaela_van_der_Schaar2;~Changhee_Lee1", "aff": "Chung-Ang University;University of California, Los Angeles;ChungAng University", "aff_domain": "cau.ac.kr;ucla.edu;cau.ac.kr", "position": "MS student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nkim2024discovering,\ntitle={Discovering Features with Synergistic Interactions in Multiple Views},\nauthor={Chohee Kim and Mihaela van der Schaar and Changhee Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hFEgae0od4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 577361, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:F44CseC87zoJ:scholar.google.com/&scioq=Discovering+Features+with+Synergistic+Interactions+in+Multiple+Views&hl=en&as_sdt=0,5", "gs_version_total": 5, "email": "cau.ac.kr;ucla.edu;cau.ac.kr", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Chung-Ang University;University of California, Los Angeles;Chungang University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.cau.ac.kr;https://www.ucla.edu;http://www.cau.ac.kr", "aff_unique_abbr": "CAU;UCLA;CAU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;0", "aff_country_unique": "South Korea;United States" }, { "title": "Keep the Momentum: Conservation Laws beyond Euclidean Gradient Flows", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33401", "id": "hG6gddAKnJ", "proceeding": "https://proceedings.mlr.press/v235/marcotte24a.html", "pdf": "https://openreview.net/pdf?id=hG6gddAKnJ", "openreview": "https://openreview.net/forum?id=hG6gddAKnJ", "author_site": "Sibylle Marcotte, R\u00e9mi Gribonval, Gabriel Peyr\u00e9", "tldr": "", "abstract": "Conservation laws are well-established in the context of Euclidean gradient flow dynamics, notably for linear or ReLU neural network training. Yet, their existence and principles for non-Euclidean geometries and momentum-based dynamics remain largely unknown. In this paper, we characterize \"all\" conservation laws in this general setting. In stark contrast to the case of gradient flows, we prove that the conservation laws for momentum-based dynamics exhibit temporal dependence. Additionally, we often observe a \"conservation loss\" when transitioning from gradient flow to momentum dynamics. Specifically, for linear networks, our framework allows us to identify all momentum conservation laws, which are less numerous than in the gradient flow case except in sufficiently over-parameterized regimes. With ReLU networks, no conservation law remains. This phenomenon also manifests in non-Euclidean metrics, used e.g. for Nonnegative Matrix Factorization (NMF): all conservation laws can be determined in the gradient flow context, yet none persists in the momentum case.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sibylle Marcotte;R\u00e9mi Gribonval;Gabriel Peyr\u00e9", "authorids": "~Sibylle_Marcotte1;~R\u00e9mi_Gribonval1;~Gabriel_Peyr\u00e92", "gender": "F;;M", "homepage": ";;http://gpeyre.com/", "dblp": "291/4809;;65/1759", "google_scholar": ";;https://scholar.google.fr/citations?user=KqA1dYcAAAAJ", "orcid": "0000-0002-2238-2973;;", "linkedin": ";;", "or_profile": "~Sibylle_Marcotte1;~R\u00e9mi_Gribonval1;~Gabriel_Peyr\u00e92", "aff": "Ecole Normale Sup\u00e9rieure de Paris;;CNRS", "aff_domain": "ens.fr;;cnrs.fr", "position": "PhD student;;Researcher", "bibtex": "@inproceedings{\nmarcotte2024keep,\ntitle={Keep the Momentum: Conservation Laws beyond Euclidean Gradient Flows},\nauthor={Sibylle Marcotte and R{\\'e}mi Gribonval and Gabriel Peyr{\\'e}},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hG6gddAKnJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 756903, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zWV-m1B8d7cJ:scholar.google.com/&scioq=Keep+the+Momentum:+Conservation+Laws+beyond+Euclidean+Gradient+Flows&hl=en&as_sdt=0,44", "gs_version_total": 8, "email": "ens.fr;;cnrs.fr", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Ecole Normale Sup\u00e9rieure de Paris;Centre National de la Recherche Scientifique", "aff_unique_dep": ";", "aff_unique_url": "https://www.ens.fr;https://www.cnrs.fr", "aff_unique_abbr": "ENS Paris;CNRS", "aff_campus_unique_index": "0", "aff_campus_unique": "Paris;", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "title": "Multiply Robust Estimation for Local Distribution Shifts with Multiple Domains", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33400", "id": "hJaWoU3Emh", "proceeding": "https://proceedings.mlr.press/v235/wilkins-reeves24a.html", "pdf": "https://openreview.net/pdf?id=hJaWoU3Emh", "openreview": "https://openreview.net/forum?id=hJaWoU3Emh", "author_site": "Steven Wilkins-Reeves, Xu Chen, Qi Ma, christine agarwal, Aude Hofleitner", "tldr": "", "abstract": "Distribution shifts are ubiquitous in real-world machine learning applications, posing a challenge to the generalization of models trained on one data distribution to another. We focus on scenarios where data distributions vary across multiple segments of the entire population and only make local assumptions about the differences between training and test (deployment) distributions within each segment. We propose a two-stage multiply robust estimation method to improve model performance on each individual segment for tabular data analysis. The method involves fitting a linear combination of the based models, learned using clusters of training data from multiple segments, followed by a refinement step for each segment. Our method is designed to be implemented with commonly used off-the-shelf machine learning models. We establish theoretical guarantees on the generalization bound of the method on the test risk. With extensive experiments on synthetic and real datasets, we demonstrate that the proposed method substantially improves over existing alternatives in prediction accuracy and robustness on both regression and classification tasks. We also assess its effectiveness on a user city prediction dataset from Meta.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Steven Wilkins-Reeves;Xu Chen;Qi Ma;christine agarwal;Aude Hofleitner", "authorids": "~Steven_Wilkins-Reeves1;~Xu_Chen23;qima@meta.com;~christine_agarwal1;~Aude_Hofleitner1", "gender": "M;M;;;F", "homepage": "https://stevejwr.github.io/;;;;https://research.facebook.com/people/hofleitner-aude/", "dblp": ";;;;76/11044", "google_scholar": ";;;;w-xdg4sAAAAJ", "orcid": "0000-0001-5492-6966;;;;", "linkedin": ";xuchen0202/;;christinekuang/;audehofleitner/", "or_profile": "~Steven_Wilkins-Reeves1;~Xu_Chen23;qima@meta.com;~christine_agarwal1;~Aude_Hofleitner1", "aff": ";Meta;;Meta Facebook;Meta", "aff_domain": ";meta.com;;fb.com;meta.com", "position": ";Research Scientist;;Researcher;Researcher", "bibtex": "@inproceedings{\nwilkins-reeves2024multiply,\ntitle={Multiply Robust Estimation for Local Distribution Shifts with Multiple Domains},\nauthor={Steven Wilkins-Reeves and Xu Chen and Qi Ma and christine agarwal and Aude Hofleitner},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hJaWoU3Emh}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1349808, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3703589544601924405&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";meta.com;;fb.com;meta.com", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Meta", "aff_unique_dep": "Meta Platforms, Inc.", "aff_unique_url": "https://meta.com", "aff_unique_abbr": "Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Hyperbolic Active Learning for Semantic Segmentation under Domain Shift", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33399", "id": "hKdJPMQvew", "proceeding": "https://proceedings.mlr.press/v235/franco24a.html", "pdf": "https://openreview.net/pdf?id=hKdJPMQvew", "openreview": "https://openreview.net/forum?id=hKdJPMQvew", "author_site": "Luca Franco, Paolo Mandica, Konstantinos Kallidromitis, Devin Guillory, Yu-Teng Li, Trevor Darrell, Fabio Galasso", "tldr": "", "abstract": "We introduce a hyperbolic neural network approach to pixel-level active learning for semantic segmentation. Analysis of the data statistics leads to a novel interpretation of the hyperbolic radius as an indicator of data scarcity. In HALO (Hyperbolic Active Learning Optimization), for the first time, we propose the use of epistemic uncertainty as a data acquisition strategy, following the intuition of selecting data points that are the least known. The hyperbolic radius, complemented by the widely-adopted prediction entropy, effectively approximates epistemic uncertainty. We perform extensive experimental analysis based on two established synthetic-to-real benchmarks, i.e. GTAV $\\rightarrow$ Cityscapes and SYNTHIA $\\rightarrow$ Cityscapes. Additionally, we test HALO on Cityscape $\\rightarrow$ ACDC for domain adaptation under adverse weather conditions, and we benchmark both convolutional and attention-based backbones. HALO sets a new state-of-the-art in active learning for semantic segmentation under domain shift and it is the first active learning approach that surpasses the performance of supervised domain adaptation while using only a small portion of labels (i.e., 1%).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Luca Franco;Paolo Mandica;Konstantinos Kallidromitis;Devin Guillory;Yu-Teng Li;Trevor Darrell;Fabio Galasso", "authorids": "~Luca_Franco1;~Paolo_Mandica1;~Konstantinos_Kallidromitis1;~Devin_Guillory1;~Yu-Teng_Li1;~Trevor_Darrell2;~Fabio_Galasso1", "gender": "M;M;M;M;M;M;M", "homepage": "https://fraluca.github.io/;https://paolomandica.github.io/;https://github.com/KKallidromitis;https://www.devinguillory.com/;;https://fgalasso.bitbucket.io/;https://people.eecs.berkeley.edu/~trevor/", "dblp": "304/2582;342/3996;;188/1061;;48/3897;d/TrevorDarrell", "google_scholar": "https://scholar.google.com/citations?hl=it;https://scholar.google.com/citations?hl=en;5EuNtbQAAAAJ;t4dSV4YAAAAJ;;https://scholar.google.de/citations?user=2gSuGBEAAAAJ;https://scholar.google.com.tw/citations?user=bh-uRFMAAAAJ", "orcid": "0000-0003-0107-6755;0000-0002-4493-2497;;;;0000-0003-1875-7813;", "linkedin": "luca-franco-968819196/;paolo-mandica/;kkall/;devin-guillory-78528958/;yutengli/;fabio-galasso-61141b32/;", "or_profile": "~Luca_Franco1;~Paolo_Mandica1;~Konstantinos_Kallidromitis1;~Devin_Guillory1;~Yu-Teng_Li1;~Fabio_Galasso1;~trevor_darrell1", "aff": "University of Roma \"La Sapienza\";University of Roma \"La Sapienza\";Panasonic;University of California, Berkeley;Adobe Systems;University of Roma \"La Sapienza\";Electrical Engineering & Computer Science Department", "aff_domain": "uniroma1.it;uniroma1.it;us.panasonic.com;berkeley.edu;adobe.com;uniroma1.it;eecs.berkeley.edu", "position": "PhD student;PhD student;AI Research Engineer;PhD student;Researcher;Associate Professor;Professor", "bibtex": "@inproceedings{\nfranco2024hyperbolic,\ntitle={Hyperbolic Active Learning for Semantic Segmentation under Domain Shift},\nauthor={Luca Franco and Paolo Mandica and Konstantinos Kallidromitis and Devin Guillory and Yu-Teng Li and Trevor Darrell and Fabio Galasso},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hKdJPMQvew}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7117084, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9416679968650332046&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "uniroma1.it;uniroma1.it;us.panasonic.com;berkeley.edu;adobe.com;uniroma1.it;eecs.berkeley.edu", "author_num": 7, "aff_unique_index": "0;0;1;2;3;0;4", "aff_unique_norm": "University of Rome La Sapienza;Panasonic Corporation;University of California, Berkeley;Adobe;Electrical Engineering & Computer Science Department", "aff_unique_dep": ";;;Adobe Systems Incorporated;Electrical Engineering & Computer Science", "aff_unique_url": "https://www.uniroma1.it;https://www.panasonic.com;https://www.berkeley.edu;https://www.adobe.com;", "aff_unique_abbr": "La Sapienza;Panasonic;UC Berkeley;Adobe;", "aff_campus_unique_index": "0;0;2;0", "aff_campus_unique": "Rome;;Berkeley", "aff_country_unique_index": "0;0;1;2;2;0", "aff_country_unique": "Italy;Japan;United States;" }, { "title": "Exploration-Driven Policy Optimization in RLHF: Theoretical Insights on Efficient Data Utilization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33398", "id": "hLGxDYo0eF", "proceeding": "https://proceedings.mlr.press/v235/du24i.html", "pdf": "https://openreview.net/pdf?id=hLGxDYo0eF", "openreview": "https://openreview.net/forum?id=hLGxDYo0eF", "author_site": "Yihan Du, Anna Winnicki, Gal Dalal, Shie Mannor, R Srikant", "tldr": "", "abstract": "Reinforcement Learning from Human Feedback (RLHF) has achieved impressive empirical successes while relying on a small amount of human feedback. However, there is limited theoretical justification for this phenomenon. Additionally, most recent studies focus on value-based algorithms despite the recent empirical successes of policy-based algorithms. In this work, we consider an RLHF algorithm based on policy optimization (PO-RLHF). The algorithm is based on the popular Policy Cover-Policy Gradient (PC-PG) algorithm, which assumes knowledge of the reward function. In PO-RLHF, knowledge of the reward function is not assumed and the algorithm relies on trajectory-based comparison feedback to infer the reward function. We provide performance bounds for PO-RLHF with low query complexity, which provides insight into why a small amount of human feedback may be sufficient to get good performance with RLHF. A key novelty is our trajectory-level elliptical potential analysis technique used to infer reward function parameters when comparison queries rather than reward observations are used. We provide and analyze algorithms in two settings: linear and neural function approximation, PG-RLHF and NN-PG-RLHF, respectively.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yihan Du;Anna Winnicki;Gal Dalal;Shie Mannor;R. Srikant", "authorids": "~Yihan_Du2;~Anna_Winnicki1;~Gal_Dalal2;~Shie_Mannor2;~R._Srikant1", "gender": "F;;M;;M", "homepage": "https://yihandu.github.io/;https://sites.google.com/view/annawinnicki;https://shie.net.technion.ac.il;;", "dblp": "231/1919;;20/1669;s/RSrikant;166/1605", "google_scholar": "https://scholar.google.pl/citations?user=_RSr3vUAAAAJ;;https://scholar.google.com.tw/citations?user=q1HlbIUAAAAJ;;https://scholar.google.co.il/citations?user=NfJiSMMAAAAJ", "orcid": ";;;;0000-0002-3166-4251", "linkedin": ";;;;galdalal/", "or_profile": "~Yihan_Du2;~Anna_Winnicki1;~Shie_Mannor2;~R._Srikant1;~Gal_Dalal1", "aff": "University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;Technion - Israel Institute of Technology, Technion;University of Illinois, Urbana Champaign;NVIDIA", "aff_domain": "illinois.edu;uiuc.edu;technion.il;illinois.edu;nvidia.com", "position": "Postdoc;PhD student;Full Professor;Full Professor;Senior Research Scientist", "bibtex": "@inproceedings{\ndu2024explorationdriven,\ntitle={Exploration-Driven Policy Optimization in {RLHF}: Theoretical Insights on Efficient Data Utilization},\nauthor={Yihan Du and Anna Winnicki and Gal Dalal and Shie Mannor and R. Srikant},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hLGxDYo0eF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 977054, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15636805553987597697&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "illinois.edu;uiuc.edu;technion.il;illinois.edu;nvidia.com", "author_num": 5, "aff_unique_index": "0;0;1;0;2", "aff_unique_norm": "University of Illinois Urbana-Champaign;Technion - Israel Institute of Technology;NVIDIA", "aff_unique_dep": ";;NVIDIA Corporation", "aff_unique_url": "https://illinois.edu;https://www.technion.ac.il;https://www.nvidia.com", "aff_unique_abbr": "UIUC;Technion;NVIDIA", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United States;Israel" }, { "title": "Harmony in Diversity: Merging Neural Networks with Canonical Correlation Analysis", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33397", "id": "hLuNVjRnY3", "proceeding": "https://proceedings.mlr.press/v235/horoi24a.html", "pdf": "https://openreview.net/pdf?id=hLuNVjRnY3", "openreview": "https://openreview.net/forum?id=hLuNVjRnY3", "author_site": "Stefan Horoi, Albert Manuel Orozco Camacho, Eugene Belilovsky, Guy Wolf", "tldr": "", "abstract": "Combining the predictions of multiple trained models through ensembling is generally a good way to improve accuracy by leveraging the different learned features of the models, however it comes with high computational and storage costs. Model fusion, the act of merging multiple models into one by combining their parameters reduces these costs but doesn't work as well in practice. Indeed, neural network loss landscapes are high-dimensional and non-convex and the minima found through learning are typically separated by high loss barriers. Numerous recent works have been focused on finding permutations matching one network features to the features of a second one, lowering the loss barrier on the linear path between them in parameter space. However, permutations are restrictive since they assume a one-to-one mapping between the different models' neurons exists. We propose a new model merging algorithm, CCA Merge, which is based on Canonical Correlation Analysis and aims to maximize the correlations between linear combinations of the model features. We show that our alignment method leads to better performances than past methods when averaging models trained on the same, or differing data splits. We also extend this analysis into the harder setting where more than 2 models are merged, and we find that CCA Merge works significantly better than past methods. Our code is publicly available at https://github.com/shoroi/align-n-merge", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Stefan Horoi;Albert Manuel Orozco Camacho;Eugene Belilovsky;Guy Wolf", "authorids": "~Stefan_Horoi1;~Albert_Manuel_Orozco_Camacho1;~Eugene_Belilovsky1;~Guy_Wolf1", "gender": ";M;M;M", "homepage": ";https://alorozco53.github.io;http://eugenium.github.io;http://guywolf.org", "dblp": "256/5511;186/7164;42/11445;120/1308", "google_scholar": "https://scholar.google.fr/citations?user=jUm5G6sAAAAJ;zYXzEisAAAAJ;https://scholar.google.fr/citations?user=CffJDoEAAAAJ;g0k3SjcAAAAJ", "orcid": "0000-0003-2951-2600;;;0000-0002-6740-059X", "linkedin": ";https://linkedin.com/in/alorozco53;;", "or_profile": "~Stefan_Horoi1;~Albert_Manuel_Orozco_Camacho1;~Eugene_Belilovsky1;~Guy_Wolf1", "aff": "Universit\u00e9 de Montr\u00e9al;Concordia University;Concordia University, Montreal;University of Montreal", "aff_domain": "umontreal.ca;concordia.ca;concordia.ca;umontreal.ca", "position": "PhD student;PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nhoroi2024harmony,\ntitle={Harmony in Diversity: Merging Neural Networks with Canonical Correlation Analysis},\nauthor={Stefan Horoi and Albert Manuel Orozco Camacho and Eugene Belilovsky and Guy Wolf},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hLuNVjRnY3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 893610, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10998758066288917354&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "umontreal.ca;concordia.ca;concordia.ca;umontreal.ca", "author_num": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al;Concordia University;University of Montreal", "aff_unique_dep": ";;", "aff_unique_url": "https://www.umontreal.ca;https://www.concordia.ca;https://wwwumontreal.ca", "aff_unique_abbr": "UdeM;Concordia;UM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Canada" }, { "title": "Subequivariant Reinforcement Learning in 3D Multi-Entity Physical Environments", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33396", "id": "hQpUhySEJi", "proceeding": "https://proceedings.mlr.press/v235/chen24aq.html", "pdf": "https://openreview.net/pdf?id=hQpUhySEJi", "openreview": "https://openreview.net/forum?id=hQpUhySEJi", "author_site": "Runfa Chen, Ling Wang, Yu Du, Tianrui Xue, Fuchun Sun, Jianwei Zhang, Wenbing Huang", "tldr": "", "abstract": "Learning policies for multi-entity systems in 3D environments is far more complicated against single-entity scenarios, due to the exponential expansion of the global state space as the number of entities increases. One potential solution of alleviating the exponential complexity is dividing the global space into independent local views that are invariant to transformations including translations and rotations. To this end, this paper proposes *Subequivariant Hierarchical Neural Networks* (SHNN) to facilitate multi-entity policy learning. In particular, SHNN first dynamically decouples the global space into local entity-level graphs via task assignment. Second, it leverages subequivariant message passing over the local entity-level graphs to devise local reference frames, remarkably compressing the representation redundancy, particularly in gravity-affected environments. Furthermore, to overcome the limitations of existing benchmarks in capturing the subtleties of multi-entity systems under the Euclidean symmetry, we propose the *Multi-entity Benchmark* (MEBEN), a new suite of environments tailored for exploring a wide range of multi-entity reinforcement learning. Extensive experiments demonstrate significant advancements of SHNN on the proposed benchmarks compared to existing methods. Comprehensive ablations are conducted to verify the indispensability of task assignment and subequivariance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Runfa Chen;Ling Wang;Yu Du;Tianrui Xue;Fuchun Sun;Jianwei Zhang;Wenbing Huang", "authorids": "~Runfa_Chen1;~Ling_Wang4;~Yu_Du8;~Tianrui_Xue1;~Fuchun_Sun1;~Jianwei_Zhang2;~Wenbing_Huang1", "gender": "M;;F;M;M;M;M", "homepage": ";;;;https://www.cs.tsinghua.edu.cn/info/1121/3555.htm;https://tams.informatik.uni-hamburg.de/people/zhang/;https://gsai.ruc.edu.cn/english/wenbing_huang", "dblp": "260/0853.html;;;;;z/JianweiZhang1;155/3181-1.html", "google_scholar": "WaeyhikAAAAJ;;;;;;0yNkmO4AAAAJ", "orcid": "0000-0002-1078-289X;0000-0003-2565-7095;0009-0005-4003-5018;;;;", "linkedin": ";;;tianrui-xue-10b3b7264/;;;", "or_profile": "~Runfa_Chen1;~Ling_Wang4;~Yu_Du8;~Tianrui_Xue1;~Fuchun_Sun1;~Jianwei_Zhang2;~Wenbing_Huang1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;New York University;Tsinghua University;Universit\u00e4t Hamburg;Renmin University of China", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn;nyu.edu;cs.tsinghua.edu.cn;uni-hamburg.de;ruc.edu.cn", "position": "PhD student;PhD student;MS student;Undergrad student;Full Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nchen2024subequivariant,\ntitle={Subequivariant Reinforcement Learning in 3D Multi-Entity Physical Environments},\nauthor={Runfa Chen and Ling Wang and Yu Du and Tianrui Xue and Fuchun Sun and Jianwei Zhang and Wenbing Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hQpUhySEJi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2540153, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2QRLHKyY0sIJ:scholar.google.com/&scioq=Subequivariant+Reinforcement+Learning+in+3D+Multi-Entity+Physical+Environments&hl=en&as_sdt=0,5", "gs_version_total": 8, "email": "tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn;nyu.edu;cs.tsinghua.edu.cn;uni-hamburg.de;ruc.edu.cn", "author_num": 7, "aff_unique_index": "0;0;0;1;0;2;3", "aff_unique_norm": "Tsinghua University;New York University;University of Hamburg;Renmin University of China", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.nyu.edu;https://www.uni-hamburg.de;http://www.ruc.edu.cn", "aff_unique_abbr": "THU;NYU;UHH;RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;2;0", "aff_country_unique": "China;United States;Germany" }, { "title": "Chasing Convex Functions with Long-term Constraints", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33395", "id": "hRBdOHVn7y", "proceeding": "https://proceedings.mlr.press/v235/lechowicz24a.html", "pdf": "https://openreview.net/pdf?id=hRBdOHVn7y", "openreview": "https://openreview.net/forum?id=hRBdOHVn7y", "author_site": "Adam Lechowicz, Nicolas Christianson, Bo Sun, Noman Bashir, Mohammad Hajiesmaili, Adam Wierman, Prashant Shenoy", "tldr": "", "abstract": "We introduce and study a family of online metric problems with long-term constraints. In these problems, an online player makes decisions $\\mathbf{x}_t$ in a metric space $(X,d)$ to simultaneously minimize their hitting cost $f_t(\\mathbf{x}_t)$ and switching cost as determined by the metric. Over the time horizon $T$, the player must satisfy a long-term demand constraint $\\sum_t c(\\mathbf{x}_t) \\geq 1$, where $c(\\mathbf{x}_t)$ denotes the fraction of demand satisfied at time $t$. Such problems can find a wide array of applications to online resource allocation in sustainable energy/computing systems. We devise optimal competitive and learning-augmented algorithms for the case of bounded hitting cost gradients and weighted $\\ell_1$ metrics, and further show that our proposed algorithms perform well in numerical experiments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Adam Lechowicz;Nicolas Christianson;Bo Sun;Noman Bashir;Mohammad Hajiesmaili;Adam Wierman;Prashant Shenoy", "authorids": "~Adam_Lechowicz1;~Nicolas_Christianson1;~Bo_Sun8;~Noman_Bashir1;~Mohammad_Hajiesmaili1;~Adam_Wierman1;~Prashant_Shenoy1", "gender": "Non-Binary;;;M;M;M;M", "homepage": "https://adamlechowicz.github.io;https://nicochristianson.com/;;https://noman-bashir.github.io/;https://groups.cs.umass.edu/hajiesmaili/;https://adamwierman.com/;https://people.cs.umass.edu/~shenoy/", "dblp": "307/5199;322/8648;;146/7819;49/7911;56/4447;s/PrashantJShenoy", "google_scholar": "fZ2-jm0AAAAJ;XS2UFA8AAAAJ;;cndPOVYAAAAJ;XCGuYKIAAAAJ;4OvOdSgAAAAJ;https://scholar.google.com.tw/citations?user=TciP6mcAAAAJ", "orcid": "0000-0002-7774-9939;0000-0001-8330-8964;;0000-0001-9304-910X;;0000-0002-5923-0199;", "linkedin": ";;;noman-bashir-8120038b/;;adam-wierman-a529474/;", "or_profile": "~Adam_Lechowicz1;~Nicolas_Christianson1;~Bo_Sun8;~Noman_Bashir1;~Mohammad_Hajiesmaili1;~Adam_Wierman1;~Prashant_Shenoy1", "aff": "University of Massachusetts Amherst;California Institute of Technology;;Massachusetts Institute of Technology;College of Information and Computer Science, University of Massachusetts, Amherst;California Institute of Technology;Department of Computer Science, University of Massachusetts at Amherst", "aff_domain": "cs.umass.edu;caltech.edu;;mit.edu;cics.umass.edu;caltech.edu;cs.umass.edu", "position": "PhD student;PhD student;;Researcher;Assistant Professor;Professor;Full Professor", "bibtex": "@inproceedings{\nlechowicz2024chasing,\ntitle={Chasing Convex Functions with Long-term Constraints},\nauthor={Adam Lechowicz and Nicolas Christianson and Bo Sun and Noman Bashir and Mohammad Hajiesmaili and Adam Wierman and Prashant Shenoy},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hRBdOHVn7y}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 969777, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7517147307381096674&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 11, "email": "cs.umass.edu;caltech.edu;;mit.edu;cics.umass.edu;caltech.edu;cs.umass.edu", "author_num": 7, "aff_unique_index": "0;1;2;0;1;0", "aff_unique_norm": "University of Massachusetts Amherst;California Institute of Technology;Massachusetts Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.umass.edu;https://www.caltech.edu;https://web.mit.edu", "aff_unique_abbr": "UMass Amherst;Caltech;MIT", "aff_campus_unique_index": "0;1;0;1;0", "aff_campus_unique": "Amherst;Pasadena;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Progressive Inference: Explaining Decoder-Only Sequence Classification Models Using Intermediate Predictions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33394", "id": "hRX1o7FBhT", "proceeding": "https://proceedings.mlr.press/v235/kariyappa24a.html", "pdf": "https://openreview.net/pdf?id=hRX1o7FBhT", "openreview": "https://openreview.net/forum?id=hRX1o7FBhT", "author_site": "Sanjay Kariyappa, Freddy Lecue, Saumitra Mishra, Christopher Pond, Daniele Magazzeni, Manuela Veloso", "tldr": "", "abstract": "This paper proposes Progressive inference--a framework to explain the predictions of decoder-only transformer models trained to perform sequence classification tasks. Our work is based on the insight that the classification head of a decoder-only model can be used to make intermediate predictions by evaluating them at different points in the input sequence. Due to the masked attention mechanism used in decoder-only models, these intermediate predictions only depend on the tokens seen before the inference point, allowing us to obtain the model's prediction on a masked input sub-sequence, with negligible computational overheads. We develop two methods to provide sub-sequence level attributions using this core insight. First, we propose Single Pass-Progressive Inference (SP-PI) to compute attributions by simply taking the difference between intermediate predictions. Second, we exploit a connection with Kernel SHAP to develop Multi Pass-Progressive Inference (MP-PI); this uses intermediate predictions from multiple masked versions of the input to compute higher-quality attributions that approximate SHAP values. We perform studies on several text classification datasets to demonstrate that our proposal provides better explanations compared to prior work, both in the single-pass and multi-pass settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sanjay Kariyappa;Freddy Lecue;Saumitra Mishra;Christopher Pond;Daniele Magazzeni;Manuela Veloso", "authorids": "~Sanjay_Kariyappa1;~Freddy_Lecue1;~Saumitra_Mishra1;christopher.pond@jpmorgan.com;~Daniele_Magazzeni1;~Manuela_Veloso1", "gender": "M;;M;;M;F", "homepage": "https://sanjaykariyappa.github.io/;http://www-sop.inria.fr/members/Freddy.Lecue/;https://sites.google.com/site/saumitramishrac4dm/;;https://nms.kcl.ac.uk/daniele.magazzeni/;https://www.cs.cmu.edu/~mmv/", "dblp": "223/6062;02/3657.html;208/1387;;14/4672;v/ManuelaMVeloso", "google_scholar": "qd9U-h4AAAAJ;https://scholar.google.ca/citations?user=GLByS4gAAAAJ;https://scholar.google.co.uk/citations?user=On6E6ogAAAAJ;;;https://scholar.google.com.tw/citations?user=2FbkAzYAAAAJ", "orcid": ";;;;;", "linkedin": "sanjay-kariyappa-74583924/;freddylecue/;;;;", "or_profile": "~Sanjay_Kariyappa1;~Freddy_Lecue1;~Saumitra_Mishra1;christopher.pond@jpmorgan.com;~Daniele_Magazzeni1;~Manuela_Veloso1", "aff": "J.P. Morgan Chase;INRIA;J.P. Morgan Chase;;;School of Computer Science, Carnegie Mellon University", "aff_domain": "jpmorganchase.com;inria.fr;jpmorgan.com;;;cs.cmu.edu", "position": "Researcher;Full Professor;Researcher;;;Full Professor", "bibtex": "@inproceedings{\nkariyappa2024progressive,\ntitle={Progressive Inference: Explaining Decoder-Only Sequence Classification Models Using Intermediate Predictions},\nauthor={Sanjay Kariyappa and Freddy Lecue and Saumitra Mishra and Christopher Pond and Daniele Magazzeni and Manuela Veloso},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hRX1o7FBhT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3058572, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1568297795130054329&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "jpmorganchase.com;inria.fr;jpmorgan.com;;;cs.cmu.edu", "author_num": 6, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "JPMorgan Chase & Co.;INRIA;Carnegie Mellon University", "aff_unique_dep": ";;School of Computer Science", "aff_unique_url": "https://www.jpmorganchase.com;https://www.inria.fr;https://www.cmu.edu", "aff_unique_abbr": "JPM;INRIA;CMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;France" }, { "title": "From Biased Selective Labels to Pseudo-Labels: An Expectation-Maximization Framework for Learning from Biased Decisions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33393", "id": "hTiNFCNxM1", "proceeding": "https://proceedings.mlr.press/v235/chang24e.html", "pdf": "https://openreview.net/pdf?id=hTiNFCNxM1", "openreview": "https://openreview.net/forum?id=hTiNFCNxM1", "author_site": "Trenton Chang, Jenna Wiens", "tldr": "", "abstract": "Selective labels occur when label observations are subject to a decision-making process; e.g., diagnoses that depend on the administration of laboratory tests. We study a clinically-inspired selective label problem called disparate censorship, where labeling biases vary across subgroups and unlabeled individuals are imputed as \u201cnegative\u201d (i.e., no diagnostic test = no illness). Machine learning models naively trained on such labels could amplify labeling bias. Inspired by causal models of selective labels, we propose Disparate Censorship Expectation-Maximization (DCEM), an algorithm for learning in the presence of disparate censorship. We theoretically analyze how DCEM mitigates the effects of disparate censorship on model performance. We validate DCEM on synthetic data, showing that it improves bias mitigation (area between ROC curves) without sacrificing discriminative performance (AUC) compared to baselines. We achieve similar results in a sepsis classification task using clinical data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Trenton Chang;Jenna Wiens", "authorids": "~Trenton_Chang1;~Jenna_Wiens1", "gender": ";F", "homepage": ";http://www-personal.umich.edu/~wiensj/", "dblp": ";63/10451", "google_scholar": ";fvEfKxkAAAAJ", "orcid": ";0000-0002-1057-7722", "linkedin": ";", "or_profile": "~Trenton_Chang1;~Jenna_Wiens1", "aff": ";University of Michigan Ann Arbor", "aff_domain": ";umich.edu", "position": ";Associate Professor", "bibtex": "@inproceedings{\nchang2024from,\ntitle={From Biased Selective Labels to Pseudo-Labels: An Expectation-Maximization Framework for Learning from Biased Decisions},\nauthor={Trenton Chang and Jenna Wiens},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hTiNFCNxM1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2164073, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:LiwXd3g6XskJ:scholar.google.com/&scioq=From+Biased+Selective+Labels+to+Pseudo-Labels:+An+Expectation-Maximization+Framework+for+Learning+from+Biased+Decisions&hl=en&as_sdt=0,44", "gs_version_total": 7, "email": ";umich.edu", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "University of Michigan", "aff_unique_dep": "", "aff_unique_url": "https://www.umich.edu", "aff_unique_abbr": "UM", "aff_campus_unique_index": "0", "aff_campus_unique": "Ann Arbor", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Mind the Boundary: Coreset Selection via Reconstructing the Decision Boundary", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33392", "id": "hWng0GXeE4", "proceeding": "https://proceedings.mlr.press/v235/yang24b.html", "pdf": "https://openreview.net/pdf?id=hWng0GXeE4", "openreview": "https://openreview.net/forum?id=hWng0GXeE4", "author_site": "Shuo Yang, Zhe Cao, Sheng Guo, Ruiheng Zhang, Ping Luo, Shengping Zhang, Liqiang Nie", "tldr": "", "abstract": "Existing paradigms of pushing the state of the art require exponentially more training data in many fields. Coreset selection seeks to mitigate this growing demand by identifying the most efficient subset of training data. In this paper, we delve into geometry-based coreset methods and preliminarily link the geometry of data distribution with models' generalization capability in theoretics. Leveraging these theoretical insights, we propose a novel coreset construction method by selecting training samples to reconstruct the decision boundary of a deep neural network learned on the full dataset. Extensive experiments across various popular benchmarks demonstrate the superiority of our method over multiple competitors. For the first time, our method achieves a 50% data pruning rate on the ImageNet-1K dataset while sacrificing less than 1% in accuracy. Additionally, we showcase and analyze the remarkable cross-architecture transferability of the coresets derived from our approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shuo Yang;Zhe Cao;Sheng Guo;Ruiheng Zhang;Ping Luo;Shengping Zhang;Liqiang Nie", "authorids": "~Shuo_Yang5;zhe.cao@bit.edu.cn;~Sheng_Guo4;~Ruiheng_Zhang1;~Ping_Luo2;~Shengping_Zhang1;~Liqiang_Nie2", "gender": "M;;;M;;M;M", "homepage": "https://faculty.hitsz.edu.cn/yangshuo;;;;;http://homepage.hit.edu.cn/zhangshengping;https://liqiangnie.github.io/index.html", "dblp": "78/1102-6;;;;;60/1866;92/8277", "google_scholar": "mVtxxCkAAAAJ;;;lPEuNiQAAAAJ;;hMNsT8sAAAAJ;yywVMhUAAAAJ", "orcid": ";;;0000-0002-5460-7196;;;0000-0003-1476-0273", "linkedin": ";;;;;;", "or_profile": "~Shuo_Yang5;zhe.cao@bit.edu.cn;~Sheng_Guo4;~Ruiheng_Zhang1;~Ping_Luo2;~Shengping_Zhang1;~Liqiang_Nie2", "aff": "University of Hong Kong;;;Beijing Institute of Technology;;Harbin Institute of Technology;Shandong University", "aff_domain": "hku.hk;;;bit.edu.cn;;hit.edu.cn;sdu.edu.cn", "position": "Postdoc;;;Associate Professor;;Full Professor;Full Professor", "bibtex": "@inproceedings{\nyang2024mind,\ntitle={Mind the Boundary: Coreset Selection via Reconstructing the Decision Boundary},\nauthor={Shuo Yang and Zhe Cao and Sheng Guo and Ruiheng Zhang and Ping Luo and Shengping Zhang and Liqiang Nie},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hWng0GXeE4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2597785, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13005479732750105611&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 7, "email": "hku.hk;;;bit.edu.cn;;hit.edu.cn;sdu.edu.cn", "author_num": 7, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Hong Kong;Beijing Institute of Technology;Harbin Institute of Technology;Shandong University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.hku.hk;http://www.bit.edu.cn/;http://www.hit.edu.cn/;http://www.sdu.edu.cn", "aff_unique_abbr": "HKU;BIT;HIT;SDU", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Hong Kong SAR;;Harbin", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Near-Optimal Regret in Linear MDPs with Aggregate Bandit Feedback", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33391", "id": "hXQOO6VsxH", "proceeding": "https://proceedings.mlr.press/v235/cassel24a.html", "pdf": "https://openreview.net/pdf?id=hXQOO6VsxH", "openreview": "https://openreview.net/forum?id=hXQOO6VsxH", "author_site": "Asaf Cassel, Haipeng Luo, Aviv Rosenberg, Dmitry Sotnikov", "tldr": "", "abstract": "In many real-world applications, it is hard to provide a reward signal in each step of a Reinforcement Learning (RL) process and more natural to give feedback when an episode ends. To this end, we study the recently proposed model of RL with Aggregate Bandit Feedback (RL-ABF), where the agent only observes the sum of rewards at the end of an episode instead of each reward individually. Prior work studied RL-ABF only in tabular settings, where the number of states is assumed to be small. In this paper, we extend ABF to linear function approximation and develop two efficient algorithms with near-optimal regret guarantees: a value-based optimistic algorithm built on a new randomization technique with a Q-functions ensemble, and a policy optimization algorithm that uses a novel hedging scheme over the ensemble.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Asaf Cassel;Haipeng Luo;Aviv Rosenberg;Dmitry Sotnikov", "authorids": "~Asaf_Cassel1;~Haipeng_Luo1;~Aviv_Rosenberg1;~Dmitry_Sotnikov1", "gender": "M;M;M;M", "homepage": "https://haipeng-luo.net/;https://sites.google.com/view/aviv-rosenberg/home;;", "dblp": "62/2576;225/9369-2;65/852.html;222/3222", "google_scholar": "ct2hw4UAAAAJ;https://scholar.google.co.il/citations?user=cg8_-foAAAAJ;https://scholar.google.co.il/citations?user=lVQv_TAAAAAJ;vhIydFkAAAAJ", "orcid": ";;;", "linkedin": ";aviv-rosenberg-2a6222149/;dmitry-sotnikov-b64b9920;", "or_profile": "~Haipeng_Luo1;~Aviv_Rosenberg1;~Dmitry_Sotnikov1;~Asaf_Benjamin_Cassel1", "aff": "University of Southern California;Google Research;Amazon;Tel Aviv University", "aff_domain": "usc.edu;google.com;amazon.com;tau.ac.il", "position": "Associate Professor;Researcher;Researcher;PhD student", "bibtex": "@inproceedings{\ncassel2024nearoptimal,\ntitle={Near-Optimal Regret in Linear {MDP}s with Aggregate Bandit Feedback},\nauthor={Asaf Cassel and Haipeng Luo and Aviv Rosenberg and Dmitry Sotnikov},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hXQOO6VsxH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 485940, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16289397962632064739&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 8, "email": "usc.edu;google.com;amazon.com;tau.ac.il", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Southern California;Google;Amazon;Tel Aviv University", "aff_unique_dep": ";Google Research;Amazon.com, Inc.;", "aff_unique_url": "https://www.usc.edu;https://research.google;https://www.amazon.com;https://www.tau.ac.il", "aff_unique_abbr": "USC;Google Research;Amazon;TAU", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Los Angeles;Mountain View;", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;Israel" }, { "title": "GaLore: Memory-Efficient LLM Training by Gradient Low-Rank Projection", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33390", "id": "hYHsrKDiX7", "proceeding": "https://proceedings.mlr.press/v235/zhao24s.html", "pdf": "https://openreview.net/pdf?id=hYHsrKDiX7", "openreview": "https://openreview.net/forum?id=hYHsrKDiX7", "author_site": "Jiawei Zhao, Zhenyu Zhang, Beidi Chen, Zhangyang \u201cAtlas\u201d Wang, Anima Anandkumar, Yuandong Tian", "tldr": "", "abstract": "Training Large Language Models (LLMs) presents significant memory challenges, predominantly due to the growing size of weights and optimizer states. Common memory-reduction approaches, such as low-rank adaptation (LoRA), add a trainable low-rank matrix to the frozen pre-trained weight in each layer, reducing trainable parameters and optimizer states. However, such approaches typically underperform training with full-rank weights in both pre-training and fine-tuning stages since they limit the parameter search to a low-rank subspace and alter the training dynamics, and further, may require full-rank warm start. In this work, we propose Gradient Low-Rank Projection (GaLore), a training strategy that allows full-parameter learning but is more memory-efficient than common low-rank adaptation methods such as LoRA. Our approach reduces memory usage by up to 65.5% in optimizer states while maintaining both efficiency and performance for pre-training on LLaMA 1B and 7B architectures with C4 dataset with up to 19.7B tokens, and on fine-tuning RoBERTa on GLUE tasks. Our 8-bit GaLore further reduces optimizer memory by up to 82.5% and total training memory by 63.3%, compared to a BF16 baseline. Notably, we demonstrate, for the first time, the feasibility of pre-training a 7B model on consumer GPUs with 24GB memory (e.g., NVIDIA RTX 4090) without model parallel, checkpointing, or offloading strategies.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiawei Zhao;Zhenyu Zhang;Beidi Chen;Zhangyang Wang;Anima Anandkumar;Yuandong Tian", "authorids": "~Jiawei_Zhao2;~Zhenyu_Zhang4;~Beidi_Chen1;~Zhangyang_Wang1;~Anima_Anandkumar1;~Yuandong_Tian1", "gender": "M;M;F;M;M;F", "homepage": "https://jiaweizhao.com/;https://zhenyu.gallery;https://www.andrew.cmu.edu/user/beidic/;https://vita-group.github.io;http://yuandong-tian.com;http://tensorlab.cms.caltech.edu/users/anima/", "dblp": ";01/1844-15;192/1339;119/4026;t/YuandongTian;", "google_scholar": ";ZLyJRxoAAAAJ;;pxFyKAIAAAAJ;0mgEF28AAAAJ;bEcLezcAAAAJ", "orcid": ";;;;0000-0003-4202-4847;", "linkedin": ";zhenyu-allen-zhang-a9b1391a3/;;;yuandongtian;anima-anandkumar-35171b1/", "or_profile": "~Jiawei_Zhao2;~Zhenyu_Zhang4;~Beidi_Chen1;~Zhangyang_Wang1;~Yuandong_Tian1;~anima_anandkumar1", "aff": "California Institute of Technology;University of Texas at Austin;Meta Facebook;University of Texas at Austin;Meta AI (FAIR);California Institute of Technology", "aff_domain": "caltech.edu;utexas.edu;fb.com;utexas.edu;meta.com;caltech.edu", "position": "PhD student;PhD student;Researcher;Associate Professor;Research Scientist;Full Professor", "bibtex": "@inproceedings{\nzhao2024galore,\ntitle={GaLore: Memory-Efficient {LLM} Training by Gradient Low-Rank Projection},\nauthor={Jiawei Zhao and Zhenyu Zhang and Beidi Chen and Zhangyang Wang and Anima Anandkumar and Yuandong Tian},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hYHsrKDiX7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 749048, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 158, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11250439635839948184&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "email": "caltech.edu;utexas.edu;fb.com;utexas.edu;meta.com;caltech.edu", "author_num": 6, "aff_unique_index": "0;1;2;1;2;0", "aff_unique_norm": "California Institute of Technology;University of Texas at Austin;Meta", "aff_unique_dep": ";;Meta Platforms, Inc.", "aff_unique_url": "https://www.caltech.edu;https://www.utexas.edu;https://meta.com", "aff_unique_abbr": "Caltech;UT Austin;Meta", "aff_campus_unique_index": "0;1;1;0", "aff_campus_unique": "Pasadena;Austin;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Referee Can Play: An Alternative Approach to Conditional Generation via Model Inversion", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33389", "id": "hZ0fWhgVch", "proceeding": "https://proceedings.mlr.press/v235/liu24aa.html", "pdf": "https://openreview.net/pdf?id=hZ0fWhgVch", "openreview": "https://openreview.net/forum?id=hZ0fWhgVch", "author_site": "Xuantong Liu, Tianyang Hu, Wenjia Wang, Kenji Kawaguchi, Yuan Yao", "tldr": "", "abstract": "As a dominant force in text-to-image generation tasks, Diffusion Probabilistic Models (DPMs) face a critical challenge in controllability, struggling to adhere strictly to complex, multi-faceted instructions. In this work, we aim to address this alignment challenge for conditional generation tasks. First, we provide an alternative view of state-of-the-art DPMs as a way of inverting advanced Vision-Language Models (VLMs). With this formulation, we naturally propose a training-free approach that bypasses the conventional sampling process associated with DPMs. By directly optimizing images with the supervision of discriminative VLMs, the proposed method can potentially achieve a better text-image alignment. As proof of concept, we demonstrate the pipeline with the pre-trained BLIP-2 model and identify several key designs for improved image generation. To further enhance the image fidelity, a Score Distillation Sampling module of Stable Diffusion is incorporated. By carefully balancing the two components during optimization, our method can produce high-quality images with near state-of-the-art performance on T2I-Compbench. The code is available at https://github.com/Pepper-lll/VLMinv.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xuantong LIU;Tianyang Hu;Wenjia Wang;Kenji Kawaguchi;Yuan Yao", "authorids": "~Xuantong_LIU1;~Tianyang_Hu1;~Wenjia_Wang2;~Kenji_Kawaguchi1;~Yuan_Yao1", "gender": "F;M;M;;Unspecified", "homepage": ";https://hu-tianyang.github.io/;https://www.wenjia-w.com/;https://ml.comp.nus.edu.sg/#members;https://yao-lab.github.io/", "dblp": ";170/2551;;;25/4120-11.html", "google_scholar": ";mlA_3r0AAAAJ;EKS1sO0AAAAJ;aLl3rYoAAAAJ;OOlHr-wAAAAJ", "orcid": "0000-0002-0960-1078;;;;0000-0001-5814-1162", "linkedin": ";;;;", "or_profile": "~Xuantong_LIU1;~Tianyang_Hu1;~Wenjia_Wang2;~Kenji_Kawaguchi1;~Yuan_Yao1", "aff": "Hong Kong University of Science and Technology;Huawei Noah's Ark Lab;HKUST (GZ);National University of Singapore;Hong Kong University of Science and Technology", "aff_domain": "ust.hk;huawei.com;hkust-gz.edu.cn;nus.edu;ust.hk", "position": "PhD student;Researcher;Assistant Professor;Presidential Young Professor;Full Professor", "bibtex": "@inproceedings{\nliu2024referee,\ntitle={Referee Can Play: An Alternative Approach to Conditional Generation via Model Inversion},\nauthor={Xuantong LIU and Tianyang Hu and Wenjia Wang and Kenji Kawaguchi and Yuan Yao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hZ0fWhgVch}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8475620, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4656281529217000526&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "ust.hk;huawei.com;hkust-gz.edu.cn;nus.edu;ust.hk", "author_num": 5, "aff_unique_index": "0;1;0;2;0", "aff_unique_norm": "Hong Kong University of Science and Technology;Huawei;National University of Singapore", "aff_unique_dep": ";Noah's Ark Lab;", "aff_unique_url": "https://www.ust.hk;https://www.huawei.com;https://www.nus.edu.sg", "aff_unique_abbr": "HKUST;Huawei;NUS", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Hong Kong SAR;;Guangzhou", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "China;Singapore" }, { "title": "Hard Tasks First: Multi-Task Reinforcement Learning Through Task Scheduling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33388", "id": "haUOhXo70o", "proceeding": "https://proceedings.mlr.press/v235/cho24d.html", "pdf": "https://openreview.net/pdf?id=haUOhXo70o", "openreview": "https://openreview.net/forum?id=haUOhXo70o", "author_site": "MYUNG-SIK CHO, Jong Eui Park, Suyoung Lee, Youngchul Sung", "tldr": "", "abstract": "Multi-task reinforcement learning (RL) faces the significant challenge of varying task difficulties, often leading to negative transfer when simpler tasks overshadow the learning of more complex ones. To overcome this challenge, we propose a novel algorithm, Scheduled Multi-Task Training (SMT), that strategically prioritizes more challenging tasks, thereby enhancing overall learning efficiency. SMT introduces a dynamic task prioritization strategy, underpinned by an effective metric for assessing task difficulty. This metric ensures an efficient and targeted allocation of training resources, significantly improving learning outcomes. Additionally, SMT incorporates a reset mechanism that periodically reinitializes key network parameters to mitigate the simplicity bias, further enhancing the adaptability and robustness of the learning process across diverse tasks. The efficacy of SMT's scheduling method is validated by significantly improving performance on challenging Meta-World benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Myungsik Cho;Jongeui Park;Suyoung Lee;Youngchul Sung", "authorids": "~Myungsik_Cho1;~Jongeui_Park1;~Suyoung_Lee4;~Youngchul_Sung1", "gender": "M;M;M;M", "homepage": ";;https://sites.google.com/view/youngchulsung;https://suyoung-lee.github.io/", "dblp": "233/3959;295/5486;17/6798;31/4163", "google_scholar": "https://scholar.google.com/citations?hl=en;;-9D2k3UAAAAJ;CWbdBy8AAAAJ", "orcid": ";0000-0003-2845-6053;0000-0003-4536-6690;", "linkedin": ";jongeui-park-ab0a91138/;;", "or_profile": "~Myungsik_Cho1;~Jongeui_Park1;~Youngchul_Sung1;~Su_Young_Lee1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "PhD student;PhD student;Full Professor;PhD student", "bibtex": "@inproceedings{\ncho2024hard,\ntitle={Hard Tasks First: Multi-Task Reinforcement Learning Through Task Scheduling},\nauthor={Myungsik Cho and Jongeui Park and Suyoung Lee and Youngchul Sung},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=haUOhXo70o}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3454734, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7591568766414056591&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 5, "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Online Adaptive Anomaly Thresholding with Confidence Sequences", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33387", "id": "hbsKxUEreL", "proceeding": "https://proceedings.mlr.press/v235/sun24h.html", "pdf": "https://openreview.net/pdf?id=hbsKxUEreL", "openreview": "https://openreview.net/forum?id=hbsKxUEreL", "author_site": "Sophia Sun, Abishek Sankararaman, Balakrishnan Narayanaswamy", "tldr": "", "abstract": "Selecting appropriate thresholds for anomaly detection in online, unsupervised settings is a challenging task, especially in the presence of data distribution shifts. Addressing these challenges is critical in many practical large scale systems, such as infrastructure monitoring and network intrusion detection. This paper proposes an algorithm that connects online thresholding with constructing confidence sequences achieving (1) adaptive online threshold selection robust to distribution shifts, (2) statistical guarantees on false positive and false negative rates without any distributional assumptions, and (3) improved performance when given relevant offline data to warm-start the online algorithm, while having bounded degradation if the offline data is irrelevant. We complement our theoretical results by empirical evidence that our method outperforms commonly used baselines across synthetic and real world datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sophia Huiwen Sun;Abishek Sankararaman;Balakrishnan Murali Narayanaswamy", "authorids": "~Sophia_Huiwen_Sun1;~Abishek_Sankararaman1;~Balakrishnan_Murali_Narayanaswamy1", "gender": "F;M;M", "homepage": "https://huiwenn.github.io/;http://abishek90.github.io/;https://sites.google.com/site/muralibalki/", "dblp": "319/6856;https://dblp.uni-trier.de/pers/hd/s/Sankararaman:Abishek;12/5012", "google_scholar": "https://scholar.google.com/citations?hl=en;3T9FHn0AAAAJ;mKzKZfUAAAAJ", "orcid": ";;0009-0006-4377-8295", "linkedin": ";;", "or_profile": "~Sophia_Huiwen_Sun1;~Abishek_Sankararaman1;~Murali_Balakrishnan1", "aff": "University of California, San Diego, University of California, San Diego;Amazon;Amazon", "aff_domain": "eng.ucsd.edu;amazon.com;amazon.com", "position": "PhD student;Researcher;Principal Researcher", "bibtex": "@inproceedings{\nsun2024online,\ntitle={Online Adaptive Anomaly Thresholding with Confidence Sequences},\nauthor={Sophia Huiwen Sun and Abishek Sankararaman and Balakrishnan Murali Narayanaswamy},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hbsKxUEreL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1452001, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:rcAkkQLWk90J:scholar.google.com/&scioq=Online+Adaptive+Anomaly+Thresholding+with+Confidence+Sequences&hl=en&as_sdt=0,33", "gs_version_total": 8, "email": "eng.ucsd.edu;amazon.com;amazon.com", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of California, San Diego;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.ucsd.edu;https://www.amazon.com", "aff_unique_abbr": "UCSD;Amazon", "aff_campus_unique_index": "0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Peeking with PEAK: Sequential, Nonparametric Composite Hypothesis Tests for Means of Multiple Data Streams", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33386", "id": "hcASxFvmZ5", "proceeding": "https://proceedings.mlr.press/v235/cho24a.html", "pdf": "https://openreview.net/pdf?id=hcASxFvmZ5", "openreview": "https://openreview.net/forum?id=hcASxFvmZ5", "author_site": "Brian Cho, Kyra Gan, Nathan Kallus", "tldr": "", "abstract": "We propose a novel nonparametric sequential test for composite hypotheses for means of multiple data streams. Our proposed method, peeking with expectation-based averaged capital (PEAK), builds upon the testing-by-betting framework and provides a non-asymptotic $\\alpha$-level test across any stopping time. Our contributions are two-fold: (1) we propose a novel betting scheme and provide theoretical guarantees on type-I error control, power, and asymptotic growth rate/$e$-power in the setting of a single data stream; (2) we introduce PEAK, a generalization of this betting scheme to multiple streams, that (i) avoids using wasteful union bounds via averaging, (ii) is a test of power one under mild regularity conditions on the sampling scheme of the streams, and (iii) reduces computational overhead when applying the testing-as-betting approaches for pure-exploration bandit problems. We illustrate the practical benefits of PEAK using both synthetic and real-world HeartSteps datasets. Our experiments show that PEAK provides up to an 85% reduction in the number of samples before stopping compared to existing stopping rules for pure-exploration bandit problems, and matches the performance of state-of-the-art sequential tests while improving upon computational complexity.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Brian M Cho;Kyra Gan;Nathan Kallus", "authorids": "~Brian_M_Cho1;~Kyra_Gan1;~Nathan_Kallus1", "gender": "M;;", "homepage": "https://bcho.page/;;http://nathankallus.com/", "dblp": "32/3261-1.html;;142/2900", "google_scholar": "https://scholar.google.co.jp/citations?user=9k0bfB0AAAAJ;;K2WfIlsAAAAJ", "orcid": "0000-0003-3558-0415;;0000-0003-1672-0507", "linkedin": "brian-cho-5a7876172/;;", "or_profile": "~Brian_M_Cho1;~Kyra_Gan1;~Nathan_Kallus1", "aff": "Cornell University;;Cornell University", "aff_domain": "cornell.edu;;cornell.edu", "position": "PhD student;;Associate Professor", "bibtex": "@inproceedings{\ncho2024peeking,\ntitle={Peeking with {PEAK}: Sequential, Nonparametric Composite Hypothesis Tests for Means of Multiple Data Streams},\nauthor={Brian M Cho and Kyra Gan and Nathan Kallus},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hcASxFvmZ5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1206679, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1676653440557456416&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "cornell.edu;;cornell.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Cornell University", "aff_unique_dep": "", "aff_unique_url": "https://www.cornell.edu", "aff_unique_abbr": "Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Grokking Group Multiplication with Cosets", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33385", "id": "hcQfTsVnBo", "proceeding": "https://proceedings.mlr.press/v235/stander24a.html", "pdf": "https://openreview.net/pdf?id=hcQfTsVnBo", "openreview": "https://openreview.net/forum?id=hcQfTsVnBo", "author_site": "Dashiell Stander, Qinan Yu, Honglu Fan, Stella Biderman", "tldr": "", "abstract": "The complex and unpredictable nature of deep neural networks prevents their safe use in many high-stakes applications. There have been many techniques developed to interpret deep neural networks, but all have substantial limitations. Algorithmic tasks have proven to be a fruitful test ground for interpreting a neural network end-to-end. Building on previous work, we completely reverse engineer fully connected one-hidden layer networks that have ``grokked'' the arithmetic of the permutation groups $S_5$ and $S_6$. The models discover the true subgroup structure of the full group and converge on neural circuits that decompose the group arithmetic using the permutation group's subgroups. We relate how we reverse engineered the model's mechanisms and confirmed our theory was a faithful description of the circuit's functionality. We also draw attention to current challenges in conducting interpretability research by comparing our work to Chughtai et al. (2023) which alleges to find a different algorithm for this same problem.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dashiell Stander;Qinan Yu;Honglu Fan;Stella Biderman", "authorids": "~Dashiell_Stander1;~Qinan_Yu1;~Honglu_Fan1;~Stella_Biderman1", "gender": "M;F;Not Specified;F", "homepage": ";https://www.linkedin.com/in/qinan-yu-9b50471b2/;https://honglu.fan;http://www.stellabiderman.com", "dblp": ";;;239/5641", "google_scholar": ";;XqlOVeAAAAAJ;bO7H0DAAAAAJ", "orcid": "0000-0002-0029-8930;;;0000-0001-8228-1042", "linkedin": ";;;stellabiderman", "or_profile": "~Dashiell_Stander1;~Qinan_Yu1;~Honglu_Fan1;~Stella_Biderman1", "aff": "MATS;Brown University;University of Geneva;Booz Allen Hamilton", "aff_domain": "matsprogram.org;brown.edu;unige.ch;boozallen.com", "position": "Researcher;Undergrad student;Postdoc;Industry researcher", "bibtex": "@inproceedings{\nstander2024grokking,\ntitle={Grokking Group Multiplication with Cosets},\nauthor={Dashiell Stander and Qinan Yu and Honglu Fan and Stella Biderman},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hcQfTsVnBo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 694472, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8394001954915087235&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "matsprogram.org;brown.edu;unige.ch;boozallen.com", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "MATS;Brown University;University of Geneva;Booz Allen Hamilton", "aff_unique_dep": ";;;", "aff_unique_url": ";https://www.brown.edu;https://www.unige.ch;https://www.boozallen.com", "aff_unique_abbr": ";Brown;UNIGE;BAH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;2;1", "aff_country_unique": ";United States;Switzerland" }, { "title": "Position: Machine Learning-powered Assessments of the EU Digital Services Act Aid Quantify Policy Impacts on Online Harms", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33384", "id": "hdpv6mall8", "proceeding": "https://proceedings.mlr.press/v235/bonel24a.html", "pdf": "https://openreview.net/pdf?id=hdpv6mall8", "openreview": "https://openreview.net/forum?id=hdpv6mall8", "author_site": "Eleonora Bonel, Luca Nannini, Davide Bassi, Michele Maggini", "tldr": "", "abstract": "While machine learning shows promise in automated knowledge generation, current techniques such as large language models and micro-targeted influence operations can be exploited for harmful purposes like the proliferation of disinformation. The European Union's Digital Services Act (DSA) is an exemplary policy response addressing these harms generated by online platforms. In this regard, it necessitates a comprehensive evaluation of its impact on curbing the harmful downstream effects of these opaque practices. Despite their harmful applications, we argue that machine learning techniques offer immense, yet under-exploited, potential for unraveling the impacts of regulations like the DSA. Following an analysis that reveals possible limitations in the DSA's provisions, we call for resolute efforts to address methodological barriers around appropriate data access, isolating marginal regulatory effects, and facilitating generalization across different contexts. Given the identified advantages of data-driven approaches to regulatory delivery, we advocate for machine learning research to help quantify the policy impacts on online harms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Eleonora Bonel;Luca Nannini;Davide Bassi;Michele Joshua Maggini", "authorids": "eleonora.bonel@sciencespo.fr;~Luca_Nannini1;davide.bassi@usc.es;michelejoshua.maggini@usc.es", "gender": ";M;;", "homepage": ";https://www.linkedin.com/in/luca-nannini/;;", "dblp": ";;;", "google_scholar": ";fXKTm9AAAAAJ;;", "orcid": ";0000-0002-4733-9760;;", "linkedin": ";luca-nannini/;;", "or_profile": "eleonora.bonel@sciencespo.fr;~Luca_Nannini1;davide.bassi@usc.es;michelejoshua.maggini@usc.es", "aff": ";Universidad de Santiago de Compostela;;", "aff_domain": ";usc.es;;", "position": ";PhD student;;", "bibtex": "@inproceedings{\nbonel2024position,\ntitle={Position: Machine Learning-powered Assessments of the {EU} Digital Services Act Aid Quantify Policy Impacts on Online Harms},\nauthor={Eleonora Bonel and Luca Nannini and Davide Bassi and Michele Joshua Maggini},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hdpv6mall8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 224899, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4jlPJuPOnzUJ:scholar.google.com/&scioq=Position:+Machine+Learning-powered+Assessments+of+the+EU+Digital+Services+Act+Aid+Quantify+Policy+Impacts+on+Online+Harms&hl=en&as_sdt=0,5", "gs_version_total": 4, "email": ";usc.es;;", "author_num": 4, "aff_unique_index": "0", "aff_unique_norm": "Universidad de Santiago de Compostela", "aff_unique_dep": "", "aff_unique_url": "https://www.usc.es", "aff_unique_abbr": "USC", "aff_country_unique_index": "0", "aff_country_unique": "Spain" }, { "title": "Craftax: A Lightning-Fast Benchmark for Open-Ended Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33383", "id": "hg4wXlrQCV", "proceeding": "https://proceedings.mlr.press/v235/matthews24a.html", "pdf": "https://openreview.net/pdf?id=hg4wXlrQCV", "openreview": "https://openreview.net/forum?id=hg4wXlrQCV", "author_site": "Michael Matthews, Michael Beukman, Benjamin Ellis, Mikayel Samvelyan, Matthew T Jackson, Samuel Coward, Jakob Foerster", "tldr": "", "abstract": "Benchmarks play a crucial role in the development and analysis of reinforcement learning (RL) algorithms. We identify that existing benchmarks used for research into open-ended learning fall into one of two categories. Either they are too slow for meaningful research to be performed without enormous computational resources, like Crafter, NetHack and Minecraft, or they are not complex enough to pose a significant challenge, like Minigrid and Procgen. To remedy this, we first present Craftax-Classic: a ground-up rewrite of Crafter in JAX that runs up to 250x faster than the Python-native original. A run of PPO using 1 billion environment interactions finishes in under an hour using only a single GPU and averages 90% of the optimal reward. To provide a more compelling challenge we present the main Craftax benchmark, a significant extension of the Crafter mechanics with elements inspired from NetHack. Solving Craftax requires deep exploration, long term planning and memory, as well as continual adaptation to novel situations as more of the world is discovered. We show that existing methods including global and episodic exploration, as well as unsupervised environment design fail to make material progress on the benchmark. We therefore believe that Craftax can for the first time allow researchers to experiment in a complex, open-ended environment with limited computational resources.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Michael Matthews;Michael Beukman;Benjamin Ellis;Mikayel Samvelyan;Matthew Thomas Jackson;Samuel Coward;Jakob Nicolaus Foerster", "authorids": "~Michael_Matthews4;~Michael_Beukman1;~Benjamin_Ellis1;~Mikayel_Samvelyan1;~Matthew_Thomas_Jackson1;~Samuel_Coward1;~Jakob_Nicolaus_Foerster1", "gender": "M;;M;M;M;;M", "homepage": "https://www.mtmatthews.com/;;http://whirl.cs.ox.ac.uk/pages/people/ben.html;https://www.samvelyan.com/;https://matthewtjackson.com;;https://www.jakobfoerster.com", "dblp": "217/7784.html;;;170/0101;331/5748;;176/5095", "google_scholar": "https://scholar.google.com/citations?authuser=1;;;2Qs19WAAAAAJ;SdGawnwAAAAJ;;6z4lQzMAAAAJ", "orcid": ";;;0009-0001-6748-8755;;;", "linkedin": "michael-matthews-b7a5b7158/;;;samvelyan;matthew-t-jackson/;;", "or_profile": "~Michael_Matthews4;~Michael_Beukman1;~Benjamin_Ellis1;~Mikayel_Samvelyan1;~Matthew_Thomas_Jackson1;~Samuel_Coward1;~Jakob_Nicolaus_Foerster1", "aff": "University of Oxford;;Department of Computer Science, University of Oxford;Meta (FAIR);Wayve;;University of Oxford, University of Oxford", "aff_domain": "ox.ac.uk;;cs.ox.ac.uk;fb.com;wayve.ai;;eng.ox.ac.uk", "position": "PhD student;;PhD student;Research Assistant;Intern;;Associate Professor", "bibtex": "@inproceedings{\nmatthews2024craftax,\ntitle={Craftax: A Lightning-Fast Benchmark for Open-Ended Reinforcement Learning},\nauthor={Michael Matthews and Michael Beukman and Benjamin Ellis and Mikayel Samvelyan and Matthew Thomas Jackson and Samuel Coward and Jakob Nicolaus Foerster},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hg4wXlrQCV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5783694, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7210243718299116628&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "ox.ac.uk;;cs.ox.ac.uk;fb.com;wayve.ai;;eng.ox.ac.uk", "author_num": 7, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "University of Oxford;Meta;Wayve", "aff_unique_dep": ";FAIR;", "aff_unique_url": "https://www.ox.ac.uk;https://meta.org;https://www.wayve.ai", "aff_unique_abbr": "Oxford;Meta;", "aff_campus_unique_index": "1", "aff_campus_unique": ";Oxford", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "Privacy Profiles for Private Selection", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33382", "id": "hgHQvrvwH9", "proceeding": "https://proceedings.mlr.press/v235/koskela24a.html", "pdf": "https://openreview.net/pdf?id=hgHQvrvwH9", "openreview": "https://openreview.net/forum?id=hgHQvrvwH9", "author_site": "Antti Koskela, Rachel Redberg, Yu-Xiang Wang", "tldr": "", "abstract": "Private selection mechanisms (e.g., Report Noisy Max, Sparse Vector) are fundamental primitives of differentially private (DP) data analysis with wide applications to private query release, voting, and hyperparameter tuning. Recent work (Liu and Talwar, 2019; Papernot and Steinke, 2022) has made significant progress in both generalizing private selection mechanisms and tightening their privacy analysis using modern numerical privacy accounting tools, e.g., R\u00e9nyi DP. But R\u00e9nyi DP is known to be lossy when $(\\epsilon,\\delta)$-DP is ultimately needed, and there is a trend to close the gap by directly handling privacy profiles, i.e., $\\delta$ as a function of $\\epsilon$ or its equivalent dual form known as $f$-DPs. In this paper, we work out an easy-to-use recipe that bounds the privacy profiles of ReportNoisyMax and PrivateTuning using the privacy profiles of the base algorithms they corral. Numerically, our approach improves over the RDP-based accounting in all regimes of interest and leads to substantial benefits in end-to-end private learning experiments. Our analysis also suggests new distributions, e.g., binomial distribution for randomizing the number of rounds that leads to more substantial improvements in certain regimes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Antti Koskela;Rachel Emily Redberg;Yu-Xiang Wang", "authorids": "~Antti_Koskela1;~Rachel_Emily_Redberg1;~Yu-Xiang_Wang1", "gender": "M;F;", "homepage": ";;http://www.cs.ucsb.edu/~yuxiangw/publications.html", "dblp": "124/9273;259/2266;62/1637-3.html", "google_scholar": "https://scholar.google.fi/citations?hl=fi;;HGNZ1fkAAAAJ", "orcid": ";0000-0001-5592-7186;", "linkedin": ";rachel-redberg-08026a45;", "or_profile": "~Antti_Koskela1;~Rachel_Emily_Redberg1;~Yu-Xiang_Wang1", "aff": "Nokia Bell Labs;Northeastern University;UC Santa Barbara", "aff_domain": "nokia-bell-labs.com;northeastern.edu;ucsb.edu", "position": "Researcher;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nkoskela2024privacy,\ntitle={Privacy Profiles for Private Selection},\nauthor={Antti Koskela and Rachel Emily Redberg and Yu-Xiang Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hgHQvrvwH9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 592197, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3175418238996942615&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 8, "email": "nokia-bell-labs.com;northeastern.edu;ucsb.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Nokia Bell Labs;Northeastern University;University of California, Santa Barbara", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nokialabs.com;https://www.northeastern.edu;https://www.ucsb.edu", "aff_unique_abbr": "Nokia Bell Labs;NEU;UCSB", "aff_campus_unique_index": "1", "aff_campus_unique": ";Santa Barbara", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "ExCP: Extreme LLM Checkpoint Compression via Weight-Momentum Joint Shrinking", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33381", "id": "hlvKd7Vdxm", "proceeding": "https://proceedings.mlr.press/v235/li24m.html", "pdf": "https://openreview.net/pdf?id=hlvKd7Vdxm", "openreview": "https://openreview.net/forum?id=hlvKd7Vdxm", "author_site": "Wenshuo Li, Xinghao Chen, Han Shu, Yehui Tang, Yunhe Wang", "tldr": "", "abstract": "Large language models (LLM) have recently attracted significant attention in the field of artificial intelligence. However, the training process of these models poses significant challenges in terms of computational and storage capacities, thus compressing checkpoints has become an urgent problem. In this paper, we propose a novel Extreme Checkpoint Compression (ExCP) framework, which significantly reduces the required storage of training checkpoints while achieving nearly lossless performance. We first calculate the residuals of adjacent checkpoints to obtain the essential but sparse information for higher compression ratio. To further excavate the redundancy parameters in checkpoints, we then propose a weight-momentum joint shrinking method to utilize another important information during the model optimization, i.e., momentum. In particular, we exploit the information of both model and optimizer to discard as many parameters as possible while preserving critical information to ensure optimal performance. Furthermore, we utilize non-uniform quantization to further compress the storage of checkpoints. We extensively evaluate our proposed ExCP framework on several models ranging from 410M to 7B parameters and demonstrate significant storage reduction while maintaining strong performance. For instance, we achieve approximately $70\\times$ compression for the Pythia-410M model, with the final performance being as accurate as the original model on various downstream tasks. Codes will be available at https://github.com/Gaffey/ExCP.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenshuo Li;Xinghao Chen;Han Shu;Yehui Tang;Yunhe Wang", "authorids": "~Wenshuo_Li2;~Xinghao_Chen1;~Han_Shu1;~Yehui_Tang1;~Yunhe_Wang1", "gender": "M;M;M;M;M", "homepage": ";;;;https://www.wangyunhe.site/", "dblp": "188/0318.html;30/4937-1;30/4042;244/9659;63/8217-1", "google_scholar": "XxaX0hkAAAAJ;tuGWUVIAAAAJ;https://scholar.google.com/citations?hl=en;TkSZQ6gAAAAJ;https://scholar.google.com.sg/citations?user=isizOkYAAAAJ", "orcid": ";0000-0002-2102-8235;;;0000-0002-0142-509X", "linkedin": ";;;;", "or_profile": "~Wenshuo_Li2;~Xinghao_Chen1;~Han_Shu1;~Yehui_Tang1;~Yunhe_Wang1", "aff": "Huawei Technologies Ltd.;Huawei Noah's Ark Lab;Huawei Technologies;Huawei Technologies Ltd.;Huawei Noah's Ark Lab", "aff_domain": "huawei.com;huawei.com;huawei.com;huawei.com;huawei.com", "position": "Researcher;Researcher;research enginneer;Researcher;Principal Researcher", "bibtex": "@inproceedings{\nli2024excp,\ntitle={Ex{CP}: Extreme {LLM} Checkpoint Compression via Weight-Momentum Joint Shrinking},\nauthor={Wenshuo Li and Xinghao Chen and Han Shu and Yehui Tang and Yunhe Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hlvKd7Vdxm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1589724, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5158293089681821170&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "huawei.com;huawei.com;huawei.com;huawei.com;huawei.com", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Huawei", "aff_unique_dep": "Huawei Technologies", "aff_unique_url": "https://www.huawei.com", "aff_unique_abbr": "Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Theoretical Guarantees for Variational Inference with Fixed-Variance Mixture of Gaussians", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33380", "id": "hnqlgwcRxb", "proceeding": "https://proceedings.mlr.press/v235/huix24a.html", "pdf": "https://openreview.net/pdf?id=hnqlgwcRxb", "openreview": "https://openreview.net/forum?id=hnqlgwcRxb", "author_site": "Tom Huix, Anna Korba, Alain Oliviero Durmus, Eric Moulines", "tldr": "", "abstract": "Variational inference (VI) is a popular approach in Bayesian inference, that looks for the best approximation of the posterior distribution within a parametric family, minimizing a loss that is (typically) the reverse Kullback-Leibler (KL) divergence. Despite its empirical success, the theoretical properties of VI have only recently received attention, and is restricted to the Gaussian case. This research paper aims to contribute to the theoretical study of VI in the non-Gaussian case by investigating the setting of Mixture of Gaussians with fixed covariance. In this view, VI over this specific family can be casted as the minimization of a Mollified relative entropy, i.e. the KL between the convolution (with respect to a Gaussian kernel) of an atomic measure supported on Diracs, where the support of the atomic measure correspond to the localization of the Gaussian components, and the target distribution. Hence, solving variational inference is equivalent to optimizing the positions of the Diracs (the particles), which can be done through gradient descent and takes the form of an interacting particle system. We study two sources of error in variational inference in this context. The first is an optimization result that is a descent lemma establishing that the algorithm decreases the objective at each iteration. The second is an approximation error that upper bounds the mollified relative entropy between an optimal finite mixture and the target distribution.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tom Huix;Anna Korba;Alain Oliviero Durmus;Eric Moulines", "authorids": "~Tom_Huix1;~Anna_Korba2;~Alain_Oliviero_Durmus1;~Eric_Moulines1", "gender": "M;;M;M", "homepage": ";;;", "dblp": ";182/8959.html;54/2358;01/11275", "google_scholar": ";https://scholar.google.fr/citations?user=dbH6E3kAAAAJ;https://scholar.google.fr/citations?user=_XE1LvQAAAAJ;", "orcid": ";;0000-0002-2058-0693;", "linkedin": "tom-huix/;;;", "or_profile": "~Tom_Huix1;~Anna_Korba2;~Eric_Moulines1;~Alain_Durmus1", "aff": "\u00c9cole Polytechnique;Ensae ParisTech;Ecole polytechnique;\u00c9cole Polytechnique", "aff_domain": "polytechnique.fr;ensae.fr;polytechnique.edu;polytechnique.fr", "position": "PhD student;Assistant Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nhuix2024theoretical,\ntitle={Theoretical Guarantees for Variational Inference with Fixed-Variance Mixture of Gaussians},\nauthor={Tom Huix and Anna Korba and Alain Oliviero Durmus and Eric Moulines},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hnqlgwcRxb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 505575, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7982817056736012096&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "polytechnique.fr;ensae.fr;polytechnique.edu;polytechnique.fr", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Ecole Polytechnique;ENSAE ParisTech", "aff_unique_dep": ";", "aff_unique_url": "https://www.polytechnique.edu;https://www.ensae.fr", "aff_unique_abbr": "X;Ensae", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "France" }, { "title": "Behavior Generation with Latent Actions", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33379", "id": "hoVwecMqV5", "proceeding": "https://proceedings.mlr.press/v235/lee24y.html", "pdf": "https://openreview.net/pdf?id=hoVwecMqV5", "openreview": "https://openreview.net/forum?id=hoVwecMqV5", "author_site": "Seungjae Lee, Yibin Wang, Haritheja Etukuru, H. Jin Kim, Mahi Shafiullah, Lerrel Pinto", "tldr": "", "abstract": "Generative modeling of complex behaviors from labeled datasets has been a longstanding problem in decision-making. Unlike language or image generation, decision-making requires modeling actions \u2013 continuous-valued vectors that are multimodal in their distribution, potentially drawn from uncurated sources, where generation errors can compound in sequential prediction. A recent class of models called Behavior Transformers (BeT) addresses this by discretizing actions using k-means clustering to capture different modes. However, k-means struggles to scale for high-dimensional action spaces or long sequences, and lacks gradient information, and thus BeT suffers in modeling long-range actions. In this work, we present Vector-Quantized Behavior Transformer (VQ-BeT), a versatile model for behavior generation that handles multimodal action prediction, conditional generation, and partial observations. VQ-BeT augments BeT by tokenizing continuous actions with a hierarchical vector quantization module. Across seven environments including simulated manipulation, autonomous driving, and robotics, VQ-BeT improves on state-of-the-art models such as BeT and Diffusion Policies. Importantly, we demonstrate VQ-BeT\u2019s improved ability to capture behavior modes while accelerating inference speed 5\u00d7 over Diffusion Policies. Videos can be found https://sjlee.cc/vq-bet/", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Seungjae Lee;Yibin Wang;Haritheja Etukuru;H. Jin Kim;Nur Muhammad Mahi Shafiullah;Lerrel Pinto", "authorids": "~Seungjae_Lee2;~Yibin_Wang1;~Haritheja_Etukuru1;~H._Jin_Kim1;~Nur_Muhammad_Mahi_Shafiullah1;~Lerrel_Pinto1", "gender": ";M;;F;M;M", "homepage": "https://sjlee.cc;https://wyb929.github.io;https://haritheja.com;http://larr.snu.ac.kr;https://www.lerrelpinto.com/;https://mahis.life", "dblp": ";;;91/5753;168/8304;308/1737", "google_scholar": "hpR9h74AAAAJ;UOW4otcAAAAJ;;TLQUwIMAAAAJ;pmVPj94AAAAJ;vAOw6aQAAAAJ", "orcid": ";;;;;0000-0003-3617-1293", "linkedin": ";;;;;", "or_profile": "~Seungjae_Lee2;~Yibin_Wang1;~Haritheja_Etukuru1;~H._Jin_Kim1;~Lerrel_Pinto1;~Nur_Muhammad_Shafiullah1", "aff": "Seoul National University;New York University;New York University;Seoul National University;New York University;New York University", "aff_domain": "snu.ac.kr;nyu.edu;nyu.edu;snu.ac.kr;cs.nyu.edu;nyu.edu", "position": "MS student;MS student;Undergrad student;Full Professor;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nlee2024behavior,\ntitle={Behavior Generation with Latent Actions},\nauthor={Seungjae Lee and Yibin Wang and Haritheja Etukuru and H. Jin Kim and Nur Muhammad Mahi Shafiullah and Lerrel Pinto},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hoVwecMqV5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7627956, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7916660172945782012&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "snu.ac.kr;nyu.edu;nyu.edu;snu.ac.kr;cs.nyu.edu;nyu.edu", "author_num": 6, "aff_unique_index": "0;1;1;0;1;1", "aff_unique_norm": "Seoul National University;New York University", "aff_unique_dep": ";", "aff_unique_url": "https://www.snu.ac.kr;https://www.nyu.edu", "aff_unique_abbr": "SNU;NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;1;1", "aff_country_unique": "South Korea;United States" }, { "title": "Nearest Neighbour Score Estimators for Diffusion Generative Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33378", "id": "hqNz4LDuhn", "proceeding": "https://proceedings.mlr.press/v235/niedoba24a.html", "pdf": "https://openreview.net/pdf?id=hqNz4LDuhn", "openreview": "https://openreview.net/forum?id=hqNz4LDuhn", "author_site": "Matthew Niedoba, Dylan Green, Saeid Naderiparizi, Vasileios Lioutas, Jonathan Lavington, Xiaoxuan Liang, Yunpeng Liu, Ke Zhang, Setareh Dabiri, Adam Scibior, Berend Zwartsenberg, Frank Wood", "tldr": "", "abstract": "Score function estimation is the cornerstone of both training and sampling from diffusion generative models. Despite this fact, the most commonly used estimators are either biased neural network approximations or high variance Monte Carlo estimators based on the conditional score. We introduce a novel nearest neighbour score function estimator which utilizes multiple samples from the training set to dramatically decrease estimator variance. We leverage our low variance estimator in two compelling applications. Training consistency models with our estimator, we report a significant increase in both convergence speed and sample quality. In diffusion models, we show that our estimator can replace a learned network for probability-flow ODE integration, opening promising new avenues of future research. Code will be released upon paper acceptance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Matthew Niedoba;Dylan Green;Saeid Naderiparizi;Vasileios Lioutas;Jonathan Wilder Lavington;Xiaoxuan Liang;Yunpeng Liu;Ke Zhang;Setareh Dabiri;Adam Scibior;Berend Zwartsenberg;Frank Wood", "authorids": "~Matthew_Niedoba2;~Dylan_Green1;~Saeid_Naderiparizi1;~Vasileios_Lioutas1;~Jonathan_Wilder_Lavington1;~Xiaoxuan_Liang2;~Yunpeng_Liu1;~Ke_Zhang23;~Setareh_Dabiri2;~Adam_Scibior1;~Berend_Zwartsenberg1;~Frank_Wood2", "gender": "M;M;M;M;M;F;M;;;;M;M", "homepage": ";;https://www.cs.ubc.ca/~saeidnp/;http://www.vlioutas.com/;https://wilderlavington.github.io/;;;;;https://www.cs.ubc.ca/~ascibior/;https://bzwartsenberg.github.io/;http://www.robots.ox.ac.uk/~fwood/", "dblp": "243/2863;;244/9611;224/6571;282/4019;;02/8137-7.html;;;167/6446;;44/4750", "google_scholar": "uSl2vYwAAAAJ;;Ubt0dYYAAAAJ;2jhOrwoAAAAJ;;;;;;https://scholar.google.co.uk/citations?user=Gpw8Z0cAAAAJ;;d4yNzXIAAAAJ", "orcid": ";;;;;;;;;;;", "linkedin": ";dylangreen90/;saeidnp;vasileioslioutas/;;xiaoxuan-liang-4451a4171/;larry-liu-323b51126/;zhangk15/;;;;frank-wood-43529114?trk=hp-identity-name", "or_profile": "~Matthew_Niedoba2;~Dylan_Green1;~Saeid_Naderiparizi1;~Vasileios_Lioutas1;~Jonathan_Wilder_Lavington1;~Xiaoxuan_Liang2;~Yunpeng_Liu1;~Ke_Zhang23;~Setareh_Dabiri2;~Adam_Scibior1;~Berend_Zwartsenberg1;~Frank_Wood2", "aff": "Inverted AI;University of British Columbia;University of British Columbia;University of British Columbia;;University of British Columbia;University of British Columbia;Inverted AI;;Inverted AI;Inverted AI;University of British Columbia", "aff_domain": "inverted.ai;cs.ubc.ca;ubc.ca;ubc.ca;;cs.ubc.ca;cs.ubc.ca;inverted.ai;;inverted.ai;inverted.ai;cs.ubc.ca", "position": "Researcher;PhD student;PhD student;PhD student;;PhD student;PhD student;Researcher;;Researcher;Researcher;Full Professor", "bibtex": "@inproceedings{\nniedoba2024nearest,\ntitle={Nearest Neighbour Score Estimators for Diffusion Generative Models},\nauthor={Matthew Niedoba and Dylan Green and Saeid Naderiparizi and Vasileios Lioutas and Jonathan Wilder Lavington and Xiaoxuan Liang and Yunpeng Liu and Ke Zhang and Setareh Dabiri and Adam Scibior and Berend Zwartsenberg and Frank Wood},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hqNz4LDuhn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3629198, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18186586412618692876&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 7, "email": "inverted.ai;cs.ubc.ca;ubc.ca;ubc.ca;;cs.ubc.ca;cs.ubc.ca;inverted.ai;;inverted.ai;inverted.ai;cs.ubc.ca", "author_num": 12, "aff_unique_index": "0;1;1;1;1;1;0;0;0;1", "aff_unique_norm": "Inverted AI;University of British Columbia", "aff_unique_dep": ";", "aff_unique_url": "https://www.inverted.ai;https://www.ubc.ca", "aff_unique_abbr": "Inverted AI;UBC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;1;0;0;0;1", "aff_country_unique": "United States;Canada" }, { "title": "Stochastic positional embeddings improve masked image modeling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33377", "id": "hr8OXXMb7a", "proceeding": "https://proceedings.mlr.press/v235/bar24a.html", "pdf": "https://openreview.net/pdf?id=hr8OXXMb7a", "openreview": "https://openreview.net/forum?id=hr8OXXMb7a", "author_site": "Amir Bar, Florian Bordes, Assaf Shocher, Mahmoud Assran, Pascal Vincent, Nicolas Ballas, Trevor Darrell, Amir Globerson, Yann LeCun", "tldr": "", "abstract": "Masked Image Modeling (MIM) is a promising self-supervised learning approach that enables learning from unlabeled images. Despite its recent success, learning good representations through MIM remains challenging because it requires predicting the right semantic content in accurate locations. For example, given an incomplete picture of a dog, we can guess that there is a tail, but we cannot determine its exact location. In this work, we propose to incorporate location uncertainty to MIM by using stochastic positional embeddings (StoP). Specifically, we condition the model on stochastic masked token positions drawn from a gaussian distribution. We show that using StoP reduces overfitting to location features and guides the model toward learning features that are more robust to location uncertainties. Quantitatively, using StoP improves downstream MIM performance on a variety of downstream tasks. For example, linear probing on ImageNet using ViT-B is improved by $+1.7\\%$, and by $2.5\\%$ for ViT-H using 1% of the data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Amir Bar;Florian Bordes;Assaf Shocher;Mido Assran;Pascal Vincent;Nicolas Ballas;Trevor Darrell;Amir Globerson;Yann LeCun", "authorids": "~Amir_Bar1;~Florian_Bordes1;~Assaf_Shocher1;~Mido_Assran1;~Pascal_Vincent1;~Nicolas_Ballas1;~Trevor_Darrell2;~Amir_Globerson1;~Yann_LeCun1", "gender": "M;M;M;M;;M;M;M;M", "homepage": "http://amirbar.net;;https://assafshocher.github.io/;http://www.iro.umontreal.ca/~vincentp;;http://www.cs.tau.ac.il/~gamir/;http://yann.lecun.com;http://www.midoassran.ca/;https://people.eecs.berkeley.edu/~trevor/", "dblp": "73/11011;194/9862;211/8006;43/861;120/9066;08/4162.html;l/YannLeCun;216/2717;d/TrevorDarrell", "google_scholar": "L__n1LUAAAAJ;OADfWhUAAAAJ;https://scholar.google.co.il/citations?user=ndRmNK8AAAAJ;WBCKQMsAAAAJ;euUV4iUAAAAJ;https://scholar.google.com.tw/citations?user=5JserkUAAAAJ;WLN3QrAAAAAJ;gcQTTvkAAAAJ;https://scholar.google.com.tw/citations?user=bh-uRFMAAAAJ", "orcid": ";;;;;;;0000-0001-9159-8447;", "linkedin": ";florianbordes;;;;;;;", "or_profile": "~Amir_Bar1;~Florian_Bordes1;~Assaf_Shocher1;~Pascal_Vincent1;~Nicolas_Ballas1;~Amir_Globerson1;~Yann_LeCun1;~Mahmoud_Assran1;~trevor_darrell1", "aff": "Tel Aviv University;Meta;University of California, Berkeley;Facebook A.I. Research;Meta;Tel Aviv University;New York University;Meta;Electrical Engineering & Computer Science Department", "aff_domain": "tau.ac.il;meta.com;berkeley.edu;fb.com;meta.com;tau.ac.il;nyu.edu;meta.com;eecs.berkeley.edu", "position": "PhD student;Researcher;Postdoc;Research Scientist;Researcher;Associate Professor;Full Professor;Research Scientist;Professor", "bibtex": "@inproceedings{\nbar2024stochastic,\ntitle={Stochastic positional embeddings improve masked image modeling},\nauthor={Amir Bar and Florian Bordes and Assaf Shocher and Mido Assran and Pascal Vincent and Nicolas Ballas and Trevor Darrell and Amir Globerson and Yann LeCun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hr8OXXMb7a}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2895987, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10336787508099031308&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8, "email": "tau.ac.il;meta.com;berkeley.edu;fb.com;meta.com;tau.ac.il;nyu.edu;meta.com;eecs.berkeley.edu", "author_num": 9, "aff_unique_index": "0;1;2;1;1;0;3;1;4", "aff_unique_norm": "Tel Aviv University;Meta;University of California, Berkeley;New York University;Electrical Engineering & Computer Science Department", "aff_unique_dep": ";Meta Platforms, Inc.;;;Electrical Engineering & Computer Science", "aff_unique_url": "https://www.tau.ac.il;https://meta.com;https://www.berkeley.edu;https://www.nyu.edu;", "aff_unique_abbr": "TAU;Meta;UC Berkeley;NYU;", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;1;1;1;0;1;1", "aff_country_unique": "Israel;United States;" }, { "title": "Truly No-Regret Learning in Constrained MDPs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33376", "id": "hrWte3nlzr", "proceeding": "https://proceedings.mlr.press/v235/muller24b.html", "pdf": "https://openreview.net/pdf?id=hrWte3nlzr", "openreview": "https://openreview.net/forum?id=hrWte3nlzr", "author_site": "Adrian M\u00fcller, Pragnya Alatur, Volkan Cevher, Giorgia Ramponi, Niao He", "tldr": "", "abstract": "Constrained Markov decision processes (CMDPs) are a common way to model safety constraints in reinforcement learning. State-of-the-art methods for efficiently solving CMDPs are based on primal-dual algorithms. For these algorithms, all currently known regret bounds allow for *error cancellations* --- one can compensate for a constraint violation in one round with a strict constraint satisfaction in another. This makes the online learning process unsafe since it only guarantees safety for the final (mixture) policy but not during learning. As Efroni et al. (2020) pointed out, it is an open question whether primal-dual algorithms can provably achieve sublinear regret if we do not allow error cancellations. In this paper, we give the first affirmative answer. We first generalize a result on last-iterate convergence of regularized primal-dual schemes to CMDPs with multiple constraints. Building upon this insight, we propose a model-based primal-dual algorithm to learn in an unknown CMDP. We prove that our algorithm achieves sublinear regret without error cancellations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Adrian M\u00fcller;Pragnya Alatur;Volkan Cevher;Giorgia Ramponi;Niao He", "authorids": "~Adrian_M\u00fcller2;~Pragnya_Alatur1;~Volkan_Cevher1;~Giorgia_Ramponi1;~Niao_He3", "gender": "M;F;M;F;", "homepage": "https://adrianlmueller.github.io;;http://lions.epfl.ch;https://gioramponi.github.io/;http://people.inf.ethz.ch/niaohe", "dblp": "25/4454-2;236/5721;70/5301;186/4493;https://dblp.uni-trier.de/pers/h/He:Niao.html", "google_scholar": ";B9FSj24AAAAJ;https://scholar.google.ch/citations?user=hlWhzU8AAAAJ;xbIAH5gAAAAJ;iNcA81MAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Adrian_M\u00fcller2;~Pragnya_Alatur1;~Volkan_Cevher1;~Giorgia_Ramponi1;~Niao_He1", "aff": "ETHZ - ETH Zurich;Department of Computer Science, ETHZ - ETH Zurich;Amazon Development Center Germany;ETHZ - ETH Zurich;Swiss Federal Institute of Technology", "aff_domain": "ethz.ch;inf.ethz.ch;amazon.de;ethz.ch;ethz.ch", "position": "PhD student;PhD student;Amazon Scholar;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nm{\\\"u}ller2024truly,\ntitle={Truly No-Regret Learning in Constrained {MDP}s},\nauthor={Adrian M{\\\"u}ller and Pragnya Alatur and Volkan Cevher and Giorgia Ramponi and Niao He},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hrWte3nlzr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1667446, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2195265774705051101&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "ethz.ch;inf.ethz.ch;amazon.de;ethz.ch;ethz.ch", "author_num": 5, "aff_unique_index": "0;0;1;0;2", "aff_unique_norm": "ETH Zurich;Amazon;Swiss Federal Institute of Technology", "aff_unique_dep": ";Development Center;", "aff_unique_url": "https://www.ethz.ch;https://www.amazon.de;https://www.ethz.ch", "aff_unique_abbr": "ETHZ;Amazon;ETH Zurich", "aff_campus_unique_index": "1", "aff_campus_unique": ";Zurich", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "Switzerland;Germany" }, { "title": "Prompt-tuning Latent Diffusion Models for Inverse Problems", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33375", "id": "hrwIndai8e", "proceeding": "https://proceedings.mlr.press/v235/chung24b.html", "pdf": "https://openreview.net/pdf?id=hrwIndai8e", "openreview": "https://openreview.net/forum?id=hrwIndai8e", "author_site": "Hyungjin Chung, Jong Chul YE, Peyman Milanfar, Mauricio Delbracio", "tldr": "", "abstract": "We propose a new method for solving imaging inverse problems using text-to-image latent diffusion models as general priors. Existing methods using latent diffusion models for inverse problems typically rely on simple null text prompts, which can lead to suboptimal performance. To improve upon this, we introduce a method for prompt tuning, which jointly optimizes the text embedding on-the-fly while running the reverse diffusion. This allows us to generate images that are more faithful to the diffusion prior. Specifically, our approach involves a unified optimization framework that simultaneously considers the prompt, latent, and pixel values through alternating minimization. This significantly diminishes image artifacts - a major problem when using latent diffusion models instead of pixel-based diffusion ones. Our method, called P2L, outperforms both pixel- and latent-diffusion model-based inverse problem solvers on a variety of tasks, such as super-resolution, deblurring, and inpainting. Furthermore, P2L demonstrates remarkable scalability to higher resolutions without artifacts.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hyungjin Chung;Jong Chul Ye;Peyman Milanfar;Mauricio Delbracio", "authorids": "~Hyungjin_Chung1;~Jong_Chul_Ye1;~Peyman_Milanfar1;~Mauricio_Delbracio1", "gender": "M;M;M;M", "homepage": "https://www.hj-chung.com/;https://bispl.weebly.com/;http://www.milanfar.org;", "dblp": "262/0382;15/5613;48/6882;90/10811", "google_scholar": "https://scholar.google.co.kr/citations?user=KdchEyoAAAAJ;HNMjoNEAAAAJ;iGzDl8IAAAAJ;lDDm920AAAAJ", "orcid": "0000-0003-3202-0893;;;", "linkedin": "hyungjin-chung-060b42148/;;;", "or_profile": "~Hyungjin_Chung1;~Jong_Chul_Ye1;~Peyman_Milanfar1;~Mauricio_Delbracio1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Google;Google", "aff_domain": "kaist.ac.kr;kaist.ac.kr;google.com;google.com", "position": "PhD student;Full Professor;Distinguished Scientist;Research Scientist", "bibtex": "@inproceedings{\nchung2024prompttuning,\ntitle={Prompt-tuning Latent Diffusion Models for Inverse Problems},\nauthor={Hyungjin Chung and Jong Chul Ye and Peyman Milanfar and Mauricio Delbracio},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hrwIndai8e}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8657271, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13591335290787932252&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "kaist.ac.kr;kaist.ac.kr;google.com;google.com", "author_num": 4, "aff_unique_index": "0;0;1;1", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.kaist.ac.kr;https://www.google.com", "aff_unique_abbr": "KAIST;Google", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "South Korea;United States" }, { "title": "RMIB: Representation Matching Information Bottleneck for Matching Text Representations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33374", "id": "hsHIxrnrMx", "proceeding": "https://proceedings.mlr.press/v235/pan24f.html", "pdf": "https://openreview.net/pdf?id=hsHIxrnrMx", "openreview": "https://openreview.net/forum?id=hsHIxrnrMx", "author_site": "Haihui Pan, zhifang Liao, Wenrui Xie, Kun Han", "tldr": "", "abstract": "Recent studies have shown that the domain matching of text representations will help improve the generalization ability of asymmetrical domains text matching tasks. This requires that the distribution of text representations should be as similar as possible, similar to matching with heterogeneous data domains, in order to make the data after feature extraction indistinguishable. However, how to match the distribution of text representations remains an open question, and the role of text representations distribution match is still unclear. In this work, we explicitly narrow the distribution of text representations by matching them with the same prior distribution. We theoretically prove that narrowing the distribution of text representations in asymmetrical domains text matching is equivalent to optimizing the information bottleneck (IB). Since the interaction between text representations plays an important role in asymmetrical domains text matching, IB does not restrict the interaction between text representations. Therefore, we propose the adequacy of interaction and the incompleteness of a single text representation on the basis of IB and obtain the representation matching information bottleneck (RMIB). We theoretically prove that the constraints on text representations in RMIB is equivalent to maximizing the mutual information between text representations on the premise that the task information is given. On four text matching models and five text matching datasets, we verify that RMIB can improve the performance of asymmetrical domains text matching. Our experimental code is available at https://github.com/chenxingphh/rmib.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haihui Pan;Zhifang Liao;Wenrui Xie;Kun Han", "authorids": "~Haihui_Pan1;~Zhifang_Liao1;~Wenrui_Xie1;~Kun_Han2", "gender": "M;F;;M", "homepage": "https://github.com/chenxingphh;https://faculty.csu.edu.cn/liaozhifang/en/index.htm;https://sm1les.com;", "dblp": "255/5454.html;42/2640.html;294/5512;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;kun-han-ab1ab516", "or_profile": "~Haihui_Pan1;~Zhifang_Liao1;~Wenrui_Xie1;~Kun_Han2", "aff": "Cheetah Mobile;Central South University;Baidu;", "aff_domain": "cmcm.com;csu.edu.cn;baidu.com;", "position": "Researcher;Full Professor;Researcher;", "bibtex": "@inproceedings{\npan2024rmib,\ntitle={{RMIB}: Representation Matching Information Bottleneck for Matching Text Representations},\nauthor={Haihui Pan and Zhifang Liao and Wenrui Xie and Kun Han},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hsHIxrnrMx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 0, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8QbJs5Zkvi4J:scholar.google.com/&scioq=RMIB:+Representation+Matching+Information+Bottleneck+for+Matching+Text+Representations&hl=en&as_sdt=0,5", "gs_version_total": 4, "email": "cmcm.com;csu.edu.cn;baidu.com;", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Cheetah Mobile;Central South University;Baidu", "aff_unique_dep": ";;Baidu, Inc.", "aff_unique_url": "https://www.cheetahmobile.com;https://www.csu.edu.cn;https://www.baidu.com", "aff_unique_abbr": "Cheetah Mobile;CSU;Baidu", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "On the Tractability of SHAP Explanations under Markovian Distributions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33373", "id": "htq0FbPOsY", "proceeding": "https://proceedings.mlr.press/v235/marzouk24a.html", "pdf": "https://openreview.net/pdf?id=htq0FbPOsY", "openreview": "https://openreview.net/forum?id=htq0FbPOsY", "author_site": "Reda Marzouk, De la Higuera", "tldr": "", "abstract": "Thanks to its solid theoretical foundation, the SHAP framework is arguably one the most widely utilized frameworks for local explainability of ML models. Despite its popularity, its exact computation is known to be very challenging, proven to be NP-Hard in various configurations. Recent works have unveiled positive complexity results regarding the computation of the SHAP score for specific model families, encompassing decision trees, random forests, and some classes of boolean circuits. Yet, all these positive results hinge on the assumption of feature independence, often simplistic in real-world scenarios. In this article, we investigate the computational complexity of the SHAP score by relaxing this assumption and introducing a Markovian perspective. We show that, under the Markovian assumption, computing the SHAP score for the class of Weighted automata, Disjoint DNFs and Decision Trees can be performed in polynomial time, offering a first positive complexity result for the problem of SHAP score computation that transcends the limitations of the feature independence assumption.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Reda Marzouk;Colin de La Higuera", "authorids": "~Reda_Marzouk1;cdlh@univ-nantes.fr", "gender": "M;", "homepage": "https://www.univ-nantes.fr/mohamed-reda-marzouk;", "dblp": "262/3782.html;", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Reda_Marzouk1;cdlh@univ-nantes.fr", "aff": "Universit\u00e9 de Nantes;", "aff_domain": "univ-nantes.fr;", "position": "PhD student;", "bibtex": "@inproceedings{\nmarzouk2024on,\ntitle={On the Tractability of {SHAP} Explanations under Markovian Distributions},\nauthor={Reda Marzouk and Colin de La Higuera},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=htq0FbPOsY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 499752, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8976234194602132443&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "univ-nantes.fr;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Universit\u00e9 de Nantes", "aff_unique_dep": "", "aff_unique_url": "https://www.univ-nantes.fr", "aff_unique_abbr": "Univ. Nantes", "aff_country_unique_index": "0", "aff_country_unique": "France" }, { "title": "Energy-Guided Diffusion Sampling for Offline-to-Online Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33372", "id": "hunSEjeCPE", "proceeding": "https://proceedings.mlr.press/v235/liu24ao.html", "pdf": "https://openreview.net/pdf?id=hunSEjeCPE", "openreview": "https://openreview.net/forum?id=hunSEjeCPE", "author_site": "Xu-Hui Liu, Tian-Shuo Liu, Shengyi Jiang, Ruifeng Chen, Zhilong Zhang, Xinwei Chen, Yang Yu", "tldr": "", "abstract": "Combining offline and online reinforcement learning (RL) techniques is indeed crucial for achieving efficient and safe learning where data acquisition is expensive. Existing methods replay offline data directly in the online phase, resulting in a significant challenge of data distribution shift and subsequently causing inefficiency in online fine-tuning. To address this issue, we introduce an innovative approach, **E**nergy-guided **DI**ffusion **S**ampling (EDIS), which utilizes a diffusion model to extract prior knowledge from the offline dataset and employs energy functions to distill this knowledge for enhanced data generation in the online phase. The theoretical analysis demonstrates that EDIS exhibits reduced suboptimality compared to solely utilizing online data or directly reusing offline data. EDIS is a plug-in approach and can be combined with existing methods in offline-to-online RL setting. By implementing EDIS to off-the-shelf methods Cal-QL and IQL, we observe a notable 20% average improvement in empirical performance on MuJoCo, AntMaze, and Adroit environments. Code is available at https://github.com/liuxhym/EDIS.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xu-Hui Liu;Tian-Shuo Liu;Shengyi Jiang;Ruifeng Chen;Zhilong Zhang;Xinwei Chen;Yang Yu", "authorids": "~Xu-Hui_Liu1;~Tian-Shuo_Liu1;~Shengyi_Jiang2;~Ruifeng_Chen1;~Zhilong_Zhang2;~Xinwei_Chen3;~Yang_Yu5", "gender": ";M;M;M;M;;", "homepage": "http://www.lamda.nju.edu.cn/liuxh/;https://github.com/LTSure;http://www.lamda.nju.edu.cn/jiangsy;http://www.lamda.nju.edu.cn/chenrf/;http://www.lamda.nju.edu.cn/zhangzl/;;", "dblp": "292/7577;;67/3929;https://dblp.uni-trier.de/pid/58/10097-3;;;", "google_scholar": ";;;;;;", "orcid": ";;0000-0002-4443-0753;;;;", "linkedin": ";;;;;;", "or_profile": "~Xu-Hui_Liu1;~Tian-Shuo_Liu1;~Shengyi_Jiang2;~Ruifeng_Chen1;~Zhilong_Zhang2;~Xinwei_Chen3;~Yang_Yu5", "aff": "Nanjing University;Nanjing university;The University of Hong Kong;Nanjing University;Nanjing University;;", "aff_domain": "nju.edu.cn;nju.edu.cn;hku.hk;nju.edu.cn;nju.edu.cn;;", "position": "PhD student;MS student;PhD student;PhD student;MS student;;", "bibtex": "@inproceedings{\nliu2024energyguided,\ntitle={Energy-Guided Diffusion Sampling for Offline-to-Online Reinforcement Learning},\nauthor={Xu-Hui Liu and Tian-Shuo Liu and Shengyi Jiang and Ruifeng Chen and Zhilong Zhang and Xinwei Chen and Yang Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hunSEjeCPE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 601192, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:A39tVU9KYjMJ:scholar.google.com/&scioq=Energy-Guided+Diffusion+Sampling+for+Offline-to-Online+Reinforcement+Learning&hl=en&as_sdt=0,33", "gs_version_total": 7, "email": "nju.edu.cn;nju.edu.cn;hku.hk;nju.edu.cn;nju.edu.cn;;", "author_num": 7, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Nanjing University;University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "https://www.nju.edu.cn;https://www.hku.hk", "aff_unique_abbr": "Nanjing U;HKU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "LLM and Simulation as Bilevel Optimizers: A New Paradigm to Advance Physical Scientific Discovery", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33371", "id": "hz8cFsdz7P", "proceeding": "https://proceedings.mlr.press/v235/ma24m.html", "pdf": "https://openreview.net/pdf?id=hz8cFsdz7P", "openreview": "https://openreview.net/forum?id=hz8cFsdz7P", "author_site": "Pingchuan Ma, Johnson Tsun-Hsuan Wang, Minghao Guo, Zhiqing Sun, Josh Tenenbaum, Daniela Rus, Chuang Gan, Wojciech Matusik", "tldr": "", "abstract": "Large Language Models have recently gained significant attention in scientific discovery for their extensive knowledge and advanced reasoning capabilities. However, they encounter challenges in effectively simulating observational feedback and grounding it with language to propel advancements in physical scientific discovery. Conversely, human scientists undertake scientific discovery by formulating hypotheses, conducting experiments, and revising theories through observational analysis. Inspired by this, we propose to enhance the knowledge-driven, abstract reasoning abilities of LLMs with the computational strength of simulations. We introduce Scientific Generative Agent (SGA), a bilevel optimization framework: LLMs act as knowledgeable and versatile thinkers, proposing scientific hypotheses and reason about discrete components, such as physics equations or molecule structures; meanwhile, simulations function as experimental platforms, providing observational feedback and optimizing via differentiability for continuous parts, such as physical parameters. We conduct extensive experiments to demonstrate our framework's efficacy in constitutive law discovery and molecular design, unveiling novel solutions that differ from conventional human expectations yet remain coherent upon analysis.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pingchuan Ma;Tsun-Hsuan Wang;Minghao Guo;Zhiqing Sun;Joshua B. Tenenbaum;Daniela Rus;Chuang Gan;Wojciech Matusik", "authorids": "~Pingchuan_Ma3;~Tsun-Hsuan_Wang2;~Minghao_Guo1;~Zhiqing_Sun1;~Joshua_B._Tenenbaum1;~Daniela_Rus1;~Chuang_Gan1;~Wojciech_Matusik2", "gender": "M;M;M;M;;F;M;M", "homepage": "https://people.csail.mit.edu/pcma;https://zswang666.github.io/;https://www.minghaoguo.com/;https://www.cs.cmu.edu/~zhiqings/;;https://www.csail.mit.edu/person/daniela-rus;http://people.csail.mit.edu/ganchuang/;https://cdfg.mit.edu/wojciech", "dblp": "215/4446-2;217/1809.html;145/0008/;211/7692;t/JoshuaBTenenbaum;r/DanielaRus;139/6993;", "google_scholar": "EtCZmkwAAAAJ;xE3WSuYAAAAJ;Hq2unJcAAAAJ;https://scholar.google.com/citations?hl=en;;https://scholar.google.com/citations?hl=en;PTeSCbIAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;;;;;0000-0003-0212-5643", "linkedin": ";;;zhiqing-sun-5781b3100/;;;;wojciech-matusik-67238126/", "or_profile": "~Pingchuan_Ma3;~Tsun-Hsuan_Wang2;~Minghao_Guo1;~Zhiqing_Sun1;~Joshua_B._Tenenbaum1;~Daniela_Rus1;~Chuang_Gan1;~Wojciech_Matusik2", "aff": "Massachusetts Institute of Technology;Liquid AI;Massachusetts Institute of Technology;Carnegie Mellon University;Massachusetts Institute of Technology;Massachusetts Institute of Technology;University of Massachusetts at Amherst;Massachusetts Institute of Technology", "aff_domain": "mit.edu;liquid.ai;mit.edu;cs.cmu.edu;mit.edu;mit.edu;umass.edu;mit.edu", "position": "PhD student;Researcher;PhD student;PhD student;Professor;Full Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nma2024llm,\ntitle={{LLM} and Simulation as Bilevel Optimizers: A New Paradigm to Advance Physical Scientific Discovery},\nauthor={Pingchuan Ma and Tsun-Hsuan Wang and Minghao Guo and Zhiqing Sun and Joshua B. Tenenbaum and Daniela Rus and Chuang Gan and Wojciech Matusik},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hz8cFsdz7P}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1155831, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3250181951531442633&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "mit.edu;liquid.ai;mit.edu;cs.cmu.edu;mit.edu;mit.edu;umass.edu;mit.edu", "author_num": 8, "aff_unique_index": "0;1;0;2;0;0;3;0", "aff_unique_norm": "Massachusetts Institute of Technology;Liquid AI;Carnegie Mellon University;University of Massachusetts Amherst", "aff_unique_dep": ";;;", "aff_unique_url": "https://web.mit.edu;;https://www.cmu.edu;https://www.umass.edu", "aff_unique_abbr": "MIT;;CMU;UMass Amherst", "aff_campus_unique_index": "1", "aff_campus_unique": ";Amherst", "aff_country_unique_index": "0;1;0;0;0;0;0;0", "aff_country_unique": "United States;Unknown" }, { "title": "Self-Correcting Self-Consuming Loops for Generative Model Training", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33370", "id": "i0nVanexij", "proceeding": "https://proceedings.mlr.press/v235/gillman24a.html", "pdf": "https://openreview.net/pdf?id=i0nVanexij", "openreview": "https://openreview.net/forum?id=i0nVanexij", "author_site": "Nate Gillman, Michael Freeman, Daksh Aggarwal, Chia-Hong HSU, Calvin Luo, Yonglong Tian, Chen Sun", "tldr": "", "abstract": "As synthetic data becomes higher quality and proliferates on the internet, machine learning models are increasingly trained on a mix of human- and machine-generated data. Despite the successful stories of using synthetic data for representation learning, using synthetic data for generative model training creates ``self-consuming loops'' which may lead to training instability or even collapse, unless certain conditions are met. Our paper aims to stabilize self-consuming generative model training. Our theoretical results demonstrate that by introducing an idealized correction function, which maps a data point to be more likely under the true data distribution, self-consuming loops can be made *exponentially* more stable. We then propose self-correction functions, which rely on expert knowledge (e.g. the laws of physics programmed in a simulator), and aim to approximate the idealized corrector automatically and at scale. We empirically validate the effectiveness of self-correcting self-consuming loops on the challenging human motion synthesis task, and observe that it successfully avoids model collapse, even when the ratio of synthetic data to real data is as high as 100%.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nate Gillman;Michael Freeman;Daksh Aggarwal;Chia-Hong HSU;Calvin Luo;Yonglong Tian;Chen Sun", "authorids": "~Nate_Gillman1;michael_freeman@alumni.brown.edu;~Daksh_Aggarwal1;~Chia-Hong_HSU1;~Calvin_Luo2;~Yonglong_Tian1;~Chen_Sun1", "gender": ";;M;M;M;;M", "homepage": "https://www.nategillman.com/;;https://dakshces.github.io;https://swimmincatt35.github.io;https://calvinyluo.com/;http://people.csail.mit.edu/yonglong/;https://chensun.me", "dblp": "299/8280;;368/5329;21/10870;;151/6328;01/6072-2", "google_scholar": "twg9zD0AAAAJ;;uP-gVQYAAAAJ;1xEUDBcAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.hk/citations?user=OsP7JHAAAAAJ;vQa7heEAAAAJ", "orcid": ";;;;;;", "linkedin": "ngillman/;;;;;;", "or_profile": "~Nate_Gillman1;michael_freeman@alumni.brown.edu;~Daksh_Aggarwal1;~Chia-Hong_HSU1;~Calvin_Luo2;~Yonglong_Tian1;~Chen_Sun1", "aff": "Brown University;;Brown University;Brown University;Brown University;Google;Google", "aff_domain": "brown.edu;;brown.edu;brown.edu;brown.edu;google.com;google.com", "position": "PhD student;;PhD student;MS student;PhD student;Researcher;Research Scientist", "bibtex": "@inproceedings{\ngillman2024selfcorrecting,\ntitle={Self-Correcting Self-Consuming Loops for Generative Model Training},\nauthor={Nate Gillman and Michael Freeman and Daksh Aggarwal and Chia-Hong HSU and Calvin Luo and Yonglong Tian and Chen Sun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=i0nVanexij}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9718152, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12358886438333460244&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "brown.edu;;brown.edu;brown.edu;brown.edu;google.com;google.com", "author_num": 7, "aff_unique_index": "0;0;0;0;1;1", "aff_unique_norm": "Brown University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.brown.edu;https://www.google.com", "aff_unique_abbr": "Brown;Google", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Auto-Regressive Next-Token Predictors are Universal Learners", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33369", "id": "i56plqPpEa", "proceeding": "https://proceedings.mlr.press/v235/malach24a.html", "pdf": "https://openreview.net/pdf?id=i56plqPpEa", "openreview": "https://openreview.net/forum?id=i56plqPpEa", "author_site": "Eran Malach", "tldr": "", "abstract": "Large language models display remarkable capabilities in logical and mathematical reasoning, allowing them to solve complex tasks. Interestingly, these abilities emerge in networks trained on the simple task of next-token prediction. In this work, we present a theoretical framework for studying auto-regressive next-token predictors. We demonstrate that even simple models such as linear next-token predictors, trained on Chain-of-Thought (CoT) data, can approximate any function efficiently computed by a Turing machine. We introduce a new complexity measure---length complexity---which measures the number of intermediate tokens in a CoT sequence required to approximate some target function, and analyze the interplay between length complexity and other notions of complexity. Finally, we show experimentally that simple next-token predictors, such as linear networks and shallow Multi-Layer Perceptrons (MLPs), display non-trivial performance on text generation and arithmetic tasks. Our results demonstrate that the power of today's LLMs can be attributed, to a great extent, to the auto-regressive next-token training scheme, and not necessarily to a particular choice of architecture.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "eran malach", "authorids": "~eran_malach1", "gender": "M", "homepage": "", "dblp": "202/2566", "google_scholar": "I15dUOwAAAAJ", "orcid": "", "linkedin": "", "or_profile": "~eran_malach1", "aff": "Harvard University", "aff_domain": "harvard.edu", "position": "Postdoc", "bibtex": "@inproceedings{\nmalach2024autoregressive,\ntitle={Auto-Regressive Next-Token Predictors are Universal Learners},\nauthor={eran malach},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=i56plqPpEa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 450454, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8708889587357615912&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "harvard.edu", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Rapid Learning without Catastrophic Forgetting in the Morris Water Maze", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33368", "id": "i9C4Kwm56G", "proceeding": "https://proceedings.mlr.press/v235/wang24ab.html", "pdf": "https://openreview.net/pdf?id=i9C4Kwm56G", "openreview": "https://openreview.net/forum?id=i9C4Kwm56G", "author_site": "Raymond L Wang, Jaedong Hwang, Akhilan Boopathy, Ila R. Fiete", "tldr": "", "abstract": "Animals can swiftly adapt to novel tasks, while maintaining proficiency on previously trained tasks. This contrasts starkly with machine learning models, which struggle on these capabilities. We first propose a new task, the sequential Morris Water Maze (sWM), which extends a widely used task in the psychology and neuroscience fields and requires both rapid and continual learning. It has frequently been hypothesized that inductive biases from brains could help build better ML systems, but the addition of constraints typically hurts rather than helping ML performance. We draw inspiration from biology to show that combining 1) a content-addressable heteroassociative memory based on the entorhinal-hippocampal circuit with grid cells that retain shared across-environment structural representations and hippocampal cells that acquire environment-specific information; 2) a spatially invariant convolutional network architecture for rapid adaptation across unfamiliar environments; and 3) the ability to perform remapping, which orthogonalizes internal representations; leads to good generalization, rapid learning, and continual learning without forgetting, respectively. Our model outperforms ANN baselines from continual learning contexts applied to the task. It retains knowledge of past environments while rapidly acquiring the skills to navigate new ones, thereby addressing the seemingly opposing challenges of quick knowledge transfer and sustaining proficiency in previously learned tasks. These biologically motivated results may point the way toward ML algorithms with similar properties.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Raymond Wang;Jaedong Hwang;Akhilan Boopathy;Ila R Fiete", "authorids": "~Raymond_Wang1;~Jaedong_Hwang1;~Akhilan_Boopathy1;~Ila_R_Fiete1", "gender": ";M;M;F", "homepage": ";https://jd730.github.io/;;https://fietelab.mit.edu/", "dblp": ";239/1982;230/8358;", "google_scholar": ";https://scholar.google.co.kr/citations?user=bITgqEUAAAAJ;;uE-CihIAAAAJ", "orcid": ";;;0000-0003-4738-2539", "linkedin": "raymond-w2/;;;", "or_profile": "~Raymond_Wang1;~Jaedong_Hwang1;~Akhilan_Boopathy1;~Ila_R_Fiete1", "aff": "University of California, Berkeley;Massachusetts Institute of Technology;Amazon;Massachusetts Institute of Technology", "aff_domain": "berkeley.edu;mit.edu;amazon.com;mit.edu", "position": "Undergrad student;PhD student;Intern;Professor", "bibtex": "@inproceedings{\nwang2024rapid,\ntitle={Rapid Learning without Catastrophic Forgetting in the Morris Water Maze},\nauthor={Raymond Wang and Jaedong Hwang and Akhilan Boopathy and Ila R Fiete},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=i9C4Kwm56G}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5442963, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16521589547593330114&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "email": "berkeley.edu;mit.edu;amazon.com;mit.edu", "author_num": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "University of California, Berkeley;Massachusetts Institute of Technology;Amazon", "aff_unique_dep": ";;Amazon.com, Inc.", "aff_unique_url": "https://www.berkeley.edu;https://web.mit.edu;https://www.amazon.com", "aff_unique_abbr": "UC Berkeley;MIT;Amazon", "aff_campus_unique_index": "0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "On the Effectiveness of Supervision in Asymmetric Non-Contrastive Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33367", "id": "iC8l9DI1ZX", "proceeding": "https://proceedings.mlr.press/v235/oh24a.html", "pdf": "https://openreview.net/pdf?id=iC8l9DI1ZX", "openreview": "https://openreview.net/forum?id=iC8l9DI1ZX", "author_site": "Jeongheon Oh, Kibok Lee", "tldr": "", "abstract": "Supervised contrastive representation learning has been shown to be effective in various transfer learning scenarios. However, while asymmetric non-contrastive learning (ANCL) often outperforms its contrastive learning counterpart in self-supervised representation learning, the extension of ANCL to supervised scenarios is less explored. To bridge the gap, we study ANCL for supervised representation learning, coined SupSiam and SupBYOL, leveraging labels in ANCL to achieve better representations. The proposed supervised ANCL framework improves representation learning while avoiding collapse. Our analysis reveals that providing supervision to ANCL reduces intra-class variance, and the contribution of supervision should be adjusted to achieve the best performance. Experiments demonstrate the superiority of supervised ANCL across various datasets and tasks. The code is available at: https://github.com/JH-Oh-23/Sup-ANCL.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jeongheon Oh;Kibok Lee", "authorids": "~Jeongheon_Oh1;~Kibok_Lee1", "gender": ";M", "homepage": "https://github.com/alan2013-github;https://ml.yonsei.ac.kr/", "dblp": ";157/3147", "google_scholar": ";6wwWRdEAAAAJ", "orcid": ";0000-0001-6995-7327", "linkedin": ";", "or_profile": "~Jeongheon_Oh1;~Kibok_Lee1", "aff": "Yonsei University;Yonsei University", "aff_domain": "yonsei.ac.kr;yonsei.ac.kr", "position": "MS student;Assistant Professor", "bibtex": "@inproceedings{\noh2024on,\ntitle={On the Effectiveness of Supervision in Asymmetric Non-Contrastive Learning},\nauthor={Jeongheon Oh and Kibok Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=iC8l9DI1ZX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2248303, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5408553163003614142&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 7, "email": "yonsei.ac.kr;yonsei.ac.kr", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Yonsei University", "aff_unique_dep": "", "aff_unique_url": "https://www.yonsei.ac.kr", "aff_unique_abbr": "Yonsei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Understanding Adam Optimizer via Online Learning of Updates: Adam is FTRL in Disguise", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33366", "id": "iE2lMjeXRR", "proceeding": "https://proceedings.mlr.press/v235/ahn24b.html", "pdf": "https://openreview.net/pdf?id=iE2lMjeXRR", "openreview": "https://openreview.net/forum?id=iE2lMjeXRR", "author_site": "Kwangjun Ahn, Zhiyu Zhang, Yunbum Kook, Yan Dai", "tldr": "", "abstract": "Despite the success of the Adam optimizer in practice, the theoretical understanding of its algorithmic components still remains limited. In particular, most existing analyses of Adam show the convergence rate that can be simply achieved by non-adative algorithms like SGD. In this work, we provide a different perspective based on online learning that underscores the importance of Adam's algorithmic components. Inspired by Cutkosky et al. (2023), we consider the framework called online learning of updates/increments, where we choose the updates/increments of an optimizer based on an online learner. With this framework, the design of a good optimizer is reduced to the design of a good online learner. Our main observation is that Adam corresponds to a principled online learning framework called Follow-the-Regularized-Leader (FTRL). Building on this observation, we study the benefits of its algorithmic components from the online learning perspective.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kwangjun Ahn;Zhiyu Zhang;Yunbum Kook;Yan Dai", "authorids": "~Kwangjun_Ahn2;~Zhiyu_Zhang1;~Yunbum_Kook1;~Yan_Dai1", "gender": ";;;M", "homepage": "http://kjahn.mit.edu/;https://zhiyuzz.github.io/;https://yunbum-kook.github.io/;https://yandaichn.github.io/", "dblp": ";45/6271-3;;132/2047-2", "google_scholar": "z94iNtgAAAAJ;5KHfVTQAAAAJ;mWASLKEAAAAJ;gkG4z3IAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Kwangjun_Ahn2;~Zhiyu_Zhang1;~Yunbum_Kook1;~Yan_Dai1", "aff": "Massachusetts Institute of Technology;Harvard University;Georgia Institute of Technology;Tsinghua University", "aff_domain": "mit.edu;harvard.edu;gatech.edu;tsinghua.edu.cn", "position": "PhD student;Postdoc;PhD student;Undergrad student", "bibtex": "@inproceedings{\nahn2024understanding,\ntitle={Understanding Adam Optimizer via Online Learning of Updates: Adam is {FTRL} in Disguise},\nauthor={Kwangjun Ahn and Zhiyu Zhang and Yunbum Kook and Yan Dai},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=iE2lMjeXRR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 509671, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10704314501418309571&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 6, "email": "mit.edu;harvard.edu;gatech.edu;tsinghua.edu.cn", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Massachusetts Institute of Technology;Harvard University;Georgia Institute of Technology;Tsinghua University", "aff_unique_dep": ";;;", "aff_unique_url": "https://web.mit.edu;https://www.harvard.edu;https://www.gatech.edu;https://www.tsinghua.edu.cn", "aff_unique_abbr": "MIT;Harvard;Georgia Tech;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;China" }, { "title": "Gibbs Sampling of Continuous Potentials on a Quantum Computer", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33365", "id": "iGMTxygzcJ", "proceeding": "https://proceedings.mlr.press/v235/motamedi24a.html", "pdf": "https://openreview.net/pdf?id=iGMTxygzcJ", "openreview": "https://openreview.net/forum?id=iGMTxygzcJ", "author_site": "Arsalan Motamedi, Pooya Ronagh", "tldr": "", "abstract": "Gibbs sampling from continuous real-valued functions is a challenging problem of interest in machine learning. Here we leverage quantum Fourier transforms to build a quantum algorithm for this task when the function is periodic. We use the quantum algorithms for solving linear ordinary differential equations to solve the Fokker\u2013Planck equation and prepare a quantum state encoding the Gibbs distribution. We show that the efficiency of interpolation and differentiation of these functions on a quantum computer depends on the rate of decay of the Fourier coefficients of the Fourier transform of the function. We view this property as a concentration of measure in the Fourier domain, and also provide functional analytic conditions for it. Our algorithm makes zeroeth order queries to a quantum oracle of the function and achieves polynomial quantum speedups in mean estimation in the Gibbs measure for generic non-convex periodic functions. At high temperatures the algorithm also allows for exponentially improved precision in sampling from Morse functions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Arsalan Motamedi;Pooya Ronagh", "authorids": "~Arsalan_Motamedi1;~Pooya_Ronagh1", "gender": "M;", "homepage": "https://arsalan-motamedi.github.io/;https://pooya-git.github.io/", "dblp": ";168/8714.html", "google_scholar": ";hDMv54oAAAAJ", "orcid": ";0000-0002-9591-9727", "linkedin": ";", "or_profile": "~Arsalan_Motamedi1;~Pooya_Ronagh1", "aff": "University of Waterloo;1QB Information Technologies (1QBit)", "aff_domain": "uwaterloo.ca;1qbit.com", "position": "MS student;Principal Researcher", "bibtex": "@inproceedings{\nmotamedi2024gibbs,\ntitle={Gibbs Sampling of Continuous Potentials on a Quantum Computer},\nauthor={Arsalan Motamedi and Pooya Ronagh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=iGMTxygzcJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 926806, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5708032088192637319&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "uwaterloo.ca;1qbit.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Waterloo;1QB Information Technologies", "aff_unique_dep": ";", "aff_unique_url": "https://uwaterloo.ca;https://www.1qbit.com", "aff_unique_abbr": "UW;1QBit", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "title": "CoLoRA: Continuous low-rank adaptation for reduced implicit neural modeling of parameterized partial differential equations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33364", "id": "iHSgfGob9j", "proceeding": "https://proceedings.mlr.press/v235/berman24b.html", "pdf": "https://openreview.net/pdf?id=iHSgfGob9j", "openreview": "https://openreview.net/forum?id=iHSgfGob9j", "author_site": "Jules Berman, Benjamin Peherstorfer", "tldr": "", "abstract": "This work introduces reduced models based on Continuous Low Rank Adaptation (CoLoRA) that pre-train neural networks for a given partial differential equation and then continuously adapt low-rank weights in time to rapidly predict the evolution of solution fields at new physics parameters and new initial conditions. The adaptation can be either purely data-driven or via an equation-driven variational approach that provides Galerkin-optimal approximations. Because CoLoRA approximates solution fields locally in time, the rank of the weights can be kept small, which means that only few training trajectories are required offline so that CoLoRA is well suited for data-scarce regimes. Predictions with CoLoRA are orders of magnitude faster than with classical methods and their accuracy and parameter efficiency is higher compared to other neural network approaches.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jules Berman;Benjamin Peherstorfer", "authorids": "~Jules_Berman1;~Benjamin_Peherstorfer2", "gender": "M;", "homepage": ";https://cims.nyu.edu/~pehersto/", "dblp": "308/1410;96/8557", "google_scholar": "g44S1mAAAAAJ;C81WhlkAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Jules_Berman1;~Benjamin_Peherstorfer2", "aff": "New York University;New York University", "aff_domain": "nyu.edu;nyu.edu", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nberman2024colora,\ntitle={CoLo{RA}: Continuous low-rank adaptation for reduced implicit neural modeling of parameterized partial differential equations},\nauthor={Jules Berman and Benjamin Peherstorfer},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=iHSgfGob9j}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 505920, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13065751062606327201&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "nyu.edu;nyu.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "New York University", "aff_unique_dep": "", "aff_unique_url": "https://www.nyu.edu", "aff_unique_abbr": "NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Sampling-based Multi-dimensional Recalibration", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33363", "id": "iJWeK2snMH", "proceeding": "https://proceedings.mlr.press/v235/chung24a.html", "pdf": "https://openreview.net/pdf?id=iJWeK2snMH", "openreview": "https://openreview.net/forum?id=iJWeK2snMH", "author_site": "Youngseog Chung, Ian Char, Jeff Schneider", "tldr": "", "abstract": "Calibration of probabilistic forecasts in the regression setting has been widely studied in the single dimensional case, where the output variables are assumed to be univariate. In many problem settings, however, the output variables are multi-dimensional, and in the presence of dependence across the output dimensions, measuring calibration and performing recalibration for each dimension separately can be both misleading and detrimental. In this work, we focus on representing predictive uncertainties via samples, and propose a recalibration method which accounts for the joint distribution across output dimensions to produce calibrated samples. Based on the concept of highest density regions (HDR), we define the notion of HDR calibration, and show that our recalibration method produces samples which are HDR calibrated. We demonstrate the performance of our method and the quality of the recalibrated samples on a suite of benchmark datasets in multi-dimensional regression, a real-world dataset in modeling plasma dynamics during nuclear fusion reactions, and on a decision-making application in forecasting demand.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Youngseog Chung;Ian Char;Jeff Schneider", "authorids": "~Youngseog_Chung1;~Ian_Char1;~Jeff_Schneider1", "gender": "M;M;", "homepage": "https://youngseogchung.github.io/;http://ianchar.com;https://www.cs.cmu.edu/~schneide", "dblp": "255/7039;157/7519;38/247", "google_scholar": "mZNSMjUAAAAJ;3SDKldkAAAAJ;3bSbb20AAAAJ", "orcid": ";;0000-0002-5080-9073", "linkedin": ";;jeff-schneider-1593b322/", "or_profile": "~Youngseog_Chung1;~Ian_Char1;~Jeff_Schneider1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "andrew.cmu.edu;cmu.edu;cs.cmu.edu", "position": "PhD student;PhD student;Researcher", "bibtex": "@inproceedings{\nchung2024samplingbased,\ntitle={Sampling-based Multi-dimensional Recalibration},\nauthor={Youngseog Chung and Ian Char and Jeff Schneider},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=iJWeK2snMH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 949453, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13173353313395737196&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "andrew.cmu.edu;cmu.edu;cs.cmu.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Learning from Students: Applying t-Distributions to Explore Accurate and Efficient Formats for LLMs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33362", "id": "iJlPJsTw2B", "proceeding": "https://proceedings.mlr.press/v235/dotzel24a.html", "pdf": "https://openreview.net/pdf?id=iJlPJsTw2B", "openreview": "https://openreview.net/forum?id=iJlPJsTw2B", "author_site": "Jordan Dotzel, Yuzong Chen, Bahaa Kotb, Sushma Prasad, Gang Wu, Sheng Li, Mohamed Abdelfattah, Zhiru Zhang", "tldr": "", "abstract": "The increasing size of large language models (LLMs) traditionally requires low-precision integer formats to meet strict latency and power demands. Yet recently, alternative formats such as Normal Float (NF4) have increased model accuracy at the cost of increased chip area. In this work, we first conduct a large-scale analysis of LLM weights and activations across 30 networks and conclude that most distributions follow a Student's t-distribution. We then derive a new theoretically optimal format, Student Float (SF4), that improves over NF4 across modern LLMs, for example increasing the average accuracy on LLaMA2-7B by 0.76% across tasks. Using this format as a high-accuracy reference, we then propose augmenting E2M1 with two variants of *supernormal* support for higher model accuracy. Finally, we explore the quality and efficiency frontier across 11 datatypes by evaluating their model accuracy and hardware complexity. We discover a Pareto curve composed of INT4, E2M1, and E2M1 with supernormal support, which offers a continuous tradeoff between model accuracy and chip area. For example, E2M1 with supernormal support increases the accuracy of Phi-2 by up to 2.19% with 1.22% area overhead, enabling more LLM-based applications to be run at four bits. The supporting code is hosted at https://github.com/cornell-zhang/llm-datatypes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jordan Dotzel;Yuzong Chen;Bahaa Kotb;Sushma Prasad;Gang Wu;Sheng Li;Mohamed S Abdelfattah;Zhiru Zhang", "authorids": "~Jordan_Dotzel1;yc2367@cornell.edu;~Bahaa_Kotb1;sushmahp@google.com;~Gang_Wu7;~Sheng_Li18;~Mohamed_S_Abdelfattah1;~Zhiru_Zhang2", "gender": "M;;M;;;;M;M", "homepage": ";;;;;https://research.google/people/107913/;https://mohsaied.github.io/;https://www.csl.cornell.edu/~zhiruz", "dblp": "230/4364;;;;;;124/7095;81/4227", "google_scholar": "5H-MYAoAAAAJ;;;;;;https://scholar.google.ca/citations?user=q4wBpWAAAAAJ;https://scholar.google.com.tw/citations?user=x05pUHsAAAAJ", "orcid": ";;;;;;;", "linkedin": "dotzel/;;bahaa-kotb/;;;;mabdelfattah/;", "or_profile": "~Jordan_Dotzel1;yc2367@cornell.edu;~Bahaa_Kotb1;sushmahp@google.com;~Gang_Wu7;~Sheng_Li18;~Mohamed_S_Abdelfattah1;~Zhiru_Zhang2", "aff": "Cornell University;;Cornell University;;;Google;Cornell University;Cornell University", "aff_domain": "cornell.edu;;cornell.edu;;;google.com;cornell.edu;cornell.edu", "position": "PhD student;;Undergrad student;;;Researcher;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\ndotzel2024learning,\ntitle={Learning from Students: Applying t-Distributions to Explore Accurate and Efficient Formats for {LLM}s},\nauthor={Jordan Dotzel and Yuzong Chen and Bahaa Kotb and Sushma Prasad and Gang Wu and Sheng Li and Mohamed S Abdelfattah and Zhiru Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=iJlPJsTw2B}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2781877, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9268770398526483410&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 8, "email": "cornell.edu;;cornell.edu;;;google.com;cornell.edu;cornell.edu", "author_num": 8, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Cornell University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.cornell.edu;https://www.google.com", "aff_unique_abbr": "Cornell;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "The Benefits of Reusing Batches for Gradient Descent in Two-Layer Networks: Breaking the Curse of Information and Leap Exponents", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33361", "id": "iKkFruh4d5", "proceeding": "https://proceedings.mlr.press/v235/dandi24a.html", "pdf": "https://openreview.net/pdf?id=iKkFruh4d5", "openreview": "https://openreview.net/forum?id=iKkFruh4d5", "author_site": "Yatin Dandi, Emanuele Troiani, Luca Arnaboldi, Luca Pesce, Lenka Zdeborova, FLORENT KRZAKALA", "tldr": "", "abstract": "We investigate the training dynamics of two-layer neural networks when learning multi-index target functions. We focus on multi-pass gradient descent (GD) that reuses the batches multiple times and show that it significantly changes the conclusion about which functions are learnable compared to single-pass gradient descent. In particular, multi-pass GD with finite stepsize is found to overcome the limitations of gradient flow and single-pass GD given by the information exponent (Ben Arous et al., 2021) and leap exponent (Abbe et al., 2023) of the target function. We show that upon re-using batches, the network achieves in just two time steps an overlap with the target subspace even for functions not satisfying the staircase property (Abbe et al., 2021). We characterize the (broad) class of functions efficiently learned in finite time. The proof of our results is based on the analysis of the Dynamical Mean-Field Theory (DMFT). We further provide a closed-form description of the dynamical process of the low-dimensional projections of the weights, and numerical experiments illustrating the theory.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yatin Dandi;Emanuele Troiani;Luca Arnaboldi;Luca Pesce;Lenka Zdeborova;Florent Krzakala", "authorids": "~Yatin_Dandi1;~Emanuele_Troiani2;~Luca_Arnaboldi2;~Luca_Pesce1;~Lenka_Zdeborova1;~Florent_Krzakala1", "gender": "M;M;M;M;F;", "homepage": "https://yatindandi.github.io/;;https://arnaboldi.lu/;https://lucpoisson.github.io;http://artax.karlin.mff.cuni.cz/~zdebl9am/;http://Krzakala.org", "dblp": "255/6032;267/5270;205/8141-2;321/1650;27/6064.html;25/1282", "google_scholar": "UiEzYkMAAAAJ;https://scholar.google.fr/citations?user=Gh0snLcAAAAJ;A-4QdoQAAAAJ;praGYvoAAAAJ;https://scholar.google.fr/citations?user=gkCjy_UAAAAJ;https://scholar.google.fr/citations?user=3jDeUlMAAAAJ", "orcid": ";0000-0003-0968-7585;0009-0001-9739-8849;;;0000-0003-2313-2578", "linkedin": ";;;;;", "or_profile": "~Yatin_Dandi1;~Emanuele_Troiani2;~Luca_Arnaboldi2;~Luca_Pesce1;~Lenka_Zdeborova1;~Florent_Krzakala1", "aff": "EPFL - EPF Lausanne;School of Computer and Communication Sciences, EPFL - EPF Lausanne;EPFL - EPF Lausanne;EPFL - EPF Lausanne;Swiss Federal Institute of Technology Lausanne;Swiss Federal Institute of Technology Lausanne", "aff_domain": "epfl.ch;ic.epfl.ch;epfl.ch;epfl.ch;epfl.ch;epfl.ch", "position": "PhD student;PhD student;PhD student;PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\ndandi2024the,\ntitle={The Benefits of Reusing Batches for Gradient Descent in Two-Layer Networks: Breaking the Curse of Information and Leap Exponents},\nauthor={Yatin Dandi and Emanuele Troiani and Luca Arnaboldi and Luca Pesce and Lenka Zdeborova and Florent Krzakala},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=iKkFruh4d5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 921460, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7496602946519644196&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 7, "email": "epfl.ch;ic.epfl.ch;epfl.ch;epfl.ch;epfl.ch;epfl.ch", "author_num": 6, "aff_unique_index": "0;0;0;0;1;1", "aff_unique_norm": "EPFL;Swiss Federal Institute of Technology Lausanne", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://www.epfl.ch", "aff_unique_abbr": "EPFL;EPFL", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Debating with More Persuasive LLMs Leads to More Truthful Answers", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33360", "id": "iLCZtl7FTa", "proceeding": "https://proceedings.mlr.press/v235/khan24a.html", "pdf": "https://openreview.net/pdf?id=iLCZtl7FTa", "openreview": "https://openreview.net/forum?id=iLCZtl7FTa", "author_site": "Akbir Khan, John Hughes, Dan Valentine, Laura Ruis, Kshitij Sachan, Ansh Radhakrishnan, Edward Grefenstette, Samuel Bowman, Tim Rockt\u00e4schel, Ethan Perez", "tldr": "", "abstract": "Common methods for aligning large language models (LLMs) with desired behaviour heavily rely on human-labelled data. However, as models grow increasingly sophisticated, they will surpass human expertise, and the role of human evaluation will evolve into non-experts overseeing experts. In anticipation of this, we ask: can weaker models assess the correctness of stronger models? We investigate this question in an analogous setting, where stronger models (experts) possess the necessary information to answer questions and weaker models (non-experts) lack this information. The method we evaluate is *debate*, where two LLM experts each argue for a different answer, and a non-expert selects the answer. We find that debate consistently helps both non-expert models and humans answer questions, achieving 76% and 88% accuracy respectively (naive baselines obtain 48% and 60%). Furthermore, optimising expert debaters for persuasiveness in an unsupervised manner improves non-expert ability to identify the truth in debates. Our results provide encouraging empirical evidence for the viability of aligning models with debate in the absence of ground truth.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Akbir Khan;John Hughes;Dan Valentine;Laura Ruis;Kshitij Sachan;Ansh Radhakrishnan;Edward Grefenstette;Samuel R. Bowman;Tim Rockt\u00e4schel;Ethan Perez", "authorids": "~Akbir_Khan1;~John_Hughes4;~Dan_Valentine1;~Laura_Ruis1;~Kshitij_Sachan1;~Ansh_Radhakrishnan1;~Edward_Grefenstette1;~Samuel_R._Bowman1;~Tim_Rockt\u00e4schel1;~Ethan_Perez1", "gender": "M;M;M;;M;;M;;;M", "homepage": "https://akbir.dev;https://www.jplhughes.com/;https://github.com/valedan;;http://kshitijsachan.com;;http://egrefen.com/;;;http://ethanperez.net", "dblp": ";;;;;;http://dblp.uni-trier.de/pers/hd/g/Grefenstette:Edward;;;192/1812", "google_scholar": "https://scholar.google.com/citations?hl=en;XswwnxUAAAAJ;;;;;https://scholar.google.co.uk/citations?user=ezllEwMAAAAJ;;;https://scholar.google.ca/citations?user=za0-taQAAAAJ", "orcid": ";;;;;;;;;", "linkedin": ";jplhughes/;;;http://linkedin.com/in/kshitij-sachan-70bb1615b/;ansh-radhakrishnan-291753184;;;;https://linkedin.com/in/ethanjperez", "or_profile": "~Akbir_Khan1;~John_Hughes4;~Dan_Valentine1;~Laura_Ruis1;~Kshitij_Sachan1;~Ansh_Radhakrishnan1;~Edward_Grefenstette1;~Samuel_R._Bowman1;~Tim_Rockt\u00e4schel1;~Ethan_Perez1", "aff": ";Speechmatics;Seri Mats;;;;Google DeepMind;;;New York University", "aff_domain": ";speechmatics.com;serimats.org;;;;deepmind.com;;;nyu.edu", "position": ";Researcher;Researcher;;;;Principal Researcher;;;Researcher", "bibtex": "@inproceedings{\nkhan2024debating,\ntitle={Debating with More Persuasive {LLM}s Leads to More Truthful Answers},\nauthor={Akbir Khan and John Hughes and Dan Valentine and Laura Ruis and Kshitij Sachan and Ansh Radhakrishnan and Edward Grefenstette and Samuel R. Bowman and Tim Rockt{\\\"a}schel and Ethan Perez},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=iLCZtl7FTa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4566274, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 102, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7666605230711268626&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": ";speechmatics.com;serimats.org;;;;deepmind.com;;;nyu.edu", "author_num": 10, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Speechmatics;Seri Mats;Google;New York University", "aff_unique_dep": ";;Google DeepMind;", "aff_unique_url": "https://www.speechmatics.com;;https://deepmind.com;https://www.nyu.edu", "aff_unique_abbr": ";;DeepMind;NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;2", "aff_country_unique": "United Kingdom;;United States" }, { "title": "CogDPM: Diffusion Probabilistic Models via Cognitive Predictive Coding", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33359", "id": "iLSgF7jMtI", "proceeding": "https://proceedings.mlr.press/v235/chen24o.html", "pdf": "https://openreview.net/pdf?id=iLSgF7jMtI", "openreview": "https://openreview.net/forum?id=iLSgF7jMtI", "author_site": "Kaiyuan Chen, Xingzhuo Guo, Yu Zhang, Jianmin Wang, Mingsheng Long", "tldr": "", "abstract": "Predictive Coding (PC) is a theoretical framework in cognitive science suggesting that the human brain processes cognition through spatiotemporal prediction of visual world. Existing studies have developed spatiotemporal prediction neural networks based on the PC theroy, emulating its two core mechanisms: Correcting predictions from residuals and Hierarchical learning. However, these models do not show the enhancement of prediction skills on real-world forecasting tasks, and ignore the Precision Weighting mechanism of PC theory. Precision weight posits that the brain allocates more attention to signals with lower Precision, contributing to the the cognitive ability of human brains. This work introduces the Cognitive Diffusion Probabilistic Models (CogDPM) which demonstrates the connection between diffusion probabilistic models and PC theory. CogDPM features a precision estimation method based on the hierarchical sampling capabilities of diffusion models, and allocate the guidance with precision weights estimated by the inherent property of diffusion models. We experimentally show that the precision weights is an estimator of model's predictability on the rigid body and fluid motion dataset. We also apply CogDPM to real-world prediction tasks using the U.K. precipitation and ERA surface wind datasets. Our results demonstrate that CogDPM outperforms both existing domain-specific operational models and general deep prediction models in providing more proficient forecasting.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kaiyuan Chen;Xingzhuo Guo;Yu Zhang;Jianmin Wang;Mingsheng Long", "authorids": "~Kaiyuan_Chen3;~Xingzhuo_Guo1;~Yu_Zhang76;~Jianmin_Wang1;~Mingsheng_Long5", "gender": "M;M;M;M;M", "homepage": ";;;https://www.thss.tsinghua.edu.cn/en/faculty/jianminwang.htm;http://ise.thss.tsinghua.edu.cn/~mlong", "dblp": ";;;06/3456-1.html;74/9023", "google_scholar": ";Cbinj9QAAAAJ;;https://scholar.google.com.tw/citations?user=MiovcboAAAAJ;_MjXpXkAAAAJ", "orcid": "0009-0009-2933-0621;;;0000-0001-6841-7943;0000-0002-5412-9120", "linkedin": ";;%E6%98%B1-%E5%BC%A0-1229122b1/;;", "or_profile": "~Kaiyuan_Chen3;~Xingzhuo_Guo1;~Yu_Zhang76;~Jianmin_Wang1;~Mingsheng_Long2", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "cs.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "MS student;PhD student;Undergrad student;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nchen2024cogdpm,\ntitle={Cog{DPM}: Diffusion Probabilistic Models via Cognitive Predictive Coding},\nauthor={Kaiyuan Chen and Xingzhuo Guo and Yu Zhang and Jianmin Wang and Mingsheng Long},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=iLSgF7jMtI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5168199, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16526954426189757337&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "cs.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "DeepPolar: Inventing Nonlinear Large-Kernel Polar Codes via Deep Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33358", "id": "iLfk2CwEHA", "proceeding": "https://proceedings.mlr.press/v235/hebbar24a.html", "pdf": "https://openreview.net/pdf?id=iLfk2CwEHA", "openreview": "https://openreview.net/forum?id=iLfk2CwEHA", "author_site": "Ashwin Hebbar, Sravan Kumar Ankireddy, Hyeji Kim, Sewoong Oh, Pramod Viswanath", "tldr": "", "abstract": "Progress in designing channel codes has been driven by human ingenuity and, fittingly, has been sporadic. Polar codes, developed on the foundation of Arikan\u2019s polarization kernel, represent the latest breakthrough in coding theory and have emerged as the state-of-the-art error-correction code for short-to-medium block length regimes. In an effort to automate the invention of good channel codes, especially in this regime, we explore a novel, non-linear generalization of Polar codes, which we call DeepPolar codes. DeepPolar codes extend the conventional Polar coding framework by utilizing a larger kernel size and parameterizing these kernels and matched decoders through neural networks. Our results demonstrate that these data-driven codes effectively leverage the benefits of a larger kernel size, resulting in enhanced reliability when compared to both existing neural codes and conventional Polar codes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "S Ashwin Hebbar;Sravan Kumar Ankireddy;Hyeji Kim;Sewoong Oh;Pramod Viswanath", "authorids": "~S_Ashwin_Hebbar1;~Sravan_Kumar_Ankireddy1;~Hyeji_Kim1;~Sewoong_Oh3;~Pramod_Viswanath2", "gender": "M;M;;;M", "homepage": "https://www.ashwinhebbar.com;https://sravan-ankireddy.github.io;;;http://pramodv.ece.illinois.edu", "dblp": ";321/0836;;;", "google_scholar": "EnNwKLgAAAAJ;j34sU94AAAAJ;;;lPycXNcAAAAJ", "orcid": "0000-0002-9339-850X;;;;", "linkedin": ";;;;", "or_profile": "~S_Ashwin_Hebbar1;~Sravan_Kumar_Ankireddy1;~Hyeji_Kim1;~Sewoong_Oh3;~Pramod_Viswanath2", "aff": "Princeton University;University of Texas at Austin;;;University of Illinois, Urbana Champaign", "aff_domain": "princeton.edu;utexas.edu;;;illinois.edu", "position": "PhD student;PhD student;;;Full Professor", "bibtex": "@inproceedings{\nhebbar2024deeppolar,\ntitle={DeepPolar: Inventing Nonlinear Large-Kernel Polar Codes via Deep Learning},\nauthor={S Ashwin Hebbar and Sravan Kumar Ankireddy and Hyeji Kim and Sewoong Oh and Pramod Viswanath},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=iLfk2CwEHA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1652055, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12720196888596786875&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "princeton.edu;utexas.edu;;;illinois.edu", "author_num": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "Princeton University;University of Texas at Austin;University of Illinois Urbana-Champaign", "aff_unique_dep": ";;", "aff_unique_url": "https://www.princeton.edu;https://www.utexas.edu;https://illinois.edu", "aff_unique_abbr": "Princeton;UT Austin;UIUC", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Austin;Urbana-Champaign", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Block Acceleration Without Momentum: On Optimal Stepsizes of Block Gradient Descent for Least-Squares", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33357", "id": "iLyUEPZ0fR", "proceeding": "https://proceedings.mlr.press/v235/peng24f.html", "pdf": "https://openreview.net/pdf?id=iLyUEPZ0fR", "openreview": "https://openreview.net/forum?id=iLyUEPZ0fR", "author_site": "Liangzu Peng, Wotao Yin", "tldr": "", "abstract": "Block coordinate descent is a powerful algorithmic template suitable for big data optimization. This template admits a lot of variants including block gradient descent (BGD), which performs gradient descent on a selected block of variables, while keeping other variables fixed. For a very long time, the stepsize for each block has tacitly been set to one divided by the block-wise Lipschitz smoothness constant, imitating the vanilla stepsize rule for gradient descent (GD). However, such a choice for BGD has not yet been able to theoretically justify its empirical superiority over GD, as existing convergence rates for BGD have worse constants than GD in the deterministic cases. To discover such theoretical justification, we set up a simple environment where we consider BGD applied to least-squares with two blocks of variables. Assuming the data matrix corresponding to each block is orthogonal, we find optimal stepsizes of BGD in closed form, which provably lead to asymptotic convergence rates twice as fast as GD with Polyak's momentum; this means, under that orthogonality assumption, one can accelerate BGD by just tuning stepsizes and without adding any momentum. An application that satisfies this assumption is *generalized alternating projection* between two subspaces, and applying our stepsizes to it improves the prior convergence rate that was once claimed, slightly inaccurately, to be optimal. The main proof idea is to minimize, in stepsize variables, the spectral radius of a matrix that controls convergence rates.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Liangzu Peng;Wotao Yin", "authorids": "~Liangzu_Peng2;~Wotao_Yin1", "gender": "M;M", "homepage": "https://liangzu.github.io/;http://wotaoyin.com", "dblp": "228/7974;76/2265", "google_scholar": "A39MlcYAAAAJ;kpQGGFUAAAAJ", "orcid": "0000-0003-0708-7543;0000-0001-6697-9731", "linkedin": ";", "or_profile": "~Liangzu_Peng2;~Wotao_Yin1", "aff": "University of Pennsylvania;Alibaba Group US", "aff_domain": "upenn.edu;alibaba-inc.com", "position": "PhD student;Principal Researcher", "bibtex": "@inproceedings{\npeng2024block,\ntitle={Block Acceleration Without Momentum: On Optimal Stepsizes of Block Gradient Descent for Least-Squares},\nauthor={Liangzu Peng and Wotao Yin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=iLyUEPZ0fR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 635242, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16130815730858761459&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "upenn.edu;alibaba-inc.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Pennsylvania;Alibaba Group", "aff_unique_dep": ";", "aff_unique_url": "https://www.upenn.edu;https://www.alibaba.com", "aff_unique_abbr": "UPenn;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Adaptive Hierarchical Certification for Segmentation using Randomized Smoothing", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33356", "id": "iOEReiiTit", "proceeding": "https://proceedings.mlr.press/v235/anani24a.html", "pdf": "https://openreview.net/pdf?id=iOEReiiTit", "openreview": "https://openreview.net/forum?id=iOEReiiTit", "author_site": "Alaa Anani, Tobias Lorenz, Bernt Schiele, Mario Fritz", "tldr": "", "abstract": "Certification for machine learning is proving that no adversarial sample can evade a model within a range under certain conditions, a necessity for safety-critical domains. Common certification methods for segmentation use a flat set of fine-grained classes, leading to high abstain rates due to model uncertainty across many classes. We propose a novel, more practical setting, which certifies pixels within a multi-level hierarchy, and adaptively relaxes the certification to a coarser level for unstable components classic methods would abstain from, effectively lowering the abstain rate whilst providing more certified semantically meaningful information. We mathematically formulate the problem setup, introduce an adaptive hierarchical certification algorithm and prove the correctness of its guarantees. Since certified accuracy does not take the loss of information into account for coarser classes, we introduce the Certified Information Gain ($\\mathrm{CIG}$) metric, which is proportional to the class granularity level. Our extensive experiments on the datasets Cityscapes, PASCAL-Context, ACDC and COCO-Stuff demonstrate that our adaptive algorithm achieves a higher $\\mathrm{CIG}$ and lower abstain rate compared to the current state-of-the-art certification method. Our code can be found here: [https://github.com/AlaaAnani/adaptive-certify](https://github.com/AlaaAnani/adaptive-certify).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alaa Anani;Tobias Lorenz;Bernt Schiele;Mario Fritz", "authorids": "~Alaa_Anani1;~Tobias_Lorenz1;~Bernt_Schiele1;~Mario_Fritz1", "gender": "F;M;M;M", "homepage": "https://www.mpi-inf.mpg.de/departments/computer-vision-and-machine-learning/people/alaa-anani;https://www.t-lorenz.com/;http://www.mpi-inf.mpg.de/~schiele;https://cispa.saarland/group/fritz/", "dblp": "368/7744;25/6006-2;s/BerntSchiele;", "google_scholar": "eJgMcksAAAAJ;gf-aMd0AAAAJ;https://scholar.google.de/citations?user=z76PBfYAAAAJ;https://scholar.google.de/citations?user=4V1nNm4AAAAJ", "orcid": ";0000-0003-4369-2644;0000-0001-9683-5237;", "linkedin": "aaanani/;;;", "or_profile": "~Alaa_Anani1;~Tobias_Lorenz1;~Bernt_Schiele1;~Mario_Fritz1", "aff": "Saarland Informatics Campus, Max-Planck Institute;CISPA Helmholtz Center for Information Security;Max Planck Institute for Informatics, Saarland Informatics Campus;Saarland University", "aff_domain": "mpi-inf.mpg.de;cispa.de;mpi-inf.mpg.de;uni-saarland.de", "position": "MS student;PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nanani2024adaptive,\ntitle={Adaptive Hierarchical Certification for Segmentation using Randomized Smoothing},\nauthor={Alaa Anani and Tobias Lorenz and Bernt Schiele and Mario Fritz},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=iOEReiiTit}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 10134550, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nQoGYPhrc70J:scholar.google.com/&scioq=Adaptive+Hierarchical+Certification+for+Segmentation+using+Randomized+Smoothing&hl=en&as_sdt=0,5", "gs_version_total": 10, "email": "mpi-inf.mpg.de;cispa.de;mpi-inf.mpg.de;uni-saarland.de", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Max-Planck Institute;CISPA Helmholtz Center for Information Security;Max Planck Institute for Informatics;Saarland University", "aff_unique_dep": "Informatics;;;", "aff_unique_url": "https://www.mpi-sws.org;https://www.cispa.de/;https://mpi-inf.mpg.de;https://www.uni-saarland.de", "aff_unique_abbr": "MPI-SWS;CISPA;MPII;UdS", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Saarland;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Triplet Interaction Improves Graph Transformers: Accurate Molecular Graph Learning with Triplet Graph Transformers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33355", "id": "iPFuWc1TV2", "proceeding": "https://proceedings.mlr.press/v235/hussain24a.html", "pdf": "https://openreview.net/pdf?id=iPFuWc1TV2", "openreview": "https://openreview.net/forum?id=iPFuWc1TV2", "author_site": "Md Shamim Hussain, Mohammed Zaki, Dharmashankar Subramanian", "tldr": "", "abstract": "Graph transformers typically lack third-order interactions, limiting their geometric understanding which is crucial for tasks like molecular geometry prediction. We propose the Triplet Graph Transformer (TGT) that enables direct communication between pairs within a 3-tuple of nodes via novel triplet attention and aggregation mechanisms. TGT is applied to molecular property prediction by first predicting interatomic distances from 2D graphs and then using these distances for downstream tasks. A novel three-stage training procedure and stochastic inference further improve training efficiency and model performance. Our model achieves new state-of-the-art (SOTA) results on open challenge benchmarks PCQM4Mv2 and OC20 IS2RE. We also obtain SOTA results on QM9, MOLPCBA, and LIT-PCBA molecular property prediction benchmarks via transfer learning. We also demonstrate the generality of TGT with SOTA results on the traveling salesman problem (TSP).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Md Shamim Hussain;Mohammed J Zaki;Dharmashankar Subramanian", "authorids": "~Md_Shamim_Hussain1;~Mohammed_J_Zaki1;~Dharmashankar_Subramanian1", "gender": "M;M;M", "homepage": "https://shamim-hussain.github.io;http://www.cs.rpi.edu/~zaki;http://researcher.watson.ibm.com/researcher/view.php?person=us-dharmash", "dblp": "232/1798;z/MohammedJaveedZaki.html;", "google_scholar": "hc97XqQAAAAJ;https://scholar.google.com/scholar?q=zaki,+mj;j54RzcEAAAAJ", "orcid": "0000-0002-0832-913X;0000-0003-4711-0234;", "linkedin": "md-shamim-hussain-344611b3/;mohammed-j-zaki/;", "or_profile": "~Md_Shamim_Hussain1;~Mohammed_J_Zaki1;~Dharmashankar_Subramanian1", "aff": "Rensselaer Polytechnic Institute;Rensselaer Polytechnic Institute;International Business Machines", "aff_domain": "rpi.edu;rpi.edu;ibm.com", "position": "PhD student;Professor;Principal Researcher", "bibtex": "@inproceedings{\nhussain2024triplet,\ntitle={Triplet Interaction Improves Graph Transformers: Accurate Molecular Graph Learning with Triplet Graph Transformers},\nauthor={Md Shamim Hussain and Mohammed J Zaki and Dharmashankar Subramanian},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=iPFuWc1TV2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2581667, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15325514114557429852&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "rpi.edu;rpi.edu;ibm.com", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Rensselaer Polytechnic Institute;International Business Machines Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://www.rpi.edu;https://www.ibm.com", "aff_unique_abbr": "RPI;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Beyond the Calibration Point: Mechanism Comparison in Differential Privacy", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33354", "id": "iQTElQbAqo", "proceeding": "https://proceedings.mlr.press/v235/kaissis24a.html", "pdf": "https://openreview.net/pdf?id=iQTElQbAqo", "openreview": "https://openreview.net/forum?id=iQTElQbAqo", "author_site": "Georgios Kaissis, Stefan Kolek, Borja de Balle Pigem, Jamie Hayes, Daniel Rueckert", "tldr": "", "abstract": "In differentially private (DP) machine learning, the privacy guarantees of DP mechanisms are often reported and compared on the basis of a single $(\\varepsilon, \\delta)$-pair. This practice overlooks that DP guarantees can vary substantially even between mechanisms sharing a given $(\\varepsilon, \\delta)$, and potentially introduces privacy vulnerabilities which can remain undetected. This motivates the need for robust, rigorous methods for comparing DP guarantees in such cases. Here, we introduce the $\\Delta$-divergence between mechanisms which quantifies the worst-case excess privacy vulnerability of choosing one mechanism over another in terms of $(\\varepsilon, \\delta)$, $f$-DP and in terms of a newly presented Bayesian interpretation. Moreover, as a generalisation of the Blackwell theorem, it is endowed with strong decision-theoretic foundations. Through application examples, we show that our techniques can facilitate informed decision-making and reveal gaps in the current understanding of privacy risks, as current practices in DP-SGD often result in choosing mechanisms with high excess privacy vulnerabilities.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Georgios Kaissis;Stefan Kolek;Borja Balle;Jamie Hayes;Daniel Rueckert", "authorids": "~Georgios_Kaissis1;~Stefan_Kolek1;~Borja_Balle2;~Jamie_Hayes1;~Daniel_Rueckert2", "gender": ";M;;;M", "homepage": ";https://skmda37.github.io/;https://borjaballe.github.io/;;https://aim-lab.io/author/daniel-ruckert/", "dblp": ";304/2478;https://dblp.uni-trier.de/pers/b/Balle:Borja.html;;69/2478", "google_scholar": ";7umQNF8AAAAJ;;;https://scholar.google.co.uk/citations?user=H0O0WnQAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Georgios_Kaissis1;~Stefan_Kolek1;~Borja_Balle2;~Jamie_Hayes1;~Daniel_Rueckert2", "aff": ";Institut f\u00fcr Mathematik;Google DeepMind;University College London;Imperial College London", "aff_domain": ";lmu.de;google.com;ucl.ac.uk;imperial.ac.uk", "position": ";PhD student;Research scientist;PhD student;Full Professor", "bibtex": "@inproceedings{\nkaissis2024beyond,\ntitle={Beyond the Calibration Point: Mechanism Comparison in Differential Privacy},\nauthor={Georgios Kaissis and Stefan Kolek and Borja Balle and Jamie Hayes and Daniel Rueckert},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=iQTElQbAqo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1095648, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14494117780401317348&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 6, "email": ";lmu.de;google.com;ucl.ac.uk;imperial.ac.uk", "author_num": 5, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Institut f\u00fcr Mathematik;Google;University College London;Imperial College London", "aff_unique_dep": "Mathematics Department;Google DeepMind;;", "aff_unique_url": ";https://deepmind.com;https://www.ucl.ac.uk;https://www.imperial.ac.uk", "aff_unique_abbr": ";DeepMind;UCL;ICL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Germany;United Kingdom" }, { "title": "Learning Decision Policies with Instrumental Variables through Double Machine Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33353", "id": "iRcmqXZjeK", "proceeding": "https://proceedings.mlr.press/v235/shao24d.html", "pdf": "https://openreview.net/pdf?id=iRcmqXZjeK", "openreview": "https://openreview.net/forum?id=iRcmqXZjeK", "author_site": "Bill Daqian Shao, Ashkan Soleymani, Francesco Quinzan, Marta Kwiatkowska", "tldr": "", "abstract": "A common issue in learning decision-making policies in data-rich settings is spurious correlations in the offline dataset, which can be caused by hidden confounders. Instrumental variable (IV) regression, which utilises a key uncounfounded variable called the instrument, is a standard technique for learning causal relationships between confounded action, outcome and context variables. Most recent IV regression algorithms use a two-stage approach, where a deep neural network (DNN) estimator learnt in the first stage is directly plugged into the second stage, in which another DNN is used to estimate the causal effect. Naively plugging the estimator can cause heavy bias in the second stage, especially when regularisation bias is present in the first stage estimator. We propose DML-IV, a non-linear IV regression method that reduces the bias in two-stage IV regressions and effectively learns high-performing policies. We derive a novel learning objective to reduce bias and design the DML-IV algorithm following the double/debiased machine learning (DML) framework. The learnt DML-IV estimator has strong convergence rate and $O(N^{-1/2})$ suboptimality guarantees that match those when the dataset is unconfounded. DML-IV outperforms state-of-the-art IV regression methods on IV regression benchmarks and learns high-performing policies in the presence of instruments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Daqian Shao;Ashkan Soleymani;Francesco Quinzan;Marta Kwiatkowska", "authorids": "~Daqian_Shao1;~Ashkan_Soleymani1;~Francesco_Quinzan1;~Marta_Kwiatkowska1", "gender": "M;M;;F", "homepage": ";https://ashkansoleymani.lids.mit.edu/;;http://www.cs.ox.ac.uk/people/marta.kwiatkowska/", "dblp": "346/0819.html;270/3353.html;;k/MartaZKwiatkowska", "google_scholar": "2IPWzigAAAAJ;omHTV3MAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": ";;;0000-0001-9022-7599", "linkedin": "bill-shao-daqian;;;", "or_profile": "~Daqian_Shao1;~Ashkan_Soleymani1;~Francesco_Quinzan1;~Marta_Kwiatkowska1", "aff": "Department of Computer Science;Massachusetts Institute of Technology;;Department of Computer Science", "aff_domain": "cs.ox.ac.uk;mit.edu;;cs.ox.ac.uk", "position": "PhD student;PhD student;;Full Professor", "bibtex": "@inproceedings{\nshao2024learning,\ntitle={Learning Decision Policies with Instrumental Variables through Double Machine Learning},\nauthor={Daqian Shao and Ashkan Soleymani and Francesco Quinzan and Marta Kwiatkowska},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=iRcmqXZjeK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 786024, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18415278925606339066&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "cs.ox.ac.uk;mit.edu;;cs.ox.ac.uk", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Unknown Institution;Massachusetts Institute of Technology", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": ";https://web.mit.edu", "aff_unique_abbr": ";MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1", "aff_country_unique": ";United States" }, { "title": "Model Alignment as Prospect Theoretic Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33352", "id": "iUwHnoENnl", "proceeding": "https://proceedings.mlr.press/v235/ethayarajh24a.html", "pdf": "https://openreview.net/pdf?id=iUwHnoENnl", "openreview": "https://openreview.net/forum?id=iUwHnoENnl", "author_site": "Kawin Ethayarajh, Winnie Xu, Niklas Muennighoff, Dan Jurafsky, Douwe Kiela", "tldr": "", "abstract": "Kahneman & Tversky's $\\textit{prospect theory}$ tells us that humans perceive random variables in a biased but well-defined manner (1992); for example, humans are famously loss-averse. We show that objectives for aligning LLMs with human feedback implicitly incorporate many of these biases---the success of these objectives (e.g., DPO) over cross-entropy minimization can partly be ascribed to them belonging to a family of loss functions that we call $\\textit{human-aware losses}$ (HALOs). However, the utility functions these methods attribute to humans still differ from those in the prospect theory literature. Using a Kahneman-Tversky model of human utility, we propose a HALO that directly maximizes the utility of generations instead of maximizing the log-likelihood of preferences, as current methods do. We call this approach KTO, and it matches or exceeds the performance of preference-based methods at scales from 1B to 30B, despite only learning from a binary signal of whether an output is desirable. More broadly, our work suggests that there is no one HALO that is universally superior; the best loss depends on the inductive biases most appropriate for a given setting, an oft-overlooked consideration.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kawin Ethayarajh;Winnie Xu;Niklas Muennighoff;Dan Jurafsky;Douwe Kiela", "authorids": "~Kawin_Ethayarajh1;~Winnie_Xu1;~Niklas_Muennighoff1;~Dan_Jurafsky1;~Douwe_Kiela1", "gender": "M;F;M;M;M", "homepage": "https://kawine.github.io/;https://winniexu.ca;https://muennighoff.github.io/;http://web.stanford.edu/~jurafsky/;https://douwekiela.github.io", "dblp": "198/6540.html;285/6560;281/6745;31/985;136/9140", "google_scholar": "7SUV6rQAAAAJ;k4l-zNYAAAAJ;Me0IoRMAAAAJ;uZg9l58AAAAJ;Q0piorUAAAAJ", "orcid": ";;;;", "linkedin": ";https://linkedin.com/in/winnie-xu;niklasmuennighoff/;;", "or_profile": "~Kawin_Ethayarajh1;~Winnie_Xu1;~Niklas_Muennighoff1;~Dan_Jurafsky1;~Douwe_Kiela1", "aff": "Stanford University;;Allen Institute for Artificial Intelligence;Stanford University;Stanford University", "aff_domain": "stanford.edu;;allenai.org;stanford.edu;stanford.edu", "position": "PhD student;;Researcher;Full Professor;Adjunct Professor", "bibtex": "@inproceedings{\nethayarajh2024model,\ntitle={Model Alignment as Prospect Theoretic Optimization},\nauthor={Kawin Ethayarajh and Winnie Xu and Niklas Muennighoff and Dan Jurafsky and Douwe Kiela},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=iUwHnoENnl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 943204, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13580731695868703616&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "stanford.edu;;allenai.org;stanford.edu;stanford.edu", "author_num": 5, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Stanford University;Allen Institute for Artificial Intelligence", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://allenai.org", "aff_unique_abbr": "Stanford;AI2", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Locally Interdependent Multi-Agent MDP: Theoretical Framework for Decentralized Agents with Dynamic Dependencies", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33351", "id": "iYYA5zDoCm", "proceeding": "https://proceedings.mlr.press/v235/deweese24a.html", "pdf": "https://openreview.net/pdf?id=iYYA5zDoCm", "openreview": "https://openreview.net/forum?id=iYYA5zDoCm", "author_site": "Alex DeWeese, Guannan Qu", "tldr": "", "abstract": "Many multi-agent systems in practice are decentralized and have dynamically varying dependencies. There has been a lack of attempts in the literature to analyze these systems theoretically. In this paper, we propose and theoretically analyze a decentralized model with dynamically varying dependencies called the Locally Interdependent Multi-Agent MDP. This model can represent problems in many disparate domains such as cooperative navigation, obstacle avoidance, and formation control. Despite the intractability that general partially observable multi-agent systems suffer from, we propose three closed-form policies that are theoretically near-optimal in this setting and can be scalable to compute and store. Consequentially, we reveal a fundamental property of Locally Interdependent Multi-Agent MDP's that the partially observable decentralized solution is exponentially close to the fully observable solution with respect to the visibility radius. We then discuss extensions of our closed-form policies to further improve tractability. We conclude by providing simulations to investigate some long horizon behaviors of our closed-form policies.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alex DeWeese;Guannan Qu", "authorids": "~Alex_DeWeese2;~Guannan_Qu1", "gender": "M;", "homepage": ";https://www.guannanqu.com/", "dblp": ";", "google_scholar": ";oFIXoy8AAAAJ", "orcid": ";", "linkedin": "alex-deweese-7313b9208;", "or_profile": "~Alex_DeWeese2;~Guannan_Qu1", "aff": "Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\ndeweese2024locally,\ntitle={Locally Interdependent Multi-Agent {MDP}: Theoretical Framework for Decentralized Agents with Dynamic Dependencies},\nauthor={Alex DeWeese and Guannan Qu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=iYYA5zDoCm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 494157, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1586842624522009511&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "cmu.edu;cmu.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Diffuse, Sample, Project: Plug-And-Play Controllable Graph Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33350", "id": "ia0Z8d1DbY", "proceeding": "https://proceedings.mlr.press/v235/sharma24b.html", "pdf": "https://openreview.net/pdf?id=ia0Z8d1DbY", "openreview": "https://openreview.net/forum?id=ia0Z8d1DbY", "author_site": "Kartik Sharma, Srijan Kumar, Rakshit Trivedi", "tldr": "", "abstract": "Diffusion models lend transformative capabilities to the graph generation task, yet controlling the properties of the generated graphs remains challenging. Recent approaches augment support for controlling soft, differentiable properties but they fail to handle user-specified hard constraints that are non-differentiable. This often results in vague control, unsuitable for applications like drug discovery that demand satisfaction of precise constraints, e.g., the maximum number of bonds. To address this, we formalize the problem of controlled graph generation and introduce PRODIGY (PROjected DIffusion for controlled Graph Generation), an innovative plug-and-play approach enabling the generation of graphs with precise control, from any pre-trained diffusion model. PRODIGY employs a novel operator to project the samples at each diffusion step onto the specified constrained space. For a large class of practical constraints and a variety of graphs, our extensive experiments demonstrate that PRODIGY empowers state-of-the-art continuous and discrete diffusion models to produce graphs meeting specific, hard constraints. Our approach achieves up to 100% constraint satisfaction for non-attributed and molecular graphs, under a variety of constraints, marking a significant step forward in precise, interpretable graph generation. Code is provided on the project webpage: [https://prodigy-diffusion.github.io/](https://prodigy-diffusion.github.io/).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kartik Sharma;Srijan Kumar;Rakshit Trivedi", "authorids": "~Kartik_Sharma1;~Srijan_Kumar1;~Rakshit_Trivedi1", "gender": "M;M;", "homepage": "https://ksartik.github.io;https://faculty.cc.gatech.edu/~srijan/;", "dblp": "121/2254;131/9628;", "google_scholar": "XL3fNAoAAAAJ;kqfLNK8AAAAJ;", "orcid": ";0000-0002-5796-3532;", "linkedin": ";srijankr/;", "or_profile": "~Kartik_Sharma1;~Srijan_Kumar1;~Rakshit_Trivedi1", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;", "aff_domain": "gatech.edu;gatech.edu;", "position": "PhD student;Assistant Professor;", "bibtex": "@inproceedings{\nsharma2024diffuse,\ntitle={Diffuse, Sample, Project: Plug-And-Play Controllable Graph Generation},\nauthor={Kartik Sharma and Srijan Kumar and Rakshit Trivedi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ia0Z8d1DbY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8240647, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7189061613545710208&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 4, "email": "gatech.edu;gatech.edu;", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Gated Linear Attention Transformers with Hardware-Efficient Training", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33349", "id": "ia5XvxFUJT", "proceeding": "https://proceedings.mlr.press/v235/yang24ab.html", "pdf": "https://openreview.net/pdf?id=ia5XvxFUJT", "openreview": "https://openreview.net/forum?id=ia5XvxFUJT", "author_site": "Songlin Yang, Bailin Wang, Yikang Shen, Rameswar Panda, Yoon Kim", "tldr": "", "abstract": "Transformers with linear attention allow for efficient parallel training but can simultaneously be formulated as an RNN with 2D (matrix-valued) hidden states, thus enjoying linear-time inference complexity. However, linear attention generally underperforms ordinary softmax attention. Moreover, current implementations of linear attention lack I/O-awareness and are thus slower than highly optimized implementations of softmax attention. This work describes a hardware-efficient algorithm for linear attention that trades off memory movement against parallelizability. The resulting implementation, dubbed FlashLinearAttention, is faster than FlashAttention-2 as a standalone layer even on short sequence lengths (e.g., 1K). We then generalize this algorithm to a more expressive variant of linear attention with data-dependent gates. When used as a replacement for the standard attention layer in Transformers, the resulting gated linear attention (GLA) Transformer is found to perform competitively against the LLaMA-architecture Transformer as well recent linear-time-inference baselines such as RetNet and Mamba on moderate-scale language modeling experiments. GLA Transformer is especially effective at length generalization, enabling a model trained on 2K to generalize to sequences longer than 20K without significant perplexity degradations. For training speed, the GLA Transformer has higher throughput than a similarly-sized Mamba model.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Songlin Yang;Bailin Wang;Yikang Shen;Rameswar Panda;Yoon Kim", "authorids": "~Songlin_Yang1;~Bailin_Wang3;~Yikang_Shen1;~Rameswar_Panda1;~Yoon_Kim1", "gender": "F;M;M;;M", "homepage": "https://sustcsonglin.github.io;;https://rpand002.github.io/;https://people.csail.mit.edu/yoonkim/;https://berlino.github.io/", "dblp": ";152/8226;126/0986;;218/7334", "google_scholar": "1chlis0AAAAJ;qff5rRYAAAAJ;_ySuu6gAAAAJ;n_ts4eYAAAAJ;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Songlin_Yang1;~Yikang_Shen1;~Rameswar_Panda1;~Yoon_Kim1;~bailin_wang1", "aff": "Massachusetts Institute of Technology;International Business Machines;MIT-IBM Watson AI Lab;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;ibm.com;ibm.com;mit.edu;mit.edu", "position": "PhD student;Researcher;Research Scientist;Assistant Professor;Postdoc", "bibtex": "@inproceedings{\nyang2024gated,\ntitle={Gated Linear Attention Transformers with Hardware-Efficient Training},\nauthor={Songlin Yang and Bailin Wang and Yikang Shen and Rameswar Panda and Yoon Kim},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ia5XvxFUJT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 603975, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 148, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=406655288648630318&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 9, "email": "mit.edu;ibm.com;ibm.com;mit.edu;mit.edu", "author_num": 5, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology;International Business Machines Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://www.ibm.com", "aff_unique_abbr": "MIT;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Modeling Caption Diversity in Contrastive Vision-Language Pretraining", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33348", "id": "iaV2fU6Dif", "proceeding": "https://proceedings.mlr.press/v235/lavoie24a.html", "pdf": "https://openreview.net/pdf?id=iaV2fU6Dif", "openreview": "https://openreview.net/forum?id=iaV2fU6Dif", "author_site": "Samuel Lavoie, Polina Kirichenko, Mark Ibrahim, Mahmoud Assran, Andrew Wilson, Aaron Courville, Nicolas Ballas", "tldr": "", "abstract": "There are a thousand ways to caption an image. Contrastive Language Pretraining (CLIP) on the other hand, works by mapping an image and its caption to a single vector -- limiting how well CLIP-like models can represent the diverse ways to describe an image. In this work, we introduce Llip, Latent Language Image Pretraining, which models the diversity of captions that could match an image. Llip's vision encoder outputs a set of visual features that are mixed into a final representation by conditioning on information derived from the text. We show that Llip outperforms non-contextualized baselines like CLIP and SigLIP on a variety of tasks even with large-scale encoders. Llip improves zero-shot classification by an average of 2.9% zero-shot classification benchmarks with a ViT-G/14 encoder. Specifically, Llip attains a zero-shot top-1 accuracy of 83.5% on ImageNet outperforming a similarly sized CLIP by 1.4%. We also demonstrate improvement on zero-shot retrieval on MS-COCO by 6.0%. We provide a comprehensive analysis of the components introduced by the method and demonstrate that Llip leads to richer visual representations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Samuel Lavoie;Polina Kirichenko;Mark Ibrahim;Mido Assran;Andrew Gordon Wilson;Aaron Courville;Nicolas Ballas", "authorids": "~Samuel_Lavoie1;~Polina_Kirichenko1;~Mark_Ibrahim1;~Mido_Assran1;~Andrew_Gordon_Wilson1;~Aaron_Courville3;~Nicolas_Ballas1", "gender": "F;;Not Specified;;;M;M", "homepage": "https://polkirichenko.github.io/;https://markibrahim.me/;https://cims.nyu.edu/~andrewgw;;;http://www.midoassran.ca/;http://example.com", "dblp": "239/8699;180/5660;65/10453;56/1688;120/9066;216/2717;225/6508", "google_scholar": "05uQHIgAAAAJ;AqYyoCMAAAAJ;https://scholar.google.com.tw/citations?user=twWX2LIAAAAJ;https://scholar.google.ca/citations?user=km6CP8cAAAAJ;euUV4iUAAAAJ;gcQTTvkAAAAJ;", "orcid": ";;;;;0000-0001-9159-8447;", "linkedin": "polkirichenko/;;;;;;", "or_profile": "~Polina_Kirichenko1;~Mark_Ibrahim1;~Andrew_Gordon_Wilson1;~Aaron_Courville3;~Nicolas_Ballas1;~Mahmoud_Assran1;~Samuel_Lavoie-Marchildon1", "aff": "New York University;Facebook AI Research (FAIR) Meta;New York University;Universit\u00e9 de Montr\u00e9al;Meta;Meta;University of Montreal", "aff_domain": "nyu.edu;ai.facebook.com;nyu.edu; ;meta.com;meta.com;umontreal.ca", "position": "PhD student;Researcher;Associate Professor;Assistant Professor;Researcher;Research Scientist;PhD student", "bibtex": "@inproceedings{\nlavoie2024modeling,\ntitle={Modeling Caption Diversity in Contrastive Vision-Language Pretraining},\nauthor={Samuel Lavoie and Polina Kirichenko and Mark Ibrahim and Mido Assran and Andrew Gordon Wilson and Aaron Courville and Nicolas Ballas},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=iaV2fU6Dif}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2084631, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9099297459332565530&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "nyu.edu;ai.facebook.com;nyu.edu; ;meta.com;meta.com;umontreal.ca", "author_num": 7, "aff_unique_index": "0;1;0;2;1;1;3", "aff_unique_norm": "New York University;Meta;Universit\u00e9 de Montr\u00e9al;University of Montreal", "aff_unique_dep": ";Facebook AI Research;;", "aff_unique_url": "https://www.nyu.edu;https://www.meta.com;https://www.umontreal.ca;https://wwwumontreal.ca", "aff_unique_abbr": "NYU;Meta AI;UdeM;UM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0;1", "aff_country_unique": "United States;Canada" }, { "title": "DiracDiffusion: Denoising and Incremental Reconstruction with Assured Data-Consistency", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33347", "id": "ibwxzYCep9", "proceeding": "https://proceedings.mlr.press/v235/fabian24b.html", "pdf": "https://openreview.net/pdf?id=ibwxzYCep9", "openreview": "https://openreview.net/forum?id=ibwxzYCep9", "author_site": "Zalan Fabian, Berk Tinaz, Mahdi Soltanolkotabi", "tldr": "", "abstract": "Diffusion models have established new state of the art in a multitude of computer vision tasks, including image restoration. Diffusion-based inverse problem solvers generate reconstructions of exceptional visual quality from heavily corrupted measurements. However, in what is widely known as the perception-distortion trade-off, the price of perceptually appealing reconstructions is often paid in declined distortion metrics, such as PSNR. Distortion metrics measure faithfulness to the observation, a crucial requirement in inverse problems. In this work, we propose a novel framework for inverse problem solving, namely we assume that the observation comes from a stochastic degradation process that gradually degrades and noises the original clean image. We learn to reverse the degradation process in order to recover the clean image. Our technique maintains consistency with the original measurement throughout the reverse process, and allows for great flexibility in trading off perceptual quality for improved distortion metrics and sampling speedup via early-stopping. We demonstrate the efficiency of our method on different high-resolution datasets and inverse problems, achieving great improvements over other state-of-the-art diffusion-based methods with respect to both perceptual and distortion metrics.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zalan Fabian;Berk Tinaz;Mahdi Soltanolkotabi", "authorids": "~Zalan_Fabian1;~Berk_Tinaz1;~Mahdi_Soltanolkotabi1", "gender": "M;M;M", "homepage": "https://z-fabian.github.io/;https://berktinaz.github.io/;http://www-bcf.usc.edu/~soltanol/", "dblp": "192/2874;275/8488;75/6691", "google_scholar": "5EKjsXQAAAAJ;gzIzOtAAAAAJ;narJyMAAAAAJ", "orcid": ";;", "linkedin": ";berk-tinaz/;", "or_profile": "~Zalan_Fabian1;~Berk_Tinaz1;~Mahdi_Soltanolkotabi1", "aff": "University of Southern California;Amazon;University of Southern California", "aff_domain": "usc.edu;amazon.com;usc.edu", "position": "Postdoc;Intern;Associate Professor", "bibtex": "@inproceedings{\nfabian2024diracdiffusion,\ntitle={DiracDiffusion: Denoising and Incremental Reconstruction with Assured Data-Consistency},\nauthor={Zalan Fabian and Berk Tinaz and Mahdi Soltanolkotabi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ibwxzYCep9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6374941, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7508433423845953381&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "email": "usc.edu;amazon.com;usc.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Southern California;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.usc.edu;https://www.amazon.com", "aff_unique_abbr": "USC;Amazon", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Best of Both Worlds Guarantees for Smoothed Online Quadratic Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33346", "id": "icijMMWwdG", "proceeding": "https://proceedings.mlr.press/v235/bhuyan24a.html", "pdf": "https://openreview.net/pdf?id=icijMMWwdG", "openreview": "https://openreview.net/forum?id=icijMMWwdG", "author_site": "Neelkamal Bhuyan, Debankur Mukherjee, Adam Wierman", "tldr": "", "abstract": "We study the smoothed online quadratic optimization (SOQO) problem where, at each round $t$, a player plays an action $x_t$ in response to a quadratic hitting cost and an additional squared $\\ell_2$-norm cost for switching actions. This problem class has strong connections to a wide range of application domains including smart grid management, adaptive control, and data center management, where switching-efficient algorithms are highly sought after. We study the SOQO problem in both adversarial and stochastic settings, and in this process, perform the first stochastic analysis of this class of problems. We provide the online optimal algorithm when the minimizers of the hitting cost function evolve as a general stochastic process, which, for the case of martingale process, takes the form of a *distribution-agnostic dynamic interpolation algorithm* that we call Lazy Adaptive Interpolation (LAI). Next, we present the stochastic-adversarial trade-off by proving an $\\Omega(T)$ expected regret for the adversarial optimal algorithm in the literature (ROBD) with respect to LAI and, a sub-optimal competitive ratio for LAI in the adversarial setting. Finally, we present a best-of-both-worlds algorithm that obtains a robust adversarial performance while simultaneously achieving a near-optimal stochastic performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Neelkamal Bhuyan;Debankur Mukherjee;Adam Wierman", "authorids": "~Neelkamal_Bhuyan1;~Debankur_Mukherjee1;~Adam_Wierman1", "gender": "M;M;M", "homepage": "https://sites.gatech.edu/neelkamalbhuyan/;https://www.debankur-mukherjee.com/;https://adamwierman.com/", "dblp": ";;56/4447", "google_scholar": "s8tRDL4AAAAJ;https://scholar.google.com/citations?hl=en;4OvOdSgAAAAJ", "orcid": "0009-0007-5662-2780;0000-0003-1678-4893;0000-0002-5923-0199", "linkedin": "neelkamal-bhuyan;;adam-wierman-a529474/", "or_profile": "~Neelkamal_Bhuyan1;~Debankur_Mukherjee1;~Adam_Wierman1", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;California Institute of Technology", "aff_domain": "gatech.edu;gatech.edu;caltech.edu", "position": "PhD student;Assistant Professor;Professor", "bibtex": "@inproceedings{\nbhuyan2024best,\ntitle={Best of Both Worlds Guarantees for Smoothed Online Quadratic Optimization},\nauthor={Neelkamal Bhuyan and Debankur Mukherjee and Adam Wierman},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=icijMMWwdG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 644380, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11895281084927280548&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "gatech.edu;gatech.edu;caltech.edu", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Georgia Institute of Technology;California Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.gatech.edu;https://www.caltech.edu", "aff_unique_abbr": "Georgia Tech;Caltech", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pasadena", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Evaluating Model Bias Requires Characterizing its Mistakes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33345", "id": "idyUNsoZ75", "proceeding": "https://proceedings.mlr.press/v235/albuquerque24a.html", "pdf": "https://openreview.net/pdf?id=idyUNsoZ75", "openreview": "https://openreview.net/forum?id=idyUNsoZ75", "author_site": "Isabela Albuquerque, Jessica Schrouff, David Warde-Farley, Taylan Cemgil, Sven Gowal, Olivia Wiles", "tldr": "", "abstract": "The ability to properly benchmark model performance in the face of spurious correlations is important to both build better predictors and increase confidence that models are operating as intended. We demonstrate that characterizing (as opposed to simply quantifying) model mistakes across subgroups is pivotal to properly reflect model biases, which are ignored by standard metrics such as worst-group accuracy or accuracy gap. Inspired by the hypothesis testing framework, we introduce SkewSize, a principled and flexible metric that captures bias from mistakes in a model's predictions. It can be used in multi-class settings or generalised to the open vocabulary setting of generative models. SkewSize is an aggregation of the effect size of the interaction between two categorical variables: the spurious variable representing the bias attribute the model's prediction. We demonstrate the utility of SkewSize in multiple settings including: standard vision models trained on synthetic data, vision models trained on ImageNet, and large scale vision-and-language models from the BLIP-2 family. In each case, the proposed SkewSize is able to highlight biases not captured by other metrics, while also providing insights on the impact of recently proposed techniques, such as instruction tuning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Isabela Albuquerque;Jessica Schrouff;David Warde-Farley;Ali Taylan Cemgil;Sven Gowal;Olivia Wiles", "authorids": "~Isabela_Albuquerque1;~Jessica_Schrouff1;~David_Warde-Farley1;~Ali_Taylan_Cemgil2;~Sven_Gowal2;~Olivia_Wiles1", "gender": "F;F;M;;M;M", "homepage": ";;;;https://www.cmpe.boun.edu.tr/~cemgil/;", "dblp": "210/2719;96/9449;71/9421;194/3191;41/6613;75/8368", "google_scholar": ";https://scholar.google.co.uk/citations?user=2YWm2nMAAAAJ;https://scholar.google.ca/citations?user=MOgfm8oAAAAJ;https://scholar.google.co.uk/citations?user=XQzHJSgAAAAJ;X3ZFZ7AAAAAJ;", "orcid": ";0000-0003-4992-3183;;;http://orcid.org/0000-0003-4463-8455;", "linkedin": ";jessica-schrouff/;;;;", "or_profile": "~Isabela_Albuquerque1;~Jessica_Schrouff1;~David_Warde-Farley1;~Olivia_Wiles1;~ali_taylan_cemgil1;~Sven_Gowal1", "aff": "Google DeepMind;Google DeepMind;Google DeepMind;Google;Bogazici University;Google DeepMind", "aff_domain": "deepmind.com;google.com;google.com;google.com;boun.edu.tr;google.com", "position": "Researcher;Senior Researcher;Research Scientist;Researcher;Full Professor;Research Engineer", "bibtex": "@inproceedings{\nalbuquerque2024evaluating,\ntitle={Evaluating Model Bias Requires Characterizing its Mistakes},\nauthor={Isabela Albuquerque and Jessica Schrouff and David Warde-Farley and Ali Taylan Cemgil and Sven Gowal and Olivia Wiles},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=idyUNsoZ75}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1110540, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6422897898317597494&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 7, "email": "deepmind.com;google.com;google.com;google.com;boun.edu.tr;google.com", "author_num": 6, "aff_unique_index": "0;0;0;0;1;0", "aff_unique_norm": "Google;Bogazici University", "aff_unique_dep": "Google DeepMind;", "aff_unique_url": "https://deepmind.com;https://www.boun.edu.tr", "aff_unique_abbr": "DeepMind;BU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;1;2;0", "aff_country_unique": "United Kingdom;United States;T\u00fcrkiye" }, { "title": "On the Unexpected Effectiveness of Reinforcement Learning for Sequential Recommendation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33344", "id": "ie3vXkMvRY", "proceeding": "https://proceedings.mlr.press/v235/silva24b.html", "pdf": "https://openreview.net/pdf?id=ie3vXkMvRY", "openreview": "https://openreview.net/forum?id=ie3vXkMvRY", "author_site": "\u00c1lvaro Labarca Silva, Denis Parra, Rodrigo A Toro Icarte", "tldr": "", "abstract": "In recent years, Reinforcement Learning (RL) has shown great promise in session-based recommendation. Sequential models that use RL have reached state-of-the-art performance for the Next-item Prediction (NIP) task. This result is intriguing, as the NIP task only evaluates how well the system can correctly recommend the next item to the user, while the goal of RL is to find a policy that optimizes rewards in the long term -- sometimes at the expense of suboptimal short-term performance. Then, how can RL improve the system's performance on short-term metrics? This article investigates this question by exploring proxy learning objectives, which we identify as goals RL models might be following, and thus could explain the performance boost. We found that RL -- when used as an auxiliary loss -- promotes the learning of embeddings that capture information about the user's previously interacted items. Subsequently, we replaced the RL objective with a straightforward auxiliary loss designed to predict the number of items the user interacted with. This substitution results in performance gains comparable to RL. These findings pave the way to improve performance and understanding of RL methods for recommender systems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "\u00c1lvaro Labarca Silva;Denis Parra;Rodrigo Toro Icarte", "authorids": "~\u00c1lvaro_Labarca_Silva1;~Denis_Parra1;~Rodrigo_Toro_Icarte1", "gender": "M;M;M", "homepage": ";https://dparra.sitios.ing.uc.cl/;http://www.cs.toronto.edu/~rntoro/", "dblp": "323/4767;09/7458;200/8660", "google_scholar": "gQf-V9wAAAAJ;aQ9TkcIAAAAJ;https://scholar.google.ca/citations?user=W9DykFMAAAAJ", "orcid": ";0000-0001-9878-8761;0000-0002-7734-099X", "linkedin": ";denisparra/;", "or_profile": "~\u00c1lvaro_Labarca_Silva1;~Denis_Parra1;~Rodrigo_Toro_Icarte1", "aff": "Pontificia Universidad Catolica de Chile;Pontificia Universidad Catolica de Chile;Pontificia Universidad Catolica de Chile", "aff_domain": "uc.cl;puc.cl;uc.cl", "position": "MS student;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nsilva2024on,\ntitle={On the Unexpected Effectiveness of Reinforcement Learning for Sequential Recommendation},\nauthor={{\\'A}lvaro Labarca Silva and Denis Parra and Rodrigo Toro Icarte},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ie3vXkMvRY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 350572, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15196936426482936535&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "uc.cl;puc.cl;uc.cl", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Pontificia Universidad Catolica de Chile", "aff_unique_dep": "", "aff_unique_url": "https://www.puc.cl", "aff_unique_abbr": "PUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Chile" }, { "title": "Private Gradient Descent for Linear Regression: Tighter Error Bounds and Instance-Specific Uncertainty Estimation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33343", "id": "igRAPavrrS", "proceeding": "https://proceedings.mlr.press/v235/brown24a.html", "pdf": "https://openreview.net/pdf?id=igRAPavrrS", "openreview": "https://openreview.net/forum?id=igRAPavrrS", "author_site": "Gavin Brown, Krishnamurthy Dvijotham, Georgina Evans, Daogao Liu, Adam Smith, Abhradeep Guha Thakurta", "tldr": "", "abstract": "We provide an improved analysis of standard differentially private gradient descent for linear regression under the squared error loss. Under modest assumptions on the input, we characterize the distribution of the iterate at each time step. Our analysis leads to new results on the algorithm's accuracy: for a proper fixed choice of hyperparameters, the sample complexity depends only linearly on the dimension of the data. This matches the dimension-dependence of the (non-private) ordinary least squares estimator as well as that of recent private algorithms that rely on sophisticated adaptive gradient-clipping schemes (Varshney et al., 2022; Liu et al., 2023). Our analysis of the iterates' distribution also allows us to construct confidence intervals for the empirical optimizer which adapt automatically to the variance of the algorithm on a particular data set. We validate our theorems through experiments on synthetic data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gavin R Brown;Krishnamurthy Dj Dvijotham;Georgina Evans;Daogao Liu;Adam Smith;Abhradeep Guha Thakurta", "authorids": "~Gavin_R_Brown1;~Krishnamurthy_Dj_Dvijotham1;~Georgina_Evans1;~Daogao_Liu1;~Adam_Smith1;~Abhradeep_Guha_Thakurta1", "gender": ";;M;M;M;M", "homepage": "https://homes.cs.washington.edu/~grbrown/;https://sites.google.com/corp/view/georginaevans;https://daogaoliu.github.io/;http://cs-people.bu.edu/ads22;https://athakurta.squarespace.com/;http://dvij.github.io", "dblp": "93/2057-3.html;;245/4078;04/5072;31/8315;16/8758", "google_scholar": "gBkMDloAAAAJ;Hqs-x-wAAAAJ;auA3AaQAAAAJ;fkGi-JMAAAAJ;1rV69hMAAAAJ;BUtloecAAAAJ", "orcid": ";;;;;", "linkedin": ";georgina-evans-9b9639b9/;;;;", "or_profile": "~Gavin_R_Brown1;~Georgina_Evans1;~Daogao_Liu1;~Adam_Smith1;~Abhradeep_Guha_Thakurta1;~Krishnamurthy_Dvijotham2", "aff": "Boston University, Boston University;Google;University of Washington, Seattle;Google;Google;Google DeepMind", "aff_domain": "bu.edu;google.com;uw.edu;google.com;google.com;google.com", "position": "PhD student;Researcher;PhD student;Researcher;Senior Research Scientist;Researcher", "bibtex": "@inproceedings{\nbrown2024private,\ntitle={Private Gradient Descent for Linear Regression: Tighter Error Bounds and Instance-Specific Uncertainty Estimation},\nauthor={Gavin R Brown and Krishnamurthy Dj Dvijotham and Georgina Evans and Daogao Liu and Adam Smith and Abhradeep Guha Thakurta},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=igRAPavrrS}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 786451, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12907926592313466563&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "bu.edu;google.com;uw.edu;google.com;google.com;google.com", "author_num": 6, "aff_unique_index": "0;1;2;1;1;1", "aff_unique_norm": "Boston University;Google;University of Washington", "aff_unique_dep": ";Google;", "aff_unique_url": "https://www.bu.edu;https://www.google.com;https://www.washington.edu", "aff_unique_abbr": "BU;Google;UW", "aff_campus_unique_index": "0;1;2;1;1", "aff_campus_unique": "Boston;Mountain View;Seattle;", "aff_country_unique_index": "0;0;0;0;0;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Unified Generation, Reconstruction, and Representation: Generalized Diffusion with Adaptive Latent Encoding-Decoding", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33342", "id": "igRjCCAz2a", "proceeding": "https://proceedings.mlr.press/v235/liu24bh.html", "pdf": "https://openreview.net/pdf?id=igRjCCAz2a", "openreview": "https://openreview.net/forum?id=igRjCCAz2a", "author_site": "Guangyi Liu, Yu Wang, Zeyu Feng, Qiyu Wu, Liping Tang, Yuan Gao, Zhen Li, Shuguang Cui, Julian McAuley, Zichao Yang, Eric Xing, Zhiting Hu", "tldr": "", "abstract": "The vast applications of deep generative models are anchored in three core capabilities---*generating* new instances, *reconstructing* inputs, and learning compact *representations*---across various data types, such as discrete text/protein sequences and continuous images. Existing model families, like variational autoencoders (VAEs), generative adversarial networks (GANs), autoregressive models, and (latent) diffusion models, generally excel in specific capabilities and data types but fall short in others. We introduce *Generalized* ***E****ncoding*-***D****ecoding ****D****iffusion ****P****robabilistic ****M****odels* (EDDPMs) which integrate the core capabilities for broad applicability and enhanced performance. EDDPMs generalize the Gaussian noising-denoising in standard diffusion by introducing parameterized encoding-decoding. Crucially, EDDPMs are compatible with the well-established diffusion model objective and training recipes, allowing effective learning of the encoder-decoder parameters *jointly* with diffusion. By choosing appropriate encoder/decoder (e.g., large language models), EDDPMs naturally apply to different data types. Extensive experiments on text, proteins, and images demonstrate the flexibility to handle diverse data and tasks and the strong improvement over various existing models. Code is available at https://github.com/guangyliu/EDDPM .", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guangyi Liu;Yu Wang;Zeyu Feng;Qiyu Wu;Liping Tang;Yuan Gao;Zhen Li;Shuguang Cui;Julian McAuley;Zichao Yang;Eric P. Xing;Zhiting Hu", "authorids": "~Guangyi_Liu1;~Yu_Wang24;~Zeyu_Feng2;~Qiyu_Wu2;~Liping_Tang2;~Yuan_Gao11;~Zhen_Li6;~Shuguang_Cui1;~Julian_McAuley1;~Zichao_Yang1;~Eric_Xing1;~Zhiting_Hu3", "gender": ";M;M;;F;M;;M;M;M;M;M", "homepage": ";https://wangyu-ustc.github.io/;;;;https://www.linkedin.com/in/rab0na/;;https://sse.cuhk.edu.cn/en/content/1415;http://cseweb.ucsd.edu/~jmcauley/;;http://www.cs.cmu.edu/~epxing/;http://zhiting.ucsd.edu", "dblp": ";;;;;;;48/4914;29/3483;07/8707;36/3855;134/4031", "google_scholar": ";https://scholar.google.com/citations?hl=en;;;;;;https://scholar.google.com.hk/citations?user=1o_qvR0AAAAJ;icbo4M0AAAAJ;https://scholar.google.co.uk/citations?user=siCYLcUAAAAJ;https://scholar.google.com.tw/citations?user=5pKTRxEAAAAJ;N7_xhHoAAAAJ", "orcid": ";;;;;;;0000-0003-2608-775X;0000-0003-0955-7588;;;", "linkedin": ";;zeyu-feng-1800831a1;;%E4%B8%BD%E5%B9%B3-%E5%94%90-51972419a;rab0na/;;;;;;", "or_profile": "~Guangyi_Liu1;~Yu_Wang24;~Zeyu_Feng2;~Qiyu_Wu2;~Liping_Tang2;~Yuan_Gao11;~Zhen_Li6;~Shuguang_Cui1;~Julian_McAuley1;~Zichao_Yang1;~Eric_Xing1;~Zhiting_Hu3", "aff": ";University of California, San Diego;University of California, San Diego;;Mohamed bin Zayed University of Artificial Intelligence;Computer Science Department, Stanford University;;The Chinese University of Hong Kong, Shenzhen;University of California, San Diego, University of California, San Diego;;School of Computer Science, Carnegie Mellon University;Amazon", "aff_domain": ";ucsd.edu;ucsd.edu;;mbzuai.ac.ae;cs.stanford.edu;;cuhk.edu.cn;eng.ucsd.edu;;cs.cmu.edu;amazon.com", "position": ";PhD student;Master Student;;NLP Engineer;MS student;;Full Professor;Full Professor;;Full Professor;Researcher", "bibtex": "@inproceedings{\nliu2024unified,\ntitle={Unified Generation, Reconstruction, and Representation: Generalized Diffusion with Adaptive Latent Encoding-Decoding},\nauthor={Guangyi Liu and Yu Wang and Zeyu Feng and Qiyu Wu and Liping Tang and Yuan Gao and Zhen Li and Shuguang Cui and Julian McAuley and Zichao Yang and Eric P. Xing and Zhiting Hu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=igRjCCAz2a}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3298910, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17396188301688411708&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";ucsd.edu;ucsd.edu;;mbzuai.ac.ae;cs.stanford.edu;;cuhk.edu.cn;eng.ucsd.edu;;cs.cmu.edu;amazon.com", "author_num": 12, "aff_unique_index": "0;0;1;2;3;0;4;5", "aff_unique_norm": "University of California, San Diego;Mohamed bin Zayed University of Artificial Intelligence;Stanford University;Chinese University of Hong Kong;Carnegie Mellon University;Amazon", "aff_unique_dep": ";;Computer Science Department;;School of Computer Science;Amazon.com, Inc.", "aff_unique_url": "https://www.ucsd.edu;https://mbzuai.ac.ae;https://www.stanford.edu;https://www.cuhk.edu.cn;https://www.cmu.edu;https://www.amazon.com", "aff_unique_abbr": "UCSD;MBZUAI;Stanford;CUHK;CMU;Amazon", "aff_campus_unique_index": "0;0;2;3;0;4", "aff_campus_unique": "San Diego;;Stanford;Shenzhen;Pittsburgh", "aff_country_unique_index": "0;0;1;0;2;0;0;0", "aff_country_unique": "United States;United Arab Emirates;China" }, { "title": "Causal Customer Churn Analysis with Low-rank Tensor Block Hazard Model", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33341", "id": "ihv6pWuILN", "proceeding": "https://proceedings.mlr.press/v235/gao24q.html", "pdf": "https://openreview.net/pdf?id=ihv6pWuILN", "openreview": "https://openreview.net/forum?id=ihv6pWuILN", "author_site": "Chenyin Gao, ZHIMING ZHANG, Shu Yang", "tldr": "", "abstract": "This study introduces an innovative method for analyzing the impact of various interventions on customer churn, using the potential outcomes framework. We present a new causal model, the tensorized latent factor block hazard model, which incorporates tensor completion methods for a principled causal analysis of customer churn. A crucial element of our approach is the formulation of a 1-bit tensor completion for the parameter tensor. This captures hidden customer characteristics and temporal elements from churn records, effectively addressing the binary nature of churn data and its time-monotonic trends. Our model also uniquely categorizes interventions by their similar impacts, enhancing the precision and practicality of implementing customer retention strategies. For computational efficiency, we apply a projected gradient descent algorithm combined with spectral clustering. We lay down the theoretical groundwork for our model, including its non-asymptotic properties. The efficacy and superiority of our model are further validated through comprehensive experiments on both simulated and real-world applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chenyin Gao;Zhiming Zhang;Shu Yang", "authorids": "~Chenyin_Gao1;zhangzm05@gmail.com;~Shu_Yang4", "gender": "M;;F", "homepage": "https://gaochenyin.github.io/Personal-Website/Chenyin_Gao_CV;;https://shuyang.wordpress.ncsu.edu/", "dblp": ";;", "google_scholar": "https://scholar.google.com/citations?hl=en;;ySYRNQMAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Chenyin_Gao1;zhangzm05@gmail.com;~Shu_Yang4", "aff": "North Carolina State University;;North Carolina State University", "aff_domain": "ncsu.edu;;ncsu.edu", "position": "PhD student;;Associate Professor", "bibtex": "@inproceedings{\ngao2024causal,\ntitle={Causal Customer Churn Analysis with Low-rank Tensor Block Hazard Model},\nauthor={Chenyin Gao and Zhiming Zhang and Shu Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ihv6pWuILN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 605521, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BJRMrDynMt8J:scholar.google.com/&scioq=Causal+Customer+Churn+Analysis+with+Low-rank+Tensor+Block+Hazard+Model&hl=en&as_sdt=0,5", "gs_version_total": 9, "email": "ncsu.edu;;ncsu.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "North Carolina State University", "aff_unique_dep": "", "aff_unique_url": "https://www.ncsu.edu", "aff_unique_abbr": "NCSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Mol-AE: Auto-Encoder Based Molecular Representation Learning With 3D Cloze Test Objective", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33340", "id": "inEuvSg0y1", "proceeding": "https://proceedings.mlr.press/v235/yang24al.html", "pdf": "https://openreview.net/pdf?id=inEuvSg0y1", "openreview": "https://openreview.net/forum?id=inEuvSg0y1", "author_site": "Junwei Yang, Kangjie Zheng, Siyu Long, Zaiqing Nie, Ming Zhang, Xinyu Dai, Wei-Ying Ma, Hao Zhou", "tldr": "", "abstract": "3D molecular representation learning has gained tremendous interest and achieved promising performance in various downstream tasks. A series of recent approaches follow a prevalent framework: an encoder-only model coupled with a coordinate denoising objective. However, through a series of analytical experiments, we prove that the encoder-only model with coordinate denoising objective exhibits inconsistency between pre-training and downstream objectives, as well as issues with disrupted atomic identifiers. To address these two issues, we propose Mol-AE for molecular representation learning, an auto-encoder model using positional encoding as atomic identifiers. We also propose a new training objective named 3D Cloze Test to make the model learn better atom spatial relationships from real molecular substructures. Empirical results demonstrate that Mol-AE achieves a large margin performance gain compared to the current state-of-the-art 3D molecular modeling approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Junwei Yang;Kangjie Zheng;Siyu Long;Zaiqing Nie;Ming Zhang;Xinyu Dai;Wei-Ying Ma;Hao Zhou", "authorids": "~Junwei_Yang2;~Kangjie_Zheng1;~Siyu_Long1;~Zaiqing_Nie2;~Ming_Zhang5;~Xinyu_Dai1;~Wei-Ying_Ma2;~Hao_Zhou5", "gender": "M;M;M;F;M;M;M;M", "homepage": "https://scholar.google.com/citations?user=n8kbAwQAAAAJ&hl=en;https://longlongman.github.io;https://air.tsinghua.edu.cn/en/info/1046/1192.htm;https://cs.pku.edu.cn/info/1080/1371.htm;http://cs.nju.edu.cn/daixinyu;https://air.tsinghua.edu.cn/en/info/1046/1189.htm;https://zhouh.github.io/;https://github.com/yjwtheonly", "dblp": ";234/9275;n/ZaiqingNie;73/1844-4;39/5815;m/WYMa.html;63/778-12;", "google_scholar": "n8kbAwQAAAAJ;aOfk1hsAAAAJ;;LbzoQBsAAAAJ;https://scholar.google.com/citations?hl=en;SToCbu8AAAAJ;https://scholar.google.com/citations?hl=zh-CN;kbGJGvsAAAAJ", "orcid": ";0000-0002-9944-4837;0000-0002-1134-2343;0000-0002-9809-3430;;;;", "linkedin": ";siyulong;;;;wei-ying-ma-16a0171/;;", "or_profile": "~Kangjie_Zheng1;~Siyu_Long1;~Zaiqing_Nie2;~Ming_Zhang5;~Xinyu_Dai1;~Wei-Ying_Ma2;~Hao_Zhou5;~junwei_yang1", "aff": "Tsinghua University;Nanjing University;Tsinghua University;Peking University;Nanjing University;Tsinghua University;Tsinghua University;Peking University", "aff_domain": "tsinghua.edu.cn;nju.edu.cn;tsinghua.edu.cn;pku.edu.cn;nju.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;pku.edu.cn", "position": "Intern;PhD student;Full Professor;Full Professor;Full Professor;Full Professor;Associate Professor;PhD Student", "bibtex": "@inproceedings{\nyang2024molae,\ntitle={Mol-{AE}: Auto-Encoder Based Molecular Representation Learning With 3D Cloze Test Objective},\nauthor={Junwei Yang and Kangjie Zheng and Siyu Long and Zaiqing Nie and Ming Zhang and Xinyu Dai and Wei-Ying Ma and Hao Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=inEuvSg0y1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3395917, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12863865037424775495&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "tsinghua.edu.cn;nju.edu.cn;tsinghua.edu.cn;pku.edu.cn;nju.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;pku.edu.cn", "author_num": 8, "aff_unique_index": "0;1;0;2;1;0;0;2", "aff_unique_norm": "Tsinghua University;Nanjing University;Peking University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.nju.edu.cn;http://www.pku.edu.cn", "aff_unique_abbr": "THU;Nanjing U;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "The Expressive Power of Path-Based Graph Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33339", "id": "io1XSRtcO8", "proceeding": "https://proceedings.mlr.press/v235/graziani24a.html", "pdf": "https://openreview.net/pdf?id=io1XSRtcO8", "openreview": "https://openreview.net/forum?id=io1XSRtcO8", "author_site": "Caterina Graziani, Tamara Drucks, Fabian Jogl, Monica Bianchini, franco scarselli, Thomas G\u00e4rtner", "tldr": "", "abstract": "We systematically investigate the expressive power of path-based graph neural networks. While it has been shown that path-based graph neural networks can achieve strong empirical results, an investigation into their expressive power is lacking. Therefore, we propose PATH-WL, a general class of color refinement algorithms based on paths and shortest path distance information. We show that PATH-WL is incomparable to a wide range of expressive graph neural networks, can count cycles, and achieves strong empirical results on the notoriously difficult family of strongly regular graphs. Our theoretical results indicate that PATH-WL forms a new hierarchy of highly expressive graph neural networks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Caterina Graziani;Tamara Drucks;Fabian Jogl;Monica Bianchini;franco scarselli;Thomas G\u00e4rtner", "authorids": "~Caterina_Graziani1;~Tamara_Drucks1;~Fabian_Jogl1;~Monica_Bianchini1;~franco_scarselli1;~Thomas_G\u00e4rtner2", "gender": "F;F;M;F;M;M", "homepage": ";https://informatics.tuwien.ac.at/people/tamara-drucks;https://fjo.gl/;https://www.unisi.it/ugov/person/9664;;https://thomasgaertner.org/", "dblp": "134/0476;;292/7003;;71/2155;https://dblp.uni-trier.de/pers/hd/g/G=auml=rtner_0001:Thomas", "google_scholar": "OMVOd6gAAAAJ;0vCiYA4AAAAJ;;tniAcpAAAAAJ;https://scholar.google.it/citations?user=lD59_7EAAAAJ;sOI8QyoAAAAJ", "orcid": "0000-0002-7606-9405;;;0000-0002-8206-8142;0000-0003-1307-0772;0000-0001-5985-9213", "linkedin": ";;;;;", "or_profile": "~Caterina_Graziani1;~Tamara_Drucks1;~Fabian_Jogl1;~Monica_Bianchini1;~franco_scarselli1;~Thomas_G\u00e4rtner2", "aff": "University of Siena;NII, Tokyo Institute of Technology;TU Wien;University of Siena;University of Siena;TU Wien", "aff_domain": "unisi.it;nii.ac.jp;tuwien.ac.at;unisi.it;unisi.it;tuwien.ac.at", "position": "PhD student;Intern;PhD student;Associate Professor;Associate Professor;Full Professor", "bibtex": "@inproceedings{\ngraziani2024the,\ntitle={The Expressive Power of Path-Based Graph Neural Networks},\nauthor={Caterina Graziani and Tamara Drucks and Fabian Jogl and Monica Bianchini and franco scarselli and Thomas G{\\\"a}rtner},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=io1XSRtcO8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1761571, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15346684841985212036&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "unisi.it;nii.ac.jp;tuwien.ac.at;unisi.it;unisi.it;tuwien.ac.at", "author_num": 6, "aff_unique_index": "0;1;2;0;0;2", "aff_unique_norm": "University of Siena;Tokyo Institute of Technology;Technische Universit\u00e4t Wien", "aff_unique_dep": ";;", "aff_unique_url": "https://www.unisi.it;https://www.titech.ac.jp;https://www.tuwien.ac.at", "aff_unique_abbr": "UniSi;Titech;TU Wien", "aff_campus_unique_index": "1", "aff_campus_unique": ";Tokyo", "aff_country_unique_index": "0;1;2;0;0;2", "aff_country_unique": "Italy;Japan;Austria" }, { "title": "Statistical Properties of Robust Satisficing", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33338", "id": "iqAyWVLUEO", "proceeding": "https://proceedings.mlr.press/v235/li24cc.html", "pdf": "https://openreview.net/pdf?id=iqAyWVLUEO", "openreview": "https://openreview.net/forum?id=iqAyWVLUEO", "author_site": "zhiyi li, Yunbei Xu, Ruohan Zhan", "tldr": "", "abstract": "The Robust Satisficing (RS) model is an emerging approach to robust optimization, offering streamlined procedures and robust generalization across various applications. However, the statistical theory of RS remains unexplored in the literature. This paper fills in the gap by comprehensively analyzing the theoretical properties of the RS model. Notably, the RS structure offers a more straightforward path to deriving statistical guarantees compared to the seminal Distributionally Robust Optimization (DRO), resulting in a richer set of results. In particular, we establish two-sided confidence intervals for the optimal loss without the need to solve a minimax optimization problem explicitly. We further provide finite-sample generalization error bounds for the RS optimizer. Importantly, our results extend to scenarios involving distribution shifts, where discrepancies exist between the sampling and target distributions. Our numerical experiments show that the RS model consistently outperforms the baseline empirical risk minimization in small-sample regimes and under distribution shifts. Furthermore, compared to the DRO model, the RS model exhibits lower sensitivity to hyperparameter tuning, highlighting its practicability for robustness considerations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "zhiyi li;Yunbei Xu;Ruohan Zhan", "authorids": "~zhiyi_li3;~Yunbei_Xu1;~Ruohan_Zhan1", "gender": "M;M;F", "homepage": "https://github.com/lzy1900011745/homepage;https://yunbeixu.github.io/;https://ruohanzhan.github.io", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~zhiyi_li3;~Yunbei_Xu1;~Ruohan_Zhan1", "aff": "Peking University;Massachusetts Institute of Technology;Hong Kong University of Science and Technology", "aff_domain": "pku.edu;mit.edu;ust.hk", "position": "Undergrad student;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nli2024statistical,\ntitle={Statistical Properties of Robust Satisficing},\nauthor={zhiyi li and Yunbei Xu and Ruohan Zhan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=iqAyWVLUEO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9044991, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1815778390504324926&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 5, "email": "pku.edu;mit.edu;ust.hk", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Peking University;Massachusetts Institute of Technology;Hong Kong University of Science and Technology", "aff_unique_dep": ";;", "aff_unique_url": "http://www.pku.edu.cn;https://web.mit.edu;https://www.ust.hk", "aff_unique_abbr": "Peking U;MIT;HKUST", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;United States" }, { "title": "Residual-Conditioned Optimal Transport: Towards Structure-Preserving Unpaired and Paired Image Restoration", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33337", "id": "irBHPlknxP", "proceeding": "https://proceedings.mlr.press/v235/tang24d.html", "pdf": "https://openreview.net/pdf?id=irBHPlknxP", "openreview": "https://openreview.net/forum?id=irBHPlknxP", "author_site": "Xiaole Tang, Hu Xin, Xiang Gu, Jian Sun", "tldr": "", "abstract": "Deep learning-based image restoration methods generally struggle with faithfully preserving the structures of the original image. In this work, we propose a novel Residual-Conditioned Optimal Transport (RCOT) approach, which models image restoration as an optimal transport (OT) problem for both unpaired and paired settings, introducing the transport residual as a unique degradation-specific cue for both the transport cost and the transport map. Specifically, we first formalize a Fourier residual-guided OT objective by incorporating the degradation-specific information of the residual into the transport cost. We further design the transport map as a two-pass RCOT map that comprises a base model and a refinement process, in which the transport residual is computed by the base model in the first pass and then encoded as a degradation-specific embedding to condition the second-pass restoration. By duality, the RCOT problem is transformed into a minimax optimization problem, which can be solved by adversarially training neural networks. Extensive experiments on multiple restoration tasks show that RCOT achieves competitive performance in terms of both distortion measures and perceptual quality, restoring images with more faithful structures as compared with state-of-the-art methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaole Tang;Xin Hu;Xiang Gu;Jian Sun", "authorids": "~Xiaole_Tang1;~Xin_Hu1;~Xiang_Gu1;~Jian_Sun1", "gender": "M;F;M;M", "homepage": ";;https://xjtu-xgu.github.io/xianggu/;https://gr.xjtu.edu.cn/en/web/jiansun/publications", "dblp": "331/0808;;57/7710-5;68/4942-9.html", "google_scholar": "CwvKZ8QAAAAJ;https://scholar.google.com.hk/citations?user=zt9JYkkAAAAJ;51GDv0EAAAAJ;SSgNWOMAAAAJ", "orcid": "0009-0003-8070-0591;;;", "linkedin": ";https://www.linkedin.cn/injobs/in/\u946b-\u80e1-2b373b154;;", "or_profile": "~Xiaole_Tang1;~Xin_Hu1;~Xiang_Gu1;~Jian_Sun1", "aff": "Xi'an Jiaotong University;Xi'an Jiaotong University;Xi'an Jiaotong University;Xi'an Jiaotong University", "aff_domain": "xjtu.edu.cn;xjtu.edu.cn;xjtu.edu.cn;xjtu.edu.cn", "position": "PhD student;PhD student;PhD student;Professor", "bibtex": "@inproceedings{\ntang2024residualconditioned,\ntitle={Residual-Conditioned Optimal Transport: Towards Structure-Preserving Unpaired and Paired Image Restoration},\nauthor={Xiaole Tang and Xin Hu and Xiang Gu and Jian Sun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=irBHPlknxP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6916144, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6566965064830615312&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "xjtu.edu.cn;xjtu.edu.cn;xjtu.edu.cn;xjtu.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Xi'an Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.xjtu.edu.cn", "aff_unique_abbr": "XJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Position: What Can Large Language Models Tell Us about Time Series Analysis", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33336", "id": "iroZNDxFJZ", "proceeding": "https://proceedings.mlr.press/v235/jin24i.html", "pdf": "https://openreview.net/pdf?id=iroZNDxFJZ", "openreview": "https://openreview.net/forum?id=iroZNDxFJZ", "author_site": "Ming Jin, Yi-Fan Zhang, Wei Chen, Kexin Zhang, Yuxuan Liang, Bin Yang, Jindong Wang, Shirui Pan, Qingsong Wen", "tldr": "", "abstract": "Time series analysis is essential for comprehending the complexities inherent in various real-world systems and applications. Although large language models (LLMs) have recently made significant strides, the development of artificial general intelligence (AGI) equipped with time series analysis capabilities remains in its nascent phase. Most existing time series models heavily rely on domain knowledge and extensive model tuning, predominantly focusing on prediction tasks. In this paper, we argue that current LLMs have the potential to revolutionize time series analysis, thereby promoting efficient decision-making and advancing towards a more universal form of time series analytical intelligence. Such advancement could unlock a wide range of possibilities, including time series modality switching and question answering. We encourage researchers and practitioners to recognize the potential of LLMs in advancing time series analysis and emphasize the need for trust in these related efforts. Furthermore, we detail the seamless integration of time series analysis with existing LLM technologies and outline promising avenues for future research.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ming Jin;YiFan Zhang;Wei Chen;Kexin Zhang;Yuxuan Liang;Bin Yang;Jindong Wang;Shirui Pan;Qingsong Wen", "authorids": "~Ming_Jin3;~YiFan_Zhang8;~Wei_Chen50;~Kexin_Zhang3;~Yuxuan_Liang1;~Bin_Yang4;~Jindong_Wang1;~Shirui_Pan1;~Qingsong_Wen2", "gender": "M;M;M;M;M;;M;M;M", "homepage": "https://mingjin.dev/;https://onedean.github.io/;https://www.zhihu.com/people/zhang-ke-xin-16-57;https://yuxuanliang.com;https://faculty.ecnu.edu.cn/_s37/yb2/main.psp;;https://jd92.wang/;https://sites.google.com/site/qingsongwen8/;https://yfzhang114.github.io/", "dblp": "34/3870-5;;119/0668-4;183/0977;77/377-2;91/8171;19/2969-1;27/561;", "google_scholar": "I2xvKaIAAAAJ;RCfQIcQAAAAJ;;n9cODgcAAAAJ;qjBQhoUAAAAJ;https://scholar.google.com.au/citations?user=frWRJN4AAAAJ;hBZ_tKsAAAAJ;vjPJvwYAAAAJ;lUnt8X4AAAAJ", "orcid": "0000-0002-6833-4811;;0000-0003-1968-8004;0000-0003-2817-7337;0000-0002-1658-1079;0000-0003-0794-527X;0000-0002-4833-0880;0000-0003-4516-2524;0000-0002-6227-0183", "linkedin": ";;;yoshall/;;;jindong-wang/;qingsong-wen-22814156/;", "or_profile": "~Ming_Jin3;~Wei_Chen50;~Kexin_Zhang3;~Yuxuan_Liang1;~Bin_Yang4;~Shirui_Pan1;~Jindong_Wang4;~Qingsong_Wen1;~yifan_zhang7", "aff": "Griffith University;Hong Kong University of Science and Technology, Guangzhou;Zhejiang University;The Hong Kong University of Science and Technology (Guangzhou);Aalborg University;Griffith University;Microsoft Research;Squirrel Ai Learning;Institute of automation, Chinese academy of science", "aff_domain": "griffith.edu.au;hkust-gz.edu.cn;zju.edu.cn;hkust-gz.edu.cn;aau.dk;griffith.edu.au;microsoft.com;squirrelai.com;nlpr.ia.ac.cn", "position": "Assistant Professor;PhD student;Postdoc;Assistant Professor;Full Professor;Full Professor;Researcher;Principal Researcher;PhD student", "bibtex": "@inproceedings{\njin2024position,\ntitle={Position: What Can Large Language Models Tell Us about Time Series Analysis},\nauthor={Ming Jin and YiFan Zhang and Wei Chen and Kexin Zhang and Yuxuan Liang and Bin Yang and Jindong Wang and Shirui Pan and Qingsong Wen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=iroZNDxFJZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1185335, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13973291020949636089&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "griffith.edu.au;hkust-gz.edu.cn;zju.edu.cn;hkust-gz.edu.cn;aau.dk;griffith.edu.au;microsoft.com;squirrelai.com;nlpr.ia.ac.cn", "author_num": 9, "aff_unique_index": "0;1;2;1;3;0;4;5;6", "aff_unique_norm": "Griffith University;Hong Kong University of Science and Technology;Zhejiang University;Aalborg University;Microsoft;Squirrel Ai Learning;Chinese Academy of Sciences", "aff_unique_dep": ";;;;Microsoft Research;;Institute of Automation", "aff_unique_url": "https://www.griffith.edu.au;https://www.ust.hk;https://www.zju.edu.cn;https://www.aau.dk;https://www.microsoft.com/en-us/research;https://www.squirrelai.com/;http://www.ia.cas.cn", "aff_unique_abbr": "Griffith;HKUST;ZJU;AAU;MSR;;CAS", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Hong Kong SAR;Guangzhou", "aff_country_unique_index": "0;1;1;1;2;0;3;1;1", "aff_country_unique": "Australia;China;Denmark;United States" }, { "title": "EvGGS: A Collaborative Learning Framework for Event-based Generalizable Gaussian Splatting", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33335", "id": "isUSVgS7W1", "proceeding": "https://proceedings.mlr.press/v235/wang24w.html", "pdf": "https://openreview.net/pdf?id=isUSVgS7W1", "openreview": "https://openreview.net/forum?id=isUSVgS7W1", "author_site": "Jiaxu Wang, Junhao He, Ziyi Zhang, Mingyuan Sun, Jingkai SUN, Renjing Xu", "tldr": "", "abstract": "Event cameras offer promising advantages such as high dynamic range and low latency, making them well-suited for challenging lighting conditions and fast-moving scenarios. However, reconstructing 3D scenes from raw event streams is difficult because event data is sparse and does not carry absolute color information. To release its potential in 3D reconstruction, we propose the first event-based generalizable 3D reconstruction framework, which reconstructs scenes as 3D Gaussians from only event input in a feedforward manner and can generalize to unseen cases without any retraining. This framework includes a depth estimation module, an intensity reconstruction module, and a Gaussian regression module. These submodules connect in a cascading manner, and we collaboratively train them with a designed joint loss to make them mutually promote. To facilitate related studies, we build a novel event-based 3D dataset with various material objects and calibrated labels of greyscale images, depth maps, camera poses, and silhouettes. Experiments show models that have jointly trained significantly outperform those trained individually. Our approach performs better than all baselines in reconstruction quality, and depth/intensity predictions with satisfactory rendering speed.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiaxu Wang;Junhao He;Ziyi Zhang;Mingyuan Sun;Jingkai SUN;Renjing Xu", "authorids": "~Jiaxu_Wang1;~Junhao_He2;~Ziyi_Zhang6;~Mingyuan_Sun1;~Jingkai_SUN1;~Renjing_Xu1", "gender": ";M;F;M;M;", "homepage": ";;https://github.com/Zerory1;https://myuansun.github.io;https://github.com/Greatsjk;", "dblp": ";;;;;", "google_scholar": ";s2pC5LAAAAAJ;;https://scholar.google.com/citations?hl=en;;", "orcid": ";0009-0004-2215-1261;;;;", "linkedin": ";junhao-h-951b63190/;;;;", "or_profile": "~Jiaxu_Wang1;~Junhao_He2;~Ziyi_Zhang6;~Mingyuan_Sun1;~Jingkai_SUN1;~Renjing_Xu1", "aff": ";the Hong Kong University of Science and Technology (Guangzhou);Hong Kong University of Science and Technology (Guangzhou);Northeastern University;Hong Kong University of Science and Technology;", "aff_domain": ";hkust-gz.edu.cn;hkust-gz.edu.cn;neu.edu.cn;hkust.edu;", "position": ";Intern;PhD student;MS student;MS student;", "bibtex": "@inproceedings{\nwang2024evggs,\ntitle={Ev{GGS}: A Collaborative Learning Framework for Event-based Generalizable Gaussian Splatting},\nauthor={Jiaxu Wang and Junhao He and Ziyi Zhang and Mingyuan Sun and Jingkai SUN and Renjing Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=isUSVgS7W1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8175762, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1601854868259753290&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6, "email": ";hkust-gz.edu.cn;hkust-gz.edu.cn;neu.edu.cn;hkust.edu;", "author_num": 6, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Hong Kong University of Science and Technology;Northeastern University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ust.hk;https://www.northeastern.edu", "aff_unique_abbr": "HKUST;NEU", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Guangzhou;Hong Kong SAR;", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Learning from Integral Losses in Physics Informed Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33334", "id": "itDhUBY2xf", "proceeding": "https://proceedings.mlr.press/v235/saleh24a.html", "pdf": "https://openreview.net/pdf?id=itDhUBY2xf", "openreview": "https://openreview.net/forum?id=itDhUBY2xf", "author_site": "Ehsan Saleh, Saba Ghaffari, Timothy Bretl, Luke Olson, Matthew West", "tldr": "", "abstract": "This work proposes a solution for the problem of training physics-informed networks under partial integro-differential equations. These equations require an infinite or a large number of neural evaluations to construct a single residual for training. As a result, accurate evaluation may be impractical, and we show that naive approximations at replacing these integrals with unbiased estimates lead to biased loss functions and solutions. To overcome this bias, we investigate three types of potential solutions: the deterministic sampling approaches, the double-sampling trick, and the delayed target method. We consider three classes of PDEs for benchmarking; one defining Poisson problems with singular charges and weak solutions of up to 10 dimensions, another involving weak solutions on electro-magnetic fields and a Maxwell equation, and a third one defining a Smoluchowski coagulation problem. Our numerical results confirm the existence of the aforementioned bias in practice and also show that our proposed delayed target approach can lead to accurate solutions with comparable quality to ones estimated with a large sample size integral. Our implementation is open-source and available at https://github.com/ehsansaleh/btspinn.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ehsan Saleh;Saba Ghaffari;Tim Bretl;Luke Olson;Matthew West", "authorids": "~Ehsan_Saleh1;~Saba_Ghaffari1;~Tim_Bretl1;~Luke_Olson1;~Matthew_West1", "gender": ";F;M;M;", "homepage": ";;http://bretl.csl.illinois.edu/;http://lukeo.cs.illinois.edu/;http://lagrange.mechse.illinois.edu", "dblp": ";;29/2834;06/8561.html;", "google_scholar": ";https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=ab_0lGcAAAAJ;o43oc6AAAAAJ;", "orcid": ";;;0000-0002-5283-6104;0000-0002-7605-0050", "linkedin": ";saba-ghaffari-171a3356/;;;", "or_profile": "~Ehsan_Saleh1;~Saba_Ghaffari1;~Tim_Bretl1;~Luke_Olson1;~Matthew_West1", "aff": ";University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign", "aff_domain": ";illinois.edu;illinois.edu;illinois.edu;illinois.edu", "position": ";PhD student;Full Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nsaleh2024learning,\ntitle={Learning from Integral Losses in Physics Informed Neural Networks},\nauthor={Ehsan Saleh and Saba Ghaffari and Tim Bretl and Luke Olson and Matthew West},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=itDhUBY2xf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2409056, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8952720426583083926&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": ";illinois.edu;illinois.edu;illinois.edu;illinois.edu", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "AI Alignment with Changing and Influenceable Reward Functions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33333", "id": "itYGbe0Cs1", "proceeding": "https://proceedings.mlr.press/v235/carroll24a.html", "pdf": "https://openreview.net/pdf?id=itYGbe0Cs1", "openreview": "https://openreview.net/forum?id=itYGbe0Cs1", "author_site": "Micah Carroll, Davis Foote, Anand Siththaranjan, Stuart Russell, Anca Dragan", "tldr": "", "abstract": "Existing AI alignment approaches assume that preferences are static, which is unrealistic: our preferences change, and may even be influenced by our interactions with AI systems themselves. To clarify the consequences of incorrectly assuming static preferences, we introduce Dynamic Reward Markov Decision Processes (DR-MDPs), which explicitly model preference changes and the AI's influence on them. We show that despite its convenience, the static-preference assumption may undermine the soundness of existing alignment techniques, leading them to implicitly reward AI systems for influencing user preferences in ways users may not truly want. We then explore potential solutions. First, we offer a unifying perspective on how an agent's optimization horizon may partially help reduce undesirable AI influence. Then, we formalize different notions of AI alignment that account for preference change from the outset. Comparing the strengths and limitations of 8 such notions of alignment, we find that they all either err towards causing undesirable AI influence, or are overly risk-averse, suggesting that a straightforward solution to the problems of changing preferences may not exist. As there is no avoiding grappling with changing preferences in real-world settings, this makes it all the more important to handle these issues with care, balancing risks and capabilities. We hope our work can provide conceptual clarity and constitute a first step towards AI alignment practices which explicitly account for (and contend with) the changing and influenceable nature of human preferences.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Micah Carroll;Davis Foote;Anand Siththaranjan;Stuart Russell;Anca Dragan", "authorids": "~Micah_Carroll1;~Davis_Foote1;~Anand_Siththaranjan1;~Stuart_Russell1;~Anca_Dragan1", "gender": "M;;M;M;F", "homepage": "https://micahcarroll.github.io/;;;https://people.eecs.berkeley.edu/~russell/;http://www.ancadragan.com/", "dblp": "250/9080;190/7201;;;", "google_scholar": "MeNbzgIAAAAJ;;qYXPDjQAAAAJ;https://scholar.google.com.tw/citations?user=KJGrjCAAAAAJ;", "orcid": "0000-0002-0716-8071;;;;", "linkedin": "micah-carroll/;;;;", "or_profile": "~Micah_Carroll1;~Davis_Foote1;~Anand_Siththaranjan1;~Stuart_Russell1;~Anca_Dragan1", "aff": "University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu", "position": "PhD student;Researcher;PhD student;Full Professor;Associate Professor", "bibtex": "@inproceedings{\ncarroll2024ai,\ntitle={{AI} Alignment with Changing and Influenceable Reward Functions},\nauthor={Micah Carroll and Davis Foote and Anand Siththaranjan and Stuart Russell and Anca Dragan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=itYGbe0Cs1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2355930, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11480470323782886337&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 12, "email": "berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Visual Transformer with Differentiable Channel Selection: An Information Bottleneck Inspired Approach", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33332", "id": "iup9NElHji", "proceeding": "https://proceedings.mlr.press/v235/wang24ak.html", "pdf": "https://openreview.net/pdf?id=iup9NElHji", "openreview": "https://openreview.net/forum?id=iup9NElHji", "author_site": "Yancheng Wang, Ping Li, Yingzhen Yang", "tldr": "", "abstract": "Self-attention and transformers have been widely used in deep learning. Recent efforts have been devoted to incorporating transformer blocks into different types of neural architectures, including those with convolutions, leading to various visual transformers for computer vision tasks. In this paper, we propose a novel and compact transformer block, Transformer with Differentiable Channel Selection, or DCS-Transformer. DCS-Transformer features channel selection in the computation of the attention weights and the input/output features of the MLP in the transformer block. Our DCS-Transformer is compatible with many popular and compact transformer networks, such as MobileViT and EfficientViT, and it reduces the FLOPs of the visual transformers while maintaining or even improving the prediction accuracy. In the experiments, we replace all the transformer blocks in MobileViT and EfficientViT with DCS-Transformer blocks, leading to DCS-Transformer networks with different backbones. The DCS-Transformer is motivated by reduction of Information Bottleneck, and a novel variational upper bound for the IB loss which can be optimized by SGD is derived and incorporated into the training loss of the network with DCS-Transformer. Extensive results on image classification and object detection evidence that DCS-Transformer renders compact and efficient visual transformers with comparable or much better prediction accuracy than the original visual transformers. The code of DCS-Transformer is available at https://github.com/Statistical-Deep-Learning/DCS-Transformer.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yancheng Wang;Ping Li;Yingzhen Yang", "authorids": "~Yancheng_Wang2;~Ping_Li3;~Yingzhen_Yang1", "gender": "M;M;M", "homepage": ";http://www.stat.rutgers.edu/home/pingli/;http://yingzhenyang.com", "dblp": ";62/5860-1;66/3838.html", "google_scholar": "https://scholar.google.com/citations?hl=en;;", "orcid": ";;", "linkedin": ";;yingzhen-yang-9b869122", "or_profile": "~Yancheng_Wang2;~Ping_Li3;~Yingzhen_Yang1", "aff": "Arizona State University;LinkedIn;Arizona State University", "aff_domain": "asu.edu;linkedin.com;asu.edu", "position": "PhD student;Engineer;Assistant Professor", "bibtex": "@inproceedings{\nwang2024visual,\ntitle={Visual Transformer with Differentiable Channel Selection: An Information Bottleneck Inspired Approach},\nauthor={Yancheng Wang and Ping Li and Yingzhen Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=iup9NElHji}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7222135, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:r-UPoEdjLugJ:scholar.google.com/&scioq=Visual+Transformer+with+Differentiable+Channel+Selection:+An+Information+Bottleneck+Inspired+Approach&hl=en&as_sdt=0,6", "gs_version_total": 6, "email": "asu.edu;linkedin.com;asu.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Arizona State University;LinkedIn Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://www.asu.edu;https://www.linkedin.com", "aff_unique_abbr": "ASU;LinkedIn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Differentiability and Optimization of Multiparameter Persistent Homology", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33331", "id": "ixdfvnO0uy", "proceeding": "https://proceedings.mlr.press/v235/scoccola24a.html", "pdf": "https://openreview.net/pdf?id=ixdfvnO0uy", "openreview": "https://openreview.net/forum?id=ixdfvnO0uy", "author_site": "Luis Scoccola, Siddharth Setlur, David Loiseaux, Mathieu Carri\u00e8re, Steve Oudot", "tldr": "", "abstract": "Real-valued functions on geometric data---such as node attributes on a graph---can be optimized using descriptors from persistent homology, allowing the user to incorporate topological terms in the loss function. When optimizing a single real-valued function (the one-parameter setting), there is a canonical choice of descriptor for persistent homology: the barcode. The operation mapping a real-valued function to its barcode is differentiable almost everywhere, and the convergence of gradient descent for losses using barcodes is relatively well understood. When optimizing a vector-valued function (the multiparameter setting), there is no unique choice of descriptor for multiparameter persistent homology, and many distinct descriptors have been proposed. This calls for the development of a general framework for differentiability and optimization that applies to a wide range of multiparameter homological descriptors. In this article, we develop such a framework and show that it encompasses well-known descriptors of different flavors, such as signed barcodes and the multiparameter persistence landscape. We complement the theory with numerical experiments supporting the idea that optimizing multiparameter homological descriptors can lead to improved performances compared to optimizing one-parameter descriptors, even when using the simplest and most efficiently computable multiparameter descriptors.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Luis Scoccola;Siddharth Setlur;David Loiseaux;Mathieu Carri\u00e8re;Steve Oudot", "authorids": "~Luis_Scoccola1;~Siddharth_Setlur1;~David_Loiseaux1;~Mathieu_Carri\u00e8re1;~Steve_Oudot1", "gender": ";M;M;;M", "homepage": ";https://siddharthsetlur.github.io;https://davidlapous.github.io/;https://mathieucarriere.github.io/website/;https://geometrica.saclay.inria.fr/team/Steve.Oudot/", "dblp": ";;322/2006;167/1015;28/6883", "google_scholar": ";xaSqxksAAAAJ;oAjKKKcAAAAJ;;", "orcid": ";;0009-0003-5559-3712;;", "linkedin": ";;david-loiseaux/;;", "or_profile": "~Luis_Scoccola1;~Siddharth_Setlur1;~David_Loiseaux1;~Mathieu_Carri\u00e8re1;~Steve_Oudot1", "aff": ";ETHZ - ETH Zurich;INRIA;INRIA;\u00c9cole Polytechnique", "aff_domain": ";ethz.ch;inria.fr;inria.fr;polytechnique.fr", "position": ";MS student;PhD student;Researcher;Full Professor", "bibtex": "@inproceedings{\nscoccola2024differentiability,\ntitle={Differentiability and Optimization of Multiparameter Persistent Homology},\nauthor={Luis Scoccola and Siddharth Setlur and David Loiseaux and Mathieu Carri{\\`e}re and Steve Oudot},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ixdfvnO0uy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1298726, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11937984997888352276&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 15, "email": ";ethz.ch;inria.fr;inria.fr;polytechnique.fr", "author_num": 5, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "ETH Zurich;INRIA;Ecole Polytechnique", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ethz.ch;https://www.inria.fr;https://www.polytechnique.edu", "aff_unique_abbr": "ETHZ;INRIA;X", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Switzerland;France" }, { "title": "Distributional Bellman Operators over Mean Embeddings", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33330", "id": "j2pLfsBm4J", "proceeding": "https://proceedings.mlr.press/v235/wenliang24a.html", "pdf": "https://openreview.net/pdf?id=j2pLfsBm4J", "openreview": "https://openreview.net/forum?id=j2pLfsBm4J", "author_site": "Li Kevin Wenliang, Gregoire Deletang, Matthew Aitchison, Marcus Hutter, Anian Ruoss, Arthur Gretton, Mark Rowland", "tldr": "", "abstract": "We propose a novel algorithmic framework for distributional reinforcement learning, based on learning finite-dimensional mean embeddings of return distributions. The framework reveals a wide variety of new algorithms for dynamic programming and temporal-difference algorithms that rely on the sketch Bellman operator, which updates mean embeddings with simple linear-algebraic computations. We provide asymptotic convergence theory, and examine the empirical performance of the algorithms on a suite of tabular tasks. Further, we show that this approach can be straightforwardly combined with deep reinforcement learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Li Kevin Wenliang;Gregoire Deletang;Matthew Aitchison;Marcus Hutter;Anian Ruoss;Arthur Gretton;Mark Rowland", "authorids": "~Li_Kevin_Wenliang1;~Gregoire_Deletang1;~Matthew_Aitchison1;~Marcus_Hutter1;~Anian_Ruoss1;~Arthur_Gretton1;~Mark_Rowland1", "gender": ";M;;M;M;M;M", "homepage": "https://kevin-w-li.github.io/;;http://www.hutter1.net/;;http://www.gatsby.ucl.ac.uk/~gretton/;http://sites.google.com/view/markrowland;http://gdeletang.com/", "dblp": "255/7009;;h/MarcusHutter;259/2083;56/2574;86/4090;277/0588", "google_scholar": "https://scholar.google.co.uk/citations?user=MW45NMEAAAAJ;81URpqMAAAAJ;https://scholar.google.com.tw/citations?user=7hmCntEAAAAJ;gFkwD3kAAAAJ;OUv7J6QAAAAJ;https://scholar.google.co.uk/citations?user=-0U84zMAAAAJ;OgVNoSkAAAAJ", "orcid": ";;0000-0002-3263-4097;;;;", "linkedin": ";;hutter1/;anian-ruoss;;;gr%C3%A9goire-del%C3%A9tang-4a1900128/", "or_profile": "~Li_Kevin_Wenliang1;~Matthew_Aitchison1;~Marcus_Hutter1;~Anian_Ruoss1;~Arthur_Gretton1;~Mark_Rowland1;~Gregoire_Detetang1", "aff": "Google DeepMind;Australian National University;Australian National University;Google DeepMind;University College London;Google DeepMind;", "aff_domain": "deepmind.com;anu.edu.au;anu.edu.au;deepmind.com;ucl.ac.uk;google.com;", "position": "Researcher;PhD student;Full Professor;Researcher;Professor;Research Scientist;", "bibtex": "@inproceedings{\nwenliang2024distributional,\ntitle={Distributional Bellman Operators over Mean Embeddings},\nauthor={Li Kevin Wenliang and Gregoire Deletang and Matthew Aitchison and Marcus Hutter and Anian Ruoss and Arthur Gretton and Mark Rowland},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=j2pLfsBm4J}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1170120, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10050873653996038853&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 7, "email": "deepmind.com;anu.edu.au;anu.edu.au;deepmind.com;ucl.ac.uk;google.com;", "author_num": 7, "aff_unique_index": "0;1;1;0;2;0", "aff_unique_norm": "Google;Australian National University;University College London", "aff_unique_dep": "Google DeepMind;;", "aff_unique_url": "https://deepmind.com;https://www.anu.edu.au;https://www.ucl.ac.uk", "aff_unique_abbr": "DeepMind;ANU;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;0;0", "aff_country_unique": "United Kingdom;Australia" }, { "title": "On Multi-Armed Bandit with Impatient Arms", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33329", "id": "j35VcooKG8", "proceeding": "https://proceedings.mlr.press/v235/shao24b.html", "pdf": "https://openreview.net/pdf?id=j35VcooKG8", "openreview": "https://openreview.net/forum?id=j35VcooKG8", "author_site": "Yuming Shao, Zhixuan Fang", "tldr": "", "abstract": "In this paper, we investigate a Multi-Armed Bandit (MAB) setting where an arm exits the game if the algorithm continuously neglects it. This setup is motivated by real-world scenarios, such as online advertising and crowdsourcing, where arms only gain benefits after being pulled by the algorithm. We identify the intrinsic hardness of this problem and limitations in existing approaches. We propose FC-SE algorithm with expected regret upper bounds as our solution to this problem. As an extension, we even allow new arms to enter after the game starts and design FC-Entry algorithm with performance guarantees for this setup. Finally, we conduct experiments to validate our theoretical results.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuming Shao;Zhixuan Fang", "authorids": "~Yuming_Shao1;~Zhixuan_Fang1", "gender": "M;M", "homepage": "https://www.researchgate.net/profile/Yuming-Shao-5;https://people.iiis.tsinghua.edu.cn/~fang/", "dblp": ";179/2243", "google_scholar": ";0N4s3CAAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Yuming_Shao1;~Zhixuan_Fang1", "aff": "Tsinghua University;Tsinghua University", "aff_domain": "mails.tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nshao2024on,\ntitle={On Multi-Armed Bandit with Impatient Arms},\nauthor={Yuming Shao and Zhixuan Fang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=j35VcooKG8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 915220, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1139468931241200848&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 4, "email": "mails.tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "MILP-FBGen: LP/MILP Instance Generation with Feasibility/Boundedness", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33328", "id": "j4HtfTqr0f", "proceeding": "https://proceedings.mlr.press/v235/zhang24p.html", "pdf": "https://openreview.net/pdf?id=j4HtfTqr0f", "openreview": "https://openreview.net/forum?id=j4HtfTqr0f", "author_site": "Yahong Zhang, Chenchen Fan, Donghui Chen, Congrui Li, Wenli Ouyang, Mingda Zhu, Junchi Yan", "tldr": "", "abstract": "Machine learning (ML) has been actively adopted in Linear Programming (LP) and Mixed-Integer Linear Programming (MILP), whose potential is hindered by instance scarcity. Current synthetic instance generation methods often fall short in closely mirroring the distribution of original datasets or ensuring the feasibility and boundedness of the generated data \u2014 a critical requirement for obtaining reliable supervised labels in model training. In this paper, we present a diffusion-based LP/MILP instance generative framework called MILP-FBGen. It strikes a balance between structural similarity and novelty while maintaining feasibility/boundedness via a meticulously designed structure-preserving generation module and a feasibility/boundedness-constrained sampling module. Our method shows superiority on two fronts: 1) preservation of key properties (hardness, feasibility, and boundedness) of LP/MILP instances, and 2) enhanced performance on downstream tasks. Extensive studies show two-fold superiority that our method ensures higher distributional similarity and 100% feasibility in both easy and hard datasets, surpassing current state-of-the-art techniques.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yahong Zhang;Chenchen Fan;Donghui Chen;Congrui Li;Wenli Ouyang;Mingda Zhu;Junchi Yan", "authorids": "~Yahong_Zhang2;~Chenchen_Fan1;~Donghui_Chen2;~Congrui_Li1;~Wenli_Ouyang1;~Mingda_Zhu1;~Junchi_Yan2", "gender": ";;;;;M;", "homepage": ";https://www.scholat.com/cfan;;;;https://www.linkedin.com/in/%E6%98%8E%E8%BE%BE-%E6%9C%B1-4b0043105;", "dblp": ";;;;243/3171;;", "google_scholar": ";;;;https://scholar.google.com/citations?hl=en;;", "orcid": ";;;0009-0001-5138-0336;;;", "linkedin": ";;;;wenli-ouyang-07891641;;", "or_profile": "~Yahong_Zhang2;~Chenchen_Fan1;~Donghui_Chen2;~Congrui_Li1;~Wenli_Ouyang1;~Mingda_Zhu1;~Junchi_Yan2", "aff": ";;;Lenovo Group Limited;Lenovo Research ;Lenovo;", "aff_domain": ";;;lenovo.com;lenovo.com;lenovo.com;", "position": ";;;Researcher;Researcher;Researcher;", "bibtex": "@inproceedings{\nzhang2024milpfbgen,\ntitle={{MILP}-{FBG}en: {LP}/{MILP} Instance Generation with Feasibility/Boundedness},\nauthor={Yahong Zhang and Chenchen Fan and Donghui Chen and Congrui Li and Wenli Ouyang and Mingda Zhu and Junchi Yan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=j4HtfTqr0f}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 632438, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=759215721394494192&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": ";;;lenovo.com;lenovo.com;lenovo.com;", "author_num": 7, "aff_unique_index": "0;1;0", "aff_unique_norm": "Lenovo Group Limited;Lenovo", "aff_unique_dep": ";Research", "aff_unique_url": "https://www.lenovo.com;https://www.lenovo.com", "aff_unique_abbr": "Lenovo;Lenovo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "FADAS: Towards Federated Adaptive Asynchronous Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33327", "id": "j56JAd29uH", "proceeding": "https://proceedings.mlr.press/v235/wang24bv.html", "pdf": "https://openreview.net/pdf?id=j56JAd29uH", "openreview": "https://openreview.net/forum?id=j56JAd29uH", "author_site": "Yujia Wang, Shiqiang Wang, Songtao Lu, Jinghui Chen", "tldr": "", "abstract": "Federated learning (FL) has emerged as a widely adopted training paradigm for privacy-preserving machine learning. While the SGD-based FL algorithms have demonstrated considerable success in the past, there is a growing trend towards adopting adaptive federated optimization methods, particularly for the training of large-scale models. However, the conventional synchronous aggregation design poses a significant challenge to the practical deployment of those adaptive federated optimization methods, particularly in the presence of straggler clients. To fill this research gap, this paper introduces federated adaptive asynchronous optimization, named FADAS, a novel method that incorporates asynchronous updates into adaptive federated optimization with provable guarantees. To further enhance the efficiency and resilience of our proposed method in scenarios with significant asynchronous delays, we also extend FADAS with a delay-adaptive learning adjustment strategy. We rigorously establish the convergence rate of the proposed algorithms and empirical results demonstrate the superior performance of FADAS over other asynchronous FL baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yujia Wang;Shiqiang Wang;Songtao Lu;Jinghui Chen", "authorids": "~Yujia_Wang3;~Shiqiang_Wang1;~Songtao_Lu1;~Jinghui_Chen1", "gender": ";M;M;M", "homepage": "https://yujiaw98.github.io/;https://shiqiang.wang;https://songtaogithub.github.io/;https://jinghuichen.github.io/", "dblp": ";87/5094-1;05/2887;67/5633", "google_scholar": "0DwROiMAAAAJ;kA_vmOcAAAAJ;LRsjX7kAAAAJ;mKia7Y4AAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Yujia_Wang3;~Shiqiang_Wang1;~Songtao_Lu1;~Jinghui_Chen1", "aff": "Pennsylvania State University;IBM, International Business Machines;IBM Thomas J. Watson Research Center;Pennsylvania State University", "aff_domain": "psu.edu;us.ibm.com;ibm.com;psu.edu", "position": "PhD student;Research Staff Member;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nwang2024fadas,\ntitle={{FADAS}: Towards Federated Adaptive Asynchronous Optimization},\nauthor={Yujia Wang and Shiqiang Wang and Songtao Lu and Jinghui Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=j56JAd29uH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 747618, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10969652620364327073&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "psu.edu;us.ibm.com;ibm.com;psu.edu", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Pennsylvania State University;International Business Machines;IBM", "aff_unique_dep": ";;Research", "aff_unique_url": "https://www.psu.edu;https://www.ibm.com;https://www.ibm.com/research", "aff_unique_abbr": "PSU;IBM;IBM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Yorktown Heights", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Position: Fundamental Limitations of LLM Censorship Necessitate New Approaches", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33326", "id": "j5csKrtyAe", "proceeding": "https://proceedings.mlr.press/v235/glukhov24a.html", "pdf": "https://openreview.net/pdf?id=j5csKrtyAe", "openreview": "https://openreview.net/forum?id=j5csKrtyAe", "author_site": "David Glukhov, Ilia Shumailov, Yarin Gal, Nicolas Papernot, Vardan Papyan", "tldr": "", "abstract": "Large language models (LLMs) have exhibited impressive capabilities in comprehending complex instructions. However, their blind adherence to provided instructions has led to concerns regarding risks of malicious use. Existing defence mechanisms, such as model fine-tuning or output censorship methods have proven to be fallible at ensuring that LLMs do not return semantically impermissible responses. We present fundamental limitations of verifying the semantic properties of LLM outputs and identifying compositional threats, illustrating inherent challenges of current approaches to censoring LLM outputs. Specifically, we demonstrate that semantic censorship can be perceived as an undecidable problem, and semantic properties of LLM outputs can become impossible to verify when the LLM is capable of providing \"encrypted\" outputs. We further show challenges of censorship can extend beyond just semantic censorship, as attackers can reconstruct impermissible outputs from a collection of permissible ones. Consequently, we call for a re-evaluation of the problem of censorship and its goals, stressing the need for new definitions and approaches to censorship. In addition, we provide an initial attempt toward achieving this goal through syntactic censorship, drawing from a security perspective to design censorship methods that can provide guarantees.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "David Glukhov;Ilia Shumailov;Yarin Gal;Nicolas Papernot;Vardan Papyan", "authorids": "~David_Glukhov1;~Ilia_Shumailov1;~Yarin_Gal1;~Nicolas_Papernot1;~Vardan_Papyan1", "gender": "M;;M;M;Unspecified", "homepage": "http://www.cs.toronto.edu/~dglukhov/;http://www.cs.ox.ac.uk/people/yarin.gal/website//;https://www.papernot.fr;https://sites.google.com/view/vardan-papyan;https://www.cl.cam.ac.uk/~is410/", "dblp": ";67/9076;162/1405;173/9783;213/8587", "google_scholar": ";https://scholar.google.co.uk/citations?user=SIayDoQAAAAJ;cGxq0cMAAAAJ;https://scholar.google.co.il/citations?user=VrE-Gd4AAAAJ;https://scholar.google.co.uk/citations?hl=en", "orcid": ";;;;", "linkedin": ";;nicolaspapernot;;ilia-shumailov/", "or_profile": "~David_Glukhov1;~Yarin_Gal1;~Nicolas_Papernot1;~Vardan_Papyan1;~I_Shumailov1", "aff": "University of Toronto;University of Oxford;Google;University of Toronto;Google DeepMind", "aff_domain": "utoronto.ca;ox.ac.uk;google.com;toronto.edu;google.com", "position": "MS student;Associate Professor;Research Scientist;Assistant Professor;Research Scientist", "bibtex": "@inproceedings{\nglukhov2024position,\ntitle={Position: Fundamental Limitations of {LLM} Censorship Necessitate New Approaches},\nauthor={David Glukhov and Ilia Shumailov and Yarin Gal and Nicolas Papernot and Vardan Papyan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=j5csKrtyAe}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 614372, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15697213918679775&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "utoronto.ca;ox.ac.uk;google.com;toronto.edu;google.com", "author_num": 5, "aff_unique_index": "0;1;2;0;2", "aff_unique_norm": "University of Toronto;University of Oxford;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.utoronto.ca;https://www.ox.ac.uk;https://www.google.com", "aff_unique_abbr": "U of T;Oxford;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;2;0;1", "aff_country_unique": "Canada;United Kingdom;United States" }, { "title": "Uniform Memory Retrieval with Larger Capacity for Modern Hopfield Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33325", "id": "j5wf1NNhFs", "proceeding": "https://proceedings.mlr.press/v235/wu24i.html", "pdf": "https://openreview.net/pdf?id=j5wf1NNhFs", "openreview": "https://openreview.net/forum?id=j5wf1NNhFs", "author_site": "Dennis Wu, Jerry Yao-Chieh Hu, Teng-Yun Hsiao, Han Liu", "tldr": "", "abstract": "We propose a two-stage optimization formulation for the memory retrieval dynamics of modern Hopfield models, termed $\\mathtt{U\\text{-}Hop}$. Our key contribution is a learnable feature map $\\Phi$ which transforms the Hopfield energy function into a kernel space. This transformation ensures convergence between the local minima of energy and the fixed points of retrieval dynamics within the kernel space. Consequently, the kernel norm induced by $\\Phi$ serves as a novel similarity measure. It utilizes the stored memory patterns as learning data to enhance memory capacity across all modern Hopfield models. Specifically, we accomplish this by constructing a separation loss $\\mathcal{L}_\\Phi$ that separates the local minima of kernelized energy by separating stored memory patterns in kernel space. Methodologically, $\\mathtt{U\\text{-}Hop}$ memory retrieval process consists of: **(Stage I:)** minimizing separation loss for a more uniformed memory (local minimum) distribution, followed by **(Stage II:)** standard Hopfield energy minimization for memory retrieval. This results in significant reduction of possible meta-stable states in the Hopfield energy function, thus preventing memory confusion. Empirically, with real-world datasets, we demonstrate that $\\mathtt{U\\text{-}Hop}$ outperforms all existing modern Hopfield models and SOTA similarity measures, achieving a substantial margin in both associative memory retrieval and deep learning tasks. Code is available at [GitHub](https://github.com/MAGICS-LAB/UHop); future updates are on [arXiv](https://arxiv.org/abs/2404.03827).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dennis Wu;Jerry Yao-Chieh Hu;Teng-Yun Hsiao;Han Liu", "authorids": "~Dennis_Wu1;~Jerry_Yao-Chieh_Hu1;~Teng-Yun_Hsiao1;~Han_Liu4", "gender": ";;M;", "homepage": ";;https://www.facebook.com/profile.php?id=100010781706668&mibextid=ZbWKwL;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Dennis_Wu1;~Jerry_Yao-Chieh_Hu1;~Teng-Yun_Hsiao1;~Han_Liu4", "aff": ";;National Taiwan University;Northwestern University", "aff_domain": ";;ntu.edu.tw;u.northwestern.edu", "position": ";;Undergrad student;Associate Professor", "bibtex": "@inproceedings{\nwu2024uniform,\ntitle={Uniform Memory Retrieval with Larger Capacity for Modern Hopfield Models},\nauthor={Dennis Wu and Jerry Yao-Chieh Hu and Teng-Yun Hsiao and Han Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=j5wf1NNhFs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9802070, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8352411172297377175&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";;ntu.edu.tw;u.northwestern.edu", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "National Taiwan University;Northwestern University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ntu.edu.tw;https://www.northwestern.edu", "aff_unique_abbr": "NTU;NU", "aff_campus_unique_index": "0", "aff_campus_unique": "Taiwan;", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "title": "Hybrid Neural Representations for Spherical Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33324", "id": "j6QZy90B93", "proceeding": "https://proceedings.mlr.press/v235/kim24i.html", "pdf": "https://openreview.net/pdf?id=j6QZy90B93", "openreview": "https://openreview.net/forum?id=j6QZy90B93", "author_site": "Hyomin Kim, Yunhui Jang, Jaeho Lee, Sungsoo Ahn", "tldr": "", "abstract": "In this paper, we study hybrid neural representations for spherical data, a domain of increasing relevance in scientific research. In particular, our work focuses on weather and climate data as well as cosmic microwave background (CMB) data. Although previous studies have delved into coordinate-based neural representations for spherical signals, they often fail to capture the intricate details of highly nonlinear signals. To address this limitation, we introduce a novel approach named Hybrid Neural Representations for Spherical data (HNeR-S). Our main idea is to use spherical feature-grids to obtain positional features which are combined with a multi-layer perceptron to predict the target signal. We consider feature-grids with equirectangular and hierarchical equal area isolatitude pixelization structures that align with weather data and CMB data, respectively. We extensively verify the effectiveness of our HNeR-S for regression, super-resolution, temporal interpolation, and compression tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hyomin Kim;Yunhui Jang;Jaeho Lee;Sungsoo Ahn", "authorids": "~Hyomin_Kim4;~Yunhui_Jang1;~Jaeho_Lee3;~Sungsoo_Ahn1", "gender": "Not Specified;F;M;M", "homepage": "http://icecream126.github.io/;https://yunhuijang.github.io;https://jaeho-lee.github.io;https://sungsooahn.super.site/", "dblp": ";;78/6080-1;90/5164", "google_scholar": ";https://scholar.google.co.kr/citations?user=mYHCTYQAAAAJ;t91zoQMAAAAJ;XTenHs0AAAAJ", "orcid": ";;;", "linkedin": "hyomin-kim-27a004179/;;;", "or_profile": "~Hyomin_Kim4;~Yunhui_Jang1;~Jaeho_Lee3;~Sungsoo_Ahn1", "aff": "POSTECH;Pohang University of Science and Technology;Pohang University of Science and Technology;Pohang University of Science and Technology", "aff_domain": "postech.ac.kr;postech.edu;postech.ac.kr;postech.ac.kr", "position": "MS student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nkim2024hybrid,\ntitle={Hybrid Neural Representations for Spherical Data},\nauthor={Hyomin Kim and Yunhui Jang and Jaeho Lee and Sungsoo Ahn},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=j6QZy90B93}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7493371, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1215068217247226864&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": "postech.ac.kr;postech.edu;postech.ac.kr;postech.ac.kr", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Pohang University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.postech.ac.kr", "aff_unique_abbr": "POSTECH", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Pohang", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Discovering Multiple Solutions from a Single Task in Offline Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33323", "id": "j6rG1ETRyu", "proceeding": "https://proceedings.mlr.press/v235/osa24a.html", "pdf": "https://openreview.net/pdf?id=j6rG1ETRyu", "openreview": "https://openreview.net/forum?id=j6rG1ETRyu", "author_site": "Takayuki Osa, Tatsuya Harada", "tldr": "", "abstract": "Recent studies on online reinforcement learning (RL) have demonstrated the advantages of learning multiple behaviors from a single task, as in the case of few-shot adaptation to a new environment. Although this approach is expected to yield similar benefits in offline RL, appropriate methods for learning multiple solutions have not been fully investigated in previous studies. In this study, we therefore addressed the problem of finding multiple solutions from a single task in offline RL. We propose algorithms that can learn multiple solutions in offline RL, and empirically investigate their performance. Our experimental results show that the proposed algorithm learns multiple qualitatively and quantitatively distinctive solutions in offline RL.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Takayuki Osa;Tatsuya Harada", "authorids": "~Takayuki_Osa1;~Tatsuya_Harada1", "gender": "M;M", "homepage": ";https://www.mi.t.u-tokyo.ac.jp/harada/", "dblp": "27/1571;14/5849", "google_scholar": "https://scholar.google.co.jp/citations?user=LqVev6MAAAAJ;https://scholar.google.com/citations?hl=ja", "orcid": ";", "linkedin": ";", "or_profile": "~Takayuki_Osa1;~Tatsuya_Harada1", "aff": "The University of Tokyo;The University of Tokyo", "aff_domain": "u-tokyo.ac.jp;u-tokyo.ac.jp", "position": "Associate Professor;Full Professor", "bibtex": "@inproceedings{\nosa2024discovering,\ntitle={Discovering Multiple Solutions from a Single Task in Offline Reinforcement Learning},\nauthor={Takayuki Osa and Tatsuya Harada},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=j6rG1ETRyu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1814772, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13273951799224738017&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "u-tokyo.ac.jp;u-tokyo.ac.jp", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Tokyo", "aff_unique_dep": "", "aff_unique_url": "https://www.u-tokyo.ac.jp", "aff_unique_abbr": "UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "title": "Tight Partial Identification of Causal Effects with Marginal Distribution of Unmeasured Confounders", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33322", "id": "jEWpcEyuUl", "proceeding": "https://proceedings.mlr.press/v235/zhang24a.html", "pdf": "https://openreview.net/pdf?id=jEWpcEyuUl", "openreview": "https://openreview.net/forum?id=jEWpcEyuUl", "tldr": "", "abstract": "Partial identification (PI) presents a significant challenge in causal inference due to the incomplete measurement of confounders. Given that obtaining auxiliary variables of confounders is not always feasible and relies on untestable assumptions, researchers are encouraged to explore the internal information of latent confounders without external assistance. However, these prevailing PI results often lack precise mathematical measurement from observational data or assume that the information pertaining to confounders falls within extreme scenarios. In our paper, we reassess the significance of the marginal confounder distribution in PI. We refrain from imposing additional restrictions on the marginal confounder distribution, such as entropy or mutual information. Instead, we establish the closed-form tight PI for any possible P(U) in the discrete case. Furthermore, we establish the if and only if criterion for discerning whether the marginal confounder information leads to non-vanilla PI regions. This reveals a fundamental negative result wherein the marginal confounder information minimally contributes to PI as the confounder\u2019s cardinality increases. Our theoretical findings are supported by experiments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhiheng Zhang", "authorids": "~Zhiheng_Zhang1", "gender": "M", "homepage": "https://scholar.google.com/citations?hl=en&user=YVXrszoAAAAJ", "dblp": "172/9230.html", "google_scholar": "https://scholar.google.com/citations?hl=en", "orcid": "0000-0001-6767-7487", "linkedin": "", "or_profile": "~Zhiheng_Zhang1", "aff": "Tsinghua University, Institute for Interdisciplinary Information Sciences (IIIS)", "aff_domain": "tsinghua.edu.cn", "position": "PhD student", "bibtex": "@inproceedings{\nzhang2024tight,\ntitle={Tight Partial Identification of Causal Effects with Marginal Distribution of Unmeasured Confounders},\nauthor={Zhiheng Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jEWpcEyuUl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8955911, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZnyWtN8hjOkJ:scholar.google.com/&scioq=Tight+Partial+Identification+of+Causal+Effects+with+Marginal+Distribution+of+Unmeasured+Confounders&hl=en&as_sdt=0,33", "gs_version_total": 4, "email": "tsinghua.edu.cn", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences (IIIS)", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "Tsinghua", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "Cross-view Masked Diffusion Transformers for Person Image Synthesis", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33321", "id": "jEoIkNkqyc", "proceeding": "https://proceedings.mlr.press/v235/pham24b.html", "pdf": "https://openreview.net/pdf?id=jEoIkNkqyc", "openreview": "https://openreview.net/forum?id=jEoIkNkqyc", "author_site": "Trung Pham, Kang Zhang, Chang Yoo", "tldr": "", "abstract": "We present X-MDPT ($\\underline{Cross}$-view $\\underline{M}$asked $\\underline{D}$iffusion $\\underline{P}$rediction $\\underline{T}$ransformers), a novel diffusion model designed for pose-guided human image generation. X-MDPT distinguishes itself by employing masked diffusion transformers that operate on latent patches, a departure from the commonly-used Unet structures in existing works. The model comprises three key modules: 1) a denoising diffusion Transformer, 2) an aggregation network that consolidates conditions into a single vector for the diffusion process, and 3) a mask cross-prediction module that enhances representation learning with semantic information from the reference image. X-MDPT demonstrates scalability, improving FID, SSIM, and LPIPS with larger models. Despite its simple design, our model outperforms state-of-the-art approaches on the DeepFashion dataset while exhibiting efficiency in terms of training parameters, training time, and inference speed. Our compact 33MB model achieves an FID of 7.42, surpassing a prior Unet latent diffusion approach (FID 8.07) using only $11\\times$ fewer parameters. Our best model surpasses the pixel-based diffusion with $\\frac{2}{3}$ of the parameters and achieves $5.43 \\times$ faster inference. The code is available at https://github.com/trungpx/xmdpt.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Trung X. Pham;Kang Zhang;Chang D. Yoo", "authorids": "~Trung_X._Pham1;~Kang_Zhang6;~Chang_D._Yoo1", "gender": "M;M;M", "homepage": "https://trungpx.github.io/;;https://sanctusfactory.com/family.php", "dblp": "228/6857;29/177-8;31/7819", "google_scholar": "4DkPIIAAAAAJ;nj19btQAAAAJ;gFWgUQEAAAAJ", "orcid": "0000-0003-4177-7054;0000-0003-2761-9383;0000-0002-0756-7179", "linkedin": ";;", "or_profile": "~Trung_X._Pham1;~Kang_Zhang6;~Chang_D._Yoo1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\npham2024crossview,\ntitle={Cross-view Masked Diffusion Transformers for Person Image Synthesis},\nauthor={Trung X. Pham and Kang Zhang and Chang D. Yoo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jEoIkNkqyc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8772581, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14518275561702508505&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Executable Code Actions Elicit Better LLM Agents", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33320", "id": "jJ9BoXAfFa", "proceeding": "https://proceedings.mlr.press/v235/wang24h.html", "pdf": "https://openreview.net/pdf?id=jJ9BoXAfFa", "openreview": "https://openreview.net/forum?id=jJ9BoXAfFa", "author_site": "Xingyao Wang, Yangyi Chen, Lifan Yuan, Yizhe Zhang, Yunzhu Li, Hao Peng, Heng Ji", "tldr": "", "abstract": "Large Language Model (LLM) agents, capable of performing a broad range of actions, such as invoking tools and controlling robots, show great potential in tackling real-world challenges. LLM agents are typically prompted to produce actions by generating JSON or text in a pre-defined format, which is usually limited by constrained action space (e.g., the scope of pre-defined tools) and restricted flexibility (e.g., inability to compose multiple tools). This work proposes to use executable Python **code** to consolidate LLM agents' **act**ions into a unified action space (**CodeAct**). Integrated with a Python interpreter, CodeAct can execute code actions and dynamically revise prior actions or emit new actions upon new observations through multi-turn interactions. Our extensive analysis of 17 LLMs on API-Bank and a newly curated benchmark shows that CodeAct outperforms widely used alternatives (up to 20% higher success rate). The encouraging performance of CodeAct motivates us to build an open-source LLM agent that interacts with environments by executing interpretable code and collaborates with users using natural language. To this end, we collect an instruction-tuning dataset CodeActInstruct that consists of 7k multi-turn interactions using CodeAct. We show that it can be used with existing data to improve models in agent-oriented tasks without compromising their general capability. CodeActAgent, finetuned from Llama2 and Mistral, is integrated with Python interpreter and uniquely tailored to perform sophisticated tasks (e.g., model training) using existing libraries and autonomously self-debug.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xingyao Wang;Yangyi Chen;Lifan Yuan;Yizhe Zhang;Yunzhu Li;Hao Peng;Heng Ji", "authorids": "~Xingyao_Wang1;~Yangyi_Chen1;~Lifan_Yuan1;~Yizhe_Zhang2;~Yunzhu_Li1;~Hao_Peng4;~Heng_Ji3", "gender": "M;M;;M;M;;F", "homepage": "https://xwang.dev;https://yangyi-chen.github.io/;;https://dreasysnail.github.io;https://yunzhuli.github.io/;;http://blender.cs.illinois.edu/hengji.html", "dblp": "264/9892;05/10083;;132/4966-2.html;182/1831;;", "google_scholar": "F7qq3YcAAAAJ;https://scholar.google.com/citations?hl=en;;WDVMfggAAAAJ;WlA92lcAAAAJ;;z7GCqT4AAAAJ", "orcid": "0000-0002-3483-8624;;;;;;", "linkedin": ";yangyi-chen-4006a11b2/;;;;;", "or_profile": "~Xingyao_Wang1;~Yangyi_Chen1;~Lifan_Yuan1;~Yizhe_Zhang2;~Yunzhu_Li1;~Hao_Peng4;~Heng_Ji3", "aff": "University of Illinois Urbana-Champaign;Department of Computer Science, University of Illinois at Urbana-Champaign;;Apple;University of Illinois Urbana-Champaign;;University of Illinois, Urbana-Champaign", "aff_domain": "cs.illinois.edu;cs.illinois.edu;;apple.com;illinois.edu;;uiuc.edu", "position": "PhD student;PhD student;;Researcher;Assistant Professor;;Full Professor", "bibtex": "@inproceedings{\nwang2024executable,\ntitle={Executable Code Actions Elicit Better {LLM} Agents},\nauthor={Xingyao Wang and Yangyi Chen and Lifan Yuan and Yizhe Zhang and Yunzhu Li and Hao Peng and Heng Ji},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jJ9BoXAfFa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4071731, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 154, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14417798504674200556&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 8, "email": "cs.illinois.edu;cs.illinois.edu;;apple.com;illinois.edu;;uiuc.edu", "author_num": 7, "aff_unique_index": "0;0;1;0;2", "aff_unique_norm": "University of Illinois Urbana-Champaign;Apple;University of Illinois", "aff_unique_dep": ";Apple Inc.;", "aff_unique_url": "https://illinois.edu;https://www.apple.com;https://illinois.edu", "aff_unique_abbr": "UIUC;Apple;UIUC", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Non-clairvoyant Scheduling with Partial Predictions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33319", "id": "jJLcXGB2uA", "proceeding": "https://proceedings.mlr.press/v235/benomar24a.html", "pdf": "https://openreview.net/pdf?id=jJLcXGB2uA", "openreview": "https://openreview.net/forum?id=jJLcXGB2uA", "author_site": "Ziyad Benomar, Vianney Perchet", "tldr": "", "abstract": "The non-clairvoyant scheduling problem has gained new interest within learning-augmented algorithms, where the decision-maker is equipped with predictions without any quality guarantees. In practical settings, access to predictions may be reduced to specific instances, due to cost or data limitations. Our investigation focuses on scenarios where predictions for only $B$ job sizes out of $n$ are available to the algorithm. We first establish near-optimal lower bounds and algorithms in the case of perfect predictions. Subsequently, we present a learning-augmented algorithm satisfying the robustness, consistency, and smoothness criteria, and revealing a novel tradeoff between consistency and smoothness inherent in the scenario with a restricted number of predictions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ziyad Benomar;Vianney Perchet", "authorids": "~Ziyad_Benomar1;~Vianney_Perchet3", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Ziyad_Benomar1;~Vianney_Perchet3", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nbenomar2024nonclairvoyant,\ntitle={Non-clairvoyant Scheduling with Partial Predictions},\nauthor={Ziyad Benomar and Vianney Perchet},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jJLcXGB2uA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 692733, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10139152540854836791&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": ";", "author_num": 2 }, { "title": "Catapults in SGD: spikes in the training loss and their impact on generalization through feature learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33318", "id": "jJmGl01S4l", "proceeding": "https://proceedings.mlr.press/v235/zhu24h.html", "pdf": "https://openreview.net/pdf?id=jJmGl01S4l", "openreview": "https://openreview.net/forum?id=jJmGl01S4l", "author_site": "Libin Zhu, Chaoyue Liu, Adityanarayanan Radhakrishnan, Misha Belkin", "tldr": "", "abstract": "In this paper, we first present an explanation regarding the common occurrence of spikes in the training loss when neural networks are trained with stochastic gradient descent (SGD). We provide evidence that the spikes in the training loss of SGD are \"catapults\", an optimization phenomenon originally observed in GD with large learning rates in Lewkowycz et al. (2020). We empirically show that these catapults occur in a low-dimensional subspace spanned by the top eigenvectors of the tangent kernel, for both GD and SGD. Second, we posit an explanation for how catapults lead to better generalization by demonstrating that catapults increase feature learning by increasing alignment with the Average Gradient Outer Product (AGOP) of the true predictor. Furthermore, we demonstrate that a smaller batch size in SGD induces a larger number of catapults, thereby improving AGOP alignment and test performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Libin Zhu;Chaoyue Liu;Adityanarayanan Radhakrishnan;Mikhail Belkin", "authorids": "~Libin_Zhu1;~Chaoyue_Liu2;~Adityanarayanan_Radhakrishnan1;~Mikhail_Belkin1", "gender": "M;M;M;", "homepage": ";https://cliu212.github.io/;https://aditradha.com/;http://misha.belkin-wang.org/", "dblp": "260/0355;191/6684-1;;", "google_scholar": "hyTGiUcAAAAJ;sRjoMX0AAAAJ;jd7_Ed0AAAAJ;Iwd9DdkAAAAJ", "orcid": ";;;", "linkedin": ";;aditradha/;", "or_profile": "~Libin_Zhu1;~Chaoyue_Liu2;~Adityanarayanan_Radhakrishnan1;~Misha_Belkin1", "aff": "University of California, San Diego;University of California, San Diego;;University of California, San Diego", "aff_domain": "ucsd.edu;ucsd.edu;;ucsd.edu", "position": "PhD student;Postdoc;;Professor", "bibtex": "@inproceedings{\nzhu2024catapults,\ntitle={Catapults in {SGD}: spikes in the training loss and their impact on generalization through feature learning},\nauthor={Libin Zhu and Chaoyue Liu and Adityanarayanan Radhakrishnan and Mikhail Belkin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jJmGl01S4l}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2699424, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17431620403131145365&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "ucsd.edu;ucsd.edu;;ucsd.edu", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, San Diego", "aff_unique_dep": "", "aff_unique_url": "https://www.ucsd.edu", "aff_unique_abbr": "UCSD", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "ERQ: Error Reduction for Post-Training Quantization of Vision Transformers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33317", "id": "jKUWlgra9b", "proceeding": "https://proceedings.mlr.press/v235/zhong24a.html", "pdf": "https://openreview.net/pdf?id=jKUWlgra9b", "openreview": "https://openreview.net/forum?id=jKUWlgra9b", "author_site": "Yunshan Zhong, Jiawei Hu, You Huang, Yuxin Zhang, Rongrong Ji", "tldr": "", "abstract": "Post-training quantization (PTQ) for vision transformers (ViTs) has garnered significant attention due to its efficiency in compressing models. However, existing methods typically overlook the intricate interdependence between quantized weight and activation, leading to considerable quantization error. In this paper, we propose ERQ, a two-step PTQ approach meticulously crafted to sequentially reduce the quantization error arising from activation and weight quantization. ERQ first introduces Activation quantization error reduction (Aqer) that strategically formulates the minimization of activation quantization error as a Ridge Regression problem, tackling it by updating weights with full-precision. Subsequently, ERQ introduces Weight quantization error reduction (Wqer) that adopts an iterative approach to mitigate the quantization error induced by weight quantization. In each iteration, an empirically derived, efficient proxy is employed to refine the rounding directions of quantized weights, coupled with a Ridge Regression solver to curtail weight quantization error. Experimental results attest to the effectiveness of our approach. Notably, ERQ surpasses the state-of-the-art GPTQ by 22.36% in accuracy for W3A4 ViT-S.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yunshan Zhong;Jiawei Hu;You Huang;Yuxin Zhang;Rongrong Ji", "authorids": "~Yunshan_Zhong1;~Jiawei_Hu4;~You_Huang1;~Yuxin_Zhang3;~Rongrong_Ji5", "gender": "M;;M;;M", "homepage": ";;;;http://mac.xmu.edu.cn/rrji-en.html", "dblp": "239/4066;;214/9824;03/7346-2;86/5681", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;;WYmFVEMAAAAJ;6IeJLJoAAAAJ;", "orcid": ";;;0000-0002-4409-7030;", "linkedin": ";;you-huang-5075251b6/;;", "or_profile": "~Yunshan_Zhong1;~Jiawei_Hu4;~You_Huang1;~Yuxin_Zhang3;~Rongrong_Ji5", "aff": "Xiamen University;;Xiamen University;Xiamen University;Xiamen University", "aff_domain": "xmu.edu.cn;;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn", "position": "PhD student;;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nzhong2024erq,\ntitle={{ERQ}: Error Reduction for Post-Training Quantization of Vision Transformers},\nauthor={Yunshan Zhong and Jiawei Hu and You Huang and Yuxin Zhang and Rongrong Ji},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jKUWlgra9b}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 868244, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8955361080802785021&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "xmu.edu.cn;;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Xiamen University", "aff_unique_dep": "", "aff_unique_url": "https://www.xmu.edu.cn", "aff_unique_abbr": "XMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Evaluation of LLMs on Syntax-Aware Code Fill-in-the-Middle Tasks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33316", "id": "jKYyFbH8ap", "proceeding": "https://proceedings.mlr.press/v235/gong24f.html", "pdf": "https://openreview.net/pdf?id=jKYyFbH8ap", "openreview": "https://openreview.net/forum?id=jKYyFbH8ap", "author_site": "Linyuan Gong, Sida Wang, Mostafa Elhoushi, Alvin Cheung", "tldr": "", "abstract": "We introduce **S**yntax-**A**ware **F**ill-**i**n-the-**M**iddle (SAFIM), a new benchmark for evaluating Large Language Models (LLMs) on the code Fill-in-the-Middle (FIM) task. This benchmark focuses on syntax-aware completions of program structures such as code blocks and conditional expressions, and includes 17,720 examples from multiple programming languages, sourced from recent code submissions after April 2022 to minimize data contamination. SAFIM provides a robust framework with various prompt designs and novel syntax-aware post-processing techniques, facilitating accurate and fair comparisons across LLMs. Our comprehensive evaluation of 15 LLMs shows that FIM pretraining not only enhances FIM proficiency but also improves Left-to-Right (L2R) inference using LLMs. Our findings challenge conventional beliefs and suggest that pretraining methods and data quality have more impact than model size. SAFIM thus serves as a foundational platform for future research in effective pretraining strategies for code LLMs. The evaluation toolkit and dataset are available at https://github.com/gonglinyuan/safim, and the leaderboard is available at https://safimbenchmark.com.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Linyuan Gong;Sida Wang;Mostafa Elhoushi;Alvin Cheung", "authorids": "~Linyuan_Gong1;~Sida_Wang2;~Mostafa_Elhoushi1;~Alvin_Cheung2", "gender": "M;M;M;", "homepage": "https://gonglinyuan.com;https://www.sidaw.xyz;;", "dblp": "213/8172;153/9609;157/6350;", "google_scholar": "w5A4QPQAAAAJ;XUI4PMEAAAAJ;https://scholar.google.ca/citations?user=y_cwSKAAAAAJ;", "orcid": ";;0000-0001-6172-4510;", "linkedin": ";;mostafaelhoushi/;", "or_profile": "~Linyuan_Gong1;~Sida_Wang2;~Mostafa_Elhoushi1;~Alvin_Cheung2", "aff": "University of California, Berkeley;Meta Facebook;Meta;", "aff_domain": "berkeley.edu;fb.com;meta.com;", "position": "PhD student;Research Scientist;Researcher;", "bibtex": "@inproceedings{\ngong2024evaluation,\ntitle={Evaluation of {LLM}s on Syntax-Aware Code Fill-in-the-Middle Tasks},\nauthor={Linyuan Gong and Sida Wang and Mostafa Elhoushi and Alvin Cheung},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jKYyFbH8ap}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 482646, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9420261034881456800&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "berkeley.edu;fb.com;meta.com;", "author_num": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of California, Berkeley;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.berkeley.edu;https://meta.com", "aff_unique_abbr": "UC Berkeley;Meta", "aff_campus_unique_index": "0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "BetterV: Controlled Verilog Generation with Discriminative Guidance", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33315", "id": "jKnW7r7de1", "proceeding": "https://proceedings.mlr.press/v235/pei24e.html", "pdf": "https://openreview.net/pdf?id=jKnW7r7de1", "openreview": "https://openreview.net/forum?id=jKnW7r7de1", "author_site": "Zehua Pei, Huiling Zhen, Mingxuan Yuan, Yu Huang, Bei Yu", "tldr": "", "abstract": "Due to the growing complexity of modern Integrated Circuits (ICs), there is a need for automated circuit design methods. Recent years have seen increasing research in hardware design language generation to facilitate the design process. In this work, we propose a Verilog generation framework, BetterV, which fine-tunes large language models (LLMs) on processed domain-specific datasets and incorporates generative discriminators for guidance on particular design demands. Verilog modules are collected, filtered, and processed from the internet to form a clean and abundant dataset. Instruct-tuning methods are specially designed to fine-tune the LLMs to understand knowledge about Verilog. Furthermore, data are augmented to enrich the training set and are also used to train a generative discriminator on particular downstream tasks, providing guidance for the LLMs to optimize Verilog implementation. BetterV has the ability to generate syntactically and functionally correct Verilog, outperforming GPT-4 on the VerilogEval benchmark. With the help of task-specific generative discriminators, BetterV achieves remarkable improvements on various electronic design automation (EDA) downstream tasks, including netlist node reduction for synthesis and verification runtime reduction with Boolean Satisfiability (SAT) solving.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zehua PEI;Huiling Zhen;Mingxuan Yuan;Yu Huang;Bei Yu", "authorids": "~Zehua_PEI1;huilingzhen2017@gmail.com;~Mingxuan_Yuan1;huangyu61@hisilicon.com;~Bei_Yu2", "gender": ";;M;;M", "homepage": ";;;;http://www.cse.cuhk.edu.hk/~byu/index.html", "dblp": ";;74/2356;;28/4556-1.html", "google_scholar": ";;https://scholar.google.com/citations?hl=en;;tGneTm4AAAAJ", "orcid": ";;0000-0002-2236-8784;;0000-0001-6406-4810", "linkedin": ";;;;yubei/", "or_profile": "~Zehua_PEI1;huilingzhen2017@gmail.com;~Mingxuan_Yuan1;huangyu61@hisilicon.com;~Bei_Yu2", "aff": ";;Huawei Technologies Ltd.;;Department of Computer Science and Engineering, The Chinese University of Hong Kong", "aff_domain": ";;huawei.com;;cse.cuhk.edu.hk", "position": ";;Researcher;;Associate Professor", "bibtex": "@inproceedings{\npei2024betterv,\ntitle={BetterV: Controlled Verilog Generation with Discriminative Guidance},\nauthor={Zehua PEI and Huiling Zhen and Mingxuan Yuan and Yu Huang and Bei Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jKnW7r7de1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 546890, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7073733403401451791&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": ";;huawei.com;;cse.cuhk.edu.hk", "author_num": 5, "aff_unique_index": "0;1", "aff_unique_norm": "Huawei;Chinese University of Hong Kong", "aff_unique_dep": "Huawei Technologies;Department of Computer Science and Engineering", "aff_unique_url": "https://www.huawei.com;https://www.cuhk.edu.hk", "aff_unique_abbr": "Huawei;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Averaging $n$-step Returns Reduces Variance in Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33314", "id": "jM9A3Kz6Ki", "proceeding": "https://proceedings.mlr.press/v235/daley24a.html", "pdf": "https://openreview.net/pdf?id=jM9A3Kz6Ki", "openreview": "https://openreview.net/forum?id=jM9A3Kz6Ki", "author_site": "Brett Daley, Martha White, Marlos C. Machado", "tldr": "", "abstract": "Multistep returns, such as $n$-step returns and $\\lambda$-returns, are commonly used to improve the sample efficiency of reinforcement learning (RL) methods. The variance of the multistep returns becomes the limiting factor in their length; looking too far into the future increases variance and reverses the benefits of multistep learning. In our work, we demonstrate the ability of compound returns\u2014weighted averages of $n$-step returns\u2014to reduce variance. We prove for the first time that any compound return with the same contraction modulus as a given $n$-step return has strictly lower variance. We additionally prove that this variance-reduction property improves the finite-sample complexity of temporal-difference learning under linear function approximation. Because general compound returns can be expensive to implement, we introduce two-bootstrap returns which reduce variance while remaining efficient, even when using minibatched experience replay. We conduct experiments showing that compound returns often increase the sample efficiency of $n$-step deep RL agents like DQN and PPO.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Brett Daley;Martha White;Marlos C. Machado", "authorids": "~Brett_Daley1;~Martha_White1;~Marlos_C._Machado1", "gender": "M;F;M", "homepage": "https://brett-daley.github.io/;http://marthawhite.ca;https://webdocs.cs.ualberta.ca/~machado/", "dblp": "157/3749;60/7057;21/10949", "google_scholar": "PP2_bZ8AAAAJ;t5zdD_IAAAAJ;https://scholar.google.ca/citations?user=xf_n4xUAAAAJ", "orcid": "0000-0002-6402-0751;0000-0002-5356-2950;", "linkedin": "brettdaley/;;cholodovskis/", "or_profile": "~Brett_Daley1;~Martha_White1;~Marlos_C._Machado1", "aff": "Meta;University of Alberta;University of Alberta", "aff_domain": "meta.com;ualberta.ca;ualberta.ca", "position": "Intern;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\ndaley2024averaging,\ntitle={Averaging \\$n\\$-step Returns Reduces Variance in Reinforcement Learning},\nauthor={Brett Daley and Martha White and Marlos C. Machado},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jM9A3Kz6Ki}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 809511, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3832196438240114292&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "meta.com;ualberta.ca;ualberta.ca", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Meta;University of Alberta", "aff_unique_dep": "Meta Platforms, Inc.;", "aff_unique_url": "https://meta.com;https://www.ualberta.ca", "aff_unique_abbr": "Meta;UAlberta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;Canada" }, { "title": "How Transformers Learn Causal Structure with Gradient Descent", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33313", "id": "jNM4imlHZv", "proceeding": "https://proceedings.mlr.press/v235/nichani24a.html", "pdf": "https://openreview.net/pdf?id=jNM4imlHZv", "openreview": "https://openreview.net/forum?id=jNM4imlHZv", "author_site": "Eshaan Nichani, Alex Damian, Jason Lee", "tldr": "", "abstract": "The incredible success of transformers on sequence modeling tasks can be largely attributed to the self-attention mechanism, which allows information to be transferred between different parts of a sequence. Self-attention allows transformers to encode causal structure which makes them particularly suitable for sequence modeling. However, the process by which transformers learn such causal structure via gradient-based training algorithms remains poorly understood. To better understand this process, we introduce an in-context learning task that requires learning latent causal structure. We prove that gradient descent on a simplified two-layer transformer learns to solve this task by encoding the latent causal graph in the first attention layer. The key insight of our proof is that the gradient of the attention matrix encodes the mutual information between tokens. As a consequence of the data processing inequality, the largest entries of this gradient correspond to edges in the latent causal graph. As a special case, when the sequences are generated from in-context Markov chains, we prove that transformers learn an induction head (Olsson et al., 2022). We confirm our theoretical findings by showing that transformers trained on our in-context learning task are able to recover a wide variety of causal structures.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Eshaan Nichani;Alex Damian;Jason D. Lee", "authorids": "~Eshaan_Nichani1;~Alex_Damian1;~Jason_D._Lee1", "gender": ";M;M", "homepage": "https://eshaannichani.com/;https://web.math.princeton.edu/~ad27/;https://jasondlee88.github.io/", "dblp": "260/6510;;88/3262", "google_scholar": ";YvHcBcEAAAAJ;GR_DsT0AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Eshaan_Nichani1;~Alex_Damian1;~Jason_D._Lee1", "aff": "Princeton University;Princeton University;Princeton University", "aff_domain": "princeton.edu;princeton.edu;princeton.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nnichani2024how,\ntitle={How Transformers Learn Causal Structure with Gradient Descent},\nauthor={Eshaan Nichani and Alex Damian and Jason D. Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jNM4imlHZv}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2726658, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 75, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8915243371016059702&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "princeton.edu;princeton.edu;princeton.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Partially Stochastic Infinitely Deep Bayesian Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33312", "id": "jNab9mXEyj", "proceeding": "https://proceedings.mlr.press/v235/calvo-ordonez24a.html", "pdf": "https://openreview.net/pdf?id=jNab9mXEyj", "openreview": "https://openreview.net/forum?id=jNab9mXEyj", "author_site": "Sergio Calvo Ordo\u00f1ez, Matthieu Meunier, Francesco Piatti, Yuantao Shi", "tldr": "", "abstract": "In this paper, we present Partially Stochastic Infinitely Deep Bayesian Neural Networks, a novel family of architectures that integrates partial stochasticity into the framework of infinitely deep neural networks. Our new class of architectures is designed to improve the computational efficiency of existing architectures at training and inference time. To do this, we leverage the advantages of partial stochasticity in the infinite-depth limit which include the benefits of full stochasticity e.g. robustness, uncertainty quantification, and memory efficiency, whilst improving their limitations around computational complexity. We present a variety of architectural configurations, offering flexibility in network design including different methods for weight partition. We also provide mathematical guarantees on the expressivity of our models by establishing that our network family qualifies as Universal Conditional Distribution Approximators. Lastly, empirical evaluations across multiple tasks show that our proposed architectures achieve better downstream task performance and uncertainty quantification than their counterparts while being significantly more efficient. The code can be found at https://github.com/Sergio20f/part_stoch_inf_deep", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sergio Calvo Ordo\u00f1ez;Matthieu Meunier;Francesco Piatti;YUANTAO SHI", "authorids": "~Sergio_Calvo_Ordo\u00f1ez1;~Matthieu_Meunier1;~Francesco_Piatti1;~YUANTAO_SHI1", "gender": "M;M;M;M", "homepage": "https://www.maths.ox.ac.uk/people/sergio.calvoordonez;;;", "dblp": ";;;", "google_scholar": "https://scholar.google.co.uk/citations?user=j39UsOEAAAAJ;;;", "orcid": ";;;", "linkedin": "sergio-calvo-ordonez/;matthieu-meunier-280086204/;francesco-piatti-801b44195;yuantao-shi", "or_profile": "~Sergio_Calvo_Ordo\u00f1ez1;~Matthieu_Meunier1;~Francesco_Piatti1;~YUANTAO_SHI1", "aff": "University of Oxford;University of Oxford;Imperial College London;University of Oxford", "aff_domain": "maths.ox.ac.uk;oxford.ac.uk;imperial.ac.uk;ox.ac.uk", "position": "PhD student;PhD student;PhD student;PhD student", "bibtex": "@inproceedings{\nordo{\\~n}ez2024partially,\ntitle={Partially Stochastic Infinitely Deep Bayesian Neural Networks},\nauthor={Sergio Calvo Ordo{\\~n}ez and Matthieu Meunier and Francesco Piatti and YUANTAO SHI},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jNab9mXEyj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3243339, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=773821969158258039&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "email": "maths.ox.ac.uk;oxford.ac.uk;imperial.ac.uk;ox.ac.uk", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Oxford;Imperial College London", "aff_unique_dep": ";", "aff_unique_url": "https://www.ox.ac.uk;https://www.imperial.ac.uk", "aff_unique_abbr": "Oxford;ICL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Fast Timing-Conditioned Latent Audio Diffusion", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33311", "id": "jOlO8t1xdx", "proceeding": "https://proceedings.mlr.press/v235/evans24a.html", "pdf": "https://openreview.net/pdf?id=jOlO8t1xdx", "openreview": "https://openreview.net/forum?id=jOlO8t1xdx", "author_site": "Zach Evans, CJ Carr, Josiah Taylor, Scott Hawley, Jordi Pons", "tldr": "", "abstract": "Generating long-form 44.1kHz stereo audio from text prompts can be computationally demanding. Further, most previous works do not tackle that music and sound effects naturally vary in their duration. Our research focuses on the efficient generation of long-form, variable-length stereo music and sounds at 44.1kHz using text prompts with a generative model. It is based on latent diffusion, with its latent defined by a fully-convolutional variational autoencoder. The generative model is conditioned on text prompts as well as timing embeddings, allowing for fine control over both the content and length of the generated music and sounds. It is capable of rendering stereo signals of up to 95 sec at 44.1kHz in 8 sec on an A100 GPU. Despite its compute efficiency and fast inference, the proposed model is one of the best in two public text-to-music and -audio benchmarks and, differently from state-of-the-art models, can generate music with structure and stereo sounds.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zach Evans;CJ Carr;Josiah Taylor;Scott H. Hawley;Jordi Pons", "authorids": "~Zach_Evans1;cj@stability.ai;josiah@stability.ai;~Scott_H._Hawley1;~Jordi_Pons1", "gender": "M;;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": "zach-evans-b3121984/;;;;", "or_profile": "~Zach_Evans1;cj@stability.ai;josiah@stability.ai;~Scott_H._Hawley1;~Jordi_Pons1", "aff": "Stability AI;;;;", "aff_domain": "stability.ai;;;;", "position": "Researcher;;;;", "bibtex": "@inproceedings{\nevans2024fast,\ntitle={Fast Timing-Conditioned Latent Audio Diffusion},\nauthor={Zach Evans and CJ Carr and Josiah Taylor and Scott H. Hawley and Jordi Pons},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jOlO8t1xdx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 641626, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 119, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5599278197106168575&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "stability.ai;;;;", "author_num": 5, "aff_unique_index": "0", "aff_unique_norm": "Stability AI", "aff_unique_dep": "", "aff_unique_url": "https://stability.ai", "aff_unique_abbr": "Stability AI", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Learning the Target Network in Function Space", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33310", "id": "jP1zeEqHli", "proceeding": "https://proceedings.mlr.press/v235/asadi24a.html", "pdf": "https://openreview.net/pdf?id=jP1zeEqHli", "openreview": "https://openreview.net/forum?id=jP1zeEqHli", "author_site": "Kavosh Asadi, Yao Liu, Shoham Sabach, Ming Yin, Rasool Fakoor", "tldr": "", "abstract": "We focus on the task of learning the value function in the reinforcement learning (RL) setting. This task is often solved by updating a pair of online and target networks while ensuring that the parameters of these two networks are equivalent. We propose Lookahead-Replicate (LR), a new value-function approximation algorithm that is agnostic to this parameter-space equivalence. Instead, the LR algorithm is designed to maintain an equivalence between the two networks in the function space. This value-based equivalence is obtained by employing a new target-network update. We show that LR leads to a convergent behavior in learning the value function. We also present empirical results demonstrating that LR-based target-network updates significantly improve deep RL on the Atari benchmark.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kavosh Asadi;Yao Liu;Shoham Sabach;Ming Yin;Rasool Fakoor", "authorids": "~Kavosh_Asadi1;~Yao_Liu1;~Shoham_Sabach1;~Ming_Yin4;~Rasool_Fakoor1", "gender": ";M;M;M;M", "homepage": "http://cs.brown.edu/~kasadiat/;http://yao-liu.com/;https://ssabach.net.technion.ac.il/;https://mingyin0312.github.io;http://rasoolfa.github.io", "dblp": "192/1404;64/424-9.html;;89/453.html;123/2447", "google_scholar": "-2qyBJEAAAAJ;umAny5UAAAAJ;https://scholar.google.ca/citations?user=42D12TkAAAAJ;ncBRYIUAAAAJ;nVsOPtQAAAAJ", "orcid": ";;;0000-0001-6458-0751;", "linkedin": ";;;;rasool-fakoor-695b5845/", "or_profile": "~Kavosh_Asadi1;~Yao_Liu1;~Shoham_Sabach1;~Ming_Yin4;~Rasool_Fakoor1", "aff": "Amazon;Amazon;Technion - Israel Institute of Technology, Technion;Princeton University;Amazon Web Services", "aff_domain": "amazon.com;amazon.com;technion.ac.il;princeton.edu;amazon.com", "position": "Researcher;Researcher;Associate Professor;Postdoc;Researcher", "bibtex": "@inproceedings{\nasadi2024learning,\ntitle={Learning the Target Network in Function Space},\nauthor={Kavosh Asadi and Yao Liu and Shoham Sabach and Ming Yin and Rasool Fakoor},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jP1zeEqHli}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1790084, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5342310593431099132&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "amazon.com;amazon.com;technion.ac.il;princeton.edu;amazon.com", "author_num": 5, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Amazon;Technion - Israel Institute of Technology;Princeton University", "aff_unique_dep": "Amazon.com, Inc.;;", "aff_unique_url": "https://www.amazon.com;https://www.technion.ac.il;https://www.princeton.edu", "aff_unique_abbr": "Amazon;Technion;Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United States;Israel" }, { "title": "Training Greedy Policy for Proposal Batch Selection in Expensive Multi-Objective Combinatorial Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33309", "id": "jP8mf34iCW", "proceeding": "https://proceedings.mlr.press/v235/lee24w.html", "pdf": "https://openreview.net/pdf?id=jP8mf34iCW", "openreview": "https://openreview.net/forum?id=jP8mf34iCW", "author_site": "Deokjae Lee, Hyun Oh Song, Kyunghyun Cho", "tldr": "", "abstract": "Active learning is increasingly adopted for expensive multi-objective combinatorial optimization problems, but it involves a challenging subset selection problem, optimizing the batch acquisition score that quantifies the goodness of a batch for evaluation. Due to the excessively large search space of the subset selection problem, prior methods optimize the batch acquisition on the latent space, which has discrepancies with the actual space, or optimize individual acquisition scores without considering the dependencies among candidates in a batch instead of directly optimizing the batch acquisition. To manage the vast search space, a simple and effective approach is the greedy method, which decomposes the problem into smaller subproblems, yet it has difficulty in parallelization since each subproblem depends on the outcome from the previous ones. To this end, we introduce a novel greedy-style subset selection algorithm that optimizes batch acquisition directly on the combinatorial space by sequential greedy sampling from the greedy policy, specifically trained to address all greedy subproblems concurrently. Notably, our experiments on the red fluorescent proteins design task show that our proposed method achieves the baseline performance in 1.69x fewer queries, demonstrating its efficiency.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Deokjae Lee;Hyun Oh Song;Kyunghyun Cho", "authorids": "~Deokjae_Lee1;~Hyun_Oh_Song1;~Kyunghyun_Cho1", "gender": "M;M;M", "homepage": "https://badeok0716.github.io;https://mllab.snu.ac.kr/hyunoh;http://kyunghyuncho.me", "dblp": "https://dblp.org/rec/conf/aistats/JeongLASS22;05/10781;41/9736", "google_scholar": "G8JsnZAAAAAJ;ScoZZPsAAAAJ;https://scholar.google.fi/citations?user=0RAmmIAAAAAJ", "orcid": ";;", "linkedin": ";hyun-oh-song-5a39b03;", "or_profile": "~Deokjae_Lee1;~Hyun_Oh_Song1;~Kyunghyun_Cho1", "aff": "Seoul National University;Seoul National University;Genentech", "aff_domain": "snu.ac.kr;snu.ac.kr;gene.com", "position": "PhD student;Associate Professor;Senior Director of Frontier Research", "bibtex": "@inproceedings{\nlee2024training,\ntitle={Training Greedy Policy for Proposal Batch Selection in Expensive Multi-Objective Combinatorial Optimization},\nauthor={Deokjae Lee and Hyun Oh Song and Kyunghyun Cho},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jP8mf34iCW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 613592, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:bQ4oNz880S8J:scholar.google.com/&scioq=Training+Greedy+Policy+for+Proposal+Batch+Selection+in+Expensive+Multi-Objective+Combinatorial+Optimization&hl=en&as_sdt=0,5", "gs_version_total": 7, "email": "snu.ac.kr;snu.ac.kr;gene.com", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Seoul National University;Genentech", "aff_unique_dep": ";", "aff_unique_url": "https://www.snu.ac.kr;https://www.genentech.com", "aff_unique_abbr": "SNU;Genentech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "South Korea;United States" }, { "title": "Learning Divergence Fields for Shift-Robust Graph Representations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33308", "id": "jPaEOH56JB", "proceeding": "https://proceedings.mlr.press/v235/wu24v.html", "pdf": "https://openreview.net/pdf?id=jPaEOH56JB", "openreview": "https://openreview.net/forum?id=jPaEOH56JB", "author_site": "Qitian Wu, Fan Nie, Chenxiao Yang, Junchi Yan", "tldr": "", "abstract": "Real-world data generation often involves certain geometries (e.g., graphs) that induce instance-level interdependence. This characteristic makes the generalization of learning models more difficult due to the intricate interdependent patterns that impact data-generative distributions and can vary from training to testing. In this work, we propose a geometric diffusion model with learnable divergence fields for the challenging generalization problem with interdependent data. We generalize the diffusion equation with stochastic diffusivity at each time step, which aims to capture the multi-faceted information flows among interdependent data. Furthermore, we derive a new learning objective through causal inference, which can guide the model to learn generalizable patterns of interdependence that are insensitive across domains. Regarding practical implementation, we introduce three model instantiations that can be considered as the generalized versions of GCN, GAT, and Transformers, respectively, which possess advanced robustness against distribution shifts. We demonstrate their promising efficacy for out-of-distribution generalization on diverse real-world datasets. Source codes are available at https://github.com/fannie1208/GLIND.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qitian Wu;Fan Nie;Chenxiao Yang;Junchi Yan", "authorids": "~Qitian_Wu1;~Fan_Nie1;~Chenxiao_Yang1;~Junchi_Yan2", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Qitian_Wu1;~Fan_Nie1;~Chenxiao_Yang1;~Junchi_Yan2", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nwu2024learning,\ntitle={Learning Divergence Fields for Shift-Robust Graph Representations},\nauthor={Qitian Wu and Fan Nie and Chenxiao Yang and Junchi Yan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jPaEOH56JB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1355285, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vddShVrmwQkJ:scholar.google.com/&scioq=Learning+Divergence+Fields+for+Shift-Robust+Graph+Representations&hl=en&as_sdt=0,47", "gs_version_total": 6, "email": ";;;", "author_num": 4 }, { "title": "Accurate LoRA-Finetuning Quantization of LLMs via Information Retention", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33307", "id": "jQ92egz5Ym", "proceeding": "https://proceedings.mlr.press/v235/qin24b.html", "pdf": "https://openreview.net/pdf?id=jQ92egz5Ym", "openreview": "https://openreview.net/forum?id=jQ92egz5Ym", "author_site": "Haotong Qin, Xudong Ma, Xingyu Zheng, Xiaoyang Li, Yang Zhang, Shouda Liu, Jie Luo, Xianglong Liu, Michele Magno", "tldr": "", "abstract": "The LoRA-finetuning quantization of LLMs has been extensively studied to obtain accurate yet compact LLMs for deployment on resource-constrained hardware. However, existing methods cause the quantized LLM to severely degrade and even fail to benefit from the finetuning of LoRA. This paper proposes a novel IR-QLoRA for pushing quantized LLMs with LoRA to be highly accurate through information retention. The proposed IR-QLoRA mainly relies on two technologies derived from the perspective of unified information: (1) statistics-based Information Calibration Quantization allows the quantized parameters of LLM to retain original information accurately; (2) finetuning-based Information Elastic Connection makes LoRA utilizes elastic representation transformation with diverse information. Comprehensive experiments show that IR-QLoRA can significantly improve accuracy across LLaMA and LLaMA2 families under 2-4 bit-widths, e.g., 4-bit LLaMA-7B achieves 1.4% improvement on MMLU compared with the state-of-the-art methods. The significant performance gain requires only a tiny 0.31% additional time consumption, revealing the satisfactory efficiency of our IR-QLoRA. We highlight that IR-QLoRA enjoys excellent versatility, compatible with various frameworks (e.g., NormalFloat and Integer quantization) and brings general accuracy gains. The code is available at https://github.com/htqin/ir-qlora .", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haotong Qin;Xudong Ma;Xingyu Zheng;Xiaoyang Li;Yang Zhang;Shouda Liu;Jie Luo;Xianglong Liu;Michele Magno", "authorids": "~Haotong_Qin1;~Xudong_Ma3;~Xingyu_Zheng1;lixiaoyang.x@bytedance.com;~Yang_Zhang21;liushouda@bytedance.com;~Jie_Luo5;~Xianglong_Liu3;~Michele_Magno1", "gender": "M;M;;;M;;M;;M", "homepage": "https://htqin.github.io/;https://macaronlin.github.io/;https://github.com/Xingyu-Zheng;;https://scholar.google.com/citations?view_op=list_works&hl=en&user=Zyko2wwAAAAJ;;https://www.linkedin.com/in/luo-jie-32847334;;https://ee.ethz.ch/the-department/people-a-z/person-detail.michele-magno.html", "dblp": "262/3626.html;19/2951;;;;;29/186-4;;", "google_scholar": "mK6n-KgAAAAJ;3_zPktkAAAAJ;ISXNTf8AAAAJ;;;;gwJTOVgAAAAJ;;ytj7UUcAAAAJ", "orcid": ";;0009-0009-6283-7635;;;;0000-0002-4157-9931;;", "linkedin": ";;;;;;luo-jie-32847334;;", "or_profile": "~Haotong_Qin1;~Xudong_Ma3;~Xingyu_Zheng1;lixiaoyang.x@bytedance.com;~Yang_Zhang21;liushouda@bytedance.com;~Jie_Luo5;~Xianglong_Liu3;~Michele_Magno1", "aff": "ETHZ - ETH Zurich;Beihang University;Beihang University;;;;Beihang University;;ETHZ - ETH Zurich", "aff_domain": "ethz.ch;buaa.edu.cn;buaa.edu.cn;;;;buaa.edu.cn;;ethz.ch", "position": "Postdoc;PhD student;Undergrad student;;;;Associate Professor;;Principal Researcher", "bibtex": "@inproceedings{\nqin2024accurate,\ntitle={Accurate Lo{RA}-Finetuning Quantization of {LLM}s via Information Retention},\nauthor={Haotong Qin and Xudong Ma and Xingyu Zheng and Xiaoyang Li and Yang Zhang and Shouda Liu and Jie Luo and Xianglong Liu and Michele Magno},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jQ92egz5Ym}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1276586, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 67, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1223603280428785567&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "ethz.ch;buaa.edu.cn;buaa.edu.cn;;;;buaa.edu.cn;;ethz.ch", "author_num": 9, "aff_unique_index": "0;1;1;1;0", "aff_unique_norm": "ETH Zurich;Beihang University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;http://www.buaa.edu.cn/", "aff_unique_abbr": "ETHZ;BUAA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;0", "aff_country_unique": "Switzerland;China" }, { "title": "The Perception-Robustness Tradeoff in Deterministic Image Restoration", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33306", "id": "jQA5iutPzd", "proceeding": "https://proceedings.mlr.press/v235/ohayon24a.html", "pdf": "https://openreview.net/pdf?id=jQA5iutPzd", "openreview": "https://openreview.net/forum?id=jQA5iutPzd", "author_site": "Guy Ohayon, Tomer Michaeli, Michael Elad", "tldr": "", "abstract": "We study the behavior of deterministic methods for solving inverse problems in imaging. These methods are commonly designed to achieve two goals: (1) attaining high perceptual quality, and (2) generating reconstructions that are consistent with the measurements. We provide a rigorous proof that the better a predictor satisfies these two requirements, the larger its Lipschitz constant must be, regardless of the nature of the degradation involved. In particular, to approach perfect perceptual quality and perfect consistency, the Lipschitz constant of the model must grow to infinity. This implies that such methods are necessarily more susceptible to adversarial attacks. We demonstrate our theory on single image super-resolution algorithms, addressing both noisy and noiseless settings. We also show how this undesired behavior can be leveraged to explore the posterior distribution, thereby allowing the deterministic model to imitate stochastic methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guy Ohayon;Tomer Michaeli;Michael Elad", "authorids": "~Guy_Ohayon1;~Tomer_Michaeli1;~Michael_Elad1", "gender": "M;M;M", "homepage": ";https://tomer.net.technion.ac.il/;https://elad.cs.technion.ac.il/", "dblp": "287/4281;70/3188.html;e/MichaelElad", "google_scholar": "Gso71ogAAAAJ;n2EbR2cAAAAJ;UpZbV44AAAAJ", "orcid": ";;0000-0001-8131-6928", "linkedin": "ohayonguy/;;michael-elad-5553852a3/", "or_profile": "~Guy_Ohayon1;~Tomer_Michaeli1;~Michael_Elad1", "aff": "Verily;Technion, Technion;Verily", "aff_domain": "verily.com;technion.ac.il;verily.com", "position": "Intern;Associate Professor;Principal Researcher", "bibtex": "@inproceedings{\nohayon2024the,\ntitle={The Perception-Robustness Tradeoff in Deterministic Image Restoration},\nauthor={Guy Ohayon and Tomer Michaeli and Michael Elad},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jQA5iutPzd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5590551, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11167364046837702238&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "verily.com;technion.ac.il;verily.com", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Verily;Technion - Israel Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.verily.com;https://www.technion.ac.il/en/", "aff_unique_abbr": "Verily;Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;Israel" }, { "title": "Position: On the Societal Impact of Open Foundation Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33305", "id": "jRX6yCxFhx", "proceeding": "https://proceedings.mlr.press/v235/kapoor24a.html", "pdf": "https://openreview.net/pdf?id=jRX6yCxFhx", "openreview": "https://openreview.net/forum?id=jRX6yCxFhx", "author_site": "Sayash Kapoor, Rishi Bommasani, Kevin Klyman, Shayne Longpre, Ashwin Ramaswami, Peter Cihon, Aspen Hopkins, Kevin Bankston, Stella Biderman, Miranda Bogen, Rumman Chowdhury, Alex Engler, Peter Henderson, Yacine Jernite, Seth Lazar, Stefano Maffulli, Alondra Nelson, Joelle Pineau, Aviya Skowron, Dawn Song, Victor Storchan, Daniel Zhang, Daniel Ho, Percy Liang, Arvind Narayanan", "tldr": "", "abstract": "Foundation models are powerful technologies: how they are released publicly directly shapes their societal impact. In this position paper, we focus on *open* foundation models, defined here as those with broadly available model weights (e.g., Llama 3, Stable Diffusion XL). We identify five distinctive properties (e.g., greater customizability, poor monitoring) that mediate their benefits and risks. Open foundation models present significant benefits, with some caveats, that span innovation, competition, the distribution of decision-making power, and transparency. To understand their risks of misuse, we design a risk assessment framework for analyzing their *marginal risk*. Across several misuse vectors (e.g., cyberattacks, bioweapons), we find that current research is insufficient to effectively characterize the marginal risk of open foundation models relative to pre-existing technologies. The framework helps explain why the marginal risk is low in some cases, clarifies disagreements about misuse risks by revealing that past work has focused on different subsets of the framework with different assumptions, and articulates a way forward for more constructive debate. Overall, our work helps support a more grounded assessment of the societal impact of open foundation models by outlining what research is needed to empirically validate their theoretical benefits and risks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sayash Kapoor;Rishi Bommasani;Kevin Klyman;Shayne Longpre;Ashwin Ramaswami;Peter Cihon;Aspen K Hopkins;Kevin Bankston;Stella Biderman;Miranda Bogen;Rumman Chowdhury;Alex Engler;Peter Henderson;Yacine Jernite;Seth Lazar;Stefano Maffulli;Alondra Nelson;Joelle Pineau;Aviya Skowron;Dawn Song;Victor Storchan;Daniel Zhang;Daniel E. Ho;Percy Liang;Arvind Narayanan", "authorids": "~Sayash_Kapoor2;~Rishi_Bommasani1;~Kevin_Klyman1;~Shayne_Longpre1;aramaswamis@gmail.com;pcihon@github.com;~Aspen_K_Hopkins1;kbankston@cdt.org;~Stella_Biderman1;~Miranda_Bogen1;~Rumman_Chowdhury1;alexcengler@gmail.com;~Peter_Henderson1;~Yacine_Jernite1;~Seth_Lazar1;stefano@opensource.org;anelson@ias.edu;~Joelle_Pineau1;~Aviya_Skowron1;~Dawn_Song1;~Victor_Storchan1;dzhang105@stanford.edu;~Daniel_E._Ho1;~Percy_Liang1;~Arvind_Narayanan1", "gender": "M;M;M;M;;;;;F;F;;;M;M;M;;;F;Non-Binary;F;;;M;;", "homepage": "https://www.cs.princeton.edu/~sayashk/;https://rishibommasani.github.io/;;https://www.shaynelongpre.com;;;;;http://www.stellabiderman.com;;http://www.rummanchowdhury.com;;http://www.peterhenderson.co/;http://cs.nyu.edu/~jernite/yj/;https://www.sethlazar.org;;;http://www.cs.mcgill.ca/~jpineau;https://www.eleuther.ai/staff;;https://www.linkedin.com/in/storchan/;;https://dho.stanford.edu;https://cs.stanford.edu/~pliang/;https://www.cs.princeton.edu/~arvindn/", "dblp": ";245/8673;;190/7024;;;;;239/5641;;;;h/PeterHenderson2;http://dblp.uni-trier.de/pers/hd/j/Jernite:Yacine;;;;p/JoellePineau;344/3578;s/DXSong;;;240/9334;04/1701;08/3080.html", "google_scholar": ";WMBXw1EAAAAJ;PhN2CjMAAAAJ;ADd_YfkAAAAJ;;;;;bO7H0DAAAAAJ;SyMv3kkAAAAJ;;;dy_JBs0AAAAJ;AK_7EBgAAAAJ;;;;https://scholar.google.ca/citations?user=CEt6_mMAAAAJ;https://scholar.google.com/citations?hl=en;;;;;pouyVyUAAAAJ;0Bi5CMgAAAAJ", "orcid": ";;;;;;;;0000-0001-8228-1042;0009-0006-8874-8583;;;;;0000-0001-5378-6033;;;;;;;;;;", "linkedin": ";;;shayne-redford-longpre/;;;aspen-hopkins-9ab281107;;stellabiderman;mirandabogen/;;;phende/;;;;;;;;;;;;", "or_profile": "~Sayash_Kapoor2;~Rishi_Bommasani1;~Kevin_Klyman1;~Shayne_Longpre1;aramaswamis@gmail.com;pcihon@github.com;~Aspen_K_Hopkins1;kbankston@cdt.org;~Stella_Biderman1;~Miranda_Bogen1;~Rumman_Chowdhury1;alexcengler@gmail.com;~Peter_Henderson1;~Yacine_Jernite1;~Seth_Lazar1;stefano@opensource.org;anelson@ias.edu;~Joelle_Pineau1;~Aviya_Skowron1;~Dawn_Song1;~Victor_Storchan1;dzhang105@stanford.edu;~Daniel_E._Ho1;~Percy_Liang1;~Arvind_Narayanan1", "aff": "Princeton University;Stanford University;Stanford University;Massachusetts Institute of Technology;;;Massachusetts Institute of Technology;;Booz Allen Hamilton;;Twitter;;Princeton University;;Australian National University;;;Meta Facebook;EleutherAI;University of California, Berkeley;;;Stanford University;Stanford University;Princeton University", "aff_domain": "princeton.edu;stanford.edu;stanford.edu;mit.edu;;;mit.edu;;boozallen.com;;twitter.com;;princeton.edu;;anu.edu.au;;;fb.com;eleuther.ai;berkeley.edu;;;stanford.edu;stanford.edu;princeton.edu", "position": "PhD student;PhD student;MS student;PhD student;;;PhD student;;Industry researcher;;Director;;Assistant Professor;;Full Professor;;;Researcher Manager;Researcher;Full Professor;;;Professor;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nkapoor2024position,\ntitle={Position: On the Societal Impact of Open Foundation Models},\nauthor={Sayash Kapoor and Rishi Bommasani and Kevin Klyman and Shayne Longpre and Ashwin Ramaswami and Peter Cihon and Aspen K Hopkins and Kevin Bankston and Stella Biderman and Miranda Bogen and Rumman Chowdhury and Alex Engler and Peter Henderson and Yacine Jernite and Seth Lazar and Stefano Maffulli and Alondra Nelson and Joelle Pineau and Aviya Skowron and Dawn Song and Victor Storchan and Daniel Zhang and Daniel E. Ho and Percy Liang and Arvind Narayanan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jRX6yCxFhx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 789919, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 25, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8148945845323238428&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "princeton.edu;stanford.edu;stanford.edu;mit.edu;;;mit.edu;;boozallen.com;;twitter.com;;princeton.edu;;anu.edu.au;;;fb.com;eleuther.ai;berkeley.edu;;;stanford.edu;stanford.edu;princeton.edu", "author_num": 25, "aff_unique_index": "0;1;1;2;2;3;4;0;5;6;7;8;1;1;0", "aff_unique_norm": "Princeton University;Stanford University;Massachusetts Institute of Technology;Booz Allen Hamilton;Twitter, Inc.;Australian National University;Meta;EleutherAI;University of California, Berkeley", "aff_unique_dep": ";;;;;;Meta Platforms, Inc.;;", "aff_unique_url": "https://www.princeton.edu;https://www.stanford.edu;https://web.mit.edu;https://www.boozallen.com;https://twitter.com;https://www.anu.edu.au;https://meta.com;https://www.eleuther.ai;https://www.berkeley.edu", "aff_unique_abbr": "Princeton;Stanford;MIT;BAH;Twitter;ANU;Meta;EleutherAI;UC Berkeley", "aff_campus_unique_index": "1;1;2;1;1", "aff_campus_unique": ";Stanford;Berkeley", "aff_country_unique_index": "0;0;0;0;0;0;0;0;1;0;0;0;0;0;0", "aff_country_unique": "United States;Australia" }, { "title": "No Dimensional Sampling Coresets for Classification", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33304", "id": "jS3CMHtYJD", "proceeding": "https://proceedings.mlr.press/v235/alishahi24a.html", "pdf": "https://openreview.net/pdf?id=jS3CMHtYJD", "openreview": "https://openreview.net/forum?id=jS3CMHtYJD", "author_site": "Meysam Alishahi, Jeff Phillips", "tldr": "", "abstract": "We refine and generalize what is known about coresets for classification problems via the sensitivity sampling framework. Such coresets seek the smallest possible subsets of input data, so one can optimize a loss function on the coreset and ensure approximation guarantees with respect to the original data. Our analysis provides the first no dimensional coresets, so the size does not depend on the dimension. Moreover, our results are general, apply for distributional input and can use iid samples, so provide sample complexity bounds, and work for a variety of loss functions. A key tool we develop is a Radamacher complexity version of the main sensitivity sampling approach, which can be of independent interest.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Meysam Alishahi;Jeff M. Phillips", "authorids": "~Meysam_Alishahi1;~Jeff_M._Phillips1", "gender": "M;M", "homepage": ";http://www.cs.utah.edu/~jeffp/", "dblp": "13/8050;17/3933", "google_scholar": ";aFDuhV8AAAAJ", "orcid": ";", "linkedin": "meysam-alishahi-a73b996a/;", "or_profile": "~Meysam_Alishahi1;~Jeff_Phillips1", "aff": ", University of Utah;Universit\u00e4t Leipzig", "aff_domain": "cs.utah.edu;uni-leipzig.de", "position": "PhD student;Researcher", "bibtex": "@inproceedings{\nalishahi2024no,\ntitle={No Dimensional Sampling Coresets for Classification},\nauthor={Meysam Alishahi and Jeff M. Phillips},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jS3CMHtYJD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 658677, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15767915204944827639&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "cs.utah.edu;uni-leipzig.de", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Utah;University of Leipzig", "aff_unique_dep": ";", "aff_unique_url": "https://www.utah.edu;https://www.uni-leipzig.de", "aff_unique_abbr": "Utah;Uni Leipzig", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Germany" }, { "title": "Sparse Inducing Points in Deep Gaussian Processes: Enhancing Modeling with Denoising Diffusion Variational Inference", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33303", "id": "jTn4AIOgpM", "proceeding": "https://proceedings.mlr.press/v235/xu24af.html", "pdf": "https://openreview.net/pdf?id=jTn4AIOgpM", "openreview": "https://openreview.net/forum?id=jTn4AIOgpM", "author_site": "JIAN XU, Delu Zeng, John Paisley", "tldr": "", "abstract": "Deep Gaussian processes (DGPs) provide a robust paradigm in Bayesian deep learning. In DGPs, a set of sparse integration locations called inducing points are selected to approximate the posterior distribution of the model. This is done to reduce computational complexity and improve model efficiency. However, inferring the posterior distribution of inducing points is not straightforward. Traditional variational inference techniques methods to approximate the posterior often leads to significant bias. To address this issue, we propose an alternative named Denoising Diffusion Variational Inference (DDVI) that utilizes a denoising diffusion stochastic differential equation (SDE) for generating posterior samples of inducing variables. We refer to the score matching method in the denoising diffusion model to approximate challenging score functions using a neural network. Furthermore, by combining classical mathematical theory of SDE with the minimization of KL divergence between the approximate and true processes, we propose a novel explicit variational lower bound for the marginal likelihood function of DGP. Through extensive experiments on various datasets and comparisons with baseline methods, we empirically demonstrate the effectiveness of the DDVI method in posterior inference of inducing points for DGP models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "JIAN XU;Delu Zeng;John Paisley", "authorids": "~JIAN_XU5;~Delu_Zeng4;~John_Paisley1", "gender": "M;M;M", "homepage": ";http://www.columbia.edu/~jwp2128/;", "dblp": "73/1149-21;97/7035;38/5665", "google_scholar": "https://scholar.google.com.hk/citations?user=DublkSoAAAAJ;r31_fYQAAAAJ;08RCdoIAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~JIAN_XU5;~John_Paisley1;~Delu_zeng1", "aff": "South China University of Technology;Columbia University;South China University of Technology", "aff_domain": "scut.edu.cn;columbia.edu;scut.edu.cn", "position": "PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nxu2024sparse,\ntitle={Sparse Inducing Points in Deep Gaussian Processes: Enhancing Modeling with Denoising Diffusion Variational Inference},\nauthor={JIAN XU and Delu Zeng and John Paisley},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jTn4AIOgpM}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 853492, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14999476652788440104&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "scut.edu.cn;columbia.edu;scut.edu.cn", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "South China University of Technology;Columbia University", "aff_unique_dep": ";", "aff_unique_url": "https://www.scut.edu.cn;https://www.columbia.edu", "aff_unique_abbr": "SCUT;Columbia", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;United States" }, { "title": "From Vision to Audio and Beyond: A Unified Model for Audio-Visual Representation and Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33302", "id": "jU6iPouOZ6", "proceeding": "https://proceedings.mlr.press/v235/su24b.html", "pdf": "https://openreview.net/pdf?id=jU6iPouOZ6", "openreview": "https://openreview.net/forum?id=jU6iPouOZ6", "author_site": "Kun Su, Xiulong Liu, Eli Shlizerman", "tldr": "", "abstract": "Video encompasses both visual and auditory data, creating a perceptually rich experience where these two modalities complement each other. As such, videos are a valuable type of media for the investigation of the interplay between audio and visual elements. Previous studies of audio-visual modalities primarily focused on either audio-visual representation learning or generative modeling of a modality conditioned on the other, creating a disconnect between these two branches. A unified framework that learns representation and generates modalities has not been developed yet. In this work, we introduce a novel framework called Vision to Audio and Beyond (VAB) to bridge the gap between audio-visual representation learning and vision-to-audio generation. The key approach of VAB is that rather than working with raw video frames and audio data, VAB performs representation learning and generative modeling within latent spaces. In particular, VAB uses a pre-trained audio tokenizer and an image encoder to obtain audio tokens and visual features, respectively. It then performs the pre-training task of visual-conditioned masked audio token prediction. This training strategy enables the model to engage in contextual learning and simultaneous video-to-audio generation. After the pre-training phase, VAB employs the iterative-decoding approach to rapidly generate audio tokens conditioned on visual features. Since VAB is a unified model, its backbone can be fine-tuned for various audio-visual downstream tasks. Our experiments showcase the efficiency of VAB in producing high-quality audio from video, and its capability to acquire semantic audio-visual features, leading to competitive results in audio-visual retrieval and classification.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kun Su;Xiulong Liu;Eli Shlizerman", "authorids": "~Kun_Su1;~Xiulong_Liu1;~Eli_Shlizerman1", "gender": "M;M;", "homepage": "https://kun-su.netlify.app/;;http://faculty.washington.edu/shlizee/", "dblp": "184/8269;;00/9501", "google_scholar": "y52GkywAAAAJ;e5GPhrMAAAAJ;oJnSO50AAAAJ", "orcid": "0009-0004-8112-9419;;0000-0002-3136-4531", "linkedin": ";xiulong-liu-33040a130/;", "or_profile": "~Kun_Su1;~Xiulong_Liu1;~Eli_Shlizerman1", "aff": "University of Washington, Seattle;University of Washington, Seattle;University of Washington", "aff_domain": "uw.edu;uw.edu;u.washington.edu", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nsu2024from,\ntitle={From Vision to Audio and Beyond: A Unified Model for Audio-Visual Representation and Generation},\nauthor={Kun Su and Xiulong Liu and Eli Shlizerman},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jU6iPouOZ6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4421399, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15660834274859995734&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "uw.edu;uw.edu;u.washington.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Washington", "aff_unique_dep": "", "aff_unique_url": "https://www.washington.edu", "aff_unique_abbr": "UW", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Seattle;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "MagicPose: Realistic Human Poses and Facial Expressions Retargeting with Identity-aware Diffusion", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33301", "id": "jVXJdGQ4eD", "proceeding": "https://proceedings.mlr.press/v235/chang24d.html", "pdf": "https://openreview.net/pdf?id=jVXJdGQ4eD", "openreview": "https://openreview.net/forum?id=jVXJdGQ4eD", "author_site": "Di Chang, Yichun Shi, Quankai Gao, Hongyi Xu, Jessica Fu, Guoxian Song, Qing Yan, Yizhe Zhu, Xiao Yang, Mohammad Soleymani", "tldr": "", "abstract": "In this work, we propose MagicPose, a diffusion-based model for 2D human pose and facial expression retargeting. Specifically, given a reference image, we aim to generate a person's new images by controlling the poses and facial expressions while keeping the identity unchanged. To this end, we propose a two-stage training strategy to disentangle human motions and appearance (e.g., facial expressions, skin tone, and dressing), consisting of (1) the pre-training of an appearance-control block and (2) learning appearance-disentangled pose control. Our novel design enables robust appearance control over generated human images, including body, facial attributes, and even background. By leveraging the prior knowledge of image diffusion models, MagicPose generalizes well to unseen human identities and complex poses without the need for additional fine-tuning. Moreover, the proposed model is easy to use and can be considered as a plug-in module/extension to Stable Diffusion. The project website is [here](https://boese0601.github.io/magicdance/). The code is available [here](https://github.com/Boese0601/MagicDance).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Di Chang;Yichun Shi;Quankai Gao;Hongyi Xu;Jessica Fu;Guoxian Song;Qing Yan;Yizhe Zhu;Xiao Yang;Mohammad Soleymani", "authorids": "~Di_Chang1;~Yichun_Shi1;~Quankai_Gao1;~Hongyi_Xu1;~Jessica_Fu1;~Guoxian_Song1;~Qing_Yan1;~Yizhe_Zhu2;~Xiao_Yang1;~Mohammad_Soleymani2", "gender": "M;M;M;M;F;M;;M;M;M", "homepage": "https://boese0601.github.io/;https://seasonsh.github.io/;https://github.com/Zerg-Overmind;http://www-scf.usc.edu/~hongyixu/;;https://guoxiansong.github.io/homepage/index.html;;http://yzzhu.net/;;http://people.ict.usc.edu/~soleymani/", "dblp": "122/2664;196/7886;287/5063;;;189/7103;;http://dblp.uni-trier.de/pers/hd/z/Zhu:Yizhe;57/3385-2.html;s/MohammadSoleymani", "google_scholar": "https://scholar.google.com.hk/citations?hl=en;RXZChV0AAAAJ;;gqtTGD4AAAAJ;;https://scholar.google.com.sg/citations?user=EMyFIYgAAAAJ;0TIYjPAAAAAJ;hPXUR0cAAAAJ;_MAKSLkAAAAJ;", "orcid": ";;;;;0000-0002-3664-572X;;;;0000-0002-5873-1434", "linkedin": "di-chang-004784206/;;;;jessica-fu-60a504254/;guoxian-song-101558117/;;yizhe-ethan-zhu-171a06126/;;", "or_profile": "~Di_Chang1;~Yichun_Shi1;~Quankai_Gao1;~Hongyi_Xu1;~Jessica_Fu1;~Guoxian_Song1;~Qing_Yan1;~Yizhe_Zhu2;~Xiao_Yang1;~Mohammad_Soleymani2", "aff": "ByteDance Inc.;ByteDance;University of Southern California;Bytedance;University of Southern California;Bytedance Inc;Bytedance;;Bytedance;University of Southern California", "aff_domain": "bytedance.com;bytedance.com;usc.edu;bytedance.com;usc.edu;bytedance.com;bytedance.com;;bytedance.com;usc.edu", "position": "Intern;Researcher;PhD student;Researcher;Undergrad student;Researcher;Researcher;;Research Scientist;Research associate professor", "bibtex": "@inproceedings{\nchang2024magicpose,\ntitle={MagicPose: Realistic Human Poses and Facial Expressions Retargeting with Identity-aware Diffusion},\nauthor={Di Chang and Yichun Shi and Quankai Gao and Hongyi Xu and Jessica Fu and Guoxian Song and Qing Yan and Yizhe Zhu and Xiao Yang and Mohammad Soleymani},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jVXJdGQ4eD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5750207, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 55, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8883202145908341831&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "bytedance.com;bytedance.com;usc.edu;bytedance.com;usc.edu;bytedance.com;bytedance.com;;bytedance.com;usc.edu", "author_num": 10, "aff_unique_index": "0;0;1;0;1;0;0;0;1", "aff_unique_norm": "ByteDance;University of Southern California", "aff_unique_dep": ";", "aff_unique_url": "https://www.bytedance.com;https://www.usc.edu", "aff_unique_abbr": "ByteDance;USC", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;1;0;1;0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "SyCoCa: Symmetrizing Contrastive Captioners with Attentive Masking for Multimodal Alignment", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33300", "id": "jWHU4b7Yk6", "proceeding": "https://proceedings.mlr.press/v235/ma24q.html", "pdf": "https://openreview.net/pdf?id=jWHU4b7Yk6", "openreview": "https://openreview.net/forum?id=jWHU4b7Yk6", "author_site": "Ziping Ma, Furong Xu, Jian liu, Ming Yang, Qingpei Guo", "tldr": "", "abstract": "Multimodal alignment between language and vision is the fundamental topic in current vision-language model research. Contrastive Captioners (CoCa), as a representative method, integrates Contrastive Language-Image Pretraining (CLIP) and Image Caption (IC) into a unified framework, resulting in impressive results. CLIP imposes a bidirectional constraints on global representations of entire images and sentences. Although IC conducts an unidirectional image-to-text generation on local representation, it lacks any constraint on local text-to-image reconstruction, which limits the ability to understand images at a fine-grained level when aligned with texts. To achieve multimodal alignment from both global and local perspectives, this paper proposes Symmetrizing Contrastive Captioners (SyCoCa), which introduces bidirectional interactions on images and texts across the global and local representation levels. Specifically, we expand a Text-Guided Masked Image Modeling (TG-MIM) head based on ITC and IC heads. The improved SyCoCa further leverages textual cues to reconstruct contextual images and visual cues to predict textual contents. When implementing bidirectional local interactions, the local contents of images tend to be cluttered or unrelated to their textual descriptions. Thus, we employ an attentive masking strategy to select effective image patches for interaction. Extensive experiments on five vision-language tasks, including image-text retrieval, image-captioning, visual question answering, and zero-shot/finetuned image classification, validate the effectiveness of our proposed method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ziping Ma;Furong Xu;Jian liu;Ming Yang;Qingpei Guo", "authorids": "~Ziping_Ma1;~Furong_Xu1;~Jian_liu8;~Ming_Yang2;~Qingpei_Guo1", "gender": "M;F;M;M;M", "homepage": "https://lemok00.github.io;https://vipl.ict.ac.cn/view_people.php?id=111;;http://users.ece.northwestern.edu/~mya671/;", "dblp": "https://dblp.uni-trier.de/pid/128/3380-2;223/1875;;98/2604-7;164/5991", "google_scholar": "H9ktNksAAAAJ;https://scholar.google.com.hk/citations?hl=zh-CN;;uBHJx08AAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";;;0000-0003-1691-6817;", "linkedin": ";;https://www.linkedin.cn/incareer/in/%E5%81%A5-%E5%88%98-917ba7138;ming-yang-29ba294/;", "or_profile": "~Ziping_Ma1;~Furong_Xu1;~Jian_liu8;~Ming_Yang2;~Qingpei_Guo1", "aff": "Peking University;Ant Group;AntGroup;Ant Group;Ant Group", "aff_domain": "pku.edu.cn;antgroup.com;antgroup.com;antgroup.com;antgroup.com", "position": "MS student;Researcher;Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\nma2024sycoca,\ntitle={SyCoCa: Symmetrizing Contrastive Captioners with Attentive Masking for Multimodal Alignment},\nauthor={Ziping Ma and Furong Xu and Jian liu and Ming Yang and Qingpei Guo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jWHU4b7Yk6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 927740, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5386546624492662949&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "pku.edu.cn;antgroup.com;antgroup.com;antgroup.com;antgroup.com", "author_num": 5, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "Peking University;Ant Group", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.antgroup.com", "aff_unique_abbr": "Peking U;Ant Group", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Conditional Language Learning with Context", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33299", "id": "jXn1qIcjyG", "proceeding": "https://proceedings.mlr.press/v235/zhang24ag.html", "pdf": "https://openreview.net/pdf?id=jXn1qIcjyG", "openreview": "https://openreview.net/forum?id=jXn1qIcjyG", "author_site": "Xiao Zhang, Miao Li, Ji Wu", "tldr": "", "abstract": "Language models can learn sophisticated language understanding skills from fitting raw text. They also unselectively learn useless corpus statistics and biases, especially during finetuning on domain-specific corpora. In this paper, we propose a simple modification to causal language modeling called conditional finetuning, which performs language modeling conditioned on a context. We show that a context can \"explain away\" certain corpus statistics and make the model avoid learning them. In this fashion, conditional finetuning achieves selective learning from a corpus, learning knowledge useful for downstream tasks while avoiding learning useless corpus statistics like topic biases. This selective learning effect leads to less forgetting and better stability-plasticity tradeoff in domain finetuning, potentially benefitting lifelong learning with language models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiao Zhang;Miao Li;Ji Wu", "authorids": "~Xiao_Zhang9;miao-li@tsinghua.edu.cn;~Ji_Wu3", "gender": ";;M", "homepage": ";;http://speech.tsinghua.edu.cn/en/", "dblp": ";;91/4957-2", "google_scholar": "https://scholar.google.com/citations?hl=en;;", "orcid": ";;0000-0001-6170-726X", "linkedin": ";;", "or_profile": "~Xiao_Zhang9;miao-li@tsinghua.edu.cn;~Ji_Wu3", "aff": "Tsinghua University;;Tsinghua University", "aff_domain": "tsinghua.edu.cn;;tsinghua.edu.cn", "position": "PhD student;;Full Professor", "bibtex": "@inproceedings{\nzhang2024conditional,\ntitle={Conditional Language Learning with Context},\nauthor={Xiao Zhang and Miao Li and Ji Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jXn1qIcjyG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1215526, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8893815129707715528&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "tsinghua.edu.cn;;tsinghua.edu.cn", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Connecting the Dots: Collaborative Fine-tuning for Black-Box Vision-Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33298", "id": "jZEY5SxbL4", "proceeding": "https://proceedings.mlr.press/v235/wang24ao.html", "pdf": "https://openreview.net/pdf?id=jZEY5SxbL4", "openreview": "https://openreview.net/forum?id=jZEY5SxbL4", "author_site": "Zhengbo Wang, Jian Liang, Ran He, Zilei Wang, Tieniu Tan", "tldr": "", "abstract": "With the emergence of pretrained vision-language models (VLMs), considerable efforts have been devoted to fine-tuning them for downstream tasks. Despite the progress made in designing efficient fine-tuning methods, such methods require access to the model's parameters, which can be challenging as model owners often opt to provide their models as a black box to safeguard model ownership. This paper proposes a **C**ollabo**ra**tive **F**ine-**T**uning (**CraFT**) approach for fine-tuning black-box VLMs to downstream tasks, where one only has access to the input prompts and the output predictions of the model. CraFT comprises two modules, a prompt generation module for learning text prompts and a prediction refinement module for enhancing output predictions in residual style. Additionally, we introduce an auxiliary prediction-consistent loss to promote consistent optimization across these modules. These modules are optimized by a novel collaborative training algorithm. Extensive experiments on few-shot classification over 15 datasets demonstrate the superiority of CraFT. The results show that CraFT achieves a decent gain of about 12% with 16-shot datasets and only 8,000 queries. Moreover, CraFT trains faster and uses only about 1/80 of the memory footprint for deployment, while sacrificing only 1.62% compared to the white-box method. Our code is publicly available at https://github.com/mrflogs/CraFT.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhengbo Wang;Jian Liang;Ran He;Zilei Wang;Tieniu Tan", "authorids": "~Zhengbo_Wang1;~Jian_Liang1;~Ran_He1;~Zilei_Wang1;~Tieniu_Tan1", "gender": ";M;M;M;", "homepage": "https://github.com/mrflogs;https://liangjian.xyz;https://rhe-web.github.io/;;", "dblp": "193/0358;19/2208-1;61/6198-1;49/1878;", "google_scholar": ";https://scholar.google.com/citations?hl=en;ayrg9AUAAAAJ;https://scholar.google.com.hk/citations?hl=zh-CN;", "orcid": ";0000-0003-3890-1894;0000-0002-3807-991X;;", "linkedin": ";;;;", "or_profile": "~Zhengbo_Wang1;~Jian_Liang1;~Ran_He1;~Zilei_Wang1;~Tieniu_Tan1", "aff": "University of Science and Technology of China;Institute of Automation, Chinese Academy of Sciences;Institute of Automation, Chinese Academy of Sciences;University of Science and Technology of China;", "aff_domain": "ustc.edu.cn;ia.ac.cn;ia.ac.cn;ustc.edu.cn;", "position": "PhD student;Associate Professor;Full Professor;Associate Professor;", "bibtex": "@inproceedings{\nwang2024connecting,\ntitle={Connecting the Dots: Collaborative Fine-tuning for Black-Box Vision-Language Models},\nauthor={Zhengbo Wang and Jian Liang and Ran He and Zilei Wang and Tieniu Tan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jZEY5SxbL4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 535294, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16227975109379882788&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 7, "email": "ustc.edu.cn;ia.ac.cn;ia.ac.cn;ustc.edu.cn;", "author_num": 5, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "University of Science and Technology of China;Chinese Academy of Sciences", "aff_unique_dep": ";Institute of Automation", "aff_unique_url": "http://www.ustc.edu.cn;http://www.ia.cas.cn", "aff_unique_abbr": "USTC;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "FiT: Flexible Vision Transformer for Diffusion Model", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33297", "id": "jZVen2JguY", "proceeding": "https://proceedings.mlr.press/v235/lu24k.html", "pdf": "https://openreview.net/pdf?id=jZVen2JguY", "openreview": "https://openreview.net/forum?id=jZVen2JguY", "author_site": "Zeyu Lu, ZiDong Wang, Di Huang, CHENGYUE WU, Xihui Liu, Wanli Ouyang, LEI BAI", "tldr": "", "abstract": "In the context of this reality, existing diffusion models, such as Diffusion Transformers, often face challenges when processing image resolutions outside of their trained domain. To overcome this limitation, we present the Flexible Vision Transformer (FiT), a transformer architecture specifically designed for generating images with unrestricted resolutions and aspect ratios. Unlike traditional methods that perceive images as static-resolution grids, FiT conceptualizes images as sequences of dynamically-sized tokens. This perspective enables a flexible training strategy that effortlessly adapts to diverse aspect ratios during both training and inference phases, thus promoting resolution generalization and eliminating biases induced by image cropping. Enhanced by a meticulously adjusted network structure and the integration of training-free extrapolation techniques, FiT exhibits remarkable flexibility in resolution extrapolation generation. Comprehensive experiments demonstrate the exceptional performance of FiT across a broad range of resolutions. Repository available at https://github.com/whlzy/FiT.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zeyu Lu;ZiDong Wang;Di Huang;Chengyue Wu;Xihui Liu;Wanli Ouyang;LEI BAI", "authorids": "~Zeyu_Lu1;~ZiDong_Wang2;~Di_Huang6;~Chengyue_Wu1;~Xihui_Liu1;~Wanli_Ouyang1;~LEI_BAI1", "gender": "M;M;;M;F;;M", "homepage": ";https://wzdthu.github.io/;;https://hills-code.github.io;https://xh-liu.github.io/;;http://leibai.site/", "dblp": "285/3138;97/5229-4.html;;;184/3911;;119/1223-1", "google_scholar": "W_sLmX0AAAAJ;3sVypA8AAAAJ;;https://scholar.google.com.hk/citations?hl=zh-CN;https://scholar.google.com.hk/citations?user=4YL23GMAAAAJ;;https://scholar.google.com.au/citations?user=sakOO04AAAAJ", "orcid": "0000-0003-0494-911X;0009-0003-8462-6819;;;0000-0003-1831-9952;;0000-0003-3378-7201", "linkedin": ";;;;;;lei-bai-641370153/", "or_profile": "~Zeyu_Lu1;~ZiDong_Wang2;~Di_Huang6;~Chengyue_Wu1;~Xihui_Liu1;~Wanli_Ouyang1;~LEI_BAI1", "aff": "Shanghai Jiaotong University;Tsinghua University;;The University of Hong Kong;University of Hong Kong;;Shanghai AI Laboratory", "aff_domain": "sjtu.edu.cn;mail.tsinghua.edu.cn;;hku.hk;hku.hk;;pjlab.org.cn", "position": "PhD student;Undergrad student;;PhD student;Assistant Professor;;Researcher", "bibtex": "@inproceedings{\nlu2024fit,\ntitle={FiT: Flexible Vision Transformer for Diffusion Model},\nauthor={Zeyu Lu and ZiDong Wang and Di Huang and Chengyue Wu and Xihui Liu and Wanli Ouyang and LEI BAI},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jZVen2JguY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2416992, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2017415440220138029&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "sjtu.edu.cn;mail.tsinghua.edu.cn;;hku.hk;hku.hk;;pjlab.org.cn", "author_num": 7, "aff_unique_index": "0;1;2;2;3", "aff_unique_norm": "Shanghai Jiao Tong University;Tsinghua University;University of Hong Kong;Shanghai AI Laboratory", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.tsinghua.edu.cn;https://www.hku.hk;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "SJTU;THU;HKU;SAIL", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Unmasking Vulnerabilities: Cardinality Sketches under Adaptive Inputs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33296", "id": "jaJxpKkBcL", "proceeding": "https://proceedings.mlr.press/v235/ahmadian24a.html", "pdf": "https://openreview.net/pdf?id=jaJxpKkBcL", "openreview": "https://openreview.net/forum?id=jaJxpKkBcL", "author_site": "Sara Ahmadian, Edith Cohen", "tldr": "", "abstract": "Cardinality sketches are popular data structures that enhance the efficiency of working with large data sets. The sketches are randomized representations of sets that are only of logarithmic size but can support set merges and approximate cardinality (i.e., distinct count) queries. When queries are not adaptive, that is, they do not depend on preceding query responses, the design provides strong guarantees of correctly answering a number of queries exponential in the sketch size $k$. In this work, we investigate the performance of cardinality sketches in adaptive settings and unveil inherent vulnerabilities. We design an attack against the ``standard'' estimators that constructs an adversarial input by post-processing responses to a set of simple non-adaptive queries of size linear in the sketch size $k$. Empirically, our attack used only $4k$ queries with the widely used HyperLogLog (HLL++) Flajolet et al., 2007; Heule et al., 2013) sketch. The simple attack technique suggests it can be effective with post-processed natural workloads. Finally and importantly, we demonstrate that the vulnerability is inherent as any estimator applied to known sketch structures can be attacked using a number of queries that is quadratic in $k$, matching a generic upper bound.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sara Ahmadian;Edith Cohen", "authorids": "~Sara_Ahmadian1;~Edith_Cohen1", "gender": "F;F", "homepage": ";http://www.cohenwang.com/edith/", "dblp": ";40/1039", "google_scholar": "ghjoJhsAAAAJ;O-TV6OgAAAAJ", "orcid": ";0000-0002-3926-8237", "linkedin": ";", "or_profile": "~Sara_Ahmadian1;~Edith_Cohen1", "aff": "Research, Google;Google", "aff_domain": "research.google.com;google.com", "position": "Research scientist;Research Scientist", "bibtex": "@inproceedings{\nahmadian2024unmasking,\ntitle={Unmasking Vulnerabilities: Cardinality Sketches under Adaptive Inputs},\nauthor={Sara Ahmadian and Edith Cohen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jaJxpKkBcL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 951229, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4489574484609700964&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "research.google.com;google.com", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Simple Ingredients for Offline Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33295", "id": "japBn31gXC", "proceeding": "https://proceedings.mlr.press/v235/cetin24a.html", "pdf": "https://openreview.net/pdf?id=japBn31gXC", "openreview": "https://openreview.net/forum?id=japBn31gXC", "author_site": "Edoardo Cetin, Andrea Tirinzoni, Matteo Pirotta, Alessandro Lazaric, Yann Ollivier, Ahmed Touati", "tldr": "", "abstract": "Offline reinforcement learning algorithms have proven effective on datasets highly connected to the target downstream task. Yet, by leveraging a novel testbed (MOOD) in which trajectories come from heterogeneous sources, we show that existing methods struggle with diverse data: their performance considerably deteriorates as data collected for related but different tasks is simply added to the offline buffer. In light of this finding, we conduct a large empirical study where we formulate and test several hypotheses to explain this failure. Surprisingly, we find that targeted scale, more than algorithmic considerations, is the key factor influencing performance. We show that simple methods like AWAC and IQL with increased policy size overcome the paradoxical failure modes from the inclusion of additional data in MOOD, and notably outperform prior state-of-the-art algorithms on the canonical D4RL benchmark.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Edoardo Cetin;Andrea Tirinzoni;Matteo Pirotta;Alessandro Lazaric;Yann Ollivier;Ahmed Touati", "authorids": "~Edoardo_Cetin1;~Andrea_Tirinzoni2;~Matteo_Pirotta1;~Alessandro_Lazaric2;~Yann_Ollivier2;~Ahmed_Touati1", "gender": ";;;M;M;M", "homepage": "https://aladoro.github.io/;https://andreatirinzoni.github.io/;;;http://www.yann-ollivier.org/rech/;", "dblp": "287/4615;220/5305;137/3249;36/321;63/343;147/5871", "google_scholar": "https://scholar.google.it/citations?hl=en;MmW0yrwAAAAJ;https://scholar.google.ca/citations?user=6qWcDTAAAAAJ;6JZ3R6wAAAAJ;;https://scholar.google.fr/citations?user=D4LT5xAAAAAJ", "orcid": ";;;;;", "linkedin": "edoardo-cetin-916b68195/;;;;;ahmed-touati-4a132a76/", "or_profile": "~Edoardo_Cetin1;~Andrea_Tirinzoni2;~Matteo_Pirotta1;~Alessandro_Lazaric2;~Yann_Ollivier2;~Ahmed_Touati1", "aff": "Sakana AI;Meta, FAIR;Meta;Meta Facebook;Meta Artificial Intelligence Research;Meta Facebook", "aff_domain": "sakana.ai;meta.com;meta.com;fb.com;meta.com;fb.com", "position": "Researcher;Researcher;Research Scientist;Research Scientist;Research scientist;Researcher", "bibtex": "@inproceedings{\ncetin2024simple,\ntitle={Simple Ingredients for Offline Reinforcement Learning},\nauthor={Edoardo Cetin and Andrea Tirinzoni and Matteo Pirotta and Alessandro Lazaric and Yann Ollivier and Ahmed Touati},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=japBn31gXC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3909357, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10955978352045434337&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 7, "email": "sakana.ai;meta.com;meta.com;fb.com;meta.com;fb.com", "author_num": 6, "aff_unique_index": "0;1;1;1;1;1", "aff_unique_norm": "Sakana AI;Meta", "aff_unique_dep": ";Meta", "aff_unique_url": ";https://meta.org", "aff_unique_abbr": ";Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;1;1;1;1", "aff_country_unique": ";United States" }, { "title": "Online Variational Sequential Monte Carlo", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33294", "id": "jbPc3pW6sC", "proceeding": "https://proceedings.mlr.press/v235/mastrototaro24a.html", "pdf": "https://openreview.net/pdf?id=jbPc3pW6sC", "openreview": "https://openreview.net/forum?id=jbPc3pW6sC", "author_site": "Alessandro Mastrototaro, Jimmy Olsson", "tldr": "", "abstract": "Being the most classical generative model for serial data, state-space models (SSM) are fundamental in AI and statistical machine learning. In SSM, any form of parameter learning or latent state inference typically involves the computation of complex latent-state posteriors. In this work, we build upon the variational sequential Monte Carlo (VSMC) method, which provides computationally efficient and accurate model parameter estimation and Bayesian latent-state inference by combining particle methods and variational inference. While standard VSMC operates in the offline mode, by re-processing repeatedly a given batch of data, we distribute the approximation of the gradient of the VSMC surrogate ELBO in time using stochastic approximation, allowing for online learning in the presence of streams of data. This results in an algorithm, online VSMC, that is capable of performing efficiently, entirely on-the-fly, both parameter estimation and particle proposal adaptation. In addition, we provide rigorous theoretical results describing the algorithm's convergence properties as the number of data tends to infinity as well as numerical illustrations of its excellent convergence properties and usefulness also in batch-processing settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alessandro Mastrototaro;Jimmy Olsson", "authorids": "~Alessandro_Mastrototaro1;~Jimmy_Olsson1", "gender": "M;M", "homepage": "https://www.kth.se/profile/alemas;https://www.kth.se/profile/jimmyol", "dblp": ";", "google_scholar": "https://scholar.google.ca/citations?user=SFmDZS8AAAAJ;xBHS7MAAAAAJ", "orcid": "0000-0001-9380-1197;", "linkedin": "alessandro-mastrototaro-41b834147;", "or_profile": "~Alessandro_Mastrototaro1;~Jimmy_Olsson1", "aff": "KTH Royal Institute of Technology;KTH Royal Institute of Technology", "aff_domain": "kth.se;kth.se", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nmastrototaro2024online,\ntitle={Online Variational Sequential Monte Carlo},\nauthor={Alessandro Mastrototaro and Jimmy Olsson},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jbPc3pW6sC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9447663, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18159573910716461383&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "kth.se;kth.se", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "KTH Royal Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kth.se", "aff_unique_abbr": "KTH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Sweden" }, { "title": "BBox-Adapter: Lightweight Adapting for Black-Box Large Language Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33293", "id": "jdRIaUu3xY", "proceeding": "https://proceedings.mlr.press/v235/sun24p.html", "pdf": "https://openreview.net/pdf?id=jdRIaUu3xY", "openreview": "https://openreview.net/forum?id=jdRIaUu3xY", "author_site": "Haotian Sun, Yuchen Zhuang, Wei Wei, Chao Zhang, Bo Dai", "tldr": "", "abstract": "Adapting state-of-the-art Large Language Models (LLMs) like GPT-4 and Gemini for specific tasks is challenging. Due to the opacity in their parameters, embeddings, and even output probabilities, existing fine-tuning adaptation methods are inapplicable. Consequently, adapting these black-box LLMs is only possible through their API services, raising concerns about transparency, privacy, and cost. To address these challenges, we introduce BBox-Adapter, a novel lightweight adapter for black-box LLMs. BBox-Adapter distinguishes target and source domain data by treating target data as positive and source data as negative. It employs a ranking-based Noise Contrastive Estimation (NCE) loss to promote the likelihood of target domain data while penalizing that of the source domain. Furthermore, it features an online adaptation mechanism, which incorporates real-time positive data sampling from ground-truth, human, or AI feedback, coupled with negative data from previous adaptations. Extensive experiments demonstrate BBox-Adapter's effectiveness and cost efficiency. It improves model performance by up to 6.77% across diverse tasks and domains, while reducing training and inference costs by 31.30x and 1.84x, respectively.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haotian Sun;Yuchen Zhuang;Wei Wei;Chao Zhang;Bo Dai", "authorids": "~Haotian_Sun1;~Yuchen_Zhuang1;~Wei_Wei15;~Chao_Zhang15;~Bo_Dai1", "gender": "M;M;;;M", "homepage": "https://haotiansun.tech/;https://night-chen.github.io/;http://chaozhang.org/;https://bo-dai.github.io/;http://www.weiwei.one", "dblp": "12/8162;191/5231.html;94/3019-14;64/2903;", "google_scholar": "lcWkVCQAAAAJ;T-f6XlEAAAAJ;https://scholar.google.com/citations?hl=en;TIKl_foAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0001-9013-7016;;0000-0003-3009-598X;0009-0002-8070-574X;", "linkedin": "haotian-sun-159597218/;;;;", "or_profile": "~Haotian_Sun1;~Yuchen_Zhuang1;~Chao_Zhang15;~Bo_Dai1;~wei_wei3", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology;Google Brain;Google", "aff_domain": "gatech.edu;gatech.edu;gatech.edu;google.com;google.com", "position": "PhD student;PhD student;Assistant Professor;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nsun2024bboxadapter,\ntitle={{BB}ox-Adapter: Lightweight Adapting for Black-Box Large Language Models},\nauthor={Haotian Sun and Yuchen Zhuang and Wei Wei and Chao Zhang and Bo Dai},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jdRIaUu3xY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 990572, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12500789947567102323&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "gatech.edu;gatech.edu;gatech.edu;google.com;google.com", "author_num": 5, "aff_unique_index": "0;0;0;1;1", "aff_unique_norm": "Georgia Institute of Technology;Google", "aff_unique_dep": ";Google Brain", "aff_unique_url": "https://www.gatech.edu;https://brain.google.com", "aff_unique_abbr": "Georgia Tech;Google Brain", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Plug-in Performative Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33292", "id": "jh7FDDwDBf", "proceeding": "https://proceedings.mlr.press/v235/lin24ab.html", "pdf": "https://openreview.net/pdf?id=jh7FDDwDBf", "openreview": "https://openreview.net/forum?id=jh7FDDwDBf", "author_site": "Licong Lin, Tijana Zrnic", "tldr": "", "abstract": "When predictions are performative, the choice of which predictor to deploy influences the distribution of future observations. The overarching goal in learning under performativity is to find a predictor that has low performative risk, that is, good performance on its induced distribution. One family of solutions for optimizing the performative risk, including bandits and other derivative-free methods, is agnostic to any structure in the performative feedback, leading to exceedingly slow convergence rates. A complementary family of solutions makes use of explicit models for the feedback, such as best-response models in strategic classification, enabling faster rates. However, these rates critically rely on the feedback model being correct. In this work we study a general protocol for making use of possibly misspecified models in performative prediction, called plug-in performative optimization. We show this solution can be far superior to model-agnostic strategies, as long as the misspecification is not too extreme. Our results support the hypothesis that models, even if misspecified, can indeed help with learning in performative settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Licong Lin;Tijana Zrnic", "authorids": "~Licong_Lin2;~Tijana_Zrnic1", "gender": "M;F", "homepage": "https://statistics.berkeley.edu/people/licong-lin;https://tijana-zrnic.github.io", "dblp": ";188/4437", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Licong_Lin2;~Tijana_Zrnic1", "aff": "University of California, Berkeley;Stanford University", "aff_domain": "berkeley.edu;stanford.edu", "position": "PhD student;Postdoc", "bibtex": "@inproceedings{\nlin2024plugin,\ntitle={Plug-in Performative Optimization},\nauthor={Licong Lin and Tijana Zrnic},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jh7FDDwDBf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 992122, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14042802746080374246&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "berkeley.edu;stanford.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of California, Berkeley;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://www.stanford.edu", "aff_unique_abbr": "UC Berkeley;Stanford", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Berkeley;Stanford", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Post-hoc Part-Prototype Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33291", "id": "jhWSzTO0Jl", "proceeding": "https://proceedings.mlr.press/v235/tan24g.html", "pdf": "https://openreview.net/pdf?id=jhWSzTO0Jl", "openreview": "https://openreview.net/forum?id=jhWSzTO0Jl", "author_site": "Andong Tan, Fengtao ZHOU, Hao Chen", "tldr": "", "abstract": "Post-hoc explainability methods such as Grad-CAM are popular because they do not influence the performance of a trained model. However, they mainly reveal ''where'' a model looks at for a given input, fail to explain ''what'' the model looks for (e.g., what is important to classify a bird image to a Scott Oriole?). Existing part-prototype networks leverage part-prototypes (e.g., characteristic Scott Oriole's wing and head) to answer both ''where\" and ''what\", but often under-perform their black box counterparts in the accuracy. Therefore, a natural question is: can one construct a network that answers both ''where'' and ''what\" in a post-hoc manner to guarantee the model's performance? To this end, we propose the first post-hoc part-prototype network via decomposing the classification head of a trained model into a set of interpretable part-prototypes. Concretely, we propose an unsupervised prototype discovery and refining strategy to obtain prototypes that can precisely reconstruct the classification head, yet being interpretable. Besides guaranteeing the performance, we show that our network offers more faithful explanations qualitatively and yields even better part-prototypes quantitatively than prior part-prototype networks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Andong Tan;Fengtao ZHOU;Hao Chen", "authorids": "~Andong_Tan1;~Fengtao_ZHOU1;~Hao_Chen1", "gender": "M;M;M", "homepage": ";;https://cse.hkust.edu.hk/~jhc/", "dblp": "225/7131;281/9853;86/475-11", "google_scholar": "zt49vSoAAAAJ;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.hk/citations?user=Z_t5DjwAAAAJ", "orcid": ";0000-0001-7039-1156;0000-0002-8400-3780", "linkedin": ";;", "or_profile": "~Andong_Tan1;~Fengtao_ZHOU1;~Hao_Chen1", "aff": "Hong Kong University of Science and Technology;Department of Computer Science and Engineering, Hong Kong University of Science and Technology;Hong Kong University of Science and Technology", "aff_domain": "connect.ust.hk;cse.ust.hk;ust.hk", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\ntan2024posthoc,\ntitle={Post-hoc Part-Prototype Networks},\nauthor={Andong Tan and Fengtao ZHOU and Hao Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jhWSzTO0Jl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1601595, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1239136886173267626&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "connect.ust.hk;cse.ust.hk;ust.hk", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Hong Kong University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.ust.hk", "aff_unique_abbr": "HKUST", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Seesaw: Compensating for Nonlinear Reduction with Linear Computations for Private Inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33290", "id": "jklD0TV5Hw", "proceeding": "https://proceedings.mlr.press/v235/li24cj.html", "pdf": "https://openreview.net/pdf?id=jklD0TV5Hw", "openreview": "https://openreview.net/forum?id=jklD0TV5Hw", "author_site": "Fabing Li, Yuanhao Zhai, Shuangyu Cai, Mingyu Gao", "tldr": "", "abstract": "With increasingly serious data privacy concerns and strict regulations, privacy-preserving machine learning (PPML) has emerged to securely execute machine learning tasks without violating privacy. Unfortunately, the computational cost to securely execute nonlinear computations in PPML remains significant, calling for new model architecture designs with fewer nonlinear operations. We propose Seesaw, a novel neural architecture search method tailored for PPML. Seesaw exploits a previously unexplored opportunity to leverage more linear computations and nonlinear result reuse, in order to compensate for the accuracy loss due to nonlinear reduction. It incorporates specifically designed pruning and search strategies, not only to efficiently handle the much larger design space of both linear and nonlinear operators, but also to achieve a better balance between the model accuracy and the online/offline execution latencies. Compared to the state-of-the-art design for image classification on ImageNet, Seesaw achieves 1.68$\\times$ lower online latency and 1.55$\\times$ lower total online + offline latency at 71% iso-accuracy, or 3.65% higher accuracy at iso-latency of 190 seconds, while using much simpler and faster search and training methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fabing Li;Yuanhao Zhai;Shuangyu Cai;Mingyu Gao", "authorids": "~Fabing_Li1;~Yuanhao_Zhai1;caisy21@mails.tsinghua.edu.cn;~Mingyu_Gao1", "gender": ";M;;M", "homepage": ";https://www.yhzhai.com;;https://people.iiis.tsinghua.edu.cn/~gaomy/", "dblp": ";22/11135-1;;61/7672-1", "google_scholar": ";https://scholar.google.com/citations?hl=en;;", "orcid": ";0000-0002-3277-3329;;", "linkedin": "fabing-li-ba955b1aa/;yuanhao-zhai-895518161/;;", "or_profile": "~Fabing_Li1;~Yuanhao_Zhai1;caisy21@mails.tsinghua.edu.cn;~Mingyu_Gao1", "aff": ";State University of New York at Buffalo;;Shanghai Qi Zhi Institute", "aff_domain": ";buffalo.edu;;sqz.ac.cn", "position": ";PhD student;;Researcher", "bibtex": "@inproceedings{\nli2024seesaw,\ntitle={Seesaw: Compensating for Nonlinear Reduction with Linear Computations for Private Inference},\nauthor={Fabing Li and Yuanhao Zhai and Shuangyu Cai and Mingyu Gao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jklD0TV5Hw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1237997, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17561921237701753593&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 7, "email": ";buffalo.edu;;sqz.ac.cn", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "State University of New York at Buffalo;Shanghai Qi Zhi Institute", "aff_unique_dep": ";", "aff_unique_url": "https://www.buffalo.edu;https://www.qz.io", "aff_unique_abbr": "SUNY Buffalo;", "aff_campus_unique_index": "0", "aff_campus_unique": "Buffalo;", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;China" }, { "title": "In-Context Decision Transformer: Reinforcement Learning via Hierarchical Chain-of-Thought", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33289", "id": "jmmji1EU3g", "proceeding": "https://proceedings.mlr.press/v235/huang24j.html", "pdf": "https://openreview.net/pdf?id=jmmji1EU3g", "openreview": "https://openreview.net/forum?id=jmmji1EU3g", "author_site": "sili huang, Jifeng Hu, Hechang Chen, Lichao Sun, Bo Yang", "tldr": "", "abstract": "In-context learning is a promising approach for offline reinforcement learning (RL) to handle online tasks, which can be achieved by providing task prompts. Recent works demonstrated that in-context RL could emerge with self-improvement in a trial-and-error manner when treating RL tasks as an across-episodic sequential prediction problem. Despite the self-improvement not requiring gradient updates, current works still suffer from high computational costs when the across-episodic sequence increases with task horizons. To this end, we propose an In-context Decision Transformer (IDT) to achieve self-improvement in a high-level trial-and-error manner. Specifically, IDT is inspired by the efficient hierarchical structure of human decision-making and thus reconstructs the sequence to consist of high-level decisions instead of low-level actions that interact with environments. As one high-level decision can guide multi-step low-level actions, IDT naturally avoids excessively long sequences and solves online tasks more efficiently. Experimental results show that IDT achieves state-of-the-art in long-horizon tasks over current in-context RL methods. In particular, the online evaluation time of our IDT is 36$\\times$ times faster than baselines in the D4RL benchmark and 27$\\times$ times faster in the Grid World benchmark.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sili Huang;Jifeng Hu;Hechang Chen;Lichao Sun;Bo Yang", "authorids": "~Sili_Huang1;~Jifeng_Hu1;~Hechang_Chen2;~Lichao_Sun1;~Bo_Yang6", "gender": "M;;M;M;", "homepage": ";;http://sai.jlu.edu.cn/info/1094/2387.htm;https://lichao-sun.github.io/;http://ccst.jlu.edu.cn/info/1367/19045.htm", "dblp": "26/6752;;145/1142;121/0780-1.html;46/999-2", "google_scholar": "ZMhi8A0AAAAJ;;EezEcbgAAAAJ;WhGUE7AAAAAJ;", "orcid": "0000-0001-5387-7904;;;;0000-0003-1927-8419", "linkedin": ";;;lichao-sun-b273a290/;", "or_profile": "~Sili_Huang1;~Jifeng_Hu1;~Hechang_Chen2;~Lichao_Sun1;~Bo_Yang6", "aff": "Jilin University;;Jilin University;Lehigh University;Jilin University", "aff_domain": "jlu.edu.cn;;jlu.edu.cn;lehigh.edu;jlu.edu.cn", "position": "PhD student;;Associate Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nhuang2024incontext,\ntitle={In-Context Decision Transformer: Reinforcement Learning via Hierarchical Chain-of-Thought},\nauthor={Sili Huang and Jifeng Hu and Hechang Chen and Lichao Sun and Bo Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jmmji1EU3g}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 758252, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6818578711001773174&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "jlu.edu.cn;;jlu.edu.cn;lehigh.edu;jlu.edu.cn", "author_num": 5, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Jilin University;Lehigh University", "aff_unique_dep": ";", "aff_unique_url": "http://www.jlu.edu.cn;https://www.lehigh.edu", "aff_unique_abbr": "JLU;Lehigh", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "A decoder-only foundation model for time-series forecasting", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33288", "id": "jn2iTJas6h", "proceeding": "https://proceedings.mlr.press/v235/das24c.html", "pdf": "https://openreview.net/pdf?id=jn2iTJas6h", "openreview": "https://openreview.net/forum?id=jn2iTJas6h", "author_site": "Abhimanyu Das, Weihao Kong, Rajat Sen, Yichen Zhou", "tldr": "", "abstract": "Motivated by recent advances in large language models for Natural Language Processing (NLP), we design a time-series foundation model for forecasting whose out-of-the-box zero-shot performance on a variety of public datasets comes close to the accuracy of state-of-the-art supervised forecasting models for each individual dataset. Our model is based on pretraining a decoder style attention model with input patching, using a large time-series corpus comprising both real-world and synthetic datasets. Experiments on a diverse set of previously unseen forecasting datasets suggests that the model can yield accurate zero-shot forecasts across different domains, forecasting horizons and temporal granularities.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Abhimanyu Das;Weihao Kong;Rajat Sen;Yichen Zhou", "authorids": "~Abhimanyu_Das2;~Weihao_Kong1;~Rajat_Sen1;~Yichen_Zhou3", "gender": "M;;M;", "homepage": "https://sites.google.com/site/abhidas/;https://weihaokong.github.io/;http://rajatsen91.github.io;", "dblp": "83/6359;117/4343;http://dblp.uni-trier.de/pers/hd/s/Sen:Rajat;55/10422", "google_scholar": ";loxOHhoAAAAJ;YzsCLBoAAAAJ;YAJI36UAAAAJ", "orcid": ";;;0000-0002-4925-6184", "linkedin": ";;rajat-sen-a8702417/;yichen-zhou-9424554a/", "or_profile": "~Abhimanyu_Das2;~Weihao_Kong1;~Rajat_Sen1;~Yichen_Zhou3", "aff": "Research, Google;Google;Google;Google", "aff_domain": "research.google.com;google.com;google.com;google.com", "position": "Researcher;Researcher;Research Scientist;Data Scientist", "bibtex": "@inproceedings{\ndas2024a,\ntitle={A decoder-only foundation model for time-series forecasting},\nauthor={Abhimanyu Das and Weihao Kong and Rajat Sen and Yichen Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jn2iTJas6h}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3096804, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 253, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15675180951439216460&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "research.google.com;google.com;google.com;google.com", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Efficient Precision and Recall Metrics for Assessing Generative Models using Hubness-aware Sampling", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33287", "id": "jnps5YwNlU", "proceeding": "https://proceedings.mlr.press/v235/liang24f.html", "pdf": "https://openreview.net/pdf?id=jnps5YwNlU", "openreview": "https://openreview.net/forum?id=jnps5YwNlU", "author_site": "Yuanbang Liang, Jing Wu, Yu-Kun Lai, Yipeng Qin", "tldr": "", "abstract": "Despite impressive results, deep generative models require massive datasets for training, and as dataset size increases, effective evaluation metrics like precision and recall (P&R) become computationally infeasible on commodity hardware. In this paper, we address this challenge by proposing efficient P&R (eP&R) metrics that give almost identical results as the original P&R but with much lower computational costs. Specifically, we identify two redundancies in the original P&R: i) redundancy in ratio computation and ii) redundancy in manifold inside/outside identification. We find both can be effectively removed via hubness-aware sampling, which extracts representative elements from synthetic/real image samples based on their hubness values, i.e., the number of times a sample becomes a k-nearest neighbor to others in the feature space. Thanks to the insensitivity of hubness-aware sampling to exact k-nearest neighbor (k-NN) results, we further improve the efficiency of our eP&R metrics by using approximate k-NN methods. Extensive experiments show that our eP&R matches the original P&R but is far more efficient in time and space. Our code is available at: https://github.com/Byronliang8/Hubness_Precision_Recall", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuanbang Liang;Jing Wu;Yu-Kun Lai;Yipeng Qin", "authorids": "~Yuanbang_Liang1;~Jing_Wu3;~Yu-Kun_Lai1;~Yipeng_Qin1", "gender": "M;;;M", "homepage": "https://byronliang8.github.io/;https://www.cardiff.ac.uk/people/view/118177-wu-jing;https://profiles.cardiff.ac.uk/staff/qiny16;https://users.cs.cf.ac.uk/Yukun.Lai/", "dblp": "279/9811;;169/5516;60/4932", "google_scholar": "RRjSWYkAAAAJ;https://scholar.google.co.uk/citations?user=ms46emIAAAAJ;ojgWPpgAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0009-0000-8370-6655;;0000-0002-1551-9126;", "linkedin": ";;;", "or_profile": "~Yuanbang_Liang1;~Jing_Wu3;~Yipeng_Qin1;~Yukun_Lai1", "aff": "Cardiff University;Cardiff University;Cardiff University;Cardiff University", "aff_domain": "cardiff.ac.uk;cardiff.ac.uk;cardiff.ac.uk;cardiff.ac.uk", "position": "PhD student;Lecturer;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nliang2024efficient,\ntitle={Efficient Precision and Recall Metrics for Assessing Generative Models using Hubness-aware Sampling},\nauthor={Yuanbang Liang and Jing Wu and Yu-Kun Lai and Yipeng Qin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jnps5YwNlU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1234450, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9654841950521786071&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "cardiff.ac.uk;cardiff.ac.uk;cardiff.ac.uk;cardiff.ac.uk", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Cardiff University", "aff_unique_dep": "", "aff_unique_url": "https://www.cardiff.ac.uk", "aff_unique_abbr": "Cardiff", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Conformalized Survival Distributions: A Generic Post-Process to Increase Calibration", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33286", "id": "jr0W36wOBx", "proceeding": "https://proceedings.mlr.press/v235/qi24a.html", "pdf": "https://openreview.net/pdf?id=jr0W36wOBx", "openreview": "https://openreview.net/forum?id=jr0W36wOBx", "author_site": "Shi-ang Qi, Yakun Yu, Russell Greiner", "tldr": "", "abstract": "Discrimination and calibration represent two important properties of survival analysis, with the former assessing the model's ability to accurately rank subjects and the latter evaluating the alignment of predicted outcomes with actual events. With their distinct nature, it is hard for survival models to simultaneously optimize both of them especially as many previous results found improving calibration tends to diminish discrimination performance. This paper introduces a novel approach utilizing *conformal regression* that can improve a model's calibration without degrading discrimination. We provide theoretical guarantees for the above claim, and rigorously validate the efficiency of our approach across 11 real-world datasets, showcasing its practical applicability and robustness in diverse scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shi-ang Qi;Yakun Yu;Russell Greiner", "authorids": "~Shi-ang_Qi1;~Yakun_Yu1;~Russell_Greiner2", "gender": "M;F;M", "homepage": "https://shi-ang.github.io/;;https://webdocs.cs.ualberta.ca/~rgreiner/", "dblp": "229/0946.html;;g/RussellGreiner", "google_scholar": "https://scholar.google.ca/citations?user=EcXcCD4AAAAJ;;https://scholar.google.com.tw/citations?user=Rn7APGIAAAAJ", "orcid": "0000-0002-4319-5501;0000-0003-4571-1570;0000-0001-8327-934X", "linkedin": "shi-ang-qi-236819197/;;", "or_profile": "~Shi-ang_Qi1;~Yakun_Yu1;~Russell_Greiner1", "aff": "University of Alberta;University of Alberta;University of Alberta", "aff_domain": "cs.ualberta.ca;ualberta.ca;ualberta.ca", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nqi2024conformalized,\ntitle={Conformalized Survival Distributions: A Generic Post-Process to Increase Calibration},\nauthor={Shi-ang Qi and Yakun Yu and Russell Greiner},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jr0W36wOBx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8225572, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12627257785649579004&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "cs.ualberta.ca;ualberta.ca;ualberta.ca", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Alberta", "aff_unique_dep": "", "aff_unique_url": "https://www.ualberta.ca", "aff_unique_abbr": "UAlberta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "title": "PGODE: Towards High-quality System Dynamics Modeling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33285", "id": "jrE7geZekq", "proceeding": "https://proceedings.mlr.press/v235/luo24b.html", "pdf": "https://openreview.net/pdf?id=jrE7geZekq", "openreview": "https://openreview.net/forum?id=jrE7geZekq", "author_site": "Xiao Luo, Yiyang Gu, Huiyu Jiang, Hang Zhou, Jinsheng Huang, Wei Ju, Zhiping Xiao, Ming Zhang, Yizhou Sun", "tldr": "", "abstract": "This paper studies the problem of modeling multi-agent dynamical systems, where agents could interact mutually to influence their behaviors. Recent research predominantly uses geometric graphs to depict these mutual interactions, which are then captured by powerful graph neural networks (GNNs). However, predicting interacting dynamics in challenging scenarios such as out-of-distribution shift and complicated underlying rules remains unsolved. In this paper, we propose a new approach named Prototypical Graph ODE (PGODE) to address the problem. The core of PGODE is to incorporate prototype decomposition from contextual knowledge into a continuous graph ODE framework. Specifically, PGODE employs representation disentanglement and system parameters to extract both object-level and system-level contexts from historical trajectories, which allows us to explicitly model their independent influence and thus enhances the generalization capability under system changes. Then, we integrate these disentangled latent representations into a graph ODE model, which determines a combination of various interacting prototypes for enhanced model expressivity. The entire model is optimized using an end-to-end variational inference framework to maximize the likelihood. Extensive experiments in both in-distribution and out-of-distribution settings validate the superiority of PGODE compared to various baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiao Luo;Yiyang Gu;Huiyu Jiang;Hang Zhou;Jinsheng Huang;Wei Ju;Zhiping Xiao;Ming Zhang;Yizhou Sun", "authorids": "~Xiao_Luo3;~Yiyang_Gu1;~Huiyu_Jiang1;~Hang_Zhou13;~Jinsheng_Huang1;~Wei_Ju1;~Zhiping_Xiao1;~Ming_Zhang5;~Yizhou_Sun1", "gender": "M;;M;M;M;;F;F;F", "homepage": "http://luoxiao12.github.io;;;https://hg-zh.github.io;;;https://patriciaxiao.github.io/www/;https://cs.pku.edu.cn/info/1080/1371.htm;http://web.cs.ucla.edu/~yzsun/", "dblp": "50/1585-1;;;;;;176/5397-1.html;73/1844-4;37/3868", "google_scholar": "https://scholar.google.com.hk/citations?;;;hVUydFIAAAAJ;YHbWSOMAAAAJ;;tF8GQawAAAAJ;LbzoQBsAAAAJ;https://scholar.google.com.tw/citations?user=TQgOjK0AAAAJ", "orcid": ";;;0009-0003-9535-6287;;;0000-0002-8583-4789;0000-0002-9809-3430;", "linkedin": "%E9%9C%84-%E7%BD%97-303548214/;;huiyu-jiang/;;;;zpxiao/;;", "or_profile": "~Xiao_Luo3;~Yiyang_Gu1;~Huiyu_Jiang1;~Hang_Zhou13;~Jinsheng_Huang1;~Wei_Ju1;~Zhiping_Xiao1;~Ming_Zhang5;~Yizhou_Sun1", "aff": "University of California, Los Angeles;;University of California, Santa Barbara;University of California, Davis;Peking University;;University of California, Los Angeles;Peking University;University of California, Los Angeles", "aff_domain": "cs.ucla.edu;;ucsb.edu;ucdavis.edu;pku.edu.cn;;cs.ucla.edu;pku.edu.cn;ucla.edu", "position": "Postdoc;;PhD student;Postdoc;PhD student;;PhD student;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nluo2024pgode,\ntitle={{PGODE}: Towards High-quality System Dynamics Modeling},\nauthor={Xiao Luo and Yiyang Gu and Huiyu Jiang and Hang Zhou and Jinsheng Huang and Wei Ju and Zhiping Xiao and Ming Zhang and Yizhou Sun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jrE7geZekq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1682352, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1919405539019252987&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "cs.ucla.edu;;ucsb.edu;ucdavis.edu;pku.edu.cn;;cs.ucla.edu;pku.edu.cn;ucla.edu", "author_num": 9, "aff_unique_index": "0;1;2;3;0;3;0", "aff_unique_norm": "University of California, Los Angeles;University of California, Santa Barbara;University of California, Davis;Peking University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ucla.edu;https://www.ucsb.edu;https://www.ucdavis.edu;http://www.pku.edu.cn", "aff_unique_abbr": "UCLA;UCSB;UC Davis;Peking U", "aff_campus_unique_index": "0;1;2;0;0", "aff_campus_unique": "Los Angeles;Santa Barbara;Davis;", "aff_country_unique_index": "0;0;0;1;0;1;0", "aff_country_unique": "United States;China" }, { "title": "FedMBridge: Bridgeable Multimodal Federated Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33284", "id": "jrHUbftLd6", "proceeding": "https://proceedings.mlr.press/v235/chen24ba.html", "pdf": "https://openreview.net/pdf?id=jrHUbftLd6", "openreview": "https://openreview.net/forum?id=jrHUbftLd6", "author_site": "Jiayi Chen, Aidong Zhang", "tldr": "", "abstract": "Multimodal Federated Learning (MFL) addresses the setup of multiple clients with diversified modality types (e.g. image, text, video, and audio) working together to improve their local personal models in a data-privacy manner. Prior MFL works rely on restrictive compositional neural architecture designs to ensure inter-client information sharing via blockwise model aggregation, limiting their applicability in the real-world **Architecture-personalized MFL (AMFL)** scenarios, where clients may have distinguished multimodal interaction strategies and there is no restriction on local architecture design. The key challenge in AMFL is how to automatically and efficiently tackle the two heterogeneity patterns--statistical and architecture heterogeneity--while maximizing the beneficial information sharing among clients. To solve this challenge, we propose **FedMBridge**, which leverages a topology-aware hypernetwork to act as a bridge that can automatically balance and digest the two heterogeneity patterns in a communication-efficient manner. Our experiments on four AMFL simulations demonstrate the efficiency and effectiveness of our proposed approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiayi Chen;Aidong Zhang", "authorids": "~Jiayi_Chen4;~Aidong_Zhang2", "gender": "F;F", "homepage": "https://jia-yi-chen.github.io/;https://engineering.virginia.edu/faculty/aidong-zhang", "dblp": "42/1159;z/AidongZhang.html", "google_scholar": "f3Iz6qoAAAAJ;O8XxkE4AAAAJ", "orcid": "0000-0003-0217-6352;0000-0001-9723-3246", "linkedin": ";", "or_profile": "~Jiayi_Chen4;~Aidong_Zhang2", "aff": "University of Virginia;University of Virginia", "aff_domain": "cs.virginia.edu;virginia.edu", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nchen2024fedmbridge,\ntitle={Fed{MB}ridge: Bridgeable Multimodal Federated Learning},\nauthor={Jiayi Chen and Aidong Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jrHUbftLd6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4601795, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1382603707793624049&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "cs.virginia.edu;virginia.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Virginia", "aff_unique_dep": "", "aff_unique_url": "https://www.virginia.edu", "aff_unique_abbr": "UVA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Position: Measure Dataset Diversity, Don't Just Claim It", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33283", "id": "jsKr6RVDDs", "proceeding": "https://proceedings.mlr.press/v235/zhao24a.html", "pdf": "https://openreview.net/pdf?id=jsKr6RVDDs", "openreview": "https://openreview.net/forum?id=jsKr6RVDDs", "author_site": "Dora Zhao, Jerone Andrews, Orestis Papakyriakopoulos, Alice Xiang", "tldr": "", "abstract": "Machine learning (ML) datasets, often perceived as neutral, inherently encapsulate abstract and disputed social constructs. Dataset curators frequently employ value-laden terms such as diversity, bias, and quality to characterize datasets. Despite their prevalence, these terms lack clear definitions and validation. Our research explores the implications of this issue by analyzing \"diversity\" across 135 image and text datasets. Drawing from social sciences, we apply principles from measurement theory to identify considerations and offer recommendations for conceptualizing, operationalizing, and evaluating diversity in datasets. Our findings have broader implications for ML research, advocating for a more nuanced and precise approach to handling value-laden properties in dataset construction.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dora Zhao;Jerone Andrews;Orestis Papakyriakopoulos;Alice Xiang", "authorids": "~Dora_Zhao1;~Jerone_Andrews1;~Orestis_Papakyriakopoulos1;~Alice_Xiang1", "gender": "F;;;", "homepage": "https://dorazhao99.github.io;;https://www.civicmachines.com;", "dblp": "295/8515;;203/6747;", "google_scholar": "I-OInyYAAAAJ;;9z-fD3sAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Dora_Zhao1;~Jerone_Andrews1;~Orestis_Papakyriakopoulos1;~Alice_Xiang1", "aff": "Stanford University;;Sony AI;", "aff_domain": "stanford.edu;;sony.com;", "position": "PhD student;;Researcher;", "bibtex": "@inproceedings{\nzhao2024position,\ntitle={Position: Measure Dataset Diversity, Don't Just Claim It},\nauthor={Dora Zhao and Jerone Andrews and Orestis Papakyriakopoulos and Alice Xiang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jsKr6RVDDs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1112734, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10713847848098937872&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "stanford.edu;;sony.com;", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Stanford University;Sony", "aff_unique_dep": ";Sony AI", "aff_unique_url": "https://www.stanford.edu;https://www.sony.com", "aff_unique_abbr": "Stanford;Sony AI", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Japan" }, { "title": "Efficient Algorithms for Sum-Of-Minimum Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33282", "id": "jsmaWEdx9g", "proceeding": "https://proceedings.mlr.press/v235/ding24a.html", "pdf": "https://openreview.net/pdf?id=jsmaWEdx9g", "openreview": "https://openreview.net/forum?id=jsmaWEdx9g", "author_site": "Lisang Ding, Ziang Chen, Xinshang Wang, Wotao Yin", "tldr": "", "abstract": "In this work, we propose a novel optimization model termed ``sum-of-minimum\" optimization. This model seeks to minimize the sum or average of $N$ objective functions over $k$ parameters, where each objective takes the minimum value of a predefined sub-function with respect to the $k$ parameters. This universal framework encompasses numerous clustering applications in machine learning and related fields. We develop efficient algorithms for solving sum-of-minimum optimization problems, inspired by a randomized initialization algorithm for the classic $k$-means (Arthur & Vassilvitskii, 2007) and Lloyd's algorithm (Lloyd, 1982). We establish a new tight bound for the generalized initialization algorithm and prove a gradient-descent-like convergence rate for generalized Lloyd's algorithm. The efficiency of our algorithms is numerically examined on multiple tasks, including generalized principal component analysis, mixed linear regression, and small-scale neural network training. Our approach compares favorably to previous ones based on simpler-but-less-precise optimization reformulations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lisang Ding;Ziang Chen;Xinshang Wang;Wotao Yin", "authorids": "~Lisang_Ding1;~Ziang_Chen1;~Xinshang_Wang1;~Wotao_Yin1", "gender": "M;M;;M", "homepage": "https://sites.google.com/g.ucla.edu/lsding/home;https://sites.duke.edu/ziangchen/;;http://wotaoyin.com", "dblp": ";;196/7073;76/2265", "google_scholar": ";odvrFvIAAAAJ;;kpQGGFUAAAAJ", "orcid": "0000-0002-4529-9427;0000-0002-8298-5223;;0000-0001-6697-9731", "linkedin": ";;;", "or_profile": "~Lisang_Ding1;~Ziang_Chen1;~Xinshang_Wang1;~Wotao_Yin1", "aff": "University of California, Los Angeles;Massachusetts Institute of Technology;;Alibaba Group US", "aff_domain": "ucla.edu;mit.edu;;alibaba-inc.com", "position": "PhD student;Instructor;;Principal Researcher", "bibtex": "@inproceedings{\nding2024efficient,\ntitle={Efficient Algorithms for Sum-Of-Minimum Optimization},\nauthor={Lisang Ding and Ziang Chen and Xinshang Wang and Wotao Yin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jsmaWEdx9g}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 460395, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16517795924845743062&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6, "email": "ucla.edu;mit.edu;;alibaba-inc.com", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of California, Los Angeles;Massachusetts Institute of Technology;Alibaba Group", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ucla.edu;https://web.mit.edu;https://www.alibaba.com", "aff_unique_abbr": "UCLA;MIT;Alibaba", "aff_campus_unique_index": "0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Minimizing $f$-Divergences by Interpolating Velocity Fields", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33281", "id": "jvVWPtJYbc", "proceeding": "https://proceedings.mlr.press/v235/liu24by.html", "pdf": "https://openreview.net/pdf?id=jvVWPtJYbc", "openreview": "https://openreview.net/forum?id=jvVWPtJYbc", "author_site": "Song Liu, Jiahao Yu, Jack Simons, Mingxuan Yi, Mark Beaumont", "tldr": "", "abstract": "Many machine learning problems can be seen as approximating a *target* distribution using a *particle* distribution by minimizing their statistical discrepancy. Wasserstein Gradient Flow can move particles along a path that minimizes the $f$-divergence between the target and particle distributions. To move particles, we need to calculate the corresponding velocity fields derived from a density ratio function between these two distributions. Previous works estimated such density ratio functions and then differentiated the estimated ratios. These approaches may suffer from overfitting, leading to a less accurate estimate of the velocity fields. Inspired by non-parametric curve fitting, we directly estimate these velocity fields using interpolation techniques. We prove that our estimators are consistent under mild conditions. We validate their effectiveness using novel applications on domain adaptation and missing data imputation. The code for reproducing our results can be found at https://github.com/anewgithubname/gradest2.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Song Liu;Jiahao Yu;Jack Simons;Mingxuan Yi;Mark Beaumont", "authorids": "~Song_Liu1;jiahao.yu@bristol.ac.uk;~Jack_Simons1;~Mingxuan_Yi1;~Mark_Beaumont1", "gender": "M;;;M;", "homepage": "http://allmodelsarewrong.net;;http://www.bristol.ac.uk/maths/;https://mingxuan-yi.github.io/;https://www.bristol.ac.uk/people/person/Mark-Beaumont-c51e682d-904b-45b4-bda5-9cf2213d4e9d/", "dblp": "80/1141-2;;;https://dblp.uni-trier.de/pid/259/3016;", "google_scholar": ";;DNI5ygoAAAAJ;l0xKeZcAAAAJ;2K3F0MMAAAAJ", "orcid": ";;;;0000-0002-8773-2743", "linkedin": ";;;;", "or_profile": "~Song_Liu1;jiahao.yu@bristol.ac.uk;~Jack_Simons1;~Mingxuan_Yi1;~Mark_Beaumont1", "aff": "University of Bristol, UK;;University of Bristol;University of Bristol;University of Bristol", "aff_domain": "bristol.ac.uk;;bristol.ac.uk;bristol.ac.uk;bristol.ac.uk", "position": "Lecturer;;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nliu2024minimizing,\ntitle={Minimizing \\$f\\$-Divergences by Interpolating Velocity Fields},\nauthor={Song Liu and Jiahao Yu and Jack Simons and Mingxuan Yi and Mark Beaumont},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jvVWPtJYbc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5771650, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5236593453722318441&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "bristol.ac.uk;;bristol.ac.uk;bristol.ac.uk;bristol.ac.uk", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Bristol", "aff_unique_dep": "", "aff_unique_url": "https://www.bristol.ac.uk", "aff_unique_abbr": "Bristol", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Data-free Distillation of Diffusion Models with Bootstrapping", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33280", "id": "jw2f9v59g0", "proceeding": "https://proceedings.mlr.press/v235/gu24d.html", "pdf": "https://openreview.net/pdf?id=jw2f9v59g0", "openreview": "https://openreview.net/forum?id=jw2f9v59g0", "author_site": "Jiatao Gu, Chen Wang, Shuangfei Zhai, Yizhe Zhang, Lingjie Liu, Joshua M Susskind", "tldr": "", "abstract": "Diffusion models have demonstrated great potential for generating diverse images. However, their performance often suffers from slow generation due to iterative denoising. Knowledge distillation has been recently proposed as a remedy which can reduce the number of inference steps to one or a few, without significant quality degradation. However, existing distillation methods either require significant amounts of offline computation for generating synthetic training data from the teacher model, or need to perform expensive online learning with the help of real data. In this work, we present a novel technique called BOOT, that overcomes these limitations with an efficient data-free distillation algorithm. The core idea is to learn a time-conditioned model that predicts the output of a pre-trained diffusion model teacher given any time-step. Such a model can be efficiently trained based on bootstrapping from two consecutive sampled steps. Furthermore, our method can be easily adapted to large-scale text-to-image diffusion models, which are challenging for previous methods given the fact that the training sets are often large and difficult to access. We demonstrate the effectiveness of our approach on several benchmark datasets in the DDIM setting, achieving comparable generation quality while being orders of magnitude faster than the diffusion teacher. The text-to-image results show that the proposed approach is able to handle highly complex distributions, shedding light on more efficient generative modeling.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiatao Gu;Chen Wang;Shuangfei Zhai;Yizhe Zhang;Lingjie Liu;Joshua M. Susskind", "authorids": "~Jiatao_Gu1;~Chen_Wang13;~Shuangfei_Zhai3;~Yizhe_Zhang2;~Lingjie_Liu1;~Joshua_M._Susskind1", "gender": "M;;M;M;F;M", "homepage": "http://jiataogu.me;https://cwchenwang.github.io;http://cs.binghamton.edu/~szhai2;https://dreasysnail.github.io;https://lingjie0206.github.io/;http://www.apple.com", "dblp": "164/5848.html;82/4206-49;;132/4966-2.html;204/0052;132/7797", "google_scholar": "https://scholar.google.com.sg/citations?user=cB1mFBsAAAAJ;5cY3Ho4AAAAJ;G6vdBYsAAAAJ;WDVMfggAAAAJ;https://scholar.google.de/citations?user=HZPnJ9gAAAAJ;Sv2TGqsAAAAJ", "orcid": ";0000-0002-9315-3780;;;;", "linkedin": "jiatao-gu-204b2672/;;;;;joshua-susskind-8ab2ab5/", "or_profile": "~Jiatao_Gu1;~Chen_Wang13;~Shuangfei_Zhai3;~Yizhe_Zhang2;~Lingjie_Liu1;~Joshua_M._Susskind1", "aff": "Apple;University of Pennsylvania;Apple;Apple;University of Pennsylvania;Apple", "aff_domain": "apple.com;upenn.edu;apple.com;apple.com;upenn.edu;apple.com", "position": "Researcher;PhD student;Research Scientist;Researcher;Assistant Professor;Researcher", "bibtex": "@inproceedings{\ngu2024datafree,\ntitle={Data-free Distillation of Diffusion Models with Bootstrapping},\nauthor={Jiatao Gu and Chen Wang and Shuangfei Zhai and Yizhe Zhang and Lingjie Liu and Joshua M. Susskind},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jw2f9v59g0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9924150, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16301103739344654978&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "apple.com;upenn.edu;apple.com;apple.com;upenn.edu;apple.com", "author_num": 6, "aff_unique_index": "0;1;0;0;1;0", "aff_unique_norm": "Apple;University of Pennsylvania", "aff_unique_dep": "Apple Inc.;", "aff_unique_url": "https://www.apple.com;https://www.upenn.edu", "aff_unique_abbr": "Apple;UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "RNAFlow: RNA Structure & Sequence Design via Inverse Folding-Based Flow Matching", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33279", "id": "jxvqvZLBuU", "proceeding": "https://proceedings.mlr.press/v235/nori24a.html", "pdf": "https://openreview.net/pdf?id=jxvqvZLBuU", "openreview": "https://openreview.net/forum?id=jxvqvZLBuU", "author_site": "Divya Nori, Wengong Jin", "tldr": "", "abstract": "The growing significance of RNA engineering in diverse biological applications has spurred interest in developing AI methods for structure-based RNA design. While diffusion models have excelled in protein design, adapting them for RNA presents new challenges due to RNA's conformational flexibility and the computational cost of fine-tuning large structure prediction models. To this end, we propose RNAFlow, a flow matching model for protein-conditioned RNA sequence-structure design. Its denoising network integrates an RNA inverse folding model and a pre-trained RosettaFold2NA network for generation of RNA sequences and structures. The integration of inverse folding in the structure denoising process allows us to simplify training by fixing the structure prediction network. We further enhance the inverse folding model by conditioning it on inferred conformational ensembles to model dynamic RNA conformations. Evaluation on protein-conditioned RNA structure and sequence generation tasks demonstrates RNAFlow's advantage over existing RNA design methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Divya Nori;Wengong Jin", "authorids": "~Divya_Nori1;~Wengong_Jin1", "gender": "F;", "homepage": "https://divnori.github.io;http://people.csail.mit.edu/wengong", "dblp": ";173/6620", "google_scholar": ";IE5D8_QAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Divya_Nori1;~Wengong_Jin1", "aff": "Massachusetts Institute of Technology;Broad Institute", "aff_domain": "mit.edu;broadinstitute.org", "position": "Undergrad student;Postdoc", "bibtex": "@inproceedings{\nnori2024rnaflow,\ntitle={{RNAF}low: {RNA} Structure \\& Sequence Design via Inverse Folding-Based Flow Matching},\nauthor={Divya Nori and Wengong Jin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jxvqvZLBuU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4080970, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4541160053433137292&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "mit.edu;broadinstitute.org", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Massachusetts Institute of Technology;Broad Institute", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://www.broadinstitute.org", "aff_unique_abbr": "MIT;Broad", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Position: Foundation Agents as the Paradigm Shift for Decision Making", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33278", "id": "jzHmElqpPe", "proceeding": "https://proceedings.mlr.press/v235/liu24aq.html", "pdf": "https://openreview.net/pdf?id=jzHmElqpPe", "openreview": "https://openreview.net/forum?id=jzHmElqpPe", "author_site": "Xiaoqian Liu, Xingzhou Lou, Jianbin Jiao, Junge Zhang", "tldr": "", "abstract": "Decision making demands intricate interplay between perception, memory, and reasoning to discern optimal policies. Conventional approaches to decision making face challenges related to low sample efficiency and poor generalization. In contrast, foundation models in language and vision have showcased rapid adaptation to diverse new tasks. Therefore, we advocate for the construction of foundation agents as a transformative shift in the learning paradigm of agents. This proposal is underpinned by the formulation of foundation agents with their fundamental characteristics and challenges motivated by the success of large language models (LLMs). Moreover, we specify the roadmap of foundation agents from large interactive data collection or generation, to self-supervised pretraining and adaptation, and knowledge and value alignment with LLMs. Lastly, we pinpoint critical research questions derived from the formulation and delineate trends for foundation agents supported by real-world use cases, addressing both technical and theoretical aspects to propel the field towards a more comprehensive and impactful future.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaoqian Liu;Xingzhou Lou;Jianbin Jiao;Junge Zhang", "authorids": "~Xiaoqian_Liu1;~Xingzhou_Lou1;~Jianbin_Jiao1;~Junge_Zhang1", "gender": "F;M;M;", "homepage": ";https://github.com/LxzGordon;http://lamp.ucas.ac.cn/;", "dblp": ";328/5451;;", "google_scholar": "6sJ8vmIAAAAJ;https://scholar.google.com/citations?hl=en;;gbStvusAAAAJ", "orcid": ";;;0000-0002-9970-394X", "linkedin": ";;;", "or_profile": "~Xiaoqian_Liu1;~Xingzhou_Lou1;~Jianbin_Jiao1;~Junge_Zhang1", "aff": "University of Chinese Academy of Sciences;University of Chinese Academy of Sciences;University of Chinese Academy of Sciences;Institute of automation, Chinese academy of science", "aff_domain": "mails.ucas.ac.cn;ucas.ac.cn;ucas.ac.cn;ia.ac.cn", "position": "PhD student;PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nliu2024position,\ntitle={Position: Foundation Agents as the Paradigm Shift for Decision Making},\nauthor={Xiaoqian Liu and Xingzhou Lou and Jianbin Jiao and Junge Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jzHmElqpPe}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1441005, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2590006426083883902&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "mails.ucas.ac.cn;ucas.ac.cn;ucas.ac.cn;ia.ac.cn", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of Chinese Academy of Sciences;Chinese Academy of Sciences", "aff_unique_dep": ";Institute of Automation", "aff_unique_url": "http://www.ucas.ac.cn;http://www.ia.cas.cn", "aff_unique_abbr": "UCAS;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Learning to Remove Cuts in Integer Linear Programming", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33277", "id": "k10805cgak", "proceeding": "https://proceedings.mlr.press/v235/puigdemont24a.html", "pdf": "https://openreview.net/pdf?id=k10805cgak", "openreview": "https://openreview.net/forum?id=k10805cgak", "author_site": "Pol Puigdemont, EFSTRATIOS PANTELEIMON SKOULAKIS, Grigorios Chrysos, Volkan Cevher", "tldr": "", "abstract": "Cutting plane methods are a fundamental approach for solving integer linear programs (ILPs). In each iteration of such methods, additional linear constraints (cuts) are introduced to the constraint set with the aim of excluding the previous fractional optimal solution while not affecting the optimal integer solution. In this work, we explore a novel approach within cutting plane methods: instead of only adding new cuts, we also consider the removal of previous cuts introduced at any of the preceding iterations of the method under a learnable parametric criteria. We demonstrate that in fundamental combinatorial optimization settings such cut removal policies can lead to significant improvements over both human-based and machine learning-guided cut addition policies even when implemented with simple models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pol Puigdemont;Stratis Skoulakis;Grigorios Chrysos;Volkan Cevher", "authorids": "~Pol_Puigdemont1;~Stratis_Skoulakis2;~Grigorios_Chrysos1;~Volkan_Cevher1", "gender": "M;M;M;M", "homepage": "https://github.com/puigde;http://www.corelab.ntua.gr/~sskoul/;https://grigorisg9gr.github.io/;http://lions.epfl.ch", "dblp": "381/4545;183/0979.html;75/6117-2;70/5301", "google_scholar": "HvBKfKUAAAAJ;Juo2Tk8AAAAJ;1bU041kAAAAJ;https://scholar.google.ch/citations?user=hlWhzU8AAAAJ", "orcid": "0009-0003-8080-9626;;;", "linkedin": "polpuigdemont/;;;", "or_profile": "~Pol_Puigdemont1;~Stratis_Skoulakis2;~Grigorios_Chrysos1;~Volkan_Cevher1", "aff": "Universidad Polit\u00e9cnica de Cataluna;EPFL - EPF Lausanne;University of Wisconsin - Madison;Amazon Development Center Germany", "aff_domain": "upc.edu;epfl.ch;wisc.edu;amazon.de", "position": "Undergrad student;Postdoc;Assistant Professor;Amazon Scholar", "bibtex": "@inproceedings{\npuigdemont2024learning,\ntitle={Learning to Remove Cuts in Integer Linear Programming},\nauthor={Pol Puigdemont and Stratis Skoulakis and Grigorios Chrysos and Volkan Cevher},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=k10805cgak}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1511918, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6183275304642159299&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "upc.edu;epfl.ch;wisc.edu;amazon.de", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Universitat Polit\u00e8cnica de Catalunya;EPFL;University of Wisconsin-Madison;Amazon", "aff_unique_dep": ";;;Development Center", "aff_unique_url": "https://www.upc.edu;https://www.epfl.ch;https://www.wisc.edu;https://www.amazon.de", "aff_unique_abbr": "UPC;EPFL;UW-Madison;Amazon", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Lausanne;Madison", "aff_country_unique_index": "0;1;2;3", "aff_country_unique": "Spain;Switzerland;United States;Germany" }, { "title": "Exploiting Negative Samples: A Catalyst for Cohort Discovery in Healthcare Analytics", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33276", "id": "k1J2GbamLi", "proceeding": "https://proceedings.mlr.press/v235/zheng24c.html", "pdf": "https://openreview.net/pdf?id=k1J2GbamLi", "openreview": "https://openreview.net/forum?id=k1J2GbamLi", "author_site": "Kaiping Zheng, Horng-Ruey Chua, Melanie Herschel, H. V Jagadish, Beng Chin Ooi, James Yip", "tldr": "", "abstract": "In healthcare analytics, addressing binary diagnosis or prognosis tasks presents unique challenges due to the inherent asymmetry between positive and negative samples. While positive samples, indicating patients with a disease, are defined based on stringent medical criteria, negative samples are defined in an open-ended manner and remain underexplored in prior research. To bridge this gap, we propose an innovative approach to facilitate cohort discovery within negative samples, leveraging a Shapley-based exploration of interrelationships between these samples, which holds promise for uncovering valuable insights concerning the studied disease, and related comorbidity and complications. We quantify each sample\u2019s contribution using data Shapley values, subsequently constructing the Negative Sample Shapley Field to model the distribution of all negative samples. Next, we transform this field through manifold learning, preserving the essential data structure information while imposing an isotropy constraint in data Shapley values. Within this transformed space, we pinpoint cohorts of medical interest via density-based clustering. We empirically evaluate the effectiveness of our approach on the real-world electronic medical records from National University Hospital in Singapore, yielding clinically valuable insights aligned with existing knowledge, and benefiting medical research and clinical decision-making.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kaiping Zheng;Horng-Ruey Chua;Melanie Herschel;H. V. Jagadish;Beng Chin Ooi;James Wei Luen Yip", "authorids": "~Kaiping_Zheng1;~Horng-Ruey_Chua1;~Melanie_Herschel1;~H._V._Jagadish1;~Beng_Chin_Ooi1;~James_Wei_Luen_Yip2", "gender": "F;M;;;M;M", "homepage": "https://www.comp.nus.edu.sg/~kaiping/;;;;http://www.comp.nus.edu.sg/~ooibc/;", "dblp": "169/3246;https://dblp.uni-trier.de/pid/262/0046;;;o/BengChinOoi;07/10315", "google_scholar": "https://scholar.google.com.sg/citations?user=V1PdtzQAAAAJ;https://scholar.google.com.sg/citations?user=BtM-NyIAAAAJ;;;https://scholar.google.com.tw/citations?user=9560QjYAAAAJ;", "orcid": "0000-0001-8138-1543;0000-0003-1379-0585;;;0000-0003-4446-1100;0000-0001-5470-4554", "linkedin": ";;;;beng-chin-ooi-34b0634/;", "or_profile": "~Kaiping_Zheng1;~Horng-Ruey_Chua1;~Melanie_Herschel1;~H._V._Jagadish1;~Beng_Chin_Ooi1;~James_Wei_Luen_Yip2", "aff": "National University of Singapore;National University Hospital;;;National University of Singapore;", "aff_domain": "nus.edu.sg;nuhs.edu.sg;;;comp.nus.edu.sg;", "position": "Postdoc;Researcher;;;Full Professor;", "bibtex": "@inproceedings{\nzheng2024exploiting,\ntitle={Exploiting Negative Samples: A Catalyst for Cohort Discovery in Healthcare Analytics},\nauthor={Kaiping Zheng and Horng-Ruey Chua and Melanie Herschel and H. V. Jagadish and Beng Chin Ooi and James Wei Luen Yip},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=k1J2GbamLi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8898174, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4000752260447724841&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "nus.edu.sg;nuhs.edu.sg;;;comp.nus.edu.sg;", "author_num": 6, "aff_unique_index": "0;1;0", "aff_unique_norm": "National University of Singapore;National University Hospital", "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;", "aff_unique_abbr": "NUS;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Singapore;Unknown" }, { "title": "Do Language Models Exhibit the Same Cognitive Biases in Problem Solving as Human Learners?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33275", "id": "k1JXxbpIY6", "proceeding": "https://proceedings.mlr.press/v235/opedal24a.html", "pdf": "https://openreview.net/pdf?id=k1JXxbpIY6", "openreview": "https://openreview.net/forum?id=k1JXxbpIY6", "author_site": "Andreas Opedal, Alessandro Stolfo, Haruki Shirakami, Ying Jiao, Ryan Cotterell, Bernhard Sch\u00f6lkopf, Abulhair Saparov, Mrinmaya Sachan", "tldr": "", "abstract": "There is increasing interest in employing large language models (LLMs) as cognitive models. For such purposes, it is central to understand which properties of human cognition are well-modeled by LLMs, and which are not. In this work, we study the biases of LLMs in relation to those known in children when solving arithmetic word problems. Surveying the learning science literature, we posit that the problem-solving process can be split into three distinct steps: text comprehension, solution planning and solution execution. We construct tests for each one in order to understand whether current LLMs display the same cognitive biases as children in these steps. We generate a novel set of word problems for each of these tests, using a neuro-symbolic approach that enables fine-grained control over the problem features. We find evidence that LLMs, with and without instruction-tuning, exhibit human-like biases in both the text-comprehension and the solution-planning steps of the solving process, but not in the final step, in which the arithmetic expressions are executed to obtain the answer.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Andreas Opedal;Alessandro Stolfo;Haruki Shirakami;Ying Jiao;Ryan Cotterell;Bernhard Sch\u00f6lkopf;Abulhair Saparov;Mrinmaya Sachan", "authorids": "~Andreas_Opedal1;~Alessandro_Stolfo1;hshirakami@student.ethz.ch;~Ying_Jiao1;~Ryan_Cotterell1;~Bernhard_Sch\u00f6lkopf1;~Abulhair_Saparov1;~Mrinmaya_Sachan3", "gender": "M;M;;F;;;M;", "homepage": "https://opedal.github.io/;https://alestolfo.github.io;;;;;http://asaparov.org;", "dblp": "292/2838;329/3838;;56/10531;;;117/6287;", "google_scholar": "https://scholar.google.de/citations?hl=en;Fx50TZQAAAAJ;;w0526qgAAAAJ;;;TVNS71sAAAAJ;", "orcid": ";;;0009-0009-2279-7691;;;;", "linkedin": "andreasopedal;alessandrostolfo/;;;;;;", "or_profile": "~Andreas_Opedal1;~Alessandro_Stolfo1;hshirakami@student.ethz.ch;~Ying_Jiao1;~Ryan_Cotterell1;~Bernhard_Sch\u00f6lkopf1;~Abulhair_Saparov1;~Mrinmaya_Sachan3", "aff": "Max Planck Institute for Intelligent Systems, Max-Planck Institute;ETHZ - ETH Zurich;;KU Leuven;;;Purdue University;", "aff_domain": "tuebingen.mpg.de;ethz.ch;;kuleuven.be;;;purdue.edu;", "position": "PhD student;PhD student;;PhD student;;;Assistant Professor;", "bibtex": "@inproceedings{\nopedal2024do,\ntitle={Do Language Models Exhibit the Same Cognitive Biases in Problem Solving as Human Learners?},\nauthor={Andreas Opedal and Alessandro Stolfo and Haruki Shirakami and Ying Jiao and Ryan Cotterell and Bernhard Sch{\\\"o}lkopf and Abulhair Saparov and Mrinmaya Sachan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=k1JXxbpIY6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1702788, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17049521752586731704&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "email": "tuebingen.mpg.de;ethz.ch;;kuleuven.be;;;purdue.edu;", "author_num": 8, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Max Planck Institute for Intelligent Systems;ETH Zurich;Katholieke Universiteit Leuven;Purdue University", "aff_unique_dep": "Intelligent Systems;;;", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.ethz.ch;https://www.kuleuven.be;https://www.purdue.edu", "aff_unique_abbr": "MPI-IS;ETHZ;KU Leuven;Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;3", "aff_country_unique": "Germany;Switzerland;Belgium;United States" }, { "title": "Self-Driven Entropy Aggregation for Byzantine-Robust Heterogeneous Federated Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33274", "id": "k2axqNsVVO", "proceeding": "https://proceedings.mlr.press/v235/huang24u.html", "pdf": "https://openreview.net/pdf?id=k2axqNsVVO", "openreview": "https://openreview.net/forum?id=k2axqNsVVO", "author_site": "Wenke Huang, Zekun Shi, Mang Ye, He Li, Bo Du", "tldr": "", "abstract": "Federated learning presents massive potential for privacy-friendly collaboration. However, the performance of federated learning is deeply affected by byzantine attacks, where malicious clients deliberately upload crafted vicious updates. While various robust aggregations have been proposed to defend against such attacks, they are subject to certain assumptions: homogeneous private data and related proxy datasets. To address these limitations, we propose Self-Driven Entropy Aggregation (SDEA), which leverages the random public dataset to conduct Byzantine-robust aggregation in heterogeneous federated learning. For Byzantine attackers, we observe that benign ones typically present more confident (sharper) predictions than evils on the public dataset. Thus, we highlight benign clients by introducing learnable aggregation weight to minimize the instance-prediction entropy of the global model on the random public dataset. Besides, with inherent data heterogeneity in federated learning, we reveal that it brings heterogeneous sharpness. Specifically, clients are optimized under distinct distribution and thus present fruitful predictive preferences. The learnable aggregation weight blindly allocates high attention to limited ones for sharper predictions, resulting in a biased global model. To alleviate this problem, we encourage the global model to offer diverse predictions via batch-prediction entropy maximization and conduct clustering to equally divide honest weights to accommodate different tendencies. This endows SDEA to detect Byzantine attackers in heterogeneous federated learning. Empirical results demonstrate the effectiveness.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenke Huang;Zekun Shi;Mang Ye;He Li;Bo Du", "authorids": "~Wenke_Huang1;~Zekun_Shi1;~Mang_Ye1;~He_Li4;~Bo_Du3", "gender": "M;M;M;M;M", "homepage": "https://wenkehuang.github.io/;https://github.com/Szkqwer;https://marswhu.github.io/;https://marswhu.github.io/team/index.htm;", "dblp": "330/1664;234/8649.html;156/0610;;70/6443-1.html", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;y6_df_4AAAAJ;j-HxRy0AAAAJ;;Shy1gnMAAAAJ", "orcid": "0000-0003-4819-293X;;0000-0003-3989-7655;0000-0002-8469-8260;", "linkedin": ";;;;", "or_profile": "~Wenke_Huang1;~Zekun_Shi1;~Mang_Ye1;~He_Li4;~Bo_Du1", "aff": "Wuhan University;Xiaomi Corporation;Wuhan University;Wuhan University;Wuhan University", "aff_domain": "whu.edu.cn;xiaomi.com;whu.edu.cn;whu.edu.cn;whu.edu.cn", "position": "PhD student;Researcher;Professor;PhD student;Full Professor", "bibtex": "@inproceedings{\nhuang2024selfdriven,\ntitle={Self-Driven Entropy Aggregation for Byzantine-Robust Heterogeneous Federated Learning},\nauthor={Wenke Huang and Zekun Shi and Mang Ye and He Li and Bo Du},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=k2axqNsVVO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 892500, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11202605389324153192&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 5, "email": "whu.edu.cn;xiaomi.com;whu.edu.cn;whu.edu.cn;whu.edu.cn", "author_num": 5, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Wuhan University;Xiaomi Corporation", "aff_unique_dep": ";", "aff_unique_url": "http://www.whu.edu.cn/;https://www.xiaomi.com", "aff_unique_abbr": "WHU;Xiaomi", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Differentially Private Decentralized Learning with Random Walks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33273", "id": "k2dVVIWWho", "proceeding": "https://proceedings.mlr.press/v235/cyffers24a.html", "pdf": "https://openreview.net/pdf?id=k2dVVIWWho", "openreview": "https://openreview.net/forum?id=k2dVVIWWho", "author_site": "Edwige Cyffers, Aur\u00e9lien Bellet, Jalaj Upadhyay", "tldr": "", "abstract": "The popularity of federated learning comes from the possibility of better scalability and the ability for participants to keep control of their data, improving data security and sovereignty. Unfortunately, sharing model updates also creates a new privacy attack surface. In this work, we characterize the privacy guarantees of decentralized learning with random walk algorithms, where a model is updated by traveling from one node to another along the edges of a communication graph. Using a recent variant of differential privacy tailored to the study of decentralized algorithms, namely Pairwise Network Differential Privacy, we derive closed-form expressions for the privacy loss between each pair of nodes where the impact of the communication topology is captured by graph theoretic quantities. Our results further reveal that random walk algorithms tends to yield better privacy guarantees than gossip algorithms for nodes close from each other. We supplement our theoretical results with empirical evaluation on synthetic and real-world graphs and datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Edwige Cyffers;Aur\u00e9lien Bellet;Jalaj Upadhyay", "authorids": "~Edwige_Cyffers1;~Aur\u00e9lien_Bellet1;~Jalaj_Upadhyay1", "gender": ";;M", "homepage": ";http://researchers.lille.inria.fr/abellet/;https://sites.google.com/view/jalajupadhyay", "dblp": "281/6734;61/8017;https://dblp.uni-trier.de/pers/u/Upadhyay:Jalaj.html", "google_scholar": ";https://scholar.google.fr/citations?user=j8svx3IAAAAJ;vHTMzPQAAAAJ", "orcid": ";0000-0003-3440-1251;", "linkedin": "edwige-cyffers/;;", "or_profile": "~Edwige_Cyffers1;~Aur\u00e9lien_Bellet1;~Jalaj_Kumar_Upadhyay1", "aff": "INRIA;INRIA;Rutgers University", "aff_domain": "inria.fr;inria.fr;rutgers.edu", "position": "PhD student;Tenured researcher;Assistant Professor", "bibtex": "@inproceedings{\ncyffers2024differentially,\ntitle={Differentially Private Decentralized Learning with Random Walks},\nauthor={Edwige Cyffers and Aur{\\'e}lien Bellet and Jalaj Upadhyay},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=k2dVVIWWho}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1287196, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15772462033662279909&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "inria.fr;inria.fr;rutgers.edu", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "INRIA;Rutgers University", "aff_unique_dep": ";", "aff_unique_url": "https://www.inria.fr;https://www.rutgers.edu", "aff_unique_abbr": "INRIA;Rutgers", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "France;United States" }, { "title": "Directly Denoising Diffusion Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33272", "id": "k5ncz7TIPX", "proceeding": "https://proceedings.mlr.press/v235/zhang24bl.html", "pdf": "https://openreview.net/pdf?id=k5ncz7TIPX", "openreview": "https://openreview.net/forum?id=k5ncz7TIPX", "author_site": "Dan Zhang, Jingjing Wang, Feng Luo", "tldr": "", "abstract": "In this paper, we present Directly Denoising Diffusion Models (DDDMs): a simple and generic approach for generating realistic images with few-step sampling, while multistep sampling is still preserved for better performance. DDDMs require no delicately designed samplers nor distillation on pre-trained distillation models. DDDMs train the diffusion model conditioned on an estimated target that was generated from previous training iterations of its own. To generate images, samples generated from previous timestep are also taken into consideration, guiding the generation process iteratively. We further propose Pseudo-LPIPS, a novel metric loss that is more robust to various values of hyperparameter. Despite its simplicity, the proposed approach can achieve strong performance in benchmark datasets. Our model achieves FID scores of 2.57 and 2.33 on CIFAR-10 in one-step and two-step sampling respectively, surpassing those obtained from GANs and distillation-based models. By extending the sampling to 1000 steps, we further reduce FID score to 1.79, aligning with state-of-the-art methods in the literature. For ImageNet 64x64, our approach stands as a competitive contender against leading models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dan Zhang;Jingjing Wang;Feng Luo", "authorids": "~Dan_Zhang10;~Jingjing_Wang12;~Feng_Luo1", "gender": "M;;M", "homepage": "https://github.com/zhangdan8962/;https://github.com/JennieHH;https://people.cs.clemson.edu/~luofeng", "dblp": ";;l/FengLuo.html", "google_scholar": ";;joROlFwAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Dan_Zhang10;~Jingjing_Wang12;~Feng_Luo1", "aff": "Clemson University;Clemson University;Clemson University", "aff_domain": "clemson.edu;clemson.edu;clemson.edu", "position": "PhD student;MS student;Full Professor", "bibtex": "@inproceedings{\nzhang2024directly,\ntitle={Directly Denoising Diffusion Models},\nauthor={Dan Zhang and Jingjing Wang and Feng Luo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=k5ncz7TIPX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9342857, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16944341043714072132&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 5, "email": "clemson.edu;clemson.edu;clemson.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Clemson University", "aff_unique_dep": "", "aff_unique_url": "https://www.clemson.edu", "aff_unique_abbr": "Clemson", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Improving SAM Requires Rethinking its Optimization Formulation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33271", "id": "k7G4N1x7f9", "proceeding": "https://proceedings.mlr.press/v235/xie24d.html", "pdf": "https://openreview.net/pdf?id=k7G4N1x7f9", "openreview": "https://openreview.net/forum?id=k7G4N1x7f9", "author_site": "Wanyun Xie, Fabian Latorre, Kimon Antonakopoulos, Thomas Pethick, Volkan Cevher", "tldr": "", "abstract": "This paper rethinks Sharpness-Aware Minimization (SAM), which is originally formulated as a zero-sum game where the weights of a network and a bounded perturbation try to minimize/maximize, respectively, the same differentiable loss. To fundamentally improve this design, we argue that SAM should instead be reformulated using the 0-1 loss. As a continuous relaxation, we follow the simple conventional approach where the minimizing (maximizing) player uses an upper bound (lower bound) surrogate to the 0-1 loss. This leads to a novel formulation of SAM as a bilevel optimization problem, dubbed as BiSAM. BiSAM with newly designed lower-bound surrogate loss indeed constructs stronger perturbation. Through numerical evidence, we show that BiSAM consistently results in improved performance when compared to the original SAM and variants, while enjoying similar computational complexity. Our code is available at https://github.com/LIONS-EPFL/BiSAM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wanyun Xie;Fabian Latorre;Kimon Antonakopoulos;Thomas Pethick;Volkan Cevher", "authorids": "~Wanyun_Xie1;~Fabian_Latorre1;~Kimon_Antonakopoulos1;~Thomas_Pethick1;~Volkan_Cevher1", "gender": "F;M;M;M;M", "homepage": ";https://fabianlatorre.com;;https://pethick.dk;http://lions.epfl.ch", "dblp": ";244/9638;https://dblp.org/pers/hd/a/Antonakopoulos:Kimon;305/4521;70/5301", "google_scholar": "S4rh8MoAAAAJ;B46S5NwAAAAJ;;;https://scholar.google.ch/citations?user=hlWhzU8AAAAJ", "orcid": ";;;;", "linkedin": "wanyun-xie-71a287210/;;;;", "or_profile": "~Wanyun_Xie1;~Fabian_Latorre1;~Kimon_Antonakopoulos1;~Thomas_Pethick1;~Volkan_Cevher1", "aff": "EPFL - EPF Lausanne;Swiss Federal Institute of Technology Lausanne;EPFL - EPF Lausanne;Swiss Federal Institute of Technology Lausanne;Amazon Development Center Germany", "aff_domain": "epfl.ch;epfl.ch;epfl.ch;epfl.ch;amazon.de", "position": "PhD student;PhD student;Postdoc;PhD student;Amazon Scholar", "bibtex": "@inproceedings{\nxie2024improving,\ntitle={Improving {SAM} Requires Rethinking its Optimization Formulation},\nauthor={Wanyun Xie and Fabian Latorre and Kimon Antonakopoulos and Thomas Pethick and Volkan Cevher},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=k7G4N1x7f9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 766191, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13462558978953903998&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "epfl.ch;epfl.ch;epfl.ch;epfl.ch;amazon.de", "author_num": 5, "aff_unique_index": "0;1;0;1;2", "aff_unique_norm": "EPFL;Swiss Federal Institute of Technology Lausanne;Amazon", "aff_unique_dep": ";;Development Center", "aff_unique_url": "https://www.epfl.ch;https://www.epfl.ch;https://www.amazon.de", "aff_unique_abbr": "EPFL;EPFL;Amazon", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "Switzerland;Germany" }, { "title": "OODRobustBench: a Benchmark and Large-Scale Analysis of Adversarial Robustness under Distribution Shift", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33270", "id": "kAFevjEYsz", "proceeding": "https://proceedings.mlr.press/v235/li24bp.html", "pdf": "https://openreview.net/pdf?id=kAFevjEYsz", "openreview": "https://openreview.net/forum?id=kAFevjEYsz", "author_site": "Lin Li, Yifei Wang, Chawin Sitawarin, Michael Spratling", "tldr": "", "abstract": "Existing works have made great progress in improving adversarial robustness, but typically test their method only on data from the same distribution as the training data, i.e. in-distribution (ID) testing. As a result, it is unclear how such robustness generalizes under input distribution shifts, i.e. out-of-distribution (OOD) testing. This omission is concerning as such distribution shifts are unavoidable when methods are deployed in the wild. To address this issue we propose a benchmark named OODRobustBench to comprehensively assess OOD adversarial robustness using 23 dataset-wise shifts (i.e. naturalistic shifts in input distribution) and 6 threat-wise shifts (i.e., unforeseen adversarial threat models). OODRobustBench is used to assess 706 robust models using 60.7K adversarial evaluations. This large-scale analysis shows that: 1) adversarial robustness suffers from a severe OOD generalization issue; 2) ID robustness correlates strongly with OOD robustness in a positive linear way. The latter enables the prediction of OOD robustness from ID robustness. We then predict and verify that existing methods are unlikely to achieve high OOD robustness. Novel methods are therefore required to achieve OOD robustness beyond our prediction. To facilitate the development of these methods, we investigate a wide range of techniques and identify several promising directions. Code and models are available at: https://github.com/OODRobustBench/OODRobustBench.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lin Li;Yifei Wang;Chawin Sitawarin;Michael W. Spratling", "authorids": "~Lin_Li12;~Yifei_Wang1;~Chawin_Sitawarin1;~Michael_W._Spratling1", "gender": "M;M;M;", "homepage": "https://treelli.github.io/;https://yifeiwang77.com;https://chawins.github.io/;", "dblp": ";00/555-1;211/7105;", "google_scholar": "dxP6Y_oAAAAJ;-CLy6YsAAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": "0000-0001-6369-2663;;0000-0002-4949-9661;", "linkedin": "lin-li-aa729a14b/;;chawins/;", "or_profile": "~Lin_Li12;~Yifei_Wang1;~Chawin_Sitawarin1;~Michael_W._Spratling1", "aff": "King's College London;Massachusetts Institute of Technology;University of California, Berkeley;", "aff_domain": "kcl.ac.uk;mit.edu;berkeley.edu;", "position": "PhD student;Postdoc;PhD student;", "bibtex": "@inproceedings{\nli2024oodrobustbench,\ntitle={{OODR}obustBench: a Benchmark and Large-Scale Analysis of Adversarial Robustness under Distribution Shift},\nauthor={Lin Li and Yifei Wang and Chawin Sitawarin and Michael W. Spratling},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kAFevjEYsz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9609090, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9421361136264596479&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "kcl.ac.uk;mit.edu;berkeley.edu;", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "King's College London;Massachusetts Institute of Technology;University of California, Berkeley", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kcl.ac.uk;https://web.mit.edu;https://www.berkeley.edu", "aff_unique_abbr": "KCL;MIT;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Autaptic Synaptic Circuit Enhances Spatio-temporal Predictive Learning of Spiking Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33269", "id": "kAIkYOE5pV", "proceeding": "https://proceedings.mlr.press/v235/wang24ci.html", "pdf": "https://openreview.net/pdf?id=kAIkYOE5pV", "openreview": "https://openreview.net/forum?id=kAIkYOE5pV", "author_site": "Lihao Wang, Zhaofei Yu", "tldr": "", "abstract": "Spiking Neural Networks (SNNs) emulate the integrated-fire-leak mechanism found in biological neurons, offering a compelling combination of biological realism and energy efficiency. In recent years, they have gained considerable research interest. However, existing SNNs predominantly rely on the Leaky Integrate-and-Fire (LIF) model and are primarily suited for simple, static tasks. They lack the ability to effectively model long-term temporal dependencies and facilitate spatial information interaction, which is crucial for tackling complex, dynamic spatio-temporal prediction tasks. To tackle these challenges, this paper draws inspiration from the concept of autaptic synapses in biology and proposes a novel Spatio-Temporal Circuit (STC) model. The STC model integrates two learnable adaptive pathways, enhancing the spiking neurons' temporal memory and spatial coordination. We conduct theoretical analysis of the dynamic parameters in the STC model, highlighting their contribution in establishing long-term memory and mitigating the issue of gradient vanishing. Through extensive experiments on multiple spatio-temporal prediction datasets, we demonstrate that our model outperforms other adaptive models. Furthermore, our model is compatible with existing spiking neuron models, thereby augmenting their dynamic representations. In essence, our work enriches the specificity and topological complexity of SNNs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lihao Wang;Zhaofei Yu", "authorids": "~Lihao_Wang4;~Zhaofei_Yu1", "gender": ";M", "homepage": ";https://yuzhaofei.github.io", "dblp": ";166/0573", "google_scholar": ";qaUgD50AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Lihao_Wang4;~Zhaofei_Yu1", "aff": ";Peking University", "aff_domain": ";pku.edu.cn", "position": ";Assistant Professor", "bibtex": "@inproceedings{\nwang2024autaptic,\ntitle={Autaptic Synaptic Circuit Enhances Spatio-temporal Predictive Learning of Spiking Neural Networks},\nauthor={Lihao Wang and Zhaofei Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kAIkYOE5pV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 883978, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4382038567341513786&as_sdt=5,24&sciodt=0,24&hl=en", "gs_version_total": 7, "email": ";pku.edu.cn", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "Listenable Maps for Audio Classifiers", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33268", "id": "kAfYYg6PX8", "proceeding": "https://proceedings.mlr.press/v235/paissan24a.html", "pdf": "https://openreview.net/pdf?id=kAfYYg6PX8", "openreview": "https://openreview.net/forum?id=kAfYYg6PX8", "author_site": "Francesco Paissan, Mirco Ravanelli, Cem Subakan", "tldr": "", "abstract": "Despite the impressive performance of deep learning models across diverse tasks, their complexity poses challenges for interpretation. This challenge is particularly evident for audio signals, where conveying interpretations becomes inherently difficult. To address this issue, we introduce Listenable Maps for Audio Classifiers (L-MAC), a posthoc interpretation method that generates faithful and listenable interpretations. L-MAC utilizes a decoder on top of a pretrained classifier to generate binary masks that highlight relevant portions of the input audio. We train the decoder with a loss function that maximizes the confidence of the classifier decision on the masked-in portion of the audio while minimizing the probability of model output for the masked-out portion. Quantitative evaluations on both in-domain and out-of-domain data demonstrate that L-MAC consistently produces more faithful interpretations than several gradient and masking-based methodologies. Furthermore, a user study confirms that, on average, users prefer the interpretations generated by the proposed technique.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Francesco Paissan;Mirco Ravanelli;Cem Subakan", "authorids": "~Francesco_Paissan1;~Mirco_Ravanelli1;~Cem_Subakan1", "gender": "M;;M", "homepage": "https://francescopaissan.it;https://ycemsubakan.github.io/;https://sites.google.com/site/mircoravanelli/", "dblp": "246/9581;275/7062;138/0284", "google_scholar": "QJtF3yQAAAAJ;zXzV-0UAAAAJ;-6Pj3IYAAAAJ", "orcid": "0000-0002-5553-7935;;", "linkedin": ";;mirco-ravanelli-489b692a/", "or_profile": "~Francesco_Paissan1;~Cem_Subakan1;~Mirco_Ravanellu1", "aff": "Fondazione Bruno Kessler;Concordia University;Montreal Institute for Learning Algorithms, University of Montreal, University of Montreal", "aff_domain": "fbk.eu;concordia.ca;mila.umontreal.ca", "position": "Researcher;Affiliate Assistant Professor;Postdoc", "bibtex": "@inproceedings{\npaissan2024listenable,\ntitle={Listenable Maps for Audio Classifiers},\nauthor={Francesco Paissan and Mirco Ravanelli and Cem Subakan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kAfYYg6PX8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2058056, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6696982596289073571&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 11, "email": "fbk.eu;concordia.ca;mila.umontreal.ca", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Fondazione Bruno Kessler;Concordia University;University of Montreal", "aff_unique_dep": ";;Montreal Institute for Learning Algorithms", "aff_unique_url": "https://www.fbk.eu;https://www.concordia.ca;https://www.umontreal.ca", "aff_unique_abbr": "FBK;Concordia;UM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Italy;Canada" }, { "title": "Acquisition Conditioned Oracle for Nongreedy Active Feature Acquisition", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33267", "id": "kGXUL6qGso", "proceeding": "https://proceedings.mlr.press/v235/valancius24a.html", "pdf": "https://openreview.net/pdf?id=kGXUL6qGso", "openreview": "https://openreview.net/forum?id=kGXUL6qGso", "author_site": "Michael Valancius, Maxwell Lennon, Junier Oliva", "tldr": "", "abstract": "We develop novel methodology for active feature acquisition (AFA), the study of sequentially acquiring a dynamic subset of features that minimizes acquisition costs whilst still yielding accurate inference. The AFA framework can be useful in a myriad of domains, including health care applications where the cost of acquiring additional features for a patient (in terms of time, money, risk, etc.) can be weighed against the expected improvement to diagnostic performance. Previous approaches for AFA have employed either: deep learning RL techniques, which have difficulty training policies due to a complicated state and action space; deep learning surrogate generative models, which require modeling complicated multidimensional conditional distributions; or greedy policies, which cannot account for jointly informative feature acquisitions. We show that we can bypass many of these challenges with a novel, nonparametric oracle based approach, which we coin the acquisition conditioned oracle (ACO). Extensive experiments show the superiority of the ACO to state-of-the-art AFA methods when acquiring features for both predictions and general decision-making.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Michael Valancius;Maxwell Lennon;Junier Oliva", "authorids": "~Michael_Valancius1;~Maxwell_Lennon1;~Junier_Oliva1", "gender": "M;M;M", "homepage": ";https://maxlennon.info;http://lupalab.com", "dblp": ";264/4751;137/8390", "google_scholar": ";https://scholar.google.com/citations?hl=en;", "orcid": ";;", "linkedin": "michael-valancius/;max-lennon-76a99b185/;", "or_profile": "~Michael_Valancius1;~Maxwell_Lennon1;~Junier_Oliva1", "aff": "University of North Carolina at Chapel Hill;University of North Carolina at Chapel Hill;University of North Carolina, Chapel Hill", "aff_domain": "unc.edu;unc.edu;unc.edu", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nvalancius2024acquisition,\ntitle={Acquisition Conditioned Oracle for Nongreedy Active Feature Acquisition},\nauthor={Michael Valancius and Maxwell Lennon and Junier Oliva},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kGXUL6qGso}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 674939, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:NvXYMqCiUM0J:scholar.google.com/&scioq=Acquisition+Conditioned+Oracle+for+Nongreedy+Active+Feature+Acquisition&hl=en&as_sdt=0,44", "gs_version_total": 7, "email": "unc.edu;unc.edu;unc.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of North Carolina", "aff_unique_dep": "", "aff_unique_url": "https://www.unc.edu", "aff_unique_abbr": "UNC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Chapel Hill", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "RoboDreamer: Learning Compositional World Models for Robot Imagination", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33266", "id": "kHjOmAUfVe", "proceeding": "https://proceedings.mlr.press/v235/zhou24f.html", "pdf": "https://openreview.net/pdf?id=kHjOmAUfVe", "openreview": "https://openreview.net/forum?id=kHjOmAUfVe", "author_site": "Siyuan Zhou, Yilun Du, Jiaben Chen, Yandong li, Dit-Yan Yeung, Chuang Gan", "tldr": "", "abstract": "Text-to-video models have demonstrated substantial potential in robotic decision-making, enabling the imagination of realistic plans of future actions as well as accurate environment simulation. However, one major issue in such models is generalization -- models are limited to synthesizing videos subject to language instructions similar to those seen at training time. This is heavily limiting in decision-making, where we seek a powerful world model to synthesize plans of unseen combinations of objects and actions in order to solve previously unseen tasks in new environments. To resolve this issue, we introduce RoboDreamer, an innovative approach for learning a compositional world model by factorizing the video generation. We leverage the natural compositionality of language to parse instructions into a set of lower-level primitives, which we condition a set of models on to generate videos. We illustrate how this factorization naturally enables compositional generalization, by allowing us to formulate a new natural language instruction as a combination of previously seen components. We further show how such a factorization enables us to add additional multimodal goals, allowing us to specify a video we wish to generate given both natural language instructions and a goal image. Our approach can successfully synthesize video plans on unseen goals in the RT-X, enables successful robot execution in simulation, and substantially outperforms monolithic baseline approaches to video generation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Siyuan Zhou;Yilun Du;Jiaben Chen;YANDONG LI;Dit-Yan Yeung;Chuang Gan", "authorids": "~Siyuan_Zhou2;~Yilun_Du1;~Jiaben_Chen1;~YANDONG_LI1;~Dit-Yan_Yeung2;~Chuang_Gan1", "gender": ";;M;M;M;M", "homepage": "https://scholar.google.com/citations?user=WjUmtm0AAAAJ&hl=zh-CN;https://yilundu.github.io;https://jiabenchen.github.io;https://cold-winter.github.io/;https://cse.hkust.edu.hk/faculty/dyyeung/;http://people.csail.mit.edu/ganchuang/", "dblp": ";204/4379;297/3054;;41/5668;139/6993", "google_scholar": "WjUmtm0AAAAJ;;egMKh7MAAAAJ;kRLb6PkAAAAJ;nEsOOx8AAAAJ;PTeSCbIAAAAJ", "orcid": ";;0000-0002-2252-1419;0000-0003-2448-1294;0000-0003-3716-8125;", "linkedin": ";;;;;", "or_profile": "~Siyuan_Zhou2;~Yilun_Du1;~Jiaben_Chen1;~YANDONG_LI1;~Dit-Yan_Yeung2;~Chuang_Gan1", "aff": "Hong Kong University of Science and Technology;Massachusetts Institute of Technology;University of Massachusetts at Amherst;Google;Hong Kong University of Science and Technology;University of Massachusetts at Amherst", "aff_domain": "hkust.edu;mit.edu;umass.edu;google.com;ust.hk;umass.edu", "position": "PhD student;PhD student;PhD student;Software Engineer;Chair Professor;Assistant Professor", "bibtex": "@inproceedings{\nzhou2024robodreamer,\ntitle={RoboDreamer: Learning Compositional World Models for Robot Imagination},\nauthor={Siyuan Zhou and Yilun Du and Jiaben Chen and YANDONG LI and Dit-Yan Yeung and Chuang Gan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kHjOmAUfVe}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2698425, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5676890862950279368&as_sdt=4005&sciodt=0,6&hl=en", "gs_version_total": 7, "email": "hkust.edu;mit.edu;umass.edu;google.com;ust.hk;umass.edu", "author_num": 6, "aff_unique_index": "0;1;2;3;0;2", "aff_unique_norm": "Hong Kong University of Science and Technology;Massachusetts Institute of Technology;University of Massachusetts Amherst;Google", "aff_unique_dep": ";;;Google", "aff_unique_url": "https://www.ust.hk;https://web.mit.edu;https://www.umass.edu;https://www.google.com", "aff_unique_abbr": "HKUST;MIT;UMass Amherst;Google", "aff_campus_unique_index": "0;2;3;0;2", "aff_campus_unique": "Hong Kong SAR;;Amherst;Mountain View", "aff_country_unique_index": "0;1;1;1;0;1", "aff_country_unique": "China;United States" }, { "title": "Pluvial Flood Emulation with Hydraulics-informed Message Passing", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33265", "id": "kIHIA6Lr0B", "proceeding": "https://proceedings.mlr.press/v235/kazadi24a.html", "pdf": "https://openreview.net/pdf?id=kIHIA6Lr0B", "openreview": "https://openreview.net/forum?id=kIHIA6Lr0B", "author_site": "Arnold Kazadi, James Doss-Gollin, Arlei Silva", "tldr": "", "abstract": "Machine Learning (ML) has emerged as a promising alternative to numerical methods for physics-based simulation due to its flexibility and efficiency. Flood modeling is a key case study for ML-based simulation due to its relevance as a tool for supporting preventive and emergency measures to mitigate flood risks. However, the complexity of the topography or domain (ground elevation) and the sparsity of the time-evolving precipitations (external forcing) can be challenging for most existing ML approaches for simulating flooding processes in space and time. Another critical challenge is incorporating physics domain knowledge (hydraulics) into these data-driven models. This paper addresses these challenges by introducing a hydraulics-informed graph neural network for flood simulation. Given a (geographical) region and precipitation data, our model predicts water depths in an auto-regressive fashion. We propose a message-passing framework inspired by the conservation of momentum and mass expressed in the shallow-water equations, which describe the physical process of a flooding event. Empirical results on a dataset covering 9 regions and 7 historical precipitation events demonstrate that our model outperforms the best baseline, and can capture the propagation of water flow more effectively, especially at the very early stage of the flooding event when the amount of water in the domain is scarce. Differently from some of the most recent methods for ML-based simulation, which tend to work well only when the domain is a smooth surface (e.g., flat terrain), we show that our solution achieves accurate results for real ground elevation data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Arnold Kazadi;James Doss-Gollin;Arlei Lopes da Silva", "authorids": "~Arnold_Kazadi1;~James_Doss-Gollin1;~Arlei_Lopes_da_Silva1", "gender": ";M;M", "homepage": "https://kanz76.github.io/;https://dossgollin-lab.github.io;https://cs.rice.edu/~al110/index.html", "dblp": ";;19/2546", "google_scholar": "8K_HYF8AAAAJ;6ifLBBsAAAAJ;atGtis4AAAAJ", "orcid": "0000-0002-9690-5212;0000-0002-3428-2224;0000-0003-1792-0076", "linkedin": ";jamesdossgollin;", "or_profile": "~Arnold_Kazadi1;~James_Doss-Gollin1;~Arlei_Lopes_da_Silva1", "aff": "Rice University;Rice University;Rice University", "aff_domain": "rice.edu;rice.edu;rice.edu", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nkazadi2024pluvial,\ntitle={Pluvial Flood Emulation with Hydraulics-informed Message Passing},\nauthor={Arnold Kazadi and James Doss-Gollin and Arlei Lopes da Silva},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kIHIA6Lr0B}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7630693, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9216142360401123326&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 4, "email": "rice.edu;rice.edu;rice.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Rice University", "aff_unique_dep": "", "aff_unique_url": "https://www.rice.edu", "aff_unique_abbr": "Rice", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "ATraDiff: Accelerating Online Reinforcement Learning with Imaginary Trajectories", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33264", "id": "kIh7GJmRfD", "proceeding": "https://proceedings.mlr.press/v235/yang24aa.html", "pdf": "https://openreview.net/pdf?id=kIh7GJmRfD", "openreview": "https://openreview.net/forum?id=kIh7GJmRfD", "author_site": "Qianlan Yang, Yu-Xiong Wang", "tldr": "", "abstract": "Training autonomous agents with sparse rewards is a long-standing problem in online reinforcement learning (RL), due to low data efficiency. Prior work overcomes this challenge by extracting useful knowledge from offline data, often accomplished through the learning of action distribution from offline data and utilizing the learned distribution to facilitate online RL. However, since the offline data are given and fixed, the extracted knowledge is inherently limited, making it difficult to generalize to new tasks. We propose a novel approach that leverages offline data to learn a generative diffusion model, coined as Adaptive Trajectory Diffuser (ATraDiff). This model generates synthetic trajectories, serving as a form of data augmentation and consequently enhancing the performance of online RL methods. The key strength of our diffuser lies in its adaptability, allowing it to effectively handle varying trajectory lengths and mitigate distribution shifts between online and offline data. Because of its simplicity, ATraDiff seamlessly integrates with a wide spectrum of RL methods. Empirical evaluation shows that ATraDiff consistently achieves state-of-the-art performance across a variety of environments, with particularly pronounced improvements in complicated settings. Our code and demo video are available at https://atradiff.github.io.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qianlan Yang;Yu-Xiong Wang", "authorids": "~Qianlan_Yang1;~Yu-Xiong_Wang1", "gender": "M;", "homepage": "https://github.com/yanQval;https://yxw.cs.illinois.edu/", "dblp": "294/4952;35/10700", "google_scholar": "iV5nuc4AAAAJ;T_Q-xDkAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Qianlan_Yang1;~Yu-Xiong_Wang1", "aff": "Amazon;Department of Computer Science, University of Illinois Urbana-Champaign", "aff_domain": "amazon.com;cs.illinois.edu", "position": "Intern;Assistant Professor", "bibtex": "@inproceedings{\nyang2024atradiff,\ntitle={{AT}raDiff: Accelerating Online Reinforcement Learning with Imaginary Trajectories},\nauthor={Qianlan Yang and Yu-Xiong Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kIh7GJmRfD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3050624, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9121381314508401071&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "amazon.com;cs.illinois.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Amazon;University of Illinois Urbana-Champaign", "aff_unique_dep": "Amazon.com, Inc.;Department of Computer Science", "aff_unique_url": "https://www.amazon.com;https://illinois.edu", "aff_unique_abbr": "Amazon;UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Learning Latent Structures in Network Games via Data-Dependent Gated-Prior Graph Variational Autoencoders", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33263", "id": "kKWjZoaRLv", "proceeding": "https://proceedings.mlr.press/v235/yu24f.html", "pdf": "https://openreview.net/pdf?id=kKWjZoaRLv", "openreview": "https://openreview.net/forum?id=kKWjZoaRLv", "author_site": "XUE YU, Muchen Li, Yan Leng, Renjie Liao", "tldr": "", "abstract": "In network games, individuals interact strategically within network environments to maximize their utilities. However, obtaining network structures is challenging. In this work, we propose an unsupervised learning model, called data-dependent gated-prior graph variational autoencoder (GPGVAE), that infers the underlying latent interaction type (strategic complement vs. substitute) among individuals and the latent network structure based on their observed actions. Specially, we propose a spectral graph neural network (GNN) based encoder to predict the interaction type and a data-dependent gated prior that models network structures conditioned on the interaction type. We further propose a Transformer based mixture of Bernoulli encoder of network structures and a GNN based decoder of game actions. We systematically study the Monte Carlo gradient estimation methods and effectively train our model in a stage-wise fashion. Extensive experiments across various synthetic and real-world network games demonstrate that our model achieves state-of-the-art performances in inferring network structures and well captures interaction types.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xue Yu;Muchen Li;Yan Leng;Renjie Liao", "authorids": "~Xue_Yu2;~Muchen_Li1;~Yan_Leng1;~Renjie_Liao1", "gender": "M;;M;F", "homepage": ";http://web.mit.edu/yleng/www/;https://lrjconan.github.io/;https://www.researchgate.net/", "dblp": "122/2666;;08/8180;", "google_scholar": "8P1cUOQAAAAJ;WfU3qjQAAAAJ;2wrS35MAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Muchen_Li1;~Yan_Leng1;~Renjie_Liao1;~YU_XUE1", "aff": "University of British Columbia;University of Texas, Austin;Department of Electrical and Computer Engineering, The University of British Columbia;Renmin University of China", "aff_domain": "ubc.ca;utexas.edu;ece.ubc.ca;ruc.edu.cn", "position": "PhD student;Assistant Professor;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nyu2024learning,\ntitle={Learning Latent Structures in Network Games via Data-Dependent Gated-Prior Graph Variational Autoencoders},\nauthor={Xue Yu and Muchen Li and Yan Leng and Renjie Liao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kKWjZoaRLv}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6882209, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yFqM4V9Rh5YJ:scholar.google.com/&scioq=Learning+Latent+Structures+in+Network+Games+via+Data-Dependent+Gated-Prior+Graph+Variational+Autoencoders&hl=en&as_sdt=0,33", "gs_version_total": 5, "email": "ubc.ca;utexas.edu;ece.ubc.ca;ruc.edu.cn", "author_num": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "University of British Columbia;University of Texas at Austin;Renmin University of China", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ubc.ca;https://www.utexas.edu;http://www.ruc.edu.cn", "aff_unique_abbr": "UBC;UT Austin;RUC", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Austin;Vancouver", "aff_country_unique_index": "0;1;0;2", "aff_country_unique": "Canada;United States;China" }, { "title": "StrWAEs to Invariant Representations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33262", "id": "kLZZWvqlEm", "proceeding": "https://proceedings.mlr.press/v235/lee24u.html", "pdf": "https://openreview.net/pdf?id=kLZZWvqlEm", "openreview": "https://openreview.net/forum?id=kLZZWvqlEm", "author_site": "Hyunjong Lee, Yedarm Seong, Sungdong Lee, Joong-Ho (Johann) Won", "tldr": "", "abstract": "Autoencoders have become an indispensable tool for generative modeling and representation learning in high dimensions. Imposing structural constraints such as conditional independence in order to capture invariance of latent variables to nuisance information has been attempted through adding *ad hoc* penalties to the loss function mostly in the variational autoencoder (VAE) context, often based on heuristics. This paper demonstrates that Wasserstein autoencoders (WAEs) are highly flexible in embracing such structural constraints. Well-known extensions of VAEs for this purpose are gracefully handled within the framework of WAEs. In particular, given a conditional independence structure of the generative model (decoder), corresponding encoder structure and penalties are derived from the functional constraints that define the WAE. These structural uses of WAEs, termed StrWAEs (\u201cstairways\u201d), open up a principled way of penalizing autoencoders to impose structural constraints. Utilizing these advantages, we present handful of results on semi-supervised classification, conditional generation, and invariant representation tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hyunjong Lee;Yedarm Seong;Sungdong Lee;Joong-Ho Won", "authorids": "~Hyunjong_Lee1;~Yedarm_Seong1;~Sungdong_Lee1;~Joong-Ho_Won1", "gender": "M;M;M;", "homepage": "https://leehyunjong.github.io/;https://mybirth0407.github.io;https://github.com/sdlee087;", "dblp": ";;296/9435;", "google_scholar": "https://scholar.google.com/citations?hl=ko;;;", "orcid": ";;0000-0003-0655-5050;", "linkedin": ";;;", "or_profile": "~Hyunjong_Lee1;~Yedarm_Seong1;~Sungdong_Lee1;~Joong-Ho_Won1", "aff": "Seoul National University;Seoul National University;National University of Singapore;", "aff_domain": "snu.ac.kr;snu.ac.kr;nus.edu.sg;", "position": "PhD student;PhD student;Researcher;", "bibtex": "@inproceedings{\nlee2024strwaes,\ntitle={Str{WAE}s to Invariant Representations},\nauthor={Hyunjong Lee and Yedarm Seong and Sungdong Lee and Joong-Ho Won},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kLZZWvqlEm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3831920, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9739252344442380385&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "snu.ac.kr;snu.ac.kr;nus.edu.sg;", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Seoul National University;National University of Singapore", "aff_unique_dep": ";", "aff_unique_url": "https://www.snu.ac.kr;https://www.nus.edu.sg", "aff_unique_abbr": "SNU;NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "South Korea;Singapore" }, { "title": "Outlier-Efficient Hopfield Layers for Large Transformer-Based Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33261", "id": "kLiDMGJKx1", "proceeding": "https://proceedings.mlr.press/v235/hu24a.html", "pdf": "https://openreview.net/pdf?id=kLiDMGJKx1", "openreview": "https://openreview.net/forum?id=kLiDMGJKx1", "author_site": "Jerry Yao-Chieh Hu, Pei-Hsuan Chang, Haozheng Luo, Hong-Yu Chen, Weijian Li, Wei-Po Wang, Han Liu", "tldr": "", "abstract": "We introduce an Outlier-Efficient Modern Hopfield Model (termed `OutEffHop`) and use it to address the outlier inefficiency problem of training gigantic transformer-based models. Our main contribution is a novel associative memory model facilitating _outlier-efficient_ associative memory retrievals. Interestingly, this memory model manifests a model-based interpretation of an outlier-efficient attention mechanism (`Softmax_1`): it is an approximation of the memory retrieval process of `OutEffHop`. Methodologically, this allows us to introduce novel outlier-efficient Hopfield layers as powerful alternatives to traditional attention mechanisms, with superior post-quantization performance. Theoretically, the Outlier-Efficient Modern Hopfield Model retains and improves the desirable properties of standard modern Hopfield models, including fixed point convergence and exponential storage capacity. Empirically, we demonstrate the efficacy of the proposed model across large-scale transformer-based and Hopfield-based models (including BERT, OPT, ViT, and STanHop-Net), benchmarking against state-of-the-art methods like `Clipped_Softmax` and `Gated_Attention`. Notably, `OutEffHop` achieves an average reduction of 22+% in average kurtosis and 26+% in the maximum infinity norm of model outputs across four models. Code is available at [GitHub](https://github.com/MAGICS-LAB/OutEffHop); future updates are on [arXiv](https://arxiv.org/abs/2404.03828).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jerry Yao-Chieh Hu;Pei-Hsuan Chang;Haozheng Luo;Hong-Yu Chen;Weijian Li;Wei-Po Wang;Han Liu", "authorids": "~Jerry_Yao-Chieh_Hu1;b09202022@ntu.edu.tw;~Haozheng_Luo2;b0976960890@gmail.com;~Weijian_Li2;b09202009@ntu.edu.tw;~Han_Liu4", "gender": ";;;;M;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;https://scholar.google.com/citations?hl=en;;", "orcid": ";;;;0009-0003-4158-4380;;", "linkedin": ";;;;weijian-li-b52566153/;;", "or_profile": "~Jerry_Yao-Chieh_Hu1;b09202022@ntu.edu.tw;~Haozheng_Luo2;b0976960890@gmail.com;~Weijian_Li2;b09202009@ntu.edu.tw;~Han_Liu4", "aff": ";;;;Northwestern University;;Northwestern University", "aff_domain": ";;;;northwestern.edu;;u.northwestern.edu", "position": ";;;;PhD student;;Associate Professor", "bibtex": "@inproceedings{\nhu2024outlierefficient,\ntitle={Outlier-Efficient Hopfield Layers for Large Transformer-Based Models},\nauthor={Jerry Yao-Chieh Hu and Pei-Hsuan Chang and Haozheng Luo and Hong-Yu Chen and Weijian Li and Wei-Po Wang and Han Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kLiDMGJKx1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3421505, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15642955421441853315&as_sdt=5,40&sciodt=0,40&hl=en", "gs_version_total": 7, "email": ";;;;northwestern.edu;;u.northwestern.edu", "author_num": 7, "aff_unique_index": "0;0", "aff_unique_norm": "Northwestern University", "aff_unique_dep": "", "aff_unique_url": "https://www.northwestern.edu", "aff_unique_abbr": "NU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Self-Supervised Coarsening of Unstructured Grid with Automatic Differentiation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33260", "id": "kMBvZ40Iu9", "proceeding": "https://proceedings.mlr.press/v235/shumilin24a.html", "pdf": "https://openreview.net/pdf?id=kMBvZ40Iu9", "openreview": "https://openreview.net/forum?id=kMBvZ40Iu9", "author_site": "Sergei Shumilin, Alexander Ryabov, Nikolay Yavich, Evgeny Burnaev, Vladimir Vanovskiy", "tldr": "", "abstract": "Due to the high computational load of modern numerical simulation, there is a demand for approaches that would reduce the size of discrete problems while keeping the accuracy reasonable. In this work, we present an original algorithm to coarsen an unstructured grid based on the concepts of differentiable physics. We achieve this by employing $k$-means clustering, autodifferentiation and stochastic minimization algorithms. We demonstrate performance of the designed algorithm on two PDEs: a linear parabolic equation which governs slightly compressible fluid flow in porous media and the wave equation. Our results show that in the considered scenarios, we reduced the number of grid points up to 10 times while preserving the modeled variable dynamics in the points of interest. The proposed approach can be applied to the simulation of an arbitrary system described by evolutionary partial differential equations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sergei Shumilin;Alexander Ryabov;Nikolay Yavich;Evgeny Burnaev;Vladimir Vanovskiy", "authorids": "~Sergei_Shumilin1;~Alexander_Ryabov1;~Nikolay_Yavich1;~Evgeny_Burnaev1;~Vladimir_Vanovskiy1", "gender": "M;M;M;M;M", "homepage": ";;https://msc.skoltech.ru/nikolayyavich;http://faculty.skoltech.ru/people/evgenyburnaev;", "dblp": ";;204/3049;144/7845;", "google_scholar": ";;osFmUM8AAAAJ;https://scholar.google.ru/citations?user=pCRdcOwAAAAJ;ogsuTqcAAAAJ", "orcid": "0000-0002-3953-7054;0000-0001-6953-8317;;0000-0001-8424-0690;", "linkedin": ";;;;vladimir-vanovskiy-83108b34", "or_profile": "~Sergei_Shumilin1;~Alexander_Ryabov1;~Nikolay_Yavich1;~Evgeny_Burnaev1;~Vladimir_Vanovskiy1", "aff": "Skolkovo Institute of Science and Technology;Moscow Institute of Physics and Technology;Skolkovo Institute of Science and Technology;Skolkovo Institute of Science and Technology;Skolkovo Institute of Science and Technology", "aff_domain": "skoltech.ru;phystech.edu;skoltech.ru;skoltech.ru;skoltech.ru", "position": "Researcher;PhD student;Researcher;Full Professor;Researcher", "bibtex": "@inproceedings{\nshumilin2024selfsupervised,\ntitle={Self-Supervised Coarsening of Unstructured Grid with Automatic Differentiation},\nauthor={Sergei Shumilin and Alexander Ryabov and Nikolay Yavich and Evgeny Burnaev and Vladimir Vanovskiy},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kMBvZ40Iu9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5327596, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6679523795153645426&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "skoltech.ru;phystech.edu;skoltech.ru;skoltech.ru;skoltech.ru", "author_num": 5, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Skolkovo Institute of Science and Technology;Moscow Institute of Physics and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.skoltech.ru;https://www.mipt.ru/en", "aff_unique_abbr": "Skoltech;MIPT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Russian Federation" }, { "title": "MusicFlow: Cascaded Flow Matching for Text Guided Music Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33259", "id": "kOczKjmYum", "proceeding": "https://proceedings.mlr.press/v235/prajwal24a.html", "pdf": "https://openreview.net/pdf?id=kOczKjmYum", "openreview": "https://openreview.net/forum?id=kOczKjmYum", "author_site": "Prajwal K R, Bowen Shi, Matthew Le, Apoorv Vyas, Andros Tjandra, Mahi Luthra, Baishan Guo, Huiyu Wang, Triantafyllos Afouras, David Kant, Wei-Ning Hsu", "tldr": "", "abstract": "We introduce MusicFlow, a cascaded text-to-music generation model based on flow matching. Based on self-supervised representations to bridge between text descriptions and music audios, we construct two flow matching networks to model the conditional distribution of semantic and acoustic features. Additionally, we leverage masked prediction as the training objective, enabling the model to generalize to other tasks such as music infilling and continuation in a zero-shot manner. Experiments on MusicCaps reveal that the music generated by MusicFlow exhibits superior quality and text coherence despite being over $2\\sim5$ times smaller and requiring $5$ times fewer iterative steps. Simultaneously, the model can perform other music generation tasks and achieves competitive performance in music infilling and continuation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "K R Prajwal;Bowen Shi;Matthew Le;Apoorv Vyas;Andros Tjandra;Mahi Luthra;Baishan Guo;Huiyu Wang;Triantafyllos Afouras;David Kant;Wei-Ning Hsu", "authorids": "~K_R_Prajwal1;~Bowen_Shi1;~Matthew_Le2;~Apoorv_Vyas1;~Andros_Tjandra1;mahiluthra@meta.com;~Baishan_Guo1;~Huiyu_Wang1;~Triantafyllos_Afouras1;~David_Kant1;~Wei-Ning_Hsu2", "gender": ";M;;;;;M;;M;;", "homepage": ";;;https://apoorv2904.github.io/;https://scholar.google.com/citations?user=Bvox_f8AAAAJ&hl=en;;;http://csrhddlam.github.io/;http://www.robots.ox.ac.uk/~afourast/;https://davidkantportfolio.com/;", "dblp": ";;;162/6169;https://dblp.org/search/pid/api?q=author:Andros_Tjandra:;;;;175/5771;;", "google_scholar": ";xqyoorYAAAAJ;;https://scholar.google.com/citations?hl=en;Bvox_f8AAAAJ;;;SnmuYloAAAAJ;https://scholar.google.co.uk/citations?user=TkBHFfgAAAAJ;;", "orcid": ";;;;;;;;;;", "linkedin": ";;;;;;baishan;;;;", "or_profile": "~K_R_Prajwal1;~Bowen_Shi1;~Matthew_Le2;~Apoorv_Vyas1;~Andros_Tjandra1;mahiluthra@meta.com;~Baishan_Guo1;~Huiyu_Wang1;~Triantafyllos_Afouras1;~David_Kant1;~Wei-Ning_Hsu2", "aff": ";Meta Facebook;;Meta ;Meta Facebook;;Meta AI;Meta Platforms;Meta;Meta Ai;", "aff_domain": ";meta.com;;meta.com;facebook.com;;meta.com;meta.com;meta.com;ai.meta.com;", "position": ";Researcher;;Researcher;Researcher;;Data Scientist;Researcher;Researcher;Researcher;", "bibtex": "@inproceedings{\nprajwal2024musicflow,\ntitle={MusicFlow: Cascaded Flow Matching for Text Guided Music Generation},\nauthor={K R Prajwal and Bowen Shi and Matthew Le and Apoorv Vyas and Andros Tjandra and Mahi Luthra and Baishan Guo and Huiyu Wang and Triantafyllos Afouras and David Kant and Wei-Ning Hsu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kOczKjmYum}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 893664, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17342638082401406809&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";meta.com;;meta.com;facebook.com;;meta.com;meta.com;meta.com;ai.meta.com;", "author_num": 11, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Meta", "aff_unique_dep": "Meta Platforms, Inc.", "aff_unique_url": "https://meta.com", "aff_unique_abbr": "Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Prompt-guided Precise Audio Editing with Diffusion Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33258", "id": "kQ1dwuheR0", "proceeding": "https://proceedings.mlr.press/v235/xu24p.html", "pdf": "https://openreview.net/pdf?id=kQ1dwuheR0", "openreview": "https://openreview.net/forum?id=kQ1dwuheR0", "author_site": "Manjie Xu, Chenxing Li, Duzhen Zhang, dan su, Wei Liang, Dong Yu", "tldr": "", "abstract": "Audio editing involves the arbitrary manipulation of audio content through precise control. Although text-guided diffusion models have made significant advancements in text-to-audio generation, they still face challenges in finding a flexible and precise way to modify target events within an audio track. We present a novel approach, referred to as **PPAE**, which serves as a general module for diffusion models and enables precise audio editing. The editing is based on the input textual prompt only and is entirely training-free. We exploit the cross-attention maps of diffusion models to facilitate accurate local editing and employ a hierarchical local-global pipeline to ensure a smoother editing process. Experimental results highlight the effectiveness of our method in various editing tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Manjie Xu;Chenxing Li;Duzhen Zhang;Dan Su;Wei Liang;Dong Yu", "authorids": "~Manjie_Xu1;lichenxing007@gmail.com;~Duzhen_Zhang1;~Dan_Su3;~Wei_Liang1;~Dong_Yu2", "gender": "M;;M;M;F;M", "homepage": "https://mjtsu.github.io;;https://bladedancer957.github.io/;;https://liangwei-bit.github.io/web/;https://sites.google.com/view/dongyu888/", "dblp": "322/5851;;235/0398.html;;;71/4598-1", "google_scholar": "j-WwUGEAAAAJ;;o0jlAfwAAAAJ;yE6WZy4AAAAJ;3p6YfBEAAAAJ;tMY31_gAAAAJ", "orcid": ";;0000-0002-4280-431X;;;0000-0003-0520-6844", "linkedin": ";;;dan-su-4948621a/;;dongyu/", "or_profile": "~Manjie_Xu1;lichenxing007@gmail.com;~Duzhen_Zhang1;~Dan_Su3;~Wei_Liang1;~Dong_Yu2", "aff": "Beijing Institute of Technology;;Institute of Automation, Chinese Academy of Sciences;;Beijing Institute of Technology;Tencent AI Lab", "aff_domain": "bit.edu.cn;;ia.ac.cn;;bit.edu.cn;tencent.com", "position": "MS student;;PhD student;;Full Professor;Distinguished Scientist", "bibtex": "@inproceedings{\nxu2024promptguided,\ntitle={Prompt-guided Precise Audio Editing with Diffusion Models},\nauthor={Manjie Xu and Chenxing Li and Duzhen Zhang and Dan Su and Wei Liang and Dong Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kQ1dwuheR0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8516576, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6275725729451670996&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "bit.edu.cn;;ia.ac.cn;;bit.edu.cn;tencent.com", "author_num": 6, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Beijing Institute of Technology;Chinese Academy of Sciences;Tencent", "aff_unique_dep": ";Institute of Automation;Tencent AI Lab", "aff_unique_url": "http://www.bit.edu.cn/;http://www.ia.cas.cn;https://ai.tencent.com", "aff_unique_abbr": "BIT;CAS;Tencent AI Lab", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Generative Flows on Discrete State-Spaces: Enabling Multimodal Flows with Applications to Protein Co-Design", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33257", "id": "kQwSbv0BR4", "proceeding": "https://proceedings.mlr.press/v235/campbell24a.html", "pdf": "https://openreview.net/pdf?id=kQwSbv0BR4", "openreview": "https://openreview.net/forum?id=kQwSbv0BR4", "author_site": "Andrew Campbell, Jason Yim, Regina Barzilay, Tom Rainforth, Tommi Jaakkola", "tldr": "", "abstract": "Combining discrete and continuous data is an important capability for generative models. We present Discrete Flow Models (DFMs), a new flow-based model of discrete data that provides the missing link in enabling flow-based generative models to be applied to multimodal continuous and discrete data problems. Our key insight is that the discrete equivalent of continuous space flow matching can be realized using Continuous Time Markov Chains. DFMs benefit from a simple derivation that includes discrete diffusion models as a specific instance while allowing improved performance over existing diffusion-based approaches. We utilize our DFMs method to build a multimodal flow-based modeling framework. We apply this capability to the task of protein co-design, wherein we learn a model for jointly generating protein structure and sequence. Our approach achieves state-of-the-art co-design performance while allowing the same multimodal model to be used for flexible generation of the sequence or structure.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Andrew Campbell;Jason Yim;Regina Barzilay;Tom Rainforth;Tommi Jaakkola", "authorids": "~Andrew_Campbell4;~Jason_Yim1;~Regina_Barzilay1;~Tom_Rainforth1;~Tommi_S._Jaakkola1", "gender": ";;female;M;", "homepage": ";http://people.csail.mit.edu/jyim/;https://www.regina.csail.mit.edu/;http://www.robots.ox.ac.uk/~twgr;", "dblp": "93/3398;278/7337;b/ReginaBarzilay;166/1198;", "google_scholar": ";8wDe9NAAAAAJ;;https://scholar.google.co.uk/citations?user=ieLRNKMAAAAJ;", "orcid": "0000-0003-2086-0238;0000-0003-0575-7400;;;", "linkedin": ";;;;", "or_profile": "~Andrew_Campbell4;~Jason_Yim1;~Regina_Barzilay1;~Tom_Rainforth1;~Tommi_S._Jaakkola1", "aff": "University of Oxford;Massachusetts Institute of Technology;Massachusetts Institute of Technology;;", "aff_domain": "ox.ac.uk;mit.edu;mit.edu;ox.ac.uk;", "position": "PhD student;PhD student;Professor;Postdoc;", "bibtex": "@inproceedings{\ncampbell2024generative,\ntitle={Generative Flows on Discrete State-Spaces: Enabling Multimodal Flows with Applications to Protein Co-Design},\nauthor={Andrew Campbell and Jason Yim and Regina Barzilay and Tom Rainforth and Tommi Jaakkola},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kQwSbv0BR4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6313688, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 112, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15623535740306130363&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 10, "email": "ox.ac.uk;mit.edu;mit.edu;ox.ac.uk;", "author_num": 5, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Oxford;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ox.ac.uk;https://web.mit.edu", "aff_unique_abbr": "Oxford;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Variational Schr\u00f6dinger Diffusion Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33256", "id": "kRv0WPJd00", "proceeding": "https://proceedings.mlr.press/v235/deng24c.html", "pdf": "https://openreview.net/pdf?id=kRv0WPJd00", "openreview": "https://openreview.net/forum?id=kRv0WPJd00", "author_site": "Wei Deng, Weijian Luo, Yixin Tan, Marin Bilo\u0161, Yu Chen, Yuriy Nevmyvaka, Ricky T. Q. Chen", "tldr": "", "abstract": "Schr\u00f6dinger bridge (SB) has emerged as the go-to method for optimizing transportation plans in diffusion models. However, SB requires estimating the intractable forward score functions, inevitably resulting in the (costly) implicit training loss based on simulated trajectories. To improve the scalability while preserving efficient transportation plans, we leverage variational inference to linearize the forward score functions (variational scores) of SB and restore *simulation-free* properties in training backward scores. We propose the variational Schr\u00f6dinger diffusion model (VSDM), where the forward process is a multivariate diffusion and the variational scores are adaptively optimized for efficient transport. Theoretically, we use stochastic approximation to prove the convergence of the variational scores and show the convergence of the adaptively generated samples based on the optimal variational scores. Empirically, we test the algorithm in simulated examples and observe that VSDM is efficient in generations of anisotropic shapes and yields straighter sample trajectories compared to the single-variate diffusion. We also verify the scalability of the algorithm in real-world data and achieve competitive unconditional generation performance in CIFAR10 and conditional generation in time series modeling. Notably, VSDM no longer depends on warm-up initializations required by SB.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wei Deng;Weijian Luo;Yixin Tan;Marin Bilo\u0161;Yu Chen;Yuriy Nevmyvaka;Ricky T. Q. Chen", "authorids": "~Wei_Deng1;~Weijian_Luo1;~Yixin_Tan1;~Marin_Bilo\u01611;~Yu_Chen15;~Yuriy_Nevmyvaka1;~Ricky_T._Q._Chen1", "gender": "M;;M;;M;;", "homepage": "https://waynedw.github.io/;;;;;;", "dblp": "69/508-2;;;;;92/1859;", "google_scholar": "IYiyxssAAAAJ;;3AGaybIAAAAJ;;;https://scholar.google.com/citations?hl=en;", "orcid": ";;;;;;", "linkedin": ";;yixin-tan-0b9b0a199/;;yu-chen-b0249b79/;;", "or_profile": "~Wei_Deng1;~Weijian_Luo1;~Yixin_Tan1;~Marin_Bilo\u01611;~Yu_Chen15;~Yuriy_Nevmyvaka1;~Ricky_T._Q._Chen1", "aff": "Morgan Stanley;;Duke University;;Morgan Stanley;Morgan Stanley;", "aff_domain": "morganstanley.com;;duke.edu;;morganstanley.com;morganstanley.com;", "position": "Researcher;;PhD student;;Researcher;Principal Researcher;", "bibtex": "@inproceedings{\ndeng2024variational,\ntitle={Variational Schr\\\"odinger Diffusion Models},\nauthor={Wei Deng and Weijian Luo and Yixin Tan and Marin Bilo{\\v{s}} and Yu Chen and Yuriy Nevmyvaka and Ricky T. Q. Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kRv0WPJd00}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6244900, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16608866848270130409&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "morganstanley.com;;duke.edu;;morganstanley.com;morganstanley.com;", "author_num": 7, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Morgan Stanley;Duke University", "aff_unique_dep": ";", "aff_unique_url": "https://www.morganstanley.com;https://www.duke.edu", "aff_unique_abbr": "Morgan Stanley;Duke", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Fewer Truncations Improve Language Modeling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33255", "id": "kRxCDDFNpp", "proceeding": "https://proceedings.mlr.press/v235/ding24f.html", "pdf": "https://openreview.net/pdf?id=kRxCDDFNpp", "openreview": "https://openreview.net/forum?id=kRxCDDFNpp", "author_site": "Hantian Ding, Zijian Wang, Giovanni Paolini, Varun Kumar, Anoop Deoras, Dan Roth, Stefano Soatto", "tldr": "", "abstract": "In large language model training, input documents are typically concatenated together and then split into sequences of equal length to avoid padding tokens. Despite its efficiency, the concatenation approach compromises data integrity\u2014it inevitably breaks many documents into incomplete pieces, leading to excessive truncations that hinder the model from learning to compose logically coherent and factually consistent content that is grounded on the complete context. To address the issue, we propose Best-fit Packing, a scalable and efficient method that packs documents into training sequences through length-aware combinatorial optimization. Our method completely eliminates unnecessary truncations while retaining the same training efficiency as concatenation. Empirical results from both text and code pre-training show that our method achieves superior performance (e.g., +4.7% on reading comprehension; +16.8% in context following; and +9.2% on program synthesis), and reduces closed-domain hallucination effectively by up to 58.3%.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hantian Ding;Zijian Wang;Giovanni Paolini;Varun Kumar;Anoop Deoras;Dan Roth;Stefano Soatto", "authorids": "~Hantian_Ding1;~Zijian_Wang1;~Giovanni_Paolini1;~Varun_Kumar3;~Anoop_Deoras1;~Dan_Roth3;~Stefano_Soatto3", "gender": "M;;M;M;M;M;", "homepage": ";;http://giovannipaolini.org;https://varunkumar-dev.github.io/;;https://www.cis.upenn.edu/~danroth/;", "dblp": "242/8095;;150/6260;;55/8761;r/DanRoth;", "google_scholar": "nEuMO58AAAAJ;;https://scholar.google.it/citations?user=xGI18C0AAAAJ;d-La2lQAAAAJ;QF_rhCIAAAAJ;E-bpPWgAAAAJ;", "orcid": ";;0000-0002-3964-9101;;;;", "linkedin": ";;g-paolini/;varunin/;anoopdeoras/;dan-roth-8667361/;", "or_profile": "~Hantian_Ding1;~Zijian_Wang1;~Giovanni_Paolini1;~Varun_Kumar3;~Anoop_Deoras1;~Dan_Roth3;~Stefano_Soatto3", "aff": "Amazon;;University of Bologna;Amazon;Amazon;Amazon;", "aff_domain": "amazon.com;;unibo.it;amazon.com;amazon.com;amazon.com;", "position": "Researcher;;Associate Professor;Principal Researcher;Principal Researcher;VP and Distinguished Scientist;", "bibtex": "@inproceedings{\nding2024fewer,\ntitle={Fewer Truncations Improve Language Modeling},\nauthor={Hantian Ding and Zijian Wang and Giovanni Paolini and Varun Kumar and Anoop Deoras and Dan Roth and Stefano Soatto},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kRxCDDFNpp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2535191, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17903305561138434584&as_sdt=5,24&sciodt=0,24&hl=en", "gs_version_total": 10, "email": "amazon.com;;unibo.it;amazon.com;amazon.com;amazon.com;", "author_num": 7, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Amazon;University of Bologna", "aff_unique_dep": "Amazon.com, Inc.;", "aff_unique_url": "https://www.amazon.com;https://www.unibo.it", "aff_unique_abbr": "Amazon;Unibo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;Italy" }, { "title": "Accelerating Transformer Pre-training with 2:4 Sparsity", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33254", "id": "kTaX87Zn6M", "proceeding": "https://proceedings.mlr.press/v235/hu24r.html", "pdf": "https://openreview.net/pdf?id=kTaX87Zn6M", "openreview": "https://openreview.net/forum?id=kTaX87Zn6M", "author_site": "Yuezhou Hu, Kang Zhao, Weiyu Huang, Jianfei Chen, Jun Zhu", "tldr": "", "abstract": "Training large transformers is slow, but recent innovations on GPU architecture give us an advantage. NVIDIA Ampere GPUs can execute a fine-grained 2:4 sparse matrix multiplication twice as fast as its dense equivalent. In the light of this property, we comprehensively investigate the feasibility of accelerating feed-forward networks (FFNs) of transformers in pre-training. First, we define a ``flip rate'' to monitor the stability of a 2:4 training process. Utilizing this metric, we propose three techniques to preserve accuracy: to modify the sparse-refined straight-through estimator by applying the masked decay term on gradients, to determine a feasible decay factor in warm-up stage, and to enhance the model's quality by a dense fine-tuning procedure near the end of pre-training. Besides, we devise two techniques to practically accelerate training: to calculate transposable 2:4 masks by convolution, and to accelerate gated activation functions by reducing GPU L2 cache miss. Experiments show that our 2:4 sparse training algorithm achieves similar convergence to dense training algorithms on several transformer pre-training tasks, while actual acceleration can be observed on different shapes of transformer block apparently. Our toolkit is available at https://github.com/huyz2023/2by4-pretrain.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuezhou Hu;Kang Zhao;Weiyu Huang;Jianfei Chen;Jun Zhu", "authorids": "~Yuezhou_Hu1;~Kang_Zhao5;~Weiyu_Huang2;~Jianfei_Chen1;~Jun_Zhu2", "gender": "M;M;M;M;M", "homepage": "https://yuezhouhu.github.io/;https://www.homepage.url;;http://ml.cs.tsinghua.edu.cn/~jianfei;http://ml.cs.tsinghua.edu.cn/~jun", "dblp": ";;;48/6809-1;50/2644-1", "google_scholar": ";vXXcc7MAAAAJ;9HZRihEAAAAJ;di5RZ1MAAAAJ;axsP38wAAAAJ", "orcid": ";;;;", "linkedin": ";;weiyu-huang-352b0a231/;;", "or_profile": "~Yuezhou_Hu1;~Kang_Zhao5;~Weiyu_Huang2;~Jianfei_Chen1;~Jun_Zhu2", "aff": "Georgia Institute of Technology;Huawei Technologies Ltd.;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "gatech.edu;huawei.com;mails.tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn", "position": "Intern;Researcher;MS student;Associate Professor;Professor", "bibtex": "@inproceedings{\nhu2024accelerating,\ntitle={Accelerating Transformer Pre-training with 2:4 Sparsity},\nauthor={Yuezhou Hu and Kang Zhao and Weiyu Huang and Jianfei Chen and Jun Zhu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kTaX87Zn6M}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6336469, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4843723809338535297&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "gatech.edu;huawei.com;mails.tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn", "author_num": 5, "aff_unique_index": "0;1;2;2;2", "aff_unique_norm": "Georgia Institute of Technology;Huawei;Tsinghua University", "aff_unique_dep": ";Huawei Technologies;", "aff_unique_url": "https://www.gatech.edu;https://www.huawei.com;https://www.tsinghua.edu.cn", "aff_unique_abbr": "Georgia Tech;Huawei;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "United States;China" }, { "title": "A Generative Approach for Treatment Effect Estimation under Collider Bias: From an Out-of-Distribution Perspective", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33253", "id": "kUj9b2CezT", "proceeding": "https://proceedings.mlr.press/v235/li24al.html", "pdf": "https://openreview.net/pdf?id=kUj9b2CezT", "openreview": "https://openreview.net/forum?id=kUj9b2CezT", "author_site": "Baohong Li, Haoxuan Li, Anpeng Wu, Minqin Zhu, shiyuan Peng, Qingyu Cao, Kun Kuang", "tldr": "", "abstract": "Resulting from non-random sample selection caused by both the treatment and outcome, collider bias poses a unique challenge to treatment effect estimation using observational data whose distribution differs from that of the target population. In this paper, we rethink collider bias from an out-of-distribution (OOD) perspective, considering that the entire data space of the target population consists of two different environments: The observational data selected from the target population belongs to a seen environment labeled with $S=1$ and the missing unselected data belongs to another unseen environment labeled with $S=0$. Based on this OOD formulation, we utilize small-scale representative data from the entire data space with no environmental labels and propose a novel method, i.e., Coupled Counterfactual Generative Adversarial Model (C$^2$GAM), to simultaneously generate the missing $S=0$ samples in observational data and the missing $S$ labels in the small-scale representative data. With the help of C$^2$GAM, collider bias can be addressed by combining the generated $S=0$ samples and the observational data to estimate treatment effects. Extensive experiments on synthetic and real-world data demonstrate that plugging C$^2$GAM into existing treatment effect estimators achieves significant performance improvements.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Baohong Li;Haoxuan Li;Anpeng Wu;Minqin Zhu;shiyuan Peng;Qingyu Cao;Kun Kuang", "authorids": "~Baohong_Li1;~Haoxuan_Li6;~Anpeng_Wu1;~Minqin_Zhu1;~shiyuan_Peng1;~Qingyu_Cao1;~Kun_Kuang1", "gender": "M;M;M;M;M;M;M", "homepage": ";https://haoxuanli-pku.github.io/;https://scholar.google.com.hk/citations?user=VQ4m6zQAAAAJ&hl=zh-CN&oi=sra;https://scholar.google.com/citations?user=bNFv_sUAAAAJ;https://github.com/cqupeng;https://www.taobao.com;http://kunkuang.github.io", "dblp": "83/3116;145/4965-1.html;267/5637;371/6014.html;;;194/4245", "google_scholar": "M08DvYsAAAAJ;gtDqiucAAAAJ;https://scholar.google.com.hk/citations?user=VQ4m6zQAAAAJ;bNFv_sUAAAAJ;;;https://scholar.google.com.hk/citations?user=FOsNiMQAAAAJ", "orcid": "0000-0002-3222-002X;0000-0003-3620-3769;0000-0003-3898-7122;0009-0008-9527-8895;;;0009-0000-7528-8131", "linkedin": ";;;;;;", "or_profile": "~Baohong_Li1;~Haoxuan_Li6;~Anpeng_Wu1;~Minqin_Zhu1;~shiyuan_Peng1;~Qingyu_Cao1;~Kun_Kuang1", "aff": "Zhejiang University;Peking University;Mohamed bin Zayed University of Artificial Intelligence;Zhejiang University;;;Zhejiang University", "aff_domain": "zju.edu.cn;pku.edu.cn;mbzuai.ac.ae;zju.edu.cn;;;zju.edu.cn", "position": "PhD student;PhD student;Researcher;PhD student;;;Associate Professor", "bibtex": "@inproceedings{\nli2024a,\ntitle={A Generative Approach for Treatment Effect Estimation under Collider Bias: From an Out-of-Distribution Perspective},\nauthor={Baohong Li and Haoxuan Li and Anpeng Wu and Minqin Zhu and shiyuan Peng and Qingyu Cao and Kun Kuang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kUj9b2CezT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 801228, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15531584534610604836&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "zju.edu.cn;pku.edu.cn;mbzuai.ac.ae;zju.edu.cn;;;zju.edu.cn", "author_num": 7, "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "Zhejiang University;Peking University;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";;", "aff_unique_url": "https://www.zju.edu.cn;http://www.pku.edu.cn;https://mbzuai.ac.ae", "aff_unique_abbr": "ZJU;Peking U;MBZUAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "China;United Arab Emirates" }, { "title": "Slicedit: Zero-Shot Video Editing With Text-to-Image Diffusion Models Using Spatio-Temporal Slices", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33252", "id": "kUm9iuvwIQ", "proceeding": "https://proceedings.mlr.press/v235/cohen24a.html", "pdf": "https://openreview.net/pdf?id=kUm9iuvwIQ", "openreview": "https://openreview.net/forum?id=kUm9iuvwIQ", "author_site": "Nathaniel Cohen, Vladimir Kulikov, Matan Kleiner, Inbar Huberman-Spiegelglas, Tomer Michaeli", "tldr": "", "abstract": "Text-to-image (T2I) diffusion models achieve state-of-the-art results in image synthesis and editing. However, leveraging such pre-trained models for video editing is considered a major challenge. Many existing works attempt to enforce temporal consistency in the edited video through explicit correspondence mechanisms, either in pixel space or between deep features. These methods, however, struggle with strong nonrigid motion. In this paper, we introduce a fundamentally different approach, which is based on the observation that spatiotemporal slices of natural videos exhibit similar characteristics to natural images. Thus, the same T2I diffusion model that is normally used only as a prior on video frames, can also serve as a strong prior for enhancing temporal consistency by applying it on spatiotemporal slices. Based on this observation, we present Slicedit, a method for text-based video editing that utilizes a pre-trained T2I diffusion model to process both spatial and spatiotemporal slices. Our method generates videos that retain the structure and motion of the original video while adhering to the target text. Through extensive experiments, we demonstrate Slicedit's ability to edit a wide range of real-world videos, confirming its clear advantages compared to existing baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nathaniel Cohen;Vladimir Kulikov;Matan Kleiner;Inbar Huberman-Spiegelglas;Tomer Michaeli", "authorids": "~Nathaniel_Cohen1;~Vladimir_Kulikov1;~Matan_Kleiner1;~Inbar_Huberman-Spiegelglas1;~Tomer_Michaeli1", "gender": "M;M;M;F;M", "homepage": ";;https://matankleiner.github.io/;https://inbarhub.github.io/www/;https://tomer.net.technion.ac.il/", "dblp": ";264/6431;334/7700;;70/3188.html", "google_scholar": ";b8lCA_MAAAAJ;n3R271gAAAAJ;YG_1cdEAAAAJ;n2EbR2cAAAAJ", "orcid": ";0009-0001-5963-2083;0009-0000-2621-5286;;", "linkedin": "nathaniel-cohen-6032b7235/;vladimir-kulikov/;matan-kleiner/;;", "or_profile": "~Nathaniel_Cohen1;~Vladimir_Kulikov1;~Matan_Kleiner1;~Inbar_Huberman-Spiegelglas1;~Tomer_Michaeli1", "aff": "Ecole Normale Superieure;Technion - Israel Institute of Technology, Technion - Israel Institute of Technology;Technion - Israel Institute of Technology, Technion - Israel Institute of Technology;Technion - Israel Institute of Technology, Technion - Israel Institute of Technology;Technion, Technion", "aff_domain": "ens-paris-saclay.fr;campus.technion.ac.il;campus.technion.ac.il;campus.technion.ac.il;technion.ac.il", "position": "MS student;PhD student;MS student;Postdoc;Associate Professor", "bibtex": "@inproceedings{\ncohen2024slicedit,\ntitle={Slicedit: Zero-Shot Video Editing With Text-to-Image Diffusion Models Using Spatio-Temporal Slices},\nauthor={Nathaniel Cohen and Vladimir Kulikov and Matan Kleiner and Inbar Huberman-Spiegelglas and Tomer Michaeli},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kUm9iuvwIQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 10063076, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16574042426081816701&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "ens-paris-saclay.fr;campus.technion.ac.il;campus.technion.ac.il;campus.technion.ac.il;technion.ac.il", "author_num": 5, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "Ecole Normale Superieure;Technion - Israel Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ens.fr;https://www.technion.ac.il/en/", "aff_unique_abbr": "ENS;Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "France;Israel" }, { "title": "Towards the Theory of Unsupervised Federated Learning: Non-asymptotic Analysis of Federated EM Algorithms", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33251", "id": "kVgpa1rfLO", "proceeding": "https://proceedings.mlr.press/v235/tian24e.html", "pdf": "https://openreview.net/pdf?id=kVgpa1rfLO", "openreview": "https://openreview.net/forum?id=kVgpa1rfLO", "author_site": "Ye Tian, Haolei Weng, Yang Feng", "tldr": "", "abstract": "While supervised federated learning approaches have enjoyed significant success, the domain of unsupervised federated learning remains relatively underexplored. Several federated EM algorithms have gained popularity in practice, however, their theoretical foundations are often lacking. In this paper, we first introduce a federated gradient EM algorithm (FedGrEM) designed for the unsupervised learning of mixture models, which supplements the existing federated EM algorithms by considering task heterogeneity and potential adversarial attacks. We present a comprehensive finite-sample theory that holds for general mixture models, then apply this general theory on specific statistical models to characterize the explicit estimation error of model parameters and mixture proportions. Our theory elucidates when and how FedGrEM outperforms local single-task learning with insights extending to existing federated EM algorithms. This bridges the gap between their practical success and theoretical understanding. Our numerical results validate our theory, and demonstrate FedGrEM's superiority over existing unsupervised federated learning benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ye Tian;Haolei Weng;Yang Feng", "authorids": "~Ye_Tian9;wenghaol@msu.edu;~Yang_Feng5", "gender": "M;;M", "homepage": "http://www.columbia.edu/~yt2661/;;http://yangfeng.hosting.nyu.edu/", "dblp": ";;", "google_scholar": "Bb6xOPUAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": ";;", "linkedin": ";;", "or_profile": "~Ye_Tian9;wenghaol@msu.edu;~Yang_Feng5", "aff": "Columbia University;;New York University", "aff_domain": "columbia.edu;;nyu.edu", "position": "PhD student;;Full Professor", "bibtex": "@inproceedings{\ntian2024towards,\ntitle={Towards the Theory of Unsupervised Federated Learning: Non-asymptotic Analysis of Federated {EM} Algorithms},\nauthor={Ye Tian and Haolei Weng and Yang Feng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kVgpa1rfLO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1851121, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10700917508465909575&as_sdt=5,30&sciodt=0,30&hl=en", "gs_version_total": 7, "email": "columbia.edu;;nyu.edu", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Columbia University;New York University", "aff_unique_dep": ";", "aff_unique_url": "https://www.columbia.edu;https://www.nyu.edu", "aff_unique_abbr": "Columbia;NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "R2E: Turning any Github Repository into a Programming Agent Environment", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33250", "id": "kXHgEYFyf3", "proceeding": "https://proceedings.mlr.press/v235/jain24c.html", "pdf": "https://openreview.net/pdf?id=kXHgEYFyf3", "openreview": "https://openreview.net/forum?id=kXHgEYFyf3", "author_site": "Naman Jain, Manish Shetty Molahalli, Tianjun Zhang, Shangdian Han, Koushik Sen, Ion Stoica", "tldr": "", "abstract": "While Large Language Models\u2019 (LLMs) coding capabilities have advanced rapidly, corresponding evaluation benchmarks on real-world programming setups are yet to catch up. Building a scalable and interactive testbed for evaluating general-purpose AI coding agents for real-world code has been challenging, particularly due to a lack of high-quality test suites available. In this paper, we present Repository to Environment (R2E), a framework that can turn any GitHub repository into a test environment to evaluate the performance of code-generating systems, both static and interactive. R2E is powered by a synergistic combination of program analysis and LLMs to construct equivalence test harnesses for any GitHub function. We instantiate our framework to build the first large-scale benchmark, R2E-Eval1, for building realistic environments for AI coding assistants. Our results demonstrate that even when SOTA models cannot generate correct solutions with advanced prompting techniques, they can effectively use environment feedback highlighting the need to move from static functional coding to interactive programming paradigm. We hope that our framework (and the instantiated benchmark) can motivate research directions by providing web-scale open-ended coding environments. R2E code is available at https://r2e.dev/", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Naman Jain;Manish Shetty;Tianjun Zhang;King Han;Koushik Sen;Ion Stoica", "authorids": "~Naman_Jain2;~Manish_Shetty1;~Tianjun_Zhang1;kingh0730@berkeley.edu;~Koushik_Sen2;~Ion_Stoica1", "gender": "M;M;;;M;M", "homepage": "https://naman-ntc.github.io/;https://manishs.org;https://tianjunz.github.io;;https://people.eecs.berkeley.edu/~ksen/;http://people.eecs.berkeley.edu/~istoica/", "dblp": ";270/0520;;;https://dblp.uni-trier.de/pid/04/418.html;s/IonStoica", "google_scholar": "6oqV3v8AAAAJ;Fcu7r3YAAAAJ;UE9jz_MAAAAJ;;Vn3L_ioAAAAJ;vN-is70AAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;ionstoica", "or_profile": "~Naman_Jain2;~Manish_Shetty1;~Tianjun_Zhang1;kingh0730@berkeley.edu;~Koushik_Sen2;~Ion_Stoica1", "aff": "University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;;UC Berkeley, University of California, Berkeley;University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu;;cs.berkeley.edu;berkeley.edu", "position": "PhD student;PhD student;PhD student;;Full Professor;Full Professor", "bibtex": "@inproceedings{\njain2024re,\ntitle={R2E: Turning any Github Repository into a Programming Agent Environment},\nauthor={Naman Jain and Manish Shetty and Tianjun Zhang and King Han and Koushik Sen and Ion Stoica},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kXHgEYFyf3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2488293, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14721741205522106252&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 5, "email": "berkeley.edu;berkeley.edu;berkeley.edu;;cs.berkeley.edu;berkeley.edu", "author_num": 6, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Parameter Estimation in DAGs from Incomplete Data via Optimal Transport", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33249", "id": "kXde6Qa6Uy", "proceeding": "https://proceedings.mlr.press/v235/vo24a.html", "pdf": "https://openreview.net/pdf?id=kXde6Qa6Uy", "openreview": "https://openreview.net/forum?id=kXde6Qa6Uy", "author_site": "Vy Vo, Trung Le, Tung-Long Vuong, He Zhao, Edwin V. Bonilla, Dinh Phung", "tldr": "", "abstract": "Estimating the parameters of a probabilistic directed graphical model from incomplete data is a long-standing challenge. This is because, in the presence of latent variables, both the likelihood function and posterior distribution are intractable without assumptions about structural dependencies or model classes. While existing learning methods are fundamentally based on likelihood maximization, here we offer a new view of the parameter learning problem through the lens of optimal transport. This perspective licenses a general framework that operates on any directed graphs without making unrealistic assumptions on the posterior over the latent variables or resorting to variational approximations. We develop a theoretical framework and support it with extensive empirical evidence demonstrating the versatility and robustness of our approach. Across experiments, we show that not only can our method effectively recover the ground-truth parameters but it also performs comparably or better than competing baselines on downstream applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vy Vo;Trung Le;Long Tung Vuong;He Zhao;Edwin V. Bonilla;Dinh Phung", "authorids": "~Vy_Vo2;~Trung_Le2;~Long_Tung_Vuong1;~He_Zhao1;~Edwin_V._Bonilla1;~Dinh_Phung2", "gender": "F;M;M;;;", "homepage": "https://isvy08.github.io/;;;;;", "dblp": "176/4660;;329/6838;;;", "google_scholar": "3CpFpFkAAAAJ;https://scholar.google.com/citations?hl=en;DCC657sAAAAJ;;;", "orcid": ";;;;;", "linkedin": ";;long-vuong-783477131/;;;", "or_profile": "~Vy_Vo2;~Trung_Le2;~Long_Tung_Vuong1;~He_Zhao1;~Edwin_V._Bonilla1;~Dinh_Phung2", "aff": "Monash University;Monash University;Monash University;;;", "aff_domain": "monash.edu;monash.edu;monash.edu;;;", "position": "PhD student;Assistant Professor;PhD student;;;", "bibtex": "@inproceedings{\nvo2024parameter,\ntitle={Parameter Estimation in {DAG}s from Incomplete Data via Optimal Transport},\nauthor={Vy Vo and Trung Le and Long Tung Vuong and He Zhao and Edwin V. Bonilla and Dinh Phung},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kXde6Qa6Uy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2435195, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12946875354456598711&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "monash.edu;monash.edu;monash.edu;;;", "author_num": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "Monash University", "aff_unique_dep": "", "aff_unique_url": "https://www.monash.edu", "aff_unique_abbr": "Monash", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Australia" }, { "title": "Intersecting-Boundary-Sensitive Fingerprinting for Tampering Detection of DNN Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33248", "id": "kZArjKc64o", "proceeding": "https://proceedings.mlr.press/v235/xiaofan24a.html", "pdf": "https://openreview.net/pdf?id=kZArjKc64o", "openreview": "https://openreview.net/forum?id=kZArjKc64o", "author_site": "Xiaofan Bai, Chaoxiang He, Xiaojing Ma, Bin Zhu, Hai Jin", "tldr": "", "abstract": "Cloud-based AI services offer numerous benefits but also introduce vulnerabilities, allowing for tampering with deployed DNN models, ranging from injecting malicious behaviors to reducing computing resources. Fingerprint samples are generated to query models to detect such tampering. In this paper, we present Intersecting-Boundary-Sensitive Fingerprinting (IBSF), a novel method for black-box integrity verification of DNN models using only top-1 labels. Recognizing that tampering with a model alters its decision boundary, IBSF crafts fingerprint samples from normal samples by maximizing the partial Shannon entropy of a selected subset of categories to position the fingerprint samples near decision boundaries where the categories in the subset intersect. These fingerprint samples are almost indistinguishable from their source samples. We theoretically establish and confirm experimentally that these fingerprint samples' expected sensitivity to tampering increases with the cardinality of the subset. Extensive evaluation demonstrates that IBSF surpasses existing state-of-the-art fingerprinting methods, particularly with larger subset cardinality, establishing its state-of-the-art performance in black-box tampering detection using only top-1 labels. The IBSF code is available at https://github.com/CGCL-codes/IBSF.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bai Xiaofan;Chaoxiang He;Xiaojing Ma;Bin Benjamin Zhu;Hai Jin", "authorids": "~Bai_Xiaofan1;~Chaoxiang_He1;~Xiaojing_Ma1;~Bin_Benjamin_Zhu1;~Hai_Jin1", "gender": "M;M;F;M;M", "homepage": "https://github.com/yutou520131;;;https://www.microsoft.com/en-us/research/people/binzhu/;http://www.linkedin.com/in/jinhust", "dblp": "384/4279.html;306/1330;45/7549;85/5693.html;98/4156", "google_scholar": "RVa58jIAAAAJ;mlqKpCcAAAAJ;https://scholar.google.com/citations?hl=en;zyXRIGgAAAAJ;", "orcid": ";;;0000-0002-3571-7808;0000-0002-3934-7605", "linkedin": ";;;;jinhust", "or_profile": "~Bai_Xiaofan1;~Chaoxiang_He1;~Xiaojing_Ma1;~Bin_Benjamin_Zhu1;~Hai_Jin1", "aff": "Huazhong University of Science and Technology;Huazhong University of Science and Technology;Huazhong University of Science and Technology;Microsoft AI Asia;Huazhong University of Science and Technology", "aff_domain": "hust.edu.cn;hust.edu.cn;hust.edu.cn;microsoft.com;hust.edu.cn", "position": "PhD student;PhD student;Full Professor;Principal Researcher;Full Professor", "bibtex": "@inproceedings{\nxiaofan2024intersectingboundarysensitive,\ntitle={Intersecting-Boundary-Sensitive Fingerprinting for Tampering Detection of {DNN} Models},\nauthor={Bai Xiaofan and Chaoxiang He and Xiaojing Ma and Bin Benjamin Zhu and Hai Jin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kZArjKc64o}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2073000, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3QWWTi1U4ioJ:scholar.google.com/&scioq=Intersecting-Boundary-Sensitive+Fingerprinting+for+Tampering+Detection+of+DNN+Models&hl=en&as_sdt=0,33", "gs_version_total": 5, "email": "hust.edu.cn;hust.edu.cn;hust.edu.cn;microsoft.com;hust.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Huazhong University of Science and Technology;Microsoft", "aff_unique_dep": ";Microsoft AI", "aff_unique_url": "http://www.hust.edu.cn;https://www.microsoft.com", "aff_unique_abbr": "HUST;MSFT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "China;Unknown" }, { "title": "More Benefits of Being Distributional: Second-Order Bounds for Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33247", "id": "kZBCFQe1Ej", "proceeding": "https://proceedings.mlr.press/v235/wang24ba.html", "pdf": "https://openreview.net/pdf?id=kZBCFQe1Ej", "openreview": "https://openreview.net/forum?id=kZBCFQe1Ej", "author_site": "Kaiwen Wang, Owen Oertell, Alekh Agarwal, Nathan Kallus, Wen Sun", "tldr": "", "abstract": "In this paper, we prove that Distributional Reinforcement Learning (DistRL), which learns the return distribution, can obtain second-order bounds in both online and offline RL in general settings with function approximation. Second-order bounds are instance-dependent bounds that scale with the variance of return, which we prove are tighter than the previously known small-loss bounds of distributional RL. To the best of our knowledge, our results are the first second-order bounds for low-rank MDPs and for offline RL. When specializing to contextual bandits (one-step RL problem), we show that a distributional learning based optimism algorithm achieves a second-order worst-case regret bound, and a second-order gap dependent bound, simultaneously. We also empirically demonstrate the benefit of DistRL in contextual bandits on real-world datasets. We highlight that our analysis with DistRL is relatively simple, follows the general framework of optimism in the face of uncertainty and does not require weighted regression. Our results suggest that DistRL is a promising framework for obtaining second-order bounds in general RL settings, thus further reinforcing the benefits of DistRL.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kaiwen Wang;Owen Oertell;Alekh Agarwal;Nathan Kallus;Wen Sun", "authorids": "~Kaiwen_Wang1;~Owen_Oertell1;~Alekh_Agarwal2;~Nathan_Kallus1;~Wen_Sun1", "gender": "M;Not Specified;M;;", "homepage": "https://kaiwenw.github.io/;https://owenoertell.com;https://alekhagarwal.net;http://nathankallus.com/;https://wensun.github.io", "dblp": "220/3822;368/5285;;142/2900;", "google_scholar": "HsMheBUAAAAJ;https://scholar.google.com/citations?hl=en;9nnDvooAAAAJ;K2WfIlsAAAAJ;iOLC30YAAAAJ", "orcid": ";;;0000-0003-1672-0507;", "linkedin": "kaiwenw/;;;;", "or_profile": "~Kaiwen_Wang1;~Owen_Oertell1;~Alekh_Agarwal2;~Nathan_Kallus1;~Wen_Sun1", "aff": "Department of Computer Science, Cornell University;Cornell University;Google;Cornell University;Cornell University", "aff_domain": "cs.cornell.edu;cornell.edu;google.com;cornell.edu;cornell.edu", "position": "PhD student;Undergrad student;Researcher;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nwang2024more,\ntitle={More Benefits of Being Distributional: Second-Order Bounds for Reinforcement Learning},\nauthor={Kaiwen Wang and Owen Oertell and Alekh Agarwal and Nathan Kallus and Wen Sun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kZBCFQe1Ej}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 629173, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17715548806917745634&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "cs.cornell.edu;cornell.edu;google.com;cornell.edu;cornell.edu", "author_num": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Cornell University;Google", "aff_unique_dep": "Department of Computer Science;Google", "aff_unique_url": "https://www.cornell.edu;https://www.google.com", "aff_unique_abbr": "Cornell;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Hyperbolic Optimizer as a Dynamical System", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33246", "id": "kZKopcDp2q", "proceeding": "https://proceedings.mlr.press/v235/alvarado24a.html", "pdf": "https://openreview.net/pdf?id=kZKopcDp2q", "openreview": "https://openreview.net/forum?id=kZKopcDp2q", "author_site": "Nico Alvarado, Hans Lobel", "tldr": "", "abstract": "During the last few years, the field of dynamical systems has been developing innovative tools to study the asymptotic behavior of different optimizers in the context of neural networks. In this work, we redefine an extensively studied optimizer, employing classical techniques from hyperbolic geometry. This new definition is linked to a non-linear differential equation as a continuous limit. Additionally, by utilizing Lyapunov stability concepts, we analyze the asymptotic behavior of its critical points.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nico Alvarado;Hans Lobel", "authorids": "~Nico_Alvarado1;~Hans_Lobel1", "gender": "M;M", "homepage": "https://www.mat.uc.cl/personas/perfil/nfalvarado;", "dblp": "336/8903.html;140/7837", "google_scholar": "https://scholar.google.com/citations?hl=es;oKYALakAAAAJ", "orcid": ";0000-0003-3514-9414", "linkedin": ";", "or_profile": "~Nico_Alvarado1;~Hans_Lobel1", "aff": "Pontificia Universidad Catolica de Chile;Pontificia Universidad Catolica de Chile", "aff_domain": "puc.cl;uc.cl", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nalvarado2024hyperbolic,\ntitle={Hyperbolic Optimizer as a Dynamical System},\nauthor={Nico Alvarado and Hans Lobel},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kZKopcDp2q}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 370758, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13480746387696481329&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 4, "email": "puc.cl;uc.cl", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Pontificia Universidad Catolica de Chile", "aff_unique_dep": "", "aff_unique_url": "https://www.puc.cl", "aff_unique_abbr": "PUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Chile" }, { "title": "How do Transformers Perform In-Context Autoregressive Learning ?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33245", "id": "kZbTkpnafR", "proceeding": "https://proceedings.mlr.press/v235/sander24a.html", "pdf": "https://openreview.net/pdf?id=kZbTkpnafR", "openreview": "https://openreview.net/forum?id=kZbTkpnafR", "author_site": "Michael Sander, Raja Giryes, Taiji Suzuki, Mathieu Blondel, Gabriel Peyr\u00e9", "tldr": "", "abstract": "Transformers have achieved state-of-the-art performance in language modeling tasks. However, the reasons behind their tremendous success are still unclear. In this paper, towards a better understanding, we train a Transformer model on a simple next token prediction task, where sequences are generated as a first-order autoregressive process $s_{t+1} = W s_t$. We show how a trained Transformer predicts the next token by first learning $W$ in-context, then applying a prediction mapping. We call the resulting procedure *in-context autoregressive learning*. More precisely, focusing on commuting orthogonal matrices $W$, we first show that a trained one-layer linear Transformer implements one step of gradient descent for the minimization of an inner objective function, when considering augmented tokens. When the tokens are not augmented, we characterize the global minima of a one-layer diagonal linear multi-head Transformer. Importantly, we exhibit orthogonality between heads and show that positional encoding captures trigonometric relations in the data. On the experimental side, we consider the general case of non-commuting orthogonal matrices and generalize our theoretical findings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Michael Eli Sander;Raja Giryes;Taiji Suzuki;Mathieu Blondel;Gabriel Peyr\u00e9", "authorids": "~Michael_Eli_Sander1;~Raja_Giryes1;~Taiji_Suzuki1;~Mathieu_Blondel1;~Gabriel_Peyr\u00e92", "gender": "M;M;M;;M", "homepage": "https://michaelsdr.github.io/;https://www.giryes.sites.tau.ac.il/;http://ibis.t.u-tokyo.ac.jp/suzuki/;http://www.mblondel.org;http://gpeyre.com/", "dblp": "285/5131;50/7998;08/312;05/8614.html;65/1759", "google_scholar": "COqAqcMAAAAJ;https://scholar.google.co.il/citations?user=9aQUYVQAAAAJ;x8osrBsAAAAJ;C0EKzrUAAAAJ;https://scholar.google.fr/citations?user=KqA1dYcAAAAJ", "orcid": ";0000-0002-2830-0297;;;", "linkedin": ";raja-giryes-0818935/;;;", "or_profile": "~Michael_Eli_Sander1;~Raja_Giryes1;~Taiji_Suzuki1;~Mathieu_Blondel1;~Gabriel_Peyr\u00e92", "aff": "Ecole Normale Sup\u00e9rieure de Paris;Tel Aviv University;The University of Tokyo;Google;CNRS", "aff_domain": "ens.fr;tauex.tau.ac.il;tokyo.ac.jp;google.com;cnrs.fr", "position": "PhD student;Associate Professor;Associate Professor;Research scientist;Researcher", "bibtex": "@inproceedings{\nsander2024how,\ntitle={How do Transformers Perform In-Context Autoregressive Learning ?},\nauthor={Michael Eli Sander and Raja Giryes and Taiji Suzuki and Mathieu Blondel and Gabriel Peyr{\\'e}},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kZbTkpnafR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 647246, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8661836356602097421&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8, "email": "ens.fr;tauex.tau.ac.il;tokyo.ac.jp;google.com;cnrs.fr", "author_num": 5, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Ecole Normale Sup\u00e9rieure de Paris;Tel Aviv University;University of Tokyo;Google;Centre National de la Recherche Scientifique", "aff_unique_dep": ";;;Google;", "aff_unique_url": "https://www.ens.fr;https://www.tau.ac.il;https://www.u-tokyo.ac.jp;https://www.google.com;https://www.cnrs.fr", "aff_unique_abbr": "ENS Paris;TAU;UTokyo;Google;CNRS", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Paris;;Mountain View", "aff_country_unique_index": "0;1;2;3;0", "aff_country_unique": "France;Israel;Japan;United States" }, { "title": "BAT: Learning to Reason about Spatial Sounds with Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33244", "id": "kao5hRX9YA", "proceeding": "https://proceedings.mlr.press/v235/zheng24i.html", "pdf": "https://openreview.net/pdf?id=kao5hRX9YA", "openreview": "https://openreview.net/forum?id=kao5hRX9YA", "author_site": "Zhisheng Zheng, Puyuan Peng, Ziyang Ma, Xie Chen, Eunsol Choi, David Harwath", "tldr": "", "abstract": "Spatial sound reasoning is a fundamental human skill, enabling us to navigate and interpret our surroundings based on sound. In this paper we present BAT, which combines the spatial sound perception ability of a binaural acoustic scene analysis model with the natural language reasoning capabilities of a large language model (LLM) to replicate this innate ability. To address the lack of existing datasets of in-the-wild spatial sounds, we synthesized a binaural audio dataset using AudioSet and SoundSpaces 2.0. Next, we developed SpatialSoundQA, a spatial sound-based question-answering dataset, offering a range of QA tasks that train BAT in various aspects of spatial sound perception and reasoning. The acoustic front end encoder of BAT is a novel spatial audio encoder named Spatial Audio Spectrogram Transformer, or Spatial-AST, which by itself achieves strong performance across sound event detection, spatial localization, and distance estimation. By integrating Spatial-AST with LLaMA-2 7B model, BAT transcends standard Sound Event Localization and Detection (SELD) tasks, enabling the model to reason about the relationships between the sounds in its environment. Our experiments demonstrate BAT's superior performance on both spatial sound perception and reasoning, showcasing the immense potential of LLMs in navigating and interpreting complex spatial audio environments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhisheng Zheng;Puyuan Peng;Ziyang Ma;Xie Chen;Eunsol Choi;David Harwath", "authorids": "~Zhisheng_Zheng1;~Puyuan_Peng1;~Ziyang_Ma3;~Xie_Chen2;~Eunsol_Choi1;~David_Harwath1", "gender": "M;M;M;M;;M", "homepage": "https://zhishengzheng.com/;https://jasonppy.github.io/;http://ziyang.tech/;https://chenxie95.github.io/;https://eunsol.github.io/;https://www.cs.utexas.edu/~harwath/index.html", "dblp": ";280/3431;;86/11429-1.html;116/2765;", "google_scholar": "WYwBrzAAAAAJ;https://scholar.google.com/citations?hl=en;4RZnXGMAAAAJ;d6u01FkAAAAJ;6wulN88AAAAJ;C0kDOzcAAAAJ", "orcid": ";;;0000-0001-7423-617X;0000-0003-3607-9104;", "linkedin": "zhisheng-zheng-573ba3269/;;;;;", "or_profile": "~Zhisheng_Zheng1;~Puyuan_Peng1;~Ziyang_Ma3;~Xie_Chen2;~Eunsol_Choi1;~David_Harwath1", "aff": "Shanghai Jiaotong University;University of Texas at Austin;Shanghai Jiaotong University;Shanghai Jiaotong University;University of Texas, Austin;University of Texas, Austin", "aff_domain": "sjtu.edu.cn;utexas.edu;sjtu.edu.cn;sjtu.edu.cn;cs.utexas.edu;utexas.edu", "position": "Undergrad student;PhD student;PhD student;Associate Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nzheng2024bat,\ntitle={{BAT}: Learning to Reason about Spatial Sounds with Large Language Models},\nauthor={Zhisheng Zheng and Puyuan Peng and Ziyang Ma and Xie Chen and Eunsol Choi and David Harwath},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kao5hRX9YA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4578294, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11580096858290224711&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "sjtu.edu.cn;utexas.edu;sjtu.edu.cn;sjtu.edu.cn;cs.utexas.edu;utexas.edu", "author_num": 6, "aff_unique_index": "0;1;0;0;1;1", "aff_unique_norm": "Shanghai Jiao Tong University;University of Texas at Austin", "aff_unique_dep": ";", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.utexas.edu", "aff_unique_abbr": "SJTU;UT Austin", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;1;0;0;1;1", "aff_country_unique": "China;United States" }, { "title": "FedRC: Tackling Diverse Distribution Shifts Challenge in Federated Learning by Robust Clustering", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33243", "id": "kc4dZYJlJG", "proceeding": "https://proceedings.mlr.press/v235/guo24f.html", "pdf": "https://openreview.net/pdf?id=kc4dZYJlJG", "openreview": "https://openreview.net/forum?id=kc4dZYJlJG", "author_site": "Yongxin Guo, Xiaoying Tang, Tao Lin", "tldr": "", "abstract": "Federated Learning (FL) is a machine learning paradigm that safeguards privacy by retaining client data on edge devices. However, optimizing FL in practice can be challenging due to the diverse and heterogeneous nature of the learning system. Though recent research has focused on improving the optimization of FL when distribution shifts occur among clients, ensuring global performance when multiple types of distribution shifts occur simultaneously among clients---such as feature distribution shift, label distribution shift, and concept shift---remain under-explored. In this paper, we identify the learning challenges posed by the simultaneous occurrence of diverse distribution shifts and propose a clustering principle to overcome these challenges. Through our research, we find that existing methods fail to address the clustering principle. Therefore, we propose a novel clustering algorithm framework, dubbed as FedRC, which adheres to our proposed clustering principle by incorporating a bi-level optimization problem and a novel objective function. Extensive experiments demonstrate that FedRC significantly outperforms other SOTA cluster-based FL methods. Our code will be publicly available.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yongxin Guo;Xiaoying Tang;Tao Lin", "authorids": "~Yongxin_Guo1;~Xiaoying_Tang2;~Tao_Lin1", "gender": "M;F;M", "homepage": "https://gyxxyg.github.io/yongxinguo/;https://sse.cuhk.edu.cn/en/faculty/tangxiaoying;https://lins-lab.github.io/", "dblp": ";134/9714-2;64/4492-4.html", "google_scholar": "5Cl1GZwAAAAJ;https://scholar.google.com/citations?hl=zh-TW;QE9pa_cAAAAJ", "orcid": "0009-0001-8652-0722;0000-0003-3955-1195;0000-0002-3246-6935", "linkedin": ";;", "or_profile": "~Yongxin_Guo1;~Xiaoying_Tang2;~Tao_Lin1", "aff": "Tencent;The Chinese University of Hong Kong, Shenzhen;Westlake University", "aff_domain": "tencent.com;cuhk.edu.cn;westlake.edu", "position": "Intern;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nguo2024fedrc,\ntitle={Fed{RC}: Tackling Diverse Distribution Shifts Challenge in Federated Learning by Robust Clustering},\nauthor={Yongxin Guo and Xiaoying Tang and Tao Lin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kc4dZYJlJG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3520685, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15939733531129246781&as_sdt=20000005&sciodt=0,21&hl=en", "gs_version_total": 7, "email": "tencent.com;cuhk.edu.cn;westlake.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Tencent;Chinese University of Hong Kong;Westlake University", "aff_unique_dep": "Tencent Holdings Limited;;", "aff_unique_url": "https://www.tencent.com;https://www.cuhk.edu.cn;https://www.westlake.edu.cn", "aff_unique_abbr": "Tencent;CUHK;WU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Sign Gradient Descent-based Neuronal Dynamics: ANN-to-SNN Conversion Beyond ReLU Network", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33242", "id": "kfpe7Dg23G", "proceeding": "https://proceedings.mlr.press/v235/oh24b.html", "pdf": "https://openreview.net/pdf?id=kfpe7Dg23G", "openreview": "https://openreview.net/forum?id=kfpe7Dg23G", "author_site": "Hyunseok Oh, Youngki Lee", "tldr": "", "abstract": "Spiking neural network (SNN) is studied in multidisciplinary domains to (i) enable order-of-magnitudes energy-efficient AI inference, and (ii) computationally simulate neuroscientific mechanisms. The lack of discrete theory obstructs the practical application of SNN by limiting its performance and nonlinearity support. We present a new optimization-theoretic perspective of the discrete dynamics of spiking neuron. We prove that a discrete dynamical system of simple integrate-and-fire models approximates the subgradient method over unconstrained optimization problems. We practically extend our theory to introduce a novel sign gradient descent (signGD)-based neuronal dynamics that can (i) approximate diverse nonlinearities beyond ReLU, and (ii) advance ANN-to-SNN conversion performance in low time-steps. Experiments on large-scale datasets show that our technique achieve (i) state-of-the-art performance in ANN-to-SNN conversion, and (ii) is first to convert new DNN architectures, e.g., ConvNext, MLP-Mixer, and ResMLP. We publicly share our source code at www.github.com/snuhcs/snn_signgd .", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hyunseok Oh;Youngki Lee", "authorids": "~Hyunseok_Oh1;~Youngki_Lee2", "gender": "M;M", "homepage": "http://www.aistudy.com/ohs/index.html;https://youngkilee.blogspot.com", "dblp": "195/1687;", "google_scholar": "qnGEZaMAAAAJ;qhKU0oMAAAAJ", "orcid": ";0000-0002-1319-7071", "linkedin": ";", "or_profile": "~Hyunseok_Oh1;~Youngki_Lee2", "aff": "Microsoft;Seoul National University", "aff_domain": "microsoft.com;snu.ac.kr", "position": "Intern;Associate Professor", "bibtex": "@inproceedings{\noh2024sign,\ntitle={Sign Gradient Descent-based Neuronal Dynamics: {ANN}-to-{SNN} Conversion Beyond Re{LU} Network},\nauthor={Hyunseok Oh and Youngki Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kfpe7Dg23G}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7370610, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18004111378597171248&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "microsoft.com;snu.ac.kr", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Microsoft;Seoul National University", "aff_unique_dep": "Microsoft Corporation;", "aff_unique_url": "https://www.microsoft.com;https://www.snu.ac.kr", "aff_unique_abbr": "Microsoft;SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;South Korea" }, { "title": "Differentially Private Domain Adaptation with Theoretical Guarantees", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33241", "id": "kkqIEp2bRa", "proceeding": "https://proceedings.mlr.press/v235/bassily24a.html", "pdf": "https://openreview.net/pdf?id=kkqIEp2bRa", "openreview": "https://openreview.net/forum?id=kkqIEp2bRa", "author_site": "Raef Bassily, Corinna Cortes, Anqi Mao, Mehryar Mohri", "tldr": "", "abstract": "In many applications, the labeled data at the learner's disposal is subject to privacy constraints and is relatively limited. To derive a more accurate predictor for the target domain, it is often beneficial to leverage publicly available labeled data from an alternative domain, somewhat close to the target domain. This is the modern problem of supervised domain adaptation from a public source to a private target domain. We present two $(\\epsilon, \\delta)$-differentially private adaptation algorithms for supervised adaptation, for which we make use of a general optimization problem, recently shown to benefit from favorable theoretical learning guarantees. Our first algorithm is designed for regression with linear predictors and shown to solve a convex optimization problem. Our second algorithm is a more general solution for loss functions that may be non-convex but Lipschitz and smooth. While our main objective is a theoretical analysis, we also report the results of several experiments. We first show that the non-private versions of our algorithms match state-of-the-art performance in supervised adaptation and that for larger values of the target sample size or $\\epsilon$, the performance of our private algorithms remains close to that of their non-private counterparts.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Raef Bassily;Corinna Cortes;Anqi Mao;Mehryar Mohri", "authorids": "~Raef_Bassily2;~Corinna_Cortes1;~Anqi_Mao1;~Mehryar_Mohri2", "gender": "F;F;M;M", "homepage": "https://research.google/people/author121/;https://anqi-mao.github.io;https://cs.nyu.edu/~mohri/;https://sites.google.com/view/rbassily", "dblp": "77/5783;241/6864;03/5448;88/8656", "google_scholar": "U_IVY50AAAAJ;nkjIZ-oAAAAJ;ktwwLjsAAAAJ;C8qMVQUAAAAJ", "orcid": ";;;", "linkedin": ";;mehryar-mohri-3737b981/;", "or_profile": "~Corinna_Cortes1;~Anqi_Mao1;~Mehryar_Mohri2;~RAEF_BASSILY1", "aff": "Google;Courant Institute of Mathematical Sciences, NYU;Google Research;Google", "aff_domain": "google.com;cims.nyu.edu;google.com;google.com", "position": "Researcher;PhD student;Principal Researcher;Researcher", "bibtex": "@inproceedings{\nbassily2024differentially,\ntitle={Differentially Private Domain Adaptation with Theoretical Guarantees},\nauthor={Raef Bassily and Corinna Cortes and Anqi Mao and Mehryar Mohri},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kkqIEp2bRa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 592845, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:1ey4LCG8QYYJ:scholar.google.com/&scioq=Differentially+Private+Domain+Adaptation+with+Theoretical+Guarantees&hl=en&as_sdt=0,5", "gs_version_total": 7, "email": "google.com;cims.nyu.edu;google.com;google.com", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Google;New York University", "aff_unique_dep": "Google;Courant Institute of Mathematical Sciences", "aff_unique_url": "https://www.google.com;https://www.courant.nyu.edu", "aff_unique_abbr": "Google;NYU", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Mountain View;New York", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "High-Probability Bound for Non-Smooth Non-Convex Stochastic Optimization with Heavy Tails", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33240", "id": "klKk9ETAyU", "proceeding": "https://proceedings.mlr.press/v235/liu24bo.html", "pdf": "https://openreview.net/pdf?id=klKk9ETAyU", "openreview": "https://openreview.net/forum?id=klKk9ETAyU", "author_site": "Langqi Liu, Yibo Wang, Lijun Zhang", "tldr": "", "abstract": "Recently, Cutkosky et al. introduce the online-to-non-convex framework, which utilizes online learning methods to solve non-smooth non-convex optimization problems, and achieves an $\\mathcal{O}(\\epsilon^{-3}\\delta^{-1})$ gradient complexity for finding $(\\delta,\\epsilon)$-stationary points. However, their results rely on the bounded variance assumption of stochastic gradients and only hold in expectation. To address these limitations, we investigate the case that stochastic gradients obey heavy-tailed distributions with finite $\\mathfrak{p}$-th moments for some $\\mathfrak{p}\\in(1,2]$, and propose a novel algorithm which is able to identify a $(\\delta,\\epsilon)$-stationary point with high probability, after consuming $\\tilde{\\mathcal{O}}(\\epsilon^{-\\frac{2\\mathfrak{p}-1}{\\mathfrak{p}-1}}\\delta^{-1})$ stochastic gradients. The key idea is first incorporating the gradient clipping technique into the online-to-non-convex framework to produce a sequence of points, the averaged gradient norms of which is no greater than $\\epsilon$. Then, we propose a validation method to select one $(\\delta,\\epsilon)$-stationary point among the candidates. When gradient distributions have bounded variance, i.e., $\\mathfrak{p}=2$, our result turns into $\\tilde{\\mathcal{O}}(\\epsilon^{-3}\\delta^{-1})$, which improves the existing $\\tilde{\\mathcal{O}}(\\epsilon^{-4}\\delta^{-1})$ high-probability bound. When the objective is smooth, our algorithm can also find an $\\epsilon$-stationary point with $\\tilde{\\mathcal{O}}(\\epsilon^{-\\frac{3\\mathfrak{p}-2}{\\mathfrak{p}-1}})$ gradient queries.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Langqi Liu;Yibo Wang;Lijun Zhang", "authorids": "~Langqi_Liu1;~Yibo_Wang2;~Lijun_Zhang1", "gender": "M;;", "homepage": "http://www.lamda.nju.edu.cn/liulq/;;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Langqi_Liu1;~Yibo_Wang2;~Lijun_Zhang1", "aff": "Nanjing University;;", "aff_domain": "nju.edu.cn;;", "position": "MS student;;", "bibtex": "@inproceedings{\nliu2024highprobability,\ntitle={High-Probability Bound for Non-Smooth Non-Convex Stochastic Optimization with Heavy Tails},\nauthor={Langqi Liu and Yibo Wang and Lijun Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=klKk9ETAyU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 450265, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15020828534387098260&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "nju.edu.cn;;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "", "aff_unique_url": "https://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "On the Expressive Power of Spectral Invariant Graph Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33239", "id": "kmugaw9Kfq", "proceeding": "https://proceedings.mlr.press/v235/zhang24ck.html", "pdf": "https://openreview.net/pdf?id=kmugaw9Kfq", "openreview": "https://openreview.net/forum?id=kmugaw9Kfq", "author_site": "Bohang Zhang, Lingxiao Zhao, Haggai Maron", "tldr": "", "abstract": "Incorporating spectral information to enhance Graph Neural Networks (GNNs) has shown promising results but raises a fundamental challenge due to the inherent ambiguity of eigenvectors. Various architectures have been proposed to address this ambiguity, referred to as spectral invariant architectures. Notable examples include GNNs and Graph Transformers that use spectral distances, spectral projection matrices, or other invariant spectral features. However, the potential expressive power of these spectral invariant architectures remains largely unclear. The goal of this work is to gain a deep theoretical understanding of the expressive power obtainable when using spectral features. We first introduce a novel message-passing framework for designing spectral invariant GNNs, called Eigenspace Projection GNN (EPNN). Our comprehensive analysis shows that EPNN essentially unifies all prior spectral invariant architectures, in that they are either strictly less expressive or equivalent to EPNN. A fine-grained expressiveness hierarchy among different architectures is also established. On the other hand, we present a surprising result that EPNN itself is bounded by a recently proposed class of Subgraph GNNs, implying that all these spectral invariant architectures are strictly less expressive than 3-WL. Finally, we demonstrate that these spectral features offer no additional advantage when combined with more expressive GNNs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bohang Zhang;Lingxiao Zhao;Haggai Maron", "authorids": "~Bohang_Zhang1;~Lingxiao_Zhao1;~Haggai_Maron1", "gender": "M;M;M", "homepage": "https://zbh2047.github.io;http://lingxiaozhao.com/;https://haggaim.github.io/", "dblp": "276/0156.html;;181/6629", "google_scholar": "https://scholar.google.com.hk/citations?hl=zh-CN;QKslW6EAAAAJ;https://scholar.google.co.il/citations?user=4v8uJrIAAAAJ", "orcid": ";;", "linkedin": "zhangbohang;;", "or_profile": "~Bohang_Zhang1;~Lingxiao_Zhao1;~Haggai_Maron1", "aff": "Peking University;Carnegie Mellon University;NVIDIA", "aff_domain": "pku.edu.cn;andrew.cmu.edu;nvidia.com", "position": "PhD student;PhD student;Research Scientist", "bibtex": "@inproceedings{\nzhang2024on,\ntitle={On the Expressive Power of Spectral Invariant Graph Neural Networks},\nauthor={Bohang Zhang and Lingxiao Zhao and Haggai Maron},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kmugaw9Kfq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1603151, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15823654302948625245&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "pku.edu.cn;andrew.cmu.edu;nvidia.com", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Peking University;Carnegie Mellon University;NVIDIA", "aff_unique_dep": ";;NVIDIA Corporation", "aff_unique_url": "http://www.pku.edu.cn;https://www.cmu.edu;https://www.nvidia.com", "aff_unique_abbr": "Peking U;CMU;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;United States" }, { "title": "Monotone, Bi-Lipschitz, and Polyak-\u0141ojasiewicz Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33238", "id": "kn2xp8UOvQ", "proceeding": "https://proceedings.mlr.press/v235/wang24p.html", "pdf": "https://openreview.net/pdf?id=kn2xp8UOvQ", "openreview": "https://openreview.net/forum?id=kn2xp8UOvQ", "author_site": "Ruigang Wang, Krishnamurthy Dvijotham, Ian Manchester", "tldr": "", "abstract": "This paper presents a new *bi-Lipschitz* invertible neural network, the BiLipNet, which has the ability to smoothly control both its *Lipschitzness* (output sensitivity to input perturbations) and *inverse Lipschitzness* (input distinguishability from different outputs). The second main contribution is a new scalar-output network, the PLNet, which is a composition of a BiLipNet and a quadratic potential. We show that PLNet satisfies the Polyak-\u0141ojasiewicz condition and can be applied to learn non-convex surrogate losses with a unique and efficiently-computable global minimum. The central technical element in these networks is a novel invertible residual layer with certified strong monotonicity and Lipschitzness, which we compose with orthogonal layers to build the BiLipNet. The certification of these properties is based on incremental quadratic constraints, resulting in much tighter bounds than can be achieved with spectral normalization. Moreover, we formulate the calculation of the inverse of a BiLipNet -- and hence the minimum of a PLNet -- as a series of three-operator splitting problems, for which fast algorithms can be applied.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ruigang Wang;Krishnamurthy Dj Dvijotham;Ian Manchester", "authorids": "~Ruigang_Wang2;~Krishnamurthy_Dj_Dvijotham1;~Ian_Manchester1", "gender": "M;M;M", "homepage": "https://www.sydney.edu.au/engineering/about/our-people/academic-staff/ian-manchester.html;http://dvij.github.io;https://ruigangwang7.github.io", "dblp": ";16/8758;", "google_scholar": ";BUtloecAAAAJ;https://scholar.google.com.au/citations?user=T8gcqxMAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Ian_Manchester1;~Krishnamurthy_Dvijotham2;~Ray_Wang1", "aff": "University of Sydney;Google DeepMind;University of Sydney", "aff_domain": "sydney.edu.au;google.com;sydney.edu.au", "position": "Professor;Researcher;Postdoc", "bibtex": "@inproceedings{\nwang2024monotone,\ntitle={Monotone, Bi-Lipschitz, and Polyak-{\\L}ojasiewicz Networks},\nauthor={Ruigang Wang and Krishnamurthy Dj Dvijotham and Ian Manchester},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kn2xp8UOvQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5507092, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16089000596784567114&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "sydney.edu.au;google.com;sydney.edu.au", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Sydney;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.sydney.edu.au;https://deepmind.com", "aff_unique_abbr": "USYD;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Australia;United Kingdom" }, { "title": "Bayesian Knowledge Distillation: A Bayesian Perspective of Distillation with Uncertainty Quantification", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33237", "id": "knZ4NYzGUd", "proceeding": "https://proceedings.mlr.press/v235/fang24a.html", "pdf": "https://openreview.net/pdf?id=knZ4NYzGUd", "openreview": "https://openreview.net/forum?id=knZ4NYzGUd", "author_site": "Luyang Fang, Yongkai Chen, Wenxuan Zhong, Ping Ma", "tldr": "", "abstract": "Knowledge distillation (KD) has been widely used for model compression and deployment acceleration. Nonetheless, the statistical insight of the remarkable performance of KD remains elusive, and methods for evaluating the uncertainty of the distilled model/student model are lacking. To address these issues, we establish a close connection between KD and a Bayesian model. In particular, we develop an innovative method named Bayesian Knowledge Distillation (BKD) to provide a transparent interpretation of the working mechanism of KD, and a suite of Bayesian inference tools for the uncertainty quantification of the student model. In BKD, the regularization imposed by the teacher model in KD is formulated as a teacher-informed prior for the student model's parameters. Consequently, we establish the equivalence between minimizing the KD loss and estimating the posterior mode in BKD. Efficient Bayesian inference algorithms are developed based on the stochastic gradient Langevin Monte Carlo and examined with extensive experiments on uncertainty ranking and credible intervals construction for predicted class probabilities.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Luyang Fang;Yongkai Chen;Wenxuan Zhong;Ping Ma", "authorids": "~Luyang_Fang1;~Yongkai_Chen1;~Wenxuan_Zhong1;~Ping_Ma1", "gender": ";M;;M", "homepage": "https://luyangfang.github.io/;https://yongkaichen99.github.io/;https://zhonglab.uga.edu/;http://malab.uga.edu", "dblp": "359/5937;293/8953;37/168;27/5565-1", "google_scholar": "jriFo4wAAAAJ;;P7oB-A0AAAAJ;xCJ8lboAAAAJ", "orcid": ";;;", "linkedin": "luyang-fang-ba34171a7/;;;", "or_profile": "~Luyang_Fang1;~Yongkai_Chen1;~Wenxuan_Zhong1;~Ping_Ma1", "aff": "University of Georgia;University of Georgia;University of Georgia;University of Georgia", "aff_domain": "uga.edu;uga.edu;uga.edu;uga.edu", "position": "PhD student;PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nfang2024bayesian,\ntitle={Bayesian Knowledge Distillation: A Bayesian Perspective of Distillation with Uncertainty Quantification},\nauthor={Luyang Fang and Yongkai Chen and Wenxuan Zhong and Ping Ma},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=knZ4NYzGUd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8308155, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13781614061027413948&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "uga.edu;uga.edu;uga.edu;uga.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Georgia", "aff_unique_dep": "", "aff_unique_url": "https://www.uga.edu", "aff_unique_abbr": "UGA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "When and How Does In-Distribution Label Help Out-of-Distribution Detection?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33236", "id": "knhbhDLdry", "proceeding": "https://proceedings.mlr.press/v235/du24g.html", "pdf": "https://openreview.net/pdf?id=knhbhDLdry", "openreview": "https://openreview.net/forum?id=knhbhDLdry", "author_site": "Xuefeng Du, Yiyou Sun, Sharon Li", "tldr": "", "abstract": "Detecting data points deviating from the training distribution is pivotal for ensuring reliable machine learning. Extensive research has been dedicated to the challenge, spanning classical anomaly detection techniques to contemporary out-of-distribution (OOD) detection approaches. While OOD detection commonly relies on supervised learning from a labeled in-distribution (ID) dataset, anomaly detection may treat the entire ID data as a single class and disregard ID labels. This fundamental distinction raises a significant question that has yet to be rigorously explored: when and how does ID label help OOD detection? This paper bridges this gap by offering a formal understanding to theoretically delineate the impact of ID labels on OOD detection. We employ a graph-theoretic approach, rigorously analyzing the separability of ID data from OOD data in a closed-form manner. Key to our approach is the characterization of data representations through spectral decomposition on the graph. Leveraging these representations, we establish a provable error bound that compares the OOD detection performance with and without ID labels, unveiling conditions for achieving enhanced OOD detection. Lastly, we present empirical results on both simulated and real datasets, validating theoretical guarantees and reinforcing our insights.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xuefeng Du;Yiyou Sun;Yixuan Li", "authorids": "~Xuefeng_Du1;~Yiyou_Sun1;~Yixuan_Li1", "gender": "M;M;F", "homepage": "https://d12306.github.io/;https://sunyiyou.github.io/;http://pages.cs.wisc.edu/~sharonli/", "dblp": "34/3557;211/5630;144/6087-1", "google_scholar": "GE_aEh4AAAAJ;IKqlQo4AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;", "linkedin": "xuefeng-du-094723192/;;liyixuan", "or_profile": "~Xuefeng_Du1;~Yiyou_Sun1;~Yixuan_Li1", "aff": "University of Wisconsin, Madison;University of California, Berkeley;Cornell University", "aff_domain": "wisc.edu;berkeley.edu;cornell.edu", "position": "PhD student;Postdoc;Graduate Student", "bibtex": "@inproceedings{\ndu2024when,\ntitle={When and How Does In-Distribution Label Help Out-of-Distribution Detection?},\nauthor={Xuefeng Du and Yiyou Sun and Yixuan Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=knhbhDLdry}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6453242, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10838208002063014611&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 7, "email": "wisc.edu;berkeley.edu;cornell.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Wisconsin;University of California, Berkeley;Cornell University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.wisc.edu;https://www.berkeley.edu;https://www.cornell.edu", "aff_unique_abbr": "UW;UC Berkeley;Cornell", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Madison;Berkeley;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Efficient Policy Evaluation with Offline Data Informed Behavior Policy Design", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33235", "id": "kpDd2HCBka", "proceeding": "https://proceedings.mlr.press/v235/liu24ca.html", "pdf": "https://openreview.net/pdf?id=kpDd2HCBka", "openreview": "https://openreview.net/forum?id=kpDd2HCBka", "author_site": "Shuze Liu, Shangtong Zhang", "tldr": "", "abstract": "Most reinforcement learning practitioners evaluate their policies with online Monte Carlo estimators for either hyperparameter tuning or testing different algorithmic design choices, where the policy is repeatedly executed in the environment to get the average outcome. Such massive interactions with the environment are prohibitive in many scenarios. In this paper, we propose novel methods that improve the data efficiency of online Monte Carlo estimators while maintaining their unbiasedness. We first propose a tailored closed-form behavior policy that provably reduces the variance of an online Monte Carlo estimator. We then design efficient algorithms to learn this closed-form behavior policy from previously collected offline data. Theoretical analysis is provided to characterize how the behavior policy learning error affects the amount of reduced variance. Compared with previous works, our method achieves better empirical performance in a broader set of environments, with fewer requirements for offline data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shuze Liu;Shangtong Zhang", "authorids": "~Shuze_Liu1;~Shangtong_Zhang1", "gender": "M;", "homepage": ";https://shangtongzhang.github.io/", "dblp": "246/3285;165/9581", "google_scholar": "OVAJS8cAAAAJ;https://scholar.google.co.uk/citations?user=Pn7fj4IAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Shuze_Liu1;~Shangtong_Zhang1", "aff": "University of Virginia, Charlottesville;University of Virginia, Charlottesville", "aff_domain": "virginia.edu;virginia.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nliu2024efficient,\ntitle={Efficient Policy Evaluation with Offline Data Informed Behavior Policy Design},\nauthor={Shuze Liu and Shangtong Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kpDd2HCBka}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3552831, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12989642048790363733&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "virginia.edu;virginia.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Virginia", "aff_unique_dep": "", "aff_unique_url": "https://www.virginia.edu", "aff_unique_abbr": "UVA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Charlottesville", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Feasible Reachable Policy Iteration", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33234", "id": "ks8qSwkkuZ", "proceeding": "https://proceedings.mlr.press/v235/qin24d.html", "pdf": "https://openreview.net/pdf?id=ks8qSwkkuZ", "openreview": "https://openreview.net/forum?id=ks8qSwkkuZ", "author_site": "Shentao Qin, Yujie Yang, Yao Mu, Jie Li, Wenjun Zou, Jingliang Duan, Shengbo Li", "tldr": "", "abstract": "The goal-reaching tasks with safety constraints are common control problems in real world, such as intelligent driving and robot manipulation. The difficulty of this kind of problem comes from the exploration termination caused by safety constraints and the sparse rewards caused by goals. The existing safe RL avoids unsafe exploration by restricting the search space to a feasible region, the essence of which is the pruning of the search space. However, there are still many ineffective explorations in the feasible region because of the ignorance of the goals. Our approach considers both safety and goals; the policy space pruning is achieved by a function called feasible reachable function, which describes whether there is a policy to make the agent safely reach the goals in the finite time domain. This function naturally satisfies the self-consistent condition and the risky Bellman equation, which can be solved by the fixed point iteration method. On this basis, we propose feasible reachable policy iteration (FRPI), which is divided into three steps: policy evaluation, region expansion, and policy improvement. In the region expansion step, by using the information of agent to reach the goals, the convergence of the feasible region is accelerated, and simultaneously a smaller feasible reachable region is identified. The experimental results verify the effectiveness of the proposed FR function in both improving the convergence speed of better or comparable performance without sacrificing safety and identifying a smaller policy space with higher sample efficiency.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shentao Qin;Yujie Yang;Yao Mu;JIE LI;Wenjun Zou;Jingliang Duan;Shengbo Eben Li", "authorids": "~Shentao_Qin1;~Yujie_Yang1;~Yao_Mu1;~JIE_LI14;~Wenjun_Zou1;~Jingliang_Duan1;~Shengbo_Eben_Li2", "gender": "M;M;M;;M;M;M", "homepage": "https://yangyujie-jack.github.io/;https://yaomarkmu.github.io/;https://jieli18.github.io/;https://www.researchgate.net/profile/Wenjun-Zou-6;;http://www.idlab-tsinghua.com/thulab/labweb/dpeople.html?11;https://github.com/JackQin007", "dblp": ";260/0674;17/2703-42;;208/9091;;", "google_scholar": "2T7-s0MAAAAJ;;OILceUIAAAAJ;;https://scholar.google.com/citations?hl=zh-CN;Dxiw1K8AAAAJ;", "orcid": "0000-0001-7222-0019;;0000-0002-3718-5593;;;;", "linkedin": ";;;;;;", "or_profile": "~Yujie_Yang1;~Yao_Mu1;~JIE_LI14;~Wenjun_Zou1;~Jingliang_Duan1;~Shengbo_Eben_Li2;~Jack_Qin1", "aff": "Tsinghua University;The University of Hong Kong;Tsinghua University;Tsinghua University;University of Science and Technology Beijing;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;hku.hk;mails.tsinghua.edu.cn;tsinghua.edu.cn;ustb.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn", "position": "PhD student;PhD student;PhD student;PhD student;Associate Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nqin2024feasible,\ntitle={Feasible Reachable Policy Iteration},\nauthor={Shentao Qin and Yujie Yang and Yao Mu and JIE LI and Wenjun Zou and Jingliang Duan and Shengbo Eben Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ks8qSwkkuZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2699107, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14573752793649707259&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "tsinghua.edu.cn;hku.hk;mails.tsinghua.edu.cn;tsinghua.edu.cn;ustb.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn", "author_num": 7, "aff_unique_index": "0;1;0;0;2;0;0", "aff_unique_norm": "Tsinghua University;University of Hong Kong;University of Science and Technology Beijing", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.hku.hk;http://www.ustb.edu.cn", "aff_unique_abbr": "THU;HKU;USTB", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Selective Mixup Helps with Distribution Shifts, But Not (Only) because of Mixup", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33233", "id": "ksph9pkEDc", "proceeding": "https://proceedings.mlr.press/v235/teney24a.html", "pdf": "https://openreview.net/pdf?id=ksph9pkEDc", "openreview": "https://openreview.net/forum?id=ksph9pkEDc", "author_site": "Damien Teney, Jindong Wang, Ehsan Abbasnejad", "tldr": "", "abstract": "Mixup is a highly successful technique to improve generalization by augmenting training data with combinations of random pairs. Selective mixup is a family of methods that apply mixup to specific pairs e.g. combining examples across classes or domains. Despite remarkable performance on benchmarks with distribution shifts, these methods are still poorly understood. We find that an overlooked aspect of selective mixup explains some of its success in a completely new light. The non-random selection of pairs affects the training distribution and improves generalization by means completely unrelated to the mixing. For example in binary classification, mixup across classes implicitly resamples the data to uniform class distribution - a classical solution to label shift. We verify empirically that this resampling explains some of the improvements reported in prior work. Theoretically, the effect relies on a ``regression toward the mean'', an accidental property we find in several datasets. Outcomes. We now better understand why selective mixup works. This lets us predict a yet-unknown failure mode and conditions where the method is detrimental. We also use the equivalence with resampling to design better variants that combine mixing and resampling effects.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Damien Teney;Jindong Wang;Ehsan Abbasnejad", "authorids": "~Damien_Teney1;~Jindong_Wang1;~Ehsan_Abbasnejad3", "gender": "M;M;M", "homepage": "https://www.damienteney.info;https://ehsanabb.github.io/;https://jd92.wang/", "dblp": "62/10068;30/11191;19/2969-1", "google_scholar": "https://scholar.google.com.au/citations?user=iS_jP_3dpD8J;https://scholar.google.com/citations?hl=en;hBZ_tKsAAAAJ", "orcid": ";;0000-0002-4833-0880", "linkedin": ";;jindong-wang/", "or_profile": "~Damien_Teney1;~Ehsan_M_Abbasnejad1;~Jindong_Wang4", "aff": "Idiap Research Institute;University of Adelaide;Microsoft Research", "aff_domain": "idiap.ch;adelaide.edu.au;microsoft.com", "position": "Researcher;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nteney2024selective,\ntitle={Selective Mixup Helps with Distribution Shifts, But Not (Only) because of Mixup},\nauthor={Damien Teney and Jindong Wang and Ehsan Abbasnejad},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ksph9pkEDc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1247725, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10502332873890205900&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "idiap.ch;adelaide.edu.au;microsoft.com", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Idiap Research Institute;University of Adelaide;Microsoft", "aff_unique_dep": ";;Microsoft Research", "aff_unique_url": "https://www.idiap.ch;https://www.adelaide.edu.au;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Idiap;Adelaide;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "Switzerland;Australia;United States" }, { "title": "Provably Neural Active Learning Succeeds via Prioritizing Perplexing Samples", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33232", "id": "kzz0kn546b", "proceeding": "https://proceedings.mlr.press/v235/bu24a.html", "pdf": "https://openreview.net/pdf?id=kzz0kn546b", "openreview": "https://openreview.net/forum?id=kzz0kn546b", "author_site": "Dake Bu, Wei Huang, Taiji Suzuki, Ji Cheng, Qingfu Zhang, Zhiqiang Xu, Hau-San Wong", "tldr": "", "abstract": "Neural Network-based active learning (NAL) is a cost-effective data selection technique that utilizes neural networks to select and train on a small subset of samples. While existing work successfully develops various effective or theory-justified NAL algorithms, the understanding of the two commonly used query criteria of NAL: uncertainty-based and diversity-based, remains in its infancy. In this work, we try to move one step forward by offering a unified explanation for the success of both query criteria-based NAL from a feature learning view. Specifically, we consider a feature-noise data model comprising easy-to-learn or hard-to-learn features disrupted by noise, and conduct analysis over 2-layer NN-based NALs in the pool-based scenario. We provably show that both uncertainty-based and diversity-based NAL are inherently amenable to one and the same principle, i.e., striving to prioritize samples that contain yet-to-be-learned features. We further prove that this shared principle is the key to their success-achieve small test error within a small labeled set. Contrastingly, the strategy-free passive learning exhibits a large test error due to the inadequate learning of yet-to-be-learned features, necessitating resort to a significantly larger label complexity for a sufficient test error reduction. Experimental results validate our findings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dake Bu;Wei Huang;Taiji Suzuki;Ji Cheng;Qingfu Zhang;zhiqiang xu;Hau-San Wong", "authorids": "~Dake_Bu1;~Wei_Huang6;~Taiji_Suzuki1;~Ji_Cheng1;~Qingfu_Zhang1;~zhiqiang_xu1;~Hau-San_Wong1", "gender": "M;M;M;M;M;M;M", "homepage": ";https://weihuang05.github.io/;http://ibis.t.u-tokyo.ac.jp/suzuki/;;https://www.cs.cityu.edu.hk/~qzhan7/index.html;https://scholar.google.com/citations?user=0R20iBMAAAAJ&hl=en;", "dblp": "379/6085;81/6685-34;08/312;06/5112-1;98/1240.html;72/51-3.html;69/2987", "google_scholar": "mWrnNqsAAAAJ;RZfDh4MAAAAJ;x8osrBsAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.co.uk/citations?user=nhL9PHwAAAAJ;;i9Dh1OkAAAAJ", "orcid": ";0000-0001-5674-7021;;0000-0002-1123-6030;;0000-0002-5693-8933;", "linkedin": ";;;ji-cheng-959378219/;;;", "or_profile": "~Dake_Bu1;~Wei_Huang6;~Taiji_Suzuki1;~Ji_Cheng1;~Qingfu_Zhang1;~zhiqiang_xu1;~Hau-San_Wong1", "aff": "City University of Hong Kong;RIKEN AIP;The University of Tokyo;City University of Hong Kong;City University of Hong Kong;Mohamed bin Zayed University of Artificial Intelligence;City University of Hong Kong", "aff_domain": "cityu.edu.hk;riken.jp;tokyo.ac.jp;my.cityu.edu.hk;cityu.edu.hk;mbzuai.ac.ae;cityu.edu.hk", "position": "PhD student;Research Scientist;Associate Professor;PhD student;Full Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nbu2024provably,\ntitle={Provably Neural Active Learning Succeeds via Prioritizing Perplexing Samples},\nauthor={Dake Bu and Wei Huang and Taiji Suzuki and Ji Cheng and Qingfu Zhang and zhiqiang xu and Hau-San Wong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kzz0kn546b}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6090673, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4610738802161895334&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "cityu.edu.hk;riken.jp;tokyo.ac.jp;my.cityu.edu.hk;cityu.edu.hk;mbzuai.ac.ae;cityu.edu.hk", "author_num": 7, "aff_unique_index": "0;1;2;0;0;3;0", "aff_unique_norm": "City University of Hong Kong;RIKEN;University of Tokyo;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";Advanced Institute for Computational Science;;", "aff_unique_url": "https://www.cityu.edu.hk;https://www.aip.riken.jp;https://www.u-tokyo.ac.jp;https://mbzuai.ac.ae", "aff_unique_abbr": "CityU;RIKEN AIP;UTokyo;MBZUAI", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;1;0;0;2;0", "aff_country_unique": "China;Japan;United Arab Emirates" }, { "title": "Semantically-correlated memories in a dense associative model", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33231", "id": "l0OGoZPZuC", "proceeding": "https://proceedings.mlr.press/v235/burns24a.html", "pdf": "https://openreview.net/pdf?id=l0OGoZPZuC", "openreview": "https://openreview.net/forum?id=l0OGoZPZuC", "tldr": "", "abstract": "I introduce a novel associative memory model named *Correlated Dense Associative Memory* (CDAM), which integrates both auto- and hetero-association in a unified framework for continuous-valued memory patterns. Employing an arbitrary graph structure to semantically link memory patterns, CDAM is theoretically and numerically analysed, revealing four distinct dynamical modes: auto-association, narrow hetero-association, wide hetero-association, and neutral quiescence. Drawing inspiration from inhibitory modulation studies, I employ anti-Hebbian learning rules to control the range of hetero-association, extract multi-scale representations of community structures in graphs, and stabilise the recall of temporal sequences. Experimental demonstrations showcase CDAM's efficacy in handling real-world data, replicating a classical neuroscience experiment, performing image retrieval, and simulating arbitrary finite automata.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Thomas F Burns", "authorids": "~Thomas_F_Burns1", "gender": "M", "homepage": "https://tfburns.com/", "dblp": "311/5096", "google_scholar": "xifCmHAAAAAJ", "orcid": "0000-0002-1123-2929", "linkedin": "tfburns/", "or_profile": "~Thomas_F_Burns1", "aff": "Timaeus", "aff_domain": "timaeus.co", "position": "Researcher", "bibtex": "@inproceedings{\nburns2024semanticallycorrelated,\ntitle={Semantically-correlated memories in a dense associative model},\nauthor={Thomas F Burns},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=l0OGoZPZuC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2548416, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3386909481838851131&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 7, "email": "timaeus.co", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Timaeus", "aff_unique_dep": "", "aff_unique_url": "", "aff_unique_abbr": "" }, { "title": "Causal Discovery via Conditional Independence Testing with Proxy Variables", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33230", "id": "l1YbS3qkdk", "proceeding": "https://proceedings.mlr.press/v235/liu24bc.html", "pdf": "https://openreview.net/pdf?id=l1YbS3qkdk", "openreview": "https://openreview.net/forum?id=l1YbS3qkdk", "author_site": "Mingzhou Liu, Xinwei Sun, YU QIAO, Yizhou Wang", "tldr": "", "abstract": "Distinguishing causal connections from correlations is important in many scenarios. However, the presence of unobserved variables, such as the latent confounder, can introduce bias in conditional independence testing commonly employed in constraint-based causal discovery for identifying causal relations. To address this issue, existing methods introduced proxy variables to adjust for the bias caused by unobserveness. However, these methods were either limited to categorical variables or relied on strong parametric assumptions for identification. In this paper, we propose a novel hypothesis-testing procedure that can effectively examine the existence of the causal relationship over continuous variables, without any parametric constraint. Our procedure is based on discretization, which under completeness conditions, is able to asymptotically establish a linear equation whose coefficient vector is identifiable under the causal null hypothesis. Based on this, we introduce our test statistic and demonstrate its asymptotic level and power. We validate the effectiveness of our procedure using both synthetic and real-world data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mingzhou Liu;Xinwei Sun;Yu QIAO;Yizhou Wang", "authorids": "~Mingzhou_Liu1;~Xinwei_Sun1;~Yu_QIAO3;~Yizhou_Wang1", "gender": "M;M;M;M", "homepage": ";https://sunxinwei0625.github.io/sunxw.github.io/;http://www.pami.sjtu.edu.cn/yuqiao;https://cfcs.pku.edu.cn/wangyizhou/", "dblp": "159/6544-1;145/6592-1;q/YuQiao3;71/3387-1", "google_scholar": ";;hO33bVgAAAAJ;831z_VcAAAAJ", "orcid": "0000-0002-0297-0938;;0000-0001-8258-3868;", "linkedin": ";;;", "or_profile": "~Mingzhou_Liu1;~Xinwei_Sun1;~Yu_QIAO3;~Yizhou_Wang1", "aff": "Peking University;Fudan University;Shanghai Jiaotong University;Peking University", "aff_domain": "pku.edu.cn;fudan.edu.cn;sjtu.edu.cn;pku.edu.cn", "position": "PhD student;Assistant Professor;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nliu2024causal,\ntitle={Causal Discovery via Conditional Independence Testing with Proxy Variables},\nauthor={Mingzhou Liu and Xinwei Sun and Yu QIAO and Yizhou Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=l1YbS3qkdk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 965409, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11062136667346799202&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "pku.edu.cn;fudan.edu.cn;sjtu.edu.cn;pku.edu.cn", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Peking University;Fudan University;Shanghai Jiao Tong University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.pku.edu.cn;https://www.fudan.edu.cn;https://www.sjtu.edu.cn", "aff_unique_abbr": "Peking U;Fudan;SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Multi-group Learning for Hierarchical Groups", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33229", "id": "l4H7Hv7LhJ", "proceeding": "https://proceedings.mlr.press/v235/deng24a.html", "pdf": "https://openreview.net/pdf?id=l4H7Hv7LhJ", "openreview": "https://openreview.net/forum?id=l4H7Hv7LhJ", "author_site": "Samuel Deng, Daniel Hsu", "tldr": "", "abstract": "The multi-group learning model formalizes the learning scenario in which a single predictor must generalize well on multiple, possibly overlapping subgroups of interest. We extend the study of multi-group learning to the natural case where the groups are hierarchically structured. We design an algorithm for this setting that outputs an interpretable and deterministic decision tree predictor with near-optimal sample complexity. We then conduct an empirical evaluation of our algorithm and find that it achieves attractive generalization properties on real datasets with hierarchical group structure.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Samuel Deng;Daniel Hsu", "authorids": "~Samuel_Deng1;~Daniel_Hsu1", "gender": "M;M", "homepage": "https://samuel-deng.github.io/;https://www.cs.columbia.edu/~djhsu/", "dblp": "251/9512;h/DanielHsu.html", "google_scholar": ";Bp6tvy0AAAAJ", "orcid": ";0000-0002-3495-7113", "linkedin": ";", "or_profile": "~Samuel_Deng1;~Daniel_Hsu1", "aff": "Columbia University;Columbia University", "aff_domain": "columbia.edu;columbia.edu", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\ndeng2024multigroup,\ntitle={Multi-group Learning for Hierarchical Groups},\nauthor={Samuel Deng and Daniel Hsu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=l4H7Hv7LhJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5675015, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7638721445877074936&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 7, "email": "columbia.edu;columbia.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Columbia University", "aff_unique_dep": "", "aff_unique_url": "https://www.columbia.edu", "aff_unique_abbr": "Columbia", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Turnstile $\\ell_p$ leverage score sampling with applications", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33228", "id": "l4ZjeDDnu9", "proceeding": "https://proceedings.mlr.press/v235/munteanu24b.html", "pdf": "https://openreview.net/pdf?id=l4ZjeDDnu9", "openreview": "https://openreview.net/forum?id=l4ZjeDDnu9", "author_site": "Alexander Munteanu, Simon Omlor", "tldr": "", "abstract": "The turnstile data stream model offers the most flexible framework where data can be manipulated dynamically, i.e., rows, columns, and even single entries of an input matrix can be added, deleted, or updated multiple times in a data stream. We develop a novel algorithm for sampling rows $a_i$ of a matrix $A\\in\\mathbb{R}^{n\\times d}$, proportional to their $\\ell_p$ norm, when $A$ is presented in a turnstile data stream. Our algorithm not only returns the set of sampled row indexes, it also returns slightly perturbed rows $\\tilde{a}_i \\approx a_i$, and approximates their sampling probabilities up to $\\varepsilon$ relative error. When combined with preconditioning techniques, our algorithm extends to $\\ell_p$ leverage score sampling over turnstile data streams. With these properties in place, it allows us to simulate subsampling constructions of coresets for important regression problems to operate over turnstile data streams with very little overhead compared to their respective off-line subsampling algorithms. For logistic regression, our framework yields the first algorithm that achieves a $(1+\\varepsilon)$ approximation and works in a turnstile data stream using polynomial sketch/subsample size, improving over $O(1)$ approximations, or $\\exp(1/\\varepsilon)$ sketch size of previous work. We compare experimentally to plain oblivious sketching and plain leverage score sampling algorithms for $\\ell_p$ and logistic regression.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alexander Munteanu;Simon Omlor", "authorids": "~Alexander_Munteanu1;~Simon_Omlor1", "gender": "M;", "homepage": "https://biometrie.statistik.tu-dortmund.de/lehrstuhl/team/alexander-munteanu/;https://www.statistik.tu-dortmund.de/omlor.html", "dblp": "145/3380;254/2706.html", "google_scholar": "https://scholar.google.de/citations?hl=en;", "orcid": ";", "linkedin": ";", "or_profile": "~Alexander_Munteanu1;~Simon_Omlor1", "aff": "Universit\u00e4t K\u00f6ln;", "aff_domain": "uni-koeln.de;", "position": "Full Professor;", "bibtex": "@inproceedings{\nmunteanu2024turnstile,\ntitle={Turnstile \\${\\textbackslash}ell\\_p\\$ leverage score sampling with applications},\nauthor={Alexander Munteanu and Simon Omlor},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=l4ZjeDDnu9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 751231, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3300035642735782736&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "uni-koeln.de;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "University of Cologne", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-koeln.de/", "aff_unique_abbr": "UC", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "title": "TravelPlanner: A Benchmark for Real-World Planning with Language Agents", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33227", "id": "l5XQzNkAOe", "proceeding": "https://proceedings.mlr.press/v235/xie24j.html", "pdf": "https://openreview.net/pdf?id=l5XQzNkAOe", "openreview": "https://openreview.net/forum?id=l5XQzNkAOe", "author_site": "Jian Xie, Kai Zhang, Jiangjie Chen, Tinghui Zhu, Renze Lou, Yuandong Tian, Yanghua Xiao, Yu Su", "tldr": "", "abstract": "Planning has been part of the core pursuit for artificial intelligence since its conception, but earlier AI agents mostly focused on constrained settings because many of the cognitive substrates necessary for human-level planning have been lacking. Recently, language agents powered by large language models (LLMs) have shown interesting capabilities such as tool use and reasoning. Are these language agents capable of planning in more complex settings that are out of the reach of prior AI agents? To advance this investigation, we propose TravelPlanner, a new planning benchmark that focuses on travel planning, a common real-world planning scenario. It provides a rich sandbox environment, various tools for accessing nearly four million data records, and 1,225 meticulously curated planning intents and reference plans. Comprehensive evaluations show that the current language agents are not yet capable of handling such complex planning tasks\u2014even GPT-4 only achieves a success rate of 0.6%. Language agents struggle to stay on task, use the right tools to collect information, or keep track of multiple constraints. However, we note that the mere possibility for language agents to tackle such a complex problem is in itself non-trivial progress. TravelPlanner provides a challenging yet meaningful testbed for future language agents.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jian Xie;Kai Zhang;Jiangjie Chen;Tinghui Zhu;Renze Lou;Yuandong Tian;Yanghua Xiao;Yu Su", "authorids": "~Jian_Xie3;~Kai_Zhang10;~Jiangjie_Chen1;~Tinghui_Zhu1;~Renze_Lou1;~Yuandong_Tian1;~Yanghua_Xiao1;~Yu_Su2", "gender": "M;M;M;M;M;M;;M", "homepage": ";https://drogozhang.github.io;https://jiangjiechen.github.io;https://darthzhu.github.io/;https://renzelou.github.io/;http://yuandong-tian.com;;http://ysu1989.github.io", "dblp": ";55/957-33;236/6076;352/8655;296/4744;t/YuandongTian;96/999;38/1070-1", "google_scholar": ";sDnAIsgAAAAJ;https://scholar.google.com.hk/citations?user=XarNs8oAAAAJ;;GVTbSPMAAAAJ;0mgEF28AAAAJ;https://scholar.google.com/citations?hl=zh-CN;rIh5OqoAAAAJ", "orcid": "0009-0000-2867-4726;;;;0000-0002-3273-0097;0000-0003-4202-4847;0000-0001-8403-9591;", "linkedin": ";kai-zhang-43774b196/;;;renze-lou-b681b51a0/;yuandongtian;;", "or_profile": "~Jian_Xie3;~Kai_Zhang10;~Jiangjie_Chen1;~Tinghui_Zhu1;~Renze_Lou1;~Yuandong_Tian1;~Yanghua_Xiao1;~Yu_Su2", "aff": "Fudan University;Google DeepMind;Fudan University;Fudan University;SalesForce.com;Meta AI (FAIR);Fudan University;Microsoft", "aff_domain": "fudan.edu.cn;google.com;fudan.edu.cn;m.fudan.edu.cn;salesforce.com;meta.com;fudan.edu.cn;microsoft.com", "position": "MS student;Student Researcher;PhD student;MS student;Intern;Research Scientist;Full Professor;Senior Researcher", "bibtex": "@inproceedings{\nxie2024travelplanner,\ntitle={TravelPlanner: A Benchmark for Real-World Planning with Language Agents},\nauthor={Jian Xie and Kai Zhang and Jiangjie Chen and Tinghui Zhu and Renze Lou and Yuandong Tian and Yanghua Xiao and Yu Su},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=l5XQzNkAOe}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3206150, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 133, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2838235609560560648&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "fudan.edu.cn;google.com;fudan.edu.cn;m.fudan.edu.cn;salesforce.com;meta.com;fudan.edu.cn;microsoft.com", "author_num": 8, "aff_unique_index": "0;1;0;0;2;3;0;4", "aff_unique_norm": "Fudan University;Google;Salesforce;Meta;Microsoft", "aff_unique_dep": ";Google DeepMind;;Facebook AI Research (FAIR);Microsoft Corporation", "aff_unique_url": "https://www.fudan.edu.cn;https://deepmind.com;https://www.salesforce.com;https://ai.facebook.com;https://www.microsoft.com", "aff_unique_abbr": "Fudan;DeepMind;Salesforce;Meta AI;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;2;2;0;2", "aff_country_unique": "China;United Kingdom;United States" }, { "title": "Scalable Multiple Kernel Clustering: Learning Clustering Structure from Expectation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33226", "id": "l5lgbVR6BP", "proceeding": "https://proceedings.mlr.press/v235/liang24g.html", "pdf": "https://openreview.net/pdf?id=l5lgbVR6BP", "openreview": "https://openreview.net/forum?id=l5lgbVR6BP", "author_site": "Weixuan Liang, En Zhu, Shengju Yu, Huiying Xu, Xinzhong Zhu, Xinwang Liu", "tldr": "", "abstract": "In this paper, we derive an upper bound of the difference between a kernel matrix and its expectation under a mild assumption. Specifically, we assume that the true distribution of the training data is an unknown isotropic Gaussian distribution. When the kernel function is a Gaussian kernel, and the mean of each cluster is sufficiently separated, we find that the expectation of a kernel matrix can be close to a rank-$k$ matrix, where $k$ is the cluster number. Moreover, we prove that the normalized kernel matrix of the training set deviates (w.r.t. Frobenius norm) from its expectation in the order of $\\widetilde{\\mathcal{O}}(1/\\sqrt{d})$, where $d$ is the dimension of samples. Based on the above theoretical results, we propose a novel multiple kernel clustering framework which attempts to learn the information of the expectation kernel matrices. First, we aim to minimize the distance between each base kernel and a rank-$k$ matrix, which is a proxy of the expectation kernel. Then, we fuse these rank-$k$ matrices into a consensus rank-$k$ matrix to find the clustering structure. Using an anchor-based method, the proposed framework is flexible with the sizes of input kernel matrices and able to handle large-scale datasets. We also provide the approximation guarantee by deriving two non-asymptotic bounds for the consensus kernel and clustering indicator matrices. Finally, we conduct extensive experiments to verify the clustering performance of the proposed method and the correctness of the proposed theoretical results.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weixuan Liang;En Zhu;Shengju Yu;Huiying Xu;Xinzhong Zhu;Xinwang Liu", "authorids": "~Weixuan_Liang1;~En_Zhu1;~Shengju_Yu1;xhy@zjnu.edu.cn;~Xinzhong_Zhu1;~Xinwang_Liu1", "gender": "M;M;;;;M", "homepage": ";https://www.researchgate.net/profile/En_Zhu;;;;https://xinwangliu.github.io/", "dblp": "274/1152;30/1307;;;;45/6569-2.html", "google_scholar": "https://scholar.google.com/citations?hl=en;;;;;A56vWC4AAAAJ", "orcid": "0000-0002-1868-5445;;;;;", "linkedin": ";;;;;", "or_profile": "~Weixuan_Liang1;~En_Zhu1;~Shengju_Yu1;xhy@zjnu.edu.cn;~Xinzhong_Zhu1;~Xinwang_Liu1", "aff": "National University of Defense Technology;National University of Defense Technology;;;;National University of Defense Technology", "aff_domain": "nudt.edu.cn;nudt.edu.cn;;;;nudt.edu.cn", "position": "PhD student;Full Professor;;;;Full Professor", "bibtex": "@inproceedings{\nliang2024scalable,\ntitle={Scalable Multiple Kernel Clustering: Learning Clustering Structure from Expectation},\nauthor={Weixuan Liang and En Zhu and Shengju Yu and Huiying Xu and Xinzhong Zhu and Xinwang Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=l5lgbVR6BP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 615881, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8709079639034775415&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "nudt.edu.cn;nudt.edu.cn;;;;nudt.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "National University of Defense Technology", "aff_unique_dep": "", "aff_unique_url": "http://www.nudt.edu.cn/", "aff_unique_abbr": "NUDT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "PIPER: Primitive-Informed Preference-based Hierarchical Reinforcement Learning via Hindsight Relabeling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33225", "id": "l6Hef6FVd0", "proceeding": "https://proceedings.mlr.press/v235/singh24e.html", "pdf": "https://openreview.net/pdf?id=l6Hef6FVd0", "openreview": "https://openreview.net/forum?id=l6Hef6FVd0", "author_site": "Utsav Singh, Wesley A. Suttle, Brian Sadler, Vinay Namboodiri, Amrit Singh Bedi", "tldr": "", "abstract": "In this work, we introduce PIPER: Primitive-Informed Preference-based Hierarchical reinforcement learning via Hindsight Relabeling, a novel approach that leverages preference-based learning to learn a reward model, and subsequently uses this reward model to relabel higher-level replay buffers. Since this reward is unaffected by lower primitive behavior, our relabeling-based approach is able to mitigate non-stationarity, which is common in existing hierarchical approaches, and demonstrates impressive performance across a range of challenging sparse-reward tasks. Since obtaining human feedback is typically impractical, we propose to replace the human-in-the-loop approach with our primitive-in-the-loop approach, which generates feedback using sparse rewards provided by the environment. Moreover, in order to prevent infeasible subgoal prediction and avoid degenerate solutions, we propose primitive-informed regularization that conditions higher-level policies to generate feasible subgoals for lower-level policies. We perform extensive experiments to show that PIPER mitigates non-stationarity in hierarchical reinforcement learning and achieves greater than 50$\\\\%$ success rates in challenging, sparse-reward robotic environments, where most other baselines fail to achieve any significant progress.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Utsav Singh;Wesley A Suttle;Brian M. Sadler;Vinay P. Namboodiri;Amrit Bedi", "authorids": "~Utsav_Singh1;~Wesley_A_Suttle1;~Brian_M._Sadler1;~Vinay_P_Namboodiri1;~Amrit_Bedi1", "gender": "M;M;M;M;", "homepage": "https://www.cse.iitk.ac.in/users/utsavz/;https://oden.utexas.edu/people/directory/Brian-Sadler/;https://vinaypn.github.io;https://sites.google.com/view/amritsinghbedi/home;http://www.wesleysuttle.com", "dblp": "241/9336;26/3347;95/6599;176/2707.html;238/0223", "google_scholar": ";s9eCQn4AAAAJ;https://scholar.google.co.in/citations?user=JyHi9OoAAAAJ;91WLA6QAAAAJ;Tf6oDygAAAAJ", "orcid": ";0000-0002-9564-3812;0000-0001-5262-9722;;", "linkedin": ";brian-sadler-5909102a/;;;", "or_profile": "~Utsav_Singh1;~Brian_M._Sadler1;~Vinay_P_Namboodiri1;~Amrit_Bedi1;~Wesley_Suttle1", "aff": "Indian Institute of Technology, Kanpur;US Army Research Laboratory;University of Bath;University of Maryland, College Park;Army Research Laboratory", "aff_domain": "iitk.ac.in;army.mil;bath.ac.uk;umd.edu;army.mil", "position": "PhD student;Principal Researcher;Associate Professor;Researcher;Postdoc", "bibtex": "@inproceedings{\nsingh2024piper,\ntitle={{PIPER}: Primitive-Informed Preference-based Hierarchical Reinforcement Learning via Hindsight Relabeling},\nauthor={Utsav Singh and Wesley A Suttle and Brian M. Sadler and Vinay P. Namboodiri and Amrit Bedi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=l6Hef6FVd0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7213247, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15191840823157027785&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "iitk.ac.in;army.mil;bath.ac.uk;umd.edu;army.mil", "author_num": 5, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Indian Institute of Technology Kanpur;US Army Research Laboratory;University of Bath;University of Maryland;Army Research Laboratory", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.iitk.ac.in;https://www.arl.army.mil;https://www.bath.ac.uk;https://www/umd.edu;https://www.arl.army.mil", "aff_unique_abbr": "IIT Kanpur;ARL;Bath;UMD;ARL", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Kanpur;;College Park", "aff_country_unique_index": "0;1;2;1;1", "aff_country_unique": "India;United States;United Kingdom" }, { "title": "Self-Alignment of Large Language Models via Monopolylogue-based Social Scene Simulation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33224", "id": "l7shXGuGBT", "proceeding": "https://proceedings.mlr.press/v235/pang24a.html", "pdf": "https://openreview.net/pdf?id=l7shXGuGBT", "openreview": "https://openreview.net/forum?id=l7shXGuGBT", "author_site": "Xianghe Pang, shuo tang, Rui Ye, Yuxin Xiong, Bolun Zhang, Yanfeng Wang, Siheng Chen", "tldr": "", "abstract": "Aligning large language models (LLMs) with human values is imperative to mitigate potential adverse effects resulting from their misuse. Drawing from the sociological insight that acknowledging all parties' concerns is a key factor in shaping human values, this paper proposes a novel direction to align LLMs by themselves: social scene simulation. To achieve this, we present MATRIX, a novel social scene simulator that emulates realistic scenes around a user's input query, enabling the LLM to take social consequences into account before responding. MATRIX serves as a virtual rehearsal space, akin to a Monopolylogue, where the LLM performs diverse roles related to the query and practice by itself. To inject this alignment, we fine-tune the LLM with MATRIX-simulated data, ensuring adherence to human values without compromising inference speed. We theoretically show that the LLM with MATRIX outperforms existing methods under mild assumptions. Finally, extensive experiments validate that our method outperforms over 10 baselines across 4 benchmarks. As evidenced by 875 user ratings, our tuned 13B-size LLM exceeds GPT-4 in aligning with human values. See our project page at https://shuotang123.github.io/MATRIX.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xianghe Pang;Shuo Tang;Rui Ye;Yuxin Xiong;Bolun Zhang;Yanfeng Wang;Siheng Chen", "authorids": "~Xianghe_Pang1;~Shuo_Tang2;~Rui_Ye1;~Yuxin_Xiong1;~Bolun_Zhang2;~Yanfeng_Wang1;~Siheng_Chen1", "gender": "M;M;M;F;M;M;M", "homepage": "https://scholar.google.com/citations?user=Z5PhjLsAAAAJ&hl=zh-CN&oi=ao;;http://rui-ye.github.io/;;https://github.com/bolunzhangQwQ;https://cmic.sjtu.edu.cn/wangyanfeng/;https://siheng-chen.github.io/", "dblp": ";;;357/5881.html;;55/5407-1.html;136/4945", "google_scholar": ";;Q4-VTxcAAAAJ;T_v5hCYAAAAJ;;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com/citations?hl=en", "orcid": ";;;;;0000-0002-3196-2347;", "linkedin": ";shuo-tang-6935a4236/;;yuxin-xiong-b325652b5/;;;", "or_profile": "~Xianghe_Pang1;~Shuo_Tang2;~Rui_Ye1;~Yuxin_Xiong1;~Bolun_Zhang2;~Yanfeng_Wang1;~Siheng_Chen2", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "position": "PhD student;PhD student;PhD student;Undergrad student;Undergrad student;Full Professor;Associate Professor", "bibtex": "@inproceedings{\npang2024selfalignment,\ntitle={Self-Alignment of Large Language Models via Monopolylogue-based Social Scene Simulation},\nauthor={Xianghe Pang and Shuo Tang and Rui Ye and Yuxin Xiong and Bolun Zhang and Yanfeng Wang and Siheng Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=l7shXGuGBT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1699972, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4918889969490305730&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "author_num": 7, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "On the Feasibility of Single-Pass Full-Capacity Learning in Linear Threshold Neurons with Binary Input Vectors", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33223", "id": "l7vQQi0I2d", "proceeding": "https://proceedings.mlr.press/v235/liu24x.html", "pdf": "https://openreview.net/pdf?id=l7vQQi0I2d", "openreview": "https://openreview.net/forum?id=l7vQQi0I2d", "author_site": "Ruipeng Liu, Borui He, Naveed Tahir, Garrett Katz", "tldr": "", "abstract": "Known learning rules tend to fall near one of two extremes: single-pass associative learning with low complexity and capacity, and multi-pass iterative learning with high complexity and capacity. In this work we investigate the mathematical feasibility of learning rules that are both single-pass and achieve the theoretical upper bound on capacity. We consider a fairly broad family of learning rules we call ``span rules,'' which include known rules such as Hebbian learning, perceptron learning, and backpropagation as special cases. To our knowledge, previous work has not determined whether single-pass, full-capacity span rules exist, even in the most fundamental case of a linear threshold neuron with binary input vectors, which is the focus of this study. We derive a necessary condition for the existence of such learning rules, which takes the form of a linear program, and show that the linear program is infeasible. This establishes an impossibility result that span rules can not be both single-pass and full-capacity.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ruipeng Liu;Borui He;Naveed Tahir;Garrett Ethan Katz", "authorids": "~Ruipeng_Liu1;~Borui_He1;~Naveed_Tahir1;~Garrett_Ethan_Katz1", "gender": "M;M;;M", "homepage": ";;;https://web.ecs.syr.edu/~gkatz01/", "dblp": ";;;163/3827", "google_scholar": ";dK1xwPIAAAAJ;;bHIqHwEAAAAJ", "orcid": ";;;0000-0002-5036-8394", "linkedin": "ruipeng-liu-72663b149/;;;", "or_profile": "~Ruipeng_Liu1;~Borui_He1;~Naveed_Tahir1;~Garrett_Ethan_Katz1", "aff": "Syracuse University;Syracuse University;;Syracuse University", "aff_domain": "syr.edu;syr.edu;;syr.edu", "position": "PhD student;PhD student;;Assistant Professor", "bibtex": "@inproceedings{\nliu2024on,\ntitle={On the Feasibility of Single-Pass Full-Capacity Learning in Linear Threshold Neurons with Binary Input Vectors},\nauthor={Ruipeng Liu and Borui He and Naveed Tahir and Garrett Ethan Katz},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=l7vQQi0I2d}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 704775, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:PWZnYNfaIWgJ:scholar.google.com/&scioq=On+the+Feasibility+of+Single-Pass+Full-Capacity+Learning+in+Linear+Threshold+Neurons+with+Binary+Input+Vectors&hl=en&as_sdt=0,33", "gs_version_total": 6, "email": "syr.edu;syr.edu;;syr.edu", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Syracuse University", "aff_unique_dep": "", "aff_unique_url": "https://www.syracuse.edu", "aff_unique_abbr": "Syracuse", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Understanding Stochastic Natural Gradient Variational Inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33222", "id": "l8GrPpsZfy", "proceeding": "https://proceedings.mlr.press/v235/wu24f.html", "pdf": "https://openreview.net/pdf?id=l8GrPpsZfy", "openreview": "https://openreview.net/forum?id=l8GrPpsZfy", "author_site": "Kaiwen Wu, Jacob Gardner", "tldr": "", "abstract": "Stochastic natural gradient variational inference (NGVI) is a popular posterior inference method with applications in various probabilistic models. Despite its wide usage, little is known about the non-asymptotic convergence rate in the *stochastic* setting. We aim to lessen this gap and provide a better understanding. For conjugate likelihoods, we prove the first $\\mathcal{O}(\\frac{1}{T})$ non-asymptotic convergence rate of stochastic NGVI. The complexity is no worse than stochastic gradient descent (a.k.a. black-box variational inference) and the rate likely has better constant dependency that leads to faster convergence in practice. For non-conjugate likelihoods, we show that stochastic NGVI with the canonical parameterization implicitly optimizes a non-convex objective. Thus, a global convergence rate of $\\mathcal{O}(\\frac{1}{T})$ is unlikely without some significant new understanding of optimizing the ELBO using natural gradients.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kaiwen Wu;Jacob R. Gardner", "authorids": "~Kaiwen_Wu2;~Jacob_R._Gardner1", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Kaiwen_Wu2;~Jacob_R._Gardner1", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nwu2024understanding,\ntitle={Understanding Stochastic Natural Gradient Variational Inference},\nauthor={Kaiwen Wu and Jacob R. Gardner},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=l8GrPpsZfy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1211236, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3073845603162228540&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": ";", "author_num": 2 }, { "title": "Feel-Good Thompson Sampling for Contextual Dueling Bandits", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33221", "id": "l9ga3iQuHt", "proceeding": "https://proceedings.mlr.press/v235/li24co.html", "pdf": "https://openreview.net/pdf?id=l9ga3iQuHt", "openreview": "https://openreview.net/forum?id=l9ga3iQuHt", "author_site": "Xuheng Li, Heyang Zhao, Quanquan Gu", "tldr": "", "abstract": "Contextual dueling bandits, where a learner compares two options based on context and receives feedback indicating which was preferred, extends classic dueling bandits by incorporating contextual information for decision-making and preference learning. Several algorithms based on the upper confidence bound (UCB) have been proposed for linear contextual dueling bandits. However, no algorithm based on posterior sampling has been developed in this setting, despite the empirical success observed in traditional contextual bandits. In this paper, we propose a Thompson sampling algorithm, named FGTS.CDB, for linear contextual dueling bandits. At the core of our algorithm is a new Feel-Good exploration term specifically tailored for dueling bandits. This term leverages the independence of the two selected arms, thereby avoiding a cross term in the analysis. We show that our algorithm achieves nearly minimax-optimal regret, i.e., $\\tilde{\\mathcal{O}}(d\\sqrt T)$, where $d$ is the model dimension and $T$ is the time horizon. Finally, we evaluate our algorithm on synthetic data and observe that FGTS.CDB outperforms existing algorithms by a large margin.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xuheng Li;Heyang Zhao;Quanquan Gu", "authorids": "~Xuheng_Li1;~Heyang_Zhao1;~Quanquan_Gu1", "gender": "M;M;M", "homepage": "http://www.pku.edu.cn;https://web.cs.ucla.edu/~hyzhao/;http://web.cs.ucla.edu/~qgu/", "dblp": "330/7681;;50/4597", "google_scholar": ";zHQ1ap0AAAAJ;GU9HgNAAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Xuheng_Li1;~Heyang_Zhao1;~Quanquan_Gu1", "aff": "ByteDance Inc.;Computer Science Department, University of California, Los Angeles;University of California, Los Angeles", "aff_domain": "bytedance.com;cs.ucla.edu;cs.ucla.edu", "position": "Intern;PhD student;Associate Professor", "bibtex": "@inproceedings{\nli2024feelgood,\ntitle={Feel-Good Thompson Sampling for Contextual Dueling Bandits},\nauthor={Xuheng Li and Heyang Zhao and Quanquan Gu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=l9ga3iQuHt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1673727, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10704676440946163734&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "bytedance.com;cs.ucla.edu;cs.ucla.edu", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "ByteDance;University of California, Los Angeles", "aff_unique_dep": ";Computer Science Department", "aff_unique_url": "https://www.bytedance.com;https://www.ucla.edu", "aff_unique_abbr": "ByteDance;UCLA", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;United States" }, { "title": "Coarse-To-Fine Tensor Trains for Compact Visual Representations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33220", "id": "lGZUvfP2ZF", "proceeding": "https://proceedings.mlr.press/v235/loeschcke24a.html", "pdf": "https://openreview.net/pdf?id=lGZUvfP2ZF", "openreview": "https://openreview.net/forum?id=lGZUvfP2ZF", "author_site": "Sebastian Loeschcke, Dan Wang, Christian Leth-Espensen, Serge Belongie, Michael Kastoryano, Sagie Benaim", "tldr": "", "abstract": "The ability to learn compact, high-quality, and easy-to-optimize representations for visual data is paramount to many applications such as novel view synthesis and 3D reconstruction. Recent work has shown substantial success in using tensor networks to design such compact and high-quality representations. However, the ability to optimize tensor-based representations, and in particular, the highly compact tensor train representation, is still lacking. This has prevented practitioners from deploying the full potential of tensor networks for visual data. To this end, we propose 'Prolongation Upsampling Tensor Train (PuTT)', a novel method for learning tensor train representations in a coarse-to-fine manner. Our method involves the prolonging or `upsampling' of a learned tensor train representation, creating a sequence of 'coarse-to-fine' tensor trains that are incrementally refined. We evaluate our representation along three axes: (1). compression, (2). denoising capability, and (3). image completion capability. To assess these axes, we consider the tasks of image fitting, 3D fitting, and novel view synthesis, where our method shows an improved performance compared to state-of-the-art tensor-based methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sebastian Bugge Loeschcke;Dan Wang;Christian Munklinde Leth-Espensen;Serge Belongie;Michael Kastoryano;Sagie Benaim", "authorids": "~Sebastian_Bugge_Loeschcke1;~Dan_Wang3;~Christian_Munklinde_Leth-Espensen1;~Serge_Belongie1;~Michael_Kastoryano1;~Sagie_Benaim1", "gender": "M;F;M;M;M;M", "homepage": "https://sebulo.github.io/;;;https://di.ku.dk/english/staff/?pure=en%2Fpersons%2Fserge-belongie(0ce65383-3761-4b17-948a-83b461e371e2)%2Fpublications.html;https://mkastoryano.com/;https://sagiebenaim.github.io/", "dblp": "267/7543;;;http://dblp.uni-trier.de/pers/hd/b/Belongie:Serge_J=;;129/1316", "google_scholar": "_aM-ud8AAAAJ;tHbMyNoAAAAJ;;ORr4XJYAAAAJ;https://scholar.google.dk/citations?user=2roUPxkAAAAJ;-zSM2I8AAAAJ", "orcid": ";;;0000-0002-0388-5217;;0000-0003-0002-3467", "linkedin": "sebastian-loeschcke/;;christian-leth-espensen-01bb86208;sergebelongie;;sagie-benaim-aab47474/", "or_profile": "~Sebastian_Bugge_Loeschcke1;~Dan_Wang3;~Christian_Munklinde_Leth-Espensen1;~Serge_Belongie1;~Michael_Kastoryano1;~Sagie_Benaim1", "aff": "University of Copenhagen;University of Copenhagen;;University of Copenhagen;IT University of Copenhagen;Hebrew University of Jerusalem", "aff_domain": "diku.dk;di.ku;;ku.dk;itu.dk;huji.ac.il", "position": "PhD student;Postdoc;;Full Professor;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nloeschcke2024coarsetofine,\ntitle={Coarse-To-Fine Tensor Trains for Compact Visual Representations},\nauthor={Sebastian Bugge Loeschcke and Dan Wang and Christian Munklinde Leth-Espensen and Serge Belongie and Michael Kastoryano and Sagie Benaim},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lGZUvfP2ZF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8679386, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12572951063962363028&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "diku.dk;di.ku;;ku.dk;itu.dk;huji.ac.il", "author_num": 6, "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "University of Copenhagen;IT University of Copenhagen;Hebrew University of Jerusalem", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ku.dk;https://itu.dk;https://www.huji.ac.il", "aff_unique_abbr": "UCPH;ITU;HUJI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Jerusalem", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "Denmark;Israel" }, { "title": "Which Frequencies do CNNs Need? Emergent Bottleneck Structure in Feature Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33219", "id": "lGvIV4Bgsz", "proceeding": "https://proceedings.mlr.press/v235/wen24d.html", "pdf": "https://openreview.net/pdf?id=lGvIV4Bgsz", "openreview": "https://openreview.net/forum?id=lGvIV4Bgsz", "author_site": "Yuxiao Wen, Arthur Jacot", "tldr": "", "abstract": "We describe the emergence of a Convolution Bottleneck (CBN) structure in CNNs, where the network uses its first few layers to transform the input representation into a representation that is supported only along a few frequencies and channels, before using the last few layers to map back to the outputs. We define the CBN rank, which describes the number and type of frequencies that are kept inside the bottleneck, and partially prove that the parameter norm required to represent a function $f$ scales as depth times the CBN rank $f$. We also show that the parameter norm depends at next order on the regularity of $f$. We show that any network with almost optimal parameter norm will exhibit a CBN structure in both the weights and - under the assumption that the network is stable under large learning rate - the activations, which motivates the common practice of down-sampling; and we verify that the CBN results still hold with down-sampling. Finally we use the CBN structure to interpret the functions learned by CNNs on a number of tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuxiao Wen;Arthur Jacot", "authorids": "~Yuxiao_Wen1;~Arthur_Jacot1", "gender": "M;M", "homepage": ";", "dblp": "298/1362;222/2747", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.ch/citations?user=G6OhFawAAAAJ", "orcid": ";", "linkedin": "yuxiao-wen-4b3162161/;", "or_profile": "~Yuxiao_Wen1;~Arthur_Jacot1", "aff": "New York University;NYU, New York University", "aff_domain": "nyu.edu;cims.nyu.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nwen2024which,\ntitle={Which Frequencies do {CNN}s Need? Emergent Bottleneck Structure in Feature Learning},\nauthor={Yuxiao Wen and Arthur Jacot},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lGvIV4Bgsz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 835620, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14662024149480738926&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "nyu.edu;cims.nyu.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "New York University", "aff_unique_dep": "", "aff_unique_url": "https://www.nyu.edu", "aff_unique_abbr": "NYU", "aff_campus_unique_index": "1", "aff_campus_unique": ";New York", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "HelmFluid: Learning Helmholtz Dynamics for Interpretable Fluid Prediction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33218", "id": "lHJFfDFbm6", "proceeding": "https://proceedings.mlr.press/v235/xing24c.html", "pdf": "https://openreview.net/pdf?id=lHJFfDFbm6", "openreview": "https://openreview.net/forum?id=lHJFfDFbm6", "author_site": "Lanxiang Xing, Haixu Wu, yuezhou ma, Jianmin Wang, Mingsheng Long", "tldr": "", "abstract": "Fluid prediction is a long-standing challenge due to the intrinsic high-dimensional non-linear dynamics. Previous methods usually utilize the non-linear modeling capability of deep models to directly estimate velocity fields for future prediction. However, skipping over inherent physical properties but directly learning superficial velocity fields will overwhelm the model from generating precise or physics-reliable results. In this paper, we propose the HelmFluid toward an accurate and interpretable predictor for fluid. Inspired by the Helmholtz theorem, we design a HelmDynamics block to learn Helmholtz dynamics, which decomposes fluid dynamics into more solvable curl-free and divergence-free parts, physically corresponding to potential and stream functions of fluid. By embedding the HelmDynamics block into a Multiscale Multihead Integral Architecture, HelmFluid can integrate learned Helmholtz dynamics along temporal dimension in multiple spatial scales to yield future fluid. Compared with previous velocity estimating methods, HelmFluid is faithfully derived from Helmholtz theorem and ravels out complex fluid dynamics with physically interpretable evidence. Experimentally, HelmFluid achieves consistent state-of-the-art in both numerical simulated and real-world observed benchmarks, even for scenarios with complex boundaries.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lanxiang Xing;Haixu Wu;Yuezhou Ma;Jianmin Wang;Mingsheng Long", "authorids": "~Lanxiang_Xing2;~Haixu_Wu1;~Yuezhou_Ma1;~Jianmin_Wang1;~Mingsheng_Long5", "gender": "M;M;M;M;M", "homepage": "https://github.com/BluesCrossing;;https://github.com/mayz20;https://www.thss.tsinghua.edu.cn/en/faculty/jianminwang.htm;http://ise.thss.tsinghua.edu.cn/~mlong", "dblp": ";286/8115;359/0553;06/3456-1.html;74/9023", "google_scholar": ";oLL_x0wAAAAJ;;https://scholar.google.com.tw/citations?user=MiovcboAAAAJ;_MjXpXkAAAAJ", "orcid": "0000-0001-5928-3242;;;0000-0001-6841-7943;0000-0002-5412-9120", "linkedin": ";;;;", "or_profile": "~Lanxiang_Xing2;~Haixu_Wu1;~Yuezhou_Ma1;~Jianmin_Wang1;~Mingsheng_Long2", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "MS student;PhD student;Undergrad student;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nxing2024helmfluid,\ntitle={HelmFluid: Learning Helmholtz Dynamics for Interpretable Fluid Prediction},\nauthor={Lanxiang Xing and Haixu Wu and Yuezhou Ma and Jianmin Wang and Mingsheng Long},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lHJFfDFbm6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 10236604, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13999775206158131553&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Robust Stable Spiking Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33217", "id": "lIYtJtpJR0", "proceeding": "https://proceedings.mlr.press/v235/ding24e.html", "pdf": "https://openreview.net/pdf?id=lIYtJtpJR0", "openreview": "https://openreview.net/forum?id=lIYtJtpJR0", "author_site": "Ding Jianhao, Zhiyu Pan, Yujia Liu, Zhaofei Yu, Tiejun Huang", "tldr": "", "abstract": "Spiking neural networks (SNNs) are gaining popularity in deep learning due to their low energy budget on neuromorphic hardware. However, they still face challenges in lacking sufficient robustness to guard safety-critical applications such as autonomous driving. Many studies have been conducted to defend SNNs from the threat of adversarial attacks. This paper aims to uncover the robustness of SNN through the lens of the stability of nonlinear systems. We are inspired by the fact that searching for parameters altering the leaky integrate-and-fire dynamics can enhance their robustness. Thus, we dive into the dynamics of membrane potential perturbation and simplify the formulation of the dynamics. We present that membrane potential perturbation dynamics can reliably convey the intensity of perturbation. Our theoretical analyses imply that the simplified perturbation dynamics satisfy input-output stability. Thus, we propose a training framework with modified SNN neurons and to reduce the mean square of membrane potential perturbation aiming at enhancing the robustness of SNN. Finally, we experimentally verify the effectiveness of the framework in the setting of Gaussian noise training and adversarial training on the image classification task. Please refer to https://github.com/DingJianhao/stable-snn for our code implementation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jianhao Ding;Zhiyu Pan;Yujia Liu;Zhaofei Yu;Tiejun Huang", "authorids": "~Jianhao_Ding1;~Zhiyu_Pan3;~Yujia_Liu1;~Zhaofei_Yu1;~Tiejun_Huang1", "gender": "M;M;F;M;M", "homepage": "https://dingjianhao.github.io/;https://gitee.com/zhiyu02;;https://yuzhaofei.github.io;https://idm.pku.edu.cn/~tjhuang/", "dblp": "128/2534;;42/10221.html;166/0573;h/TiejunHuang", "google_scholar": "4rDfCSsAAAAJ;;iDyKEuwAAAAJ;qaUgD50AAAAJ;https://scholar.google.com.tw/citations?user=knvEK4AAAAAJ", "orcid": ";;0000-0001-7356-3937;;0000-0002-4234-6099", "linkedin": ";;;;", "or_profile": "~Jianhao_Ding1;~Zhiyu_Pan3;~Yujia_Liu1;~Zhaofei_Yu1;~Tiejun_Huang1", "aff": "Institute of Automation, Chinese Academy of Sciences;Peking University;Peking University;Peking University;Peking University", "aff_domain": "ia.ac.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn", "position": "Intern;Intern;Researcher;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nding2024robust,\ntitle={Robust Stable Spiking Neural Networks},\nauthor={Jianhao Ding and Zhiyu Pan and Yujia Liu and Zhaofei Yu and Tiejun Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lIYtJtpJR0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1444284, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13323900389971377837&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "ia.ac.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn", "author_num": 5, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "Chinese Academy of Sciences;Peking University", "aff_unique_dep": "Institute of Automation;", "aff_unique_url": "http://www.ia.cas.cn;http://www.pku.edu.cn", "aff_unique_abbr": "CAS;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Inferring the Long-Term Causal Effects of Long-Term Treatments from Short-Term Experiments", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33216", "id": "lQ2o7JteMO", "proceeding": "https://proceedings.mlr.press/v235/tran24b.html", "pdf": "https://openreview.net/pdf?id=lQ2o7JteMO", "openreview": "https://openreview.net/forum?id=lQ2o7JteMO", "author_site": "Allen Tran, Aurelien Bibaut, Nathan Kallus", "tldr": "", "abstract": "We study inference on the long-term causal effect of a continual exposure to a novel intervention, which we term a long-term treatment, based on an experiment involving only short-term observations. Key examples include the long-term health effects of regularly-taken medicine or of environmental hazards and the long-term effects on users of changes to an online platform. This stands in contrast to short-term treatments or \"shocks,\" whose long-term effect can reasonably be mediated by short-term observations, enabling the use of surrogate methods. Long-term treatments by definition have direct effects on long-term outcomes via continual exposure, so surrogacy conditions cannot reasonably hold. We connect the problem with offline reinforcement learning, leveraging doubly-robust estimators to estimate long-term causal effects for long-term treatments and construct confidence intervals.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Allen Tran;Aurelien Bibaut;Nathan Kallus", "authorids": "~Allen_Tran1;~Aurelien_Bibaut1;~Nathan_Kallus1", "gender": ";;", "homepage": "http://allentran.github.io/;https://www.stat.berkeley.edu/~aurelien.bibaut/;http://nathankallus.com/", "dblp": ";https://dblp.uni-trier.de/pers/hd/b/Bibaut:Aur=eacute=lien;142/2900", "google_scholar": ";;K2WfIlsAAAAJ", "orcid": ";;0000-0003-1672-0507", "linkedin": ";;", "or_profile": "~Allen_Tran1;~Aurelien_Bibaut1;~Nathan_Kallus1", "aff": "NetFlix;;Cornell University", "aff_domain": "netflix.com;;cornell.edu", "position": "Principal Researcher;;Associate Professor", "bibtex": "@inproceedings{\ntran2024inferring,\ntitle={Inferring the Long-Term Causal Effects of Long-Term Treatments from Short-Term Experiments},\nauthor={Allen Tran and Aurelien Bibaut and Nathan Kallus},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lQ2o7JteMO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 383601, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1193457748281130234&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "netflix.com;;cornell.edu", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Netflix;Cornell University", "aff_unique_dep": ";", "aff_unique_url": "https://www.netflix.com;https://www.cornell.edu", "aff_unique_abbr": "Netflix;Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "GaussianPro: 3D Gaussian Splatting with Progressive Propagation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33215", "id": "lQ3SEBH1gF", "proceeding": "https://proceedings.mlr.press/v235/cheng24f.html", "pdf": "https://openreview.net/pdf?id=lQ3SEBH1gF", "openreview": "https://openreview.net/forum?id=lQ3SEBH1gF", "author_site": "Kai Cheng, Xiaoxiao Long, Kaizhi Yang, Yao Yao, Wei Yin, Yuexin Ma, Wenping Wang, Xuejin Chen", "tldr": "", "abstract": "3D Gaussian Splatting (3DGS) has recently revolutionized the field of neural rendering with its high fidelity and efficiency. However, 3DGS heavily depends on the initialized point cloud produced by Structure-from-Motion (SfM) techniques. When tackling large-scale scenes that unavoidably contain texture-less surfaces, SfM techniques fail to produce enough points in these surfaces and cannot provide good initialization for 3DGS. As a result, 3DGS suffers from difficult optimization and low-quality renderings. In this paper, inspired by classic multi-view stereo (MVS) techniques, we propose GaussianPro, a novel method that applies a progressive propagation strategy to guide the densification of the 3D Gaussians. Compared to the simple split and clone strategies used in 3DGS, our method leverages the priors of the existing reconstructed geometries of the scene and utilizes patch matching to produce new Gaussians with accurate positions and orientations. Experiments on both large-scale and small-scale scenes validate the effectiveness of our method. Our method significantly surpasses 3DGS on the Waymo dataset, exhibiting an improvement of 1.15dB in terms of PSNR. Codes and data are available at https://github.com/kcheng1021/GaussianPro.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kai Cheng;Xiaoxiao Long;Kaizhi Yang;Yao Yao;Wei Yin;Yuexin Ma;Wenping Wang;Xuejin Chen", "authorids": "~Kai_Cheng1;~Xiaoxiao_Long2;~Kaizhi_Yang1;~Yao_Yao1;~Wei_Yin2;~Yuexin_Ma2;~Wenping_Wang1;~Xuejin_Chen1", "gender": "M;;M;M;M;F;M;", "homepage": "https://cklibra.github.io/;;https://silenkzyoung.github.io/KaizhiYang/;https://yoyo000.github.io/;https://yvanyin.net/;http://yuexinma.me/aboutme.html;https://engineering.tamu.edu/cse/profiles/Wang-Wenping.html;", "dblp": "23/2177;;;07/4410-8;67/4051-6;209/5925;;", "google_scholar": "LeDSFrAAAAAJ;;;MGxaDVEAAAAJ;ZIf_rtcAAAAJ;;28shvv0AAAAJ;", "orcid": ";;;;;;0000-0002-2284-3952;", "linkedin": ";;;;;;;", "or_profile": "~Kai_Cheng1;~Xiaoxiao_Long2;~Kaizhi_Yang1;~Yao_Yao1;~Wei_Yin2;~Yuexin_Ma2;~Wenping_Wang1;~Xuejin_Chen1", "aff": "University of Science and Technology of China;;University of Science and Technology of China;Nanjing University; Shenzhen DJI Sciences and Technologies Ltd.;ShanghaiTech University;Texas A&M University - College Station;", "aff_domain": "ustc.edu.cn;;ustc.edu.cn;nju.edu.cn;dji.com;shanghaitech.edu.cn;tamu.edu;", "position": "PhD student;;PhD student;Associate Professor;Researcher;Assistant Professor;Full Professor;", "bibtex": "@inproceedings{\ncheng2024gaussianpro,\ntitle={GaussianPro: 3D Gaussian Splatting with Progressive Propagation},\nauthor={Kai Cheng and Xiaoxiao Long and Kaizhi Yang and Yao Yao and Wei Yin and Yuexin Ma and Wenping Wang and Xuejin Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lQ3SEBH1gF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8757404, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4404315513587124942&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "ustc.edu.cn;;ustc.edu.cn;nju.edu.cn;dji.com;shanghaitech.edu.cn;tamu.edu;", "author_num": 8, "aff_unique_index": "0;0;1;2;3;4", "aff_unique_norm": "University of Science and Technology of China;Nanjing University;DJI Sciences and Technologies;ShanghaiTech University;Texas A&M University", "aff_unique_dep": ";;;;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.nju.edu.cn;https://www.dji.com;https://www.shanghaitech.edu.cn;https://www.tamu.edu", "aff_unique_abbr": "USTC;Nanjing U;DJI;ShanghaiTech;TAMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";College Station", "aff_country_unique_index": "0;0;0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Counterfactual Reasoning for Multi-Label Image Classification via Patching-Based Training", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33214", "id": "lQIN9ZyMLz", "proceeding": "https://proceedings.mlr.press/v235/xie24i.html", "pdf": "https://openreview.net/pdf?id=lQIN9ZyMLz", "openreview": "https://openreview.net/forum?id=lQIN9ZyMLz", "author_site": "Ming-Kun Xie, Jia-Hao Xiao, Pei Peng, Gang Niu, Masashi Sugiyama, Sheng-Jun Huang", "tldr": "", "abstract": "The key to multi-label image classification (MLC) is to improve model performance by leveraging label correlations. Unfortunately, it has been shown that overemphasizing co-occurrence relationships can cause the overfitting issue of the model, ultimately leading to performance degradation. In this paper, we provide a causal inference framework to show that the correlative features caused by the target object and its co-occurring objects can be regarded as a mediator, which has both positive and negative impacts on model predictions. On the positive side, the mediator enhances the recognition performance of the model by capturing co-occurrence relationships; on the negative side, it has the harmful causal effect that causes the model to make an incorrect prediction for the target object, even when only co-occurring objects are present in an image. To address this problem, we propose a counterfactual reasoning method to measure the total direct effect, achieved by enhancing the direct effect caused only by the target object. Due to the unknown location of the target object, we propose patching-based training and inference to accomplish this goal, which divides an image into multiple patches and identifies the pivot patch that contains the target object. Experimental results on multiple benchmark datasets with diverse configurations validate that the proposed method can achieve state-of-the-art performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ming-Kun Xie;Jia-Hao Xiao;Pei Peng;Gang Niu;Masashi Sugiyama;Sheng-Jun Huang", "authorids": "~Ming-Kun_Xie1;~Jia-Hao_Xiao1;~Pei_Peng1;~Gang_Niu1;~Masashi_Sugiyama1;~Sheng-Jun_Huang1", "gender": "M;M;M;M;;M", "homepage": "http://www.xiemk.pro/;https://cs.nuaa.edu.cn/1965/list.htm;https://niug1984.github.io;http://www.ms.k.u-tokyo.ac.jp/sugi/;http://parnec.nuaa.edu.cn/huangsj;", "dblp": "215/4362;207/7514-5;26/3367-1;35/1228;01/3367.html;238/4029", "google_scholar": "https://scholar.google.co.jp/citations?hl=zh-CN;;https://scholar.google.co.jp/citations?user=HOkcy00AAAAJ;https://scholar.google.co.jp/citations?user=GkYIrlIAAAAJ;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.tw/citations?view_op=list_works", "orcid": ";0009-0006-2658-0594;;0000-0001-6658-6743;0000-0002-7673-5367;", "linkedin": ";;;;;", "or_profile": "~Ming-Kun_Xie1;~Pei_Peng1;~Gang_Niu1;~Masashi_Sugiyama1;~Sheng-Jun_Huang1;~Jiahao_Xiao2", "aff": "Nanjing University of Aeronautics and Astronautics;Nanjing University of Aeronautics and Astronautics;Southeast University;The University of Tokyo;Nanjing University of Aeronautics and Astronautics;Nanjing University of Aeronautics and Astronautics", "aff_domain": "nuaa.edu.cn;nuaa.edu;seu.edu.cn;u-tokyo.ac.jp;nuaa.edu.cn;nuaa.edu.cn", "position": "PhD student;PhD student;Adjunct Full Professor;Full Professor;Full Professor;MS student", "bibtex": "@inproceedings{\nxie2024counterfactual,\ntitle={Counterfactual Reasoning for Multi-Label Image Classification via Patching-Based Training},\nauthor={Ming-Kun Xie and Jia-Hao Xiao and Pei Peng and Gang Niu and Masashi Sugiyama and Sheng-Jun Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lQIN9ZyMLz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 923143, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17454650979201419448&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "nuaa.edu.cn;nuaa.edu;seu.edu.cn;u-tokyo.ac.jp;nuaa.edu.cn;nuaa.edu.cn", "author_num": 6, "aff_unique_index": "0;0;1;2;0;0", "aff_unique_norm": "Nanjing University of Aeronautics and Astronautics;Southeast University;University of Tokyo", "aff_unique_dep": ";;", "aff_unique_url": "http://www.nuaa.edu.cn;https://www.seu.edu.cn/;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "NUAA;SEU;UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "China;Japan" }, { "title": "Unsupervised Concept Discovery Mitigates Spurious Correlations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33213", "id": "lQzmDFlsHX", "proceeding": "https://proceedings.mlr.press/v235/arefin24a.html", "pdf": "https://openreview.net/pdf?id=lQzmDFlsHX", "openreview": "https://openreview.net/forum?id=lQzmDFlsHX", "author_site": "Md Rifat Arefin, Yan Zhang, Aristide Baratin, Francesco Locatello, Irina Rish, Dianbo Liu, Kenji Kawaguchi", "tldr": "", "abstract": "Models prone to spurious correlations in training data often produce brittle predictions and introduce unintended biases. Addressing this challenge typically involves methods relying on prior knowledge and group annotation to remove spurious correlations, which may not be readily available in many applications. In this paper, we establish a novel connection between unsupervised object-centric learning and mitigation of spurious correlations. Instead of directly inferring subgroups with varying correlations with labels, our approach focuses on discovering concepts: discrete ideas that are shared across input samples. Leveraging existing object-centric representation learning, we introduce CoBalT: a concept balancing technique that effectively mitigates spurious correlations without requiring human labeling of subgroups. Evaluation across the benchmark datasets for sub-population shifts demonstrate superior or competitive performance compared state-of-the-art baselines, without the need for group annotation. Code is available at https://github.com/rarefin/CoBalT", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Md Rifat Arefin;Yan Zhang;Aristide Baratin;Francesco Locatello;Irina Rish;Dianbo Liu;Kenji Kawaguchi", "authorids": "~Md_Rifat_Arefin1;~Yan_Zhang1;~Aristide_Baratin1;~Francesco_Locatello1;~Irina_Rish1;~Dianbo_Liu2;~Kenji_Kawaguchi1", "gender": ";M;;M;F;;", "homepage": ";https://www.cyanogenoid.com;;https://twitter.com/FrancescoLocat8;http://irina-rish.com;;https://ml.comp.nus.edu.sg/#members", "dblp": ";04/3348-67;;195/6074;;;", "google_scholar": ";https://scholar.google.co.uk/citations?user=XtCqbfEAAAAJ;;;Avse5gIAAAAJ;;aLl3rYoAAAAJ", "orcid": ";0000-0003-3470-3663;;;;;", "linkedin": ";;;;irina-rish-8b2162;;", "or_profile": "~Md_Rifat_Arefin1;~Yan_Zhang1;~Aristide_Baratin1;~Francesco_Locatello1;~Irina_Rish1;~Dianbo_Liu2;~Kenji_Kawaguchi1", "aff": ";Mila - Quebec Artificial Intelligence Institute;;Institute of Science and Technology;University of Montreal;;National University of Singapore", "aff_domain": ";mila.quebec;;ist.ac.at;mila.quebec;;nus.edu", "position": ";Industrial Partner;;Assistant Professor;Professor;;Presidential Young Professor", "bibtex": "@inproceedings{\narefin2024unsupervised,\ntitle={Unsupervised Concept Discovery Mitigates Spurious Correlations},\nauthor={Md Rifat Arefin and Yan Zhang and Aristide Baratin and Francesco Locatello and Irina Rish and Dianbo Liu and Kenji Kawaguchi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lQzmDFlsHX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9681592, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1645610072277806501&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": ";mila.quebec;;ist.ac.at;mila.quebec;;nus.edu", "author_num": 7, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Quebec Artificial Intelligence Institute;Institute of Science and Technology;University of Montreal;National University of Singapore", "aff_unique_dep": "Artificial Intelligence;;;", "aff_unique_url": "https://mila.quebec;;https://wwwumontreal.ca;https://www.nus.edu.sg", "aff_unique_abbr": "Mila;;UM;NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;2", "aff_country_unique": "Canada;;Singapore" }, { "title": "Predictive Linear Online Tracking for Unknown Targets", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33212", "id": "lT3W4AkyM7", "proceeding": "https://proceedings.mlr.press/v235/tsiamis24a.html", "pdf": "https://openreview.net/pdf?id=lT3W4AkyM7", "openreview": "https://openreview.net/forum?id=lT3W4AkyM7", "author_site": "Anastasios Tsiamis, Aren Karapetyan, Yueshan Li, Efe C. Balta, John Lygeros", "tldr": "", "abstract": "In this paper, we study the problem of online tracking in linear control systems, where the objective is to follow a moving target. Unlike classical tracking control, the target is unknown, non-stationary, and its state is revealed sequentially, thus, fitting the framework of online non-stochastic control. We consider the case of quadratic costs and propose a new algorithm, called predictive linear online tracking (PLOT). The algorithm uses recursive least squares with exponential forgetting to learn a time-varying dynamic model of the target. The learned model is used in the optimal policy under the framework of receding horizon control. We show the dynamic regret of PLOT scales with $\\mathcal{O}(\\sqrt{TV_T})$, where $V_T$ is the total variation of the target dynamics and $T$ is the time horizon. Unlike prior work, our theoretical results hold for non-stationary targets. We implement our online control algorithm on a real quadrotor, thus, showcasing one of the first successful applications of online control methods on real hardware.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anastasios Tsiamis;Aren Karapetyan;Yueshan Li;Efe C. Balta;John Lygeros", "authorids": "~Anastasios_Tsiamis1;~Aren_Karapetyan1;yuesli@student.ethz.ch;~Efe_C._Balta1;~John_Lygeros1", "gender": ";M;;;M", "homepage": "https://n.ethz.ch/~atsiamis/;http://people.ee.ethz.ch/~akarapetyan/;;https://www.ebalta.me;https://control.ee.ethz.ch/people/profile.john-lygeros.html", "dblp": ";;;204/0958;51/2754", "google_scholar": "CtAqV5cAAAAJ;FHpNIrEAAAAJ;;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-7935-7541;;;0000-0001-8596-8739;0000-0002-6159-1962", "linkedin": ";;;;john-lygeros-662b73233/", "or_profile": "~Anastasios_Tsiamis1;~Aren_Karapetyan1;yuesli@student.ethz.ch;~Efe_C._Balta1;~John_Lygeros1", "aff": "ETHZ - ETH Zurich;ETHZ - ETH Zurich;;ETHZ - ETH Zurich;ETHZ - ETH Zurich", "aff_domain": "ethz.ch;ethz.ch;;ethz.ch;ethz.ch", "position": "Postdoc;PhD student;;Guest Senior Scientist;Full Professor", "bibtex": "@inproceedings{\ntsiamis2024predictive,\ntitle={Predictive Linear Online Tracking for Unknown Targets},\nauthor={Anastasios Tsiamis and Aren Karapetyan and Yueshan Li and Efe C. Balta and John Lygeros},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lT3W4AkyM7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5755405, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8077772393647170503&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "ethz.ch;ethz.ch;;ethz.ch;ethz.ch", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "ETH Zurich", "aff_unique_dep": "", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Generalization to New Sequential Decision Making Tasks with In-Context Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33211", "id": "lVQ4FUZ6dp", "proceeding": "https://proceedings.mlr.press/v235/raparthy24a.html", "pdf": "https://openreview.net/pdf?id=lVQ4FUZ6dp", "openreview": "https://openreview.net/forum?id=lVQ4FUZ6dp", "author_site": "Sharath Chandra Raparthy, Eric Hambro, Robert Kirk, Mikael Henaff, Roberta Raileanu", "tldr": "", "abstract": "Training autonomous agents that can learn new tasks from only a handful of demonstrations is a long-standing problem in machine learning. Recently, transformers have been shown to learn new language or vision tasks without any weight updates from only a few examples, also referred to as in-context learning. However, the sequential decision making setting poses additional challenges having a lower tolerance for errors since the environment's stochasticity or the agent's actions can lead to unseen, and sometimes unrecoverable, states. In this paper, we use an illustrative example to show that naively applying transformers to sequential decision making problems does not enable in-context learning of new tasks. We then demonstrate how training on sequences of trajectories with certain distributional properties leads to in-context learning of new sequential decision making tasks. We investigate different design choices and find that larger model and dataset sizes, as well as more task diversity, environment stochasticity, and trajectory burstiness, all result in better in-context learning of new out-of-distribution tasks. By training on large diverse offline datasets, our model is able to learn new MiniHack and Procgen tasks without any weight updates from just a handful of demonstrations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sharath Chandra Raparthy;Eric Hambro;Robert Kirk;Mikael Henaff;Roberta Raileanu", "authorids": "~Sharath_Chandra_Raparthy3;~Eric_Hambro1;~Robert_Kirk1;~Mikael_Henaff1;~Roberta_Raileanu2", "gender": "M;M;M;M;F", "homepage": "https://sharathraparthy.github.io/;https://erichambro.com/;https://robertkirk.github.io;http://www.mikaelhenaff.com;https://rraileanu.github.io/", "dblp": "302/4190;290/1986;01/9684;86/10571;215/5579", "google_scholar": "https://scholar.google.ca/citations?user=S1R0_UMAAAAJ;ehquBPIAAAAJ;https://scholar.google.co.uk/citations?user=PL5KWdYAAAAJ;bX__wkYAAAAJ;9hVXpJ0AAAAJ", "orcid": ";;;;", "linkedin": ";eric-hambro;;;roberta-raileanu-44b25660/", "or_profile": "~Sharath_Chandra_Raparthy3;~Eric_Hambro1;~Robert_Kirk1;~Mikael_Henaff1;~Roberta_Raileanu1", "aff": "Meta Facebook;Anthropic;University College London;Meta;Meta Facebook", "aff_domain": "fb.com;anthropic.com;ucl.ac.uk;meta.com;fb.com", "position": "Researcher;Researcher;PhD student;Researcher;Researcher", "bibtex": "@inproceedings{\nraparthy2024generalization,\ntitle={Generalization to New Sequential Decision Making Tasks with In-Context Learning},\nauthor={Sharath Chandra Raparthy and Eric Hambro and Robert Kirk and Mikael Henaff and Roberta Raileanu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lVQ4FUZ6dp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1747996, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4151609489039840724&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "fb.com;anthropic.com;ucl.ac.uk;meta.com;fb.com", "author_num": 5, "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "Meta;Anthropic;University College London", "aff_unique_dep": "Meta Platforms, Inc.;;", "aff_unique_url": "https://meta.com;https://www.anthropic.com;https://www.ucl.ac.uk", "aff_unique_abbr": "Meta;Anthropic;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Revisiting Inexact Fixed-Point Iterations for Min-Max Problems: Stochasticity and Structured Nonconvexity", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33210", "id": "lWy2lCTyJa", "proceeding": "https://proceedings.mlr.press/v235/alacaoglu24a.html", "pdf": "https://openreview.net/pdf?id=lWy2lCTyJa", "openreview": "https://openreview.net/forum?id=lWy2lCTyJa", "author_site": "Ahmet Alacaoglu, Donghwan Kim, Stephen Wright", "tldr": "", "abstract": "We focus on constrained, $L$-smooth, potentially stochastic and nonconvex-nonconcave min-max problems either satisfying $\\rho$-cohypomonotonicity or admitting a solution to the $\\rho$-weakly Minty Variational Inequality (MVI), where larger values of the parameter $\\rho>0$ correspond to a greater degree of nonconvexity. These problem classes include examples in two player reinforcement learning, interaction dominant min-max problems, and certain synthetic test problems on which classical min-max algorithms fail. It has been conjectured that first-order methods can tolerate a value of $\\rho$ no larger than $\\frac{1}{L}$, but existing results in the literature have stagnated at the tighter requirement $\\rho < \\frac{1}{2L}$. With a simple argument, we obtain optimal or best-known complexity guarantees with cohypomonotonicity or weak MVI conditions for $\\rho < \\frac{1}{L}$. First main insight for the improvements in the convergence analyses is to harness the recently proposed *conic nonexpansiveness* property of operators. Second, we provide a refined analysis for inexact Halpern iteration that relaxes the required inexactness level to improve some state-of-the-art complexity results even for constrained stochastic convex-concave min-max problems. Third, we analyze a stochastic inexact Krasnosel'skii-Mann iteration with a multilevel Monte Carlo estimator when the assumptions only hold with respect to a solution.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ahmet Alacaoglu;Donghwan Kim;Stephen Wright", "authorids": "~Ahmet_Alacaoglu2;~Donghwan_Kim2;~Stephen_Wright1", "gender": ";M;M", "homepage": "https://ahmetalacaoglu.github.io;http://mathsci.kaist.ac.kr/~donghwankim/;https://wrightstephen.github.io/sw_proj/", "dblp": "209/4889;05/1032;75/2677", "google_scholar": "-yRi8D4AAAAJ;https://scholar.google.com/citations?hl=en;VFQRIOwAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Ahmet_Alacaoglu2;~Donghwan_Kim2;~Stephen_Wright1", "aff": "University of Wisconsin-Madison;Korea Advanced Institute of Science & Technology;University of Wisconsin, Madison", "aff_domain": "wisc.edu;kaist.ac.kr;wisc.edu", "position": "Postdoc;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nalacaoglu2024revisiting,\ntitle={Revisiting Inexact Fixed-Point Iterations for Min-Max Problems: Stochasticity and Structured Nonconvexity},\nauthor={Ahmet Alacaoglu and Donghwan Kim and Stephen Wright},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lWy2lCTyJa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 590297, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2084563396826317022&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "wisc.edu;kaist.ac.kr;wisc.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Wisconsin-Madison;Korea Advanced Institute of Science and Technology;University of Wisconsin", "aff_unique_dep": ";;", "aff_unique_url": "https://www.wisc.edu;https://www.kaist.ac.kr;https://www.wisc.edu", "aff_unique_abbr": "UW-Madison;KAIST;UW", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;South Korea" }, { "title": "Meta-Reinforcement Learning Robust to Distributional Shift Via Performing Lifelong In-Context Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33209", "id": "laIOUtstMs", "proceeding": "https://proceedings.mlr.press/v235/xu24o.html", "pdf": "https://openreview.net/pdf?id=laIOUtstMs", "openreview": "https://openreview.net/forum?id=laIOUtstMs", "author_site": "TengYe Xu, Zihao Li, Qinyuan Ren", "tldr": "", "abstract": "A key challenge in Meta-Reinforcement Learning (meta-RL) is the task distribution shift, since the generalization ability of most current meta-RL methods is limited to tasks sampled from the training distribution. In this paper, we propose Posterior Sampling Bayesian Lifelong In-Context Reinforcement Learning (PSBL), which is robust to task distribution shift. PSBL meta-trains a variant of transformer to directly perform amortized inference about the Predictive Posterior Distribution (PPD) of the optimal policy. Once trained, the network can infer the PPD online with frozen parameters. The agent then samples actions from the approximate PPD to perform online exploration, which progressively reduces uncertainty and enhances performance in the interaction with the environment. This property is known as in-context learning. Experimental results demonstrate that PSBL significantly outperforms standard Meta RL methods both in tasks with sparse rewards and dense rewards when the test task distribution is strictly shifted from the training distribution.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tengye Xu;Zihao Li;Qinyuan Ren", "authorids": "~Tengye_Xu1;~Zihao_Li7;~Qinyuan_Ren1", "gender": "M;M;F", "homepage": "https://github.com/Jeong-zju;https://person.zju.edu.cn/0008668;https://github.com/xu-ye", "dblp": "175/8858.html;43/3186.html;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Zihao_Li7;~Qinyuan_Ren1;~Tanya-xu1", "aff": "Zhejiang University;Zhejiang University;Zhejiang University", "aff_domain": "zju.edu.cn;zju.edu.cn;zju.edu.cn", "position": "MS student;Full Professor;MS student", "bibtex": "@inproceedings{\nxu2024metareinforcement,\ntitle={Meta-Reinforcement Learning Robust to Distributional Shift Via Performing Lifelong In-Context Learning},\nauthor={Tengye Xu and Zihao Li and Qinyuan Ren},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=laIOUtstMs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 816302, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4172077202347992528&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "zju.edu.cn;zju.edu.cn;zju.edu.cn", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Zhejiang University", "aff_unique_dep": "", "aff_unique_url": "https://www.zju.edu.cn", "aff_unique_abbr": "ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "DMTG: One-Shot Differentiable Multi-Task Grouping", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33208", "id": "lcX5GbDIi8", "proceeding": "https://proceedings.mlr.press/v235/gao24h.html", "pdf": "https://openreview.net/pdf?id=lcX5GbDIi8", "openreview": "https://openreview.net/forum?id=lcX5GbDIi8", "author_site": "Yuan Gao, Shuguo Jiang, Moran Li, Jin-Gang Yu, Gui-Song Xia", "tldr": "", "abstract": "We aim to address Multi-Task Learning (MTL) with a large number of tasks by Multi-Task Grouping (MTG). Given $N$ tasks, we propose to **simultaneously identify the best task groups from $2^N$ candidates and train the model weights simultaneously in one-shot**, with **the high-order task-affinity fully exploited**. This is distinct from the pioneering methods which sequentially identify the groups and train the model weights, where the group identification often relies on heuristics. As a result, our method not only improves the training efficiency, but also mitigates the objective bias introduced by the sequential procedures that potentially leads to a suboptimal solution. Specifically, **we formulate MTG as a fully differentiable pruning problem on an adaptive network architecture determined by an unknown Categorical distribution**. To categorize $N$ tasks into $K$ groups (represented by $K$ encoder branches), we initially set up $KN$ task heads, where each branch connects to all $N$ task heads to exploit the high-order task-affinity. Then, we gradually prune the $KN$ heads down to $N$ by learning a relaxed differentiable Categorical distribution, ensuring that each task is exclusively and uniquely categorized into only one branch. Extensive experiments on CelebA and Taskonomy datasets with detailed ablations show the promising performance and efficiency of our method. The codes are available at https://github.com/ethanygao/DMTG.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuan Gao;Shuguo Jiang;Moran Li;Jin-Gang Yu;Gui-Song Xia", "authorids": "~Yuan_Gao4;~Shuguo_Jiang1;~Moran_Li1;~Jin-Gang_Yu1;~Gui-Song_Xia3", "gender": ";M;F;M;", "homepage": ";;;https://yanzhao.scut.edu.cn/open/ExpertInfo.aspx?zjbh=goi4tp1KbAaC80CVL590Bg==;", "dblp": ";;281/7337;https://dblp.uni-trier.de/pers/hd/y/Yu:Jin=Gang;", "google_scholar": ";https://scholar.google.com/citations?view_op=list_works;8pvT01UAAAAJ;uSPkrNcAAAAJ;", "orcid": ";;0000-0002-8149-1117;0000-0003-2148-2726;", "linkedin": ";;;;", "or_profile": "~Yuan_Gao4;~Shuguo_Jiang1;~Moran_Li1;~Jin-Gang_Yu1;~Gui-Song_Xia3", "aff": ";Wuhan University;Tencent Youtu Lab;South China University of Technology;", "aff_domain": ";whu.edu.cn;tencent.com;scut.edu.cn;", "position": ";PhD student;Researcher;Associate Professor;", "bibtex": "@inproceedings{\ngao2024dmtg,\ntitle={{DMTG}: One-Shot Differentiable Multi-Task Grouping},\nauthor={Yuan Gao and Shuguo Jiang and Moran Li and Jin-Gang Yu and Gui-Song Xia},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lcX5GbDIi8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1620629, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16363240089206962536&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 10, "email": ";whu.edu.cn;tencent.com;scut.edu.cn;", "author_num": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "Wuhan University;Tencent;South China University of Technology", "aff_unique_dep": ";Youtu Lab;", "aff_unique_url": "http://www.whu.edu.cn/;https://www.tencent.com;https://www.scut.edu.cn", "aff_unique_abbr": "WHU;Tencent;SCUT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "On the Complexity of Finite-Sum Smooth Optimization under the Polyak\u2013\u0141ojasiewicz Condition", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33207", "id": "leJGQCron2", "proceeding": "https://proceedings.mlr.press/v235/bai24c.html", "pdf": "https://openreview.net/pdf?id=leJGQCron2", "openreview": "https://openreview.net/forum?id=leJGQCron2", "author_site": "Yunyan Bai, Yuxing Liu, Luo Luo", "tldr": "", "abstract": "This paper considers the optimization problem of the form $\\min_{{\\bf x}\\in{\\mathbb R}^d} f({\\bf x})\\triangleq \\frac{1}{n}\\sum_{i=1}^n f_i({\\bf x})$, where $f(\\cdot)$ satisfies the Polyak\u2013\u0141ojasiewicz (PL) condition with parameter $\\mu$ and $\\{f_i(\\cdot)\\}_{i=1}^n$ is $L$-mean-squared smooth. We show that any gradient method requires at least $\\Omega(n+\\kappa\\sqrt{n}\\log(1/\\epsilon))$ incremental first-order oracle (IFO) calls to find an $\\epsilon$-suboptimal solution, where $\\kappa\\triangleq L/\\mu$ is the condition number of the problem. This result nearly matches upper bounds of IFO complexity for best-known first-order methods. We also study the problem of minimizing the PL function in the distributed setting such that the individuals $f_1(\\cdot),\\dots,f_n(\\cdot)$ are located on a connected network of $n$ agents. We provide lower bounds of $\\Omega(\\kappa/\\sqrt{\\gamma}\\log(1/\\epsilon))$, $\\Omega((\\kappa+\\tau\\kappa/\\sqrt{\\gamma})\\log(1/\\epsilon))$ and $\\Omega\\big(n+\\kappa\\sqrt{n}\\log(1/\\epsilon)\\big)$ for communication rounds, time cost and local first-order oracle calls respectively, where $\\gamma\\in(0,1]$ is the spectral gap of the mixing matrix associated with the network and $\\tau>0$ is the time cost of per communication round. Furthermore, we propose a decentralized first-order method that nearly matches above lower bounds in expectation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yunyan Bai;Yuxing Liu;Luo Luo", "authorids": "~Yunyan_Bai1;~Yuxing_Liu1;~Luo_Luo1", "gender": "F;M;M", "homepage": "https://weibo.com/u/7364596711;https://infinity-stars.github.io/;https://luoluo-sds.github.io/", "dblp": ";11/8650;https://dblp.org/pers/hd/l/Luo:Luo", "google_scholar": ";ENZKdAUAAAAJ;NggI9EsAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yunyan_Bai1;~Yuxing_Liu1;~Luo_Luo1", "aff": "Fudan University;Fudan University;Fudan University", "aff_domain": "fudan.edu.cn;fudan.edu.cn;fudan.edu.cn", "position": "MS student;Undergrad student;Assistant Professor", "bibtex": "@inproceedings{\nbai2024on,\ntitle={On the Complexity of Finite-Sum Smooth Optimization under the Polyak{\\textendash}{\\L}ojasiewicz Condition},\nauthor={Yunyan Bai and Yuxing Liu and Luo Luo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=leJGQCron2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 669133, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13234795417139906272&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "fudan.edu.cn;fudan.edu.cn;fudan.edu.cn", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Fudan University", "aff_unique_dep": "", "aff_unique_url": "https://www.fudan.edu.cn", "aff_unique_abbr": "Fudan", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Mean-field Chaos Diffusion Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33206", "id": "lgcFX4VFrM", "proceeding": "https://proceedings.mlr.press/v235/park24f.html", "pdf": "https://openreview.net/pdf?id=lgcFX4VFrM", "openreview": "https://openreview.net/forum?id=lgcFX4VFrM", "author_site": "Sungwoo Park, Dongjun Kim, Ahmed Alaa", "tldr": "", "abstract": "In this paper, we introduce a new class of score-based generative models (SGMs) designed to handle high-cardinality data distributions by leveraging concepts from mean-field theory. We present mean-field chaos diffusion models (MF-CDMs), which address the curse of dimensionality inherent in high-cardinality data by utilizing the propagation of chaos property of interacting particles. By treating high-cardinality data as a large stochastic system of interacting particles, we develop a novel score-matching method for infinite-dimensional chaotic particle systems and propose an approximation scheme that employs a subdivision strategy for efficient training. Our theoretical and empirical results demonstrate the scalability and effectiveness of MF-CDMs for managing large high-cardinality data structures, such as 3D point clouds.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sungwoo Park;Dongjun Kim;Ahmed Alaa", "authorids": "~Sungwoo_Park3;~Dongjun_Kim1;~Ahmed_Alaa1", "gender": "M;M;M", "homepage": "https://sites.google.com/view/dongjun-kim?pli=1;https://alaalab.berkeley.edu/;", "dblp": "03/4394;140/7324;92/6585", "google_scholar": "https://scholar.google.com/citations?hl=ko;https://scholar.google.com.eg/citations?user=_pv1sEcAAAAJ;B1xpjO8AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Dongjun_Kim1;~Ahmed_Alaa1;~Sung_Woo_Park2", "aff": "Sony AI;University of California, Berkeley;University of California, Berkeley", "aff_domain": "sony.com;berkeley.edu;berkeley.edu", "position": "Intern;Assistant Professor;Postdoc", "bibtex": "@inproceedings{\npark2024meanfield,\ntitle={Mean-field Chaos Diffusion Models},\nauthor={Sungwoo Park and Dongjun Kim and Ahmed Alaa},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lgcFX4VFrM}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2277776, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Q5bnr-yNuMQJ:scholar.google.com/&scioq=Mean-field+Chaos+Diffusion+Models&hl=en&as_sdt=0,44", "gs_version_total": 6, "email": "sony.com;berkeley.edu;berkeley.edu", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Sony;University of California, Berkeley", "aff_unique_dep": "Sony AI;", "aff_unique_url": "https://www.sony.com;https://www.berkeley.edu", "aff_unique_abbr": "Sony AI;UC Berkeley", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Japan;United States" }, { "title": "Spike Distance Function as a Learning Objective for Spike Prediction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33205", "id": "limyQ1Kk0k", "proceeding": "https://proceedings.mlr.press/v235/doran24a.html", "pdf": "https://openreview.net/pdf?id=limyQ1Kk0k", "openreview": "https://openreview.net/forum?id=limyQ1Kk0k", "author_site": "Kevin Doran, Marvin Seifert, Carola Yovanovich, Tom Baden", "tldr": "", "abstract": "Approaches to predicting neuronal spike responses commonly use a Poisson learning objective. This objective quantizes responses into spike counts within a fixed summation interval, typically on the order of 10 to 100 milliseconds in duration; however, neuronal responses are often time accurate down to a few milliseconds, and Poisson models struggle to precisely model them at these timescales. We propose the concept of a spike distance function that maps points in time to the temporal distance to the nearest spike. We show that neural networks can be trained to approximate spike distance functions, and we present an efficient algorithm for inferring spike trains from the outputs of these models. Using recordings of chicken and frog retinal ganglion cells responding to visual stimuli, we compare the performance of our approach to that of Poisson models trained with various summation intervals. We show that our approach outperforms the use of Poisson models at spike train inference.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kevin Doran;Marvin Seifert;Carola A. M. Yovanovich;Tom Baden", "authorids": "~Kevin_Doran1;m.seifert@sussex.ac.uk;c.a.m.yovanovich@sussex.ac.uk;t.baden@sussex.ac.uk", "gender": "M;;;", "homepage": "https://blog.kdoran.com;;;", "dblp": "384/4274;;;", "google_scholar": ";;;", "orcid": "0009-0009-3748-6941;;;", "linkedin": ";;;", "or_profile": "~Kevin_Doran1;m.seifert@sussex.ac.uk;c.a.m.yovanovich@sussex.ac.uk;t.baden@sussex.ac.uk", "aff": "University of Sussex;;;", "aff_domain": "sussex.ac.uk;;;", "position": "PhD student;;;", "bibtex": "@inproceedings{\ndoran2024spike,\ntitle={Spike Distance Function as a Learning Objective for Spike Prediction},\nauthor={Kevin Doran and Marvin Seifert and Carola A. M. Yovanovich and Tom Baden},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=limyQ1Kk0k}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2767261, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vF-qq31JhUoJ:scholar.google.com/&scioq=Spike+Distance+Function+as+a+Learning+Objective+for+Spike+Prediction&hl=en&as_sdt=0,33", "gs_version_total": 7, "email": "sussex.ac.uk;;;", "author_num": 4, "aff_unique_index": "0", "aff_unique_norm": "University of Sussex", "aff_unique_dep": "", "aff_unique_url": "https://www.sussex.ac.uk", "aff_unique_abbr": "Sussex", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "title": "Detecting Influence Structures in Multi-Agent Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33204", "id": "lm04PyXoEl", "proceeding": "https://proceedings.mlr.press/v235/pieroth24a.html", "pdf": "https://openreview.net/pdf?id=lm04PyXoEl", "openreview": "https://openreview.net/forum?id=lm04PyXoEl", "author_site": "Fabian Raoul Pieroth, Katherine Fitch, Lenz Belzner", "tldr": "", "abstract": "We consider the problem of quantifying the amount of influence one agent can exert on another in the setting of multi-agent reinforcement learning (MARL). As a step towards a unified approach to express agents' interdependencies, we introduce the total and state influence measurement functions. Both of these are valid for all common MARL systems, such as the discounted reward setting. Additionally, we propose novel quantities, called the total impact measurement (TIM) and state impact measurement (SIM), that characterize one agent's influence on another by the maximum impact it can have on the other agents' expected returns and represent instances of impact measurement functions in the average reward setting. Furthermore, we provide approximation algorithms for TIM and SIM with simultaneously learning approximations of agents' expected returns, error bounds, stability analyses under changes of the policies, and convergence guarantees. The approximation algorithm relies only on observing other agents' actions and is, other than that, fully decentralized. Through empirical studies, we validate our approach's effectiveness in identifying intricate influence structures in complex interactions. Our work appears to be the first study of determining influence structures in the multi-agent average reward setting with convergence guarantees.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fabian Raoul Pieroth;Katherine Fitch;Lenz Belzner", "authorids": "~Fabian_Raoul_Pieroth1;~Katherine_Fitch1;~Lenz_Belzner1", "gender": "M;;", "homepage": "https://www.cs.cit.tum.de/en/dss/members/fabian-pieroth/;;", "dblp": "334/1024.html;;136/1485", "google_scholar": ";https://scholar.google.de/citations?user=ncwIaWgAAAAJ;", "orcid": "0000-0002-5712-1706;;", "linkedin": ";;", "or_profile": "~Fabian_Raoul_Pieroth1;~Katherine_Fitch1;~Lenz_Belzner1", "aff": "Technische Universit\u00e4t M\u00fcnchen;LARALAB;Technische Hochschule Ingolstadt", "aff_domain": "tum.de;laralab.de;thi.de", "position": "PhD student;Researcher;Associate Professor", "bibtex": "@inproceedings{\npieroth2024detecting,\ntitle={Detecting Influence Structures in Multi-Agent Reinforcement Learning},\nauthor={Fabian Raoul Pieroth and Katherine Fitch and Lenz Belzner},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lm04PyXoEl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5010469, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4adKMcO4tj4J:scholar.google.com/&scioq=Detecting+Influence+Structures+in+Multi-Agent+Reinforcement+Learning&hl=en&as_sdt=0,48", "gs_version_total": 5, "email": "tum.de;laralab.de;thi.de", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Technische Universit\u00e4t M\u00fcnchen;LARALAB;Technische Hochschule Ingolstadt", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tum.de;;https://www.thi.de", "aff_unique_abbr": "TUM;;THI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany;" }, { "title": "Learning Modality Knowledge Alignment for Cross-Modality Transfer", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33203", "id": "lmiurzioja", "proceeding": "https://proceedings.mlr.press/v235/ma24d.html", "pdf": "https://openreview.net/pdf?id=lmiurzioja", "openreview": "https://openreview.net/forum?id=lmiurzioja", "author_site": "Wenxuan Ma, Shuang Li, Lincan Cai, Jingxuan Kang", "tldr": "", "abstract": "Cross-modality transfer aims to leverage large pretrained models to complete tasks that may not belong to the modality of pretraining data. Existing works achieve certain success in extending classical finetuning to cross-modal scenarios, yet we still lack understanding about the influence of modality gap on the transfer. In this work, a series of experiments focusing on the source representation quality during transfer are conducted, revealing the connection between larger modality gap and lesser knowledge reuse which means ineffective transfer. We then formalize the gap as the knowledge misalignment between modalities using conditional distribution $P(Y|X)$. Towards this problem, we present **Mo**dality k**N**owledge **A**lignment (MoNA), a meta-learning approach that learns target data transformation to reduce the modality knowledge discrepancy ahead of the transfer. Experiments show that the approach significantly improves upon cross-modal finetuning methods, and most importantly leads to better reuse of source modality knowledge.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenxuan Ma;Shuang Li;Lincan Cai;Jingxuan Kang", "authorids": "~Wenxuan_Ma2;~Shuang_Li6;~Lincan_Cai1;~Jingxuan_Kang1", "gender": "M;M;;", "homepage": ";https://shuangli.xyz;https://github.com/cailincan0129;", "dblp": "289/0784-1;43/6294-8;;", "google_scholar": "u7aJOt8AAAAJ;VXCiAc4AAAAJ;wH-dNbAAAAAJ;", "orcid": "0000-0001-5402-6028;0000-0001-6807-9905;;", "linkedin": ";;;", "or_profile": "~Wenxuan_Ma2;~Shuang_Li6;~Lincan_Cai1;~Jingxuan_Kang1", "aff": "Beijing Institute of Technology;Beijing Institute of Technology;Beijing Institute of Technology;", "aff_domain": "bit.edu.cn;bit.edu.cn;bit.edu.cn;", "position": "MS student;Associate Professor;MS student;", "bibtex": "@inproceedings{\nma2024learning,\ntitle={Learning Modality Knowledge Alignment for Cross-Modality Transfer},\nauthor={Wenxuan Ma and Shuang Li and Lincan Cai and Jingxuan Kang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lmiurzioja}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1364860, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5675078376681110002&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "bit.edu.cn;bit.edu.cn;bit.edu.cn;", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Beijing Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "http://www.bit.edu.cn/", "aff_unique_abbr": "BIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Density-Softmax: Efficient Test-time Model for Uncertainty Estimation and Robustness under Distribution Shifts", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33202", "id": "lon750Kf7n", "proceeding": "https://proceedings.mlr.press/v235/bui24a.html", "pdf": "https://openreview.net/pdf?id=lon750Kf7n", "openreview": "https://openreview.net/forum?id=lon750Kf7n", "author_site": "Ha Manh Bui, Anqi Liu", "tldr": "", "abstract": "Sampling-based methods, e.g., Deep Ensembles and Bayesian Neural Nets have become promising approaches to improve the quality of uncertainty estimation and robust generalization. However, they suffer from a large model size and high latency at test time, which limits the scalability needed for low-resource devices and real-time applications. To resolve these computational issues, we propose Density-Softmax, a sampling-free deterministic framework via combining a density function built on a Lipschitz-constrained feature extractor with the softmax layer. Theoretically, we show that our model is the solution of minimax uncertainty risk and is distance-aware on feature space, thus reducing the over-confidence of the standard softmax under distribution shifts. Empirically, our method enjoys competitive results with state-of-the-art techniques in terms of uncertainty and robustness, while having a lower number of model parameters and a lower latency at test time.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ha Manh Bui;Anqi Liu", "authorids": "~Ha_Manh_Bui1;~Anqi_Liu2", "gender": ";F", "homepage": ";https://anqiliu-ai.github.io/", "dblp": ";", "google_scholar": ";Q8yp6zQAAAAJ", "orcid": ";0000-0002-0468-5698", "linkedin": ";", "or_profile": "~Ha_Manh_Bui1;~Anqi_Liu2", "aff": ";University of Illinois, Chicago", "aff_domain": ";uic.edu", "position": ";PhD student", "bibtex": "@inproceedings{\nbui2024densitysoftmax,\ntitle={Density-Softmax: Efficient Test-time Model for Uncertainty Estimation and Robustness under Distribution Shifts},\nauthor={Ha Manh Bui and Anqi Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lon750Kf7n}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4421408, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=717653009592370250&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": ";uic.edu", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "University of Illinois at Chicago", "aff_unique_dep": "", "aff_unique_url": "https://www.uic.edu", "aff_unique_abbr": "UIC", "aff_campus_unique_index": "0", "aff_campus_unique": "Chicago", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "TERD: A Unified Framework for Safeguarding Diffusion Models Against Backdoors", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33201", "id": "lpHjmPvxW1", "proceeding": "https://proceedings.mlr.press/v235/mo24a.html", "pdf": "https://openreview.net/pdf?id=lpHjmPvxW1", "openreview": "https://openreview.net/forum?id=lpHjmPvxW1", "author_site": "Yichuan Mo, Hui Huang, Mingjie Li, Ang Li, Yisen Wang", "tldr": "", "abstract": "Diffusion models have achieved notable success in image generation, but they remain highly vulnerable to backdoor attacks, which compromise their integrity by producing specific undesirable outputs when presented with a pre-defined trigger. In this paper, we investigate how to protect diffusion models from this dangerous threat. Specifically, we propose **TERD**, a backdoor defense framework that builds unified modeling for current attacks, which enables us to derive an accessible reversed loss. A trigger reversion strategy is further employed: an initial approximation of the trigger through noise sampled from a prior distribution, followed by refinement through differential multi-step samplers. Additionally, with the reversed trigger, we propose backdoor detection from the noise space, introducing the first backdoor input detection approach for diffusion models and a novel model detection algorithm that calculates the KL divergence between reversed and benign distributions. Extensive evaluations demonstrate that TERD secures a 100% True Positive Rate (TPR) and True Negative Rate (TNR) across datasets of varying resolutions. TERD also demonstrates nice adaptability to other Stochastic Differential Equation (SDE)-based models. Our code is available at https://github.com/PKU-ML/TERD.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yichuan Mo;Hui Huang;Mingjie Li;Ang Li;Yisen Wang", "authorids": "~Yichuan_Mo1;~Hui_Huang6;~Mingjie_Li1;~Ang_Li20;~Yisen_Wang1", "gender": "M;F;M;M;M", "homepage": "https://www.linkedin.com/in/%E6%98%93%E5%B7%9D-%E8%8E%AB-446841212/;;https://mingjieli0111.github.io/;https://github.com/Charles20021201;https://yisenwang.github.io/", "dblp": "321/6790;;;;172/1346-1", "google_scholar": "xvSYG1gAAAAJ;;;;uMWPDboAAAAJ", "orcid": ";;0000-0002-1588-2654;;", "linkedin": ";\u835f-\u9ec4-0b33a52b0/;;;", "or_profile": "~Yichuan_Mo1;~Hui_Huang6;~Mingjie_Li1;~Ang_Li20;~Yisen_Wang1", "aff": "Peking University;Peking University;CISPA Helmholtz Center for Information Security;Peking University;Peking University", "aff_domain": "stu.pku.edu.cn;stu.pku.edu.cn;cispa.de;pku.edu.cn;pku.edu.cn", "position": "PhD student;Undergrad student;Postdoc;Undergrad student;Assistant Professor", "bibtex": "@inproceedings{\nmo2024terd,\ntitle={{TERD}: A Unified Framework for Safeguarding Diffusion Models Against Backdoors},\nauthor={Yichuan Mo and Hui Huang and Mingjie Li and Ang Li and Yisen Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lpHjmPvxW1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5903805, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14889553547558109813&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "stu.pku.edu.cn;stu.pku.edu.cn;cispa.de;pku.edu.cn;pku.edu.cn", "author_num": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Peking University;CISPA Helmholtz Center for Information Security", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.cispa.de/", "aff_unique_abbr": "Peking U;CISPA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "China;Germany" }, { "title": "SMaRt: Improving GANs with Score Matching Regularity", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33200", "id": "lqeVCc9zYq", "proceeding": "https://proceedings.mlr.press/v235/xia24d.html", "pdf": "https://openreview.net/pdf?id=lqeVCc9zYq", "openreview": "https://openreview.net/forum?id=lqeVCc9zYq", "author_site": "Mengfei Xia, Yujun Shen, Ceyuan Yang, Ran Yi, Wenping Wang, Yong-Jin Liu", "tldr": "", "abstract": "Generative adversarial networks (GANs) usually struggle in learning from highly diverse data, whose underlying manifold is complex. In this work, we revisit the mathematical foundations of GANs, and theoretically reveal that the native adversarial loss for GAN training is insufficient to fix the problem of $\\textit{subsets with positive Lebesgue measure of the generated data manifold lying out of the real data manifold}$. Instead, we find that score matching serves as a promising solution to this issue thanks to its capability of persistently pushing the generated data points towards the real data manifold. We thereby propose to improve the optimization of GANs with score matching regularity (SMaRt). Regarding the empirical evidences, we first design a toy example to show that training GANs by the aid of a ground-truth score function can help reproduce the real data distribution more accurately, and then confirm that our approach can consistently boost the synthesis performance of various state-of-the-art GANs on real-world datasets with pre-trained diffusion models acting as the approximate score function. For instance, when training Aurora on the ImageNet $64\\times64$ dataset, we manage to improve FID from 8.87 to 7.11, on par with the performance of one-step consistency model. Code is available at https://github.com/thuxmf/SMaRt.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mengfei Xia;Yujun Shen;Ceyuan Yang;Ran Yi;Wenping Wang;Yong-jin Liu", "authorids": "~Mengfei_Xia1;~Yujun_Shen1;~Ceyuan_Yang2;~Ran_Yi1;~Wenping_Wang1;~Yong-jin_Liu1", "gender": "M;;M;F;M;M", "homepage": "https://thuxmf.github.io/;;https://ceyuan.me/;https://yiranran.github.io/;https://engineering.tamu.edu/cse/profiles/Wang-Wenping.html;https://cg.cs.tsinghua.edu.cn/people/~Yongjin/Yongjin.htm", "dblp": "301/3569;;218/2676;136/5469;;27/2098", "google_scholar": "jmOlxQ0AAAAJ;;Rfj4jWoAAAAJ;https://scholar.google.com.hk/citations?user=y68DLo4AAAAJ;28shvv0AAAAJ;https://scholar.google.com.tw/citations?user=GNDtwWQAAAAJ", "orcid": ";;;0000-0003-1858-3358;0000-0002-2284-3952;0000-0001-5774-1916", "linkedin": ";;;;;", "or_profile": "~Mengfei_Xia1;~Yujun_Shen1;~Ceyuan_Yang2;~Ran_Yi1;~Wenping_Wang1;~Yong-jin_Liu1", "aff": "Tsinghua University;;ByteDance Inc.;Shanghai Jiaotong University;Texas A&M University - College Station;Tsinghua University", "aff_domain": "tsinghua.edu.cn;;bytedance.com;sjtu.edu.cn;tamu.edu;tsinghua.edu.cn", "position": "PhD student;;Researcher;Assistant Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nxia2024smart,\ntitle={{SM}aRt: Improving {GAN}s with Score Matching Regularity},\nauthor={Mengfei Xia and Yujun Shen and Ceyuan Yang and Ran Yi and Wenping Wang and Yong-jin Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lqeVCc9zYq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5586566, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3445825351360006235&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "tsinghua.edu.cn;;bytedance.com;sjtu.edu.cn;tamu.edu;tsinghua.edu.cn", "author_num": 6, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "Tsinghua University;ByteDance;Shanghai Jiao Tong University;Texas A&M University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.bytedance.com;https://www.sjtu.edu.cn;https://www.tamu.edu", "aff_unique_abbr": "THU;ByteDance;SJTU;TAMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";College Station", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Federated Combinatorial Multi-Agent Multi-Armed Bandits", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33199", "id": "lrFwPeDdEQ", "proceeding": "https://proceedings.mlr.press/v235/fourati24b.html", "pdf": "https://openreview.net/pdf?id=lrFwPeDdEQ", "openreview": "https://openreview.net/forum?id=lrFwPeDdEQ", "author_site": "Fares Fourati, Mohamed-Slim Alouini, Vaneet Aggarwal", "tldr": "", "abstract": "This paper introduces a federated learning framework tailored for online combinatorial optimization with bandit feedback. In this setting, agents select subsets of arms, observe noisy rewards for these subsets without accessing individual arm information, and can cooperate and share information at specific intervals. Our framework transforms any offline resilient single-agent $(\\alpha-\\epsilon)$-approximation algorithm\u2014having a complexity of $\\tilde{\\mathcal{O}}\\left(\\frac{\\psi}{\\epsilon^\\beta}\\right)$, where the logarithm is omitted, for some function $\\psi$ and constant $\\beta$\u2014into an online multi-agent algorithm with $m$ communicating agents and an $\\alpha$-regret of no more than $\\tilde{\\mathcal{O}}\\left(m^{-\\frac{1}{3+\\beta}} \\psi^\\frac{1}{3+\\beta} T^\\frac{2+\\beta}{3+\\beta}\\right)$. Our approach not only eliminates the $\\epsilon$ approximation error but also ensures sublinear growth with respect to the time horizon $T$ and demonstrates a linear speedup with an increasing number of communicating agents. Additionally, the algorithm is notably communication-efficient, requiring only a sublinear number of communication rounds, quantified as $\\tilde{\\mathcal{O}}\\left(\\psi T^\\frac{\\beta}{\\beta+1}\\right)$. Furthermore, the framework has been successfully applied to online stochastic submodular maximization using various offline algorithms, yielding the first results for both single-agent and multi-agent settings and recovering specialized single-agent theoretical guarantees. We empirically validate our approach to a stochastic data summarization problem, illustrating the effectiveness of the proposed framework, even in single-agent scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fares Fourati;Mohamed-Slim Alouini;Vaneet Aggarwal", "authorids": "~Fares_Fourati1;~Mohamed-Slim_Alouini1;~Vaneet_Aggarwal1", "gender": "M;M;M", "homepage": "https://fouratifares.github.io/website/;https://cemse.kaust.edu.sa/ctl/people/person/mohamed-slim-alouini;", "dblp": "275/3371;64/6304;91/6560", "google_scholar": "FAmOUOIAAAAJ;;", "orcid": "0000-0002-6913-7035;;", "linkedin": "fares-fourati-96641914a/?originalSubdomain=tn;;", "or_profile": "~Fares_Fourati1;~Mohamed-Slim_Alouini1;~Vaneet_Aggarwal1", "aff": "King Abdullah University of Science and Technology;King Abdullah University of Science and Technology;Purdue University", "aff_domain": "kaust.edu.sa;kaust.edu.sa;purdue.edu", "position": "PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nfourati2024federated,\ntitle={Federated Combinatorial Multi-Agent Multi-Armed Bandits},\nauthor={Fares Fourati and Mohamed-Slim Alouini and Vaneet Aggarwal},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lrFwPeDdEQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 845044, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3772182266327760618&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 9, "email": "kaust.edu.sa;kaust.edu.sa;purdue.edu", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "King Abdullah University of Science and Technology;Purdue University", "aff_unique_dep": ";", "aff_unique_url": "https://www.kast.kau.edu.sa;https://www.purdue.edu", "aff_unique_abbr": "KAUST;Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Saudi Arabia;United States" }, { "title": "E$^2$GAN: Efficient Training of Efficient GANs for Image-to-Image Translation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33198", "id": "lrPrkWXqzd", "proceeding": "https://proceedings.mlr.press/v235/gong24g.html", "pdf": "https://openreview.net/pdf?id=lrPrkWXqzd", "openreview": "https://openreview.net/forum?id=lrPrkWXqzd", "author_site": "Yifan Gong, Zheng Zhan, Qing Jin, Yanyu Li, Yerlan Idelbayev, Xian Liu, Andrey Zharkov, Kfir Aberman, Sergey Tulyakov, Yanzhi Wang, Jian Ren", "tldr": "", "abstract": "One highly promising direction for enabling flexible real-time on-device image editing is utilizing data distillation by leveraging large-scale text-to-image diffusion models to generate paired datasets used for training generative adversarial networks (GANs). This approach notably alleviates the stringent requirements typically imposed by high-end commercial GPUs for performing image editing with diffusion models. However, unlike text-to-image diffusion models, each distilled GAN is specialized for a specific image editing task, necessitating costly training efforts to obtain models for various concepts. In this work, we introduce and address a novel research direction: can the process of distilling GANs from diffusion models be made significantly more efficient? To achieve this goal, we propose a series of innovative techniques. First, we construct a base GAN model with generalized features, adaptable to different concepts through fine-tuning, eliminating the need for training from scratch. Second, we identify crucial layers within the base GAN model and employ Low-Rank Adaptation (LoRA) with a simple yet effective rank search process, rather than fine-tuning the entire base model. Third, we investigate the minimal amount of data necessary for fine-tuning, further reducing the overall training time. Extensive experiments show that we can efficiently empower GANs with the ability to perform real-time high-quality image editing on mobile devices with remarkably reduced training and storage costs for each concept.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yifan Gong;Zheng Zhan;Qing Jin;Yanyu Li;Yerlan Idelbayev;Xian Liu;Andrey Zharkov;Kfir Aberman;Sergey Tulyakov;Yanzhi Wang;Jian Ren", "authorids": "~Yifan_Gong2;~Zheng_Zhan3;~Qing_Jin1;~Yanyu_Li1;~Yerlan_Idelbayev1;~Xian_Liu1;~Andrey_Zharkov1;~Kfir_Aberman1;~Sergey_Tulyakov1;~Yanzhi_Wang2;~Jian_Ren2", "gender": "F;;;;M;M;;M;M;M;M", "homepage": "https://yifanfanfanfan.github.io/;;;;http://graduatestudent.ucmerced.edu/yidelbayev/;https://alvinliu0.github.io/;;https://kfiraberman.github.io/;http://www.stulyakov.com/;https://web.northeastern.edu/yanzhiwang/;https://alanspike.github.io/", "dblp": "49/3073-4.html;156/4008-1.html;37/11144;194/5818;203/8094;;;;40/6115;;59/2180-5", "google_scholar": "U_gevVgAAAAJ;hwTuEX0AAAAJ;X9iggBcAAAAJ;https://scholar.google.com/citations?hl=en;nAaroNMAAAAJ;https://scholar.google.com/citations?hl=en-us;;https://scholar.google.co.il/citations?user=jdbZDakAAAAJ;mgzXR0sAAAAJ;a7akgIEAAAAJ;https://scholar.google.co.jp/citations?user=vDALiU4AAAAJ", "orcid": "0000-0002-3912-097X;;0000-0001-8795-9297;;;0000-0001-9817-7418;;;;;", "linkedin": "yifan-gong-3059b8132/;;;;;xian-liu-9840b52a3/;;;sergeytulyakov/;;", "or_profile": "~Yifan_Gong2;~Zheng_Zhan3;~Qing_Jin1;~Yanyu_Li1;~Yerlan_Idelbayev1;~Xian_Liu1;~Andrey_Zharkov1;~Kfir_Aberman1;~Sergey_Tulyakov1;~Yanzhi_Wang2;~Jian_Ren2", "aff": "Northeastern University;Northeastern University;Snap Inc.;Northeastern University;Snap Inc.;The Chinese University of Hong Kong;;Snap Inc.;Snap Inc.;Northeastern University;Snap Inc.", "aff_domain": "neu.edu;neu.edu;snap.com;northeastern.edu;snapchat.com;cuhk.edu.hk;;snap.com;snapchat.com;neu.edu;snapchat.com", "position": "PhD student;PhD student;Researcher;PhD student;Researcher;PhD student;;Researcher;Director of Research;Assistant Professor;Research Scientist", "bibtex": "@inproceedings{\ngong2024egan,\ntitle={E\\${\\textasciicircum}2\\${GAN}: Efficient Training of Efficient {GAN}s for Image-to-Image Translation},\nauthor={Yifan Gong and Zheng Zhan and Qing Jin and Yanyu Li and Yerlan Idelbayev and Xian Liu and Andrey Zharkov and Kfir Aberman and Sergey Tulyakov and Yanzhi Wang and Jian Ren},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lrPrkWXqzd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9496367, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18204567626855748828&as_sdt=5,24&sciodt=0,24&hl=en", "gs_version_total": 6, "email": "neu.edu;neu.edu;snap.com;northeastern.edu;snapchat.com;cuhk.edu.hk;;snap.com;snapchat.com;neu.edu;snapchat.com", "author_num": 11, "aff_unique_index": "0;0;1;0;1;2;1;1;0;1", "aff_unique_norm": "Northeastern University;Snap Inc.;Chinese University of Hong Kong", "aff_unique_dep": ";;", "aff_unique_url": "https://www.northeastern.edu;https://www.snapinc.com;https://www.cuhk.edu.hk", "aff_unique_abbr": "NEU;Snap;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;1;0;0;0;0", "aff_country_unique": "United States;China" }, { "title": "DistiLLM: Towards Streamlined Distillation for Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33197", "id": "lsHZNNoC7r", "proceeding": "https://proceedings.mlr.press/v235/ko24c.html", "pdf": "https://openreview.net/pdf?id=lsHZNNoC7r", "openreview": "https://openreview.net/forum?id=lsHZNNoC7r", "author_site": "Jongwoo Ko, Sungnyun Kim, Tianyi Chen, Se-Young Yun", "tldr": "", "abstract": "Knowledge distillation (KD) is widely used for compressing a teacher model to a smaller student model, reducing its inference cost and memory footprint while preserving model capabilities. However, current KD methods for auto-regressive sequence models (e.g., large language models) suffer from missing a standardized objective function. Moreover, the recent use of student-generated outputs to address training-inference mismatches has significantly escalated computational costs. To tackle these issues, we introduce DistiLLM, a more effective and efficient KD framework for auto-regressive language models. DistiLLM comprises two components: (1) a novel skew Kullback-Leibler divergence loss, where we unveil and leverage its theoretical properties, and (2) an adaptive off-policy approach designed to enhance the efficiency in utilizing student-generated outputs. Extensive experiments, including instruction-following tasks, demonstrate the effectiveness of DistiLLM in building high-performing student models while achieving up to 4.3$\\times$ speedup compared to recent KD methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jongwoo Ko;Sungnyun Kim;Tianyi Chen;Se-Young Yun", "authorids": "~Jongwoo_Ko1;~Sungnyun_Kim1;~Tianyi_Chen3;~Se-Young_Yun1", "gender": "M;M;M;M", "homepage": "https://sites.google.com/view/jongwooko;https://bit.ly/sungnyunkim;;https://fbsqkd.github.io", "dblp": "286/1503;276/5441;;23/8862", "google_scholar": "l2jkwHwAAAAJ;DsWny60AAAAJ;2BahjdkAAAAJ;X_IAjb8AAAAJ", "orcid": ";0000-0002-3251-1812;;", "linkedin": "jongwoo-ko-8b93051b4/;sungnyun-kim-38a029242/;tianyi-chen-b65502b3/;seyoung-yun-395130ab/", "or_profile": "~Jongwoo_Ko1;~Sungnyun_Kim1;~Tianyi_Chen3;~Se-Young_Yun1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Microsoft;KAIST", "aff_domain": "kaist.ac.kr;kaist.ac.kr;microsoft.com;kaist.ac.kr", "position": "PhD student;PhD student;Senior Researcher;Assistant Professor", "bibtex": "@inproceedings{\nko2024distillm,\ntitle={Disti{LLM}: Towards Streamlined Distillation for Large Language Models},\nauthor={Jongwoo Ko and Sungnyun Kim and Tianyi Chen and Se-Young Yun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lsHZNNoC7r}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1621250, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13464185332156371483&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "kaist.ac.kr;kaist.ac.kr;microsoft.com;kaist.ac.kr", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.kaist.ac.kr;https://www.microsoft.com", "aff_unique_abbr": "KAIST;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "South Korea;United States" }, { "title": "MVMoE: Multi-Task Vehicle Routing Solver with Mixture-of-Experts", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33196", "id": "lsQnneYa8p", "proceeding": "https://proceedings.mlr.press/v235/zhou24c.html", "pdf": "https://openreview.net/pdf?id=lsQnneYa8p", "openreview": "https://openreview.net/forum?id=lsQnneYa8p", "author_site": "Jianan Zhou, Zhiguang Cao, Yaoxin Wu, Wen Song, Yining Ma, Jie Zhang, Xu Chi", "tldr": "", "abstract": "Learning to solve vehicle routing problems (VRPs) has garnered much attention. However, most neural solvers are only structured and trained independently on a specific problem, making them less generic and practical. In this paper, we aim to develop a unified neural solver that can cope with a range of VRP variants simultaneously. Specifically, we propose a multi-task vehicle routing solver with mixture-of-experts (MVMoE), which greatly enhances the model capacity without a proportional increase in computation. We further develop a hierarchical gating mechanism for the MVMoE, delivering a good trade-off between empirical performance and computational complexity. Experimentally, our method significantly promotes zero-shot generalization performance on 10 unseen VRP variants, and showcases decent results on the few-shot setting and real-world benchmark instances. We further conduct extensive studies on the effect of MoE configurations in solving VRPs, and observe the superiority of hierarchical gating when facing out-of-distribution data. The source code is available at: https://github.com/RoyalSkye/Routing-MVMoE.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jianan Zhou;Zhiguang Cao;Yaoxin Wu;Wen Song;Yining Ma;Jie Zhang;Xu Chi", "authorids": "~Jianan_Zhou1;~Zhiguang_Cao1;~Yaoxin_Wu2;~Wen_Song1;~Yining_Ma1;~Jie_Zhang9;~Xu_Chi1", "gender": "M;M;M;M;M;M;M", "homepage": "https://royalskye.github.io/;https://zhiguangcaosg.github.io/;https://songwenas12.github.io/;https://yining043.github.io/;https://personal.ntu.edu.sg/zhangj/;;https://research.tue.nl/en/persons/yaoxin-wu", "dblp": "296/2326-2;178/8621;50/5489;160/6245-1;84/6889-2;;192/4964", "google_scholar": "9T58m-EAAAAJ;https://scholar.google.com.sg/citations?user=2R-cOkYAAAAJ;s8Nz-xoAAAAJ;4_VyBTsAAAAJ;IFV_RdMAAAAJ;https://scholar.google.com/citations?hl=en;0qRnmK8AAAAJ", "orcid": "0000-0002-4896-148X;0000-0002-4499-759X;0000-0001-7624-1861;0000-0002-6639-8547;;0000-0001-5480-3974;0000-0002-3625-6599", "linkedin": ";;;yiningma/;;;", "or_profile": "~Jianan_Zhou1;~Zhiguang_Cao1;~Wen_Song1;~Yining_Ma1;~Jie_Zhang9;~Xu_Chi1;~YAOXIN_WU1", "aff": "Nanyang Technological University;Singapore Management University;Shandong University;Nanyang Technological University;Nanyang Technological University;Singapore Institute of Manufacturing Technology, A*STAR;Eindhoven University of Technology", "aff_domain": "ntu.edu.sg;smu.edu.sg;sdu.edu.cn;ntu.edu.sg;ntu.edu.sg;simtech.a-star.edu.sg;tue.nl", "position": "PhD student;Assistant Professor;Associate Professor;Research Fellow;Full Professor;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nzhou2024mvmoe,\ntitle={{MVM}oE: Multi-Task Vehicle Routing Solver with Mixture-of-Experts},\nauthor={Jianan Zhou and Zhiguang Cao and Yaoxin Wu and Wen Song and Yining Ma and Jie Zhang and Xu Chi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lsQnneYa8p}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 983757, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=825289983616497384&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 11, "email": "ntu.edu.sg;smu.edu.sg;sdu.edu.cn;ntu.edu.sg;ntu.edu.sg;simtech.a-star.edu.sg;tue.nl", "author_num": 7, "aff_unique_index": "0;1;2;0;0;3;4", "aff_unique_norm": "Nanyang Technological University;Singapore Management University;Shandong University;Singapore Institute of Manufacturing Technology;Eindhoven University of Technology", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.ntu.edu.sg;https://www.smu.edu.sg;http://www.sdu.edu.cn;https://www.simtech.a-star.edu.sg;https://www.tue.nl", "aff_unique_abbr": "NTU;SMU;SDU;SIMTech;TU/e", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0;2", "aff_country_unique": "Singapore;China;Netherlands" }, { "title": "CauDiTS: Causal Disentangled Domain Adaptation of Multivariate Time Series", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33195", "id": "lsavZkUjFZ", "proceeding": "https://proceedings.mlr.press/v235/lu24i.html", "pdf": "https://openreview.net/pdf?id=lsavZkUjFZ", "openreview": "https://openreview.net/forum?id=lsavZkUjFZ", "author_site": "Junxin Lu, Shiliang Sun", "tldr": "", "abstract": "Unsupervised domain adaptation of multivariate time series aims to train a model to adapt its classification ability from a labeled source domain to an unlabeled target domain, where there are differences in the distribution between domains. Existing methods extract domain-invariant features directly via a shared feature extractor, neglecting the exploration of the underlying causal patterns, which undermines their reliability, especially in complex multivariate dynamic systems. To address this problem, we propose CauDiTS, an innovative framework for unsupervised domain adaptation of multivariate time series. CauDiTS adopts an adaptive rationale disentangler to disentangle domain-common causal rationales and domain-specific correlations from variable interrelationships. The stability of causal rationales across domains is vital for filtering domainspecific perturbations and facilitating the extraction of domain-invariant representations. Moreover, we promote the cross-domain consistency of intra-class causal rationales employing the learning strategies of causal prototype consistency and domain-intervention causality invariance. CauDiTS is evaluated on four benchmark datasets, demonstrating its effectiveness and outperforming state-of-the-art methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "junxin lu;Shiliang Sun", "authorids": "~junxin_lu1;~Shiliang_Sun1", "gender": "M;", "homepage": ";", "dblp": ";", "google_scholar": "https://scholar.google.com.hk/citations?user=5d1BqiEAAAAJ;", "orcid": "0000-0003-3653-0858;", "linkedin": ";", "or_profile": "~junxin_lu1;~Shiliang_Sun1", "aff": "East China Normal University;", "aff_domain": "ecnu.edu.cn;", "position": "PhD student;", "bibtex": "@inproceedings{\nlu2024caudits,\ntitle={CauDi{TS}: Causal Disentangled Domain Adaptation of Multivariate Time Series},\nauthor={junxin lu and Shiliang Sun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lsavZkUjFZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2839002, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17110164944651850098&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "ecnu.edu.cn;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "East China Normal University", "aff_unique_dep": "", "aff_unique_url": "http://www.ecnu.edu.cn", "aff_unique_abbr": "ECNU", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "Convergence and Trade-Offs in Riemannian Gradient Descent and Riemannian Proximal Point", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33194", "id": "ltb2XaIr9p", "proceeding": "https://proceedings.mlr.press/v235/marti-nez-rubio24a.html", "pdf": "https://openreview.net/pdf?id=ltb2XaIr9p", "openreview": "https://openreview.net/forum?id=ltb2XaIr9p", "author_site": "David Mart\u00ednez-Rubio, Christophe Roux, Sebastian Pokutta", "tldr": "", "abstract": "In this work, we analyze two of the most fundamental algorithms in geodesically convex optimization: Riemannian gradient descent and (possibly inexact) Riemannian proximal point. We quantify their rates of convergence and produce different variants with several trade-offs. Crucially, we show the iterates naturally stay in a ball around an optimizer, of radius depending on the initial distance and, in some cases, on the curvature. Previous works simply assumed bounded iterates, resulting in rates that were not fully quantified. We also provide an implementable inexact proximal point algorithm and prove several new useful properties of Riemannian proximal methods: they work when positive curvature is present, the proximal operator does not move points away from any optimizer, and we quantify the smoothness of its induced Moreau envelope. Further, we explore beyond our theory with empirical tests.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "David Mart\u00ednez-Rubio;Christophe Roux;Sebastian Pokutta", "authorids": "~David_Mart\u00ednez-Rubio2;~Christophe_Roux1;~Sebastian_Pokutta1", "gender": ";;M", "homepage": ";;http://www.pokutta.com", "dblp": ";;75/7718", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~David_Mart\u00ednez-Rubio2;~Christophe_Roux1;~Sebastian_Pokutta1", "aff": ";;TU Berlin", "aff_domain": ";;tu-berlin.de", "position": ";;Full Professor", "bibtex": "@inproceedings{\nmart{\\'\\i}nez-rubio2024convergence,\ntitle={Convergence and Trade-Offs in Riemannian Gradient Descent and Riemannian Proximal Point},\nauthor={David Mart{\\'\\i}nez-Rubio and Christophe Roux and Sebastian Pokutta},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ltb2XaIr9p}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 764993, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18175123072675303275&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";;tu-berlin.de", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Technische Universit\u00e4t Berlin", "aff_unique_dep": "", "aff_unique_url": "https://www.tu-berlin.de", "aff_unique_abbr": "TU Berlin", "aff_campus_unique_index": "0", "aff_campus_unique": "Berlin", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "title": "Jetfire: Efficient and Accurate Transformer Pretraining with INT8 Data Flow and Per-Block Quantization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33193", "id": "ltzTHGFF5i", "proceeding": "https://proceedings.mlr.press/v235/xi24b.html", "pdf": "https://openreview.net/pdf?id=ltzTHGFF5i", "openreview": "https://openreview.net/forum?id=ltzTHGFF5i", "author_site": "Haocheng Xi, Yuxiang Chen, Kang Zhao, KAI JUN TEH, Jianfei Chen, Jun Zhu", "tldr": "", "abstract": "Pretraining transformers are generally time-consuming. Fully quantized training (FQT) is a promising approach to speed up pretraining. However, most FQT methods adopt a quantize-compute-dequantize procedure, which often leads to suboptimal speedup and significant performance degradation when used in transformers due to the high memory access overheads and low-precision computations. In this work, we propose Jetfire, an efficient and accurate INT8 training method specific to transformers. Our method features an INT8 data flow to optimize memory access and a per-block quantization method to maintain the accuracy of pretrained transformers. Extensive experiments demonstrate that our INT8 FQT method achieves comparable accuracy to the FP16 training baseline and outperforms the existing INT8 training works for transformers. Moreover, for a standard transformer block, our method offers an end-to-end training speedup of 1.42x and a 1.49x memory reduction compared to the FP16 baseline.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haocheng Xi;Yuxiang Chen;Kang Zhao;KAI JUN TEH;Jianfei Chen;Jun Zhu", "authorids": "~Haocheng_Xi1;~Yuxiang_Chen2;~Kang_Zhao5;~KAI_JUN_TEH1;~Jianfei_Chen1;~Jun_Zhu2", "gender": "M;M;M;M;M;M", "homepage": "https://haochengxi.github.io/;https://github.com/cyx0406;https://www.homepage.url;https://github.com/kaijun924;http://ml.cs.tsinghua.edu.cn/~jianfei;http://ml.cs.tsinghua.edu.cn/~jun", "dblp": "349/7931;;;;48/6809-1;50/2644-1", "google_scholar": "klZ2MMcAAAAJ;;vXXcc7MAAAAJ;;di5RZ1MAAAAJ;axsP38wAAAAJ", "orcid": ";;;;;", "linkedin": "haocheng-xi-412511323/;;;;;", "or_profile": "~Haocheng_Xi1;~Yuxiang_Chen2;~Kang_Zhao5;~KAI_JUN_TEH1;~Jianfei_Chen1;~Jun_Zhu2", "aff": "Tsinghua University;Tsinghua University;Huawei Technologies Ltd.;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;mails.tsinghua.edu.cn;huawei.com;mails.tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn", "position": "Undergrad student;Undergrad student;Researcher;Undergrad student;Associate Professor;Professor", "bibtex": "@inproceedings{\nxi2024jetfire,\ntitle={Jetfire: Efficient and Accurate Transformer Pretraining with {INT}8 Data Flow and Per-Block Quantization},\nauthor={Haocheng Xi and Yuxiang Chen and Kang Zhao and KAI JUN TEH and Jianfei Chen and Jun Zhu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ltzTHGFF5i}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 926904, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3014379044058880277&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "tsinghua.edu.cn;mails.tsinghua.edu.cn;huawei.com;mails.tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn", "author_num": 6, "aff_unique_index": "0;0;1;0;0;0", "aff_unique_norm": "Tsinghua University;Huawei", "aff_unique_dep": ";Huawei Technologies", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.huawei.com", "aff_unique_abbr": "THU;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Two Stones Hit One Bird: Bilevel Positional Encoding for Better Length Extrapolation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33192", "id": "luqH1eL4PN", "proceeding": "https://proceedings.mlr.press/v235/he24c.html", "pdf": "https://openreview.net/pdf?id=luqH1eL4PN", "openreview": "https://openreview.net/forum?id=luqH1eL4PN", "author_site": "Zhenyu He, Guhao Feng, Shengjie Luo, Kai Yang, Liwei Wang, Jingjing Xu, Zhi Zhang, Hongxia Yang, Di He", "tldr": "", "abstract": "In this work, we leverage the intrinsic segmentation of language sequences and design a new positional encoding method called Bilevel Positional Encoding (BiPE). For each position, our BiPE blends an intra-segment encoding and an inter-segment encoding. The intra-segment encoding identifies the locations within a segment and helps the model capture the semantic information therein via absolute positional encoding. The inter-segment encoding specifies the segment index, models the relationships between segments, and aims to improve extrapolation capabilities via relative positional encoding. Theoretical analysis shows this disentanglement of positional information makes learning more effective. The empirical results also show that our BiPE has superior length extrapolation capabilities across a wide range of tasks in diverse text modalities.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhenyu He;Guhao Feng;Shengjie Luo;Kai Yang;Liwei Wang;Jingjing Xu;Zhi Zhang;Hongxia Yang;Di He", "authorids": "~Zhenyu_He3;~Guhao_Feng1;~Shengjie_Luo1;yangkai@stu.pku.edu.cn;~Liwei_Wang1;~Jingjing_Xu1;~Zhi_Zhang4;~Hongxia_Yang2;~Di_He1", "gender": "M;M;M;;M;F;M;F;M", "homepage": "https://zhenyuhe00.github.io/;;https://lsj2408.github.io;;http://www.liweiwang-pku.com/;;https://zhreshold.github.io;https://www4.comp.polyu.edu.hk/~hongxyang/;https://dihe-pku.github.io/", "dblp": "355/4626;;274/2110;;;25/624;;;74/184", "google_scholar": "https://scholar.google.co.jp/citations?user=bKwkUO4AAAAJ;wmDqYvUAAAAJ;ImWO7WYAAAAJ;;VZHxoh8AAAAJ;;nZr0oXQAAAAJ;iJlC5mMAAAAJ;https://scholar.google.co.jp/citations?user=orVoz4IAAAAJ", "orcid": ";;;;;;0000-0003-0249-1678;;", "linkedin": ";;shengjie-luo-ba6137193/;;;;;;", "or_profile": "~Zhenyu_He3;~Guhao_Feng1;~Shengjie_Luo1;yangkai@stu.pku.edu.cn;~Liwei_Wang1;~Jingjing_Xu1;~Zhi_Zhang4;~Hongxia_Yang2;~Di_He1", "aff": "Peking University;Peking University;Microsoft;;Peking University;;ByteDance Inc.;ByteDance Inc.;Microsoft", "aff_domain": "pku.edu.cn;pku.edu.cn;microsoft.com;;pku.edu.cn;;bytedance.com;bytedance.com;microsoft.com", "position": "PhD student;Undergrad student;Intern;;Full Professor;;Researcher;Principal Researcher;Senior Researcher", "bibtex": "@inproceedings{\nhe2024two,\ntitle={Two Stones Hit One Bird: Bilevel Positional Encoding for Better Length Extrapolation},\nauthor={Zhenyu He and Guhao Feng and Shengjie Luo and Kai Yang and Liwei Wang and Jingjing Xu and Zhi Zhang and Hongxia Yang and Di He},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=luqH1eL4PN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1794050, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14538707582392691765&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "pku.edu.cn;pku.edu.cn;microsoft.com;;pku.edu.cn;;bytedance.com;bytedance.com;microsoft.com", "author_num": 9, "aff_unique_index": "0;0;1;0;2;2;1", "aff_unique_norm": "Peking University;Microsoft;ByteDance", "aff_unique_dep": ";Microsoft Corporation;", "aff_unique_url": "http://www.pku.edu.cn;https://www.microsoft.com;https://www.bytedance.com", "aff_unique_abbr": "Peking U;Microsoft;ByteDance", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Degeneration-free Policy Optimization: RL Fine-Tuning for Language Models without Degeneration", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33191", "id": "lwTshcWlmB", "proceeding": "https://proceedings.mlr.press/v235/jang24b.html", "pdf": "https://openreview.net/pdf?id=lwTshcWlmB", "openreview": "https://openreview.net/forum?id=lwTshcWlmB", "author_site": "Youngsoo Jang, Geon-Hyeong Kim, Byoungjip Kim, Yu Jin Kim, Honglak Lee, Moontae Lee", "tldr": "", "abstract": "As the pre-training objectives (e.g., next token prediction) of language models (LMs) are inherently not aligned with task scores, optimizing LMs to achieve higher downstream task scores is essential. One of the promising approaches is to fine-tune LMs through reinforcement learning (RL). However, conventional RL methods based on PPO and a penalty of KL divergence are vulnerable to text degeneration where LMs do not generate natural texts anymore after RL fine-tuning. To address this problem, we provide Degeneration-free Policy Optimization (DfPO) that can fine-tune LMs to generate texts that achieve improved downstream task scores, while preserving the ability to generate natural texts. To achieve this, we introduce KL-masking which masks out the actions that potentially cause deviation from the reference policy when its likelihood is increased or decreased. Then, we devise truncated advantage functions for separately performing likelihood maximization and minimization to improve the task performance. In the experiments, we provide the results of DfPO and baseline algorithms on various generative NLP tasks including text continuation, text detoxification, and commonsense generation. Our experiments demonstrate that DfPO successfully improves the downstream task scores while preserving the ability to generate natural texts, without requiring additional hyperparameter search.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Youngsoo Jang;Geon-Hyeong Kim;Byoungjip Kim;Yu Jin Kim;Honglak Lee;Moontae Lee", "authorids": "~Youngsoo_Jang2;~Geon-Hyeong_Kim2;~Byoungjip_Kim1;~Yu_Jin_Kim1;~Honglak_Lee2;~Moontae_Lee1", "gender": ";M;;;;", "homepage": "http://www.ysjang.me;https://sites.google.com/view/ghkim;;;;https://moontae.people.uic.edu", "dblp": "195/0471;231/7707;;;;132/1761", "google_scholar": "6EoBBggAAAAJ;https://scholar.google.co.kr/citations?user=IJL0uXoAAAAJ;;;;BMvYy9cAAAAJ", "orcid": ";;;;;0000-0001-5542-3463", "linkedin": ";;;;;moontae-lee-975248123/", "or_profile": "~Youngsoo_Jang2;~Geon-Hyeong_Kim2;~Byoungjip_Kim1;~Yu_Jin_Kim1;~Honglak_Lee2;~Moontae_Lee1", "aff": "LG AI Research;LG AI Research;;LG AI Research;;University of Illinois, Chicago", "aff_domain": "lgresearch.ai;lgresearch.ai;;lgresearch.ai;;uic.edu", "position": "Researcher;Researcher;;Researcher;;Assistant Professor", "bibtex": "@inproceedings{\njang2024degenerationfree,\ntitle={Degeneration-free Policy Optimization: {RL} Fine-Tuning for Language Models without Degeneration},\nauthor={Youngsoo Jang and Geon-Hyeong Kim and Byoungjip Kim and Yu Jin Kim and Honglak Lee and Moontae Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lwTshcWlmB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1402730, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YMGjZGqQDxMJ:scholar.google.com/&scioq=Degeneration-free+Policy+Optimization:+RL+Fine-Tuning+for+Language+Models+without+Degeneration&hl=en&as_sdt=0,5", "gs_version_total": 5, "email": "lgresearch.ai;lgresearch.ai;;lgresearch.ai;;uic.edu", "author_num": 6, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "LG;University of Illinois at Chicago", "aff_unique_dep": "LG AI Research;", "aff_unique_url": "https://www.lgaires.com;https://www.uic.edu", "aff_unique_abbr": "LG AI;UIC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Chicago", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "South Korea;United States" }, { "title": "Adaptive Conformal Inference by Betting", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33190", "id": "lwWV4Zl3h1", "proceeding": "https://proceedings.mlr.press/v235/podkopaev24a.html", "pdf": "https://openreview.net/pdf?id=lwWV4Zl3h1", "openreview": "https://openreview.net/forum?id=lwWV4Zl3h1", "author_site": "Aleksandr Podkopaev, Darren Xu, Kuang-chih Lee", "tldr": "", "abstract": "Conformal prediction is a valuable tool for quantifying predictive uncertainty of machine learning models. However, its applicability relies on the assumption of data exchangeability, a condition which is often not met in real-world scenarios. In this paper, we consider the problem of adaptive conformal inference without any assumptions about the data generating process. Existing approaches for adaptive conformal inference are based on optimizing the pinball loss using variants of online gradient descent. A notable shortcoming of such approaches is in their explicit dependence on and sensitivity to the choice of the learning rates. In this paper, we propose a different approach for adaptive conformal inference that leverages parameter-free online convex optimization techniques. We prove that our method controls long-term miscoverage frequency at a nominal level and demonstrate its convincing empirical performance without any need of performing cumbersome parameter tuning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aleksandr Podkopaev;Dong Xu;Kuang-chih Lee", "authorids": "~Aleksandr_Podkopaev1;~Dong_Xu7;~Kuang-chih_Lee1", "gender": "M;;M", "homepage": "https://sashapodkopaev.com;;", "dblp": "268/0747;;l/KuangchihLee", "google_scholar": "58-8sF8AAAAJ;u7l6dOwAAAAJ;", "orcid": ";;", "linkedin": "sasha-podkopaev/;;", "or_profile": "~Aleksandr_Podkopaev1;~Dong_Xu7;~Kuang-chih_Lee1", "aff": ";Walmart Labs;", "aff_domain": ";walmartlabs.com;", "position": ";Researcher;", "bibtex": "@inproceedings{\npodkopaev2024adaptive,\ntitle={Adaptive Conformal Inference by Betting},\nauthor={Aleksandr Podkopaev and Dong Xu and Kuang-chih Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lwWV4Zl3h1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1903779, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12534179494248720039&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";walmartlabs.com;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Walmart", "aff_unique_dep": "Walmart Labs", "aff_unique_url": "https://www.walmart.com", "aff_unique_abbr": "Walmart Labs", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Smooth Tchebycheff Scalarization for Multi-Objective Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33189", "id": "m4dO5L6eCp", "proceeding": "https://proceedings.mlr.press/v235/lin24y.html", "pdf": "https://openreview.net/pdf?id=m4dO5L6eCp", "openreview": "https://openreview.net/forum?id=m4dO5L6eCp", "author_site": "Xi Lin, Xiaoyuan Zhang, Zhiyuan Yang, Fei Liu, Zhenkun Wang, Qingfu Zhang", "tldr": "", "abstract": "Multi-objective optimization problems can be found in many real-world applications, where the objectives often conflict each other and cannot be optimized by a single solution. In the past few decades, numerous methods have been proposed to find Pareto solutions that represent optimal trade-offs among the objectives for a given problem. However, these existing methods could have high computational complexity or may not have good theoretical properties for solving a general differentiable multi-objective optimization problem. In this work, by leveraging the smooth optimization technique, we propose a lightweight and efficient smooth Tchebycheff scalarization approach for gradient-based multi-objective optimization. It has good theoretical properties for finding all Pareto solutions with valid trade-off preferences, while enjoying significantly lower computational complexity compared to other methods. Experimental results on various real-world application problems fully demonstrate the effectiveness of our proposed method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xi Lin;Xiaoyuan Zhang;Zhiyuan Yang;Fei Liu;Zhenkun Wang;Qingfu Zhang", "authorids": "~Xi_Lin2;~Xiaoyuan_Zhang2;~Zhiyuan_Yang2;~Fei_Liu14;~Zhenkun_Wang1;~Qingfu_Zhang1", "gender": "M;M;;;M;M", "homepage": "https://xi-l.github.io/;;;;https://faculty.sustech.edu.cn/wangzk3/en/;https://www.cs.cityu.edu.hk/~qzhan7/index.html", "dblp": "43/489-1;;;;96/9114;98/1240.html", "google_scholar": "QB_MUboAAAAJ;KQj18L8AAAAJ;;;https://scholar.google.com.sg/citations?user=r9ezy2gAAAAJ;https://scholar.google.co.uk/citations?user=nhL9PHwAAAAJ", "orcid": ";0000-0002-3852-645X;;;0000-0003-1152-6780;", "linkedin": ";;;;;", "or_profile": "~Xi_Lin2;~Xiaoyuan_Zhang2;~Zhiyuan_Yang2;~Fei_Liu14;~Zhenkun_Wang1;~Qingfu_Zhang1", "aff": "City University of Hong Kong;City University of Hong Kong;;;Southern University of Science and Technology;City University of Hong Kong", "aff_domain": "cityu.edu.hk;cityu.edu.hk;;;sustech.edu.cn;cityu.edu.hk", "position": "Postdoc;PhD student;;;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nlin2024smooth,\ntitle={Smooth Tchebycheff Scalarization for Multi-Objective Optimization},\nauthor={Xi Lin and Xiaoyuan Zhang and Zhiyuan Yang and Fei Liu and Zhenkun Wang and Qingfu Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=m4dO5L6eCp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5828379, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6430639641275269027&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "cityu.edu.hk;cityu.edu.hk;;;sustech.edu.cn;cityu.edu.hk", "author_num": 6, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "City University of Hong Kong;Southern University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.cityu.edu.hk;https://www.sustech.edu.cn", "aff_unique_abbr": "CityU;SUSTech", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "When Representations Align: Universality in Representation Learning Dynamics", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33188", "id": "m5nB7ucXHT", "proceeding": "https://proceedings.mlr.press/v235/van-rossem24a.html", "pdf": "https://openreview.net/pdf?id=m5nB7ucXHT", "openreview": "https://openreview.net/forum?id=m5nB7ucXHT", "author_site": "Loek van Rossem, Andrew Saxe", "tldr": "", "abstract": "Deep neural networks come in many sizes and architectures. The choice of architecture, in conjunction with the dataset and learning algorithm, is commonly understood to affect the learned neural representations. Yet, recent results have shown that different architectures learn representations with striking qualitative similarities. Here we derive an effective theory of representation learning under the assumption that the encoding map from input to hidden representation and the decoding map from representation to output are arbitrary smooth functions. This theory schematizes representation learning dynamics in the regime of complex, large architectures, where hidden representations are not strongly constrained by the parametrization. We show through experiments that the effective theory describes aspects of representation learning dynamics across a range of deep networks with different activation functions and architectures, and exhibits phenomena similar to the \u201crich\u201d and \u201clazy\u201d regime. While many network behaviors depend quantitatively on architecture, our findings point to certain behaviors that are widely conserved once models are sufficiently flexible.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Loek van Rossem;Andrew M Saxe", "authorids": "~Loek_van_Rossem1;~Andrew_M_Saxe1", "gender": "M;M", "homepage": ";https://www.saxelab.org", "dblp": ";39/6894", "google_scholar": ";h0Al1fcAAAAJ", "orcid": ";0000-0002-9831-8812", "linkedin": "loek-van-rossem-95a5b4165/;", "or_profile": "~Loek_van_Rossem1;~Andrew_M_Saxe1", "aff": "University College London, University of London;University College London, University of London", "aff_domain": "ucl.ac.uk;ucl.ac.uk", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nrossem2024when,\ntitle={When Representations Align: Universality in Representation Learning Dynamics},\nauthor={Loek van Rossem and Andrew M Saxe},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=m5nB7ucXHT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5293068, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5342643745317283820&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "email": "ucl.ac.uk;ucl.ac.uk", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University College London", "aff_unique_dep": "", "aff_unique_url": "https://www.ucl.ac.uk", "aff_unique_abbr": "UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Layer-Aware Analysis of Catastrophic Overfitting: Revealing the Pseudo-Robust Shortcut Dependency", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33187", "id": "m8lCi7rG4u", "proceeding": "https://proceedings.mlr.press/v235/lin24v.html", "pdf": "https://openreview.net/pdf?id=m8lCi7rG4u", "openreview": "https://openreview.net/forum?id=m8lCi7rG4u", "author_site": "Runqi Lin, Chaojian Yu, Bo Han, Hang Su, Tongliang Liu", "tldr": "", "abstract": "Catastrophic overfitting (CO) presents a significant challenge in single-step adversarial training (AT), manifesting as highly distorted deep neural networks (DNNs) that are vulnerable to multi-step adversarial attacks. However, the underlying factors that lead to the distortion of decision boundaries remain unclear. In this work, we delve into the specific changes within different DNN layers and discover that during CO, the former layers are more susceptible, experiencing earlier and greater distortion, while the latter layers show relative insensitivity. Our analysis further reveals that this increased sensitivity in former layers stems from the formation of $\\textit{pseudo-robust shortcuts}$, which alone can impeccably defend against single-step adversarial attacks but bypass genuine-robust learning, resulting in distorted decision boundaries. Eliminating these shortcuts can partially restore robustness in DNNs from the CO state, thereby verifying that dependence on them triggers the occurrence of CO. This understanding motivates us to implement adaptive weight perturbations across different layers to hinder the generation of $\\textit{pseudo-robust shortcuts}$, consequently mitigating CO. Extensive experiments demonstrate that our proposed method, $\\textbf{L}$ayer-$\\textbf{A}$ware Adversarial Weight $\\textbf{P}$erturbation (LAP), can effectively prevent CO and further enhance robustness.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Runqi Lin;Chaojian Yu;Bo Han;Hang Su;Tongliang Liu", "authorids": "~Runqi_Lin1;~Chaojian_Yu1;~Bo_Han1;~Hang_Su3;~Tongliang_Liu1", "gender": "M;M;M;M;M", "homepage": "https://runqilin.github.io;;https://tongliang-liu.github.io/;https://bhanml.github.io/;", "dblp": "359/1108;223/9872;150/6667;241/0472-3;26/5371-6", "google_scholar": "Zg7PKbcAAAAJ;b3ltuG8AAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;nTNjqHwAAAAJ;dxN1_X0AAAAJ", "orcid": "0009-0000-6607-7754;;;;", "linkedin": ";;;;", "or_profile": "~Runqi_Lin1;~Chaojian_Yu1;~Tongliang_Liu1;~bo_han2;~Hang_Su2", "aff": "University of Sydney;The University of Sydney;Mohamed bin Zayed University of Artificial Intelligence;MBZUAI;Tsinghua University", "aff_domain": "usyd.edu.au;uni.sydney.edu.au;mbzuai.ac.ae;mbzuai.ac.ae;tsinghua.edu.cn", "position": "PhD student;PhD student;Affiliated Associate Professor;Researcher;Associate Professor", "bibtex": "@inproceedings{\nlin2024layeraware,\ntitle={Layer-Aware Analysis of Catastrophic Overfitting: Revealing the Pseudo-Robust Shortcut Dependency},\nauthor={Runqi Lin and Chaojian Yu and Bo Han and Hang Su and Tongliang Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=m8lCi7rG4u}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9297884, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7152825765943395102&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "usyd.edu.au;uni.sydney.edu.au;mbzuai.ac.ae;mbzuai.ac.ae;tsinghua.edu.cn", "author_num": 5, "aff_unique_index": "0;0;1;1;2", "aff_unique_norm": "University of Sydney;Mohamed bin Zayed University of Artificial Intelligence;Tsinghua University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.sydney.edu.au;https://mbzuai.ac.ae;https://www.tsinghua.edu.cn", "aff_unique_abbr": "USYD;MBZUAI;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;2", "aff_country_unique": "Australia;United Arab Emirates;China" }, { "title": "Smooth Min-Max Monotonic Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33186", "id": "m8t1yzfBsJ", "proceeding": "https://proceedings.mlr.press/v235/igel24a.html", "pdf": "https://openreview.net/pdf?id=m8t1yzfBsJ", "openreview": "https://openreview.net/forum?id=m8t1yzfBsJ", "tldr": "", "abstract": "Monotonicity constraints are powerful regularizers in statistical modelling. They can support fairness in computer-aided decision making and increase plausibility in data-driven scientific models. The seminal min-max (MM) neural network architecture ensures monotonicity, but often gets stuck in undesired local optima during training because of partial derivatives being zero when computing extrema. We propose a simple modification of the MM network using strictly-increasing smooth minimum and maximum functions that alleviates this problem. The resulting smooth min-max (SMM) network module inherits the asymptotic approximation properties from the MM architecture. It can be used within larger deep learning systems trained end-to-end. The SMM module is conceptually simple and computationally less demanding than state-of-the-art neural networks for monotonic modelling. Our experiments show that this does not come with a loss in generalization performance compared to alternative neural and non-neural approaches.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Christian Igel", "authorids": "~Christian_Igel1", "gender": "M", "homepage": "https://christian-igel.github.io/", "dblp": "38/6146", "google_scholar": "https://scholar.google.dk/citations?user=d-jF4zIAAAAJ", "orcid": "0000-0003-2868-0856", "linkedin": "christianigel/", "or_profile": "~Christian_Igel1", "aff": "University of Copenhagen", "aff_domain": "ku.dk", "position": "Full Professor", "bibtex": "@inproceedings{\nigel2024smooth,\ntitle={Smooth Min-Max Monotonic Networks},\nauthor={Christian Igel},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=m8t1yzfBsJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 595057, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:VAfUke2I2TUJ:scholar.google.com/&scioq=Smooth+Min-Max+Monotonic+Networks&hl=en&as_sdt=0,5", "gs_version_total": 7, "email": "ku.dk", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "University of Copenhagen", "aff_unique_dep": "", "aff_unique_url": "https://www.ku.dk", "aff_unique_abbr": "UCPH", "aff_country_unique_index": "0", "aff_country_unique": "Denmark" }, { "title": "Reinformer: Max-Return Sequence Modeling for Offline RL", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33185", "id": "mBc8Pestd5", "proceeding": "https://proceedings.mlr.press/v235/zhuang24b.html", "pdf": "https://openreview.net/pdf?id=mBc8Pestd5", "openreview": "https://openreview.net/forum?id=mBc8Pestd5", "author_site": "Zifeng Zhuang, Dengyun Peng, Jinxin Liu, Ziqi Zhang, Donglin Wang", "tldr": "", "abstract": "As a data-driven paradigm, offline reinforcement learning (RL) has been formulated as sequence modeling that conditions on the hindsight information including returns, goal or future trajectory. Although promising, this supervised paradigm overlooks the core objective of RL that maximizes the return. This overlook directly leads to the lack of trajectory stitching capability that affects the sequence model learning from sub-optimal data. In this work, we introduce the concept of max-return sequence modeling which integrates the goal of maximizing returns into existing sequence models. We propose **Rein*for***ced Trans***for*mer** (**Rein*for*mer**), indicating the sequence model is reinforced by the RL objective. **Rein*for*mer** additionally incorporates the objective of maximizing returns in the training phase, aiming to predict the maximum future return within the distribution. During inference, this in-distribution maximum return will guide the selection of optimal actions. Empirically, **Rein*for*mer** is competitive with classical RL methods on the D4RL benchmark and outperforms state-of-the-art sequence model particularly in trajectory stitching ability. Code is public at https://github.com/Dragon-Zhuang/Reinformer.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zifeng Zhuang;Dengyun Peng;Jinxin Liu;Ziqi Zhang;Donglin Wang", "authorids": "~Zifeng_Zhuang1;~Dengyun_Peng1;~Jinxin_Liu1;~Ziqi_Zhang7;~Donglin_Wang1", "gender": "M;M;;;M", "homepage": ";https://github.com/sfasfaffa;;;https://milab.westlake.edu.cn/", "dblp": "276/5034;368/1092.html;;;", "google_scholar": ";;;;https://scholar.google.ca/citations?user=-fo6wdwAAAAJ", "orcid": ";;;;0000-0002-8188-3735", "linkedin": ";;;;", "or_profile": "~Zifeng_Zhuang1;~Dengyun_Peng1;~Jinxin_Liu1;~Ziqi_Zhang7;~Donglin_Wang1", "aff": "Zhejiang University;Westlake University;;;Westlake University", "aff_domain": "zju.edu.cn;westlake.edu.cn;;;westlake.edu.cn", "position": "PhD student;Intern;;;Associate Professor", "bibtex": "@inproceedings{\nzhuang2024reinformer,\ntitle={Reinformer: Max-Return Sequence Modeling for Offline {RL}},\nauthor={Zifeng Zhuang and Dengyun Peng and Jinxin Liu and Ziqi Zhang and Donglin Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mBc8Pestd5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1321445, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2010500215562384427&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "zju.edu.cn;westlake.edu.cn;;;westlake.edu.cn", "author_num": 5, "aff_unique_index": "0;1;1", "aff_unique_norm": "Zhejiang University;Westlake University", "aff_unique_dep": ";", "aff_unique_url": "https://www.zju.edu.cn;https://www.westlake.edu.cn", "aff_unique_abbr": "ZJU;WU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Zero-Shot Unsupervised and Text-Based Audio Editing Using DDPM Inversion", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33184", "id": "mCzyRdDak5", "proceeding": "https://proceedings.mlr.press/v235/manor24a.html", "pdf": "https://openreview.net/pdf?id=mCzyRdDak5", "openreview": "https://openreview.net/forum?id=mCzyRdDak5", "author_site": "Hila Manor, Tomer Michaeli", "tldr": "", "abstract": "Editing signals using large pre-trained models, in a zero-shot manner, has recently seen rapid advancements in the image domain. However, this wave has yet to reach the audio domain. In this paper, we explore two zero-shot editing techniques for audio signals, which use DDPM inversion with pre-trained diffusion models. The first, which we coin *ZEro-shot Text-based Audio (ZETA)* editing, is adopted from the image domain. The second, named *ZEro-shot UnSupervized (ZEUS)* editing, is a novel approach for discovering semantically meaningful editing directions without supervision. When applied to music signals, this method exposes a range of musically interesting modifications, from controlling the participation of specific instruments to improvisations on the melody. Samples and code can be found on our [examples page](https://hilamanor.github.io/AudioEditing/).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hila Manor;Tomer Michaeli", "authorids": "~Hila_Manor1;~Tomer_Michaeli1", "gender": "F;M", "homepage": "https://hilamanor.github.io/;https://tomer.net.technion.ac.il/", "dblp": "357/5326.html;70/3188.html", "google_scholar": "Pz32vm4AAAAJ;n2EbR2cAAAAJ", "orcid": "0009-0007-6851-148X;", "linkedin": "hilamanor/;", "or_profile": "~Hila_Manor1;~Tomer_Michaeli1", "aff": "Technion - Israel Institute of Technology, Technion - Israel Institute of Technology;Technion, Technion", "aff_domain": "campus.technion.ac.il;technion.ac.il", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nmanor2024zeroshot,\ntitle={Zero-Shot Unsupervised and Text-Based Audio Editing Using {DDPM} Inversion},\nauthor={Hila Manor and Tomer Michaeli},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mCzyRdDak5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7847392, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11752872304492719448&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "campus.technion.ac.il;technion.ac.il", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "A Multimodal Automated Interpretability Agent", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33183", "id": "mDw42ZanmE", "proceeding": "https://proceedings.mlr.press/v235/shaham24a.html", "pdf": "https://openreview.net/pdf?id=mDw42ZanmE", "openreview": "https://openreview.net/forum?id=mDw42ZanmE", "author_site": "Tamar Rott Shaham, Sarah Schwettmann, Franklin Wang, Achyuta Rajaram, Evan Hernandez, Jacob Andreas, Antonio Torralba", "tldr": "", "abstract": "This paper describes MAIA, a Multimodal Automated Interpretability Agent. MAIA is a system that uses neural models to automate neural model understanding tasks like feature interpretation and failure mode discovery. It equips a pre-trained vision-language model with a set of tools that support iterative experimentation on subcomponents of other models to explain their behavior. These include tools commonly used by human interpretability researchers: for synthesizing and editing inputs, computing maximally activating exemplars from real-world datasets, and summarizing and describing experimental results. Interpretability experiments proposed by MAIA compose these tools to describe and explain system behavior. We evaluate applications of MAIA to computer vision models. We first characterize MAIA\u2019s ability to describe (neuron-level) features in learned representations of images. Across several trained models and a novel dataset of synthetic vision neurons with paired ground-truth descriptions, MAIA produces descriptions comparable to those generated by expert human experimenters. We then show that MAIA can aid in two additional interpretability tasks: reducing sensitivity to spurious features, and automatically identifying inputs likely to be mis-classified.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tamar Rott Shaham;Sarah Schwettmann;Franklin Wang;Achyuta Rajaram;Evan Hernandez;Jacob Andreas;Antonio Torralba", "authorids": "~Tamar_Rott_Shaham1;~Sarah_Schwettmann2;~Franklin_Wang2;~Achyuta_Rajaram1;~Evan_Hernandez1;~Jacob_Andreas1;~Antonio_Torralba1", "gender": "F;F;M;;M;M;M", "homepage": "https://tamarott.github.io/;;;https://flybamboo.org;https://evandez.com;http://web.mit.edu/jda/www;http://web.mit.edu/torralba/www//", "dblp": "185/7904;;;;;97/8154;t/AntonioBTorralba", "google_scholar": "https://scholar.google.co.il/citations?user=YRJ-ePMAAAAJ;;bLoeVJYAAAAJ;;;dnZ8udEAAAAJ;https://scholar.google.com.tw/citations?user=8cxDHS4AAAAJ", "orcid": ";0000-0001-6385-1396;;;0000-0002-8876-1781;;", "linkedin": ";;frankxwang/;;evandez/;;", "or_profile": "~Tamar_Rott_Shaham1;~Sarah_Schwettmann2;~Franklin_Wang2;~Achyuta_Rajaram1;~Evan_Hernandez1;~Jacob_Andreas1;~Antonio_Torralba1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Microsoft;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;csail.mit.edu;mit.edu;mit.edu;microsoft.com;mit.edu", "position": "Postdoc;Postdoc;Undergrad student;Intern;PhD student;Researcher;Full Professor", "bibtex": "@inproceedings{\nshaham2024a,\ntitle={A Multimodal Automated Interpretability Agent},\nauthor={Tamar Rott Shaham and Sarah Schwettmann and Franklin Wang and Achyuta Rajaram and Evan Hernandez and Jacob Andreas and Antonio Torralba},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mDw42ZanmE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9710525, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1169787175816466698&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 4, "email": "mit.edu;mit.edu;csail.mit.edu;mit.edu;mit.edu;microsoft.com;mit.edu", "author_num": 7, "aff_unique_index": "0;0;0;0;0;1;0", "aff_unique_norm": "Massachusetts Institute of Technology;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://web.mit.edu;https://www.microsoft.com", "aff_unique_abbr": "MIT;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "On Hypothesis Transfer Learning of Functional Linear Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33182", "id": "mGsF8Q0fGZ", "proceeding": "https://proceedings.mlr.press/v235/lin24p.html", "pdf": "https://openreview.net/pdf?id=mGsF8Q0fGZ", "openreview": "https://openreview.net/forum?id=mGsF8Q0fGZ", "author_site": "Haotian Lin, Matthew Reimherr", "tldr": "", "abstract": "We study the transfer learning (TL) for the functional linear regression (FLR) under the Reproducing Kernel Hilbert Space (RKHS) framework, observing the TL techniques in existing high-dimensional linear regression is not compatible with the truncation-based FLR methods as functional data are intrinsically infinite-dimensional and generated by smooth underlying processes. We measure the similarity across tasks using RKHS distance, allowing the type of information being transferred to be tied to the properties of the imposed RKHS. Building on the hypothesis offset transfer learning paradigm, two algorithms are proposed: one conducts the transfer when positive sources are known, while the other leverages aggregation techniques to achieve robust transfer without prior information about the sources. We establish asymptotic lower bounds for this learning problem and show the proposed algorithms enjoy a matching upper bound. These analyses provide statistical insights into factors that contribute to the dynamics of the transfer. We also extend the results to functional generalized linear models. The effectiveness of the proposed algorithms is demonstrated via extensive synthetic data as well as real-world data applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haotian Lin;Matthew Reimherr", "authorids": "~Haotian_Lin1;~Matthew_Reimherr1", "gender": "M;", "homepage": "https://haotianlin.github.io/;https://www.personal.psu.edu/~mlr36", "dblp": "177/6974-2;187/4282", "google_scholar": "DtHTtSwAAAAJ;UZcbx9gAAAAJ", "orcid": ";0000-0002-7149-0591", "linkedin": ";", "or_profile": "~Haotian_Lin1;~Matthew_Reimherr1", "aff": "Pennsylvania State University;Amazon", "aff_domain": "psu.edu;amazon.com", "position": "PhD student;Principal Researcher", "bibtex": "@inproceedings{\nlin2024on,\ntitle={On Hypothesis Transfer Learning of Functional Linear Models},\nauthor={Haotian Lin and Matthew Reimherr},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mGsF8Q0fGZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1078991, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1310880925888288359&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "psu.edu;amazon.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Pennsylvania State University;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.psu.edu;https://www.amazon.com", "aff_unique_abbr": "PSU;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Rethinking Optimization and Architecture for Tiny Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33181", "id": "mHIEOZtDDF", "proceeding": "https://proceedings.mlr.press/v235/tang24c.html", "pdf": "https://openreview.net/pdf?id=mHIEOZtDDF", "openreview": "https://openreview.net/forum?id=mHIEOZtDDF", "author_site": "Yehui Tang, Kai Han, Fangcheng Liu, Yunsheng Ni, Yuchuan Tian, Zheyuan Bai, Yi-Qi Hu, Sichao Liu, Shang-Ling Jui, Yunhe Wang", "tldr": "", "abstract": "The power of large language models (LLMs) has been demonstrated through numerous data and computing resources. However, the application of language models on mobile devices is facing huge challenge on the computation and memory costs, that is, tiny language models with high performance are urgently required. Limited by the highly complex training process, there are many details for optimizing language models that are seldom studied carefully. In this study, based on a tiny language model with 1B parameters, we carefully design a series of empirical study to analyze the effect of each component. Three perspectives are mainly discussed, i.e., neural architecture, parameter initialization, and optimization strategy. Several design formulas are empirically proved especially effective for tiny language models, including tokenizer compression, architecture tweaking, parameter inheritance and multiple-round training. Then we train PanGu-$\\pi$-1B Pro and PanGu-$\\pi$-1.5B Pro on 1.6T multilingual corpora, following the established formulas. Experimental results demonstrate the improved optimization and architecture yield a notable average improvement of 8.87 on benchmark evaluation sets for PanGu-$\\pi$-1B Pro. Besides, PanGu-$\\pi$-1.5B Pro surpasses a range of SOTA models with larger model sizes, validating its superior performance. The code will be released soon. The code is available at https://github.com/YuchuanTian/RethinkTinyLM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yehui Tang;Kai Han;Fangcheng Liu;Yunsheng Ni;Yuchuan Tian;Zheyuan Bai;Yi-Qi Hu;Sichao Liu;SHANGLING JUI;Yunhe Wang", "authorids": "~Yehui_Tang1;~Kai_Han2;~Fangcheng_Liu1;~Yunsheng_Ni1;~Yuchuan_Tian1;~Zheyuan_Bai2;~Yi-Qi_Hu1;owen.liusichao@huawei.com;~SHANGLING_JUI1;~Yunhe_Wang1", "gender": "M;M;M;Not Specified;M;M;M;;M;M", "homepage": ";https://iamhankai.github.io;https://scholar.google.com/;https://niyunsheng.github.io/;;;http://www.lamda.nju.edu.cn/huyq/;;;https://www.wangyunhe.site/", "dblp": "244/9659;51/4757-2;;;193/6675;;178/8662;;;63/8217-1", "google_scholar": "TkSZQ6gAAAAJ;vThoBVcAAAAJ;https://scholar.google.com/;CSpaTt8AAAAJ;;;https://scholar.google.com/citations?hl=en;;;https://scholar.google.com.sg/citations?user=isizOkYAAAAJ", "orcid": ";0000-0002-9761-2702;;;;;;;0000-0002-1047-4264;0000-0002-0142-509X", "linkedin": ";;;;;zheyuanbai/;;;;", "or_profile": "~Yehui_Tang1;~Kai_Han2;~Fangcheng_Liu1;~Yunsheng_Ni1;~Yuchuan_Tian1;~Zheyuan_Bai2;~Yi-Qi_Hu1;owen.liusichao@huawei.com;~SHANGLING_JUI1;~Yunhe_Wang1", "aff": "Huawei Technologies Ltd.;Huawei Noah's Ark Lab;;;Peking University;Huawei Technologies Ltd.;;;Huawei Technologies Ltd.;Huawei Noah's Ark Lab", "aff_domain": "huawei.com;huawei.com;;;pku.edu.cn;huawei.com;;;huawei.com;huawei.com", "position": "Researcher;Principal Researcher;;;PhD student;Researcher;;;Principal Researcher;Principal Researcher", "bibtex": "@inproceedings{\ntang2024rethinking,\ntitle={Rethinking Optimization and Architecture for Tiny Language Models},\nauthor={Yehui Tang and Kai Han and Fangcheng Liu and Yunsheng Ni and Yuchuan Tian and Zheyuan Bai and Yi-Qi Hu and Sichao Liu and SHANGLING JUI and Yunhe Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mHIEOZtDDF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 649225, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13004982430555434021&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 6, "email": "huawei.com;huawei.com;;;pku.edu.cn;huawei.com;;;huawei.com;huawei.com", "author_num": 10, "aff_unique_index": "0;0;1;0;0;0", "aff_unique_norm": "Huawei;Peking University", "aff_unique_dep": "Huawei Technologies;", "aff_unique_url": "https://www.huawei.com;http://www.pku.edu.cn", "aff_unique_abbr": "Huawei;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Challenges in Training PINNs: A Loss Landscape Perspective", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33180", "id": "mJGiFr8jLa", "proceeding": "https://proceedings.mlr.press/v235/rathore24a.html", "pdf": "https://openreview.net/pdf?id=mJGiFr8jLa", "openreview": "https://openreview.net/forum?id=mJGiFr8jLa", "author_site": "Pratik Rathore, Weimu Lei, Zachary Frangella, Lu Lu, Madeleine Udell", "tldr": "", "abstract": "This paper explores challenges in training Physics-Informed Neural Networks (PINNs), emphasizing the role of the loss landscape in the training process. We examine difficulties in minimizing the PINN loss function, particularly due to ill-conditioning caused by differential operators in the residual term. We compare gradient-based optimizers Adam, L-BFGS, and their combination Adam+L-BFGS, showing the superiority of Adam+L-BFGS, and introduce a novel second-order optimizer, NysNewton-CG (NNCG), which significantly improves PINN performance. Theoretically, our work elucidates the connection between ill-conditioned differential operators and ill-conditioning in the PINN loss and shows the benefits of combining first- and second-order optimization methods. Our work presents valuable insights and more powerful optimization strategies for training PINNs, which could improve the utility of PINNs for solving difficult partial differential equations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pratik Rathore;Weimu Lei;Zachary Frangella;Lu Lu;Madeleine Udell", "authorids": "~Pratik_Rathore1;~Weimu_Lei1;~Zachary_Frangella1;~Lu_Lu1;~Madeleine_Udell1", "gender": "M;;;M;F", "homepage": "https://pratikrathore8.github.io;https://profiles.stanford.edu/weimu-lei;https://github.com/zjf4;https://lu.seas.upenn.edu;https://people.orie.cornell.edu/mru8", "dblp": "334/1745;367/6961;298/0473;01/2086-10;153/2166", "google_scholar": "9WOwy2sAAAAJ;;;wD_wsWUAAAAJ;tZ9pEDMAAAAJ", "orcid": ";;;0000-0002-5476-5768;0000-0002-3985-915X", "linkedin": "pratikrathore/;;;;", "or_profile": "~Pratik_Rathore1;~Weimu_Lei1;~Zachary_Frangella1;~Lu_Lu1;~Madeleine_Udell1", "aff": "Stanford University;Stanford University;Stanford University;Yale University;Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu;yale.edu;stanford.edu", "position": "PhD student;MS student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nrathore2024challenges,\ntitle={Challenges in Training {PINN}s: A Loss Landscape Perspective},\nauthor={Pratik Rathore and Weimu Lei and Zachary Frangella and Lu Lu and Madeleine Udell},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mJGiFr8jLa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1580711, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13498038755973970618&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "email": "stanford.edu;stanford.edu;stanford.edu;yale.edu;stanford.edu", "author_num": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Stanford University;Yale University", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.yale.edu", "aff_unique_abbr": "Stanford;Yale", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "What Improves the Generalization of Graph Transformers? A Theoretical Dive into the Self-attention and Positional Encoding", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33179", "id": "mJhXlsZzzE", "proceeding": "https://proceedings.mlr.press/v235/li24bo.html", "pdf": "https://openreview.net/pdf?id=mJhXlsZzzE", "openreview": "https://openreview.net/forum?id=mJhXlsZzzE", "author_site": "Hongkang Li, Meng Wang, Tengfei Ma, Sijia Liu, Zaixi Zhang, Pin-Yu Chen", "tldr": "", "abstract": "Graph Transformers, which incorporate self-attention and positional encoding, have recently emerged as a powerful architecture for various graph learning tasks. Despite their impressive performance, the complex non-convex interactions across layers and the recursive graph structure have made it challenging to establish a theoretical foundation for learning and generalization. This study introduces the first theoretical investigation of a shallow Graph Transformer for semi-supervised node classification, comprising a self-attention layer with relative positional encoding and a two-layer perception. Focusing on a graph data model with discriminative nodes that determine node labels and non-discriminative nodes that are class-irrelevant, we characterize the sample complexity required to achieve a desirable generalization error by training with stochastic gradient descent (SGD). This paper provides the quantitative characterization of the sample complexity and number of iterations for convergence dependent on the fraction of discriminative nodes, the dominant patterns, and the initial model errors. Furthermore, we demonstrate that self-attention and positional encoding enhance generalization by making the attention map sparse and promoting the core neighborhood during training, which explains the superior feature representation of Graph Transformers. Our theoretical results are supported by empirical experiments on synthetic and real-world benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hongkang Li;Meng Wang;Tengfei Ma;Sijia Liu;ZAIXI ZHANG;Pin-Yu Chen", "authorids": "~Hongkang_Li1;~Meng_Wang4;~Tengfei_Ma1;~Sijia_Liu1;~ZAIXI_ZHANG2;~Pin-Yu_Chen1", "gender": ";F;M;M;M;M", "homepage": "https://lohek330.github.io/lihongkang.github.io/;https://www.ecse.rpi.edu/~wang/index.html;https://sites.google.com/site/matf0123/;https://lsjxjtu.github.io/;http://home.ustc.edu.cn/~zaixi/;http://www.pinyuchen.com", "dblp": "318/8643;93/6765-3;94/9023-1;128/6972-1;267/9295.html;39/8969", "google_scholar": "https://scholar.google.com.hk/citations?user=DVlDPjMAAAAJ;;9OvNakkAAAAJ;C7dO_UgAAAAJ;https://scholar.google.com/citations?hl=zh-CN;jxwlCUUAAAAJ", "orcid": ";;0000-0002-1086-529X;;;0000-0003-1039-8369", "linkedin": "hongkang-li-b7a341173/;;;;;pin-yu-chen-940062a2", "or_profile": "~Hongkang_Li1;~Meng_Wang4;~Tengfei_Ma1;~Sijia_Liu1;~ZAIXI_ZHANG2;~Pin-Yu_Chen1", "aff": "Rensselaer Polytechnic Institute;Rensselaer Polytechnic Institute;State University of New York at Stony Brook;Michigan State University;University of Science and Technology of China;International Business Machines", "aff_domain": "rpi.edu;rpi.edu;stonybrook.edu;msu.edu;ustc.edu.cn;ibm.com", "position": "PhD student;Associate Professor;Assistant Professor;Assistant Professor;PhD student;Principal Researcher", "bibtex": "@inproceedings{\nli2024what,\ntitle={What Improves the Generalization of Graph Transformers? A Theoretical Dive into the Self-attention and Positional Encoding},\nauthor={Hongkang Li and Meng Wang and Tengfei Ma and Sijia Liu and ZAIXI ZHANG and Pin-Yu Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mJhXlsZzzE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2505015, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5298125028387051408&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "rpi.edu;rpi.edu;stonybrook.edu;msu.edu;ustc.edu.cn;ibm.com", "author_num": 6, "aff_unique_index": "0;0;1;2;3;4", "aff_unique_norm": "Rensselaer Polytechnic Institute;State University of New York at Stony Brook;Michigan State University;University of Science and Technology of China;International Business Machines Corporation", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.rpi.edu;https://www.stonybrook.edu;https://www.msu.edu;http://www.ustc.edu.cn;https://www.ibm.com", "aff_unique_abbr": "RPI;SUNY Stony Brook;MSU;USTC;IBM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stony Brook", "aff_country_unique_index": "0;0;0;0;1;0", "aff_country_unique": "United States;China" }, { "title": "Studying K-FAC Heuristics by Viewing Adam through a Second-Order Lens", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33178", "id": "mK6FB9xQ7v", "proceeding": "https://proceedings.mlr.press/v235/clarke24a.html", "pdf": "https://openreview.net/pdf?id=mK6FB9xQ7v", "openreview": "https://openreview.net/forum?id=mK6FB9xQ7v", "author_site": "Ross Clarke, Jose Miguel Hernandez-Lobato", "tldr": "", "abstract": "Research into optimisation for deep learning is characterised by a tension between the computational efficiency of first-order, gradient-based methods (such as SGD and Adam) and the theoretical efficiency of second-order, curvature-based methods (such as quasi-Newton methods and K-FAC). Noting that second-order methods often only function effectively with the addition of stabilising heuristics (such as Levenberg-Marquardt damping), we ask how much these (as opposed to the second-order curvature model) contribute to second-order algorithms' performance. We thus study _AdamQLR_: an optimiser combining damping and learning rate selection techniques from K-FAC (Martens & Grosse, 2015) with the update directions proposed by Adam, inspired by considering Adam through a second-order lens. We evaluate AdamQLR on a range of regression and classification tasks at various scales and hyperparameter tuning methodologies, concluding K-FAC's adaptive heuristics are of variable standalone general effectiveness, and finding an _untuned_ AdamQLR setting can achieve comparable performance vs runtime to _tuned_ benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ross M Clarke;Jos\u00e9 Miguel Hern\u00e1ndez-Lobato", "authorids": "~Ross_M_Clarke1;~Jos\u00e9_Miguel_Hern\u00e1ndez-Lobato1", "gender": "M;", "homepage": ";http://jmhl.org", "dblp": "304/7918;40/6058", "google_scholar": "1joGBpgAAAAJ;BEBccCQAAAAJ", "orcid": "0000-0001-9884-046X;0000-0001-7610-949X", "linkedin": ";", "or_profile": "~Ross_M_Clarke1;~Jose_Miguel_Hernandez_Lobato1", "aff": "University of Cambridge;University of Cambridge", "aff_domain": "cam.ac.uk;cam.ac.uk", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nclarke2024studying,\ntitle={Studying K-{FAC} Heuristics by Viewing Adam through a Second-Order Lens},\nauthor={Ross M Clarke and Jos{\\'e} Miguel Hern{\\'a}ndez-Lobato},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mK6FB9xQ7v}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9140020, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:0xp7v7clXE8J:scholar.google.com/&scioq=Studying+K-FAC+Heuristics+by+Viewing+Adam+through+a+Second-Order+Lens&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": "cam.ac.uk;cam.ac.uk", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Cambridge", "aff_unique_dep": "", "aff_unique_url": "https://www.cam.ac.uk", "aff_unique_abbr": "Cambridge", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Rethinking Data Shapley for Data Selection Tasks: Misleads and Merits", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33177", "id": "mKYBMf1hHG", "proceeding": "https://proceedings.mlr.press/v235/wang24cg.html", "pdf": "https://openreview.net/pdf?id=mKYBMf1hHG", "openreview": "https://openreview.net/forum?id=mKYBMf1hHG", "author_site": "Jiachen Wang, Tianji Yang, James Zou, Yongchan Kwon, Ruoxi Jia", "tldr": "", "abstract": "Data Shapley provides a principled approach to data valuation and plays a crucial role in data-centric machine learning (ML) research. Data selection is considered a standard application of Data Shapley. However, its data selection performance has shown to be inconsistent across settings in the literature. This study aims to deepen our understanding of this phenomenon. We introduce a hypothesis testing framework and show that Data Shapley's performance can be no better than random selection without specific constraints on utility functions. We identify a class of utility functions, monotonically transformed modular functions, within which Data Shapley optimally selects data. Based on this insight, we propose a heuristic for predicting Data Shapley\u2019s effectiveness in data selection tasks. Our experiments corroborate these findings, adding new insights into when Data Shapley may or may not succeed.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiachen T. Wang;Tianji Yang;James Zou;Yongchan Kwon;Ruoxi Jia", "authorids": "~Jiachen_T._Wang1;~Tianji_Yang1;~James_Zou1;~Yongchan_Kwon1;~Ruoxi_Jia1", "gender": ";Not Specified;;;", "homepage": ";https://y-jiji.github.io/blog/;;;https://ruoxijia.info/", "dblp": ";;;;147/5355-1", "google_scholar": ";;23ZXZvEAAAAJ;;JCrug-YAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Jiachen_T._Wang1;~Tianji_Yang1;~James_Zou1;~Yongchan_Kwon1;~Ruoxi_Jia1", "aff": ";East China Normal University;Stanford University;;Virginia Tech", "aff_domain": ";ecnu.edu.cn;stanford.edu;;vt.edu", "position": ";Undergrad student;Assistant Professor;;Assistant Professor", "bibtex": "@inproceedings{\nwang2024rethinking,\ntitle={Rethinking Data Shapley for Data Selection Tasks: Misleads and Merits},\nauthor={Jiachen T. Wang and Tianji Yang and James Zou and Yongchan Kwon and Ruoxi Jia},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mKYBMf1hHG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6875793, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5477685327645014805&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": ";ecnu.edu.cn;stanford.edu;;vt.edu", "author_num": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "East China Normal University;Stanford University;Virginia Tech", "aff_unique_dep": ";;", "aff_unique_url": "http://www.ecnu.edu.cn;https://www.stanford.edu;https://www.vt.edu", "aff_unique_abbr": "ECNU;Stanford;VT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;United States" }, { "title": "Overcoming Data and Model heterogeneities in Decentralized Federated Learning via Synthetic Anchors", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33176", "id": "mNzkumTSVL", "proceeding": "https://proceedings.mlr.press/v235/huang24v.html", "pdf": "https://openreview.net/pdf?id=mNzkumTSVL", "openreview": "https://openreview.net/forum?id=mNzkumTSVL", "author_site": "Chun-Yin Huang, Kartik Srinivas, Xin Zhang, Xiaoxiao Li", "tldr": "", "abstract": "Conventional Federated Learning (FL) involves collaborative training of a global model while maintaining user data privacy. One of its branches, decentralized FL, is a serverless network that allows clients to own and optimize different local models separately, which results in saving management and communication resources. Despite the promising advancements in decentralized FL, it may reduce model generalizability due to lacking a global model. In this scenario, managing data and model heterogeneity among clients becomes a crucial problem, which poses a unique challenge that must be overcome: *How can every client's local model learn generalizable representation in a decentralized manner?* To address this challenge, we propose a novel **De**centralized FL technique by introducing **S**ynthetic **A**nchors, dubbed as DeSA. Based on the theory of domain adaptation and Knowledge Distillation (KD), we theoretically and empirically show that synthesizing global anchors based on raw data distribution facilitates mutual knowledge transfer. We further design two effective regularization terms for local training: *1) REG loss* that regularizes the distribution of the client's latent embedding with the anchors and *2) KD loss* that enables clients to learn from others. Through extensive experiments on diverse client data distributions, we showcase the effectiveness of DeSA in enhancing both inter- and intra-domain accuracy of each client. The implementation of DeSA can be found at: https://github.com/ubc-tea/DESA", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chun-Yin Huang;Kartik Srinivas;Xin Zhang;Xiaoxiao Li", "authorids": "~Chun-Yin_Huang1;~Kartik_Srinivas1;~Xin_Zhang16;~Xiaoxiao_Li1", "gender": "M;M;M;Unspecified", "homepage": ";https://kartiksrinivas007.github.io/;https://xinzhang-nac.github.io/;https://xxlya.github.io/", "dblp": ";;76/1584-54.html;71/8042", "google_scholar": "moi11dgAAAAJ;https://scholar.google.co.in/citations?user=TxnwVpgAAAAJ;9u5Pa0gAAAAJ;sdENOQ4AAAAJ", "orcid": ";;0000-0002-0784-2038;", "linkedin": ";;;", "or_profile": "~Chun-Yin_Huang1;~Kartik_Srinivas1;~Xin_Zhang16;~Xiaoxiao_Li1", "aff": "University of British Columbia;Indian Institute of Technology, Hyderabad;Meta Facebook;University of British Columbia", "aff_domain": "ubc.ca;cse.iith.ac.in;fb.com;ece.ubc.ca", "position": "PhD student;Undergrad student;Research Scientist;Assistant Professor", "bibtex": "@inproceedings{\nhuang2024overcoming,\ntitle={Overcoming Data and Model heterogeneities in Decentralized Federated Learning via Synthetic Anchors},\nauthor={Chun-Yin Huang and Kartik Srinivas and Xin Zhang and Xiaoxiao Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mNzkumTSVL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4105011, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16641722536636542082&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "ubc.ca;cse.iith.ac.in;fb.com;ece.ubc.ca", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of British Columbia;Indian Institute of Technology Hyderabad;Meta", "aff_unique_dep": ";;Meta Platforms, Inc.", "aff_unique_url": "https://www.ubc.ca;https://www.iith.ac.in;https://meta.com", "aff_unique_abbr": "UBC;IIT Hyderabad;Meta", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hyderabad", "aff_country_unique_index": "0;1;2;0", "aff_country_unique": "Canada;India;United States" }, { "title": "PruNeRF: Segment-Centric Dataset Pruning via 3D Spatial Consistency", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33175", "id": "mU7FfQT6VE", "proceeding": "https://proceedings.mlr.press/v235/jung24b.html", "pdf": "https://openreview.net/pdf?id=mU7FfQT6VE", "openreview": "https://openreview.net/forum?id=mU7FfQT6VE", "author_site": "Yeonsung Jung, Heecheol Yun, Joonhyung Park, Jin-Hwa Kim, Eunho Yang", "tldr": "", "abstract": "Neural Radiance Fields (NeRF) have shown remarkable performance in learning 3D scenes. However, NeRF exhibits vulnerability when confronted with distractors in the training images -- unexpected objects are present only within specific views, such as moving entities like pedestrians or birds. Excluding distractors during dataset construction is a straightforward solution, but without prior knowledge of their types and quantities, it becomes prohibitively expensive. In this paper, we propose PruNeRF, a segment-centric dataset pruning framework via 3D spatial consistency, that effectively identifies and prunes the distractors. We first examine existing metrics for measuring pixel-wise distraction and introduce Influence Functions for more accurate measurements. Then, we assess 3D spatial consistency using a depth-based reprojection technique to obtain 3D-aware distraction. Furthermore, we incorporate segmentation for pixel-to-segment refinement, enabling more precise identification. Our experiments on benchmark datasets demonstrate that PruNeRF consistently outperforms state-of-the-art methods in robustness against distractors.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yeonsung Jung;Heecheol Yun;Joonhyung Park;Jin-Hwa Kim;Eunho Yang", "authorids": "~Yeonsung_Jung1;~Heecheol_Yun1;~Joonhyung_Park1;~Jin-Hwa_Kim1;~Eunho_Yang1", "gender": ";M;M;Unspecified;M", "homepage": "https://yeonsungjung.github.io/;;;http://wityworks.com;https://sites.google.com/site/hleehome2/", "dblp": "264/2809;379/5127;306/1374;48/258;96/2621", "google_scholar": "https://scholar.google.com/citations?hl=ko;;https://scholar.google.com/citations?hl=ko;https://scholar.google.co.kr/citations?user=3f2wPekAAAAJ;", "orcid": ";;;0000-0002-0423-0415;", "linkedin": "yeonsung-jung-a50015213/;yoon6503-352b76229/;joonhyung-park-495527145/;;", "or_profile": "~Yeonsung_Jung1;~Heecheol_Yun1;~Joonhyung_Park1;~Jin-Hwa_Kim1;~Eunho_Yang1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;NAVER;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;navercorp.com;kaist.ac.kr", "position": "PhD student;PhD student;PhD student;Research Scientist;Associate Professor", "bibtex": "@inproceedings{\njung2024prunerf,\ntitle={PruNe{RF}: Segment-Centric Dataset Pruning via 3D Spatial Consistency},\nauthor={Yeonsung Jung and Heecheol Yun and Joonhyung Park and Jin-Hwa Kim and Eunho Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mU7FfQT6VE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9842390, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:fhgWaxBOph4J:scholar.google.com/&scioq=PruNeRF:+Segment-Centric+Dataset+Pruning+via+3D+Spatial+Consistency&hl=en&as_sdt=0,5", "gs_version_total": 5, "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;navercorp.com;kaist.ac.kr", "author_num": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;NAVER Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;https://www.naver.com", "aff_unique_abbr": "KAIST;NAVER", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "WebLINX: Real-World Website Navigation with Multi-Turn Dialogue", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33174", "id": "mUSPhG4uDW", "proceeding": "https://proceedings.mlr.press/v235/lu24e.html", "pdf": "https://openreview.net/pdf?id=mUSPhG4uDW", "openreview": "https://openreview.net/forum?id=mUSPhG4uDW", "author_site": "Xing Han L\u00f9, Zden\u011bk Kasner, Siva Reddy", "tldr": "", "abstract": "We propose the problem of conversational web navigation, where a digital agent controls a web browser and follows user instructions to solve real-world tasks in a multi-turn dialogue fashion. To support this problem, we introduce WEBLINX - a large-scale benchmark of 100K interactions across 2300 expert demonstrations of conversational web navigation. Our benchmark covers a broad range of patterns on over 150 real-world websites and can be used to train and evaluate agents in diverse scenarios. Due to the magnitude of information present, Large Language Models (LLMs) cannot process entire web pages in real-time. To solve this bottleneck, we design a retrieval-inspired model that efficiently prunes HTML pages by ranking relevant elements. We use the selected elements, along with screenshots and action history, to assess a variety of models for their ability to replicate human behavior when navigating the web. Our experiments span from small text-only to proprietary multimodal LLMs. We find that smaller finetuned decoders surpass the best zero-shot LLMs (including GPT-4V), but also larger finetuned multimodal models which were explicitly pretrained on screenshots. However, all finetuned models struggle to generalize to unseen websites. Our findings highlight the need for large multimodal models that can generalize to novel settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xing Han Lu;Zden\u011bk Kasner;Siva Reddy", "authorids": "~Xing_Han_Lu2;~Zden\u011bk_Kasner1;~Siva_Reddy1", "gender": "M;M;M", "homepage": "https://xinghanlu.com/;https://kasnerz.github.io;http://sivareddy.in", "dblp": "223/2756;262/3799;64/8153", "google_scholar": "ekoOv3YAAAAJ;https://scholar.google.cz/citations?user=6NnuRB8AAAAJ;", "orcid": "0000-0001-9027-8425;0000-0002-5753-5538;", "linkedin": "xing-han-lu/;zdenek-kasner;", "or_profile": "~Xing_Han_Lu2;~Zden\u011bk_Kasner1;~Siva_Reddy1", "aff": "Montreal Institute for Learning Algorithms, University of Montreal, Universit\u00e9 de Montr\u00e9al;Charles University, Prague;Mila, McGill University", "aff_domain": "mila.umontreal.ca;cuni.cz;mila.quebec", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nlu2024weblinx,\ntitle={Web{LINX}: Real-World Website Navigation with Multi-Turn Dialogue},\nauthor={Xing Han Lu and Zden{\\v{e}}k Kasner and Siva Reddy},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mUSPhG4uDW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5908748, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4577090507004329194&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "mila.umontreal.ca;cuni.cz;mila.quebec", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Montreal;Charles University;McGill University", "aff_unique_dep": "Montreal Institute for Learning Algorithms;;Mila", "aff_unique_url": "https://www.mila.quebec;https://www.cuni.cz;https://www.mcgill.ca", "aff_unique_abbr": "MILA;Charles University;McGill", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Montreal;Prague", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Canada;Czech Republic" }, { "title": "Privacy-Preserving Instructions for Aligning Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33173", "id": "mUT1biz09t", "proceeding": "https://proceedings.mlr.press/v235/yu24e.html", "pdf": "https://openreview.net/pdf?id=mUT1biz09t", "openreview": "https://openreview.net/forum?id=mUT1biz09t", "author_site": "Da Yu, Peter Kairouz, Sewoong Oh, Zheng Xu", "tldr": "", "abstract": "Service providers of large language model (LLM) applications collect user instructions in the wild and use them in further aligning LLMs with users' intentions. These instructions, which potentially contain sensitive information, are annotated by human workers in the process. This poses a new privacy risk not addressed by the typical private optimization. To this end, we propose using synthetic instructions to replace real instructions in data annotation and model fine-tuning. Formal differential privacy is guaranteed by generating those synthetic instructions using privately fine-tuned generators. Crucial in achieving the desired utility is our novel filtering algorithm that matches the distribution of the synthetic instructions to that of the real ones. In both supervised fine-tuning and reinforcement learning from human feedback, our extensive experiments demonstrate the high utility of the final set of synthetic instructions by showing comparable results to real instructions. In supervised fine-tuning, models trained with private synthetic instructions outperform leading open-source models such as Vicuna.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Da Yu;Peter Kairouz;Sewoong Oh;Zheng Xu", "authorids": "~Da_Yu1;~Peter_Kairouz1;~Sewoong_Oh3;~Zheng_Xu2", "gender": "M;M;;M", "homepage": ";https://kairouzp.github.io/;https://sites.google.com/site/xuzhustc/;https://homes.cs.washington.edu/~sewoong/", "dblp": "48/8545;129/1254;83/2535-2;80/4366", "google_scholar": "FcRGdiwAAAAJ;m8NUgw0AAAAJ;TfWlMTYAAAAJ;55TAOdgAAAAJ", "orcid": ";;0009-0003-6747-3953;", "linkedin": ";kayrouzp;zheng-xu-0a125236/;", "or_profile": "~Da_Yu1;~Peter_Kairouz1;~Zheng_Xu2;~Sewoong_Oh1", "aff": "SUN YAT-SEN UNIVERSITY;Google;Google;University of Washington", "aff_domain": "sysu.edu.cn;google.com;google.com;uw.edu", "position": "PhD student;Research Scientist;Research Scientist;Full Professor", "bibtex": "@inproceedings{\nyu2024privacypreserving,\ntitle={Privacy-Preserving Instructions for Aligning Large Language Models},\nauthor={Da Yu and Peter Kairouz and Sewoong Oh and Zheng Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mUT1biz09t}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1332443, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16003202111382260723&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "sysu.edu.cn;google.com;google.com;uw.edu", "author_num": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Sun Yat-sen University;Google;University of Washington", "aff_unique_dep": ";Google;", "aff_unique_url": "http://www.sysu.edu.cn;https://www.google.com;https://www.washington.edu", "aff_unique_abbr": "SYSU;Google;UW", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "China;United States" }, { "title": "Adaptive Accompaniment with ReaLchords", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33172", "id": "mUVydzrkgz", "proceeding": "https://proceedings.mlr.press/v235/wu24c.html", "pdf": "https://openreview.net/pdf?id=mUVydzrkgz", "openreview": "https://openreview.net/forum?id=mUVydzrkgz", "author_site": "Yusong Wu, Tim Cooijmans, Kyle Kastner, Adam Roberts, Ian Simon, Alexander Scarlatos, Chris Donahue, Cassie Tarakajian, Shayegan Omidshafiei, Aaron Courville, Pablo Samuel Castro, Natasha Jaques, Cheng-Zhi Anna Huang", "tldr": "", "abstract": "Jamming requires coordination, anticipation, and collaborative creativity between musicians. Current generative models of music produce expressive output but are not able to generate in an online manner, meaning simultaneously with other musicians (human or otherwise). We propose ReaLchords, an online generative model for improvising chord accompaniment to user melody. We start with an online model pretrained by maximum likelihood, and use reinforcement learning to finetune the model for online use. The finetuning objective leverages both a novel reward model that provides feedback on both harmonic and temporal coherency between melody and chord, and a divergence term that implements a novel type of distillation from a teacher model that can see the future melody. Through quantitative experiments and listening tests, we demonstrate that the resulting model adapts well to unfamiliar input and produce fitting accompaniment. ReaLchords opens the door to live jamming, as well as simultaneous co-creation in other modalities.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yusong Wu;Tim Cooijmans;Kyle Kastner;Adam Roberts;Ian Simon;Alexander Scarlatos;Chris Donahue;Cassie Tarakajian;Shayegan Omidshafiei;Aaron Courville;Pablo Samuel Castro;Natasha Jaques;Cheng-Zhi Anna Huang", "authorids": "~Yusong_Wu1;~Tim_Cooijmans1;~Kyle_Kastner1;~Adam_Roberts1;~Ian_Simon1;~Alexander_Scarlatos1;~Chris_Donahue1;~Cassie_Tarakajian1;~Shayegan_Omidshafiei1;~Aaron_Courville3;~Pablo_Samuel_Castro1;~Natasha_Jaques1;~Cheng-Zhi_Anna_Huang1", "gender": "M;M;Unspecified;M;M;M;M;;;;M;F;F", "homepage": "http://lukewys.github.io/;;;;http://iansimon.org/;https://people.cs.umass.edu/~ajscarlatos/;https://chrisdonahue.com;;;;https://psc-g.github.io/;https://natashajaques.ai/;", "dblp": "255/5686;153/5756;http://dblp.uni-trier.de/pers/hd/k/Kastner:Kyle;95/6569;33/2787;275/8415;34/4405;;153/7735;56/1688;05/5455;145/7732;59/9006", "google_scholar": ";https://scholar.google.ca/citations?user=Ec6vKzwAAAAJ;https://scholar.google.ca/citations?user=0XtGoMUAAAAJ;U5UpKq8AAAAJ;pKqwl3wAAAAJ;https://scholar.google.com/citations?hl=en;MgzHAPQAAAAJ;;nm5wMNUAAAAJ;https://scholar.google.ca/citations?user=km6CP8cAAAAJ;https://scholar.google.ca/citations?user=jn5r6TsAAAAJ;8iCb2TwAAAAJ;NRz_EVgAAAAJ", "orcid": ";;;;;;0009-0007-6825-6327;;;;;;", "linkedin": ";;;;;alex-scarlatos-399455113/;;;;;pablo-samuel-castro-2113641b/;natashajaques;", "or_profile": "~Yusong_Wu1;~Tim_Cooijmans1;~Kyle_Kastner1;~Adam_Roberts1;~Ian_Simon1;~Alexander_Scarlatos1;~Chris_Donahue1;~Cassie_Tarakajian1;~Shayegan_Omidshafiei1;~Aaron_Courville3;~Pablo_Samuel_Castro1;~Natasha_Jaques1;~Cheng-Zhi_Anna_Huang1", "aff": "Universit\u00e9 de Montr\u00e9al, Mila;University of Montreal;Google;Google;Google;Department of Computer Science, University of Massachusetts at Amherst;Google;;Google Research;Universit\u00e9 de Montr\u00e9al;Google;Google;Google", "aff_domain": "umontreal.ca;umontreal.ca;google.com;google.com;google.com;cs.umass.edu;google.com;;google.com; ;google.com;google.com;google.com", "position": "PhD student;PhD student;Researcher;Software Engineer;Software Engineer;PhD student;Researcher;;Research Scientist;Assistant Professor;Researcher;Senior Research Scientist;Researcher", "bibtex": "@inproceedings{\nwu2024adaptive,\ntitle={Adaptive Accompaniment with ReaLchords},\nauthor={Yusong Wu and Tim Cooijmans and Kyle Kastner and Adam Roberts and Ian Simon and Alexander Scarlatos and Chris Donahue and Cassie Tarakajian and Shayegan Omidshafiei and Aaron Courville and Pablo Samuel Castro and Natasha Jaques and Cheng-Zhi Anna Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mUVydzrkgz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1577929, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 13, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13743962641768274509&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "umontreal.ca;umontreal.ca;google.com;google.com;google.com;cs.umass.edu;google.com;;google.com; ;google.com;google.com;google.com", "author_num": 13, "aff_unique_index": "0;1;2;2;2;3;2;2;0;2;2;2", "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al;University of Montreal;Google;University of Massachusetts Amherst", "aff_unique_dep": "Mila;;Google;Department of Computer Science", "aff_unique_url": "https://www.umontreal.ca;https://wwwumontreal.ca;https://www.google.com;https://www.umass.edu", "aff_unique_abbr": "UdeM;UM;Google;UMass Amherst", "aff_campus_unique_index": "1;1;1;2;1;1;1;1;1", "aff_campus_unique": ";Mountain View;Amherst", "aff_country_unique_index": "0;0;1;1;1;1;1;1;0;1;1;1", "aff_country_unique": "Canada;United States" }, { "title": "Spider: A Unified Framework for Context-dependent Concept Segmentation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33171", "id": "mWV8NeU79e", "proceeding": "https://proceedings.mlr.press/v235/zhao24j.html", "pdf": "https://openreview.net/pdf?id=mWV8NeU79e", "openreview": "https://openreview.net/forum?id=mWV8NeU79e", "author_site": "Xiaoqi Zhao, Youwei Pang, Wei Ji, Baicheng Sheng, Jiaming Zuo, Lihe Zhang, Huchuan Lu", "tldr": "", "abstract": "Different from the context-independent (CI) concepts such as human, car, and airplane, context-dependent (CD) concepts require higher visual understanding ability, such as camouflaged object and medical lesion. Despite the rapid advance of many CD understanding tasks in respective branches, the isolated evolution leads to their limited cross-domain generalisation and repetitive technique innovation. Since there is a strong coupling relationship between foreground and background context in CD tasks, existing methods require to train separate models in their focused domains. This restricts their real-world CD concept understanding towards artificial general intelligence (AGI). We propose a unified model with a single set of parameters, Spider, which only needs to be trained once. With the help of the proposed concept filter driven by the image-mask group prompt, Spider is able to understand and distinguish diverse strong context-dependent concepts to accurately capture the Prompter's intention. Without bells and whistles, Spider significantly outperforms the state-of-the-art specialized models in 8 different context-dependent segmentation tasks, including 4 natural scenes (salient, camouflaged, and transparent objects and shadow) and 4 medical lesions (COVID-19, polyp, breast, and skin lesion with color colonoscopy, CT, ultrasound, and dermoscopy modalities). Besides, Spider shows obvious advantages in continuous learning. It can easily complete the training of new tasks by fine-tuning parameters less than 1% and bring a tolerable performance degradation of less than 5% for all old tasks. The source code will be publicly available at https://github.com/Xiaoqi-Zhao-DLUT/Spider-UniCDSeg.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaoqi Zhao;Youwei Pang;Wei Ji;Baicheng Sheng;Jiaming Zuo;Lihe Zhang;Huchuan Lu", "authorids": "~Xiaoqi_Zhao1;~Youwei_Pang1;~Wei_Ji2;~Baicheng_Sheng1;~Jiaming_Zuo2;~Lihe_Zhang1;~Huchuan_Lu1", "gender": "M;M;;M;M;;M", "homepage": "https://Xiaoqi-Zhao-DLUT.github.io;https://lartpang.github.io;;https://github.com/shengbaicheng;https://jiaming-zuo.github.io/;;http://ice.dlut.edu.cn/lu/publications.html", "dblp": ";270/2129;;;;46/10700;64/6896", "google_scholar": "0EKcLI4AAAAJ;jdo9_goAAAAJ;;;;XGPdQbIAAAAJ;D3nE0agAAAAJ", "orcid": ";0000-0002-3950-0956;;;;;", "linkedin": ";;;;;;", "or_profile": "~Xiaoqi_Zhao1;~Youwei_Pang1;~Wei_Ji2;~Baicheng_Sheng1;~Jiaming_Zuo2;~Lihe_Zhang1;~Huchuan_Lu1", "aff": "Dalian University of Technology;Dalian University of Technology;;;;Dalian University of Technology;Dalian University of Technology", "aff_domain": "dlut.edu.cn;dlut.edu.cn;;;;dlut.edu.cn;dlut.edu.cn", "position": "PhD student;PhD student;;;;Full Professor;Professor", "bibtex": "@inproceedings{\nzhao2024spider,\ntitle={Spider: A Unified Framework for Context-dependent Concept Segmentation},\nauthor={Xiaoqi Zhao and Youwei Pang and Wei Ji and Baicheng Sheng and Jiaming Zuo and Lihe Zhang and Huchuan Lu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mWV8NeU79e}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9593876, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=626093312212410775&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "dlut.edu.cn;dlut.edu.cn;;;;dlut.edu.cn;dlut.edu.cn", "author_num": 7, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Dalian University of Technology", "aff_unique_dep": "", "aff_unique_url": "http://www.dlut.edu.cn/", "aff_unique_abbr": "DUT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Beyond the ROC Curve: Classification Trees Using Cost-Optimal Curves, with Application to Imbalanced Datasets", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33170", "id": "mXLcbRBA8v", "proceeding": "https://proceedings.mlr.press/v235/gabidolla24a.html", "pdf": "https://openreview.net/pdf?id=mXLcbRBA8v", "openreview": "https://openreview.net/forum?id=mXLcbRBA8v", "author_site": "Magzhan Gabidolla, Arman Zharmagambetov, Miguel Carreira-Perpinan", "tldr": "", "abstract": "Important applications such as fraud or spam detection or churn prediction involve binary classification problems where the datasets are imbalanced and the cost of false positives greatly differs from the cost of false negatives. We focus on classification trees, in particular oblique trees, which subsume both the traditional axis-aligned trees and logistic regression, but are more accurate than both while providing interpretable models. Rather than using ROC curves, we advocate a loss based on minimizing the false negatives subject to a maximum false positive rate, which we prove to be equivalent to minimizing a weighted 0/1 loss. This yields a curve of classifiers that provably dominates the ROC curve, but is hard to optimize due to the 0/1 loss. We give the first algorithm that can iteratively update the tree parameters globally so that the weighted 0/1 loss decreases monotonically. Experiments on various datasets with class imbalance or class costs show this indeed dominates ROC-based classifiers and significantly improves over previous approaches to learn trees based on weighted purity criteria or over- or undersampling.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Magzhan Gabidolla;Arman Zharmagambetov;Miguel \u00c1. Carreira-Perpi\u00f1\u00e1n", "authorids": "~Magzhan_Gabidolla1;~Arman_Zharmagambetov1;~Miguel_\u00c1._Carreira-Perpi\u00f1\u00e1n2", "gender": ";M;", "homepage": ";https://arman-z.github.io/;", "dblp": ";252/5004;", "google_scholar": ";D6QocXMAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Magzhan_Gabidolla1;~Arman_Zharmagambetov1;~Miguel_\u00c1._Carreira-Perpi\u00f1\u00e1n2", "aff": ";Meta AI (FAIR);", "aff_domain": ";meta.com;", "position": ";Postdoc;", "bibtex": "@inproceedings{\ngabidolla2024beyond,\ntitle={Beyond the {ROC} Curve: Classification Trees Using Cost-Optimal Curves, with Application to Imbalanced Datasets},\nauthor={Magzhan Gabidolla and Arman Zharmagambetov and Miguel {\\'A}. Carreira-Perpi{\\~n}{\\'a}n},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mXLcbRBA8v}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 786501, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=649175842782122698&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "email": ";meta.com;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Meta", "aff_unique_dep": "Facebook AI Research (FAIR)", "aff_unique_url": "https://ai.facebook.com", "aff_unique_abbr": "Meta AI", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Reinforcement Learning from Reachability Specifications: PAC Guarantees with Expected Conditional Distance", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33169", "id": "mXUDDL4r1Q", "proceeding": "https://proceedings.mlr.press/v235/svoboda24a.html", "pdf": "https://openreview.net/pdf?id=mXUDDL4r1Q", "openreview": "https://openreview.net/forum?id=mXUDDL4r1Q", "author_site": "Jakub Svoboda, Suguman Bansal, Krishnendu Chatterjee", "tldr": "", "abstract": "Reinforcement Learning (RL) from temporal logical specifications is a fundamental problem in sequential decision making. One of the basic and core such specification is the reachability specification that requires a target set to be eventually visited. Despite strong empirical results for RL from such specifications, the theoretical guarantees are bleak, including the impossibility of Probably Approximately Correct (PAC) guarantee for reachability specifications. Given the impossibility result, in this work we consider the problem of RL from reachability specifications along with the information of expected conditional distance (ECD). We present (a) lower bound results which establish the necessity of ECD information for PAC guarantees and (b) an algorithm that establishes PAC-guarantees given the ECD information. To the best of our knowledge, this is the first RL from reachability specifications that does not make any assumptions on the underlying environment to learn policies.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jakub Svoboda;Suguman Bansal;Krishnendu Chatterjee", "authorids": "~Jakub_Svoboda1;~Suguman_Bansal1;~Krishnendu_Chatterjee1", "gender": ";F;M", "homepage": "https://pub.ist.ac.at/~jsvoboda/;https://suguman.github.io/;http://pub.ist.ac.at/~kchatterjee/", "dblp": ";217/4777.html;92/5602", "google_scholar": "GgwnWfEAAAAJ;https://scholar.google.fi/citations?user=bd9Rk1MAAAAJ;https://scholar.google.com.tw/citations?user=1kaW8bwAAAAJ", "orcid": "0000-0002-1419-3267;0000-0002-0405-073X;", "linkedin": ";;", "or_profile": "~Jakub_Svoboda1;~Suguman_Bansal1;~Krishnendu_Chatterjee1", "aff": "Intitute of Science and Technology;Georgia Institute of Technology;Institute of Science and Technology Austria", "aff_domain": "ista.at;gatech.edu;ist.ac.at", "position": "PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nsvoboda2024reinforcement,\ntitle={Reinforcement Learning from Reachability Specifications: {PAC} Guarantees with Expected Conditional Distance},\nauthor={Jakub Svoboda and Suguman Bansal and Krishnendu Chatterjee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mXUDDL4r1Q}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 349085, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8AEgNmcEUMwJ:scholar.google.com/&scioq=Reinforcement+Learning+from+Reachability+Specifications:+PAC+Guarantees+with+Expected+Conditional+Distance&hl=en&as_sdt=0,5", "gs_version_total": 5, "email": "ista.at;gatech.edu;ist.ac.at", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Institute of Science and Technology;Georgia Institute of Technology;Institute of Science and Technology Austria", "aff_unique_dep": ";;", "aff_unique_url": ";https://www.gatech.edu;https://www.ist.ac.at", "aff_unique_abbr": ";Georgia Tech;IST Austria", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;2", "aff_country_unique": ";United States;Austria" }, { "title": "Low-Rank Similarity Mining for Multimodal Dataset Distillation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33168", "id": "mY93trX2Qz", "proceeding": "https://proceedings.mlr.press/v235/xu24q.html", "pdf": "https://openreview.net/pdf?id=mY93trX2Qz", "openreview": "https://openreview.net/forum?id=mY93trX2Qz", "author_site": "Yue Xu, Zhilin Lin, Yusong Qiu, Cewu Lu, Yong-Lu Li", "tldr": "", "abstract": "Though dataset distillation has witnessed rapid development in recent years, the distillation of multimodal data, e.g., image-text pairs, poses unique and under-explored challenges. Unlike unimodal data, image-text contrastive learning (ITC) data lack inherent categorization and should instead place greater emphasis on modality correspondence. In this work, we propose **Lo**w-**R**ank **S**imilarity Mining (**LoRS**) for multimodal dataset distillation, that concurrently distills a ground truth similarity matrix with image-text pairs, and leverages low-rank factorization for efficiency and scalability. The proposed approach brings significant improvement to the existing algorithms, marking a significant contribution to the field of visual-language dataset distillation. We advocate adopting LoRS as a foundational synthetic data setup for image-text dataset distillation. Our code is available at https://github.com/silicx/LoRS_Distill.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yue Xu;Zhilin Lin;Yusong Qiu;Cewu Lu;Yong-Lu Li", "authorids": "~Yue_Xu4;~Zhilin_Lin1;~Yusong_Qiu1;~Cewu_Lu3;~Yong-Lu_Li1", "gender": "M;M;M;M;M", "homepage": "https://silicx.github.io;https://github.com/PlasmidLin;https://github.com/Saramandaaa;https://www.mvig.org/;https://dirtyharrylyl.github.io/", "dblp": ";;;;198/9345", "google_scholar": "N03Uc1oAAAAJ;;;https://scholar.google.com.tw/citations?user=QZVQEWAAAAAJ;https://scholar.google.com.hk/citations?user=UExAaVgAAAAJ", "orcid": "0000-0001-7489-7269;;0009-0008-4170-2283;;0000-0003-0478-0692", "linkedin": ";;;;%E6%B0%B8%E9%9C%B2-%E6%9D%8E-991b99139/", "or_profile": "~Yue_Xu4;~Zhilin_Lin1;~Yusong_Qiu1;~Cewu_Lu3;~Yong-Lu_Li1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu;sjtu.edu;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "position": "PhD student;Undergrad student;MS student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nxu2024lowrank,\ntitle={Low-Rank Similarity Mining for Multimodal Dataset Distillation},\nauthor={Yue Xu and Zhilin Lin and Yusong Qiu and Cewu Lu and Yong-Lu Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mY93trX2Qz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1283838, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7005623664660652888&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 6, "email": "sjtu.edu;sjtu.edu;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "HumanTOMATO: Text-aligned Whole-body Motion Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33167", "id": "maVIKlGqr7", "proceeding": "https://proceedings.mlr.press/v235/lu24b.html", "pdf": "https://openreview.net/pdf?id=maVIKlGqr7", "openreview": "https://openreview.net/forum?id=maVIKlGqr7", "author_site": "Shunlin Lu, Ling-Hao Chen, Ailing Zeng, Jing Lin, Ruimao Zhang, Lei Zhang, Heung-Yeung Shum", "tldr": "", "abstract": "This work targets a novel text-driven **whole-body** motion generation task, which takes a given textual description as input and aims at generating high-quality, diverse, and coherent facial expressions, hand gestures, and body motions simultaneously. Previous works on text-driven motion generation tasks mainly have two limitations: they ignore the key role of fine-grained hand and face controlling in vivid whole-body motion generation, and lack a good alignment between text and motion. To address such limitations, we propose a Text-aligned whOle-body Motion generATiOn framework, named HumanTOMATO, which is the first attempt to our knowledge towards applicable holistic motion generation in this research area. To tackle this challenging task, our solution includes two key designs: (1) a Holistic Hierarchical VQ-VAE (aka H${}^{2}$VQ) and a Hierarchical-GPT for fine-grained body and hand motion reconstruction and generation with two structured codebooks; and (2) a pre-trained text-motion-alignment model to help generated motion align with the input textual description explicitly. Comprehensive experiments verify that our model has significant advantages in both the quality of generated motions and their alignment with text.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shunlin Lu;Ling-Hao Chen;Ailing Zeng;Jing Lin;Ruimao Zhang;Lei Zhang;Heung-Yeung Shum", "authorids": "~Shunlin_Lu1;~Ling-Hao_Chen1;~Ailing_Zeng1;~Jing_Lin3;~Ruimao_Zhang1;~Lei_Zhang23;~Heung-Yeung_Shum1", "gender": "M;;F;M;M;M;M", "homepage": "https://shunlinlu.github.io/;https://lhchen.top;https://ailingzeng.site/;https://jinglin7.github.io/;http://zhangruimao.site/#;https://www.microsoft.com/en-us/research/people/hshum/;https://www.leizhang.org/", "dblp": "333/0021;339/7448.html;226/4720;;54/10697;;z/LeiZhang", "google_scholar": ";mxvMDpMAAAAJ;Tn7fzS8AAAAJ;SvaU2GMAAAAJ;ZJwZdtgAAAAJ;;fIlGZToAAAAJ", "orcid": ";;;;;;", "linkedin": "shunlin-lu-401aa61a6/;ling-hao-chen-79b87a224/;%E7%88%B1%E7%8E%B2-%E6%9B%BE-65504112a/;;;;", "or_profile": "~Shunlin_Lu1;~Ling-Hao_Chen1;~Ailing_Zeng1;~Jing_Lin3;~Ruimao_Zhang1;~Heung-Yeung_Shum1;~Lei_Zhang1", "aff": "The Chinese University of HongKong, ShenZhen;International Digital Economy Academy;International Digital Economy Academy;Tsinghua University;The Chinese University of Hong Kong (Shenzhen);;International Digital Economy Academy", "aff_domain": "cuhk.edu.cn;idea.edu.cn;idea.edu.cn;tsinghua.edu.cn;cuhk.edu.cn;;idea.edu.cn", "position": "PhD student;Research Intern;Researcher;MS student;Assistant Professor;;Chief Scientist", "bibtex": "@inproceedings{\nlu2024humantomato,\ntitle={Human{TOMATO}: Text-aligned Whole-body Motion Generation},\nauthor={Shunlin Lu and Ling-Hao Chen and Ailing Zeng and Jing Lin and Ruimao Zhang and Lei Zhang and Heung-Yeung Shum},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=maVIKlGqr7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7835466, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5446754581777076719&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "cuhk.edu.cn;idea.edu.cn;idea.edu.cn;tsinghua.edu.cn;cuhk.edu.cn;;idea.edu.cn", "author_num": 7, "aff_unique_index": "0;1;1;2;0;1", "aff_unique_norm": "Chinese University of Hong Kong;International Digital Economy Academy;Tsinghua University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cuhk.edu.cn;;https://www.tsinghua.edu.cn", "aff_unique_abbr": "CUHK;;THU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China;" }, { "title": "Distributionally Robust Data Valuation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33166", "id": "mbBehLOAqR", "proceeding": "https://proceedings.mlr.press/v235/lin24t.html", "pdf": "https://openreview.net/pdf?id=mbBehLOAqR", "openreview": "https://openreview.net/forum?id=mbBehLOAqR", "author_site": "Xiaoqiang Lin, Xinyi Xu, Zhaoxuan Wu, See-Kiong Ng, Bryan Kian Hsiang Low", "tldr": "", "abstract": "Data valuation quantifies the contribution of each data point to the performance of a machine learning model. Existing works typically define the value of data by its improvement of the validation performance of the trained model. However, this approach can be impractical to apply in collaborative machine learning and data marketplace since it is difficult for the parties/buyers to agree on a common validation dataset or determine the exact validation distribution *a priori*. To address this, we propose a *distributionally robust data valuation* approach to perform data valuation without known/fixed validation distributions. Our approach defines the value of data by its improvement of the distributionally robust generalization error (DRGE), thus providing a worst-case performance guarantee *without* a known/fixed validation distribution. However, since computing DRGE directly is infeasible, we propose using *model deviation* as a proxy for the marginal improvement of DRGE (for kernel regression and neural networks) to compute data values. Furthermore, we identify a notion of uniqueness where low uniqueness characterizes low-value data. We empirically demonstrate that our approach outperforms existing data valuation approaches in data selection and data removal tasks on real-world datasets (e.g., housing price prediction, diabetes hospitalization prediction).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaoqiang Lin;Xinyi Xu;Zhaoxuan Wu;See-Kiong Ng;Bryan Kian Hsiang Low", "authorids": "~Xiaoqiang_Lin1;~Xinyi_Xu4;~Zhaoxuan_Wu1;~See-Kiong_Ng1;~Bryan_Kian_Hsiang_Low1", "gender": "M;M;M;M;M", "homepage": "https://xqlin98.github.io/;https://xinyi-xu.com;https://zhaoxuanwu.github.io/;https://www.comp.nus.edu.sg/~ngsk/;http://www.comp.nus.edu.sg/~lowkh", "dblp": "269/4573;;298/5083;00/5480;97/4877", "google_scholar": "nqKwA60AAAAJ;2762GgsAAAAJ;Th_mPm8AAAAJ;https://scholar.google.com.tw/citations?user=_wsommYAAAAJ;https://scholar.google.com.tw/citations?user=2P-Q09UAAAAJ", "orcid": ";0000-0002-8744-0695;0009-0002-5659-6387;0000-0001-6565-7511;", "linkedin": ";xinyi-xu-a93222133/;zhaoxuanwu/;seekiong/?originalSubdomain=sg;", "or_profile": "~Xiaoqiang_Lin1;~Xinyi_Xu4;~Zhaoxuan_Wu1;~See-Kiong_Ng1;~Bryan_Kian_Hsiang_Low1", "aff": "National University of Singapore;National University of Singapore;National University of Singapore;National University of Singapore;National University of Singapore", "aff_domain": "u.nus.edu;nus.edu.sg;u.nus.edu;nus.edu.sg;nus.edu.sg", "position": "PhD student;PhD student;PhD student;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nlin2024distributionally,\ntitle={Distributionally Robust Data Valuation},\nauthor={Xiaoqiang Lin and Xinyi Xu and Zhaoxuan Wu and See-Kiong Ng and Bryan Kian Hsiang Low},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mbBehLOAqR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4353357, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11731448885211738219&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 4, "email": "u.nus.edu;nus.edu.sg;u.nus.edu;nus.edu.sg;nus.edu.sg", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "National University of Singapore", "aff_unique_dep": "", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Singapore" }, { "title": "A2Q+: Improving Accumulator-Aware Weight Quantization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33165", "id": "mbx2pLK5Eq", "proceeding": "https://proceedings.mlr.press/v235/colbert24a.html", "pdf": "https://openreview.net/pdf?id=mbx2pLK5Eq", "openreview": "https://openreview.net/forum?id=mbx2pLK5Eq", "author_site": "Ian Colbert, Alessandro Pappalardo, Jakoba Petri-Koenig, Yaman Umuroglu", "tldr": "", "abstract": "Quantization techniques commonly reduce the inference costs of neural networks by restricting the precision of weights and activations. Recent studies show that also reducing the precision of the accumulator can further improve hardware efficiency at the risk of numerical overflow, which introduces arithmetic errors that can degrade model accuracy. To avoid numerical overflow while maintaining accuracy, recent work proposed accumulator-aware quantization (A2Q)\u2014a quantization-aware training method that constrains model weights during training to safely use a target accumulator bit width during inference. Although this shows promise, we demonstrate that A2Q relies on an overly restrictive constraint and a sub-optimal weight initialization strategy that each introduce superfluous quantization error. To address these shortcomings, we introduce: (1) an improved bound that alleviates accumulator constraints without compromising overflow avoidance; and (2) a new strategy for initializing quantized weights from pre-trained floating-point checkpoints. We combine these contributions with weight normalization to introduce A2Q+. We identify and characterize the various tradeoffs that arise as a consequence of accumulator constraints and support our analysis with experiments that show A2Q+ significantly improves these trade-offs when compared to prior methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ian Colbert;Alessandro Pappalardo;Jakoba Petri-Koenig;Yaman Umuroglu", "authorids": "~Ian_Colbert1;~Alessandro_Pappalardo1;~Jakoba_Petri-Koenig1;~Yaman_Umuroglu1", "gender": ";;;M", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";Wr0fGKoAAAAJ;H5w5KtMAAAAJ;LBWVHx0AAAAJ", "orcid": ";;;0000-0002-3700-5935", "linkedin": ";;jakoba-petri-koenig/;yamanumuroglu/", "or_profile": "~Ian_Colbert1;~Alessandro_Pappalardo1;~Jakoba_Petri-Koenig1;~Yaman_Umuroglu1", "aff": ";AMD AEAI Research Labs;Advanced Micro Devices;Advanced Micro Devices, Inc.", "aff_domain": ";amd.com;amd.com;amd.com", "position": ";Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\ncolbert2024aq,\ntitle={A2Q+: Improving Accumulator-Aware Weight Quantization},\nauthor={Ian Colbert and Alessandro Pappalardo and Jakoba Petri-Koenig and Yaman Umuroglu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mbx2pLK5Eq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1028141, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5028650241001953433&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": ";amd.com;amd.com;amd.com", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "AMD Research;Advanced Micro Devices, Inc.;Advanced Micro Devices", "aff_unique_dep": "AEAI Research Labs;;", "aff_unique_url": "https://www.amd.com;https://www.amd.com;https://www.amd.com", "aff_unique_abbr": "AMD;AMD;AMD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Position: Tensor Networks are a Valuable Asset for Green AI", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33164", "id": "mcg6jppkwb", "proceeding": "https://proceedings.mlr.press/v235/memmel24a.html", "pdf": "https://openreview.net/pdf?id=mcg6jppkwb", "openreview": "https://openreview.net/forum?id=mcg6jppkwb", "author_site": "Eva Memmel, Clara Menzen, Jetze Schuurmans, Frederiek Wesel, Kim Batselier", "tldr": "", "abstract": "For the first time, this position paper introduces a fundamental link between tensor networks (TNs) and Green AI, highlighting their synergistic potential to enhance both the inclusivity and sustainability of AI research. We argue that TNs are valuable for Green AI due to their strong mathematical backbone and inherent logarithmic compression potential. We undertake a comprehensive review of the ongoing discussions on Green AI, emphasizing the importance of sustainability and inclusivity in AI research to demonstrate the significance of establishing the link between Green AI and TNs. To support our position, we first provide a comprehensive overview of efficiency metrics proposed in Green AI literature and then evaluate examples of TNs in the fields of kernel machines and deep learning using the proposed efficiency metrics. This position paper aims to incentivize meaningful, constructive discussions by bridging fundamental principles of Green AI and TNs. We advocate for researchers to seriously evaluate the integration of TNs into their research projects, and in alignment with the link established in this paper, we support prior calls encouraging researchers to treat Green AI principles as a research priority.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Eva Memmel;Clara Menzen;Jetze Schuurmans;Frederiek Wesel;kim batselier", "authorids": "~Eva_Memmel1;~Clara_Menzen1;~Jetze_Schuurmans1;~Frederiek_Wesel1;~kim_batselier1", "gender": "F;F;M;Not Specified;", "homepage": ";;https://github.com/JSchuurmans;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": "eva-m-b34a85123/;clara-menzen-9038b560/;jetzeschuurmans/;https://linkedin.com/in/frederiek-wesel;", "or_profile": "~Eva_Memmel1;~Clara_Menzen1;~Jetze_Schuurmans1;~Frederiek_Wesel1;~kim_batselier1", "aff": "Delft University of Technology;;Xebia Data;Delft University of Technology;", "aff_domain": "tudelft.nl;;xebia.com;tudelft.nl;", "position": "PhD student;;Researcher;PhD student;", "bibtex": "@inproceedings{\nmemmel2024position,\ntitle={Position: Tensor Networks are a Valuable Asset for Green {AI}},\nauthor={Eva Memmel and Clara Menzen and Jetze Schuurmans and Frederiek Wesel and kim batselier},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mcg6jppkwb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1039285, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11927801705622401103&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "tudelft.nl;;xebia.com;tudelft.nl;", "author_num": 5, "aff_unique_index": "0;1;0", "aff_unique_norm": "Delft University of Technology;Xebia", "aff_unique_dep": ";Data", "aff_unique_url": "https://www.tudelft.nl;https://xebia.com/", "aff_unique_abbr": "TU Delft;Xebia", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Netherlands" }, { "title": "Unsupervised Domain Adaptation for Anatomical Structure Detection in Ultrasound Images", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33163", "id": "meItvvCO7X", "proceeding": "https://proceedings.mlr.press/v235/pu24b.html", "pdf": "https://openreview.net/pdf?id=meItvvCO7X", "openreview": "https://openreview.net/forum?id=meItvvCO7X", "author_site": "Bin Pu, Xingguo Lv, Jiewen Yang, He Guannan, Xingbo Dong, Yiqun Lin, Li Shengli, Ying Tan, Liu Fei, Ming Chen, Zhe Jin, Kenli Li, Xiaomeng Li", "tldr": "", "abstract": "Models trained on ultrasound images from one institution typically experience a decline in effectiveness when transferred directly to other institutions. Moreover, unlike natural images, dense and overlapped structures exist in fetus ultrasound images, making the detection of structures more challenging. Thus, to tackle this problem, we propose a new Unsupervised Domain Adaptation (UDA) method named ToMo-UDA for fetus structure detection, which consists of the Topology Knowledge Transfer (TKT) and the Morphology Knowledge Transfer (MKT) module. The TKT leverages prior knowledge of the medical anatomy of fetal as topological information, reconstructing and aligning anatomy features across source and target domains. Then, the MKT formulates a more consistent and independent morphological representation for each substructure of an organ. To evaluate the proposed ToMo-UDA for ultrasound fetal anatomical structure detection, we introduce **FUSH$^2$**, a new **F**etal **U**ltra**S**ound benchmark, comprises **H**eart and **H**ead images collected from **Two** health centers, with 16 annotated regions. Our experiments show that utilizing topological and morphological anatomy information in ToMo-UDA can greatly improve organ structure detection. This expands the potential for structure detection tasks in medical image analysis.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bin Pu;Xingguo Lv;Jiewen Yang;He Guannan;Xingbo Dong;Yiqun Lin;Li Shengli;Tan Ying;Liu Fei;Ming Chen;Zhe Jin;Kenli Li;Xiaomeng Li", "authorids": "~Bin_Pu2;~Xingguo_Lv1;~Jiewen_Yang1;~He_Guannan2;~Xingbo_Dong1;~Yiqun_Lin1;~Li_Shengli1;~Tan_Ying1;~Liu_Fei1;~Ming_Chen16;~Zhe_Jin4;~Kenli_Li1;~Xiaomeng_Li1", "gender": "M;M;M;F;M;M;;F;;F;M;F;M", "homepage": "https://scholar.google.com/citations?user=JzXzqGgAAAAJ&hl=zh-CN;https://github.com/Yore0;https://gitlab.com/Jiewen_Yang;https://scholar.google.com/citations?user=h1myuTQAAAAJ&hl=zh-CN;https://xingbod.github.io/;;;https://github.com/tanying8013;;https://hospital.51daifu.com/hos1421/doctor/dt_138029.shtml;http://csee.hnu.edu.cn/people/likenli;https://xmengli.github.io/;", "dblp": ";;302/4089.html;;237/0125;26/2888;;;;;l/KenliLi.html;02/9850-1;73/1936", "google_scholar": "JzXzqGgAAAAJ;https://scholar.google.com/citations?hl=zh-CN;Y0MYdh8AAAAJ;;Jc4KdcIAAAAJ;dnG10ZwAAAAJ;;;;;https://scholar.google.com/citations?view_op=list_works;uVTzPpoAAAAJ;N-szqToAAAAJ", "orcid": "0009-0007-8771-6501;;;;0000-0001-9782-6068;0000-0002-7697-0842;0000-0003-0570-4165;0009-0004-6039-4088;0009-0004-3133-4414;;0000-0002-2635-7716;;0000-0003-4501-7992", "linkedin": ";;;;;;;;;;;;", "or_profile": "~Bin_Pu2;~Xingguo_Lv1;~Jiewen_Yang1;~He_Guannan2;~Xingbo_Dong1;~Yiqun_Lin1;~Li_Shengli1;~Tan_Ying1;~Liu_Fei1;~Ming_Chen16;~Kenli_Li1;~Xiaomeng_Li1;~ZHE_JIN2", "aff": "Hong Kong University of Science and Technology;Anhui University;Hong Kong University of Science and Technology;Sichuan University;Anhui University;Hong Kong University of Science and Technology;Shenzhen Maternity and Child Healthcare Hospital;Shenzhen Maternity and Child Healthcare Hospital;Anhui University;;Hunan University;Hong Kong University of Science and Technology;Anhui University", "aff_domain": "hkust.edu;ahu.edu.cn;ust.hk;stu.scu.edu;ahu.edu.cn;ust.hk;smu.edu;smu.edu;ahu.edu.cn;;hnu.edu.cn;ust.hk;ahu.edu.cn", "position": "Postdoc;MS student;PhD student;Full Professor;Lecturer;PhD student;Full Professor;Researcher;MS student;;Full Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\npu2024unsupervised,\ntitle={Unsupervised Domain Adaptation for Anatomical Structure Detection in Ultrasound Images},\nauthor={Bin Pu and Xingguo Lv and Jiewen Yang and He Guannan and Xingbo Dong and Yiqun Lin and Li Shengli and Tan Ying and Liu Fei and Ming Chen and Zhe Jin and Kenli Li and Xiaomeng Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=meItvvCO7X}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4728111, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 13, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8904961670315574364&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "hkust.edu;ahu.edu.cn;ust.hk;stu.scu.edu;ahu.edu.cn;ust.hk;smu.edu;smu.edu;ahu.edu.cn;;hnu.edu.cn;ust.hk;ahu.edu.cn", "author_num": 13, "aff_unique_index": "0;1;0;2;1;0;3;3;1;4;0;1", "aff_unique_norm": "Hong Kong University of Science and Technology;Anhui University;Sichuan University;Shenzhen Maternity and Child Healthcare Hospital;Hunan University", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.ust.hk;http://www.ahu.edu.cn/;https://www.scu.edu.cn;;http://www.hunu.edu.cn/", "aff_unique_abbr": "HKUST;AHU;SCU;;HNU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "On Online Experimentation without Device Identifiers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33162", "id": "merZTLSdC9", "proceeding": "https://proceedings.mlr.press/v235/shankar24a.html", "pdf": "https://openreview.net/pdf?id=merZTLSdC9", "openreview": "https://openreview.net/forum?id=merZTLSdC9", "author_site": "Shiv Shankar, Ritwik Sinha, Madalina Fiterau", "tldr": "", "abstract": "Measuring human feedback via randomized experimentation is a cornerstone of data-driven decision-making. The methodology used to estimate user preferences from their online behaviours is critically dependent on user identifiers. However, in today's digital landscape, consumers frequently interact with content across multiple devices, which are often recorded with different identifiers for the same consumer. The inability to match different device identities across consumers poses significant challenges for accurately estimating human preferences and other causal effects. Moreover, without strong assumptions about the device-user graph, the causal effects might not be identifiable. In this paper, we propose HIFIVE, a variational method to solve the problem of estimating global average treatment effects (GATE) from a fragmented view of exposures and outcomes. Experiments show that our estimator is superior to standard estimators, with a lower bias and greater robustness to network uncertainty.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shiv Shankar;Ritwik Sinha;Madalina Fiterau", "authorids": "~Shiv_Shankar2;~Ritwik_Sinha1;~Madalina_Fiterau3", "gender": ";M;F", "homepage": ";https://research.adobe.com/person/ritwik-sinha/;https://www.cs.umass.edu/~mfiterau", "dblp": "203/9123;127/3163;05/8090", "google_scholar": ";https://scholar.google.co.in/citations?user=4SDTMIQAAAAJ;NTHsaUQAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Shiv_Shankar2;~Ritwik_Sinha1;~Madalina_Fiterau1", "aff": "IIT Bombay;Adobe Systems;Department of Computer Science, University of Massachusetts, Amherst", "aff_domain": "iitb.ac.in;adobe.com;cs.umass.edu", "position": "Researcher;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nshankar2024on,\ntitle={On Online Experimentation without Device Identifiers},\nauthor={Shiv Shankar and Ritwik Sinha and Madalina Fiterau},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=merZTLSdC9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 636581, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16774293500685921276&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "iitb.ac.in;adobe.com;cs.umass.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Indian Institute of Technology Bombay;Adobe;University of Massachusetts Amherst", "aff_unique_dep": ";Adobe Systems Incorporated;Department of Computer Science", "aff_unique_url": "https://www.iitb.ac.in;https://www.adobe.com;https://www.umass.edu", "aff_unique_abbr": "IITB;Adobe;UMass Amherst", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Mumbai;;Amherst", "aff_country_unique_index": "0;1;1", "aff_country_unique": "India;United States" }, { "title": "Privacy Attacks in Decentralized Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33161", "id": "mggc3oYHy4", "proceeding": "https://proceedings.mlr.press/v235/mrini24a.html", "pdf": "https://openreview.net/pdf?id=mggc3oYHy4", "openreview": "https://openreview.net/forum?id=mggc3oYHy4", "author_site": "Abdellah El Mrini, Edwige Cyffers, Aur\u00e9lien Bellet", "tldr": "", "abstract": "Decentralized Gradient Descent (D-GD) allows a set of users to perform collaborative learning without sharing their data by iteratively averaging local model updates with their neighbors in a network graph. The absence of direct communication between non-neighbor nodes might lead to the belief that users cannot infer precise information about the data of others. In this work, we demonstrate the opposite, by proposing the first attack against D-GD that enables a user (or set of users) to reconstruct the private data of other users outside their immediate neighborhood. Our approach is based on a reconstruction attack against the gossip averaging protocol, which we then extend to handle the additional challenges raised by D-GD. We validate the effectiveness of our attack on real graphs and datasets, showing that the number of users compromised by a single or a handful of attackers is often surprisingly large. We empirically investigate some of the factors that affect the performance of the attack, namely the graph topology, the number of attackers, and their position in the graph.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Abdellah El Mrini;Edwige Cyffers;Aur\u00e9lien Bellet", "authorids": "abdellah.elmrini@epfl.ch;~Edwige_Cyffers1;~Aur\u00e9lien_Bellet1", "gender": ";;", "homepage": ";;http://researchers.lille.inria.fr/abellet/", "dblp": ";281/6734;61/8017", "google_scholar": ";;https://scholar.google.fr/citations?user=j8svx3IAAAAJ", "orcid": ";;0000-0003-3440-1251", "linkedin": ";edwige-cyffers/;", "or_profile": "abdellah.elmrini@epfl.ch;~Edwige_Cyffers1;~Aur\u00e9lien_Bellet1", "aff": ";INRIA;INRIA", "aff_domain": ";inria.fr;inria.fr", "position": ";PhD student;Tenured researcher", "bibtex": "@inproceedings{\nmrini2024privacy,\ntitle={Privacy Attacks in Decentralized Learning},\nauthor={Abdellah El Mrini and Edwige Cyffers and Aur{\\'e}lien Bellet},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mggc3oYHy4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2193545, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10553279107423901499&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8, "email": ";inria.fr;inria.fr", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "INRIA", "aff_unique_dep": "", "aff_unique_url": "https://www.inria.fr", "aff_unique_abbr": "INRIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "title": "LoRAP: Transformer Sub-Layers Deserve Differentiated Structured Compression for Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33160", "id": "mhI5nc5QwX", "proceeding": "https://proceedings.mlr.press/v235/li24bi.html", "pdf": "https://openreview.net/pdf?id=mhI5nc5QwX", "openreview": "https://openreview.net/forum?id=mhI5nc5QwX", "author_site": "guangyan li, Yongqiang Tang, Wensheng Zhang", "tldr": "", "abstract": "Large language models (LLMs) show excellent performance in difficult tasks, but they often require massive memories and computational resources. How to reduce the parameter scale of LLMs has become research hotspots. In this study, we get an important observation that the multi-head self-attention (MHA) sub-layer of Transformer exhibits noticeable low-rank structure, while the feed-forward network (FFN) sub-layer does not. With this regard, we design a novel structured compression method LoRAP, which organically combines **Lo**w-**R**ank matrix approximation **A**nd structured **P**runing. For the MHA sub-layer, we proposal an input activation weighted singular value decomposition method and allocate different parameter amounts for each weight matrix based on the differences in low-rank properties of matrices.For the FFN sub-layer, we propose a gradient-free structured channel pruning method and save the least important 1% of parameters which actually play a vital role in model performance. Extensive evaluations on zero-shot perplexity and zero-shot task classification indicate that our proposal is superior to previous structured compression rivals under multiple compression ratios. Our code will be released soon.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guangyan Li;Yongqiang Tang;Wensheng Zhang", "authorids": "~Guangyan_Li1;~Yongqiang_Tang1;~Wensheng_Zhang5", "gender": ";;M", "homepage": ";;https://people.ucas.ac.cn/~wenshengzhang", "dblp": ";;94/6627-2.html/", "google_scholar": ";;", "orcid": ";;0000-0003-0752-941X", "linkedin": ";;", "or_profile": "~Guangyan_Li1;~Yongqiang_Tang1;~Wensheng_Zhang5", "aff": ";;Guangzhou University", "aff_domain": ";;gzhu.edu.cn", "position": ";;Full Professor", "bibtex": "@inproceedings{\nli2024lorap,\ntitle={Lo{RAP}: Transformer Sub-Layers Deserve Differentiated Structured Compression for Large Language Models},\nauthor={Guangyan Li and Yongqiang Tang and Wensheng Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mhI5nc5QwX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3788929, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8314188560284949706&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 7, "email": ";;gzhu.edu.cn", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Guangzhou University", "aff_unique_dep": "", "aff_unique_url": "http://www.gzhu.edu.cn", "aff_unique_abbr": "GU", "aff_country_unique_index": "0", "aff_country_unique": "China" }, { "title": "Bounded and Uniform Energy-based Out-of-distribution Detection for Graphs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33159", "id": "mjh7AOWozN", "proceeding": "https://proceedings.mlr.press/v235/yang24n.html", "pdf": "https://openreview.net/pdf?id=mjh7AOWozN", "openreview": "https://openreview.net/forum?id=mjh7AOWozN", "author_site": "Shenzhi Yang, Bin Liang, An Liu, Lin Gui, Xingkai Yao, Xiaofang Zhang", "tldr": "", "abstract": "Given the critical role of graphs in real-world applications and their high-security requirements, improving the ability of graph neural networks (GNNs) to detect out-of-distribution (OOD) data is an urgent research problem. The recent work GNNSAFE proposes a framework based on the aggregation of negative energy scores that significantly improves the performance of GNNs to detect node-level OOD data. However, our study finds that score aggregation among nodes is susceptible to extreme values due to the unboundedness of the negative energy scores and logit shifts, which severely limits the accuracy of GNNs in detecting node-level OOD data. In this paper, we propose NODESAFE: reducing the generation of extreme scores of nodes by adding two optimization terms that make the negative energy scores bounded and mitigate the logit shift. Experimental results show that our approach dramatically improves the ability of GNNs to detect OOD data at the node level, e.g., in detecting OOD data induced by Structure Manipulation, the metric of FPR95 (lower is better) in scenarios without (with) OOD data exposure are reduced from the current SOTA by 28.4% ( 22.7% ). The code is available via https://github.com/ShenzhiYang2000/NODESAFE-Bounded-and-Uniform-Energy-based-Out-of-distribution-Detection-for-Graphs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shenzhi Yang;Bin Liang;An Liu;Lin Gui;Xingkai Yao;Xiaofang Zhang", "authorids": "~Shenzhi_Yang1;~Bin_Liang6;~An_Liu1;~Lin_Gui3;~Xingkai_Yao1;~Xiaofang_Zhang1", "gender": "M;M;Not Specified;M;M;F", "homepage": "https://github.com/Treasure-tea;https://binliang-nlp.github.io/;http://web.suda.edu.cn/anliu/;;https://github.com/noah-yxk;", "dblp": "379/6162;71/6053-4;52/94-2;34/8605-3;;08/1043", "google_scholar": ";djpQeLEAAAAJ;89cdhB4AAAAJ;https://scholar.google.com.ph/citations?user=1b3Eyx4AAAAJ;;https://scholar.google.co.uk/citations?hl=en", "orcid": "0000-0001-9879-8812;0000-0001-7234-1347;0000-0002-6368-576X;;0009-0002-0998-2728;0000-0002-8667-0456", "linkedin": ";;;;;", "or_profile": "~Shenzhi_Yang1;~Bin_Liang6;~An_Liu1;~Lin_Gui3;~Xingkai_Yao1;~Xiaofang_Zhang1", "aff": "Suzhou University;The Chinese University of Hong Kong;Soochow University;King's College London, University of London;Suzhou University;Soochow University", "aff_domain": "suda.edu.cn;cuhk.edu.hk;suda.edu.cn;kcl.ac.uk;suda.edu.cn;suda.edu.cn", "position": "MS student;Postdoc;Full Professor;Lecturer;MS student;Full Professor", "bibtex": "@inproceedings{\nyang2024bounded,\ntitle={Bounded and Uniform Energy-based Out-of-distribution Detection for Graphs},\nauthor={Shenzhi Yang and Bin Liang and An Liu and Lin Gui and Xingkai Yao and Xiaofang Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mjh7AOWozN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3593656, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9492481073878575506&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6, "email": "suda.edu.cn;cuhk.edu.hk;suda.edu.cn;kcl.ac.uk;suda.edu.cn;suda.edu.cn", "author_num": 6, "aff_unique_index": "0;1;2;3;0;2", "aff_unique_norm": "Suzhou University;Chinese University of Hong Kong;Soochow University;King's College London", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.suda.edu.cn;https://www.cuhk.edu.hk;https://www.soochow.edu.cn;https://www.kcl.ac.uk", "aff_unique_abbr": "Suda;CUHK;Soochow U;KCL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "China;United Kingdom" }, { "title": "Caduceus: Bi-Directional Equivariant Long-Range DNA Sequence Modeling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33158", "id": "mk3A5IUdn8", "proceeding": "https://proceedings.mlr.press/v235/schiff24a.html", "pdf": "https://openreview.net/pdf?id=mk3A5IUdn8", "openreview": "https://openreview.net/forum?id=mk3A5IUdn8", "author_site": "Yair Schiff, Chia Hsiang Kao, Aaron Gokaslan, Tri Dao, Albert Gu, Volodymyr Kuleshov", "tldr": "", "abstract": "Large-scale sequence modeling has sparked rapid advances that now extend into biology and genomics. However, modeling genomic sequences introduces challenges such as the need to model long-range token interactions, the effects of upstream and downstream regions of the genome, and the reverse complementarity (RC) of DNA. Here, we propose an architecture motivated by these challenges that builds off the long-range Mamba block, and extends it to a BiMamba component that supports bi-directionality, and to a MambaDNA block that additionally supports RC equivariance. We use MambaDNA as the basis of Caduceus, the first family of RC equivariant bi-directional long-range DNA language models, and we introduce pre-training and fine-tuning strategies that yield Caduceus DNA foundation models. Caduceus outperforms previous long-range models on downstream benchmarks; on a challenging long-range variant effect prediction task, Caduceus exceeds the performance of 10x larger models that do not leverage bi-directionality or equivariance. Code to reproduce our experiments is available here: https://github.com/kuleshov-group/caduceus.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yair Schiff;Chia Hsiang Kao;Aaron Gokaslan;Tri Dao;Albert Gu;Volodymyr Kuleshov", "authorids": "~Yair_Schiff1;~Chia_Hsiang_Kao1;~Aaron_Gokaslan1;~Tri_Dao1;~Albert_Gu1;~Volodymyr_Kuleshov1", "gender": "M;M;M;;M;", "homepage": "https://github.com/yair-schiff;https://iandrover.github.io;https://skylion007.github.io/;https://tridao.me/;;https://www.cs.cornell.edu/~kuleshov/", "dblp": ";241/3791;220/6816;206/7018;130/0612;81/8612", "google_scholar": "GhFrOdQAAAAJ;https://scholar.google.com.tw/citations?user=W_i9B0sAAAAJ;Mt2wyL4AAAAJ;NQRw0bQAAAAJ;DVCHv1kAAAAJ;RY_t8XAAAAAJ", "orcid": ";;0000-0002-3575-2961;;0000-0002-4946-6042;", "linkedin": "yair-schiff;;aarongokaslan/;;;", "or_profile": "~Yair_Schiff1;~Chia_Hsiang_Kao1;~Aaron_Gokaslan1;~Tri_Dao1;~Albert_Gu1;~Volodymyr_Kuleshov1", "aff": "Department of Computer Science, Cornell University;Cornell University;Cornell University;Princeton University;Carnegie Mellon University;Cornell University", "aff_domain": "cs.cornell.edu;cornell.edu;cornell.edu;princeton.edu;cmu.edu;cornell.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nschiff2024caduceus,\ntitle={Caduceus: Bi-Directional Equivariant Long-Range {DNA} Sequence Modeling},\nauthor={Yair Schiff and Chia Hsiang Kao and Aaron Gokaslan and Tri Dao and Albert Gu and Volodymyr Kuleshov},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mk3A5IUdn8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 765451, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 101, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6041389491834971482&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "cs.cornell.edu;cornell.edu;cornell.edu;princeton.edu;cmu.edu;cornell.edu", "author_num": 6, "aff_unique_index": "0;0;0;1;2;0", "aff_unique_norm": "Cornell University;Princeton University;Carnegie Mellon University", "aff_unique_dep": "Department of Computer Science;;", "aff_unique_url": "https://www.cornell.edu;https://www.princeton.edu;https://www.cmu.edu", "aff_unique_abbr": "Cornell;Princeton;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "GliDe with a CaPE: A Low-Hassle Method to Accelerate Speculative Decoding", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33157", "id": "mk8oRhox2l", "proceeding": "https://proceedings.mlr.press/v235/du24c.html", "pdf": "https://openreview.net/pdf?id=mk8oRhox2l", "openreview": "https://openreview.net/forum?id=mk8oRhox2l", "author_site": "Cunxiao Du, Jing Jiang, Xu Yuanchen, Jiawei Wu, Sicheng Yu, Yongqi Li, Shenggui Li, Kai Xu, Liqiang Nie, Zhaopeng Tu, Yang You", "tldr": "", "abstract": "Speculative decoding is a relatively new decoding framework that leverages small and efficient draft models to reduce the latency of LLMs. In this study, we introduce GliDe and CaPE, two low-hassle modifications to vanilla speculative decoding to further improve the decoding speed of a frozen LLM. Specifically, GliDe is a modified draft model architecture that reuses the cached keys and values from the target LLM, while CaPE is a proposal expansion method that uses the draft model's confidence scores to help select additional candidate tokens for verification. Extensive experiments on different benchmarks demonstrate that our proposed GliDe draft model significantly reduces the expected decoding latency. Additional evaluation using walltime reveals that GliDe can accelerate Vicuna models up to 2.17x and further extend the improvement to 2.61x with CaPE. We will release our code, data, and the trained draft models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Cunxiao Du;Jing Jiang;Xu Yuanchen;Jiawei Wu;Sicheng Yu;Yongqi Li;Shenggui Li;Kai Xu;Liqiang Nie;Zhaopeng Tu;Yang You", "authorids": "~Cunxiao_Du3;~Jing_Jiang1;~Xu_Yuanchen1;~Jiawei_Wu8;~Sicheng_Yu2;~Yongqi_Li1;~Shenggui_Li1;~Kai_Xu1;~Liqiang_Nie2;~Zhaopeng_Tu1;~Yang_You1", "gender": ";F;M;;;;M;M;M;M;M", "homepage": ";http://www.mysmu.edu/faculty/jingjiang/;https://github.com/chengeharrison;;;;;;https://liqiangnie.github.io/index.html;http://www.zptu.net;https://www.comp.nus.edu.sg/~youy/", "dblp": ";68/1974-1;;;;;;x/KaiXu4;92/8277;71/9281;33/8167-1.html", "google_scholar": ";https://scholar.google.com.sg/citations?user=hVTK2YwAAAAJ;;;;;;;yywVMhUAAAAJ;IvE2zRgAAAAJ;jF4dPZwAAAAJ", "orcid": ";0000-0002-3035-0074;;;;;;;0000-0003-1476-0273;;", "linkedin": ";;;;;;https://sg.linkedin.com/in/shenggui-li-0b6850152;;;tuzhaopeng;yang-you-0b92914b/", "or_profile": "~Cunxiao_Du3;~Jing_Jiang1;~Xu_Yuanchen1;~Jiawei_Wu8;~Sicheng_Yu2;~Yongqi_Li1;~Shenggui_Li1;~Kai_Xu1;~Liqiang_Nie2;~Zhaopeng_Tu1;~Yang_You1", "aff": ";Singapore Management University;National University of Singapore;;;;National University of Singapore;;Shandong University;Tencent AI Lab;National University of Singapore", "aff_domain": ";smu.edu.sg;u.nus.edu;;;;nus.edu.sg;;sdu.edu.cn;tencent.com;nus.edu.sg", "position": ";Full Professor;MS student;;;;Research Assistant;;Full Professor;Principal Researcher;Professor", "bibtex": "@inproceedings{\ndu2024glide,\ntitle={GliDe with a Ca{PE}: A Low-Hassle Method to Accelerate Speculative Decoding},\nauthor={Cunxiao Du and Jing Jiang and Xu Yuanchen and Jiawei Wu and Sicheng Yu and Yongqi Li and Shenggui Li and Kai Xu and Liqiang Nie and Zhaopeng Tu and Yang You},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mk8oRhox2l}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2133287, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3922278010828478392&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": ";smu.edu.sg;u.nus.edu;;;;nus.edu.sg;;sdu.edu.cn;tencent.com;nus.edu.sg", "author_num": 11, "aff_unique_index": "0;1;1;2;3;1", "aff_unique_norm": "Singapore Management University;National University of Singapore;Shandong University;Tencent", "aff_unique_dep": ";;;Tencent AI Lab", "aff_unique_url": "https://www.smu.edu.sg;https://www.nus.edu.sg;http://www.sdu.edu.cn;https://ai.tencent.com", "aff_unique_abbr": "SMU;NUS;SDU;Tencent AI Lab", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;1;0", "aff_country_unique": "Singapore;China" }, { "title": "Double Stochasticity Gazes Faster: Snap-Shot Decentralized Stochastic Gradient Tracking Methods", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33156", "id": "mkbSXxovP5", "proceeding": "https://proceedings.mlr.press/v235/di24a.html", "pdf": "https://openreview.net/pdf?id=mkbSXxovP5", "openreview": "https://openreview.net/forum?id=mkbSXxovP5", "author_site": "Hao Di, Haishan Ye, Xiangyu Chang, Guang Dai, Ivor Tsang", "tldr": "", "abstract": "In decentralized optimization, $m$ agents form a network and only communicate with their neighbors, which gives advantages in data ownership, privacy, and scalability. At the same time, decentralized stochastic gradient descent ($\\texttt{SGD}$) methods, as popular decentralized algorithms for training large-scale machine learning models, have shown their superiority over centralized counterparts. Distributed stochastic gradient tracking $\\texttt{DSGT}$ has been recognized as the popular and state-of-the-art decentralized $\\texttt{SGD}$ method due to its proper theoretical guarantees. However, the theoretical analysis of $\\texttt{DSGT}$ shows that its iteration complexity is $\\tilde{\\mathcal{O}} \\left(\\frac{\\bar{\\sigma}^2}{m\\mu \\varepsilon} + \\frac{\\sqrt{L}\\bar{\\sigma}}{\\mu(1 - \\lambda_2(W))^{1/2} C_W \\sqrt{\\varepsilon} }\\right)$, where the doubly stochastic matrix $W$ represents the network topology and $ C_W $ is a parameter that depends on $W$. Thus, it indicates that the convergence property of $\\texttt{DSGT}$ is heavily affected by the topology of the communication network. To overcome the weakness of $\\texttt{DSGT}$, we resort to the snap-shot gradient tracking skill and propose two novel algorithms, snap-shot $\\texttt{DSGT}$ ($\\texttt{SS-DSGT}$) and accelerated snap-shot $\\texttt{DSGT}$ ($\\texttt{ASS-DSGT}$). We further justify that $\\texttt{SS-DSGT}$ exhibits a lower iteration complexity compared to $\\texttt{DSGT}$ in the general communication network topology. Additionally, $\\texttt{ASS-DSGT}$ matches $\\texttt{DSGT}$'s iteration complexity $\\mathcal{O}\\left( \\frac{\\bar{\\sigma}^2}{m\\mu \\varepsilon} + \\frac{\\sqrt{L}\\bar{\\sigma}}{\\mu (1 - \\lambda_2(W))^{1/2}\\sqrt{\\varepsilon}} \\right)$ under the same conditions as $\\texttt{DSGT}$. Numerical experiments validate $\\texttt{SS-DSGT}$'s superior performance performance in the general communication network topology and exhibit better practical performance of $\\texttt{ASS-DSGT}$ on the specified $W$ compared to $\\texttt{DSGT}$.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hao Di;Haishan Ye;Xiangyu Chang;Guang Dai;Ivor Tsang", "authorids": "~Hao_Di2;~Haishan_Ye2;~Xiangyu_Chang1;~Guang_Dai1;~Ivor_Tsang1", "gender": "M;M;M;M;M", "homepage": "https://conscien.top/;;;;https://www.a-star.edu.sg/cfar/about-cfar/management/prof-ivor-tsang", "dblp": "95/8967;162/0002.html;90/9705;;35/5873", "google_scholar": ";;;;rJMOlVsAAAAJ", "orcid": "0009-0004-1846-9787;;;0000-0002-3529-9087;", "linkedin": ";;;;", "or_profile": "~Hao_Di2;~Haishan_Ye2;~Xiangyu_Chang1;~Guang_Dai1;~Ivor_W_Tsang1", "aff": "Xi'an Jiaotong University;Xi'an Jiaotong University;Xi'an Jiaotong University;SGIT AI;A*STAR", "aff_domain": "xjtu.edu.cn;xjtu.edu.cn;xjtu.edu.cn;sgcc.com.cn;cfar.a-star.edu.sg", "position": "PhD student;Associate Professor;Associate Professor;Principal Researcher;Principal Researcher", "bibtex": "@inproceedings{\ndi2024double,\ntitle={Double Stochasticity Gazes Faster: Snap-Shot Decentralized Stochastic Gradient Tracking Methods},\nauthor={Hao Di and Haishan Ye and Xiangyu Chang and Guang Dai and Ivor Tsang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mkbSXxovP5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 656890, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2630837004419526702&as_sdt=40000005&sciodt=0,22&hl=en", "gs_version_total": 5, "email": "xjtu.edu.cn;xjtu.edu.cn;xjtu.edu.cn;sgcc.com.cn;cfar.a-star.edu.sg", "author_num": 5, "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "Xi'an Jiao Tong University;SGIT AI;Agency for Science, Technology and Research", "aff_unique_dep": ";;", "aff_unique_url": "https://www.xjtu.edu.cn;;https://www.a-star.edu.sg", "aff_unique_abbr": "XJTU;;A*STAR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;2", "aff_country_unique": "China;;Singapore" }, { "title": "Repoformer: Selective Retrieval for Repository-Level Code Completion", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33155", "id": "moyG54Okrj", "proceeding": "https://proceedings.mlr.press/v235/wu24a.html", "pdf": "https://openreview.net/pdf?id=moyG54Okrj", "openreview": "https://openreview.net/forum?id=moyG54Okrj", "author_site": "Di Wu, Wasi Ahmad, Dejiao Zhang, Murali Krishna Ramanathan, Xiaofei Ma", "tldr": "", "abstract": "Recent advances in retrieval-augmented generation (RAG) have initiated a new era in repository-level code completion. However, the invariable use of retrieval in existing methods exposes issues in both efficiency and robustness, with a large proportion of the retrieved contexts proving unhelpful or harmful to code language models (code LMs). In this paper, we propose a selective RAG framework to avoid retrieval when unnecessary. To power this framework, we design a self-supervised learning approach to enable a code LM to accurately self-evaluate whether retrieval can improve its output quality and robustly leverage the potentially noisy retrieved contexts. Using this LM as both the selective RAG policy and the generation model, our framework achieves state-of-the-art repository-level code completion performance on diverse benchmarks including RepoEval, CrossCodeEval, and CrossCodeLongEval, a new long-form code completion benchmark. Meanwhile, our analyses show that selectively retrieving brings as much as 70% inference speedup in the online serving setting without harming the performance. We further demonstrate that our framework is able to accommodate different generation models, retrievers, and programming languages. These advancements position our framework as an important step towards more accurate and efficient repository-level code completion.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Di Wu;Wasi Uddin Ahmad;Dejiao Zhang;Murali Krishna Ramanathan;Xiaofei Ma", "authorids": "~Di_Wu14;~Wasi_Uddin_Ahmad1;~Dejiao_Zhang1;~Murali_Krishna_Ramanathan1;~Xiaofei_Ma1", "gender": "Not Specified;M;F;M;M", "homepage": "https://xiaowu0162.github.io/;http://wasiahmad.github.io/;https://dejiao2018.github.io/;;https://www.amazon.science/author/xiaofei-ma", "dblp": "52/328-54.html;183/0576;131/6876;75/541.html;", "google_scholar": "vu1pDZgAAAAJ;YCHJZOMAAAAJ;klYBD5MAAAAJ;;Pc2SfvMAAAAJ", "orcid": ";;;;", "linkedin": ";ahmadwasi/;;;xiaofei-ma-b3627928", "or_profile": "~Di_Wu14;~Wasi_Uddin_Ahmad1;~Dejiao_Zhang1;~Murali_Krishna_Ramanathan1;~Xiaofei_Ma1", "aff": "University of California, Los Angeles;Amazon;Amazon AWS;Amazon;Amazon Web Services", "aff_domain": "cs.ucla.edu;amazon.com;amazon.com;amazon.com;amazon.com", "position": "PhD student;Applied Scientist;Applied Scientist;Principal Researcher;Applied Science Manager", "bibtex": "@inproceedings{\nwu2024repoformer,\ntitle={Repoformer: Selective Retrieval for Repository-Level Code Completion},\nauthor={Di Wu and Wasi Uddin Ahmad and Dejiao Zhang and Murali Krishna Ramanathan and Xiaofei Ma},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=moyG54Okrj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 943924, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10868774457029051558&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "cs.ucla.edu;amazon.com;amazon.com;amazon.com;amazon.com", "author_num": 5, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "University of California, Los Angeles;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.ucla.edu;https://www.amazon.com", "aff_unique_abbr": "UCLA;Amazon", "aff_campus_unique_index": "0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Mean-field Analysis on Two-layer Neural Networks from a Kernel Perspective", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33154", "id": "mphq2jMFLZ", "proceeding": "https://proceedings.mlr.press/v235/takakura24a.html", "pdf": "https://openreview.net/pdf?id=mphq2jMFLZ", "openreview": "https://openreview.net/forum?id=mphq2jMFLZ", "author_site": "Shokichi Takakura, Taiji Suzuki", "tldr": "", "abstract": "In this paper, we study the feature learning ability of two-layer neural networks in the mean-field regime through the lens of kernel methods. To focus on the dynamics of the kernel induced by the first layer, we utilize a two-timescale limit, where the second layer moves much faster than the first layer. In this limit, the learning problem is reduced to the minimization problem over the intrinsic kernel. Then, we show the global convergence of the mean-field Langevin dynamics and derive time and particle discretization error. We also demonstrate that two-layer neural networks can learn a union of multiple reproducing kernel Hilbert spaces more efficiently than any kernel methods, and neural networks aquire data-dependent kernel which aligns with the target function. In addition, we develop a label noise procedure, which converges to the global optimum and show that the degrees of freedom appears as an implicit reguralization.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shokichi Takakura;Taiji Suzuki", "authorids": "~Shokichi_Takakura1;~Taiji_Suzuki1", "gender": "M;M", "homepage": "https://github.com/masayoshi64;http://ibis.t.u-tokyo.ac.jp/suzuki/", "dblp": ";08/312", "google_scholar": ";x8osrBsAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Shokichi_Takakura1;~Taiji_Suzuki1", "aff": "The University of Tokyo;The University of Tokyo", "aff_domain": "g.ecc.u-tokyo.ac.jp;tokyo.ac.jp", "position": "MS student;Associate Professor", "bibtex": "@inproceedings{\ntakakura2024meanfield,\ntitle={Mean-field Analysis on Two-layer Neural Networks from a Kernel Perspective},\nauthor={Shokichi Takakura and Taiji Suzuki},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mphq2jMFLZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 502048, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9946776099422462274&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "g.ecc.u-tokyo.ac.jp;tokyo.ac.jp", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Tokyo", "aff_unique_dep": "", "aff_unique_url": "https://www.u-tokyo.ac.jp", "aff_unique_abbr": "UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "title": "Fine-Grained Causal Dynamics Learning with Quantization for Improving Robustness in Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33153", "id": "mrd4e8ZJjm", "proceeding": "https://proceedings.mlr.press/v235/hwang24b.html", "pdf": "https://openreview.net/pdf?id=mrd4e8ZJjm", "openreview": "https://openreview.net/forum?id=mrd4e8ZJjm", "author_site": "Inwoo Hwang, Yunhyeok Kwak, Suhyung Choi, Byoung-Tak Zhang, Sanghack Lee", "tldr": "", "abstract": "Causal dynamics learning has recently emerged as a promising approach to enhancing robustness in reinforcement learning (RL). Typically, the goal is to build a dynamics model that makes predictions based on the causal relationships among the entities. Despite the fact that causal connections often manifest only under certain contexts, existing approaches overlook such fine-grained relationships and lack a detailed understanding of the dynamics. In this work, we propose a novel dynamics model that infers fine-grained causal structures and employs them for prediction, leading to improved robustness in RL. The key idea is to jointly learn the dynamics model with a discrete latent variable that quantizes the state-action space into subgroups. This leads to recognizing meaningful context that displays sparse dependencies, where causal structures are learned for each subgroup throughout the training. Experimental results demonstrate the robustness of our method to unseen states and locally spurious correlations in downstream tasks where fine-grained causal reasoning is crucial. We further illustrate the effectiveness of our subgroup-based approach with quantization in discovering fine-grained causal relationships compared to prior methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Inwoo Hwang;Yunhyeok Kwak;Suhyung Choi;Byoung-Tak Zhang;Sanghack Lee", "authorids": "~Inwoo_Hwang1;~Yunhyeok_Kwak1;~Suhyung_Choi2;~Byoung-Tak_Zhang1;~Sanghack_Lee1", "gender": ";M;M;M;M", "homepage": "https://iwhwang.github.io;https://yun-kwak.github.io;https://www.github.com/conscious-choi;https://bi.snu.ac.kr/~btzhang/;http://www.sanghacklee.me", "dblp": "317/0732;332/4729;376/2355;09/5682;20/1133", "google_scholar": "MuG6Le8AAAAJ;rhyhnRYAAAAJ;jPcpT2MAAAAJ;sYTUOu8AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0009-0001-5491-3492;;;0000-0001-7137-6126", "linkedin": ";;;;sanghack-lee-65b52a28/", "or_profile": "~Inwoo_Hwang1;~Yunhyeok_Kwak1;~Suhyung_Choi2;~Byoung-Tak_Zhang1;~Sanghack_Lee1", "aff": "Seoul National University;Seoul National University;Seoul National University;Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr", "position": "PhD student;MS student;PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nhwang2024finegrained,\ntitle={Fine-Grained Causal Dynamics Learning with Quantization for Improving Robustness in Reinforcement Learning},\nauthor={Inwoo Hwang and Yunhyeok Kwak and Suhyung Choi and Byoung-Tak Zhang and Sanghack Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mrd4e8ZJjm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2637741, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8426167059727066293&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 8, "email": "snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Major-Minor Mean Field Multi-Agent Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33152", "id": "mslTE1qgLa", "proceeding": "https://proceedings.mlr.press/v235/cui24a.html", "pdf": "https://openreview.net/pdf?id=mslTE1qgLa", "openreview": "https://openreview.net/forum?id=mslTE1qgLa", "author_site": "Kai Cui, Christian Fabian, Anam Tahir, Heinz Koeppl", "tldr": "", "abstract": "Multi-agent reinforcement learning (MARL) remains difficult to scale to many agents. Recent MARL using Mean Field Control (MFC) provides a tractable and rigorous approach to otherwise difficult cooperative MARL. However, the strict MFC assumption of many independent, weakly-interacting agents is too inflexible in practice. We generalize MFC to instead simultaneously model many similar and few complex agents \u2013 as Major-Minor Mean Field Control (M3FC). Theoretically, we give approximation results for finite agent control, and verify the sufficiency of stationary policies for optimality together with a dynamic programming principle. Algorithmically, we propose Major-Minor Mean Field MARL (M3FMARL) for finite agent systems instead of the limiting system. The algorithm is shown to approximate the policy gradient of the underlying M3FC MDP. Finally, we demonstrate its capabilities experimentally in various scenarios. We observe a strong performance in comparison to state-of-the-art policy gradient MARL methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kai Cui;Christian Fabian;Anam Tahir;Heinz Koeppl", "authorids": "~Kai_Cui3;~Christian_Fabian1;~Anam_Tahir1;~Heinz_Koeppl1", "gender": ";M;F;M", "homepage": ";https://www.bcs.tu-darmstadt.de/team_sos/fabianchristian.en.jsp;https://www.bcs.tu-darmstadt.de/team_sos/tahiranam_sos.en.jsp;", "dblp": ";85/10135-1;;41/6084", "google_scholar": ";https://scholar.google.de/citations?user=hYtlGkMAAAAJ;5ihEd7YAAAAJ;https://scholar.google.de/citations?user=WaPW80kAAAAJ", "orcid": ";0000-0003-4239-3861;0000-0002-5585-0948;", "linkedin": ";https://de.linkedin.com/in/-christian-fabian;;", "or_profile": "~Kai_Cui3;~Christian_Fabian1;~Anam_Tahir1;~Heinz_Koeppl1", "aff": ";Technische Universit\u00e4t Darmstadt;Technical University Darmstadt;TU Darmstadt", "aff_domain": ";tu-darmstadt.de;tudarmstadt.de;tu-darmstadt.de", "position": ";PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\ncui2024majorminor,\ntitle={Major-Minor Mean Field Multi-Agent Reinforcement Learning},\nauthor={Kai Cui and Christian Fabian and Anam Tahir and Heinz Koeppl},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mslTE1qgLa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1859818, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=136805878130073797&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": ";tu-darmstadt.de;tudarmstadt.de;tu-darmstadt.de", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Technische Universit\u00e4t Darmstadt;Technical University of Darmstadt", "aff_unique_dep": ";", "aff_unique_url": "https://www.tu-darmstadt.de;https://www.tu-darmstadt.de", "aff_unique_abbr": "TUD;TUD", "aff_campus_unique_index": "1", "aff_campus_unique": ";Darmstadt", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "Gambling-Based Confidence Sequences for Bounded Random Vectors", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33151", "id": "mu7Er7f9NQ", "proceeding": "https://proceedings.mlr.press/v235/ryu24a.html", "pdf": "https://openreview.net/pdf?id=mu7Er7f9NQ", "openreview": "https://openreview.net/forum?id=mu7Er7f9NQ", "author_site": "Jongha (Jon) Ryu, Gregory Wornell", "tldr": "", "abstract": "A confidence sequence (CS) is a sequence of confidence sets that contains a target parameter of an underlying stochastic process at any time step with high probability. This paper proposes a new approach to constructing CSs for means of bounded multivariate stochastic processes using a general gambling framework, extending the recently established coin toss framework for bounded random processes. The proposed gambling framework provides a general recipe for constructing CSs for categorical and probability-vector-valued observations, as well as for general bounded multidimensional observations through a simple reduction. This paper specifically explores the use of the mixture portfolio, akin to Cover's universal portfolio, in the proposed framework and investigates the properties of the resulting CSs. Simulations demonstrate the tightness of these confidence sequences compared to existing methods. When applied to the sampling without-replacement setting for finite categorical data, it is shown that the resulting CS based on a universal gambling strategy is provably tighter than that of the posterior-prior ratio martingale proposed by Waudby-Smith and Ramdas.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jongha Jon Ryu;Gregory W. Wornell", "authorids": "~Jongha_Jon_Ryu1;~Gregory_W._Wornell1", "gender": "M;M", "homepage": "https://jongharyu.github.io;https://web.mit.edu/gww/www/", "dblp": "340/4088;94/5969", "google_scholar": "5ZYeWgcAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0001-9166-4758", "linkedin": ";", "or_profile": "~Jongha_Jon_Ryu1;~Gregory_Wornell1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu", "position": "Postdoc;Full Professor", "bibtex": "@inproceedings{\nryu2024gamblingbased,\ntitle={Gambling-Based Confidence Sequences for Bounded Random Vectors},\nauthor={Jongha Jon Ryu and Gregory W. Wornell},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mu7Er7f9NQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 718717, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16409951478447522770&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "mit.edu;mit.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Soft Prompt Recovers Compressed LLMs, Transferably", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33150", "id": "muBJPCIqZT", "proceeding": "https://proceedings.mlr.press/v235/xu24s.html", "pdf": "https://openreview.net/pdf?id=muBJPCIqZT", "openreview": "https://openreview.net/forum?id=muBJPCIqZT", "author_site": "Zhaozhuo Xu, Zirui Liu, Beidi Chen, Shaochen (Henry) Zhong, Yuxin Tang, Jue Wang, Kaixiong Zhou, Xia Hu, Anshumali Shrivastava", "tldr": "", "abstract": "Model compression is one of the most popular approaches to improve the accessibility of Large Language Models (LLMs) by reducing their memory footprint. However, the gaining of such efficiency benefits often simultaneously demands extensive engineering efforts and intricate designs to mitigate the performance decline. In this work, we leverage *(Soft) Prompt Tuning* in its most vanilla form and discover such conventionally learned soft prompts can recover the performance of compressed LLMs. More surprisingly, we observe such recovery effect to be transferable among different tasks and models (albeit natural tokenizer and dimensionality limitations), resulting in further overhead reduction and yet, subverting the common belief that learned soft prompts are task-specific. Our work is fully orthogonal and compatible with model compression frameworks such as pruning and quantization, where we enable up to $8\\times$ compressed LLM (with a joint 4-bit quantization and 50% weight pruning compression) to match its uncompressed counterparts on popular benchmarks. We note that we are the first to reveal vanilla Parameter-Efficient Fine-Tuning (PEFT) techniques have the potential to be utilized under a compression recovery context, opening a new line of opportunities for model accessibility advancement while freeing our fellow researchers from the previously present engineering burdens and constraints. The code is available at https://github.com/zirui-ray-liu/compress-then-prompt.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhaozhuo Xu;Zirui Liu;Beidi Chen;Shaochen Zhong;Yuxin Tang;Jue WANG;Kaixiong Zhou;Xia Hu;Anshumali Shrivastava", "authorids": "~Zhaozhuo_Xu1;~Zirui_Liu1;~Beidi_Chen1;~Shaochen_Zhong1;~Yuxin_Tang2;~Jue_WANG1;~Kaixiong_Zhou1;~Xia_Hu4;~Anshumali_Shrivastava1", "gender": "M;M;F;M;;M;M;;M", "homepage": "https://ottovonxu.github.io/;https://zirui-ray-liu.github.io/;https://www.andrew.cmu.edu/user/beidic/;https://openreview.net/profile?id=~Shaochen_Zhong1;;https://juewang.me/about/;https://kaixiong-zhou.github.io/;;https://www.cs.rice.edu/~as143/", "dblp": "195/4352;196/8629-1.html;192/1339;326/7286.html;;69/393-19;178/7315;;63/9828", "google_scholar": "7tDlVAsAAAAJ;https://scholar.google.com/citations?hl=zh-CN;;https://scholar.google.com/citations?hl=en;;PykI8xcAAAAJ;zMspIjIAAAAJ;;https://scholar.google.com.tw/citations?user=SGT23RAAAAAJ", "orcid": ";;;;;;0000-0001-5226-8736;;", "linkedin": ";;;shaochen-henry-zhong-96a941249/;;;;;", "or_profile": "~Zhaozhuo_Xu1;~Zirui_Liu1;~Beidi_Chen1;~Shaochen_Zhong1;~Yuxin_Tang2;~Jue_WANG1;~Kaixiong_Zhou1;~Xia_Hu4;~Anshumali_Shrivastava1", "aff": "Stevens Institute of Technology;Rice University;Meta Facebook;Rice University;;Together AI;Massachusetts Institute of Technology;;ThirdAI Corp.", "aff_domain": "stevens.edu;rice.edu;fb.com;rice.edu;;together.ai;mit.edu;;thirdai.com", "position": "Assistant Professor;PhD student;Researcher;PhD student;;Researcher;Postdoc;;CEO", "bibtex": "@inproceedings{\nxu2024soft,\ntitle={Soft Prompt Recovers Compressed {LLM}s, Transferably},\nauthor={Zhaozhuo Xu and Zirui Liu and Beidi Chen and Shaochen Zhong and Yuxin Tang and Jue WANG and Kaixiong Zhou and Xia Hu and Anshumali Shrivastava},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=muBJPCIqZT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 776265, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15341159390038407207&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "stevens.edu;rice.edu;fb.com;rice.edu;;together.ai;mit.edu;;thirdai.com", "author_num": 9, "aff_unique_index": "0;1;2;1;3;4;5", "aff_unique_norm": "Stevens Institute of Technology;Rice University;Meta;Together AI;Massachusetts Institute of Technology;ThirdAI Corp.", "aff_unique_dep": ";;Meta Platforms, Inc.;;;", "aff_unique_url": "https://www.stevens.edu;https://www.rice.edu;https://meta.com;https://www.together.ai;https://web.mit.edu;", "aff_unique_abbr": "SIT;Rice;Meta;Together AI;MIT;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Surrogates for Offline Black-Box Optimization via Gradient Matching", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33149", "id": "mv9beA1wDF", "proceeding": "https://proceedings.mlr.press/v235/hoang24a.html", "pdf": "https://openreview.net/pdf?id=mv9beA1wDF", "openreview": "https://openreview.net/forum?id=mv9beA1wDF", "author_site": "Minh Hoang, Azza Fadhel, Aryan Deshwal, Jana Doppa, Nghia Hoang", "tldr": "", "abstract": "Offline design optimization problem arises in numerous science and engineering applications including material and chemical design, where expensive online experimentation necessitates the use of *in silico* surrogate functions to predict and maximize the target objective over candidate designs. Although these surrogates can be learned from offline data, their predictions are often inaccurate outside the offline data regime. This challenge raises a fundamental question about the impact of imperfect surrogate model on the performance gap between its optima and the true optima, and to what extent the performance loss can be mitigated. Although prior work developed methods to improve the robustness of surrogate models and their associated optimization processes, a provably quantifiable relationship between an imperfect surrogate and the corresponding performance gap, as well as whether prior methods directly address it, remain elusive. To shed light on this important question, we present a theoretical framework to understand offline black-box optimization, by explicitly bounding the optimization quality based on how well the surrogate matches the latent gradient field that underlines the offline data. Inspired by our theoretical analysis, we propose a principled black-box gradient matching algorithm to create effective surrogate models for offline optimization, improving over prior approaches on various real-world benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Minh Hoang;Azza Fadhel;Aryan Deshwal;Jana Doppa;Trong Nghia Hoang", "authorids": "~Minh_Hoang1;~Azza_Fadhel1;~Aryan_Deshwal1;~Jana_Doppa1;~Trong_Nghia_Hoang1", "gender": "M;F;M;;", "homepage": ";;https://aryandeshwal.github.io/;;", "dblp": ";;246/3012.html;;", "google_scholar": "56Mb6DY0_NUC;;wNEYBrAAAAAJ;;", "orcid": ";;;;", "linkedin": ";azza-fadhel-594a21246/;aryan-deshwal-a27835120/;;", "or_profile": "~Minh_Hoang1;~Azza_Fadhel1;~Aryan_Deshwal1;~Jana_Doppa1;~Trong_Nghia_Hoang1", "aff": "Princeton University;Washington State University at Pullman;University of Minnesota - Twin Cities;;", "aff_domain": "princeton.edu;wsu.edu;umn.edu;;", "position": "Postdoc;PhD student;Assistant Professor;;", "bibtex": "@inproceedings{\nhoang2024learning,\ntitle={Learning Surrogates for Offline Black-Box Optimization via Gradient Matching},\nauthor={Minh Hoang and Azza Fadhel and Aryan Deshwal and Jana Doppa and Trong Nghia Hoang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mv9beA1wDF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1239499, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13515851983987321970&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "princeton.edu;wsu.edu;umn.edu;;", "author_num": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "Princeton University;Washington State University;University of Minnesota", "aff_unique_dep": ";;", "aff_unique_url": "https://www.princeton.edu;https://wsu.edu;https://www.minnesota.edu", "aff_unique_abbr": "Princeton;WSU;UMN", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Pullman;Twin Cities", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Is Epistemic Uncertainty Faithfully Represented by Evidential Deep Learning Methods?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33148", "id": "mxjB0LIgpT", "proceeding": "https://proceedings.mlr.press/v235/juergens24a.html", "pdf": "https://openreview.net/pdf?id=mxjB0LIgpT", "openreview": "https://openreview.net/forum?id=mxjB0LIgpT", "author_site": "Mira Juergens, Nis Meinert, Viktor Bengs, Eyke H\u00fcllermeier, Willem Waegeman", "tldr": "", "abstract": "Trustworthy ML systems should not only return accurate predictions, but also a reliable representation of their uncertainty. Bayesian methods are commonly used to quantify both aleatoric and epistemic uncertainty, but alternative approaches, such as evidential deep learning methods, have become popular in recent years. The latter group of methods in essence extends empirical risk minimization (ERM) for predicting second-order probability distributions over outcomes, from which measures of epistemic (and aleatoric) uncertainty can be extracted. This paper presents novel theoretical insights of evidential deep learning, highlighting the difficulties in optimizing second-order loss functions and interpreting the resulting epistemic uncertainty measures. With a systematic setup that covers a wide range of approaches for classification, regression and counts, it provides novel insights into issues of identifiability and convergence in second-order loss minimization, and the relative (rather than absolute) nature of epistemic uncertainty measures.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mira Juergens;Nis Meinert;Viktor Bengs;Eyke H\u00fcllermeier;Willem Waegeman", "authorids": "~Mira_Juergens1;~Nis_Meinert1;~Viktor_Bengs1;~Eyke_H\u00fcllermeier1;~Willem_Waegeman1", "gender": "F;M;M;M;M", "homepage": ";;https://www.kiml.ifi.lmu.de/;https://cs.uni-paderborn.de/index.php?id=60202;http://bioml.ugent.be", "dblp": ";;244/9484;h/EykeHullermeier;02/2445", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;;J1eEtpwAAAAJ;https://scholar.google.de/citations?user=usVJeNN3xFAC;https://scholar.google.be/citations?user=jdjZppMAAAAJ", "orcid": ";0000-0002-4712-9579;0000-0001-6988-6186;0000-0002-9944-4108;", "linkedin": ";dr-nis-meinert/;;;", "or_profile": "~Mira_Juergens1;~Nis_Meinert1;~Viktor_Bengs1;~Eyke_H\u00fcllermeier1;~Willem_Waegeman1", "aff": "Universiteit Gent;German Aerospace Center (DLR);Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;Ghent University", "aff_domain": "ugent.be;dlr.de;lmu.de;lmu.de;ugent.be", "position": "PhD student;Postdoc;Postdoc;Full Professor;Associate Professor", "bibtex": "@inproceedings{\njuergens2024is,\ntitle={Is Epistemic Uncertainty Faithfully Represented by Evidential Deep Learning Methods?},\nauthor={Mira Juergens and Nis Meinert and Viktor Bengs and Eyke H{\\\"u}llermeier and Willem Waegeman},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mxjB0LIgpT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8939575, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12608831016798697692&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "ugent.be;dlr.de;lmu.de;lmu.de;ugent.be", "author_num": 5, "aff_unique_index": "0;1;2;2;3", "aff_unique_norm": "University of Ghent;German Aerospace Center;Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;Ghent University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ugent.be/en;https://www.dlr.de;https://www.lmu.de;https://www.ugent.be/en", "aff_unique_abbr": "UGent;DLR;LMU;UGent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;0", "aff_country_unique": "Belgium;Germany" }, { "title": "BeigeMaps: Behavioral Eigenmaps for Reinforcement Learning from Images", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33147", "id": "myCgfQZzbc", "proceeding": "https://proceedings.mlr.press/v235/adhikary24a.html", "pdf": "https://openreview.net/pdf?id=myCgfQZzbc", "openreview": "https://openreview.net/forum?id=myCgfQZzbc", "author_site": "Sandesh Adhikary, Anqi Li, Byron Boots", "tldr": "", "abstract": "Training reinforcement learning (RL) agents directly from high-dimensional image observations continues to be a challenging problem. Recent line of work on behavioral distances proposes to learn representations that encode behavioral similarities quantified by the bisimulation metric. By learning an isometric mapping to a lower dimensional space that preserves this metric, such methods attempt to learn representations that group together functionally similar states. However, such an isometric mapping may not exist, making the learning objective ill-defined. We propose an alternative objective that allows distortions in long-range distances, while preserving *local* metric structure -- inducing representations that highlight natural clusters in the state space. This leads to new representations, which we term Behavioral Eigenmaps (BeigeMaps), corresponding to the eigenfunctions of similarity kernels induced by behavioral distances. We empirically demonstrate that when added as a drop-in modification, BeigeMaps improve the policy performance of prior behavioral distance based RL algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sandesh Adhikary;Anqi Li;Byron Boots", "authorids": "~Sandesh_Adhikary1;~Anqi_Li1;~Byron_Boots1", "gender": "M;;", "homepage": "https://sandeshadhikary.github.io/;https://anqili.github.io;", "dblp": ";;", "google_scholar": "8VUrBI4AAAAJ;HG08FCMAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Sandesh_Adhikary1;~Anqi_Li1;~Byron_Boots1", "aff": "Department of Computer Science, University of Washington;NVIDIA;", "aff_domain": "cs.washington.edu;nvidia.com;", "position": "PhD student;Researcher;", "bibtex": "@inproceedings{\nadhikary2024beigemaps,\ntitle={BeigeMaps: Behavioral Eigenmaps for Reinforcement Learning from Images},\nauthor={Sandesh Adhikary and Anqi Li and Byron Boots},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=myCgfQZzbc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3070485, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9svV2NxT0U8J:scholar.google.com/&scioq=BeigeMaps:+Behavioral+Eigenmaps+for+Reinforcement+Learning+from+Images&hl=en&as_sdt=0,10", "gs_version_total": 5, "email": "cs.washington.edu;nvidia.com;", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "University of Washington;NVIDIA", "aff_unique_dep": "Department of Computer Science;NVIDIA Corporation", "aff_unique_url": "https://www.washington.edu;https://www.nvidia.com", "aff_unique_abbr": "UW;NVIDIA", "aff_campus_unique_index": "0", "aff_campus_unique": "Seattle;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Bayesian Regret Minimization in Offline Bandits", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33146", "id": "mz55Ox0Igz", "proceeding": "https://proceedings.mlr.press/v235/petrik24a.html", "pdf": "https://openreview.net/pdf?id=mz55Ox0Igz", "openreview": "https://openreview.net/forum?id=mz55Ox0Igz", "author_site": "Marek Petrik, Guy Tennenholtz, Mohammad Ghavamzadeh", "tldr": "", "abstract": "We study how to make decisions that minimize Bayesian regret in offline linear bandits. Prior work suggests that one must take actions with maximum lower confidence bound (LCB) on their reward. We argue that reliance on LCB is inherently flawed in this setting and propose a new algorithm that directly minimizes upper-bounds on the Bayesian regret using efficient conic optimization solvers. Our bounds build heavily on new connections to monetary risk measures. Proving a matching lower-bound, we show that our upper-bounds are tight, and by minimizing them we are guaranteed to outperform the LCB approach. Our numerical results on synthetic domains confirm that our approach is superior to maximizing LCB.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Marek Petrik;Guy Tennenholtz;Mohammad Ghavamzadeh", "authorids": "~Marek_Petrik2;~Guy_Tennenholtz4;~Mohammad_Ghavamzadeh2", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Marek_Petrik2;~Guy_Tennenholtz4;~Mohammad_Ghavamzadeh2", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\npetrik2024bayesian,\ntitle={Bayesian Regret Minimization in Offline Bandits},\nauthor={Marek Petrik and Guy Tennenholtz and Mohammad Ghavamzadeh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mz55Ox0Igz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 616902, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Jocz1UdtD6YJ:scholar.google.com/&scioq=Bayesian+Regret+Minimization+in+Offline+Bandits&hl=en&as_sdt=0,5", "gs_version_total": 9, "email": ";;", "author_num": 3 }, { "title": "Rejuvenating image-GPT as Strong Visual Representation Learners", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33145", "id": "mzGtunvpJH", "proceeding": "https://proceedings.mlr.press/v235/ren24d.html", "pdf": "https://openreview.net/pdf?id=mzGtunvpJH", "openreview": "https://openreview.net/forum?id=mzGtunvpJH", "author_site": "Sucheng Ren, Zeyu Wang, Hongru Zhu, Junfei Xiao, Alan Yuille, Cihang Xie", "tldr": "", "abstract": "This paper enhances image-GPT (iGPT), one of the pioneering works that introduce autoregressive pretraining to predict the next pixels for visual representation learning. Two simple yet essential changes are made. First, we shift the prediction target from raw pixels to semantic tokens, enabling a higher-level understanding of visual content. Second, we supplement the autoregressive modeling by instructing the model to predict not only the next tokens but also the visible tokens. This pipeline is particularly effective when semantic tokens are encoded by discriminatively trained models, such as CLIP. We introduce this novel approach as D-iGPT. Extensive experiments showcase that D-iGPT excels as a strong learner of visual representations: A notable achievement is its compelling performance on the ImageNet-1K dataset --- by training on publicly available datasets, D-iGPT unprecedentedly achieves **90.0%** top-1 accuracy with a vanilla ViT-H. Additionally, D-iGPT shows strong generalization on the downstream task. Code is available at https://github.com/OliverRensu/D-iGPT.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sucheng Ren;Zeyu Wang;Hongru Zhu;Junfei Xiao;Alan Yuille;Cihang Xie", "authorids": "~Sucheng_Ren1;~Zeyu_Wang2;~Hongru_Zhu1;~Junfei_Xiao1;~Alan_Yuille1;~Cihang_Xie3", "gender": "M;;M;M;M;", "homepage": "https://oliverren.netlify.com/;;;;;", "dblp": "270/9042;;80/2869;246/7952;y/AlanLYuille;", "google_scholar": "Hbf-SoAAAAAJ;;G8NZJLIAAAAJ;rv-aTqkAAAAJ;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Sucheng_Ren1;~Zeyu_Wang2;~Hongru_Zhu1;~Junfei_Xiao1;~Alan_Yuille1;~Cihang_Xie3", "aff": "Johns Hopkins University;;Johns Hopkins University;Google;Johns Hopkins University;", "aff_domain": "jh.edu;;jhu.edu;google.com;johnshopkins.edu;", "position": "PhD student;;PhD student;Researcher;Full Professor;", "bibtex": "@inproceedings{\nren2024rejuvenating,\ntitle={Rejuvenating image-{GPT} as Strong Visual Representation Learners},\nauthor={Sucheng Ren and Zeyu Wang and Hongru Zhu and Junfei Xiao and Alan Yuille and Cihang Xie},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=mzGtunvpJH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 550766, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18435393445057734417&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "jh.edu;;jhu.edu;google.com;johnshopkins.edu;", "author_num": 6, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Johns Hopkins University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.jhu.edu;https://www.google.com", "aff_unique_abbr": "JHU;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Multiply-Robust Causal Change Attribution", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33144", "id": "n2eppIzHlL", "proceeding": "https://proceedings.mlr.press/v235/quintas-martinez24a.html", "pdf": "https://openreview.net/pdf?id=n2eppIzHlL", "openreview": "https://openreview.net/forum?id=n2eppIzHlL", "author_site": "V\u00edctor Quintas-Mart\u00ednez, Mohammad Bahadori, Eduardo Santiago, Jeff Mu, David Heckerman", "tldr": "", "abstract": "Comparing two samples of data, we observe a change in the distribution of an outcome variable. In the presence of multiple explanatory variables, how much of the change can be explained by each possible cause? We develop a new estimation strategy that, given a causal model, combines regression and re-weighting methods to quantify the contribution of each causal mechanism. Our proposed methodology is multiply robust, meaning that it still recovers the target parameter under partial misspecification. We prove that our estimator is consistent and asymptotically normal. Moreover, it can be incorporated into existing frameworks for causal attribution, such as Shapley values, which will inherit the consistency and large-sample distribution properties. Our method demonstrates excellent performance in Monte Carlo simulations, and we show its usefulness in an empirical application. Our method is implemented as part of the Python library ``DoWhy`` (Sharma & Kiciman, 2020; Bl\u00f6baum et al., 2022).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Victor Quintas-Martinez;Mohammad Taha Bahadori;Eduardo Santiago;Jeff Mu;David Heckerman", "authorids": "~Victor_Quintas-Martinez1;~Mohammad_Taha_Bahadori1;santiedu@amazon.com;jefmu@amazon.com;heckerma@amazon.com", "gender": "M;M;;;", "homepage": "https://economics.mit.edu/faculty;http://faculty.washington.edu/bahadori/;;;", "dblp": ";28/10813.html;;;", "google_scholar": ";tlZvhyoAAAAJ;;;", "orcid": ";;;;", "linkedin": ";tahabahadori/;;;", "or_profile": "~Victor_Quintas-Martinez1;~Mohammad_Taha_Bahadori1;santiedu@amazon.com;jefmu@amazon.com;heckerma@amazon.com", "aff": "Massachusetts Institute of Technology;Amazon;;;", "aff_domain": "mit.edu;amazon.com;;;", "position": "PhD student;Scientist;;;", "bibtex": "@inproceedings{\nquintas-martinez2024multiplyrobust,\ntitle={Multiply-Robust Causal Change Attribution},\nauthor={Victor Quintas-Martinez and Mohammad Taha Bahadori and Eduardo Santiago and Jeff Mu and David Heckerman},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=n2eppIzHlL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 487261, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13132897213899137949&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "mit.edu;amazon.com;;;", "author_num": 5, "aff_unique_index": "0;1", "aff_unique_norm": "Massachusetts Institute of Technology;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://web.mit.edu;https://www.amazon.com", "aff_unique_abbr": "MIT;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Learning the Uncertainty Sets of Linear Control Systems via Set Membership: A Non-asymptotic Analysis", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33143", "id": "n2kq2EOHFE", "proceeding": "https://proceedings.mlr.press/v235/li24ci.html", "pdf": "https://openreview.net/pdf?id=n2kq2EOHFE", "openreview": "https://openreview.net/forum?id=n2kq2EOHFE", "author_site": "Yingying Li, Jing Yu, Lauren Conger, Taylan Kargin, Adam Wierman", "tldr": "", "abstract": "This paper studies uncertainty set estimation for unknown linear systems. Uncertainty sets are crucial for the quality of robust control since they directly influence the conservativeness of the control design. Departing from the confidence region analysis of least squares estimation, this paper focuses on set membership estimation (SME). Though good numerical performances have attracted applications of SME in the control literature, the non-asymptotic convergence rate of SME for linear systems remains an open question. This paper provides the first convergence rate bounds for SME and discusses variations of SME under relaxed assumptions. We also provide numerical results demonstrating SME's practical promise.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yingying Li;Jing Yu;Lauren Conger;Taylan Kargin;Adam Wierman", "authorids": "~Yingying_Li3;~Jing_Yu1;~Lauren_Conger1;~Taylan_Kargin1;~Adam_Wierman1", "gender": "F;F;M;M;", "homepage": "https://yingying.li;https://www.jingyu.io/;https://tkargin.github.io;https://adamwierman.com/;https://leconger.github.io/", "dblp": "63/5869;42/6466-10;322/5556;56/4447;", "google_scholar": ";akiDVE8AAAAJ;5VpXWyIAAAAJ;4OvOdSgAAAAJ;Iv6uAdMAAAAJ", "orcid": ";;0000-0001-6744-654X;0000-0002-5923-0199;", "linkedin": ";jing-yu-32347979/;taylan-kargin/;adam-wierman-a529474/;", "or_profile": "~Yingying_Li3;~Jing_Yu1;~Taylan_Kargin1;~Adam_Wierman1;~Lauren_E_Conger1", "aff": "University of Illinois, Urbana Champaign;California Institute of Technology;California Institute of Technology;California Institute of Technology;California Institute of Technology", "aff_domain": "illinois.edu;caltech.edu;caltech.edu;caltech.edu;caltech.edu", "position": "Assistant Professor;PhD student;PhD student;Professor;PhD student", "bibtex": "@inproceedings{\nli2024learning,\ntitle={Learning the Uncertainty Sets of Linear Control Systems via Set Membership: A Non-asymptotic Analysis},\nauthor={Yingying Li and Jing Yu and Lauren Conger and Taylan Kargin and Adam Wierman},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=n2kq2EOHFE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 906134, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7938149545316369216&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "illinois.edu;caltech.edu;caltech.edu;caltech.edu;caltech.edu", "author_num": 5, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "University of Illinois Urbana-Champaign;California Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://www.caltech.edu", "aff_unique_abbr": "UIUC;Caltech", "aff_campus_unique_index": "0;1;1;1;1", "aff_campus_unique": "Urbana-Champaign;Pasadena", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Solving Hierarchical Information-Sharing Dec-POMDPs: An Extensive-Form Game Approach", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33142", "id": "n3smZl8itR", "proceeding": "https://proceedings.mlr.press/v235/peralez24a.html", "pdf": "https://openreview.net/pdf?id=n3smZl8itR", "openreview": "https://openreview.net/forum?id=n3smZl8itR", "author_site": "Johan Peralez, Aur\u00e9lien Delage, Olivier Buffet, Jilles Dibangoye", "tldr": "", "abstract": "A recent theory shows that a multi-player decentralized partially observable Markov decision process can be transformed into an equivalent single-player game, enabling the application of Bellman's principle of optimality to solve the single-player game by breaking it down into single-stage subgames. However, this approach entangles the decision variables of all players at each single-stage subgame, resulting in backups with a double-exponential complexity. This paper demonstrates how to disentangle these decision variables while maintaining optimality under hierarchical information sharing, a prominent management style in our society. To achieve this, we apply the principle of optimality to solve any single-stage subgame by breaking it down further into smaller subgames, enabling us to make single-player decisions at a time. Our approach reveals that extensive-form games always exist with solutions to a single-stage subgame, significantly reducing time complexity. Our experimental results show that the algorithms leveraging these findings can scale up to much larger multi-player games without compromising optimality.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Johan Peralez;Aur\u00e9lien Delage;Olivier Buffet;Jilles Steeve Dibangoye", "authorids": "~Johan_Peralez1;~Aur\u00e9lien_Delage1;~Olivier_Buffet1;~Jilles_Steeve_Dibangoye1", "gender": ";Not Specified;M;M", "homepage": "https://scholar.google.com/citations?user=Lc6LJ7MAAAAJ&hl=fr;;https://members.loria.fr/olivier.buffet/;http://dibangoye.fr", "dblp": ";268/6635;35/5418;52/7118", "google_scholar": "Lc6LJ7MAAAAJ;;https://scholar.google.fr/citations?user=3XG9JKEAAAAJ;https://scholar.google.fr/citations?user=iQ3v57QAAAAJ", "orcid": ";;0000-0002-5072-5857;", "linkedin": ";aur%C3%A9lien-delage-49b091193/;;", "or_profile": "~Johan_Peralez1;~Aur\u00e9lien_Delage1;~Olivier_Buffet1;~Jilles_Steeve_Dibangoye1", "aff": "Institut National des Sciences Appliqu\u00e9es de Lyon;INSA de Lyon;INRIA;INSA de Lyon", "aff_domain": "insa-lyon.fr;insa-lyon.fr;inria.fr;insa-lyon.fr", "position": "Postdoc;ph;Researcher;Associate Professor", "bibtex": "@inproceedings{\nperalez2024solving,\ntitle={Solving Hierarchical Information-Sharing Dec-{POMDP}s: An Extensive-Form Game Approach},\nauthor={Johan Peralez and Aur{\\'e}lien Delage and Olivier Buffet and Jilles Steeve Dibangoye},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=n3smZl8itR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 892329, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6326979332135114917&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "insa-lyon.fr;insa-lyon.fr;inria.fr;insa-lyon.fr", "author_num": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Institut National des Sciences Appliqu\u00e9es;INSA de Lyon;INRIA", "aff_unique_dep": ";;", "aff_unique_url": "https://www.insa-lyon.fr;https://www.insa-lyon.fr;https://www.inria.fr", "aff_unique_abbr": "INSA Lyon;INSA Lyon;INRIA", "aff_campus_unique_index": "0", "aff_campus_unique": "Lyon;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "France" }, { "title": "Parameterized Physics-informed Neural Networks for Parameterized PDEs", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33141", "id": "n3yYrtt9U7", "proceeding": "https://proceedings.mlr.press/v235/cho24b.html", "pdf": "https://openreview.net/pdf?id=n3yYrtt9U7", "openreview": "https://openreview.net/forum?id=n3yYrtt9U7", "author_site": "Woojin Cho, Minju Jo, Haksoo Lim, Kookjin Lee, Dongeun Lee, Sanghyun Hong, Noseong Park", "tldr": "", "abstract": "Complex physical systems are often described by partial differential equations (PDEs) that depend on parameters such as the Raynolds number in fluid mechanics. In applications such as design optimization or uncertainty quantification, solutions of those PDEs need to be evaluated at numerous points in the parameter space. While physics-informed neural networks (PINNs) have emerged as a new strong competitor as a surrogate, their usage in this scenario remains underexplored due to the inherent need for repetitive and time-consuming training. In this paper, we address this problem by proposing a novel extension, parameterized physics-informed neural networks (P$^2$INNs). P$^2$INNs enable modeling the solutions of parameterized PDEs via explicitly encoding a latent representation of PDE parameters. With the extensive empirical evaluation, we demonstrate that P$^2$INNs outperform the baselines both in accuracy and parameter efficiency on benchmark 1D and 2D parameterized PDEs and are also effective in overcoming the known \u201cfailure modes\u201d.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Woojin Cho;Minju Jo;Haksoo Lim;Kookjin Lee;Dongeun Lee;Sanghyun Hong;Noseong Park", "authorids": "~Woojin_Cho1;~Minju_Jo1;~Haksoo_Lim1;~Kookjin_Lee1;~Dongeun_Lee1;~Sanghyun_Hong1;~Noseong_Park1", "gender": "M;F;M;M;M;M;", "homepage": "https://woojin-cho.github.io/;;;https://scholar.google.com/citations?hl=en&user=KL89hVQAAAAJ&view_op=list_works;;http://www.sanghyun-hong.com;", "dblp": ";236/8419.html;;122/5103;62/688;135/8991;", "google_scholar": "cqIj5tQAAAAJ;xLVtvn8AAAAJ;ZJvEyqwAAAAJ;https://scholar.google.com/citations?hl=en;;https://scholar.google.com/citations?hl=en;", "orcid": ";0000-0002-7908-5005;;;;;", "linkedin": "woojin-cho-02b905264/;;;;;;", "or_profile": "~Woojin_Cho1;~Minju_Jo1;~Haksoo_Lim1;~Kookjin_Lee1;~Dongeun_Lee1;~Sanghyun_Hong1;~Noseong_Park1", "aff": "Yonsei University;LG CNS;Yonsei University;Arizona State University;East Texas A&M University;Oregon State University;", "aff_domain": "yonsei.ac.kr;lgcns.com;yonsei.ac.kr;asu.edu;tamuc.edu;oregonstate.edu;", "position": "MS student;Researcher;MS student;Assistant Professor;Associate Professor;Assistant Professor;", "bibtex": "@inproceedings{\ncho2024parameterized,\ntitle={Parameterized Physics-informed Neural Networks for Parameterized {PDE}s},\nauthor={Woojin Cho and Minju Jo and Haksoo Lim and Kookjin Lee and Dongeun Lee and Sanghyun Hong and Noseong Park},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=n3yYrtt9U7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4984901, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11937657037465535758&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "yonsei.ac.kr;lgcns.com;yonsei.ac.kr;asu.edu;tamuc.edu;oregonstate.edu;", "author_num": 7, "aff_unique_index": "0;1;0;2;3;4", "aff_unique_norm": "Yonsei University;LG;Arizona State University;East Texas A&M University;Oregon State University", "aff_unique_dep": ";LG CNS;;;", "aff_unique_url": "https://www.yonsei.ac.kr;https://www.lgcns.com;https://www.asu.edu;https://www.etam.edu;https://oregonstate.edu", "aff_unique_abbr": "Yonsei;LG CNS;ASU;ETAMU;OSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;1;1", "aff_country_unique": "South Korea;United States" }, { "title": "Decoding-time Realignment of Language Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33140", "id": "n8g6WMxt09", "proceeding": "https://proceedings.mlr.press/v235/liu24r.html", "pdf": "https://openreview.net/pdf?id=n8g6WMxt09", "openreview": "https://openreview.net/forum?id=n8g6WMxt09", "author_site": "Tianlin Liu, Shangmin Guo, Leonardo Martins Bianco, Daniele Calandriello, Quentin Berthet, Felipe Llinares-Lopez, Jessica Hoffmann, Lucas Dixon, Michal Valko, Mathieu Blondel", "tldr": "", "abstract": "Aligning language models with human preferences is crucial for reducing errors and biases in these models. Alignment techniques, such as reinforcement learning from human feedback (RLHF), are typically cast as optimizing a tradeoff between human preference rewards and a proximity regularization term that encourages staying close to the unaligned model. Selecting an appropriate level of regularization is critical: insufficient regularization can lead to reduced model capabilities due to reward hacking, whereas excessive regularization hinders alignment. Traditional methods for finding the optimal regularization level require retraining multiple models with varying regularization strengths. This process, however, is resource-intensive, especially for large models. To address this challenge, we propose decoding-time realignment (DeRa), a simple method to explore and evaluate different regularization strengths in aligned models without retraining. DeRa enables control over the degree of alignment, allowing users to smoothly transition between unaligned and aligned models. It also enhances the efficiency of hyperparameter tuning by enabling the identification of effective regularization strengths using a validation dataset.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tianlin Liu;Shangmin Guo;Leonardo Bianco;Daniele Calandriello;Quentin Berthet;Felipe Llinares-L\u00f3pez;Jessica Hoffmann;Lucas Dixon;Michal Valko;Mathieu Blondel", "authorids": "~Tianlin_Liu2;~Shangmin_Guo1;leonardo.martins-bianco@universite-paris-saclay.fr;~Daniele_Calandriello1;~Quentin_Berthet2;~Felipe_Llinares-L\u00f3pez1;~Jessica_Hoffmann1;~Lucas_Dixon1;~Michal_Valko1;~Mathieu_Blondel1", "gender": "M;M;;M;M;;F;Not Specified;M;", "homepage": "http://www.tianlinliu.com;;;;http://q-berthet.github.io/;;https://www.cs.utexas.edu/~hoffmann/;https://research.google/people/lucas-dixon/;https://misovalko.github.io/research.html;http://www.mblondel.org", "dblp": "20/7667;183/0949;;129/1542;129/1262;;209/9871;39/6853;03/5455;05/8614.html", "google_scholar": ";cpOrbSoAAAAJ;;;bHwGZjcAAAAJ;;XQxn9dMAAAAJ;nDs3-TMAAAAJ;jrazNCQAAAAJ;C0EKzrUAAAAJ", "orcid": ";0000-0003-1716-0994;;;;;;0000-0003-1094-1675;;", "linkedin": ";;;;;;;lucas-dixon-94070354/;michalvalko/;", "or_profile": "~Tianlin_Liu2;~Shangmin_Guo1;leonardo.martins-bianco@universite-paris-saclay.fr;~Daniele_Calandriello1;~Quentin_Berthet2;~Felipe_Llinares-L\u00f3pez1;~Jessica_Hoffmann1;~Lucas_Dixon1;~Michal_Valko1;~Mathieu_Blondel1", "aff": "University of Basel;University of Edinburgh;;Google DeepMind;Google DeepMind;;Google;Research, Google;Meta;Google", "aff_domain": "unibas.ch;ed.ac.uk;;deepmind.com;google.com;;google.com;research.google.com;meta.com;google.com", "position": "PhD student;PhD student;;Researcher;Researcher;;Researcher;Researcher;Principal Researcher;Research scientist", "bibtex": "@inproceedings{\nliu2024decodingtime,\ntitle={Decoding-time Realignment of Language Models},\nauthor={Tianlin Liu and Shangmin Guo and Leonardo Bianco and Daniele Calandriello and Quentin Berthet and Felipe Llinares-L{\\'o}pez and Jessica Hoffmann and Lucas Dixon and Michal Valko and Mathieu Blondel},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=n8g6WMxt09}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2252276, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12412715148830183049&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "unibas.ch;ed.ac.uk;;deepmind.com;google.com;;google.com;research.google.com;meta.com;google.com", "author_num": 10, "aff_unique_index": "0;1;2;2;2;2;3;2", "aff_unique_norm": "University of Basel;University of Edinburgh;Google;Meta", "aff_unique_dep": ";;Google DeepMind;Meta Platforms, Inc.", "aff_unique_url": "https://www.unibas.ch;https://www.ed.ac.uk;https://deepmind.com;https://meta.com", "aff_unique_abbr": "UniBas;Edinburgh;DeepMind;Meta", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;1;2;2;2;2", "aff_country_unique": "Switzerland;United Kingdom;United States" }, { "title": "Scaling Down Deep Learning with MNIST-1D", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33139", "id": "n9pru4bJU9", "proceeding": "https://proceedings.mlr.press/v235/greydanus24a.html", "pdf": "https://openreview.net/pdf?id=n9pru4bJU9", "openreview": "https://openreview.net/forum?id=n9pru4bJU9", "author_site": "Sam Greydanus, Dmitry Kobak", "tldr": "", "abstract": "Although deep learning models have taken on commercial and political relevance, key aspects of their training and operation remain poorly understood. This has sparked interest in science of deep learning projects, many of which require large amounts of time, money, and electricity. But how much of this research really needs to occur at scale? In this paper, we introduce MNIST-1D: a minimalist, procedurally generated, low-memory, and low-compute alternative to classic deep learning benchmarks. Although the dimensionality of MNIST-1D is only 40 and its default training set size only 4000, MNIST-1D can be used to study inductive biases of different deep architectures, find lottery tickets, observe deep double descent, metalearn an activation function, and demonstrate guillotine regularization in self-supervised learning. All these experiments can be conducted on a GPU or often even on a CPU within minutes, allowing for fast prototyping, educational use cases, and cutting-edge research on a low budget.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Samuel James Greydanus;Dmitry Kobak", "authorids": "~Samuel_James_Greydanus1;~Dmitry_Kobak2", "gender": "M;", "homepage": "https://greydanus.github.io/about.html;https://dkobak.github.io/", "dblp": "205/2640;236/5191", "google_scholar": "SECnlpMAAAAJ;BUQbD5kAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Samuel_James_Greydanus1;~Dmitry_Kobak2", "aff": ";Eberhard-Karls-Universit\u00e4t T\u00fcbingen", "aff_domain": ";uni-tuebingen.de", "position": ";Researcher", "bibtex": "@inproceedings{\ngreydanus2024scaling,\ntitle={Scaling Down Deep Learning with {MNIST}-1D},\nauthor={Samuel James Greydanus and Dmitry Kobak},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=n9pru4bJU9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1587682, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16090150116901873654&as_sdt=8005&sciodt=0,7&hl=en", "gs_version_total": 6, "email": ";uni-tuebingen.de", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Eberhard Karls University of T\u00fcbingen", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-tuebingen.de/", "aff_unique_abbr": "Uni T\u00fcbingen", "aff_campus_unique_index": "0", "aff_campus_unique": "T\u00fcbingen", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "title": "Position: Enforced Amnesia as a Way to Mitigate the Potential Risk of Silent Suffering in the Conscious AI", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33138", "id": "nACGn4US1R", "proceeding": "https://proceedings.mlr.press/v235/tkachenko24a.html", "pdf": "https://openreview.net/pdf?id=nACGn4US1R", "openreview": "https://openreview.net/forum?id=nACGn4US1R", "tldr": "", "abstract": "Science fiction has explored the possibility of a conscious self-aware mind being locked in silent suffering for prolonged periods of time. Unfortunately, we still do not have a reliable test for the presence of consciousness in information processing systems. Even in case of humans, our confidence in the presence of consciousness in specific individuals is based mainly on their self-reports and our own subjective experiences and the expectation other beings like us should share them. Considering our limited understanding of consciousness and some academic theories suggesting consciousness may be an emergent correlate of any complex-enough information processing, it is not impossible that an artificial intelligence (AI) system, such as a large language model (LLM), may be undergoing some, perhaps rudimentary, conscious experience. Given the tedious tasks often assigned to AI, such conscious experience may be highly unpleasant. Such unobserved suffering of a conscious being would be viewed as morally wrong by at least some ethicists - even if it has no practical effects on human users of AI. This paper proposes a method to mitigate the risk of an AI suffering in silence without needing to confirm if the AI is actually conscious. Our core postulate is that in all known real-world information processing systems, for a past experience to affect an agent in the present, that experience has to be mediated by the agent's memory. Therefore, preventing access to memory store, or regularly resetting it, could reduce the suffering due to past memories and interrupt the maintenance of a continuous suffering-prone self-identity in these hypothetically conscious AI systems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yegor Tkachenko", "authorids": "~Yegor_Tkachenko1", "gender": "M", "homepage": "https://yegortkachenko.com/", "dblp": "", "google_scholar": "https://scholar.google.com/citations?hl=en", "orcid": "", "linkedin": "", "or_profile": "~Yegor_Tkachenko1", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\ntkachenko2024position,\ntitle={Position: Enforced Amnesia as a Way to Mitigate the Potential Risk of Silent Suffering in the Conscious {AI}},\nauthor={Yegor Tkachenko},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nACGn4US1R}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 209859, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15677161960943558749&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": "", "author_num": 1 }, { "title": "A fast algorithm to simulate nonlinear resistive networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33137", "id": "nAbfF37H6t", "proceeding": "https://proceedings.mlr.press/v235/scellier24a.html", "pdf": "https://openreview.net/pdf?id=nAbfF37H6t", "openreview": "https://openreview.net/forum?id=nAbfF37H6t", "tldr": "", "abstract": "Analog electrical networks have long been investigated as energy-efficient computing platforms for machine learning, leveraging analog physics during inference. More recently, resistor networks have sparked particular interest due to their ability to learn using local rules (such as equilibrium propagation), enabling potentially important energy efficiency gains for training as well. Despite their potential advantage, the simulations of these resistor networks has been a significant bottleneck to assess their scalability, with current methods either being limited to linear networks or relying on realistic, yet slow circuit simulators like SPICE. Assuming ideal circuit elements, we introduce a novel approach for the simulation of nonlinear resistive networks, which we frame as a quadratic programming problem with linear inequality constraints, and which we solve using a fast, exact coordinate descent algorithm. Our simulation methodology significantly outperforms existing SPICE-based simulations, enabling the training of networks up to 327 times larger at speeds 160 times faster, resulting in a 50,000-fold improvement in the ratio of network size to epoch duration. Our approach can foster more rapid progress in the simulations of nonlinear analog electrical networks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Benjamin Scellier", "authorids": "~Benjamin_Scellier1", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\nscellier2024a,\ntitle={A fast algorithm to simulate nonlinear resistive networks},\nauthor={Benjamin Scellier},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nAbfF37H6t}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 529392, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10572705409386736410&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "", "author_num": 1 }, { "title": "Verifying message-passing neural networks via topology-based bounds tightening", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33136", "id": "nAoiUlz4Bf", "proceeding": "https://proceedings.mlr.press/v235/hojny24a.html", "pdf": "https://openreview.net/pdf?id=nAoiUlz4Bf", "openreview": "https://openreview.net/forum?id=nAoiUlz4Bf", "author_site": "Christopher Hojny, Shiqiang Zhang, Juan Campos, Ruth Misener", "tldr": "", "abstract": "Since graph neural networks (GNNs) are often vulnerable to attack, we need to know when we can trust them. We develop a computationally effective approach towards providing robust certificates for message-passing neural networks (MPNNs) using a Rectified Linear Unit (ReLU) activation function. Because our work builds on mixed-integer optimization, it encodes a wide variety of subproblems, for example it admits (i) both adding and removing edges, (ii) both global and local budgets, and (iii) both topological perturbations and feature modifications. Our key technology, topology-based bounds tightening, uses graph structure to tighten bounds. We also experiment with aggressive bounds tightening to dynamically change the optimization constraints by tightening variable bounds. To demonstrate the effectiveness of these strategies, we implement an extension to the open-source branch-and-cut solver SCIP. We test on both node and graph classification problems and consider topological attacks that both add and remove edges.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Christopher Hojny;Shiqiang Zhang;Juan S Campos;Ruth Misener", "authorids": "~Christopher_Hojny1;~Shiqiang_Zhang1;~Juan_S_Campos1;~Ruth_Misener1", "gender": ";M;;F", "homepage": ";;;https://wp.doc.ic.ac.uk/rmisener/", "dblp": ";;219/7664;04/8800", "google_scholar": ";https://scholar.google.com/citations?hl=en;https://scholar.google.co.uk/citations?user=LCg1qsUAAAAJ;AQxtWHoAAAAJ", "orcid": ";;;0000-0001-5612-5417", "linkedin": ";;juan-campos-3b6100133/;ruth-misener/", "or_profile": "~Christopher_Hojny1;~Shiqiang_Zhang1;~Juan_S_Campos1;~Ruth_Misener1", "aff": ";Imperial College London, Imperial College London;;Imperial College London", "aff_domain": ";imperial.ac.uk;;imperial.ac.uk", "position": ";PhD student;;Full Professor", "bibtex": "@inproceedings{\nhojny2024verifying,\ntitle={Verifying message-passing neural networks via topology-based bounds tightening},\nauthor={Christopher Hojny and Shiqiang Zhang and Juan S Campos and Ruth Misener},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nAoiUlz4Bf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1220569, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5828972103330528828&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 9, "email": ";imperial.ac.uk;;imperial.ac.uk", "author_num": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Imperial College London", "aff_unique_dep": "", "aff_unique_url": "https://www.imperial.ac.uk", "aff_unique_abbr": "ICL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Combining Experimental and Historical Data for Policy Evaluation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33135", "id": "nB6ERIud2y", "proceeding": "https://proceedings.mlr.press/v235/li24bh.html", "pdf": "https://openreview.net/pdf?id=nB6ERIud2y", "openreview": "https://openreview.net/forum?id=nB6ERIud2y", "author_site": "Ting Li, Chengchun Shi, Qianglin Wen, Yang Sui, Yongli Qin, Chunbo Lai, Hongtu Zhu", "tldr": "", "abstract": "This paper studies policy evaluation with multiple data sources, especially in scenarios that involve one experimental dataset with two arms, complemented by a historical dataset generated under a single control arm. We propose novel data integration methods that linearly integrate base policy value estimators constructed based on the experimental and historical data, with weights optimized to minimize the mean square error (MSE) of the resulting combined estimator. We further apply the pessimistic principle to obtain more robust estimators, and extend these developments to sequential decision making. Theoretically, we establish non-asymptotic error bounds for the MSEs of our proposed estimators, and derive their oracle, efficiency and robustness properties across a broad spectrum of reward shift scenarios. Numerical experiments and real-data-based analyses from a ridesharing company demonstrate the superior performance of the proposed estimators.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ting Li;Chengchun Shi;Qianglin Wen;Yang Sui;Yongli Qin;Chunbo Lai;Hongtu Zhu", "authorids": "~Ting_Li7;~Chengchun_Shi1;~Qianglin_Wen1;~Yang_Sui2;dsqinyongli@didiglobal.com;laichunbo@didiglobal.com;~Hongtu_Zhu3", "gender": ";M;M;M;;;", "homepage": ";https://callmespring.github.io/;https://github.com/QianglinSIMON;https://github.com/suiyangsoo;;;", "dblp": ";;;;;;", "google_scholar": ";dDGy3N0AAAAJ;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Ting_Li7;~Chengchun_Shi1;~Qianglin_Wen1;~Yang_Sui2;dsqinyongli@didiglobal.com;laichunbo@didiglobal.com;~Hongtu_Zhu3", "aff": ";London School of Economics;Yunnan University;Shanghai University of Finance and Economics;;;", "aff_domain": ";lse.ac.uk;ynu.edu.cn;shufe.edu.cn;;;", "position": ";Associate Professor;PhD student;PhD student;;;", "bibtex": "@inproceedings{\nli2024combining,\ntitle={Combining Experimental and Historical Data for Policy Evaluation},\nauthor={Ting Li and Chengchun Shi and Qianglin Wen and Yang Sui and Yongli Qin and Chunbo Lai and Hongtu Zhu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nB6ERIud2y}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4419358, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11770887568812724529&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 10, "email": ";lse.ac.uk;ynu.edu.cn;shufe.edu.cn;;;", "author_num": 7, "aff_unique_index": "0;1;2", "aff_unique_norm": "London School of Economics;Yunnan University;Shanghai University of Finance and Economics", "aff_unique_dep": ";;", "aff_unique_url": "https://www.lse.ac.uk;http://www.ynu.edu.cn;http://www.sufe.edu.cn", "aff_unique_abbr": "LSE;YNU;SUFE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United Kingdom;China" }, { "title": "Align Your Steps: Optimizing Sampling Schedules in Diffusion Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33134", "id": "nBGBzV4It3", "proceeding": "https://proceedings.mlr.press/v235/sabour24a.html", "pdf": "https://openreview.net/pdf?id=nBGBzV4It3", "openreview": "https://openreview.net/forum?id=nBGBzV4It3", "author_site": "Amirmojtaba Sabour, Sanja Fidler, Karsten Kreis", "tldr": "", "abstract": "Diffusion models (DMs) have established themselves as the state-of-the-art generative modeling approach in the visual domain and beyond. A crucial drawback of DMs is their slow sampling speed, relying on many sequential function evaluations through large neural networks. Sampling from DMs can be seen as solving a differential equation through a discretized set of noise levels known as the sampling schedule. While past works primarily focused on deriving efficient solvers, little attention has been given to finding optimal sampling schedules, and the entire literature relies on hand-crafted heuristics. In this work, for the first time, we propose a general and principled approach to optimizing the sampling schedules of DMs for high-quality outputs, called Align Your Steps. We leverage methods from stochastic calculus and find optimal schedules specific to different solvers, trained DMs and datasets. We evaluate our novel approach on several image, video as well as 2D toy data synthesis benchmarks, using a variety of different samplers, and observe that our optimized schedules outperform previous hand-crafted schedules in almost all experiments. Our method demonstrates the untapped potential of sampling schedule optimization, especially in the few-step synthesis regime.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Amirmojtaba Sabour;Sanja Fidler;Karsten Kreis", "authorids": "~Amirmojtaba_Sabour1;~Sanja_Fidler1;~Karsten_Kreis1", "gender": "M;F;", "homepage": ";http://www.cs.toronto.edu/~fidler/;https://karstenkreis.github.io/", "dblp": "251/8739;08/6607;238/6834", "google_scholar": "pUEBuscAAAAJ;CUlqK5EAAAAJ;https://scholar.google.de/citations?user=rFd-DiAAAAAJ", "orcid": ";;", "linkedin": ";sanja-fidler-2846a1a?trk=hp-identity-name;karstenkreis", "or_profile": "~Amirmojtaba_Sabour1;~Sanja_Fidler1;~Karsten_Kreis1", "aff": "Department of Computer Science, University of Toronto;Department of Computer Science, University of Toronto;NVIDIA", "aff_domain": "cs.toronto.edu;cs.toronto.edu;nvidia.com", "position": "PhD student;Associate Professor;Research Scientist", "bibtex": "@inproceedings{\nsabour2024align,\ntitle={Align Your Steps: Optimizing Sampling Schedules in Diffusion Models},\nauthor={Amirmojtaba Sabour and Sanja Fidler and Karsten Kreis},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nBGBzV4It3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9928789, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=979106667075899413&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "cs.toronto.edu;cs.toronto.edu;nvidia.com", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Toronto;NVIDIA", "aff_unique_dep": "Department of Computer Science;NVIDIA Corporation", "aff_unique_url": "https://www.utoronto.ca;https://www.nvidia.com", "aff_unique_abbr": "U of T;NVIDIA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Toronto;", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Canada;United States" }, { "title": "Equivariant Deep Weight Space Alignment", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33133", "id": "nBPnmk6EeO", "proceeding": "https://proceedings.mlr.press/v235/navon24a.html", "pdf": "https://openreview.net/pdf?id=nBPnmk6EeO", "openreview": "https://openreview.net/forum?id=nBPnmk6EeO", "author_site": "Aviv Navon, Aviv Shamsian, Ethan Fetaya, Gal Chechik, Nadav Dym, Haggai Maron", "tldr": "", "abstract": "Permutation symmetries of deep networks make basic operations like model merging and similarity estimation challenging. In many cases, aligning the weights of the networks, i.e., finding optimal permutations between their weights, is necessary. Unfortunately, weight alignment is an NP-hard problem. Prior research has mainly focused on solving relaxed versions of the alignment problem, leading to either time-consuming methods or sub-optimal solutions. To accelerate the alignment process and improve its quality, we propose a novel framework aimed at learning to solve the weight alignment problem, which we name Deep-Align. To that end, we first prove that weight alignment adheres to two fundamental symmetries and then, propose a deep architecture that respects these symmetries. Notably, our framework does not require any labeled data. We provide a theoretical analysis of our approach and evaluate Deep-Align on several types of network architectures and learning setups. Our experimental results indicate that a feed-forward pass with Deep-Align produces better or equivalent alignments compared to those produced by current optimization algorithms. Additionally, our alignments can be used as an effective initialization for other methods, leading to improved solutions with a significant speedup in convergence.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aviv Navon;Aviv Shamsian;Ethan Fetaya;Gal Chechik;Nadav Dym;Haggai Maron", "authorids": "~Aviv_Navon1;~Aviv_Shamsian1;~Ethan_Fetaya1;~Gal_Chechik1;~Nadav_Dym1;~Haggai_Maron1", "gender": "M;M;M;;M;M", "homepage": "https://avivnavon.github.io/;;http://www.cs.toronto.edu/~ethanf/;https://chechiklab.biu.ac.il/~gal/;https://haggaim.github.io/;https://nadavdym.github.io./", "dblp": "269/9785;261/9492;01/10046;c/GalChechik;181/6629;167/1176", "google_scholar": "https://scholar.google.co.il/citations?user=N-sME4wAAAAJ;;zLuqh-0AAAAJ;Wk2gAZUAAAAJ;https://scholar.google.co.il/citations?user=4v8uJrIAAAAJ;https://scholar.google.co.il/citations?user=qOyXmMYAAAAJ", "orcid": ";;0000-0003-3125-1665;0000-0001-9164-5303;;", "linkedin": ";aviv-shamsian/;;;;", "or_profile": "~Aviv_Navon1;~Aviv_Shamsian1;~Ethan_Fetaya1;~Gal_Chechik1;~Haggai_Maron1;~Nadav_E_Dym1", "aff": "Bar Ilan University, Israel;Bar-Ilan University;Bar Ilan University;NVIDIA;NVIDIA;Technion - Israel Institute of Technology, Technion", "aff_domain": "biu.ac.il;biu.ac.il;biu.ac.il;nvidia.com;nvidia.com;technion.ac.il", "position": "PhD student;PhD student;Assistant Professor;Principal Researcher;Research Scientist;Assistant Professor", "bibtex": "@inproceedings{\nnavon2024equivariant,\ntitle={Equivariant Deep Weight Space Alignment},\nauthor={Aviv Navon and Aviv Shamsian and Ethan Fetaya and Gal Chechik and Nadav Dym and Haggai Maron},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nBPnmk6EeO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6119983, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5434543739837023110&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 7, "email": "biu.ac.il;biu.ac.il;biu.ac.il;nvidia.com;nvidia.com;technion.ac.il", "author_num": 6, "aff_unique_index": "0;0;0;1;1;2", "aff_unique_norm": "Bar-Ilan University;NVIDIA;Technion - Israel Institute of Technology", "aff_unique_dep": ";NVIDIA Corporation;", "aff_unique_url": "https://www.biu.ac.il;https://www.nvidia.com;https://www.technion.ac.il", "aff_unique_abbr": "BIU;NVIDIA;Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;1;0", "aff_country_unique": "Israel;United States" }, { "title": "Learning Coverage Paths in Unknown Environments with Deep Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33132", "id": "nCZYRBK1J4", "proceeding": "https://proceedings.mlr.press/v235/jonnarth24a.html", "pdf": "https://openreview.net/pdf?id=nCZYRBK1J4", "openreview": "https://openreview.net/forum?id=nCZYRBK1J4", "author_site": "Arvi Jonnarth, Jie Zhao, Michael Felsberg", "tldr": "", "abstract": "Coverage path planning (CPP) is the problem of finding a path that covers the entire free space of a confined area, with applications ranging from robotic lawn mowing to search-and-rescue. When the environment is unknown, the path needs to be planned online while mapping the environment, which cannot be addressed by offline planning methods that do not allow for a flexible path space. We investigate how suitable reinforcement learning is for this challenging problem, and analyze the involved components required to efficiently learn coverage paths, such as action space, input feature representation, neural network architecture, and reward function. We propose a computationally feasible egocentric map representation based on frontiers, and a novel reward term based on total variation to promote complete coverage. Through extensive experiments, we show that our approach surpasses the performance of both previous RL-based approaches and highly specialized methods across multiple CPP variations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Arvi Jonnarth;Jie Zhao;Michael Felsberg", "authorids": "~Arvi_Jonnarth1;~Jie_Zhao3;~Michael_Felsberg2", "gender": "M;F;", "homepage": "https://liu.se/en/employee/arvjo80;https://github.com/zj5559?tab=repositories;https://liu.se/en/employee/micfe03", "dblp": "317/0723;23/3168-14;00/78", "google_scholar": "dVvOUGYAAAAJ;Oi42Tc8AAAAJ;https://scholar.google.se/citations?hl=en", "orcid": "0000-0002-3434-2522;;0000-0002-6096-3648", "linkedin": ";;https://linkedin.com/in/michael-felsberg-668a202", "or_profile": "~Arvi_Jonnarth1;~Jie_Zhao3;~Michael_Felsberg2", "aff": "Husqvarna;Dalian University of Technology;Link\u00f6ping University", "aff_domain": "husqvarnagroup.com;dlut.edu.cn;liu.se", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\njonnarth2024learning,\ntitle={Learning Coverage Paths in Unknown Environments with Deep Reinforcement Learning},\nauthor={Arvi Jonnarth and Jie Zhao and Michael Felsberg},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nCZYRBK1J4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3898565, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10959240509460447941&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "husqvarnagroup.com;dlut.edu.cn;liu.se", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Husqvarna Group;Dalian University of Technology;Link\u00f6ping University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.husqvarnagroup.com/;http://www.dlut.edu.cn/;https://www.liu.se", "aff_unique_abbr": "Husqvarna;DUT;LiU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Sweden;China" }, { "title": "Fourier Controller Networks for Real-Time Decision-Making in Embodied Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33131", "id": "nDps3Q8j2l", "proceeding": "https://proceedings.mlr.press/v235/tan24c.html", "pdf": "https://openreview.net/pdf?id=nDps3Q8j2l", "openreview": "https://openreview.net/forum?id=nDps3Q8j2l", "author_site": "Hengkai Tan, LIU SONGMING, Kai Ma, Chengyang Ying, Xingxing Zhang, Hang Su, Jun Zhu", "tldr": "", "abstract": "Transformer has shown promise in reinforcement learning to model time-varying features for obtaining generalized low-level robot policies on diverse robotics datasets in embodied learning. However, it still suffers from the issues of low data efficiency and high inference latency. In this paper, we propose to investigate the task from a new perspective of the frequency domain. We first observe that the energy density in the frequency domain of a robot's trajectory is mainly concentrated in the low-frequency part. Then, we present the Fourier Controller Network (FCNet), a new network that uses Short-Time Fourier Transform (STFT) to extract and encode time-varying features through frequency domain interpolation. In order to do real-time decision-making, we further adopt FFT and Sliding DFT methods in the model architecture to achieve parallel training and efficient recurrent inference. Extensive results in both simulated (e.g., D4RL) and real-world environments (e.g., robot locomotion) demonstrate FCNet's substantial efficiency and effectiveness over existing methods such as Transformer, e.g., FCNet outperforms Transformer on multi-environmental robotics datasets of all types of sizes (from 1.9M to 120M). The project page and code can be found https://thkkk.github.io/fcnet.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hengkai Tan;Songming Liu;Kai Ma;Chengyang Ying;Xingxing Zhang;Hang Su;Jun Zhu", "authorids": "~Hengkai_Tan1;~Songming_Liu1;~Kai_Ma6;~Chengyang_Ying1;~Xingxing_Zhang3;~Hang_Su3;~Jun_Zhu2", "gender": "M;M;M;M;F;M;M", "homepage": "https://github.com/thkkk;;https://github.com/mad0g4;https://yingchengyang.github.io/;https://indussky8.github.io/;http://ml.cs.tsinghua.edu.cn/~jun;", "dblp": "378/5382;285/4585;;296/2065;;50/2644-1;26/5371-6", "google_scholar": "ot-bfRUAAAAJ;6urFg8kAAAAJ;;vM6KE18AAAAJ;https://scholar.google.com.hk/citations?user=RKjiLyAAAAAJ;axsP38wAAAAJ;dxN1_X0AAAAJ", "orcid": ";;;;0000-0002-2909-1589;;", "linkedin": "hengkai-tan-a31b88272/;%E6%9D%BE%E9%93%AD-%E5%88%98-7b8339254/;;%E9%93%96%E9%98%B3-%E5%BA%94-9b682a203/;;;", "or_profile": "~Hengkai_Tan1;~Songming_Liu1;~Kai_Ma6;~Chengyang_Ying1;~Xingxing_Zhang3;~Jun_Zhu2;~Hang_Su2", "aff": "the Department of Computer Science, Tsinghua University;Tsinghua University;, Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "cs.tsinghua.edu.cn;mails.tsinghua.edu.cn;cs.tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn;mail.tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;PhD student;MS student;PhD student;Researcher;Professor;Associate Professor", "bibtex": "@inproceedings{\ntan2024fourier,\ntitle={Fourier Controller Networks for Real-Time Decision-Making in Embodied Learning},\nauthor={Hengkai Tan and Songming Liu and Kai Ma and Chengyang Ying and Xingxing Zhang and Hang Su and Jun Zhu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nDps3Q8j2l}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2571770, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12512413700404574127&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "cs.tsinghua.edu.cn;mails.tsinghua.edu.cn;cs.tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn;mail.tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 7, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Adaptive Online Experimental Design for Causal Discovery", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33130", "id": "nJzf3TVnOn", "proceeding": "https://proceedings.mlr.press/v235/elahi24a.html", "pdf": "https://openreview.net/pdf?id=nJzf3TVnOn", "openreview": "https://openreview.net/forum?id=nJzf3TVnOn", "author_site": "Muhammad Qasim Elahi, Lai Wei, Murat Kocaoglu, Mahsa Ghasemi", "tldr": "", "abstract": "Causal discovery aims to uncover cause-and-effect relationships encoded in causal graphs by leveraging observational, interventional data, or their combination. The majority of existing causal discovery methods are developed assuming infinite interventional data. We focus on interventional data efficiency and formalize causal discovery from the perspective of online learning, inspired by pure exploration in bandit problems. A graph separating system, consisting of interventions that cut every edge of the graph at least once, is sufficient for learning causal graphs when infinite interventional data is available, even in the worst case. We propose a track-and-stop causal discovery algorithm that adaptively selects interventions from the graph separating system via allocation matching and learns the causal graph based on sampling history. Given any desired confidence value, the algorithm determines a termination condition and runs until it is met. We analyze the algorithm to establish a problem-dependent upper bound on the expected number of required interventional samples. Our proposed algorithm outperforms existing methods in simulations across various randomly generated causal graphs. It achieves higher accuracy, measured by the structural hamming distance (SHD) between the learned causal graph and the ground truth, with significantly fewer samples.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Muhammad Qasim Elahi;Lai Wei;Murat Kocaoglu;Mahsa Ghasemi", "authorids": "~Muhammad_Qasim_Elahi1;~Lai_Wei5;~Murat_Kocaoglu1;~Mahsa_Ghasemi1", "gender": "M;M;M;F", "homepage": "https://www.linkedin.com/in/qasim-elahi-b59948133/;;https://www.muratkocaoglu.com;https://mahsaghasemi.github.io/", "dblp": ";36/4168-2;74/11343;206/6477", "google_scholar": "M7C8dFAAAAAJ;45PJl9AAAAAJ;7N7bzdwAAAAJ;7KqsRJ8AAAAJ", "orcid": ";;;", "linkedin": ";;mkocaoglu/;", "or_profile": "~Muhammad_Qasim_Elahi1;~Lai_Wei5;~Murat_Kocaoglu1;~Mahsa_Ghasemi1", "aff": "Purdue University;University of Michigan - Ann Arbor;Purdue University;Purdue University", "aff_domain": "purdue.edu;umich.edu;purdue.edu;purdue.edu", "position": "PhD student;Postdoc;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nelahi2024adaptive,\ntitle={Adaptive Online Experimental Design for Causal Discovery},\nauthor={Muhammad Qasim Elahi and Lai Wei and Murat Kocaoglu and Mahsa Ghasemi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nJzf3TVnOn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4108904, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5448118956998255393&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 7, "email": "purdue.edu;umich.edu;purdue.edu;purdue.edu", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Purdue University;University of Michigan", "aff_unique_dep": ";", "aff_unique_url": "https://www.purdue.edu;https://www.umich.edu", "aff_unique_abbr": "Purdue;UM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Ann Arbor", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Merging Multi-Task Models via Weight-Ensembling Mixture of Experts", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33129", "id": "nLRKnO74RB", "proceeding": "https://proceedings.mlr.press/v235/tang24e.html", "pdf": "https://openreview.net/pdf?id=nLRKnO74RB", "openreview": "https://openreview.net/forum?id=nLRKnO74RB", "author_site": "Anke Tang, Li Shen, Yong Luo, Nan Yin, Lefei Zhang, Dacheng Tao", "tldr": "", "abstract": "Merging various task-specific Transformer-based vision models trained on different tasks into a single unified model can execute all the tasks concurrently. Previous methods, exemplified by task arithmetic, have been proven to be both effective and scalable. Existing methods have primarily focused on seeking a static optimal solution within the original model parameter space. A notable challenge is mitigating the interference between parameters of different models, which can substantially deteriorate performance. In this paper, we propose to merge most of the parameters while upscaling the MLP of the Transformer layers to a weight-ensembling mixture of experts (MoE) module, which can dynamically integrate shared and task-specific knowledge based on the input, thereby providing a more flexible solution that can adapt to the specific needs of each instance. Our key insight is that by identifying and separating shared knowledge and task-specific knowledge, and then dynamically integrating them, we can mitigate the parameter interference problem to a great extent. We conduct the conventional multi-task model merging experiments and evaluate the generalization and robustness of our method. The results demonstrate the effectiveness of our method and provide a comprehensive understanding of our method. The code is available at https://github.com/tanganke/weight-ensembling_MoE", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anke Tang;Li Shen;Yong Luo;Nan Yin;Lefei Zhang;Dacheng Tao", "authorids": "~Anke_Tang1;~Li_Shen1;~Yong_Luo2;~Nan_Yin4;~Lefei_Zhang1;~Dacheng_Tao1", "gender": "M;M;M;M;M;", "homepage": ";https://sites.google.com/site/mathshenli/home;;;;", "dblp": "348/4694;91/3680-8;57/5272-2.html;135/8983;28/10770;", "google_scholar": "KA7cyvUAAAAJ;yVhgENIAAAAJ;zb1oVGIAAAAJ;https://scholar.google.com.hk/citations?user=NoOK0pIAAAAJ;BLKHwNwAAAAJ;", "orcid": "0000-0002-0576-8153;;;;;", "linkedin": ";;;yin-nan-b32943173;;", "or_profile": "~Anke_Tang1;~Li_Shen1;~Yong_Luo2;~Nan_Yin4;~Lefei_Zhang1;~Dacheng_Tao1", "aff": "JD.com;JD Explore Academy;Wuhan University;Mohamed bin Zayed University of Artificial Intelligence;Wuhan University;", "aff_domain": "jd.com;jd.com;whu.edu.cn;mbzuai.ac.ae;whu.edu.cn;", "position": "Intern;Researcher;Professor;Postdoc;Full Professor;", "bibtex": "@inproceedings{\ntang2024merging,\ntitle={Merging Multi-Task Models via Weight-Ensembling Mixture of Experts},\nauthor={Anke Tang and Li Shen and Yong Luo and Nan Yin and Lefei Zhang and Dacheng Tao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nLRKnO74RB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2805018, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1325857199730013195&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 6, "email": "jd.com;jd.com;whu.edu.cn;mbzuai.ac.ae;whu.edu.cn;", "author_num": 6, "aff_unique_index": "0;1;2;3;2", "aff_unique_norm": "JD.com;JD;Wuhan University;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";JD Explore Academy;;", "aff_unique_url": "https://www.jd.com;;http://www.whu.edu.cn/;https://mbzuai.ac.ae", "aff_unique_abbr": "JD;;WHU;MBZUAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;2;0", "aff_country_unique": "China;;United Arab Emirates" }, { "title": "Completing Visual Objects via Bridging Generation and Segmentation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33128", "id": "nLgtHHBgl3", "proceeding": "https://proceedings.mlr.press/v235/li24j.html", "pdf": "https://openreview.net/pdf?id=nLgtHHBgl3", "openreview": "https://openreview.net/forum?id=nLgtHHBgl3", "author_site": "Xiang Li, Yinpeng Chen, Chung-Ching Lin, Hao Chen, Kai Hu, Rita Singh, Bhiksha Raj, Lijuan Wang, Zicheng Liu", "tldr": "", "abstract": "This paper presents a novel approach to object completion, with the primary goal of reconstructing a complete object from its partially visible components. Our method, named MaskComp, delineates the completion process through iterative stages of generation and segmentation. In each iteration, the object mask is provided as an additional condition to boost image generation, and, in return, the generated images can lead to a more accurate mask by fusing the segmentation of images. We demonstrate that the combination of one generation and one segmentation stage effectively functions as a mask denoiser. Through alternation between the generation and segmentation stages, the partial object mask is progressively refined, providing precise shape guidance and yielding superior object completion results. Our experiments demonstrate the superiority of MaskComp over existing approaches, e.g., ControlNet and Stable Diffusion, establishing it as an effective solution for object completion.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiang Li;Yinpeng Chen;Chung-Ching Lin;Hao Chen;Kai Hu;Rita Singh;Bhiksha Raj;Lijuan Wang;Zicheng Liu", "authorids": "~Xiang_Li35;~Yinpeng_Chen1;~Chung-Ching_Lin2;~Hao_Chen15;~Kai_Hu2;~Rita_Singh1;~Bhiksha_Raj1;~Lijuan_Wang1;~Zicheng_Liu1", "gender": ";M;;M;M;F;M;F;M", "homepage": ";https://scholar.google.com/citations?user=V_VpLksAAAAJ&hl=en;;https://hhhhhhao.github.io/;https://github.com/hukkai;http://mlsp.cs.cmu.edu/people/rsingh/index.html;https://www.cs.cmu.edu/directory/bhikshar/;https://www.microsoft.com/en-us/research/people/lijuanw/;https://sites.google.com/view/zichengliu/home?pli=1", "dblp": ";45/6977;;;;;60/3996;51/2527.html;l/ZichengLiu", "google_scholar": ";;;tktqkhwAAAAJ;;;;cDcWXuIAAAAJ;bkALdvsAAAAJ", "orcid": ";;;;;;;;0000-0001-5894-7828", "linkedin": ";;;haochen97/;;;;;", "or_profile": "~Xiang_Li35;~Yinpeng_Chen1;~Chung-Ching_Lin2;~Hao_Chen15;~Kai_Hu2;~Rita_Singh1;~Bhiksha_Raj1;~Lijuan_Wang1;~Zicheng_Liu1", "aff": ";Google DeepMind;;Carnegie Mellon University;Carnegie Mellon University;School of Computer Science, Carnegie Mellon University;Mohamed bin Zayed University of Artificial Intelligence;Microsoft;Microsoft", "aff_domain": ";google.com;;andrew.cmu.edu;cmu.edu;cs.cmu.edu;mbzuai.ac.ae;microsoft.com;microsoft.com", "position": ";Research Scientist;;PhD student;PhD student;Research Professor;Full Professor;Principal Researcher;partner research manager", "bibtex": "@inproceedings{\nli2024completing,\ntitle={Completing Visual Objects via Bridging Generation and Segmentation},\nauthor={Xiang Li and Yinpeng Chen and Chung-Ching Lin and Hao Chen and Kai Hu and Rita Singh and Bhiksha Raj and Lijuan Wang and Zicheng Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nLgtHHBgl3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7800545, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6486437434814496423&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";google.com;;andrew.cmu.edu;cmu.edu;cs.cmu.edu;mbzuai.ac.ae;microsoft.com;microsoft.com", "author_num": 9, "aff_unique_index": "0;1;1;1;2;3;3", "aff_unique_norm": "Google;Carnegie Mellon University;Mohamed bin Zayed University of Artificial Intelligence;Microsoft", "aff_unique_dep": "Google DeepMind;;;Microsoft Corporation", "aff_unique_url": "https://deepmind.com;https://www.cmu.edu;https://mbzuai.ac.ae;https://www.microsoft.com", "aff_unique_abbr": "DeepMind;CMU;MBZUAI;Microsoft", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0;1;1;1;2;1;1", "aff_country_unique": "United Kingdom;United States;United Arab Emirates" }, { "title": "StableSSM: Alleviating the Curse of Memory in State-space Models through Stable Reparameterization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33127", "id": "nMN5hNZMQK", "proceeding": "https://proceedings.mlr.press/v235/wang24ag.html", "pdf": "https://openreview.net/pdf?id=nMN5hNZMQK", "openreview": "https://openreview.net/forum?id=nMN5hNZMQK", "author_site": "Shida Wang, Qianxiao Li", "tldr": "", "abstract": "In this paper, we investigate the long-term memory learning capabilities of state-space models (SSMs) from the perspective of parameterization. We prove that state-space models without any reparameterization exhibit a memory limitation similar to that of traditional RNNs: the target relationships that can be stably approximated by state-space models must have an exponential decaying memory. Our analysis identifies this ``curse of memory'' as a result of the recurrent weights converging to a stability boundary, suggesting that a reparameterization technique can be effective. To this end, we introduce a class of reparameterization techniques for SSMs that effectively lift its memory limitations. Besides improving approximation capabilities, we further illustrate that a principled choice of reparameterization scheme can also enhance optimization stability. We validate our findings using synthetic datasets, language models and image classifications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shida Wang;Qianxiao Li", "authorids": "~Shida_Wang1;~Qianxiao_Li1", "gender": "M;M", "homepage": "https://radarfudan.github.io;https://blog.nus.edu.sg/qianxiaoli/", "dblp": "245/6187;172/0930.html", "google_scholar": "vA2YMfgAAAAJ;https://scholar.google.com.sg/citations?user=zLgReYoAAAAJ", "orcid": ";0000-0002-3903-3737", "linkedin": ";", "or_profile": "~Shida_Wang1;~Qianxiao_Li1", "aff": "National University of Singapore;National University of Singapore", "aff_domain": "nus.edu.sg;nus.edu.sg", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nwang2024stablessm,\ntitle={Stable{SSM}: Alleviating the Curse of Memory in State-space Models through Stable Reparameterization},\nauthor={Shida Wang and Qianxiao Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nMN5hNZMQK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 780019, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14189617943554510295&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 7, "email": "nus.edu.sg;nus.edu.sg", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "National University of Singapore", "aff_unique_dep": "", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Singapore" }, { "title": "SHINE: Shielding Backdoors in Deep Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33126", "id": "nMWxLnSBGW", "proceeding": "https://proceedings.mlr.press/v235/yuan24c.html", "pdf": "https://openreview.net/pdf?id=nMWxLnSBGW", "openreview": "https://openreview.net/forum?id=nMWxLnSBGW", "author_site": "Zhuowen Yuan, Wenbo Guo, Jinyuan Jia, Bo Li, Dawn Song", "tldr": "", "abstract": "Recent studies have discovered that a deep reinforcement learning (DRL) policy is vulnerable to backdoor attacks. Existing defenses against backdoor attacks either do not consider RL's unique mechanism or make unrealistic assumptions, resulting in limited defense efficacy, practicability, and generalizability. We propose SHINE, a backdoor shielding method specific for DRL. SHINE designs novel policy explanation techniques to identify the backdoor triggers and a policy retraining algorithm to eliminate the impact of the triggers on backdoored agents. We theoretically justify that SHINE guarantees to improve a backdoored agent's performance in a poisoned environment while ensuring its performance difference in the clean environment before and after shielding is bounded. We further conduct extensive experiments that evaluate SHINE against three mainstream DRL backdoor attacks in various benchmark RL environments. Our results show that SHINE significantly outperforms existing defenses in mitigating these backdoor attacks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhuowen Yuan;Wenbo Guo;Jinyuan Jia;Bo Li;Dawn Song", "authorids": "~Zhuowen_Yuan1;~Wenbo_Guo1;~Jinyuan_Jia2;~Bo_Li19;~Dawn_Song1", "gender": "M;M;;F;F", "homepage": ";https://henrygwb.github.io/;https://jinyuan-jia.github.io/;http://boli.cs.illinois.edu/;", "dblp": "304/3576;144/1238-2.html;24/5124-1.html;50/3402-26;s/DXSong", "google_scholar": "F-r0bYQAAAAJ;KyPheRMAAAAJ;iyg4ytkAAAAJ;K8vJkTcAAAAJ;", "orcid": ";;0000-0002-9785-7769;;", "linkedin": ";;;;", "or_profile": "~Zhuowen_Yuan1;~Wenbo_Guo1;~Jinyuan_Jia2;~Bo_Li19;~Dawn_Song1", "aff": "University of Illinois Urbana-Champaign;University of California, Santa Barbara;Pennsylvania State University;University of Illinois, Urbana Champaign;University of California, Berkeley", "aff_domain": "illinois.edu;ucsb.edu;psu.edu;illinois.edu;berkeley.edu", "position": "PhD student;Assistant Professor;Assistant Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nyuan2024shine,\ntitle={{SHINE}: Shielding Backdoors in Deep Reinforcement Learning},\nauthor={Zhuowen Yuan and Wenbo Guo and Jinyuan Jia and Bo Li and Dawn Song},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nMWxLnSBGW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 921960, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6033307709105802344&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "illinois.edu;ucsb.edu;psu.edu;illinois.edu;berkeley.edu", "author_num": 5, "aff_unique_index": "0;1;2;0;3", "aff_unique_norm": "University of Illinois Urbana-Champaign;University of California, Santa Barbara;Pennsylvania State University;University of California, Berkeley", "aff_unique_dep": ";;;", "aff_unique_url": "https://illinois.edu;https://www.ucsb.edu;https://www.psu.edu;https://www.berkeley.edu", "aff_unique_abbr": "UIUC;UCSB;PSU;UC Berkeley", "aff_campus_unique_index": "0;1;0;3", "aff_campus_unique": "Urbana-Champaign;Santa Barbara;;Berkeley", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Unsupervised Representation Learning of Brain Activity via Bridging Voxel Activity and Functional Connectivity", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33125", "id": "nOjZfpLyh1", "proceeding": "https://proceedings.mlr.press/v235/behrouz24a.html", "pdf": "https://openreview.net/pdf?id=nOjZfpLyh1", "openreview": "https://openreview.net/forum?id=nOjZfpLyh1", "author_site": "Ali Behrouz, Parsa Delavari, Farnoosh Hashemi", "tldr": "", "abstract": "Effective brain representation learning is a key step toward the understanding of cognitive processes and diagnosis of neurological diseases/disorders. Existing studies have focused on either (1) voxel-level activity, where only a single weight relating the voxel activity to the task (i.e., aggregation of voxel activity over a time window) is considered, missing their temporal dynamics, or (2) functional connectivity of the brain in the level of region of interests, missing voxel-level activities. We bridge this gap and design BrainMixer, an unsupervised learning framework that effectively utilizes both functional connectivity and associated time series of voxels to learn voxel-level representation in an unsupervised manner. BrainMixer employs two simple yet effective MLP-based encoders to simultaneously learn the dynamics of voxel-level signals and their functional correlations. To encode voxel activity, BrainMixer fuses information across both time and voxel dimensions via a dynamic attention mechanism. To learn the structure of the functional connectivity, BrainMixer presents a temporal graph patching and encodes each patch by combining its nodes' features via a new adaptive temporal pooling. Our experiments show that BrainMixer attains outstanding performance and outperforms 14 baselines in different downstream tasks and setups.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ali Behrouz;Parsa Delavari;Farnoosh Hashemi", "authorids": "~Ali_Behrouz1;~Parsa_Delavari1;~Farnoosh_Hashemi1", "gender": "M;M;F", "homepage": "https://Abehrouz.github.io;;https://farnooshha.github.io/", "dblp": "220/4163;;318/9574", "google_scholar": "UbwVuqIAAAAJ;clef-H4AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;", "linkedin": "ali-behrouz-506aa2127;;farnoosh-hashemi-a48328123/", "or_profile": "~Ali_Behrouz1;~Parsa_Delavari1;~Farnoosh_Hashemi1", "aff": "Cornell University;University of British Columbia;Cornell University", "aff_domain": "cornell.edu;ubc.ca;cornell.edu", "position": "PhD student;PhD student;PhD student", "bibtex": "@inproceedings{\nbehrouz2024unsupervised,\ntitle={Unsupervised Representation Learning of Brain Activity via Bridging Voxel Activity and Functional Connectivity},\nauthor={Ali Behrouz and Parsa Delavari and Farnoosh Hashemi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nOjZfpLyh1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5286979, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6364363259821404592&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "cornell.edu;ubc.ca;cornell.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Cornell University;University of British Columbia", "aff_unique_dep": ";", "aff_unique_url": "https://www.cornell.edu;https://www.ubc.ca", "aff_unique_abbr": "Cornell;UBC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;Canada" }, { "title": "Viewing Transformers Through the Lens of Long Convolutions Layers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33124", "id": "nOyj26YdIQ", "proceeding": "https://proceedings.mlr.press/v235/zimerman24b.html", "pdf": "https://openreview.net/pdf?id=nOyj26YdIQ", "openreview": "https://openreview.net/forum?id=nOyj26YdIQ", "author_site": "Itamar Zimerman, Lior Wolf", "tldr": "", "abstract": "Despite their dominance in modern DL and, especially, NLP domains, transformer architectures exhibit sub-optimal performance on long-range tasks compared to recent layers that are specifically designed for this purpose. In this work, drawing inspiration from key attributes of longrange layers, such as state-space layers, linear RNN layers, and global convolution layers, we demonstrate that minimal modifications to the transformer architecture can significantly enhance performance on the Long Range Arena (LRA) benchmark, thus narrowing the gap with these specialized layers. We identify that two key principles for long-range tasks are (i) incorporating an inductive bias towards smoothness, and (ii) locality. As we show, integrating these ideas into the attention mechanism improves results with a negligible amount of additional computation and without any additional trainable parameters. Our theory and experiments also shed light on the reasons for the inferior performance of transformers on long-range tasks and identify critical properties that are essential for successfully capturing long-range dependencies.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Itamar Zimerman;Lior Wolf", "authorids": "~Itamar_Zimerman1;~Lior_Wolf1", "gender": "M;M", "homepage": ";http://www.cs.tau.ac.il/~wolf", "dblp": "294/8621;83/4103", "google_scholar": "01s_DpwAAAAJ;UbFrXTsAAAAJ", "orcid": "0000-0001-8321-0609;0000-0001-5578-8892", "linkedin": ";", "or_profile": "~Itamar_Zimerman1;~Lior_Wolf1", "aff": "International Business Machines;Tel Aviv University", "aff_domain": "ibm.com;tau.ac.il", "position": "Researcher;Full Professor", "bibtex": "@inproceedings{\nzimerman2024viewing,\ntitle={Viewing Transformers Through the Lens of Long Convolutions Layers},\nauthor={Itamar Zimerman and Lior Wolf},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nOyj26YdIQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 10048348, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9023295933100738496&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 3, "email": "ibm.com;tau.ac.il", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "International Business Machines Corporation;Tel Aviv University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ibm.com;https://www.tau.ac.il", "aff_unique_abbr": "IBM;TAU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Israel" }, { "title": "Thermometer: Towards Universal Calibration for Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33123", "id": "nP7Q1PnuLK", "proceeding": "https://proceedings.mlr.press/v235/shen24c.html", "pdf": "https://openreview.net/pdf?id=nP7Q1PnuLK", "openreview": "https://openreview.net/forum?id=nP7Q1PnuLK", "author_site": "Maohao Shen, Subhro Das, Kristjan Greenewald, Prasanna Sattigeri, Gregory Wornell, Soumya Ghosh", "tldr": "", "abstract": "We consider the issue of calibration in large language models (LLM). Recent studies have found that common interventions such as instruction tuning often result in poorly calibrated LLMs. Although calibration is well-explored in traditional applications, calibrating LLMs is uniquely challenging. These challenges stem as much from the severe computational requirements of LLMs as from their versatility, which allows them to be applied to diverse tasks. Addressing these challenges, we propose THERMOMETER, a calibration approach tailored to LLMs. THERMOMETER learns an auxiliary model, given data from multiple tasks, for calibrating a LLM. It is computationally efficient, preserves the accuracy of the LLM, and produces better-calibrated responses for new tasks. Extensive empirical evaluations across various benchmarks demonstrate the effectiveness of the proposed method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Maohao Shen;Subhro Das;Kristjan Greenewald;Prasanna Sattigeri;Gregory W. Wornell;Soumya Ghosh", "authorids": "~Maohao_Shen1;~Subhro_Das1;~Kristjan_Greenewald1;~Prasanna_Sattigeri1;~Gregory_W._Wornell1;~Soumya_Ghosh1", "gender": "M;;;;;M", "homepage": "https://maohaos2.github.io/Maohao/;;https://researcher.watson.ibm.com/researcher/view.php?person=ibm-Kristjan.H.Greenewald;;;http://soumyaghosh.com", "dblp": "272/5397;;146/0563;00/7428;;58/5138", "google_scholar": ";;L3zNUG4AAAAJ;m-s38ikAAAAJ;;GEYQenQAAAAJ", "orcid": ";;;0000-0003-4435-0486;;", "linkedin": ";;;prasannasattigeri/;;", "or_profile": "~Maohao_Shen1;~Subhro_Das1;~Kristjan_Greenewald1;~Prasanna_Sattigeri1;~Gregory_W._Wornell1;~Soumya_Ghosh1", "aff": "Massachusetts Institute of Technology;;MIT-IBM Watson AI Lab, IBM Research;IBM Research;;International Business Machines", "aff_domain": "mit.edu;;ibm.com;ibm.com;;ibm.com", "position": "PhD student;;Research Scientist;Researcher;;Research Scientist", "bibtex": "@inproceedings{\nshen2024thermometer,\ntitle={Thermometer: Towards Universal Calibration for Large Language Models},\nauthor={Maohao Shen and Subhro Das and Kristjan Greenewald and Prasanna Sattigeri and Gregory W. Wornell and Soumya Ghosh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nP7Q1PnuLK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2512114, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5389569742830284476&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "mit.edu;;ibm.com;ibm.com;;ibm.com", "author_num": 6, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Massachusetts Institute of Technology;IBM;International Business Machines Corporation", "aff_unique_dep": ";AI Lab;", "aff_unique_url": "https://web.mit.edu;https://www.ibmwatsonai.org/;https://www.ibm.com", "aff_unique_abbr": "MIT;MIT-IBM AI Lab;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Enhancing Value Function Estimation through First-Order State-Action Dynamics in Offline Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33122", "id": "nSGnx8lNJ6", "proceeding": "https://proceedings.mlr.press/v235/lien24a.html", "pdf": "https://openreview.net/pdf?id=nSGnx8lNJ6", "openreview": "https://openreview.net/forum?id=nSGnx8lNJ6", "author_site": "Yun-Hsuan Lien, Ping-Chun Hsieh, Tzu-Mao Li, Yu-Shuen Wang", "tldr": "", "abstract": "In offline reinforcement learning (RL), updating the value function with the discrete-time Bellman Equation often encounters challenges due to the limited scope of available data. This limitation stems from the Bellman Equation, which cannot accurately predict the value of unvisited states. To address this issue, we have introduced an innovative solution that bridges the continuous- and discrete-time RL methods, capitalizing on their advantages. Our method uses a discrete-time RL algorithm to derive the value function from a dataset while ensuring that the function's first derivative aligns with the local characteristics of states and actions, as defined by the Hamilton-Jacobi-Bellman equation in continuous RL. We provide practical algorithms for both deterministic policy gradient methods and stochastic policy gradient methods. Experiments on the D4RL dataset show that incorporating the first-order information significantly improves policy performance for offline RL problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yun-Hsuan Lien;Ping-Chun Hsieh;Tzu-Mao Li;Yu-Shuen Wang", "authorids": "~Yun-Hsuan_Lien1;~Ping-Chun_Hsieh1;~Tzu-Mao_Li1;~Yu-Shuen_Wang1", "gender": "F;M;Not Specified;M", "homepage": ";https://pinghsieh.github.io/;https://cseweb.ucsd.edu/~tzli/;https://people.cs.nycu.edu.tw/~yushuen/", "dblp": ";163/7352;122/4798;08/742", "google_scholar": ";ix38JgoAAAAJ;Y7MCOdYAAAAJ;AKeIOxIAAAAJ", "orcid": ";;;0000-0003-2550-2990", "linkedin": ";;;", "or_profile": "~Yun-Hsuan_Lien1;~Ping-Chun_Hsieh1;~Tzu-Mao_Li1;~Yu-Shuen_Wang1", "aff": "National Yang Ming Chiao Tung University;National Yang Ming Chiao Tung University;University of California, San Diego;National Yang Ming Chiao Tung University", "aff_domain": "nycu.edu.tw;nycu.edu.tw;ucsd.edu;cs.nycu.edu.tw", "position": "PhD student;Associate Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nlien2024enhancing,\ntitle={Enhancing Value Function Estimation through First-Order State-Action Dynamics in Offline Reinforcement Learning},\nauthor={Yun-Hsuan Lien and Ping-Chun Hsieh and Tzu-Mao Li and Yu-Shuen Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nSGnx8lNJ6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1135650, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3722914467393372163&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "nycu.edu.tw;nycu.edu.tw;ucsd.edu;cs.nycu.edu.tw", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "National Yang Ming Chiao Tung University;University of California, San Diego", "aff_unique_dep": ";", "aff_unique_url": "https://www.nycu.edu.tw;https://www.ucsd.edu", "aff_unique_abbr": "NYCU;UCSD", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Taiwan;San Diego", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Predictive Coding beyond Correlations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33121", "id": "nTgzmXvuEA", "proceeding": "https://proceedings.mlr.press/v235/salvatori24a.html", "pdf": "https://openreview.net/pdf?id=nTgzmXvuEA", "openreview": "https://openreview.net/forum?id=nTgzmXvuEA", "author_site": "Tommaso Salvatori, Luca Pinchetti, Amine M'Charrak, Beren Millidge, Thomas Lukasiewicz", "tldr": "", "abstract": "Biologically plausible learning algorithms offer a promising alternative to traditional deep learning techniques, especially in overcoming the limitations of backpropagation in fast and low-energy neuromorphic implementations. To this end, there has been extensive research in understanding what their capabilities are. In this work, we show how one of such algorithms, called predictive coding, is able to perform causal inference tasks. First, we show how a simple change in the inference process of predictive coding enables to compute interventions without the need to mutilate or redefine a causal graph. Then, we explore applications in cases where the graph is unknown, and has to be inferred from observational data. Empirically, we show how such findings can be used to improve the performance of predictive coding in image classification tasks, and conclude that such models are naturally able to perform causal inference tasks using a biologically plausible kind of message passing.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tommaso Salvatori;Luca Pinchetti;Amine M'Charrak;Beren Millidge;Thomas Lukasiewicz", "authorids": "~Tommaso_Salvatori1;~Luca_Pinchetti1;~Amine_M'Charrak1;~Beren_Millidge1;~Thomas_Lukasiewicz2", "gender": "M;M;;M;", "homepage": "https://www.cs.ox.ac.uk/people/tommaso.salvatori/;;;http://beren.io/;https://www.cs.ox.ac.uk/people/thomas.lukasiewicz/", "dblp": "270/2016;;330/3059;244/9967;l/ThomasLukasiewicz", "google_scholar": "https://scholar.google.com/citations?hl=en;;VLGACv8AAAAJ;3GGkFTkAAAAJ;arjucpEAAAAJ", "orcid": ";;;;", "linkedin": ";luca-pinchetti-414230222/;mcharrak/;beren-millidge-377065142/;", "or_profile": "~Tommaso_Salvatori1;~Luca_Pinchetti1;~Amine_M'Charrak1;~Beren_Millidge1;~Thomas_Lukasiewicz2", "aff": "VERSES;Department of Computer Science, University of Oxford;University of Oxford;;Department of Computer Science, University of Oxford", "aff_domain": "verses.ai;cs.ox.ac.uk;cs.ox.ac.uk;;cs.ox.ac.uk", "position": "Researcher;PhD student;PhD student;;Full Professor", "bibtex": "@inproceedings{\nsalvatori2024predictive,\ntitle={Predictive Coding beyond Correlations},\nauthor={Tommaso Salvatori and Luca Pinchetti and Amine M'Charrak and Beren Millidge and Thomas Lukasiewicz},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nTgzmXvuEA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4695079, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17705514576379309609&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "verses.ai;cs.ox.ac.uk;cs.ox.ac.uk;;cs.ox.ac.uk", "author_num": 5, "aff_unique_index": "1;1;1", "aff_unique_norm": ";University of Oxford", "aff_unique_dep": ";Department of Computer Science", "aff_unique_url": ";https://www.ox.ac.uk", "aff_unique_abbr": ";Oxford", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Oxford", "aff_country_unique_index": "1;1;1", "aff_country_unique": ";United Kingdom" }, { "title": "STEER: Assessing the Economic Rationality of Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33120", "id": "nU1mtFDtMX", "proceeding": "https://proceedings.mlr.press/v235/raman24b.html", "pdf": "https://openreview.net/pdf?id=nU1mtFDtMX", "openreview": "https://openreview.net/forum?id=nU1mtFDtMX", "author_site": "Narun Raman, Taylor Lundy, Samuel Joseph Amouyal, Yoav Levine, Kevin Leyton-Brown, Moshe Tennenholtz", "tldr": "", "abstract": "There is increasing interest in using LLMs as decision-making \"agents\". Doing so includes many degrees of freedom: which model should be used; how should it be prompted; should it be asked to introspect, conduct chain-of-thought reasoning, etc? Settling these questions---and more broadly, determining whether an LLM agent is reliable enough to be trusted---requires a methodology for assessing such an agent's economic rationality. In this paper, we provide one. We begin by surveying the economic literature on rational decision making, taxonomizing a large set of fine-grained \"elements\" that an agent should exhibit, along with dependencies between them. We then propose a benchmark distribution that quantitatively scores an LLMs performance on these elements and, combined with a user-provided rubric, produces a \"rationality report card\". Finally, we describe the results of a large-scale empirical experiment with 14 different LLMs, characterizing the both current state of the art and the impact of different model sizes on models' ability to exhibit rational behavior.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Narun Krishnamurthi Raman;Taylor Lundy;Samuel Joseph Amouyal;Yoav Levine;Kevin Leyton-Brown;Moshe Tennenholtz", "authorids": "~Narun_Krishnamurthi_Raman1;~Taylor_Lundy1;~Samuel_Joseph_Amouyal1;~Yoav_Levine1;~Kevin_Leyton-Brown1;~Moshe_Tennenholtz1", "gender": "M;M;M;M;Not Specified;", "homepage": "https://narunraman.com;https://cs.ubc.ca/~tlundy;https://samsam3232.github.io/;;http://cs.ubc.ca/~kevinlb;http://moshet.net.technion.ac.il", "dblp": ";243/2600;321/1032;199/1895;81/1149;", "google_scholar": "SEWbKagAAAAJ;;https://scholar.google.com/citations?hl=en;;_4dnp0IAAAAJ;", "orcid": ";;;;0000-0002-7644-5327;", "linkedin": ";taylor-lundy-8b915418b/;;;kevinleytonbrown/;", "or_profile": "~Narun_Krishnamurthi_Raman1;~Taylor_Lundy1;~Samuel_Joseph_Amouyal1;~Yoav_Levine1;~Kevin_Leyton-Brown1;~Moshe_Tennenholtz1", "aff": "University of British Columbia;University of British Columbia;School of Computer Science, Tel Aviv University;;University of British Columbia;Technion - Israel Institute of Technology, Technion", "aff_domain": "ubc.ca;ubc.ca;cs.tau.ac.il;;ubc.ca;technion.ac.il", "position": "PhD student;PhD student;PhD student;;Full Professor;Full Professor", "bibtex": "@inproceedings{\nraman2024steer,\ntitle={{STEER}: Assessing the Economic Rationality of Large Language Models},\nauthor={Narun Krishnamurthi Raman and Taylor Lundy and Samuel Joseph Amouyal and Yoav Levine and Kevin Leyton-Brown and Moshe Tennenholtz},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nU1mtFDtMX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 750059, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16679935592718605374&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "ubc.ca;ubc.ca;cs.tau.ac.il;;ubc.ca;technion.ac.il", "author_num": 6, "aff_unique_index": "0;0;1;0;2", "aff_unique_norm": "University of British Columbia;Tel Aviv University;Technion - Israel Institute of Technology", "aff_unique_dep": ";School of Computer Science;", "aff_unique_url": "https://www.ubc.ca;https://www.tau.ac.il;https://www.technion.ac.il", "aff_unique_abbr": "UBC;TAU;Technion", "aff_campus_unique_index": "1", "aff_campus_unique": ";Tel Aviv", "aff_country_unique_index": "0;0;1;0;1", "aff_country_unique": "Canada;Israel" }, { "title": "Double-Step Alternating Extragradient with Increasing Timescale Separation for Finding Local Minimax Points: Provable Improvements", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33119", "id": "nUVForc3VP", "proceeding": "https://proceedings.mlr.press/v235/kim24m.html", "pdf": "https://openreview.net/pdf?id=nUVForc3VP", "openreview": "https://openreview.net/forum?id=nUVForc3VP", "author_site": "Kyuwon Kim, Donghwan Kim", "tldr": "", "abstract": "In nonconvex-nonconcave minimax optimization, two-timescale gradient methods have shown their potential to find local minimax (optimal) points, provided that the timescale separation between the min and the max player is sufficiently large. However, existing two-timescale variants of gradient descent ascent and extragradient methods face two shortcomings, especially when we search for non-strict local minimax points that are prevalent in modern overparameterized setting. In specific, (1) these methods can be unstable at some non-strict local minimax points even with sufficiently large timescale separation, and even (2) computing a proper amount of timescale separation is infeasible in practice. To remedy these two issues, we propose to incorporate two simple but provably effective schemes, double-step alternating update and increasing timescale separation, into the two-timescale extragradient method, respectively. Under mild conditions, we show that the proposed methods converge to non-strict local minimax points that all existing two-timescale methods fail to converge.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kyuwon Kim;Donghwan Kim", "authorids": "~Kyuwon_Kim1;~Donghwan_Kim2", "gender": "M;M", "homepage": "https://kaist-kyuwonkim.github.io/;http://mathsci.kaist.ac.kr/~donghwankim/", "dblp": "119/9250;05/1032", "google_scholar": ";https://scholar.google.com/citations?hl=en", "orcid": "0009-0002-6967-9907;", "linkedin": ";", "or_profile": "~Kyuwon_Kim1;~Donghwan_Kim2", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.edu;kaist.ac.kr", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nkim2024doublestep,\ntitle={Double-Step Alternating Extragradient with Increasing Timescale Separation for Finding Local Minimax Points: Provable Improvements},\nauthor={Kyuwon Kim and Donghwan Kim},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nUVForc3VP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 641167, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1302060135590329684&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "kaist.edu;kaist.ac.kr", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "HAMLET: Graph Transformer Neural Operator for Partial Differential Equations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33118", "id": "nYX7I6PsL7", "proceeding": "https://proceedings.mlr.press/v235/bryutkin24a.html", "pdf": "https://openreview.net/pdf?id=nYX7I6PsL7", "openreview": "https://openreview.net/forum?id=nYX7I6PsL7", "author_site": "Andrey Bryutkin, Jiahao Huang, Zhongying Deng, Guang Yang, Carola-Bibiane Sch\u00f6nlieb, Angelica I Aviles-Rivero", "tldr": "", "abstract": "We present a novel graph transformer framework, HAMLET, designed to address the challenges in solving partial differential equations (PDEs) using neural networks. The framework uses graph transformers with modular input encoders to directly incorporate differential equation information into the solution process. This modularity enhances parameter correspondence control, making HAMLET adaptable to PDEs of arbitrary geometries and varied input formats. Notably, HAMLET scales effectively with increasing data complexity and noise, showcasing its robustness. HAMLET is not just tailored to a single type of physical simulation, but can be applied across various domains. Moreover, it boosts model resilience and performance, especially in scenarios with limited data. We demonstrate, through extensive experiments, that our framework is capable of outperforming current techniques for PDEs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Andrey Bryutkin;Jiahao Huang;Zhongying Deng;Guang Yang;Carola-Bibiane Sch\u00f6nlieb;Angelica I Aviles-Rivero", "authorids": "~Andrey_Bryutkin1;~Jiahao_Huang1;~Zhongying_Deng1;~Guang_Yang1;~Carola-Bibiane_Sch\u00f6nlieb1;~Angelica_I_Aviles-Rivero1", "gender": "M;;M;;F;F", "homepage": "https://andreybryutkin.netlify.app/about;;;https://www.yanglab.fyi/;http://www.damtp.cam.ac.uk/research/cia/;https://angelicaiaviles.wordpress.com/", "dblp": ";;241/0938;25/5712-6;07/8184;138/9507", "google_scholar": ";;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.co.uk/citations?user=ZfzEFpsAAAAJ;nPeOXjwAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;;;", "linkedin": "andrey-bryutkin-03462b194/;;;guang-yang-ab1173135/;;", "or_profile": "~Andrey_Bryutkin1;~Jiahao_Huang1;~Zhongying_Deng1;~Guang_Yang1;~Carola-Bibiane_Sch\u00f6nlieb1;~Angelica_I_Aviles-Rivero1", "aff": "Massachusetts Institute of Technology;;University of Cambridge;King's College London, University of London;University of Cambridge;University of Cambridge", "aff_domain": "mit.edu;;cam.ac.uk;kcl.ac.uk;cam.ac.uk;cam.ac.uk", "position": "PhD student;;Postdoc;Hononary Senior Lecturer;Full Professor;Senior Research Associate", "bibtex": "@inproceedings{\nbryutkin2024hamlet,\ntitle={{HAMLET}: Graph Transformer Neural Operator for Partial Differential Equations},\nauthor={Andrey Bryutkin and Jiahao Huang and Zhongying Deng and Guang Yang and Carola-Bibiane Sch{\\\"o}nlieb and Angelica I Aviles-Rivero},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nYX7I6PsL7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2609114, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13763692487967760060&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "mit.edu;;cam.ac.uk;kcl.ac.uk;cam.ac.uk;cam.ac.uk", "author_num": 6, "aff_unique_index": "0;1;2;1;1", "aff_unique_norm": "Massachusetts Institute of Technology;University of Cambridge;King's College London", "aff_unique_dep": ";;", "aff_unique_url": "https://web.mit.edu;https://www.cam.ac.uk;https://www.kcl.ac.uk", "aff_unique_abbr": "MIT;Cambridge;KCL", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "video-SALMONN: Speech-Enhanced Audio-Visual Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33117", "id": "nYsh5GFIqX", "proceeding": "https://proceedings.mlr.press/v235/sun24l.html", "pdf": "https://openreview.net/pdf?id=nYsh5GFIqX", "openreview": "https://openreview.net/forum?id=nYsh5GFIqX", "author_site": "Guangzhi Sun, Wenyi Yu, Changli Tang, Xianzhao Chen, Tian Tan, Wei Li, Lu Lu, Zejun MA, Yuxuan Wang, Chao Zhang", "tldr": "", "abstract": "Speech understanding as an element of the more generic video understanding using audio-visual large language models (av-LLMs) is a crucial yet understudied aspect. This paper proposes video-SALMONN, a single end-to-end av-LLM for video processing, which can understand not only visual frame sequences, audio events and music, but speech as well. To obtain fine-grained temporal information required by speech understanding, while keeping efficient for other video elements, this paper proposes a novel multi-resolution causal Q-Former (MRC Q-Former) structure to connect pre-trained audio-visual encoders and the backbone large language model. Moreover, dedicated training approaches including the diversity loss and the unpaired audio-visual mixed training scheme are proposed to avoid frames or modality dominance. On the introduced audio-visual evaluation benchmark, video-SALMONN achieves more than 25% absolute accuracy improvements on the video-QA task and over 30% absolute accuracy improvements on audio-visual QA tasks with human speech. In addition, video-SALMONN demonstrates remarkable video comprehension and reasoning abilities on tasks that are unprecedented by other av-LLMs. Our training code and model checkpoints are available at https://github.com/bytedance/SALMONN/", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guangzhi Sun;Wenyi Yu;Changli Tang;Xianzhao Chen;Tian Tan;Wei Li;Lu Lu;Zejun MA;Yuxuan Wang;Chao Zhang", "authorids": "~Guangzhi_Sun1;~Wenyi_Yu2;~Changli_Tang1;~Xianzhao_Chen1;~Tian_Tan5;~Wei_Li78;~Lu_Lu6;~Zejun_MA1;~Yuxuan_Wang1;~Chao_Zhang20", "gender": "M;M;M;;M;M;M;M;M;M", "homepage": "http://mi.eng.cam.ac.uk/\u223cgs534/;https://github.com/Yu-Doit;;http://chenxianzhao.bytedance.com;;;;;;http://mi.eng.cam.ac.uk/~cz277/", "dblp": "236/4543;;331/8719;;;;;;;94/3019-31.html", "google_scholar": "PzPAzf8AAAAJ;CGqr-V8AAAAJ;RzIjbf0AAAAJ;;ukL_E5AAAAAJ;q8ZrKVIAAAAJ;IQaR2KoAAAAJ;https://scholar.google.com/citations?hl=zh-CN;3RaOfJkAAAAJ;https://scholar.google.co.uk/citations?view_op=list_works", "orcid": ";;0000-0002-2009-3078;;;;;;;", "linkedin": "brian-sun-59746b12b/;;;;;;;zejun-ma-58614365/;;", "or_profile": "~Guangzhi_Sun1;~Wenyi_Yu2;~Changli_Tang1;~Xianzhao_Chen1;~Tian_Tan5;~Wei_Li78;~Lu_Lu6;~Zejun_MA1;~Yuxuan_Wang1;~Chao_Zhang20", "aff": "University of Cambridge;Tsinghua University;Tsinghua University;;;Bytedance;;ByteDance Inc.;ByteDance;University College London", "aff_domain": "cam.ac.uk;tsinghua.edu.cn;tsinghua.edu.cn;;;bytedance.com;;bytedance.com;bytedance.com;ucl.ac.uk", "position": "Junior Research Fellow;PhD student;Undergrad student;;;Researcher;;Principal Researcher;Researcher;Associate Professor", "bibtex": "@inproceedings{\nsun2024videosalmonn,\ntitle={video-{SALMONN}: Speech-Enhanced Audio-Visual Large Language Models},\nauthor={Guangzhi Sun and Wenyi Yu and Changli Tang and Xianzhao Chen and Tian Tan and Wei Li and Lu Lu and Zejun MA and Yuxuan Wang and Chao Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nYsh5GFIqX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9508653, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4515344905201949029&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "cam.ac.uk;tsinghua.edu.cn;tsinghua.edu.cn;;;bytedance.com;;bytedance.com;bytedance.com;ucl.ac.uk", "author_num": 10, "aff_unique_index": "0;1;1;2;2;2;3", "aff_unique_norm": "University of Cambridge;Tsinghua University;ByteDance;University College London", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.cam.ac.uk;https://www.tsinghua.edu.cn;https://www.bytedance.com;https://www.ucl.ac.uk", "aff_unique_abbr": "Cambridge;THU;Bytedance;UCL", "aff_campus_unique_index": "0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;1;1;1;1;1;0", "aff_country_unique": "United Kingdom;China" }, { "title": "A Dynamical Model of Neural Scaling Laws", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33116", "id": "nbOY1OmtRc", "proceeding": "https://proceedings.mlr.press/v235/bordelon24a.html", "pdf": "https://openreview.net/pdf?id=nbOY1OmtRc", "openreview": "https://openreview.net/forum?id=nbOY1OmtRc", "author_site": "Blake Bordelon, Alexander Atanasov, Cengiz Pehlevan", "tldr": "", "abstract": "On a variety of tasks, the performance of neural networks predictably improves with training time, dataset size and model size across many orders of magnitude. This phenomenon is known as a neural scaling law. Of fundamental importance is the compute-optimal scaling law, which reports the performance as a function of units of compute when choosing model sizes optimally. We analyze a random feature model trained with gradient descent as a solvable model of network training and generalization. This reproduces many observations about neural scaling laws. First, our model makes a prediction about why the scaling of performance with training time and with model size have different power law exponents. Consequently, the theory predicts an asymmetric compute-optimal scaling rule where the number of training steps are increased faster than model parameters, consistent with recent empirical observations. Second, it has been observed that early in training, networks converge to their infinite-width dynamics at a rate $1/\\text{width}$ but at late time exhibit a rate $\\text{width}^{-c}$, where $c$ depends on the structure of the architecture and task. We show that our model exhibits this behavior. Lastly, our theory shows how the gap between training and test loss can gradually build up over time due to repeated reuse of data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Blake Bordelon;Alexander Atanasov;Cengiz Pehlevan", "authorids": "~Blake_Bordelon1;~Alexander_Atanasov1;~Cengiz_Pehlevan2", "gender": "M;M;", "homepage": "https://blakebordelon.github.io/;http://abatanasov.com/;https://pehlevan.seas.harvard.edu/", "dblp": "228/6993;305/3785.html;145/3480", "google_scholar": "yeQ8_pgAAAAJ;abMQRYIAAAAJ;veDLTPEAAAAJ", "orcid": "0000-0003-0455-9445;0000-0002-3338-0324;0000-0001-9767-6063", "linkedin": ";alexatanasov/;", "or_profile": "~Blake_Bordelon1;~Alexander_Atanasov1;~Cengiz_Pehlevan2", "aff": "Harvard University;Harvard University;School of Engineering and Applied Sciences, Harvard University", "aff_domain": "harvard.edu;harvard.edu;seas.harvard.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nbordelon2024a,\ntitle={A Dynamical Model of Neural Scaling Laws},\nauthor={Blake Bordelon and Alexander Atanasov and Cengiz Pehlevan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nbOY1OmtRc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1934482, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3614692862228956880&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "harvard.edu;harvard.edu;seas.harvard.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Conformalized Adaptive Forecasting of Heterogeneous Trajectories", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33115", "id": "nbpwNmXTTw", "proceeding": "https://proceedings.mlr.press/v235/zhou24l.html", "pdf": "https://openreview.net/pdf?id=nbpwNmXTTw", "openreview": "https://openreview.net/forum?id=nbpwNmXTTw", "author_site": "Yanfei Zhou, Lars Lindemann, Matteo Sesia", "tldr": "", "abstract": "This paper presents a new conformal method for generating *simultaneous* forecasting bands guaranteed to cover the *entire path* of a new random trajectory with sufficiently high probability. Prompted by the need for dependable uncertainty estimates in motion planning applications where the behavior of diverse objects may be more or less unpredictable, we blend different techniques from online conformal prediction of single and multiple time series, as well as ideas for addressing heteroscedasticity in regression. This solution is both principled, providing precise finite-sample guarantees, and effective, often leading to more informative predictions than prior methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yanfei Zhou;Lars Lindemann;Matteo Sesia", "authorids": "~Yanfei_Zhou1;~Lars_Lindemann1;~Matteo_Sesia1", "gender": "F;M;", "homepage": ";https://sites.google.com/view/larslindemann/main-page;https://msesia.github.io/", "dblp": ";;280/1260", "google_scholar": "YJ5oVF4AAAAJ;AkVKyzkAAAAJ;qFtP1MQAAAAJ", "orcid": ";;0000-0001-9046-907X", "linkedin": "yanfei-zhou-25547a11a/;;matteo-sesia", "or_profile": "~Yanfei_Zhou1;~Lars_Lindemann1;~Matteo_Sesia1", "aff": "University of Southern California;University of Southern California;University of Southern California", "aff_domain": "usc.edu;usc.edu;usc.edu", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nzhou2024conformalized,\ntitle={Conformalized Adaptive Forecasting of Heterogeneous Trajectories},\nauthor={Yanfei Zhou and Lars Lindemann and Matteo Sesia},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nbpwNmXTTw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 773675, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2862020093647806775&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "usc.edu;usc.edu;usc.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Southern California", "aff_unique_dep": "", "aff_unique_url": "https://www.usc.edu", "aff_unique_abbr": "USC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Position: Considerations for Differentially Private Learning with Large-Scale Public Pretraining", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33114", "id": "ncjhi4qAPV", "proceeding": "https://proceedings.mlr.press/v235/tramer24a.html", "pdf": "https://openreview.net/pdf?id=ncjhi4qAPV", "openreview": "https://openreview.net/forum?id=ncjhi4qAPV", "author_site": "Florian Tramer, Gautam Kamath, Nicholas Carlini", "tldr": "", "abstract": "The performance of differentially private machine learning can be boosted significantly by leveraging the transfer learning capabilities of non-private models pretrained on large *public* datasets. We critically review this approach. We primarily question whether the use of large Web-scraped datasets *should* be viewed as differential-privacy-preserving. We further scrutinize whether existing machine learning benchmarks are appropriate for measuring the ability of pretrained models to generalize to sensitive domains. Finally, we observe that reliance on large pretrained models may lose *other* forms of privacy, requiring data to be outsourced to a more compute-powerful third party.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Florian Tram\u00e8r;Gautam Kamath;Nicholas Carlini", "authorids": "~Florian_Tram\u00e8r1;~Gautam_Kamath1;~Nicholas_Carlini1", "gender": "M;;M", "homepage": "http://www.gautamkamath.com/;http://nicholas.carlini.com;http://floriantramer.com", "dblp": "73/11140;145/1806;158/7224", "google_scholar": "MK6zHkYAAAAJ;;https://scholar.google.ch/citations?user=ijH0-a8AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Gautam_Kamath1;~Nicholas_Carlini1;~Florian_Tramer1", "aff": "University of Waterloo;Google;ETHZ - ETH Zurich", "aff_domain": "uwaterloo.ca;google.com;ethz.ch", "position": "Assistant Professor;Researcher;Assistant Professor", "bibtex": "@inproceedings{\ntram{\\`e}r2024position,\ntitle={Position: Considerations for Differentially Private Learning with Large-Scale Public Pretraining},\nauthor={Florian Tram{\\`e}r and Gautam Kamath and Nicholas Carlini},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ncjhi4qAPV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 282389, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14296547589243527527&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "uwaterloo.ca;google.com;ethz.ch", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Waterloo;Google;ETH Zurich", "aff_unique_dep": ";Google;", "aff_unique_url": "https://uwaterloo.ca;https://www.google.com;https://www.ethz.ch", "aff_unique_abbr": "UW;Google;ETHZ", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;2", "aff_country_unique": "Canada;United States;Switzerland" }, { "title": "Graph-based Time Series Clustering for End-to-End Hierarchical Forecasting", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33113", "id": "nd47Za5jk5", "proceeding": "https://proceedings.mlr.press/v235/cini24a.html", "pdf": "https://openreview.net/pdf?id=nd47Za5jk5", "openreview": "https://openreview.net/forum?id=nd47Za5jk5", "author_site": "Andrea Cini, Danilo Mandic, Cesare Alippi", "tldr": "", "abstract": "Relationships among time series can be exploited as inductive biases in learning effective forecasting models. In hierarchical time series, relationships among subsets of sequences induce hard constraints (hierarchical inductive biases) on the predicted values. In this paper, we propose a graph-based methodology to unify relational and hierarchical inductive biases in the context of deep learning for time series forecasting. In particular, we model both types of relationships as dependencies in a pyramidal graph structure, with each pyramidal layer corresponding to a level of the hierarchy. By exploiting modern - trainable - graph pooling operators we show that the hierarchical structure, if not available as a prior, can be learned directly from data, thus obtaining cluster assignments aligned with the forecasting objective. A differentiable reconciliation stage is incorporated into the processing architecture, allowing hierarchical constraints to act both as an architectural bias as well as a regularization element for predictions. Simulation results on representative datasets show that the proposed method compares favorably against the state of the art.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Andrea Cini;Danilo Mandic;Cesare Alippi", "authorids": "~Andrea_Cini1;~Danilo_Mandic1;~Cesare_Alippi1", "gender": "M;;M", "homepage": "https://andreacini.github.io/;http://www.commsp.ee.ic.ac.uk/~mandic;https://alippi.faculty.polimi.it/", "dblp": "249/8223;;84/6337", "google_scholar": "bQI2UIUAAAAJ;https://scholar.google.co.uk/citations?user=hcxWZkcAAAAJ;SCZObbIAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Andrea_Cini1;~Danilo_Mandic1;~Cesare_Alippi1", "aff": "Universita della Svizzera Italiana;Imperial College London;Politecnico di Milano", "aff_domain": "usi.ch;imperial.ac.uk;polimi.it", "position": "PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\ncini2024graphbased,\ntitle={Graph-based Time Series Clustering for End-to-End Hierarchical Forecasting},\nauthor={Andrea Cini and Danilo Mandic and Cesare Alippi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nd47Za5jk5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 944789, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15252588894831532114&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "usi.ch;imperial.ac.uk;polimi.it", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Universita della Svizzera Italiana;Imperial College London;Politecnico di Milano", "aff_unique_dep": ";;", "aff_unique_url": "https://www.usi.ch;https://www.imperial.ac.uk;https://www.polimi.it", "aff_unique_abbr": "USI;ICL;Polimi", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "Switzerland;United Kingdom;Italy" }, { "title": "Bringing Motion Taxonomies to Continuous Domains via GPLVM on Hyperbolic manifolds", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33112", "id": "ndVXXmxSC5", "proceeding": "https://proceedings.mlr.press/v235/jaquier24a.html", "pdf": "https://openreview.net/pdf?id=ndVXXmxSC5", "openreview": "https://openreview.net/forum?id=ndVXXmxSC5", "author_site": "No\u00e9mie Jaquier, Leonel Rozo, Miguel Gonz\u00e1lez-Duque, Slava Borovitskiy, Tamim Asfour", "tldr": "", "abstract": "Human motion taxonomies serve as high-level hierarchical abstractions that classify how humans move and interact with their environment. They have proven useful to analyse grasps, manipulation skills, and whole-body support poses. Despite substantial efforts devoted to design their hierarchy and underlying categories, their use remains limited. This may be attributed to the lack of computational models that fill the gap between the discrete hierarchical structure of the taxonomy and the high-dimensional heterogeneous data associated to its categories. To overcome this problem, we propose to model taxonomy data via hyperbolic embeddings that capture the associated hierarchical structure. We achieve this by formulating a novel Gaussian process hyperbolic latent variable model that incorporates the taxonomy structure through graph-based priors on the latent space and distance-preserving back constraints. We validate our model on three different human motion taxonomies to learn hyperbolic embeddings that faithfully preserve the original graph structure. We show that our model properly encodes unseen data from existing or new taxonomy categories, and outperforms its Euclidean and VAE-based counterparts. Finally, through proof-of-concept experiments, we show that our model may be used to generate realistic trajectories between the learned embeddings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "No\u00e9mie Jaquier;Leonel Rozo;Miguel Gonz\u00e1lez-Duque;Viacheslav Borovitskiy;Tamim Asfour", "authorids": "~No\u00e9mie_Jaquier1;~Leonel_Rozo1;~Miguel_Gonz\u00e1lez-Duque3;~Viacheslav_Borovitskiy1;~Tamim_Asfour1", "gender": "M;M;M;M;F", "homepage": "https://vab.im/;http://www.humanoids.kit.edu/;https://leonelrozo.weebly.com/;https://www.miguelgondu.com/;https://njaquier.ch/", "dblp": "259/3201;34/6686.html;10/9515;244/9609.html;", "google_scholar": "https://scholar.google.ru/citations?user=1KqNyNMAAAAJ;https://scholar.google.de/citations?user=65bIT4oAAAAJ;https://scholar.google.it/citations?user=vLWgi-YAAAAJ;eje0FAYAAAAJ;j3rJXU4AAAAJ", "orcid": ";;0000-0001-5970-9135;;", "linkedin": ";;leonelrozo/;;", "or_profile": "~Viacheslav_Borovitskiy1;~Tamim_Asfour1;~Leonel_Dario_Rozo1;~Miguel_Gonz\u00e1lez_Duque1;~Noemie_Jaquier1", "aff": "ETHZ - ETH Zurich;Karlsruhe Institute of Technology;Robert Bosch GmbH, Bosch;Copenhagen University;Karlsruhe Institute of Technology", "aff_domain": "ethz.ch;kit.edu;de.bosch.com;ku.dk;kit.edu", "position": "Postdoc;Full Professor;Principal Researcher;Researcher;Postdoc", "bibtex": "@inproceedings{\njaquier2024bringing,\ntitle={Bringing Motion Taxonomies to Continuous Domains via {GPLVM} on Hyperbolic manifolds},\nauthor={No{\\'e}mie Jaquier and Leonel Rozo and Miguel Gonz{\\'a}lez-Duque and Viacheslav Borovitskiy and Tamim Asfour},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ndVXXmxSC5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5934591, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8005460436707093323&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "ethz.ch;kit.edu;de.bosch.com;ku.dk;kit.edu", "author_num": 5, "aff_unique_index": "0;1;2;3;1", "aff_unique_norm": "ETH Zurich;Karlsruhe Institute of Technology;Robert Bosch GmbH;University of Copenhagen", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ethz.ch;https://www.kit.edu;https://www.bosch.com;https://www.ku.dk", "aff_unique_abbr": "ETHZ;KIT;Bosch;UCPH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;2;1", "aff_country_unique": "Switzerland;Germany;Denmark" }, { "title": "RLVF: Learning from Verbal Feedback without Overgeneralization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33110", "id": "ngcZhfXCBW", "proceeding": "https://proceedings.mlr.press/v235/stephan24a.html", "pdf": "https://openreview.net/pdf?id=ngcZhfXCBW", "openreview": "https://openreview.net/forum?id=ngcZhfXCBW", "author_site": "Moritz Stephan, Alexander Khazatsky, Eric Mitchell, Annie Chen, Sheryl Hsu, Archit Sharma, Chelsea Finn", "tldr": "", "abstract": "The diversity of contexts in which large language models (LLMs) are deployed requires the ability to modify or customize default model behaviors to incorporate nuanced requirements and preferences. A convenient interface to specify such model adjustments is high-level verbal feedback, such as \u201cDon\u2019t use emojis when drafting emails to my boss.\u201d However, while writing high-level feedback is far simpler than collecting annotations for reinforcement learning from human feedback (RLHF), we find that simply prompting a model with such feedback leads to $\\textbf{overgeneralization}$\u2013applying feedback in contexts where it is not relevant. We propose a new method Contextualized Critiques with Constrained Preference Optimization (C3PO) to learn from high-level verbal feedback while reducing overgeneralization compared to current work. C3PO uses a piece of high-level feedback to generate a small synthetic preference dataset to specify when and how the feedback should (and should not) be applied. It then fine-tunes the model in accordance with the synthetic preference data while minimizing the divergence from the original model for prompts where the feedback does not apply. Our experimental results indicate that our approach effectively applies verbal feedback to relevant scenarios while preserving existing behaviors for other contexts more than current methods. For both human- and GPT-4-generated high-level feedback, C3PO effectively adheres to the given feedback comparably to in-context baselines while reducing overgeneralization by 30%.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Moritz Pascal Stephan;Alexander Khazatsky;Eric Mitchell;Annie S Chen;Sheryl Hsu;Archit Sharma;Chelsea Finn", "authorids": "~Moritz_Pascal_Stephan1;~Alexander_Khazatsky1;~Eric_Mitchell1;~Annie_S_Chen1;~Sheryl_Hsu1;~Archit_Sharma1;~Chelsea_Finn1", "gender": "M;M;M;F;F;M;F", "homepage": ";https://www.linkedin.com/in/alexander-khazatsky-b98841149/;https://ericmitchell.ai;https://anniesch.github.io/;https://www.linkedin.com/in/sheryl-hsu-83b84a183/;;https://ai.stanford.edu/~cbfinn/", "dblp": ";;238/0419;277/1527.html;;220/3163.html;131/1783", "google_scholar": ";;q77J4fgAAAAJ;;;_0IIzxgAAAAJ;vfPE6hgAAAAJ", "orcid": ";;0000-0002-7487-1744;;;;", "linkedin": "moritz-stephan/;;;annie-s-chen/;;;", "or_profile": "~Moritz_Pascal_Stephan1;~Alexander_Khazatsky1;~Eric_Mitchell1;~Annie_S_Chen1;~Sheryl_Hsu1;~Archit_Sharma1;~Chelsea_Finn1", "aff": "Stanford University;University of California, Berkeley;Stanford University;Stanford University;Stanford University;Stanford University;Google", "aff_domain": "stanford.edu;berkeley.edu;stanford.edu;stanford.edu;stanford.edu;stanford.edu;google.com", "position": "Undergrad student;Undergraduate Researcher;PhD student;PhD student;Undergrad student;Graduate Student;Research Scientist", "bibtex": "@inproceedings{\nstephan2024rlvf,\ntitle={{RLVF}: Learning from Verbal Feedback without Overgeneralization},\nauthor={Moritz Pascal Stephan and Alexander Khazatsky and Eric Mitchell and Annie S Chen and Sheryl Hsu and Archit Sharma and Chelsea Finn},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ngcZhfXCBW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2259084, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15785229105104914043&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "stanford.edu;berkeley.edu;stanford.edu;stanford.edu;stanford.edu;stanford.edu;google.com", "author_num": 7, "aff_unique_index": "0;1;0;0;0;0;2", "aff_unique_norm": "Stanford University;University of California, Berkeley;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.stanford.edu;https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "Stanford;UC Berkeley;Google", "aff_campus_unique_index": "0;1;0;0;0;0;2", "aff_campus_unique": "Stanford;Berkeley;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Momentum Particle Maximum Likelihood", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33109", "id": "ngjmcfowtc", "proceeding": "https://proceedings.mlr.press/v235/lim24b.html", "pdf": "https://openreview.net/pdf?id=ngjmcfowtc", "openreview": "https://openreview.net/forum?id=ngjmcfowtc", "author_site": "Jen Ning Lim, Juan Kuntz, Samuel Power, Adam M. Johansen", "tldr": "", "abstract": "Maximum likelihood estimation (MLE) of latent variable models is often recast as the minimization of a free energy functional over an extended space of parameters and probability distributions. This perspective was recently combined with insights from optimal transport to obtain novel particle-based algorithms for fitting latent variable models to data. Drawing inspiration from prior works which interpret `momentum-enriched' optimization algorithms as discretizations of ordinary differential equations, we propose an analogous dynamical-systems-inspired approach to minimizing the free energy functional. The result is a dynamical system that blends elements of Nesterov's Accelerated Gradient method, the underdamped Langevin diffusion, and particle methods. Under suitable assumptions, we prove that the continuous-time system minimizes the functional. By discretizing the system, we obtain a practical algorithm for MLE in latent variable models. The algorithm outperforms existing particle methods in numerical experiments and compares favourably with other MLE algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jen Ning Lim;Juan Kuntz;Samuel Power;Adam Michael Johansen", "authorids": "~Jen_Ning_Lim1;~Juan_Kuntz1;~Samuel_Power1;~Adam_Michael_Johansen1", "gender": ";M;M;M", "homepage": ";https://juankuntz.github.io/;https://sites.google.com/view/sp-monte-carlo/;https://go.warwick.ac.uk/amjohansen", "dblp": "250/9539;193/9597;;43/3875", "google_scholar": "Uryp_N8AAAAJ;https://scholar.google.co.uk/citations?user=ndrKbVoAAAAJ;ePQTKrEAAAAJ;https://scholar.google.co.uk/citations?user=KOaq7EEAAAAJ", "orcid": ";0000-0002-5855-6074;0000-0001-8644-8014;0000-0002-3531-7628", "linkedin": ";;samuel-power-6308b02b/;adam-johansen-6b71154/", "or_profile": "~Jen_Ning_Lim1;~Juan_Kuntz1;~Samuel_Power1;~Adam_Michael_Johansen1", "aff": "The University of Warwick;Polygeist;University of Bristol;University of Warwick", "aff_domain": "warwick.ac.uk;polygei.st;bristol.ac.uk;warwick.ac.uk", "position": "PhD student;Researcher;Postdoc;Full Professor", "bibtex": "@inproceedings{\nlim2024momentum,\ntitle={Momentum Particle Maximum Likelihood},\nauthor={Jen Ning Lim and Juan Kuntz and Samuel Power and Adam Michael Johansen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ngjmcfowtc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2877393, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13223908394080045354&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "warwick.ac.uk;polygei.st;bristol.ac.uk;warwick.ac.uk", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Warwick;Polygeist;University of Bristol", "aff_unique_dep": ";;", "aff_unique_url": "https://warwick.ac.uk;;https://www.bristol.ac.uk", "aff_unique_abbr": "Warwick;;Bristol", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom;" }, { "title": "Efficient Exploration in Average-Reward Constrained Reinforcement Learning: Achieving Near-Optimal Regret With Posterior Sampling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33108", "id": "njpTpkvUbO", "proceeding": "https://proceedings.mlr.press/v235/provodin24a.html", "pdf": "https://openreview.net/pdf?id=njpTpkvUbO", "openreview": "https://openreview.net/forum?id=njpTpkvUbO", "author_site": "Danil Provodin, Maurits Kaptein, Mykola Pechenizkiy", "tldr": "", "abstract": "We present a new algorithm based on posterior sampling for learning in Constrained Markov Decision Processes (CMDP) in the infinite-horizon undiscounted setting. The algorithm achieves near-optimal regret bounds while being advantageous empirically compared to the existing algorithms. Our main theoretical result is a Bayesian regret bound for each cost component of $\\tilde{O} (DS\\sqrt{AT})$ for any communicating CMDP with $S$ states, $A$ actions, and diameter $D$. This regret bound matches the lower bound in order of time horizon $T$ and is the best-known regret bound for communicating CMDPs achieved by a computationally tractable algorithm. Empirical results show that our posterior sampling algorithm outperforms the existing algorithms for constrained reinforcement learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Danil Provodin;Maurits Clemens Kaptein;Mykola Pechenizkiy", "authorids": "~Danil_Provodin1;~Maurits_Clemens_Kaptein1;~Mykola_Pechenizkiy1", "gender": "M;M;M", "homepage": ";https://www.mauritskaptein.com;http://www.win.tue.nl/~mpechen/", "dblp": ";;37/4649", "google_scholar": "eh0cVQIAAAAJ;EjWTDpoAAAAJ;https://scholar.google.com.tw/citations?user=F0uFT_kAAAAJ", "orcid": "0009-0000-8986-6510;;0000-0003-4955-0743", "linkedin": "danil-provodin/;;mpechen/", "or_profile": "~Danil_Provodin1;~Maurits_Clemens_Kaptein1;~Mykola_Pechenizkiy1", "aff": "Eindhoven University of Technology;Tilburg University;Eindhoven University of Technology", "aff_domain": "tue.nl;tilburguniversity.nl;tue.nl", "position": "PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nprovodin2024efficient,\ntitle={Efficient Exploration in Average-Reward Constrained Reinforcement Learning: Achieving Near-Optimal Regret With Posterior Sampling},\nauthor={Danil Provodin and Maurits Clemens Kaptein and Mykola Pechenizkiy},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=njpTpkvUbO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1005862, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nFeD14Jg7pEJ:scholar.google.com/&scioq=Efficient+Exploration+in+Average-Reward+Constrained+Reinforcement+Learning:+Achieving+Near-Optimal+Regret+With+Posterior+Sampling&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": "tue.nl;tilburguniversity.nl;tue.nl", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Eindhoven University of Technology;Tilburg University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tue.nl;https://www.tilburguniversity.edu/", "aff_unique_abbr": "TU/e;Tilburg U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Netherlands" }, { "title": "Language Agent Tree Search Unifies Reasoning, Acting, and Planning in Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33107", "id": "njwv9BsGHF", "proceeding": "https://proceedings.mlr.press/v235/zhou24r.html", "pdf": "https://openreview.net/pdf?id=njwv9BsGHF", "openreview": "https://openreview.net/forum?id=njwv9BsGHF", "author_site": "Andy Zhou, Kai Yan, Michal Shlapentokh-Rothman, Haohan Wang, Yu-Xiong Wang", "tldr": "", "abstract": "While language models (LMs) have shown potential across a range of decision-making tasks, their reliance on simple acting processes limits their broad deployment as autonomous agents. In this paper, we introduce Language Agent Tree Search (LATS) -- the first general framework that synergizes the capabilities of LMs in reasoning, acting, and planning. By leveraging the in-context learning ability of LMs, we integrate Monte Carlo Tree Search into LATS to enable LMs as agents, along with LM-powered value functions and self-reflections for proficient exploration and enhanced decision-making. A key feature of our approach is the incorporation of an environment for external feedback, which offers a more deliberate and adaptive problem-solving mechanism that surpasses the constraints of existing techniques. Our experimental evaluation across diverse domains, including programming, interactive question-answering (QA), web navigation, and math, validates the effectiveness and generality of LATS in decision-making while maintaining competitive or improved reasoning performance. Notably, LATS achieves state-of-the-art pass@1 accuracy (92.7%) for programming on HumanEval with GPT-4 and demonstrates gradient-free performance (average score of 75.9) comparable to gradient-based fine-tuning for web navigation on WebShop with GPT-3.5. Code can be found at https://github.com/lapisrocks/LanguageAgentTreeSearch", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Andy Zhou;Kai Yan;Michal Shlapentokh-Rothman;Haohan Wang;Yu-Xiong Wang", "authorids": "~Andy_Zhou2;~Kai_Yan1;~Michal_Shlapentokh-Rothman1;~Haohan_Wang1;~Yu-Xiong_Wang1", "gender": "M;M;M;;F", "homepage": "https://www.andyzhou.ai;https://kaiyan289.github.io/;http://cs.cmu.edu/~haohanw;https://yxw.cs.illinois.edu/;https://michalmsr.web.illinois.edu/", "dblp": ";;132/4066;35/10700;269/4751", "google_scholar": "https://scholar.google.com/citations?hl=en;KElKfgQAAAAJ;nZxJGeUAAAAJ;T_Q-xDkAAAAJ;x9szIWsAAAAJ", "orcid": ";;;;", "linkedin": "andy-zhou-679376206/;%E5%BC%80-%E9%A2%9C-18b7931b1/;haohanwang/;;michal-shlapentokh-rothman/", "or_profile": "~Andy_Zhou2;~Kai_Yan1;~Haohan_Wang1;~Yu-Xiong_Wang1;~Michal_M_Shlapentokh-Rothman1", "aff": "Department of Computer Science;University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;Department of Computer Science, University of Illinois Urbana-Champaign;University of Illinois, Urbana Champaign", "aff_domain": "cs.illinois.edu;cs.illinois.edu;illinois.edu;cs.illinois.edu;illinois.edu", "position": "Undergrad student;PhD student;Assistant Professor;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nzhou2024language,\ntitle={Language Agent Tree Search Unifies Reasoning, Acting, and Planning in Language Models},\nauthor={Andy Zhou and Kai Yan and Michal Shlapentokh-Rothman and Haohan Wang and Yu-Xiong Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=njwv9BsGHF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 925423, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 191, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5570756468692384789&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 10, "email": "cs.illinois.edu;cs.illinois.edu;illinois.edu;cs.illinois.edu;illinois.edu", "author_num": 5, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "Unknown Institution;University of Illinois Urbana-Champaign", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": ";https://illinois.edu", "aff_unique_abbr": ";UIUC", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "1;1;1;1", "aff_country_unique": ";United States" }, { "title": "LLM Maybe LongLM: SelfExtend LLM Context Window Without Tuning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33106", "id": "nkOMLBIiI7", "proceeding": "https://proceedings.mlr.press/v235/jin24b.html", "pdf": "https://openreview.net/pdf?id=nkOMLBIiI7", "openreview": "https://openreview.net/forum?id=nkOMLBIiI7", "author_site": "Hongye Jin, Xiaotian Han, Jingfeng Yang, Zhimeng Jiang, Zirui Liu, Chia-Yuan Chang, Huiyuan Chen, Xia Hu", "tldr": "", "abstract": "It is well known that LLMs cannot generalize well to long contexts whose lengths are larger than the training sequence length. This poses challenges when employing LLMs for processing long input sequences during inference. In this work, we argue that LLMs themselves have inherent capabilities to handles s long contexts without fine-tuning. To achieve this goal, we propose SelfExtend to extend the context window of LLMs by constructing bi-level attention information: the grouped attention and the neighbor attention. The grouped attention captures the dependencies among tokens that are far apart, while neighbor attention captures dependencies among adjacent tokens within a specified range. The two-level attentions are computed based on the original model's self-attention mechanism during inference. With minor code modification, our SelfExtend can effortlessly extend existing LLMs' context window without any fine-tuning. We conduct comprehensive experiments on multiple benchmarks and the results show that our SelfExtend can effectively extend existing LLMs' context window length.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hongye Jin;Xiaotian Han;Jingfeng Yang;Zhimeng Jiang;Zirui Liu;Chia-Yuan Chang;Huiyuan Chen;Xia Hu", "authorids": "~Hongye_Jin1;~Xiaotian_Han1;~Jingfeng_Yang2;~Zhimeng_Jiang1;~Zirui_Liu1;~Chia-Yuan_Chang3;~Huiyuan_Chen1;~Xia_Hu4", "gender": "M;M;M;M;M;Not Specified;M;M", "homepage": "https://github.com/Mooler0410;https://ahxt.github.io/;https://jingfengyang.github.io/;http://www.zhimengjiang.com/;https://zirui-ray-liu.github.io/;https://z76316.github.io/;;https://cs.rice.edu/~xh37/index.html", "dblp": "268/7929;;;217/3235;196/8629-1.html;03/1382-2.html;204/5464;256/9406.html", "google_scholar": ";Uromx98AAAAJ;hysBvrwAAAAJ;5Es3Yk4AAAAJ;https://scholar.google.com/citations?hl=zh-CN;EO595aMAAAAJ;j3y4dJwAAAAJ;https://scholar.google.com.tw/citations?user=pcCS60IAAAAJ", "orcid": ";;;0000-0001-6933-3952;;0009-0001-1889-612X;0000-0002-6360-558X;", "linkedin": ";;jingfeng-yang-797864172/;;;chia-yuan-chang/;;", "or_profile": "~Hongye_Jin1;~Xiaotian_Han1;~Jingfeng_Yang2;~Zhimeng_Jiang1;~Zirui_Liu1;~Chia-Yuan_Chang3;~Huiyuan_Chen1;~Xia_Hu2", "aff": "Texas A&M;Texas A&M University;Amazon;VISA Research;Rice University;Texas A&M University - College Station;Amazon;Rice University", "aff_domain": "tamu.edu;tamu.edu;amazon.com;visa.com;rice.edu;tamu.edu;amazon.com;rice.edu", "position": "PhD student;PhD student;Researcher;Researcher;PhD student;PhD student;Researcher;Associate Professor", "bibtex": "@inproceedings{\njin2024llm,\ntitle={{LLM} Maybe Long{LM}: SelfExtend {LLM} Context Window Without Tuning},\nauthor={Hongye Jin and Xiaotian Han and Jingfeng Yang and Zhimeng Jiang and Zirui Liu and Chia-Yuan Chang and Huiyuan Chen and Xia Hu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nkOMLBIiI7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2203388, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 125, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6384208667411400297&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "email": "tamu.edu;tamu.edu;amazon.com;visa.com;rice.edu;tamu.edu;amazon.com;rice.edu", "author_num": 8, "aff_unique_index": "0;0;1;2;3;0;1;3", "aff_unique_norm": "Texas A&M University;Amazon;VISA;Rice University", "aff_unique_dep": ";Amazon.com, Inc.;Research;", "aff_unique_url": "https://www.tamu.edu;https://www.amazon.com;https://www.visa.com/;https://www.rice.edu", "aff_unique_abbr": "TAMU;Amazon;VISA;Rice", "aff_campus_unique_index": "1", "aff_campus_unique": ";College Station", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "EVEREST: Efficient Masked Video Autoencoder by Removing Redundant Spatiotemporal Tokens", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33105", "id": "nn5OPHom8t", "proceeding": "https://proceedings.mlr.press/v235/hwang24d.html", "pdf": "https://openreview.net/pdf?id=nn5OPHom8t", "openreview": "https://openreview.net/forum?id=nn5OPHom8t", "author_site": "Sunil Hwang, Jaehong Yoon, Youngwan Lee, Sung Ju Hwang", "tldr": "", "abstract": "Masked Video Autoencoder (MVA) approaches have demonstrated their potential by significantly outperforming previous video representation learning methods. However, they waste an excessive amount of computations and memory in predicting uninformative tokens/frames due to random masking strategies. (e.g., over 16 nodes with 128 NVIDIA A100 GPUs). To resolve this issue, we exploit the unequal information density among the patches in videos and propose EVEREST, a surprisingly efficient MVA approach for video representation learning that finds tokens containing rich motion features and discards uninformative ones during both pre-training and fine-tuning. We further present an information-intensive frame selection strategy that allows the model to focus on informative and causal frames with minimal redundancy. Our method significantly reduces the computation and memory requirements of MVA, enabling the pre-training and fine-tuning on a single machine with 8 GPUs while achieving comparable performance to computation- and memory-heavy baselines on multiple benchmarks and the uncurated Ego4D dataset. We hope that our work contributes to reducing the barrier to further research on video understanding.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sunil Hwang;Jaehong Yoon;Youngwan Lee;Sung Ju Hwang", "authorids": "~Sunil_Hwang1;~Jaehong_Yoon1;~Youngwan_Lee1;~Sung_Ju_Hwang1", "gender": ";M;M;", "homepage": "https://github.com/sunilhoho;https://jaehong31.github.io/;https://youngwanlee.github.io/;", "dblp": "225/3993;203/4449;184/5625;", "google_scholar": ";-5comoUAAAAJ;EqemKYsAAAAJ;", "orcid": ";;0000-0001-8644-155X;", "linkedin": ";jaehongyoon/;youngwanlee/;", "or_profile": "~Sunil_Hwang1;~Jaehong_Yoon1;~Youngwan_Lee1;~Sung_Ju_Hwang1", "aff": "Korea Military Academy;University of North Carolina at Chapel Hill;Electronics and Telecommunication Research Institute;", "aff_domain": "kma.ac.kr;unc.edu;etri.re.kr;", "position": "Instructor;Postdoc;Researcher;", "bibtex": "@inproceedings{\nhwang2024everest,\ntitle={{EVEREST}: Efficient Masked Video Autoencoder by Removing Redundant Spatiotemporal Tokens},\nauthor={Sunil Hwang and Jaehong Yoon and Youngwan Lee and Sung Ju Hwang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nn5OPHom8t}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3998440, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12311685144230178080&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "kma.ac.kr;unc.edu;etri.re.kr;", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Korea Military Academy;University of North Carolina;Electronics and Telecommunication Research Institute", "aff_unique_dep": ";;", "aff_unique_url": "http://www.kma.go.kr;https://www.unc.edu;http://www.etri.re.kr", "aff_unique_abbr": "KMA;UNC;ETRI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Chapel Hill", "aff_country_unique_index": "0;1;0", "aff_country_unique": "South Korea;United States" }, { "title": "Can a Few Decide for Many? The Metric Distortion of Sortition", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33104", "id": "nsjfoziR5j", "proceeding": "https://proceedings.mlr.press/v235/caragiannis24a.html", "pdf": "https://openreview.net/pdf?id=nsjfoziR5j", "openreview": "https://openreview.net/forum?id=nsjfoziR5j", "author_site": "Ioannis Caragiannis, Evi Micha, Jannik Peters", "tldr": "", "abstract": "Recent works have studied the design of algorithms for selecting representative sortition panels. However, the most central question remains unaddressed: Do these panels reflect the entire population's opinion? We present a positive answer by adopting the concept of metric distortion from computational social choice, which aims to quantify how much a panel's decision aligns with the ideal decision of the population when preferences and agents lie on a metric space. We show that uniform selection needs only logarithmically many agents in terms of the number of alternatives to achieve almost optimal distortion. We also show that Fair Greedy Capture, a selection algorithm introduced recently by Ebadian and Micha (2024), matches uniform selection's guarantees of almost optimal distortion and also achieves constant ex-post distortion, ensuring a ``best of both worlds'' performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ioannis Caragiannis;Evi Micha;Jannik Peters", "authorids": "~Ioannis_Caragiannis1;~Evi_Micha1;~Jannik_Peters1", "gender": "M;F;M", "homepage": "https://cs.au.dk/~iannis/;https://evi-micha.github.io;https://sites.google.com/view/jannikpeters", "dblp": "c/IoannisCaragiannis;204/3011;187/9581-1", "google_scholar": "https://scholar.google.gr/citations?hl=en;;https://scholar.google.de/citations?user=fcfJZkcAAAAJ", "orcid": "0000-0002-4918-7131;;", "linkedin": "ioannis-caragiannis-293979104/;;", "or_profile": "~Ioannis_Caragiannis1;~Evi_Micha1;~Jannik_Peters1", "aff": "Aarhus University;University of Southern California;Technische Universit\u00e4t Berlin", "aff_domain": "au.dk;usc.edu;tu-berlin.de", "position": "Full Professor;Assistant Professor;PhD student", "bibtex": "@inproceedings{\ncaragiannis2024can,\ntitle={Can a Few Decide for Many? The Metric Distortion of Sortition},\nauthor={Ioannis Caragiannis and Evi Micha and Jannik Peters},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nsjfoziR5j}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 679488, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8598007654605907197&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 8, "email": "au.dk;usc.edu;tu-berlin.de", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Aarhus University;University of Southern California;Technische Universit\u00e4t Berlin", "aff_unique_dep": ";;", "aff_unique_url": "https://au.dk;https://www.usc.edu;https://www.tu-berlin.de", "aff_unique_abbr": "AU;USC;TU Berlin", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;2", "aff_country_unique": "Denmark;United States;Germany" }, { "title": "$H$-Consistency Guarantees for Regression", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33103", "id": "nvHlHfjJPe", "proceeding": "https://proceedings.mlr.press/v235/mao24c.html", "pdf": "https://openreview.net/pdf?id=nvHlHfjJPe", "openreview": "https://openreview.net/forum?id=nvHlHfjJPe", "author_site": "Anqi Mao, Mehryar Mohri, Yutao Zhong", "tldr": "", "abstract": "We present a detailed study of $H$-consistency bounds for regression. We first present new theorems that generalize the tools previously given to establish $H$-consistency bounds. This generalization proves essential for analyzing $H$-consistency bounds specific to regression. Next, we prove a series of novel $H$-consistency bounds for surrogate loss functions of the squared loss, under the assumption of a symmetric distribution and a bounded hypothesis set. This includes positive results for the Huber loss, all $\\ell_p$ losses, $p \\geq 1$, the squared $\\epsilon$-insensitive loss, as well as a negative result for the $\\epsilon$-insensitive loss used in Support Vector Regression (SVR). We further leverage our analysis of $H$-consistency for regression and derive principled surrogate losses for adversarial regression (Section 5). This readily establishes novel algorithms for adversarial regression, for which we report favorable experimental results in Section 6.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anqi Mao;Mehryar Mohri;Yutao Zhong", "authorids": "~Anqi_Mao1;~Mehryar_Mohri2;~Yutao_Zhong1", "gender": "F;M;", "homepage": "https://anqi-mao.github.io;https://cs.nyu.edu/~mohri/;", "dblp": "241/6864;03/5448;51/3178-2", "google_scholar": "nkjIZ-oAAAAJ;ktwwLjsAAAAJ;", "orcid": ";;", "linkedin": ";mehryar-mohri-3737b981/;", "or_profile": "~Anqi_Mao1;~Mehryar_Mohri2;~Yutao_Zhong1", "aff": "Courant Institute of Mathematical Sciences, NYU;Google Research;Google", "aff_domain": "cims.nyu.edu;google.com;google.com", "position": "PhD student;Principal Researcher;Researcher", "bibtex": "@inproceedings{\nmao2024hconsistency,\ntitle={\\$H\\$-Consistency Guarantees for Regression},\nauthor={Anqi Mao and Mehryar Mohri and Yutao Zhong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nvHlHfjJPe}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 481892, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13806304699553808194&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "cims.nyu.edu;google.com;google.com", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "New York University;Google", "aff_unique_dep": "Courant Institute of Mathematical Sciences;Google Research", "aff_unique_url": "https://www.courant.nyu.edu;https://research.google", "aff_unique_abbr": "NYU;Google Research", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "New York;Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Robustness of Deep Learning for Accelerated MRI: Benefits of Diverse Training Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33102", "id": "nvfZgdHtHc", "proceeding": "https://proceedings.mlr.press/v235/lin24h.html", "pdf": "https://openreview.net/pdf?id=nvfZgdHtHc", "openreview": "https://openreview.net/forum?id=nvfZgdHtHc", "author_site": "Kang Lin, Reinhard Heckel", "tldr": "", "abstract": "Deep learning based methods for image reconstruction are state-of-the-art for a variety of imaging tasks. However, neural networks often perform worse if the training data differs significantly from the data they are applied to. For example, a model trained for accelerated magnetic resonance imaging (MRI) on one scanner performs worse on another scanner. In this work, we investigate the impact of the training data on a model's performance and robustness for accelerated MRI. We find that models trained on the combination of various data distributions, such as those obtained from different MRI scanners and anatomies, exhibit robustness equal or superior to models trained on the best single distribution for a specific target distribution. Thus training on such diverse data tends to improve robustness. Furthermore, training on such a diverse dataset does not compromise in-distribution performance, i.e., a model trained on diverse data yields in-distribution performance at least as good as models trained on the more narrow individual distributions. Our results suggest that training a model for imaging on a variety of distributions tends to yield a more effective and robust model than maintaining separate models for individual distributions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kang Lin;Reinhard Heckel", "authorids": "~Kang_Lin1;~Reinhard_Heckel1", "gender": "M;M", "homepage": ";", "dblp": ";81/9668", "google_scholar": "https://scholar.google.com/citations?hl=en;ZWV0I7cAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Kang_Lin1;~Reinhard_Heckel1", "aff": "Technical University Munich;Rice University", "aff_domain": "tum.de;rice.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nlin2024robustness,\ntitle={Robustness of Deep Learning for Accelerated {MRI}: Benefits of Diverse Training Data},\nauthor={Kang Lin and Reinhard Heckel},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nvfZgdHtHc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8964162, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17534568045588487906&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "tum.de;rice.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Technical University of Munich;Rice University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tum.de;https://www.rice.edu", "aff_unique_abbr": "TUM;Rice", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Germany;United States" }, { "title": "BRAIn: Bayesian Reward-conditioned Amortized Inference for natural language generation from feedback", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33101", "id": "nxzXTLByXO", "proceeding": "https://proceedings.mlr.press/v235/pandey24a.html", "pdf": "https://openreview.net/pdf?id=nxzXTLByXO", "openreview": "https://openreview.net/forum?id=nxzXTLByXO", "author_site": "Gaurav Pandey, Yatin Nandwani, Tahira Naseem, Mayank Mishra, Guangxuan Xu, Dinesh Raghu, Sachindra Joshi, Asim Munawar, Ram\u00f3n Astudillo", "tldr": "", "abstract": "Distribution matching methods for language model alignment such as Generation with Distributional Control (GDC) and Distributional Policy Gradient (DPG) have not received the same level of attention in reinforcement learning from human feedback (RLHF) as contrastive methods such as Sequence Likelihood Calibration (SLiC), Direct Preference Optimization (DPO) and its variants. We identify high variance of the gradient estimate as the primary reason for the lack of success of these methods and propose a self-normalized baseline to reduce the variance. We further generalize the target distribution in DPG, GDC and DPO by using Bayes' rule to define the reward-conditioned posterior. The resulting approach, referred to as BRAIn - Bayesian Reward-conditioned Amortized Inference acts as a bridge between distribution matching methods and DPO and significantly outperforms prior art in summarization and Antropic HH tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gaurav Pandey;Yatin Nandwani;Tahira Naseem;Mayank Mishra;Guangxuan Xu;Dinesh Raghu;Sachindra Joshi;Asim Munawar;Ram\u00f3n Fernandez Astudillo", "authorids": "~Gaurav_Pandey2;~Yatin_Nandwani1;~Tahira_Naseem1;~Mayank_Mishra1;~Guangxuan_Xu1;~Dinesh_Raghu1;~Sachindra_Joshi1;~Asim_Munawar2;~Ram\u00f3n_Fernandez_Astudillo1", "gender": "M;M;F;M;M;;M;M;M", "homepage": "https://researcher.watson.ibm.com/researcher/view.php?person=in-gpandey1;http://www.cse.iitd.ac.in/~yatin;;https://mayank31398.github.io/;;https://dineshraghu.github.io/;https://researcher.watson.ibm.com/researcher/view.php?person=in-jsachind;;https://ramon-astudillo.github.io/", "dblp": "23/3937-1;255/7046;44/642;;278/8544.html;72/11205;96/2418;;", "google_scholar": "MjYpRw8AAAAJ;https://scholar.google.com/citations?hl=en;IoVlb40AAAAJ;YsbtW6cAAAAJ;ohsEWqsAAAAJ;https://scholar.google.co.in/citations?user=kphcPUkAAAAJ;https://scholar.google.co.in/citations?user=aRo6uNEAAAAJ;;zJ4uM00AAAAJ", "orcid": ";;;;;;;;", "linkedin": "gaurav-pandey-11321120/;yatin-nandwani-0804ba9/;tahira-naseem-12066b46/;mayank31398;;;;asimmunawar/;", "or_profile": "~Gaurav_Pandey2;~Yatin_Nandwani1;~Tahira_Naseem1;~Mayank_Mishra1;~Guangxuan_Xu1;~Dinesh_Raghu1;~Sachindra_Joshi1;~Asim_Munawar2;~Ramon_Fernandez_Astudillo1", "aff": "International Business Machines;Indian Institute of Technology Delhi;IBM, International Business Machines;International Business Machines;IBM Research;IBM Research - New Delhi;;International Business Machines;International Business Machines", "aff_domain": "ibm.com;iitd.ac.in;us.ibm.com;ibm.com;ibm.com;in.ibm.com;;ibm.com;ibm.com", "position": "Research Scientist;PhD student;Researcher;Researcher;Researcher;Researcher;;Researcher;Researcher", "bibtex": "@inproceedings{\npandey2024brain,\ntitle={{BRAI}n: Bayesian Reward-conditioned Amortized Inference for natural language generation from feedback},\nauthor={Gaurav Pandey and Yatin Nandwani and Tahira Naseem and Mayank Mishra and Guangxuan Xu and Dinesh Raghu and Sachindra Joshi and Asim Munawar and Ram{\\'o}n Fernandez Astudillo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nxzXTLByXO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 549845, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10403208532069103779&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 7, "email": "ibm.com;iitd.ac.in;us.ibm.com;ibm.com;ibm.com;in.ibm.com;;ibm.com;ibm.com", "author_num": 9, "aff_unique_index": "0;1;2;0;3;3;0;0", "aff_unique_norm": "International Business Machines Corporation;Indian Institute of Technology Delhi;International Business Machines;IBM", "aff_unique_dep": ";;;IBM Research", "aff_unique_url": "https://www.ibm.com;https://www.iitd.ac.in;https://www.ibm.com;https://www.ibm.com/research", "aff_unique_abbr": "IBM;IIT Delhi;IBM;IBM", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Delhi;New Delhi", "aff_country_unique_index": "0;1;0;0;0;1;0;0", "aff_country_unique": "United States;India" }, { "title": "Using Left and Right Brains Together: Towards Vision and Language Planning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33100", "id": "o1gS6MNAw8", "proceeding": "https://proceedings.mlr.press/v235/cen24a.html", "pdf": "https://openreview.net/pdf?id=o1gS6MNAw8", "openreview": "https://openreview.net/forum?id=o1gS6MNAw8", "author_site": "Jun CEN, Chenfei Wu, Xiao Liu, Shengming Yin, Yixuan Pei, Jinglong Yang, Qifeng Chen, Nan Duan, Jianguo Zhang", "tldr": "", "abstract": "Large Language Models (LLMs) and Large Multi-modality Models (LMMs) have demonstrated remarkable decision masking capabilities on a variety of tasks. However, they inherently operate planning within the language space, lacking the vision and spatial imagination ability. In contrast, humans utilize both left and right hemispheres of the brain for language and visual planning during the thinking process. Therefore, we introduce a novel vision-language planning framework in this work to perform concurrent visual and language planning for tasks with inputs of any form. Our framework incorporates visual planning to capture intricate environmental details, while language planning enhances the logical coherence of the overall system. We evaluate the effectiveness of our framework across vision-language tasks, vision-only tasks, and language-only tasks. The results demonstrate the superior performance of our approach, indicating that the integration of visual and language planning yields better contextually aware task execution.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jun CEN;Chenfei Wu;Xiao Liu;Shengming Yin;Yixuan Pei;Jinglong Yang;Qifeng Chen;Nan Duan;Jianguo Zhang", "authorids": "~Jun_CEN1;~Chenfei_Wu2;~Xiao_Liu14;~Shengming_Yin1;~Yixuan_Pei2;~Jinglong_Yang1;~Qifeng_Chen1;~Nan_Duan1;~Jianguo_Zhang2", "gender": "M;M;M;M;;M;M;M;M", "homepage": "https://cen-jun.com;;https://xiaoliunlc.github.io/;https://shengming-yin.github.io/;http://;;http://cqf.io/;https://nanduan.github.io/;https://scholar.google.com/citations?hl=en&user=ypSmZtIAAAAJ&view_op=list_works", "dblp": "280/3156;;82/1364-29;340/8237;332/1559;;117/4819;;90/6415-1", "google_scholar": "7SKAhBwAAAAJ;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.sg/citations?user=cn1k7gYAAAAJ;rzaiNqIAAAAJ;;;lLMX9hcAAAAJ;Qaa6OxIAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-7578-7667;;0000-0002-8893-366X;;;0000-0002-5616-1861;;;", "linkedin": ";;xiao-liu-71357b72/;shengming-yin-098490259/;;;;;", "or_profile": "~Jun_CEN1;~Chenfei_Wu2;~Xiao_Liu14;~Shengming_Yin1;~Yixuan_Pei2;~Jinglong_Yang1;~Qifeng_Chen1;~Nan_Duan1;~Jianguo_Zhang2", "aff": "Hong Kong University of Science and Technology;Microsoft;Microsoft Research Asia;University of Science and Technology of China;Xi'an Jiaotong University;City University of Hong Kong;Hong Kong University of Science and Technology;Microsoft Research Asia;Southern University for Science and Technology", "aff_domain": "ust.hk;microsoft.com;microsoft.com;ustc.edu.cn;xjtu.edu.cn;my.cityu.edu.hk;hkust.edu;microsoft.com;sustech.edu", "position": "PhD student;Researcher;Researcher;MS student;MS student;PhD student;Assistant Professor;Principal Researcher;Full Professor", "bibtex": "@inproceedings{\ncen2024using,\ntitle={Using Left and Right Brains Together: Towards Vision and Language Planning},\nauthor={Jun CEN and Chenfei Wu and Xiao Liu and Shengming Yin and Yixuan Pei and Jinglong Yang and Qifeng Chen and Nan Duan and Jianguo Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=o1gS6MNAw8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5870005, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18018700165470635086&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "ust.hk;microsoft.com;microsoft.com;ustc.edu.cn;xjtu.edu.cn;my.cityu.edu.hk;hkust.edu;microsoft.com;sustech.edu", "author_num": 9, "aff_unique_index": "0;1;1;2;3;4;0;1;5", "aff_unique_norm": "Hong Kong University of Science and Technology;Microsoft;University of Science and Technology of China;Xi'an Jiao Tong University;City University of Hong Kong;Southern University for Science and Technology", "aff_unique_dep": ";Microsoft Corporation;;;;", "aff_unique_url": "https://www.ust.hk;https://www.microsoft.com;http://www.ustc.edu.cn;https://www.xjtu.edu.cn;https://www.cityu.edu.hk;https://www.sustech.edu.cn", "aff_unique_abbr": "HKUST;Microsoft;USTC;XJTU;CityU;SUSTech", "aff_campus_unique_index": "0;2;0;0;2", "aff_campus_unique": "Hong Kong SAR;;Asia", "aff_country_unique_index": "0;1;0;0;0;0;0;0;0", "aff_country_unique": "China;United States" }, { "title": "Interpreting and Improving Diffusion Models from an Optimization Perspective", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33099", "id": "o2ND9v0CeK", "proceeding": "https://proceedings.mlr.press/v235/permenter24a.html", "pdf": "https://openreview.net/pdf?id=o2ND9v0CeK", "openreview": "https://openreview.net/forum?id=o2ND9v0CeK", "author_site": "Frank Permenter, Chenyang Yuan", "tldr": "", "abstract": "Denoising is intuitively related to projection. Indeed, under the manifold hypothesis, adding random noise is approximately equivalent to orthogonal perturbation. Hence, learning to denoise is approximately learning to project. In this paper, we use this observation to interpret denoising diffusion models as approximate gradient descent applied to the Euclidean distance function. We then provide straight-forward convergence analysis of the DDIM sampler under simple assumptions on the projection error of the denoiser. Finally, we propose a new gradient-estimation sampler, generalizing DDIM using insights from our theoretical results. In as few as 5-10 function evaluations, our sampler achieves state-of-the-art FID scores on pretrained CIFAR-10 and CelebA models and can generate high quality samples on latent diffusion models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Frank Permenter;Chenyang Yuan", "authorids": "~Frank_Permenter1;~Chenyang_Yuan1", "gender": ";", "homepage": "https://www.mit.edu/~fperment;", "dblp": "90/9943;", "google_scholar": "BQ_S4vMAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Frank_Permenter1;~Chenyang_Yuan1", "aff": "Toyota Research Institute;", "aff_domain": "tri.global;", "position": "Principal Researcher;", "bibtex": "@inproceedings{\npermenter2024interpreting,\ntitle={Interpreting and Improving Diffusion Models from an Optimization Perspective},\nauthor={Frank Permenter and Chenyang Yuan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=o2ND9v0CeK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7395445, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7774997093383291268&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "tri.global;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Toyota Research Institute", "aff_unique_dep": "", "aff_unique_url": "https://www.tri.global", "aff_unique_abbr": "TRI", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "ReLU Network with Width $d+\\mathcal{O}(1)$ Can Achieve Optimal Approximation Rate", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33098", "id": "o4HF3N6CZR", "proceeding": "https://proceedings.mlr.press/v235/liu24g.html", "pdf": "https://openreview.net/pdf?id=o4HF3N6CZR", "openreview": "https://openreview.net/forum?id=o4HF3N6CZR", "author_site": "Chenghao LIU, Minghua Chen", "tldr": "", "abstract": "The prevalent employment of narrow neural networks, characterized by their minimal parameter count per layer, has led to a surge in research exploring their potential as universal function approximators. A notable result in this field states that networks with just a width of $d+1$ can approximate any continuous function for input dimension $d$ arbitrarily well. However, the optimal approximation rate for these narrowest networks, i.e., the optimal relation between the count of tunable parameters and the approximation error, remained unclear. In this paper, we address this gap by proving that ReLU networks with width $d+1$ can achieve the optimal approximation rate for continuous functions over the domain $[0,1]^d$ under $L^p$ norm for $p\\in[1,\\infty)$. We further show that for the uniform norm, a width of $d+11$ is sufficient. We also extend the results to narrow feed-forward networks with various activations, confirming their capability to approximate at the optimal rate. This work adds to the understanding of universal approximation of narrow networks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chenghao Liu;Minghua Chen", "authorids": "~Chenghao_Liu5;~Minghua_Chen1", "gender": "M;M", "homepage": ";https://www.mhchen.com", "dblp": ";12/4395-1.html", "google_scholar": ";https://scholar.google.com.hk/citations?user=WzEQ9QwAAAAJ", "orcid": "0009-0001-5374-2767;0000-0003-4763-0037", "linkedin": ";", "or_profile": "~Chenghao_Liu5;~Minghua_Chen1", "aff": "City University of Hong Kong;City University of Hong Kong", "aff_domain": "cityu.edu.hk;cityu.edu.hk", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nliu2024relu,\ntitle={Re{LU} Network with Width \\$d+{\\textbackslash}mathcal\\{O\\}(1)\\$ Can Achieve Optimal Approximation Rate},\nauthor={Chenghao Liu and Minghua Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=o4HF3N6CZR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 615113, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "email": "cityu.edu.hk;cityu.edu.hk", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "City University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cityu.edu.hk", "aff_unique_abbr": "CityU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "PairNet: Training with Observed Pairs to Estimate Individual Treatment Effect", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33097", "id": "o5SVr80Rgg", "proceeding": "https://proceedings.mlr.press/v235/nagalapatti24a.html", "pdf": "https://openreview.net/pdf?id=o5SVr80Rgg", "openreview": "https://openreview.net/forum?id=o5SVr80Rgg", "author_site": "Lokesh Nagalapatti, Pranava Singhal, Avishek Ghosh, Sunita Sarawagi", "tldr": "", "abstract": "Given a dataset of individuals each described by a covariate vector, a treatment, and an observed outcome on the treatment, the goal of the individual treatment effect (ITE) estimation task is to predict outcome changes resulting from a change in treatment. A fundamental challenge is that in the observational data, a covariate\u2019s outcome is observed only under one treatment, whereas we need to infer the difference in outcomes under two different treatments. Several existing approaches address this issue through training with inferred pseudo-outcomes, but their success relies on the quality of these pseudo-outcomes. We propose PairNet, a novel ITE estimation training strategy that minimizes losses over pairs of examples based on their factual observed outcomes. Theoretical analysis for binary treatments reveals that PairNet is a consistent estimator of ITE risk, and achieves smaller generalization error than baseline models. Empirical comparison with thirteen existing methods across eight benchmarks, covering both discrete and continuous treatments, shows that PairNet achieves significantly lower ITE error compared to the baselines. Also, it is model-agnostic and easy to implement.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lokesh Nagalapatti;Pranava Singhal;Avishek Ghosh;Sunita Sarawagi", "authorids": "~Lokesh_Nagalapatti1;200070057@iitb.ac.in;~Avishek_Ghosh2;~Sunita_Sarawagi1", "gender": ";;M;F", "homepage": "https://nlokesh.netlify.app/;;https://sites.google.com/view/avishekghosh;https://www.cse.iitb.ac.in/~sunita/", "dblp": "259/2681.html;;98/275;s/SunitaSarawagi", "google_scholar": "BkkZbo0AAAAJ;;8y0Dg5cAAAAJ;https://scholar.google.com.tw/citations?user=Hg4HmTAAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Lokesh_Nagalapatti1;200070057@iitb.ac.in;~Avishek_Ghosh2;~Sunita_Sarawagi1", "aff": "Indian Institute of Technology, Bombay;;Indian Institute of Technology, Bombay;IIT Bombay", "aff_domain": "iitb.ac.in;;iitb.ac.in;iitb.ac.in", "position": "PhD student;;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nnagalapatti2024pairnet,\ntitle={PairNet: Training with Observed Pairs to Estimate Individual Treatment Effect},\nauthor={Lokesh Nagalapatti and Pranava Singhal and Avishek Ghosh and Sunita Sarawagi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=o5SVr80Rgg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 558275, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:cLwfysPsZz0J:scholar.google.com/&scioq=PairNet:+Training+with+Observed+Pairs+to+Estimate+Individual+Treatment+Effect&hl=en&as_sdt=0,5", "gs_version_total": 7, "email": "iitb.ac.in;;iitb.ac.in;iitb.ac.in", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Indian Institute of Technology Bombay", "aff_unique_dep": "", "aff_unique_url": "https://www.iitb.ac.in", "aff_unique_abbr": "IIT Bombay", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Bombay;Mumbai", "aff_country_unique_index": "0;0;0", "aff_country_unique": "India" }, { "title": "How Spurious Features are Memorized: Precise Analysis for Random and NTK Features", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33096", "id": "o6N1Bqay0k", "proceeding": "https://proceedings.mlr.press/v235/bombari24a.html", "pdf": "https://openreview.net/pdf?id=o6N1Bqay0k", "openreview": "https://openreview.net/forum?id=o6N1Bqay0k", "author_site": "Simone Bombari, Marco Mondelli", "tldr": "", "abstract": "Deep learning models are known to overfit and memorize spurious features in the training dataset. While numerous empirical studies have aimed at understanding this phenomenon, a rigorous theoretical framework to quantify it is still missing. In this paper, we consider spurious features that are uncorrelated with the learning task, and we provide a precise characterization of how they are memorized via two separate terms: _(i)_ the _stability_ of the model with respect to individual training samples, and _(ii)_ the _feature alignment_ between the spurious pattern and the full sample. While the first term is well established in learning theory and it is connected to the generalization error in classical work, the second one is, to the best of our knowledge, novel. Our key technical result gives a precise characterization of the feature alignment for the two prototypical settings of random features (RF) and neural tangent kernel (NTK) regression. We prove that the memorization of spurious features weakens as the generalization capability increases and, through the analysis of the feature alignment, we unveil the role of the model and of its activation function. Numerical experiments show the predictive power of our theory on standard datasets (MNIST, CIFAR-10).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Simone Bombari;Marco Mondelli", "authorids": "~Simone_Bombari1;~Marco_Mondelli1", "gender": "Not Specified;M", "homepage": "https://simone-bombari.github.io/;http://marcomondelli.com", "dblp": "317/4969;120/7089", "google_scholar": ";BHdSb5AAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Simone_Bombari1;~Marco_Mondelli1", "aff": "Institute of Science and Technology;Institute of Science and Technology", "aff_domain": "ist.ac.at;ist.ac.at", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nbombari2024how,\ntitle={How Spurious Features are Memorized: Precise Analysis for Random and {NTK} Features},\nauthor={Simone Bombari and Marco Mondelli},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=o6N1Bqay0k}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 762359, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1423781605980440538&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "ist.ac.at;ist.ac.at", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "", "aff_country_unique": "" }, { "title": "Can Looped Transformers Learn to Implement Multi-step Gradient Descent for In-context Learning?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33095", "id": "o8AaRKbP9K", "proceeding": "https://proceedings.mlr.press/v235/gatmiry24b.html", "pdf": "https://openreview.net/pdf?id=o8AaRKbP9K", "openreview": "https://openreview.net/forum?id=o8AaRKbP9K", "author_site": "Khashayar Gatmiry, Nikunj Saunshi, Sashank J. Reddi, Stefanie Jegelka, Sanjiv Kumar", "tldr": "", "abstract": "Transformers to do reasoning and few-shot learning, without any fine-tuning, is widely conjectured to stem from their ability to implicitly simulate a multi-step algorithms -- such as gradient descent -- with their weights in a single forward pass. Recently, there has been progress in understanding this complex phenomenon from an expressivity point of view, by demonstrating that Transformers can express such multi-step algorithms. However, our knowledge about the more fundamental aspect of its learnability, beyond single layer models, is very limited. In particular, *can training Transformers enable convergence to algorithmic solutions*? In this work we resolve this for in context linear regression with linear looped Transformers -- a multi-layer model with weight sharing that is conjectured to have an inductive bias to learn fix-point iterative algorithms. More specifically, for this setting we show that the global minimizer of the population training loss implements multi-step preconditioned gradient descent, with a preconditioner that adapts to the data distribution. Furthermore, we show a fast convergence for gradient flow on the regression loss, despite the non-convexity of the landscape, by proving a novel gradient dominance condition. To our knowledge, this is the first theoretical analysis for multi-layer Transformer in this setting. We further validate our theoretical findings through synthetic experiments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Khashayar Gatmiry;Nikunj Saunshi;Sashank J. Reddi;Stefanie Jegelka;Sanjiv Kumar", "authorids": "~Khashayar_Gatmiry1;~Nikunj_Saunshi1;~Sashank_J._Reddi1;~Stefanie_Jegelka3;~Sanjiv_Kumar1", "gender": "M;;M;F;", "homepage": "http://ce.sharif.edu/~kgatmiry/;https://www.nikunjsaunshi.com/;;http://people.csail.mit.edu/stefje/;http://www.sanjivk.com/", "dblp": ";199/2236;50/10452;38/7003;", "google_scholar": ";F24vXggAAAAJ;70lgwYwAAAAJ;gTWUZlsAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Khashayar_Gatmiry1;~Nikunj_Saunshi1;~Sashank_J._Reddi1;~Stefanie_Jegelka3;~Sanjiv_Kumar1", "aff": "Massachusetts Institute of Technology;Google;Google;Massachusetts Institute of Technology;Google", "aff_domain": "mit.edu;google.com;google.com;mit.edu;google.com", "position": "PhD student;Researcher;Research Scientist;Associate Professor;Research Scientist", "bibtex": "@inproceedings{\ngatmiry2024can,\ntitle={Can Looped Transformers Learn to Implement Multi-step Gradient Descent for In-context Learning?},\nauthor={Khashayar Gatmiry and Nikunj Saunshi and Sashank J. Reddi and Stefanie Jegelka and Sanjiv Kumar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=o8AaRKbP9K}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 733557, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1263925447848753902&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 9, "email": "mit.edu;google.com;google.com;mit.edu;google.com", "author_num": 5, "aff_unique_index": "0;1;1;0;1", "aff_unique_norm": "Massachusetts Institute of Technology;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://web.mit.edu;https://www.google.com", "aff_unique_abbr": "MIT;Google", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Latent Space Hierarchical EBM Diffusion Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33094", "id": "o9uOuIwhZK", "proceeding": "https://proceedings.mlr.press/v235/cui24b.html", "pdf": "https://openreview.net/pdf?id=o9uOuIwhZK", "openreview": "https://openreview.net/forum?id=o9uOuIwhZK", "author_site": "Jiali Cui, Tian Han", "tldr": "", "abstract": "This work studies the learning problem of the energy-based prior model and the multi-layer generator model. The multi-layer generator model, which contains multiple layers of latent variables organized in a top-down hierarchical structure, typically assumes the Gaussian prior model. Such a prior model can be limited in modelling expressivity, which results in a gap between the generator posterior and the prior model, known as the prior hole problem. Recent works have explored learning the energy-based (EBM) prior model as a second-stage, complementary model to bridge the gap. However, the EBM defined on a multi-layer latent space can be highly multi-modal, which makes sampling from such marginal EBM prior challenging in practice, resulting in ineffectively learned EBM. To tackle the challenge, we propose to leverage the diffusion probabilistic scheme to mitigate the burden of EBM sampling and thus facilitate EBM learning. Our extensive experiments demonstrate a superior performance of our diffusion-learned EBM prior on various challenging tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiali Cui;Tian Han", "authorids": "~Jiali_Cui1;~Tian_Han1", "gender": "M;M", "homepage": "https://jcui1224.github.io/;https://hthth0801.github.io/", "dblp": "17/2469;65/4065-1", "google_scholar": "dDBTlNAAAAAJ;Qtvu5t4AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Jiali_Cui1;~Tian_Han1", "aff": "Stevens Institute of Technology;Stevens Institute of Technology", "aff_domain": "stevens.edu;stevens.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\ncui2024learning,\ntitle={Learning Latent Space Hierarchical {EBM} Diffusion Models},\nauthor={Jiali Cui and Tian Han},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=o9uOuIwhZK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9745194, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2340474177373414039&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "stevens.edu;stevens.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Stevens Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.stevens.edu", "aff_unique_abbr": "SIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "VideoPrism: A Foundational Visual Encoder for Video Understanding", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33093", "id": "oBP8vXFJNQ", "proceeding": "https://proceedings.mlr.press/v235/zhao24f.html", "pdf": "https://openreview.net/pdf?id=oBP8vXFJNQ", "openreview": "https://openreview.net/forum?id=oBP8vXFJNQ", "author_site": "Long Zhao, Nitesh Bharadwaj Gundavarapu, Liangzhe Yuan, Hao Zhou, Shen Yan, Jennifer J. Sun, Luke Friedman, Rui Qian, Tobias Weyand, Yue Zhao, Rachel Hornung, Florian Schroff, Ming-Hsuan Yang, David Ross, Huisheng Wang, Hartwig Adam, Mikhail Sirotenko, Ting Liu, Boqing Gong", "tldr": "", "abstract": "We introduce VideoPrism, a general-purpose video encoder that tackles diverse video understanding tasks with a single frozen model. We pretrain VideoPrism on a heterogeneous corpus containing 36M high-quality video-caption pairs and 582M video clips with noisy parallel text (e.g., ASR transcripts). The pretraining approach improves upon masked autoencoding by global-local distillation of semantic video embeddings and a token shuffling scheme, enabling VideoPrism to focus primarily on the video modality while leveraging the invaluable text associated with videos. We extensively test VideoPrism on four broad groups of video understanding tasks, from web video question answering to CV for science, achieving state-of-the-art performance on 31 out of 33 video understanding benchmarks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Long Zhao;Nitesh Bharadwaj Gundavarapu;Liangzhe Yuan;Hao Zhou;Shen Yan;Jennifer J. Sun;Luke Friedman;Rui Qian;Tobias Weyand;Yue Zhao;Rachel Hornung;Florian Schroff;Ming-Hsuan Yang;David A Ross;Huisheng Wang;Hartwig Adam;Mikhail Sirotenko;Ting Liu;Boqing Gong", "authorids": "~Long_Zhao2;~Nitesh_Bharadwaj_Gundavarapu1;~Liangzhe_Yuan2;~Hao_Zhou3;~Shen_Yan2;~Jennifer_J._Sun1;~Luke_Friedman1;~Rui_Qian1;~Tobias_Weyand3;~Yue_Zhao4;~Rachel_Hornung1;~Florian_Schroff1;~Ming-Hsuan_Yang1;~David_A_Ross1;~Huisheng_Wang1;~Hartwig_Adam1;~Mikhail_Sirotenko1;~Ting_Liu4;~Boqing_Gong1", "gender": "M;Not Specified;M;;M;;M;;;M;;M;M;;;He/him;M;;M", "homepage": "http://garyzhao.github.io/;;https://yuanliangzhe.github.io;;https://shenyann.github.io/;;https://www.linkedin.com/in/lufriedman;https://rui1996.github.io/;http://tobw.net;https://zhaoyue-zephyrus.github.io/;;;https://faculty.ucmerced.edu/mhyang/;;;https://research.google/people/author37870/;https://www.linkedin.com/in/mihail-sirotenko-33187913/;http://tliu.org;http://boqinggong.info", "dblp": "31/5383-3;247/1182.html;215/4356;;https://dblp.uni-trier.de/pers/hd/y/Yan:Shen;;;132/8026;71/6931;48/76-6;;52/5594;79/3711.html;;;75/948;263/7266;52/5150-5;29/7457", "google_scholar": "YTyBTmgAAAAJ;v19p_0oAAAAJ;1H9CkZgAAAAJ;;-shYRd8AAAAJ;;;https://scholar.google.com.sg/citations?user=HrzHNbAAAAAJ;US56Kw8AAAAJ;https://scholar.google.com.hk/citations?user=6_U35tAAAAAJ;;eWbZJlMAAAAJ;p9-ohHsAAAAJ;;4evU9_YAAAAJ;fWd88tEAAAAJ;IpGXRaAAAAAJ;4wSfAIQAAAAJ;lv9ZeVUAAAAJ", "orcid": "0000-0001-8921-8564;;;;;;;;;0000-0003-2753-5921;;;0000-0003-4848-2304;;;0000-0003-1258-4341;;;", "linkedin": "garyzhao9012/;;;;shawnyanyuv/;;;;;;;florianschroff;minghsuanyang/;;;hartwig-adam-1873392/;;;boqing-gong-46aa5821/", "or_profile": "~Long_Zhao2;~Nitesh_Bharadwaj_Gundavarapu1;~Liangzhe_Yuan2;~Hao_Zhou3;~Shen_Yan2;~Jennifer_J._Sun1;~Luke_Friedman1;~Rui_Qian1;~Tobias_Weyand3;~Yue_Zhao4;~Rachel_Hornung1;~Florian_Schroff1;~Ming-Hsuan_Yang1;~David_A_Ross1;~Huisheng_Wang1;~Hartwig_Adam1;~Mikhail_Sirotenko1;~Ting_Liu4;~Boqing_Gong1", "aff": "Google DeepMind;Google;Google DeepMind;;Google Research;;Google;Google;Google;University of Texas, Austin;;Google;University of California at Merced;;Google;Google Research;Google DeepMind;Google DeepMind;Google", "aff_domain": "google.com;google.com;google.com;;google.com;;google.com;google.com;google.com;utexas.edu;;google.com;umcerced.edu;;google.com;google.com;google.com;google.com;google.com", "position": "Research scientist;Researcher;Researcher;;Research Scientist;;Researcher;Researcher;Software Engineer;PhD student;;SWE;Professor;;Researcher;Principal Researcher;TLM;Researcher;Research Scientist", "bibtex": "@inproceedings{\nzhao2024videoprism,\ntitle={VideoPrism: A Foundational Visual Encoder for Video Understanding},\nauthor={Long Zhao and Nitesh Bharadwaj Gundavarapu and Liangzhe Yuan and Hao Zhou and Shen Yan and Jennifer J. Sun and Luke Friedman and Rui Qian and Tobias Weyand and Yue Zhao and Rachel Hornung and Florian Schroff and Ming-Hsuan Yang and David A Ross and Huisheng Wang and Hartwig Adam and Mikhail Sirotenko and Ting Liu and Boqing Gong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=oBP8vXFJNQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5014099, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 19, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3013415257576669547&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 10, "email": "google.com;google.com;google.com;;google.com;;google.com;google.com;google.com;utexas.edu;;google.com;umcerced.edu;;google.com;google.com;google.com;google.com;google.com", "author_num": 19, "aff_unique_index": "0;0;0;0;0;0;0;1;0;2;0;0;0;0;0", "aff_unique_norm": "Google;University of Texas at Austin;University of California, Merced", "aff_unique_dep": "Google DeepMind;;", "aff_unique_url": "https://deepmind.com;https://www.utexas.edu;https://www.ucmerced.edu", "aff_unique_abbr": "DeepMind;UT Austin;UC Merced", "aff_campus_unique_index": "1;1;1;1;1;2;1;3;1;1;1", "aff_campus_unique": ";Mountain View;Austin;Merced", "aff_country_unique_index": "0;1;0;1;1;1;1;1;1;1;1;1;0;0;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "SPADE: Sparsity-Guided Debugging for Deep Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33092", "id": "oBYv73nOoA", "proceeding": "https://proceedings.mlr.press/v235/soltani-moakhar24a.html", "pdf": "https://openreview.net/pdf?id=oBYv73nOoA", "openreview": "https://openreview.net/forum?id=oBYv73nOoA", "author_site": "Arshia Soltani Moakhar, Eugenia Iofinova, Elias Frantar, Dan Alistarh", "tldr": "", "abstract": "It is known that sparsity can improve interpretability for deep neural networks. However, existing methods in the area either require networks that are pre-trained with sparsity constraints, or impose sparsity after the fact, altering the network's general behavior. In this paper, we demonstrate, for the first time, that sparsity can instead be incorporated into the interpretation process itself, as a sample-specific preprocessing step. Unlike previous work, this approach, which we call SPADE, does not place constraints on the trained model and does not affect its behavior during inference on the sample. Given a trained model and a target sample, SPADE uses sample-targeted pruning to provide a \"trace\" of the network's execution on the sample, reducing the network to the most important connections prior to computing an interpretation. We demonstrate that preprocessing with SPADE significantly increases the accuracy of image saliency maps across several interpretability methods. Additionally, SPADE improves the usefulness of neuron visualizations, aiding humans in reasoning about network behavior. Our code is available at https://github.com/IST-DASLab/SPADE.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Arshia Soltani Moakhar;Eugenia Iofinova;Elias Frantar;Dan Alistarh", "authorids": "~Arshia_Soltani_Moakhar1;~Eugenia_Iofinova1;~Elias_Frantar1;~Dan_Alistarh7", "gender": "F;M;M;M", "homepage": ";;https://ckodser.github.io;http://people.csail.mit.edu/alistarh/", "dblp": "295/9539;259/2210;330/2487;36/3251.html", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;hjdlwz8AAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=75q-6ZQAAAAJ", "orcid": "0000-0002-7778-3221;;;", "linkedin": ";elias-frantar-5b43181a4;arshia-soltani-891016143/;", "or_profile": "~Eugenia_Iofinova1;~Elias_Frantar1;~Arshia_Soltani_Moakar1;~Dan_Alistarh1", "aff": "Microsoft Research;Institute of Science and Technology Austria;;Institute of Science and Technology", "aff_domain": "research.microsoft.com;ist.ac.at;;ist.ac.at", "position": "Intern;PhD student;;Full Professor", "bibtex": "@inproceedings{\nmoakhar2024spade,\ntitle={{SPADE}: Sparsity-Guided Debugging for Deep Neural Networks},\nauthor={Arshia Soltani Moakhar and Eugenia Iofinova and Elias Frantar and Dan Alistarh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=oBYv73nOoA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3801625, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3750709433693806887&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "research.microsoft.com;ist.ac.at;;ist.ac.at", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Microsoft;Institute of Science and Technology Austria;Institute of Science and Technology", "aff_unique_dep": "Microsoft Research;;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.ist.ac.at;", "aff_unique_abbr": "MSR;IST Austria;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Austria;" }, { "title": "KISA: A Unified Keyframe Identifier and Skill Annotator for Long-Horizon Robotics Demonstrations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33091", "id": "oCI9gHocws", "proceeding": "https://proceedings.mlr.press/v235/kou24b.html", "pdf": "https://openreview.net/pdf?id=oCI9gHocws", "openreview": "https://openreview.net/forum?id=oCI9gHocws", "author_site": "Longxin Kou, Fei Ni, Yan Zheng, Jinyi Liu, Yifu Yuan, Zibin Dong, Jianye Hao", "tldr": "", "abstract": "Robotic manipulation tasks often span over long horizons and encapsulate multiple subtasks with different skills. Learning policies directly from long-horizon demonstrations is challenging without intermediate keyframes guidance and corresponding skill annotations. Existing approaches for keyframe identification often struggle to offer reliable decomposition for low accuracy and fail to provide semantic relevance between keyframes and skills. For this, we propose a unified **K**eyframe **I**dentifier and **S**kill **A**notator (**KISA**) that utilizes pretrained visual-language representations for precise and interpretable decomposition of unlabeled demonstrations. Specifically, we develop a simple yet effective temporal enhancement module that enriches frame-level representations with expanded receptive fields to capture semantic dynamics at the video level. We further propose coarse contrastive learning and fine-grained monotonic encouragement to enhance the alignment between visual representations from keyframes and language representations from skills. The experimental results across three benchmarks demonstrate that KISA outperforms competitive baselines in terms of accuracy and interpretability of keyframe identification. Moreover, KISA exhibits robust generalization capabilities and the flexibility to incorporate various pretrained representations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Longxin Kou;Fei Ni;YAN ZHENG;Jinyi Liu;Yifu Yuan;Zibin Dong;Jianye HAO", "authorids": "~Longxin_Kou1;~Fei_Ni1;~YAN_ZHENG1;~Jinyi_Liu1;~Yifu_Yuan1;~Zibin_Dong1;~Jianye_HAO1", "gender": "M;M;;M;M;M;M", "homepage": "https://fei-ni.github.io/;https://yanzzzzz.github.io;;https://yifu-yuan.github.io/;https://github.com/GrandpaDZB;http://www.icdai.org/jianye.html;https://longxinkou.github.io/index.html", "dblp": "11/579-1;10/2381-2;192/6688-2;261/3688;358/5885;21/7664.html;", "google_scholar": "https://scholar.google.com.hk/citations?hl=zh-CN;https://scholar.google.com.hk/citations?user=tJuhd1kAAAAJ;kaQS7NAAAAAJ;83JhosMAAAAJ;JQ6881QAAAAJ;;", "orcid": "0009-0007-5623-2782;;;0009-0009-2194-942X;0000-0002-2986-6022;0000-0002-0422-8235;", "linkedin": ";;\u91d1\u6bc5-\u5218-5b7447118;;;;", "or_profile": "~Fei_Ni1;~YAN_ZHENG1;~Jinyi_Liu1;~Yifu_Yuan1;~Zibin_Dong1;~Jianye_HAO1;~LongxinKou1", "aff": "Tianjin University;Tianjin Unibersity, China;Tianjin University;Tianjin University;Tianjin University;Tianjin University;Tianjin University", "aff_domain": "tju.edu.cn;tju.edu.cn;tju.edu.cn;tju.edu.cn;tju.edu.cn;tju.edu.cn;tju.edu.cn", "position": "PhD student;Associate Professor;PhD student;PhD student;MS student;Associate Professor;MS student", "bibtex": "@inproceedings{\nkou2024kisa,\ntitle={{KISA}: A Unified Keyframe Identifier and Skill Annotator for Long-Horizon Robotics Demonstrations},\nauthor={Longxin Kou and Fei Ni and YAN ZHENG and Jinyi Liu and Yifu Yuan and Zibin Dong and Jianye HAO},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=oCI9gHocws}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9224401, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4346799782295152070&as_sdt=40000005&sciodt=0,22&hl=en", "gs_version_total": 5, "email": "tju.edu.cn;tju.edu.cn;tju.edu.cn;tju.edu.cn;tju.edu.cn;tju.edu.cn;tju.edu.cn", "author_num": 7, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Tianjin University", "aff_unique_dep": "", "aff_unique_url": "http://www.tju.edu.cn", "aff_unique_abbr": "TJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Image Restoration Through Generalized Ornstein-Uhlenbeck Bridge", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33090", "id": "oDUJmNCV8D", "proceeding": "https://proceedings.mlr.press/v235/yue24d.html", "pdf": "https://openreview.net/pdf?id=oDUJmNCV8D", "openreview": "https://openreview.net/forum?id=oDUJmNCV8D", "author_site": "Yue Conghan, Zhengwei Peng, Junlong Ma, Shiyan Du, Pengxu Wei, Dongyu Zhang", "tldr": "", "abstract": "Diffusion models exhibit powerful generative capabilities enabling noise mapping to data via reverse stochastic differential equations. However, in image restoration, the focus is on the mapping relationship from low-quality to high-quality images. Regarding this issue, we introduce the Generalized Ornstein-Uhlenbeck Bridge (GOUB) model. By leveraging the natural mean-reverting property of the generalized OU process and further eliminating the variance of its steady-state distribution through the Doob's *h*\u2013transform, we achieve diffusion mappings from point to point enabling the recovery of high-quality images from low-quality ones. Moreover, we unravel the fundamental mathematical essence shared by various bridge models, all of which are special instances of GOUB and empirically demonstrate the optimality of our proposed models. Additionally, we present the corresponding Mean-ODE model adept at capturing both pixel-level details and structural perceptions. Experimental outcomes showcase the state-of-the-art performance achieved by both models across diverse tasks, including inpainting, deraining, and super-resolution. Code is available at https://github.com/Hammour-steak/GOUB.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Conghan Yue;Zhengwei Peng;Junlong Ma;Shiyan Du;Pengxu Wei;Dongyu Zhang", "authorids": "~Conghan_Yue2;pengzhw23@mail2.sysu.edu.cn;majlong3@mail2.sysu.edu.cn;dushy5@mail2.sysu.edu.cn;~Pengxu_Wei1;~Dongyu_Zhang1", "gender": "M;;;;;M", "homepage": "https://www.zhihu.com/people/bai-e-ji-wan-qi;;;;;http://sdcs.sysu.edu.cn/content/2500", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Conghan_Yue2;pengzhw23@mail2.sysu.edu.cn;majlong3@mail2.sysu.edu.cn;dushy5@mail2.sysu.edu.cn;~Pengxu_Wei1;~Dongyu_Zhang1", "aff": "SUN YAT-SEN UNIVERSITY;;;;;SUN YAT-SEN UNIVERSITY", "aff_domain": "sysu.edu.cn;;;;;sysu.edu.cn", "position": "MS student;;;;;Associate Professor", "bibtex": "@inproceedings{\nyue2024image,\ntitle={Image Restoration Through Generalized Ornstein-Uhlenbeck Bridge},\nauthor={Conghan Yue and Zhengwei Peng and Junlong Ma and Shiyan Du and Pengxu Wei and Dongyu Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=oDUJmNCV8D}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3585072, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7587158893640916703&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 7, "email": "sysu.edu.cn;;;;;sysu.edu.cn", "author_num": 6, "aff_unique_index": "0;0", "aff_unique_norm": "Sun Yat-sen University", "aff_unique_dep": "", "aff_unique_url": "http://www.sysu.edu.cn", "aff_unique_abbr": "SYSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Position: Amazing Things Come From Having Many Good Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33089", "id": "oFDFGd9Age", "proceeding": "https://proceedings.mlr.press/v235/rudin24a.html", "pdf": "https://openreview.net/pdf?id=oFDFGd9Age", "openreview": "https://openreview.net/forum?id=oFDFGd9Age", "author_site": "Cynthia Rudin, Chudi Zhong, Lesia Semenova, Margo Seltzer, Ron Parr, Jiachang Liu, Srikar Katta, Jon Donnelly, Harry Chen, Zachery Boner", "tldr": "", "abstract": "The *Rashomon Effect*, coined by Leo Breiman, describes the phenomenon that there exist many equally good predictive models for the same dataset. This phenomenon happens for many real datasets and when it does, it sparks both magic and consternation, but mostly magic. In light of the Rashomon Effect, this perspective piece proposes reshaping the way we think about machine learning, particularly for tabular data problems in the nondeterministic (noisy) setting. We address how the Rashomon Effect impacts (1) the existence of simple-yet-accurate models, (2) flexibility to address user preferences, such as fairness and monotonicity, without losing performance, (3) uncertainty in predictions, fairness, and explanations, (4) reliable variable importance, (5) algorithm choice, specifically, providing advanced knowledge of which algorithms might be suitable for a given problem, and (6) public policy. We also discuss a theory of when the Rashomon Effect occurs and why. Our goal is to illustrate how the Rashomon Effect can have a massive impact on the use of machine learning for complex problems in society.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Cynthia Rudin;Chudi Zhong;Lesia Semenova;Margo Seltzer;Ronald Parr;Jiachang Liu;Srikar Katta;Jon Donnelly;Harry Chen;Zachery Boner", "authorids": "~Cynthia_Rudin1;~Chudi_Zhong1;~Lesia_Semenova1;~Margo_Seltzer1;~Ronald_Parr1;~Jiachang_Liu1;~Srikar_Katta1;~Jon_Donnelly1;~Harry_Chen2;~Zachery_Boner1", "gender": ";F;;;Not Specified;M;M;M;M;M", "homepage": ";https://chudizhong.github.io/;;;https://users.cs.duke.edu/~parr/;https://jiachangliu.github.io/;https://kattasa.github.io/;;;https://zackboner.faculty.bio/", "dblp": ";267/5474;;;26/4670;194/1565-1;357/5662;307/5438;;382/7881", "google_scholar": ";DXKNTLIAAAAJ;;;https://scholar.google.com.tw/citations?user=b-GJ3QIAAAAJ;W_Zyr0AAAAAJ;M5xmXXQAAAAJ;https://scholar.google.com/citations?hl=en;;GtdCQo8AAAAJ", "orcid": ";;;;;;;0000-0002-3971-1075;;0000-0003-0089-8917", "linkedin": ";;;;;;;;harry-chen/;zachery-boner", "or_profile": "~Cynthia_Rudin1;~Chudi_Zhong1;~Lesia_Semenova1;~Margo_Seltzer1;~Ronald_Parr1;~Jiachang_Liu1;~Srikar_Katta1;~Jon_Donnelly1;~Harry_Chen2;~Zachery_Boner1", "aff": ";Duke University;;;Duke University;Duke University;Duke University;Duke University;Duke University;Department of Computer Science, Duke University", "aff_domain": ";duke.edu;;;duke.edu;duke.edu;duke.edu;duke.edu;duke.edu;cs.duke.edu", "position": ";PhD student;;;Full Professor;PhD student;PhD student;PhD student;Undergrad student;PhD student", "bibtex": "@inproceedings{\nrudin2024position,\ntitle={Position: Amazing Things Come From Having Many Good Models},\nauthor={Cynthia Rudin and Chudi Zhong and Lesia Semenova and Margo Seltzer and Ronald Parr and Jiachang Liu and Srikar Katta and Jon Donnelly and Harry Chen and Zachery Boner},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=oFDFGd9Age}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1885911, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5315089339899152381&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": ";duke.edu;;;duke.edu;duke.edu;duke.edu;duke.edu;duke.edu;cs.duke.edu", "author_num": 10, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Duke University", "aff_unique_dep": "", "aff_unique_url": "https://www.duke.edu", "aff_unique_abbr": "Duke", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Preference Optimization for Molecule Synthesis with Conditional Residual Energy-based Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33088", "id": "oLfq1KKneW", "proceeding": "https://proceedings.mlr.press/v235/liu24n.html", "pdf": "https://openreview.net/pdf?id=oLfq1KKneW", "openreview": "https://openreview.net/forum?id=oLfq1KKneW", "author_site": "Songtao Liu, Hanjun Dai, Yue Zhao, Peng Liu", "tldr": "", "abstract": "Molecule synthesis through machine learning is one of the fundamental problems in drug discovery. Current data-driven strategies employ one-step retrosynthesis models and search algorithms to predict synthetic routes in a top-bottom manner. Despite their effective performance, these strategies face limitations in the molecule synthetic route generation due to a greedy selection of the next molecule set without any lookahead. Furthermore, existing strategies cannot control the generation of synthetic routes based on possible criteria such as material costs, yields, and step count. In this work, we propose a general and principled framework via conditional residual energy-based models (EBMs), that focus on the quality of the entire synthetic route based on the specific criteria. By incorporating an additional energy-based function into our probabilistic model, our proposed algorithm can enhance the quality of the most probable synthetic routes (with higher probabilities) generated by various strategies in a plug-and-play fashion. Extensive experiments demonstrate that our framework can consistently boost performance across various strategies and outperforms previous state-of-the-art top-1 accuracy by a margin of 2.5%. Code is available at https://github.com/SongtaoLiu0823/CREBM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Songtao Liu;Hanjun Dai;Yue Zhao;Peng Liu", "authorids": "~Songtao_Liu2;~Hanjun_Dai1;~Yue_Zhao13;~Peng_Liu3", "gender": "M;M;M;M", "homepage": "https://songtaoliu0823.github.io/;https://hanjun-dai.github.io;https://viterbi-web.usc.edu/~yzhao010/;https://s2.ist.psu.edu/pliu", "dblp": ";144/7311;48/76-16;21/6121-5", "google_scholar": "https://scholar.google.com.tw/citations?hl=zh-CN;obpl7GQAAAAJ;https://scholar.google.ca/citations?user=zoGDYsoAAAAJ;", "orcid": ";;0000-0003-3401-4921;", "linkedin": ";hanjun-dai;yzhao062/;", "or_profile": "~Songtao_Liu2;~Hanjun_Dai1;~Yue_Zhao13;~Peng_Liu3", "aff": "Peking University;Google Research;University of Southern California;Pennsylvania State University", "aff_domain": "pku.edu.cn;google.com;usc.edu;psu.edu", "position": "Intern;Researcher;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nliu2024preference,\ntitle={Preference Optimization for Molecule Synthesis with Conditional Residual Energy-based Models},\nauthor={Songtao Liu and Hanjun Dai and Yue Zhao and Peng Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=oLfq1KKneW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 517800, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6660547649813674854&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "pku.edu.cn;google.com;usc.edu;psu.edu", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Peking University;Google;University of Southern California;Pennsylvania State University", "aff_unique_dep": ";Google Research;;", "aff_unique_url": "http://www.pku.edu.cn;https://research.google;https://www.usc.edu;https://www.psu.edu", "aff_unique_abbr": "Peking U;Google Research;USC;PSU", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Mountain View;Los Angeles", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "China;United States" }, { "title": "How to Leverage Diverse Demonstrations in Offline Imitation Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33087", "id": "oOlooUu2Sb", "proceeding": "https://proceedings.mlr.press/v235/yue24c.html", "pdf": "https://openreview.net/pdf?id=oOlooUu2Sb", "openreview": "https://openreview.net/forum?id=oOlooUu2Sb", "author_site": "Sheng Yue, Jiani Liu, Xingyuan Hua, Ju Ren, Sen Lin, Junshan Zhang, Yaoxue Zhang", "tldr": "", "abstract": "Offline Imitation Learning (IL) with imperfect demonstrations has garnered increasing attention owing to the scarcity of expert data in many real-world domains. A fundamental problem in this scenario is *how to extract positive behaviors from noisy data*. In general, current approaches to the problem select data building on state-action similarity to given expert demonstrations, neglecting precious information in (potentially abundant) *diverse* state-actions that deviate from expert ones. In this paper, we introduce a simple yet effective data selection method that identifies positive behaviors based on their *resultant states* - a more informative criterion enabling explicit utilization of dynamics information and effective extraction of both expert and beneficial diverse behaviors. Further, we devise a lightweight behavior cloning algorithm capable of leveraging the expert and selected data correctly. In the experiments, we evaluate our method on a suite of complex and high-dimensional offline IL benchmarks, including continuous-control and vision-based tasks. The results demonstrate that our method achieves state-of-the-art performance, outperforming existing methods on **20/21** benchmarks, typically by **2-5x**, while maintaining a comparable runtime to Behavior Cloning (BC).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sheng Yue;Jiani Liu;Xingyuan Hua;Ju Ren;Sen Lin;Junshan Zhang;Yaoxue Zhang", "authorids": "~Sheng_Yue1;~Jiani_Liu2;xingyuanhua@bit.edu.cn;~Ju_Ren1;~Sen_Lin1;~Junshan_Zhang1;~Yaoxue_Zhang3", "gender": "M;F;;;;M;M", "homepage": "https://shaunyue.github.io;;;;https://slin70.github.io/;https://faculty.engineering.ucdavis.edu/jzhang/;", "dblp": "236/3241;;;;70/9499-1.html;59/1232.html;99/4094", "google_scholar": "n0Gjw_oAAAAJ;https://scholar.google.com.hk/citations?user=cF-pqUIAAAAJ;;;94-TbUsAAAAJ;UtAdFs8AAAAJ;q_76wvMAAAAJ", "orcid": "0009-0001-3416-8181;;;;;;", "linkedin": ";;;;;;", "or_profile": "~Sheng_Yue1;~Jiani_Liu2;xingyuanhua@bit.edu.cn;~Ju_Ren1;~Sen_Lin1;~Junshan_Zhang1;~Yaoxue_Zhang3", "aff": "Tsinghua University;Tsinghua University;;;University of Houston;University of California, Davis;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;;;uh.edu;ucdavis.edu;tsinghua.edu.cn", "position": "Postdoc;PhD student;;;Assistant Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nyue2024how,\ntitle={How to Leverage Diverse Demonstrations in Offline Imitation Learning},\nauthor={Sheng Yue and Jiani Liu and Xingyuan Hua and Ju Ren and Sen Lin and Junshan Zhang and Yaoxue Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=oOlooUu2Sb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6316017, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=980325963472575795&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "tsinghua.edu.cn;tsinghua.edu.cn;;;uh.edu;ucdavis.edu;tsinghua.edu.cn", "author_num": 7, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Tsinghua University;University of Houston;University of California, Davis", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.uh.edu;https://www.ucdavis.edu", "aff_unique_abbr": "THU;UH;UC Davis", "aff_campus_unique_index": "1", "aff_campus_unique": ";Davis", "aff_country_unique_index": "0;0;1;1;0", "aff_country_unique": "China;United States" }, { "title": "DRCT: Diffusion Reconstruction Contrastive Training towards Universal Detection of Diffusion Generated Images", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33086", "id": "oRLwyayrh1", "proceeding": "https://proceedings.mlr.press/v235/chen24ay.html", "pdf": "https://openreview.net/pdf?id=oRLwyayrh1", "openreview": "https://openreview.net/forum?id=oRLwyayrh1", "author_site": "Baoying Chen, Jishen Zeng, Jianquan Yang, Rui Yang", "tldr": "", "abstract": "Diffusion models have made significant strides in visual content generation but also raised increasing demands on generated image detection. Existing detection methods have achieved considerable progress, but they usually suffer a significant decline in accuracy when detecting images generated by an unseen diffusion model. In this paper, we seek to address the generalizability of generated image detectors from the perspective of hard sample classification. The basic idea is that if a classifier can distinguish generated images that closely resemble real ones, then it can also effectively detect less similar samples, potentially even those produced by a different diffusion model. Based on this idea, we propose Diffusion Reconstruction Contrastive Learning (DRCT), a universal framework to enhance the generalizability of the existing detectors. DRCT generates hard samples by high-quality diffusion reconstruction and adopts contrastive training to guide the learning of diffusion artifacts. In addition, we have built a million-scale dataset, DRCT-2M, including 16 types diffusion models for the evaluation of generalizability of detection methods. Extensive experimental results show that detectors enhanced with DRCT achieve over a 10% accuracy improvement in cross-set tests. The code, models, and dataset will soon be available at https://github.com/beibuwandeluori/DRCT.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Baoying Chen;Jishen Zeng;Jianquan Yang;Rui Yang", "authorids": "~Baoying_Chen1;~Jishen_Zeng1;~Jianquan_Yang1;~Rui_Yang18", "gender": "M;M;M;M", "homepage": ";https://scholar.google.com/citations?user=Dp60mEkAAAAJ&hl=zh-CN&oi=ao;http://ruiyang19.github.io;https://github.com/beibuwandeluori", "dblp": "190/7583;93/8522;;", "google_scholar": "https://scholar.google.com/citations?hl=en;Dp60mEkAAAAJ;;", "orcid": "0000-0002-4894-9966;0000-0003-2613-8975;;", "linkedin": ";;;", "or_profile": "~Jishen_Zeng1;~Jianquan_Yang1;~Rui_Yang18;~BaoyingChen1", "aff": "Alibaba Group;SUN YAT-SEN UNIVERSITY;Alibaba Group;Alibaba Group", "aff_domain": "alibaba-inc.com;sysu.edu.cn;alibaba-inc.com;alibaba-inc.com", "position": "Researcher;Assistant Professor;Researcher;Researcher", "bibtex": "@inproceedings{\nchen2024drct,\ntitle={{DRCT}: Diffusion Reconstruction Contrastive Training towards Universal Detection of Diffusion Generated Images},\nauthor={Baoying Chen and Jishen Zeng and Jianquan Yang and Rui Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=oRLwyayrh1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2989977, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16896322279816232001&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 4, "email": "alibaba-inc.com;sysu.edu.cn;alibaba-inc.com;alibaba-inc.com", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Alibaba Group;Sun Yat-sen University", "aff_unique_dep": ";", "aff_unique_url": "https://www.alibaba.com;http://www.sysu.edu.cn", "aff_unique_abbr": "Alibaba;SYSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "On the sample complexity of conditional independence testing with Von Mises estimator with application to causal discovery", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33085", "id": "oSOZ31ISBV", "proceeding": "https://proceedings.mlr.press/v235/jamshidi24a.html", "pdf": "https://openreview.net/pdf?id=oSOZ31ISBV", "openreview": "https://openreview.net/forum?id=oSOZ31ISBV", "author_site": "Fateme Jamshidi, Luca Ganassali, Negar Kiyavash", "tldr": "", "abstract": "Motivated by conditional independence testing, an essential step in constraint-based causal discovery algorithms, we study the nonparametric Von Mises estimator for the entropy of multivariate distributions built on a kernel density estimator. We establish an exponential concentration inequality for this estimator. We design a test for conditional independence (CI) based on our estimator, called VM-CI, which achieves optimal parametric rates under smoothness assumptions. Leveraging the exponential concentration, we prove a tight upper bound for the overall error of VM-CI. This, in turn, allows us to characterize the sample complexity of any constraint-based causal discovery algorithm that uses VM-CI for CI tests. To the best of our knowledge, this is the first sample complexity guarantee for causal discovery for non-linear models and non-Gaussian continuous variables. Furthermore, we empirically show that VM-CI outperforms other popular CI tests in terms of either time, sample complexity, or both. This enhancement significantly improves the performance in structure learning as well.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fateme Jamshidi;Luca Ganassali;Negar Kiyavash", "authorids": "~Fateme_Jamshidi1;luca.ganassali@universite-paris-saclay.fr;~Negar_Kiyavash1", "gender": "F;;F", "homepage": ";;https://people.epfl.ch/negar.kiyavash?lang=en", "dblp": "304/8469;;85/4976", "google_scholar": ";;7tBDvOwAAAAJ", "orcid": ";;0000-0002-8545-7709", "linkedin": "fateme-jamshidi/;;", "or_profile": "~Fateme_Jamshidi1;luca.ganassali@universite-paris-saclay.fr;~Negar_Kiyavash1", "aff": "Swiss Federal Institute of Technology Lausanne;;EPFL - EPF Lausanne", "aff_domain": "epfl.ch;;epfl.ch", "position": "PhD student;;Full Professor", "bibtex": "@inproceedings{\njamshidi2024on,\ntitle={On the sample complexity of conditional independence testing with Von Mises estimator with application to causal discovery},\nauthor={Fateme Jamshidi and Luca Ganassali and Negar Kiyavash},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=oSOZ31ISBV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 579216, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2024065389893407205&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "epfl.ch;;epfl.ch", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Swiss Federal Institute of Technology Lausanne;EPFL", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://www.epfl.ch", "aff_unique_abbr": "EPFL;EPFL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "title": "Learning to Explore in POMDPs with Informational Rewards", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33084", "id": "oTD3WoQyFR", "proceeding": "https://proceedings.mlr.press/v235/xie24a.html", "pdf": "https://openreview.net/pdf?id=oTD3WoQyFR", "openreview": "https://openreview.net/forum?id=oTD3WoQyFR", "author_site": "Annie Xie, Logan M. Bhamidipaty, Evan Liu, Joey Hong, Sergey Levine, Chelsea Finn", "tldr": "", "abstract": "Standard exploration methods typically rely on random coverage of the state space or coverage-promoting exploration bonuses. However, in partially observed settings, the biggest exploration challenge is often posed by the need to discover information-gathering strategies---e.g., an agent that has to navigate to a location in traffic might learn to first check traffic conditions and then choose a route. In this work, we design a POMDP agent that gathers information about the hidden state, using ideas from the meta-exploration literature. Our approach provides an exploration bonus that rewards the agent for gathering information about the state that is relevant for completing the task. While this requires the agent to know what this information is during training, it can obtained in several ways: in the most general case, off-policy algorithms can leverage knowledge about the entire trajectory to determine such information in hindsight, but the user can also provide prior knowledge (e.g., privileged information) to help inform the training process. Through experiments in several partially-observed environments, we find that our approach is competitive with prior methods when minimal exploration is needed, but substantially outperforms them when more complex strategies are required. Our algorithm also shows the ability to learn without any privileged information, by reasoning about the entire trajectory in hindsight and and effectively using any information it reveals about the hidden state.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Annie Xie;Logan Mondal Bhamidipaty;Evan Zheran Liu;Joey Hong;Sergey Levine;Chelsea Finn", "authorids": "~Annie_Xie1;~Logan_Mondal_Bhamidipaty1;~Evan_Zheran_Liu1;~Joey_Hong2;~Sergey_Levine1;~Chelsea_Finn1", "gender": ";M;M;M;M;F", "homepage": "https://cs.stanford.edu/~anniexie/;https://logan-mondal-bhamidipaty.com/;https://ezliu.github.io;;https://people.eecs.berkeley.edu/~svlevine/;https://ai.stanford.edu/~cbfinn/", "dblp": "215/3608;399/2851;199/1870;188/6056.html;80/7594;131/1783", "google_scholar": ";wJ_Eo0sAAAAJ;qjDVoqQAAAAJ;SiBVfPUAAAAJ;8R35rCwAAAAJ;vfPE6hgAAAAJ", "orcid": ";0009-0001-3978-9462;;;;", "linkedin": ";logan-bhamidipaty/;;;;", "or_profile": "~Annie_Xie1;~Logan_Mondal_Bhamidipaty1;~Evan_Zheran_Liu1;~Joey_Hong2;~Sergey_Levine1;~Chelsea_Finn1", "aff": "Stanford University;Stanford University;Generally Intelligent;University of California, Berkeley;Google;Google", "aff_domain": "stanford.edu;cs.stanford.edu;generallyintelligent.com;berkeley.edu;google.com;google.com", "position": "PhD student;MS student;Researcher;PhD student;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nxie2024learning,\ntitle={Learning to Explore in {POMDP}s with Informational Rewards},\nauthor={Annie Xie and Logan Mondal Bhamidipaty and Evan Zheran Liu and Joey Hong and Sergey Levine and Chelsea Finn},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=oTD3WoQyFR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1458161, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3884820664220414200&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 4, "email": "stanford.edu;cs.stanford.edu;generallyintelligent.com;berkeley.edu;google.com;google.com", "author_num": 6, "aff_unique_index": "0;0;1;2;3;3", "aff_unique_norm": "Stanford University;Generally Intelligent;University of California, Berkeley;Google", "aff_unique_dep": ";;;Google", "aff_unique_url": "https://www.stanford.edu;;https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "Stanford;;UC Berkeley;Google", "aff_campus_unique_index": "0;0;2;3;3", "aff_campus_unique": "Stanford;;Berkeley;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Efficient Stochastic Approximation of Minimax Excess Risk Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33083", "id": "oTYuORAMaP", "proceeding": "https://proceedings.mlr.press/v235/zhang24d.html", "pdf": "https://openreview.net/pdf?id=oTYuORAMaP", "openreview": "https://openreview.net/forum?id=oTYuORAMaP", "author_site": "Lijun Zhang, Haomin Bai, Wei-Wei Tu, Ping Yang, Yao Hu", "tldr": "", "abstract": "While traditional distributionally robust optimization (DRO) aims to minimize the maximal risk over a set of distributions, Agarwal & Zhang (2022) recently proposed a variant that replaces risk with *excess risk*. Compared to DRO, the new formulation\u2014minimax excess risk optimization (MERO) has the advantage of suppressing the effect of heterogeneous noise in different distributions. However, the choice of excess risk leads to a very challenging minimax optimization problem, and currently there exists only an inefficient algorithm for empirical MERO. In this paper, we develop efficient stochastic approximation approaches which directly target MERO. Specifically, we leverage techniques from stochastic convex optimization to estimate the minimal risk of every distribution, and solve MERO as a stochastic convex-concave optimization (SCCO) problem with biased gradients. The presence of bias makes existing theoretical guarantees of SCCO inapplicable, and fortunately, we demonstrate that the bias, caused by the estimation error of the minimal risk, is under-control. Thus, MERO can still be optimized with a nearly optimal convergence rate. Moreover, we investigate a practical scenario where the quantity of samples drawn from each distribution may differ, and propose a stochastic approach that delivers *distribution-dependent* convergence rates.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lijun Zhang;Haomin Bai;Wei-Wei Tu;Ping Yang;Yao Hu", "authorids": "~Lijun_Zhang1;baihm@lamda.nju.edu.cn;~Wei-Wei_Tu1;jiadi@xiaohongshu.com;~Yao_Hu1", "gender": ";;M;;M", "homepage": ";;;;", "dblp": ";;229/4363;;", "google_scholar": ";;NrSit7IAAAAJ;;LIu7k7wAAAAJ", "orcid": ";;0000-0002-2407-0252;;0009-0006-1274-7111", "linkedin": ";;wei-wei-tu/;;", "or_profile": "~Lijun_Zhang1;baihm@lamda.nju.edu.cn;~Wei-Wei_Tu1;jiadi@xiaohongshu.com;~Yao_Hu1", "aff": ";;4Paradigm Inc.;;Zhejiang University of Technology", "aff_domain": ";;4paradigm.com;;zjut.edu.cn", "position": ";;Vice President;;Researcher", "bibtex": "@inproceedings{\nzhang2024efficient,\ntitle={Efficient Stochastic Approximation of Minimax Excess Risk Optimization},\nauthor={Lijun Zhang and Haomin Bai and Wei-Wei Tu and Ping Yang and Yao Hu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=oTYuORAMaP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1194881, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16091228132489375390&as_sdt=8005&sciodt=0,7&hl=en", "gs_version_total": 7, "email": ";;4paradigm.com;;zjut.edu.cn", "author_num": 5, "aff_unique_index": "0;1", "aff_unique_norm": "4Paradigm;Zhejiang University of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.4paradigm.com/;https://www.zjut.edu.cn", "aff_unique_abbr": "4Paradigm;ZJUT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Human-like Category Learning by Injecting Ecological Priors from Large Language Models into Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33082", "id": "oTmQmaNkGn", "proceeding": "https://proceedings.mlr.press/v235/jagadish24a.html", "pdf": "https://openreview.net/pdf?id=oTmQmaNkGn", "openreview": "https://openreview.net/forum?id=oTmQmaNkGn", "author_site": "Akshay Kumar Jagadish, Julian Coda-Forno, Mirko Thalmann, Eric Schulz, Marcel Binz", "tldr": "", "abstract": "Ecological rationality refers to the notion that humans are rational agents adapted to their environment. However, testing this theory remains challenging due to two reasons: the difficulty in defining what tasks are ecologically valid and building rational models for these tasks. In this work, we demonstrate that large language models can generate cognitive tasks, specifically category learning tasks, that match the statistics of real-world tasks, thereby addressing the first challenge. We tackle the second challenge by deriving rational agents adapted to these tasks using the framework of meta-learning, leading to a class of models called *ecologically rational meta-learned inference* (ERMI). ERMI quantitatively explains human data better than seven other cognitive models in two different experiments. It additionally matches human behavior on a qualitative level: (1) it finds the same tasks difficult that humans find difficult, (2) it becomes more reliant on an exemplar-based strategy for assigning categories with learning, and (3) it generalizes to unseen stimuli in a human-like way. Furthermore, we show that ERMI's ecologically valid priors allow it to achieve state-of-the-art performance on the OpenML-CC18 classification benchmark.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Akshay Kumar Jagadish;Julian Coda-Forno;Mirko Thalmann;Eric Schulz;Marcel Binz", "authorids": "~Akshay_Kumar_Jagadish1;~Julian_Coda-Forno1;~Mirko_Thalmann1;~Eric_Schulz1;~Marcel_Binz1", "gender": "M;M;;M;M", "homepage": "http://akshaykjagadish.com/;;;https://cpilab.org;", "dblp": "384/4213.html;;;124/0016;212/5102", "google_scholar": "B42Mr-sAAAAJ;beVJGycAAAAJ;;;https://scholar.google.de/citations?user=Lvm9Q8QAAAAJ", "orcid": "0000-0002-7897-9752;;;;", "linkedin": "akshaykjagadish/;;mirko-thalmann-phd-9261a7136/;;", "or_profile": "~Akshay_Kumar_Jagadish1;~Julian_Coda-Forno1;~Mirko_Thalmann1;~Eric_Schulz1;~Marcel_Binz1", "aff": "Max Planck Institute for Biological Cybernetics;Max Planck Institute for Biological Cybernetics, Max-Planck Institute;;Max Planck Institute for Biological Cybernetics;Helmholtz Zentrum M\u00fcnchen", "aff_domain": "tue.mpg.de;tuebingen.mpg.de;;tuebingen.mpg.de;helmholtz-munich.de", "position": "PhD student;PhD student;;Assistant Professor;Postdoc", "bibtex": "@inproceedings{\njagadish2024humanlike,\ntitle={Human-like Category Learning by Injecting Ecological Priors from Large Language Models into Neural Networks},\nauthor={Akshay Kumar Jagadish and Julian Coda-Forno and Mirko Thalmann and Eric Schulz and Marcel Binz},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=oTmQmaNkGn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 973052, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13053870901935259592&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "tue.mpg.de;tuebingen.mpg.de;;tuebingen.mpg.de;helmholtz-munich.de", "author_num": 5, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Max Planck Institute for Biological Cybernetics;Helmholtz Zentrum M\u00fcnchen", "aff_unique_dep": "Biological Cybernetics;", "aff_unique_url": "https://www.biocybernetics.mpg.de;https://www.helmholtz-muenchen.de", "aff_unique_abbr": "MPIBC;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Foundations of Testing for Finite-Sample Causal Discovery", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33081", "id": "oUmXcewb83", "proceeding": "https://proceedings.mlr.press/v235/yan24g.html", "pdf": "https://openreview.net/pdf?id=oUmXcewb83", "openreview": "https://openreview.net/forum?id=oUmXcewb83", "author_site": "Tom Yan, Ziyu Xu, Zachary Lipton", "tldr": "", "abstract": "Discovery of causal relationships is a fundamental goal of science and vital for sound decision making. As such, there has been considerable interest in causal discovery methods with provable guarantees. Existing works have thus far largely focused on discovery under hard intervention and infinite-samples, in which intervening on a node readily reveals the orientation of every edge incident to the node. This setup however overlooks the stochasticity inherent in real-world, finite-sample settings. Our work takes a step towards studying finite-sample causal discovery, wherein multiple interventions on a node are now needed for edge orientation. In this work, we study the canonical setup in theoretical causal discovery literature, where one assumes causal sufficiency and access to the graph skeleton. Our key observation is that discovery may be viewed as structured, multiple testing, and we develop a novel testing framework to this end. Crucially, our framework allows for anytime valid testing as multiple tests are needed to conclude an edge orientation. It also allows for flexible combination of structured test-statistics (enabling one to use Meek rules to propagate edge orientation) as well as robust testing. Through empirical simulations, we confirm the usefulness of our framework. In closing, using this testing framework, we show how one may efficiently verify graph structure by drawing a connection to multi-constraint bandits and designing a novel algorithm to this end.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tom Yan;Ziyu Xu;Zachary Chase Lipton", "authorids": "~Tom_Yan1;~Ziyu_Xu2;~Zachary_Chase_Lipton1", "gender": ";;Unspecified", "homepage": ";https://neilzxu.me;http://zacklipton.com", "dblp": "213/7323;;", "google_scholar": ";;MN9Kfg8AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Tom_Yan1;~Ziyu_Xu2;~Zachary_Chase_Lipton1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu;cmu.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nyan2024foundations,\ntitle={Foundations of Testing for Finite-Sample Causal Discovery},\nauthor={Tom Yan and Ziyu Xu and Zachary Chase Lipton},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=oUmXcewb83}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1879498, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:QZW0SFVjonUJ:scholar.google.com/&scioq=Foundations+of+Testing+for+Finite-Sample+Causal+Discovery&hl=en&as_sdt=0,33", "gs_version_total": 4, "email": "cmu.edu;cmu.edu;cmu.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "BWS: Best Window Selection Based on Sample Scores for Data Pruning across Broad Ranges", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33080", "id": "oWYzIodyC4", "proceeding": "https://proceedings.mlr.press/v235/choi24c.html", "pdf": "https://openreview.net/pdf?id=oWYzIodyC4", "openreview": "https://openreview.net/forum?id=oWYzIodyC4", "author_site": "Hoyong Choi, Nohyun Ki, Hye Won Chung", "tldr": "", "abstract": "Data subset selection aims to find a smaller yet informative subset of a large dataset that can approximate the full-dataset training, addressing challenges associated with training neural networks on large-scale datasets. However, existing methods tend to specialize in either high or low selection ratio regimes, lacking a universal approach that consistently achieves competitive performance across a broad range of selection ratios. We introduce a universal and efficient data subset selection method, Best Window Selection (BWS), by proposing a method to choose the best window subset from samples ordered based on their difficulty scores. This approach offers flexibility by allowing the choice of window intervals that span from easy to difficult samples. Furthermore, we provide an efficient mechanism for selecting the best window subset by evaluating its quality using kernel ridge regression. Our experimental results demonstrate the superior performance of BWS compared to other baselines across a broad range of selection ratios over datasets, including CIFAR-10/100 and ImageNet, and the scenarios involving training from random initialization or fine-tuning of pre-trained models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hoyong Choi;Nohyun Ki;Hye Won Chung", "authorids": "~Hoyong_Choi1;~Nohyun_Ki1;~Hye_Won_Chung2", "gender": "M;F;M", "homepage": ";https://iids.kaist.ac.kr/;https://iids.kaist.ac.kr/people", "dblp": ";https://dblp.uni-trier.de/pers/hd/c/Chung:Hye_Won;", "google_scholar": ";;", "orcid": ";;", "linkedin": "%ED%98%B8%EC%9A%A9-%EC%B5%9C-67b9a919b/;;", "or_profile": "~Hoyong_Choi1;~Hye_Won_Chung2;~Ki_Nohyun1", "aff": "KAIST;Korea Advanced Institute of Science & Technology;KAIST, Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;ee.kaist.ac.kr", "position": "PhD student;Associate Professor;PhD student", "bibtex": "@inproceedings{\nchoi2024bws,\ntitle={{BWS}: Best Window Selection Based on Sample Scores for Data Pruning across Broad Ranges},\nauthor={Hoyong Choi and Nohyun Ki and Hye Won Chung},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=oWYzIodyC4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2265630, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18175229063289832276&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 8, "email": "kaist.ac.kr;kaist.ac.kr;ee.kaist.ac.kr", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "A Linear Time and Space Local Point Cloud Geometry Encoder via Vectorized Kernel Mixture (VecKM)", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33079", "id": "oYltxxam2t", "proceeding": "https://proceedings.mlr.press/v235/yuan24b.html", "pdf": "https://openreview.net/pdf?id=oYltxxam2t", "openreview": "https://openreview.net/forum?id=oYltxxam2t", "author_site": "Dehao Yuan, Cornelia Fermuller, Tahseen Rabbani, Furong Huang, Yiannis Aloimonos", "tldr": "", "abstract": "We propose VecKM, a local point cloud geometry encoder that is descriptive and efficient to compute. VecKM leverages a unique approach by vectorizing a kernel mixture to represent the local point cloud. Such representation's descriptiveness is supported by two theorems that validate its ability to reconstruct and preserve the similarity of the local shape. Unlike existing encoders downsampling the local point cloud, VecKM constructs the local geometry encoding using all neighboring points, producing a more descriptive encoding. Moreover, VecKM is efficient to compute and scalable to large point cloud inputs: VecKM reduces the memory cost from $(n^2+nKd)$ to $(nd+np)$; and reduces the major runtime cost from computing $nK$ MLPs to $n$ MLPs, where $n$ is the size of the point cloud, $K$ is the neighborhood size, $d$ is the encoding dimension, and $p$ is a marginal factor. The efficiency is due to VecKM's unique factorizable property that eliminates the need of explicitly grouping points into neighbors. In the normal estimation task, VecKM demonstrates not only 100x faster inference speed but also highest accuracy and strongest robustness. In classification and segmentation tasks, integrating VecKM as a preprocessing module achieves consistently better performance than the PointNet, PointNet++, and point transformer baselines, and runs consistently faster by up to 10 times.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dehao Yuan;Cornelia Fermuller;Tahseen Rabbani;Furong Huang;Yiannis Aloimonos", "authorids": "~Dehao_Yuan1;~Cornelia_Fermuller3;~Tahseen_Rabbani1;~Furong_Huang1;~Yiannis_Aloimonos1", "gender": "M;F;M;F;M", "homepage": "https://www.cs.umd.edu/~dhyuan/;http://users.umiacs.umd.edu/users/fer/;https://www.cs.umd.edu/people/trabbani;https://furong-huang.com;http://www.prg.cs.umd.edu", "dblp": "321/3498;f/CorneliaFermuller;280/2362;72/8513;a/YiannisAloimonos", "google_scholar": "B3zkxloAAAAJ;0gEOJSEAAAAJ;;13yyuCcAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0003-1957-6360;0000-0003-2044-2386;;;", "linkedin": "dehao-yuan-875ba115b/;cornelia-fermuller-594b855/;;;yiannis-aloimonos-6374865/", "or_profile": "~Dehao_Yuan1;~Cornelia_Fermuller3;~Tahseen_Rabbani1;~Furong_Huang1;~Yiannis_Aloimonos1", "aff": "University of Maryland, College Park;University of Maryland, College Park;University of Maryland, College Park;University of Maryland;University of Maryland, College Park", "aff_domain": "umd.edu;umd.edu;umd.edu;cs.umd.edu;umd.edu", "position": "PhD student;Research Scientist;PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nyuan2024a,\ntitle={A Linear Time and Space Local Point Cloud Geometry Encoder via Vectorized Kernel Mixture (Vec{KM})},\nauthor={Dehao Yuan and Cornelia Fermuller and Tahseen Rabbani and Furong Huang and Yiannis Aloimonos},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=oYltxxam2t}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6937255, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2039137315850151867&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "umd.edu;umd.edu;umd.edu;cs.umd.edu;umd.edu", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of Maryland", "aff_unique_dep": "", "aff_unique_url": "https://www/umd.edu", "aff_unique_abbr": "UMD", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "The Relative Value of Prediction in Algorithmic Decision Making", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33078", "id": "oaACFfNbXl", "proceeding": "https://proceedings.mlr.press/v235/perdomo24a.html", "pdf": "https://openreview.net/pdf?id=oaACFfNbXl", "openreview": "https://openreview.net/forum?id=oaACFfNbXl", "author_site": "Juan Perdomo", "tldr": "", "abstract": "Algorithmic predictions are increasingly used to inform the allocations of goods and interventions in the public sphere. In these domains, predictions serve as a means to an end. They provide stakeholders with insights into likelihood of future events as a means to improve decision making quality, and enhance social welfare. However, if maximizing welfare is the ultimate goal, prediction is only a small piece of the puzzle. There are various other policy levers a social planner might pursue in order to improve bottom-line outcomes, such as expanding access to available goods, or increasing the effect sizes of interventions. Given this broad range of design decisions, a basic question to ask is: What is the relative value of prediction in algorithmic decision making? How do the improvements in welfare arising from better predictions compare to those of other policy levers? The goal of our work is to initiate the formal study of these questions. Our main results are theoretical in nature. We identify simple, sharp conditions determining the relative value of prediction vis-\u00e0-vis expanding access, within several statistical models that are popular amongst quantitative social scientists. Furthermore, we illustrate how these theoretical insights can guide the design of algorithmic decision making systems in practice.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Juan Carlos Perdomo", "authorids": "~Juan_Carlos_Perdomo1", "gender": "M", "homepage": "https://jcperdomo.org/", "dblp": "242/7773.html", "google_scholar": "TeBmXz4AAAAJ", "orcid": "", "linkedin": "", "or_profile": "~Juan_Carlos_Perdomo1", "aff": "Harvard University", "aff_domain": "harvard.edu", "position": "Postdoc", "bibtex": "@inproceedings{\nperdomo2024the,\ntitle={The Relative Value of Prediction in Algorithmic Decision Making},\nauthor={Juan Carlos Perdomo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=oaACFfNbXl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3931913, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14782637256683473324&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 8, "email": "harvard.edu", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Uniformly Stable Algorithms for Adversarial Training and Beyond", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33077", "id": "odCl49tWA6", "proceeding": "https://proceedings.mlr.press/v235/xiao24e.html", "pdf": "https://openreview.net/pdf?id=odCl49tWA6", "openreview": "https://openreview.net/forum?id=odCl49tWA6", "author_site": "Jiancong Xiao, Jiawei Zhang, Zhi-Quan Luo, Asuman Ozdaglar", "tldr": "", "abstract": "In adversarial machine learning, neural networks suffer from a significant issue known as robust overfitting, where the robust test accuracy decreases over epochs (Rice et al., 2020). Recent research conducted by Xing et al., 2021;Xiao et al., 2022 has focused on studying the uniform stability of adversarial training. Their investigations revealed that SGD-based adversarial training fails to exhibit uniform stability, and the derived stability bounds align with the observed phenomenon of robust overfitting in experiments. This finding motivates us to develop uniformly stable algorithms specifically tailored for adversarial training. To this aim, we introduce Moreau envelope-$\\mathcal{A}$ (ME-$\\mathcal{A}$), a variant of the Moreau Envelope-type algorithm. We employ a Moreau envelope function to reframe the original problem as a min-min problem, separating the non-strong convexity and non-smoothness of the adversarial loss. Then, this approach alternates between solving the inner and outer minimization problems to achieve uniform stability without incurring additional computational overhead. In practical scenarios, we demonstrate the efficacy of ME-$\\mathcal{A}$ in mitigating the issue of robust overfitting. Beyond its application in adversarial training, this represents a fundamental result in uniform stability analysis, as ME-$\\mathcal{A}$ is the first algorithm to exhibit uniform stability for weakly-convex, non-smooth problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiancong Xiao;Jiawei Zhang;Zhi-Quan Luo;Asuman E. Ozdaglar", "authorids": "~Jiancong_Xiao1;~Jiawei_Zhang6;~Zhi-Quan_Luo1;~Asuman_E._Ozdaglar1", "gender": "M;M;M;F", "homepage": "https://jiancongxiao.github.io;https://www.cuhk.edu.cn/;;https://asu.mit.edu/", "dblp": "330/4306;;;35/2875", "google_scholar": "_vGY3joAAAAJ;;dW3gcXoAAAAJ;https://scholar.google.com.tw/citations?user=nWnBSOsAAAAJ", "orcid": ";0000-0002-9420-384X;;", "linkedin": ";;;", "or_profile": "~Jiancong_Xiao1;~Jiawei_Zhang6;~Zhi-Quan_Luo1;~Asuman_E._Ozdaglar1", "aff": "University of Pennsylvania;Massachusetts Institute of Technology;The Chinese University of Hong Kong, Shenzhen;Massachusetts Institute of Technology", "aff_domain": "upenn.edu;mit.edu;cuhk.edu.cn;mit.edu", "position": "Postdoc;Postdoc;Full Professor;PhD student", "bibtex": "@inproceedings{\nxiao2024uniformly,\ntitle={Uniformly Stable Algorithms for Adversarial Training and Beyond},\nauthor={Jiancong Xiao and Jiawei Zhang and Zhi-Quan Luo and Asuman E. Ozdaglar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=odCl49tWA6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 487189, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2677523054234976607&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "email": "upenn.edu;mit.edu;cuhk.edu.cn;mit.edu", "author_num": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "University of Pennsylvania;Massachusetts Institute of Technology;Chinese University of Hong Kong", "aff_unique_dep": ";;", "aff_unique_url": "https://www.upenn.edu;https://web.mit.edu;https://www.cuhk.edu.cn", "aff_unique_abbr": "UPenn;MIT;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;China" }, { "title": "Generative Active Learning for Long-tailed Instance Segmentation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33076", "id": "ofXRBPtol3", "proceeding": "https://proceedings.mlr.press/v235/zhu24b.html", "pdf": "https://openreview.net/pdf?id=ofXRBPtol3", "openreview": "https://openreview.net/forum?id=ofXRBPtol3", "author_site": "Muzhi Zhu, Chengxiang Fan, Hao Chen, Yang Liu, Weian Mao, Xiaogang Xu, Chunhua Shen", "tldr": "", "abstract": "Recently, large-scale language-image generative models have gained widespread attention and many works have utilized generated data from these models to further enhance the performance of perception tasks. However, not all generated data can positively impact downstream models, and these methods do not thoroughly explore how to better select and utilize generated data. On the other hand, there is still a lack of research oriented towards active learning on generated data. In this paper, we explore how to perform active learning specifically for generated data in the long-tailed instance segmentation task. Subsequently, we propose BSGAL, a new algorithm that estimates the contribution of the current batch-generated data based on gradient cache. BSGAL is meticulously designed to cater for unlimited generated data and complex downstream segmentation tasks. BSGAL outperforms the baseline approach and effectually improves the performance of long-tailed segmentation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Muzhi Zhu;Chengxiang Fan;Hao Chen;Yang Liu;Weian Mao;Xiaogang Xu;Chunhua Shen", "authorids": "~Muzhi_Zhu1;~Chengxiang_Fan1;~Hao_Chen17;~Yang_Liu83;~Weian_Mao2;~Xiaogang_Xu2;~Chunhua_Shen2", "gender": "M;M;;M;M;M;", "homepage": "https://z-mu-z.github.io/;https://leaf1170124460.github.io/;;https://github.com/yangliu96;;https://xiaogang00.github.io;", "dblp": "157/1679;353/0691;;51/3710-98.html;289/1631;118/2268-2;", "google_scholar": "https://scholar.google.com.hk/citations?user=064gBH4AAAAJ;I2aAMsAAAAAJ;;9JcQ2hwAAAAJ;Qu-QXTsAAAAJ;https://scholar.google.com.hk/citations?user=R65xDQwAAAAJ;", "orcid": ";0009-0000-2555-4112;;0009-0003-8540-9154;;0000-0002-7928-7336;", "linkedin": ";;;;;;", "or_profile": "~Muzhi_Zhu1;~Chengxiang_Fan1;~Hao_Chen17;~Yang_Liu83;~Weian_Mao2;~Xiaogang_Xu2;~Chunhua_Shen2", "aff": "Zhejiang University;Zhejiang University;;Zhejiang University;University of Adelaide;Zhejiang University;", "aff_domain": "zju.edu.cn;zju.edu.cn;;zju.edu.cn;adelaide.edu.au;zju.edu.cn;", "position": "PhD student;MS student;;PhD student;PhD student;Assistant Professor;", "bibtex": "@inproceedings{\nzhu2024generative,\ntitle={Generative Active Learning for Long-tailed Instance Segmentation},\nauthor={Muzhi Zhu and Chengxiang Fan and Hao Chen and Yang Liu and Weian Mao and Xiaogang Xu and Chunhua Shen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ofXRBPtol3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3029957, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3270971446566625675&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "zju.edu.cn;zju.edu.cn;;zju.edu.cn;adelaide.edu.au;zju.edu.cn;", "author_num": 7, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Zhejiang University;University of Adelaide", "aff_unique_dep": ";", "aff_unique_url": "https://www.zju.edu.cn;https://www.adelaide.edu.au", "aff_unique_abbr": "ZJU;Adelaide", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "China;Australia" }, { "title": "Generating In-Distribution Proxy Graphs for Explaining Graph Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33075", "id": "ohG9bVMs5j", "proceeding": "https://proceedings.mlr.press/v235/chen24bd.html", "pdf": "https://openreview.net/pdf?id=ohG9bVMs5j", "openreview": "https://openreview.net/forum?id=ohG9bVMs5j", "author_site": "Zhuomin Chen, Jiaxing Zhang, Jingchao Ni, Xiaoting Li, Yuchen Bian, Md Mezbahul Isam, Ananda Mondal, Hua Wei, Dongsheng Luo", "tldr": "", "abstract": "Graph Neural Networks (GNNs) have become a building block in graph data processing, with wide applications in critical domains. The growing needs to deploy GNNs in high-stakes applications necessitate explainability for users in the decision-making processes. A popular paradigm for the explainability of GNNs is to identify explainable subgraphs by comparing their labels with the ones of original graphs. This task is challenging due to the substantial distributional shift from the original graphs in the training set to the set of explainable subgraphs, which prevents accurate prediction of labels with the subgraphs. To address it, in this paper, we propose a novel method that generates proxy graphs for explainable subgraphs that are in the distribution of training data. We introduce a parametric method that employs graph generators to produce proxy graphs. A new training objective based on information theory is designed to ensure that proxy graphs not only adhere to the distribution of training data but also preserve explanatory factors. Such generated proxy graphs can be reliably used to approximate the predictions of the labels of explainable subgraphs. Empirical evaluations across various datasets demonstrate our method achieves more accurate explanations for GNNs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhuomin Chen;Jiaxing Zhang;Jingchao Ni;Xiaoting Li;Yuchen Bian;Md Mezbahul Islam;Ananda Mondal;Hua Wei;Dongsheng Luo", "authorids": "~Zhuomin_Chen1;~Jiaxing_Zhang4;~Jingchao_Ni1;~Xiaoting_Li3;~Yuchen_Bian1;~Md_Mezbahul_Islam1;~Ananda_Mondal1;~Hua_Wei1;~Dongsheng_Luo1", "gender": ";M;M;F;M;;M;M;M", "homepage": ";https://tabzhangjx.github.io/;;https://xiaoting.me/;https://yuchenbian.github.io/;;https://www.cis.fiu.edu/faculty-staff/mondal-ananda/;https://www.public.asu.edu/~hwei27/;https://users.cs.fiu.edu/~dluo/", "dblp": ";131/6330-2;151/3208;;187/4068;;;01/6961-1;", "google_scholar": ";86f1WUAAAAAJ;rH9MTZMAAAAJ;wlf7M-cAAAAJ;gU0icBEAAAAJ;;;F1CEAKwAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";0009-0007-8031-661X;;;0000-0002-0685-3771;;;0000-0002-3735-1635;0000-0003-4192-0826", "linkedin": ";jiaxing-zhang-45593b156/;jingchao-ni-930a3871/;;yuchenbian/;;;;", "or_profile": "~Zhuomin_Chen1;~Jiaxing_Zhang4;~Jingchao_Ni1;~Xiaoting_Li3;~Yuchen_Bian1;~Md_Mezbahul_Islam1;~Ananda_Mondal1;~Hua_Wei1;~Dongsheng_Luo1", "aff": ";New Jersey Institute of Technology;Amazon;VISA;Amazon;;Florida International University;Arizona State University;Florida International University", "aff_domain": ";njit.edu;amazon.com;visa.com;amazon.com;;fiu.edu;asu.edu;fiu.edu", "position": ";PhD student;Applied Scientist;Researcher;Researcher;;Assistant Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nchen2024generating,\ntitle={Generating In-Distribution Proxy Graphs for Explaining Graph Neural Networks},\nauthor={Zhuomin Chen and Jiaxing Zhang and Jingchao Ni and Xiaoting Li and Yuchen Bian and Md Mezbahul Islam and Ananda Mondal and Hua Wei and Dongsheng Luo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ohG9bVMs5j}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7162697, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12463625008539949149&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": ";njit.edu;amazon.com;visa.com;amazon.com;;fiu.edu;asu.edu;fiu.edu", "author_num": 9, "aff_unique_index": "0;1;2;1;3;4;3", "aff_unique_norm": "New Jersey Institute of Technology;Amazon;VISA;Florida International University;Arizona State University", "aff_unique_dep": ";Amazon.com, Inc.;;;", "aff_unique_url": "https://www.njit.edu;https://www.amazon.com;https://www.visa.com;https://www.fiu.edu;https://www.asu.edu", "aff_unique_abbr": "NJIT;Amazon;VISA;FIU;ASU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Optimal bounds for $\\ell_p$ sensitivity sampling via $\\ell_2$ augmentation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33074", "id": "ohH3sbUue2", "proceeding": "https://proceedings.mlr.press/v235/munteanu24a.html", "pdf": "https://openreview.net/pdf?id=ohH3sbUue2", "openreview": "https://openreview.net/forum?id=ohH3sbUue2", "author_site": "Alexander Munteanu, Simon Omlor", "tldr": "", "abstract": "Data subsampling is one of the most natural methods to approximate a massively large data set by a small representative proxy. In particular, sensitivity sampling received a lot of attention, which samples points proportional to an individual importance measure called sensitivity. This framework reduces in very general settings the size of data to roughly the VC dimension $d$ times the total sensitivity $\\mathfrak S$ while providing strong $(1\\pm\\varepsilon)$ guarantees on the quality of approximation. The recent work of Woodruff & Yasuda (2023c) improved substantially over the general $\\tilde O(\\varepsilon^{-2}\\mathfrak Sd)$ bound for the important problem of $\\ell_p$ subspace embeddings to $\\tilde O(\\varepsilon^{-2}\\mathfrak S^{2/p})$ for $p\\in[1,2]$. Their result was subsumed by an earlier $\\tilde O(\\varepsilon^{-2}\\mathfrak Sd^{1-p/2})$ bound which was implicitly given in the work of Chen & Derezinski (2021). We show that their result is tight when sampling according to plain $\\ell_p$ sensitivities. We observe that by augmenting the $\\ell_p$ sensitivities by $\\ell_2$ sensitivities, we obtain better bounds improving over the aforementioned results to optimal linear $\\tilde O(\\varepsilon^{-2}(\\mathfrak S+d)) = \\tilde O(\\varepsilon^{-2}d)$ sampling complexity for all $p \\in [1,2]$. In particular, this resolves an open question of Woodruff & Yasuda (2023c) in the affirmative for $p \\in [1,2]$ and brings sensitivity subsampling into the regime that was previously only known to be possible using Lewis weights (Cohen & Peng, 2015). As an application of our main result, we also obtain an $\\tilde O(\\varepsilon^{-2}\\mu d)$ sensitivity sampling bound for logistic regression, where $\\mu$ is a natural complexity measure for this problem. This improves over the previous $\\tilde O(\\varepsilon^{-2}\\mu^2 d)$ bound of Mai et al. (2021) which was based on Lewis weights subsampling.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alexander Munteanu;Simon Omlor", "authorids": "~Alexander_Munteanu1;~Simon_Omlor1", "gender": "M;", "homepage": "https://biometrie.statistik.tu-dortmund.de/lehrstuhl/team/alexander-munteanu/;https://www.statistik.tu-dortmund.de/omlor.html", "dblp": "145/3380;254/2706.html", "google_scholar": "https://scholar.google.de/citations?hl=en;", "orcid": ";", "linkedin": ";", "or_profile": "~Alexander_Munteanu1;~Simon_Omlor1", "aff": "Universit\u00e4t K\u00f6ln;", "aff_domain": "uni-koeln.de;", "position": "Full Professor;", "bibtex": "@inproceedings{\nmunteanu2024optimal,\ntitle={Optimal bounds for \\${\\textbackslash}ell\\_p\\$ sensitivity sampling via \\${\\textbackslash}ell\\_2\\$ augmentation},\nauthor={Alexander Munteanu and Simon Omlor},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ohH3sbUue2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 517924, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "email": "uni-koeln.de;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "University of Cologne", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-koeln.de/", "aff_unique_abbr": "UC", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "title": "Off-policy Evaluation Beyond Overlap: Sharp Partial Identification Under Smoothness", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33073", "id": "oiY7yhyi6W", "proceeding": "https://proceedings.mlr.press/v235/khan24b.html", "pdf": "https://openreview.net/pdf?id=oiY7yhyi6W", "openreview": "https://openreview.net/forum?id=oiY7yhyi6W", "author_site": "Samir Khan, Martin Saveski, Johan Ugander", "tldr": "", "abstract": "Off-policy evaluation, and the complementary problem of policy learning, use historical data collected under a logging policy to estimate and/or optimize the value of a target policy. Methods for these tasks typically assume overlap between the target and logging policy, enabling solutions based on importance weighting and/or imputation. Absent such an overlap assumption, existing work either relies on a well-specified model or optimizes needlessly conservative bounds. In this work, we develop methods for no-overlap policy evaluation without a well-specified model, relying instead on non-parametric assumptions on the expected outcome, with a particular focus on Lipschitz smoothness. Under such assumptions we are able to provide sharp bounds on the off-policy value, along with optimal estimators of those bounds. For Lipschitz smoothness, we construct a pair of linear programs that upper and lower bound the contribution of the no-overlap region to the off-policy value. We show that these programs have a concise closed form solution, and that their solutions converge under the Lipschitz assumption to the sharp partial identification bounds at a minimax optimal rate, up to log factors. We demonstrate the effectiveness our methods on two semi-synthetic examples, and obtain informative and valid bounds that are tighter than those possible without smoothness assumptions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Samir Khan;Martin Saveski;Johan Ugander", "authorids": "~Samir_Khan1;~Martin_Saveski1;~Johan_Ugander1", "gender": "M;M;M", "homepage": "https://web.stanford.edu/~samirk/;http://martinsaveski.com/;http://stanford.edu/~jugander/", "dblp": ";138/9642;13/10542.html", "google_scholar": ";M3D870YAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0000-0001-5655-4086", "linkedin": "samir-khan-9536a9175/;;", "or_profile": "~Samir_Khan1;~Martin_Saveski1;~Johan_Ugander1", "aff": "Stanford University;University of Washington;Yale University", "aff_domain": "stanford.edu;uw.edu;yale.edu", "position": "PhD student;Assistant Professor;Visiting Associate Professor", "bibtex": "@inproceedings{\nkhan2024offpolicy,\ntitle={Off-policy Evaluation Beyond Overlap: Sharp Partial Identification Under Smoothness},\nauthor={Samir Khan and Martin Saveski and Johan Ugander},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=oiY7yhyi6W}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 526605, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12628131004177159728&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "stanford.edu;uw.edu;yale.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Stanford University;University of Washington;Yale University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.stanford.edu;https://www.washington.edu;https://www.yale.edu", "aff_unique_abbr": "Stanford;UW;Yale", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Q-value Regularized Transformer for Offline Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33072", "id": "ojtddicekd", "proceeding": "https://proceedings.mlr.press/v235/hu24c.html", "pdf": "https://openreview.net/pdf?id=ojtddicekd", "openreview": "https://openreview.net/forum?id=ojtddicekd", "author_site": "Shengchao Hu, Ziqing Fan, Chaoqin Huang, Li Shen, Ya Zhang, Yanfeng Wang, Dacheng Tao", "tldr": "", "abstract": "Recent advancements in offline reinforcement learning (RL) have underscored the capabilities of Conditional Sequence Modeling (CSM), a paradigm that learns the action distribution based on history trajectory and target returns for each state. However, these methods often struggle with stitching together optimal trajectories from sub-optimal ones due to the inconsistency between the sampled returns within individual trajectories and the optimal returns across multiple trajectories. Fortunately, Dynamic Programming (DP) methods offer a solution by leveraging a value function to approximate optimal future returns for each state, while these techniques are prone to unstable learning behaviors, particularly in long-horizon and sparse-reward scenarios. Building upon these insights, we propose the Q-value regularized Transformer (QT), which combines the trajectory modeling ability of the Transformer with the predictability of optimal future returns from DP methods. QT learns an action-value function and integrates a term maximizing action-values into the training loss of CSM, which aims to seek optimal actions that align closely with the behavior policy. Empirical evaluations on D4RL benchmark datasets demonstrate the superiority of QT over traditional DP and CSM methods, highlighting the potential of QT to enhance the state-of-the-art in offline RL.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shengchao Hu;Ziqing Fan;Chaoqin Huang;Li Shen;Ya Zhang;Yanfeng Wang;Dacheng Tao", "authorids": "~Shengchao_Hu1;~Ziqing_Fan1;~Chaoqin_Huang1;~Li_Shen1;~Ya_Zhang1;~Yanfeng_Wang1;~Dacheng_Tao1", "gender": ";;M;M;F;M;", "homepage": ";;https://chaoqinhuang.github.io/;https://sites.google.com/site/mathshenli/home;https://annzhanglion.github.io/;https://cmic.sjtu.edu.cn/wangyanfeng/;", "dblp": ";;215/5540;91/3680-8;85/3714-2;55/5407-1.html;", "google_scholar": ";;BAZSE7wAAAAJ;yVhgENIAAAAJ;pbjw9sMAAAAJ;https://scholar.google.com/citations?hl=zh-CN;", "orcid": ";;;;0000-0002-5390-9053;0000-0002-3196-2347;", "linkedin": ";;;;;;", "or_profile": "~Shengchao_Hu1;~Ziqing_Fan1;~Chaoqin_Huang1;~Li_Shen1;~Ya_Zhang1;~Yanfeng_Wang1;~Dacheng_Tao1", "aff": ";;Shanghai Jiaotong University;JD Explore Academy;Shanghai Jiaotong University;Shanghai Jiaotong University;", "aff_domain": ";;sjtu.edu.cn;jd.com;sjtu.edu.cn;sjtu.edu.cn;", "position": ";;PhD student;Researcher;Professor;Full Professor;", "bibtex": "@inproceedings{\nhu2024qvalue,\ntitle={Q-value Regularized Transformer for Offline Reinforcement Learning},\nauthor={Shengchao Hu and Ziqing Fan and Chaoqin Huang and Li Shen and Ya Zhang and Yanfeng Wang and Dacheng Tao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ojtddicekd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 470926, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6240625810451690750&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";;sjtu.edu.cn;jd.com;sjtu.edu.cn;sjtu.edu.cn;", "author_num": 7, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Shanghai Jiao Tong University;JD", "aff_unique_dep": ";JD Explore Academy", "aff_unique_url": "https://www.sjtu.edu.cn;", "aff_unique_abbr": "SJTU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China;" }, { "title": "Mitigating Catastrophic Forgetting in Online Continual Learning by Modeling Previous Task Interrelations via Pareto Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33071", "id": "olbTrkWo1D", "proceeding": "https://proceedings.mlr.press/v235/wu24ab.html", "pdf": "https://openreview.net/pdf?id=olbTrkWo1D", "openreview": "https://openreview.net/forum?id=olbTrkWo1D", "author_site": "Yichen WU, Hong Wang, Peilin Zhao, Yefeng Zheng, Ying WEI, Long-Kai Huang", "tldr": "", "abstract": "Catastrophic forgetting remains a core challenge in continual learning (CL), where the models struggle to retain previous knowledge when learning new tasks. While existing replay-based CL methods have been proposed to tackle this challenge by utilizing a memory buffer to store data from previous tasks, they generally overlook the interdependence between previously learned tasks and fail to encapsulate the optimally integrated knowledge in previous tasks, leading to sub-optimal performance of the previous tasks. Against this issue, we first reformulate replay-based CL methods as a unified hierarchical gradient aggregation framework. We then incorporate the Pareto optimization to capture the interrelationship among previously learned tasks and design a Pareto-Optimized CL algorithm (POCL), which effectively enhances the overall performance of past tasks while ensuring the performance of the current task. Comprehensive empirical results demonstrate that the proposed POCL outperforms current state-of-the-art CL methods across multiple datasets and different settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yichen Wu;Hong Wang;Peilin Zhao;Yefeng Zheng;Ying Wei;Long-Kai Huang", "authorids": "~Yichen_Wu2;~Hong_Wang5;~Peilin_Zhao2;~Yefeng_Zheng3;~Ying_Wei1;~Long-Kai_Huang1", "gender": "M;F;;F;;M", "homepage": "https://wuyichen-97.github.io/;https://hongwang01.github.io/;;https://wei-ying.net/;https://sites.google.com/site/longkaihugo/home;https://en.westlake.edu.cn/faculty/yefeng-zheng.html", "dblp": ";83/5522-21;84/8411;14/4899-1;133/2006;44/6510", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;I5RH0CwAAAAJ;https://scholar.google.com.hk/citations?user=HPeX_YcAAAAJ;5UpFdKsAAAAJ;CaP64WUAAAAJ;vAIECxgAAAAJ", "orcid": "0000-0003-2859-3285;;0000-0001-8543-3953;;0000-0001-5263-1443;0000-0003-2195-2847", "linkedin": ";;;;;yefeng-zheng-bb45641/?originalSubdomain=cn", "or_profile": "~Yichen_Wu2;~Hong_Wang5;~Peilin_Zhao2;~Ying_Wei1;~Long-Kai_Huang1;~Yefeng_Zheng2", "aff": "City University of Hong Kong;Tencent ;Tencent;Nanyang Technological University;Tencent;Tencent Jarvis Lab", "aff_domain": "cityu.edu.hk;tencent.com;tencent.com;ntu.edu.sg;tencent.com;tencent.com", "position": "PhD student;Senior Researcher;Researcher;Assistant Professor;Researcher;Director", "bibtex": "@inproceedings{\nwu2024mitigating,\ntitle={Mitigating Catastrophic Forgetting in Online Continual Learning by Modeling Previous Task Interrelations via Pareto Optimization},\nauthor={Yichen Wu and Hong Wang and Peilin Zhao and Yefeng Zheng and Ying Wei and Long-Kai Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=olbTrkWo1D}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 852475, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7249503852165454836&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "cityu.edu.hk;tencent.com;tencent.com;ntu.edu.sg;tencent.com;tencent.com", "author_num": 6, "aff_unique_index": "0;1;1;2;1;1", "aff_unique_norm": "City University of Hong Kong;Tencent;Nanyang Technological University", "aff_unique_dep": ";Tencent Holdings Limited;", "aff_unique_url": "https://www.cityu.edu.hk;https://www.tencent.com;https://www.ntu.edu.sg", "aff_unique_abbr": "CityU;Tencent;NTU", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "China;Singapore" }, { "title": "A Theory of Fault-Tolerant Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33070", "id": "ooh8tkXKyR", "proceeding": "https://proceedings.mlr.press/v235/wu24z.html", "pdf": "https://openreview.net/pdf?id=ooh8tkXKyR", "openreview": "https://openreview.net/forum?id=ooh8tkXKyR", "author_site": "Changlong Wu, Yifan Wang, Ananth Grama", "tldr": "", "abstract": "Developing machine learning models that account for potential faults encountered in real-world environments presents a fundamental challenge for mission-critical applications. In this paper, we introduce a novel theoretical framework grounded in learning theory for dealing with faults. In particular, we propose a framework called *fault-tolerant PAC learning*, aimed at identifying the most fault-tolerant models from a given hypothesis class (such as neural networks). We show that if faults occur randomly, fault-tolerant learning is equivalent to regular PAC learning. However, for *adversarial* faults, we show that the sample complexity of fault-tolerant PAC learning can grow linearly w.r.t. the number of perturbing functions induced by the faults, even for a hypothesis class with VC-dimension 1. We then provide a matching upper bound by restricting the number of perturbing functions. Finally, we show that the linear dependency on the number of perturbing functions can be substantially improved for *deletion faults* in neural networks. Our work provides a powerful formal framework and avenues for a number of future investigations on the precise characterization of fault-tolerant learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Changlong Wu;Yifan Wang;Ananth Grama", "authorids": "~Changlong_Wu1;~Yifan_Wang14;~Ananth_Grama1", "gender": "M;F;M", "homepage": "https://changlongwu1993.github.io/;https://cacayaya.github.io/;https://www.cs.purdue.edu/homes/ayg/", "dblp": "204/4267;;", "google_scholar": "-T9eX0kAAAAJ;hqL5jWYAAAAJ;https://scholar.google.com.tw/citations?user=bpsZlEQAAAAJ", "orcid": ";;", "linkedin": ";yifan-wang-66521524b/;", "or_profile": "~Changlong_Wu1;~Yifan_Wang14;~Ananth_Grama1", "aff": "Purdue University;Purdue University;Purdue University", "aff_domain": "purdue.edu;purdue.edu;purdue.edu", "position": "Postdoc;PhD student;Full Professor", "bibtex": "@inproceedings{\nwu2024a,\ntitle={A Theory of Fault-Tolerant Learning},\nauthor={Changlong Wu and Yifan Wang and Ananth Grama},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ooh8tkXKyR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 428145, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:6etsbjAKWn4J:scholar.google.com/&scioq=A+Theory+of+Fault-Tolerant+Learning&hl=en&as_sdt=0,5", "gs_version_total": 5, "email": "purdue.edu;purdue.edu;purdue.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Position: Explain to Question not to Justify", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33069", "id": "ooikIHLHCs", "proceeding": "https://proceedings.mlr.press/v235/biecek24a.html", "pdf": "https://openreview.net/pdf?id=ooikIHLHCs", "openreview": "https://openreview.net/forum?id=ooikIHLHCs", "author_site": "Przemyslaw Biecek, Wojciech Samek", "tldr": "", "abstract": "Explainable Artificial Intelligence (XAI) is a young but very promising field of research. Unfortunately, the progress in this field is currently slowed down by divergent and incompatible goals. We separate various threads tangled within the area of XAI into two complementary cultures of human/value-oriented explanations (BLUE XAI) and model/validation-oriented explanations (RED XAI). This position paper argues that the area of RED XAI is currently under-explored, i.e., more methods for explainability are desperately needed to question models (e.g., extract knowledge from well-performing models as well as spotting and fixing bugs in faulty models), and the area of RED XAI hides great opportunities and potential for important research necessary to ensure the safety of AI systems. We conclude this paper by presenting promising challenges in this area.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Przemyslaw Biecek;Wojciech Samek", "authorids": "~Przemyslaw_Biecek2;~Wojciech_Samek1", "gender": "M;Not Specified", "homepage": "http://iphome.hhi.de/samek/;http://biecek.pl/", "dblp": "79/9736;68/2414", "google_scholar": "7aQwO08AAAAJ;https://scholar.google.pl/citations?user=Af0O75cAAAAJ", "orcid": ";0000-0001-8423-1823", "linkedin": ";pbiecek/", "or_profile": "~Wojciech_Samek1;~Przemyslaw_Biecek1", "aff": "Fraunhofer HHI;University of Warsaw", "aff_domain": "hhi.fraunhofer.de;uw.edu.pl", "position": "Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nbiecek2024position,\ntitle={Position: Explain to Question not to Justify},\nauthor={Przemyslaw Biecek and Wojciech Samek},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ooikIHLHCs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 600814, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2594797309159984290&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "hhi.fraunhofer.de;uw.edu.pl", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Fraunhofer Heinrich Hertz Institute;University of Warsaw", "aff_unique_dep": ";", "aff_unique_url": "https://www.hhi.fraunhofer.de/;https://www.uw.edu.pl", "aff_unique_abbr": "HHI;UW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Germany;Poland" }, { "title": "Theoretical Analysis of Learned Database Operations under Distribution Shift through Distribution Learnability", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33068", "id": "oowQ8LPA12", "proceeding": "https://proceedings.mlr.press/v235/zeighami24a.html", "pdf": "https://openreview.net/pdf?id=oowQ8LPA12", "openreview": "https://openreview.net/forum?id=oowQ8LPA12", "author_site": "Sepanta Zeighami, Cyrus Shahabi", "tldr": "", "abstract": "Use of machine learning to perform database operations, such as indexing, cardinality estimation, and sorting, is shown to provide substantial performance benefits. However, when datasets change and data distribution shifts, empirical results also show performance degradation for learned models, possibly to worse than non-learned alternatives. This, together with a lack of theoretical understanding of learned methods undermines their practical applicability, since there are no guarantees on how well the models will perform after deployment. In this paper, we present the first known theoretical characterization of the performance of learned models in dynamic datasets, for the aforementioned operations. Our results show novel theoretical characteristics achievable by learned models and provide bounds on the performance of the models that characterize their advantages over non-learned methods, showing why and when learned models can outperform the alternatives. Our analysis develops the *distribution learnability* framework and novel theoretical tools which build the foundation for the analysis of learned database operations in the future.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sepanta Zeighami;Cyrus Shahabi", "authorids": "~Sepanta_Zeighami2;~Cyrus_Shahabi1", "gender": "M;M", "homepage": "https://szeighami.github.io/;https://infolab.usc.edu/Shahabi/", "dblp": ";s/CyrusShahabi", "google_scholar": ";jEdhxGMAAAAJ", "orcid": ";0000-0001-9118-0681", "linkedin": ";cyrus-shahabi-9791256b/", "or_profile": "~Sepanta_Zeighami2;~Cyrus_Shahabi1", "aff": "University of Southern California;University of Southern California", "aff_domain": "usc.edu;usc.edu", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nzeighami2024theoretical,\ntitle={Theoretical Analysis of Learned Database Operations under Distribution Shift through Distribution Learnability},\nauthor={Sepanta Zeighami and Cyrus Shahabi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=oowQ8LPA12}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 632694, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10777659045357101474&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "usc.edu;usc.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Southern California", "aff_unique_dep": "", "aff_unique_url": "https://www.usc.edu", "aff_unique_abbr": "USC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "A New Robust Partial p-Wasserstein-Based Metric for Comparing Distributions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33067", "id": "opieUcKjPa", "proceeding": "https://proceedings.mlr.press/v235/raghvendra24a.html", "pdf": "https://openreview.net/pdf?id=opieUcKjPa", "openreview": "https://openreview.net/forum?id=opieUcKjPa", "author_site": "Sharath Raghvendra, Pouyan Shirzadian, Kaiyi Zhang", "tldr": "", "abstract": "The $2$-Wasserstein distance is sensitive to minor geometric differences between distributions, making it a very powerful dissimilarity metric. However, due to this sensitivity, a small outlier mass can also cause a significant increase in the $2$-Wasserstein distance between two similar distributions. Similarly, sampling discrepancy can cause the empirical $2$-Wasserstein distance on $n$ samples in $\\mathbb{R}^2$ to converge to the true distance at a rate of $n^{-1/4}$, which is significantly slower than the rate of $n^{-1/2}$ for $1$-Wasserstein distance. We introduce a new family of distances parameterized by $k \\ge 0$, called $k$-RPW that is based on computing the partial $2$-Wasserstein distance. We show that (1) $k$-RPW satisfies the metric properties, (2) $k$-RPW is robust to small outlier mass while retaining the sensitivity of $2$-Wasserstein distance to minor geometric differences, and (3) when $k$ is a constant, $k$-RPW distance between empirical distributions on $n$ samples in $\\mathbb{R}^2$ converges to the true distance at a rate of $n^{-1/3}$, which is faster than the convergence rate of $n^{-1/4}$ for the $2$-Wasserstein distance. Using the partial $p$-Wasserstein distance, we extend our distance to any $p \\in [1,\\infty]$. By setting parameters $k$ or $p$ appropriately, we can reduce our distance to the total variation, $p$-Wasserstein, and the L\u00e9vy-Prokhorov distances. Experiments show that our distance function achieves higher accuracy in comparison to the $1$-Wasserstein, $2$-Wasserstein, and TV distances for image retrieval tasks on noisy real-world data sets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sharath Raghvendra;Pouyan Shirzadian;Kaiyi Zhang", "authorids": "~Sharath_Raghvendra1;~Pouyan_Shirzadian1;~Kaiyi_Zhang2", "gender": "M;M;M", "homepage": "http://people.cs.vt.edu/~sharathr/;https://sites.google.com/vt.edu/pshirzadian/home;https://kaiyiz.github.io/", "dblp": "149/2582;322/7785;254/0055-4", "google_scholar": "https://scholar.google.com.tw/citations?user=kOfRa7MAAAAJ;https://scholar.google.com/citations?hl=en;n-Hg5SwAAAAJ", "orcid": ";0000-0001-8315-2357;", "linkedin": ";;", "or_profile": "~Sharath_Raghvendra1;~Pouyan_Shirzadian1;~Kaiyi_Zhang2", "aff": "North Carolina State University;Virginia Polytechnic Institute and State University;Virginia Polytechnic Institute and State University", "aff_domain": "csc.ncsu.edu;vt.edu;vt.edu", "position": "Associate Professor;PhD student;PhD student", "bibtex": "@inproceedings{\nraghvendra2024a,\ntitle={A New Robust Partial p-Wasserstein-Based Metric for Comparing Distributions},\nauthor={Sharath Raghvendra and Pouyan Shirzadian and Kaiyi Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=opieUcKjPa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1413630, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3100624133097574107&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "csc.ncsu.edu;vt.edu;vt.edu", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "North Carolina State University;Virginia Tech", "aff_unique_dep": ";", "aff_unique_url": "https://www.ncsu.edu;https://www.vt.edu", "aff_unique_abbr": "NCSU;VT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Algorithm and Hardness for Dynamic Attention Maintenance in Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33066", "id": "opkluZm9gX", "proceeding": "https://proceedings.mlr.press/v235/van-den-brand24a.html", "pdf": "https://openreview.net/pdf?id=opkluZm9gX", "openreview": "https://openreview.net/forum?id=opkluZm9gX", "author_site": "Jan van den Brand, Zhao Song, Tianyi Zhou", "tldr": "", "abstract": "The attention scheme is one of the key components over all the LLMs, such as BERT, GPT-1, Transformers, GPT-2, 3, 3.5 and 4. Inspired by previous theoretical study of static version of the attention multiplication problem [Zandieh, Han, Daliri, and Karbasi ICML 2023, Alman and Song NeurIPS 2023], we formally define a dynamic version of attention matrix multiplication problem. In each iteration we update one entry in key matrix $K \\in \\mathbb{R}^{n \\times d}$ or value matrix $V \\in \\mathbb{R}^{n \\times d}$. In the query stage, we receive $(i,j) \\in [n] \\times [d]$ as input, and want to answer $(D^{-1} A V)_{i,j}$, where $A:=\\exp(QK^\\top) \\in \\mathbb{R}^{n \\times n}$ is a square matrix and $D := \\mathrm{diag}(A {\\bf 1}_n) \\in \\mathbb{R}^{n \\times n}$ is a diagonal matrix and ${\\bf 1}_n$ denotes a length-$n$ vector that all the entries are ones. We provide two results: an algorithm and a conditional lower bound. Inspired by the lazy update idea from [Demetrescu and Italiano FOCS 2000, Sankowski FOCS 2004, Cohen, Lee and Song STOC 2019, Brand SODA 2020], we provide a data-structure that uses $O(n^{\\omega(1,1,\\tau)-\\tau})$ amortized update time, and $O(n^{1+\\tau})$ worst-case query time, where $n^{\\omega(1,1,\\tau)}$ denotes $\\mathrm(n,n,n^\\tau)$ with matrix multiplication exponent $\\omega$ and $\\tau$ denotes a constant in $(0,1]$. We also show that unless the hinted matrix vector multiplication conjecture [Brand, Nanongkai and Saranurak FOCS 2019] is false, there is no algorithm that can use both $O(n^{\\omega(1,1,\\tau) - \\tau- \\Omega(1)})$ amortized update time, and $O(n^{1+\\tau-\\Omega(1)})$ worst query time.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jan van den Brand;Zhao Song;Tianyi Zhou", "authorids": "~Jan_van_den_Brand1;~Zhao_Song3;~Tianyi_Zhou4", "gender": ";M;", "homepage": "https://jvdbrand.com;https://www.youtube.com/@zhaosong2031;", "dblp": "196/3762;76/4051-2;", "google_scholar": "56fOepEAAAAJ;yDZct7UAAAAJ;", "orcid": "0000-0001-8611-6896;;", "linkedin": ";;", "or_profile": "~Jan_van_den_Brand1;~Zhao_Song3;~Tianyi_Zhou4", "aff": "Georgia Institute of Technology;Adobe;", "aff_domain": "gatech.edu;adobe.com;", "position": "Assistant Professor;Researcher;", "bibtex": "@inproceedings{\nbrand2024algorithm,\ntitle={Algorithm and Hardness for Dynamic Attention Maintenance in Large Language Models},\nauthor={Jan van den Brand and Zhao Song and Tianyi Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=opkluZm9gX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 441891, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7195906318516545252&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "gatech.edu;adobe.com;", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Georgia Institute of Technology;Adobe", "aff_unique_dep": ";Adobe Inc.", "aff_unique_url": "https://www.gatech.edu;https://www.adobe.com", "aff_unique_abbr": "Georgia Tech;Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "InterpreTabNet: Distilling Predictive Signals from Tabular Data by Salient Feature Interpretation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33065", "id": "or8BQ4ohGb", "proceeding": "https://proceedings.mlr.press/v235/si24a.html", "pdf": "https://openreview.net/pdf?id=or8BQ4ohGb", "openreview": "https://openreview.net/forum?id=or8BQ4ohGb", "author_site": "Jacob Si, Wendy Yusi Cheng, Michael Cooper, Rahul G. Krishnan", "tldr": "", "abstract": "Tabular data are omnipresent in various sectors of industries. Neural networks for tabular data such as TabNet have been proposed to make predictions while leveraging the attention mechanism for interpretability. However, the inferred attention masks are often dense, making it challenging to come up with rationales about the predictive signal. To remedy this, we propose InterpreTabNet, a variant of the TabNet model that models the attention mechanism as a latent variable sampled from a Gumbel-Softmax distribution. This enables us to regularize the model to learn distinct concepts in the attention masks via a KL Divergence regularizer. It prevents overlapping feature selection by promoting sparsity which maximizes the model's efficacy and improves interpretability to determine the important features when predicting the outcome. To assist in the interpretation of feature interdependencies from our model, we employ a large language model (GPT-4) and use prompt engineering to map from the learned feature mask onto natural language text describing the learned signal. Through comprehensive experiments on real-world datasets, we demonstrate that InterpreTabNet outperforms previous methods for interpreting tabular data while attaining competitive accuracy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jacob Yoke Hong Si;Wendy Yusi Cheng;Michael Cooper;Rahul Krishnan", "authorids": "~Jacob_Yoke_Hong_Si1;~Wendy_Yusi_Cheng1;~Michael_Cooper2;~Rahul_G_Krishnan1", "gender": "M;F;;M", "homepage": "https://jacobyhsi.github.io/;;https://michaeljohncooper.com;http://www.cs.toronto.edu/~rahulgk/index.html", "dblp": ";;;172/0880", "google_scholar": "-SJHaFcAAAAJ;;https://scholar.google.ca/citations?user=hfNx8qUAAAAJ;ilJgXHkAAAAJ", "orcid": ";;;", "linkedin": ";wendy-yusi-cheng-6153b9204/;;rahulgk/", "or_profile": "~Jacob_Yoke_Hong_Si1;~Wendy_Yusi_Cheng1;~Michael_Cooper2;~Rahul_G_Krishnan1", "aff": "Imperial College London;University of Toronto;University of Toronto;Department of Computer Science, University of Toronto", "aff_domain": "imperial.ac.uk;utoronto.ca;toronto.edu;cs.toronto.edu", "position": "PhD student;Undergrad student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nsi2024interpretabnet,\ntitle={InterpreTabNet: Distilling Predictive Signals from Tabular Data by Salient Feature Interpretation},\nauthor={Jacob Yoke Hong Si and Wendy Yusi Cheng and Michael Cooper and Rahul Krishnan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=or8BQ4ohGb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2589401, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12085568612784452838&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "imperial.ac.uk;utoronto.ca;toronto.edu;cs.toronto.edu", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Imperial College London;University of Toronto", "aff_unique_dep": ";", "aff_unique_url": "https://www.imperial.ac.uk;https://www.utoronto.ca", "aff_unique_abbr": "ICL;U of T", "aff_campus_unique_index": "1", "aff_campus_unique": ";Toronto", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United Kingdom;Canada" }, { "title": "On the Origins of Linear Representations in Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33064", "id": "otuTw4Mghk", "proceeding": "https://proceedings.mlr.press/v235/jiang24d.html", "pdf": "https://openreview.net/pdf?id=otuTw4Mghk", "openreview": "https://openreview.net/forum?id=otuTw4Mghk", "author_site": "Yibo Jiang, Goutham Rajendran, Pradeep Ravikumar, Bryon Aragam, Victor Veitch", "tldr": "", "abstract": "An array of recent works have argued that high-level semantic concepts are encoded \"linearly\" in the representation space of large language models. In this work, we study the origins of such linear representations. To that end, we introduce a latent variable model to abstract and formalize the concept dynamics of the next token prediction. We use this formalism to prove that linearity arises as a consequence of the loss function and the implicit bias of gradient descent. The theory is further substantiated empirically via experiments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yibo Jiang;Goutham Rajendran;Pradeep Kumar Ravikumar;Bryon Aragam;Victor Veitch", "authorids": "~Yibo_Jiang2;~Goutham_Rajendran1;~Pradeep_Kumar_Ravikumar1;~Bryon_Aragam1;~Victor_Veitch1", "gender": "M;M;M;;", "homepage": ";https://gouthamrdn.github.io/;http://www.cs.cmu.edu/~pradeepr/;http://bryonaragam.com/;http://victorveitch.com", "dblp": "54/2193;274/1323;94/3594;140/7564;167/5650", "google_scholar": "hvQo2gQAAAAJ;YVrGTe8AAAAJ;https://scholar.google.com.tw/citations?user=Q4DTPw4AAAAJ;u-W3_9QAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Yibo_Jiang2;~Goutham_Rajendran1;~Pradeep_Kumar_Ravikumar1;~Bryon_Aragam1;~Victor_Veitch1", "aff": "University of Chicago;Carnegie Mellon University;Carnegie Mellon University;Booth School of Business;Google", "aff_domain": "uchicago.edu;cmu.edu;cmu.edu;chicagobooth.edu;google.com", "position": "PhD student;Postdoc;Full Professor;Assistant Professor;Research Scientist", "bibtex": "@inproceedings{\njiang2024on,\ntitle={On the Origins of Linear Representations in Large Language Models},\nauthor={Yibo Jiang and Goutham Rajendran and Pradeep Kumar Ravikumar and Bryon Aragam and Victor Veitch},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=otuTw4Mghk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 980346, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17396367709506786352&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "uchicago.edu;cmu.edu;cmu.edu;chicagobooth.edu;google.com", "author_num": 5, "aff_unique_index": "0;1;1;2;3", "aff_unique_norm": "University of Chicago;Carnegie Mellon University;University of Chicago Booth School of Business;Google", "aff_unique_dep": ";;Booth School of Business;Google", "aff_unique_url": "https://www.uchicago.edu;https://www.cmu.edu;https://www.chicagobooth.edu;https://www.google.com", "aff_unique_abbr": "UChicago;CMU;Booth;Google", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Chicago;Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Bridging Model Heterogeneity in Federated Learning via Uncertainty-based Asymmetrical Reciprocity Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33063", "id": "p0MGN0LSnx", "proceeding": "https://proceedings.mlr.press/v235/wang24cs.html", "pdf": "https://openreview.net/pdf?id=p0MGN0LSnx", "openreview": "https://openreview.net/forum?id=p0MGN0LSnx", "author_site": "Jiaqi Wang, Chenxu Zhao, Lingjuan Lyu, Quanzeng You, Mengdi Huai, Fenglong Ma", "tldr": "", "abstract": "This paper presents FedType, a simple yet pioneering framework designed to fill research gaps in heterogeneous model aggregation within federated learning (FL). FedType introduces small identical proxy models for clients, serving as agents for information exchange, ensuring model security, and achieving efficient communication simultaneously. To transfer knowledge between large private and small proxy models on clients, we propose a novel uncertainty-based asymmetrical reciprocity learning method, eliminating the need for any public data. Comprehensive experiments conducted on benchmark datasets demonstrate the efficacy and generalization ability of FedType across diverse settings. Our approach redefines federated learning paradigms by bridging model heterogeneity, eliminating reliance on public data, prioritizing client privacy, and reducing communication costs (The codes are available in the supplementation materials).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiaqi Wang;Chenxu Zhao;Lingjuan Lyu;Quanzeng You;Mengdi Huai;Fenglong Ma", "authorids": "~Jiaqi_Wang4;~Chenxu_Zhao4;~Lingjuan_Lyu1;~Quanzeng_You3;~Mengdi_Huai1;~Fenglong_Ma1", "gender": ";;F;M;F;M", "homepage": ";;https://sites.google.com/view/lingjuan-lyu;https://qzyou.github.io/;https://mdhuai.github.io/;https://fenglong-ma.github.io/", "dblp": ";;178/9876;33/9972.html;150/8482;85/10856", "google_scholar": ";;;c5KJsIgAAAAJ;40ZYTzEAAAAJ;DLJIxNMAAAAJ", "orcid": ";;;0000-0003-3608-0607;0000-0001-6368-5973;0000-0002-4999-0303", "linkedin": ";;;quanzeng-you-5b98a55a/;;fenglong-ma-69805832/", "or_profile": "~Jiaqi_Wang4;~Chenxu_Zhao4;~Lingjuan_Lyu1;~Quanzeng_You3;~Mengdi_Huai1;~Fenglong_Ma1", "aff": ";;Sony;ByteDance;Iowa State University;Pennsylvania State University", "aff_domain": ";;sony.com;bytedance.com;iastate.edu;psu.edu", "position": ";;scientist;Researcher;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nwang2024bridging,\ntitle={Bridging Model Heterogeneity in Federated Learning via Uncertainty-based Asymmetrical Reciprocity Learning},\nauthor={Jiaqi Wang and Chenxu Zhao and Lingjuan Lyu and Quanzeng You and Mengdi Huai and Fenglong Ma},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=p0MGN0LSnx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1923620, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15050220170056731939&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 7, "email": ";;sony.com;bytedance.com;iastate.edu;psu.edu", "author_num": 6, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Sony Corporation;ByteDance;Iowa State University;Pennsylvania State University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.sony.com;https://www.bytedance.com;https://www.iastate.edu;https://www.psu.edu", "aff_unique_abbr": "Sony;ByteDance;ISU;PSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;2", "aff_country_unique": "Japan;China;United States" }, { "title": "MEMORYLLM: Towards Self-Updatable Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33062", "id": "p0lKWzdikQ", "proceeding": "https://proceedings.mlr.press/v235/wang24s.html", "pdf": "https://openreview.net/pdf?id=p0lKWzdikQ", "openreview": "https://openreview.net/forum?id=p0lKWzdikQ", "author_site": "Yu Wang, Yifan Gao, Xiusi Chen, Haoming Jiang, Shiyang Li, Jingfeng Yang, Qingyu Yin, Zheng Li, Xian Li, Bing Yin, Jingbo Shang, Julian McAuley", "tldr": "", "abstract": "Existing Large Language Models (LLMs) usually remain static after deployment, which might make it hard to inject new knowledge into the model. We aim to build models containing a considerable portion of self-updatable parameters, enabling the model to integrate new knowledge effectively and efficiently. To this end, we introduce MEMORYLLM, a model that comprises a transformer and a fixed-size memory pool within the latent space of the transformer. MEMORYLLM can self-update with text knowledge and memorize the knowledge injected earlier. Our evaluations demonstrate the ability of MEMORYLLM to effectively incorporate new knowledge, as evidenced by its performance on model editing benchmarks. Meanwhile, the model exhibits long-term information retention capacity, which is validated through our custom-designed evaluations and long-context benchmarks. MEMORYLLM also shows operational integrity without any sign of performance degradation even after nearly a million memory updates. Our code and model are open-sourced at https://github.com/wangyu-ustc/MemoryLLM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yu Wang;Yifan Gao;Xiusi Chen;Haoming Jiang;Shiyang Li;Jingfeng Yang;Qingyu Yin;Zheng Li;Xian Li;Bing Yin;Jingbo Shang;Julian McAuley", "authorids": "~Yu_Wang24;~Yifan_Gao1;~Xiusi_Chen1;~Haoming_Jiang1;~Shiyang_Li1;~Jingfeng_Yang2;~Qingyu_Yin2;~Zheng_Li9;~Xian_Li3;~Bing_Yin1;~Jingbo_Shang2;~Julian_McAuley1", "gender": "M;Not Specified;M;M;M;M;F;M;M;M;M;M", "homepage": "https://wangyu-ustc.github.io/;http://yifan-gao.github.io;https://xiusic.github.io/;https://hmjianggatech.github.io;https://jingfengyang.github.io/;;https://scholar.google.com/citations?user=6-Xx0IoAAAAJ&hl=en;;https://shangjingbo1226.github.io/;http://cseweb.ucsd.edu/~jmcauley/;;https://hsqmlzno1.github.io/", "dblp": ";79/3190-1;210/1049;230/3684;;179/2542;;;151/3145.html;29/3483;;10/1143-18", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com.hk/citations?user=erdMFJwAAAAJ;JqGAil4AAAAJ;XaFhuG8AAAAJ;hysBvrwAAAAJ;P-mBKNYAAAAJ;6-Xx0IoAAAAJ;qSOxydEAAAAJ;0SkFI4MAAAAJ;icbo4M0AAAAJ;4zli0KkAAAAJ;https://scholar.google.com.hk/citations?user=P6fwn4AAAAAJ", "orcid": ";;0000-0002-9713-8000;;;;;0000-0002-5890-0031;;0000-0003-0955-7588;;", "linkedin": ";yi-fan-gao/;xiusi-chen-53180583/;;jingfeng-yang-797864172/;;xianl/;bingyin;;;;", "or_profile": "~Yu_Wang24;~Yifan_Gao1;~Xiusi_Chen1;~Haoming_Jiang1;~Jingfeng_Yang2;~Qingyu_Yin2;~Xian_Li3;~Bing_Yin1;~Jingbo_Shang2;~Julian_McAuley1;~SHIYANG_LI2;~zheng_li4", "aff": "University of California, San Diego;Amazon;University of California, Los Angeles;Amazon;Amazon;Amazon;Amazon;Amazon;University of California, San Diego;University of California, San Diego, University of California, San Diego;Amazon;Amazon", "aff_domain": "ucsd.edu;amazon.com;ucla.edu;amazon.com;amazon.com;amazon.com;amazon.com;amazon.com;ucsd.edu;eng.ucsd.edu;amazon.com;amazon.com", "position": "PhD student;Researcher;PhD student;Principal Researcher;Researcher;Researcher;Applied Scientist;Senior Science Manager;Assistant Professor;Full Professor;Researcher;Researcher", "bibtex": "@inproceedings{\nwang2024memoryllm,\ntitle={{MEMORYLLM}: Towards Self-Updatable Large Language Models},\nauthor={Yu Wang and Yifan Gao and Xiusi Chen and Haoming Jiang and Shiyang Li and Jingfeng Yang and Qingyu Yin and Zheng Li and Xian Li and Bing Yin and Jingbo Shang and Julian McAuley},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=p0lKWzdikQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1418983, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16903164166476613544&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "ucsd.edu;amazon.com;ucla.edu;amazon.com;amazon.com;amazon.com;amazon.com;amazon.com;ucsd.edu;eng.ucsd.edu;amazon.com;amazon.com", "author_num": 12, "aff_unique_index": "0;1;2;1;1;1;1;1;0;0;1;1", "aff_unique_norm": "University of California, San Diego;Amazon;University of California, Los Angeles", "aff_unique_dep": ";Amazon.com, Inc.;", "aff_unique_url": "https://www.ucsd.edu;https://www.amazon.com;https://www.ucla.edu", "aff_unique_abbr": "UCSD;Amazon;UCLA", "aff_campus_unique_index": "0;2;0;0", "aff_campus_unique": "San Diego;;Los Angeles", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Nesting Particle Filters for Experimental Design in Dynamical Systems", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33061", "id": "p1kDNFs62o", "proceeding": "https://proceedings.mlr.press/v235/iqbal24a.html", "pdf": "https://openreview.net/pdf?id=p1kDNFs62o", "openreview": "https://openreview.net/forum?id=p1kDNFs62o", "author_site": "Sahel Iqbal, Adrien Corenflos, Simo S\u00e4rkk\u00e4, Hany Abdulsamad", "tldr": "", "abstract": "In this paper, we propose a novel approach to Bayesian experimental design for non-exchangeable data that formulates it as risk-sensitive policy optimization. We develop the Inside-Out SMC$^2$ algorithm, a nested sequential Monte Carlo technique to infer optimal designs, and embed it into a particle Markov chain Monte Carlo framework to perform gradient-based policy amortization. Our approach is distinct from other amortized experimental design techniques, as it does not rely on contrastive estimators. Numerical validation on a set of dynamical systems showcases the efficacy of our method in comparison to other state-of-the-art strategies.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sahel Iqbal;Adrien Corenflos;Simo S\u00e4rkk\u00e4;Hany Abdulsamad", "authorids": "~Sahel_Iqbal1;~Adrien_Corenflos1;~Simo_S\u00e4rkk\u00e41;~Hany_Abdulsamad1", "gender": "M;M;M;", "homepage": "https://sahel13.github.io/;https://adriencorenflos.github.io/;https://users.aalto.fi/~ssarkka/;", "dblp": ";284/8438;38/4897;173/6249", "google_scholar": ";https://scholar.google.co.uk/citations?user=sJJ7FKgAAAAJ;;https://scholar.google.de/citations?hl=de", "orcid": "0000-0001-5464-3632;0000-0002-8374-4659;;", "linkedin": ";;;", "or_profile": "~Sahel_Iqbal1;~Adrien_Corenflos1;~Simo_S\u00e4rkk\u00e41;~Hany_Abdulsamad1", "aff": "Aalto University;Aalto University;Aalto University;Aalto University", "aff_domain": "aalto.fi;aalto.fi;aalto.fi;aalto.fi", "position": "PhD student;PhD student;Full Professor;Postdoc", "bibtex": "@inproceedings{\niqbal2024nesting,\ntitle={Nesting Particle Filters for Experimental Design in Dynamical Systems},\nauthor={Sahel Iqbal and Adrien Corenflos and Simo S{\\\"a}rkk{\\\"a} and Hany Abdulsamad},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=p1kDNFs62o}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 438875, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=690867299274140802&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "email": "aalto.fi;aalto.fi;aalto.fi;aalto.fi", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Aalto University", "aff_unique_dep": "", "aff_unique_url": "https://www.aalto.fi", "aff_unique_abbr": "Aalto", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Finland" }, { "title": "PRISE: LLM-Style Sequence Compression for Learning Temporal Action Abstractions in Control", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33060", "id": "p225Od0aYt", "proceeding": "https://proceedings.mlr.press/v235/zheng24b.html", "pdf": "https://openreview.net/pdf?id=p225Od0aYt", "openreview": "https://openreview.net/forum?id=p225Od0aYt", "author_site": "Ruijie Zheng, Ching-An Cheng, Hal Daum\u00e9, Furong Huang, Andrey Kolobov", "tldr": "", "abstract": "Temporal action abstractions, along with belief state representations, are a powerful knowledge sharing mechanism for sequential decision making. In this work, we propose a novel view that treats inducing temporal action abstractions as a sequence compression problem. To do so, we bring a subtle but critical component of LLM training pipelines -- input tokenization via byte pair encoding (BPE) -- to bear on the seemingly distant task of learning skills of variable time span in continuous control domains. We introduce an approach called Primitive Sequence Encoding (PRISE) that combines continuous action quantization with BPE to learn powerful action abstractions. We empirically show that high-level skills discovered by PRISE from a multitask set of robotic manipulation demonstrations significantly boost the learning performance of behavior cloning on downstream tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ruijie Zheng;Ching-An Cheng;Hal Daum\u00e9 III;Furong Huang;Andrey Kolobov", "authorids": "~Ruijie_Zheng1;~Ching-An_Cheng1;~Hal_Daum\u00e9_III1;~Furong_Huang1;~Andrey_Kolobov1", "gender": ";M;M;F;M", "homepage": "http://www.ruijiezheng.com;http://www.chinganc.com;http://hal3.name;https://furong-huang.com;https://www.microsoft.com/en-us/research/people/akolobov/", "dblp": "294/8474;123/6369;77/2856.html;72/8513;95/3462", "google_scholar": ";bMZFLZ_V4goC;PbEw81gAAAAJ;13yyuCcAAAAJ;xEWgxBsAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Ruijie_Zheng1;~Ching-An_Cheng1;~Hal_Daum\u00e9_III1;~Furong_Huang1;~Andrey_Kolobov1", "aff": "University of Maryland, College Park;Microsoft Research;Microsoft;University of Maryland;Microsoft", "aff_domain": "cs.umd.edu;microsoft.com;microsoft.com;cs.umd.edu;microsoft.com", "position": "PhD student;Principal Researcher;Senior Principle Researcher;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nzheng2024prise,\ntitle={{PRISE}: {LLM}-Style Sequence Compression for Learning Temporal Action Abstractions in Control},\nauthor={Ruijie Zheng and Ching-An Cheng and Hal Daum{\\'e} III and Furong Huang and Andrey Kolobov},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=p225Od0aYt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4671295, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11550802334298192842&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "cs.umd.edu;microsoft.com;microsoft.com;cs.umd.edu;microsoft.com", "author_num": 5, "aff_unique_index": "0;1;1;0;1", "aff_unique_norm": "University of Maryland;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www/umd.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "UMD;MSR", "aff_campus_unique_index": "0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Prospective Side Information for Latent MDPs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33059", "id": "p5FIjG9fbs", "proceeding": "https://proceedings.mlr.press/v235/kwon24a.html", "pdf": "https://openreview.net/pdf?id=p5FIjG9fbs", "openreview": "https://openreview.net/forum?id=p5FIjG9fbs", "author_site": "Jeongyeol Kwon, Yonathan Efroni, Shie Mannor, Constantine Caramanis", "tldr": "", "abstract": "In many interactive decision-making problems, there is contextual side information that remains fixed within the course of an interaction. This problem has been studied quite extensively under the assumption the context is fully observed, as well as in the opposing limit when the context is unobserved, a special type of POMDP also referred to as a Latent MDP (LMDP). In this work, we consider a class of decision problems that interpolates between the settings, namely, between the case the context is fully observed, and the case the context is unobserved. We refer to this class of decision problems as *LMDPs with prospective side information*. In such an environment an agent receives additional, weakly revealing, information on the latent context at the beginning of each episode. We show that, surprisingly, this problem is not captured by contemporary POMDP settings and is not solved by RL algorithms designed for partially observed environments. We then establish that any sample efficient algorithm must suffer at least $\\Omega(K^{2/3})$-regret, as opposed to standard $\\Omega(\\sqrt{K})$ lower bounds. We design an algorithm with a matching upper bound that depends only polynomially on the problem parameters. This establishes exponential improvement in the sample complexity relative to the existing LMDP lower bound, when prospective information is not given in prior work.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jeongyeol Kwon;Yonathan Efroni;Shie Mannor;Constantine Caramanis", "authorids": "~Jeongyeol_Kwon1;~Yonathan_Efroni2;~Shie_Mannor2;~Constantine_Caramanis1", "gender": "M;M;M;M", "homepage": "https://kwonchungli.github.io/;https://sites.google.com/view/yonathan-efroni/;https://shie.net.technion.ac.il;http://users.ece.utexas.edu/~cmcaram/constantine_caramanis/Home.html", "dblp": "https://dblp.uni-trier.de/pid/228/9224;215/3475;20/1669;96/5760", "google_scholar": "cnyMCYMAAAAJ;pfTInEgAAAAJ;https://scholar.google.com.tw/citations?user=q1HlbIUAAAAJ;47YTUrEAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Jeongyeol_Kwon1;~Yonathan_Efroni2;~Shie_Mannor2;~Constantine_Caramanis1", "aff": "University of Wisconsin - Madison;Meta;Technion - Israel Institute of Technology, Technion;University of Texas, Austin", "aff_domain": "wisc.edu;meta.com;technion.il;utexas.edu", "position": "Postdoc;Researcher;Full Professor;Full Professor", "bibtex": "@inproceedings{\nkwon2024prospective,\ntitle={Prospective Side Information for Latent {MDP}s},\nauthor={Jeongyeol Kwon and Yonathan Efroni and Shie Mannor and Constantine Caramanis},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=p5FIjG9fbs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 563081, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9148788914383547133&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "wisc.edu;meta.com;technion.il;utexas.edu", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Wisconsin-Madison;Meta;Technion - Israel Institute of Technology;University of Texas at Austin", "aff_unique_dep": ";Meta Platforms, Inc.;;", "aff_unique_url": "https://www.wisc.edu;https://meta.com;https://www.technion.ac.il;https://www.utexas.edu", "aff_unique_abbr": "UW-Madison;Meta;Technion;UT Austin", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Madison;;Austin", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;Israel" }, { "title": "Inexact Newton-type Methods for Optimisation with Nonnegativity Constraints", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33058", "id": "p7gpooFIr3", "proceeding": "https://proceedings.mlr.press/v235/smee24a.html", "pdf": "https://openreview.net/pdf?id=p7gpooFIr3", "openreview": "https://openreview.net/forum?id=p7gpooFIr3", "author_site": "Oscar Smee, Fred Roosta", "tldr": "", "abstract": "We consider solving large scale nonconvex optimisation problems with nonnegativity constraints. Such problems arise frequently in machine learning, such as nonnegative least-squares, nonnegative matrix factorisation, as well as problems with sparsity-inducing regularisation. In such settings, first-order methods, despite their simplicity, can be prohibitively slow on ill-conditioned problems or become trapped near saddle regions, while most second-order alternatives involve non-trivially challenging subproblems. The two-metric projection framework, initially proposed by Bertsekas (1982), alleviates these issues and achieves the best of both worlds by combining projected gradient steps at the boundary of the feasible region with Newton steps in the interior in such a way that feasibility can be maintained by simple projection onto the nonnegative orthant. We develop extensions of the two-metric projection framework, which by inexactly solving the subproblems as well as employing non-positive curvature directions, are suitable for large scale and nonconvex settings. We obtain state-of-the-art convergence rates for various classes of non-convex problems and demonstrate competitive practical performance on a variety of problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Oscar Smee;Fred Roosta", "authorids": "~Oscar_Smee1;~Fred_Roosta1", "gender": "M;M", "homepage": ";https://people.smp.uq.edu.au/FredRoosta/", "dblp": "384/4179;133/8630", "google_scholar": "https://scholar.google.com.au/citations?user=MH7UbTAAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0009-0007-2419-2042;", "linkedin": ";", "or_profile": "~Oscar_Smee1;~Fred_Roosta1", "aff": "University of Queensland;University of Queensland", "aff_domain": "uq.edu.au;uq.edu.au", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nsmee2024inexact,\ntitle={Inexact Newton-type Methods for Optimisation with Nonnegativity Constraints},\nauthor={Oscar Smee and Fred Roosta},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=p7gpooFIr3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 993685, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11894667699871865062&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "uq.edu.au;uq.edu.au", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Queensland", "aff_unique_dep": "", "aff_unique_url": "https://www.uq.edu.au", "aff_unique_abbr": "UQ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Australia" }, { "title": "Generalizing Orthogonalization for Models with Non-Linearities", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33057", "id": "p9SMltcfsu", "proceeding": "https://proceedings.mlr.press/v235/rugamer24a.html", "pdf": "https://openreview.net/pdf?id=p9SMltcfsu", "openreview": "https://openreview.net/forum?id=p9SMltcfsu", "author_site": "David R\u00fcgamer, Chris Kolb, Tobias Weber, Lucas Kook, Thomas Nagler", "tldr": "", "abstract": "The complexity of black-box algorithms can lead to various challenges, including the introduction of biases. These biases present immediate risks in the algorithms\u2019 application. It was, for instance, shown that neural networks can deduce racial information solely from a patient's X-ray scan, a task beyond the capability of medical experts. If this fact is not known to the medical expert, automatic decision-making based on this algorithm could lead to prescribing a treatment (purely) based on racial information. While current methodologies allow for the \"orthogonalization\" or \"normalization\" of neural networks with respect to such information, existing approaches are grounded in linear models. Our paper advances the discourse by introducing corrections for non-linearities such as ReLU activations. Our approach also encompasses scalar and tensor-valued predictions, facilitating its integration into neural network architectures. Through extensive experiments, we validate our method's effectiveness in safeguarding sensitive data in generalized linear models, normalizing convolutional neural networks for metadata, and rectifying pre-existing embeddings for undesired attributes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "David R\u00fcgamer;Chris Kolb;Tobias Weber;Lucas Kook;Thomas Nagler", "authorids": "~David_R\u00fcgamer1;~Chris_Kolb1;~Tobias_Weber1;~Lucas_Kook1;~Thomas_Nagler1", "gender": "M;;M;;M", "homepage": "https://davidruegamer.github.io/;;https://www.slds.stat.uni-muenchen.de/people/weber/;;http://www.tnagler.com", "dblp": "220/5560;;;;22/8947", "google_scholar": "https://scholar.google.de/citations?user=_DYguksAAAAJ;;;;tZR1rZoAAAAJ", "orcid": ";;;;0000-0003-1855-0046", "linkedin": ";;https://de.linkedin.com/in/tobias-weber-ba22b7153;;", "or_profile": "~David_R\u00fcgamer1;~Chris_Kolb1;~Tobias_Weber1;~Lucas_Kook1;~Thomas_Nagler1", "aff": "LMU Munich;;Department for Statistics;;Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen", "aff_domain": "lmu.de;;stat.uni-muenchen.de;;lmu.de", "position": "Associate Professor;;PhD student;;Associate Professor", "bibtex": "@inproceedings{\nr{\\\"u}gamer2024generalizing,\ntitle={Generalizing Orthogonalization for Models with Non-Linearities},\nauthor={David R{\\\"u}gamer and Chris Kolb and Tobias Weber and Lucas Kook and Thomas Nagler},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=p9SMltcfsu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 800400, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11531309864949628823&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "lmu.de;;stat.uni-muenchen.de;;lmu.de", "author_num": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "Ludwig Maximilian University of Munich;Statistics Department;Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen", "aff_unique_dep": ";Department for Statistics;", "aff_unique_url": "https://www.lmu.de;;https://www.lmu.de", "aff_unique_abbr": "LMU;;LMU", "aff_campus_unique_index": "0", "aff_campus_unique": "Munich;", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany;" }, { "title": "RL-CFR: Improving Action Abstraction for Imperfect Information Extensive-Form Games with Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33056", "id": "pA2Q5Wfspp", "proceeding": "https://proceedings.mlr.press/v235/li24t.html", "pdf": "https://openreview.net/pdf?id=pA2Q5Wfspp", "openreview": "https://openreview.net/forum?id=pA2Q5Wfspp", "author_site": "Boning Li, Zhixuan Fang, Longbo Huang", "tldr": "", "abstract": "Effective action abstraction is crucial in tackling challenges associated with large action spaces in Imperfect Information Extensive-Form Games (IIEFGs). However, due to the vast state space and computational complexity in IIEFGs, existing methods often rely on fixed abstractions, resulting in sub-optimal performance. In response, we introduce RL-CFR, a novel reinforcement learning (RL) approach for dynamic action abstraction. RL-CFR builds upon our innovative Markov Decision Process (MDP) formulation, with states corresponding to public information and actions represented as feature vectors indicating specific action abstractions. The reward is defined as the expected payoff difference between the selected and default action abstractions. RL-CFR constructs a game tree with RL-guided action abstractions and utilizes counterfactual regret minimization (CFR) for strategy derivation. Impressively, it can be trained from scratch, achieving higher expected payoff without increased CFR solving time. In experiments on Heads-up No-limit Texas Hold'em, RL-CFR outperforms ReBeL's replication and Slumbot, demonstrating significant win-rate margins of $64\\pm 11$ and $84\\pm 17$ mbb/hand, respectively.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Boning Li;Zhixuan Fang;Longbo Huang", "authorids": "~Boning_Li3;~Zhixuan_Fang1;~Longbo_Huang2", "gender": "M;M;M", "homepage": "https://lbn187.github.io/;https://people.iiis.tsinghua.edu.cn/~fang/;http://people.iiis.tsinghua.edu.cn/~huang/", "dblp": ";179/2243;79/7077", "google_scholar": ";0N4s3CAAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Boning_Li3;~Zhixuan_Fang1;~Longbo_Huang2", "aff": "Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "mail.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nli2024rlcfr,\ntitle={{RL}-{CFR}: Improving Action Abstraction for Imperfect Information Extensive-Form Games with Reinforcement Learning},\nauthor={Boning Li and Zhixuan Fang and Longbo Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pA2Q5Wfspp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1019177, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8049893838864524045&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "mail.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Theory of Consistency Diffusion Models: Distribution Estimation Meets Fast Sampling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33055", "id": "pAPykbqUHf", "proceeding": "https://proceedings.mlr.press/v235/dou24a.html", "pdf": "https://openreview.net/pdf?id=pAPykbqUHf", "openreview": "https://openreview.net/forum?id=pAPykbqUHf", "author_site": "Zehao Dou, Minshuo Chen, Mengdi Wang, Zhuoran Yang", "tldr": "", "abstract": "Diffusion models have revolutionized various application domains, including computer vision and audio generation. Despite the state-of-the-art performance, diffusion models are known for their slow sample generation due to the extensive number of steps involved. In response, consistency models have been developed to merge multiple steps in the sampling process, thereby significantly boosting the speed of sample generation without compromising quality. This paper contributes towards the first statistical theory for consistency models, formulating their training as a distribution discrepancy minimization problem. Our analysis yields statistical estimation rates based on the Wasserstein distance for consistency models, matching those of vanilla diffusion models. Additionally, our results encompass the training of consistency models through both distillation and isolation methods, demystifying their underlying advantage.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zehao Dou;Minshuo Chen;Mengdi Wang;Zhuoran Yang", "authorids": "~Zehao_Dou2;~Minshuo_Chen1;~Mengdi_Wang1;~Zhuoran_Yang1", "gender": "M;M;F;M", "homepage": "https://zehaodou-official.github.io;https://minshuochen.github.io;http://mwang.princeton.edu;https://zhuoranyang.github.io/", "dblp": "224/5549.html;217/1509;;", "google_scholar": "CypbdCkAAAAJ;qU9WvTgAAAAJ;;", "orcid": ";;;", "linkedin": "zehao-dou-870b4b133/;;;", "or_profile": "~Zehao_Dou2;~Minshuo_Chen1;~Mengdi_Wang1;~Zhuoran_Yang1", "aff": "Yale University;Princeton University;Princeton University;Yale University", "aff_domain": "yale.edu;princeton.edu;princeton.edu;yale.edu", "position": "PhD student;Postdoc;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\ndou2024theory,\ntitle={Theory of Consistency Diffusion Models: Distribution Estimation Meets Fast Sampling},\nauthor={Zehao Dou and Minshuo Chen and Mengdi Wang and Zhuoran Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pAPykbqUHf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1038583, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3682879077516723485&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": "yale.edu;princeton.edu;princeton.edu;yale.edu", "author_num": 4, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Yale University;Princeton University", "aff_unique_dep": ";", "aff_unique_url": "https://www.yale.edu;https://www.princeton.edu", "aff_unique_abbr": "Yale;Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Combinatorial Multivariant Multi-Armed Bandits with Applications to Episodic Reinforcement Learning and Beyond", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33054", "id": "pAdI75JG3G", "proceeding": "https://proceedings.mlr.press/v235/liu24bp.html", "pdf": "https://openreview.net/pdf?id=pAdI75JG3G", "openreview": "https://openreview.net/forum?id=pAdI75JG3G", "author_site": "Xutong Liu, Siwei Wang, Jinhang Zuo, Han Zhong, Xuchuang Wang, Zhiyong Wang, Shuai Li, Mohammad Hajiesmaili, John C.S. Lui, Wei Chen", "tldr": "", "abstract": "We introduce a novel framework of combinatorial multi-armed bandits (CMAB) with multivariant and probabilistically triggering arms (CMAB-MT), where the outcome of each arm is a $d$-dimensional multivariant random variable and the feedback follows a general arm triggering process. Compared with existing CMAB works, CMAB-MT not only enhances the modeling power but also allows improved results by leveraging distinct statistical properties for multivariant random variables. For CMAB-MT, we propose a general 1-norm multivariant and triggering probability-modulated smoothness condition, and an optimistic CUCB-MT algorithm built upon this condition. Our framework can include many important problems as applications, such as episodic reinforcement learning (RL) and probabilistic maximum coverage for goods distribution, all of which meet the above smoothness condition and achieve matching or improved regret bounds compared to existing works. Through our new framework, we build the first connection between the episodic RL and CMAB literature, by offering a new angle to solve the episodic RL through the lens of CMAB, which may encourage more interactions between these two important directions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xutong Liu;Siwei Wang;Jinhang Zuo;Han Zhong;Xuchuang Wang;Zhiyong Wang;Shuai Li;Mohammad Hajiesmaili;John C.S. Lui;Wei Chen", "authorids": "~Xutong_Liu1;~Siwei_Wang2;~Jinhang_Zuo1;~Han_Zhong1;~Xuchuang_Wang1;~Zhiyong_Wang9;~Shuai_Li3;~Mohammad_Hajiesmaili1;~John_C.S._Lui2;~Wei_Chen10", "gender": "M;M;M;;M;M;F;M;M;M", "homepage": "https://xutongliu.me/;https://www.microsoft.com/en-us/research/people/siweiwang/publications/;https://jhzuo.github.io;https://hanzhong-ml.github.io/;https://xuchuangw.com;https://zhiyongwangwzy.github.io/;http://shuaili8.github.io;https://groups.cs.umass.edu/hajiesmaili/;http://www.cse.cuhk.edu.hk/~cslui/Index.html;https://www.microsoft.com/en-us/research/people/weic/", "dblp": "70/3372-2;51/8279-2;179/8179;137/8096.html;319/5123;;57/2281-10;49/7911;l/JohnCSLui;c/WeiChen13", "google_scholar": "KNfY6BIAAAAJ;;W3YHD10AAAAJ;Bk5q_pAAAAAJ;QJ66dEcAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.hk/citations?user=kMZgQxcAAAAJ;XCGuYKIAAAAJ;https://scholar.google.com.tw/citations?user=7LVjQ7MAAAAJ;hlEPkxAAAAAJ", "orcid": "0000-0002-8628-5873;;0000-0002-9557-3551;;;;;;0000-0001-7466-0384;", "linkedin": ";;;;;zhiyong-wang-a44aaa1a3/;;;;", "or_profile": "~Xutong_Liu1;~Siwei_Wang2;~Jinhang_Zuo1;~Han_Zhong1;~Xuchuang_Wang1;~Zhiyong_Wang9;~Shuai_Li3;~Mohammad_Hajiesmaili1;~John_C.S._Lui2;~Wei_Chen10", "aff": "The Chinese University of Hong Kong;Microsoft;California Institute of Technology;Peking University;University of Massachusetts at Amherst;Department of Computer Science and Engineering, The Chinese University of Hong Kong;John Hopcroft Center, Shanghai Jiao Tong University;College of Information and Computer Science, University of Massachusetts, Amherst;The Chinese University of Hong Kong;Microsoft Research", "aff_domain": "cuhk.edu.hk;microsoft.com;caltech.edu;stu.pku.edu.cn;cs.umass.edu;cse.cuhk.edu.hk;sjtu.edu.cn;cics.umass.edu;cse.cuhk.edu.hk;microsoft.com", "position": "Postdoc;Researcher;Postdoc;PhD student;Postdoc;PhD student;Assistant Professor;Assistant Professor;Full Professor;Pricipal Researcher", "bibtex": "@inproceedings{\nliu2024combinatorial,\ntitle={Combinatorial Multivariant Multi-Armed Bandits with Applications to Episodic Reinforcement Learning and Beyond},\nauthor={Xutong Liu and Siwei Wang and Jinhang Zuo and Han Zhong and Xuchuang Wang and Zhiyong Wang and Shuai Li and Mohammad Hajiesmaili and John C.S. Lui and Wei Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pAdI75JG3G}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 555258, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7502749176727257884&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "cuhk.edu.hk;microsoft.com;caltech.edu;stu.pku.edu.cn;cs.umass.edu;cse.cuhk.edu.hk;sjtu.edu.cn;cics.umass.edu;cse.cuhk.edu.hk;microsoft.com", "author_num": 10, "aff_unique_index": "0;1;2;3;4;0;5;4;0;1", "aff_unique_norm": "Chinese University of Hong Kong;Microsoft;California Institute of Technology;Peking University;University of Massachusetts Amherst;Shanghai Jiao Tong University", "aff_unique_dep": ";Microsoft Corporation;;;;John Hopcroft Center", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.microsoft.com;https://www.caltech.edu;http://www.pku.edu.cn;https://www.umass.edu;https://www.sjtu.edu.cn", "aff_unique_abbr": "CUHK;Microsoft;Caltech;Peking U;UMass Amherst;SJTU", "aff_campus_unique_index": "0;2;3;0;4;3;0", "aff_campus_unique": "Hong Kong SAR;;Pasadena;Amherst;Shanghai", "aff_country_unique_index": "0;1;1;0;1;0;0;1;0;1", "aff_country_unique": "China;United States" }, { "title": "Stochastic Weakly Convex Optimization beyond Lipschitz Continuity", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33053", "id": "pAyX8q1IIn", "proceeding": "https://proceedings.mlr.press/v235/gao24d.html", "pdf": "https://openreview.net/pdf?id=pAyX8q1IIn", "openreview": "https://openreview.net/forum?id=pAyX8q1IIn", "author_site": "Wenzhi Gao, Qi Deng", "tldr": "", "abstract": "This paper considers stochastic weakly convex optimization without the standard Lipschitz continuity assumption. Based on new adaptive regularization (stepsize) strategies, we show that a wide class of stochastic algorithms, including the stochastic subgradient method, preserve the $\\mathcal{O} ( 1 / \\sqrt{K})$ convergence rate with constant failure rate. Our analyses rest on rather weak assumptions: the Lipschitz parameter can be either bounded by a general growth function of $\\\\|x\\\\|$ or locally estimated through independent random samples. Numerical experiments demonstrate the efficiency and robustness of our proposed stepsize policies.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenzhi Gao;Qi Deng", "authorids": "~Wenzhi_Gao1;~Qi_Deng1", "gender": "M;M", "homepage": "https://github.com/Gwzwpxz;http://sime.shufe.edu.cn/teacher/show/225", "dblp": ";", "google_scholar": "4lDkX_YAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";", "linkedin": ";", "or_profile": "~Wenzhi_Gao1;~Qi_Deng1", "aff": "Stanford University;Shanghai University of Finance and Economics", "aff_domain": "stanford.edu;sufe.edu.cn", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\ngao2024stochastic,\ntitle={Stochastic Weakly Convex Optimization beyond Lipschitz Continuity},\nauthor={Wenzhi Gao and Qi Deng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pAyX8q1IIn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 458901, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13680323978484752604&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "stanford.edu;sufe.edu.cn", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Stanford University;Shanghai University of Finance and Economics", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;http://www.sufe.edu.cn", "aff_unique_abbr": "Stanford;SUFE", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;China" }, { "title": "QORA: Zero-Shot Transfer via Interpretable Object-Relational Model Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33052", "id": "pAzDdYzEva", "proceeding": "https://proceedings.mlr.press/v235/stella24a.html", "pdf": "https://openreview.net/pdf?id=pAzDdYzEva", "openreview": "https://openreview.net/forum?id=pAzDdYzEva", "author_site": "Gabriel Stella, Dmitri Loguinov", "tldr": "", "abstract": "Although neural networks have demonstrated significant success in various reinforcement-learning tasks, even the highest-performing deep models often fail to generalize. As an alternative, object-oriented approaches offer a promising path towards better efficiency and generalization; however, they typically address narrow problem classes and require extensive domain knowledge. To overcome these limitations, we introduce *QORA*, an algorithm that constructs models expressive enough to solve a variety of domains, including those with stochastic transition functions, directly from a domain-agnostic object-based state representation. We also provide a novel benchmark suite to evaluate learners' generalization capabilities. In our test domains, QORA achieves 100% predictive accuracy using almost four orders of magnitude fewer observations than a neural-network baseline, demonstrates zero-shot transfer to modified environments, and adapts rapidly when applied to tasks involving previously unseen object interactions. Finally, we give examples of QORA's learned rules, showing them to be easily interpretable.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gabriel Stella;Dmitri Loguinov", "authorids": "~Gabriel_Stella1;~Dmitri_Loguinov1", "gender": "Not Specified;M", "homepage": "https://gabrielrstella.com/;http://irl.cs.tamu.edu/people/dmitri/", "dblp": ";", "google_scholar": ";JSidYKQAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Gabriel_Stella1;~Dmitri_Loguinov1", "aff": "Texas A&M University - College Station;Texas A&M University - College Station", "aff_domain": "tamu.edu;tamu.edu", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nstella2024qora,\ntitle={{QORA}: Zero-Shot Transfer via Interpretable Object-Relational Model Learning},\nauthor={Gabriel Stella and Dmitri Loguinov},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pAzDdYzEva}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 843122, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:obRtvGMg5a8J:scholar.google.com/&scioq=QORA:+Zero-Shot+Transfer+via+Interpretable+Object-Relational+Model+Learning&hl=en&as_sdt=0,44", "gs_version_total": 16, "email": "tamu.edu;tamu.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Texas A&M University", "aff_unique_dep": "", "aff_unique_url": "https://www.tamu.edu", "aff_unique_abbr": "TAMU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Station", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "RAUCA: A Novel Physical Adversarial Attack on Vehicle Detectors via Robust and Accurate Camouflage Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33051", "id": "pBTLGM9uWx", "proceeding": "https://proceedings.mlr.press/v235/zhou24n.html", "pdf": "https://openreview.net/pdf?id=pBTLGM9uWx", "openreview": "https://openreview.net/forum?id=pBTLGM9uWx", "author_site": "Jiawei Zhou, Linye Lyu, Daojing He, YU LI", "tldr": "", "abstract": "Adversarial camouflage is a widely used physical attack against vehicle detectors for its superiority in multi-view attack performance. One promising approach involves using differentiable neural renderers to facilitate adversarial camouflage optimization through gradient back-propagation. However, existing methods often struggle to capture environmental characteristics during the rendering process or produce adversarial textures that can precisely map to the target vehicle, resulting in suboptimal attack performance. Moreover, these approaches neglect diverse weather conditions, reducing the efficacy of generated camouflage across varying weather scenarios. To tackle these challenges, we propose a robust and accurate camouflage generation method, namely RAUCA. The core of RAUCA is a novel neural rendering component, Neural Renderer Plus (NRP), which can accurately project vehicle textures and render images with environmental characteristics such as lighting and weather. In addition, we integrate a multi-weather dataset for camouflage generation, leveraging the NRP to enhance the attack robustness. Experimental results on six popular object detectors show that RAUCA consistently outperforms existing methods in both simulation and real-world settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiawei Zhou;Linye Lyu;Daojing He;YU LI", "authorids": "~Jiawei_Zhou4;~Linye_Lyu1;~Daojing_He1;~YU_LI10", "gender": ";M;M;Not Specified", "homepage": ";;http://faculty.hitsz.edu.cn/hedaojing;http://liyu.one", "dblp": ";371/4134;60/7270;34/2997-7", "google_scholar": ";qjHyCmwAAAAJ;;M0zhrM8AAAAJ", "orcid": ";;0000-0002-3820-8128;", "linkedin": ";;;", "or_profile": "~Jiawei_Zhou4;~Linye_Lyu1;~Daojing_He1;~YU_LI10", "aff": ";Harbin Institute of Technology;Harbin Institute of Technology;Harbin Institute of Technology (Shen Zhen)", "aff_domain": ";stu.hit.edu.cn;hit.edu.cn;hit.edu.cn", "position": ";PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nzhou2024rauca,\ntitle={{RAUCA}: A Novel Physical Adversarial Attack on Vehicle Detectors via Robust and Accurate Camouflage Generation},\nauthor={Jiawei Zhou and Linye Lyu and Daojing He and YU LI},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pBTLGM9uWx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2569133, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15756373377657479730&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";stu.hit.edu.cn;hit.edu.cn;hit.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Harbin Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "http://www.hit.edu.cn/", "aff_unique_abbr": "HIT", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Harbin;Shenzhen", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Revealing Vision-Language Integration in the Brain with Multimodal Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33050", "id": "pD9BTIDUoX", "proceeding": "https://proceedings.mlr.press/v235/subramaniam24a.html", "pdf": "https://openreview.net/pdf?id=pD9BTIDUoX", "openreview": "https://openreview.net/forum?id=pD9BTIDUoX", "author_site": "Vighnesh Subramaniam, Colin Conwell, Christopher Wang, Gabriel Kreiman, Boris Katz, Ignacio Cases, Andrei Barbu", "tldr": "", "abstract": "We use (multi)modal deep neural networks (DNNs) to probe for sites of multimodal integration in the human brain by predicting stereoencephalography (SEEG) recordings taken while human subjects watched movies. We operationalize sites of multimodal integration as regions where a multimodal vision-language model predicts recordings better than unimodal language, unimodal vision, or linearly-integrated language-vision models. Our target DNN models span different architectures (e.g., convolutional networks and transformers) and multimodal training techniques (e.g., cross-attention and contrastive learning). As a key enabling step, we first demonstrate that trained vision and language models systematically outperform their randomly initialized counterparts in their ability to predict SEEG signals. We then compare unimodal and multimodal models against one another. Because our target DNN models often have different architectures, number of parameters, and training sets (possibly obscuring those differences attributable to integration), we carry out a controlled comparison of two models (SLIP and SimCLR), which keep all of these attributes the same aside from input modality. Using this approach, we identify a sizable number of neural sites (on average 141 out of 1090 total sites or 12.94%) and brain regions where multimodal integration seems to occur. Additionally, we find that among the variants of multimodal training techniques we assess, CLIP-style training is the best suited for downstream prediction of the neural activity in these sites.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vighnesh Subramaniam;Colin Conwell;Christopher Wang;Gabriel Kreiman;Boris Katz;Ignacio Cases;Andrei Barbu", "authorids": "~Vighnesh_Subramaniam1;~Colin_Conwell1;~Christopher_Wang1;~Gabriel_Kreiman1;~Boris_Katz1;~Ignacio_Cases2;~Andrei_Barbu3", "gender": ";;;M;M;Non-Binary;M", "homepage": "https://vsubramaniam851.github.io;;https://czlwang.github.io/;http://klab.tch.harvard.edu;http://people.csail.mit.edu/boris/boris.html;;https://0xab.com", "dblp": ";;;12/1367;k/BorisKatz;;58/8365", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;;WxZ_6nsAAAAJ;FdNuUb8AAAAJ;9-TdgYMAAAAJ;t1rjgHgAAAAJ", "orcid": ";0000-0002-7754-1580;;0000-0003-3505-8475;;;", "linkedin": "vighnesh-subramaniam-34549717b/;;;kreiman/;;;andrei-barbu-1166131", "or_profile": "~Vighnesh_Subramaniam1;~Colin_Conwell1;~Christopher_Wang1;~Gabriel_Kreiman1;~Boris_Katz1;~Ignacio_Cases2;~Andrei_Barbu3", "aff": "Massachusetts Institute of Technology;Johns Hopkins University;Computer Science and Artificial Intelligence Laboratory, Electrical Engineering & Computer Science;Harvard Medical School;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;jhu.edu;csail.mit.edu;harvard.edu;mit.edu;mit.edu;mit.edu", "position": "MS student;Postdoc;PhD student;Full Professor;Principal Research Scientist;Postdoc;Researcher", "bibtex": "@inproceedings{\nsubramaniam2024revealing,\ntitle={Revealing Vision-Language Integration in the Brain with Multimodal Networks},\nauthor={Vighnesh Subramaniam and Colin Conwell and Christopher Wang and Gabriel Kreiman and Boris Katz and Ignacio Cases and Andrei Barbu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pD9BTIDUoX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6815724, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12071620120661180690&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 10, "email": "mit.edu;jhu.edu;csail.mit.edu;harvard.edu;mit.edu;mit.edu;mit.edu", "author_num": 7, "aff_unique_index": "0;1;0;2;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology;Johns Hopkins University;Harvard University", "aff_unique_dep": ";;Medical School", "aff_unique_url": "https://web.mit.edu;https://www.jhu.edu;https://hms.harvard.edu", "aff_unique_abbr": "MIT;JHU;HMS", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Cambridge;Boston", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "SF-DQN: Provable Knowledge Transfer using Successor Feature for Deep Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33049", "id": "pDoAjdrMf0", "proceeding": "https://proceedings.mlr.press/v235/zhang24q.html", "pdf": "https://openreview.net/pdf?id=pDoAjdrMf0", "openreview": "https://openreview.net/forum?id=pDoAjdrMf0", "author_site": "Shuai Zhang, Heshan Fernando, Miao Liu, Keerthiram Murugesan, Songtao Lu, Pin-Yu Chen, Tianyi Chen, Meng Wang", "tldr": "", "abstract": "This paper studies the transfer reinforcement learning (RL) problem where multiple RL problems have different reward functions but share the same underlying transition dynamics. In this setting, the Q-function of each RL problem (task) can be decomposed into a successor feature (SF) and a reward mapping: the former characterizes the transition dynamics, and the latter characterizes the task-specific reward function. This Q-function decomposition, coupled with a policy improvement operator known as generalized policy improvement (GPI), reduces the sample complexity of finding the optimal Q-function, and thus the SF & GPI framework exhibits promising empirical performance compared to traditional RL methods like Q-learning. However, its theoretical foundations remain largely unestablished, especially when learning the successor features using deep neural networks (SF-DQN). This paper studies the provable knowledge transfer using SFs-DQN in transfer RL problems. We establish the first convergence analysis with provable generalization guarantees for SF-DQN with GPI. The theory reveals that SF-DQN with GPI outperforms conventional RL approaches, such as deep Q-network, in terms of both faster convergence rate and better generalization. Numerical experiments on real and synthetic RL tasks support the superior performance of SF-DQN & GPI, aligning with our theoretical findings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shuai Zhang;Heshan Devaka Fernando;Miao Liu;Keerthiram Murugesan;Songtao Lu;Pin-Yu Chen;Tianyi Chen;Meng Wang", "authorids": "~Shuai_Zhang6;~Heshan_Devaka_Fernando1;~Miao_Liu1;~Keerthiram_Murugesan1;~Songtao_Lu1;~Pin-Yu_Chen1;~Tianyi_Chen5;~Meng_Wang4", "gender": "M;M;M;M;M;M;M;F", "homepage": "https://inchs708.github.io/shuaizhang.github.io/index.html;https://heshandevaka.github.io/;https://sites.google.com/view/miaoliuhome;https://keerthi166.github.io;https://songtaogithub.github.io/;http://www.pinyuchen.com;https://chentianyi1991.github.io/;https://www.ecse.rpi.edu/~wang/index.html", "dblp": "71/208-15;;;178/2877;05/2887;39/8969;;93/6765-3", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;QOXlyxIAAAAJ;7QHvAEYAAAAJ;-698GEMAAAAJ;LRsjX7kAAAAJ;jxwlCUUAAAAJ;kFwvv38AAAAJ;", "orcid": "0000-0001-8280-6988;;;0000-0001-6847-522X;;0000-0003-1039-8369;;", "linkedin": ";;miao-liu-3273a32b;https://linkedin.com/in/keerthiram;;pin-yu-chen-940062a2;;", "or_profile": "~Shuai_Zhang6;~Heshan_Devaka_Fernando1;~Miao_Liu1;~Keerthiram_Murugesan1;~Songtao_Lu1;~Pin-Yu_Chen1;~Tianyi_Chen5;~Meng_Wang4", "aff": "New Jersey Institute of Technology;Rensselaer Polytechnic Institute;International Business Machines;International Business Machines;IBM Thomas J. Watson Research Center;International Business Machines;Rensselaer Polytechnic Institute;Rensselaer Polytechnic Institute", "aff_domain": "njit.edu;rpi.edu;ibm.com;ibm.com;ibm.com;ibm.com;rpi.edu;rpi.edu", "position": "Assistant Professor;PhD student;Research Staff Member;Researcher;Researcher;Principal Researcher;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nzhang2024sfdqn,\ntitle={{SF}-{DQN}: Provable Knowledge Transfer using Successor Feature for Deep Reinforcement Learning},\nauthor={Shuai Zhang and Heshan Devaka Fernando and Miao Liu and Keerthiram Murugesan and Songtao Lu and Pin-Yu Chen and Tianyi Chen and Meng Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pDoAjdrMf0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 746156, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=312482323669842890&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "njit.edu;rpi.edu;ibm.com;ibm.com;ibm.com;ibm.com;rpi.edu;rpi.edu", "author_num": 8, "aff_unique_index": "0;1;2;2;3;2;1;1", "aff_unique_norm": "New Jersey Institute of Technology;Rensselaer Polytechnic Institute;International Business Machines Corporation;IBM", "aff_unique_dep": ";;;Research", "aff_unique_url": "https://www.njit.edu;https://www.rpi.edu;https://www.ibm.com;https://www.ibm.com/research", "aff_unique_abbr": "NJIT;RPI;IBM;IBM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Yorktown Heights", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Better & Faster Large Language Models via Multi-token Prediction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33048", "id": "pEWAcejiU2", "proceeding": "https://proceedings.mlr.press/v235/gloeckle24a.html", "pdf": "https://openreview.net/pdf?id=pEWAcejiU2", "openreview": "https://openreview.net/forum?id=pEWAcejiU2", "author_site": "Fabian Gloeckle, Badr Youbi Idrissi, Baptiste Roziere, David Lopez-Paz, Gabriel Synnaeve", "tldr": "", "abstract": "Large language models such as GPT and Llama are trained with a next-token prediction loss. In this work, we suggest that training language models to predict multiple future tokens at once results in higher sample efficiency. More specifically, at each position in the training corpus, we ask the model to predict the following $n$ tokens using $n$ independent output heads, operating on top of a shared model trunk. Considering multi-token prediction as an auxiliary training task, we measure improved downstream capabilities with no overhead in training time for both code and natural language models. The method is increasingly useful for larger model sizes, and keeps its appeal when training for multiple epochs. Gains are especially pronounced on generative benchmarks like coding, where our models consistently outperform strong baselines by several percentage points. Our 13B parameter models solves 12% more problems on Human Eval and 17% more on MBPP than comparable next-token models. Experiments on small algorithmic tasks demonstrate that multi-token prediction is favorable for the development of induction heads and algorithmic reasoning capabilities. As an additional benefit, models trained with 4-token prediction are up to $3\\times$ faster at inference, even with large batch sizes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fabian Gloeckle;Badr Youbi Idrissi;Baptiste Roziere;David Lopez-Paz;Gabriel Synnaeve", "authorids": "~Fabian_Gloeckle1;~Badr_Youbi_Idrissi1;~Baptiste_Roziere1;~David_Lopez-Paz2;~Gabriel_Synnaeve1", "gender": "M;M;;;M", "homepage": "https://github.com/faabian;;;http://lopezpaz.org;", "dblp": "355/2184.html;265/5811.html;;74/10481;http://dblp.uni-trier.de/pers/hd/s/Synnaeve:Gabriel", "google_scholar": ";;CrSf2CQAAAAJ;;wN9rBkcAAAAJ", "orcid": ";;;;", "linkedin": ";badr-y-idrissi/;;;", "or_profile": "~Fabian_Gloeckle1;~Badr_Youbi_Idrissi1;~Baptiste_Roziere1;~David_Lopez-Paz2;~Gabriel_Synnaeve1", "aff": "Ecole Nationale des Ponts et Chausees;;Meta AI;Meta Facebook;Meta Facebook", "aff_domain": "enpc.fr;;fb.com;fb.com;fb.com", "position": "PhD student;;Researcher;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\ngloeckle2024better,\ntitle={Better \\& Faster Large Language Models via Multi-token Prediction},\nauthor={Fabian Gloeckle and Badr Youbi Idrissi and Baptiste Roziere and David Lopez-Paz and Gabriel Synnaeve},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pEWAcejiU2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1586398, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 85, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=664686811610308106&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "enpc.fr;;fb.com;fb.com;fb.com", "author_num": 5, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Ecole Nationale des Ponts et Chaussees;Meta", "aff_unique_dep": ";Meta AI", "aff_unique_url": "https://www.enpc.fr;https://meta.com", "aff_unique_abbr": "ENPC;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "France;United States" }, { "title": "O$n$ Learning Deep O($n$)-Equivariant Hyperspheres", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33047", "id": "pFWmHUdJE5", "proceeding": "https://proceedings.mlr.press/v235/melnyk24a.html", "pdf": "https://openreview.net/pdf?id=pFWmHUdJE5", "openreview": "https://openreview.net/forum?id=pFWmHUdJE5", "author_site": "Pavlo Melnyk, Michael Felsberg, M\u00e5rten Wadenb\u00e4ck, Andreas Robinson, Cuong Le", "tldr": "", "abstract": "In this paper, we utilize hyperspheres and regular $n$-simplexes and propose an approach to learning deep features equivariant under the transformations of $n$D reflections and rotations, encompassed by the powerful group of O$(n)$. Namely, we propose O$(n)$-equivariant neurons with spherical decision surfaces that generalize to any dimension $n$, which we call Deep Equivariant Hyperspheres. We demonstrate how to combine them in a network that directly operates on the basis of the input points and propose an invariant operator based on the relation between two points and a sphere, which as we show, turns out to be a Gram matrix. Using synthetic and real-world data in $n$D, we experimentally verify our theoretical contributions and find that our approach is superior to the competing methods for O$(n)$-equivariant benchmark datasets (classification and regression), demonstrating a favorable speed/performance trade-off. The code is available on [GitHub](https://github.com/pavlo-melnyk/equivariant-hyperspheres).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pavlo Melnyk;Michael Felsberg;M\u00e5rten Wadenb\u00e4ck;Andreas Robinson;Cuong Le", "authorids": "~Pavlo_Melnyk1;~Michael_Felsberg2;~M\u00e5rten_Wadenb\u00e4ck1;~Andreas_Robinson1;~Cuong_Le1", "gender": "M;;M;M;M", "homepage": "https://pavlomelnyk.com;https://liu.se/en/employee/micfe03;https://liu.se/en/employee/marwa32;;", "dblp": "232/3322;00/78;132/2319;158/5786;", "google_scholar": "RhThiI8AAAAJ;https://scholar.google.se/citations?hl=en;6WRQpCQAAAAJ;https://scholar.google.se/citations?user=_4Mg38AAAAAJ;aGtNlKgAAAAJ", "orcid": "0000-0002-6091-861X;0000-0002-6096-3648;0000-0002-0675-2794;;", "linkedin": ";https://linkedin.com/in/michael-felsberg-668a202;;;cuong-le-8811ba16b/", "or_profile": "~Pavlo_Melnyk1;~Michael_Felsberg2;~M\u00e5rten_Wadenb\u00e4ck1;~Andreas_Robinson1;~Cuong_Le1", "aff": "Link\u00f6ping University;Link\u00f6ping University;Link\u00f6ping University;Link\u00f6ping University;Link\u00f6ping University", "aff_domain": "liu.se;liu.se;liu.se;liu.se;liu.se", "position": "PhD student;Full Professor;Assistant Professor;Researcher;PhD student", "bibtex": "@inproceedings{\nmelnyk2024on,\ntitle={O\\$n\\$ Learning Deep O(\\$n\\$)-Equivariant Hyperspheres},\nauthor={Pavlo Melnyk and Michael Felsberg and M{\\r{a}}rten Wadenb{\\\"a}ck and Andreas Robinson and Cuong Le},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pFWmHUdJE5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 536500, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6118510587473492375&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 8, "email": "liu.se;liu.se;liu.se;liu.se;liu.se", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Link\u00f6ping University", "aff_unique_dep": "", "aff_unique_url": "https://www.liu.se", "aff_unique_abbr": "LiU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Sweden" }, { "title": "Mechanistic Neural Networks for Scientific Machine Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33046", "id": "pLtuwhoQh7", "proceeding": "https://proceedings.mlr.press/v235/pervez24a.html", "pdf": "https://openreview.net/pdf?id=pLtuwhoQh7", "openreview": "https://openreview.net/forum?id=pLtuwhoQh7", "author_site": "Adeel Pervez, Francesco Locatello, Efstratios Gavves", "tldr": "", "abstract": "This paper presents *Mechanistic Neural Networks*, a neural network design for machine learning applications in the sciences. It incorporates a new *Mechanistic Block* in standard architectures to explicitly learn governing differential equations as representations, revealing the underlying dynamics of data and enhancing interpretability and efficiency in data modeling. Central to our approach is a novel *Relaxed Linear Programming Solver* (NeuRLP) inspired by a technique that reduces solving linear ODEs to solving linear programs. This integrates well with neural networks and surpasses the limitations of traditional ODE solvers enabling scalable GPU parallel processing. Overall, Mechanistic Neural Networks demonstrate their versatility for scientific machine learning applications, adeptly managing tasks from equation discovery to dynamic systems modeling. We prove their comprehensive capabilities in analyzing and interpreting complex scientific data across various applications, showing significant performance against specialized state-of-the-art methods. Source code is available at https://github.com/alpz/mech-nn.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Adeel Pervez;Francesco Locatello;Stratis Gavves", "authorids": "~Adeel_Pervez1;~Francesco_Locatello1;~Stratis_Gavves1", "gender": ";M;M", "homepage": ";https://twitter.com/FrancescoLocat8;https://www.egavves.com", "dblp": "225/4821;195/6074;03/8693", "google_scholar": ";;https://scholar.google.nl/citations?user=QqfCvsgAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Adeel_Pervez1;~Francesco_Locatello1;~Efstratios_Gavves1", "aff": "University of Amsterdam;Institute of Science and Technology;University of Amsterdam", "aff_domain": "uva.nl;ist.ac.at;uva.nl", "position": "Postdoc;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\npervez2024mechanistic,\ntitle={Mechanistic Neural Networks for Scientific Machine Learning},\nauthor={Adeel Pervez and Francesco Locatello and Stratis Gavves},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pLtuwhoQh7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3008373, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11254124308389966139&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "uva.nl;ist.ac.at;uva.nl", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Amsterdam;Institute of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.uva.nl;", "aff_unique_abbr": "UvA;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Netherlands;" }, { "title": "Efficient Algorithms for Empirical Group Distributionally Robust Optimization and Beyond", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33045", "id": "pOJbk4Nzmi", "proceeding": "https://proceedings.mlr.press/v235/yu24a.html", "pdf": "https://openreview.net/pdf?id=pOJbk4Nzmi", "openreview": "https://openreview.net/forum?id=pOJbk4Nzmi", "author_site": "Dingzhi Yu, Yunuo Cai, Wei Jiang, Lijun Zhang", "tldr": "", "abstract": "In this paper, we investigate the empirical counterpart of Group Distributionally Robust Optimization (GDRO), which aims to minimize the maximal empirical risk across $m$ distinct groups. We formulate empirical GDRO as a *two-level* finite-sum convex-concave minimax optimization problem and develop an algorithm called ALEG to benefit from its special structure. ALEG is a double-looped stochastic primal-dual algorithm that incorporates variance reduction techniques into a modified mirror prox routine. To exploit the two-level finite-sum structure, we propose a simple group sampling strategy to construct the stochastic gradient with a smaller Lipschitz constant and then perform variance reduction for all groups. Theoretical analysis shows that ALEG achieves $\\varepsilon$-accuracy within a computation complexity of $\\mathcal{O}\\left(\\frac{m\\sqrt{\\bar{n}\\ln{m}}}{\\varepsilon}\\right)$, where $\\bar n$ is the average number of samples among $m$ groups. Notably, our approach outperforms the state-of-the-art method by a factor of $\\sqrt{m}$. Based on ALEG, we further develop a two-stage optimization algorithm called ALEM to deal with the empirical Minimax Excess Risk Optimization (MERO) problem. The computation complexity of ALEM nearly matches that of ALEG, surpassing the rates of existing methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dingzhi Yu;Yunuo Cai;Wei Jiang;Lijun Zhang", "authorids": "~Dingzhi_Yu1;~Yunuo_Cai1;~Wei_Jiang8;~Lijun_Zhang1", "gender": ";M;M;", "homepage": "https://www.lamda.nju.edu.cn/yudz/;https://github.com/Colektan;http://www.lamda.nju.edu.cn/jiangw/?AspxAutoDetectCookieSupport=1;", "dblp": "371/9829;;;", "google_scholar": "s7pIGmgAAAAJ;;;", "orcid": ";;;", "linkedin": "dingzhi-yu-28a221316/;;;", "or_profile": "~Dingzhi_Yu1;~Yunuo_Cai1;~Wei_Jiang8;~Lijun_Zhang1", "aff": "Fudan University;Fudan University;Nanjing University;", "aff_domain": "fudan.edu.cn;fudan.edu.cn;nju.edu.cn;", "position": "Undergrad student;Undergrad student;PhD student;", "bibtex": "@inproceedings{\nyu2024efficient,\ntitle={Efficient Algorithms for Empirical Group Distributionally Robust Optimization and Beyond},\nauthor={Dingzhi Yu and Yunuo Cai and Wei Jiang and Lijun Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pOJbk4Nzmi}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 618097, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8867038177804966550&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "fudan.edu.cn;fudan.edu.cn;nju.edu.cn;", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Fudan University;Nanjing University", "aff_unique_dep": ";", "aff_unique_url": "https://www.fudan.edu.cn;https://www.nju.edu.cn", "aff_unique_abbr": "Fudan;Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "SILVER: Single-loop variance reduction and application to federated learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33044", "id": "pOgMluzEIH", "proceeding": "https://proceedings.mlr.press/v235/oko24a.html", "pdf": "https://openreview.net/pdf?id=pOgMluzEIH", "openreview": "https://openreview.net/forum?id=pOgMluzEIH", "author_site": "Kazusato Oko, Shunta Akiyama, Denny Wu, Tomoya Murata, Taiji Suzuki", "tldr": "", "abstract": "Most variance reduction methods require multiple times of full gradient computation, which is time-consuming and hence a bottleneck in application to distributed optimization. We present a single-loop variance-reduced gradient estimator named SILVER (SIngle-Loop VariancE-Reduction) for the finite-sum non-convex optimization, which does not require multiple full gradients but nevertheless achieves the optimal gradient complexity. Notably, unlike existing methods, SILVER provably reaches second-order optimality, with exponential convergence in the Polyak-\u0141ojasiewicz (PL) region, and achieves further speedup depending on the data heterogeneity. Owing to these advantages, SILVER serves as a new base method to design communication-efficient federated learning algorithms: we combine SILVER with local updates which gives the best communication rounds and number of communicated gradients across all range of Hessian heterogeneity, and, at the same time, guarantees second-order optimality and exponential convergence in the PL region.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kazusato Oko;Shunta Akiyama;Denny Wu;Tomoya Murata;Taiji Suzuki", "authorids": "~Kazusato_Oko1;~Shunta_Akiyama1;~Denny_Wu2;~Tomoya_Murata1;~Taiji_Suzuki1", "gender": "M;M;M;M;M", "homepage": ";https://shuntaak.github.io/;https://dennywu1.github.io/;;http://ibis.t.u-tokyo.ac.jp/suzuki/", "dblp": ";280/3821;;151/5035;08/312", "google_scholar": ";https://scholar.google.co.jp/citations?user=RlTfkjQAAAAJ;https://scholar.google.com/citations?hl=en;hH5pbMIAAAAJ;x8osrBsAAAAJ", "orcid": ";;;;", "linkedin": "kazusatooko/;;;;", "or_profile": "~Kazusato_Oko1;~Shunta_Akiyama1;~Denny_Wu2;~Tomoya_Murata1;~Taiji_Suzuki1", "aff": "The University of Tokyo;The University of Tokyo;New York University;The University of Tokyo;The University of Tokyo", "aff_domain": "u-tokyo.ac.jp;u-tokyo.ac.jp;nyu.edu;tokyo.ac.jp;tokyo.ac.jp", "position": "MS student;PhD student;Postdoc;PhD student;Associate Professor", "bibtex": "@inproceedings{\noko2024silver,\ntitle={{SILVER}: Single-loop variance reduction and application to federated learning},\nauthor={Kazusato Oko and Shunta Akiyama and Denny Wu and Tomoya Murata and Taiji Suzuki},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pOgMluzEIH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1277823, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:bRwjh563V1EJ:scholar.google.com/&scioq=SILVER:+Single-loop+variance+reduction+and+application+to+federated+learning&hl=en&as_sdt=0,5", "gs_version_total": 4, "email": "u-tokyo.ac.jp;u-tokyo.ac.jp;nyu.edu;tokyo.ac.jp;tokyo.ac.jp", "author_num": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "University of Tokyo;New York University", "aff_unique_dep": ";", "aff_unique_url": "https://www.u-tokyo.ac.jp;https://www.nyu.edu", "aff_unique_abbr": "UTokyo;NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "Japan;United States" }, { "title": "Near-Optimal Reinforcement Learning with Self-Play under Adaptivity Constraints", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33043", "id": "pPNMhdYMaz", "proceeding": "https://proceedings.mlr.press/v235/qiao24b.html", "pdf": "https://openreview.net/pdf?id=pPNMhdYMaz", "openreview": "https://openreview.net/forum?id=pPNMhdYMaz", "author_site": "Dan Qiao, Yu-Xiang Wang", "tldr": "", "abstract": "We study the problem of multi-agent reinforcement learning (MARL) with adaptivity constraints --- a new problem motivated by real-world applications where deployments of new policies are costly and the number of policy updates must be minimized. For two-player zero-sum Markov Games, we design a (policy) elimination based algorithm that achieves a regret of $\\widetilde{O}(\\sqrt{H^3 S^2 ABK})$, while the batch complexity is only $O(H+\\log\\log K)$. In the above, $S$ denotes the number of states, $A,B$ are the number of actions for the two players respectively, $H$ is the horizon and $K$ is the number of episodes. Furthermore, we prove a batch complexity lower bound $\\Omega(\\frac{H}{\\log_{A}K}+\\log\\log K)$ for all algorithms with $\\widetilde{O}(\\sqrt{K})$ regret bound, which matches our upper bound up to logarithmic factors. As a byproduct, our techniques naturally extend to learning bandit games and reward-free MARL within near optimal batch complexity. To the best of our knowledge, these are the first line of results towards understanding MARL with low adaptivity.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dan Qiao;Yu-Xiang Wang", "authorids": "~Dan_Qiao1;~Yu-Xiang_Wang1", "gender": "M;", "homepage": ";http://www.cs.ucsb.edu/~yuxiangw/publications.html", "dblp": ";62/1637-3.html", "google_scholar": "EyfAUuUAAAAJ;HGNZ1fkAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Dan_Qiao1;~Yu-Xiang_Wang1", "aff": ", University of California, Santa Barbara;UC Santa Barbara", "aff_domain": "cs.ucsb.edu;ucsb.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nqiao2024nearoptimal,\ntitle={Near-Optimal Reinforcement Learning with Self-Play under Adaptivity Constraints},\nauthor={Dan Qiao and Yu-Xiang Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pPNMhdYMaz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 460214, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4735454589297253525&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "cs.ucsb.edu;ucsb.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Santa Barbara", "aff_unique_dep": "", "aff_unique_url": "https://www.ucsb.edu", "aff_unique_abbr": "UCSB", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Santa Barbara", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Class-Imbalanced Graph Learning without Class Rebalancing", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33042", "id": "pPnkpvBeZN", "proceeding": "https://proceedings.mlr.press/v235/liu24ay.html", "pdf": "https://openreview.net/pdf?id=pPnkpvBeZN", "openreview": "https://openreview.net/forum?id=pPnkpvBeZN", "author_site": "Zhining Liu, Ruizhong Qiu, Zhichen Zeng, Hyunsik Yoo, David Zhou, Zhe Xu, Yada Zhu, Kommy Weldemariam, Jingrui He, Hanghang Tong", "tldr": "", "abstract": "Class imbalance is prevalent in real-world node classification tasks and poses great challenges for graph learning models. Most existing studies are rooted in a class-rebalancing (CR) perspective and address class imbalance with class-wise reweighting or resampling. In this work, we approach the root cause of class-imbalance bias from an topological paradigm. Specifically, we theoretically reveal two **fundamental phenomena in the graph topology** that greatly exacerbate the predictive bias stemming from class imbalance. On this basis, we devise a lightweight topological augmentation framework BAT to mitigate the class-imbalance bias without class rebalancing. Being orthogonal to CR, BAT can function as an **efficient plug-and-play module** that can be seamlessly combined with and significantly boost existing CR techniques. Systematic experiments on real-world imbalanced graph learning tasks show that BAT can deliver up to 46.27% performance gain and up to 72.74% bias reduction over existing techniques. Code, examples, and documentations are available at https://github.com/ZhiningLiu1998/BAT.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhining Liu;Ruizhong Qiu;Zhichen Zeng;Hyunsik Yoo;David Zhou;Zhe Xu;Yada Zhu;Kommy Weldemariam;Jingrui He;Hanghang Tong", "authorids": "~Zhining_Liu1;~Ruizhong_Qiu1;~Zhichen_Zeng1;~Hyunsik_Yoo1;~David_Zhou1;~Zhe_Xu5;~Yada_Zhu1;~Kommy_Weldemariam1;~Jingrui_He1;~Hanghang_Tong3", "gender": "M;M;;;;M;;;F;", "homepage": "https://zhiningliu.com/;https://q-rz.github.io/;https://zhichenz98.github.io/;https://sites.google.com/view/hsyoo;;https://pricexu.github.io/;https://researcher.watson.ibm.com/researcher/view.php?person=us-yzhu;https://research.ibm.com/people/kommy-weldemariam--1;https://www.hejingrui.org;http://tonghanghang.org", "dblp": "195/4399-2;330/9860;345/6632-1;202/6001;;97/3701-7;56/8808;61/5240.html;34/2685;58/1757", "google_scholar": "5WORAUQAAAAJ;REKarmcAAAAJ;rFdX368AAAAJ;8aPSNF0AAAAJ;;7IhVDFsAAAAJ;AJb408gAAAAJ;lHXu6nkAAAAJ;hXpZynkAAAAJ;RaINcuUAAAAJ", "orcid": "0000-0003-1828-2109;0009-0000-3253-8890;0000-0002-5534-3401;0000-0001-5253-5646;;0000-0002-6675-1398;0000-0002-3338-6371;;0000-0002-6429-6272;0000-0003-4405-3887", "linkedin": "zhiningliu/;ruizhong-qiu/;;hyunsik-yoo-534251194/;david-zo/;;yadazhu/;https://www.linkedin.com/feed/;;htong/", "or_profile": "~Zhining_Liu1;~Ruizhong_Qiu1;~Zhichen_Zeng1;~Hyunsik_Yoo1;~David_Zhou1;~Zhe_Xu5;~Yada_Zhu1;~Kommy_Weldemariam1;~Jingrui_He1;~Hanghang_Tong3", "aff": "University of Illinois, Urbana Champaign;University of Illinois Urbana-Champaign;University of Illinois Urbana-Champaign;University of Illinois, Urbana-Champaign;University of Illinois Urbana-Champaign;University of Illinois, Urbana Champaign;IBM Research;;University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;illinois.edu;illinois.edu;illinois.edu;cs.illinois.edu;illinois.edu;us.ibm.com;;illinois.edu;illinois.edu", "position": "PhD student;MS student;PhD student;PhD student;PhD student;PhD student;Principal Research Scientist;;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nliu2024classimbalanced,\ntitle={Class-Imbalanced Graph Learning without Class Rebalancing},\nauthor={Zhining Liu and Ruizhong Qiu and Zhichen Zeng and Hyunsik Yoo and David Zhou and Zhe Xu and Yada Zhu and Kommy Weldemariam and Jingrui He and Hanghang Tong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pPnkpvBeZN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2903560, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10437734991011705142&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "illinois.edu;illinois.edu;illinois.edu;illinois.edu;cs.illinois.edu;illinois.edu;us.ibm.com;;illinois.edu;illinois.edu", "author_num": 10, "aff_unique_index": "0;0;0;1;0;0;2;0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;University of Illinois;IBM", "aff_unique_dep": ";;IBM Research", "aff_unique_url": "https://illinois.edu;https://illinois.edu;https://www.ibm.com/research", "aff_unique_abbr": "UIUC;UIUC;IBM", "aff_campus_unique_index": "0;0;0;0;0;0;0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Split-Ensemble: Efficient OOD-aware Ensemble via Task and Model Splitting", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33041", "id": "pQyoBWA146", "proceeding": "https://proceedings.mlr.press/v235/chen24aw.html", "pdf": "https://openreview.net/pdf?id=pQyoBWA146", "openreview": "https://openreview.net/forum?id=pQyoBWA146", "author_site": "Anthony Chen, Huanrui Yang, Yulu Gan, Denis Gudovskiy, Zhen Dong, Haofan Wang, Tomoyuki Okuno, Yohei Nakata, EECS Kurt Keutzer, Shanghang Zhang", "tldr": "", "abstract": "Uncertainty estimation is crucial for deep learning models to detect out-of-distribution (OOD) inputs. However, the naive deep learning classifiers produce uncalibrated uncertainty for OOD data. Improving the uncertainty estimation typically requires external data for OOD-aware training or considerable costs to build an ensemble. In this work, we improve on uncertainty estimation without extra OOD data or additional inference costs using an alternative *Split-Ensemble* method. Specifically, we propose a novel *subtask-splitting* ensemble training objective where a task is split into several complementary subtasks based on feature similarity. Each subtask considers part of the data as in distribution while all the rest as OOD data. Diverse submodels can therefore be trained on each subtask with OOD-aware objectives, learning generalizable uncertainty estimation. To avoid overheads, we enable low-level feature sharing among submodels, building a tree-like Split-Ensemble architecture via iterative splitting and pruning. Empirical study shows Split-Ensemble, without additional computational cost, improves accuracy over a single model by 0.8%, 1.8%, and 25.5% on CIFAR-10, CIFAR-100, and Tiny-ImageNet, respectively. OOD detection for the same backbone and in-distribution datasets surpasses a single model baseline by 2.2%, 8.1%, and 29.6% in mean AUROC, respectively.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anthony Chen;Huanrui Yang;Yulu Gan;Denis A Gudovskiy;Zhen Dong;Haofan Wang;Tomoyuki Okuno;Yohei Nakata;Kurt Keutzer;Shanghang Zhang", "authorids": "~Anthony_Chen3;~Huanrui_Yang1;~Yulu_Gan1;~Denis_A_Gudovskiy1;~Zhen_Dong3;~Haofan_Wang1;~Tomoyuki_Okuno1;~Yohei_Nakata1;~Kurt_Keutzer1;~Shanghang_Zhang4", "gender": "M;M;M;M;M;M;M;M;M;F", "homepage": "https://atchen.com/;https://sites.google.com/view/huanrui-yang;https://yulugan.com/;https://gudovskiy.github.io/;https://dong-zhen.com/;https://haofanwang.github.io/;;;https://people.eecs.berkeley.edu/~keutzer/;https://www.shanghangzhang.com/", "dblp": ";221/2845;307/1107;136/4981;;234/7841.html;;27/8364.html;k/KurtKeutzer.html;95/11531", "google_scholar": ";bjNCUt8AAAAJ;https://scholar.google.com/citations?hl=en;03qjEm0AAAAJ;czxMUzcAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.co.jp/citations?user=E7BhgRsAAAAJ;MA5f-rYAAAAJ;ID9QePIAAAAJ;voqw10cAAAAJ", "orcid": ";;0009-0002-8541-2911;0000-0002-6829-6667;;;;0009-0006-9838-1367;0000-0003-3868-8501;", "linkedin": "anthony-chen-08873b254/;;yulu-g-31a626281/;gudovskiy;zhen-dong/;;;;kurtkeutzer/;", "or_profile": "~Anthony_Chen3;~Huanrui_Yang1;~Yulu_Gan1;~Denis_A_Gudovskiy1;~Zhen_Dong3;~Haofan_Wang1;~Tomoyuki_Okuno1;~Yohei_Nakata1;~Kurt_Keutzer1;~Shanghang_Zhang1", "aff": "Peking University;University of California, Berkeley;Peking University;Panasonic Corp;Nexusflow.ai Inc;Xiaohongshu;Panasonic Holdings Corporation;Panasonic;University of California, Berkeley;Peking University", "aff_domain": "pku.edu.cn;berkeley.edu;pku.edu.cn;panasonic.com;nexusflow.ai;xiaohongshu.com;panasonic.com;us.panasonic.com;berkeley.edu;pku.edu.cn", "position": "MS student;Postdoc;MS student;Senior Researcher;Principal Researcher;Researcher;Researcher;Researcher;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nchen2024splitensemble,\ntitle={Split-Ensemble: Efficient {OOD}-aware Ensemble via Task and Model Splitting},\nauthor={Anthony Chen and Huanrui Yang and Yulu Gan and Denis A Gudovskiy and Zhen Dong and Haofan Wang and Tomoyuki Okuno and Yohei Nakata and Kurt Keutzer and Shanghang Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pQyoBWA146}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1967482, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:QSpKMtkYqfoJ:scholar.google.com/&scioq=Split-Ensemble:+Efficient+OOD-aware+Ensemble+via+Task+and+Model+Splitting&hl=en&as_sdt=0,5", "gs_version_total": 8, "email": "pku.edu.cn;berkeley.edu;pku.edu.cn;panasonic.com;nexusflow.ai;xiaohongshu.com;panasonic.com;us.panasonic.com;berkeley.edu;pku.edu.cn", "author_num": 10, "aff_unique_index": "0;1;0;2;3;4;5;2;1;0", "aff_unique_norm": "Peking University;University of California, Berkeley;Panasonic Corporation;Nexusflow.ai;Xiaohongshu;Panasonic Holdings Corporation", "aff_unique_dep": ";;;;;", "aff_unique_url": "http://www.pku.edu.cn;https://www.berkeley.edu;https://www.panasonic.com;https://www.nexusflow.ai;https://www.xiaohongshu.com;https://www.panasonic.com/global", "aff_unique_abbr": "Peking U;UC Berkeley;Panasonic;Nexusflow.ai;XHS;PHC", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;0;2;1;0;2;2;1;0", "aff_country_unique": "China;United States;Japan" }, { "title": "Subgoal-based Demonstration Learning for Formal Theorem Proving", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33040", "id": "pSnhA7Em1P", "proceeding": "https://proceedings.mlr.press/v235/zhao24h.html", "pdf": "https://openreview.net/pdf?id=pSnhA7Em1P", "openreview": "https://openreview.net/forum?id=pSnhA7Em1P", "author_site": "Xueliang Zhao, Wenda Li, Lingpeng Kong", "tldr": "", "abstract": "Large language models (LLMs) present a promising pathway for advancing the domain of formal theorem proving. In this paper, we aim to improve the performance of LLMs in formal theorem proving by thoroughly examining the structure and organization of demonstrative in-context examples. We introduce a subgoal-based demonstration learning framework, specifically designed to enhance the efficiency of proof search in LLMs. First, drawing upon the insights of subgoal learning from reinforcement learning and robotics, we propose the construction of distinct subgoals for each demonstration example and refine these subgoals in accordance with the pertinent theories of subgoal learning. Second, we build upon recent advances in diffusion models to predict the optimal organization, simultaneously addressing two intricate issues that persist within the domain of demonstration organization: subset selection and order determination. Our integration of subgoal-based learning has notably increased proof accuracy from 38.9% to 44.1% on the miniF2F benchmark. Furthermore, the adoption of diffusion models for demonstration organization can lead to an additional enhancement in accuracy to 45.5%, or a $5\\times$ improvement in sampling efficiency compared to previously established methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xueliang Zhao;Wenda Li;Lingpeng Kong", "authorids": "~Xueliang_Zhao1;~Wenda_Li1;~Lingpeng_Kong1", "gender": "M;M;M", "homepage": ";https://wenda302.github.io;https://ikekonglp.github.io/", "dblp": ";132/9868.html;144/7656", "google_scholar": "h-87C9cAAAAJ;ufYxQkEAAAAJ;f1hBi5wAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Xueliang_Zhao1;~Wenda_Li1;~Lingpeng_Kong1", "aff": "The University of Hong Kong;University of Edinburgh;Department of Computer Science, The University of Hong Kong", "aff_domain": "cs.hku.hk;ed.ac.uk;cs.hku.hk", "position": "PhD student;Lecturer;Assistant Professor", "bibtex": "@inproceedings{\nzhao2024subgoalbased,\ntitle={Subgoal-based Demonstration Learning for Formal Theorem Proving},\nauthor={Xueliang Zhao and Wenda Li and Lingpeng Kong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pSnhA7Em1P}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2518820, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7507271506354852663&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 4, "email": "cs.hku.hk;ed.ac.uk;cs.hku.hk", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Hong Kong;University of Edinburgh", "aff_unique_dep": ";", "aff_unique_url": "https://www.hku.hk;https://www.ed.ac.uk", "aff_unique_abbr": "HKU;Edinburgh", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;United Kingdom" }, { "title": "SelMatch: Effectively Scaling Up Dataset Distillation via Selection-Based Initialization and Partial Updates by Trajectory Matching", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33039", "id": "pTFud6SetK", "proceeding": "https://proceedings.mlr.press/v235/lee24g.html", "pdf": "https://openreview.net/pdf?id=pTFud6SetK", "openreview": "https://openreview.net/forum?id=pTFud6SetK", "author_site": "Yongmin Lee, Hye Won Chung", "tldr": "", "abstract": "Dataset distillation aims to synthesize a small number of images per class (IPC) from a large dataset to approximate full dataset training with minimal performance loss. While effective in very small IPC ranges, many distillation methods become less effective, even underperforming random sample selection, as IPC increases. Our examination of state-of-the-art trajectory-matching based distillation methods across various IPC scales reveals that these methods struggle to incorporate the complex, rare features of harder samples into the synthetic dataset even with the increased IPC, resulting in a persistent coverage gap between easy and hard test samples. Motivated by such observations, we introduce SelMatch, a novel distillation method that effectively scales with IPC. SelMatch uses selection-based initialization and partial updates through trajectory matching to manage the synthetic dataset's desired difficulty level tailored to IPC scales. When tested on CIFAR-10/100 and TinyImageNet, SelMatch consistently outperforms leading selection-only and distillation-only methods across subset ratios from 5% to 30%.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yongmin Lee;Hye Won Chung", "authorids": "~Yongmin_Lee1;~Hye_Won_Chung2", "gender": ";F", "homepage": "https://github.com/Yongalls;https://iids.kaist.ac.kr/", "dblp": ";https://dblp.uni-trier.de/pers/hd/c/Chung:Hye_Won", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Yongmin_Lee1;~Hye_Won_Chung2", "aff": ";Korea Advanced Institute of Science & Technology", "aff_domain": ";kaist.ac.kr", "position": ";Associate Professor", "bibtex": "@inproceedings{\nlee2024selmatch,\ntitle={SelMatch: Effectively Scaling Up Dataset Distillation via Selection-Based Initialization and Partial Updates by Trajectory Matching},\nauthor={Yongmin Lee and Hye Won Chung},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pTFud6SetK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2698343, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12409203384167413941&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 7, "email": ";kaist.ac.kr", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_country_unique_index": "0", "aff_country_unique": "South Korea" }, { "title": "Position: Understanding LLMs Requires More Than Statistical Generalization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33038", "id": "pVyOchWUBa", "proceeding": "https://proceedings.mlr.press/v235/reizinger24a.html", "pdf": "https://openreview.net/pdf?id=pVyOchWUBa", "openreview": "https://openreview.net/forum?id=pVyOchWUBa", "author_site": "Patrik Reizinger, Szilvia Ujv\u00e1ry, Anna M\u00e9sz\u00e1ros, Anna Kerekes, Wieland Brendel, Ferenc Husz\u00e1r", "tldr": "", "abstract": "The last decade has seen blossoming research in deep learning theory attempting to answer, ``Why does deep learning generalize?\" A powerful shift in perspective precipitated this progress: the study of overparametrized models in the interpolation regime. In this paper, we argue that another perspective shift is due, since some of the desirable qualities of LLMs are not a consequence of good statistical generalization and require a separate theoretical explanation. Our core argument relies on the observation that AR probabilistic models are inherently non-identifiable: models zero or near-zero KL divergence apart---thus, equivalent test loss---can exhibit markedly different behaviors. We support our position with mathematical examples and empirical observations, illustrating why non-identifiability has practical relevance through three case studies: (1) the non-identifiability of zero-shot rule extrapolation; (2) the approximate non-identifiability of in-context learning; and (3) the non-identifiability of fine-tunability. We review promising research directions focusing on LLM-relevant generalization measures, transferability, and inductive biases.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Patrik Reizinger;Szilvia Ujv\u00e1ry;Anna M\u00e9sz\u00e1ros;Anna Kerekes;Wieland Brendel;Ferenc Husz\u00e1r", "authorids": "~Patrik_Reizinger1;~Szilvia_Ujv\u00e1ry1;~Anna_M\u00e9sz\u00e1ros1;~Anna_Kerekes1;~Wieland_Brendel1;~Ferenc_Husz\u00e1r1", "gender": "M;F;;F;M;M", "homepage": "https://rpatrik96.github.io/;;;;;", "dblp": "249/5412;;;;37/11107;http://dblp.uni-trier.de/pers/hd/h/Huszar:Ferenc", "google_scholar": "zIT0fdIAAAAJ;;;;v-JL-hsAAAAJ;https://scholar.google.co.uk/citations?user=koQCVT4AAAAJ", "orcid": "0000-0001-9861-0293;;;;;", "linkedin": "patrik-reizinger/;szilviaujvary/;anna-m%C3%A9sz%C3%A1ros-2ba244222/;anna-kerekes-5a84651b8/;;", "or_profile": "~Patrik_Reizinger1;~Szilvia_Ujv\u00e1ry1;~Anna_M\u00e9sz\u00e1ros1;~Anna_Kerekes1;~Wieland_Brendel1;~Ferenc_Huszar1", "aff": "Eberhard-Karls-Universit\u00e4t T\u00fcbingen;University of Cambridge;University of Cambridge;ETHZ - ETH Zurich;ELLIS Institute T\u00fcbingen;University of Cambridge", "aff_domain": "uni-tuebingen.de;cam.ac.uk;cam.ac.uk;ethz.ch;tue.ellis.eu;cam.ac.uk", "position": "PhD student;PhD student;PhD student;PhD student;Principal Researcher;Associate Professor", "bibtex": "@inproceedings{\nreizinger2024position,\ntitle={Position: Understanding {LLM}s Requires More Than Statistical Generalization},\nauthor={Patrik Reizinger and Szilvia Ujv{\\'a}ry and Anna M{\\'e}sz{\\'a}ros and Anna Kerekes and Wieland Brendel and Ferenc Husz{\\'a}r},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pVyOchWUBa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 535224, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4002114302053423084&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "uni-tuebingen.de;cam.ac.uk;cam.ac.uk;ethz.ch;tue.ellis.eu;cam.ac.uk", "author_num": 6, "aff_unique_index": "0;1;1;2;3;1", "aff_unique_norm": "Eberhard Karls University of T\u00fcbingen;University of Cambridge;ETH Zurich;ELLIS Institute", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.cam.ac.uk;https://www.ethz.ch;https://ellis.eu/", "aff_unique_abbr": "Uni T\u00fcbingen;Cambridge;ETHZ;ELLIS", "aff_campus_unique_index": "0;1;1;0;1", "aff_campus_unique": "T\u00fcbingen;Cambridge;", "aff_country_unique_index": "0;1;1;2;0;1", "aff_country_unique": "Germany;United Kingdom;Switzerland" }, { "title": "Guiding LLMs The Right Way: Fast, Non-Invasive Constrained Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33037", "id": "pXaEYzrFae", "proceeding": "https://proceedings.mlr.press/v235/beurer-kellner24a.html", "pdf": "https://openreview.net/pdf?id=pXaEYzrFae", "openreview": "https://openreview.net/forum?id=pXaEYzrFae", "author_site": "Luca Beurer-Kellner, Marc Fischer, Martin Vechev", "tldr": "", "abstract": "To ensure that text generated by large language models (LLMs) is in an expected format, constrained decoding methods propose to enforce strict formal language constraints during generation. However, as we show in this work, not only do such methods often incur performance overhead during generation, but many of them also significantly impair task accuracy, if they do not correctly align the underlying LLM sub-word vocabularies with external constraints. To address this, we present a novel decoding algorithm, DOMINO, that can enforce constraints in a fully subword-aligned fashion, while leveraging pre-computation and speculative decoding to achieve virtually no overhead and in some cases even almost 2$\\times$ speedup over unconstrained decoding -- thereby outperforming existing approaches by a wide margin. We release DOMINO as open source at https://github.com/eth-sri/domino.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Luca Beurer-Kellner;Marc Fischer;Martin Vechev", "authorids": "~Luca_Beurer-Kellner1;~Marc_Fischer1;~Martin_Vechev1", "gender": "M;M;M", "homepage": ";;https://www.sri.inf.ethz.ch/people/martin", "dblp": "314/2627;37/9373-2;93/2189.html", "google_scholar": "https://scholar.google.com/citations?hl=de;;https://scholar.google.ch/citations?user=aZ1Rh50AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Luca_Beurer-Kellner1;~Marc_Fischer1;~Martin_Vechev1", "aff": "ETHZ - ETH Zurich;Swiss Federal Institute of Technology;Swiss Federal Institute of Technology", "aff_domain": "ethz.ch;ethz.ch;ethz.ch", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nbeurer-kellner2024guiding,\ntitle={Guiding {LLM}s The Right Way: Fast, Non-Invasive Constrained Generation},\nauthor={Luca Beurer-Kellner and Marc Fischer and Martin Vechev},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pXaEYzrFae}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 404902, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7950284387618850180&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "ethz.ch;ethz.ch;ethz.ch", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "ETH Zurich;Swiss Federal Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch", "aff_unique_abbr": "ETHZ;ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Open-Vocabulary Calibration for Fine-tuned CLIP", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33036", "id": "pY2UpspnBB", "proceeding": "https://proceedings.mlr.press/v235/wang24bw.html", "pdf": "https://openreview.net/pdf?id=pY2UpspnBB", "openreview": "https://openreview.net/forum?id=pY2UpspnBB", "author_site": "Shuoyuan Wang, Jindong Wang, Guoqing Wang, Bob Zhang, Kaiyang Zhou, Hongxin Wei", "tldr": "", "abstract": "Vision-language models (VLMs) have emerged as formidable tools, showing their strong capability in handling various open-vocabulary tasks in image recognition, text-driven visual content generation, and visual chatbots, to name a few. In recent years, considerable efforts and resources have been devoted to adaptation methods for improving downstream performance of VLMs, particularly on parameter-efficient fine-tuning methods like prompt learning. However, a crucial aspect that has been largely overlooked is the confidence calibration problem in fine-tuned VLMs, which could greatly reduce reliability when deploying such models in the real world. This paper bridges the gap by systematically investigating the confidence calibration problem in the context of prompt learning and reveals that existing calibration methods are insufficient to address the problem, especially in the open-vocabulary setting. To solve the problem, we present a simple and effective approach called Distance-Aware Calibration (DAC), which is based on scaling the temperature using as guidance the distance between predicted text labels and base classes. The experiments with 7 distinct prompt learning methods applied across 11 diverse downstream datasets demonstrate the effectiveness of DAC, which achieves high efficacy without sacrificing the inference speed.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shuoyuan Wang;Jindong Wang;Guoqing Wang;Bob Zhang;Kaiyang Zhou;Hongxin Wei", "authorids": "~Shuoyuan_Wang3;~Jindong_Wang1;~Guoqing_Wang2;~Bob_Zhang1;~Kaiyang_Zhou1;~Hongxin_Wei1", "gender": "M;;M;M;M;M", "homepage": "https://faculty.uestc.edu.cn/wangguoqing1/zh_CN/index.htm;;https://kaiyangzhou.github.io/;https://hongxin001.github.io/;https://jd92.wang/;", "dblp": "17/356-1;24/7465;203/3155;150/6350;19/2969-1;315/5913", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;https://scholar.google.co.uk/citations?user=gRIejugAAAAJ;cABH034AAAAJ;hBZ_tKsAAAAJ;SfMkEYgAAAAJ", "orcid": ";0000-0003-2497-9519;;;0000-0002-4833-0880;0000-0003-1795-4161", "linkedin": ";;;;jindong-wang/;", "or_profile": "~Guoqing_Wang2;~Bob_Zhang1;~Kaiyang_Zhou1;~Hongxin_Wei1;~Jindong_Wang4;~SHUOYUAN_WANG2", "aff": "University of Electronic Science and Technology of China;University of Macau;Hong Kong Baptist University;Southern University of Science and Technology;Microsoft Research;University of Macau", "aff_domain": "uestc.edu.cn;um.edu.mo;hkbu.edu.hk;sustech.edu.cn;microsoft.com;umac.mo", "position": "Full Professor;Associate Professor;Assistant Professor;Assistant Professor;Researcher;MS student", "bibtex": "@inproceedings{\nwang2024openvocabulary,\ntitle={Open-Vocabulary Calibration for Fine-tuned {CLIP}},\nauthor={Shuoyuan Wang and Jindong Wang and Guoqing Wang and Bob Zhang and Kaiyang Zhou and Hongxin Wei},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pY2UpspnBB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1427789, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8615365557556773534&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "uestc.edu.cn;um.edu.mo;hkbu.edu.hk;sustech.edu.cn;microsoft.com;umac.mo", "author_num": 6, "aff_unique_index": "0;1;2;3;4;1", "aff_unique_norm": "University of Electronic Science and Technology of China;University of Macau;Hong Kong Baptist University;Southern University of Science and Technology;Microsoft", "aff_unique_dep": ";;;;Microsoft Research", "aff_unique_url": "https://www.uestc.edu.cn;https://www.um.edu.mo;https://www.hkbu.edu.hk;https://www.sustech.edu.cn;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "UESTC;UM;HKBU;SUSTech;MSR", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Macau SAR;Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "A General Online Algorithm for Optimizing Complex Performance Metrics", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33035", "id": "pfnBLXgFVS", "proceeding": "https://proceedings.mlr.press/v235/kotlowski24a.html", "pdf": "https://openreview.net/pdf?id=pfnBLXgFVS", "openreview": "https://openreview.net/forum?id=pfnBLXgFVS", "author_site": "Wojciech Kotlowski, Marek Wydmuch, Erik Schultheis, Rohit Babbar, Krzysztof Dembczynski", "tldr": "", "abstract": "We consider sequential maximization of performance metrics that are general functions of a confusion matrix of a classifier (such as precision, F-measure, or G-mean). Such metrics are, in general, non-decomposable over individual instances, making their optimization very challenging. While they have been extensively studied under different frameworks in the batch setting, their analysis in the online learning regime is very limited, with only a few distinguished exceptions. In this paper, we introduce and analyze a general online algorithm that can be used in a straightforward way with a variety of complex performance metrics in binary, multi-class, and multi-label classification problems. The algorithm's update and prediction rules are appealingly simple and computationally efficient without the need to store any past data. We show the algorithm attains $\\mathcal{O}(\\frac{\\ln n}{n})$ regret for concave and smooth metrics and verify the efficiency of the proposed algorithm in empirical studies.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wojciech Kotlowski;Marek Wydmuch;Erik Schultheis;Rohit Babbar;Krzysztof Dembczynski", "authorids": "~Wojciech_Kotlowski1;~Marek_Wydmuch1;~Erik_Schultheis1;~Rohit_Babbar1;~Krzysztof_Dembczynski1", "gender": "M;M;;;", "homepage": ";https://mwydmuch.pl;https://www.aalto.fi/en/people/erik-schultheis;;https://research.yahoo.com/researchers/kdembczynski", "dblp": "63/4977;180/5883;268/7969;;91/3569", "google_scholar": ";lMXyoEAAAAAJ;MGxmO7EAAAAJ;;https://scholar.google.pl/citations?user=SetMoyoAAAAJ", "orcid": ";0000-0002-6598-6304;0000-0003-1685-8397;;0000-0001-7477-6758", "linkedin": ";marekwydmuch;;;krzysztof-dembczynski-36155344/", "or_profile": "~Wojciech_Kotlowski1;~Marek_Wydmuch1;~Erik_Schultheis1;~Rohit_Babbar1;~Krzysztof_Dembczynski1", "aff": "Poznan University of Technology;Poznan University of Technology;Aalto University;;Politechnika Poznanska", "aff_domain": "put.poznan.pl;put.poznan.pl;aalto.fi;;put.poznan.pl", "position": "Assistant Professor;Lecturer;PhD student;;Assistant Professor", "bibtex": "@inproceedings{\nkotlowski2024a,\ntitle={A General Online Algorithm for Optimizing Complex Performance Metrics},\nauthor={Wojciech Kotlowski and Marek Wydmuch and Erik Schultheis and Rohit Babbar and Krzysztof Dembczynski},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pfnBLXgFVS}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3904702, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:M_L4n8XY0VAJ:scholar.google.com/&scioq=A+General+Online+Algorithm+for+Optimizing+Complex+Performance+Metrics&hl=en&as_sdt=0,5", "gs_version_total": 10, "email": "put.poznan.pl;put.poznan.pl;aalto.fi;;put.poznan.pl", "author_num": 5, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Poznan University of Technology;Aalto University;Politechnika Poznanska", "aff_unique_dep": ";;", "aff_unique_url": "https://www.put.poznan.pl/;https://www.aalto.fi;https://www.put.poznan.pl/", "aff_unique_abbr": "PUT;Aalto;PUT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Poland;Finland" }, { "title": "Translation Equivariant Transformer Neural Processes", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33034", "id": "pftXzp6Yn3", "proceeding": "https://proceedings.mlr.press/v235/ashman24a.html", "pdf": "https://openreview.net/pdf?id=pftXzp6Yn3", "openreview": "https://openreview.net/forum?id=pftXzp6Yn3", "author_site": "Matthew Ashman, Cristiana Diaconu, Junhyuck Kim, Lakee Sivaraya, Stratis Markou, James Requeima, Wessel Bruinsma, Richard E Turner", "tldr": "", "abstract": "The effectiveness of neural processes (NPs) in modelling posterior prediction maps---the mapping from data to posterior predictive distributions---has significantly improved since their inception. This improvement can be attributed to two principal factors: (1) advancements in the architecture of permutation invariant set functions, which are intrinsic to all NPs; and (2) leveraging symmetries present in the true posterior predictive map, which are problem dependent. Transformers are a notable development in permutation invariant set functions, and their utility within NPs has been demonstrated through the family of models we refer to as TNPs. Despite significant interest in TNPs, little attention has been given to incorporating symmetries. Notably, the posterior prediction maps for data that are stationary---a common assumption in spatio-temporal modelling---exhibit translation equivariance. In this paper, we introduce of a new family of translation equivariant TNPs that incorporate *translation equivariance*. Through an extensive range of experiments on synthetic and real-world spatio-temporal data, we demonstrate the effectiveness of TE-TNPs relative to their non-translation-equivariant counterparts and other NP baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Matthew Ashman;Cristiana Diaconu;Junhyuck Kim;Lakee Sivaraya;Stratis Markou;James Requeima;Wessel P Bruinsma;Richard E. Turner", "authorids": "~Matthew_Ashman1;~Cristiana_Diaconu1;~Junhyuck_Kim1;~Lakee_Sivaraya1;~Stratis_Markou1;~James_Requeima1;~Wessel_P_Bruinsma1;~Richard_E_Turner1", "gender": "M;F;M;M;M;M;M;", "homepage": "https://mattashman.github.io/;https://cddcam.github.io/;;https://lakeesiv.com/;;http://jamesr.info;https://rich-turner-group.github.io/;https://wessel.ai", "dblp": ";380/4379.html;;;300/3941;;40/5352;242/3348.html", "google_scholar": "j1YiUKUAAAAJ;Ws2IoZIAAAAJ;;;;https://scholar.google.ca/citations?hl=en;https://scholar.google.co.uk/citations?user=DgLEyZgAAAAJ;QRQwz3cAAAAJ", "orcid": ";0009-0007-5165-2630;;;;;;", "linkedin": "matthew-ashman-a69017150/;https://linkedin.com/in/cristiana-diaconu-99a3ba161;junhyuck-kim-29308920a?utm_source=share&utm_campaign=share_via&utm_content=profile&utm_medium=ios_app;lakeesiv/;stratos-m-85884b94/;;;", "or_profile": "~Matthew_Ashman1;~Cristiana_Diaconu1;~Junhyuck_Kim1;~Lakee_Sivaraya1;~Stratis_Markou1;~James_Requeima1;~Richard_E_Turner1;~Wessel_Bruinsma1", "aff": "University of Cambridge;University of Cambridge;University of Cambridge;University of Cambridge;;University of Toronto;Microsoft Research;", "aff_domain": "cam.ac.uk;cam.ac.uk;cam.ac.uk;cam.ac.uk;;cs.toronto;research.microsoft.com;", "position": "PhD student;PhD student;MS student;Undergrad student;;Postdoc;Researcher;", "bibtex": "@inproceedings{\nashman2024translation,\ntitle={Translation Equivariant Transformer Neural Processes},\nauthor={Matthew Ashman and Cristiana Diaconu and Junhyuck Kim and Lakee Sivaraya and Stratis Markou and James Requeima and Wessel P Bruinsma and Richard E. Turner},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pftXzp6Yn3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2838369, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15753125870380324455&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "email": "cam.ac.uk;cam.ac.uk;cam.ac.uk;cam.ac.uk;;cs.toronto;research.microsoft.com;", "author_num": 8, "aff_unique_index": "0;0;0;0;1;2", "aff_unique_norm": "University of Cambridge;University of Toronto;Microsoft", "aff_unique_dep": ";;Microsoft Research", "aff_unique_url": "https://www.cam.ac.uk;https://www.utoronto.ca;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Cambridge;U of T;MSR", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0;0;0;1;2", "aff_country_unique": "United Kingdom;Canada;United States" }, { "title": "Towards Optimal Adversarial Robust Q-learning with Bellman Infinity-error", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33033", "id": "pgI9inG2Ny", "proceeding": "https://proceedings.mlr.press/v235/li24cl.html", "pdf": "https://openreview.net/pdf?id=pgI9inG2Ny", "openreview": "https://openreview.net/forum?id=pgI9inG2Ny", "author_site": "Haoran Li, Zicheng Zhang, Wang Luo, Congying Han, Yudong Hu, Tiande Guo, Shichen Liao", "tldr": "", "abstract": "Establishing robust policies is essential to counter attacks or disturbances affecting deep reinforcement learning (DRL) agents. Recent studies explore state-adversarial robustness and suggest the potential lack of an optimal robust policy (ORP), posing challenges in setting strict robustness constraints. This work further investigates ORP: At first, we introduce a consistency assumption of policy (CAP) stating that optimal actions in the Markov decision process remain consistent with minor perturbations, supported by empirical and theoretical evidence. Building upon CAP, we crucially prove the existence of a deterministic and stationary ORP that aligns with the Bellman optimal policy. Furthermore, we illustrate the necessity of $L^{\\infty}$-norm when minimizing Bellman error to attain ORP. This finding clarifies the vulnerability of prior DRL algorithms that target the Bellman optimal policy with $L^{1}$-norm and motivates us to train a Consistent Adversarial Robust Deep Q-Network (CAR-DQN) by minimizing a surrogate of Bellman Infinity-error. The top-tier performance of CAR-DQN across various benchmarks validates its practical effectiveness and reinforces the soundness of our theoretical analysis.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haoran Li;Zicheng Zhang;Wang Luo;Congying Han;Yudong Hu;Tiande Guo;Shichen Liao", "authorids": "~Haoran_Li17;~Zicheng_Zhang3;~Wang_Luo1;~Congying_Han1;~Yudong_Hu1;~Tiande_Guo1;~Shichen_Liao1", "gender": "M;M;F;M;M;M;M", "homepage": ";;http://people.ucas.edu.cn/~hancy;https://dzyzhyd999.github.io/;https://people.ucas.ac.cn/~tdguo?language=en;;", "dblp": ";;07/2808;335/1936;;;", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;;;https://scholar.google.com/citations?view_op=list_works;;;H8WNWPsAAAAJ", "orcid": ";0009-0001-2966-5001;0000-0002-3445-4620;0009-0000-6528-8672;0000-0002-3804-9163;0009-0001-7653-9139;", "linkedin": ";;;;;;", "or_profile": "~Haoran_Li17;~Wang_Luo1;~Congying_Han1;~Yudong_Hu1;~Tiande_Guo1;~Shichen_Liao1;~zicheng_zhang1", "aff": "University of Chinese Academy of Sciences;University of Chinese Academy of Sciences;University of Chinese Academy of Sciences;University of Chinese Academy of Sciences;University of Chinese Academy of Sciences;;University of Chinese Academy of Sciences", "aff_domain": "mails.ucas.ac.cn;ucas.ac.cn;ucas.ac.cn;mails.ucas.ac.cn;ucas.ac.cn;;ucas.ac.cn", "position": "PhD student;MS student;Full Professor;PhD student;Full Professor;;PhD student", "bibtex": "@inproceedings{\nli2024towards,\ntitle={Towards Optimal Adversarial Robust Q-learning with Bellman Infinity-error},\nauthor={Haoran Li and Zicheng Zhang and Wang Luo and Congying Han and Yudong Hu and Tiande Guo and Shichen Liao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pgI9inG2Ny}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2929517, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1917726060173251851&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "mails.ucas.ac.cn;ucas.ac.cn;ucas.ac.cn;mails.ucas.ac.cn;ucas.ac.cn;;ucas.ac.cn", "author_num": 7, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "University of Chinese Academy of Sciences", "aff_unique_dep": "", "aff_unique_url": "http://www.ucas.ac.cn", "aff_unique_abbr": "UCAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "DiffStitch: Boosting Offline Reinforcement Learning with Diffusion-based Trajectory Stitching", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33032", "id": "phGHQOKmaU", "proceeding": "https://proceedings.mlr.press/v235/li24bf.html", "pdf": "https://openreview.net/pdf?id=phGHQOKmaU", "openreview": "https://openreview.net/forum?id=phGHQOKmaU", "author_site": "Guanghe Li, Yixiang Shan, Zhengbang Zhu, Ting Long, Weinan Zhang", "tldr": "", "abstract": "In offline reinforcement learning (RL), the performance of the learned policy highly depends on the quality of offline datasets. However, the offline dataset contains very limited optimal trajectories in many cases. This poses a challenge for offline RL algorithms, as agents must acquire the ability to transit to high-reward regions. To address this issue, we introduce Diffusionbased Trajectory Stitching (DiffStitch), a novel diffusion-based data augmentation pipeline that systematically generates stitching transitions between trajectories. DiffStitch effectively connects low-reward trajectories with high-reward trajectories, forming globally optimal trajectories and thereby mitigating the challenges faced by offline RL algorithms in learning trajectory stitching. Empirical experiments conducted on D4RL datasets demonstrate the effectiveness of our pipeline across RL methodologies. Notably, DiffStitch demonstrates substantial enhancements in the performance of one-step methods(IQL), imitation learning methods(TD3+BC) and trajectory optimization methods(DT). Our code is publicly available at https://github.com/guangheli12/DiffStitch", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guanghe Li;Yixiang Shan;Zhengbang Zhu;Ting Long;Weinan Zhang", "authorids": "~Guanghe_Li1;~Yixiang_Shan1;~Zhengbang_Zhu1;~Ting_Long1;~Weinan_Zhang1", "gender": "M;M;M;F;M", "homepage": "https://github.com/guangheli12;https://dblp.org/pid/331/0031;https://github.com/zbzhu99;http://apex.sjtu.edu.cn/members/longting@apexlab.org;http://wnzhang.net", "dblp": ";331/0031;277/0869;06/8646.html;28/10261-1", "google_scholar": ";KSHTBUkAAAAJ;;;Qzss0GEAAAAJ", "orcid": ";0000-0003-1300-9146;;0000-0001-5817-6875;0000-0002-0127-2425", "linkedin": ";;;;", "or_profile": "~Guanghe_Li1;~Yixiang_Shan1;~Zhengbang_Zhu1;~Ting_Long1;~Weinan_Zhang1", "aff": "Jilin University;Jilin University;Shanghai Jiaotong University;Jilin University;Shanghai Jiaotong University", "aff_domain": "jlu.edu.cn;jlu.edu.cn;sjtu.edu.cn;jlu.edu.cn;sjtu.edu.cn", "position": "Undergrad student;PhD student;PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nli2024diffstitch,\ntitle={DiffStitch: Boosting Offline Reinforcement Learning with Diffusion-based Trajectory Stitching},\nauthor={Guanghe Li and Yixiang Shan and Zhengbang Zhu and Ting Long and Weinan Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=phGHQOKmaU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1684281, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4386289273508640885&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "jlu.edu.cn;jlu.edu.cn;sjtu.edu.cn;jlu.edu.cn;sjtu.edu.cn", "author_num": 5, "aff_unique_index": "0;0;1;0;1", "aff_unique_norm": "Jilin University;Shanghai Jiao Tong University", "aff_unique_dep": ";", "aff_unique_url": "http://www.jlu.edu.cn;https://www.sjtu.edu.cn", "aff_unique_abbr": "JLU;SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "GPT-4V(ision) is a Generalist Web Agent, if Grounded", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33031", "id": "piecKJ2DlB", "proceeding": "https://proceedings.mlr.press/v235/zheng24e.html", "pdf": "https://openreview.net/pdf?id=piecKJ2DlB", "openreview": "https://openreview.net/forum?id=piecKJ2DlB", "author_site": "Boyuan Zheng, Boyu Gou, Jihyung Kil, Huan Sun, Yu Su", "tldr": "", "abstract": "The recent development on large multimodal models (LMMs), especially GPT-4V(ision) and Gemini, has been quickly expanding the capability boundaries of multimodal models beyond traditional tasks like image captioning and visual question answering. In this work, we explore the potential of LMMs like GPT-4V as a generalist web agent that can follow natural language instructions to complete tasks on any given website. We propose SEEACT, a generalist web agent that harnesses the power of LMMs for integrated visual understanding and acting on the web. We evaluate on the recent MIND2WEB benchmark. In addition to standard offline evaluation on cached websites, we enable a new online evaluation setting by developing a tool that allows running web agents on live websites. We show that GPT-4V presents a great potential for web agents---it can successfully complete 51.1% of the tasks on live websites if we manually ground its textual plans into actions on the websites. This substantially outperforms text-only LLMs like GPT-4 or smaller models (FLAN-T5 and BLIP-2) specifically fine-tuned for web agents. However, grounding still remains a major challenge. Existing LMM grounding strategies like set-of-mark prompting turns out to be not effective for web agents, and the best grounding strategy we develop in this paper leverages both the HTML structure and visuals. Yet, there is still a substantial gap with oracle grounding, leaving ample room for further improvement. All code, data, and evaluation tools are available at https://github.com/OSU-NLP-Group/SeeAct.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Boyuan Zheng;Boyu Gou;Jihyung Kil;Huan Sun;Yu Su", "authorids": "~Boyuan_Zheng1;~Boyu_Gou1;~Jihyung_Kil1;~Huan_Sun1;~Yu_Su2", "gender": "M;M;M;F;M", "homepage": "https://boyuanzheng010.github.io/;https://boyugou.github.io;https://heendung.github.io/;https://u.osu.edu/ihudas/people/;http://ysu1989.github.io", "dblp": ";365/3954;213/1319;33/2952-1.html;38/1070-1", "google_scholar": "amEL4n8AAAAJ;BgEYhp4AAAAJ;C3O0uxcAAAAJ;wIFkulcAAAAJ;rIh5OqoAAAAJ", "orcid": ";;;;", "linkedin": "boyuan-zheng-602238183/;boyu-gou-b0a710238/;;huan-sun-81527924/?originalSubdomain=cn;", "or_profile": "~Boyuan_Zheng1;~Boyu_Gou1;~Jihyung_Kil1;~Huan_Sun1;~Yu_Su2", "aff": "Ohio State University, Columbus;Ohio State University, Columbus;The Ohio State University;The Ohio State University, Columbus;Microsoft", "aff_domain": "osu.edu;osu.edu;osu.edu;osu.edu;microsoft.com", "position": "PhD student;PhD student;PhD student;Associate Professor;Senior Researcher", "bibtex": "@inproceedings{\nzheng2024gptvision,\ntitle={{GPT}-4V(ision) is a Generalist Web Agent, if Grounded},\nauthor={Boyuan Zheng and Boyu Gou and Jihyung Kil and Huan Sun and Yu Su},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=piecKJ2DlB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9233140, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 230, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10394931590361022069&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "osu.edu;osu.edu;osu.edu;osu.edu;microsoft.com", "author_num": 5, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Ohio State University;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.osu.edu;https://www.microsoft.com", "aff_unique_abbr": "OSU;Microsoft", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Columbus;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Model Tailor: Mitigating Catastrophic Forgetting in Multi-modal Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33030", "id": "piujJIF3zs", "proceeding": "https://proceedings.mlr.press/v235/zhu24l.html", "pdf": "https://openreview.net/pdf?id=piujJIF3zs", "openreview": "https://openreview.net/forum?id=piujJIF3zs", "author_site": "Didi Zhu, Zhongyi Sun, Zexi Li, tao shen, Ke Yan, Shouhong Ding, Chao Wu, Kun Kuang", "tldr": "", "abstract": "Catastrophic forgetting emerges as a critical challenge when fine-tuning multi-modal large language models (MLLMs), where improving performance on unseen tasks often leads to a significant performance drop on the original tasks. This paper presents a comprehensive analysis of catastrophic forgetting in MLLMs and introduces a post-training adjustment method called Model Tailor. Our method primarily preserves the pre-trained parameters while replacing a small number ($\\leq$ 10%) of fine-tuned parameters, maintaining $\\sim$ 99% effectiveness on original tasks versus pre-training, and achieving $\\sim$ 97% on new tasks compared to standard fine-tuning. Specifically, we derive a sparse mask to identify the model patch, based on a fusion strategy that integrates salience and sensitivity analysis. Subsequently, a compensation mechanism is introduced to decorate the patch, enhancing the model's performance on both target and original tasks. Additionally, our method is adaptable to multi-task scenarios. Through extensive experiments on InstructBLIP and LLaVA-1.5 in both image captioning and visual question answering tasks, our approach demonstrates significant task adaptability while preserving inherent pre-trained capabilities.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Didi Zhu;Zhongyisun Sun;Zexi Li;Tao Shen;Ke Yan;Shouhong Ding;Chao Wu;Kun Kuang", "authorids": "~Didi_Zhu1;~Zhongyisun_Sun1;~Zexi_Li1;~Tao_Shen4;~Ke_Yan2;~Shouhong_Ding3;~Chao_Wu1;~Kun_Kuang1", "gender": "F;M;M;;;M;M;M", "homepage": "https://didizhu-zju.github.io/;;https://zexilee.github.io/about-zexili/;;;;;http://kunkuang.github.io", "dblp": "305/0602;;151/9187-1;;;119/6735;45/3158-1;194/4245", "google_scholar": "https://scholar.google.com.hk/citations?user=gthqIqIAAAAJ;prqsxYcAAAAJ;https://scholar.google.com.hk/citations?user=6lMg5eoAAAAJ;;;OGf40fkAAAAJ;gpTPt58AAAAJ;https://scholar.google.com.hk/citations?user=FOsNiMQAAAAJ", "orcid": "0009-0004-6892-5357;;0000-0003-0831-3549;;;0000-0002-3175-3553;0000-0003-0885-6869;0009-0000-7528-8131", "linkedin": ";;;;;;;", "or_profile": "~Didi_Zhu1;~Zhongyisun_Sun1;~Zexi_Li1;~Tao_Shen4;~Ke_Yan2;~Shouhong_Ding3;~Chao_Wu1;~Kun_Kuang1", "aff": "Zhejiang University;Tencent Youtu Lab;Zhejiang University;;;Tencent Youtu Lab;Zhejiang University;Zhejiang University", "aff_domain": "zju.edu.cn;tencent.com;zju.edu.cn;;;tencent.com;zju.edu.cn;zju.edu.cn", "position": "PhD student;Researcher;PhD student;;;researcher;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nzhu2024model,\ntitle={Model Tailor: Mitigating Catastrophic Forgetting in Multi-modal Large Language Models},\nauthor={Didi Zhu and Zhongyisun Sun and Zexi Li and Tao Shen and Ke Yan and Shouhong Ding and Chao Wu and Kun Kuang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=piujJIF3zs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2183411, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16987444943889575247&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "zju.edu.cn;tencent.com;zju.edu.cn;;;tencent.com;zju.edu.cn;zju.edu.cn", "author_num": 8, "aff_unique_index": "0;1;0;1;0;0", "aff_unique_norm": "Zhejiang University;Tencent", "aff_unique_dep": ";Youtu Lab", "aff_unique_url": "https://www.zju.edu.cn;https://www.tencent.com", "aff_unique_abbr": "ZJU;Tencent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Robust Inverse Constrained Reinforcement Learning under Model Misspecification", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33029", "id": "pkUl39b0in", "proceeding": "https://proceedings.mlr.press/v235/xu24r.html", "pdf": "https://openreview.net/pdf?id=pkUl39b0in", "openreview": "https://openreview.net/forum?id=pkUl39b0in", "author_site": "Sheng Xu, Guiliang Liu", "tldr": "", "abstract": "To solve safety-critical decision-making problems, Inverse Constrained Reinforcement Learning (ICRL) infers constraints from expert demonstrations and seeks to imitate expert preference by utilizing these constraints. While prior ICRL research commonly overlooks the discrepancy between the training and deploying environments, we demonstrate that such a discrepancy can significantly compromise the reliability of the inferred constraints and thus induce unsafe movements. Motivated by this finding, we propose the Robust Constraint Inference (RCI) problem and an Adaptively Robust ICRL (AR-ICRL) algorithm to solve RCI efficiently. Specifically, we model the impact of misspecified dynamics with an opponent policy and learn a robust policy to facilitate safe control in a Markov Game. Subsequently, we adjust our constraint model to align the learned policies to expert demonstrations, accommodating both soft and hard optimality in our behavioral models. Empirical results demonstrate the significance of robust constraints and the effectiveness of the proposed AR-ICRL algorithm under continuous and discrete domains. The code is available at https://github.com/Jasonxu1225/AR-ICRL.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sheng Xu;Guiliang Liu", "authorids": "~Sheng_Xu8;~Guiliang_Liu1", "gender": ";M", "homepage": "https://shengxu.net/;http://guiliang.me/", "dblp": ";220/5411", "google_scholar": "rJhWU6gAAAAJ;CuMylvEAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Sheng_Xu8;~Guiliang_Liu1", "aff": "The Chinese University of Hong Kong, Shenzhen;The Chinese University of Hong Kong, Shenzhen", "aff_domain": "cuhk.edu.cn;cuhk.edu.hk", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nxu2024robust,\ntitle={Robust Inverse Constrained Reinforcement Learning under Model Misspecification},\nauthor={Sheng Xu and Guiliang Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pkUl39b0in}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4325057, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16271503188992001850&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "cuhk.edu.cn;cuhk.edu.hk", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Chinese University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cuhk.edu.cn", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Shenzhen", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Efficient Denoising Diffusion via Probabilistic Masking", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33028", "id": "pktvuR7b5v", "proceeding": "https://proceedings.mlr.press/v235/zhang24cf.html", "pdf": "https://openreview.net/pdf?id=pktvuR7b5v", "openreview": "https://openreview.net/forum?id=pktvuR7b5v", "author_site": "Weizhong Zhang, Zhiwei Zhang, Renjie Pi, Zhongming Jin, Yuan Gao, Jieping Ye, Kani Chen", "tldr": "", "abstract": "Diffusion models have exhibited remarkable advancements in generating high-quality data. However, a critical drawback is their computationally intensive inference process, which requires a large number of timesteps to generate a single sample. Existing methods address this challenge by decoupling the forward and reverse processes, and they rely on handcrafted rules for sampling acceleration, leading to the risk of discarding important steps. In this paper, we propose an Efficient Denoising Diffusion method via Probabilistic Masking (EDDPM) that can identify and skip the redundant steps during training. To determine whether a timestep should be skipped or not, we employ probabilistic reparameterization to continualize the binary determination mask. The mask distribution parameters are learned jointly with model weights. By incorporating a real-time sparse constraint, our method can effectively identify and eliminate unnecessary steps during the training iterations, thereby improving inference efficiency. Notably, as the model becomes fully trained, the random masks converge to a sparse and deterministic one, retaining only a small number of essential steps. Empirical results demonstrate the superiority of our proposed EDDPM over the state-of-the-art sampling acceleration methods across various domains. EDDPM can generate high-quality samples with only 20% of the steps for time series imputation and achieve 4.89 FID with 5 steps for CIFAR-10. Moreover, when starting from a pretrained model, our method efficiently identifies the most informative timesteps within a single epoch, which demonstrates the potential of EDDPM to be a practical tool to explore large diffusion models with limited resources.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "WEIZHONG ZHANG;Zhiwei Zhang;Renjie Pi;Zhongming Jin;Yuan Gao;Jieping Ye;Kani Chen", "authorids": "~WEIZHONG_ZHANG2;~Zhiwei_Zhang3;~Renjie_Pi1;~Zhongming_Jin1;~Yuan_Gao4;~Jieping_Ye4;~Kani_Chen1", "gender": ";;M;M;;M;M", "homepage": ";;;https://sites.google.com/site/zjuzhongmingjin/;;http://yelabs.net/;https://seng.hkust.edu.hk/about/people/faculty/kani-chen", "dblp": ";;67/2156;;;03/5454;", "google_scholar": ";;XUq0HwcAAAAJ;fOC90nQAAAAJ;;T9AzhwcAAAAJ;", "orcid": ";;;;;0000-0001-8662-5818;", "linkedin": ";;;;;;", "or_profile": "~WEIZHONG_ZHANG2;~Zhiwei_Zhang3;~Renjie_Pi1;~Zhongming_Jin1;~Yuan_Gao4;~Jieping_Ye4;~Kani_Chen1", "aff": ";;Hong Kong University of Science and Technology;Alibaba Cloud Computing;;Alibaba Group;Hong Kong University of Science and Technology", "aff_domain": ";;ust.hk;alibaba-inc.com;;alibaba-inc.com;ust.hk", "position": ";;PhD student;Researcher;;Principal Researcher;Full Professor", "bibtex": "@inproceedings{\nzhang2024efficient,\ntitle={Efficient Denoising Diffusion via Probabilistic Masking},\nauthor={WEIZHONG ZHANG and Zhiwei Zhang and Renjie Pi and Zhongming Jin and Yuan Gao and Jieping Ye and Kani Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pktvuR7b5v}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3312340, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10315798789718667343&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";;ust.hk;alibaba-inc.com;;alibaba-inc.com;ust.hk", "author_num": 7, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Hong Kong University of Science and Technology;Alibaba Group", "aff_unique_dep": ";Cloud Computing", "aff_unique_url": "https://www.ust.hk;https://www.alibabacloud.com", "aff_unique_abbr": "HKUST;Alibaba Cloud", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Emergent Equivariance in Deep Ensembles", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33027", "id": "plXXbXjvQ9", "proceeding": "https://proceedings.mlr.press/v235/gerken24a.html", "pdf": "https://openreview.net/pdf?id=plXXbXjvQ9", "openreview": "https://openreview.net/forum?id=plXXbXjvQ9", "author_site": "Jan Gerken, Pan Kessel", "tldr": "", "abstract": "We show that deep ensembles become equivariant for all inputs and at all training times by simply using data augmentation. Crucially, equivariance holds off-manifold and for any architecture in the infinite width limit. The equivariance is emergent in the sense that predictions of individual ensemble members are not equivariant but their collective prediction is. Neural tangent kernel theory is used to derive this result and we verify our theoretical insights using detailed numerical experiments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jan E Gerken;Pan Kessel", "authorids": "~Jan_E_Gerken1;~Pan_Kessel1", "gender": "M;M", "homepage": "https://www.chalmers.se/sv/personal/Sidor/gerken.aspx;https://www.gene.com/scientists/our-scientists/pan-kessel", "dblp": "293/9373;238/1381", "google_scholar": "NIdlVIEAAAAJ;uODjwl8AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Jan_E_Gerken1;~Pan_Kessel1", "aff": "Chalmers University of Technology;Prescient Design", "aff_domain": "chalmers.se;roche.com", "position": "Assistant Professor;Researcher", "bibtex": "@inproceedings{\ngerken2024emergent,\ntitle={Emergent Equivariance in Deep Ensembles},\nauthor={Jan E Gerken and Pan Kessel},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=plXXbXjvQ9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1222748, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12466558153924177896&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 10, "email": "chalmers.se;roche.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Chalmers University of Technology;Prescient Design", "aff_unique_dep": ";", "aff_unique_url": "https://www.chalmers.se;", "aff_unique_abbr": "Chalmers;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0", "aff_country_unique": "Sweden;" }, { "title": "Graph Out-of-Distribution Detection Goes Neighborhood Shaping", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33026", "id": "pmcusTywXO", "proceeding": "https://proceedings.mlr.press/v235/bao24c.html", "pdf": "https://openreview.net/pdf?id=pmcusTywXO", "openreview": "https://openreview.net/forum?id=pmcusTywXO", "author_site": "Tianyi Bao, Qitian Wu, Zetian Jiang, Yiting Chen, Jiawei Sun, Junchi Yan", "tldr": "", "abstract": "Despite the rich line of research works on out-of-distribution (OOD) detection on images, the literature on OOD detection for interdependent data, e.g., graphs, is still relatively limited. To fill this gap, we introduce TopoOOD as a principled approach that accommodates graph topology and neighborhood context for detecting OOD node instances on graphs. Meanwhile, we enrich the experiment settings by splitting in-distribution (ID) and OOD data based on distinct topological distributions, which presents new benchmarks for a more comprehensive analysis of graph-based OOD detection. The latter is designed to thoroughly assess the performance of these discriminators under distribution shifts involving structural information, providing a rigorous evaluation of methods in the emerging area of OOD detection on graphs. Our experimental results show the competitiveness of the proposed model across multiple datasets, as evidenced by up to a 15% increase in the AUROC and a 50% decrease in the FPR compared to existing state-of-the-art methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tianyi Bao;Qitian Wu;Zetian Jiang;Yiting Chen;Jiawei Sun;Junchi Yan", "authorids": "~Tianyi_Bao1;~Qitian_Wu1;~Zetian_Jiang1;~Yiting_Chen1;~Jiawei_Sun2;~Junchi_Yan2", "gender": "F;;M;M;;", "homepage": "https://github.com/btyll;;http://thinklab.sjtu.edu.cn/member.html;https://ytchen981.github.io/;;", "dblp": "202/6729;;;135/6971;;", "google_scholar": ";;;https://scholar.google.com/citations?hl=zh-CN;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Tianyi_Bao1;~Qitian_Wu1;~Zetian_Jiang1;~Yiting_Chen1;~Jiawei_Sun2;~Junchi_Yan2", "aff": "Shanghai Jiaotong University;;Shanghai Jiaotong University;Shanghai Jiaotong University;;", "aff_domain": "sjtu.edu.cn;;sjtu.edu.cn;sjtu.edu.cn;;", "position": "PhD student;;PhD student;PhD student;;", "bibtex": "@inproceedings{\nbao2024graph,\ntitle={Graph Out-of-Distribution Detection Goes Neighborhood Shaping},\nauthor={Tianyi Bao and Qitian Wu and Zetian Jiang and Yiting Chen and Jiawei Sun and Junchi Yan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pmcusTywXO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 657884, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15468703609245851451&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "email": "sjtu.edu.cn;;sjtu.edu.cn;sjtu.edu.cn;;", "author_num": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Agent-Specific Effects: A Causal Effect Propagation Analysis in Multi-Agent MDPs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33025", "id": "pmncWWkGMz", "proceeding": "https://proceedings.mlr.press/v235/triantafyllou24a.html", "pdf": "https://openreview.net/pdf?id=pmncWWkGMz", "openreview": "https://openreview.net/forum?id=pmncWWkGMz", "author_site": "Stelios Triantafyllou, Aleksa Sukovic, Debmalya Mandal, Goran Radanovic", "tldr": "", "abstract": "Establishing causal relationships between actions and outcomes is fundamental for accountable multi-agent decision-making. However, interpreting and quantifying agents' contributions to such relationships pose significant challenges. These challenges are particularly prominent in the context of multi-agent sequential decision-making, where the causal effect of an agent's action on the outcome depends on how other agents respond to that action. In this paper, our objective is to present a systematic approach for attributing the causal effects of agents' actions to the influence they exert on other agents. Focusing on multi-agent Markov decision processes, we introduce agent-specific effects (ASE), a novel causal quantity that measures the effect of an agent's action on the outcome that propagates through other agents. We then turn to the counterfactual counterpart of ASE (cf-ASE), provide a sufficient set of conditions for identifying cf-ASE, and propose a practical sampling-based algorithm for estimating it. Finally, we experimentally evaluate the utility of cf-ASE through a simulation-based testbed, which includes a sepsis management environment.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Stelios Triantafyllou;Aleksa Sukovic;Debmalya Mandal;Goran Radanovic", "authorids": "~Stelios_Triantafyllou1;~Aleksa_Sukovic1;~Debmalya_Mandal2;~Goran_Radanovic1", "gender": "M;M;M;", "homepage": "https://stelios30.github.io/;https://aleksa-sukovic.github.io/;https://debmandal.github.io;", "dblp": "298/1292;359/1824;151/3685;133/1771", "google_scholar": "VcYmHo0AAAAJ;HqkjkXIAAAAJ;OquWQpEAAAAJ;KBG_JlAAAAAJ", "orcid": ";;;", "linkedin": ";aleksa-sukovic/;;", "or_profile": "~Stelios_Triantafyllou1;~Aleksa_Sukovic1;~Debmalya_Mandal2;~Goran_Radanovic1", "aff": "MPI-SWS;MPI-SWS;University of Warwick;MPI-SWS", "aff_domain": "mpi-sws.org;mpi-sws.org;warwick.ac.uk;mpi-sws.org", "position": "PhD student;Researcher;Assistant Professor;Research group leader", "bibtex": "@inproceedings{\ntriantafyllou2024agentspecific,\ntitle={Agent-Specific Effects: A Causal Effect Propagation Analysis in Multi-Agent {MDP}s},\nauthor={Stelios Triantafyllou and Aleksa Sukovic and Debmalya Mandal and Goran Radanovic},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pmncWWkGMz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 762959, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12865367719194653235&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 7, "email": "mpi-sws.org;mpi-sws.org;warwick.ac.uk;mpi-sws.org", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Max Planck Institute for Software Systems;University of Warwick", "aff_unique_dep": ";", "aff_unique_url": "https://www.mpi-sws.org;https://www.warwick.ac.uk", "aff_unique_abbr": "MPI-SWS;Warwick", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Germany;United Kingdom" }, { "title": "More Flexible PAC-Bayesian Meta-Learning by Learning Learning Algorithms", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33024", "id": "pmsPKIBAu6", "proceeding": "https://proceedings.mlr.press/v235/zakerinia24a.html", "pdf": "https://openreview.net/pdf?id=pmsPKIBAu6", "openreview": "https://openreview.net/forum?id=pmsPKIBAu6", "author_site": "Hossein Zakerinia, Amin Behjati, Christoph Lampert", "tldr": "", "abstract": "We introduce a new framework for studying meta-learning methods using PAC-Bayesian theory. Its main advantage over previous work is that it allows for more flexibility in how the transfer of knowledge between tasks is realized. For previous approaches, this could only happen indirectly, by means of learning prior distributions over models. In contrast, the new generalization bounds that we prove express the process of meta-learning much more directly as learning the learning algorithm that should be used for future tasks. The flexibility of our framework makes it suitable to analyze a wide range of meta-learning mechanisms and even design new mechanisms. Other than our theoretical contributions we also show empirically that our framework improves the prediction quality in practical meta-learning mechanisms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hossein Zakerinia;Amin Behjati;Christoph H. Lampert", "authorids": "~Hossein_Zakerinia1;~Amin_Behjati1;~Christoph_H._Lampert6", "gender": ";M;", "homepage": ";https://www.researchgate.net/profile/Amin-Behjati;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Hossein_Zakerinia1;~Amin_Behjati1;~Christoph_H._Lampert6", "aff": ";Sharif University of Technology;", "aff_domain": ";sharif.edu;", "position": ";PhD student;", "bibtex": "@inproceedings{\nzakerinia2024more,\ntitle={More Flexible {PAC}-Bayesian Meta-Learning by Learning Learning Algorithms},\nauthor={Hossein Zakerinia and Amin Behjati and Christoph H. Lampert},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pmsPKIBAu6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 373198, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16265971979474842643&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 9, "email": ";sharif.edu;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Sharif University of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.sharif.edu", "aff_unique_abbr": "SUT", "aff_country_unique_index": "0", "aff_country_unique": "Iran" }, { "title": "An Intrinsic Vector Heat Network", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33023", "id": "po4NsL9KvX", "proceeding": "https://proceedings.mlr.press/v235/gao24c.html", "pdf": "https://openreview.net/pdf?id=po4NsL9KvX", "openreview": "https://openreview.net/forum?id=po4NsL9KvX", "author_site": "Alexander Gao, Maurice Chu, Mubbasir Kapadia, Ming Lin, Hsueh-Ti Derek Liu", "tldr": "", "abstract": "Vector fields are widely used to represent and model flows for many science and engineering applications. This paper introduces a novel neural network architecture for learning tangent vector fields that are intrinsically defined on manifold surfaces embedded in 3D. Previous approaches to learning vector fields on surfaces treat vectors as multi-dimensional scalar fields, using traditional scalar-valued architectures to process channels individually, thus fail to preserve fundamental intrinsic properties of the vector field. The core idea of this work is to introduce a trainable vector heat diffusion module to spatially propagate vector-valued feature data across the surface, which we incorporate into our proposed architecture that consists of vector-valued neurons. Our architecture is invariant to rigid motion of the input, isometric deformation, and choice of local tangent bases, and is robust to discretizations of the surface. We evaluate our Vector Heat Network on triangle meshes, and empirically validate its invariant properties. We also demonstrate the effectiveness of our method on the useful industrial application of quadrilateral mesh generation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alexander Gao;Maurice Chu;Mubbasir Kapadia;Ming Lin;Hsueh-Ti Derek Liu", "authorids": "~Alexander_Gao1;~Maurice_Chu1;~Mubbasir_Kapadia2;~Ming_Lin2;~Hsueh-Ti_Derek_Liu1", "gender": "M;M;M;F;M", "homepage": "https://gaoalexander.github.io;;https://ivi.cs.rutgers.edu/;http://www.cs.umd.edu/~lin;http://www.dgp.toronto.edu/~hsuehtil/", "dblp": "289/1694.html;;08/4943;l/MingCLin.html;205/9611", "google_scholar": "uoNPrRUAAAAJ;;xhkzmycAAAAJ;ugFNit4AAAAJ;-T7Au0kAAAAJ", "orcid": ";;0000-0002-3501-0028;0000-0003-3736-6949;", "linkedin": ";maurice-chu-1ab3731a/;mubbasir-kapadia-aa9273a/;mlin2/;", "or_profile": "~Alexander_Gao1;~Maurice_Chu1;~Mubbasir_Kapadia2;~Ming_Lin2;~Hsueh-Ti_Derek_Liu1", "aff": "University of Maryland, College Park;;Roblox;Amazon;Roblox", "aff_domain": "umd.edu;;roblox.com;amazon.com;roblox.com", "position": "PhD student;;Principal Researcher;Amazon Scholar;Researcher", "bibtex": "@inproceedings{\ngao2024an,\ntitle={An Intrinsic Vector Heat Network},\nauthor={Alexander Gao and Maurice Chu and Mubbasir Kapadia and Ming Lin and Hsueh-Ti Derek Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=po4NsL9KvX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 10231454, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:AdY6l9sNmk4J:scholar.google.com/&scioq=An+Intrinsic+Vector+Heat+Network&hl=en&as_sdt=0,33", "gs_version_total": 5, "email": "umd.edu;;roblox.com;amazon.com;roblox.com", "author_num": 5, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "University of Maryland;Roblox Corporation;Amazon", "aff_unique_dep": ";;Amazon.com, Inc.", "aff_unique_url": "https://www/umd.edu;https://www.roblox.com;https://www.amazon.com", "aff_unique_abbr": "UMD;Roblox;Amazon", "aff_campus_unique_index": "0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Fair Off-Policy Learning from Observational Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33022", "id": "poEPRuNvM3", "proceeding": "https://proceedings.mlr.press/v235/frauen24a.html", "pdf": "https://openreview.net/pdf?id=poEPRuNvM3", "openreview": "https://openreview.net/forum?id=poEPRuNvM3", "author_site": "Dennis Frauen, Valentyn Melnychuk, Stefan Feuerriegel", "tldr": "", "abstract": "Algorithmic decision-making in practice must be fair for legal, ethical, and societal reasons. To achieve this, prior research has contributed various approaches that ensure fairness in machine learning predictions, while comparatively little effort has focused on fairness in decision-making, specifically off-policy learning. In this paper, we propose a novel framework for fair off-policy learning: we learn decision rules from observational data under different notions of fairness, where we explicitly assume that observational data were collected under a different -- potentially discriminatory -- behavioral policy. Importantly, our framework applies to different fairness notions for off-policy learning, where fairness is formalized based on actions or policy values. As our main contribution, we propose a neural network-based framework to learn optimal policies under different fairness notions. We further provide theoretical guarantees in the form of generalization bounds for the finite-sample version of our framework. We demonstrate the effectiveness of our framework through extensive numerical experiments using both simulated and real-world data. Altogether, our work enables algorithmic decision-making in a wide array of practical applications where fairness must be ensured.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dennis Frauen;Valentyn Melnychuk;Stefan Feuerriegel", "authorids": "~Dennis_Frauen1;~Valentyn_Melnychuk1;~Stefan_Feuerriegel1", "gender": "M;M;M", "homepage": "https://www.ai.bwl.uni-muenchen.de/team/research_team/dennis_frauen/index.html;https://valentyn1997.github.io/;http://www.ai.bwl.lmu.de", "dblp": "315/0115;254/1513;125/0630", "google_scholar": "ieyW4WQAAAAJ;EMExrOMAAAAJ;https://scholar.google.de/citations?hl=de", "orcid": ";0000-0002-2401-6803;0000-0001-7856-8729", "linkedin": "dennis-frauen-6b5746171/;valentyn-melnychuk/;", "or_profile": "~Dennis_Frauen1;~Valentyn_Melnychuk1;~Stefan_Feuerriegel1", "aff": "Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;LMU Munich", "aff_domain": "lmu.de;lmu.de;lmu.de", "position": "PhD student;PhD student;Professor", "bibtex": "@inproceedings{\nfrauen2024fair,\ntitle={Fair Off-Policy Learning from Observational Data},\nauthor={Dennis Frauen and Valentyn Melnychuk and Stefan Feuerriegel},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=poEPRuNvM3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 681977, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5841492373024397942&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "lmu.de;lmu.de;lmu.de", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;Ludwig Maximilian University of Munich", "aff_unique_dep": ";", "aff_unique_url": "https://www.lmu.de;https://www.lmu.de", "aff_unique_abbr": "LMU;LMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Munich", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "In-Context Reinforcement Learning for Variable Action Spaces", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33021", "id": "pp3v2ch5Sd", "proceeding": "https://proceedings.mlr.press/v235/sinii24a.html", "pdf": "https://openreview.net/pdf?id=pp3v2ch5Sd", "openreview": "https://openreview.net/forum?id=pp3v2ch5Sd", "author_site": "Viacheslav Sinii, Alexander Nikulin, Vladislav Kurenkov, Ilya Zisman, Sergey Kolesnikov", "tldr": "", "abstract": "Recently, it has been shown that transformers pre-trained on diverse datasets with multi-episode contexts can generalize to new reinforcement learning tasks in-context. A key limitation of previously proposed models is their reliance on a predefined action space size and structure. The introduction of a new action space often requires data re-collection and model re-training, which can be costly for some applications. In our work, we show that it is possible to mitigate this issue by proposing the Headless-AD model that, despite being trained only once, is capable of generalizing to discrete action spaces of variable size, semantic content and order. By experimenting with Bernoulli and contextual bandits, as well as a gridworld environment, we show that Headless-AD exhibits significant capability to generalize to action spaces it has never encountered, even outperforming specialized models trained for a specific set of actions on several environment configurations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Viacheslav Sinii;Alexander Nikulin;Vladislav Kurenkov;Ilya Zisman;Sergey Kolesnikov", "authorids": "~Viacheslav_Sinii1;~Alexander_Nikulin1;~Vladislav_Kurenkov1;~Ilya_Zisman1;~Sergey_Kolesnikov1", "gender": "M;M;M;Not Specified;M", "homepage": "https://t.me/identiki_t;https://howuhh.github.io/;https://vkurenkov.me;https://zis.mn/;https://scitator.com", "dblp": "351/7957;314/6349;251/9126;;191/1945", "google_scholar": "IO-blf8AAAAJ;yACvnqUAAAAJ;w09vtVsAAAAJ;tmh78sQAAAAJ;iukbpVEAAAAJ", "orcid": ";;0000-0003-4078-1086;;", "linkedin": ";;;suessmann/;scitator/", "or_profile": "~Viacheslav_Sinii1;~Alexander_Nikulin1;~Vladislav_Kurenkov1;~Ilya_Zisman1;~Sergey_Kolesnikov1", "aff": "Innopolis University;Moscow Institute of Physics and Technology;Tinkoff;Higher School of Economics;Tinkoff", "aff_domain": "innopolis.ru;mipt.edu;tinkoff.ai;hse.ru;tinkoff.ru", "position": "Undergrad student;PhD student;Researcher;MS student;Principal Researcher", "bibtex": "@inproceedings{\nsinii2024incontext,\ntitle={In-Context Reinforcement Learning for Variable Action Spaces},\nauthor={Viacheslav Sinii and Alexander Nikulin and Vladislav Kurenkov and Ilya Zisman and Sergey Kolesnikov},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pp3v2ch5Sd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 915713, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12498652674903200289&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "innopolis.ru;mipt.edu;tinkoff.ai;hse.ru;tinkoff.ru", "author_num": 5, "aff_unique_index": "0;1;2;3;2", "aff_unique_norm": "Innopolis University;Moscow Institute of Physics and Technology;Tinkoff Bank;Higher School of Economics", "aff_unique_dep": ";;;", "aff_unique_url": "https://innopolis.ru/en;https://www.mipt.ru/en;https://www.tinkoff.ru;https://www.hse.ru", "aff_unique_abbr": "Innopolis;MIPT;Tinkoff;HSE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Russian Federation" }, { "title": "The Balanced-Pairwise-Affinities Feature Transform", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33020", "id": "pspyQm4ko0", "proceeding": "https://proceedings.mlr.press/v235/shalam24a.html", "pdf": "https://openreview.net/pdf?id=pspyQm4ko0", "openreview": "https://openreview.net/forum?id=pspyQm4ko0", "author_site": "Daniel Shalam, Simon Korman", "tldr": "", "abstract": "The Balanced-Pairwise-Affinities (BPA) feature transform is designed to upgrade the features of a set of input items to facilitate downstream matching or grouping related tasks. The transformed set encodes a rich representation of high order relations between the input features. A particular min-cost-max-flow fractional matching problem, whose entropy regularized version can be approximated by an optimal transport (OT) optimization, leads to a transform which is efficient, differentiable, equivariant, parameterless and probabilistically interpretable. While the Sinkhorn OT solver has been adapted extensively in many contexts, we use it differently by minimizing the cost between a set of features to *itself* and using the transport plan's *rows* as the new representation.Empirically, the transform is highly effective and flexible in its use and consistently improves networks it is inserted into, in a variety of tasks and training schemes. We demonstrate state-of-the-art results in few-shot classification, unsupervised image clustering and person re-identification. Code is available at github.com/DanielShalam/BPA .", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Daniel Shalam;Simon Korman", "authorids": "~Daniel_Shalam1;~Simon_Korman1", "gender": "M;M", "homepage": ";http://www.cs.haifa.ac.il/~skorman/", "dblp": ";http://dblp.uni-trier.de/pers/hd/k/Korman:Simon", "google_scholar": "Ffu2TCkAAAAJ;6OZMoP0AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Daniel_Shalam1;~Simon_Korman1", "aff": "University of Haifa;University of Haifa", "aff_domain": "haifa.ac.il;haifa.ac.il", "position": "MS student;Lecturer", "bibtex": "@inproceedings{\nshalam2024the,\ntitle={The Balanced-Pairwise-Affinities Feature Transform},\nauthor={Daniel Shalam and Simon Korman},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pspyQm4ko0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3146309, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:bunqKQl0choJ:scholar.google.com/&scioq=The+Balanced-Pairwise-Affinities+Feature+Transform&hl=en&as_sdt=0,5", "gs_version_total": 7, "email": "haifa.ac.il;haifa.ac.il", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Haifa", "aff_unique_dep": "", "aff_unique_url": "https://www.haifa.ac.il", "aff_unique_abbr": "UoH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "DisCo-Diff: Enhancing Continuous Diffusion Models with Discrete Latents", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33019", "id": "psup68MBvt", "proceeding": "https://proceedings.mlr.press/v235/xu24f.html", "pdf": "https://openreview.net/pdf?id=psup68MBvt", "openreview": "https://openreview.net/forum?id=psup68MBvt", "author_site": "Yilun Xu, Gabriele Corso, Tommi Jaakkola, Arash Vahdat, Karsten Kreis", "tldr": "", "abstract": "Diffusion models (DMs) have revolutionized generative learning. They utilize a diffusion process to encode data into a simple Gaussian distribution. However, encoding a complex, potentially multimodal data distribution into a single *continuous* Gaussian distribution arguably represents an unnecessarily challenging learning problem. We propose ***Dis**crete-**Co**ntinuous Latent Variable **Diff**usion Models (DisCo-Diff)* to simplify this task by introducing complementary *discrete* latent variables. We augment DMs with learnable discrete latents, inferred with an encoder, and train DM and encoder end-to-end. DisCo-Diff does not rely on pre-trained networks, making the framework universally applicable. The discrete latents significantly simplify learning the DM's complex noise-to-data mapping by reducing the curvature of the DM's generative ODE. An additional autoregressive transformer models the distribution of the discrete latents, a simple step because DisCo-Diff requires only few discrete variables with small codebooks. We validate DisCo-Diff on toy data, several image synthesis tasks as well as molecular docking, and find that introducing discrete latents consistently improves model performance. For example, DisCo-Diff achieves state-of-the-art FID scores on class-conditioned ImageNet-64/128 datasets with ODE sampler.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yilun Xu;Gabriele Corso;Tommi Jaakkola;Arash Vahdat;Karsten Kreis", "authorids": "~Yilun_Xu1;~Gabriele_Corso1;~Tommi_S._Jaakkola1;~Arash_Vahdat3;~Karsten_Kreis1", "gender": "M;;;M;", "homepage": "http://yilun-xu.com;https://gcorso.github.io/;;http://latentspace.cc/;https://karstenkreis.github.io/", "dblp": ";262/6499;;92/8108;238/6834", "google_scholar": ";LUrAYgEAAAAJ;;https://scholar.google.ca/citations?user=p9-nlRIAAAAJ;https://scholar.google.de/citations?user=rFd-DiAAAAAJ", "orcid": ";;;;", "linkedin": ";gcorso/;;;karstenkreis", "or_profile": "~Yilun_Xu1;~Gabriele_Corso1;~Tommi_S._Jaakkola1;~Arash_Vahdat3;~Karsten_Kreis1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;;NVIDIA;NVIDIA", "aff_domain": "mit.edu;mit.edu;;nvidia.com;nvidia.com", "position": "PhD student;PhD student;;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nxu2024discodiff,\ntitle={DisCo-Diff: Enhancing Continuous Diffusion Models with Discrete Latents},\nauthor={Yilun Xu and Gabriele Corso and Tommi Jaakkola and Arash Vahdat and Karsten Kreis},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=psup68MBvt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6385884, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15385117910782491377&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "mit.edu;mit.edu;;nvidia.com;nvidia.com", "author_num": 5, "aff_unique_index": "0;0;1;1", "aff_unique_norm": "Massachusetts Institute of Technology;NVIDIA", "aff_unique_dep": ";NVIDIA Corporation", "aff_unique_url": "https://web.mit.edu;https://www.nvidia.com", "aff_unique_abbr": "MIT;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Relational Learning in Pre-Trained Models: A Theory from Hypergraph Recovery Perspective", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33018", "id": "puSMYmHmJW", "proceeding": "https://proceedings.mlr.press/v235/chen24l.html", "pdf": "https://openreview.net/pdf?id=puSMYmHmJW", "openreview": "https://openreview.net/forum?id=puSMYmHmJW", "author_site": "Yang Chen, Cong Fang, Zhouchen Lin, Bing Liu", "tldr": "", "abstract": "Foundation Models (FMs) have demonstrated remarkable insights into the relational dynamics of the world, leading to the crucial question: *how do these models acquire an understanding of world hybrid relations?* Traditional statistical learning, particularly for prediction problems, may overlook the rich and inherently structured information from the data, especially regarding the relationships between objects. We introduce a mathematical model that formalizes relational learning as hypergraph recovery to study pre-training of FMs. In our framework, the world is represented as a hypergraph, with data abstracted as random samples from hyperedges. We theoretically examine the feasibility of a Pre-Trained Model (PTM) to recover this hypergraph and analyze the data efficiency in a minimax near-optimal style. By integrating rich graph theories into the realm of PTMs, our mathematical framework offers powerful tools for an in-depth understanding of pre-training from a unique perspective and can be used under various scenarios. As an example, we extend the framework to entity alignment in multimodal learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yang Chen;Cong Fang;Zhouchen Lin;Bing Liu", "authorids": "~Yang_Chen17;~Cong_Fang1;~Zhouchen_Lin1;~Bing_Liu1", "gender": "M;M;M;M", "homepage": "https://zero-lab-pku.github.io/personwise/chenyang/;https://congfang-ml.github.io/;https://zhouchenlin.github.io;https://www.cs.uic.edu/~liub/", "dblp": ";140/6568;l/ZhouchenLin;l/BingLiu1.html", "google_scholar": ";N2M9RPoAAAAJ;https://scholar.google.com.tw/citations?user=TanjFwoAAAAJ;Kt1bjZoAAAAJ", "orcid": ";;0000-0003-1493-7569;", "linkedin": ";;;", "or_profile": "~Yang_Chen17;~Cong_Fang1;~Zhouchen_Lin1;~Bing_Liu1", "aff": "Peking University;Peking University;Peking University;University of Illinois at Chicago", "aff_domain": "pku.edu.cn;pku.edu.cn;pku.edu.cn;uic.edu", "position": "PhD student;Assistant Professor;Professor;Full Professor", "bibtex": "@inproceedings{\nchen2024relational,\ntitle={Relational Learning in Pre-Trained Models: A Theory from Hypergraph Recovery Perspective},\nauthor={Yang Chen and Cong Fang and Zhouchen Lin and Bing Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=puSMYmHmJW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6177620, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11978670676764772524&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 6, "email": "pku.edu.cn;pku.edu.cn;pku.edu.cn;uic.edu", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Peking University;University of Illinois at Chicago", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.uic.edu", "aff_unique_abbr": "Peking U;UIC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Chicago", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "DiNADO: Norm-Disentangled Neurally-Decomposed Oracles for Controlling Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33017", "id": "pvg1OdUtDQ", "proceeding": "https://proceedings.mlr.press/v235/lu24o.html", "pdf": "https://openreview.net/pdf?id=pvg1OdUtDQ", "openreview": "https://openreview.net/forum?id=pvg1OdUtDQ", "author_site": "Sidi Lu, Wenbo Zhao, Chenyang Tao, Arpit Gupta, Shanchan Wu, Tagyoung Chung, Nanyun Peng", "tldr": "", "abstract": "NeurAlly-Decomposed Oracle (NADO) is a powerful approach for controllable generation with large language models. It is designed to avoid catastrophic forgetting while achieving guaranteed convergence to an entropy-maximized closed-form optimal solution with reasonable modeling capacity. Despite the success, several challenges arise when apply NADO to a wide range of scenarios. Vanilla NADO suffers from gradient vanishing for low-probability control signals and is highly reliant on a regularization to satisfy the stochastic version of Bellman equation. In addition, the vanilla implementation of NADO introduces a few additional transformer layers, suffering from a limited capacity especially compared to other finetune-based model adaptation methods like LoRA. In this paper, we propose a improved version of the NADO algorithm, namely DiNADO (norm-**Di**sentangled **N**eur**A**lly-**D**ecomposed **O**racles), which improves the performance of the NADO algorithm through disentangling the step-wise global norm over the approximated oracle $R$-value for all potential next-tokens, allowing DiNADO to be combined with finetuning methods like LoRA. We discuss in depth how DiNADO achieves better capacity, stability and flexibility with both empirical and theoretical results. Experiments on formality control in machine translation and the lexically constrained generation task CommonGen demonstrates the significance of the improvements.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sidi Lu;Wenbo Zhao;Chenyang Tao;Arpit Gupta;Shanchan Wu;Tagyoung Chung;Nanyun Peng", "authorids": "~Sidi_Lu1;~Wenbo_Zhao1;~Chenyang_Tao1;~Arpit_Gupta1;~Shanchan_Wu2;~Tagyoung_Chung2;~Nanyun_Peng1", "gender": "M;M;M;M;;F;M", "homepage": "https://sidilu.cn;;http://cytao.wordpress.com;;;https://violetpeng.github.io/;https://www.linkedin.com/in/wushanchan/", "dblp": "206/6156;30/5943;170/6702;;02/8178;117/4036;16/2030", "google_scholar": "KHMrrfgAAAAJ;7ImYaXEAAAAJ;;XXVjLVgAAAAJ;_-egoNcAAAAJ;XxRXvX0AAAAJ;WbHzkK8AAAAJ", "orcid": ";;;;;;", "linkedin": ";waynezhaowenbo;;;tagyoung-chung-78b86652/;;wushanchan/", "or_profile": "~Sidi_Lu1;~Wenbo_Zhao1;~Chenyang_Tao1;~Arpit_Gupta1;~Tagyoung_Chung2;~Nanyun_Peng1;~Shanchan_Wu1", "aff": "University of California, Los Angeles;Amazon;Amazon;Amazon;Amazon;University of California, Los Angeles;Samsung Research America", "aff_domain": "ucla.edu;amazon.com;amazon.com;amazon.com;amazon.com;ucla.edu;samsung.com", "position": "PhD student;Researcher;Researcher;Researcher;Researcher;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nlu2024dinado,\ntitle={Di{NADO}: Norm-Disentangled Neurally-Decomposed Oracles for Controlling Language Models},\nauthor={Sidi Lu and Wenbo Zhao and Chenyang Tao and Arpit Gupta and Shanchan Wu and Tagyoung Chung and Nanyun Peng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pvg1OdUtDQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 419799, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1315587376108426441&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "ucla.edu;amazon.com;amazon.com;amazon.com;amazon.com;ucla.edu;samsung.com", "author_num": 7, "aff_unique_index": "0;1;1;1;1;0;2", "aff_unique_norm": "University of California, Los Angeles;Amazon;Samsung", "aff_unique_dep": ";Amazon.com, Inc.;Samsung Research America", "aff_unique_url": "https://www.ucla.edu;https://www.amazon.com;https://www.samsung.com/us/careers/research/", "aff_unique_abbr": "UCLA;Amazon;SRA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Latent Logic Tree Extraction for Event Sequence Explanation from LLMs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33016", "id": "pwfcwEqdUz", "proceeding": "https://proceedings.mlr.press/v235/song24j.html", "pdf": "https://openreview.net/pdf?id=pwfcwEqdUz", "openreview": "https://openreview.net/forum?id=pwfcwEqdUz", "author_site": "Zitao Song, Chao Yang, Chaojie Wang, Bo An, Shuang Li", "tldr": "", "abstract": "Modern high-stakes systems, such as healthcare or robotics, often generate vast streaming event sequences. Our goal is to design an efficient, plug-and-play tool to elicit logic tree-based explanations from Large Language Models (LLMs) to provide customized insights into each observed event sequence. Built on the temporal point process model for events, our method employs the likelihood function as a score to evaluate generated logic trees. We propose an amortized Expectation-Maximization (EM) learning framework and treat the logic tree as latent variables. In the E-step, we evaluate the posterior distribution over the latent logic trees using an LLM prior and the likelihood of the observed event sequences. LLM provides a high-quality prior for the latent logic trees, however, since the posterior is built over a discrete combinatorial space, we cannot get the closed-form solution. We propose to generate logic tree samples from the posterior using a learnable GFlowNet, which is a diversity-seeking generator for structured discrete variables. The M-step employs the generated logic rules to approximate marginalization over the posterior, facilitating the learning of model parameters and refining the tunable LLM prior parameters. In the online setting, our locally built, lightweight model will iteratively extract the most relevant rules from LLMs for each sequence using only a few iterations. Empirical demonstrations showcase the promising performance and adaptability of our framework.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zitao Song;Chao Yang;Chaojie Wang;Bo An;Shuang Li", "authorids": "~Zitao_Song1;~Chao_Yang9;~Chaojie_Wang1;~Bo_An2;~Shuang_Li3", "gender": "M;M;M;M;F", "homepage": "https://tsedao.github.io/;https://github.com/yangchaoforthree;https://chaojiewang94.github.io/;https://personal.ntu.edu.sg/boan/;https://shuangli01.github.io", "dblp": ";;134/9314-1;42/6178-1.html;43/6294-2", "google_scholar": "RATrbJUAAAAJ;;https://scholar.google.com/citations?hl=en;PEEpuNwAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0003-4646-0339;;;0000-0002-7064-7438;", "linkedin": ";;;;", "or_profile": "~Zitao_Song1;~Chao_Yang9;~Chaojie_Wang1;~Bo_An2;~Shuang_Li3", "aff": "Nanyang Technological University;The Chinese University of Hong Kong, Shenzhen;Skywork AI;Nanyang Technological University;The Chinese University of Hong Kong (Shenzhen)", "aff_domain": "ntu.edu.sg;cuhk.edu.cn;kunlun-inc.com;ntu.edu.sg;cuhk.edu.cn", "position": "Researcher;PhD student;Researcher;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nsong2024latent,\ntitle={Latent Logic Tree Extraction for Event Sequence Explanation from {LLM}s},\nauthor={Zitao Song and Chao Yang and Chaojie Wang and Bo An and Shuang Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pwfcwEqdUz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 769807, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16110359738463555990&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "ntu.edu.sg;cuhk.edu.cn;kunlun-inc.com;ntu.edu.sg;cuhk.edu.cn", "author_num": 5, "aff_unique_index": "0;1;2;0;1", "aff_unique_norm": "Nanyang Technological University;Chinese University of Hong Kong;Skywork AI", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ntu.edu.sg;https://www.cuhk.edu.cn;https://www.skywork.ai", "aff_unique_abbr": "NTU;CUHK;Skywork AI", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;1;2;0;1", "aff_country_unique": "Singapore;China;United States" }, { "title": "Probabilistic Routing for Graph-Based Approximate Nearest Neighbor Search", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33015", "id": "pz4B2kHVKo", "proceeding": "https://proceedings.mlr.press/v235/lu24l.html", "pdf": "https://openreview.net/pdf?id=pz4B2kHVKo", "openreview": "https://openreview.net/forum?id=pz4B2kHVKo", "author_site": "Kejing Lu, Chuan Xiao, Yoshiharu Ishikawa", "tldr": "", "abstract": "Approximate nearest neighbor search (ANNS) in high-dimensional spaces is a pivotal challenge in the field of machine learning. In recent years graph-based methods have emerged as the superior approach to ANNS, establishing a new state of the art. Although various optimizations for graph-based ANNS have been introduced, they predominantly rely on heuristic methods that lack formal theoretical backing. This paper aims to enhance routing within graph-based ANNS by introducing a method that offers a probabilistic guarantee when exploring a node\u2019s neighbors in the graph. We formulate the problem as probabilistic routing and develop two baseline strategies by incorporating locality-sensitive techniques. Subsequently, we introduce PEOs, a novel approach that efficiently identifies which neighbors in the graph should be considered for exact distance computation, thus significantly improving efficiency in practice. Our experiments demonstrate that equipping PEOs can increase throughput on a commonly utilized graph index (HNSW) by a factor of 1.6 to 2.5, and its efficiency consistently outperforms the leading-edge routing technique by 1.1 to 1.4 times. The code and datasets used for our evaluations are publicly accessible at https//github.com/ICML2024-code/PEOs .", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kejing Lu;Chuan Xiao;Yoshiharu Ishikawa", "authorids": "~Kejing_Lu1;~Chuan_Xiao2;~Yoshiharu_Ishikawa1", "gender": "M;M;M", "homepage": ";https://sites.google.com/site/chuanxiao1983/;https://www.db.is.i.nagoya-u.ac.jp/~ishikawa/", "dblp": "223/5242.html;57/4384-1.html;i/YoshiharuIshikawa", "google_scholar": "a8RE6F8AAAAJ;15bmyOkAAAAJ;https://scholar.google.co.jp/citations?user=jCJDleAAAAAJ", "orcid": ";0000-0001-7239-5134;0000-0003-3875-3262", "linkedin": ";chuan-xiao-8352071a0/;yoshiharu-ishikawa-b08500b8/", "or_profile": "~Kejing_Lu1;~Chuan_Xiao2;~Yoshiharu_Ishikawa1", "aff": "Nagoya University;Osaka University;Nagoya University", "aff_domain": "nagoya-u.ac.jp;osaka-u.ac.jp;nagoya-u.ac.jp", "position": "Postdoc;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nlu2024probabilistic,\ntitle={Probabilistic Routing for Graph-Based Approximate Nearest Neighbor Search},\nauthor={Kejing Lu and Chuan Xiao and Yoshiharu Ishikawa},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pz4B2kHVKo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1190972, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3570813275562629010&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "nagoya-u.ac.jp;osaka-u.ac.jp;nagoya-u.ac.jp", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Nagoya University;Osaka University", "aff_unique_dep": ";", "aff_unique_url": "https://www.nagoya-u.ac.jp;https://www.osaka-u.ac.jp", "aff_unique_abbr": "Nagoya U;Osaka U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Japan" }, { "title": "Disentanglement Learning via Topology", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33014", "id": "q0lxAs5GGO", "proceeding": "https://proceedings.mlr.press/v235/balabin24a.html", "pdf": "https://openreview.net/pdf?id=q0lxAs5GGO", "openreview": "https://openreview.net/forum?id=q0lxAs5GGO", "author_site": "Nikita Balabin, Daria Voronkova, Ilya Trofimov, Evgeny Burnaev, Serguei Barannikov", "tldr": "", "abstract": "We propose TopDis (Topological Disentanglement), a method for learning disentangled representations via adding a multi-scale topological loss term. Disentanglement is a crucial property of data representations substantial for the explainability and robustness of deep learning models and a step towards high-level cognition. The state-of-the-art methods are based on VAE and encourage the joint distribution of latent variables to be factorized. We take a different perspective on disentanglement by analyzing topological properties of data manifolds. In particular, we optimize the topological similarity for data manifolds traversals. To the best of our knowledge, our paper is the first one to propose a differentiable topological loss for disentanglement learning. Our experiments have shown that the proposed TopDis loss improves disentanglement scores such as MIG, FactorVAE score, SAP score, and DCI disentanglement score with respect to state-of-the-art results while preserving the reconstruction quality. Our method works in an unsupervised manner, permitting us to apply it to problems without labeled factors of variation. The TopDis loss works even when factors of variation are correlated. Additionally, we show how to use the proposed topological loss to find disentangled directions in a trained GAN.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nikita Balabin;Daria Voronkova;Ilya Trofimov;Evgeny Burnaev;Serguei Barannikov", "authorids": "~Nikita_Balabin1;~Daria_Voronkova1;~Ilya_Trofimov1;~Evgeny_Burnaev1;~Serguei_Barannikov1", "gender": "M;;;M;", "homepage": ";;;http://faculty.skoltech.ru/people/evgenyburnaev;", "dblp": "310/1857;;130/0370;144/7845;255/5203", "google_scholar": "https://scholar.google.com/citations?hl=en;;https://scholar.google.ru/citations?user=V1c6KjgAAAAJ;https://scholar.google.ru/citations?user=pCRdcOwAAAAJ;https://scholar.google.fr/citations?user=-soT8KcAAAAJ", "orcid": ";;0000-0002-2961-7368;0000-0001-8424-0690;0000-0002-9323-0651", "linkedin": "nikita-balabin-10455b17a/;;https://ru.linkedin.com/in/ilya-trofimov-ba122748;;", "or_profile": "~Nikita_Balabin1;~Daria_Voronkova1;~Ilya_Trofimov1;~Evgeny_Burnaev1;~Serguei_Barannikov1", "aff": "Skolkovo Institute of Science and Technology;;Skoltech;Skolkovo Institute of Science and Technology;CNRS, Institut Mathematiques de Jussieu, Paris Diderot University", "aff_domain": "skoltech.ru;;skoltech.ru;skoltech.ru;imj-prg.fr", "position": "PhD student;;Research scientist;Full Professor;Researcher", "bibtex": "@inproceedings{\nbalabin2024disentanglement,\ntitle={Disentanglement Learning via Topology},\nauthor={Nikita Balabin and Daria Voronkova and Ilya Trofimov and Evgeny Burnaev and Serguei Barannikov},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=q0lxAs5GGO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9138021, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15099455684142021084&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "skoltech.ru;;skoltech.ru;skoltech.ru;imj-prg.fr", "author_num": 5, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Skolkovo Institute of Science and Technology;Paris Diderot University", "aff_unique_dep": ";Institut Mathematiques de Jussieu", "aff_unique_url": "https://www.skoltech.ru;https://www.univ-paris-diderot.fr", "aff_unique_abbr": "Skoltech;Paris Diderot", "aff_campus_unique_index": "1", "aff_campus_unique": ";Paris", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "Russian Federation;France" }, { "title": "Sparse-to-dense Multimodal Image Registration via Multi-Task Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33013", "id": "q0vILV7zAw", "proceeding": "https://proceedings.mlr.press/v235/zhang24ar.html", "pdf": "https://openreview.net/pdf?id=q0vILV7zAw", "openreview": "https://openreview.net/forum?id=q0vILV7zAw", "author_site": "Kaining Zhang, Jiayi Ma", "tldr": "", "abstract": "Aligning image pairs captured by different sensors or those undergoing significant appearance changes is crucial for various computer vision and robotics applications. Existing approaches cope with this problem via either Sparse feature Matching (SM) or Dense direct Alignment (DA) paradigms. Sparse methods are efficient but lack accuracy in textureless scenes, while dense ones are more accurate in all scenes but demand for good initialization. In this paper, we propose SDME, a Sparse-to-Dense Multimodal feature Extractor based on a novel multi-task network that simultaneously predicts SM and DA features for robust multimodal image registration. We propose the sparse-to-dense registration paradigm: we first perform initial registration via SM and then refine the result via DA. By using the well-designed SDME, the sparse-to-dense approach combines the merits from both SM and DA. Extensive experiments on MSCOCO, GoogleEarth, VIS-NIR and VIS-IR-drone datasets demonstrate that our method achieves remarkable performance on multimodal cases. Furthermore, our approach exhibits robust generalization capabilities, enabling the fine-tuning of models initially trained on single-modal datasets for use with smaller multimodal datasets. Our code is available at https://github.com/KN-Zhang/SDME.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kaining Zhang;Jiayi Ma", "authorids": "~Kaining_Zhang2;~Jiayi_Ma2", "gender": "F;M", "homepage": "https://kn-zhang.github.io/;https://sites.google.com/site/jiayima2013/home", "dblp": ";96/9989", "google_scholar": "https://scholar.google.com.hk/citations?user=bmnZN8sAAAAJ;73trMQkAAAAJ", "orcid": ";0000-0003-3264-3265", "linkedin": ";", "or_profile": "~Kaining_Zhang2;~Jiayi_Ma2", "aff": "Wuhan University;Wuhan University", "aff_domain": "whu.edu.cn;whu.edu.cn", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nzhang2024sparsetodense,\ntitle={Sparse-to-dense Multimodal Image Registration via Multi-Task Learning},\nauthor={Kaining Zhang and Jiayi Ma},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=q0vILV7zAw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1929644, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1242052589079078664&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 5, "email": "whu.edu.cn;whu.edu.cn", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Wuhan University", "aff_unique_dep": "", "aff_unique_url": "http://www.whu.edu.cn/", "aff_unique_abbr": "WHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "An Effective Dynamic Gradient Calibration Method for Continual Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33012", "id": "q14AbM4kdv", "proceeding": "https://proceedings.mlr.press/v235/lin24a.html", "pdf": "https://openreview.net/pdf?id=q14AbM4kdv", "openreview": "https://openreview.net/forum?id=q14AbM4kdv", "author_site": "Weichen Lin, Jiaxiang Chen, Ruomin Huang, Hu Ding", "tldr": "", "abstract": "Continual learning (CL) is a fundamental topic in machine learning, where the goal is to train a model with continuously incoming data and tasks. Due to the memory limit, we cannot store all the historical data, and therefore confront the ``catastrophic forgetting'' problem, i.e., the performance on the previous tasks can substantially decrease because of the missing information in the latter period. Though a number of elegant methods have been proposed, the catastrophic forgetting phenomenon still cannot be well avoided in practice. In this paper, we study the problem from the gradient perspective, where our aim is to develop an effective algorithm to calibrate the gradient in each updating step of the model; namely, our goal is to guide the model to be updated in the right direction under the situation that a large amount of historical data are unavailable. Our idea is partly inspired by the seminal stochastic variance reduction methods (e.g., SVRG and SAGA) for reducing the variance of gradient estimation in stochastic gradient descent algorithms. Another benefit is that our approach can be used as a general tool, which is able to be incorporated with several existing popular CL methods to achieve better performance. We also conduct a set of experiments on several benchmark datasets to evaluate the performance in practice.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weichen Lin;Jiaxiang Chen;Ruomin Huang;Hu Ding", "authorids": "~Weichen_Lin1;~Jiaxiang_Chen2;~Ruomin_Huang1;~Hu_Ding1", "gender": "M;M;M;M", "homepage": "https://github.com/Tempest854;;https://fockee.github.io/;https://hu-ding.github.io/index.html", "dblp": ";319/7722;296/9683;74/9794", "google_scholar": ";;YjthT-kAAAAJ;D1-liJEAAAAJ", "orcid": ";0000-0002-5830-4088;;", "linkedin": ";;;", "or_profile": "~Weichen_Lin1;~Jiaxiang_Chen2;~Ruomin_Huang1;~Hu_Ding1", "aff": "University of Science and Technology of China;University of Science and Technology of China;Duke University;University of Science and Technology of China", "aff_domain": "ustc.edu.cn;ustc.edu.cn;duke.edu;ustc.edu.cn", "position": "MS student;MS student;PhD student;Professor", "bibtex": "@inproceedings{\nlin2024an,\ntitle={An Effective Dynamic Gradient Calibration Method for Continual Learning},\nauthor={Weichen Lin and Jiaxiang Chen and Ruomin Huang and Hu Ding},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=q14AbM4kdv}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3245997, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5126435974398445593&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 7, "email": "ustc.edu.cn;ustc.edu.cn;duke.edu;ustc.edu.cn", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Science and Technology of China;Duke University", "aff_unique_dep": ";", "aff_unique_url": "http://www.ustc.edu.cn;https://www.duke.edu", "aff_unique_abbr": "USTC;Duke", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Classification Under Strategic Self-Selection", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33011", "id": "q3Bz1TVTq4", "proceeding": "https://proceedings.mlr.press/v235/horowitz24a.html", "pdf": "https://openreview.net/pdf?id=q3Bz1TVTq4", "openreview": "https://openreview.net/forum?id=q3Bz1TVTq4", "author_site": "Guy Horowitz, Yonatan Sommer, Moran Koren, Nir Rosenfeld", "tldr": "", "abstract": "When users stand to gain from certain predictive outcomes, they are prone to act strategically to obtain predictions that are favorable. Most current works consider strategic behavior that manifests as users modifying their features; instead, we study a novel setting in which users decide whether to even participate (or not), this in response to the learned classifier. Considering learning approaches of increasing strategic awareness, we investigate the effects of user self-selection on learning, and the implications of learning on the composition of the self-selected population. Building on this, we propose a differentiable framework for learning under self-selective behavior, which can be optimized effectively. We conclude with experiments on real data and simulated behavior that complement our analysis and demonstrate the utility of our approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guy Horowitz;Yonatan Sommer;Moran Koren;Nir Rosenfeld", "authorids": "~Guy_Horowitz1;s.yonatan@campus.technion.ac.il;~Moran_Koren1;~Nir_Rosenfeld2", "gender": "M;;;M", "homepage": "https://www.linkedin.com/in/guy-horowitz;;http://mkoren.org;https://nirr.cswp.cs.technion.ac.il", "dblp": ";;;145/9800", "google_scholar": ";;;WTlgnYkAAAAJ", "orcid": "0009-0001-5093-7235;;0000-0003-0012-0208;", "linkedin": ";;;", "or_profile": "~Guy_Horowitz1;s.yonatan@campus.technion.ac.il;~Moran_Koren1;~Nir_Rosenfeld2", "aff": "Amazon;;Harvard University;Technion, Technion", "aff_domain": "amazon.com;;harvard.edu;technion.ac.il", "position": "Intern;;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nhorowitz2024classification,\ntitle={Classification Under Strategic Self-Selection},\nauthor={Guy Horowitz and Yonatan Sommer and Moran Koren and Nir Rosenfeld},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=q3Bz1TVTq4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3438728, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8556210622976997019&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "amazon.com;;harvard.edu;technion.ac.il", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Amazon;Harvard University;Technion - Israel Institute of Technology", "aff_unique_dep": "Amazon.com, Inc.;;", "aff_unique_url": "https://www.amazon.com;https://www.harvard.edu;https://www.technion.ac.il/en/", "aff_unique_abbr": "Amazon;Harvard;Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;Israel" }, { "title": "Disguised Copyright Infringement of Latent Diffusion Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33010", "id": "q5Bg858Hef", "proceeding": "https://proceedings.mlr.press/v235/lu24m.html", "pdf": "https://openreview.net/pdf?id=q5Bg858Hef", "openreview": "https://openreview.net/forum?id=q5Bg858Hef", "author_site": "Yiwei Lu, Matthew Yang, Zuoqiu Liu, Gautam Kamath, Yaoliang Yu", "tldr": "", "abstract": "Copyright infringement may occur when a generative model produces samples substantially similar to some copyrighted data that it had access to during the training phase. The notion of access usually refers to including copyrighted samples directly in the training dataset, which one may inspect to identify an infringement. We argue that such visual auditing largely overlooks a concealed copyright infringement, where one constructs a disguise that looks drastically different from the copyrighted sample yet still induces the effect of training Latent Diffusion Models on it. Such disguises only require indirect access to the copyrighted material and cannot be visually distinguished, thus easily circumventing the current auditing tools. In this paper, we provide a better understanding of such disguised copyright infringement by uncovering the disguises generation algorithm, the revelation of the disguises, and importantly, how to detect them to augment the existing toolbox. Additionally, we introduce a broader notion of acknowledgment for comprehending such indirect access.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yiwei Lu;Matthew Y. R. Yang;Zuoqiu Liu;Gautam Kamath;Yaoliang Yu", "authorids": "~Yiwei_Lu1;~Matthew_Y._R._Yang1;~Zuoqiu_Liu1;~Gautam_Kamath1;~Yaoliang_Yu1", "gender": "M;;M;M;M", "homepage": "https://cs.uwaterloo.ca/~y485lu/;;https://www.linkedin.com/in/robert-zuoqiu-liu/;http://www.gautamkamath.com/;https://cs.uwaterloo.ca/~y328yu/", "dblp": ";;;73/11140;90/4989", "google_scholar": "ke0k9PkAAAAJ;xDNZlnMAAAAJ;Cl73CgcAAAAJ;MK6zHkYAAAAJ;https://scholar.google.ca/citations?user=zbXIQMsAAAAJ", "orcid": ";;;;0000-0002-3823-0720", "linkedin": ";;robert-zuoqiu-liu/;;", "or_profile": "~Yiwei_Lu1;~Matthew_Y._R._Yang1;~Zuoqiu_Liu1;~Gautam_Kamath1;~Yaoliang_Yu1", "aff": "University of Waterloo;University of Waterloo;University of Waterloo;University of Waterloo;University of Waterloo", "aff_domain": "uwaterloo.ca;uwaterloo.ca;uwaterloo.ca;uwaterloo.ca;uwaterloo.ca", "position": "PhD student;Undergrad student;Undergrad student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nlu2024disguised,\ntitle={Disguised Copyright Infringement of Latent Diffusion Models},\nauthor={Yiwei Lu and Matthew Y. R. Yang and Zuoqiu Liu and Gautam Kamath and Yaoliang Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=q5Bg858Hef}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5106853, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8015525236042477123&as_sdt=805&sciodt=0,3&hl=en", "gs_version_total": 7, "email": "uwaterloo.ca;uwaterloo.ca;uwaterloo.ca;uwaterloo.ca;uwaterloo.ca", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of Waterloo", "aff_unique_dep": "", "aff_unique_url": "https://uwaterloo.ca", "aff_unique_abbr": "UW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Canada" }, { "title": "Byzantine Resilient and Fast Federated Few-Shot Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33009", "id": "q5q59s2WJy", "proceeding": "https://proceedings.mlr.press/v235/singh24f.html", "pdf": "https://openreview.net/pdf?id=q5q59s2WJy", "openreview": "https://openreview.net/forum?id=q5q59s2WJy", "author_site": "Ankit Pratap Singh, Namrata Vaswani", "tldr": "", "abstract": "This work introduces a Byzantine resilient solution for learning low-dimensional linear representation. Our main contribution is the development of a provably Byzantine-resilient AltGDmin algorithm for solving this problem in a federated setting. We argue that our solution is sample-efficient, fast, and communicationefficient. In solving this problem, we also introduce a novel secure solution to the federated subspace learning meta-problem that occurs in many different applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ankit Pratap Singh;Namrata Vaswani", "authorids": "~Ankit_Pratap_Singh1;~Namrata_Vaswani1", "gender": "M;", "homepage": ";https://www.ece.iastate.edu/~namrata/", "dblp": ";", "google_scholar": ";s-dQPO8AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Ankit_Pratap_Singh1;~Namrata_Vaswani1", "aff": "Iowa State University;Iowa State University", "aff_domain": "iastate.edu;iastate.edu", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nsingh2024byzantine,\ntitle={Byzantine Resilient and Fast Federated Few-Shot Learning},\nauthor={Ankit Pratap Singh and Namrata Vaswani},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=q5q59s2WJy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 507196, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7655878951580091654&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "iastate.edu;iastate.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Iowa State University", "aff_unique_dep": "", "aff_unique_url": "https://www.iastate.edu", "aff_unique_abbr": "ISU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "MLIP: Efficient Multi-Perspective Language-Image Pretraining with Exhaustive Data Utilization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33008", "id": "q6fXuPLpao", "proceeding": "https://proceedings.mlr.press/v235/zhang24cb.html", "pdf": "https://openreview.net/pdf?id=q6fXuPLpao", "openreview": "https://openreview.net/forum?id=q6fXuPLpao", "author_site": "Yu Zhang, Qi Zhang, Zixuan Gong, Yiwei Shi, Yepeng Liu, Duoqian Miao, Yang Liu, KE LIU, Kun Yi, Wei Fan, Liang Hu, Changwei Wang", "tldr": "", "abstract": "Contrastive Language-Image Pretraining (CLIP) has achieved remarkable success, leading to rapid advancements in multimodal studies. However, CLIP faces a notable challenge in terms of *inefficient data utilization*. It relies on a single contrastive supervision for each image-text pair during representation learning, disregarding a substantial amount of valuable information that could offer richer supervision. Additionally, the retention of non-informative tokens leads to increased computational demands and time costs, particularly in CLIP's ViT image encoder. To address these issues, we propose **M**ulti-Perspective **L**anguage-**I**mage **P**retraining (**MLIP**). In MLIP, we leverage the frequency transform's sensitivity to both high and low-frequency variations, which complements the spatial domain's sensitivity limited to low-frequency variations only. By incorporating frequency transforms and token-level alignment, we expand CILP's single supervision into multi-domain and multi-level supervision, enabling a more thorough exploration of informative image features. Additionally, we introduce a token merging method guided by comprehensive semantics from the frequency and spatial domains. This allows us to merge tokens to multi-granularity tokens with a controllable compression rate to accelerate CLIP. Extensive experiments validate the effectiveness of our design.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yu Zhang;Qi Zhang;Zixuan Gong;Yiwei Shi;Yepeng Liu;Duoqian Miao;Yang Liu;KE LIU;Kun Yi;Wei Fan;Liang Hu;Changwei Wang", "authorids": "~Yu_Zhang60;~Qi_Zhang25;~Zixuan_Gong2;~Yiwei_Shi1;~Yepeng_Liu1;~Duoqian_Miao1;~Yang_Liu127;~KE_LIU10;~Kun_Yi2;~Wei_Fan6;~Liang_Hu1;~Changwei_Wang2", "gender": ";M;M;;;M;;;;M;M;", "homepage": ";https://sites.google.com/view/qizhang-bit-uts/home;https://github.com/gongzix;;;https://iip.tongji.edu.cn;https://github.com/yangyangyang0007;;https://github.com/aikunyi;https://weifan.site/;https://sites.google.com/view/lianghu/home;", "dblp": ";52/323-20;363/7481;;;90/1041-1;;;202/8470-1;54/3488-10;48/5388-4;", "google_scholar": ";8UAk1p4AAAAJ;https://scholar.google.com/citations?hl=en;;;;;;MhMZcIEAAAAJ;cQ8zLJ4AAAAJ;https://scholar.google.com.au/citations?user=cj6wAgYAAAAJ;", "orcid": ";0000-0002-1037-1361;;;;0000-0001-6588-1468;;;0000-0002-9980-6033;0000-0001-7656-445X;;", "linkedin": ";;;;;;;;;;;", "or_profile": "~Yu_Zhang60;~Qi_Zhang25;~Zixuan_Gong2;~Yiwei_Shi1;~Yepeng_Liu1;~Duoqian_Miao1;~Yang_Liu127;~KE_LIU10;~Kun_Yi2;~Wei_Fan6;~Liang_Hu1;~Changwei_Wang2", "aff": ";Tongji University;;;;Tongji University;;;Beijing Institute of Technology;University of Oxford;Tongji University;", "aff_domain": ";tongji.edu.cn;;;;tongji.edu.cn;;;bit.edu.cn;ox.ac.uk;tongji.edu.cn;", "position": ";Researcher;;;;Full Professor;;;PhD student;Postdoc Researcher;Full Professor;", "bibtex": "@inproceedings{\nzhang2024mlip,\ntitle={{MLIP}: Efficient Multi-Perspective Language-Image Pretraining with Exhaustive Data Utilization},\nauthor={Yu Zhang and Qi Zhang and Zixuan Gong and Yiwei Shi and Yepeng Liu and Duoqian Miao and Yang Liu and KE LIU and Kun Yi and Wei Fan and Liang Hu and Changwei Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=q6fXuPLpao}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7233097, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16793974533078878021&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": ";tongji.edu.cn;;;;tongji.edu.cn;;;bit.edu.cn;ox.ac.uk;tongji.edu.cn;", "author_num": 12, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Tongji University;Beijing Institute of Technology;University of Oxford", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tongji.edu.cn;http://www.bit.edu.cn/;https://www.ox.ac.uk", "aff_unique_abbr": "Tongji;BIT;Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "China;United Kingdom" }, { "title": "tinyBenchmarks: evaluating LLMs with fewer examples", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33007", "id": "qAml3FpfhG", "proceeding": "https://proceedings.mlr.press/v235/maia-polo24a.html", "pdf": "https://openreview.net/pdf?id=qAml3FpfhG", "openreview": "https://openreview.net/forum?id=qAml3FpfhG", "author_site": "Felipe Maia Polo, Lucas Weber, Leshem Choshen, Yuekai Sun, Gongjun Xu, Mikhail Yurochkin", "tldr": "", "abstract": "The versatility of large language models (LLMs) led to the creation of diverse benchmarks that thoroughly test a variety of language models\u2019 abilities. These benchmarks consist of tens of thousands of examples making evaluation of LLMs very expensive. In this paper, we investigate strategies to reduce the number of evaluations needed to assess the performance of an LLM on several key benchmarks. For example, we show that to accurately estimate the performance of an LLM on MMLU, a popular multiple-choice QA benchmark consisting of 14K examples, it is sufficient to evaluate this LLM on 100 curated examples. We release evaluation tools and tiny versions of popular benchmarks: Open LLM Leaderboard, MMLU, HELM, and AlpacaEval 2.0. Our empirical analysis demonstrates that these tools and tiny benchmarks are sufficient to reliably and efficiently reproduce the original evaluation results.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Felipe Maia Polo;Lucas Weber;Leshem Choshen;Yuekai Sun;Gongjun Xu;Mikhail Yurochkin", "authorids": "~Felipe_Maia_Polo1;~Lucas_Weber1;~Leshem_Choshen1;~Yuekai_Sun1;~Gongjun_Xu1;~Mikhail_Yurochkin1", "gender": "M;M;Not Specified;;;M", "homepage": "https://felipemaiapolo.github.io/;https://lucweber.github.io/;https://ktilana.wixsite.com/leshem-choshen;https://yuekai.github.io/;https://sites.google.com/umich.edu/gongjunxu;https://moonfolk.github.io/", "dblp": "261/9581;;218/5237;;;191/6719", "google_scholar": "CJbgmnkAAAAJ;LGnAvXkAAAAJ;https://scholar.google.com/citations?hl=en;6T1XtW8AAAAJ;;QjBF9sUAAAAJ", "orcid": "0000-0002-4950-2795;;0000-0002-0085-6496;;;", "linkedin": ";lucasweber0/;leshemchoshen/;;;mikhail-yurochkin-a45659114/", "or_profile": "~Felipe_Maia_Polo1;~Lucas_Weber1;~Leshem_Choshen1;~Yuekai_Sun1;~Gongjun_Xu1;~Mikhail_Yurochkin1", "aff": "University of Michigan - Ann Arbor;Universitat Pompeu Fabra;International Business Machines;University of Michigan - Ann Arbor;University of Michigan - Ann Arbor;IBM Research", "aff_domain": "umich.edu;upf.es;ibm.com;umich.edu;umich.edu;ibm.com", "position": "PhD student;PhD student;Researcher;Assistant \u2192 Associate Professor of Statistics;Associate Professor;Researcher", "bibtex": "@inproceedings{\npolo2024tinybenchmarks,\ntitle={tinyBenchmarks: evaluating {LLM}s with fewer examples},\nauthor={Felipe Maia Polo and Lucas Weber and Leshem Choshen and Yuekai Sun and Gongjun Xu and Mikhail Yurochkin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qAml3FpfhG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6431815, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=39608789700587146&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "email": "umich.edu;upf.es;ibm.com;umich.edu;umich.edu;ibm.com", "author_num": 6, "aff_unique_index": "0;1;2;0;0;3", "aff_unique_norm": "University of Michigan;Universitat Pompeu Fabra;International Business Machines Corporation;IBM", "aff_unique_dep": ";;;IBM Research", "aff_unique_url": "https://www.umich.edu;https://www.upf.edu/;https://www.ibm.com;https://www.ibm.com/research", "aff_unique_abbr": "UM;UPF;IBM;IBM", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Ann Arbor;", "aff_country_unique_index": "0;1;0;0;0;0", "aff_country_unique": "United States;Spain" }, { "title": "ACM-MILP: Adaptive Constraint Modification via Grouping and Selection for Hardness-Preserving MILP Instance Generation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33006", "id": "qDAAMmGsGw", "proceeding": "https://proceedings.mlr.press/v235/guo24d.html", "pdf": "https://openreview.net/pdf?id=qDAAMmGsGw", "openreview": "https://openreview.net/forum?id=qDAAMmGsGw", "author_site": "Ziao Guo, Yang Li, Chang Liu, Wenli Ouyang, Junchi Yan", "tldr": "", "abstract": "Data plays a pivotal role in the development of both classic and learning-based methods for Mixed-Integer Linear Programming (MILP). However, the scarcity of data in real-world applications underscores the necessity for MILP instance generation methods. Currently, these methods primarily rely on iterating random single-constraint modifications, disregarding the underlying problem structure with constraint interrelations, thereby leading to compromised quality and solvability. In this paper, we propose ACM-MILP, a framework for MILP instance generation, to achieve adaptive constraint modification and constraint interrelation modeling. It employs an adaptive constraint selection mechanism based on probability estimation within the latent space to preserve instance characteristics. Meanwhile, it detects and groups strongly related constraints through community detection, enabling collective modifications that account for constraint dependencies. Experimental results show significant improvements in problem-solving hardness similarity under our framework. Additionally, in the downstream task, we showcase the efficacy of our generated instances for hyperparameter tuning. Source code is available: https://github.com/Thinklab-SJTU/ACM-MILP.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ziao Guo;Yang Li;Chang Liu;Wenli Ouyang;Junchi Yan", "authorids": "~Ziao_Guo1;~Yang_Li32;~Chang_Liu7;~Wenli_Ouyang1;~Junchi_Yan2", "gender": "M;M;M;;M", "homepage": "https://github.com/ziao-guo;https://yangco-le.github.io;https://only-changer.github.io/;;http://thinklab.sjtu.edu.cn/", "dblp": "312/4575;;52/5716;243/3171;60/7949.html", "google_scholar": ";ecE0xDIAAAAJ;BTu8eaQAAAAJ;https://scholar.google.com/citations?hl=en;ga230VoAAAAJ", "orcid": ";0000-0002-5249-3471;;;0000-0001-9639-7679", "linkedin": ";;;wenli-ouyang-07891641;", "or_profile": "~Ziao_Guo1;~Yang_Li32;~Chang_Liu7;~Wenli_Ouyang1;~Junchi_Yan1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Lenovo Research ;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;lenovo.com;sjtu.edu.cn", "position": "PhD student;PhD student;PhD student;Researcher;Full Professor", "bibtex": "@inproceedings{\nguo2024acmmilp,\ntitle={{ACM}-{MILP}: Adaptive Constraint Modification via Grouping and Selection for Hardness-Preserving {MILP} Instance Generation},\nauthor={Ziao Guo and Yang Li and Chang Liu and Wenli Ouyang and Junchi Yan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qDAAMmGsGw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 917848, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8083385536886342093&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;lenovo.com;sjtu.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Shanghai Jiao Tong University;Lenovo", "aff_unique_dep": ";Research", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.lenovo.com", "aff_unique_abbr": "SJTU;Lenovo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Model-Based Minimum Bayes Risk Decoding for Text Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33005", "id": "qDUaH9xHVV", "proceeding": "https://proceedings.mlr.press/v235/jinnai24a.html", "pdf": "https://openreview.net/pdf?id=qDUaH9xHVV", "openreview": "https://openreview.net/forum?id=qDUaH9xHVV", "author_site": "Yuu Jinnai, Tetsuro Morimura, Ukyo Honda, Kaito Ariu, Kenshi Abe", "tldr": "", "abstract": "Minimum Bayes Risk (MBR) decoding has been shown to be a powerful alternative to beam search decoding in a variety of text generation tasks. MBR decoding selects a hypothesis from a pool of hypotheses that has the least expected risk under a probability model according to a given utility function. Since it is impractical to compute the expected risk exactly over all possible hypotheses, two approximations are commonly used in MBR. First, it integrates over a sampled set of hypotheses rather than over all possible hypotheses. Second, it estimates the probability of each hypothesis using a Monte Carlo estimator. While the first approximation is necessary to make it computationally feasible, the second is not essential since we typically have access to the model probability at inference time. We propose model-based MBR (MBMBR), a variant of MBR that uses the model probability itself as the estimate of the probability distribution instead of the Monte Carlo estimate. We show analytically and empirically that the model-based estimate is more promising than the Monte Carlo estimate in text generation tasks. Our experiments show that MBMBR outperforms MBR in several text generation tasks, both with encoder-decoder models and with language models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuu Jinnai;Tetsuro Morimura;Ukyo Honda;Kaito Ariu;Kenshi Abe", "authorids": "~Yuu_Jinnai1;~Tetsuro_Morimura1;~Ukyo_Honda1;~Kaito_Ariu1;~Kenshi_Abe1", "gender": "M;M;M;M;M", "homepage": "https://jinnaiyuu.github.io;;https://ukyh.github.io/;https://researchmap.jp/ariu?lang=en;https://bakanaouji.github.io/", "dblp": "178/8539;36/1501;220/2007;229/7578;254/2763", "google_scholar": "H0MaUNIAAAAJ;https://scholar.google.co.jp/citations?user=IgjF21EAAAAJ;https://scholar.google.com/citations?view_op=list_works;https://scholar.google.co.jp/citations?user=4zXjxhsAAAAJ;rImmohoAAAAJ", "orcid": ";;0000-0002-4894-9886;;", "linkedin": ";;;;", "or_profile": "~Yuu_Jinnai1;~Tetsuro_Morimura1;~Ukyo_Honda1;~Kaito_Ariu1;~Kenshi_Abe1", "aff": "CyberAgent, Inc.;CyberAgent, Inc.;CyberAgent, Inc.;CyberAgent, Inc.;CyberAgent, Inc.", "aff_domain": "cyberagent.co.jp;cyberagent.co.jp;cyberagent.co.jp;cyberagent.co.jp;cyberagent.co.jp", "position": "Researcher;Researcher;Researcher;Research Scientist;Research scientist", "bibtex": "@inproceedings{\njinnai2024modelbased,\ntitle={Model-Based Minimum Bayes Risk Decoding for Text Generation},\nauthor={Yuu Jinnai and Tetsuro Morimura and Ukyo Honda and Kaito Ariu and Kenshi Abe},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qDUaH9xHVV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 789504, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17806908606393858455&as_sdt=4005&sciodt=0,6&hl=en", "gs_version_total": 8, "email": "cyberagent.co.jp;cyberagent.co.jp;cyberagent.co.jp;cyberagent.co.jp;cyberagent.co.jp", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "CyberAgent", "aff_unique_dep": "", "aff_unique_url": "https://www.cyberagent.co.jp", "aff_unique_abbr": "CyberAgent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Japan" }, { "title": "Sample-Efficient Robust Multi-Agent Reinforcement Learning in the Face of Environmental Uncertainty", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33004", "id": "qDw4FxMubj", "proceeding": "https://proceedings.mlr.press/v235/shi24d.html", "pdf": "https://openreview.net/pdf?id=qDw4FxMubj", "openreview": "https://openreview.net/forum?id=qDw4FxMubj", "author_site": "Laixi Shi, Eric Mazumdar, Yuejie Chi, Adam Wierman", "tldr": "", "abstract": "To overcome the sim-to-real gap in reinforcement learning (RL), learned policies must maintain robustness against environmental uncertainties. While robust RL has been widely studied in single-agent regimes, in multi-agent environments, the problem remains understudied---despite the fact that the problems posed by environmental uncertainties are often exacerbated by strategic interactions. This work focuses on learning in distributionally robust Markov games (RMGs), a robust variant of standard Markov games, wherein each agent aims to learn a policy that maximizes its own worst-case performance when the deployed environment deviates within its own prescribed uncertainty set. This results in a set of robust equilibrium strategies for all agents that align with classic notions of game-theoretic equilibria. Assuming a non-adaptive sampling mechanism from a generative model, we propose a sample-efficient model-based algorithm (DRNVI) with finite-sample complexity guarantees for learning robust variants of various notions of game-theoretic equilibria. We also establish an information-theoretic lower bound for solving RMGs, which confirms the near-optimal sample complexity of DRNVI with respect to problem-dependent factors such as the size of the state space, the target accuracy, and the horizon length.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Laixi Shi;Eric Mazumdar;Yuejie Chi;Adam Wierman", "authorids": "~Laixi_Shi1;~Eric_Mazumdar1;~Yuejie_Chi1;~Adam_Wierman1", "gender": "F;M;;M", "homepage": "https://laixishi.github.io/;http://people.eecs.berkeley.edu/~emazumdar/;;https://adamwierman.com/", "dblp": "211/7965;177/9322;;56/4447", "google_scholar": "V8RkRr8AAAAJ;FZOxxvcAAAAJ;;4OvOdSgAAAAJ", "orcid": ";;;0000-0002-5923-0199", "linkedin": ";;;adam-wierman-a529474/", "or_profile": "~Laixi_Shi1;~Eric_Mazumdar1;~Yuejie_Chi1;~Adam_Wierman1", "aff": "California Institute of Technology;Deparment of Computing + Mathematical Sciences, California Institute of Technology;;California Institute of Technology", "aff_domain": "caltech.edu;cms.caltech.edu;;caltech.edu", "position": "Postdoc;Assistant Professor;;Professor", "bibtex": "@inproceedings{\nshi2024sampleefficient,\ntitle={Sample-Efficient Robust Multi-Agent Reinforcement Learning in the Face of Environmental Uncertainty},\nauthor={Laixi Shi and Eric Mazumdar and Yuejie Chi and Adam Wierman},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qDw4FxMubj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1188303, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4350847829317774719&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 9, "email": "caltech.edu;cms.caltech.edu;;caltech.edu", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "California Institute of Technology;", "aff_unique_dep": ";", "aff_unique_url": "https://www.caltech.edu;", "aff_unique_abbr": "Caltech;", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pasadena;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States;" }, { "title": "Estimating Unknown Population Sizes Using the Hypergeometric Distribution", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33003", "id": "qE4nkfyMYl", "proceeding": "https://proceedings.mlr.press/v235/hodgson24a.html", "pdf": "https://openreview.net/pdf?id=qE4nkfyMYl", "openreview": "https://openreview.net/forum?id=qE4nkfyMYl", "author_site": "Liam Hodgson, Danilo Bzdok", "tldr": "", "abstract": "The multivariate hypergeometric distribution describes sampling without replacement from a discrete population of elements divided into multiple categories. Addressing a gap in the literature, we tackle the challenge of estimating discrete distributions when both the total population size and the category sizes are unknown. Here, we propose a novel solution using the hypergeometric likelihood to solve this estimation problem, even in the presence of severe under-sampling. Our approach accounts for a data generating process where the ground-truth is a mixture of distributions conditional on a continuous latent variable, as seen in collaborative filtering, using the variational autoencoder framework. Empirical data simulation demonstrates that our method outperforms other likelihood functions used to model count data, both in terms of accuracy of population size estimate and learning an informative latent space. We showcase our method's versatility through applications in NLP, by inferring and estimating the complexity of latent vocabularies in reading passage excerpts, and in biology, by accurately recovering the true number of gene transcripts from sparse single-cell genomics data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Liam Hodgson;Danilo Bzdok", "authorids": "~Liam_Hodgson1;~Danilo_Bzdok2", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nhodgson2024estimating,\ntitle={Estimating Unknown Population Sizes Using the Hypergeometric Distribution},\nauthor={Liam Hodgson and Danilo Bzdok},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qE4nkfyMYl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7995771, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7044633305659425479&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": ";", "author_num": 2 }, { "title": "Operator SVD with Neural Networks via Nested Low-Rank Approximation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33002", "id": "qESG5HaaoJ", "proceeding": "https://proceedings.mlr.press/v235/ryu24b.html", "pdf": "https://openreview.net/pdf?id=qESG5HaaoJ", "openreview": "https://openreview.net/forum?id=qESG5HaaoJ", "author_site": "Jongha (Jon) Ryu, Xiangxiang Xu, Hasan Sabri Melihcan Erol, Yuheng Bu, Lizhong Zheng, Gregory Wornell", "tldr": "", "abstract": "Computing eigenvalue decomposition (EVD) of a given linear operator, or finding its leading eigenvalues and eigenfunctions, is a fundamental task in many machine learning and scientific simulation problems. For high-dimensional eigenvalue problems, training neural networks to parameterize the eigenfunctions is considered as a promising alternative to the classical numerical linear algebra techniques. This paper proposes a new optimization framework based on the low-rank approximation characterization of a truncated singular value decomposition, accompanied by new techniques called *nesting* for learning the top-$L$ singular values and singular functions in the correct order. The proposed method promotes the desired orthogonality in the learned functions implicitly and efficiently via an unconstrained optimization formulation, which is easy to solve with off-the-shelf gradient-based optimization algorithms. We demonstrate the effectiveness of the proposed optimization framework for use cases in computational physics and machine learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jongha Jon Ryu;Xiangxiang Xu;Hasan Sabri Melihcan Erol;Yuheng Bu;Lizhong Zheng;Gregory W. Wornell", "authorids": "~Jongha_Jon_Ryu1;~Xiangxiang_Xu1;~Hasan_Sabri_Melihcan_Erol1;~Yuheng_Bu1;~Lizhong_Zheng1;~Gregory_W._Wornell1", "gender": "M;M;M;M;M;M", "homepage": "https://jongharyu.github.io;https://xiangxiangxu.com/;;https://buyuheng.github.io/;http://lizhongzheng.mit.edu;https://web.mit.edu/gww/www/", "dblp": "340/4088;147/5345-1;296/0298;168/8338;;94/5969", "google_scholar": "5ZYeWgcAAAAJ;u-BAw9sAAAAJ;cfgWcqYAAAAJ;1jPQEVMAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": ";0000-0002-4178-0934;0000-0003-4377-6936;0000-0002-3479-4553;;0000-0001-9166-4758", "linkedin": ";xiangxiangxu/;hsme;bu-yuheng-36560039/;;", "or_profile": "~Jongha_Jon_Ryu1;~Xiangxiang_Xu1;~Hasan_Sabri_Melihcan_Erol1;~Yuheng_Bu1;~Lizhong_Zheng1;~Gregory_Wornell1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;University of Florida;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu;ufl.edu;mit.edu;mit.edu", "position": "Postdoc;Postdoc;PhD student;Assistant Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nryu2024operator,\ntitle={Operator {SVD} with Neural Networks via Nested Low-Rank Approximation},\nauthor={Jongha Jon Ryu and Xiangxiang Xu and Hasan Sabri Melihcan Erol and Yuheng Bu and Lizhong Zheng and Gregory W. Wornell},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qESG5HaaoJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1741395, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14159012565378572891&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 13, "email": "mit.edu;mit.edu;mit.edu;ufl.edu;mit.edu;mit.edu", "author_num": 6, "aff_unique_index": "0;0;0;1;0;0", "aff_unique_norm": "Massachusetts Institute of Technology;University of Florida", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://www.ufl.edu", "aff_unique_abbr": "MIT;UF", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "AnyTool: Self-Reflective, Hierarchical Agents for Large-Scale API Calls", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33001", "id": "qFILbkTQWw", "proceeding": "https://proceedings.mlr.press/v235/du24h.html", "pdf": "https://openreview.net/pdf?id=qFILbkTQWw", "openreview": "https://openreview.net/forum?id=qFILbkTQWw", "author_site": "YU DU, Fangyun Wei, Hongyang Zhang", "tldr": "", "abstract": "We introduce AnyTool, a large language model agent designed to revolutionize the utilization of a vast array of tools in addressing user queries. We utilize over 16,000 APIs from Rapid API, operating under the assumption that a subset of these APIs could potentially resolve the queries. AnyTool primarily incorporates three elements: an API retriever with a hierarchical structure, a solver aimed at resolving user queries using a selected set of API candidates, and a self-reflection mechanism, which re-activates AnyTool if the initial solution proves impracticable. AnyTool is powered by the function calling feature of GPT-4, eliminating the need for training external modules. We also revisit the evaluation protocol introduced by previous works and identify a limitation in this protocol that leads to an artificially high pass rate. By revising the evaluation protocol to better reflect practical application scenarios, we introduce an additional benchmark, termed AnyToolBench. Experiments across various datasets demonstrate the superiority of our AnyTool over strong baselines such as ToolLLM and a GPT-4 variant tailored for tool utilization. For instance, AnyTool outperforms ToolLLM by +35.5% in terms of average pass rate on ToolBench.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yu Du;Fangyun Wei;Hongyang Zhang", "authorids": "~Yu_Du9;~Fangyun_Wei1;~Hongyang_Zhang1", "gender": "M;M;M", "homepage": ";https://hongyanz.github.io/;", "dblp": "161/2636;23/10537-1;", "google_scholar": "-ncz2s8AAAAJ;https://scholar.google.com/citations?hl=en;VfkZpWwAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Fangyun_Wei1;~Hongyang_Zhang1;~YU_DU7", "aff": "Microsoft Research;School of Computer Science, University of Waterloo;Tsinghua University", "aff_domain": "microsoft.com;uwaterloo.ca;mails.tsinghua.edu.cn", "position": "Researcher;Assistant Professor;PhD student", "bibtex": "@inproceedings{\ndu2024anytool,\ntitle={AnyTool: Self-Reflective, Hierarchical Agents for Large-Scale {API} Calls},\nauthor={Yu Du and Fangyun Wei and Hongyang Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qFILbkTQWw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 654568, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17225862436584240640&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "microsoft.com;uwaterloo.ca;mails.tsinghua.edu.cn", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Microsoft;University of Waterloo;Tsinghua University", "aff_unique_dep": "Microsoft Research;School of Computer Science;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://uwaterloo.ca;https://www.tsinghua.edu.cn", "aff_unique_abbr": "MSR;UWaterloo;THU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Waterloo", "aff_country_unique_index": "0;1;2", "aff_country_unique": "United States;Canada;China" }, { "title": "Structure-Aware E(3)-Invariant Molecular Conformer Aggregation Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/33000", "id": "qGEEso256L", "proceeding": "https://proceedings.mlr.press/v235/nguyen24g.html", "pdf": "https://openreview.net/pdf?id=qGEEso256L", "openreview": "https://openreview.net/forum?id=qGEEso256L", "author_site": "Duy Nguyen, Nina Lukashina, Tai Nguyen, An Thai Le, TrungTin Nguyen, Nhat Ho, Jan Peters, Daniel Sonntag, Viktor Zaverkin, Mathias Niepert", "tldr": "", "abstract": "A molecule\u2019s 2D representation consists of its atoms, their attributes, and the molecule\u2019s covalent bonds. A 3D (geometric) representation of a molecule is called a conformer and consists of its atom types and Cartesian coordinates. Every conformer has a potential energy, and the lower this energy, the more likely it occurs in nature. Most existing machine learning methods for molecular property prediction consider either 2D molecular graphs or 3D conformer structure representations in isolation. Inspired by recent work on using ensembles of conformers in conjunction with 2D graph representations, we propose E(3)-invariant molecular conformer aggregation networks. The method integrates a molecule\u2019s 2D representation with that of multiple of its conformers. Contrary to prior work, we propose a novel 2D\u20133D aggregation mechanism based on a differentiable solver for the Fused Gromov-Wasserstein Barycenter problem and the use of an efficient conformer generation method based on distance geometry. We show that the proposed aggregation mechanism is E(3) invariant and propose an efficient GPU implementation. Moreover, we demonstrate that the aggregation mechanism helps to significantly outperform state-of-the-art molecule property prediction methods on established datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Duy Minh Ho Nguyen;Nina Lukashina;Tai Nguyen;An Thai Le;TrungTin Nguyen;Nhat Ho;Jan Peters;Daniel Sonntag;Viktor Zaverkin;Mathias Niepert", "authorids": "~Duy_Minh_Ho_Nguyen1;~Nina_Lukashina1;~Tai_Nguyen2;~An_Thai_Le1;~TrungTin_Nguyen1;~Nhat_Ho1;~Jan_Peters3;~Daniel_Sonntag2;~Viktor_Zaverkin1;~Mathias_Niepert1", "gender": "M;F;M;Not Specified;M;M;M;M;M;M", "homepage": ";;;https://www.ias.informatik.tu-darmstadt.de/index.php/Team/AnThaiLe;https://trung-tinnguyen.github.io/;https://nhatptnk8912.github.io/;https://www.jan-peters.net;https://www.dfki.de/~sonntag/;https://viktorzaverkin.github.io/;http://www.matlog.net", "dblp": "199/8349;;;121/0037;275/3643;203/4479;p/JanPeters1;83/5858.html;;n/MathiasNiepert", "google_scholar": "_NIyeykAAAAJ;EvCbk0UAAAAJ;lBDYHHAAAAAJ;k0r0tfUAAAAJ;NhiJDJsAAAAJ;https://scholar.google.ca/citations?user=Xs7cKMwAAAAJ;https://scholar.google.de/citations?user=-kIVAcAAAAAJ;v7i6Uz4AAAAJ;OTOXTzgAAAAJ;https://scholar.google.de/citations?user=p5vLzq0AAAAJ", "orcid": ";;0009-0004-7707-2069;;0000-0001-8433-5980;;0000-0002-5266-8091;;0000-0001-9940-8548;", "linkedin": ";nina-lukashina/;;;trungtinnguyen0/;nhat-pham-minh-ho-267b8164/;janrpeters/;;;", "or_profile": "~Duy_Minh_Ho_Nguyen1;~Nina_Lukashina1;~Tai_Nguyen2;~An_Thai_Le1;~TrungTin_Nguyen1;~Nhat_Ho1;~Jan_Peters3;~Daniel_Sonntag2;~Viktor_Zaverkin1;~Mathias_Niepert1", "aff": "German Research Center for AI;JetBrains GmbH;German Research Center for AI;Technische Universit\u00e4t Darmstadt;The University of Queensland;University of Texas, Austin;TU Darmstadt;Carl von Ossietzky Universit\u00e4t Oldenburg;NEC Laboratories Europe;NEC", "aff_domain": "dfki.de;jetbrains.com;dfki.de;tu-darmstadt.de;uq.edu.au;utexas.edu;tu-darmstadt.de;uol.de;neclab.eu;neclab.eu", "position": "Researcher;Researcher;Researcher;PhD student;Postdoc;Assistant Professor;Full Professor;Full Professor;Researcher;Research Scientist", "bibtex": "@inproceedings{\nnguyen2024structureaware,\ntitle={Structure-Aware E(3)-Invariant Molecular Conformer Aggregation Networks},\nauthor={Duy Minh Ho Nguyen and Nina Lukashina and Tai Nguyen and An Thai Le and TrungTin Nguyen and Nhat Ho and Jan Peters and Daniel Sonntag and Viktor Zaverkin and Mathias Niepert},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qGEEso256L}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2868551, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17916082769920700170&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 10, "email": "dfki.de;jetbrains.com;dfki.de;tu-darmstadt.de;uq.edu.au;utexas.edu;tu-darmstadt.de;uol.de;neclab.eu;neclab.eu", "author_num": 10, "aff_unique_index": "0;1;0;2;3;4;2;5;6;7", "aff_unique_norm": "German Research Center for Artificial Intelligence;JetBrains;Technische Universit\u00e4t Darmstadt;University of Queensland;University of Texas at Austin;Carl von Ossietzky University of Oldenburg;NEC Laboratories Europe;NEC Corporation", "aff_unique_dep": ";;;;;;;", "aff_unique_url": "https://www.dfki.de/;https://www.jetbrains.com;https://www.tu-darmstadt.de;https://www.uq.edu.au;https://www.utexas.edu;https://www.uni-oldenburg.de/;https://www.nec-labs.eu;https://www.nec.com", "aff_unique_abbr": "DFKI;JetBrains;TUD;UQ;UT Austin;UvO;NEC LE;NEC", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Austin;Darmstadt", "aff_country_unique_index": "0;0;0;0;1;2;0;0;3;4", "aff_country_unique": "Germany;Australia;United States;Unknown;Japan" }, { "title": "The Effect of Weight Precision on the Neuron Count in Deep ReLU Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32999", "id": "qHt8FzPvU9", "proceeding": "https://proceedings.mlr.press/v235/he24i.html", "pdf": "https://openreview.net/pdf?id=qHt8FzPvU9", "openreview": "https://openreview.net/forum?id=qHt8FzPvU9", "author_site": "Songhua He, Periklis Papakonstantinou", "tldr": "", "abstract": "Deep neural networks (DNNs) have become pivotal in machine learning, but the impact of weight precision, such as in networks with rectified linear units (ReLU), remains underexplored. We analytically investigate the interplay of three key factors: the precision of ReLU network weights, the number of neurons, and the time of the preprocessing algorithm that generates the network description. Our study, which, to the best of our knowledge, is the first formal work on weight precision, yields three main results. (1) We present an exponential time preprocessing algorithm that showcases the possibility of trading ReLU nodes for weight precision. Specifically, our method achieves an exponential reduction in neuron count when computing any function of high complexity with boolean input encoding. What is the implication of the above result in theoretical and practical works? (2) In theory of computing, in general, there is no free lunch. In our case, if you significantly reduce the number of neurons then you should pay the cost in weight precision. To address this, we introduce a notion of network size that considers weight precision in addition to the network's number of neurons. We establish that under this redefined notion of network size, it is generally impossible to exchange neurons for weight precision in ReLU networks of the same (redefined) size. (3) In practice, we show that high weight precision alone cannot help in reducing the neuron count. If instead of our exponential time preprocessing algorithm one uses any polynomial time algorithm, then it is impossible to non-trivially reduce the neuron count, regardless of the high weight precision.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Songhua He;Periklis A. Papakonstantinou", "authorids": "~Songhua_He1;~Periklis_A._Papakonstantinou1", "gender": "M;M", "homepage": "https://i.dont.have;http://papakonstantinou.org", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Songhua_He1;~Periklis_A._Papakonstantinou1", "aff": "Rutgers University, Newark;Rutgers University, Newark and New Brunswick", "aff_domain": "rutgers.edu;rutgers.edu", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nhe2024the,\ntitle={The Effect of Weight Precision on the Neuron Count in Deep Re{LU} Networks},\nauthor={Songhua He and Periklis A. Papakonstantinou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qHt8FzPvU9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 291553, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TLGT7eaLvQYJ:scholar.google.com/&scioq=The+Effect+of+Weight+Precision+on+the+Neuron+Count+in+Deep+ReLU+Networks&hl=en&as_sdt=0,5", "gs_version_total": 4, "email": "rutgers.edu;rutgers.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Rutgers University", "aff_unique_dep": "", "aff_unique_url": "https://www.rutgers.edu", "aff_unique_abbr": "Rutgers", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Newark;Newark and New Brunswick", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Graph Neural Network Explanations are Fragile", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32998", "id": "qIOSNyPPwB", "proceeding": "https://proceedings.mlr.press/v235/li24bd.html", "pdf": "https://openreview.net/pdf?id=qIOSNyPPwB", "openreview": "https://openreview.net/forum?id=qIOSNyPPwB", "author_site": "Jiate Li, Meng Pang, Yun Dong, Jinyuan Jia, Binghui Wang", "tldr": "", "abstract": "Explainable Graph Neural Network (GNN) has emerged recently to foster the trust of using GNNs. Existing GNN explainers are developed from various perspectives to enhance the explanation performance. We take the first step to study GNN explainers under adversarial attack\u2014We found that an adversary slightly perturbing graph structure can ensure GNN model makes correct predictions, but the GNN explainer yields a drastically different explanation on the perturbed graph. Specifically, we first formulate the attack problem under a practical threat model (i.e., the adversary has limited knowledge about the GNN explainer and a restricted perturbation budget). We then design two methods (i.e., one is loss-based and the other is deduction-based) to realize the attack. We evaluate our attacks on various GNN explainers and the results show these explainers are fragile.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiate Li;Meng Pang;Yun Dong;Jinyuan Jia;Binghui Wang", "authorids": "~Jiate_Li1;~Meng_Pang1;~Yun_Dong1;~Jinyuan_Jia2;~Binghui_Wang2", "gender": "M;M;F;;M", "homepage": "https://github.com/JetRichardLee/JetRichardLee/tree/main;;;https://jinyuan-jia.github.io/;https://wangbinghui.net", "dblp": "379/5970;172/9447.html;;24/5124-1.html;123/7149", "google_scholar": "https://scholar.google.com/citations?hl=zh-TW;;;iyg4ytkAAAAJ;SoOztcEAAAAJ", "orcid": ";;;0000-0002-9785-7769;0000-0001-5616-060X", "linkedin": ";;yun-dong/;;", "or_profile": "~Jiate_Li1;~Meng_Pang1;~Yun_Dong1;~Jinyuan_Jia2;~Binghui_Wang2", "aff": "Illinois Institute of Technology;Nanchang University;Milwaukee School of Engineering;Pennsylvania State University;Illinois Institute of Technology", "aff_domain": "iit.edu;ncu.edu.cn;msoe.edu;psu.edu;iit.edu", "position": "Intern;Associate Professor;Assistant Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nli2024graph,\ntitle={Graph Neural Network Explanations are Fragile},\nauthor={Jiate Li and Meng Pang and Yun Dong and Jinyuan Jia and Binghui Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qIOSNyPPwB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2467358, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10418167856057287886&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "iit.edu;ncu.edu.cn;msoe.edu;psu.edu;iit.edu", "author_num": 5, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "Illinois Institute of Technology;Nanchang University;Milwaukee School of Engineering;Pennsylvania State University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.iit.edu;https://www.ncu.edu.cn;https://www.msoe.edu;https://www.psu.edu", "aff_unique_abbr": "IIT;NCU;MSOE;PSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;China" }, { "title": "On Interpolating Experts and Multi-Armed Bandits", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32997", "id": "qIiPM5CbRY", "proceeding": "https://proceedings.mlr.press/v235/chen24p.html", "pdf": "https://openreview.net/pdf?id=qIiPM5CbRY", "openreview": "https://openreview.net/forum?id=qIiPM5CbRY", "author_site": "Houshuang Chen, Yuchen He, Chihao Zhang", "tldr": "", "abstract": "Learning with expert advice and multi-armed bandit are two classic online decision problems which differ on how the information is observed in each round of the game. We study a family of problems interpolating the two. For a vector $\\mathbf{m}=(m_1,\\dots,m_K)\\in \\mathbb N^K$, an instance of $\\mathbf m$-MAB indicates that the arms are partitioned into $K$ groups and the $i$-th group contains $m_i$ arms. Once an arm is pulled, the losses of all arms in the same group are observed. We prove tight minimax regret bounds for $\\mathbf m$-MAB and design an optimal PAC algorithm for its pure exploration version, $\\mathbf m$-BAI, where the goal is to identify the arm with minimum loss with as few rounds as possible. We show that the minimax regret of $\\mathbf m$-MAB is $\\Theta\\left(\\sqrt{T\\sum_{k=1}^K\\log (m_k+1)}\\right)$ and the minimum number of pulls for an $(\\varepsilon,0.05)$-PAC algorithm of $\\mathbf m$-BAI is $\\Theta\\left(\\frac{1}{\\varepsilon^2}\\cdot \\sum_{k=1}^K\\log (m_k+1)\\right)$. Both our upper bounds and lower bounds for $\\mathbf m$-MAB can be extended to a more general setting, namely the bandit with graph feedback, in terms of the *clique cover* and related graph parameters. As consequences, we obtained tight minimax regret bounds for several families of feedback graphs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Houshuang Chen;Yuchen He;Chihao Zhang", "authorids": "~Houshuang_Chen1;~Yuchen_He2;~Chihao_Zhang1", "gender": "M;F;", "homepage": ";;http://chihaozhang.com", "dblp": ";;92/11261-1.html", "google_scholar": ";https://scholar.google.com.hk/citations?hl=zh-CN;", "orcid": ";;", "linkedin": "https://linkedin.com/in/\u539a\u53cc-\u9648-b504a913a;;", "or_profile": "~Houshuang_Chen1;~Yuchen_He2;~Chihao_Zhang1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "position": "MS student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nchen2024on,\ntitle={On Interpolating Experts and Multi-Armed Bandits},\nauthor={Houshuang Chen and Yuchen He and Chihao Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qIiPM5CbRY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 546767, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17560496998623963326&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 6, "email": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "GiLOT: Interpreting Generative Language Models via Optimal Transport", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32996", "id": "qKL25sGjxL", "proceeding": "https://proceedings.mlr.press/v235/li24i.html", "pdf": "https://openreview.net/pdf?id=qKL25sGjxL", "openreview": "https://openreview.net/forum?id=qKL25sGjxL", "author_site": "Xuhong Li, Jiamin Chen, Yekun Chai, Haoyi Xiong", "tldr": "", "abstract": "While large language models (LLMs) surge with the rise of generative AI, algorithms to explain LLMs highly desire. Existing feature attribution methods adequate for discriminative language models like BERT often fail to deliver faithful explanations for LLMs, primarily due to two issues: (1) For every specific prediction, the LLM outputs a probability distribution over the vocabulary\u2013a large number of tokens with unequal semantic distance; (2) As an autoregressive language model, the LLM handles input tokens while generating a sequence of probability distributions of various tokens. To address above two challenges, this work proposes GiLOT that leverages Optimal Transport to measure the distributional change of all possible generated sequences upon the absence of every input token, while taking into account the tokens\u2019 similarity, so as to faithfully estimate feature attribution for LLMs. We have carried out extensive experiments on top of Llama families and their fine-tuned derivatives across various scales to validate the effectiveness of GiLOT for estimating the input attributions. The results show that GiLOT outperforms existing solutions on a number of faithfulness metrics under fair comparison settings. Source code is publicly available at https://github.com/holyseven/GiLOT.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xuhong Li;Jiamin Chen;Yekun Chai;Haoyi Xiong", "authorids": "~Xuhong_Li3;~Jiamin_Chen2;~Yekun_Chai1;~Haoyi_Xiong1", "gender": ";M;M;M", "homepage": ";https://cyk1337.github.io/;https://sites.google.com/site/haoyixiongshomepage/;", "dblp": ";252/0188;06/2700;76/5330-2.html", "google_scholar": "zLoLPukAAAAJ;P0NRuRYAAAAJ;f_Kcie0AAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";;;", "linkedin": ";;;xuhong-li-4b2776a9/", "or_profile": "~Jiamin_Chen2;~Yekun_Chai1;~Haoyi_Xiong1;~Xuhong_LI1", "aff": "City University of Hong Kong;Baidu;Baidu;Baidu", "aff_domain": "cityu.edu.hk;baidu.com;baidu.com;baidu.com", "position": "PhD student;Researcher;Principal Researcher;Researcher", "bibtex": "@inproceedings{\nli2024gilot,\ntitle={Gi{LOT}: Interpreting Generative Language Models via Optimal Transport},\nauthor={Xuhong Li and Jiamin Chen and Yekun Chai and Haoyi Xiong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qKL25sGjxL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 539207, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4713712091525984750&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "cityu.edu.hk;baidu.com;baidu.com;baidu.com", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "City University of Hong Kong;Baidu", "aff_unique_dep": ";Baidu, Inc.", "aff_unique_url": "https://www.cityu.edu.hk;https://www.baidu.com", "aff_unique_abbr": "CityU;Baidu", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Learning from Streaming Data when Users Choose", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32995", "id": "qLZ32oS7j2", "proceeding": "https://proceedings.mlr.press/v235/su24a.html", "pdf": "https://openreview.net/pdf?id=qLZ32oS7j2", "openreview": "https://openreview.net/forum?id=qLZ32oS7j2", "author_site": "Jinyan Su, Sarah Dean", "tldr": "", "abstract": "In digital markets comprised of many competing services, each user chooses between multiple service providers according to their preferences, and the chosen service makes use of the user data to incrementally improve its model. The service providers' models influence which service the user will choose at the next time step, and the user's choice, in return, influences the model update, leading to a feedback loop. In this paper, we formalize the above dynamics and develop a simple and efficient decentralized algorithm to locally minimize the overall user loss. Theoretically, we show that our algorithm asymptotically converges to stationary points of of the overall loss almost surely. We also experimentally demonstrate the utility of our algorithm with real world data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jinyan Su;Sarah Dean", "authorids": "~Jinyan_Su1;~Sarah_Dean2", "gender": "F;F", "homepage": "https://jinyansu1.github.io/;https://sdean.website/", "dblp": ";207/8292", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;xhKqjpYAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Jinyan_Su1;~Sarah_Dean2", "aff": "Cornell University;Cornell University", "aff_domain": "cornell.edu;cornell.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nsu2024learning,\ntitle={Learning from Streaming Data when Users Choose},\nauthor={Jinyan Su and Sarah Dean},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qLZ32oS7j2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 976522, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10294023342256737132&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "cornell.edu;cornell.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Cornell University", "aff_unique_dep": "", "aff_unique_url": "https://www.cornell.edu", "aff_unique_abbr": "Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Cluster-Aware Similarity Diffusion for Instance Retrieval", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32994", "id": "qMG3OK7Xcg", "proceeding": "https://proceedings.mlr.press/v235/luo24i.html", "pdf": "https://openreview.net/pdf?id=qMG3OK7Xcg", "openreview": "https://openreview.net/forum?id=qMG3OK7Xcg", "author_site": "Jifei Luo, Hantao Yao, Changsheng Xu", "tldr": "", "abstract": "Diffusion-based re-ranking is a common method used for retrieving instances by performing similarity propagation in a nearest neighbor graph. However, existing techniques that construct the affinity graph based on pairwise instances can lead to the propagation of misinformation from outliers and other manifolds, resulting in inaccurate results. To overcome this issue, we propose a novel Cluster-Aware Similarity (CAS) diffusion for instance retrieval. The primary concept of CAS is to conduct similarity diffusion within local clusters, which can reduce the influence from other manifolds explicitly. To obtain a symmetrical and smooth similarity matrix, our Bidirectional Similarity Diffusion strategy introduces an inverse constraint term to the optimization objective of local cluster diffusion. Additionally, we have optimized a Neighbor-guided Similarity Smoothing approach to ensure similarity consistency among the local neighbors of each instance. Evaluations in instance retrieval and object re-identification validate the effectiveness of the proposed CAS, our code is publicly available.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jifei Luo;Hantao Yao;Changsheng Xu", "authorids": "~Jifei_Luo1;~Hantao_Yao2;~Changsheng_Xu1", "gender": "M;M;M", "homepage": ";http://www.hantaoyao.com/;", "dblp": ";167/3478;85/1301", "google_scholar": ";;https://scholar.google.com.sg/citations?user=hI9NRDkAAAAJ", "orcid": ";;", "linkedin": "jifeiluo;;", "or_profile": "~Jifei_Luo1;~Hantao_Yao2;~Changsheng_Xu1", "aff": "University of Science and Technology of China;Institute of automation, Chinese academy of science;Institute of Automation, Chinese Academy of Sciences", "aff_domain": "ustc.edu.cn;nlpr.ia.ac.cn;ia.ac.cn", "position": "PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nluo2024clusteraware,\ntitle={Cluster-Aware Similarity Diffusion for Instance Retrieval},\nauthor={Jifei Luo and Hantao Yao and Changsheng Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qMG3OK7Xcg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 885286, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6358087556652482446&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "ustc.edu.cn;nlpr.ia.ac.cn;ia.ac.cn", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Science and Technology of China;Chinese Academy of Sciences", "aff_unique_dep": ";Institute of Automation", "aff_unique_url": "http://www.ustc.edu.cn;http://www.ia.cas.cn", "aff_unique_abbr": "USTC;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "DynSyn: Dynamical Synergistic Representation for Efficient Learning and Control in Overactuated Embodied Systems", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32993", "id": "qOMQ0UGLYl", "proceeding": "https://proceedings.mlr.press/v235/he24o.html", "pdf": "https://openreview.net/pdf?id=qOMQ0UGLYl", "openreview": "https://openreview.net/forum?id=qOMQ0UGLYl", "author_site": "Kaibo He, Chenhui Zuo, Chengtian Ma, Yanan Sui", "tldr": "", "abstract": "Learning an effective policy to control high-dimensional, overactuated systems is a significant challenge for deep reinforcement learning algorithms. Such control scenarios are often observed in the neural control of vertebrate musculoskeletal systems. The study of these control mechanisms will provide insights into the control of high-dimensional, overactuated systems. The coordination of actuators, known as muscle synergies in neuromechanics, is considered a presumptive mechanism that simplifies the generation of motor commands. The dynamical structure of a system is the basis of its function, allowing us to derive a synergistic representation of actuators. Motivated by this theory, we propose the *Dynamical Synergistic Representation (DynSyn)* algorithm. DynSyn aims to generate synergistic representations from dynamical structures and perform task-specific, state-dependent adaptation to the representations to improve motor control. We demonstrate DynSyn's efficiency across various tasks involving different musculoskeletal models, achieving state-of-the-art sample efficiency and robustness compared to baseline algorithms. DynSyn generates interpretable synergistic representations that capture the essential features of dynamical structures and demonstrates generalizability across diverse motor tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kaibo He;Chenhui Zuo;Chengtian Ma;Yanan Sui", "authorids": "~Kaibo_He2;~Chenhui_Zuo1;~Chengtian_Ma1;~Yanan_Sui1", "gender": ";M;F;M", "homepage": "https://github.com/Beanpow;https://github.com/zchJo;https://github.com/chengtianma;https://www.yanansui.com", "dblp": ";;;151/6934", "google_scholar": ";;;https://scholar.google.com/citations?hl=en", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Kaibo_He2;~Chenhui_Zuo1;~Chengtian_Ma1;~Yanan_Sui1", "aff": "Tsinghua University;;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;;mails.tsinghua.edu.cn;tsinghua.edu.cn", "position": "MS student;;MS student;Associate Professor", "bibtex": "@inproceedings{\nhe2024dynsyn,\ntitle={DynSyn: Dynamical Synergistic Representation for Efficient Learning and Control in Overactuated Embodied Systems},\nauthor={Kaibo He and Chenhui Zuo and Chengtian Ma and Yanan Sui},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qOMQ0UGLYl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7393770, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5573569890266733788&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 6, "email": "tsinghua.edu.cn;;mails.tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "BiLLM: Pushing the Limit of Post-Training Quantization for LLMs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32992", "id": "qOl2WWOqFg", "proceeding": "https://proceedings.mlr.press/v235/huang24q.html", "pdf": "https://openreview.net/pdf?id=qOl2WWOqFg", "openreview": "https://openreview.net/forum?id=qOl2WWOqFg", "author_site": "Wei Huang, Yangdong Liu, Haotong Qin, Ying Li, Shiming Zhang, Xianglong Liu, Michele Magno, XIAOJUAN QI", "tldr": "", "abstract": "Pretrained large language models (LLMs) exhibit exceptional general language processing capabilities but come with significant demands on memory and computational resources. As a powerful compression technology, binarization can extremely reduce model weights to a mere 1 bit, lowering the expensive computation and memory requirements. However, existing quantization techniques fall short of maintaining LLM performance under ultra-low bit-widths. In response to this challenge, we present BiLLM, a groundbreaking 1-bit post-training quantization scheme tailored for pretrained LLMs. Based on the weight distribution of LLMs, BiLLM first identifies and structurally selects salient weights, and minimizes the compression loss through an effective binary residual approximation strategy. Moreover, considering the bell-shaped distribution of the non-salient weights, we propose an optimal splitting search to group and binarize them accurately. BiLLM, for the first time, achieves high-accuracy inference (e.g. 8.41 perplexity on LLaMA2-70B) with only 1.08-bit weights across various LLM families and evaluation metrics, outperforms SOTA quantization methods of LLM by significant margins. Moreover, BiLLM enables the binarization process of a 7-billion LLM within 0.5 hours on a single GPU, demonstrating satisfactory time efficiency. Our code is available at https://github.com/Aaronhuang-778/BiLLM .", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wei Huang;Yangdong Liu;Haotong Qin;Ying Li;Shiming Zhang;Xianglong Liu;Michele Magno;XIAOJUAN QI", "authorids": "~Wei_Huang36;~Yangdong_Liu2;~Haotong_Qin1;~Ying_Li22;szhang@eee.hku.hk;~Xianglong_Liu3;~Michele_Magno1;~XIAOJUAN_QI2", "gender": "M;M;M;F;;;M;F", "homepage": "https://aaron-weihuang.com/;https://www.buaa.edu.cn;https://htqin.github.io/;https://scse.buaa.edu.cn/info/1079/7274.htm;;;https://ee.ethz.ch/the-department/people-a-z/person-detail.michele-magno.html;https://xjqi.github.io/", "dblp": ";;262/3626.html;;;;;176/1445-1.html", "google_scholar": "rZVUlPAAAAAJ;;mK6n-KgAAAAJ;;;;ytj7UUcAAAAJ;bGn0uacAAAAJ", "orcid": "0009-0007-9885-0028;;;;;;;", "linkedin": ";;;;;;;", "or_profile": "~Wei_Huang36;~Yangdong_Liu2;~Haotong_Qin1;~Ying_Li22;szhang@eee.hku.hk;~Xianglong_Liu3;~Michele_Magno1;~XIAOJUAN_QI2", "aff": "University of Hong Kong;Beihang University;ETHZ - ETH Zurich;school of computer science and engineering;;;ETHZ - ETH Zurich;University of Hong Kong", "aff_domain": "hku.hk;buaa.edu.cn;ethz.ch;scse.buaa.edu.cn;;;ethz.ch;hku.hk", "position": "PhD student;MS student;Postdoc;Associate Professor;;;Principal Researcher;Assistant Professor", "bibtex": "@inproceedings{\nhuang2024billm,\ntitle={Bi{LLM}: Pushing the Limit of Post-Training Quantization for {LLM}s},\nauthor={Wei Huang and Yangdong Liu and Haotong Qin and Ying Li and Shiming Zhang and Xianglong Liu and Michele Magno and XIAOJUAN QI},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qOl2WWOqFg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7118833, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 89, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5446924334351333793&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "hku.hk;buaa.edu.cn;ethz.ch;scse.buaa.edu.cn;;;ethz.ch;hku.hk", "author_num": 8, "aff_unique_index": "0;1;2;3;2;0", "aff_unique_norm": "University of Hong Kong;Beihang University;ETH Zurich;School of Computer Science and Engineering", "aff_unique_dep": ";;;Computer Science and Engineering", "aff_unique_url": "https://www.hku.hk;http://www.buaa.edu.cn/;https://www.ethz.ch;", "aff_unique_abbr": "HKU;BUAA;ETHZ;", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;1;1;0", "aff_country_unique": "China;Switzerland;" }, { "title": "Controlling Behavioral Diversity in Multi-Agent Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32991", "id": "qQjUgItPq4", "proceeding": "https://proceedings.mlr.press/v235/bettini24a.html", "pdf": "https://openreview.net/pdf?id=qQjUgItPq4", "openreview": "https://openreview.net/forum?id=qQjUgItPq4", "author_site": "Matteo Bettini, Ryan Kortvelesy, Amanda Prorok", "tldr": "", "abstract": "The study of behavioral diversity in Multi-Agent Reinforcement Learning (MARL) is a nascent yet promising field. In this context, the present work deals with the question of how to control the diversity of a multi-agent system. With no existing approaches to control diversity to a set value, current solutions focus on blindly promoting it via intrinsic rewards or additional loss functions, effectively changing the learning objective and lacking a principled measure for it. To address this, we introduce Diversity Control (DiCo), a method able to control diversity to an exact value of a given metric by representing policies as the sum of a parameter-shared component and dynamically scaled per-agent components. By applying constraints directly to the policy architecture, DiCo leaves the learning objective unchanged, enabling its applicability to any actor-critic MARL algorithm. We theoretically prove that DiCo achieves the desired diversity, and we provide several experiments, both in cooperative and competitive tasks, that show how DiCo can be employed as a novel paradigm to increase performance and sample efficiency in MARL. Multimedia results are available on the paper's website: https://sites.google.com/view/dico-marl", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Matteo Bettini;Ryan Kortvelesy;Amanda Prorok", "authorids": "~Matteo_Bettini1;~Ryan_Kortvelesy1;~Amanda_Prorok1", "gender": "M;M;", "homepage": "http://matteobettini.com;;", "dblp": "324/2168;289/0863;", "google_scholar": "hcvR_W0AAAAJ;fMxXjiIAAAAJ;", "orcid": "0000-0001-8679-0151;0000-0001-6654-0796;", "linkedin": "bettinimatteo/;;", "or_profile": "~Matteo_Bettini1;~Ryan_Kortvelesy1;~Amanda_Prorok1", "aff": "University of Cambridge;;", "aff_domain": "cam.ac.uk;;", "position": "PhD student;;", "bibtex": "@inproceedings{\nbettini2024controlling,\ntitle={Controlling Behavioral Diversity in Multi-Agent Reinforcement Learning},\nauthor={Matteo Bettini and Ryan Kortvelesy and Amanda Prorok},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qQjUgItPq4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7894474, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17049496662392488615&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "cam.ac.uk;;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "University of Cambridge", "aff_unique_dep": "", "aff_unique_url": "https://www.cam.ac.uk", "aff_unique_abbr": "Cambridge", "aff_campus_unique_index": "0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "title": "BLO-SAM: Bi-level Optimization Based Finetuning of the Segment Anything Model for Overfitting-Preventing Semantic Segmentation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32990", "id": "qRtM5EqE9l", "proceeding": "https://proceedings.mlr.press/v235/zhang24ai.html", "pdf": "https://openreview.net/pdf?id=qRtM5EqE9l", "openreview": "https://openreview.net/forum?id=qRtM5EqE9l", "author_site": "Li Zhang, Youwei Liang, Ruiyi Zhang, Amirhosein Javadi, Pengtao Xie", "tldr": "", "abstract": "The Segment Anything Model (SAM), a foundation model pretrained on millions of images and segmentation masks, has significantly advanced semantic segmentation, a fundamental task in computer vision. Despite its strengths, SAM encounters two major challenges. Firstly, it struggles with segmenting specific objects autonomously, as it relies on users to manually input prompts like points or bounding boxes to identify targeted objects. Secondly, SAM faces challenges in excelling at specific downstream tasks, like medical imaging, due to a disparity between the distribution of its pretraining data, which predominantly consists of general-domain images, and the data used in downstream tasks. Current solutions to these problems, which involve finetuning SAM, often lead to overfitting, a notable issue in scenarios with very limited data, like in medical imaging. To overcome these limitations, we introduce BLO-SAM, which finetunes SAM based on bi-level optimization (BLO). Our approach allows for automatic image segmentation without the need for manual prompts, by optimizing a learnable prompt embedding. Furthermore, it significantly reduces the risk of overfitting by training the model's weight parameters and the prompt embedding on two separate subsets of the training dataset, each at a different level of optimization. We apply BLO-SAM to diverse semantic segmentation tasks in general and medical domains. The results demonstrate BLO-SAM's superior performance over various state-of-the-art image semantic segmentation methods. The code of BLO-SAM is available at https://github.com/importZL/BLO-SAM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Li Zhang;Youwei Liang;Ruiyi Zhang;Amirhosein Javadi;Pengtao Xie", "authorids": "~Li_Zhang21;~Youwei_Liang1;~Ruiyi_Zhang4;~Amirhosein_Javadi1;~Pengtao_Xie3", "gender": "M;M;M;M;M", "homepage": ";https://youweiliang.github.io/;;https://amirhosein-javadi.github.io;https://pengtaoxie.github.io/", "dblp": ";257/5626;;371/4356;133/1998", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;zMofZR4AAAAJ;D7EXgU0AAAAJ;R2Gsk_IAAAAJ;cnncomYAAAAJ", "orcid": ";;;;", "linkedin": "%E5%8A%9B-%E5%BC%A0-b24808212/;;;amirhosein-javadi/;", "or_profile": "~Li_Zhang21;~Youwei_Liang1;~Ruiyi_Zhang4;~Amirhosein_Javadi1;~Pengtao_Xie3", "aff": "University of California, San Diego;University of California, San Diego;University of California, San Diego;University of California, San Diego;Carnegie Mellon University", "aff_domain": "ucsd.edu;ucsd.edu;ucsd.edu;ucsd.edu; ", "position": "PhD student;PhD student;PhD student;PhD student;Graduate Student", "bibtex": "@inproceedings{\nzhang2024blosam,\ntitle={{BLO}-{SAM}: Bi-level Optimization Based Finetuning of the Segment Anything Model for Overfitting-Preventing Semantic Segmentation},\nauthor={Li Zhang and Youwei Liang and Ruiyi Zhang and Amirhosein Javadi and Pengtao Xie},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qRtM5EqE9l}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4909739, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15493407225535876705&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "ucsd.edu;ucsd.edu;ucsd.edu;ucsd.edu; ", "author_num": 5, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "University of California, San Diego;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucsd.edu;https://www.cmu.edu", "aff_unique_abbr": "UCSD;CMU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Extracting Training Data From Document-Based VQA Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32989", "id": "qTX1vxzs8b", "proceeding": "https://proceedings.mlr.press/v235/pinto24a.html", "pdf": "https://openreview.net/pdf?id=qTX1vxzs8b", "openreview": "https://openreview.net/forum?id=qTX1vxzs8b", "author_site": "Francesco Pinto, Nathalie Rauschmayr, Florian Tramer, Phil Torr, Federico Tombari", "tldr": "", "abstract": "Vision-Language Models (VLMs) have made remarkable progress in document-based Visual Question Answering (i.e., responding to queries about the contents of an input document provided as an image). In this work, we show these models can memorize responses for training samples and regurgitate them even when the relevant visual information has been removed. This includes Personal Identifiable Information (PII) repeated once in the training set, indicating these models could divulge memorised sensitive information and therefore pose a privacy risk. We quantitatively measure the extractability of information in controlled experiments and differentiate between cases where it arises from generalization capabilities or from memorization. We further investigate the factors that influence memorization across multiple state-of-the-art models and propose an effective heuristic countermeasure that empirically prevents the extractability of PII.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Francesco Pinto;Nathalie Rauschmayr;Florian Tram\u00e8r;Philip Torr;Federico Tombari", "authorids": "~Francesco_Pinto1;rauschmayr@google.com;~Florian_Tram\u00e8r1;~Philip_Torr1;~Federico_Tombari1", "gender": "M;;;;M", "homepage": ";;;http://www.robots.ox.ac.uk/~tvg/;https://federicotombari.github.io/", "dblp": "281/7477;;;;16/3539", "google_scholar": "rqAdo2MAAAAJ;;;;TFsE4BIAAAAJ", "orcid": ";;;;0000-0001-5598-5212", "linkedin": "francesco-pinto-42a389b1?lipi=urn%3Ali%3Apage%3Ad_flagship3_profile_view_base_contact_details%3BishkY8oUQ8OTPPeV0SSCdw%3D%3D;;;;fedet/", "or_profile": "~Francesco_Pinto1;rauschmayr@google.com;~Florian_Tram\u00e8r1;~Philip_Torr1;~Federico_Tombari1", "aff": "University of Oxford;;;University of Oxford;Technical University Munich (TUM)", "aff_domain": "ox.ac.uk;;;ox.ac.uk;in.tum.de", "position": "PhD student;;;Full Professor;Lecturer", "bibtex": "@inproceedings{\npinto2024extracting,\ntitle={Extracting Training Data From Document-Based {VQA} Models},\nauthor={Francesco Pinto and Nathalie Rauschmayr and Florian Tram{\\`e}r and Philip Torr and Federico Tombari},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qTX1vxzs8b}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1078905, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6757372417112474101&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "ox.ac.uk;;;ox.ac.uk;in.tum.de", "author_num": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Oxford;Technical University Munich", "aff_unique_dep": ";", "aff_unique_url": "https://www.ox.ac.uk;https://www.tum.de", "aff_unique_abbr": "Oxford;TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United Kingdom;Germany" }, { "title": "Defining Neural Network Architecture through Polytope Structures of Datasets", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32988", "id": "qXoqV40imX", "proceeding": "https://proceedings.mlr.press/v235/lee24q.html", "pdf": "https://openreview.net/pdf?id=qXoqV40imX", "openreview": "https://openreview.net/forum?id=qXoqV40imX", "author_site": "Sangmin Lee, Abbas Mammadov, Jong Chul YE", "tldr": "", "abstract": "Current theoretical and empirical research in neural networks suggests that complex datasets require large network architectures for thorough classification, yet the precise nature of this relationship remains unclear. This paper tackles this issue by defining upper and lower bounds for neural network widths, which are informed by the polytope structure of the dataset in question. We also delve into the application of these principles to simplicial complexes and specific manifold shapes, explaining how the requirement for network width varies in accordance with the geometric complexity of the dataset. Moreover, we develop an algorithm to investigate a converse situation where the polytope structure of a dataset can be inferred from its corresponding trained neural networks. Through our algorithm, it is established that popular datasets such as MNIST, Fashion-MNIST, and CIFAR10 can be efficiently encapsulated using no more than two polytopes with a small number of faces.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sangmin Lee;Abbas Mammadov;Jong Chul Ye", "authorids": "~Sangmin_Lee3;~Abbas_Mammadov1;~Jong_Chul_Ye1", "gender": "M;M;M", "homepage": ";;https://bispl.weebly.com/", "dblp": ";367/7571;15/5613", "google_scholar": "https://scholar.google.co.kr/citations?user=2wp3excAAAAJ;_-WJlkwAAAAJ;HNMjoNEAAAAJ", "orcid": ";0009-0004-5593-4529;", "linkedin": ";abbas-mammadov/;", "or_profile": "~Sangmin_Lee3;~Abbas_Mammadov1;~Jong_Chul_Ye1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "PhD student;Undergrad student;Full Professor", "bibtex": "@inproceedings{\nlee2024defining,\ntitle={Defining Neural Network Architecture through Polytope Structures of Datasets},\nauthor={Sangmin Lee and Abbas Mammadov and Jong Chul Ye},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qXoqV40imX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 10056854, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1465953337026650257&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 6, "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Pausing Policy Learning in Non-stationary Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32987", "id": "qY622O6Ehg", "proceeding": "https://proceedings.mlr.press/v235/lee24l.html", "pdf": "https://openreview.net/pdf?id=qY622O6Ehg", "openreview": "https://openreview.net/forum?id=qY622O6Ehg", "author_site": "Hyunin Lee, Ming Jin, Javad Lavaei, Somayeh Sojoudi", "tldr": "", "abstract": "Real-time inference is a challenge of real-world reinforcement learning due to temporal differences in time-varying environments: the system collects data from the past, updates the decision model in the present, and deploys it in the future. We tackle a common belief that continually updating the decision is optimal to minimize the temporal gap. We propose forecasting an online reinforcement learning framework and show that strategically pausing decision updates yields better overall performance by effectively managing aleatoric uncertainty. Theoretically, we compute an optimal ratio between policy update and hold duration, and show that a non-zero policy hold duration provides a sharper upper bound on the dynamic regret. Our experimental evaluations on three different environments also reveal that a non-zero policy hold duration yields higher rewards compared to continuous decision updates.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hyunin Lee;Ming Jin;Javad Lavaei;Somayeh Sojoudi", "authorids": "~Hyunin_Lee1;~Ming_Jin2;~Javad_Lavaei1;~Somayeh_Sojoudi1", "gender": "M;M;;F", "homepage": "https://hyunin-lee.github.io/;http://www.jinming.tech/;;https://eecs.berkeley.edu/~sojoudi/", "dblp": "353/1740;;;06/7000", "google_scholar": "kHTDu1YAAAAJ;YdxdTtkAAAAJ;;kNH8zcgAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Hyunin_Lee1;~Ming_Jin2;~Javad_Lavaei1;~Somayeh_Sojoudi1", "aff": "University of California, Berkeley;Virginia Tech;;University of California, Berkeley", "aff_domain": "berkeley.edu;vt.edu;;berkeley.edu", "position": "PhD student;Assistant Professor;;Associate Professor", "bibtex": "@inproceedings{\nlee2024pausing,\ntitle={Pausing Policy Learning in Non-stationary Reinforcement Learning},\nauthor={Hyunin Lee and Ming Jin and Javad Lavaei and Somayeh Sojoudi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qY622O6Ehg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3961508, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8445073065734303420&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "email": "berkeley.edu;vt.edu;;berkeley.edu", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of California, Berkeley;Virginia Tech", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://www.vt.edu", "aff_unique_abbr": "UC Berkeley;VT", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Pre-Training Protein Bi-level Representation Through Span Mask Strategy On 3D Protein Chains", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32986", "id": "qY63FnLuJ1", "proceeding": "https://proceedings.mlr.press/v235/jiale24a.html", "pdf": "https://openreview.net/pdf?id=qY63FnLuJ1", "openreview": "https://openreview.net/forum?id=qY63FnLuJ1", "author_site": "Jiale Zhao, Wanru Zhuang, Jia Song, Yaqi Li, Shuqi Lu", "tldr": "", "abstract": "In recent years, there has been a surge in the development of 3D structure-based pre-trained protein models, representing a significant advancement over pre-trained protein language models in various downstream tasks. However, most existing structure-based pre-trained models primarily focus on the residue level, i.e., alpha carbon atoms, while ignoring other atoms like side chain atoms. We argue that modeling proteins at both residue and atom levels is important since the side chain atoms can also be crucial for numerous downstream tasks, for example, molecular docking. Nevertheless, we find that naively combining residue and atom information during pre-training typically fails. We identify a key reason is the information leakage caused by the inclusion of atom structure in the input, which renders residue-level pre-training tasks trivial and results in insufficiently expressive residue representations. To address this issue, we introduce a span mask pre-training strategy on 3D protein chains to learn meaningful representations of both residues and atoms. This leads to a simple yet effective approach to learning protein representation suitable for diverse downstream tasks. Extensive experimental results on binding site prediction and function prediction tasks demonstrate our proposed pre-training approach significantly outperforms other methods. Our code will be made public.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhao jiale;Wanru Zhuang;Jia Song;Yaqi Li;Shuqi Lu", "authorids": "~Zhao_jiale1;~Wanru_Zhuang1;~Jia_Song5;~Yaqi_Li2;~Shuqi_Lu1", "gender": "M;F;F;;F", "homepage": "https://www.researchgate.net/profile/Jiale-Zhao-2;https://github.com/ABOWLofFish;;;", "dblp": ";;;;245/1806", "google_scholar": ";;https://scholar.google.com/citations?hl=en;;", "orcid": ";;;0000-0002-3010-2599;", "linkedin": ";;;;", "or_profile": "~Zhao_jiale1;~Wanru_Zhuang1;~Jia_Song5;~Yaqi_Li2;~Shuqi_Lu1", "aff": "University of Chinese Academy of Sciences;Xiamen University;Xiamen University;Hunan Normal University;DP Technology", "aff_domain": "ucas.edu;xmu.edu.cn;xmu.edu.cn;hunnu.edu.cn;dp.tech", "position": "PhD student;Undergrad student;MS student;PhD student;Researcher", "bibtex": "@inproceedings{\njiale2024pretraining,\ntitle={Pre-Training Protein Bi-level Representation Through Span Mask Strategy On 3D Protein Chains},\nauthor={Zhao jiale and Wanru Zhuang and Jia Song and Yaqi Li and Shuqi Lu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qY63FnLuJ1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2052529, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5047191332467494018&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "ucas.edu;xmu.edu.cn;xmu.edu.cn;hunnu.edu.cn;dp.tech", "author_num": 5, "aff_unique_index": "0;1;1;2;3", "aff_unique_norm": "University of Chinese Academy of Sciences;Xiamen University;Hunan Normal University;DP Technology", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.ucas.ac.cn;https://www.xmu.edu.cn;http://www.hnu.edu.cn;", "aff_unique_abbr": "UCAS;XMU;HNU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China;" }, { "title": "PriorBoost: An Adaptive Algorithm for Learning from Aggregate Responses", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32985", "id": "qawwyKqOkj", "proceeding": "https://proceedings.mlr.press/v235/javanmard24a.html", "pdf": "https://openreview.net/pdf?id=qawwyKqOkj", "openreview": "https://openreview.net/forum?id=qawwyKqOkj", "author_site": "Adel Javanmard, Matthew Fahrbach, Vahab Mirrokni", "tldr": "", "abstract": "This work studies algorithms for learning from aggregate responses. We focus on the construction of aggregation sets (called *bags* in the literature) for event-level loss functions. We prove for linear regression and generalized linear models (GLMs) that the optimal bagging problem reduces to one-dimensional size-constrained $k$-means clustering. Further, we theoretically quantify the advantage of using curated bags over random bags. We then propose the $\\texttt{PriorBoost}$ algorithm, which adaptively forms bags of samples that are increasingly homogeneous with respect to (unobserved) individual responses to improve model quality. We study label differential privacy for aggregate learning, and we also provide extensive experiments showing that $\\texttt{PriorBoost}$ regularly achieves optimal model quality for event-level predictions, in stark contrast to non-adaptive algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Adel Javanmard;Matthew Fahrbach;Vahab Mirrokni", "authorids": "~Adel_Javanmard1;~Matthew_Fahrbach1;~Vahab_Mirrokni2", "gender": ";;M", "homepage": "https://faculty.marshall.usc.edu/Adel-Javanmard/;;https://people.csail.mit.edu/mirrokni/Welcome.html", "dblp": "96/8072;;m/VahabSMirrokni", "google_scholar": "cNSbfGQAAAAJ;;opbZfw0AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Adel_Javanmard1;~Matthew_Fahrbach1;~Vahab_Mirrokni2", "aff": "University of Southern California;;Google Research", "aff_domain": "usc.edu;;google.com", "position": "Full Professor;;VP, Google Fellow", "bibtex": "@inproceedings{\njavanmard2024priorboost,\ntitle={PriorBoost: An Adaptive Algorithm for Learning from Aggregate Responses},\nauthor={Adel Javanmard and Matthew Fahrbach and Vahab Mirrokni},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qawwyKqOkj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1469585, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10729653678747869805&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "usc.edu;;google.com", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "University of Southern California;Google", "aff_unique_dep": ";Google Research", "aff_unique_url": "https://www.usc.edu;https://research.google", "aff_unique_abbr": "USC;Google Research", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Los Angeles;Mountain View", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Provably Efficient Reinforcement Learning for Adversarial Restless Multi-Armed Bandits with Unknown Transitions and Bandit Feedback", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32984", "id": "qbIKUfastZ", "proceeding": "https://proceedings.mlr.press/v235/xiong24b.html", "pdf": "https://openreview.net/pdf?id=qbIKUfastZ", "openreview": "https://openreview.net/forum?id=qbIKUfastZ", "author_site": "GUOJUN XIONG, Jian Li", "tldr": "", "abstract": "Restless multi-armed bandits (RMAB) play a central role in modeling sequential decision making problems under an instantaneous activation constraint that at most $B$ arms can be activated at any decision epoch. Each restless arm is endowed with a state that evolves independently according to a Markov decision process regardless of being activated or not. In this paper, we consider the task of learning in episodic RMAB with unknown transition functions, bandit feedback, and adversarial rewards, which can change arbitrarily across episodes. The goal of the decision maker is to maximize its total adversarial rewards during the learning process while the instantaneous activation constraint must be satisfied in each decision epoch. We develop a novel reinforcement learning algorithm with two key contributors: a novel biased adversarial reward estimator to deal with bandit feedback and unknown transitions, and a low-complexity index policy to satisfy the instantaneous activation constraint. We show $\\tilde{\\mathcal{O}}(H\\sqrt{T})$ regret bound for our algorithm, where $T$ is the number of episodes and $H$ is the episode length. To our best knowledge, this is the first algorithm to ensure $\\tilde{\\mathcal{O}}(\\sqrt{T})$ regret for adversarial RMAB in our considered challenging settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "GUOJUN XIONG;Jian Li", "authorids": "~GUOJUN_XIONG1;~Jian_Li14", "gender": ";M", "homepage": "https://xionggj001.github.io/;https://sites.google.com/stonybrook.edu/jianli", "dblp": "214/2134.html;33/5448-8", "google_scholar": "FIBwLnoAAAAJ;h039Yq4AAAAJ", "orcid": ";", "linkedin": "guojun-%E5%9B%BD%E9%92%A7-xiong-48696aa6/;", "or_profile": "~GUOJUN_XIONG1;~Jian_Li14", "aff": "State University of New York at Stony Brook;State University of New York at Stony Brook", "aff_domain": "stonybrook.edu;stonybrook.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nxiong2024provably,\ntitle={Provably Efficient Reinforcement Learning for Adversarial Restless Multi-Armed Bandits with Unknown Transitions and Bandit Feedback},\nauthor={GUOJUN XIONG and Jian Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qbIKUfastZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 585275, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16357570866714045914&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "stonybrook.edu;stonybrook.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "State University of New York at Stony Brook", "aff_unique_dep": "", "aff_unique_url": "https://www.stonybrook.edu", "aff_unique_abbr": "SUNY Stony Brook", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stony Brook", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Memory Consolidation Enables Long-Context Video Understanding", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32983", "id": "qeFgvVVAJ2", "proceeding": "https://proceedings.mlr.press/v235/balazevic24a.html", "pdf": "https://openreview.net/pdf?id=qeFgvVVAJ2", "openreview": "https://openreview.net/forum?id=qeFgvVVAJ2", "author_site": "Ivana Balazevic, Yuge Shi, Pinelopi Papalampidi, Rahma Chaabouni, Skanda Koppula, Olivier Henaff", "tldr": "", "abstract": "Most transformer-based video encoders are limited to short temporal contexts due to their quadratic complexity. While various attempts have been made to extend this context, this has often come at the cost of both conceptual and computational complexity. We propose to instead re-purpose existing pre-trained video transformers by simply fine-tuning them to attend to memories derived non-parametrically from past activations. By leveraging redundancy reduction, our memory-consolidated vision transformer (MC-ViT) effortlessly extends its context far into the past and exhibits excellent scaling behavior when learning from longer videos. In doing so, MC-ViT sets a new state-of-the-art in long-context video understanding on EgoSchema, Perception Test, and Diving48, outperforming methods that benefit from orders of magnitude more parameters.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ivana Balazevic;Yuge Shi;Pinelopi Papalampidi;Rahma Chaabouni;Skanda Koppula;Olivier J Henaff", "authorids": "~Ivana_Balazevic1;~Yuge_Shi2;~Pinelopi_Papalampidi1;~Rahma_Chaabouni1;~Skanda_Koppula1;~Olivier_J_Henaff1", "gender": "F;;F;F;;", "homepage": "https://ibalazevic.github.io/;;https://ppapalampidi.github.io/;;;https://www.olivierhenaff.com/", "dblp": "185/0837;;203/9741.html;;;156/0035.html", "google_scholar": "CnxZPkkAAAAJ;;https://scholar.google.gr/citations?user=3VE4eWAAAAAJ;https://scholar.google.com/citations?hl=fr;;Sx75CVsAAAAJ", "orcid": ";;;;;0000-0001-8183-9489", "linkedin": ";;;;;", "or_profile": "~Ivana_Balazevic1;~Yuge_Shi2;~Pinelopi_Papalampidi1;~Rahma_Chaabouni1;~Skanda_Koppula1;~Olivier_J_Henaff1", "aff": "Google DeepMind;;Google;Google;;Google DeepMind", "aff_domain": "google.com;;google.com;google.com;;google.com", "position": "Research Scientist;;Researcher;Researcher;;Research Scientist", "bibtex": "@inproceedings{\nbalazevic2024memory,\ntitle={Memory Consolidation Enables Long-Context Video Understanding},\nauthor={Ivana Balazevic and Yuge Shi and Pinelopi Papalampidi and Rahma Chaabouni and Skanda Koppula and Olivier J Henaff},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qeFgvVVAJ2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 530741, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14827503423574118963&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "google.com;;google.com;google.com;;google.com", "author_num": 6, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google DeepMind", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "On the Emergence of Cross-Task Linearity in Pretraining-Finetuning Paradigm", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32982", "id": "qg6AlnpEQH", "proceeding": "https://proceedings.mlr.press/v235/zhou24e.html", "pdf": "https://openreview.net/pdf?id=qg6AlnpEQH", "openreview": "https://openreview.net/forum?id=qg6AlnpEQH", "author_site": "Zhanpeng Zhou, Zijun Chen, Yilan Chen, Bo Zhang, Junchi Yan", "tldr": "", "abstract": "The pretraining-finetuning paradigm has become the prevailing trend in modern deep learning. In this work, we discover an intriguing linear phenomenon in models that are initialized from a common pretrained checkpoint and finetuned on different tasks, termed as Cross-Task Linearity (CTL). Specifically, we show that if we linearly interpolate the weights of two finetuned models, the features in the weight-interpolated model are often approximately equal to the linear interpolation of features in two finetuned models at each layer. We provide comprehensive empirical evidence supporting that CTL consistently occurs for finetuned models that start from the same pretrained checkpoint. We conjecture that in the pretraining-finetuning paradigm, neural networks approximately function as linear maps, mapping from the parameter space to the feature space. Based on this viewpoint, our study unveils novel insights into explaining model merging/editing, particularly by translating operations from the parameter space to the feature space. Furthermore, we delve deeper into the root cause for the emergence of CTL, highlighting the role of pretraining.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhanpeng Zhou;Zijun Chen;Yilan Chen;Bo Zhang;Junchi Yan", "authorids": "~Zhanpeng_Zhou1;~Zijun_Chen1;~Yilan_Chen1;~Bo_Zhang17;~Junchi_Yan2", "gender": "M;;M;M;M", "homepage": "https://zzp1012.github.io/;https://github.com/zijunchen68;https://yilanchen6.github.io/;https://bobrown.github.io/boZhang.github.io/;http://thinklab.sjtu.edu.cn/", "dblp": ";;167/6638-2.html;36/2259-69;60/7949.html", "google_scholar": "idxXY3UAAAAJ;;6wmzpRIAAAAJ;https://scholar.google.com/citations?hl=en;ga230VoAAAAJ", "orcid": ";;;0000-0001-8052-782X;0000-0001-9639-7679", "linkedin": ";;;;", "or_profile": "~Zhanpeng_Zhou1;~Zijun_Chen1;~Yilan_Chen1;~Bo_Zhang17;~Junchi_Yan1", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;University of California, San Diego;Shanghai Artificial Intelligence Laboratory;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;ucsd.edu;pjlab.org.cn;sjtu.edu.cn", "position": "PhD student;MS student;PhD student;Researcher;Full Professor", "bibtex": "@inproceedings{\nzhou2024on,\ntitle={On the Emergence of Cross-Task Linearity in Pretraining-Finetuning Paradigm},\nauthor={Zhanpeng Zhou and Zijun Chen and Yilan Chen and Bo Zhang and Junchi Yan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qg6AlnpEQH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8362439, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3823355429181498621&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": "sjtu.edu.cn;sjtu.edu.cn;ucsd.edu;pjlab.org.cn;sjtu.edu.cn", "author_num": 5, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Shanghai Jiao Tong University;University of California, San Diego;Shanghai Artificial Intelligence Laboratory", "aff_unique_dep": ";;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.ucsd.edu;http://www.shailab.org/", "aff_unique_abbr": "SJTU;UCSD;Shanghai AI Lab", "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "China;United States" }, { "title": "Transformers Provably Learn Sparse Token Selection While Fully-Connected Nets Cannot", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32981", "id": "qjqlhWDcId", "proceeding": "https://proceedings.mlr.press/v235/wang24ca.html", "pdf": "https://openreview.net/pdf?id=qjqlhWDcId", "openreview": "https://openreview.net/forum?id=qjqlhWDcId", "author_site": "Zixuan Wang, Stanley Wei, Daniel Hsu, Jason Lee", "tldr": "", "abstract": "The transformer architecture has prevailed in various deep learning settings due to its exceptional capabilities to select and compose structural information. Motivated by these capabilities, Sanford et al. (2023) proposed the *sparse token selection* task, in which transformers excel while fully-connected networks (FCNs) fail in the worst case. Building upon that, we strengthen the FCN lower bound to an average-case setting and establish an algorithmic separation of transformers over FCNs. Specifically, a one-layer transformer trained with gradient descent provably learns the sparse token selection task and, surprisingly, exhibits strong out-of-distribution length generalization. We provide empirical simulations to justify our theoretical findings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zixuan Wang;Stanley Wei;Daniel Hsu;Jason D. Lee", "authorids": "~Zixuan_Wang4;~Stanley_Wei1;~Daniel_Hsu1;~Jason_D._Lee1", "gender": "M;M;M;M", "homepage": "https://zixuan-wang-dlt.github.io;;https://www.cs.columbia.edu/~djhsu/;https://jasondlee88.github.io/", "dblp": ";;h/DanielHsu.html;88/3262", "google_scholar": "vNJDZyEAAAAJ;;Bp6tvy0AAAAJ;GR_DsT0AAAAJ", "orcid": ";;0000-0002-3495-7113;", "linkedin": ";stanley-wei-99ab98199/;;", "or_profile": "~Zixuan_Wang4;~Stanley_Wei1;~Daniel_Hsu1;~Jason_D._Lee1", "aff": "Princeton University;Princeton University;Columbia University;Princeton University", "aff_domain": "princeton.edu;princeton.edu;columbia.edu;princeton.edu", "position": "PhD student;PhD student;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nwang2024transformers,\ntitle={Transformers Provably Learn Sparse Token Selection While Fully-Connected Nets Cannot},\nauthor={Zixuan Wang and Stanley Wei and Daniel Hsu and Jason D. Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qjqlhWDcId}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5623784, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17032285168634088439&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "princeton.edu;princeton.edu;columbia.edu;princeton.edu", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Princeton University;Columbia University", "aff_unique_dep": ";", "aff_unique_url": "https://www.princeton.edu;https://www.columbia.edu", "aff_unique_abbr": "Princeton;Columbia", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "From Words to Actions: Unveiling the Theoretical Underpinnings of LLM-Driven Autonomous Systems", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32980", "id": "qkhbyDqlNI", "proceeding": "https://proceedings.mlr.press/v235/he24a.html", "pdf": "https://openreview.net/pdf?id=qkhbyDqlNI", "openreview": "https://openreview.net/forum?id=qkhbyDqlNI", "author_site": "Jianliang He, Siyu Chen, Fengzhuo Zhang, Zhuoran Yang", "tldr": "", "abstract": "In this work, from a theoretical lens, we aim to understand why large language model (LLM) empowered agents are able to solve decision-making problems in the physical world. To this end, consider a hierarchical reinforcement learning (RL) model where the LLM Planner and the Actor perform high-level task planning and low-level execution, respectively. Under this model, the LLM Planner navigates a partially observable Markov decision process (POMDP) by iteratively generating language-based subgoals via prompting. Under proper assumptions on the pretraining data, we prove that the pretrained LLM Planner effectively performs Bayesian aggregated imitation learning (BAIL) through in-context learning. Additionally, we highlight the necessity for exploration beyond the subgoals derived from BAIL by proving that naively executing the subgoals returned by LLM leads to a linear regret. As a remedy, we introduce an $\\epsilon$-greedy exploration strategy to BAIL, which is proven to incur sublinear regret when the pretraining error is small. Finally, we extend our theoretical framework to include scenarios where the LLM Planner serves as a world model for inferring the transition model of the environment and to multi-agent settings, enabling coordination among multiple Actors.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jianliang He;Siyu Chen;Fengzhuo Zhang;Zhuoran Yang", "authorids": "~Jianliang_He1;~Siyu_Chen2;~Fengzhuo_Zhang1;~Zhuoran_Yang1", "gender": "M;M;M;M", "homepage": "https://no.io/;https://github.com/FFishy-git/FFishy-git.github.io;;https://zhuoranyang.github.io/", "dblp": ";;254/1627;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;%E4%B8%B0%E5%8D%93-%E5%BC%A0-4576a5135/;", "or_profile": "~Jianliang_He1;~Siyu_Chen2;~Fengzhuo_Zhang1;~Zhuoran_Yang1", "aff": "Fudan University;Yale University;National University of Singapore;Yale University", "aff_domain": "fudan.edu.cn;yale.edu;nus.edu;yale.edu", "position": "Undergrad student;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nhe2024from,\ntitle={From Words to Actions: Unveiling the Theoretical Underpinnings of {LLM}-Driven Autonomous Systems},\nauthor={Jianliang He and Siyu Chen and Fengzhuo Zhang and Zhuoran Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qkhbyDqlNI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1851106, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1224384622726060419&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "fudan.edu.cn;yale.edu;nus.edu;yale.edu", "author_num": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Fudan University;Yale University;National University of Singapore", "aff_unique_dep": ";;", "aff_unique_url": "https://www.fudan.edu.cn;https://www.yale.edu;https://www.nus.edu.sg", "aff_unique_abbr": "Fudan;Yale;NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;1", "aff_country_unique": "China;United States;Singapore" }, { "title": "Implicit Regularization in Feedback Alignment Learning Mechanisms for Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32979", "id": "qklMNNub0H", "proceeding": "https://proceedings.mlr.press/v235/robertson24a.html", "pdf": "https://openreview.net/pdf?id=qklMNNub0H", "openreview": "https://openreview.net/forum?id=qklMNNub0H", "author_site": "Zach Robertson, Sanmi Koyejo", "tldr": "", "abstract": "Feedback Alignment (FA) methods are biologically inspired local learning rules for training neural networks with reduced communication between layers. While FA has potential applications in distributed and privacy-aware ML, limitations in multi-class classification and lack of theoretical understanding of the alignment mechanism have constrained its impact. This study introduces a unified framework elucidating the operational principles behind alignment in FA. Our key contributions include: (1) a novel conservation law linking changes in synaptic weights to implicit regularization that maintains alignment with the gradient, with support from experiments, (2) sufficient conditions for convergence based on the concept of alignment dominance, and (3) empirical analysis showing better alignment can enhance FA performance on complex multi-class tasks. Overall, these theoretical and practical advancements improve interpretability of bio-plausible learning rules and provide groundwork for developing enhanced FA algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zachary Robertson;Sanmi Koyejo", "authorids": "~Zachary_Robertson1;~Sanmi_Koyejo1", "gender": "M;M", "homepage": ";https://cs.stanford.edu/~sanmi/", "dblp": "271/8503.html;14/8885", "google_scholar": "769PIisAAAAJ;EaaOeJwAAAAJ", "orcid": ";0000-0002-4023-419X", "linkedin": "zrobertson466920/;sanmi-koyejo-984754/", "or_profile": "~Zachary_Robertson1;~Oluwasanmi_O_Koyejo1", "aff": "Stanford University;Google", "aff_domain": "stanford.edu;google.com", "position": "PhD student;Research Scientist", "bibtex": "@inproceedings{\nrobertson2024implicit,\ntitle={Implicit Regularization in Feedback Alignment Learning Mechanisms for Neural Networks},\nauthor={Zachary Robertson and Sanmi Koyejo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qklMNNub0H}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1113584, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:0NlNJ64hZh8J:scholar.google.com/&scioq=Implicit+Regularization+in+Feedback+Alignment+Learning+Mechanisms+for+Neural+Networks&hl=en&as_sdt=0,33", "gs_version_total": 6, "email": "stanford.edu;google.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.stanford.edu;https://www.google.com", "aff_unique_abbr": "Stanford;Google", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Stanford;Mountain View", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Multi-Source Conformal Inference Under Distribution Shift", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32978", "id": "qmUbSAgz08", "proceeding": "https://proceedings.mlr.press/v235/liu24ag.html", "pdf": "https://openreview.net/pdf?id=qmUbSAgz08", "openreview": "https://openreview.net/forum?id=qmUbSAgz08", "author_site": "Yi Liu, Alexander Levis, Sharon-Lise Normand, Larry Han", "tldr": "", "abstract": "Recent years have experienced increasing utilization of complex machine learning models across multiple sources of data to inform more generalizable decision-making. However, distribution shifts across data sources and privacy concerns related to sharing individual-level data, coupled with a lack of uncertainty quantification from machine learning predictions, make it challenging to achieve valid inferences in multi-source environments. In this paper, we consider the problem of obtaining distribution-free prediction intervals for a target population, leveraging multiple potentially biased data sources. We derive the efficient influence functions for the quantiles of unobserved outcomes in the target and source populations, and show that one can incorporate machine learning prediction algorithms in the estimation of nuisance functions while still achieving parametric rates of convergence to nominal coverage probabilities. Moreover, when conditional outcome invariance is violated, we propose a data-adaptive strategy to upweight informative data sources for efficiency gain and downweight non-informative data sources for bias reduction. We highlight the robustness and efficiency of our proposals for a variety of conformal scores and data-generating mechanisms via extensive synthetic experiments. Hospital length of stay prediction intervals for pediatric patients undergoing a high-risk cardiac surgical procedure between 2016-2022 in the U.S. illustrate the utility of our methodology.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yi Liu;Alexander Levis;Sharon-Lise Normand;Larry Han", "authorids": "~Yi_Liu49;alevis@cmu.edu;larryhan@fas.harvard.edu;~Larry_Han1", "gender": "M;;;M", "homepage": "https://yiliu1998.github.io/;;;https://larrylehan.github.io/", "dblp": ";;;", "google_scholar": "ARA2oPcAAAAJ;;;iZnTmxMAAAAJ", "orcid": "0000-0002-0935-007X;;;0000-0002-0577-9661", "linkedin": "yiliu98/;;;", "or_profile": "~Yi_Liu49;alevis@cmu.edu;larryhan@fas.harvard.edu;~Larry_Han1", "aff": "North Carolina State University;;;Northeastern University", "aff_domain": "ncsu.edu;;;northeastern.edu", "position": "PhD student;;;Assistant Professor", "bibtex": "@inproceedings{\nliu2024multisource,\ntitle={Multi-Source Conformal Inference Under Distribution Shift},\nauthor={Yi Liu and Alexander Levis and Sharon-Lise Normand and Larry Han},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qmUbSAgz08}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7268013, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7710826245960494103&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "email": "ncsu.edu;;;northeastern.edu", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "North Carolina State University;Northeastern University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ncsu.edu;https://www.northeastern.edu", "aff_unique_abbr": "NCSU;NEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Manifold Integrated Gradients: Riemannian Geometry for Feature Attribution", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32977", "id": "qoOt02l2WC", "proceeding": "https://proceedings.mlr.press/v235/zaher24a.html", "pdf": "https://openreview.net/pdf?id=qoOt02l2WC", "openreview": "https://openreview.net/forum?id=qoOt02l2WC", "author_site": "Eslam Zaher, Maciej Trzaskowski, Quan Nguyen, Fred Roosta", "tldr": "", "abstract": "In this paper, we dive into the reliability concerns of Integrated Gradients (IG), a prevalent feature attribution method for black-box deep learning models. We particularly address two predominant challenges associated with IG: the generation of noisy feature visualizations for vision models and the vulnerability to adversarial attributional attacks. Our approach involves an adaptation of path-based feature attribution, aligning the path of attribution more closely to the intrinsic geometry of the data manifold. Our experiments utilise deep generative models applied to several real-world image datasets. They demonstrate that IG along the geodesics conforms to the curved geometry of the Riemannian data manifold, generating more perceptually intuitive explanations and, subsequently, substantially increasing robustness to targeted attributional attacks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Eslam Zaher;Maciej Trzaskowski;Quan Nguyen;Fred Roosta", "authorids": "~Eslam_Zaher1;m.trzaskowski@uq.edu.au;quan.nguyen@imb.uq.edu.au;~Fred_Roosta1", "gender": "M;;;M", "homepage": "https://cires.org.au;;;https://people.smp.uq.edu.au/FredRoosta/", "dblp": "377/5982;;;133/8630", "google_scholar": "https://scholar.google.com.au/citations?user=xbVfDVsAAAAJ;;;https://scholar.google.com/citations?hl=en", "orcid": "0009-0000-5058-7856;;;", "linkedin": "eslam-zaher/;;;", "or_profile": "~Eslam_Zaher1;m.trzaskowski@uq.edu.au;quan.nguyen@imb.uq.edu.au;~Fred_Roosta1", "aff": "University of Queensland;;;University of Queensland", "aff_domain": "uq.edu.au;;;uq.edu.au", "position": "PhD student;;;Associate Professor", "bibtex": "@inproceedings{\nzaher2024manifold,\ntitle={Manifold Integrated Gradients: Riemannian Geometry for Feature Attribution},\nauthor={Eslam Zaher and Maciej Trzaskowski and Quan Nguyen and Fred Roosta},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qoOt02l2WC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2446815, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3097343657411438409&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "uq.edu.au;;;uq.edu.au", "author_num": 4, "aff_unique_index": "0;0", "aff_unique_norm": "University of Queensland", "aff_unique_dep": "", "aff_unique_url": "https://www.uq.edu.au", "aff_unique_abbr": "UQ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Australia" }, { "title": "An Empirical Study Into What Matters for Calibrating Vision-Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32976", "id": "qoxuPshrZb", "proceeding": "https://proceedings.mlr.press/v235/tu24a.html", "pdf": "https://openreview.net/pdf?id=qoxuPshrZb", "openreview": "https://openreview.net/forum?id=qoxuPshrZb", "author_site": "Weijie Tu, Weijian Deng, Dylan Campbell, Stephen Gould, Tom Gedeon", "tldr": "", "abstract": "Vision-Language Models (VLMs) have emerged as the dominant approach for zero-shot recognition, adept at handling diverse scenarios and significant distribution changes. However, their deployment in risk-sensitive areas requires a deeper understanding of their uncertainty estimation capabilities, a relatively uncharted area. In this study, we explore the calibration properties of VLMs across different architectures, datasets, and training strategies. In particular, we analyze the uncertainty estimation performance of VLMs when calibrated in one domain, label set or hierarchy level, and tested in a different one. Our findings reveal that while VLMs are not inherently calibrated for uncertainty, temperature scaling significantly and consistently improves calibration, even across shifts in distribution and changes in label set. Moreover, VLMs can be calibrated with a very small set of examples. Through detailed experimentation, we highlight the potential applications and importance of our insights, aiming for more reliable and effective use of VLMs in critical, real-world scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weijie Tu;Weijian Deng;Dylan Campbell;Stephen Gould;Tom Gedeon", "authorids": "~Weijie_Tu1;~Weijian_Deng1;~Dylan_Campbell1;~Stephen_Gould1;~Tom_Gedeon1", "gender": "M;M;M;M;M", "homepage": ";http://weijiandeng.xyz;https://sites.google.com/view/djcampbell;http://users.cecs.anu.edu.au/~sgould/;https://cs.anu.edu.au/people/Tom.Gedeon/", "dblp": "344/1001;198/1517;139/6663;89/1569.html;g/TamasDGedeon.html", "google_scholar": ";https://scholar.google.com.hk/citations?user=lReHnAEAAAAJ;https://scholar.google.com.au/citations?user=FayBF1AAAAAJ;YvdzeM8AAAAJ;https://scholar.google.com.tw/citations?user=lPTjWIkAAAAJ", "orcid": ";;0000-0002-4717-6850;0000-0001-8929-7899;0000-0001-8356-4909", "linkedin": "weijie-tu;;;;tom-gedeon", "or_profile": "~Weijie_Tu1;~Weijian_Deng1;~Dylan_Campbell1;~Stephen_Gould1;~Tom_Gedeon1", "aff": "Australian National University;Australian National University;Australian National University;Australian National University;Curtin University of Technology", "aff_domain": "anu.edu.au;anu.edu.au;anu.edu.au;anu.edu.au;curtin.edu.au", "position": "PhD student;Postdoc;Lecturer;Full Professor;Full Professor", "bibtex": "@inproceedings{\ntu2024an,\ntitle={An Empirical Study Into What Matters for Calibrating Vision-Language Models},\nauthor={Weijie Tu and Weijian Deng and Dylan Campbell and Stephen Gould and Tom Gedeon},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qoxuPshrZb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2166897, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5538966444686904030&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "anu.edu.au;anu.edu.au;anu.edu.au;anu.edu.au;curtin.edu.au", "author_num": 5, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Australian National University;Curtin University", "aff_unique_dep": ";", "aff_unique_url": "https://www.anu.edu.au;https://www.curtin.edu.au", "aff_unique_abbr": "ANU;Curtin", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Australia" }, { "title": "Learning High-Frequency Functions Made Easy with Sinusoidal Positional Encoding", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32975", "id": "qqPL0DkcrI", "proceeding": "https://proceedings.mlr.press/v235/sun24m.html", "pdf": "https://openreview.net/pdf?id=qqPL0DkcrI", "openreview": "https://openreview.net/forum?id=qqPL0DkcrI", "author_site": "Chuanhao Sun, Zhihang Yuan, Kai Xu, Luo Mai, Siddharth N, Shuo Chen, Mahesh Marina", "tldr": "", "abstract": "Fourier features based positional encoding (PE) is commonly used in machine learning tasks that involve learning high-frequency features from low-dimensional inputs, such as 3D view synthesis and time series regression with neural tangent kernels. Despite their effectiveness, existing PEs require manual, empirical adjustment of crucial hyperparameters, specifically the Fourier features, tailored to each unique task. Further, PEs face challenges in efficiently learning high-frequency functions, particularly in tasks with limited data. In this paper, we introduce sinusoidal PE (SPE), designed to efficiently learn adaptive frequency features closely aligned with the true underlying function. Our experiments demonstrate that SPE, without hyperparameter tuning, consistently achieves enhanced fidelity and faster training across various tasks, including 3D view synthesis, Text-to-Speech generation, and 1D regression. SPE is implemented as a direct replacement for existing PEs. Its plug-and-play nature lets numerous tasks easily adopt and benefit from SPE.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chuanhao Sun;Zhihang Yuan;Kai Xu;Luo Mai;Siddharth N;Shuo Chen;Mahesh K. Marina", "authorids": "chuanhao.sun@ed.ac.uk;~Zhihang_Yuan4;~Kai_Xu4;~Luo_Mai1;~Siddharth_N1;s1931698@ed.ac.uk;~Mahesh_K._Marina1", "gender": ";M;M;M;M;;M", "homepage": ";;https://xuk.ai;https://luomai.github.io;https://homepages.inf.ed.ac.uk/snaraya3/;;https://homepages.inf.ed.ac.uk/mmarina/", "dblp": ";;;;67/8366;;71/2142", "google_scholar": ";;https://scholar.google.ca/citations?user=kf3C60wAAAAJ;;V7D7hxMAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": ";;;;0000-0003-4911-7333;;", "linkedin": ";zhihang-yuan-744456213/;;;;;mahesh-marina-22629a23/", "or_profile": "chuanhao.sun@ed.ac.uk;~Zhihang_Yuan4;~Kai_Xu4;~Luo_Mai1;~Siddharth_N1;s1931698@ed.ac.uk;~Mahesh_K._Marina1", "aff": ";Edinburgh University, University of Edinburgh;Amazon;University of Edinburgh;University of Edinburgh;;Edinburgh University, University of Edinburgh", "aff_domain": ";inf.ed.ac.uk;amazon.com;ed.ac.uk;ed.ac.uk;;inf.ed.ac.uk", "position": ";MS student;Research scientist;Assistant Professor;Reader (Associate Professor);;Full Professor", "bibtex": "@inproceedings{\nsun2024learning,\ntitle={Learning High-Frequency Functions Made Easy with Sinusoidal Positional Encoding},\nauthor={Chuanhao Sun and Zhihang Yuan and Kai Xu and Luo Mai and Siddharth N and Shuo Chen and Mahesh K. Marina},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qqPL0DkcrI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 989668, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16707694854586050948&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": ";inf.ed.ac.uk;amazon.com;ed.ac.uk;ed.ac.uk;;inf.ed.ac.uk", "author_num": 7, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "University of Edinburgh;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.ed.ac.uk;https://www.amazon.com", "aff_unique_abbr": "Edinburgh;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "Latent Space Symmetry Discovery", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32974", "id": "qstt2OguvM", "proceeding": "https://proceedings.mlr.press/v235/yang24g.html", "pdf": "https://openreview.net/pdf?id=qstt2OguvM", "openreview": "https://openreview.net/forum?id=qstt2OguvM", "author_site": "Jianke Yang, Nima Dehmamy, Robin Walters, Rose Yu", "tldr": "", "abstract": "Equivariant neural networks require explicit knowledge of the symmetry group. Automatic symmetry discovery methods aim to relax this constraint and learn invariance and equivariance from data. However, existing symmetry discovery methods are limited to simple linear symmetries and cannot handle the complexity of real-world data. We propose a novel generative model, Latent LieGAN (LaLiGAN), which can discover symmetries of nonlinear group actions. It learns a mapping from the data space to a latent space where the symmetries become linear and simultaneously discovers symmetries in the latent space. Theoretically, we show that our model can express nonlinear symmetries under some conditions about the group action. Experimentally, we demonstrate that our method can accurately discover the intrinsic symmetry in high-dimensional dynamical systems. LaLiGAN also results in a well-structured latent space that is useful for downstream tasks including equation discovery and long-term forecasting.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jianke Yang;Nima Dehmamy;Robin Walters;Rose Yu", "authorids": "~Jianke_Yang2;~Nima_Dehmamy1;~Robin_Walters1;~Rose_Yu1", "gender": ";M;M;F", "homepage": "https://jiankeyang.github.io;;http://www.robinwalters.com;http://roseyu.com", "dblp": "50/2341;198/1338;258/3416;164/7314", "google_scholar": "https://scholar.google.com/citations?hl=en;gvHpUtgAAAAJ;fnprJmUAAAAJ;", "orcid": ";0000-0003-1617-5502;;", "linkedin": ";nima-dehmamy-57770a4a/;;", "or_profile": "~Jianke_Yang2;~Nima_Dehmamy1;~Robin_Walters1;~Rose_Yu1", "aff": "University of California, San Diego;International Business Machines;Northeastern University ;University of California, San Diego", "aff_domain": "ucsd.edu;ibm.com;northeastern.edu;ucsd.edu", "position": "PhD student;Researcher;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nyang2024latent,\ntitle={Latent Space Symmetry Discovery},\nauthor={Jianke Yang and Nima Dehmamy and Robin Walters and Rose Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qstt2OguvM}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3786695, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8366021739882753992&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "email": "ucsd.edu;ibm.com;northeastern.edu;ucsd.edu", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of California, San Diego;International Business Machines Corporation;Northeastern University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ucsd.edu;https://www.ibm.com;https://www.northeastern.edu", "aff_unique_abbr": "UCSD;IBM;NEU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Multi-Agent Reinforcement Learning Meets Leaf Sequencing in Radiotherapy", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32973", "id": "qwKSTLbati", "proceeding": "https://proceedings.mlr.press/v235/gao24g.html", "pdf": "https://openreview.net/pdf?id=qwKSTLbati", "openreview": "https://openreview.net/forum?id=qwKSTLbati", "author_site": "Riqiang Gao, Florin-Cristian Ghesu, Simon Arberet, Shahab Basiri, Esa Kuusela, Martin Kraus, Dorin Comaniciu, Ali Kamen", "tldr": "", "abstract": "In contemporary radiotherapy planning (RTP), a key module leaf sequencing is predominantly addressed by optimization-based approaches. In this paper, we propose a novel deep reinforcement learning (DRL) model termed as *Reinforced Leaf Sequencer* (RLS) in a multi-agent framework for leaf sequencing. The RLS model offers improvements to time-consuming iterative optimization steps via large-scale training and can control movement patterns through the design of reward mechanisms. We have conducted experiments on four datasets with four metrics and compared our model with a leading optimization sequencer. Our findings reveal that the proposed RLS model can achieve reduced fluence reconstruction errors, and potential faster convergence when integrated in an optimization planner. Additionally, RLS has shown promising results in a full artificial intelligence RTP pipeline. We hope this pioneer multi-agent RL leaf sequencer can foster future research on machine learning for RTP.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Riqiang Gao;Florin-Cristian Ghesu;Simon Arberet;Shahab Basiri;Esa Kuusela;Martin Kraus;Dorin Comaniciu;Ali Kamen", "authorids": "~Riqiang_Gao1;~Florin-Cristian_Ghesu1;simon.arberet@siemens-healthineers.com;shahab.basiri@varian.com;esa.kuusela@varian.com;martin_kraus@siemens-healthineers.com;~Dorin_Comaniciu1;~Ali_Kamen2", "gender": "M;M;;;;;M;M", "homepage": "https://riqianggao.github.io/;;;;;;https://comaniciu.net/;", "dblp": "169/7226;;;;;;54/4552;38/8280", "google_scholar": "VjI_dtUAAAAJ;https://scholar.google.co.uk/citations?user=Z1-KZ8RoM6YC;;;;;-XZ2HrAAAAAJ;j41ocikAAAAJ", "orcid": "0000-0002-8729-1941;;;;;;0000-0002-5238-8647;", "linkedin": "riqiang-gao-97223b119/;;;;;;dorincomaniciu;ali-kamen-3715771/", "or_profile": "~Riqiang_Gao1;~Florin-Cristian_Ghesu1;simon.arberet@siemens-healthineers.com;shahab.basiri@varian.com;esa.kuusela@varian.com;martin_kraus@siemens-healthineers.com;~Dorin_Comaniciu1;~Ali_Kamen2", "aff": "Siemens Healthineers;Siemens Healthineers;;;;;Siemens Healthineers;Siemens Healthineers", "aff_domain": "siemens-healthineers.com;siemens-healthineers.com;;;;;siemens-healthineers.com;siemens-healthineers.com", "position": "Scientist;AI Research Scientist;;;;;SVP AI and Digital innovation;Principal Researcher", "bibtex": "@inproceedings{\ngao2024multiagent,\ntitle={Multi-Agent Reinforcement Learning Meets Leaf Sequencing in Radiotherapy},\nauthor={Riqiang Gao and Florin-Cristian Ghesu and Simon Arberet and Shahab Basiri and Esa Kuusela and Martin Kraus and Dorin Comaniciu and Ali Kamen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qwKSTLbati}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4317686, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7703581346806230993&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "siemens-healthineers.com;siemens-healthineers.com;;;;;siemens-healthineers.com;siemens-healthineers.com", "author_num": 8, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Siemens Healthineers", "aff_unique_dep": "", "aff_unique_url": "https://www.siemens-healthineers.com", "aff_unique_abbr": "Siemens Healthineers", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "title": "$S^2$IP-LLM: Semantic Space Informed Prompt Learning with LLM for Time Series Forecasting", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32972", "id": "qwQVV5R8Y7", "proceeding": "https://proceedings.mlr.press/v235/pan24c.html", "pdf": "https://openreview.net/pdf?id=qwQVV5R8Y7", "openreview": "https://openreview.net/forum?id=qwQVV5R8Y7", "author_site": "Zijie Pan, Yushan Jiang, Sahil Garg, Anderson Schneider, Yuriy Nevmyvaka, Dongjin Song", "tldr": "", "abstract": "Recently, there has been a growing interest in leveraging pre-trained large language models (LLMs) for various time series applications. However, the semantic space of LLMs, established through the pre-training, is still underexplored and may help yield more distinctive and informative representations to facilitate time series forecasting. To this end, we propose Semantic Space Informed Prompt learning with LLM ($S^2$IP-LLM) to align the pre-trained semantic space with time series embedding space and perform time series forecasting based on learned prompts from the joint space. We first design a tokenization module tailored for cross-modality alignment, which explicitly concatenates patches of decomposed time series components to create embeddings that effectively encode the temporal dynamics. Next, we leverage the pre-trained word token embeddings to derive semantic anchors and align selected anchors with time series embeddings by maximizing the cosine similarity in the joint space. This way, $S^2$IP-LLM can retrieve relevant semantic anchors as prompts to provide strong indicators (context) for time series that exhibit different temporal dynamics. With thorough empirical studies on multiple benchmark datasets, we demonstrate that the proposed $S^2$IP-LLM can achieve superior forecasting performance over state-of-the-art baselines. Furthermore, our ablation studies and visualizations verify the necessity of prompt learning informed by semantic space.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zijie Pan;Yushan Jiang;Sahil Garg;Anderson Schneider;Yuriy Nevmyvaka;Dongjin Song", "authorids": "~Zijie_Pan3;~Yushan_Jiang1;~Sahil_Garg1;~Anderson_Schneider1;~Yuriy_Nevmyvaka1;~Dongjin_Song2", "gender": "M;M;M;;;M", "homepage": ";https://sites.google.com/view/jayjiang/home;https://sgarg87.github.io/;;;https://songdj.github.io/", "dblp": ";;117/4904;;92/1859;41/3281", "google_scholar": ";7zOFNbIAAAAJ;Sz2mNx0AAAAJ;;https://scholar.google.com/citations?hl=en;BJdHw6AAAAAJ", "orcid": "0009-0006-4893-586X;;;;;", "linkedin": "zijiepan/;;;;;", "or_profile": "~Zijie_Pan3;~Yushan_Jiang1;~Sahil_Garg1;~Anderson_Schneider1;~Yuriy_Nevmyvaka1;~Dongjin_Song2", "aff": "University of Connecticut;University of Connecticut;Morgan Stanley;;Morgan Stanley;University of Connecticut", "aff_domain": "uconn.edu;uconn.edu;morganstanley.com;;morganstanley.com;uconn.edu", "position": "PhD student;PhD student;Machine Learning Researcher (Vice President);;Principal Researcher;Assistant Professor", "bibtex": "@inproceedings{\npan2024sipllm,\ntitle={\\$S{\\textasciicircum}2\\${IP}-{LLM}: Semantic Space Informed Prompt Learning with {LLM} for Time Series Forecasting},\nauthor={Zijie Pan and Yushan Jiang and Sahil Garg and Anderson Schneider and Yuriy Nevmyvaka and Dongjin Song},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qwQVV5R8Y7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2774362, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15365478949615590601&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "uconn.edu;uconn.edu;morganstanley.com;;morganstanley.com;uconn.edu", "author_num": 6, "aff_unique_index": "0;0;1;1;0", "aff_unique_norm": "University of Connecticut;Morgan Stanley", "aff_unique_dep": ";", "aff_unique_url": "https://www.uconn.edu;https://www.morganstanley.com", "aff_unique_abbr": "UConn;Morgan Stanley", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Test-Time Model Adaptation with Only Forward Passes", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32971", "id": "qz1Vx1v9iK", "proceeding": "https://proceedings.mlr.press/v235/niu24a.html", "pdf": "https://openreview.net/pdf?id=qz1Vx1v9iK", "openreview": "https://openreview.net/forum?id=qz1Vx1v9iK", "author_site": "Shuaicheng Niu, Chunyan Miao, Guohao Chen, Pengcheng Wu, Peilin Zhao", "tldr": "", "abstract": "Test-time adaptation has proven effective in adapting a given trained model to unseen test samples with potential distribution shifts. However, in real-world scenarios, models are usually deployed on resource-limited devices, e.g., FPGAs, and are often quantized and hard-coded with non-modifiable parameters for acceleration. In light of this, existing methods are often infeasible since they heavily depend on computation-intensive backpropagation for model updating that may be not supported. To address this, we propose a test-time Forward-Optimization Adaptation (FOA) method. In FOA, we seek to solely learn a newly added prompt (as model's input) via a derivative-free covariance matrix adaptation evolution strategy. To make this strategy work stably under our online unsupervised setting, we devise a novel fitness function by measuring test-training statistic discrepancy and model prediction entropy. Moreover, we design an activation shifting scheme that directly tunes the model activations for shifted test samples, making them align with the source training domain, thereby further enhancing adaptation performance. Without using any backpropagation and altering model weights, FOA runs on quantized 8-bit ViT outperforms gradient-based TENT on full-precision 32-bit ViT, while achieving an up to *24*-fold memory reduction on ImageNet-C. The source code is available at: https://github.com/mr-eggplant/FOA.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shuaicheng Niu;Chunyan Miao;Guohao Chen;Pengcheng Wu;Peilin Zhao", "authorids": "~Shuaicheng_Niu1;~Chunyan_Miao1;~Guohao_Chen1;~Pengcheng_Wu1;~Peilin_Zhao2", "gender": "M;F;M;M;", "homepage": "https://niushuaicheng.cn/;http://www.ntulily.org/ascymiao/;https://github.com/Cascol-Chen/;;", "dblp": "254/1388;m/ChunyanMiao;;;84/8411", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=fmXGRJgAAAAJ;HZbzdNEAAAAJ;kX_GFDIAAAAJ;https://scholar.google.com.hk/citations?user=HPeX_YcAAAAJ", "orcid": "0000-0001-8212-1831;0000-0002-0300-3448;0009-0007-9736-4642;0000-0003-0487-2060;0000-0001-8543-3953", "linkedin": ";;;;", "or_profile": "~Shuaicheng_Niu1;~Chunyan_Miao1;~Guohao_Chen1;~Pengcheng_Wu1;~Peilin_Zhao2", "aff": "Nanyang Technological University;School of Computer Science and Engineering, Nanyang Technological University;South China University of Technology;Nanyang Technological University;Tencent", "aff_domain": "ntu.edu.sg;scse.ntu.edu.sg;scut.edu.cn;ntu.edu.sg;tencent.com", "position": "Postdoc;Full Professor;MS student;Researcher;Researcher", "bibtex": "@inproceedings{\nniu2024testtime,\ntitle={Test-Time Model Adaptation with Only Forward Passes},\nauthor={Shuaicheng Niu and Chunyan Miao and Guohao Chen and Pengcheng Wu and Peilin Zhao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qz1Vx1v9iK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1336959, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15589056319652439747&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "ntu.edu.sg;scse.ntu.edu.sg;scut.edu.cn;ntu.edu.sg;tencent.com", "author_num": 5, "aff_unique_index": "0;0;1;0;2", "aff_unique_norm": "Nanyang Technological University;South China University of Technology;Tencent", "aff_unique_dep": ";;Tencent Holdings Limited", "aff_unique_url": "https://www.ntu.edu.sg;https://www.scut.edu.cn;https://www.tencent.com", "aff_unique_abbr": "NTU;SCUT;Tencent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;1", "aff_country_unique": "Singapore;China" }, { "title": "Learning to Route Among Specialized Experts for Zero-Shot Generalization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32970", "id": "r0qcGcFL4U", "proceeding": "https://proceedings.mlr.press/v235/muqeeth24a.html", "pdf": "https://openreview.net/pdf?id=r0qcGcFL4U", "openreview": "https://openreview.net/forum?id=r0qcGcFL4U", "author_site": "Mohammed Muqeeth, Haokun Liu, Yufan Liu, Colin Raffel", "tldr": "", "abstract": "Recently, there has been a widespread proliferation of \"expert\" language models that are specialized to a specific task or domain through parameter-efficient fine-tuning. How can we recycle large collections of expert language models to improve zero-shot generalization to unseen tasks? In this work, we propose $\\textbf{P}$ost-$\\textbf{H}$oc $\\textbf{A}$daptive $\\textbf{T}$okenwise $\\textbf{G}$ating $\\textbf{O}$ver an $\\textbf{O}$cean of $\\textbf{S}$pecialized $\\textbf{E}$xperts (**PHATGOOSE**), which learns to route among specialized modules that were produced through parameter-efficient fine-tuning. Unlike past methods that learn to route among specialized models, PHATGOOSE explores the possibility that zero-shot generalization will be improved if different experts can be adaptively chosen for each token and at each layer in the model. Crucially, our method is *post-hoc* - it does not require simultaneous access to the datasets used to create the specialized models and only requires a modest amount of additional compute after each expert model is trained. In experiments covering a range of specialized model collections and zero-shot generalization benchmarks, we find that PHATGOOSE outperforms past methods for post-hoc routing and, in some cases, outperforms explicit multitask training (which requires simultaneous data access). To better understand the routing strategy learned by PHATGOOSE, we perform qualitative experiments to validate that PHATGOOSE's performance stems from its ability to make adaptive per-token and per-module expert choices.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mohammed Muqeeth;Haokun Liu;Yufan Liu;Colin Raffel", "authorids": "~Mohammed_Muqeeth1;~Haokun_Liu1;~Yufan_Liu5;~Colin_Raffel1", "gender": ";;M;", "homepage": "https://muqeeth.github.io;https://haokunliu.github.io/;;http://colinraffel.com", "dblp": "320/4437;169/0460;;149/0082", "google_scholar": "dsAzIX4AAAAJ;T3dz_MQAAAAJ;;I66ZBYwAAAAJ", "orcid": ";;;", "linkedin": "muqeeth-mohammed/;;yufan-liu;", "or_profile": "~Mohammed_Muqeeth1;~Haokun_Liu1;~Yufan_Liu5;~Colin_Raffel1", "aff": "IBM, International Business Machines;Department of Computer Science, University of Toronto;;Hugging Face", "aff_domain": "us.ibm.com;cs.toronto.edu;;huggingface.co", "position": "Researcher;PhD student;;Researcher", "bibtex": "@inproceedings{\nmuqeeth2024learning,\ntitle={Learning to Route Among Specialized Experts for Zero-Shot Generalization},\nauthor={Mohammed Muqeeth and Haokun Liu and Yufan Liu and Colin Raffel},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=r0qcGcFL4U}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 499644, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10019342206869291027&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "us.ibm.com;cs.toronto.edu;;huggingface.co", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "International Business Machines;University of Toronto;Hugging Face", "aff_unique_dep": ";Department of Computer Science;", "aff_unique_url": "https://www.ibm.com;https://www.utoronto.ca;https://huggingface.co", "aff_unique_abbr": "IBM;U of T;Hugging Face", "aff_campus_unique_index": "1", "aff_campus_unique": ";Toronto", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;Canada" }, { "title": "Superposition Prompting: Improving and Accelerating Retrieval-Augmented Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32969", "id": "r8k5JrGip6", "proceeding": "https://proceedings.mlr.press/v235/merth24a.html", "pdf": "https://openreview.net/pdf?id=r8k5JrGip6", "openreview": "https://openreview.net/forum?id=r8k5JrGip6", "author_site": "Thomas Merth, Qichen Fu, Mohammad Rastegari, Mahyar Najibi", "tldr": "", "abstract": "Despite the successes of large language models (LLMs), they exhibit significant drawbacks, particularly when processing long contexts. Their inference cost scales quadratically with respect to sequence length, making it expensive for deployment in some real-world text processing applications, such as retrieval-augmented generation (RAG). Additionally, LLMs also exhibit the \"distraction phenomenon\", where irrelevant context in the prompt degrades output quality. To address these drawbacks, we propose a novel RAG prompting methodology, *superposition prompting*, which can be directly applied to pre-trained transformer-based LLMs *without the need for fine-tuning*. At a high level, superposition prompting allows the LLM to process input documents in parallel *prompt paths*, discarding paths once they are deemed irrelevant. We demonstrate the capability of our method to simultaneously enhance time efficiency across a variety of question-answering benchmarks using multiple pre-trained LLMs. Furthermore, our technique significantly improves accuracy when the retrieved context is large relative the context the model was trained on. For example, our approach facilitates a $93\\times$ reduction in compute time while *improving* accuracy by $43\\%$ on the NaturalQuestions-Open dataset with the MPT-7B instruction-tuned model over naive RAG.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Thomas Merth;Qichen Fu;Mohammad Rastegari;Mahyar Najibi", "authorids": "~Thomas_Merth2;~Qichen_Fu2;~Mohammad_Rastegari2;~Mahyar_Najibi1", "gender": "M;M;M;M", "homepage": ";https://fuqichen1998.github.io/;https://mrastegari.github.io/;https://www.mahyarnajibi.com", "dblp": ";304/2909;31/5228;137/8354", "google_scholar": ";4wcQM3UAAAAJ;N4-2Z_cAAAAJ;bZb6e_sAAAAJ", "orcid": ";;;", "linkedin": "https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&cad=rja&uact=8&ved=2ahUKEwjeqPL1wbX6AhUYADQIHQuHCiEQFnoECAgQAQ&url=https%3A%2F%2Fwww.linkedin.com%2Fin%2Fthomas-merth&usg=AOvVaw0Zc9tUoZi9bmAfDVWzwB71;qichen-fu/;;", "or_profile": "~Thomas_Merth2;~Qichen_Fu2;~Mohammad_Rastegari2;~Mahyar_Najibi1", "aff": ";Apple;Department of Computer Science, University of Washington;Apple", "aff_domain": ";apple.com;cs.washington.edu;apple.com", "position": ";Researcher;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nmerth2024superposition,\ntitle={Superposition Prompting: Improving and Accelerating Retrieval-Augmented Generation},\nauthor={Thomas Merth and Qichen Fu and Mohammad Rastegari and Mahyar Najibi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=r8k5JrGip6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1001385, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2054731421648326281&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";apple.com;cs.washington.edu;apple.com", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Apple;University of Washington", "aff_unique_dep": "Apple Inc.;Department of Computer Science", "aff_unique_url": "https://www.apple.com;https://www.washington.edu", "aff_unique_abbr": "Apple;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Reweighted Solutions for Weighted Low Rank Approximation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32968", "id": "r9XICONppE", "proceeding": "https://proceedings.mlr.press/v235/woodruff24b.html", "pdf": "https://openreview.net/pdf?id=r9XICONppE", "openreview": "https://openreview.net/forum?id=r9XICONppE", "author_site": "David Woodruff, Taisuke Yasuda", "tldr": "", "abstract": "Weighted low rank approximation (WLRA) is an important yet computationally challenging primitive with applications ranging from statistical analysis, model compression, and signal processing. To cope with the NP-hardness of this problem, prior work considers heuristics, bicriteria, or parameterized tractable algorithms to solve this problem. In this work, we introduce a new relaxed solution to WLRA which outputs a matrix that is not necessarily low rank, but can be stored using very few parameters and gives provable approximation guarantees when the weight matrix has low rank. Our central idea is to use the weight matrix itself to reweight a low rank solution, which gives an extremely simple algorithm with remarkable empirical performance in applications to model compression and on synthetic datasets. Our algorithm also gives nearly optimal communication complexity bounds for a natural distributed problem associated with this problem, for which we show matching communication lower bounds. Together, our communication complexity bounds show that the rank of the weight matrix provably parameterizes the communication complexity of WLRA. We also obtain the first relative error guarantees for feature selection with a weighted objective.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "David Woodruff;Taisuke Yasuda", "authorids": "~David_Woodruff1;~Taisuke_Yasuda1", "gender": "M;M", "homepage": "http://www.cs.cmu.edu/~dwoodruf/;https://taisukeyasuda.github.io/", "dblp": "w/DPWoodruff;177/9741-2", "google_scholar": "https://scholar.google.com.tw/citations?user=0G2t-6sAAAAJ;c62WqiEAAAAJ", "orcid": ";", "linkedin": ";taisukeyasuda/", "or_profile": "~David_Woodruff1;~Taisuke_Yasuda1", "aff": "Carnegie Mellon University;School of Computer Science, Carnegie Mellon University", "aff_domain": "cmu.edu;cs.cmu.edu", "position": "Full Professor;PhD student", "bibtex": "@inproceedings{\nwoodruff2024reweighted,\ntitle={Reweighted Solutions for Weighted Low Rank Approximation},\nauthor={David Woodruff and Taisuke Yasuda},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=r9XICONppE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 606745, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4768689046908959188&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "cmu.edu;cs.cmu.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "BiSHop: Bi-Directional Cellular Learning for Tabular Data with Generalized Sparse Modern Hopfield Model", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32967", "id": "r9rzU9QzPe", "proceeding": "https://proceedings.mlr.press/v235/xu24l.html", "pdf": "https://openreview.net/pdf?id=r9rzU9QzPe", "openreview": "https://openreview.net/forum?id=r9rzU9QzPe", "author_site": "Chenwei Xu, Yu-Chao Huang, Jerry Yao-Chieh Hu, Weijian Li, Ammar Gilani, Hsi-Sheng Goan, Han Liu", "tldr": "", "abstract": "We introduce the **Bi**-Directional **S**parse **Hop**field Network (**BiSHop**), a novel end-to-end framework for tabular learning. BiSHop handles the two major challenges of deep tabular learning: non-rotationally invariant data structure and feature sparsity in tabular data. Our key motivation comes from the recently established connection between associative memory and attention mechanisms. Consequently, BiSHop uses a dual-component approach, sequentially processing data both column-wise and row-wise through two interconnected directional learning modules. Computationally, these modules house layers of generalized sparse modern Hopfield layers, a sparse extension of the modern Hopfield model with learnable sparsity. Methodologically, BiSHop facilitates multi-scale representation learning, capturing both intra-feature and inter-feature interactions, with adaptive sparsity at each scale. Empirically, through experiments on diverse real-world datasets, BiSHop surpasses current SOTA methods with significantly fewer HPO runs, marking it a robust solution for deep tabular learning. The code is available on [GitHub](https://github.com/MAGICS-LAB/BiSHop); future updates are on [arXiv](https://arxiv.org/abs/2404.03830).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chenwei Xu;Yu-Chao Huang;Jerry Yao-Chieh Hu;Weijian Li;Ammar Gilani;Hsi-Sheng Goan;Han Liu", "authorids": "~Chenwei_Xu2;r11222015@g.ntu.edu.tw;~Jerry_Yao-Chieh_Hu1;~Weijian_Li2;~Ammar_Gilani1;goan@phys.ntu.edu.tw;~Han_Liu4", "gender": ";;;M;M;;", "homepage": ";;;;https://www.northwestern.edu/;;", "dblp": ";;;;225/5425;;", "google_scholar": ";;;https://scholar.google.com/citations?hl=en;;;", "orcid": ";;;0009-0003-4158-4380;;;", "linkedin": ";;;weijian-li-b52566153/;;;", "or_profile": "~Chenwei_Xu2;r11222015@g.ntu.edu.tw;~Jerry_Yao-Chieh_Hu1;~Weijian_Li2;~Ammar_Gilani1;goan@phys.ntu.edu.tw;~Han_Liu4", "aff": ";;;Northwestern University;;;Northwestern University", "aff_domain": ";;;northwestern.edu;;;u.northwestern.edu", "position": ";;;PhD student;;;Associate Professor", "bibtex": "@inproceedings{\nxu2024bishop,\ntitle={Bi{SH}op: Bi-Directional Cellular Learning for Tabular Data with Generalized Sparse Modern Hopfield Model},\nauthor={Chenwei Xu and Yu-Chao Huang and Jerry Yao-Chieh Hu and Weijian Li and Ammar Gilani and Hsi-Sheng Goan and Han Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=r9rzU9QzPe}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5187726, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=135650947467767166&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": ";;;northwestern.edu;;;u.northwestern.edu", "author_num": 7, "aff_unique_index": "0;0", "aff_unique_norm": "Northwestern University", "aff_unique_dep": "", "aff_unique_url": "https://www.northwestern.edu", "aff_unique_abbr": "NU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "InstructZero: Efficient Instruction Optimization for Black-Box Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32966", "id": "rADFNrIss3", "proceeding": "https://proceedings.mlr.press/v235/chen24e.html", "pdf": "https://openreview.net/pdf?id=rADFNrIss3", "openreview": "https://openreview.net/forum?id=rADFNrIss3", "author_site": "Lichang Chen, Jiuhai Chen, Tom Goldstein, Heng Huang, Tianyi Zhou", "tldr": "", "abstract": "Large language models (LLMs) are instruction followers but the performance varies under different instructions. It is challenging to create the best instruction, especially for black-box LLMs on which backpropagation is forbidden. Instead of directly optimizing the discrete instruction, we optimize a low-dimensional soft prompt applied to an open-source LLM to generate the instruction for the black-box LLM. In each optimization step of the proposed method InstructZero, a soft prompt is converted into an instruction by the open-source LLM, which is then submitted to the black-box LLM for zero-shot evaluation, whose result is sent to Bayesian optimization to produce new soft prompts improving the zero-shot performance. We evaluate InstructZero on different combinations of open-source LLMs and APIs including Vicuna and ChatGPT. InstructZero outperforms SOTA auto-instruction methods across a variety of downstream tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lichang Chen;Jiuhai Chen;Tom Goldstein;Heng Huang;Tianyi Zhou", "authorids": "~Lichang_Chen2;~Jiuhai_Chen1;~Tom_Goldstein1;~Heng_Huang1;~Tianyi_Zhou1", "gender": "M;M;M;M;M", "homepage": "https://www.linkedin.com/in/jiuhai-chen-6a486715a/;https://www.cs.umd.edu/~tomg/;https://www.cs.umd.edu/~heng/;https://tianyizhou.github.io/;", "dblp": ";25/8184;03/281;88/8205-1;151/6212", "google_scholar": ";KmSuVtgAAAAJ;4OqLaDwAAAAJ;OKvgizMAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";;;0000-0001-5348-0632;", "linkedin": ";;;tianyizhou;lichang-chen-b7a506173/", "or_profile": "~Jiuhai_Chen1;~Tom_Goldstein1;~Heng_Huang1;~Tianyi_Zhou1;~LICHANG_CHEN1", "aff": "University of Maryland, College Park;University of Maryland, College Park;Department of Computer Science, University of Maryland, College Park;University of Maryland, College Park;Department of Computer Science, University of Maryland, College Park", "aff_domain": "umd.edu;umd.edu;cs.umd.edu;umd.edu;cs.umd.edu", "position": "PhD student;Full Professor;Full Professor;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nchen2024instructzero,\ntitle={InstructZero: Efficient Instruction Optimization for Black-Box Large Language Models},\nauthor={Lichang Chen and Jiuhai Chen and Tom Goldstein and Heng Huang and Tianyi Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rADFNrIss3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2335182, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 87, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11656455885715424620&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "umd.edu;umd.edu;cs.umd.edu;umd.edu;cs.umd.edu", "author_num": 5, "aff_unique_index": "0;0;1;0;1", "aff_unique_norm": "University of Maryland;University of Maryland, College Park", "aff_unique_dep": ";Department of Computer Science", "aff_unique_url": "https://www/umd.edu;https://www/umd.edu", "aff_unique_abbr": "UMD;UMD", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "UPOCR: Towards Unified Pixel-Level OCR Interface", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32965", "id": "rEZ24oJhbn", "proceeding": "https://proceedings.mlr.press/v235/peng24e.html", "pdf": "https://openreview.net/pdf?id=rEZ24oJhbn", "openreview": "https://openreview.net/forum?id=rEZ24oJhbn", "author_site": "Dezhi Peng, Zhenhua Yang, Jiaxin Zhang, Chongyu Liu, Yongxin Shi, Kai Ding, Fengjun Guo, Lianwen Jin", "tldr": "", "abstract": "Existing optical character recognition (OCR) methods rely on task-specific designs with divergent paradigms, architectures, and training strategies, which significantly increases the complexity of research and maintenance and hinders the fast deployment in applications. To this end, we propose UPOCR, a simple-yet-effective generalist model for Unified Pixel-level OCR interface. Specifically, the UPOCR unifies the paradigm of diverse OCR tasks as image-to-image transformation and the architecture as a vision Transformer (ViT)-based encoder-decoder with learnable task prompts. The prompts push the general feature representations extracted by the encoder towards task-specific spaces, endowing the decoder with task awareness. Moreover, the model training is uniformly aimed at minimizing the discrepancy between the predicted and ground-truth images regardless of the inhomogeneity among tasks. Experiments are conducted on three pixel-level OCR tasks including text removal, text segmentation, and tampered text detection. Without bells and whistles, the experimental results showcase that the proposed method can simultaneously achieve state-of-the-art performance on three tasks with a unified single model, which provides valuable strategies and insights for future research on generalist OCR models. Code is available at https://github.com/shannanyinxiang/UPOCR.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dezhi Peng;Zhenhua Yang;Jiaxin Zhang;Chongyu Liu;Yongxin Shi;Kai Ding;Fengjun Guo;Lianwen Jin", "authorids": "~Dezhi_Peng1;~Zhenhua_Yang3;~Jiaxin_Zhang1;~Chongyu_Liu2;~Yongxin_Shi2;~Kai_Ding2;~Fengjun_Guo1;~Lianwen_Jin1", "gender": "M;;M;;;M;M;M", "homepage": ";https://yeungchenwa.github.io/;https://github.com/ZZZHANG-jx;https://www.scut.edu.cn/new/;;;https://dblp.org/pid/74/9103;http://www.dlvc-lab.net/lianwen/", "dblp": "217/2342;42/4517;32/7698-3;211/4070.html;359/4310;44/2891-9.html;74/9103;54/3221", "google_scholar": "6zNgcjAAAAAJ;https://scholar.google.com/citations?view_op=list_works;pXh33zUAAAAJ;dW7AgfgAAAAJ;e-3XAoAAAAAJ;SX43hBUAAAAJ;;WMUStEUAAAAJ", "orcid": "0000-0002-3263-3449;;0000-0001-9787-9514;;0009-0003-2650-1663;0000-0002-9371-0751;;0000-0002-5456-0957", "linkedin": ";;;;;;;", "or_profile": "~Dezhi_Peng1;~Zhenhua_Yang3;~Jiaxin_Zhang1;~Chongyu_Liu2;~Yongxin_Shi2;~Kai_Ding2;~Fengjun_Guo1;~Lianwen_Jin1", "aff": "South China University of Technology;South China University of Technology;South China University of Technology;South China University of Technology;South China University of Technology;INTSIG Information;Intsig Information Co. Ltd;South China University of Technology", "aff_domain": "scut.edu.cn;scut.edu.cn;scut.edu.cn;scut.edu.cn;scut.edu.cn;intsig.net;intsig.com;scut.edu.cn", "position": "PhD student;MS student;PhD student;PhD student;PhD student;Researcher;Principal Researcher;Professor", "bibtex": "@inproceedings{\npeng2024upocr,\ntitle={{UPOCR}: Towards Unified Pixel-Level {OCR} Interface},\nauthor={Dezhi Peng and Zhenhua Yang and Jiaxin Zhang and Chongyu Liu and Yongxin Shi and Kai Ding and Fengjun Guo and Lianwen Jin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rEZ24oJhbn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9872330, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17832777491967033016&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "scut.edu.cn;scut.edu.cn;scut.edu.cn;scut.edu.cn;scut.edu.cn;intsig.net;intsig.com;scut.edu.cn", "author_num": 8, "aff_unique_index": "0;0;0;0;0;1;2;0", "aff_unique_norm": "South China University of Technology;INTSIG Information Co., Ltd.;Intsig Information Co. Ltd", "aff_unique_dep": ";;", "aff_unique_url": "https://www.scut.edu.cn;http://www.intsig.com;http://www.intsig.com/", "aff_unique_abbr": "SCUT;;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "PASOA- PArticle baSed Bayesian Optimal Adaptive design", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32964", "id": "rGCvMARXkG", "proceeding": "https://proceedings.mlr.press/v235/iollo24a.html", "pdf": "https://openreview.net/pdf?id=rGCvMARXkG", "openreview": "https://openreview.net/forum?id=rGCvMARXkG", "author_site": "Jacopo Iollo, Christophe Heinkel\u00e9, Pierre Alliez, Florence Forbes", "tldr": "", "abstract": "We propose a new procedure named PASOA, for Bayesian experimental design, that performs sequential design optimization by simultaneously providing accurate estimates of successive posterior distributions for parameter inference. The sequential design process is carried out via a contrastive estimation principle, using stochastic optimization and Sequential Monte Carlo (SMC) samplers to maximise the Expected Information Gain (EIG). As larger information gains are obtained for larger distances between successive posterior distributions, this EIG objective may worsen classical SMC performance. To handle this issue, tempering is proposed to have both a large information gain and an accurate SMC sampling, that we show is crucial for performance. This novel combination of stochastic optimization and tempered SMC allows to jointly handle design optimization and parameter inference. We provide a proof that the obtained optimal design estimators benefit from some consistency property. Numerical experiments confirm the potential of the approach, which outperforms other recent existing procedures.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jacopo Iollo;Christophe Heinkel\u00e9;Pierre Alliez;Florence Forbes", "authorids": "~Jacopo_Iollo1;~Christophe_Heinkel\u00e91;~Pierre_Alliez1;~Florence_Forbes1", "gender": "M;M;M;", "homepage": ";https://www.cerema.fr/fr/innovation-recherche/recherche/equipes/endsum-evaluation-non-destructive-structures-materiaux;https://team.inria.fr/titane/pierre-alliez/;", "dblp": ";;98/1937.html;", "google_scholar": ";;https://scholar.google.fr/citations?user=nQuoXgwAAAAJ;", "orcid": ";;0000-0002-6214-4005;", "linkedin": "jacopo-iollo/;;pierre-alliez-1078612/;", "or_profile": "~Jacopo_Iollo1;~Christophe_Heinkel\u00e91;~Pierre_Alliez1;~Florence_Forbes1", "aff": "INRIA Rhone-Alpes;Cerema Endsum;INRIA;", "aff_domain": "inrialpes.fr;cerema.fr;inria.fr;", "position": "PhD student;Researcher;Principal Researcher;", "bibtex": "@inproceedings{\niollo2024pasoa,\ntitle={{PASOA}- {PA}rticle baSed Bayesian Optimal Adaptive design},\nauthor={Jacopo Iollo and Christophe Heinkel{\\'e} and Pierre Alliez and Florence Forbes},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rGCvMARXkG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8010499, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15303512509472870432&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "inrialpes.fr;cerema.fr;inria.fr;", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "INRIA;Cerema", "aff_unique_dep": ";", "aff_unique_url": "https://www.inria.fr;https://www.cerema.fr", "aff_unique_abbr": "INRIA;", "aff_campus_unique_index": "0", "aff_campus_unique": "Rhone-Alpes;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "Restoring balance: principled under/oversampling of data for optimal classification", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32963", "id": "rHylzxK3HU", "proceeding": "https://proceedings.mlr.press/v235/loffredo24a.html", "pdf": "https://openreview.net/pdf?id=rHylzxK3HU", "openreview": "https://openreview.net/forum?id=rHylzxK3HU", "author_site": "Emanuele Loffredo, Mauro Pastore, Simona Cocco, Remi Monasson", "tldr": "", "abstract": "Class imbalance in real-world data poses a common bottleneck for machine learning tasks, since achieving good generalization on under-represented examples is often challenging. Mitigation strategies, such as under or oversampling the data depending on their abundances, are routinely proposed and tested empirically, but how they should adapt to the data statistics remains poorly understood. In this work, we determine exact analytical expressions of the generalization curves in the high-dimensional regime for linear classifiers (Support Vector Machines). We also provide a sharp prediction of the effects of under/oversampling strategies depending on class imbalance, first and second moments of the data, and the metrics of performance considered. We show that mixed strategies involving under and oversampling of data lead to performance improvement. Through numerical experiments, we show the relevance of our theoretical predictions on real datasets, on deeper architectures and with sampling strategies based on unsupervised probabilistic models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Emanuele Loffredo;Mauro Pastore;Simona Cocco;Remi Monasson", "authorids": "~Emanuele_Loffredo1;mauro.pastore@phys.ens.fr;~Simona_Cocco1;~Remi_Monasson1", "gender": ";;F;M", "homepage": ";;http://www.lps.ens.fr/~cocco/;http://www.phys.ens.fr/~monasson/", "dblp": ";;;", "google_scholar": ";;;J6LkBeUAAAAJ", "orcid": "0009-0004-4882-8250;;0000-0002-1852-7789;", "linkedin": ";;;", "or_profile": "~Emanuele_Loffredo1;mauro.pastore@phys.ens.fr;~Simona_Cocco1;~Remi_Monasson1", "aff": "Ecole Normale Sup\u00e9rieure de Paris;;Ecole Normale Sup\u00e9rieure de Paris;Ecole Normale Sup\u00e9rieure de Paris", "aff_domain": "ens.fr;;ens.fr;ens.fr", "position": "PhD student;;Full Professor;Full Professor", "bibtex": "@inproceedings{\nloffredo2024restoring,\ntitle={Restoring balance: principled under/oversampling of data for optimal classification},\nauthor={Emanuele Loffredo and Mauro Pastore and Simona Cocco and Remi Monasson},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rHylzxK3HU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1305131, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6577093169220223160&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "ens.fr;;ens.fr;ens.fr", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Ecole Normale Sup\u00e9rieure de Paris", "aff_unique_dep": "", "aff_unique_url": "https://www.ens.fr", "aff_unique_abbr": "ENS Paris", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Paris", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "Visual Representation Learning with Stochastic Frame Prediction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32962", "id": "rI6lxIX0uX", "proceeding": "https://proceedings.mlr.press/v235/jang24c.html", "pdf": "https://openreview.net/pdf?id=rI6lxIX0uX", "openreview": "https://openreview.net/forum?id=rI6lxIX0uX", "author_site": "Huiwon Jang, Dongyoung Kim, Junsu Kim, Jinwoo Shin, Pieter Abbeel, Younggyo Seo", "tldr": "", "abstract": "Self-supervised learning of image representations by predicting future frames is a promising direction but still remains a challenge. This is because of the under-determined nature of frame prediction; multiple potential futures can arise from a single current frame. To tackle this challenge, in this paper, we revisit the idea of stochastic video generation that learns to capture uncertainty in frame prediction and explore its effectiveness for representation learning. Specifically, we design a framework that trains a stochastic frame prediction model to learn temporal information between frames. Moreover, to learn dense information within each frame, we introduce an auxiliary masked image modeling objective along with a shared decoder architecture. We find this architecture allows for combining both objectives in a synergistic and compute-efficient manner. We demonstrate the effectiveness of our framework on a variety of tasks from video label propagation and vision-based robot learning domains, such as video segmentation, pose tracking, vision-based robotic locomotion, and manipulation tasks. Code is available on the project webpage: https://sites.google.com/view/2024rsp.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Huiwon Jang;Dongyoung Kim;Junsu Kim;Jinwoo Shin;Pieter Abbeel;Younggyo Seo", "authorids": "~Huiwon_Jang1;~Dongyoung_Kim3;~Junsu_Kim1;~Jinwoo_Shin1;~Pieter_Abbeel2;~Younggyo_Seo1", "gender": "M;M;M;M;M;M", "homepage": "https://huiwon-jang.github.io/;https://kingdy2002.github.io/;https://sites.google.com/view/junsu-kim;https://sites.google.com/site/mijirim/;https://people.eecs.berkeley.edu/~pabbeel/;https://younggyo.me/", "dblp": "332/0647;;;31/7062;;265/5586", "google_scholar": "https://scholar.google.com/citations?hl=en;bsp1E58AAAAJ;1o9cS8UAAAAJ;https://scholar.google.com.tw/citations?user=m3eDp7kAAAAJ;https://scholar.google.com.tw/citations?user=vtwH6GkAAAAJ;tI1-YwIAAAAJ", "orcid": ";;;;;", "linkedin": "huiwon-jang-5a789b250;;junsu-kim-b170b3168/;;;", "or_profile": "~Huiwon_Jang1;~Dongyoung_Kim3;~Junsu_Kim1;~Jinwoo_Shin1;~Pieter_Abbeel2;~Younggyo_Seo1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Covariant;Dyson", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;covariant.ai;dyson.com", "position": "PhD student;Undergrad student;Ph.D. student;Full Professor;Founder;Researcher", "bibtex": "@inproceedings{\njang2024visual,\ntitle={Visual Representation Learning with Stochastic Frame Prediction},\nauthor={Huiwon Jang and Dongyoung Kim and Junsu Kim and Jinwoo Shin and Pieter Abbeel and Younggyo Seo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rI6lxIX0uX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2237001, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18436846782870906529&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;covariant.ai;dyson.com", "author_num": 6, "aff_unique_index": "0;0;0;0;1;2", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Covariant;Dyson", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kaist.ac.kr;;https://www.dyson.com", "aff_unique_abbr": "KAIST;;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;2", "aff_country_unique": "South Korea;;United Kingdom" }, { "title": "GNNs Also Deserve Editing, and They Need It More Than Once", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32961", "id": "rIc9adYbH2", "proceeding": "https://proceedings.mlr.press/v235/zhong24d.html", "pdf": "https://openreview.net/pdf?id=rIc9adYbH2", "openreview": "https://openreview.net/forum?id=rIc9adYbH2", "author_site": "Shaochen (Henry) Zhong, Duy Le, Zirui Liu, Zhimeng Jiang, Andrew Ye, Jiamu Zhang, Jiayi Yuan, Kaixiong Zhou, Zhaozhuo Xu, Jing Ma, Shuai Xu, Vipin Chaudhary, Xia Hu", "tldr": "", "abstract": "Suppose a self-driving car is crashing into pedestrians, or a chatbot is instructing its users to conduct criminal wrongdoing; the stakeholders of such products will undoubtedly want to patch these catastrophic errors as soon as possible. To address such concerns, *Model Editing:* the study of efficiently patching model behaviors without significantly altering their general performance, has seen considerable activity, with hundreds of editing techniques developed in various domains such as CV and NLP. However, **the graph learning community has objectively fallen behind with only a few Graph Neural Network-compatible \u2014 and just one GNN-specific \u2014 model editing methods available**, where all of which are limited in their practical scope. We argue that the impracticality of these methods lies in their lack of *Sequential Editing Robustness:* the ability to edit multiple errors sequentially, and therefore fall short in effectiveness, as this approach mirrors how errors are discovered and addressed in the real world. In this paper, we delve into the specific reasons behind the difficulty of editing GNNs in succession and observe the root cause to be model overfitting. We subsequently propose a simple yet effective solution \u2014 SEED-GNN \u2014 by leveraging overfit-prevention techniques in a GNN-specific context to derive the first and only GNN model editing method that scales practically. Additionally, we formally frame the task paradigm of GNN editing and hope to inspire future research in this crucial but currently overlooked field. Please refer to our [GitHub repository](https://github.com/henryzhongsc/gnn_editing) for code and checkpoints.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shaochen Zhong;Duy Le;Zirui Liu;Zhimeng Jiang;Andrew Ye;Jiamu Zhang;Jiayi Yuan;Kaixiong Zhou;Zhaozhuo Xu;Jing Ma;Shuai Xu;Vipin Chaudhary;Xia Hu", "authorids": "~Shaochen_Zhong1;~Duy_Le1;~Zirui_Liu1;~Zhimeng_Jiang1;~Andrew_Ye1;~Jiamu_Zhang1;~Jiayi_Yuan1;~Kaixiong_Zhou1;~Zhaozhuo_Xu2;~Jing_Ma2;~Shuai_Xu2;~Vipin_Chaudhary2;~Xia_Hu4", "gender": "M;M;M;M;M;M;;M;;F;M;M;M", "homepage": "https://openreview.net/profile?id=~Shaochen_Zhong1;;https://zirui-ray-liu.github.io/;http://www.zhimengjiang.com/;https://andrew-ye.com/;;https://jy-yuan.github.io/;https://kaixiong-zhou.github.io/;https://ottovonxu.github.io/;https://jma712.github.io/;https://engineering.case.edu/profiles/sxx214;https://engineering.case.edu/profiles/vxc204;https://cs.rice.edu/~xh37/index.html", "dblp": "326/7286.html;35/6375-1;196/8629-1.html;217/3235;368/4125;;251/4029-1.html;178/7315;195/4352;96/6129-2;;c/VipinChaudhary.html;256/9406.html", "google_scholar": "https://scholar.google.com/citations?hl=en;xwuBnK8AAAAJ;https://scholar.google.com/citations?hl=zh-CN;5Es3Yk4AAAAJ;N0gvFN4AAAAJ;eQpW5EIAAAAJ;XMrlrV8AAAAJ;zMspIjIAAAAJ;7tDlVAsAAAAJ;VLElvX8AAAAJ;wu-vtI4AAAAJ;vJbjqpIAAAAJ;https://scholar.google.com.tw/citations?user=pcCS60IAAAAJ", "orcid": ";;;0000-0001-6933-3952;;;;0000-0001-5226-8736;;;;0000-0001-9672-6225;", "linkedin": "shaochen-henry-zhong-96a941249/;duy-escanord-le/;;;;jiamu-zhang-morris;;;;;;vipin-chaudhary-379529/;", "or_profile": "~Shaochen_Zhong1;~Duy_Le1;~Zirui_Liu1;~Zhimeng_Jiang1;~Andrew_Ye1;~Jiamu_Zhang1;~Jiayi_Yuan1;~Kaixiong_Zhou1;~Zhaozhuo_Xu2;~Jing_Ma2;~Shuai_Xu2;~Vipin_Chaudhary2;~Xia_Hu2", "aff": "Rice University;Case Western Reserve University;Rice University;VISA Research;Case Western Reserve University;Case Western Reserve University;Rice University;Massachusetts Institute of Technology;Rice University;Case Western Reserve University;Case Western Reserve University;Case Western Reserve University;Rice University", "aff_domain": "rice.edu;case.edu;rice.edu;visa.com;case.edu;case.edu;rice.edu;mit.edu;rice.edu;case.edu;case.edu;case.edu;rice.edu", "position": "PhD student;Undergrad student;PhD student;Researcher;Undergrad student;Undergrad student;PhD student;Postdoc;PhD student;Assistant Professor;Assistant Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nzhong2024gnns,\ntitle={{GNN}s Also Deserve Editing, and They Need It More Than Once},\nauthor={Shaochen Zhong and Duy Le and Zirui Liu and Zhimeng Jiang and Andrew Ye and Jiamu Zhang and Jiayi Yuan and Kaixiong Zhou and Zhaozhuo Xu and Jing Ma and Shuai Xu and Vipin Chaudhary and Xia Hu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rIc9adYbH2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 419146, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 13, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10434778580277886715&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "rice.edu;case.edu;rice.edu;visa.com;case.edu;case.edu;rice.edu;mit.edu;rice.edu;case.edu;case.edu;case.edu;rice.edu", "author_num": 13, "aff_unique_index": "0;1;0;2;1;1;0;3;0;1;1;1;0", "aff_unique_norm": "Rice University;Case Western Reserve University;VISA;Massachusetts Institute of Technology", "aff_unique_dep": ";;Research;", "aff_unique_url": "https://www.rice.edu;https://www.case.edu;https://www.visa.com/;https://web.mit.edu", "aff_unique_abbr": "Rice;CWRU;VISA;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Exploration and Anti-Exploration with Distributional Random Network Distillation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32960", "id": "rIrpzmqRBk", "proceeding": "https://proceedings.mlr.press/v235/yang24w.html", "pdf": "https://openreview.net/pdf?id=rIrpzmqRBk", "openreview": "https://openreview.net/forum?id=rIrpzmqRBk", "author_site": "Kai Yang, jian tao, Jiafei Lyu, Xiu Li", "tldr": "", "abstract": "Exploration remains a critical issue in deep reinforcement learning for an agent to attain high returns in unknown environments. Although the prevailing exploration Random Network Distillation (RND) algorithm has been demonstrated to be effective in numerous environments, it often needs more discriminative power in bonus allocation. This paper highlights the ``bonus inconsistency'' issue within RND, pinpointing its primary limitation. To address this issue, we introduce the Distributional RND (DRND), a derivative of the RND. DRND enhances the exploration process by distilling a distribution of random networks and implicitly incorporating pseudo counts to improve the precision of bonus allocation. This refinement encourages agents to engage in more extensive exploration. Our method effectively mitigates the inconsistency issue without introducing significant computational overhead. Both theoretical analysis and experimental results demonstrate the superiority of our approach over the original RND algorithm. Our method excels in challenging online exploration scenarios and effectively serves as an anti-exploration mechanism in D4RL offline tasks. Our code is publicly available at https://github.com/yk7333/DRND.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kai Yang;Jian Tao;Jiafei Lyu;Xiu Li", "authorids": "~Kai_Yang6;~Jian_Tao4;~Jiafei_Lyu1;~Xiu_Li1", "gender": "M;;M;F", "homepage": "https://github.com/yk7333;;;https://thusigsiclab.github.io/thu.github.io/introduction.html", "dblp": ";;278/1503;13/1206-1", "google_scholar": ";;bfgCMr8AAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";0009-0007-8439-3161;0000-0001-6616-417X;0000-0003-0403-1923", "linkedin": ";;;", "or_profile": "~Kai_Yang6;~Jian_Tao4;~Jiafei_Lyu1;~Xiu_Li1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;mail.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "MS student;MS student;PhD student;Professor", "bibtex": "@inproceedings{\nyang2024exploration,\ntitle={Exploration and Anti-Exploration with Distributional Random Network Distillation},\nauthor={Kai Yang and Jian Tao and Jiafei Lyu and Xiu Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rIrpzmqRBk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 10057219, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13786006434311853287&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "tsinghua.edu.cn;mail.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Linguistic Calibration of Long-Form Generations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32959", "id": "rJVjQSQ8ye", "proceeding": "https://proceedings.mlr.press/v235/band24a.html", "pdf": "https://openreview.net/pdf?id=rJVjQSQ8ye", "openreview": "https://openreview.net/forum?id=rJVjQSQ8ye", "author_site": "Neil Band, Xuechen Li, Tengyu Ma, Tatsunori Hashimoto", "tldr": "", "abstract": "Language models (LMs) may lead their users to make suboptimal downstream decisions when they confidently hallucinate. This issue can be mitigated by having the LM verbally convey the probability that its claims are correct, but existing models cannot produce long-form text with calibrated confidence statements. Through the lens of decision-making, we define linguistic calibration for long-form generations: an LM is linguistically calibrated if its generations enable its users to make calibrated probabilistic predictions. This definition enables a training framework where a supervised finetuning step bootstraps an LM to emit long-form generations with confidence statements such as \"I estimate a 30% chance of...\" or \"I am certain that...\", followed by a reinforcement learning step which rewards generations that enable a user to provide calibrated answers to related questions. We linguistically calibrate Llama 2 7B and find in automated and human evaluations of long-form generations that it is significantly more calibrated than strong finetuned factuality baselines with comparable accuracy. These findings generalize under significant domain shifts to scientific and biomedical questions and to an entirely held-out person biography generation task. Our results demonstrate that long-form generations may be calibrated end-to-end by constructing an objective in the space of the predictions that users make in downstream decision-making.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Neil Band;Xuechen Li;Tengyu Ma;Tatsunori Hashimoto", "authorids": "~Neil_Band1;~Xuechen_Li1;~Tengyu_Ma1;~Tatsunori_Hashimoto1", "gender": ";M;M;M", "homepage": ";https://www.lxuechen.com/;http://ai.stanford.edu/~tengyuma/;https://thashim.github.io", "dblp": "266/5812.html;;54/9061;", "google_scholar": "2Dab2vkAAAAJ;GaYmpIgAAAAJ;i38QlUwAAAAJ;5ygiTwsAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Neil_Band1;~Xuechen_Li1;~Tengyu_Ma1;~Tatsunori_Hashimoto1", "aff": "Computer Science Department, Stanford University;Computer Science Department, Stanford University;Facebook AI Research;Stanford University", "aff_domain": "cs.stanford.edu;cs.stanford.edu;fb.com;stanford.edu", "position": "PhD student;PhD student;Visiting Scientist;Assistant Professor", "bibtex": "@inproceedings{\nband2024linguistic,\ntitle={Linguistic Calibration of Long-Form Generations},\nauthor={Neil Band and Xuechen Li and Tengyu Ma and Tatsunori Hashimoto},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rJVjQSQ8ye}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2003350, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13739565548200217733&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6, "email": "cs.stanford.edu;cs.stanford.edu;fb.com;stanford.edu", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Stanford University;Meta", "aff_unique_dep": "Computer Science Department;Facebook AI Research", "aff_unique_url": "https://www.stanford.edu;https://research.facebook.com", "aff_unique_abbr": "Stanford;FAIR", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "In-context Learning on Function Classes Unveiled for Transformers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32958", "id": "rJkGOARXns", "proceeding": "https://proceedings.mlr.press/v235/wang24ae.html", "pdf": "https://openreview.net/pdf?id=rJkGOARXns", "openreview": "https://openreview.net/forum?id=rJkGOARXns", "author_site": "Zhijie Wang, Bo Jiang, Shuai Li", "tldr": "", "abstract": "Transformer-based neural sequence models exhibit a remarkable ability to perform in-context learning. Given some training examples, a pre-trained model can make accurate predictions on an unseen input. This paper studies why transformers can learn different types of function classes in-context. We first show by construction that there exists a family of transformers (with different activation functions) that implement approximate gradient descent on the parameters of neural networks, and we provide an upper bound for the number of heads, hidden dimensions, and layers of the transformer. We also show that a transformer can learn linear functions, the indicator function of a unit ball, and smooth functions in-context by learning neural networks that approximate them. The above instances mainly focus on a transformer pre-trained on single tasks. We also prove that when pre-trained on two tasks: linear regression and classification, a transformer can make accurate predictions on both tasks simultaneously. Our results move beyond linearity in terms of in-context learning instances and provide a comprehensive understanding of why transformers can learn many types of function classes through the bridge of neural networks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhijie Wang;Bo Jiang;Shuai Li", "authorids": "~Zhijie_Wang7;~Bo_Jiang2;~Shuai_Li3", "gender": "M;M;F", "homepage": "https://github.com/Vincent-Zhijie;https://jhc.sjtu.edu.cn/~bjiang/;http://shuaili8.github.io", "dblp": ";34/2005-3.html;57/2281-10", "google_scholar": "x_9XRb4AAAAJ;WxAIZtMAAAAJ;https://scholar.google.com.hk/citations?user=kMZgQxcAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Zhijie_Wang7;~Bo_Jiang2;~Shuai_Li3", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;John Hopcroft Center, Shanghai Jiao Tong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "position": "Undergrad student;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nwang2024incontext,\ntitle={In-context Learning on Function Classes Unveiled for Transformers},\nauthor={Zhijie Wang and Bo Jiang and Shuai Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rJkGOARXns}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 529492, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15725857715380643997&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 4, "email": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shanghai", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Learning to Compile Programs to Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32957", "id": "rJti61Uere", "proceeding": "https://proceedings.mlr.press/v235/weber24b.html", "pdf": "https://openreview.net/pdf?id=rJti61Uere", "openreview": "https://openreview.net/forum?id=rJti61Uere", "author_site": "Logan Weber, Jesse Michel, Alex Renda, Michael Carbin", "tldr": "", "abstract": "A *neural surrogate* is a neural network that mimics the behavior of a program. Neural surrogates of programs have been used to automatically tune program inputs, adapt programs to new settings, and accelerate computations. Neural surrogates have traditionally been developed by training on input-output examples for a single program. Language models present another approach wherein a model is trained on a single, large dataset then directly consumes program text, to act as a neural surrogate of the program. Having the language model as both the neural surrogate generator and the neural surrogate, however, poses a tradeoff of limited accuracy or excessive resource consumption. We present *neural surrogate compilation*, a technique for producing neural surrogates directly from program text without coupling neural surrogate generation and execution. We implement neural surrogate compilers using hypernetworks trained on a dataset of C programs and find they produce neural surrogates that are $1.91$-$9.50\\times$ as data-efficient and train in $4.31$-$7.28\\times$ fewer epochs than neural surrogates trained from scratch.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Logan Weber;Jesse Michel;Alex Renda;Michael Carbin", "authorids": "~Logan_Weber1;~Jesse_Michel1;~Alex_Renda2;~Michael_Carbin1", "gender": ";;M;M", "homepage": "https://weberlo.github.io;http://web.mit.edu/jmmichel/www/;https://alexrenda.com;http://people.csail.mit.edu/mcarbin/", "dblp": ";;206/6568;07/3119", "google_scholar": ";;4BCuJ2AAAAAJ;mtejbKYAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Logan_Weber1;~Jesse_Michel1;~Alex_Renda2;~Michael_Carbin1", "aff": "Massachusetts Institute of Technology;MIT;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;csail.mit.edu;mit.edu;mit.edu", "position": "PhD student;PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nweber2024learning,\ntitle={Learning to Compile Programs to Neural Networks},\nauthor={Logan Weber and Jesse Michel and Alex Renda and Michael Carbin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rJti61Uere}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5180037, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:LcYAg7hrYPMJ:scholar.google.com/&scioq=Learning+to+Compile+Programs+to+Neural+Networks&hl=en&as_sdt=0,44", "gs_version_total": 6, "email": "mit.edu;csail.mit.edu;mit.edu;mit.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Closing the Gap: Achieving Global Convergence (Last Iterate) of Actor-Critic under Markovian Sampling with Neural Network Parametrization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32956", "id": "rJxFvAs7pq", "proceeding": "https://proceedings.mlr.press/v235/gaur24a.html", "pdf": "https://openreview.net/pdf?id=rJxFvAs7pq", "openreview": "https://openreview.net/forum?id=rJxFvAs7pq", "author_site": "Mudit Gaur, Amrit Singh Bedi, Di Wang, Vaneet Aggarwal", "tldr": "", "abstract": "The current state-of-the-art theoretical analysis of Actor-Critic (AC) algorithms significantly lags in addressing the practical aspects of AC implementations. This crucial gap needs bridging to bring the analysis in line with practical implementations of AC. To address this, we advocate for considering the MMCLG criteria: **M**ulti-layer neural network parametrization for actor/critic, **M**arkovian sampling, **C**ontinuous state-action spaces, the performance of the **L**ast iterate, and **G**lobal optimality. These aspects are practically significant and have been largely overlooked in existing theoretical analyses of AC algorithms. In this work, we address these gaps by providing the first comprehensive theoretical analysis of AC algorithms that encompasses all five crucial practical aspects (covers MMCLG criteria). We establish global convergence sample complexity bounds of $\\tilde{\\mathcal{O}}\\left( \\epsilon^{-3} \\right)$. We achieve this result through our novel use of the weak gradient domination property of MDP's and our unique analysis of the error in critic estimation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mudit Gaur;Amrit Bedi;Di Wang;Vaneet Aggarwal", "authorids": "~Mudit_Gaur1;~Amrit_Bedi1;~Di_Wang1;~Vaneet_Aggarwal1", "gender": "M;M;;M", "homepage": "https://www.linkedin.com/in/mudit-gaur-a3294661/;https://sites.google.com/view/amritsinghbedi/home;;", "dblp": "334/0220.html;176/2707.html;;91/6560", "google_scholar": "aA4GjhAAAAAJ;91WLA6QAAAAJ;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Mudit_Gaur1;~Amrit_Bedi1;~Di_Wang1;~Vaneet_Aggarwal1", "aff": "Purdue University;University of Maryland, College Park;;Purdue University", "aff_domain": "purdue.edu;umd.edu;;purdue.edu", "position": "PhD student;Researcher;;Full Professor", "bibtex": "@inproceedings{\ngaur2024closing,\ntitle={Closing the Gap: Achieving Global Convergence (Last Iterate) of Actor-Critic under Markovian Sampling with Neural Network Parametrization},\nauthor={Mudit Gaur and Amrit Bedi and Di Wang and Vaneet Aggarwal},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rJxFvAs7pq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 395198, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12908059859307875405&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "purdue.edu;umd.edu;;purdue.edu", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Purdue University;University of Maryland", "aff_unique_dep": ";", "aff_unique_url": "https://www.purdue.edu;https://www/umd.edu", "aff_unique_abbr": "Purdue;UMD", "aff_campus_unique_index": "1", "aff_campus_unique": ";College Park", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Transport of Algebraic Structure to Latent Embeddings", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32955", "id": "rK6AZem0hX", "proceeding": "https://proceedings.mlr.press/v235/pfrommer24a.html", "pdf": "https://openreview.net/pdf?id=rK6AZem0hX", "openreview": "https://openreview.net/forum?id=rK6AZem0hX", "author_site": "Samuel Pfrommer, Brendon G. Anderson, Somayeh Sojoudi", "tldr": "", "abstract": "Machine learning often aims to produce latent embeddings of inputs which lie in a larger, abstract mathematical space. For example, in the field of 3D modeling, subsets of Euclidean space can be embedded as vectors using implicit neural representations. Such subsets also have a natural algebraic structure including operations (e.g., union) and corresponding laws (e.g., associativity). How can we learn to \"union\" two sets using only their latent embeddings while respecting associativity? We propose a general procedure for parameterizing latent space operations that are provably consistent with the laws on the input space. This is achieved by learning a bijection from the latent space to a carefully designed *mirrored algebra* which is constructed on Euclidean space in accordance with desired laws. We evaluate these *structural transport nets* for a range of mirrored algebras against baselines that operate directly on the latent space. Our experiments provide strong evidence that respecting the underlying algebraic structure of the input space is key for learning accurate and self-consistent operations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Samuel Pfrommer;Brendon G. Anderson;Somayeh Sojoudi", "authorids": "~Samuel_Pfrommer1;~Brendon_G._Anderson1;~Somayeh_Sojoudi1", "gender": ";;F", "homepage": "https://sam.pfrommer.us/;https://brendon-anderson.github.io/;https://eecs.berkeley.edu/~sojoudi/", "dblp": ";225/6104;06/7000", "google_scholar": "ysS4V1UAAAAJ;kNA83jQAAAAJ;kNH8zcgAAAAJ", "orcid": ";;", "linkedin": "sampfrommer/;;", "or_profile": "~Samuel_Pfrommer1;~Brendon_G._Anderson1;~Somayeh_Sojoudi1", "aff": "University of California, Berkeley;University of California, Berkeley;University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\npfrommer2024transport,\ntitle={Transport of Algebraic Structure to Latent Embeddings},\nauthor={Samuel Pfrommer and Brendon G. Anderson and Somayeh Sojoudi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rK6AZem0hX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 593342, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17485807096473292223&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 8, "email": "berkeley.edu;berkeley.edu;berkeley.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Stochastic Conditional Diffusion Models for Robust Semantic Image Synthesis", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32954", "id": "rMV86cAOh6", "proceeding": "https://proceedings.mlr.press/v235/ko24e.html", "pdf": "https://openreview.net/pdf?id=rMV86cAOh6", "openreview": "https://openreview.net/forum?id=rMV86cAOh6", "author_site": "Juyeon Ko, Inho Kong, Dogyun Park, Hyunwoo Kim", "tldr": "", "abstract": "Semantic image synthesis (SIS) is a task to generate realistic images corresponding to semantic maps (labels). However, in real-world applications, SIS often encounters noisy user inputs. To address this, we propose Stochastic Conditional Diffusion Model (SCDM), which is a robust conditional diffusion model that features novel forward and generation processes tailored for SIS with noisy labels. It enhances robustness by stochastically perturbing the semantic label maps through Label Diffusion, which diffuses the labels with discrete diffusion. Through the diffusion of labels, the noisy and clean semantic maps become similar as the timestep increases, eventually becoming identical at $t=T$. This facilitates the generation of an image close to a clean image, enabling robust generation. Furthermore, we propose a class-wise noise schedule to differentially diffuse the labels depending on the class. We demonstrate that the proposed method generates high-quality samples through extensive experiments and analyses on benchmark datasets, including a novel experimental setup simulating human errors during real-world applications. Code is available at https://github.com/mlvlab/SCDM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Juyeon Ko;Inho Kong;Dogyun Park;Hyunwoo J. Kim", "authorids": "~Juyeon_Ko1;~Inho_Kong1;~Dogyun_Park2;~Hyunwoo_J._Kim3", "gender": ";M;M;M", "homepage": "https://github.com/dewyeon;https://github.com/inooni;https://dogyunpark.github.io/;https://hyunwoojkim.com/publications", "dblp": "317/5260;371/4054;323/9575;150/4259", "google_scholar": ";hrpWwhAAAAAJ;Cgc-2roAAAAJ;https://scholar.google.co.kr/citations?user=LfBoJt8AAAAJ", "orcid": ";;;0000-0002-2181-9264", "linkedin": ";;dogyunpark/;", "or_profile": "~Juyeon_Ko1;~Inho_Kong1;~Dogyun_Park2;~Hyunwoo_Kim1", "aff": "Korea University;Korea University;Korea University;Korea University", "aff_domain": "korea.ac.kr;korea.ac.kr;korea.ac.kr;korea.ac.kr", "position": "MS student;Undergrad student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nko2024stochastic,\ntitle={Stochastic Conditional Diffusion Models for Robust Semantic Image Synthesis},\nauthor={Juyeon Ko and Inho Kong and Dogyun Park and Hyunwoo J. Kim},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rMV86cAOh6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8357808, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9297482955626093724&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "korea.ac.kr;korea.ac.kr;korea.ac.kr;korea.ac.kr", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Korea University", "aff_unique_dep": "", "aff_unique_url": "https://www.korea.ac.kr", "aff_unique_abbr": "KU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Highway Value Iteration Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32953", "id": "rORsGuE2hV", "proceeding": "https://proceedings.mlr.press/v235/wang24ai.html", "pdf": "https://openreview.net/pdf?id=rORsGuE2hV", "openreview": "https://openreview.net/forum?id=rORsGuE2hV", "author_site": "Yuhui Wang, Weida Li, Francesco Faccio, Qingyuan Wu, J\u00fcrgen Schmidhuber", "tldr": "", "abstract": "Value iteration networks (VINs) enable end-to-end learning for planning tasks by employing a differentiable \"planning module\" that approximates the value iteration algorithm. However, long-term planning remains a challenge because training very deep VINs is difficult. To address this problem, we embed highway value iteration---a recent algorithm designed to facilitate long-term credit assignment---into the structure of VINs. This improvement augments the \"planning module\" of the VIN with three additional components: 1) an \"aggregate gate,\" which constructs skip connections to improve information flow across many layers; 2) an \"exploration module,\" crafted to increase the diversity of information and gradient flow in spatial dimensions; 3) a \"filter gate\" designed to ensure safe exploration. The resulting novel highway VIN can be trained effectively with hundreds of layers using standard backpropagation. In long-term planning tasks requiring hundreds of planning steps, deep highway VINs outperform both traditional VINs and several advanced, very deep NNs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuhui Wang;Weida Li;Francesco Faccio;Qingyuan Wu;J\u00fcrgen Schmidhuber", "authorids": "~Yuhui_Wang1;~Weida_Li1;~Francesco_Faccio1;~Qingyuan_Wu1;~J\u00fcrgen_Schmidhuber1", "gender": "M;;M;M;M", "homepage": "https://wangyuhuix.github.io/;;;;http://people.idsia.ch/~juergen/", "dblp": ";121/8659;227/3214;;s/JurgenSchmidhuber", "google_scholar": "https://scholar.google.com.tw/citations?hl=zh-CN;claK_XkAAAAJ;0z3DkrkAAAAJ;CYfMzb8AAAAJ;https://scholar.google.ch/citations?user=gLnCTgIAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Yuhui_Wang1;~Weida_Li1;~Francesco_Faccio1;~Qingyuan_Wu1;~J\u00fcrgen_Schmidhuber1", "aff": "King Abdullah University of Science and Technology;University of Waterloo;The Swiss AI Lab IDSIA - USI - SUPSI;University of Liverpool;IDSIA", "aff_domain": "kaust.edu.sa;uwaterloo.ca;idsia.ch;liverpool.ac.uk;idsia.ch", "position": "Postdoc;Intern;PhD student;PhD student;Scientific Director", "bibtex": "@inproceedings{\nwang2024highway,\ntitle={Highway Value Iteration Networks},\nauthor={Yuhui Wang and Weida Li and Francesco Faccio and Qingyuan Wu and J{\\\"u}rgen Schmidhuber},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rORsGuE2hV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2056399, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=501954707938285374&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "kaust.edu.sa;uwaterloo.ca;idsia.ch;liverpool.ac.uk;idsia.ch", "author_num": 5, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "King Abdullah University of Science and Technology;University of Waterloo;Swiss AI Lab IDSIA;University of Liverpool;Institute of Digital Technologies", "aff_unique_dep": ";;AI Lab;;", "aff_unique_url": "https://www.kast.kau.edu.sa;https://uwaterloo.ca;https://www.idsia.ch/;https://www.liverpool.ac.uk;https://www.idsia.ch", "aff_unique_abbr": "KAUST;UW;IDSIA;Liv Uni;IDSIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;3;2", "aff_country_unique": "Saudi Arabia;Canada;Switzerland;United Kingdom" }, { "title": "Expressivity and Generalization: Fragment-Biases for Molecular GNNs", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32952", "id": "rPm5cKb1VB", "proceeding": "https://proceedings.mlr.press/v235/wollschlager24a.html", "pdf": "https://openreview.net/pdf?id=rPm5cKb1VB", "openreview": "https://openreview.net/forum?id=rPm5cKb1VB", "author_site": "Tom Wollschl\u00e4ger, Niklas Kemper, Leon Hetzel, Johanna Sommer, Stephan G\u00fcnnemann", "tldr": "", "abstract": "Although recent advances in higher-order Graph Neural Networks (GNNs) improve the theoretical expressiveness and molecular property predictive performance, they often fall short of the empirical performance of models that explicitly use fragment information as inductive bias. However, for these approaches, there exists no theoretic expressivity study. In this work, we propose the *Fragment-WL* test, an extension to the well-known Weisfeiler & Leman (WL) test, which enables the theoretic analysis of these fragment-biased GNNs. Building on the insights gained from the Fragment-WL test, we develop a new GNN architecture and a fragmentation with infinite vocabulary that significantly boosts expressiveness. We show the effectiveness of our model on synthetic and real-world data where we outperform all GNNs on Peptides and have $12$% lower error than all GNNs on ZINC and $34$% lower error than other fragment-biased models. Furthermore, we show that our model exhibits superior generalization capabilities compared to the latest transformer-based architectures, positioning it as a robust solution for a range of molecular modeling tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tom Wollschl\u00e4ger;Niklas Kemper;Leon Hetzel;Johanna Sommer;Stephan G\u00fcnnemann", "authorids": "~Tom_Wollschl\u00e4ger1;~Niklas_Kemper1;~Leon_Hetzel1;~Johanna_Sommer1;~Stephan_G\u00fcnnemann1", "gender": "M;M;M;F;M", "homepage": "https://www.linkedin.com/in/wollschlaeger/;https://linkedin.com/in/niklas-kemper-6119602b1;;https://johanna-sommer.com;http://www.daml.in.tum.de", "dblp": "332/0829;379/9628;246/5214;https://dblp.uni-trier.de/pid/243/2320;43/3011", "google_scholar": "https://scholar.google.com/citations?hl=en;;;R3p8FGsAAAAJ;", "orcid": ";;0000-0002-4823-9729;;", "linkedin": "wollschlaeger/;https://linkedin.com/in/niklas-kemper-6119602b1;;;", "or_profile": "~Tom_Wollschl\u00e4ger1;~Niklas_Kemper1;~Leon_Hetzel1;~Johanna_Sommer1;~Stephan_G\u00fcnnemann1", "aff": "Valence Labs powered by recursion;Technische Universit\u00e4t M\u00fcnchen;Technische Universit\u00e4t M\u00fcnchen;Technische Universit\u00e4t M\u00fcnchen;Technical University Munich", "aff_domain": "valencelabs.com;tum.edu;tum.de;tum.de;tum.de", "position": "Researcher;MS student;PhD student;PhD student;Professor", "bibtex": "@inproceedings{\nwollschl{\\\"a}ger2024expressivity,\ntitle={Expressivity and Generalization: Fragment-Biases for Molecular {GNN}s},\nauthor={Tom Wollschl{\\\"a}ger and Niklas Kemper and Leon Hetzel and Johanna Sommer and Stephan G{\\\"u}nnemann},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rPm5cKb1VB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1319674, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16137446192711199722&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "valencelabs.com;tum.edu;tum.de;tum.de;tum.de", "author_num": 5, "aff_unique_index": "0;1;1;1;2", "aff_unique_norm": "Valence Labs;Technische Universit\u00e4t M\u00fcnchen;Technical University of Munich", "aff_unique_dep": ";;", "aff_unique_url": ";https://www.tum.de;https://www.tum.de", "aff_unique_abbr": ";TUM;TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;1;1;1", "aff_country_unique": ";Germany" }, { "title": "Contrastive Representation for Data Filtering in Cross-Domain Offline Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32951", "id": "rReWhol66R", "proceeding": "https://proceedings.mlr.press/v235/wen24a.html", "pdf": "https://openreview.net/pdf?id=rReWhol66R", "openreview": "https://openreview.net/forum?id=rReWhol66R", "author_site": "Xiaoyu Wen, Chenjia Bai, Kang Xu, Xudong Yu, Yang Zhang, Xuelong Li, Zhen Wang", "tldr": "", "abstract": "Cross-domain offline reinforcement learning leverages source domain data with diverse transition dynamics to alleviate the data requirement for the target domain. However, simply merging the data of two domains leads to performance degradation due to the dynamics mismatch. Existing methods address this problem by measuring the dynamics gap via domain classifiers while relying on the assumptions of the transferability of paired domains. In this paper, we propose a novel representation-based approach to measure the domain gap, where the representation is learned through a contrastive objective by sampling transitions from different domains. We show that such an objective recovers the mutual-information gap of transition functions in two domains without suffering from the unbounded issue of the dynamics gap in handling significantly different domains. Based on the representations, we introduce a data filtering algorithm that selectively shares transitions from the source domain according to the contrastive score functions. Empirical results on various tasks demonstrate that our method achieves superior performance, using only 10% of the target data to achieve 89.2% of the performance on 100% target dataset with state-of-the-art methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaoyu Wen;Chenjia Bai;Kang Xu;Xudong Yu;Yang Zhang;Xuelong Li;Zhen Wang", "authorids": "~Xiaoyu_Wen2;~Chenjia_Bai2;~Kang_Xu2;~Xudong_Yu2;~Yang_Zhang49;~Xuelong_Li2;~Zhen_Wang11", "gender": "M;M;M;;M;M;M", "homepage": ";https://baichenjia.github.io/;https://kangxu023.github.io/;;https://github.com/breez3young;;http://iopen.nwpu.edu.cn/info/1015/1351.htm?ivk_sa=1024320u", "dblp": ";247/1943;295/1622;;;l/XuelongLi;", "google_scholar": "fk767YoAAAAJ;Rm_1y2kAAAAJ;7FTLsHUAAAAJ;https://scholar.google.com.hk/citations?hl=zh-CN;2NxmmZwAAAAJ;ahUibskAAAAJ;https://scholar.google.co.uk/citations?hl=zh-CN", "orcid": ";;0000-0001-6040-3002;;;;", "linkedin": ";;;;;;", "or_profile": "~Xiaoyu_Wen2;~Chenjia_Bai2;~Kang_Xu2;~Xudong_Yu2;~Yang_Zhang49;~Xuelong_Li2;~Zhen_Wang11", "aff": "Northwest Polytechnical University Xi'an;Shanghai AI Laboratory;Fudan University;Harbin Institute of Technology;Tsinghua University;Northwestern Polytechnical University;Northwestern Polytechnical University", "aff_domain": "nwpu.edu.cn;pjlab.org.cn;fudan.edu.cn;hit.edu.cn;mails.tsinghua.edu.cn;nwpu.edu.cn;nwpu.edu.cn", "position": "MS student;Researcher;MS student;PhD student;PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nwen2024contrastive,\ntitle={Contrastive Representation for Data Filtering in Cross-Domain Offline Reinforcement Learning},\nauthor={Xiaoyu Wen and Chenjia Bai and Kang Xu and Xudong Yu and Yang Zhang and Xuelong Li and Zhen Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rReWhol66R}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 804331, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2259255461100000278&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "nwpu.edu.cn;pjlab.org.cn;fudan.edu.cn;hit.edu.cn;mails.tsinghua.edu.cn;nwpu.edu.cn;nwpu.edu.cn", "author_num": 7, "aff_unique_index": "0;1;2;3;4;5;5", "aff_unique_norm": "Northwest Polytechnical University;Shanghai AI Laboratory;Fudan University;Harbin Institute of Technology;Tsinghua University;Northwestern Polytechnical University", "aff_unique_dep": ";;;;;", "aff_unique_url": "http://www.nwpu.edu.cn;https://www.shanghai-ai-lab.com;https://www.fudan.edu.cn;http://www.hit.edu.cn/;https://www.tsinghua.edu.cn;https://www.nwpu.edu.cn", "aff_unique_abbr": "NWPU;SAIL;Fudan;HIT;THU;NWPU", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Xi'an;;Harbin", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Effective Federated Graph Matching", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32950", "id": "rSfzchjIYu", "proceeding": "https://proceedings.mlr.press/v235/zhou24v.html", "pdf": "https://openreview.net/pdf?id=rSfzchjIYu", "openreview": "https://openreview.net/forum?id=rSfzchjIYu", "author_site": "Yang Zhou, Zijie Zhang, Zeru Zhang, Lingjuan Lyu, Wei-Shinn Ku", "tldr": "", "abstract": "Graph matching in the setting of federated learning is still an open problem. This paper proposes an unsupervised federated graph matching algorithm, UFGM, for inferring matched node pairs on different graphs across clients while maintaining privacy requirement, by leveraging graphlet theory and trust region optimization. First, the nodes' graphlet features are captured to generate pseudo matched node pairs on different graphs across clients as pseudo training data for tackling the dilemma of unsupervised graph matching in federated setting and leveraging the strength of supervised graph matching. An approximate graphlet enumeration method is proposed to sample a small number of graphlets and capture nodes' graphlet features. Theoretical analysis is conducted to demonstrate that the approximate method is able to maintain the quality of graphlet estimation while reducing its expensive cost. Second, we propose a separate trust region algorithm for pseudo supervised federated graph matching while maintaining the privacy constraints. In order to avoid expensive cost of the second-order Hessian computation in the trust region algorithm, we propose two weak quasi-Newton conditions to construct a positive definite scalar matrix as the Hessian approximation with only first-order gradients. We theoretically derive the error introduced by the separate trust region due to the Hessian approximation and conduct the convergence analysis of the approximation method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yang Zhou;Zijie Zhang;Zeru Zhang;Lingjuan Lyu;Wei-Shinn Ku", "authorids": "~Yang_Zhou4;~Zijie_Zhang1;~Zeru_Zhang1;~Lingjuan_Lyu1;~Wei-Shinn_Ku1", "gender": ";M;M;F;M", "homepage": "http://eng.auburn.edu/users/yangzhou/;;;https://sites.google.com/view/lingjuan-lyu;http://www.eng.auburn.edu/~weishinn/", "dblp": "07/4580-1;63/8333.html;280/1147;178/9876;21/1694", "google_scholar": "yvE8Po0AAAAJ;https://scholar.google.com/citations?hl=zh-CN;;;https://scholar.google.com.tw/citations?user=ZQ87sO4AAAAJ", "orcid": "0000-0001-7839-4933;0000-0003-1254-098X;;;0000-0001-8636-4689", "linkedin": ";;;;", "or_profile": "~Yang_Zhou4;~Zijie_Zhang1;~Zeru_Zhang1;~Lingjuan_Lyu1;~Wei-Shinn_Ku1", "aff": "Auburn University;University of Texas at San Antonio;Auburn University;Sony;Auburn University", "aff_domain": "auburn.edu;utsa.edu;auburn.edu;sony.com;auburn.edu", "position": "Assistant Professor;Assistant Professor;PhD student;scientist;Full Professor", "bibtex": "@inproceedings{\nzhou2024effective,\ntitle={Effective Federated Graph Matching},\nauthor={Yang Zhou and Zijie Zhang and Zeru Zhang and Lingjuan Lyu and Wei-Shinn Ku},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rSfzchjIYu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 920282, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_ovKfN7dI_UJ:scholar.google.com/&scioq=Effective+Federated+Graph+Matching&hl=en&as_sdt=0,47", "gs_version_total": 4, "email": "auburn.edu;utsa.edu;auburn.edu;sony.com;auburn.edu", "author_num": 5, "aff_unique_index": "0;1;0;2;0", "aff_unique_norm": "Auburn University;University of Texas at San Antonio;Sony Corporation", "aff_unique_dep": ";;", "aff_unique_url": "https://www.auburn.edu;https://www.utsa.edu;https://www.sony.com", "aff_unique_abbr": "Auburn;UTSA;Sony", "aff_campus_unique_index": "1", "aff_campus_unique": ";San Antonio", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;Japan" }, { "title": "Decomposing and Editing Predictions by Modeling Model Computation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32949", "id": "rTBR0eqE4G", "proceeding": "https://proceedings.mlr.press/v235/shah24a.html", "pdf": "https://openreview.net/pdf?id=rTBR0eqE4G", "openreview": "https://openreview.net/forum?id=rTBR0eqE4G", "author_site": "Harshay Shah, Andrew Ilyas, Aleksander Madry", "tldr": "", "abstract": "*How does the internal computation of a machine learning model transform inputs into predictions?* To tackle this question, we introduce a framework called *component modeling* for decomposing a model prediction in terms of its components---architectural \"building blocks\" such as convolution filters or attention heads. We focus on a special case of this framework, *component attribution*, where the goal is to estimate the counterfactual impact of individual components on a given prediction. We then present COAR, a scalable algorithm for estimating component attributions, and demonstrate its effectiveness across models, datasets and modalities. Finally, we show that COAR directly enables effective model editing. Our code is available at [github.com/MadryLab/modelcomponents]([https://github.com/MadryLab/modelcomponents]).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Harshay Shah;Andrew Ilyas;Aleksander Madry", "authorids": "~Harshay_Shah1;~Andrew_Ilyas1;~Aleksander_Madry1", "gender": ";M;M", "homepage": "http://harshay.me/;http://andrewilyas.com;https://people.csail.mit.edu/madry/", "dblp": "211/7945;156/5465;67/2454", "google_scholar": "oC8YKjUAAAAJ;Dtw3YBoAAAAJ;SupjsEUAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Harshay_Shah1;~Andrew_Ilyas1;~Aleksander_Madry1", "aff": "Apple MLR;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "apple.com;mit.edu;mit.edu", "position": "Research Intern;PhD student;Professor", "bibtex": "@inproceedings{\nshah2024decomposing,\ntitle={Decomposing and Editing Predictions by Modeling Model Computation},\nauthor={Harshay Shah and Andrew Ilyas and Aleksander Madry},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rTBR0eqE4G}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9114619, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9410364383681136578&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "apple.com;mit.edu;mit.edu", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Apple;Massachusetts Institute of Technology", "aff_unique_dep": "Machine Learning and Research;", "aff_unique_url": "https://www.apple.com;https://web.mit.edu", "aff_unique_abbr": "Apple;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Position: Is machine learning good or bad for the natural sciences?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32948", "id": "rU8o0QQCy0", "proceeding": "https://proceedings.mlr.press/v235/hogg24a.html", "pdf": "https://openreview.net/pdf?id=rU8o0QQCy0", "openreview": "https://openreview.net/forum?id=rU8o0QQCy0", "author_site": "David W. Hogg, Soledad Villar", "tldr": "", "abstract": "Machine learning (ML) methods are having a huge impact across all of the sciences. However, ML has a strong ontology \u2014 in which only the data exist \u2014 and a strong epistemology \u2014 in which a model is considered good if it performs well on held-out training data. These philosophies are in strong conflict with both standard practices and key philosophies in the natural sciences. Here we identify some locations for ML in the natural sciences at which the ontology and epistemology are valuable. For example, when an expressive machine learning model is used in a causal inference to represent the effects of confounders, such as foregrounds, backgrounds, or instrument calibration parameters, the model capacity and loose philosophy of ML can make the results more trustworthy. We also show that there are contexts in which the introduction of ML introduces strong, unwanted statistical biases. For one, when ML models are used to emulate physical (or first-principles) simulations, they amplify confirmation biases. For another, when expressive regressions are used to label datasets, those labels cannot be used in downstream joint or ensemble analyses without taking on uncontrolled biases. The question in the title is being asked of all of the natural sciences; that is, we are calling on the scientific communities to take a step back and consider the role and value of ML in their fields; the (partial) answers we give here come from the particular perspective of physics.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "David W Hogg;Soledad Villar", "authorids": "~David_W_Hogg1;~Soledad_Villar2", "gender": ";F", "homepage": "https://cosmo.nyu.edu/hogg/;https://www.ams.jhu.edu/villar/", "dblp": ";https://dblp.uni-trier.de/pers/hd/v/Villar:Soledad", "google_scholar": "hcmW-W0AAAAJ;JBGlsDoAAAAJ", "orcid": "0000-0003-2866-9403;", "linkedin": ";", "or_profile": "~David_Hogg1;~Soledad_Villar1", "aff": "New York University;Johns Hopkins University", "aff_domain": "nyu.edu;jhu.edu", "position": "Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nhogg2024position,\ntitle={Position: Is machine learning good or bad for the natural sciences?},\nauthor={David W Hogg and Soledad Villar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rU8o0QQCy0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 589106, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17332912892817062351&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 4, "email": "nyu.edu;jhu.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "New York University;Johns Hopkins University", "aff_unique_dep": ";", "aff_unique_url": "https://www.nyu.edu;https://www.jhu.edu", "aff_unique_abbr": "NYU;JHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Coactive Learning for Large Language Models using Implicit User Feedback", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35295", "id": "rVWsTjMW1m", "proceeding": "https://proceedings.mlr.press/v235/tucker24a.html", "pdf": "https://openreview.net/pdf?id=rVWsTjMW1m", "openreview": "https://openreview.net/forum?id=rVWsTjMW1m", "author_site": "Aaron D. Tucker, Kiant\u00e9 Brantley, Adam Cahall, Thorsten Joachims", "tldr": "", "abstract": "We propose coactive learning as a model and feedback mechanism for training large language models (LLMs). The key insight is that users provide implicit feedback whenever they edit the text $y$ proposed by an LLM. While the edited text $\\bar y$ is typically not a gold-standard example for supervised training, coactive learning merely requires that the edited text $\\bar y$ is an improvement over the proposed text $y$. Note that such weak implicit preference feedback $\\bar y \\succ y$ is available in many application settings on a per-user basis, thus enabling the personalization of LLMs. In this paper, we develop the theoretical basis for coactive training of non-linear models, and we derive CoRLL as the first coactive learning algorithm for LLMs. Empirical results indicate that CoRLL is effective even for weak and noisy coactive preference feedback, making it a promising algorithm for training and personalization of LLMs from feedback that is naturally collected in many use cases.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aaron David Tucker;Kiant\u00e9 Brantley;Adam Cahall;Thorsten Joachims", "authorids": "~Aaron_David_Tucker1;~Kiant\u00e9_Brantley2;~Adam_Cahall1;~Thorsten_Joachims1", "gender": ";;;M", "homepage": "https://atucker.github.io/;;;http://www.joachims.org", "dblp": "256/5430;;;j/ThorstenJoachims", "google_scholar": "2HUwRZsAAAAJ;;;5tk1PV8AAAAJ", "orcid": "0000-0003-3967-9711;;;0000-0003-3654-3683", "linkedin": ";;adam-cahall-9aa6191b3/;thorsten-joachims-7224a35/", "or_profile": "~Aaron_David_Tucker1;~Kiant\u00e9_Brantley2;~Adam_Cahall1;~Thorsten_Joachims1", "aff": "Cornell University;;Cornell University;Amazon", "aff_domain": "cornell.edu;;cornell.edu;amazon.com", "position": "PhD student;;Undergrad student;Amazon Scholar", "bibtex": "@inproceedings{\ntucker2024coactive,\ntitle={Coactive Learning for Large Language Models using Implicit User Feedback},\nauthor={Aaron David Tucker and Kiant{\\'e} Brantley and Adam Cahall and Thorsten Joachims},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rVWsTjMW1m}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 764168, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5155508016770905811&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "cornell.edu;;cornell.edu;amazon.com", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Cornell University;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.cornell.edu;https://www.amazon.com", "aff_unique_abbr": "Cornell;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Piecewise Constant and Linear Regression Trees: An Optimal Dynamic Programming Approach", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32947", "id": "rXnBvu5D7i", "proceeding": "https://proceedings.mlr.press/v235/van-den-bos24a.html", "pdf": "https://openreview.net/pdf?id=rXnBvu5D7i", "openreview": "https://openreview.net/forum?id=rXnBvu5D7i", "author_site": "Mim van den Bos, Jacobus van der Linden, Emir Demirovi\u0107", "tldr": "", "abstract": "Regression trees are a human-comprehensible machine-learning model that can represent complex relationships. They are typically trained using greedy heuristics because computing optimal regression trees is NP-hard. Contrary to this standard practice, we consider optimal methods and improve the scalability of optimal methods by developing three new dynamic programming approaches. First, we improve the performance of a piecewise constant regression tree method using a special algorithm for trees of depth two. Second, we provide the first optimal dynamic programming method for piecewise multiple linear regression. Third, we develop the first optimal method for piecewise simple linear regression, for which we also provide a special algorithm for trees of depth two. The experimental results show that our methods improve scalability by one or more orders of magnitude over the state-of-the-art optimal methods while performing similarly or better in out-of-sample performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mim van den Bos;Jacobus G. M. van der Linden;Emir Demirovi\u0107", "authorids": "~Mim_van_den_Bos1;~Jacobus_G._M._van_der_Linden1;~Emir_Demirovi\u01071", "gender": "M;Not Specified;M", "homepage": ";http://www.emirdemirovic.com;https://www.tudelft.nl/ewi/over-de-faculteit/afdelingen/software-technology/algorithmics/people/koos-van-der-linden/", "dblp": ";;294/1810", "google_scholar": ";;rc-Xm_AAAAAJ", "orcid": ";;0009-0001-4015-0594", "linkedin": "mim-van-den-bos;;", "or_profile": "~Mim_van_den_Bos1;~Emir_Demirovi\u01071;~Jacobus_G.M._van_der_Linden1", "aff": "Delft University of Technology;Delft University of Technology;Delft University of Technology", "aff_domain": "student.tudelft.nl;tudelft.nl;tudelft.nl", "position": "MS student;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nbos2024piecewise,\ntitle={Piecewise Constant and Linear Regression Trees: An Optimal Dynamic Programming Approach},\nauthor={Mim van den Bos and Jacobus G. M. van der Linden and Emir Demirovi{\\'c}},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rXnBvu5D7i}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 576542, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16933954519162280043&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "email": "student.tudelft.nl;tudelft.nl;tudelft.nl", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Delft University of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.tudelft.nl", "aff_unique_abbr": "TU Delft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Netherlands" }, { "title": "Moreau Envelope for Nonconvex Bi-Level Optimization: A Single-Loop and Hessian-Free Solution Strategy", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32946", "id": "rZD9hV0Bc4", "proceeding": "https://proceedings.mlr.press/v235/liu24ap.html", "pdf": "https://openreview.net/pdf?id=rZD9hV0Bc4", "openreview": "https://openreview.net/forum?id=rZD9hV0Bc4", "author_site": "Risheng Liu, Zhu Liu, Wei Yao, Shangzhi Zeng, Jin Zhang", "tldr": "", "abstract": "This work focuses on addressing two major challenges in the context of large-scale nonconvex Bi-Level Optimization (BLO) problems, which are increasingly applied in machine learning due to their ability to model nested structures. These challenges involve ensuring computational efficiency and providing theoretical guarantees. While recent advances in scalable BLO algorithms have primarily relied on lower-level convexity simplification, our work specifically tackles large-scale BLO problems involving nonconvexity in both the upper and lower levels. We simultaneously address computational and theoretical challenges by introducing an innovative single-loop gradient-based algorithm, utilizing the Moreau envelope-based reformulation, and providing non-asymptotic convergence analysis for general nonconvex BLO problems. Notably, our algorithm relies solely on first-order gradient information, enhancing its practicality and efficiency, especially for large-scale BLO learning tasks. We validate our approach's effectiveness through experiments on various synthetic problems, two typical hyper-parameter learning tasks, and a real-world neural architecture search application, collectively demonstrating its superior performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Risheng Liu;Zhu Liu;Wei Yao;Shangzhi Zeng;Jin Zhang", "authorids": "~Risheng_Liu1;~Zhu_Liu3;~Wei_Yao3;~Shangzhi_Zeng1;~Jin_Zhang8", "gender": ";M;;M;M", "homepage": "https://rsliu.tech/;https://scholar.google.com/citations?user=WDjOXbIAAAAJ&hl=zh-CN;https://mathscinet.ams.org/mathscinet/search/author.html?mrauthid=910710;;https://math.sustech.edu.cn/c/zhangjin?lang=en", "dblp": "82/8066;14/191-4.html;;209/8353;", "google_scholar": "DzuhImQAAAAJ;WDjOXbIAAAAJ;;rzIzb6cAAAAJ;https://scholar.google.ca/citations?hl=en", "orcid": ";0000-0003-0975-2711;;0000-0002-6950-7825;", "linkedin": ";;;;", "or_profile": "~Risheng_Liu1;~Zhu_Liu3;~Wei_Yao3;~Shangzhi_Zeng1;~Jin_Zhang8", "aff": "Dalian University of Technology;Dalian University of Technology;Southern University of Science and Technology;University of Victoria;", "aff_domain": "dlut.edu.cn;dlut.edu.cn;sustech.edu.cn;uvic.ca;", "position": "Full Professor;PhD student;Assistant Professor;Postdoc;", "bibtex": "@inproceedings{\nliu2024moreau,\ntitle={Moreau Envelope for Nonconvex Bi-Level Optimization: A Single-Loop and Hessian-Free Solution Strategy},\nauthor={Risheng Liu and Zhu Liu and Wei Yao and Shangzhi Zeng and Jin Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rZD9hV0Bc4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3098488, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1113381049889494682&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "dlut.edu.cn;dlut.edu.cn;sustech.edu.cn;uvic.ca;", "author_num": 5, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Dalian University of Technology;Southern University of Science and Technology;University of Victoria", "aff_unique_dep": ";;", "aff_unique_url": "http://www.dlut.edu.cn/;https://www.sustech.edu.cn;https://www.uvic.ca", "aff_unique_abbr": "DUT;SUSTech;UVic", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "China;Canada" }, { "title": "A New Theoretical Perspective on Data Heterogeneity in Federated Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32945", "id": "re6es2atbl", "proceeding": "https://proceedings.mlr.press/v235/wang24bu.html", "pdf": "https://openreview.net/pdf?id=re6es2atbl", "openreview": "https://openreview.net/forum?id=re6es2atbl", "author_site": "Jiayi Wang, Shiqiang Wang, Rong-Rong Chen, Mingyue Ji", "tldr": "", "abstract": "In federated learning (FL), data heterogeneity is the main reason that existing theoretical analyses are pessimistic about the convergence rate. In particular, for many FL algorithms, the convergence rate grows dramatically when the number of local updates becomes large, especially when the product of the gradient divergence and local Lipschitz constant is large. However, empirical studies can show that more local updates can improve the convergence rate even when these two parameters are large, which is inconsistent with the theoretical findings. This paper aims to bridge this gap between theoretical understanding and practical performance by providing a theoretical analysis from a new perspective on data heterogeneity. In particular, we propose a new and weaker assumption compared to the local Lipschitz gradient assumption, named the heterogeneity-driven pseudo-Lipschitz assumption. We show that this and the gradient divergence assumptions can jointly characterize the effect of data heterogeneity. By deriving a convergence upper bound for FedAvg and its extensions, we show that, compared to the existing works, local Lipschitz constant is replaced by the much smaller heterogeneity-driven pseudo-Lipschitz constant and the corresponding convergence upper bound can be significantly reduced for the same number of local updates, although its order stays the same. In addition, when the local objective function is quadratic, more insights on the impact of data heterogeneity can be obtained using the heterogeneity-driven pseudo-Lipschitz constant. For example, we can identify a region where FedAvg can outperform mini-batch SGD even when the gradient divergence can be arbitrarily large. Our findings are validated using experiments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiayi Wang;Shiqiang Wang;Rong-Rong Chen;Mingyue Ji", "authorids": "~Jiayi_Wang4;~Shiqiang_Wang1;~Rong-Rong_Chen1;~Mingyue_Ji1", "gender": "F;M;;M", "homepage": ";https://shiqiang.wang;;https://mingyueji.ece.ufl.edu/", "dblp": ";87/5094-1;;", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;kA_vmOcAAAAJ;G2pEqUQAAAAJ;rWLfxVgAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Jiayi_Wang4;~Shiqiang_Wang1;~Rong-Rong_Chen1;~Mingyue_Ji1", "aff": "Oak Ridge National Laboratory;IBM, International Business Machines;University of Utah;University of Florida", "aff_domain": "ornl.gov;us.ibm.com;utah.edu;ufl.edu", "position": "Postdoc;Research Staff Member;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nwang2024a,\ntitle={A New Theoretical Perspective on Data Heterogeneity in Federated Optimization},\nauthor={Jiayi Wang and Shiqiang Wang and Rong-Rong Chen and Mingyue Ji},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=re6es2atbl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2060110, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3253535705080024254&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "ornl.gov;us.ibm.com;utah.edu;ufl.edu", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Oak Ridge National Laboratory;International Business Machines;University of Utah;University of Florida", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ornl.gov;https://www.ibm.com;https://www.utah.edu;https://www.ufl.edu", "aff_unique_abbr": "ORNL;IBM;Utah;UF", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "SaVeR: Optimal Data Collection Strategy for Safe Policy Evaluation in Tabular MDP", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32944", "id": "reB9FFAaKw", "proceeding": "https://proceedings.mlr.press/v235/mukherjee24a.html", "pdf": "https://openreview.net/pdf?id=reB9FFAaKw", "openreview": "https://openreview.net/forum?id=reB9FFAaKw", "author_site": "Subhojyoti Mukherjee, Josiah Hanna, Robert Nowak", "tldr": "", "abstract": "In this paper, we study safe data collection for the purpose of policy evaluation in tabular Markov decision processes (MDPs). In policy evaluation, we are given a target policy and asked to estimate the expected cumulative reward it will obtain. Policy evaluation requires data and we are interested in the question of what *behavior* policy should collect the data for the most accurate evaluation of the target policy. While prior work has considered behavior policy selection, in this paper, we additionally consider a safety constraint on the behavior policy. Namely, we assume there exists a known default policy that incurs a particular expected cost when run and we enforce that the cumulative cost of all behavior policies ran is better than a constant factor of the cost that would be incurred had we always run the default policy. We first show that there exists a class of intractable MDPs where no safe oracle algorithm with knowledge about problem parameters can efficiently collect data and satisfy the safety constraints. We then define the tractability condition for an MDP such that a safe oracle algorithm can efficiently collect data and using that we prove the first lower bound for this setting. We then introduce an algorithm SaVeR for this problem that approximates the safe oracle algorithm and bound the finite-sample mean squared error of the algorithm while ensuring it satisfies the safety constraint. Finally, we show in simulations that SaVeR produces low MSE policy evaluation while satisfying the safety constraint.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Subhojyoti Mukherjee;Josiah P. Hanna;Robert D Nowak", "authorids": "~Subhojyoti_Mukherjee1;~Josiah_P._Hanna1;~Robert_D_Nowak1", "gender": "M;M;M", "homepage": "https://subhojyoti.github.io/;http://nowak.ece.wisc.edu;https://pages.cs.wisc.edu/~jphanna/", "dblp": "199/2032;n/RobertDNowak;135/6336", "google_scholar": ";fn13u8IAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Subhojyoti_Mukherjee1;~Robert_D_Nowak1;~Josiah_Hanna2", "aff": "University of Wisconsin, Madison;University of Wisconsin - Madison;University of Wisconsin - Madison", "aff_domain": "wisc.edu;;wisc.edu", "position": "PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nmukherjee2024saver,\ntitle={SaVeR: Optimal Data Collection Strategy for Safe Policy Evaluation in Tabular {MDP}},\nauthor={Subhojyoti Mukherjee and Josiah P. Hanna and Robert D Nowak},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=reB9FFAaKw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1461860, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5103285875524933950&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "wisc.edu;;wisc.edu", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Wisconsin;University of Wisconsin-Madison", "aff_unique_dep": ";", "aff_unique_url": "https://www.wisc.edu;https://www.wisc.edu", "aff_unique_abbr": "UW;UW-Madison", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Position: Intent-aligned AI Systems Must Optimize for Agency Preservation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32943", "id": "rfvgdfd1K9", "proceeding": "https://proceedings.mlr.press/v235/mitelut24a.html", "pdf": "https://openreview.net/pdf?id=rfvgdfd1K9", "openreview": "https://openreview.net/forum?id=rfvgdfd1K9", "author_site": "Catalin Mitelut, Benjamin Smith, Peter Vamplew", "tldr": "", "abstract": "A central approach to AI-safety research has been to generate aligned AI systems: i.e. systems that do not deceive users and yield actions or recommendations that humans might judge as consistent with their intentions and goals. Here we argue that truthful AIs aligned solely to human intent are insufficient and that preservation of long-term agency of humans may be a more robust standard that may need to be separated and explicitly optimized for. We discuss the science of intent and control and how human intent can be manipulated and we provide a formal definition of agency-preserving AI-human interactions focusing on forward-looking explicit agency evaluations. Our work points to a novel pathway for human harm in AI-human interactions and proposes solutions to this challenge.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Catalin Mitelut;Benjamin Smith;Peter Vamplew", "authorids": "~Catalin_Mitelut1;benjsmith@gmail.com;~Peter_Vamplew2", "gender": ";;M", "homepage": ";;https://federation.edu.au/schools/school-of-science-engineering-and-information-technology/staff-profiles/information-technology/peter-vamplew", "dblp": ";;v/PeterVamplew", "google_scholar": ";;https://scholar.google.com.au/citations?user=Q4oV_VoAAAAJ", "orcid": ";;0000-0002-8687-4424", "linkedin": ";;peter-vamplew-1150361/", "or_profile": "~Catalin_Mitelut1;benjsmith@gmail.com;~Peter_Vamplew2", "aff": ";;Federation University Australia", "aff_domain": ";;federation.edu.au", "position": ";;Full Professor", "bibtex": "@inproceedings{\nmitelut2024position,\ntitle={Position: Intent-aligned {AI} Systems Must Optimize for Agency Preservation},\nauthor={Catalin Mitelut and Benjamin Smith and Peter Vamplew},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rfvgdfd1K9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3551965, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11161880961259875520&as_sdt=5,24&sciodt=0,24&hl=en", "gs_version_total": 4, "email": ";;federation.edu.au", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Federation University Australia", "aff_unique_dep": "", "aff_unique_url": "https://www.federation.edu.au", "aff_unique_abbr": "FedUni", "aff_country_unique_index": "0", "aff_country_unique": "Australia" }, { "title": "Reducing Item Discrepancy via Differentially Private Robust Embedding Alignment for Privacy-Preserving Cross Domain Recommendation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32942", "id": "rk4kmL8aOY", "proceeding": "https://proceedings.mlr.press/v235/liu24cf.html", "pdf": "https://openreview.net/pdf?id=rk4kmL8aOY", "openreview": "https://openreview.net/forum?id=rk4kmL8aOY", "author_site": "Weiming Liu, Xiaolin Zheng, Chaochao Chen, Jiahe Xu, Xinting Liao, Fan Wang, Yanchao Tan, Yew Soon ONG", "tldr": "", "abstract": "Cross-Domain Recommendation (CDR) have become increasingly appealing by leveraging useful information to tackle the data sparsity problem across domains. Most of latest CDR models assume that domain-shareable user-item information (e.g., rating and review on overlapped users or items) are accessible across domains. However, these assumptions become impractical due to the strict data privacy protection policy. In this paper, we propose Reducing Item Discrepancy (RidCDR) model on solving Privacy-Preserving Cross-Domain Recommendation (PPCDR) problem. Specifically, we aim to enhance the model performance on both source and target domains without overlapped users and items while protecting the data privacy. We innovatively propose private-robust embedding alignment module in RidCDR for knowledge sharing across domains while avoiding negative transfer privately. Our empirical study on Amazon and Douban datasets demonstrates that RidCDR significantly outperforms the state-of-the-art models under the PPCDR without overlapped users and items.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weiming Liu;Xiaolin Zheng;Chaochao Chen;Jiahe Xu;Xinting Liao;Fan Wang;Yanchao Tan;Yew-Soon Ong", "authorids": "~Weiming_Liu2;~Xiaolin_Zheng1;~Chaochao_Chen3;~Jiahe_Xu1;~Xinting_Liao1;~Fan_Wang14;~Yanchao_Tan1;~Yew-Soon_Ong1", "gender": ";M;;F;F;;F;", "homepage": ";https://person.zju.edu.cn/xlzheng;https://sites.google.com/site/ccchomepage/;https://github.com/Che-Xu;https://xenialll.github.io/;;;", "dblp": ";09/5763;26/1492-1;72/7143-3.html;331/1544;;210/4829.html;", "google_scholar": ";MY23M60AAAAJ;qZTMyzwAAAAJ;;FoMerO8AAAAJ;;NQWuK9UAAAAJ;", "orcid": ";0000-0001-5483-0366;0000-0003-1419-964X;0009-0009-0680-1806;0000-0002-8257-2381;;0000-0002-3526-6859;", "linkedin": ";;ccchomepage/;;;;;", "or_profile": "~Weiming_Liu2;~Xiaolin_Zheng1;~Chaochao_Chen3;~Jiahe_Xu1;~Xinting_Liao1;~Fan_Wang14;~Yanchao_Tan1;~Yew-Soon_Ong1", "aff": ";Zhejiang University;Zhejiang University;Zhejiang University;Zhejiang University;;Fuzhou University;", "aff_domain": ";zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn;;fzu.edu.cn;", "position": ";Full Professor;Distinguished Research Fellow;Undergrad student;PhD student;;Lecturer;", "bibtex": "@inproceedings{\nliu2024reducing,\ntitle={Reducing Item Discrepancy via Differentially Private Robust Embedding Alignment for Privacy-Preserving Cross Domain Recommendation},\nauthor={Weiming Liu and Xiaolin Zheng and Chaochao Chen and Jiahe Xu and Xinting Liao and Fan Wang and Yanchao Tan and Yew-Soon Ong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rk4kmL8aOY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1054836, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17359352278262368810&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": ";zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn;;fzu.edu.cn;", "author_num": 8, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Zhejiang University;Fuzhou University", "aff_unique_dep": ";", "aff_unique_url": "https://www.zju.edu.cn;https://www.fznu.edu.cn", "aff_unique_abbr": "ZJU;FZU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Revealing the Dark Secrets of Extremely Large Kernel ConvNets on Robustness", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32941", "id": "rkYOxLLv2x", "proceeding": "https://proceedings.mlr.press/v235/chen24bb.html", "pdf": "https://openreview.net/pdf?id=rkYOxLLv2x", "openreview": "https://openreview.net/forum?id=rkYOxLLv2x", "author_site": "Honghao Chen, Zhang Yurong, xiaokun Feng, Xiangxiang Chu, Kaiqi Huang", "tldr": "", "abstract": "Robustness is a vital aspect to consider when deploying deep learning models into the wild. Numerous studies have been dedicated to the study of the robustness of vision transformers (ViTs), which have dominated as the mainstream backbone choice for vision tasks since the dawn of 2020s. Recently, some large kernel convnets make a comeback with impressive performance and efficiency. However, it still remains unclear whether large kernel networks are robust and the attribution of their robustness. In this paper, we first conduct a comprehensive evaluation of large kernel convnets' robustness and their differences from typical small kernel counterparts and ViTs on six diverse robustness benchmark datasets. Then to analyze the underlying factors behind their strong robustness, we design experiments from both quantitative and qualitative perspectives to reveal large kernel convnets' intriguing properties that are completely different from typical convnets. Our experiments demonstrate for the first time that pure CNNs can achieve exceptional robustness comparable or even superior to that of ViTs. Our analysis on occlusion invariance, kernel attention patterns and frequency characteristics provide novel insights into the source of robustness. Code available at: https://github.com/Lauch1ng/LKRobust.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Honghao Chen;Yurong Zhang;Xiaokun Feng;Xiangxiang Chu;Kaiqi Huang", "authorids": "~Honghao_Chen1;~Yurong_Zhang1;~Xiaokun_Feng1;~Xiangxiang_Chu1;~Kaiqi_Huang1", "gender": "M;F;M;M;M", "homepage": ";;https://github.com/XiaokunFeng;https://cxxgtxy.github.io/;https://people.ucas.ac.cn/~huangkaiqi?language=en", "dblp": "279/9807;261/6606;314/9776;207/8002;89/7026", "google_scholar": "https://scholar.google.com/citations?hl=en;;https://scholar.google.com.hk/citations?user=NqXtIPIAAAAJ;jn21pUsAAAAJ;caQ-OmYAAAAJ", "orcid": ";;;0000-0003-2548-0605;", "linkedin": ";%E8%82%B2%E8%8D%A3-%E5%BC%A0-603333238/;;;", "or_profile": "~Honghao_Chen1;~Yurong_Zhang1;~Xiaokun_Feng1;~Xiangxiang_Chu1;~Kaiqi_Huang1", "aff": "Institute of Automation, Chinese Academy of Sciences;Shanghai Jiaotong University;Institute of automation, Chinese academy of science;MeiTuan;Institute of automation, Chinese academy of science", "aff_domain": "ia.ac.cn;sjtu.edu.cn;ia.ac.cn;meituan.com;nlpr.ia.ac.cn", "position": "PhD student;MS student;PhD student;Senior Engineer;Professor", "bibtex": "@inproceedings{\nchen2024revealing,\ntitle={Revealing the Dark Secrets of Extremely Large Kernel ConvNets on Robustness},\nauthor={Honghao Chen and Yurong Zhang and Xiaokun Feng and Xiangxiang Chu and Kaiqi Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rkYOxLLv2x}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 615155, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11389311864238033993&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "ia.ac.cn;sjtu.edu.cn;ia.ac.cn;meituan.com;nlpr.ia.ac.cn", "author_num": 5, "aff_unique_index": "0;1;0;2;0", "aff_unique_norm": "Chinese Academy of Sciences;Shanghai Jiao Tong University;Meituan", "aff_unique_dep": "Institute of Automation;;", "aff_unique_url": "http://www.ia.cas.cn;https://www.sjtu.edu.cn;https://www.meituan.com", "aff_unique_abbr": "CAS;SJTU;MeiTuan", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Listening to the noise: Blind Denoising with Gibbs Diffusion", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32940", "id": "rmEgJ7bhuZ", "proceeding": "https://proceedings.mlr.press/v235/heurtel-depeiges24a.html", "pdf": "https://openreview.net/pdf?id=rmEgJ7bhuZ", "openreview": "https://openreview.net/forum?id=rmEgJ7bhuZ", "author_site": "David Heurtel-Depeiges, Charles Margossian, Ruben Ohana, Bruno R\u00e9galdo-Saint Blancard", "tldr": "", "abstract": "In recent years, denoising problems have become intertwined with the development of deep generative models. In particular, diffusion models are trained like denoisers, and the distribution they model coincide with denoising priors in the Bayesian picture. However, denoising through diffusion-based posterior sampling requires the noise level and covariance to be known, preventing *blind denoising*. We overcome this limitation by introducing Gibbs Diffusion (GDiff), a general methodology addressing posterior sampling of both the signal and the noise parameters. Assuming arbitrary parametric Gaussian noise, we develop a Gibbs algorithm that alternates sampling steps from a conditional diffusion model trained to map the signal prior to the class of noise distributions, and a Monte Carlo sampler to infer the noise parameters. Our theoretical analysis highlights potential pitfalls, guides diagnostic usage, and quantifies errors in the Gibbs stationary distribution caused by the diffusion model. We showcase our method for 1) blind denoising of natural images involving colored noises with unknown amplitude and exponent, and 2) a cosmology problem, namely the analysis of cosmic microwave background data, where Bayesian inference of \"noise\" parameters means constraining models of the evolution of the Universe.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "David Heurtel-Depeiges;Charles Margossian;Ruben Ohana;Bruno R\u00e9galdo-Saint Blancard", "authorids": "~David_Heurtel-Depeiges1;~Charles_Margossian1;~Ruben_Ohana1;~Bruno_R\u00e9galdo-Saint_Blancard1", "gender": "M;M;;", "homepage": "https://david-heurtel-depeiges.github.io/;https://charlesm93.github.io./;https://rubenohana.github.io/;https://users.flatironinstitute.org/~bregaldosaintblancard/", "dblp": ";;251/5608;", "google_scholar": ";nPtLsvIAAAAJ;https://scholar.google.fr/citations?user=F9qNg2wAAAAJ;TfcmfBQAAAAJ", "orcid": ";0000-0002-3274-5619;0000-0002-8493-1210;0000-0003-0055-0953", "linkedin": ";charles-margossian-3428935b/;rubenohana/;", "or_profile": "~David_Heurtel-Depeiges1;~Charles_Margossian1;~Ruben_Ohana1;~Bruno_R\u00e9galdo-Saint_Blancard1", "aff": "Google;Flatiron Institute;Flatiron Institute;Flatiron Institute", "aff_domain": "deepmind.com;flatironinstitute.org;flatironinstitute.org;flatironinstitute.org", "position": "Intern;Postdoc;Postdoc;Postdoc", "bibtex": "@inproceedings{\nheurtel-depeiges2024listening,\ntitle={Listening to the noise: Blind Denoising with Gibbs Diffusion},\nauthor={David Heurtel-Depeiges and Charles Margossian and Ruben Ohana and Bruno R{\\'e}galdo-Saint Blancard},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rmEgJ7bhuZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4795921, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1883152997089562211&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "deepmind.com;flatironinstitute.org;flatironinstitute.org;flatironinstitute.org", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Google;Flatiron Institute", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://flatironinstitute.org", "aff_unique_abbr": "Google;Flatiron", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Designing Decision Support Systems using Counterfactual Prediction Sets", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32939", "id": "rqyXubsBhH", "proceeding": "https://proceedings.mlr.press/v235/straitouri24a.html", "pdf": "https://openreview.net/pdf?id=rqyXubsBhH", "openreview": "https://openreview.net/forum?id=rqyXubsBhH", "author_site": "Eleni Straitouri, Manuel Gomez-Rodriguez", "tldr": "", "abstract": "Decision support systems for classification tasks are predominantly designed to predict the value of the ground truth labels. However, since their predictions are not perfect, these systems also need to make human experts understand when and how to use these predictions to update their own predictions. Unfortunately, this has been proven challenging. In this context, it has been recently argued that an alternative type of decision support systems may circumvent this challenge. Rather than providing a single label prediction, these systems provide a set of label prediction values constructed using a conformal predictor, namely a prediction set, and forcefully ask experts to predict a label value from the prediction set. However, the design and evaluation of these systems have so far relied on stylized expert models, questioning their promise. In this paper, we revisit the design of this type of systems from the perspective of online learning and develop a methodology that does not require, nor assumes, an expert model. Our methodology leverages the nested structure of the prediction sets provided by any conformal predictor and a natural counterfactual monotonicity assumption to achieve an exponential improvement in regret in comparison to vanilla bandit algorithms. We conduct a large-scale human subject study ($n = 2{,}751$) to compare our methodology to several competitive baselines. The results show that, for decision support systems based on prediction sets, limiting experts\u2019 level of agency leads to greater performance than allowing experts to always exercise their own agency.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Eleni Straitouri;Manuel Gomez Rodriguez", "authorids": "~Eleni_Straitouri1;~Manuel_Gomez_Rodriguez1", "gender": ";M", "homepage": "https://people.mpi-sws.org/~estraitouri/;https://www.mpi-sws.org/~manuelgr/", "dblp": "302/4619;73/8260", "google_scholar": "kphSqwwAAAAJ;https://scholar.google.com.tw/citations?user=UcuXmuwAAAAJ", "orcid": ";", "linkedin": "eleni-straitouri-919419205;", "or_profile": "~Eleni_Straitouri1;~Manuel_Gomez_Rodriguez1", "aff": "MPI-SWS;MPI-SWS", "aff_domain": "mpi-sws.org;mpi-sws.org", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nstraitouri2024designing,\ntitle={Designing Decision Support Systems using Counterfactual Prediction Sets},\nauthor={Eleni Straitouri and Manuel Gomez Rodriguez},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rqyXubsBhH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2452971, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4069766652661221391&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "mpi-sws.org;mpi-sws.org", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Max Planck Institute for Software Systems", "aff_unique_dep": "", "aff_unique_url": "https://www.mpi-sws.org", "aff_unique_abbr": "MPI-SWS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "AlphaFold Meets Flow Matching for Generating Protein Ensembles", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32938", "id": "rs8Sh2UASt", "proceeding": "https://proceedings.mlr.press/v235/jing24a.html", "pdf": "https://openreview.net/pdf?id=rs8Sh2UASt", "openreview": "https://openreview.net/forum?id=rs8Sh2UASt", "author_site": "Bowen Jing, Bonnie Berger, Tommi Jaakkola", "tldr": "", "abstract": "The biological functions of proteins often depend on dynamic structural ensembles. In this work, we develop a flow-based generative modeling approach for learning and sampling the conformational landscapes of proteins. We repurpose highly accurate single-state predictors such as AlphaFold and ESMFold and fine-tune them under a custom flow matching framework to obtain sequence-conditioned generative models of protein structure called AlphaFlow and ESMFlow. When trained and evaluated on the PDB, our method provides a superior combination of precision and diversity compared to AlphaFold with MSA subsampling. When further trained on ensembles from all-atom MD, our method accurately captures conformational flexibility, positional distributions, and higher-order ensemble observables for unseen proteins. Moreover, our method can diversify a static PDB structure with faster wall-clock convergence to certain equilibrium properties than replicate MD trajectories, demonstrating its potential as a proxy for expensive physics-based simulations. Code is available at https://github.com/bjing2016/alphaflow.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bowen Jing;Bonnie Berger;Tommi Jaakkola", "authorids": "~Bowen_Jing1;~Bonnie_Berger1;~Tommi_S._Jaakkola1", "gender": ";F;", "homepage": ";https://people.csail.mit.edu/bab/;", "dblp": ";b/BonnieBerger;", "google_scholar": ";bYjKaowAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Bowen_Jing1;~Bonnie_Berger1;~Tommi_S._Jaakkola1", "aff": ";Massachusetts Institute of Technology;", "aff_domain": ";mit.edu;", "position": ";Full Professor;", "bibtex": "@inproceedings{\njing2024alphafold,\ntitle={AlphaFold Meets Flow Matching for Generating Protein Ensembles},\nauthor={Bowen Jing and Bonnie Berger and Tommi Jaakkola},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rs8Sh2UASt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9148636, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 108, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12272007018629929193&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";mit.edu;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Sampling in Unit Time with Kernel Fisher-Rao Flow", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32937", "id": "rtyqBfcg8j", "proceeding": "https://proceedings.mlr.press/v235/maurais24a.html", "pdf": "https://openreview.net/pdf?id=rtyqBfcg8j", "openreview": "https://openreview.net/forum?id=rtyqBfcg8j", "author_site": "Aimee Maurais, Youssef Marzouk", "tldr": "", "abstract": "We introduce a new mean-field ODE and corresponding interacting particle systems (IPS) for sampling from an unnormalized target density. The IPS are gradient-free, available in closed form, and only require the ability to sample from a reference density and compute the (unnormalized) target-to-reference density ratio. The mean-field ODE is obtained by solving a Poisson equation for a velocity field that transports samples along the geometric mixture of the two densities, $\\pi_0^{1-t} \\pi_1^t$, which is the path of a particular Fisher-Rao gradient flow. We employ a RKHS ansatz for the velocity field, which makes the Poisson equation tractable and enables discretization of the resulting mean-field ODE over finite samples. The mean-field ODE can be additionally be derived from a discrete-time perspective as the limit of successive linearizations of the Monge-Amp\u00e8re equations within a framework known as sample-driven optimal transport. We introduce a stochastic variant of our approach and demonstrate empirically that our IPS can produce high-quality samples from varied target distributions, outperforming comparable gradient-free particle systems and competitive with gradient-based alternatives.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aimee Maurais;Youssef Marzouk", "authorids": "~Aimee_Maurais1;~Youssef_Marzouk1", "gender": ";M", "homepage": ";https://uqgroup.mit.edu", "dblp": ";78/9757", "google_scholar": ";TwVbNZ4AAAAJ", "orcid": ";0000-0001-8242-3290", "linkedin": ";", "or_profile": "~Aimee_Maurais1;~Youssef_Marzouk1", "aff": ";Massachusetts Institute of Technology", "aff_domain": ";mit.edu", "position": ";Professor", "bibtex": "@inproceedings{\nmaurais2024sampling,\ntitle={Sampling in Unit Time with Kernel Fisher-Rao Flow},\nauthor={Aimee Maurais and Youssef Marzouk},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rtyqBfcg8j}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3120262, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6365226743208880534&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": ";mit.edu", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Dynamic Facility Location in High Dimensional Euclidean Spaces", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32936", "id": "rucbIsWoEV", "proceeding": "https://proceedings.mlr.press/v235/bhattacharya24a.html", "pdf": "https://openreview.net/pdf?id=rucbIsWoEV", "openreview": "https://openreview.net/forum?id=rucbIsWoEV", "author_site": "Sayan Bhattacharya, Gramoz Goranci, Shaofeng Jiang, Yi Qian, Yubo Zhang", "tldr": "", "abstract": "We study the facility location problem in the dynamic setting, where the goal is to efficiently process an intermixed sequence of point insertions and deletions while maintaining a high quality and stable solution. Although the problem has been studied in the context of general metrics and low-dimensional spaces, much remains unknown concerning dynamic facility location in high dimensional spaces. In this work, we present the first fully dynamic algorithm for facility location in high-dimensional spaces $\\mathbb{R}^{d}$. For any $c \\geq 1$, our algorithm achieves $O(c)$-approximation, supports point updates in $\\tilde{O}(\\mathrm{poly}(d)n^{1/c + o(1)})$ amortized time and incurs $O(1)$ amortized recourse. More generally, our result shows that despite the linear-time lower bound on the update time for general metrics, it is possible to achieve sub-linear update times for metric spaces that admit dynamic nearest neighbour oracles. Experiments on real datasets confirm that our algorithm achieves high-quality solutions with low running time, and incurs minimal recourse.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sayan Bhattacharya;Gramoz Goranci;Shaofeng H.-C. Jiang;Yi Qian;Yubo Zhang", "authorids": "~Sayan_Bhattacharya2;~Gramoz_Goranci1;~Shaofeng_H.-C._Jiang1;qianyi@stu.pku.edu.cn;~Yubo_Zhang4", "gender": "M;;M;;M", "homepage": "https://www.dcs.warwick.ac.uk/~u1671158/;https://sites.google.com/view/ggoranci/;https://shaofengjiang.cn;;http://saigyouji.github.io/", "dblp": "57/3907.html;179/2404;157/6062;;", "google_scholar": "ca-urkIAAAAJ;nWXrBUgAAAAJ;;;", "orcid": ";0000-0002-9603-2255;0000-0001-7972-827X;;", "linkedin": ";;;;", "or_profile": "~Sayan_Bhattacharya2;~Gramoz_Goranci1;~Shaofeng_H.-C._Jiang1;qianyi@stu.pku.edu.cn;~Yubo_Zhang4", "aff": "Google;Universit\u00e4t Vienna;Peking University;;Peking University", "aff_domain": "google.com;univie.ac.at;pku.edu.cn;;pku.edu.cn", "position": "Researcher;Assistant Professor;Assistant Professor;;PhD student", "bibtex": "@inproceedings{\nbhattacharya2024dynamic,\ntitle={Dynamic Facility Location in High Dimensional Euclidean Spaces},\nauthor={Sayan Bhattacharya and Gramoz Goranci and Shaofeng H.-C. Jiang and Yi Qian and Yubo Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rucbIsWoEV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 540299, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2043010615199739334&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 7, "email": "google.com;univie.ac.at;pku.edu.cn;;pku.edu.cn", "author_num": 5, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Google;University of Vienna;Peking University", "aff_unique_dep": "Google;;", "aff_unique_url": "https://www.google.com;https://univie.ac.at;http://www.pku.edu.cn", "aff_unique_abbr": "Google;UV;Peking U", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;2;2", "aff_country_unique": "United States;Austria;China" }, { "title": "Differentiable Annealed Importance Sampling Minimizes The Jensen-Shannon Divergence Between Initial and Target Distribution", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32935", "id": "rvaN2P1rvC", "proceeding": "https://proceedings.mlr.press/v235/zenn24a.html", "pdf": "https://openreview.net/pdf?id=rvaN2P1rvC", "openreview": "https://openreview.net/forum?id=rvaN2P1rvC", "author_site": "Johannes Zenn, Robert Bamler", "tldr": "", "abstract": "Differentiable annealed importance sampling (DAIS), proposed by Geffner & Domke (2021) and Zhang et al. (2021), allows optimizing, among others, over the initial distribution of AIS. In this paper, we show that, in the limit of many transitions, DAIS minimizes the symmetrized KL divergence (Jensen-Shannon divergence) between the initial and target distribution. Thus, DAIS can be seen as a form of variational inference (VI) in that its initial distribution is a parametric fit to an intractable target distribution. We empirically evaluate the usefulness of the initial distribution as a variational distribution on synthetic and real-world data, observing that it often provides more accurate uncertainty estimates than standard VI (optimizing the reverse KL divergence), importance weighted VI, and Markovian score climbing (optimizing the forward KL divergence).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Johannes Zenn;Robert Bamler", "authorids": "~Johannes_Zenn1;~Robert_Bamler1", "gender": "M;M", "homepage": "https://jzenn.github.io;https://robamler.github.io/", "dblp": "308/0659;195/6208.html", "google_scholar": "QYZmWP8AAAAJ;LwvdNAgAAAAJ", "orcid": "0009-0001-5834-3474;", "linkedin": "johannes-zenn;", "or_profile": "~Johannes_Zenn1;~Robert_Bamler1", "aff": "Eberhard-Karls-Universit\u00e4t T\u00fcbingen;University of Tuebingen", "aff_domain": "uni-tuebingen.de;uni-tuebingen.de", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nzenn2024differentiable,\ntitle={Differentiable Annealed Importance Sampling Minimizes The Jensen-Shannon Divergence Between Initial and Target Distribution},\nauthor={Johannes Zenn and Robert Bamler},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rvaN2P1rvC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1803966, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12453215385874368005&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "uni-tuebingen.de;uni-tuebingen.de", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Eberhard Karls University of T\u00fcbingen;University of Tuebingen", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.uni-tuebingen.de/", "aff_unique_abbr": "Uni T\u00fcbingen;Uni T\u00fcbingen", "aff_campus_unique_index": "0", "aff_campus_unique": "T\u00fcbingen;", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "SAM-E: Leveraging Visual Foundation Model with Sequence Imitation for Embodied Manipulation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32934", "id": "ryDa4mS18V", "proceeding": "https://proceedings.mlr.press/v235/zhang24c.html", "pdf": "https://openreview.net/pdf?id=ryDa4mS18V", "openreview": "https://openreview.net/forum?id=ryDa4mS18V", "author_site": "Junjie Zhang, Chenjia Bai, Haoran He, Zhigang Wang, Bin Zhao, Xiu Li, Xuelong Li", "tldr": "", "abstract": "Acquiring a multi-task imitation policy in 3D manipulation poses challenges in terms of scene understanding and action prediction. Current methods employ both 3D representation and multi-view 2D representation to predict the poses of the robot\u2019s end-effector. However, they still require a considerable amount of high-quality robot trajectories, and suffer from limited generalization in unseen tasks and inefficient execution in long-horizon reasoning. In this paper, we propose **SAM-E**, a novel architecture for robot manipulation by leveraging a vision-foundation model for generalizable scene understanding and sequence imitation for long-term action reasoning. Specifically, we adopt Segment Anything (SAM) pre-trained on a huge number of images and promptable masks as the foundation model for extracting task-relevant features, and employ parameter-efficient fine-tuning on robot data for a better understanding of embodied scenarios. To address long-horizon reasoning, we develop a novel multi-channel heatmap that enables the prediction of the action sequence in a single pass, notably enhancing execution efficiency. Experimental results from various instruction-following tasks demonstrate that SAM-E achieves superior performance with higher execution efficiency compared to the baselines, and also significantly improves generalization in few-shot adaptation to new tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Junjie Zhang;Chenjia Bai;Haoran He;Zhigang Wang;Bin Zhao;Xiu Li;Xuelong Li", "authorids": "~Junjie_Zhang6;~Chenjia_Bai2;~Haoran_He1;~Zhigang_Wang3;~Bin_Zhao7;~Xiu_Li1;~Xuelong_Li2", "gender": "M;M;M;M;F;M;M", "homepage": "https://pipixiaqishi1.github.io/;https://baichenjia.github.io/;https://tinnerhrhe.github.io/;https://iopen.nwpu.edu.cn/info/1347/2105.htm;https://thusigsiclab.github.io/thu.github.io/introduction.html;;", "dblp": ";247/1943;299/7312;73/4325-1.html;13/1206-1;l/XuelongLi;35/1989-2", "google_scholar": "EaLlLZ8AAAAJ;Rm_1y2kAAAAJ;Z33PHQ0AAAAJ;https://scholar.google.com.hk/citations?user=DQB0hqwAAAAJ;https://scholar.google.com/citations?hl=zh-CN;ahUibskAAAAJ;cw3EaAYAAAAJ", "orcid": ";;0000-0002-7340-8643;;0000-0003-0403-1923;;", "linkedin": ";;;;;;", "or_profile": "~Junjie_Zhang6;~Chenjia_Bai2;~Haoran_He1;~Bin_Zhao7;~Xiu_Li1;~Xuelong_Li2;~Zhi.gang_Wang1", "aff": "Tsinghua University;Shanghai AI Laboratory;Hong Kong University of Science and Technology;Northwest Polytechnical University Xi'an;Tsinghua University;Northwestern Polytechnical University;Shanghai AI Lab", "aff_domain": "tsinghua.edu.cn;pjlab.org.cn;connect.ust.hk;nwpu.edu.cn;tsinghua.edu.cn;nwpu.edu.cn;pjlab.org.cn", "position": "MS student;Researcher;PhD student;Associate Professor;Professor;Full Professor;Researcher", "bibtex": "@inproceedings{\nzhang2024same,\ntitle={{SAM}-E: Leveraging Visual Foundation Model with Sequence Imitation for Embodied Manipulation},\nauthor={Junjie Zhang and Chenjia Bai and Haoran He and Zhigang Wang and Bin Zhao and Xiu Li and Xuelong Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ryDa4mS18V}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9603707, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8387733044758518065&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "tsinghua.edu.cn;pjlab.org.cn;connect.ust.hk;nwpu.edu.cn;tsinghua.edu.cn;nwpu.edu.cn;pjlab.org.cn", "author_num": 7, "aff_unique_index": "0;1;2;3;0;4;5", "aff_unique_norm": "Tsinghua University;Shanghai AI Laboratory;Hong Kong University of Science and Technology;Northwest Polytechnical University;Northwestern Polytechnical University;Shanghai AI Lab", "aff_unique_dep": ";;;;;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.shanghai-ai-lab.com;https://www.ust.hk;http://www.nwpu.edu.cn;https://www.nwpu.edu.cn;https://www.shanghaiailab.com", "aff_unique_abbr": "THU;SAIL;HKUST;NWPU;NWPU;SAIL", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Hong Kong SAR;Xi'an", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Don't trust your eyes: on the (un)reliability of feature visualizations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32933", "id": "s0Jvdolv2I", "proceeding": "https://proceedings.mlr.press/v235/geirhos24a.html", "pdf": "https://openreview.net/pdf?id=s0Jvdolv2I", "openreview": "https://openreview.net/forum?id=s0Jvdolv2I", "author_site": "Robert Geirhos, Roland S. Zimmermann, Blair Bilodeau, Wieland Brendel, Been Kim", "tldr": "", "abstract": "How do neural networks extract patterns from pixels? Feature visualizations attempt to answer this important question by visualizing highly activating patterns through optimization. Today, visualization methods form the foundation of our knowledge about the internal workings of neural networks, as a type of mechanistic interpretability. Here we ask: How reliable are feature visualizations? We start our investigation by developing network circuits that trick feature visualizations into showing arbitrary patterns that are completely disconnected from normal network behavior on natural input. We then provide evidence for a similar phenomenon occurring in standard, unmanipulated networks: feature visualizations are processed very differently from standard input, casting doubt on their ability to \"explain\" how neural networks process natural images. This can be used as a sanity check for feature visualizations. We underpin our empirical findings by theory proving that the set of functions that can be reliably understood by feature visualization is extremely small and does not include general black-box neural networks. Therefore, a promising way forward could be the development of networks that enforce certain structures in order to ensure more reliable feature visualizations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Robert Geirhos;Roland S. Zimmermann;Blair Bilodeau;Wieland Brendel;Been Kim", "authorids": "~Robert_Geirhos1;~Roland_S._Zimmermann1;~Blair_Bilodeau1;~Wieland_Brendel1;~Been_Kim1", "gender": "M;M;M;;M", "homepage": "https://robertgeirhos.com/;http://www.blairbilodeau.ca;;https://beenkim.github.io/;https://rzimmermann.com", "dblp": "176/0076;;37/11107;https://dblp.uni-trier.de/pers/k/Kim:Been.html;227/2603", "google_scholar": "w3kGtMIAAAAJ;;v-JL-hsAAAAJ;;https://scholar.google.de/citations?user=4jdISHwAAAAJ", "orcid": "0000-0001-7698-3187;;;;", "linkedin": "rgeirhos/;;;;", "or_profile": "~Robert_Geirhos1;~Blair_Bilodeau1;~Wieland_Brendel1;~Been_Kim1;~Roland_Simon_Zimmermann1", "aff": "Google DeepMind;;ELLIS Institute T\u00fcbingen;Google DeepMind;Max-Planck Institute for Intelligent Systems", "aff_domain": "google.com;;tue.ellis.eu;google.com;mpg.tuebingen.de", "position": "Research Scientist;;Principal Researcher;Research Scientist;PhD student", "bibtex": "@inproceedings{\ngeirhos2024dont,\ntitle={Don't trust your eyes: on the (un)reliability of feature visualizations},\nauthor={Robert Geirhos and Roland S. Zimmermann and Blair Bilodeau and Wieland Brendel and Been Kim},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=s0Jvdolv2I}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9040508, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13494620787600136581&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "google.com;;tue.ellis.eu;google.com;mpg.tuebingen.de", "author_num": 5, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Google;ELLIS Institute;Max-Planck Institute for Intelligent Systems", "aff_unique_dep": "Google DeepMind;;", "aff_unique_url": "https://deepmind.com;https://ellis.eu/;https://www.mpi-is.mpg.de", "aff_unique_abbr": "DeepMind;ELLIS;MPI-IS", "aff_campus_unique_index": "1", "aff_campus_unique": ";T\u00fcbingen", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "United Kingdom;Germany" }, { "title": "DiffAug: Enhance Unsupervised Contrastive Learning with Domain-Knowledge-Free Diffusion-based Data Augmentation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32932", "id": "s0UDX7Kswl", "proceeding": "https://proceedings.mlr.press/v235/zang24a.html", "pdf": "https://openreview.net/pdf?id=s0UDX7Kswl", "openreview": "https://openreview.net/forum?id=s0UDX7Kswl", "author_site": "Zelin Zang, Hao Luo, Kai Wang, Panpan Zhang, Fan Wang, Stan Z Li, Yang You", "tldr": "", "abstract": "Unsupervised Contrastive learning has gained prominence in fields such as vision, and biology, leveraging predefined positive/negative samples for representation learning. Data augmentation, categorized into hand-designed and model-based methods, has been identified as a crucial component for enhancing contrastive learning. However, hand-designed methods require human expertise in domain-specific data while sometimes distorting the meaning of the data. In contrast, generative model-based approaches usually require supervised or large-scale external data, which has become a bottleneck constraining model training in many domains. To address the problems presented above, this paper proposes DiffAug, a novel unsupervised contrastive learning technique with diffusion mode-based positive data generation. DiffAug consists of a semantic encoder and a conditional diffusion model; the conditional diffusion model generates new positive samples conditioned on the semantic encoding to serve the training of unsupervised contrast learning. With the help of iterative training of the semantic encoder and diffusion model, DiffAug improves the representation ability in an uninterrupted and unsupervised manner. Experimental evaluations show that DiffAug outperforms hand-designed and SOTA model-based augmentation methods on DNA sequence, visual, and bio-feature datasets. The code for review is released at [DiffAug CODE](https://github.com/zangzelin/code_diffaug).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zelin Zang;Hao Luo;Kai Wang;Panpan Zhang;Fan Wang;Stan Z. Li;Yang You", "authorids": "~Zelin_Zang2;~Hao_Luo1;~Kai_Wang8;~Panpan_Zhang1;~Fan_Wang6;~Stan_Z._Li2;~Yang_You1", "gender": "M;M;M;F;F;M;M", "homepage": ";http://luohao.site/;https://kaiwang960112.github.io/;;;https://www.comp.nus.edu.sg/~youy/;https://en.westlake.edu.cn/academics/School_of_Engineering/About/Our_People/Faculty/201912/t20191206_2497.shtml", "dblp": "226/7615;14/3727-4;78/2022-36;;;33/8167-1.html;l/StanZLi", "google_scholar": "foERjnQAAAAJ;7QvWnzMAAAAJ;i2II0XIAAAAJ;7QsqATEAAAAJ;WCRGTHsAAAAJ;jF4dPZwAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";0000-0002-6405-4011;0000-0002-1154-5175;;0000-0001-7320-1119;;", "linkedin": ";;;;;yang-you-0b92914b/;stan-z-li-%E6%9D%8E%E5%AD%90%E9%9D%92-55753224/", "or_profile": "~Zelin_Zang2;~Hao_Luo1;~Kai_Wang8;~Panpan_Zhang1;~Fan_Wang6;~Yang_You1;~Stan_Z._Li1", "aff": "National University of Singapore;Alibaba Group;National University of Singapore;National University of Singapore;Alibaba Group;National University of Singapore;Westlake University", "aff_domain": "nus.edu.sg;alibaba-inc.com;u.nus.edu;u.nus.edu;alibaba-inc.com;nus.edu.sg;westlake.edu.cn", "position": "Intern;Researcher;PhD student;PhD student;Senior Staff Algorithm Engineer;Professor;Chair Professor", "bibtex": "@inproceedings{\nzang2024diffaug,\ntitle={DiffAug: Enhance Unsupervised Contrastive Learning with Domain-Knowledge-Free Diffusion-based Data Augmentation},\nauthor={Zelin Zang and Hao Luo and Kai Wang and Panpan Zhang and Fan Wang and Stan Z. Li and Yang You},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=s0UDX7Kswl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5583984, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4212662110553691983&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "nus.edu.sg;alibaba-inc.com;u.nus.edu;u.nus.edu;alibaba-inc.com;nus.edu.sg;westlake.edu.cn", "author_num": 7, "aff_unique_index": "0;1;0;0;1;0;2", "aff_unique_norm": "National University of Singapore;Alibaba Group;Westlake University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nus.edu.sg;https://www.alibaba.com;https://www.westlake.edu.cn", "aff_unique_abbr": "NUS;Alibaba;WU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1;0;1", "aff_country_unique": "Singapore;China" }, { "title": "LoRA Training in the NTK Regime has No Spurious Local Minima", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32931", "id": "s1sdx6vNsU", "proceeding": "https://proceedings.mlr.press/v235/jang24d.html", "pdf": "https://openreview.net/pdf?id=s1sdx6vNsU", "openreview": "https://openreview.net/forum?id=s1sdx6vNsU", "author_site": "Uijeong Jang, Jason Lee, Ernest Ryu", "tldr": "", "abstract": "Low-rank adaptation (LoRA) has become the standard approach for parameter-efficient fine-tuning of large language models (LLM), but our theoretical understanding of LoRA has been limited. In this work, we theoretically analyze LoRA fine-tuning in the neural tangent kernel (NTK) regime with $N$ data points, showing: (i) full fine-tuning (without LoRA) admits a low-rank solution of rank $r\\lesssim \\sqrt{N}$; (ii) using LoRA with rank $r\\gtrsim \\sqrt{N}$ eliminates spurious local minima, allowing gradient descent to find the low-rank solutions; (iii) the low-rank solution found using LoRA generalizes well.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Uijeong Jang;Jason D. Lee;Ernest K. Ryu", "authorids": "~Uijeong_Jang1;~Jason_D._Lee1;~Ernest_K._Ryu1", "gender": "M;M;M", "homepage": "https://uijeongjang.github.io/;https://jasondlee88.github.io/;http://www.math.snu.ac.kr/~ernestryu/", "dblp": ";88/3262;165/5192", "google_scholar": ";GR_DsT0AAAAJ;CNOqUZoAAAAJ", "orcid": "0000-0001-7475-5965;;0000-0001-6820-9095", "linkedin": ";;", "or_profile": "~Uijeong_Jang1;~Jason_D._Lee1;~Ernest_K._Ryu1", "aff": "Seoul National University;Princeton University;Seoul National University", "aff_domain": "snu.ac.kr;princeton.edu;snu.ac.kr", "position": "MS student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\njang2024lora,\ntitle={Lo{RA} Training in the {NTK} Regime has No Spurious Local Minima},\nauthor={Uijeong Jang and Jason D. Lee and Ernest K. Ryu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=s1sdx6vNsU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1322264, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15339303190206462584&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "snu.ac.kr;princeton.edu;snu.ac.kr", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Seoul National University;Princeton University", "aff_unique_dep": ";", "aff_unique_url": "https://www.snu.ac.kr;https://www.princeton.edu", "aff_unique_abbr": "SNU;Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "South Korea;United States" }, { "title": "In-Context Sharpness as Alerts: An Inner Representation Perspective for Hallucination Mitigation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32930", "id": "s3e8poX3kb", "proceeding": "https://proceedings.mlr.press/v235/chen24av.html", "pdf": "https://openreview.net/pdf?id=s3e8poX3kb", "openreview": "https://openreview.net/forum?id=s3e8poX3kb", "author_site": "Shiqi Chen, Miao Xiong, Junteng Liu, Zhengxuan Wu, Teng Xiao, Siyang Gao, Junxian He", "tldr": "", "abstract": "Large language models (LLMs) frequently hallucinate, e.g., making factual errors, yet our understanding of why they make these errors remains limited. In this study, we aim to understand the underlying mechanisms of LLM hallucinations from the perspective of *inner representations*. We discover a pattern associated with hallucinations: correct generations tend to have *sharper* context activations in the hidden states of the in-context tokens, compared to that of the incorrect generations. Leveraging this signal, we propose an entropy-based metric to quantify the *sharpness* among the in-context hidden states and incorporate it into the decoding process, i.e, use the entropy value to adjust the next token prediction distribution to improve the factuality and overall quality of the generated text. Experiments on knowledge-seeking datasets (Natural Questions, HotpotQA, TriviaQA) and hallucination benchmark (TruthfulQA) demonstrate our consistent effectiveness, e.g., up to 8.6 absolute points on TruthfulQA. We believe this study can improve our understanding of hallucinations and serve as a practical solution for hallucination mitigation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shiqi Chen;Miao Xiong;Junteng Liu;Zhengxuan Wu;Teng Xiao;Siyang Gao;Junxian He", "authorids": "~Shiqi_Chen3;~Miao_Xiong2;~Junteng_Liu2;~Zhengxuan_Wu1;~Teng_Xiao2;~Siyang_Gao1;~Junxian_He1", "gender": "F;F;M;M;M;M;M", "homepage": ";https://miaoxiong2320.github.io/;https://vicent0205.github.io/;https://cs.stanford.edu/~wuzhengx/;https://www.cityu.edu.hk/stfprofile/siyangao.htm;https://jxhe.github.io;https://tengxiao1.github.io/", "dblp": ";;347/3273;234/4650;136/9876;188/6127.html;", "google_scholar": "4Tg7zOMAAAAJ;yQ4U_5IAAAAJ;;CBvE6lwAAAAJ;NK6nQ9YAAAAJ;BIFGeoUAAAAJ;ld3OKXwAAAAJ", "orcid": ";;;;0000-0002-3574-6393;;", "linkedin": ";miao-xiong-9b1892187/;;;;;", "or_profile": "~Shiqi_Chen3;~Miao_Xiong2;~Junteng_Liu2;~Zhengxuan_Wu1;~Siyang_Gao1;~Junxian_He1;~Teng_Xiao1", "aff": "City University of Hong Kong;National University of Singapore;Shanghai Jiaotong University;Stanford University;City University of Hong Kong;Hong Kong University of Science and Technology;The Pennsylvania State University", "aff_domain": "cityu.edu.hk;u.nus.edu;sjtu.edu.cn;stanford.edu;cityu.edu.hk;ust.hk;psu.edu", "position": "PhD student;PhD student;Undergrad student;PhD student;Associate Professor;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nchen2024incontext,\ntitle={In-Context Sharpness as Alerts: An Inner Representation Perspective for Hallucination Mitigation},\nauthor={Shiqi Chen and Miao Xiong and Junteng Liu and Zhengxuan Wu and Teng Xiao and Siyang Gao and Junxian He},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=s3e8poX3kb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 965224, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16598156206541490114&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "cityu.edu.hk;u.nus.edu;sjtu.edu.cn;stanford.edu;cityu.edu.hk;ust.hk;psu.edu", "author_num": 7, "aff_unique_index": "0;1;2;3;0;4;5", "aff_unique_norm": "City University of Hong Kong;National University of Singapore;Shanghai Jiao Tong University;Stanford University;Hong Kong University of Science and Technology;Pennsylvania State University", "aff_unique_dep": ";;;;;", "aff_unique_url": "https://www.cityu.edu.hk;https://www.nus.edu.sg;https://www.sjtu.edu.cn;https://www.stanford.edu;https://www.ust.hk;https://www.psu.edu", "aff_unique_abbr": "CityU;NUS;SJTU;Stanford;HKUST;PSU", "aff_campus_unique_index": "0;2;0;0", "aff_campus_unique": "Hong Kong SAR;;Stanford", "aff_country_unique_index": "0;1;0;2;0;0;2", "aff_country_unique": "China;Singapore;United States" }, { "title": "Balanced Data, Imbalanced Spectra: Unveiling Class Disparities with Spectral Imbalance", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32929", "id": "s4EYBJ30WY", "proceeding": "https://proceedings.mlr.press/v235/kaushik24a.html", "pdf": "https://openreview.net/pdf?id=s4EYBJ30WY", "openreview": "https://openreview.net/forum?id=s4EYBJ30WY", "author_site": "Chiraag Kaushik, Ran Liu, Chi-Heng Lin, Amrit Khera, Matthew Jin, Wenrui Ma, Vidya Muthukumar, Eva Dyer", "tldr": "", "abstract": "Classification models are expected to perform equally well for different classes, yet in practice, there are often large gaps in their performance. This issue of class bias is widely studied in cases of datasets with sample imbalance, but is relatively overlooked in balanced datasets. In this work, we introduce the concept of spectral imbalance in features as a potential source for class disparities and study the connections between spectral imbalance and class bias in both theory and practice. To build the connection between spectral imbalance and class gap, we develop a theoretical framework for studying class disparities and derive exact expressions for the per-class error in a high-dimensional mixture model setting. We then study this phenomenon in 11 different state-of-the-art pre-trained encoders, and show how our proposed framework can be used to compare the quality of encoders, as well as evaluate and combine data augmentation strategies to mitigate the issue. Our work sheds light on the class-dependent effects of learning, and provides new insights into how state-of-the-art pre-trained features may have unknown biases that can be diagnosed through their spectra.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chiraag Kaushik;Ran Liu;Chi-Heng Lin;Amrit Khera;Matthew Y Jin;Wenrui Ma;Vidya Muthukumar;Eva L Dyer", "authorids": "~Chiraag_Kaushik1;~Ran_Liu2;~Chi-Heng_Lin1;~Amrit_Khera1;~Matthew_Y_Jin1;~Wenrui_Ma3;~Vidya_Muthukumar3;~Eva_L_Dyer1", "gender": "M;F;M;;M;;F;F", "homepage": "https://chiraagk7.github.io/;https://ranliu98.github.io/;https://www.chihenglin.com/;;;https://dyerlab.gatech.edu/people/;https://vmuthukumar.ece.gatech.edu;http://dyerlab.gatech.edu", "dblp": ";;128/4282;;;;149/0019;64/8509.html", "google_scholar": "rr6QwiwAAAAJ;vBEAxZgAAAAJ;OqSt2wMAAAAJ;;;;K2OEs2YAAAAJ;Sb_jcHcAAAAJ", "orcid": ";;;0000-0002-8453-5031;;;;", "linkedin": "chiraag-kaushik/;;chi-heng-lin-986727217;amritk10/;myjin/;;;", "or_profile": "~Chiraag_Kaushik1;~Ran_Liu2;~Chi-Heng_Lin1;~Amrit_Khera1;~Matthew_Y_Jin1;~Wenrui_Ma3;~Vidya_Muthukumar3;~Eva_Dyer1", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;Samsung Research America;Georgia Institute of Technology;;Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology", "aff_domain": "gatech.edu;gatech.edu;samsung.com;gatech.edu;;gatech.edu;gatech.edu;gatech.edu", "position": "PhD student;PhD student;Researcher;MS student;;MS student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nkaushik2024balanced,\ntitle={Balanced Data, Imbalanced Spectra: Unveiling Class Disparities with Spectral Imbalance},\nauthor={Chiraag Kaushik and Ran Liu and Chi-Heng Lin and Amrit Khera and Matthew Y Jin and Wenrui Ma and Vidya Muthukumar and Eva L Dyer},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=s4EYBJ30WY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4851788, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7771230801979682904&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "gatech.edu;gatech.edu;samsung.com;gatech.edu;;gatech.edu;gatech.edu;gatech.edu", "author_num": 8, "aff_unique_index": "0;0;1;0;0;0;0", "aff_unique_norm": "Georgia Institute of Technology;Samsung", "aff_unique_dep": ";Samsung Research America", "aff_unique_url": "https://www.gatech.edu;https://www.samsung.com/us/careers/research/", "aff_unique_abbr": "Georgia Tech;SRA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "MS-TIP: Imputation Aware Pedestrian Trajectory Prediction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32928", "id": "s4Hy0L4mml", "proceeding": "https://proceedings.mlr.press/v235/chib24a.html", "pdf": "https://openreview.net/pdf?id=s4Hy0L4mml", "openreview": "https://openreview.net/forum?id=s4Hy0L4mml", "author_site": "Pranav Singh Chib, Achintya Nath, Paritosh Kabra, Ishu Gupta, Pravendra Singh", "tldr": "", "abstract": "Pedestrian trajectory prediction aims to predict future trajectories based on observed trajectories. Current state-of-the-art methods often assume that the observed sequences of agents are complete, which is a strong assumption that overlooks inherent uncertainties. Understanding pedestrian behavior when dealing with missing values in the observed sequence is crucial for enhancing the performance of predictive models. In this work, we propose the MultiScale hypergraph for Trajectory Imputation and Prediction (MS-TIP), a novel approach that simultaneously addresses the imputation of missing observations and the prediction of future trajectories. Specifically, we leverage transformers with diagonal masked self-attention to impute incomplete observations. Further, our approach promotes complex interaction modeling through multi-scale hypergraphs, optimizing our trajectory prediction module to capture different types of interactions. With the inclusion of scenic attention, we learn contextual scene information, instead of sole reliance on coordinates. Additionally, our approach utilizes an intermediate control point and refinement module to infer future trajectories accurately. Extensive experiments validate the efficacy of MS-TIP in precisely predicting pedestrian future trajectories. Code is publicly available at https://github.com/Pranav-chib/MS-TIP.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pranav singh chib;Achintya Nath;Paritosh Kabra;Ishu Gupta;Pravendra Singh", "authorids": "~Pranav_singh_chib1;~Achintya_Nath1;~Paritosh_Kabra1;~Ishu_Gupta1;~Pravendra_Singh1", "gender": ";M;M;F;M", "homepage": ";https://channeli.in/student_profile/20114003/;;;https://sites.google.com/view/pravendra/", "dblp": ";;;;160/8743", "google_scholar": ";;;;YwDTxJMAAAAJ", "orcid": ";;0009-0003-8115-3569;;0000-0003-1001-2219", "linkedin": ";achintya-nath;paritosh-kabra-480b51201/;ishugupta71;", "or_profile": "~Pranav_singh_chib1;~Achintya_Nath1;~Paritosh_Kabra1;~Ishu_Gupta1;~Pravendra_Singh1", "aff": ";Indian Institute of Technology, Roorkee;Indian Institute of Technology, Roorkee;Indian Institute of Technology, Roorkee, Dhirubhai Ambani Institute Of Information and Communication Technology;Indian Institute of Technology, Roorkee", "aff_domain": ";iitr.ac.in;iitr.ac.in;iitr.ac.in;iitr.ac.in", "position": ";Undergrad student;Undergrad student;Undergrad student;Assistant Professor", "bibtex": "@inproceedings{\nchib2024mstip,\ntitle={{MS}-{TIP}: Imputation Aware Pedestrian Trajectory Prediction},\nauthor={Pranav singh chib and Achintya Nath and Paritosh Kabra and Ishu Gupta and Pravendra Singh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=s4Hy0L4mml}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4261314, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12012358060431277387&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "email": ";iitr.ac.in;iitr.ac.in;iitr.ac.in;iitr.ac.in", "author_num": 5, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Indian Institute of Technology;Indian Institute of Technology, Roorkee", "aff_unique_dep": ";", "aff_unique_url": "https://www.iitr.ac.in;https://www.iitr.ac.in", "aff_unique_abbr": "IIT Roorkee;IIT Roorkee", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Roorkee", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "India" }, { "title": "High-Performance Temporal Reversible Spiking Neural Networks with $\\mathcal{O}(L)$ Training Memory and $\\mathcal{O}(1)$ Inference Cost", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32927", "id": "s4h6nyjM9H", "proceeding": "https://proceedings.mlr.press/v235/hu24q.html", "pdf": "https://openreview.net/pdf?id=s4h6nyjM9H", "openreview": "https://openreview.net/forum?id=s4h6nyjM9H", "author_site": "JiaKui Hu, Man Yao, Xuerui Qiu, Yuhong Chou, Yuxuan Cai, Ning Qiao, Yonghong Tian, Bo XU, Guoqi Li", "tldr": "", "abstract": "Multi-timestep simulation of brain-inspired Spiking Neural Networks (SNNs) boost memory requirements during training and increase inference energy cost. Current training methods cannot simultaneously solve both training and inference dilemmas. This work proposes a novel Temporal Reversible architecture for SNNs (T-RevSNN) to jointly address the training and inference challenges by altering the forward propagation of SNNs. We turn off the temporal dynamics of most spiking neurons and design multi-level temporal reversible interactions at temporal turn-on spiking neurons, resulting in a $\\mathcal{O}(L)$ training memory. Combined with the temporal reversible nature, we redesign the input encoding and network organization of SNNs to achieve $\\mathcal{O}(1)$ inference energy cost. Then, we finely adjust the internal units and residual connections of the basic SNN block to ensure the effectiveness of sparse temporal information interaction. T-RevSNN achieves excellent accuracy on ImageNet, while the memory efficiency, training time acceleration and inference energy efficiency can be significantly improved by $8.6 \\times$, $2.0 \\times$ and $1.6 \\times$, respectively. This work is expected to break the technical bottleneck of significantly increasing memory cost and training time for large-scale SNNs while maintaining both high performance and low inference energy cost.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "JiaKui Hu;Man Yao;Xuerui Qiu;Yuhong Chou;Yuxuan Cai;Ning Qiao;Yonghong Tian;Bo XU;Guoqi Li", "authorids": "~JiaKui_Hu1;~Man_Yao1;~Xuerui_Qiu1;~Yuhong_Chou1;~Yuxuan_Cai1;~Ning_Qiao1;~Yonghong_Tian1;~Bo_XU10;~Guoqi_Li1", "gender": "M;M;M;M;M;M;M;M;M", "homepage": "https://github.com/jkhu29;;https://bollossom.github.io/sherry.qiu/;https://openreview.net/;https://nightsnack.github.io;;http://www.pkuml.org;;https://scholar.google.com/citations?hl=en&user=qCfE--MAAAAJ", "dblp": "327/3225;21/5932;351/8271;347/9986;;138/5959;86/5857;;", "google_scholar": "VagFt-sAAAAJ;eE4vvp0AAAAJ;bMwW4e8AAAAJ;8CpWM4cAAAAJ;EzYiBeUAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;;https://scholar.google.com/citations?hl=en", "orcid": ";;0009-0000-8725-1619;;;;0000-0002-2978-5935;;", "linkedin": ";;;;;;;%E6%B3%A2-%E5%BE%90-74210b115/?midToken=AQH1EMB1ZoboJA&midSig=2Q5MzMXmNEH9M1&trk=eml-email_pymk_02-header-22-profile&trkEmail=eml-email_pymk_02-header-22-profile-null-7ydrhe~kpggjoav~k9-null-neptune/profile~vanity.view;", "or_profile": "~JiaKui_Hu1;~Man_Yao1;~Xuerui_Qiu1;~Yuhong_Chou1;~Yuxuan_Cai1;~Ning_Qiao1;~Yonghong_Tian1;~Bo_XU10;~Guoqi_Li1", "aff": "Peking University;Institute of automation, Chinese academy of sciences;University of Electronic Science and Technology of China;The Hong Kong Polytechnic University;01.AI;Chengdu SynSense Tech. Co. Ltd. ;Peking University;Institute of Automation, Chinese Academy of Sciences;Institute of Automation, Chinese Academy of Sciences", "aff_domain": "pku.edu.cn;ia.ac.cn;uestc.edu.cn;connect.polyu.hk;01.ai;synsense.ai;pku.edu.cn;ia.ac.cn;ia.ac.cn", "position": "PhD student;Assistant Professor;Undergrad student;PhD student;Researcher;CEO;Full Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nhu2024highperformance,\ntitle={High-Performance Temporal Reversible Spiking Neural Networks with \\${\\textbackslash}mathcal\\{O\\}(L)\\$ Training Memory and \\${\\textbackslash}mathcal\\{O\\}(1)\\$ Inference Cost},\nauthor={JiaKui Hu and Man Yao and Xuerui Qiu and Yuhong Chou and Yuxuan Cai and Ning Qiao and Yonghong Tian and Bo XU and Guoqi Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=s4h6nyjM9H}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 679305, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "email": "pku.edu.cn;ia.ac.cn;uestc.edu.cn;connect.polyu.hk;01.ai;synsense.ai;pku.edu.cn;ia.ac.cn;ia.ac.cn", "author_num": 9, "aff_unique_index": "0;1;2;3;4;5;0;1;1", "aff_unique_norm": "Peking University;Chinese Academy of Sciences;University of Electronic Science and Technology of China;Hong Kong Polytechnic University;01.AI;Chengdu SynSense Tech. Co. Ltd.", "aff_unique_dep": ";Institute of Automation;;;;", "aff_unique_url": "http://www.pku.edu.cn;http://www.ia.cas.cn;https://www.uestc.edu.cn;https://www.polyu.edu.hk;;", "aff_unique_abbr": "Peking U;CAS;UESTC;PolyU;;", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "China;" }, { "title": "Meta-Learners for Partially-Identified Treatment Effects Across Multiple Environments", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32926", "id": "s5PLISyNyP", "proceeding": "https://proceedings.mlr.press/v235/schweisthal24a.html", "pdf": "https://openreview.net/pdf?id=s5PLISyNyP", "openreview": "https://openreview.net/forum?id=s5PLISyNyP", "author_site": "Jonas Schweisthal, Dennis Frauen, M van der Schaar, Stefan Feuerriegel", "tldr": "", "abstract": "Estimating the conditional average treatment effect (CATE) from observational data is relevant for many applications such as personalized medicine. Here, we focus on the widespread setting where the observational data come from multiple environments, such as different hospitals, physicians, or countries. Furthermore, we allow for violations of standard causal assumptions, namely, overlap within the environments and unconfoundedness. To this end, we move away from point identification and focus on partial identification. Specifically, we show that current assumptions from the literature on multiple environments allow us to interpret the environment as an instrumental variable (IV). This allows us to adapt bounds from the IV literature for partial identification of CATE by leveraging treatment assignment mechanisms across environments. Then, we propose different model-agnostic learners (so-called meta-learners) to estimate the bounds that can be used in combination with arbitrary machine learning models. We further demonstrate the effectiveness of our meta-learners across various experiments using both simulated and real-world data. Finally, we discuss the applicability of our meta-learners to partial identification in instrumental variable settings, such as randomized controlled trials with non-compliance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jonas Schweisthal;Dennis Frauen;Mihaela van der Schaar;Stefan Feuerriegel", "authorids": "~Jonas_Schweisthal1;~Dennis_Frauen1;~Mihaela_van_der_Schaar2;~Stefan_Feuerriegel1", "gender": "M;M;F;M", "homepage": "https://www.som.lmu.de/ai/en/institute/contact-page/jonas-schweisthal-0f01481a.html;https://www.ai.bwl.uni-muenchen.de/team/research_team/dennis_frauen/index.html;https://www.vanderschaar-lab.com;http://www.ai.bwl.lmu.de", "dblp": "329/4240;315/0115;;125/0630", "google_scholar": "https://scholar.google.de/citations?user=GHpjcEsAAAAJ;ieyW4WQAAAAJ;DZ3S--MAAAAJ;https://scholar.google.de/citations?hl=de", "orcid": "0000-0003-3725-3821;;;0000-0001-7856-8729", "linkedin": ";dennis-frauen-6b5746171/;;", "or_profile": "~Jonas_Schweisthal1;~Dennis_Frauen1;~Mihaela_van_der_Schaar2;~Stefan_Feuerriegel1", "aff": "Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;University of California, Los Angeles;LMU Munich", "aff_domain": "lmu.de;lmu.de;ucla.edu;lmu.de", "position": "PhD student;PhD student;Full Professor;Professor", "bibtex": "@inproceedings{\nschweisthal2024metalearners,\ntitle={Meta-Learners for Partially-Identified Treatment Effects Across Multiple Environments},\nauthor={Jonas Schweisthal and Dennis Frauen and Mihaela van der Schaar and Stefan Feuerriegel},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=s5PLISyNyP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1306444, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5349596939607855734&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "lmu.de;lmu.de;ucla.edu;lmu.de", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;University of California, Los Angeles;Ludwig Maximilian University of Munich", "aff_unique_dep": ";;", "aff_unique_url": "https://www.lmu.de;https://www.ucla.edu;https://www.lmu.de", "aff_unique_abbr": "LMU;UCLA;LMU", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Los Angeles;Munich", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Germany;United States" }, { "title": "Fundamental Benefit of Alternating Updates in Minimax Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32925", "id": "s6ZAT8MLKU", "proceeding": "https://proceedings.mlr.press/v235/lee24e.html", "pdf": "https://openreview.net/pdf?id=s6ZAT8MLKU", "openreview": "https://openreview.net/forum?id=s6ZAT8MLKU", "author_site": "Jaewook Lee, Hanseul Cho, Chulhee Yun", "tldr": "", "abstract": "The Gradient Descent-Ascent (GDA) algorithm, designed to solve minimax optimization problems, takes the descent and ascent steps either simultaneously (Sim-GDA) or alternately (Alt-GDA). While Alt-GDA is commonly observed to converge faster, the performance gap between the two is not yet well understood theoretically, especially in terms of global convergence rates. To address this theory-practice gap, we present fine-grained convergence analyses of both algorithms for strongly-convex-strongly-concave and Lipschitz-gradient objectives. Our new iteration complexity upper bound of Alt-GDA is strictly smaller than the lower bound of Sim-GDA; i.e., Alt-GDA is provably faster. Moreover, we propose Alternating-Extrapolation GDA (Alex-GDA), a general algorithmic framework that subsumes Sim-GDA and Alt-GDA, for which the main idea is to alternately take gradients from extrapolations of the iterates. We show that Alex-GDA satisfies a smaller iteration complexity bound, identical to that of the Extra-gradient method, while requiring less gradient computations. We also prove that Alex-GDA enjoys linear convergence for bilinear problems, for which both Sim-GDA and Alt-GDA fail to converge at all.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jaewook Lee;Hanseul Cho;Chulhee Yun", "authorids": "~Jaewook_Lee6;~Hanseul_Cho1;~Chulhee_Yun1", "gender": "M;M;M", "homepage": "https://id8198.github.io/;https://hanseuljo.github.io/;https://chulheeyun.github.io/", "dblp": ";233/5755-2;138/0148.html", "google_scholar": "grnMVBEAAAAJ;IczOXwsAAAAJ;Ukl64ggAAAAJ", "orcid": ";0009-0001-0410-0290;", "linkedin": ";hanseul-cho-66b01a260/;", "or_profile": "~Jaewook_Lee6;~Hanseul_Cho1;~Chulhee_Yun1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "MS student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nlee2024fundamental,\ntitle={Fundamental Benefit of Alternating Updates in Minimax Optimization},\nauthor={Jaewook Lee and Hanseul Cho and Chulhee Yun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=s6ZAT8MLKU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 858354, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9966382830950092017&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "WARM: On the Benefits of Weight Averaged Reward Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32924", "id": "s7RDnNUJy6", "proceeding": "https://proceedings.mlr.press/v235/rame24a.html", "pdf": "https://openreview.net/pdf?id=s7RDnNUJy6", "openreview": "https://openreview.net/forum?id=s7RDnNUJy6", "author_site": "Alexandre Rame, Nino Vieillard, L\u00e9onard Hussenot, Robert Dadashi, Geoffrey Cideron, Olivier Bachem, Johan Ferret", "tldr": "", "abstract": "Aligning large language models (LLMs) with human preferences through reinforcement learning (RLHF) can lead to reward hacking, where LLMs exploit failures in the reward model (RM) to achieve seemingly high rewards without meeting the underlying objectives. We identify two primary challenges when designing RMs to mitigate reward hacking: distribution shifts during the RL process and inconsistencies in human preferences. As a solution, we propose Weight Averaged Reward Models (WARM), first fine-tuning multiple RMs, then averaging them in the weight space. This strategy follows the observation that fine-tuned weights remain linearly mode connected when sharing the same pre-training. By averaging weights, WARM improves efficiency compared to the traditional ensembling of predictions, while improving reliability under distribution shifts and robustness to preference inconsistencies. Our experiments on summarization tasks, using best-of-N and RL methods, shows that WARM improves the overall quality and alignment of LLM predictions; for example, a policy RL fine-tuned with WARM has a 79.4% win rate against a policy RL fine-tuned with a single RM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alexandre Rame;Nino Vieillard;Leonard Hussenot;Robert Dadashi;Geoffrey Cideron;Olivier Bachem;Johan Ferret", "authorids": "~Alexandre_Rame1;~Nino_Vieillard1;~Leonard_Hussenot1;~Robert_Dadashi2;~Geoffrey_Cideron1;~Olivier_Bachem1;~Johan_Ferret1", "gender": "M;;;M;M;M;M", "homepage": "https://alexrame.github.io/;;;;http://www.olivierbachem.ch/;https://ferretj.github.io;", "dblp": ";243/5918;241/9657;;https://dblp.org/pers/hd/b/Bachem:Olivier;;230/3843.html", "google_scholar": "7znwivwAAAAJ;https://scholar.google.fr/citations?user=4jua80IAAAAJ;nTdWO9MAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.ch/citations?user=mW9BcgsAAAAJ;uyUnqjMAAAAJ;RWyPeYYAAAAJ", "orcid": ";;;;;;", "linkedin": "alexandre-ram%C3%A9-05259587;;;;olivier-bachem-10257756/;;", "or_profile": "~Alexandre_Rame1;~Nino_Vieillard1;~Leonard_Hussenot1;~Geoffrey_Cideron1;~Olivier_Bachem1;~Johan_Ferret1;~Robert_Dadashi1", "aff": "Google;Google Deepmind;;Google;Google Brain;Google;", "aff_domain": "google.com;google.com;;google.com;google.com;google.com;", "position": "research scientist;Researcher;;Research Engineer;Research scientist;Researcher;", "bibtex": "@inproceedings{\nrame2024warm,\ntitle={{WARM}: On the Benefits of Weight Averaged Reward Models},\nauthor={Alexandre Rame and Nino Vieillard and Leonard Hussenot and Robert Dadashi and Geoffrey Cideron and Olivier Bachem and Johan Ferret},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=s7RDnNUJy6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1690373, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 76, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13750861246709676801&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "google.com;google.com;;google.com;google.com;google.com;", "author_num": 7, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Google;DeepMind", "aff_unique_dep": "Google;DeepMind", "aff_unique_url": "https://www.google.com;https://deepmind.com", "aff_unique_abbr": "Google;DeepMind", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Understanding and Diagnosing Deep Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32923", "id": "s9RKqT7jVM", "proceeding": "https://proceedings.mlr.press/v235/korkmaz24a.html", "pdf": "https://openreview.net/pdf?id=s9RKqT7jVM", "openreview": "https://openreview.net/forum?id=s9RKqT7jVM", "tldr": "", "abstract": "Deep neural policies have recently been installed in a diverse range of settings, from biotechnology to automated financial systems. However, the utilization of deep neural networks to approximate the value function leads to concerns on the decision boundary stability, in particular, with regard to the sensitivity of policy decision making to indiscernible, non-robust features due to highly non-convex and complex deep neural manifolds. These concerns constitute an obstruction to understanding the reasoning made by deep neural policies, and their foundational limitations. Hence, it is crucial to develop techniques that aim to understand the sensitivities in the learnt representations of neural network policies. To achieve this we introduce a theoretically founded method that provides a systematic analysis of the unstable directions in the deep neural policy decision boundary across both time and space. Through experiments in the Arcade Learning Environment (ALE), we demonstrate the effectiveness of our technique for identifying correlated directions of instability, and for measuring how sample shifts remold the set of sensitive directions in the neural policy landscape. Most importantly, we demonstrate that state-of-the-art robust training techniques yield learning of disjoint unstable directions, with dramatically larger oscillations over time, when compared to standard training. We believe our results reveal the fundamental properties of the decision process made by reinforcement learning policies, and can help in constructing reliable and robust deep neural policies.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ezgi Korkmaz", "authorids": "~Ezgi_Korkmaz2", "gender": "", "homepage": "https://ezgikorkmaz.github.io/", "dblp": "300/7830.html", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "~Ezgi_Korkmaz2", "aff": "University College London, University of London", "aff_domain": "ucl.ac.uk", "position": "PhD student", "bibtex": "@inproceedings{\nkorkmaz2024understanding,\ntitle={Understanding and Diagnosing Deep Reinforcement Learning},\nauthor={Ezgi Korkmaz},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=s9RKqT7jVM}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5529535, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2332313858356984828&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 6, "email": "ucl.ac.uk", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "University College London", "aff_unique_dep": "", "aff_unique_url": "https://www.ucl.ac.uk", "aff_unique_abbr": "UCL", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "title": "Candidate Pseudolabel Learning: Enhancing Vision-Language Models by Prompt Tuning with Unlabeled Data", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32922", "id": "sBJNokmYuV", "proceeding": "https://proceedings.mlr.press/v235/zhang24bo.html", "pdf": "https://openreview.net/pdf?id=sBJNokmYuV", "openreview": "https://openreview.net/forum?id=sBJNokmYuV", "author_site": "Jiahan Zhang, Qi Wei, Feng Liu, Lei Feng", "tldr": "", "abstract": "Fine-tuning vision-language models (VLMs) with abundant unlabeled data recently has attracted increasing attention. Existing methods that resort to the pseudolabeling strategy would suffer from heavily incorrect hard pseudolabels when VLMs exhibit low zero-shot performance in downstream tasks. To alleviate this issue, we propose a **C**andidate **P**seudolabel **L**earning method, termed **CPL**, to fine-tune VLMs with suitable candidate pseudolabels of unlabeled data in downstream tasks. The core of our method lies in the generation strategy of candidate pseudolabels, which progressively generates refined candidate pseudolabels by both intra- and inter-instance label selection, based on a confidence score matrix for all unlabeled data. This strategy can result in better performance in true label inclusion and class-balanced instance selection. In this way, we can directly apply existing loss functions to learn with generated candidate psueudolabels. Extensive experiments on nine benchmark datasets with three learning paradigms demonstrate the effectiveness of our method. Our code can be found here.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiahan Zhang;Qi Wei;Feng Liu;Lei Feng", "authorids": "~Jiahan_Zhang1;~Qi_Wei4;~Feng_Liu2;~Lei_Feng1", "gender": ";M;M;M", "homepage": ";http://weiq0010.top;https://fengliu90.github.io/index.html;https://lfeng1995.github.io/", "dblp": ";43/2782-4.html;77/1318-3;76/847-6", "google_scholar": ";https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com/citations?hl=en;https://scholar.google.com.sg/citations?user=KomQOFkAAAAJ", "orcid": ";;0000-0002-5005-9129;0000-0003-2839-5799", "linkedin": ";;alexfengliu;", "or_profile": "~Jiahan_Zhang1;~Qi_Wei4;~Feng_Liu2;~Lei_Feng1", "aff": ";Nanyang Technological University;University of Melbourne;Singapore University of Technology and Design", "aff_domain": ";ntu.edu.sg;unimelb.edu.au;sutd.edu.sg", "position": ";PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nzhang2024candidate,\ntitle={Candidate Pseudolabel Learning: Enhancing Vision-Language Models by Prompt Tuning with Unlabeled Data},\nauthor={Jiahan Zhang and Qi Wei and Feng Liu and Lei Feng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=sBJNokmYuV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2007389, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13370731097677354084&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";ntu.edu.sg;unimelb.edu.au;sutd.edu.sg", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Nanyang Technological University;University of Melbourne;Singapore University of Technology and Design", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ntu.edu.sg;https://www.unimelb.edu.au;https://www.sutd.edu.sg", "aff_unique_abbr": "NTU;UniMelb;SUTD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Singapore;Australia" }, { "title": "Compressing Large Language Models by Joint Sparsification and Quantization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32921", "id": "sCGRhnuMUJ", "proceeding": "https://proceedings.mlr.press/v235/guo24g.html", "pdf": "https://openreview.net/pdf?id=sCGRhnuMUJ", "openreview": "https://openreview.net/forum?id=sCGRhnuMUJ", "author_site": "Jinyang Guo, Jianyu Wu, Zining Wang, Jiaheng Liu, Ge Yang, Yifu Ding, Ruihao Gong, Haotong Qin, Xianglong Liu", "tldr": "", "abstract": "In this paper, we introduce a novel model compression technique named Joint Sparsification and Quantization (JSQ), explicitly tailored for large language models (LLMs). Traditional methods employ either sparsification or quantization individually to compress LLMs, leading to performance degradation at high compression ratios. In contrast, our JSQ approach integrates sparsification and quantization cohesively. As sparsification tend to preserve outliers that is harmful to quantization, we introduce a novel sparsity metric to serves as a bridge between the sparsification and quantization. Moreover, it is proven outliers in LLMs have significant impact but harmful to compression. Current solutions are highly coupled with quantization process, which is not helpful to sparsification. To this end, we also introduce a search-based activation editor to automatically eliminate relatively useless outliers. Comprehensive experiments across various datasets and architectures affirm the efficacy of our JSQ framework. Notably, our JSQ achieves 7.96$\\times$ computation reduction without crashing for the representative model LLaMA. This accomplishment stands in stark contrast to the limitations of most state-of-the-art LLM compression methods, which typically fail under such extreme compression ratios. Our code is released at https://github.com/uanu2002/JSQ.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jinyang Guo;Jianyu Wu;Zining Wang;Jiaheng Liu;Ge Yang;Yifu Ding;Ruihao Gong;Haotong Qin;Xianglong Liu", "authorids": "~Jinyang_Guo1;~Jianyu_Wu2;~Zining_Wang3;~Jiaheng_Liu1;~Ge_Yang5;~Yifu_Ding2;~Ruihao_Gong1;~Haotong_Qin1;~Xianglong_Liu3", "gender": "M;M;M;M;M;F;M;M;", "homepage": "https://jinyangguo.github.io/;https://uanu2002.github.io/;;https://liujiaheng.github.io/;;https://yifu-ding.github.io/;https://xhplus.github.io;https://htqin.github.io/;", "dblp": ";;;225/1962;;;247/1172;262/3626.html;", "google_scholar": "uJGeT1AAAAAJ;WmA0pzkAAAAJ;;yFI_RjUAAAAJ;OQ3u-S4AAAAJ;RCEI1r0AAAAJ;8i7Z15kAAAAJ;mK6n-KgAAAAJ;", "orcid": ";0009-0006-9173-1766;0000-0002-1259-5377;;0009-0001-0635-0197;0000-0002-3612-8757;0000-0002-6024-7086;;", "linkedin": ";;;;;yifu-ding-253614186/;;;", "or_profile": "~Jinyang_Guo1;~Jianyu_Wu2;~Zining_Wang3;~Jiaheng_Liu1;~Ge_Yang5;~Yifu_Ding2;~Ruihao_Gong1;~Haotong_Qin1;~Xianglong_Liu3", "aff": "Beihang University;Beihang University;Beihang University;Alibaba Group;Beihang University;Nanyang Technological University;SenseTime;ETHZ - ETH Zurich;", "aff_domain": "buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;alibaba-inc.com;buaa.edu.cn;ntu.edu.sg;sensetime.com;ethz.ch;", "position": "Assistant Professor;Undergrad student;MS student;Researcher;MS student;PhD student;Principal Researcher;Postdoc;", "bibtex": "@inproceedings{\nguo2024compressing,\ntitle={Compressing Large Language Models by Joint Sparsification and Quantization},\nauthor={Jinyang Guo and Jianyu Wu and Zining Wang and Jiaheng Liu and Ge Yang and Yifu Ding and Ruihao Gong and Haotong Qin and Xianglong Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=sCGRhnuMUJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 0, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11508644646752279783&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;alibaba-inc.com;buaa.edu.cn;ntu.edu.sg;sensetime.com;ethz.ch;", "author_num": 9, "aff_unique_index": "0;0;0;1;0;2;3;4", "aff_unique_norm": "Beihang University;Alibaba Group;Nanyang Technological University;SenseTime;ETH Zurich", "aff_unique_dep": ";;;;", "aff_unique_url": "http://www.buaa.edu.cn/;https://www.alibaba.com;https://www.ntu.edu.sg;https://www.sensetime.com;https://www.ethz.ch", "aff_unique_abbr": "BUAA;Alibaba;NTU;SenseTime;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;0;2", "aff_country_unique": "China;Singapore;Switzerland" }, { "title": "LASER: Linear Compression in Wireless Distributed Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32920", "id": "sDjszMb2Ir", "proceeding": "https://proceedings.mlr.press/v235/makkuva24a.html", "pdf": "https://openreview.net/pdf?id=sDjszMb2Ir", "openreview": "https://openreview.net/forum?id=sDjszMb2Ir", "author_site": "Ashok Vardhan Makkuva, Marco Bondaschi, Thijs Vogels, Martin Jaggi, Hyeji Kim, Michael Gastpar", "tldr": "", "abstract": "Data-parallel SGD is the de facto algorithm for distributed optimization, especially for large scale machine learning. Despite its merits, communication bottleneck is one of its persistent issues. Most compression schemes to alleviate this either assume noiseless communication links, or fail to achieve good performance on practical tasks. In this paper, we close this gap and introduce **LASER**: **L**ine**A**r Compre**S**sion in Wir**E**less Dist**R**ibuted Optimization. LASER capitalizes on the inherent low-rank structure of gradients and transmits them efficiently over the noisy channels. Whilst enjoying theoretical guarantees similar to those of the classical SGD, LASER shows consistent gains over baselines on a variety of practical benchmarks. In particular, it outperforms the state-of-the-art compression schemes on challenging computer vision and GPT language modeling tasks. On the latter, we obtain 50-64% improvement in perplexity over our baselines for noisy channels.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ashok Vardhan Makkuva;Marco Bondaschi;Thijs Vogels;Martin Jaggi;Hyeji Kim;Michael Gastpar", "authorids": "~Ashok_Vardhan_Makkuva1;~Marco_Bondaschi1;~Thijs_Vogels1;~Martin_Jaggi1;~Hyeji_Kim1;~Michael_Gastpar1", "gender": ";M;M;M;;", "homepage": ";;https://thijs.link;https://mlo.epfl.ch;;https://people.epfl.ch/michael.gastpar", "dblp": ";255/4933;https://dblp.uni-trier.de/pid/169/7392;17/4402;;", "google_scholar": ";;KKQCt30AAAAJ;https://scholar.google.ch/citations?user=r1TJBr8AAAAJ;;https://scholar.google.ch/citations?user=IQ3hcw4AAAAJ", "orcid": ";0000-0002-4158-2487;0000-0002-5884-4842;0000-0003-1579-5558;;0000-0002-5499-5336", "linkedin": ";;;;;", "or_profile": "~Ashok_Vardhan_Makkuva1;~Marco_Bondaschi1;~Thijs_Vogels1;~Martin_Jaggi1;~Hyeji_Kim1;~Michael_Gastpar1", "aff": ";EPFL - EPF Lausanne;Swiss Federal Institute of Technology Lausanne;EPFL;;School of Computer and Communication Sciences, EPFL - EPF Lausanne", "aff_domain": ";epfl.ch;epfl.ch;epfl.ch;;ic.epfl.ch", "position": ";PhD student;PhD student;Associate Professor;;Full Professor", "bibtex": "@inproceedings{\nmakkuva2024laser,\ntitle={{LASER}: Linear Compression in Wireless Distributed Optimization},\nauthor={Ashok Vardhan Makkuva and Marco Bondaschi and Thijs Vogels and Martin Jaggi and Hyeji Kim and Michael Gastpar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=sDjszMb2Ir}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 588850, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12759355906006547739&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": ";epfl.ch;epfl.ch;epfl.ch;;ic.epfl.ch", "author_num": 6, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "EPFL;Swiss Federal Institute of Technology Lausanne", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://www.epfl.ch", "aff_unique_abbr": "EPFL;EPFL", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Vectorized Conditional Neural Fields: A Framework for Solving Time-dependent Parametric Partial Differential Equations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32919", "id": "sF9epWkNUG", "proceeding": "https://proceedings.mlr.press/v235/hagnberger24a.html", "pdf": "https://openreview.net/pdf?id=sF9epWkNUG", "openreview": "https://openreview.net/forum?id=sF9epWkNUG", "author_site": "Jan Hagnberger, Marimuthu Kalimuthu, Daniel Musekamp, Mathias Niepert", "tldr": "", "abstract": "Transformer models are increasingly used for solving Partial Differential Equations (PDEs). Several adaptations have been proposed, all of which suffer from the typical problems of Transformers, such as quadratic memory and time complexity. Furthermore, all prevalent architectures for PDE solving lack at least one of several desirable properties of an ideal surrogate model, such as (i) generalization to PDE parameters not seen during training, (ii) spatial and temporal zero-shot super-resolution, (iii) continuous temporal extrapolation, (iv) support for 1D, 2D, and 3D PDEs, and (v) efficient inference for longer temporal rollouts. To address these limitations, we propose *Vectorized Conditional Neural Fields* (VCNeFs), which represent the solution of time-dependent PDEs as neural fields. Contrary to prior methods, however, VCNeFs compute, for a set of multiple spatio-temporal query points, their solutions in parallel and model their dependencies through attention mechanisms. Moreover, VCNeF can condition the neural field on both the initial conditions and the parameters of the PDEs. An extensive set of experiments demonstrates that VCNeFs are competitive with and often outperform existing ML-based surrogate models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jan Hagnberger;Marimuthu Kalimuthu;Daniel Musekamp;Mathias Niepert", "authorids": "~Jan_Hagnberger1;~Marimuthu_Kalimuthu1;~Daniel_Musekamp1;~Mathias_Niepert1", "gender": "M;M;M;M", "homepage": "https://jhagnberger.github.io;;https://www.ki.uni-stuttgart.de/institute/team/Musekamp-00001/;http://www.matlog.net", "dblp": "379/6122;245/7576;379/6030;n/MathiasNiepert", "google_scholar": "https://scholar.google.de/citations?user=_BjB8hwAAAAJ;;pAsBQeQAAAAJ;https://scholar.google.de/citations?user=p5vLzq0AAAAJ", "orcid": ";;;", "linkedin": "jan-hagnberger/;;;", "or_profile": "~Jan_Hagnberger1;~Marimuthu_Kalimuthu1;~Daniel_Musekamp1;~Mathias_Niepert1", "aff": "University of Stuttgart;University of Stuttgart, Stuttgart Vaihingen Campus, Germany;Universit\u00e4t Stuttgart;NEC", "aff_domain": "uni-stuttgart.de;uni-stuttgart.de;uni-stuttgart.de;neclab.eu", "position": "MS student;PhD student;PhD student;Research Scientist", "bibtex": "@inproceedings{\nhagnberger2024vectorized,\ntitle={Vectorized Conditional Neural Fields: A Framework for Solving Time-dependent Parametric Partial Differential Equations},\nauthor={Jan Hagnberger and Marimuthu Kalimuthu and Daniel Musekamp and Mathias Niepert},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=sF9epWkNUG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3503530, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1991113001793529671&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "uni-stuttgart.de;uni-stuttgart.de;uni-stuttgart.de;neclab.eu", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of Stuttgart;NEC Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-stuttgart.de;https://www.nec.com", "aff_unique_abbr": "USTuttgart;NEC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stuttgart Vaihingen", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "Germany;Japan" }, { "title": "Tell, Don't Show: Language Guidance Eases Transfer Across Domains in Images and Videos", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32918", "id": "sFN49CfklF", "proceeding": "https://proceedings.mlr.press/v235/kalluri24a.html", "pdf": "https://openreview.net/pdf?id=sFN49CfklF", "openreview": "https://openreview.net/forum?id=sFN49CfklF", "author_site": "Tarun Kalluri, Bodhisattwa Prasad Majumder, Manmohan Chandraker", "tldr": "", "abstract": "We introduce LaGTran, a novel framework that utilizes text supervision to guide robust transfer of discriminative knowledge from labeled source to unlabeled target data with domain gaps. While unsupervised adaptation methods have been established to address this problem, they show limitations in handling challenging domain shifts due to their exclusive operation within the pixel-space. Motivated by our observation that semantically richer text modality has more favorable transfer properties, we devise a transfer mechanism to use a source-trained text-classifier to generate predictions on the target text descriptions, and utilize these predictions as supervision for the corresponding images. Our approach driven by language guidance is surprisingly easy and simple, yet significantly outperforms all prior approaches on challenging datasets like GeoNet and DomainNet, validating its extreme effectiveness. To further extend the scope of our study beyond images, we introduce a new benchmark called Ego2Exo to study ego-exo transfer in videos and find that our language-aided approach LaGTran yields significant gains in this highly challenging and non-trivial transfer setting. Code, models, and proposed datasets are publicly available at https://tarun005.github.io/lagtran/.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tarun Kalluri;Bodhisattwa Prasad Majumder;Manmohan Chandraker", "authorids": "~Tarun_Kalluri1;~Bodhisattwa_Prasad_Majumder1;~Manmohan_Chandraker3", "gender": "M;;M", "homepage": "https://tarun005.github.io/;https://www.majumderb.com/;http://cseweb.ucsd.edu/~mkchandraker/", "dblp": "167/4104;138/6177;79/589", "google_scholar": "https://scholar.google.co.in/citations?user=AeraUlMAAAAJ;cEM1a5gAAAAJ;oPFCNk4AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Tarun_Kalluri1;~Bodhisattwa_Prasad_Majumder1;~Manmohan_Chandraker2", "aff": "University of California, San Diego, University of California, San Diego;Allen Institute for Artificial Intelligence;University of California, San Diego", "aff_domain": "eng.ucsd.edu;allenai.org;ucsd.edu", "position": "PhD student;Researcher;Full Professor", "bibtex": "@inproceedings{\nkalluri2024tell,\ntitle={Tell, Don't Show: Language Guidance Eases Transfer Across Domains in Images and Videos},\nauthor={Tarun Kalluri and Bodhisattwa Prasad Majumder and Manmohan Chandraker},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=sFN49CfklF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7136559, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2868114330493762514&as_sdt=8005&sciodt=0,7&hl=en", "gs_version_total": 5, "email": "eng.ucsd.edu;allenai.org;ucsd.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of California, San Diego;Allen Institute for Artificial Intelligence", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucsd.edu;https://allenai.org", "aff_unique_abbr": "UCSD;AI2", "aff_campus_unique_index": "0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Language-Driven Cross-Modal Classifier for Zero-Shot Multi-Label Image Recognition", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32917", "id": "sHswzNWUW2", "proceeding": "https://proceedings.mlr.press/v235/liu24bq.html", "pdf": "https://openreview.net/pdf?id=sHswzNWUW2", "openreview": "https://openreview.net/forum?id=sHswzNWUW2", "author_site": "Yicheng Liu, Jie Wen, Chengliang Liu, xiaozhao fang, Zuoyong Li, Yong Xu, Zheng Zhang", "tldr": "", "abstract": "Large-scale pre-trained vision-language models (e.g., CLIP) have shown powerful zero-shot transfer capabilities in image recognition tasks. Recent approaches typically employ supervised fine-tuning methods to adapt CLIP for zero-shot multi-label image recognition tasks. However, obtaining sufficient multi-label annotated image data for training is challenging and not scalable. In this paper, we propose a new language-driven framework for zero-shot multi-label recognition that eliminates the need for annotated images during training. Leveraging the aligned CLIP multi-modal embedding space, our method utilizes language data generated by LLMs to train a cross-modal classifier, which is subsequently transferred to the visual modality. During inference, directly applying the classifier to visual inputs may limit performance due to the modality gap. To address this issue, we introduce a cross-modal mapping method that maps image embeddings to the language modality while retaining crucial visual information. Comprehensive experiments demonstrate that our method outperforms other zero-shot multi-label recognition methods and achieves competitive results compared to few-shot methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yicheng Liu;Jie Wen;Chengliang Liu;Xiaozhao Fang;Zuoyong Li;Yong Xu;Zheng Zhang", "authorids": "~Yicheng_Liu6;~Jie_Wen1;~Chengliang_Liu1;~Xiaozhao_Fang1;~Zuoyong_Li1;~Yong_Xu9;~Zheng_Zhang18", "gender": ";;;M;;M;", "homepage": ";;;;;https://www.yongxu.org;", "dblp": ";;;140/6459.html;;;", "google_scholar": ";;;;;https://scholar.google.com.hk/citations?user=zOVgYQYAAAAJ;", "orcid": ";;;0000-0001-8440-1765;;;", "linkedin": ";;;;;;", "or_profile": "~Yicheng_Liu6;~Jie_Wen1;~Chengliang_Liu1;~Xiaozhao_Fang1;~Zuoyong_Li1;~Yong_Xu9;~Zheng_Zhang18", "aff": ";;;Guangdong University of Technology;;Harbin Institute of Technology;", "aff_domain": ";;;gdut.edu.cn;;hit.edu.cn;", "position": ";;;Full Professor;;Full Professor;", "bibtex": "@inproceedings{\nliu2024languagedriven,\ntitle={Language-Driven Cross-Modal Classifier for Zero-Shot Multi-Label Image Recognition},\nauthor={Yicheng Liu and Jie Wen and Chengliang Liu and Xiaozhao Fang and Zuoyong Li and Yong Xu and Zheng Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=sHswzNWUW2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7605486, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9523546567517048568&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": ";;;gdut.edu.cn;;hit.edu.cn;", "author_num": 7, "aff_unique_index": "0;1", "aff_unique_norm": "Guangdong University of Technology;Harbin Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "http://www.gdut.edu.cn;http://www.hit.edu.cn/", "aff_unique_abbr": "GDUT;HIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Harbin", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Large Language Models are Geographically Biased", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32916", "id": "sHtIStlg0v", "proceeding": "https://proceedings.mlr.press/v235/manvi24a.html", "pdf": "https://openreview.net/pdf?id=sHtIStlg0v", "openreview": "https://openreview.net/forum?id=sHtIStlg0v", "author_site": "Rohin Manvi, Samar Khanna, Marshall Burke, David Lobell, Stefano Ermon", "tldr": "", "abstract": "Large Language Models (LLMs) inherently carry the biases contained in their training corpora, which can lead to the perpetuation of societal harm. As the impact of these foundation models grows, understanding and evaluating their biases becomes crucial to achieving fairness and accuracy. We propose to study what LLMs know about the world we live in through the lens of geography. This approach is particularly powerful as there is ground truth for the numerous aspects of human life that are meaningfully projected onto geographic space such as culture, race, language, politics, and religion. We show various problematic geographic biases, which we define as systemic errors in geospatial predictions. Initially, we demonstrate that LLMs are capable of making accurate zero-shot geospatial predictions in the form of ratings that show strong monotonic correlation with ground truth (Spearman's $\\rho$ of up to 0.89). We then show that LLMs exhibit common biases across a range of objective and subjective topics. In particular, LLMs are clearly biased against locations with lower socioeconomic conditions (e.g. most of Africa) on a variety of sensitive subjective topics such as attractiveness, morality, and intelligence (Spearman\u2019s $\\rho$ of up to 0.70). Finally, we introduce a bias score to quantify this and find that there is significant variation in the magnitude of bias across existing LLMs. Code is available on the project website: https://rohinmanvi.github.io/GeoLLM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rohin Manvi;Samar Khanna;Marshall Burke;David B. Lobell;Stefano Ermon", "authorids": "~Rohin_Manvi1;~Samar_Khanna1;~Marshall_Burke1;~David_B._Lobell1;~Stefano_Ermon1", "gender": "M;;;;M", "homepage": ";https://samar-khanna.github.io/;http://web.stanford.edu/~mburke/;;http://cs.stanford.edu/~ermon/", "dblp": ";;;00/11322;47/8135", "google_scholar": ";DPHEQsMAAAAJ;ppx71KUAAAAJ;;", "orcid": ";;;;", "linkedin": "rohin-manvi-2a9226187/;samar-khanna-133b8190/;;;", "or_profile": "~Rohin_Manvi1;~Samar_Khanna1;~Marshall_Burke1;~David_B._Lobell1;~Stefano_Ermon1", "aff": "Stanford University;Computer Science Department, Stanford University;;Stanford University;Stanford University", "aff_domain": "stanford.edu;cs.stanford.edu;;stanford.edu;stanford.edu", "position": "Undergrad student;Researcher;;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nmanvi2024large,\ntitle={Large Language Models are Geographically Biased},\nauthor={Rohin Manvi and Samar Khanna and Marshall Burke and David B. Lobell and Stefano Ermon},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=sHtIStlg0v}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8948119, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7560356487739533888&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "stanford.edu;cs.stanford.edu;;stanford.edu;stanford.edu", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Sample Complexity Bounds for Estimating Probability Divergences under Invariances", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32915", "id": "sKjcrAC4eZ", "proceeding": "https://proceedings.mlr.press/v235/tahmasebi24a.html", "pdf": "https://openreview.net/pdf?id=sKjcrAC4eZ", "openreview": "https://openreview.net/forum?id=sKjcrAC4eZ", "author_site": "Behrooz Tahmasebi, Stefanie Jegelka", "tldr": "", "abstract": "Group-invariant probability distributions appear in many data-generative models in machine learning, such as graphs, point clouds, and images. In practice, one often needs to estimate divergences between such distributions. In this work, we study how the inherent invariances, with respect to any smooth action of a Lie group on a manifold, improve sample complexity when estimating the 1-Wasserstein distance, the Sobolev Integral Probability Metrics (Sobolev IPMs), the Maximum Mean Discrepancy (MMD), and also the complexity of the density estimation problem (in the $L^2$ and $L^\\infty$ distance). Our results indicate a two-fold gain: (1) reducing the sample complexity by a multiplicative factor corresponding to the group size (for finite groups) or the normalized volume of the quotient space (for groups of positive dimension); (2) improving the exponent in the convergence rate (for groups of positive dimension). These results are completely new for groups of positive dimension and extend recent bounds for finite group actions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Behrooz Tahmasebi;Stefanie Jegelka", "authorids": "~Behrooz_Tahmasebi1;~Stefanie_Jegelka3", "gender": "M;F", "homepage": "https://people.csail.mit.edu/bzt/;http://people.csail.mit.edu/stefje/", "dblp": "223/0884;38/7003", "google_scholar": "ZXCO3DMAAAAJ;gTWUZlsAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Behrooz_Tahmasebi1;~Stefanie_Jegelka3", "aff": "Microsoft Research ;Massachusetts Institute of Technology", "aff_domain": "microsoft.com;mit.edu", "position": "Intern;Associate Professor", "bibtex": "@inproceedings{\ntahmasebi2024sample,\ntitle={Sample Complexity Bounds for Estimating Probability Divergences under Invariances},\nauthor={Behrooz Tahmasebi and Stefanie Jegelka},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=sKjcrAC4eZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1120212, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=802210843913359130&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "microsoft.com;mit.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Microsoft;Massachusetts Institute of Technology", "aff_unique_dep": "Microsoft Research;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://web.mit.edu", "aff_unique_abbr": "MSR;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "CaRiNG: Learning Temporal Causal Representation under Non-Invertible Generation Process", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32914", "id": "sLZzFTMWSt", "proceeding": "https://proceedings.mlr.press/v235/chen24ai.html", "pdf": "https://openreview.net/pdf?id=sLZzFTMWSt", "openreview": "https://openreview.net/forum?id=sLZzFTMWSt", "author_site": "Guangyi Chen, Yifan Shen, Zhenhao Chen, Xiangchen Song, Yuewen Sun, Weiran Yao, Xiao Liu, Kun Zhang", "tldr": "", "abstract": "Identifying the underlying time-delayed latent causal processes in sequential data is vital for grasping temporal dynamics and making downstream reasoning. While some recent methods can robustly identify these latent causal variables, they rely on strict assumptions about the invertible generation process from latent variables to observed data. However, these assumptions are often hard to satisfy in real-world applications containing information loss. For instance, the visual perception process translates a 3D space into 2D images, or the phenomenon of persistence of vision incorporates historical data into current perceptions. To address this challenge, we establish an identifiability theory that allows for the recovery of independent latent components even when they come from a nonlinear and non-invertible mix. Using this theory as a foundation, we propose a principled approach, CaRiNG, to learn the Causal Representation of Non-invertible Generative temporal data with identifiability guarantees. Specifically, we utilize temporal context to recover lost latent information and apply the conditions in our theory to guide the training process. Through experiments conducted on synthetic datasets, we validate that our CaRiNG method reliably identifies the causal process, even when the generation process is non-invertible. Moreover, we demonstrate that our approach considerably improves temporal understanding and reasoning in practical applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guangyi Chen;Yifan Shen;Zhenhao Chen;Xiangchen Song;Yuewen Sun;Weiran Yao;Xiao Liu;Kun Zhang", "authorids": "~Guangyi_Chen1;~Yifan_Shen4;~Zhenhao_Chen1;~Xiangchen_Song1;~Yuewen_Sun1;~Weiran_Yao1;~Xiao_Liu23;~Kun_Zhang1", "gender": "M;M;M;M;F;M;M;M", "homepage": "https://chengy12.github.io/;https://sanshuiii.github.io/about/;https://zhenhaochenofficial.github.io/;https://xiangchensong.github.io/;https://yuewen-sun.github.io/;;;http://www.andrew.cmu.edu/user/kunz1/", "dblp": "c/GuangyiChen-2;59/7950-4;192/7717;261/9024;219/9893;192/3295;;96/3115-1", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;PMKkElwAAAAJ;xOAtM0YAAAAJ;foR8BIoAAAAJ;https://scholar.google.com/citations?hl=en;rr_leUAAAAAJ;;RGoypN4AAAAJ", "orcid": ";0000-0003-2358-1146;;;;;;", "linkedin": ";;;;;;xiao-l-699069205/;", "or_profile": "~Guangyi_Chen1;~Yifan_Shen4;~Zhenhao_Chen1;~Xiangchen_Song1;~Yuewen_Sun1;~Weiran_Yao1;~Xiao_Liu23;~Kun_Zhang1", "aff": "Carnegie Mellon University;Mohamed bin Zayed University of Artificial Intelligence;Mohamed bin Zayed University of Artificial Intelligence;Carnegie Mellon University;Mohamed bin Zayed University of Artificial Intelligence;SalesForce.com;Technische Universit\u00e4t Darmstadt;Carnegie Mellon University", "aff_domain": "cmu.edu;mbzuai.ac.ae;mbzuai.ac.ae;cmu.edu;mbzuai.ac.ae;salesforce.com;tu-darmstadt.de;cmu.edu", "position": "Postdoc;MS student;PhD student;PhD student;Postdoc;Researcher;Researcher;Associate Professor", "bibtex": "@inproceedings{\nchen2024caring,\ntitle={CaRi{NG}: Learning Temporal Causal Representation under Non-Invertible Generation Process},\nauthor={Guangyi Chen and Yifan Shen and Zhenhao Chen and Xiangchen Song and Yuewen Sun and Weiran Yao and Xiao Liu and Kun Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=sLZzFTMWSt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4051910, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1324709897311286294&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "cmu.edu;mbzuai.ac.ae;mbzuai.ac.ae;cmu.edu;mbzuai.ac.ae;salesforce.com;tu-darmstadt.de;cmu.edu", "author_num": 8, "aff_unique_index": "0;1;1;0;1;2;3;0", "aff_unique_norm": "Carnegie Mellon University;Mohamed bin Zayed University of Artificial Intelligence;Salesforce;Technische Universit\u00e4t Darmstadt", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.cmu.edu;https://mbzuai.ac.ae;https://www.salesforce.com;https://www.tu-darmstadt.de", "aff_unique_abbr": "CMU;MBZUAI;Salesforce;TUD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;1;0;2;0", "aff_country_unique": "United States;United Arab Emirates;Germany" }, { "title": "Stochastic Quantum Sampling for Non-Logconcave Distributions and Estimating Partition Functions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32913", "id": "sNjxqSnXFO", "proceeding": "https://proceedings.mlr.press/v235/ozgul24a.html", "pdf": "https://openreview.net/pdf?id=sNjxqSnXFO", "openreview": "https://openreview.net/forum?id=sNjxqSnXFO", "author_site": "Guneykan Ozgul, Xiantao Li, Mehrdad Mahdavi, Chunhao Wang", "tldr": "", "abstract": "We present quantum algorithms for sampling from possibly non-logconcave probability distributions expressed as $\\pi(x) \\propto \\exp(-\\beta f(x))$ as well as quantum algorithms for estimating the partition function for such distributions. We also incorporate a stochastic gradient oracle that implements the quantum walk operators inexactly by only using mini-batch gradients when $f$ can be written as a finite sum. One challenge of quantizing the resulting Markov chains is that they do not satisfy the detailed balance condition in general. Consequently, the mixing time of the algorithm cannot be expressed in terms of the spectral gap of the transition density matrix, making the quantum algorithms nontrivial to analyze. We overcame these challenges by first building a reference reversible Markov chain that converges to the target distribution, then controlling the discrepancy between our algorithm's output and the target distribution by using the reference Markov chain as a bridge to establish the total complexity. Our quantum algorithms exhibit polynomial speedups in terms of dimension or precision dependencies when compared to best-known classical algorithms under similar assumptions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guneykan Ozgul;Xiantao Li;Mehrdad Mahdavi;Chunhao Wang", "authorids": "~Guneykan_Ozgul1;~Xiantao_Li1;~Mehrdad_Mahdavi2;~Chunhao_Wang1", "gender": "M;M;M;M", "homepage": "https://guneykan.github.io/;https://xxl12.github.io/main/;http://www.cse.psu.edu/~mzm616/;https://www.chunhaowang.com", "dblp": ";;88/4321;", "google_scholar": "SqBr5pYAAAAJ;2U8gtbEAAAAJ;HzxnwocAAAAJ;", "orcid": ";0000-0002-9760-7292;;", "linkedin": ";;;", "or_profile": "~Guneykan_Ozgul1;~Xiantao_Li1;~Mehrdad_Mahdavi2;~Chunhao_Wang1", "aff": "Pennsylvania State University;Pennsylvania State University;Toyota Technological Institute at Chicago;Pennsylvania State University", "aff_domain": "psu.edu;psu.edu;ttic.edu;psu.edu", "position": "PhD student;Full Professor;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nozgul2024stochastic,\ntitle={Stochastic Quantum Sampling for Non-Logconcave Distributions and Estimating Partition Functions},\nauthor={Guneykan Ozgul and Xiantao Li and Mehrdad Mahdavi and Chunhao Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=sNjxqSnXFO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 510505, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3900593262588899270&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "email": "psu.edu;psu.edu;ttic.edu;psu.edu", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Pennsylvania State University;Toyota Technological Institute at Chicago", "aff_unique_dep": ";", "aff_unique_url": "https://www.psu.edu;https://www.tti-chicago.org", "aff_unique_abbr": "PSU;TTI Chicago", "aff_campus_unique_index": "1", "aff_campus_unique": ";Chicago", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Optimal Eye Surgeon: Finding image priors through sparse generators at initialization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32912", "id": "sO5qtpvsUZ", "proceeding": "https://proceedings.mlr.press/v235/ghosh24c.html", "pdf": "https://openreview.net/pdf?id=sO5qtpvsUZ", "openreview": "https://openreview.net/forum?id=sO5qtpvsUZ", "author_site": "Avrajit Ghosh, Xitong Zhang, Kenneth Sun, Qing Qu, Saiprasad Ravishankar, Rongrong Wang", "tldr": "", "abstract": "We introduce Optimal Eye Surgeon (OES), a framework for pruning and training deep image generator networks. Typically, untrained deep convolutional networks, which include image sampling operations, serve as effective image priors. However, they tend to overfit to noise in image restoration tasks due to being overparameterized. OES addresses this by adaptively pruning networks at random initialization to a level of underparameterization. This process effectively captures low-frequency image components even without training, by just masking. When trained to fit noisy image, these pruned subnetworks, which we term Sparse-DIP, resist overfitting to noise. This benefit arises from underparameterization and the regularization effect of masking, constraining them in the manifold of image priors. We demonstrate that subnetworks pruned through OES surpass other leading pruning methods, such as the Lottery Ticket Hypothesis, which is known to be suboptimal for image recovery tasks. Our extensive experiments demonstrate the transferability of OES-masks and the characteristics of sparse-subnetworks for image generation. Code is available at https://github.com/Avra98/Optimal-Eye-Surgeon.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Avrajit Ghosh;Xitong Zhang;Kenneth K. Sun;Qing Qu;Saiprasad Ravishankar;Rongrong Wang", "authorids": "~Avrajit_Ghosh1;~Xitong_Zhang1;~Kenneth_K._Sun1;~Qing_Qu2;~Saiprasad_Ravishankar1;~Rongrong_Wang1", "gender": "M;M;M;M;;", "homepage": "https://sites.google.com/view/avrajitghosh;;https://kennethsun.net;https://qingqu.engin.umich.edu/;;https://users.math.msu.edu/users/wangron6/", "dblp": "261/2812;156/9687;;127/6874-1;;", "google_scholar": "Q44Z8hwAAAAJ;Ci9svAcAAAAJ;;JfblW3MAAAAJ;;", "orcid": ";;;0000-0001-9136-558X;;", "linkedin": ";xitong-zhang-70118915a/;;qing-q-1a0b9746/;;", "or_profile": "~Avrajit_Ghosh1;~Xitong_Zhang1;~Kenneth_K._Sun1;~Qing_Qu2;~Saiprasad_Ravishankar1;~Rongrong_Wang1", "aff": "RIKEN;Michigan State University;University of Michigan - Ann Arbor;University of Michigan;;Michigan State University", "aff_domain": "riken.jp;msu.edu;umich.edu;umich.edu;;msu.edu", "position": "Intern;PhD student;Undergrad student;Assistant Professor;;Associate Professor", "bibtex": "@inproceedings{\nghosh2024optimal,\ntitle={Optimal Eye Surgeon: Finding image priors through sparse generators at initialization},\nauthor={Avrajit Ghosh and Xitong Zhang and Kenneth K. Sun and Qing Qu and Saiprasad Ravishankar and Rongrong Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=sO5qtpvsUZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5564646, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9487392066902433891&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "riken.jp;msu.edu;umich.edu;umich.edu;;msu.edu", "author_num": 6, "aff_unique_index": "0;1;2;2;1", "aff_unique_norm": "RIKEN;Michigan State University;University of Michigan", "aff_unique_dep": ";;", "aff_unique_url": "https://www.riken.jp;https://www.msu.edu;https://www.umich.edu", "aff_unique_abbr": "RIKEN;MSU;UM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Ann Arbor", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "Japan;United States" }, { "title": "PAC-Bayesian Generalization Bounds for Knowledge Graph Representation Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32911", "id": "sOyJSNUrzQ", "proceeding": "https://proceedings.mlr.press/v235/lee24i.html", "pdf": "https://openreview.net/pdf?id=sOyJSNUrzQ", "openreview": "https://openreview.net/forum?id=sOyJSNUrzQ", "author_site": "Jaejun Lee, Minsung Hwang, Joyce Whang", "tldr": "", "abstract": "While a number of knowledge graph representation learning (KGRL) methods have been proposed over the past decade, very few theoretical analyses have been conducted on them. In this paper, we present the first PAC-Bayesian generalization bounds for KGRL methods. To analyze a broad class of KGRL models, we propose a generic framework named ReED (Relation-aware Encoder-Decoder), which consists of a relation-aware message passing encoder and a triplet classification decoder. Our ReED framework can express at least 15 different existing KGRL models, including not only graph neural network-based models such as R-GCN and CompGCN but also shallow-architecture models such as RotatE and ANALOGY. Our generalization bounds for the ReED framework provide theoretical grounds for the commonly used tricks in KGRL, e.g., parameter-sharing and weight normalization schemes, and guide desirable design choices for practical KGRL methods. We empirically show that the critical factors in our generalization bounds can explain actual generalization errors on three real-world knowledge graphs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jaejun Lee;Minsung Hwang;Joyce Jiyoung Whang", "authorids": "~Jaejun_Lee1;~Minsung_Hwang1;~Joyce_Jiyoung_Whang2", "gender": ";M;F", "homepage": "https://jaejunlee714.github.io/;https://bdi-lab.kaist.ac.kr/;http://bdi-lab.kaist.ac.kr/", "dblp": ";;121/4230", "google_scholar": "G5UMYkUAAAAJ;https://scholar.google.co.kr/citations?user=Iw2xxKwAAAAJ;TLrKglQAAAAJ", "orcid": "0000-0002-6948-6462;0009-0003-9700-8884;0000-0002-4773-3194", "linkedin": ";;", "or_profile": "~Jaejun_Lee1;~Minsung_Hwang1;~Joyce_Jiyoung_Whang2", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;KAIST", "aff_domain": "kaist.edu;kaist.ac.kr;kaist.ac.kr", "position": "PhD student;MS student;Associate Professor", "bibtex": "@inproceedings{\nlee2024pacbayesian,\ntitle={{PAC}-Bayesian Generalization Bounds for Knowledge Graph Representation Learning},\nauthor={Jaejun Lee and Minsung Hwang and Joyce Jiyoung Whang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=sOyJSNUrzQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1561088, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12886412402125966172&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "kaist.edu;kaist.ac.kr;kaist.ac.kr", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Private Heterogeneous Federated Learning Without a Trusted Server Revisited: Error-Optimal and Communication-Efficient Algorithms for Convex Losses", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32910", "id": "sSAEhcdB9N", "proceeding": "https://proceedings.mlr.press/v235/gao24i.html", "pdf": "https://openreview.net/pdf?id=sSAEhcdB9N", "openreview": "https://openreview.net/forum?id=sSAEhcdB9N", "author_site": "Changyu Gao, Andrew Lowy, Xingyu Zhou, Stephen Wright", "tldr": "", "abstract": "We revisit the problem of federated learning (FL) with private data from people who do not trust the server or other silos/clients. In this context, every silo (e.g. hospital) has data from several people (e.g. patients) and needs to protect the privacy of each person's data (e.g. health records), even if the server and/or other silos try to uncover this data. Inter-Silo Record-Level Differential Privacy (ISRL-DP) prevents each silo's data from being leaked, by requiring that silo $i$'s *communications* satisfy item-level differential privacy. Prior work (Lowy & Razaviyayn, 2023a) characterized the optimal excess risk bounds for ISRL-DP algorithms with *homogeneous* (i.i.d.) silo data and convex loss functions. However, two important questions were left open: 1) Can the same excess risk bounds be achieved with *heterogeneous* (non-i.i.d.) silo data? 2) Can the optimal risk bounds be achieved with *fewer communication rounds*? In this paper, we give positive answers to both questions. We provide novel ISRL-DP FL algorithms that achieve the optimal excess risk bounds in the presence of heterogeneous silo data. Moreover, our algorithms are more *communication-efficient* than the prior state-of-the-art. For smooth loss functions, our algorithm achieves the *optimal* excess risk bound and has *communication complexity that matches the non-private lower bound*. Additionally, our algorithms are more *computationally efficient* than the previous state-of-the-art.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Changyu Gao;Andrew Lowy;Xingyu Zhou;Stephen Wright", "authorids": "~Changyu_Gao1;~Andrew_Lowy1;~Xingyu_Zhou2;~Stephen_Wright1", "gender": ";;M;M", "homepage": "https://cyugao.github.io/;https://sites.google.com/view/andrewlowy;http://xingyuzhou.org;https://wrightstephen.github.io/sw_proj/", "dblp": "339/8993;285/5314;07/10352-1;75/2677", "google_scholar": ";https://scholar.google.com/citations?hl=en;AsTyRmwAAAAJ;VFQRIOwAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Changyu_Gao1;~Andrew_Lowy1;~Xingyu_Zhou2;~Stephen_Wright1", "aff": "University of Wisconsin - Madison;University of Wisconsin - Madison;Wayne State University;University of Wisconsin, Madison", "aff_domain": "wisc.edu;wisc.edu;wayne.edu;wisc.edu", "position": "PhD student;Postdoc;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\ngao2024private,\ntitle={Private Heterogeneous Federated Learning Without a Trusted Server Revisited: Error-Optimal and Communication-Efficient Algorithms for Convex Losses},\nauthor={Changyu Gao and Andrew Lowy and Xingyu Zhou and Stephen Wright},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=sSAEhcdB9N}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1490088, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4149734455419974843&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "wisc.edu;wisc.edu;wayne.edu;wisc.edu", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "University of Wisconsin-Madison;Wayne State University;University of Wisconsin", "aff_unique_dep": ";;", "aff_unique_url": "https://www.wisc.edu;https://wayne.edu;https://www.wisc.edu", "aff_unique_abbr": "UW-Madison;WSU;UW", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Low-Cost High-Power Membership Inference Attacks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32909", "id": "sT7UJh5CTc", "proceeding": "https://proceedings.mlr.press/v235/zarifzadeh24a.html", "pdf": "https://openreview.net/pdf?id=sT7UJh5CTc", "openreview": "https://openreview.net/forum?id=sT7UJh5CTc", "author_site": "Sajjad Zarifzadeh, Philippe Liu, Reza Shokri", "tldr": "", "abstract": "Membership inference attacks aim to detect if a particular data point was used in training a model. We design a novel statistical test to perform robust membership inference attacks (RMIA) with low computational overhead. We achieve this by a fine-grained modeling of the null hypothesis in our likelihood ratio tests, and effectively leveraging both reference models and reference population data samples. RMIA has superior test power compared with prior methods, throughout the TPR-FPR curve (even at extremely low FPR, as low as 0). Under computational constraints, where only a limited number of pre-trained reference models (as few as 1) are available, and also when we vary other elements of the attack (e.g., data distribution), our method performs exceptionally well, unlike prior attacks that approach random guessing. RMIA lays the groundwork for practical yet accurate data privacy risk assessment in machine learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sajjad Zarifzadeh;Philippe Liu;Reza Shokri", "authorids": "~Sajjad_Zarifzadeh1;~Philippe_Liu1;~Reza_Shokri1", "gender": "M;;", "homepage": "https://yazd.ac.ir/en/people/zarifzadeh;;", "dblp": ";;", "google_scholar": "IGMsPeUAAAAJ;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Sajjad_Zarifzadeh1;~Philippe_Liu1;~Reza_Shokri1", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nzarifzadeh2024lowcost,\ntitle={Low-Cost High-Power Membership Inference Attacks},\nauthor={Sajjad Zarifzadeh and Philippe Liu and Reza Shokri},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=sT7UJh5CTc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3065113, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5113215623402215438&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": ";;", "author_num": 3 }, { "title": "Private and Federated Stochastic Convex Optimization: Efficient Strategies for Centralized Systems", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32908", "id": "sTVSyqD6XX", "proceeding": "https://proceedings.mlr.press/v235/reshef24a.html", "pdf": "https://openreview.net/pdf?id=sTVSyqD6XX", "openreview": "https://openreview.net/forum?id=sTVSyqD6XX", "author_site": "Roie Reshef, Kfir Levy", "tldr": "", "abstract": "This paper addresses the challenge of preserving privacy in Federated Learning (FL) within centralized systems, focusing on both trusted and untrusted server scenarios. We analyze this setting within the Stochastic Convex Optimization (SCO) framework, and devise methods that ensure Differential Privacy (DP) while maintaining optimal convergence rates for homogeneous and heterogeneous data distributions. Our approach, based on a recent stochastic optimization technique, offers linear computational complexity, comparable to non-private FL methods, and reduced gradient obfuscation. This work enhances the practicality of DP in FL, balancing privacy, efficiency, and robustness in a variety of server trust environments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Roie Reshef;Kfir Yehuda Levy", "authorids": "~Roie_Reshef1;~Kfir_Yehuda_Levy1", "gender": "M;M", "homepage": ";http://kfiryehud.wixsite.com/kfir-y-levy", "dblp": ";83/11388", "google_scholar": ";", "orcid": "0009-0009-0914-0783;", "linkedin": ";", "or_profile": "~Roie_Reshef1;~Kfir_Yehuda_Levy1", "aff": "Technion - Israel Institute of Technology, Technion - Israel Institute of Technology;Technion - Israel Institute of Technology, Technion", "aff_domain": "campus.technion.ac.il;technion.ac.il", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nreshef2024private,\ntitle={Private and Federated Stochastic Convex Optimization: Efficient Strategies for Centralized Systems},\nauthor={Roie Reshef and Kfir Yehuda Levy},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=sTVSyqD6XX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 402756, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5AGuhdui_scJ:scholar.google.com/&scioq=Private+and+Federated+Stochastic+Convex+Optimization:+Efficient+Strategies+for+Centralized+Systems&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": "campus.technion.ac.il;technion.ac.il", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "Language Models as Semantic Indexers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32907", "id": "sYeioWoF9u", "proceeding": "https://proceedings.mlr.press/v235/jin24h.html", "pdf": "https://openreview.net/pdf?id=sYeioWoF9u", "openreview": "https://openreview.net/forum?id=sYeioWoF9u", "author_site": "Bowen Jin, Hansi Zeng, Guoyin Wang, Xiusi Chen, Tianxin Wei, Ruirui Li, Zhengyang Wang, Zheng Li, Yang Li, Hanqing Lu, Suhang Wang, Jiawei Han, Xianfeng Tang", "tldr": "", "abstract": "Semantic identifier (ID) is an important concept in information retrieval that aims to preserve the semantics of objects such as documents and items inside their IDs. Previous studies typically adopt a two-stage pipeline to learn semantic IDs by first procuring embeddings using off-the-shelf text encoders and then deriving IDs based on the embeddings. However, each step introduces potential information loss, and there is usually an inherent mismatch between the distribution of embeddings within the latent space produced by text encoders and the anticipated distribution required for semantic indexing. It is non-trivial to design a method that can learn the document\u2019s semantic representations and its hierarchical structure simultaneously, given that semantic IDs are discrete and sequentially structured, and the semantic supervision is deficient. In this paper, we introduce LMIndexer, a self-supervised framework to learn semantic IDs with a generative language model. We tackle the challenge of sequential discrete ID by introducing a semantic indexer capable of generating neural sequential discrete representations with progressive training and contrastive learning. In response to the semantic supervision deficiency, we propose to train the model with a self-supervised document reconstruction objective. We show the high quality of the learned IDs and demonstrate their effectiveness on three tasks including recommendation, product search, and document retrieval on five datasets from various domains. Code is available at https://github.com/PeterGriffinJin/LMIndexer.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bowen Jin;Hansi Zeng;Guoyin Wang;Xiusi Chen;Tianxin Wei;Ruirui Li;Zhengyang Wang;Zheng Li;Yang Li;Hanqing Lu;Suhang Wang;Jiawei Han;Xianfeng Tang", "authorids": "~Bowen_Jin1;~Hansi_Zeng1;~Guoyin_Wang1;~Xiusi_Chen1;~Tianxin_Wei1;~Ruirui_Li3;~Zhengyang_Wang1;~Zheng_Li9;~Yang_Li80;~Hanqing_Lu3;~Suhang_Wang1;~Jiawei_Han1;~Xianfeng_Tang1", "gender": "M;;M;M;;M;M;M;M;M;M;M;M", "homepage": "https://peterjin.me/;https://hansizeng.github.io/;;https://xiusic.github.io/;https://weitianxin.github.io/;https://ruiruili.mystrikingly.com/;;;;https://faculty.ist.psu.edu/szw494/;http://hanj.cs.illinois.edu/;https://xta.ng/;https://hsqmlzno1.github.io/", "dblp": "235/8066;;05/3838-2;210/1049;277/5800;12/8221-2;;;39/6752;136/9440;h/JiaweiHan.html;33/7694;10/1143-18", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;;https://scholar.google.com/citations?hl=en;JqGAil4AAAAJ;_LU2-kMAAAAJ;gYCtd6cAAAAJ;A4fNBtEAAAAJ;;pNYuJQIAAAAJ;cdT_WMMAAAAJ;https://scholar.google.com.tw/citations?user=Kv9AbjMAAAAJ;u1PEv-QAAAAJ;https://scholar.google.com.hk/citations?user=P6fwn4AAAAAJ", "orcid": "0000-0003-1295-2829;;;0000-0002-9713-8000;0000-0003-4450-2005;;0000-0002-5146-2884;;;0000-0003-3448-4878;0000-0002-3629-2696;;", "linkedin": "bowen-peter-jin/;;;xiusi-chen-53180583/;tianxin-wei-7063a2180/;;;yang-laurence-li-54023266;;;;xianfengtang/;", "or_profile": "~Bowen_Jin1;~Hansi_Zeng1;~Guoyin_Wang1;~Xiusi_Chen1;~Tianxin_Wei1;~Ruirui_Li3;~Zhengyang_Wang1;~Yang_Li80;~Hanqing_Lu3;~Suhang_Wang1;~Jiawei_Han1;~Xianfeng_Tang1;~zheng_li4", "aff": "University of Illinois, Urbana Champaign;University of Massachusetts at Amherst;Bytedance;University of California, Los Angeles;University of Illinois, Urbana-Champaign;Amazon;Amazon;;Amazon;Pennsylvania State University;University of Illinois at Urbana-Champaign (UIUC);Amazon;Amazon", "aff_domain": "illinois.edu;umass.edu;bytedance.com;ucla.edu;uiuc.edu;amazon.com;amazon.com;;amazon.com;psu.edu;illinois.edu;amazon.com;amazon.com", "position": "PhD student;PhD student;Principal Researcher;PhD student;PhD student;Researcher;Researcher;;Researcher;Assistant Professor;Full Professor;Researcher;Researcher", "bibtex": "@inproceedings{\njin2024language,\ntitle={Language Models as Semantic Indexers},\nauthor={Bowen Jin and Hansi Zeng and Guoyin Wang and Xiusi Chen and Tianxin Wei and Ruirui Li and Zhengyang Wang and Zheng Li and Yang Li and Hanqing Lu and Suhang Wang and Jiawei Han and Xianfeng Tang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=sYeioWoF9u}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 823134, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 13, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10618922475077052397&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "email": "illinois.edu;umass.edu;bytedance.com;ucla.edu;uiuc.edu;amazon.com;amazon.com;;amazon.com;psu.edu;illinois.edu;amazon.com;amazon.com", "author_num": 13, "aff_unique_index": "0;1;2;3;4;5;5;5;6;0;5;5", "aff_unique_norm": "University of Illinois Urbana-Champaign;University of Massachusetts Amherst;ByteDance;University of California, Los Angeles;University of Illinois;Amazon;Pennsylvania State University", "aff_unique_dep": ";;;;;Amazon.com, Inc.;", "aff_unique_url": "https://illinois.edu;https://www.umass.edu;https://www.bytedance.com;https://www.ucla.edu;https://illinois.edu;https://www.amazon.com;https://www.psu.edu", "aff_unique_abbr": "UIUC;UMass Amherst;Bytedance;UCLA;UIUC;Amazon;PSU", "aff_campus_unique_index": "0;1;3;0;0", "aff_campus_unique": "Urbana-Champaign;Amherst;;Los Angeles", "aff_country_unique_index": "0;0;1;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States;China" }, { "title": "Physics-Informed Neural Network Policy Iteration: Algorithms, Convergence, and Verification", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32906", "id": "sZla6SnooP", "proceeding": "https://proceedings.mlr.press/v235/meng24b.html", "pdf": "https://openreview.net/pdf?id=sZla6SnooP", "openreview": "https://openreview.net/forum?id=sZla6SnooP", "author_site": "Yiming Meng, Ruikun Zhou, Amartya Mukherjee, Maxwell Fitzsimmons, Christopher Song, Jun Liu", "tldr": "", "abstract": "Solving nonlinear optimal control problems is a challenging task, particularly for high-dimensional problems. We propose algorithms for model-based policy iterations to solve nonlinear optimal control problems with convergence guarantees. The main component of our approach is an iterative procedure that utilizes neural approximations to solve linear partial differential equations (PDEs), ensuring convergence. We present two variants of the algorithms. The first variant formulates the optimization problem as a linear least square problem, drawing inspiration from extreme learning machine (ELM) for solving PDEs. This variant efficiently handles low-dimensional problems with high accuracy. The second variant is based on a physics-informed neural network (PINN) for solving PDEs and has the potential to address high-dimensional problems. We demonstrate that both algorithms outperform traditional approaches, such as Galerkin methods, by a significant margin. We provide a theoretical analysis of both algorithms in terms of convergence of neural approximations towards the true optimal solutions in a general setting. Furthermore, we employ formal verification techniques to demonstrate the verifiable stability of the resulting controllers.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yiming Meng;Ruikun Zhou;Amartya Mukherjee;Maxwell Fitzsimmons;Christopher Song;Jun Liu", "authorids": "~Yiming_Meng1;~Ruikun_Zhou1;~Amartya_Mukherjee1;~Maxwell_Fitzsimmons1;~Christopher_Song1;~Jun_Liu11", "gender": "M;M;M;;M;M", "homepage": ";;https://amartyamukherjee.github.io/;;https://uwaterloo.ca/applied-mathematics/department-members/graduate-students;", "dblp": ";;;;;", "google_scholar": "bym7rFQAAAAJ;https://scholar.google.ca/citations?user=iJ5B-60AAAAJ;https://scholar.google.ca/citations?user=OPbpMbYAAAAJ;;;", "orcid": ";0000-0001-7265-0617;0000-0001-9962-8110;;;", "linkedin": ";;amartya-marty-mukherjee-142246164/;;;", "or_profile": "~Yiming_Meng1;~Ruikun_Zhou1;~Amartya_Mukherjee1;~Maxwell_Fitzsimmons1;~Christopher_Song1;~Jun_Liu11", "aff": "University of Illinois, Urbana Champaign;University of Waterloo;University of Waterloo;;University of Waterloo;University of Waterloo", "aff_domain": "illinois.edu;uwaterloo.ca;uwaterloo.ca;;uwaterloo.ca;uwaterloo.ca", "position": "Postdoc;PhD student;PhD student;;MS student;Associate Professor", "bibtex": "@inproceedings{\nmeng2024physicsinformed,\ntitle={Physics-Informed Neural Network Policy Iteration: Algorithms, Convergence, and Verification},\nauthor={Yiming Meng and Ruikun Zhou and Amartya Mukherjee and Maxwell Fitzsimmons and Christopher Song and Jun Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=sZla6SnooP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1086192, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10147044620830302484&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "illinois.edu;uwaterloo.ca;uwaterloo.ca;;uwaterloo.ca;uwaterloo.ca", "author_num": 6, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "University of Illinois Urbana-Champaign;University of Waterloo", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://uwaterloo.ca", "aff_unique_abbr": "UIUC;UW", "aff_campus_unique_index": "0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "United States;Canada" }, { "title": "Pruned Pivot: Correlation Clustering Algorithm for Dynamic, Parallel, and Local Computation Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32905", "id": "saP7s0ZgYE", "proceeding": "https://proceedings.mlr.press/v235/dalirrooyfard24a.html", "pdf": "https://openreview.net/pdf?id=saP7s0ZgYE", "openreview": "https://openreview.net/forum?id=saP7s0ZgYE", "author_site": "Mina Dalirrooyfard, Konstantin Makarychev, Slobodan Mitrovic", "tldr": "", "abstract": "Given a graph with positive and negative edge labels, the correlation clustering problem aims to cluster the nodes so to minimize the total number of between-cluster positive and within-cluster negative edges. This problem has many applications in data mining, particularly in unsupervised learning. Inspired by the prevalence of large graphs and constantly changing data in modern applications, we study correlation clustering in dynamic, parallel (MPC), and local computation (LCA) settings. We design an approach that improves state-of-the-art runtime complexities in all these settings. In particular, we provide the first fully dynamic algorithm that runs in an expected amortized constant time, without any dependence on the graph size. Moreover, our algorithm essentially matches the approximation guarantee of the celebrated Pivot algorithm.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mina Dalirrooyfard;Konstantin Makarychev;Slobodan Mitrovic", "authorids": "~Mina_Dalirrooyfard1;~Konstantin_Makarychev1;~Slobodan_Mitrovic1", "gender": "F;M;", "homepage": "https://ca.linkedin.com/in/mina-dalirrooyfard-6691a153?original_referer=https%3A%2F%2Fwww.google.com%2F;http://konstantin.makarychev.net/;", "dblp": "209/5851;37/1011;", "google_scholar": "vatvqfAAAAAJ;https://scholar.google.com.tw/citations?user=-E3hYj8AAAAJ;", "orcid": ";0000-0002-9587-3677;", "linkedin": ";konstantin-makarychev-143b3a132/;", "or_profile": "~Mina_Dalirrooyfard1;~Konstantin_Makarychev1;~Slobodan_Mitrovic1", "aff": "Morgan Stanley;Northwestern University;", "aff_domain": "morganstanley.com;northwestern.edu;", "position": "Researcher;Full Professor;", "bibtex": "@inproceedings{\ndalirrooyfard2024pruned,\ntitle={Pruned Pivot: Correlation Clustering Algorithm for Dynamic, Parallel, and Local Computation Models},\nauthor={Mina Dalirrooyfard and Konstantin Makarychev and Slobodan Mitrovic},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=saP7s0ZgYE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 471909, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9086003812990598110&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "morganstanley.com;northwestern.edu;", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Morgan Stanley;Northwestern University", "aff_unique_dep": ";", "aff_unique_url": "https://www.morganstanley.com;https://www.northwestern.edu", "aff_unique_abbr": "Morgan Stanley;NU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "APT: Adaptive Pruning and Tuning Pretrained Language Models for Efficient Training and Inference", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32904", "id": "sb81Xl50JG", "proceeding": "https://proceedings.mlr.press/v235/zhao24g.html", "pdf": "https://openreview.net/pdf?id=sb81Xl50JG", "openreview": "https://openreview.net/forum?id=sb81Xl50JG", "author_site": "Bowen Zhao, Hannaneh Hajishirzi, Qingqing Cao", "tldr": "", "abstract": "Fine-tuning and inference with large Language Models (LM) are generally known to be expensive. Parameter-efficient fine-tuning over pretrained LMs reduces training memory by updating a small number of LM parameters but does not improve inference efficiency. Structured pruning improves LM inference efficiency by removing consistent parameter blocks, yet often increases training memory and time. To improve both training and inference efficiency, we introduce APT that adaptively *prunes* and *tunes* parameters for the LMs. At the early stage of fine-tuning, APT dynamically adds *salient* tuning parameters for fast and accurate convergence while discarding unimportant parameters for efficiency. Compared to baselines, our experiments show that APT maintains up to 98% task performance when pruning RoBERTa and T5 models with 40% parameters left while keeping 86.4% LLaMA models' performance with 70% parameters remaining. Furthermore, APT speeds up LMs' fine-tuning by up to 8$\\times$ and reduces large LMs' memory training footprint by up to 70%. Our code and models are publicly available at https://github.com/ROIM1998/APT.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bowen Zhao;Hannaneh Hajishirzi;Qingqing Cao", "authorids": "~Bowen_Zhao3;~Hannaneh_Hajishirzi1;~Qingqing_Cao1", "gender": "M;F;M", "homepage": "http://my.bowenroim.com/;https://homes.cs.washington.edu/~hannaneh/;https://awk.ai/", "dblp": "191/9426-4;52/1296;", "google_scholar": "mBJItX8AAAAJ;LOV6_WIAAAAJ;vLpPyUUAAAAJ", "orcid": "0000-0001-7001-0675;;0000-0002-8564-9241", "linkedin": "bowen-zhao-760b83225/;;qqcao", "or_profile": "~Bowen_Zhao3;~Hannaneh_Hajishirzi1;~Qingqing_Cao1", "aff": "Tsinghua University;University of Washington;University of Washington, Seattle", "aff_domain": "mail.tsinghua.edu.cn;uw.edu;uw.edu", "position": "MS student;Associate Professor;Postdoc", "bibtex": "@inproceedings{\nzhao2024apt,\ntitle={{APT}: Adaptive Pruning and Tuning Pretrained Language Models for Efficient Training and Inference},\nauthor={Bowen Zhao and Hannaneh Hajishirzi and Qingqing Cao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=sb81Xl50JG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 854031, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2866478592040644547&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 7, "email": "mail.tsinghua.edu.cn;uw.edu;uw.edu", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Tsinghua University;University of Washington", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.washington.edu", "aff_unique_abbr": "THU;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;United States" }, { "title": "Projecting Molecules into Synthesizable Chemical Spaces", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32903", "id": "scFlbJQdm1", "proceeding": "https://proceedings.mlr.press/v235/luo24a.html", "pdf": "https://openreview.net/pdf?id=scFlbJQdm1", "openreview": "https://openreview.net/forum?id=scFlbJQdm1", "author_site": "Shitong Luo, Wenhao Gao, Zuofan Wu, Jian Peng, Connor Coley, Jianzhu Ma", "tldr": "", "abstract": "Discovering new drug molecules is a pivotal yet challenging process due to the near-infinitely large chemical space and notorious demands on time and resources. Numerous generative models have recently been introduced to accelerate the drug discovery process, but their progression to experimental validation remains limited, largely due to a lack of consideration for synthetic accessibility in practical settings. In this work, we introduce a novel framework that is capable of generating new chemical structures while ensuring synthetic accessibility. Specifically, we introduce a postfix notation of synthetic pathways to represent molecules in chemical space. Then, we design a transformer-based model to translate molecular graphs into postfix notations of synthesis. We highlight the model's ability to: (a) perform bottom-up synthesis planning more accurately, (b) generate structurally similar, synthesizable analogs for unsynthesizable molecules proposed by generative models with their properties preserved, and (c) explore the local synthesizable chemical space around hit molecules.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shitong Luo;Wenhao Gao;Zuofan Wu;Jian Peng;Connor W. Coley;Jianzhu Ma", "authorids": "~Shitong_Luo1;~Wenhao_Gao1;~Zuofan_Wu1;~Jian_Peng1;~Connor_W._Coley1;~Jianzhu_Ma2", "gender": ";M;M;M;M;M", "homepage": "https://luost.me;https://wenhao-gao.github.io;;http://jianpeng.web.engr.illinois.edu/;https://majianzhu.com/;https://coley.mit.edu", "dblp": "271/0339;177/0968;;29/4181-1;24/9080.html;206/6284", "google_scholar": "z1BrjyIAAAAJ;s4eywrUAAAAJ;;https://scholar.google.com.tw/citations?user=4wcAVXAAAAAJ;;l015S80AAAAJ", "orcid": ";0000-0002-6506-8044;;;;0000-0002-8271-8723", "linkedin": ";;zuofan-wu-b08398213/;;;", "or_profile": "~Shitong_Luo1;~Wenhao_Gao1;~Zuofan_Wu1;~Jian_Peng1;~Jianzhu_Ma2;~Connor_Coley1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Helixon Research;University of Illinois, Urbana Champaign;Tsinghua University;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;helixon.com;illinois.edu;tsinghua.edu.cn;mit.edu", "position": "PhD student;PhD student;Researcher;Assistant Professor;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nluo2024projecting,\ntitle={Projecting Molecules into Synthesizable Chemical Spaces},\nauthor={Shitong Luo and Wenhao Gao and Zuofan Wu and Jian Peng and Connor W. Coley and Jianzhu Ma},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=scFlbJQdm1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4966749, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6453406264009035091&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "mit.edu;mit.edu;helixon.com;illinois.edu;tsinghua.edu.cn;mit.edu", "author_num": 6, "aff_unique_index": "0;0;1;2;3;0", "aff_unique_norm": "Massachusetts Institute of Technology;Helixon Research;University of Illinois Urbana-Champaign;Tsinghua University", "aff_unique_dep": ";;;", "aff_unique_url": "https://web.mit.edu;;https://illinois.edu;https://www.tsinghua.edu.cn", "aff_unique_abbr": "MIT;;UIUC;THU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;1;0", "aff_country_unique": "United States;China" }, { "title": "Bayesian Optimization of Function Networks with Partial Evaluations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32902", "id": "scMAQ3mFAA", "proceeding": "https://proceedings.mlr.press/v235/buathong24a.html", "pdf": "https://openreview.net/pdf?id=scMAQ3mFAA", "openreview": "https://openreview.net/forum?id=scMAQ3mFAA", "author_site": "Poompol Buathong, Jiayue Wan, Raul Astudillo, Samuel Daulton, Maximilian Balandat, Peter Frazier", "tldr": "", "abstract": "Bayesian optimization is a powerful framework for optimizing functions that are expensive or time-consuming to evaluate. Recent work has considered Bayesian optimization of function networks (BOFN), where the objective function is given by a network of functions, each taking as input the output of previous nodes in the network as well as additional parameters. Leveraging this network structure has been shown to yield significant performance improvements. Existing BOFN algorithms for general-purpose networks evaluate the full network at each iteration. However, many real-world applications allow for evaluating nodes individually. To exploit this, we propose a novel knowledge gradient acquisition function that chooses which node and corresponding inputs to evaluate in a cost-aware manner, thereby reducing query costs by evaluating only on a part of the network at each step. We provide an efficient approach to optimizing our acquisition function and show that it outperforms existing BOFN methods and other benchmarks across several synthetic and real-world problems. Our acquisition function is the first to enable cost-aware optimization of a broad class of function networks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Poompol Buathong;Jiayue Wan;Raul Astudillo;Sam Daulton;Maximilian Balandat;Peter I. Frazier", "authorids": "~Poompol_Buathong1;~Jiayue_Wan1;~Raul_Astudillo1;~Sam_Daulton1;~Maximilian_Balandat1;~Peter_I._Frazier1", "gender": "M;M;M;;;", "homepage": ";https://jiayuewan.com;https://raulastudillo.netlify.app/;;https://research.facebook.com/people/balandat-max/;", "dblp": "250/9682;205/7993;242/3889;;41/9185;", "google_scholar": "LyqN0-MAAAAJ;i_HSb0EAAAAJ;r1Jkj7MAAAAJ;;N0iLicUAAAAJ;", "orcid": ";0000-0002-8252-1584;;;0000-0002-8214-8935;", "linkedin": ";jiayuewan/;;;maximilian-balandat-b5843946/;", "or_profile": "~Poompol_Buathong1;~Jiayue_Wan1;~Raul_Astudillo1;~Sam_Daulton1;~Maximilian_Balandat1;~Peter_I._Frazier1", "aff": "Cornell University;Cornell University;California Institute of Technology;;Meta;", "aff_domain": "cornell.edu;cornell.edu;caltech.edu;;meta.com;", "position": "PhD student;PhD student;Postdoc;;Research Scientist Manager;", "bibtex": "@inproceedings{\nbuathong2024bayesian,\ntitle={Bayesian Optimization of Function Networks with Partial Evaluations},\nauthor={Poompol Buathong and Jiayue Wan and Raul Astudillo and Sam Daulton and Maximilian Balandat and Peter I. Frazier},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=scMAQ3mFAA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1834157, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12784186610114205276&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "cornell.edu;cornell.edu;caltech.edu;;meta.com;", "author_num": 6, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Cornell University;California Institute of Technology;Meta", "aff_unique_dep": ";;Meta Platforms, Inc.", "aff_unique_url": "https://www.cornell.edu;https://www.caltech.edu;https://meta.com", "aff_unique_abbr": "Cornell;Caltech;Meta", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pasadena", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Scaling Laws for the Value of Individual Data Points in Machine Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32901", "id": "scSB9RynSd", "proceeding": "https://proceedings.mlr.press/v235/covert24a.html", "pdf": "https://openreview.net/pdf?id=scSB9RynSd", "openreview": "https://openreview.net/forum?id=scSB9RynSd", "author_site": "Ian Covert, Wenlong Ji, Tatsunori Hashimoto, James Zou", "tldr": "", "abstract": "Recent works have shown that machine learning models improve at a predictable rate with the amount of training data, leading to scaling laws that describe the relationship between error and dataset size. These scaling laws can help determine a model's training dataset, but they take an aggregate view of the data by only considering the dataset's size. We consider a new perspective by investigating scaling behavior for the value of individual data points: we find that a data point's contribution to model's performance shrinks predictably with the size of the dataset in a log-linear manner. Interestingly, there is significant variability in the scaling exponent among different data points, indicating that certain points are more valuable in small datasets and other points are relatively more useful as a part of large datasets. We provide learning theory support for our scaling laws and we observe empirically that it holds across several model classes. We further propose a maximum likelihood estimator and an amortized estimator to efficiently learn the individualized scaling behaviors from a small number of noisy observations per data point. Using our efficient estimators, we provide insights into factors that influence the scaling behavior of different data points. Finally we demonstrate applications of the individualized scaling laws to data valuation and data subset selection.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ian Connick Covert;Wenlong Ji;Tatsunori Hashimoto;James Zou", "authorids": "~Ian_Connick_Covert1;~Wenlong_Ji1;~Tatsunori_Hashimoto1;~James_Zou1", "gender": "M;M;M;", "homepage": "https://iancovert.com;https://wenlong2000.github.io/;https://thashim.github.io;", "dblp": "262/3443;;;", "google_scholar": "Np8Ek3cAAAAJ;;5ygiTwsAAAAJ;23ZXZvEAAAAJ", "orcid": ";;;", "linkedin": "ian-covert/;;;", "or_profile": "~Ian_Connick_Covert1;~Wenlong_Ji1;~Tatsunori_Hashimoto1;~James_Zou1", "aff": "Stanford University;Stanford University;Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu;stanford.edu", "position": "Postdoc;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\ncovert2024scaling,\ntitle={Scaling Laws for the Value of Individual Data Points in Machine Learning},\nauthor={Ian Connick Covert and Wenlong Ji and Tatsunori Hashimoto and James Zou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=scSB9RynSd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1264397, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1592765812824943118&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "stanford.edu;stanford.edu;stanford.edu;stanford.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "In value-based deep reinforcement learning, a pruned network is a good network", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32900", "id": "seo9V9QRZp", "proceeding": "https://proceedings.mlr.press/v235/obando-ceron24a.html", "pdf": "https://openreview.net/pdf?id=seo9V9QRZp", "openreview": "https://openreview.net/forum?id=seo9V9QRZp", "author_site": "Johan Obando Ceron, Aaron Courville, Pablo Samuel Castro", "tldr": "", "abstract": "Recent work has shown that deep reinforcement learning agents have difficulty in effectively using their network parameters. We leverage prior insights into the advantages of sparse training techniques and demonstrate that gradual magnitude pruning enables value-based agents to maximize parameter effectiveness. This results in networks that yield dramatic performance improvements over traditional networks, using only a small fraction of the full network parameters. Our code is publicly available, see Appendix A for details.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Johan Samir Obando Ceron;Aaron Courville;Pablo Samuel Castro", "authorids": "~Johan_Samir_Obando_Ceron1;~Aaron_Courville3;~Pablo_Samuel_Castro1", "gender": "M;;M", "homepage": "https://johanobandoc.github.io;;https://psc-g.github.io/", "dblp": ";56/1688;05/5455", "google_scholar": "KViAb3EAAAAJ;https://scholar.google.ca/citations?user=km6CP8cAAAAJ;https://scholar.google.ca/citations?user=jn5r6TsAAAAJ", "orcid": ";;", "linkedin": "johan-obando/;;pablo-samuel-castro-2113641b/", "or_profile": "~Johan_Samir_Obando_Ceron1;~Aaron_Courville3;~Pablo_Samuel_Castro1", "aff": "Mila - Quebec AI Institute, Universit\u00e9 de Montr\u00e9al;Universit\u00e9 de Montr\u00e9al;Google", "aff_domain": "mila.umontreal.ca; ;google.com", "position": "PhD student;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nceron2024in,\ntitle={In value-based deep reinforcement learning, a pruned network is a good network},\nauthor={Johan Samir Obando Ceron and Aaron Courville and Pablo Samuel Castro},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=seo9V9QRZp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1249217, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4819404688220007074&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "mila.umontreal.ca; ;google.com", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al;Google", "aff_unique_dep": "Mila - Quebec AI Institute;Google", "aff_unique_url": "https://www.mila.quebec/;https://www.google.com", "aff_unique_abbr": "Mila;Google", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Montr\u00e9al;;Mountain View", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Canada;United States" }, { "title": "Fast Algorithms for Hypergraph PageRank with Applications to Semi-Supervised Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32899", "id": "sfQH4JJ4We", "proceeding": "https://proceedings.mlr.press/v235/ameranis24a.html", "pdf": "https://openreview.net/pdf?id=sfQH4JJ4We", "openreview": "https://openreview.net/forum?id=sfQH4JJ4We", "author_site": "Konstantinos Ameranis, Adela DePavia, Lorenzo Orecchia, Erasmo Tani", "tldr": "", "abstract": "A fundamental approach to semi-supervised learning is to leverage the structure of the sample space to diffuse label information from annotated examples to unlabeled points. Traditional methods model the input data points as a graph and rely on fast algorithms for solving Laplacian systems of equations, such as those defining PageRank. However, previous work has demonstrated that graph-based models fail to capture higher-order relations, such as group membership, which are better modeled by hypergraphs. Unfortunately, the scalable application of hypergraph models has been hampered by the non-linearity of the hypergraph Laplacian. In this paper, we present highly scalable algorithms for hypergraph primitives, such as hypergraph PageRank vectors and hypergraph Laplacian systems, over general families of hypergraphs. In addition to giving strong theoretical guarantees, we empirically showcase the speed of our algorithms on benchmark instances of semi-supervised learning on categorical data. We exploit their generality to improve semi-supervised manifold clustering via hypergraph models. By providing significant speed-ups on fundamental hypergraph tasks, our algorithms enable the deployment of hypergraph models on a massive scale.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Konstantinos Ameranis;Adela Frances DePavia;Lorenzo Orecchia;Erasmo Tani", "authorids": "~Konstantinos_Ameranis1;~Adela_Frances_DePavia1;~Lorenzo_Orecchia1;~Erasmo_Tani1", "gender": "M;F;M;", "homepage": "https://people.cs.uchicago.edu/~kameranis/;https://cam.uchicago.edu/people/profile/adela-depavia/;http://orecchia.net;", "dblp": "252/9946;;32/4340;", "google_scholar": "eajqSs4AAAAJ;rxMmdJYAAAAJ;https://scholar.google.com.tw/citations?user=dT7yOrwAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Konstantinos_Ameranis1;~Adela_Frances_DePavia1;~Lorenzo_Orecchia1;~Erasmo_Tani1", "aff": "University of Chicago;University of Chicago;University of Chicago;", "aff_domain": "uchicago.edu;uchicago.edu;uchicago.edu;", "position": "PhD student;PhD student;Assistant Professor;", "bibtex": "@inproceedings{\nameranis2024fast,\ntitle={Fast Algorithms for Hypergraph PageRank with Applications to Semi-Supervised Learning},\nauthor={Konstantinos Ameranis and Adela Frances DePavia and Lorenzo Orecchia and Erasmo Tani},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=sfQH4JJ4We}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4896350, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10165650564494013034&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "uchicago.edu;uchicago.edu;uchicago.edu;", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Chicago", "aff_unique_dep": "", "aff_unique_url": "https://www.uchicago.edu", "aff_unique_abbr": "UChicago", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Online Learning under Budget and ROI Constraints via Weak Adaptivity", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32898", "id": "shzEkKPrsn", "proceeding": "https://proceedings.mlr.press/v235/castiglioni24a.html", "pdf": "https://openreview.net/pdf?id=shzEkKPrsn", "openreview": "https://openreview.net/forum?id=shzEkKPrsn", "author_site": "Matteo Castiglioni, Andrea Celli, Christian Kroer", "tldr": "", "abstract": "We study online learning problems in which a decision maker has to make a sequence of costly decisions, with the goal of maximizing their expected reward while adhering to budget and return-on-investment (ROI) constraints. Existing primal-dual algorithms designed for constrained online learning problems under adversarial inputs rely on two fundamental assumptions. First, the decision maker must know beforehand the value of parameters related to the degree of strict feasibility of the problem (i.e. Slater parameters). Second, a strictly feasible solution to the offline optimization problem must exist at each round. Both requirements are unrealistic for practical applications such as bidding in online ad auctions. In this paper, we show how such assumptions can be circumvented by endowing standard primal-dual templates with *weakly adaptive* regret minimizers. This results in a ``dual-balancing'' framework which ensures that dual variables stay sufficiently small, even in the absence of knowledge about Slater's parameter. We prove the first *best-of-both-worlds* no-regret guarantees which hold in absence of the two aforementioned assumptions, under stochastic and adversarial inputs. Finally, we show how to instantiate the framework to optimally bid in various mechanisms of practical relevance, such as first- and second-price auctions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Matteo Castiglioni;Andrea Celli;Christian Kroer", "authorids": "~Matteo_Castiglioni1;~Andrea_Celli1;~Christian_Kroer1", "gender": ";M;M", "homepage": "https://castiglionimatteo.github.io;https://andcelli.github.io/;http://www.columbia.edu/~ck2945/", "dblp": "225/7720;190/7301.html;64/10660", "google_scholar": "https://scholar.google.it/citations?user=NPE3HAYAAAAJ;9wQscqEAAAAJ;https://scholar.google.ch/citations?user=ckHwjPAAAAAJ", "orcid": "0000-0002-1070-6766;;0000-0002-9009-8683", "linkedin": ";;", "or_profile": "~Matteo_Castiglioni1;~Andrea_Celli1;~Christian_Kroer1", "aff": "Politecnico di Milano;Bocconi University;Columbia University", "aff_domain": "polimi.it;unibocconi.it;columbia.edu", "position": "Assistant Professor;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\ncastiglioni2024online,\ntitle={Online Learning under Budget and {ROI} Constraints via Weak Adaptivity},\nauthor={Matteo Castiglioni and Andrea Celli and Christian Kroer},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=shzEkKPrsn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 488207, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7581133761519246205&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "polimi.it;unibocconi.it;columbia.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Politecnico di Milano;Bocconi University;Columbia University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.polimi.it;https://www.bocconi.edu;https://www.columbia.edu", "aff_unique_abbr": "Polimi;Bocconi;Columbia", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Italy;United States" }, { "title": "Revisiting Context Aggregation for Image Matting", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32897", "id": "sjJZHPV9Id", "proceeding": "https://proceedings.mlr.press/v235/liu24as.html", "pdf": "https://openreview.net/pdf?id=sjJZHPV9Id", "openreview": "https://openreview.net/forum?id=sjJZHPV9Id", "author_site": "Qinglin Liu, Xiaoqian Lv, Quanling Meng, Zonglin Li, Xiangyuan Lan, Shuo Yang, Shengping Zhang, Liqiang Nie", "tldr": "", "abstract": "Traditional studies emphasize the significance of context information in improving matting performance. Consequently, deep learning-based matting methods delve into designing pooling or affinity-based context aggregation modules to achieve superior results. However, these modules cannot well handle the context scale shift caused by the difference in image size during training and inference, resulting in matting performance degradation. In this paper, we revisit the context aggregation mechanisms of matting networks and find that a basic encoder-decoder network without any context aggregation modules can actually learn more universal context aggregation, thereby achieving higher matting performance compared to existing methods. Building on this insight, we present AEMatter, a matting network that is straightforward yet very effective. AEMatter adopts a Hybrid-Transformer backbone with appearance-enhanced axis-wise learning (AEAL) blocks to build a basic network with strong context aggregation learning capability. Furthermore, AEMatter leverages a large image training strategy to assist the network in learning context aggregation from data. Extensive experiments on five popular matting datasets demonstrate that the proposed AEMatter outperforms state-of-the-art matting methods by a large margin. The source code is available at https://github.com/aipixel/AEMatter.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qinglin Liu;Xiaoqian Lv;Quanling Meng;Zonglin Li;Xiangyuan Lan;Shuo Yang;Shengping Zhang;Liqiang Nie", "authorids": "~Qinglin_Liu1;~Xiaoqian_Lv1;~Quanling_Meng1;~Zonglin_Li1;~Xiangyuan_Lan4;~Shuo_Yang5;~Shengping_Zhang1;~Liqiang_Nie2", "gender": "M;F;M;M;M;M;M;M", "homepage": ";https://me.csdn.net/weixin_40399244;https://github.com/lingtianxia123;https://lzl.sd.cn/;https://faculty.hitsz.edu.cn/yangshuo;http://homepage.hit.edu.cn/zhangshengping;https://liqiangnie.github.io/index.html;https://www.comp.hkbu.edu.hk/v1/?page=profile&id=lanxiangyuan", "dblp": "227/7900;;;;78/1102-6;60/1866;92/8277;151/8902", "google_scholar": "hsu1cSIAAAAJ;;;e3cfNyMAAAAJ;mVtxxCkAAAAJ;hMNsT8sAAAAJ;yywVMhUAAAAJ;https://scholar.google.com.hk/citations?user=c3iwWRcAAAAJ", "orcid": "0000-0002-2408-3344;;;;;;0000-0003-1476-0273;", "linkedin": "%E9%9D%92%E6%9E%97-%E6%9F%B3-a7354377/;;;;;;;", "or_profile": "~Qinglin_Liu1;~Xiaoqian_Lv1;~Quanling_Meng1;~Zonglin_Li1;~Shuo_Yang5;~Shengping_Zhang1;~Liqiang_Nie2;~xiangyuan_lan1", "aff": "Harbin Institute of Technology;Harbin Institute of Technology;Harbin Institute of Technology;Harbin Institute of Technology;University of Hong Kong;Harbin Institute of Technology;Shandong University;Pengcheng Laboratory", "aff_domain": "hit.edu.cn;hit.edu;hit.edu.cn;hit.edu.cn;hku.hk;hit.edu.cn;sdu.edu.cn;pcl.ac.cn", "position": "Lecturer;PhD student;Lecturer;PhD student;Postdoc;Full Professor;Full Professor;Researcher", "bibtex": "@inproceedings{\nliu2024revisiting,\ntitle={Revisiting Context Aggregation for Image Matting},\nauthor={Qinglin Liu and Xiaoqian Lv and Quanling Meng and Zonglin Li and Xiangyuan Lan and Shuo Yang and Shengping Zhang and Liqiang Nie},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=sjJZHPV9Id}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5362536, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11156479041349637275&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "hit.edu.cn;hit.edu;hit.edu.cn;hit.edu.cn;hku.hk;hit.edu.cn;sdu.edu.cn;pcl.ac.cn", "author_num": 8, "aff_unique_index": "0;0;0;0;1;0;2;3", "aff_unique_norm": "Harbin Institute of Technology;University of Hong Kong;Shandong University;Pengcheng Laboratory", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.hit.edu.cn/;https://www.hku.hk;http://www.sdu.edu.cn;", "aff_unique_abbr": "HIT;HKU;SDU;", "aff_campus_unique_index": "0;0;0;0;1;0", "aff_campus_unique": "Harbin;Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Diving into Underwater: Segment Anything Model Guided Underwater Salient Instance Segmentation and A Large-scale Dataset", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32896", "id": "snhurpZt63", "proceeding": "https://proceedings.mlr.press/v235/lian24c.html", "pdf": "https://openreview.net/pdf?id=snhurpZt63", "openreview": "https://openreview.net/forum?id=snhurpZt63", "author_site": "Shijie Lian, Ziyi Zhang, Hua Li, Wenjie Li, Laurence Yang, Sam Kwong, Runmin Cong", "tldr": "", "abstract": "With the breakthrough of large models, Segment Anything Model (SAM) and its extensions have been attempted to apply in diverse tasks of computer vision. Underwater salient instance segmentation is a foundational and vital step for various underwater vision tasks, which often suffer from low segmentation accuracy due to the complex underwater circumstances and the adaptive ability of models. Moreover, the lack of large-scale datasets with pixel-level salient instance annotations has impeded the development of machine learning techniques in this field. To address these issues, we construct the first large-scale underwater salient instance segmentation dataset (USIS10K), which contains 10,632 underwater images with pixel-level annotations in 7 categories from various underwater scenes. Then, we propose an Underwater Salient Instance Segmentation architecture based on Segment Anything Model (USIS-SAM) specifically for the underwater domain. We devise an Underwater Adaptive Visual Transformer (UA-ViT) encoder to incorporate underwater domain visual prompts into the segmentation network. We further design an out-of-the-box underwater Salient Feature Prompter Generator (SFPG) to automatically generate salient prompters instead of explicitly providing foreground points or boxes as prompts in SAM. Comprehensive experimental results show that our USIS-SAM method can achieve superior performance on USIS10K datasets compared to the state-of-the-art methods. Datasets and codes are released on https://github.com/LiamLian0727/USIS10K.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shijie Lian;Ziyi Zhang;Hua Li;Wenjie Li;Laurence Tianruo Yang;Sam Kwong;Runmin Cong", "authorids": "~Shijie_Lian1;~Ziyi_Zhang6;~Hua_Li8;~Wenjie_Li10;~Laurence_Tianruo_Yang1;~Sam_Kwong1;~Runmin_Cong1", "gender": "M;F;;;M;M;M", "homepage": "https://github.com/LiamLian0727;https://github.com/Zerory1;;;https://scholar.google.com/citations?hl=en&user=a39Yz5cAAAAJ;https://scholars.ln.edu.hk/en/persons/sam-tak-wu-kwong;https://rmcong.github.io/", "dblp": "353/4716;;;;y/LaurenceTianruoYang;18/30;180/7852", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;;;;https://scholar.google.com/citations?hl=en;_PVI6EAAAAAJ;https://scholar.google.co.uk/citations?hl=en", "orcid": ";;;;0000-0002-7986-4244;0000-0001-7484-7261;0000-0003-0972-4008", "linkedin": ";;;;;;", "or_profile": "~Shijie_Lian1;~Ziyi_Zhang6;~Hua_Li8;~Wenjie_Li10;~Laurence_Tianruo_Yang1;~Sam_Kwong1;~Runmin_Cong1", "aff": "Hainan University;Hong Kong University of Science and Technology (Guangzhou);;;Huazhong University of Science and Technology;Lingnan University;Shandong University", "aff_domain": "hainu.edu.cn;hkust-gz.edu.cn;;;hust.edu.cn;ln.edu.hk;sdu.edu.cn", "position": "Undergrad student;PhD student;;;Full Professor;Chair Professor;Full Professor", "bibtex": "@inproceedings{\nlian2024diving,\ntitle={Diving into Underwater: Segment Anything Model Guided Underwater Salient Instance Segmentation and A Large-scale Dataset},\nauthor={Shijie Lian and Ziyi Zhang and Hua Li and Wenjie Li and Laurence Tianruo Yang and Sam Kwong and Runmin Cong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=snhurpZt63}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9669288, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11826659956508435871&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "hainu.edu.cn;hkust-gz.edu.cn;;;hust.edu.cn;ln.edu.hk;sdu.edu.cn", "author_num": 7, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Hainan University;Hong Kong University of Science and Technology;Huazhong University of Science and Technology;Lingnan University;Shandong University", "aff_unique_dep": ";;;;", "aff_unique_url": "http://www.hainanu.edu.cn;https://www.ust.hk;http://www.hust.edu.cn;http://www.lingnan.edu.cn;http://www.sdu.edu.cn", "aff_unique_abbr": "HNU;HKUST;HUST;LNU;SDU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "On a Combinatorial Problem Arising in Machine Teaching", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32895", "id": "spOpHW1No2", "proceeding": "https://proceedings.mlr.press/v235/sunde24a.html", "pdf": "https://openreview.net/pdf?id=spOpHW1No2", "openreview": "https://openreview.net/forum?id=spOpHW1No2", "author_site": "Joakim Sunde, Brigt H\u00e5vardstun, Jan Kratochv\u00edl, Jan Arne Telle", "tldr": "", "abstract": "We study a model of machine teaching where the teacher mapping is constructed from a size function on both concepts and examples. The main question in machine teaching is the minimum number of examples needed for any concept, the so-called teaching dimension. A recent paper (Ferri et al., 2024) conjectured that the worst case for this model, as a function of the size of the concept class, occurs when the consistency matrix contains the binary representations of numbers from zero and up. In this paper we prove their conjecture. The result can be seen as a generalization of a theorem resolving the edge isoperimetry problem for hypercubes (Hart, 1976), and our proof is based on a lemma of (Graham, 1970).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Joakim Sunde;Brigt H\u00e5vardstun;Jan Kratochv\u00edl;Jan Arne Telle", "authorids": "~Joakim_Sunde1;~Brigt_H\u00e5vardstun1;~Jan_Kratochv\u00edl1;jan.arne.telle@uib.no", "gender": "M;M;M;", "homepage": "https://www.uib.no/personer/Joakim.Hauger.Sunde;;https://kam.mff.cuni.cz/~honza/;", "dblp": ";;31/6569.html;", "google_scholar": ";bOfjWOUAAAAJ;AUbareUAAAAJ;", "orcid": ";;0000-0002-2620-6133;", "linkedin": ";;;", "or_profile": "~Joakim_Sunde1;~Brigt_H\u00e5vardstun1;~Jan_Kratochv\u00edl1;jan.arne.telle@uib.no", "aff": "University of Bergen;University of Bergen;Charles University Prague;", "aff_domain": "uib.no;uib.no;cuni.cz;", "position": "PhD student;PhD student;Full Professor;", "bibtex": "@inproceedings{\nsunde2024on,\ntitle={On a Combinatorial Problem Arising in Machine Teaching},\nauthor={Joakim Sunde and Brigt H{\\r{a}}vardstun and Jan Kratochv{\\'\\i}l and Jan Arne Telle},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=spOpHW1No2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 278220, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7797865243178874788&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "uib.no;uib.no;cuni.cz;", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Bergen;Charles University", "aff_unique_dep": ";", "aff_unique_url": "https://www.uib.no;https://www.cuni.cz", "aff_unique_abbr": "uib;Charles University", "aff_campus_unique_index": "1", "aff_campus_unique": ";Prague", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Norway;Czech Republic" }, { "title": "Ambiguity-Aware Abductive Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32894", "id": "sqv2xP8rfb", "proceeding": "https://proceedings.mlr.press/v235/he24j.html", "pdf": "https://openreview.net/pdf?id=sqv2xP8rfb", "openreview": "https://openreview.net/forum?id=sqv2xP8rfb", "author_site": "Hao-Yuan He, Hui Sun, Zheng Xie, Ming Li", "tldr": "", "abstract": "Abductive Learning (ABL) is a promising framework for integrating sub-symbolic perception and logical reasoning through abduction. In this case, the abduction process provides supervision for the perception model from the background knowledge. Nevertheless, this process naturally contains uncertainty, since the knowledge base may be satisfied by numerous potential candidates. This implies that the result of the abduction process, i.e., a set of candidates, is ambiguous; both correct and incorrect candidates are mixed in this set. The prior art of abductive learning selects the candidate that has the minimal inconsistency of the knowledge base. However, this method overlooks the ambiguity in the abduction process and is prone to error when it fails to identify the correct candidates. To address this, we propose Ambiguity-Aware Abductive Learning ($\\textrm{A}^3\\textrm{BL}$), which evaluates all potential candidates and their probabilities, thus preventing the model from falling into sub-optimal solutions. Both experimental results and theoretical analyses prove that $\\textrm{A}^3\\textrm{BL}$ markedly enhances ABL by efficiently exploiting the ambiguous abduced supervision.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hao-Yuan He;Hui Sun;Zheng Xie;Ming Li", "authorids": "~Hao-Yuan_He2;~Hui_Sun1;~Zheng_Xie1;~Ming_Li1", "gender": ";;;M", "homepage": ";;http://www.lamda.nju.edu.cn/xiez/;http://ai.nju.edu.cn/lim/", "dblp": ";;83/6764;l/MingLi5", "google_scholar": ";;;djdBwxwAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Hao-Yuan_He2;~Hui_Sun1;~Zheng_Xie1;~Ming_Li1", "aff": ";;Nanjing University;Nanjing University", "aff_domain": ";;nju.edu.cn;nju.edu.cn", "position": ";;PhD student;Professor", "bibtex": "@inproceedings{\nhe2024ambiguityaware,\ntitle={Ambiguity-Aware Abductive Learning},\nauthor={Hao-Yuan He and Hui Sun and Zheng Xie and Ming Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=sqv2xP8rfb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1967915, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12270718615813747402&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": ";;nju.edu.cn;nju.edu.cn", "author_num": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "", "aff_unique_url": "https://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "ReLUs Are Sufficient for Learning Implicit Neural Representations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32893", "id": "srejp9uOx7", "proceeding": "https://proceedings.mlr.press/v235/shenouda24a.html", "pdf": "https://openreview.net/pdf?id=srejp9uOx7", "openreview": "https://openreview.net/forum?id=srejp9uOx7", "author_site": "Joseph Shenouda, Yamin Zhou, Robert Nowak", "tldr": "", "abstract": "Motivated by the growing theoretical understanding of neural networks that employ the Rectified Linear Unit (ReLU) as their activation function, we revisit the use of ReLU activation functions for learning implicit neural representations (INRs). Inspired by second order B-spline wavelets, we incorporate a set of simple constraints to the ReLU neurons in each layer of a deep neural network (DNN) to remedy the spectral bias. This in turn enables its use for various INR tasks. Empirically, we demonstrate that, contrary to popular belief, one *can learn* state-of-the-art INRs based on a DNN composed of only ReLU neurons. Next, by leveraging recent theoretical works which characterize the kinds of functions ReLU neural networks learn, we provide a way to quantify the regularity of the learned function. This offers a principled approach to selecting the hyperparameters in INR architectures. We substantiate our claims through experiments in signal representation, super resolution, and computed tomography, demonstrating the versatility and effectiveness of our method. The code for all experiments can be found at https://github.com/joeshenouda/relu-inrs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Joseph Shenouda;Yamin Zhou;Robert D Nowak", "authorids": "~Joseph_Shenouda1;~Yamin_Zhou1;~Robert_D_Nowak1", "gender": "M;F;M", "homepage": "https://joeshenouda.github.io/;;http://nowak.ece.wisc.edu", "dblp": "300/9012;;n/RobertDNowak", "google_scholar": "GslaaDUAAAAJ;;fn13u8IAAAAJ", "orcid": ";;", "linkedin": "joseph-shenouda-723231141;yamin-zhou-63832124a;", "or_profile": "~Joseph_Shenouda1;~Yamin_Zhou1;~Robert_D_Nowak1", "aff": "University of Wisconsin - Madison;University of Wisconsin - Madison;University of Wisconsin - Madison", "aff_domain": "wisc.edu;wisc.edu;", "position": "PhD student;Undergrad student;Full Professor", "bibtex": "@inproceedings{\nshenouda2024relus,\ntitle={Re{LU}s Are Sufficient for Learning Implicit Neural Representations},\nauthor={Joseph Shenouda and Yamin Zhou and Robert D Nowak},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=srejp9uOx7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9605270, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14251735723025931328&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "wisc.edu;wisc.edu;", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Wisconsin-Madison", "aff_unique_dep": "", "aff_unique_url": "https://www.wisc.edu", "aff_unique_abbr": "UW-Madison", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "ULAREF: A Unified Label Refinement Framework for Learning with Inaccurate Supervision", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32892", "id": "ssFMq35UUY", "proceeding": "https://proceedings.mlr.press/v235/qiao24c.html", "pdf": "https://openreview.net/pdf?id=ssFMq35UUY", "openreview": "https://openreview.net/forum?id=ssFMq35UUY", "author_site": "Congyu Qiao, Ning Xu, Yihao Hu, Xin Geng", "tldr": "", "abstract": "Learning with inaccurate supervision is often encountered in weakly supervised learning, and researchers have invested a considerable amount of time and effort in designing specialized algorithms for different forms of annotations in inaccurate supervision. In fact, different forms of these annotations share the fundamental characteristic that they all still incorporate some portion of correct labeling information. This commonality can serve as a lever, enabling the creation of a cohesive framework designed to tackle the challenges associated with various forms of annotations in learning with inaccurate supervision. In this paper, we propose a unified label refinement framework named ULAREF, i.e., a Unified LAbel REfinement Framework for learning with inaccurate supervision, which is capable of leveraging label refinement to handle inaccurate supervision. Specifically, our framework trains the predictive model with refined labels through global detection of reliability and local enhancement using an enhanced model fine-tuned by a proposed consistency loss. Also, we theoretically justify that the enhanced model in local enhancement can achieve higher accuracy than the predictive model on the detected unreliable set under mild assumptions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Congyu Qiao;Ning Xu;Yihao Hu;Xin Geng", "authorids": "~Congyu_Qiao3;~Ning_Xu5;~Yihao_Hu2;~Xin_Geng1", "gender": "M;M;M;M", "homepage": "http://palm.seu.edu.cn/homepage/qiaocongyu/demo/index.html;http://palm.seu.edu.cn/xuning/;;http://palm.seu.edu.cn/xgeng/index.htm", "dblp": "277/9262;04/5856-9;234/7986-4.html;", "google_scholar": ";;;ZOCxkIcAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Congyu_Qiao3;~Ning_Xu5;~Yihao_Hu2;~Xin_Geng1", "aff": "Southeast University;Southeast University;Southeast University;Southeast University, China", "aff_domain": "seu.edu.cn;seu.edu.cn;seu.edu.cn;seu.edu.cn", "position": "PhD student;Associate Professor;MS student;Professor", "bibtex": "@inproceedings{\nqiao2024ularef,\ntitle={{ULAREF}: A Unified Label Refinement Framework for Learning with Inaccurate Supervision},\nauthor={Congyu Qiao and Ning Xu and Yihao Hu and Xin Geng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ssFMq35UUY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 535856, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:NSF92_xZU2AJ:scholar.google.com/&scioq=ULAREF:+A+Unified+Label+Refinement+Framework+for+Learning+with+Inaccurate+Supervision&hl=en&as_sdt=0,44", "gs_version_total": 4, "email": "seu.edu.cn;seu.edu.cn;seu.edu.cn;seu.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Southeast University", "aff_unique_dep": "", "aff_unique_url": "https://www.seu.edu.cn/", "aff_unique_abbr": "SEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Transferable Facial Privacy Protection against Blind Face Restoration via Domain-Consistent Adversarial Obfuscation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32891", "id": "st2BTty53v", "proceeding": "https://proceedings.mlr.press/v235/zhang24co.html", "pdf": "https://openreview.net/pdf?id=st2BTty53v", "openreview": "https://openreview.net/forum?id=st2BTty53v", "author_site": "Kui Zhang, Hang Zhou, Jie Zhang, Wenbo Zhou, Weiming Zhang, Nenghai Yu", "tldr": "", "abstract": "With the rise of social media and the proliferation of facial recognition surveillance, concerns surrounding privacy have escalated significantly. While numerous studies have concentrated on safeguarding users against unauthorized face recognition, a new and often overlooked issue has emerged due to advances in facial restoration techniques: traditional methods of facial obfuscation may no longer provide a secure shield, as they can potentially expose anonymous information to human perception. Our empirical study shows that blind face restoration (BFR) models can restore obfuscated faces with high probability by simply retraining them on obfuscated (e.g., pixelated) faces. To address it, we propose a transferable adversarial obfuscation method for privacy protection against BFR models. Specifically, we observed a common characteristic among BFR models, namely, their capability to approximate an inverse mapping of a transformation from a high-quality image domain to a low-quality image domain. Leveraging this shared model attribute, we have developed a domain-consistent adversarial method for generating obfuscated images. In essence, our method is designed to minimize overfitting to surrogate models during the perturbation generation process, thereby enhancing the generalization of adversarial obfuscated facial images. Extensive experiments on various BFR models demonstrate the effectiveness and transferability of the proposed method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kui Zhang;Hang Zhou;Jie Zhang;Wenbo Zhou;Weiming Zhang;Nenghai Yu", "authorids": "~Kui_Zhang2;~Hang_Zhou5;~Jie_Zhang11;~Wenbo_Zhou1;~Weiming_Zhang2;~Nenghai_Yu1", "gender": "M;M;M;M;M;M", "homepage": ";https://ryanhangzhou.github.io/;https://zjzac.github.io/;http://staff.ustc.edu.cn/~welbeckz/;http://staff.ustc.edu.cn/~zhangwm/;", "dblp": ";26/3707-7;84/6889-73;;;96/5144", "google_scholar": ";YrQxT8cAAAAJ;7YkR3CoAAAAJ//;sPMWxr0AAAAJ;eTCfl6cAAAAJ;https://scholar.google.com.hk/citations?user=7620QAMAAAAJ", "orcid": ";;0000-0002-4230-1077;;0000-0001-5576-6108;", "linkedin": ";hang-zhou-a9548280/;;;;", "or_profile": "~Kui_Zhang2;~Hang_Zhou5;~Jie_Zhang11;~Wenbo_Zhou1;~Weiming_Zhang2;~Nenghai_Yu1", "aff": "University of Science and Technology of China;University of Alberta;Nanyang Technological University;University of Science and Technology of China;University of Science and Technology of China;University of Science and Technology of China", "aff_domain": "ustc.edu.cn;ualberta.ca;ntu.edu.sg;ustc.edu.cn;ustc.edu.cn;ustc.edu.cn", "position": "PhD student;Postdoc;Postdoc;Associate Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nzhang2024transferable,\ntitle={Transferable Facial Privacy Protection against Blind Face Restoration via Domain-Consistent Adversarial Obfuscation},\nauthor={Kui Zhang and Hang Zhou and Jie Zhang and Wenbo Zhou and Weiming Zhang and Nenghai Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=st2BTty53v}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2916333, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18119533863790678816&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "ustc.edu.cn;ualberta.ca;ntu.edu.sg;ustc.edu.cn;ustc.edu.cn;ustc.edu.cn", "author_num": 6, "aff_unique_index": "0;1;2;0;0;0", "aff_unique_norm": "University of Science and Technology of China;University of Alberta;Nanyang Technological University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.ualberta.ca;https://www.ntu.edu.sg", "aff_unique_abbr": "USTC;UAlberta;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0;0;0", "aff_country_unique": "China;Canada;Singapore" }, { "title": "Accelerated Speculative Sampling Based on Tree Monte Carlo", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32890", "id": "stMhi1Sn2G", "proceeding": "https://proceedings.mlr.press/v235/hu24f.html", "pdf": "https://openreview.net/pdf?id=stMhi1Sn2G", "openreview": "https://openreview.net/forum?id=stMhi1Sn2G", "author_site": "Zhengmian Hu, Heng Huang", "tldr": "", "abstract": "Speculative Sampling (SpS) has been introduced to speed up inference of large language models (LLMs) by generating multiple tokens in a single forward pass under the guidance of a reference model, while preserving the original distribution. We observe that SpS can be derived through maximum coupling on the token distribution. However, we find that this approach is not optimal as it applies maximum coupling incrementally for each new token, rather than seeking a global maximum coupling that yields a faster algorithm, given the tree-space nature of LLM generative distributions. In this paper, we shift our focus from distributions on a token space to those on a tree space. We propose a novel class of Tree Monte Carlo (TMC) methods, demonstrating their unbiasedness and convergence. As a particular instance of TMC, our new algorithm, Accelerated Speculative Sampling (ASpS), outperforms traditional SpS by generating more tokens per step on average, achieving faster inference, while maintaining the original distribution.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhengmian Hu;Heng Huang", "authorids": "~Zhengmian_Hu1;~Heng_Huang1", "gender": "M;M", "homepage": "https://www.umd.edu/;https://www.cs.umd.edu/~heng/", "dblp": "285/4945;03/281", "google_scholar": "4eXiWWgAAAAJ;4OqLaDwAAAAJ", "orcid": "0000-0003-0316-146X;", "linkedin": ";", "or_profile": "~Zhengmian_Hu1;~Heng_Huang1", "aff": "University of Maryland, College Park;Department of Computer Science, University of Maryland, College Park", "aff_domain": "umd.edu;cs.umd.edu", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nhu2024accelerated,\ntitle={Accelerated Speculative Sampling Based on Tree Monte Carlo},\nauthor={Zhengmian Hu and Heng Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=stMhi1Sn2G}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 442602, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9140289205118249693&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "email": "umd.edu;cs.umd.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Maryland;University of Maryland, College Park", "aff_unique_dep": ";Department of Computer Science", "aff_unique_url": "https://www/umd.edu;https://www/umd.edu", "aff_unique_abbr": "UMD;UMD", "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Sparser, Better, Deeper, Stronger: Improving Static Sparse Training with Exact Orthogonal Initialization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32889", "id": "svm53KQAtN", "proceeding": "https://proceedings.mlr.press/v235/nowak24a.html", "pdf": "https://openreview.net/pdf?id=svm53KQAtN", "openreview": "https://openreview.net/forum?id=svm53KQAtN", "author_site": "Aleksandra I. Nowak, \u0141ukasz Gniecki, Filip Szatkowski, Jacek Tabor", "tldr": "", "abstract": "Static sparse training aims to train sparse models from scratch, achieving remarkable results in recent years. A key design choice is given by the sparse initialization, which determines the trainable sub-network through a binary mask. Existing methods mainly select such mask based on a predefined dense initialization. Such an approach may not efficiently leverage the mask's potential impact on the optimization. An alternative direction, inspired by research into dynamical isometry, is to introduce orthogonality in the sparse subnetwork, which helps in stabilizing the gradient signal. In this work, we propose Exact Orthogonal Initialization (EOI), a novel sparse orthogonal initialization scheme based on composing random Givens rotations. Contrary to other existing approaches, our method provides exact (not approximated) orthogonality and enables the creation of layers with arbitrary densities. We demonstrate the superior effectiveness and efficiency of EOI through experiments, consistently outperforming common sparse initialization techniques. Our method enables training highly sparse 1000-layer MLP and CNN networks without residual connections or normalization techniques, emphasizing the crucial role of weight initialization in static sparse training alongside sparse mask selection.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aleksandra Nowak;\u0141ukasz Gniecki;Filip Szatkowski;Jacek Tabor", "authorids": "~Aleksandra_Nowak1;~\u0141ukasz_Gniecki1;~Filip_Szatkowski1;~Jacek_Tabor1", "gender": "F;M;M;M", "homepage": ";;;", "dblp": "34/10106;;323/8425;31/5172", "google_scholar": "2A-eZhQAAAAJ;;xjnAIOEAAAAJ;https://scholar.google.pl/citations?user=zSKYziUAAAAJ", "orcid": "0000-0002-2830-6613;;0000-0001-8592-2001;0000-0001-6652-7727", "linkedin": ";lukaszgniecki/;fszatkowski/;", "or_profile": "~Aleksandra_Nowak1;~\u0141ukasz_Gniecki1;~Filip_Szatkowski1;~Jacek_Tabor1", "aff": "Google;Jagiellonian University Cracow;Amazon;Jagiellonian University", "aff_domain": "google.com;uj.edu.pl;amazon.de;uj.edu.pl", "position": "Intern;MS student;Intern;Full Professor", "bibtex": "@inproceedings{\nnowak2024sparser,\ntitle={Sparser, Better, Deeper, Stronger: Improving Static Sparse Training with Exact Orthogonal Initialization},\nauthor={Aleksandra Nowak and {\\L}ukasz Gniecki and Filip Szatkowski and Jacek Tabor},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=svm53KQAtN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1022222, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3081273972719296832&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "google.com;uj.edu.pl;amazon.de;uj.edu.pl", "author_num": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Google;Jagiellonian University;Amazon", "aff_unique_dep": "Google;;Amazon.com, Inc.", "aff_unique_url": "https://www.google.com;https://www.uj.edu.pl;https://www.amazon.com", "aff_unique_abbr": "Google;UJ;Amazon", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Mountain View;Cracow;", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "United States;Poland" }, { "title": "IM-3D: Iterative Multiview Diffusion and Reconstruction for High-Quality 3D Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32888", "id": "swTG6xju8O", "proceeding": "https://proceedings.mlr.press/v235/melas-kyriazi24a.html", "pdf": "https://openreview.net/pdf?id=swTG6xju8O", "openreview": "https://openreview.net/forum?id=swTG6xju8O", "author_site": "Luke Melas-Kyriazi, Iro Laina, Christian Rupprecht, Natalia Neverova, Andrea Vedaldi, Oran Gafni, Filippos Kokkinos", "tldr": "", "abstract": "Most text-to-3D generators build upon off-the-shelf text-to-image models trained on billions of images. They use variants of Score Distillation Sampling (SDS), which is slow, somewhat unstable, and prone to artifacts. A mitigation is to fine-tune the 2D generator to be multi-view aware, which can help distillation or can be combined with reconstruction networks to output 3D objects directly. In this paper, we further explore the design space of text-to-3D models. We significantly improve multi-view generation by considering video instead of image generators. Combined with a 3D reconstruction algorithm which, by using Gaussian splatting, can optimize a robust image-based loss, we directly produce high-quality 3D outputs from the generated views. Our new method, IM-3D, reduces the number of evaluations of the 2D generator network 10-100$\\times$, resulting in a much more efficient pipeline, better quality, fewer geometric inconsistencies, and higher yield of usable 3D assets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Luke Melas-Kyriazi;Iro Laina;Christian Rupprecht;Natalia Neverova;Andrea Vedaldi;Oran Gafni;Filippos Kokkinos", "authorids": "~Luke_Melas-Kyriazi1;~Iro_Laina1;~Christian_Rupprecht1;~Natalia_Neverova1;~Andrea_Vedaldi1;~Oran_Gafni1;~Filippos_Kokkinos1", "gender": "M;;M;F;M;;M", "homepage": "https://lukemelas.github.io/;;http://chrirupp.github.io;https://nneverova.github.io/;https://www.robots.ox.ac.uk/~vedaldi/;;https://fkokkinos.github.io/", "dblp": "228/5680;;https://dblp.uni-trier.de/pid/76/744-1;119/1495;99/2825;;186/7080", "google_scholar": "https://scholar.google.com/citations?hl=en;;https://scholar.google.de/citations?user=IrYlproAAAAJ;https://scholar.google.fr/citations?user=cLPaHcIAAAAJ;bRT7t28AAAAJ;;uuXQjUIAAAAJ", "orcid": ";;;;0000-0003-1374-2858;;", "linkedin": ";;;;;;", "or_profile": "~Luke_Melas-Kyriazi1;~Iro_Laina1;~Christian_Rupprecht1;~Natalia_Neverova1;~Andrea_Vedaldi1;~Oran_Gafni1;~Filippos_Kokkinos1", "aff": ";;University of Oxford;Meta GenAI;Meta;;Meta AI", "aff_domain": ";;ox.ac.uk;meta.com;meta.com;;fb.com", "position": ";;Associate Professor;Principal Researcher;Researcher;;Researcher", "bibtex": "@inproceedings{\nmelas-kyriazi2024imd,\ntitle={{IM}-3D: Iterative Multiview Diffusion and Reconstruction for High-Quality 3D Generation},\nauthor={Luke Melas-Kyriazi and Iro Laina and Christian Rupprecht and Natalia Neverova and Andrea Vedaldi and Oran Gafni and Filippos Kokkinos},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=swTG6xju8O}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8911842, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11077863578841952251&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": ";;ox.ac.uk;meta.com;meta.com;;fb.com", "author_num": 7, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of Oxford;Meta", "aff_unique_dep": ";Meta GenAI", "aff_unique_url": "https://www.ox.ac.uk;https://meta.com", "aff_unique_abbr": "Oxford;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Dirichlet Flow Matching with Applications to DNA Sequence Design", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32887", "id": "syXFAVqx85", "proceeding": "https://proceedings.mlr.press/v235/stark24b.html", "pdf": "https://openreview.net/pdf?id=syXFAVqx85", "openreview": "https://openreview.net/forum?id=syXFAVqx85", "author_site": "Hannes St\u00e4rk, Bowen Jing, Chenyu Wang, Gabriele Corso, Bonnie Berger, Regina Barzilay, Tommi Jaakkola", "tldr": "", "abstract": "Discrete diffusion or flow models could enable faster and more controllable sequence generation than autoregressive models. We show that naive linear flow matching on the simplex is insufficient toward this goal since it suffers from discontinuities in the training target and further pathologies. To overcome this, we develop Dirichlet flow matching on the simplex based on mixtures of Dirichlet distributions as probability paths. In this framework, we derive a connection between the mixtures' scores and the flow's vector field that allows for classifier and classifier-free guidance. Further, we provide distilled Dirichlet flow matching, which enables one-step sequence generation with minimal performance hits, resulting in $O(L)$ speedups compared to autoregressive models. On complex DNA sequence generation tasks, we demonstrate superior performance compared to all baselines in distributional metrics and in achieving desired design targets for generated sequences. Finally, we show that our classifier-free guidance approach improves unconditional generation and is effective for generating DNA that satisfies design targets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hannes Stark;Bowen Jing;Chenyu Wang;Gabriele Corso;Bonnie Berger;Regina Barzilay;Tommi Jaakkola", "authorids": "~Hannes_Stark1;~Bowen_Jing1;~Chenyu_Wang7;~Gabriele_Corso1;~Bonnie_Berger1;~Regina_Barzilay1;~Tommi_S._Jaakkola1", "gender": ";;;;F;female;", "homepage": ";;;https://gcorso.github.io/;https://people.csail.mit.edu/bab/;https://www.regina.csail.mit.edu/;", "dblp": ";;;262/6499;b/BonnieBerger;b/ReginaBarzilay;", "google_scholar": ";;;LUrAYgEAAAAJ;bYjKaowAAAAJ;;", "orcid": ";;;;;;", "linkedin": ";;;gcorso/;;;", "or_profile": "~Hannes_Stark1;~Bowen_Jing1;~Chenyu_Wang7;~Gabriele_Corso1;~Bonnie_Berger1;~Regina_Barzilay1;~Tommi_S._Jaakkola1", "aff": ";;;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;", "aff_domain": ";;;mit.edu;mit.edu;mit.edu;", "position": ";;;PhD student;Full Professor;Professor;", "bibtex": "@inproceedings{\nstark2024dirichlet,\ntitle={Dirichlet Flow Matching with Applications to {DNA} Sequence Design},\nauthor={Hannes Stark and Bowen Jing and Chenyu Wang and Gabriele Corso and Bonnie Berger and Regina Barzilay and Tommi Jaakkola},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=syXFAVqx85}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1011081, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14716738619101432934&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": ";;;mit.edu;mit.edu;mit.edu;", "author_num": 7, "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Advancing Dynamic Sparse Training by Exploring Optimization Opportunities", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32886", "id": "szRHR9XGrY", "proceeding": "https://proceedings.mlr.press/v235/ji24a.html", "pdf": "https://openreview.net/pdf?id=szRHR9XGrY", "openreview": "https://openreview.net/forum?id=szRHR9XGrY", "author_site": "Jie Ji, Gen Li, Lu Yin, Minghai Qin, Geng Yuan, Linke Guo, Shiwei Liu, Xiaolong Ma", "tldr": "", "abstract": "Dynamic Sparse Training (DST) is an effective approach for addressing the substantial training resource requirements posed by the ever-increasing size of the Deep Neural Networks (DNNs). Characterized by its dynamic \"train-prune-grow'' schedule during training, DST implicitly develops a bi-level structure for training the weights while discovering a subnetwork topology. However, such a structure is consistently overlooked by the current DST algorithms for further optimization opportunities, and these algorithms, on the other hand, solely optimize the weights while determining masks heuristically. In this paper, we extensively study DST algorithms and argue that the training scheme of DST naturally forms a bi-level problem in which the updating of weight and mask is interdependent. Based on this observation, we introduce a novel efficient training framework called BiDST, which for the first time, introduces bi-level optimization methodology into dynamic sparse training domain. Unlike traditional partial-heuristic DST schemes, which suffer from sub-optimal search efficiency for masks and miss the opportunity to fully explore the topological space of neural networks, BiDST excels at discovering excellent sparse patterns by optimizing mask and weight simultaneously, resulting in maximum 2.62% higher accuracy, 2.1$\\times$ faster execution speed, and 25$\\times$ reduced overhead. Code available at https://github.com/jjsrf/BiDST-ICML2024.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jie Ji;Gen Li;Lu Yin;Minghai Qin;Geng Yuan;Linke Guo;Shiwei Liu;Xiaolong Ma", "authorids": "~Jie_Ji1;~Gen_Li4;~Lu_Yin1;~Minghai_Qin1;~Geng_Yuan1;~Linke_Guo2;~Shiwei_Liu2;~Xiaolong_Ma2", "gender": ";M;;M;M;M;M;M", "homepage": ";https://coulsonlee.github.io;https://luuyin.com/;https://sites.google.com/site/minghaiqin/home;;http://cecas.clemson.edu/~linkeg/index.html;https://shiweiliuiiiiiii.github.io/;https://xiaolongma2016.com", "dblp": ";28/538-12;87/2528-6;;205/3007;;234/8697-3.html;", "google_scholar": ";;G4Xe1NkAAAAJ;MSgWKbYAAAAJ;tBIAgtgAAAAJ;https://scholar.google.com/citations?hl=en;73IbXtsAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;;0000-0001-9844-992X;;;0000-0003-3753-7648", "linkedin": ";;;;;;;xiaolong-ma-66b98910b/", "or_profile": "~Jie_Ji1;~Gen_Li4;~Lu_Yin1;~Minghai_Qin1;~Geng_Yuan1;~Linke_Guo2;~Shiwei_Liu2;~Xiaolong_Ma2", "aff": ";Clemson University;University of Aberdeen;Western Digital Corporation;University of Georgia;Clemson University;University of Oxford;Clemson University", "aff_domain": ";clemson.edu;abdn.ac.uk;wdc.com;uga.edu;clemson.edu;ox.ac.uk;clemson.edu", "position": ";PhD student;Assistant Professor;senior technologist;Assistant Professor;Associate Professor;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nji2024advancing,\ntitle={Advancing Dynamic Sparse Training by Exploring Optimization Opportunities},\nauthor={Jie Ji and Gen Li and Lu Yin and Minghai Qin and Geng Yuan and Linke Guo and Shiwei Liu and Xiaolong Ma},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=szRHR9XGrY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2001573, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12967496942853996700&as_sdt=40000005&sciodt=0,22&hl=en", "gs_version_total": 6, "email": ";clemson.edu;abdn.ac.uk;wdc.com;uga.edu;clemson.edu;ox.ac.uk;clemson.edu", "author_num": 8, "aff_unique_index": "0;1;2;3;0;4;0", "aff_unique_norm": "Clemson University;University of Aberdeen;Western Digital Corporation;University of Georgia;University of Oxford", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.clemson.edu;https://www.abdn.ac.uk;https://www.westerndigital.com;https://www.uga.edu;https://www.ox.ac.uk", "aff_unique_abbr": "Clemson;Aberdeen;WDC;UGA;Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;1;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "DataFreeShield: Defending Adversarial Attacks without Training Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32885", "id": "szvKJgmubh", "proceeding": "https://proceedings.mlr.press/v235/lee24f.html", "pdf": "https://openreview.net/pdf?id=szvKJgmubh", "openreview": "https://openreview.net/forum?id=szvKJgmubh", "author_site": "Hyeyoon Lee, Kanghyun Choi, Dain Kwon, SunJong Park, Mayoore Jaiswal, Noseong Park, Jonghyun Choi, Jinho Lee", "tldr": "", "abstract": "Recent advances in adversarial robustness rely on an abundant set of training data, where using external or additional datasets has become a common setting. However, in real life, the training data is often kept private for security and privacy issues, while only the pretrained weight is available to the public. In such scenarios, existing methods that assume accessibility to the original data become inapplicable. Thus we investigate the pivotal problem of data-free adversarial robustness, where we try to achieve adversarial robustness without accessing any real data. Through a preliminary study, we highlight the severity of the problem by showing that robustness without the original dataset is difficult to achieve, even with similar domain datasets. To address this issue, we propose DataFreeShield, which tackles the problem from two perspectives: surrogate dataset generation and adversarial training using the generated data. Through extensive validation, we show that DataFreeShield outperforms baselines, demonstrating that the proposed method sets the first entirely data-free solution for the adversarial robustness problem.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hyeyoon Lee;Kanghyun Choi;Dain Kwon;SunJong Park;Mayoore Selvarasa Jaiswal;Noseong Park;Jonghyun Choi;Jinho Lee", "authorids": "~Hyeyoon_Lee1;~Kanghyun_Choi1;~Dain_Kwon1;~SunJong_Park1;~Mayoore_Selvarasa_Jaiswal1;~Noseong_Park1;~Jonghyun_Choi1;~Jinho_Lee2", "gender": ";M;F;;F;;M;M", "homepage": "https://aisys.snu.ac.kr/members/HyeyoonLee.html;https://aisys.snu.ac.kr/kanghyun.html;https://github.com/meowrowan;;mayoore.github.io;;https://ppolon.github.io/;http://acsys.snu.ac.kr/people.html", "dblp": "276/0074;229/7353;380/6008;;http://dblp.uni-trier.de/pers/hd/j/Jaiswal:Mayoore_S=;;21/11103;", "google_scholar": "lYXg5nsAAAAJ;n9e6qnsAAAAJ;;;IcMxiP4AAAAJ;;uiGWnm4AAAAJ;https://scholar.google.com/citations?hl=ko", "orcid": ";;;;;;0000-0002-7934-8434;", "linkedin": ";;;;;;jonghyun-choi-459bb615/;", "or_profile": "~Hyeyoon_Lee1;~Kanghyun_Choi1;~Dain_Kwon1;~SunJong_Park1;~Mayoore_Selvarasa_Jaiswal1;~Noseong_Park1;~Jonghyun_Choi1;~Jinho_Lee2", "aff": "Seoul National University;Seoul National University;Yonsei University;;University of Washington;;Yonsei University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;yonsei.ac.kr;; ;;yonsei.ac.kr;snu.ac.kr", "position": "PhD student;PhD student;Undergrad student;;Graduate Student;;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nlee2024datafreeshield,\ntitle={DataFreeShield: Defending Adversarial Attacks without Training Data},\nauthor={Hyeyoon Lee and Kanghyun Choi and Dain Kwon and SunJong Park and Mayoore Selvarasa Jaiswal and Noseong Park and Jonghyun Choi and Jinho Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=szvKJgmubh}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9625516, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16114210221793948933&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "snu.ac.kr;snu.ac.kr;yonsei.ac.kr;; ;;yonsei.ac.kr;snu.ac.kr", "author_num": 8, "aff_unique_index": "0;0;1;2;1;0", "aff_unique_norm": "Seoul National University;Yonsei University;University of Washington", "aff_unique_dep": ";;", "aff_unique_url": "https://www.snu.ac.kr;https://www.yonsei.ac.kr;https://www.washington.edu", "aff_unique_abbr": "SNU;Yonsei;UW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "South Korea;United States" }, { "title": "Surface-VQMAE: Vector-quantized Masked Auto-encoders on Molecular Surfaces", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32884", "id": "szxtVHOh0C", "proceeding": "https://proceedings.mlr.press/v235/wu24o.html", "pdf": "https://openreview.net/pdf?id=szxtVHOh0C", "openreview": "https://openreview.net/forum?id=szxtVHOh0C", "author_site": "Fang Wu, Stan Z Li", "tldr": "", "abstract": "Molecular surfaces imply fingerprints of interaction patterns between proteins. However, non-equivalent efforts have been paid to incorporating the abundant protein surface information for analyzing proteins' biological functions in juxtaposition to amino acid sequences and 3D structures. We propose a novel surface-based unsupervised learning algorithm termed Surface-VQMAE to overcome this obstacle. In light of surface point clouds' sparsity and disorder properties, we first partition them into patches and obtain the sequential arrangement via the Morton curve. Successively, a Transformer-based architecture named SurfFormer was introduced to integrate the surface geometry and capture patch-level relations. At last, we enhance the prevalent masked auto-encoder (MAE) with the vector quantization (VQ) technique, which establishes a surface pattern codebook to enforce a discrete posterior distribution of latent variables and achieve more condensed semantics. Our work is the foremost to implement pretraining purely on molecular surfaces and extensive experiments on diverse real-life scenarios including binding site scoring, binding affinity prediction, and mutant effect estimation demonstrate its effectiveness. The code is available at https://github.com/smiles724/VQMAE.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fang Wu;Stan Z. Li", "authorids": "~Fang_Wu1;~Stan_Z._Li2", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": "~Fang_Wu1;~Stan_Z._Li2", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nwu2024surfacevqmae,\ntitle={Surface-{VQMAE}: Vector-quantized Masked Auto-encoders on Molecular Surfaces},\nauthor={Fang Wu and Stan Z. Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=szxtVHOh0C}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3745398, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12275628384116385621&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6, "email": ";", "author_num": 2 }, { "title": "Coprocessor Actor Critic: A Model-Based Reinforcement Learning Approach For Adaptive Brain Stimulation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32883", "id": "t3SEfoTaYQ", "proceeding": "https://proceedings.mlr.press/v235/pan24g.html", "pdf": "https://openreview.net/pdf?id=t3SEfoTaYQ", "openreview": "https://openreview.net/forum?id=t3SEfoTaYQ", "author_site": "Michelle Pan, Mariah Schrum, Vivek Myers, Erdem Biyik, Anca Dragan", "tldr": "", "abstract": "Adaptive brain stimulation can treat neurological conditions such as Parkinson\u2019s disease and post-stroke motor deficits by influencing abnormal neural activity. Because of patient heterogeneity, each patient requires a unique stimulation policy to achieve optimal neural responses. Model-free reinforcement learning (MFRL) holds promise in learning effective policies for a variety of similar control tasks, but is limited in domains like brain stimulation by a need for numerous costly environment interactions. In this work we introduce Coprocessor Actor Critic, a novel, model-based reinforcement learning (MBRL) approach for learning neural coprocessor policies for brain stimulation. Our key insight is that coprocessor policy learning is a combination of learning how to act optimally in the world and learning how to induce optimal actions in the world through stimulation of an injured brain. We show that our approach overcomes the limitations of traditional MFRL methods in terms of sample efficiency and task success and outperforms baseline MBRL approaches in a neurologically realistic model of an injured brain.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Michelle Pan;Mariah L Schrum;Vivek Myers;Erdem Biyik;Anca Dragan", "authorids": "~Michelle_Pan1;~Mariah_L_Schrum1;~Vivek_Myers1;~Erdem_Biyik1;~Anca_Dragan1", "gender": "F;F;;M;F", "homepage": "https://michelllepan.github.io/;;https://people.eecs.berkeley.edu/~vmyers/;http://people.eecs.berkeley.edu/~ebiyik/;http://www.ancadragan.com/", "dblp": ";237/8619;270/8694;194/2736;", "google_scholar": ";QuzrQzIAAAAJ;5NGAbT4AAAAJ;https://scholar.google.com.tr/citations?user=P-G3sjYAAAAJ;", "orcid": ";;;0000-0002-9516-3130;", "linkedin": ";;;https://linkedin.com/in/ebiyik;", "or_profile": "~Michelle_Pan1;~Mariah_L_Schrum1;~Vivek_Myers1;~Erdem_Biyik1;~Anca_Dragan1", "aff": "University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;University of Southern California;University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu;usc.edu;berkeley.edu", "position": "Undergrad student;Postdoc;PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\npan2024coprocessor,\ntitle={Coprocessor Actor Critic: A Model-Based Reinforcement Learning Approach For Adaptive Brain Stimulation},\nauthor={Michelle Pan and Mariah L Schrum and Vivek Myers and Erdem Biyik and Anca Dragan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=t3SEfoTaYQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1894650, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RSxybYX7MFwJ:scholar.google.com/&scioq=Coprocessor+Actor+Critic:+A+Model-Based+Reinforcement+Learning+Approach+For+Adaptive+Brain+Stimulation&hl=en&as_sdt=0,14", "gs_version_total": 8, "email": "berkeley.edu;berkeley.edu;berkeley.edu;usc.edu;berkeley.edu", "author_num": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "University of California, Berkeley;University of Southern California", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://www.usc.edu", "aff_unique_abbr": "UC Berkeley;USC", "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Berkeley;Los Angeles", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Compositional Few-Shot Class-Incremental Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32882", "id": "t4908PyZxs", "proceeding": "https://proceedings.mlr.press/v235/zou24c.html", "pdf": "https://openreview.net/pdf?id=t4908PyZxs", "openreview": "https://openreview.net/forum?id=t4908PyZxs", "author_site": "Yixiong Zou, Shanghang Zhang, haichen zhou, Yuhua Li, Ruixuan Li", "tldr": "", "abstract": "Few-shot class-incremental learning (FSCIL) is proposed to continually learn from novel classes with only a few samples after the (pre-)training on base classes with sufficient data. However, this remains a challenge. In contrast, humans can easily recognize novel classes with a few samples. Cognitive science demonstrates that an important component of such human capability is compositional learning. This involves identifying visual primitives from learned knowledge and then composing new concepts using these transferred primitives, making incremental learning both effective and interpretable. To imitate human compositional learning, we propose a cognitive-inspired method for the FSCIL task. We define and build a compositional model based on set similarities, and then equip it with a primitive composition module and a primitive reuse module. In the primitive composition module, we propose to utilize the Centered Kernel Alignment (CKA) similarity to approximate the similarity between primitive sets, allowing the training and evaluation based on primitive compositions. In the primitive reuse module, we enhance primitive reusability by classifying inputs based on primitives replaced with the closest primitives from other classes. Experiments on three datasets validate our method, showing it outperforms current state-of-the-art methods with improved interpretability. Our code is available at https://github.com/Zoilsen/Comp-FSCIL.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yixiong Zou;Shanghang Zhang;haichen zhou;Yuhua Li;Ruixuan Li", "authorids": "~Yixiong_Zou1;~Shanghang_Zhang4;~haichen_zhou1;~Yuhua_Li2;~Ruixuan_Li1", "gender": ";;F;F;M", "homepage": ";;;;http://idc.hust.edu.cn/rxli/index.html", "dblp": ";;;79/5796-3;60/4429.html", "google_scholar": ";;;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com/scholar?q=ruixuan+li", "orcid": ";;;;0000-0002-7791-5511", "linkedin": ";;https://www.linkedin.cn/incareer/in/ACoAADA9JaUBCa1OOF323CPHE4tmxobsbY30ghk;;https://www.linkedin.cn/incareer/in/ruixuan-li-b367319", "or_profile": "~Yixiong_Zou1;~Shanghang_Zhang4;~haichen_zhou1;~Yuhua_Li2;~Ruixuan_Li1", "aff": ";;Huazhong University of Science and Technology;Huazhong University of Science and Technology;Huazhong University of Science and Technology", "aff_domain": ";;hust.edu.cn;hust.edu.cn;hust.edu.cn", "position": ";;MS student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nzou2024compositional,\ntitle={Compositional Few-Shot Class-Incremental Learning},\nauthor={Yixiong Zou and Shanghang Zhang and haichen zhou and Yuhua Li and Ruixuan Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=t4908PyZxs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5897431, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15749239537694332098&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": ";;hust.edu.cn;hust.edu.cn;hust.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Huazhong University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "http://www.hust.edu.cn", "aff_unique_abbr": "HUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "TimeX++: Learning Time-Series Explanations with Information Bottleneck", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32881", "id": "t6dBpwkbea", "proceeding": "https://proceedings.mlr.press/v235/liu24bl.html", "pdf": "https://openreview.net/pdf?id=t6dBpwkbea", "openreview": "https://openreview.net/forum?id=t6dBpwkbea", "author_site": "Zichuan Liu, Tianchun Wang, Jimeng Shi, Xu Zheng, Zhuomin Chen, Lei Song, Wenqian Dong, Jayantha Obeysekera, Farhad Shirani, Dongsheng Luo", "tldr": "", "abstract": "Explaining deep learning models operating on time series data is crucial in various applications of interest which require interpretable and transparent insights from time series signals. In this work, we investigate this problem from an information theoretic perspective and show that most existing measures of explainability may suffer from trivial solutions and distributional shift issues. To address these issues, we introduce a simple yet practical objective function for time series explainable learning. The design of the objective function builds upon the principle of information bottleneck (IB), and modifies the IB objective function to avoid trivial solutions and distributional shift issues. We further present TimeX++, a novel explanation framework that leverages a parametric network to produce explanation-embedded instances that are both in-distributed and label-preserving. We evaluate TimeX++ on both synthetic and real-world datasets comparing its performance against leading baselines, and validate its practical efficacy through case studies in a real-world environmental application. Quantitative and qualitative evaluations show that TimeX++ outperforms baselines across all datasets, demonstrating a substantial improvement in explanation quality for time series data. The source code is available at https://github.com/zichuan-liu/TimeXplusplus.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zichuan Liu;Tianchun Wang;Jimeng Shi;Xu Zheng;Zhuomin Chen;Lei Song;Wenqian Dong;Jayantha Obeysekera;Farhad Shirani;Dongsheng Luo", "authorids": "~Zichuan_Liu3;~Tianchun_Wang1;~Jimeng_Shi1;~Xu_Zheng3;~Zhuomin_Chen1;~Lei_Song3;~Wenqian_Dong2;jobeysek@fiu.edu;~Farhad_Shirani1;~Dongsheng_Luo1", "gender": ";M;M;;;M;F;;M;M", "homepage": "https://zichuan-liu.github.io/;;https://jimengshi.github.io/about/;;;;http://wenqiandong.com/;;;https://users.cs.fiu.edu/~dluo/", "dblp": ";153/5231.html;292/8195.html;;;76/893-1.html;;;331/0634;", "google_scholar": "SUvzKxwAAAAJ;8su8b60AAAAJ;TLu9SFQAAAAJ;;;pXDSOocAAAAJ;6qIVck4AAAAJ;;M1Ab5mEAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;0009-0002-7268-0431;;;;;;;0000-0003-4192-0826", "linkedin": ";;jimengshi/;;;;;;farhad-shirani-48b63349/;", "or_profile": "~Zichuan_Liu3;~Tianchun_Wang1;~Jimeng_Shi1;~Xu_Zheng3;~Zhuomin_Chen1;~Lei_Song3;~Wenqian_Dong2;jobeysek@fiu.edu;~Farhad_Shirani1;~Dongsheng_Luo1", "aff": "Nanjing University;Pennsylvania State University;Florida International University;;;Microsoft;Florida International University;;Florida International University;Florida International University", "aff_domain": "nju.edu.cn;psu.edu;fiu.edu;;;microsoft.com;fiu.edu;;cis.fiu.edu;fiu.edu", "position": "MS student;PhD student;PhD student;;;Principal Researcher;Assistant Professor;;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nliu2024timex,\ntitle={TimeX++: Learning Time-Series Explanations with Information Bottleneck},\nauthor={Zichuan Liu and Tianchun Wang and Jimeng Shi and Xu Zheng and Zhuomin Chen and Lei Song and Wenqian Dong and Jayantha Obeysekera and Farhad Shirani and Dongsheng Luo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=t6dBpwkbea}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2111516, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16896866786921731848&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "nju.edu.cn;psu.edu;fiu.edu;;;microsoft.com;fiu.edu;;cis.fiu.edu;fiu.edu", "author_num": 10, "aff_unique_index": "0;1;2;3;2;2;2", "aff_unique_norm": "Nanjing University;Pennsylvania State University;Florida International University;Microsoft", "aff_unique_dep": ";;;Microsoft Corporation", "aff_unique_url": "https://www.nju.edu.cn;https://www.psu.edu;https://www.fiu.edu;https://www.microsoft.com", "aff_unique_abbr": "Nanjing U;PSU;FIU;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;1;1", "aff_country_unique": "China;United States" }, { "title": "Training Large Language Models for Reasoning through Reverse Curriculum Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32880", "id": "t82Y3fmRtk", "proceeding": "https://proceedings.mlr.press/v235/xi24a.html", "pdf": "https://openreview.net/pdf?id=t82Y3fmRtk", "openreview": "https://openreview.net/forum?id=t82Y3fmRtk", "author_site": "Zhiheng Xi, Wenxiang Chen, Boyang Hong, Senjie Jin, Rui Zheng, Wei He, Yiwen Ding, Shichun Liu, Xin Guo, Junzhe Wang, Honglin Guo, Wei Shen, Xiaoran Fan, Yuhao Zhou, Shihan Dou, Xiao Wang, Xinbo Zhang, Peng Sun, Tao Gui, Qi Zhang, Xuanjing Huang", "tldr": "", "abstract": "In this paper, we propose **R**$^3$: Learning **R**easoning through **R**everse Curriculum **R**einforcement Learning (RL), a novel method that employs only outcome supervision to achieve the benefits of process supervision for large language models. The core challenge in applying RL to complex reasoning is to identify a sequence of actions that result in positive rewards and provide appropriate supervision for optimization. Outcome supervision provides sparse rewards for final results without identifying error locations, whereas process supervision offers step-wise rewards but requires extensive manual annotation. **R**$^3$ overcomes these limitations by learning from correct demonstrations. Specifically, **R**$^3$ progressively slides the start state of reasoning from a demonstration's end to its beginning, facilitating easier model exploration at all stages. Thus, **R**$^3$ establishes a step-wise curriculum, allowing outcome supervision to offer step-level signals and precisely pinpoint errors. Using Llama2-7B, our method surpasses RL baseline on eight reasoning tasks by $4.1$ points on average. Notably, in program-based reasoning, 7B-scale models perform comparably to larger models or closed-source models with our **R**$^3$.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhiheng Xi;Wenxiang Chen;Boyang Hong;Senjie Jin;Rui Zheng;Wei He;Yiwen Ding;Shichun Liu;Xin Guo;Junzhe Wang;Honglin Guo;Wei Shen;Xiaoran Fan;Yuhao Zhou;Shihan Dou;Xiao Wang;Xinbo Zhang;peng sun;Tao Gui;Qi Zhang;Xuanjing Huang", "authorids": "~Zhiheng_Xi1;~Wenxiang_Chen3;~Boyang_Hong1;~Senjie_Jin1;~Rui_Zheng1;~Wei_He14;~Yiwen_Ding3;~Shichun_Liu2;~Xin_Guo13;~Junzhe_Wang2;~Honglin_Guo1;~Wei_Shen12;~Xiaoran_Fan3;~Yuhao_Zhou3;~Shihan_Dou1;~Xiao_Wang12;~Xinbo_Zhang1;~peng_sun1;~Tao_Gui1;~Qi_Zhang8;~Xuanjing_Huang1", "gender": ";M;M;M;M;M;F;;F;;M;;M;M;;M;F;M;M;M;F", "homepage": "https://woooodyy.github.io/;https://github.com/chenwxOggai;https://scholar.google.com/citations?user=po9bNwsAAAAJ&hl=zh-CN;;https://github.com/ruizheng20;https://hwcoder.top/about;;;https://github.com/XinGuo2002;;https://github.com/KYLN24;http://github.com/fakerbaby;;https://ciaran.top;;https://xiaowangnlp.github.io/;;http://pengsun.github.io;;http://qizhang.info;https://xuanjing-huang.github.io/", "dblp": "333/4268;;;348/5674.html;;;;;;;60/10205;;197/0141;;;;;;135/6973;52/323-1;05/6735-1", "google_scholar": "https://scholar.google.com.hk/citations?user=zSVLkqAAAAAJ;IhrlxrYAAAAJ;po9bNwsAAAAJ;https://scholar.google.com.hk/citations?user=kMP_SiUAAAAJ;https://scholar.google.com.hk/citations?user=7Z0V_SoAAAAJ;;;;;;HrYYk4YAAAAJ;-DlGT8IAAAAJ;https://scholar.google.com/citations?hl=zh-CN;qHHExWgAAAAJ;;https://scholar.google.com.hk/citations?hl=zh-CN;;;;XfqR3yYAAAAJ;RGsMgZA4H78C", "orcid": ";;;;;;0009-0003-0670-9221;;;;0000-0003-0361-2689;;;;;;;;;;0000-0001-9197-9426", "linkedin": ";;;;;;;;;;;;;;;;;;;;", "or_profile": "~Zhiheng_Xi1;~Wenxiang_Chen3;~Boyang_Hong1;~Senjie_Jin1;~Rui_Zheng1;~Wei_He14;~Yiwen_Ding3;~Shichun_Liu2;~Xin_Guo13;~Junzhe_Wang2;~Honglin_Guo1;~Wei_Shen12;~Xiaoran_Fan3;~Yuhao_Zhou3;~Shihan_Dou1;~Xiao_Wang12;~Xinbo_Zhang1;~peng_sun1;~Tao_Gui1;~Qi_Zhang8;~Xuanjing_Huang1", "aff": "Fudan University;Fudan University;Fudan University;Fudan University;Fudan University;Fudan University;Fudan University;;Fudan University;;Fudan University;Fudan University;Fudan University;Fudan University;;Fudan University;ByteDance;ByteDance;Fudan University;Fudan University;Fudan University", "aff_domain": "fudan.edu.cn;fudan.edu;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;;fudan.edu.cn;;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;;fudan.edu.cn;bytedance.com;bytedance.com;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn", "position": "PhD student;MS student;MS student;MS student;PhD student;MS student;MS student;;Undergrad student;;Undergrad student;MS student;PhD student;MS student;;PhD student;Researcher;Researcher;Assistant Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nxi2024training,\ntitle={Training Large Language Models for Reasoning through Reverse Curriculum Reinforcement Learning},\nauthor={Zhiheng Xi and Wenxiang Chen and Boyang Hong and Senjie Jin and Rui Zheng and Wei He and Yiwen Ding and Shichun Liu and Xin Guo and Junzhe Wang and Honglin Guo and Wei Shen and Xiaoran Fan and Yuhao Zhou and Shihan Dou and Xiao Wang and Xinbo Zhang and peng sun and Tao Gui and Qi Zhang and Xuanjing Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=t82Y3fmRtk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1009296, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 21, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11826655664877514813&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "fudan.edu.cn;fudan.edu;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;;fudan.edu.cn;;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;;fudan.edu.cn;bytedance.com;bytedance.com;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn", "author_num": 21, "aff_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0;1;1;0;0;0", "aff_unique_norm": "Fudan University;ByteDance", "aff_unique_dep": ";", "aff_unique_url": "https://www.fudan.edu.cn;https://www.bytedance.com", "aff_unique_abbr": "Fudan;ByteDance", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "The Computational Complexity of Finding Second-Order Stationary Points", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32879", "id": "t8WDBcegae", "proceeding": "https://proceedings.mlr.press/v235/kontogiannis24a.html", "pdf": "https://openreview.net/pdf?id=t8WDBcegae", "openreview": "https://openreview.net/forum?id=t8WDBcegae", "author_site": "Andreas Kontogiannis, Vasilis Pollatos, Sotiris Kanellopoulos, Panayotis Mertikopoulos, Aris Pagourtzis, Ioannis Panageas", "tldr": "", "abstract": "Non-convex minimization problems are universally considered hard, and even guaranteeing that a computed solution is locally minimizing is known to be NP-hard. In this general context, our paper focuses on the problem of finding stationary points that satisfy an approximate second-order optimality condition, which serves to exclude strict saddles and other non-minimizing stationary points. Our main result is that the problem of finding approximate second-order stationary points (SOSPs) is PLS-complete, i.e., of the same complexity as the problem of finding first-order stationary points (FOSPs), thus resolving an open question in the field. In particular, our results imply that, under the widely believed complexity conjecture that PLS $\\neq$ FNP, finding approximate SOSPs in unconstrained domains is *easier* than in constrained domains, which is known to be NP-hard. This comes in stark contrast with earlier results which implied that, unless PLS = CLS, finding approximate FOSPs in unconstrained domains is *harder* than in constrained domains.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Andreas Kontogiannis;Vasilis Pollatos;Sotiris Kanellopoulos;Panayotis Mertikopoulos;Aris Pagourtzis;Ioannis Panageas", "authorids": "~Andreas_Kontogiannis1;vpollato@mpi-sws.org;~Sotiris_Kanellopoulos1;~Panayotis_Mertikopoulos1;~Aris_Pagourtzis1;~Ioannis_Panageas1", "gender": "M;;M;M;M;M", "homepage": ";;;http://polaris.imag.fr/panayotis.mertikopoulos/;https://www.ece.ntua.gr/en/staff/79;https://panageas.github.io", "dblp": "309/6039;;;49/6721;;139/3829", "google_scholar": "https://scholar.google.nl/citations?hl=en;;;xsusqPYAAAAJ;;5NiFWuwAAAAJ", "orcid": ";;;0000-0003-2026-9616;;", "linkedin": "andreas-kontogiannis-2405a3176/;;sotiris-kanellopoulos/;;;", "or_profile": "~Andreas_Kontogiannis1;vpollato@mpi-sws.org;~Sotiris_Kanellopoulos1;~Panayotis_Mertikopoulos1;~Aris_Pagourtzis1;~Ioannis_Panageas1", "aff": "Archimedes AI;;National Technical University of Athens;French National Center for Scientific Research;National Technical University of Athens;Donald Bren School of Information and Computer Sciences, University of California, Irvine", "aff_domain": "athenarc.gr;;ntua.gr;imag.fr;ntua.gr;ics.uci.edu", "position": "PhD student;;PhD student;Principal Researcher;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nkontogiannis2024the,\ntitle={The Computational Complexity of Finding Second-Order Stationary Points},\nauthor={Andreas Kontogiannis and Vasilis Pollatos and Sotiris Kanellopoulos and Panayotis Mertikopoulos and Aris Pagourtzis and Ioannis Panageas},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=t8WDBcegae}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5244918, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3282376234023636824&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15, "email": "athenarc.gr;;ntua.gr;imag.fr;ntua.gr;ics.uci.edu", "author_num": 6, "aff_unique_index": "0;1;2;1;3", "aff_unique_norm": "Archimedes AI;National Technical University of Athens;French National Center for Scientific Research;University of California, Irvine", "aff_unique_dep": ";;;Donald Bren School of Information and Computer Sciences", "aff_unique_url": "https://www.archimedes.ai;https://www.ntua.gr;https://www.cnrs.fr;https://www.uci.edu", "aff_unique_abbr": "Archimedes AI;NTUA;CNRS;UCI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Irvine", "aff_country_unique_index": "0;1;2;1;0", "aff_country_unique": "United States;Greece;France" }, { "title": "Larimar: Large Language Models with Episodic Memory Control", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32878", "id": "t8mt4YrPsq", "proceeding": "https://proceedings.mlr.press/v235/das24a.html", "pdf": "https://openreview.net/pdf?id=t8mt4YrPsq", "openreview": "https://openreview.net/forum?id=t8mt4YrPsq", "author_site": "Payel Das, Subhajit Chaudhury, Elliot Nelson, Igor Melnyk, Sarath Swaminathan, Sophie Dai, Aurelie Lozano, Georgios Kollias, Vijil Chenthamarakshan, Jiri Navratil, Soham Dan, Pin-Yu Chen", "tldr": "", "abstract": "Efficient and accurate updating of knowledge stored in Large Language Models (LLMs) is one of the most pressing research challenges today. This paper presents Larimar - a novel, brain-inspired architecture for enhancing LLMs with a distributed episodic memory. Larimar's memory allows for dynamic, one-shot updates of knowledge without the need for computationally expensive re-training or fine-tuning. Experimental results on multiple fact editing benchmarks demonstrate that Larimar attains accuracy comparable to most competitive baselines, even in the challenging sequential editing setup, but also excels in speed---yielding speed-ups of 8-10x depending on the base LLM ---as well as flexibility due to the proposed architecture being simple, LLM-agnostic, and hence general. We further provide mechanisms for selective fact forgetting, information leakage prevention, and input context length generalization with Larimar and show their effectiveness. Our code is available at https://github.com/IBM/larimar.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Payel Das;Subhajit Chaudhury;Elliot Nelson;Igor Melnyk;Sarathkrishna Swaminathan;Sihui Dai;Aurelie Lozano;Georgios Kollias;Vijil Chenthamarakshan;Jiri Navratil;Soham Dan;Pin-Yu Chen", "authorids": "~Payel_Das1;~Subhajit_Chaudhury1;~Elliot_Nelson1;~Igor_Melnyk1;~Sarathkrishna_Swaminathan1;~Sihui_Dai1;~Aurelie_Lozano1;~Georgios_Kollias1;~Vijil_Chenthamarakshan1;~Jiri_Navratil1;~Soham_Dan1;~Pin-Yu_Chen1", "gender": "F;M;;M;M;F;F;;M;;M;M", "homepage": ";https://subhajitchaudhury.github.io/;;https://imelnyk.github.io/;https://scholar.google.com/citations?hl=en&user=LIHU5U8AAAAJ;;https://research.ibm.com/people/aurelie-lozano;;https://researcher.watson.ibm.com/researcher/view.php?person=us-ecvijil;https://researcher.watson.ibm.com/researcher/view.php?person=us-jiri;https://sdan2.github.io/;http://www.pinyuchen.com", "dblp": "56/7926;http://dblp2.uni-trier.de/pers/hd/c/Chaudhury:Subhajit;323/9332;;177/8796.html;244/9642;06/274;;;00/680-1.html;181/9448;39/8969", "google_scholar": ";https://scholar.google.co.jp/citations?user=EBTpFrQAAAAJ;YBvdBOkAAAAJ;4vDRTWwAAAAJ;https://scholar.google.com/citations?hl=en;;4wTGaDsAAAAJ;;g9hboJ0AAAAJ;H41S5AgAAAAJ;nOsmu8UAAAAJ;jxwlCUUAAAAJ", "orcid": ";;;;;;;;;0009-0007-5230-7679;;0000-0003-1039-8369", "linkedin": ";subhajit-chaudhury-24955455/;elliot-nelson-18295377/;;;;;;;jiri-navratil-62641497/;;pin-yu-chen-940062a2", "or_profile": "~Payel_Das1;~Subhajit_Chaudhury1;~Elliot_Nelson1;~Igor_Melnyk1;~Sarathkrishna_Swaminathan1;~Sihui_Dai1;~Aurelie_Lozano1;~Georgios_Kollias1;~Vijil_Chenthamarakshan1;~Jiri_Navratil1;~Soham_Dan1;~Pin-Yu_Chen1", "aff": "IBM, International Business Machines;International Business Machines;International Business Machines;International Business Machines;International Business Machines;International Business Machines;IBM Research;;International Business Machines;International Business Machines;International Business Machines;International Business Machines", "aff_domain": "us.ibm.com;ibm.com;ibm.com;ibm.com;ibm.com;ibm.com;us.ibm.com;;ibm.com;ibm.com;ibm.com;ibm.com", "position": "Principal Researcher;Research Scientist;Researcher;Researcher;Senior Research Scientist;Intern;Principal Researcher;;Senior Technical Staff member;Principal Research Staff Member;Researcher;Principal Researcher", "bibtex": "@inproceedings{\ndas2024larimar,\ntitle={Larimar: Large Language Models with Episodic Memory Control},\nauthor={Payel Das and Subhajit Chaudhury and Elliot Nelson and Igor Melnyk and Sarathkrishna Swaminathan and Sihui Dai and Aurelie Lozano and Georgios Kollias and Vijil Chenthamarakshan and Jiri Navratil and Soham Dan and Pin-Yu Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=t8mt4YrPsq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1383025, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4576011797206411025&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "us.ibm.com;ibm.com;ibm.com;ibm.com;ibm.com;ibm.com;us.ibm.com;;ibm.com;ibm.com;ibm.com;ibm.com", "author_num": 12, "aff_unique_index": "0;1;1;1;1;1;2;1;1;1;1", "aff_unique_norm": "International Business Machines;International Business Machines Corporation;IBM", "aff_unique_dep": ";;IBM Research", "aff_unique_url": "https://www.ibm.com;https://www.ibm.com;https://www.ibm.com/research", "aff_unique_abbr": "IBM;IBM;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Task-aware Orthogonal Sparse Network for Exploring Shared Knowledge in Continual Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32877", "id": "tABvuya05B", "proceeding": "https://proceedings.mlr.press/v235/hu24b.html", "pdf": "https://openreview.net/pdf?id=tABvuya05B", "openreview": "https://openreview.net/forum?id=tABvuya05B", "author_site": "Yusong Hu, De Cheng, Dingwen Zhang, Nannan Wang, Tongliang Liu, Xinbo Gao", "tldr": "", "abstract": "Continual learning (CL) aims to learn from sequentially arriving tasks without catastrophic forgetting (CF). By partitioning the network into two parts based on the Lottery Ticket Hypothesis---one for holding the knowledge of the old tasks while the other for learning the knowledge of the new task---the recent progress has achieved forget-free CL. Although addressing the CF issue well, such methods would encounter serious under-fitting in long-term CL, in which the learning process will continue for a long time and the number of new tasks involved will be much higher. To solve this problem, this paper partitions the network into three parts---with a new part for exploring the knowledge sharing between the old and new tasks. With the shared knowledge, this part of network can be learnt to simultaneously consolidate the old tasks and fit to the new task. To achieve this goal, we propose a task-aware **Orthogonal Sparse Network** (OSN), which contains shared knowledge induced network partition and sharpness-aware orthogonal sparse network learning. The former partitions the network to select shared parameters, while the latter guides the exploration of shared knowledge through shared parameters. Qualitative and quantitative analyses, show that the proposed OSN induces minimum to no interference with past tasks, *i.e.*, approximately no forgetting, while greatly improves the model plasticity and capacity, and finally achieves the state-of-the-art performances.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yusong Hu;De Cheng;Dingwen Zhang;Nannan Wang;Tongliang Liu;Xinbo Gao", "authorids": "~Yusong_Hu2;~De_Cheng3;~Dingwen_Zhang1;~Nannan_Wang1;~Tongliang_Liu1;~Xinbo_Gao5", "gender": "M;M;M;M;M;M", "homepage": "https://github.com/hysxdu;https://web.xidian.edu.cn/dcheng/index.html;;https://tongliang-liu.github.io/;https://faculty.cqupt.edu.cn/gaoxinbo/zh_CN/index.htm;https://zdw-nwpu.github.io/dingwenz.github.com/", "dblp": ";154/1991/;10/8359-1;150/6667;;150/6620", "google_scholar": ";180lASkAAAAJ;SRBn7oUAAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;https://scholar.google.com/citations?hl=zh-CN;", "orcid": ";;;;0000-0002-7985-0037;", "linkedin": ";;;;xinbo-gao-151a2224/;", "or_profile": "~Yusong_Hu2;~De_Cheng3;~Nannan_Wang1;~Tongliang_Liu1;~Xinbo_Gao5;~Dingwen_Zhang2", "aff": "Xi'an University of Electronic Science and Technology;Xidian University;Xidian University;Mohamed bin Zayed University of Artificial Intelligence;Chongqing University of Post and Telecommunications;Northwestern Polytechnical University", "aff_domain": "xidian.edu.cn;xidian.edu.cn;xidian.edu.cn;mbzuai.ac.ae;cqupt.edu.cn;nwpu.edu.cn", "position": "MS student;Associate Professor;Full Professor;Affiliated Associate Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nhu2024taskaware,\ntitle={Task-aware Orthogonal Sparse Network for Exploring Shared Knowledge in Continual Learning},\nauthor={Yusong Hu and De Cheng and Dingwen Zhang and Nannan Wang and Tongliang Liu and Xinbo Gao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tABvuya05B}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2158738, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16858680228188033055&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "xidian.edu.cn;xidian.edu.cn;xidian.edu.cn;mbzuai.ac.ae;cqupt.edu.cn;nwpu.edu.cn", "author_num": 6, "aff_unique_index": "0;1;1;2;3;4", "aff_unique_norm": "Xi'an University of Electronic Science and Technology;Xidian University;Mohamed bin Zayed University of Artificial Intelligence;Chongqing University of Post and Telecommunications;Northwestern Polytechnical University", "aff_unique_dep": ";;;;", "aff_unique_url": "http://www.xidian.edu.cn/;http://www.xidian.edu.cn/;https://mbzuai.ac.ae;http://www.cqupt.edu.cn;https://www.nwpu.edu.cn", "aff_unique_abbr": "Xidian University;Xidian;MBZUAI;CQUPT;NWPU", "aff_campus_unique_index": "0", "aff_campus_unique": "Xi'an;", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "China;United Arab Emirates" }, { "title": "MADA: Meta-Adaptive Optimizers Through Hyper-Gradient Descent", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32876", "id": "tASXcrMekp", "proceeding": "https://proceedings.mlr.press/v235/ozkara24a.html", "pdf": "https://openreview.net/pdf?id=tASXcrMekp", "openreview": "https://openreview.net/forum?id=tASXcrMekp", "author_site": "Kaan Ozkara, Can Karakus, Parameswaran Raman, Mingyi Hong, Shoham Sabach, Branislav Kveton, Volkan Cevher", "tldr": "", "abstract": "Following the introduction of Adam, several novel adaptive optimizers for deep learning have been proposed. These optimizers typically excel in some tasks but may not outperform Adam uniformly across all tasks. In this work, we introduce Meta-Adaptive Optimizers (MADA), a unified optimizer framework that can generalize several known optimizers and dynamically learn the most suitable one during training. The key idea in MADA is to parameterize the space of optimizers and dynamically search through it using hyper-gradient descent during training. We empirically compare MADA to other popular optimizers on vision and language tasks, and find that MADA consistently outperforms Adam and other popular optimizers, and is robust against sub-optimally tuned hyper-parameters. MADA achieves a greater validation performance improvement over Adam compared to other popular optimizers during GPT-2 training and fine-tuning. We also propose AVGrad, a modification of AMSGrad that replaces the maximum operator with averaging, which is more suitable for hyper-gradient optimization. Finally, we provide a convergence analysis to show that parameterized interpolations of optimizers can improve their error bounds (up to constants), hinting at an advantage for meta-optimizers.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kaan Ozkara;Can Karakus;Parameswaran Raman;Mingyi Hong;Shoham Sabach;Branislav Kveton;Volkan Cevher", "authorids": "~Kaan_Ozkara1;~Can_Karakus2;~Parameswaran_Raman1;~Mingyi_Hong1;~Shoham_Sabach1;~Branislav_Kveton1;~Volkan_Cevher1", "gender": ";M;M;M;M;M;M", "homepage": ";http://cankarakus.com;https://paramsraman.github.io/;http://people.ece.umn.edu/~mhong/mingyi.html;https://ssabach.net.technion.ac.il/;http://www.bkveton.com;http://lions.epfl.ch", "dblp": ";81/11533;142/2573;57/8053;;92/5526;70/5301", "google_scholar": "W-JoHj0AAAAJ;2N7UwRMAAAAJ;amJUMFEAAAAJ;qRnP-p0AAAAJ;https://scholar.google.ca/citations?user=42D12TkAAAAJ;CZaDvPgAAAAJ;https://scholar.google.ch/citations?user=hlWhzU8AAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Kaan_Ozkara1;~Can_Karakus2;~Parameswaran_Raman1;~Mingyi_Hong1;~Shoham_Sabach1;~Branislav_Kveton1;~Volkan_Cevher1", "aff": "University of California, Los Angeles;;Amazon;University of Minnesota, Minneapolis;Technion - Israel Institute of Technology, Technion;Amazon;Amazon Development Center Germany", "aff_domain": "ucla.edu;;amazon.com;umn.edu;technion.ac.il;amazon.com;amazon.de", "position": "PhD student;;Applied Scientist;Associate Professor;Associate Professor;Principal Scientist;Amazon Scholar", "bibtex": "@inproceedings{\nozkara2024mada,\ntitle={{MADA}: Meta-Adaptive Optimizers Through Hyper-Gradient Descent},\nauthor={Kaan Ozkara and Can Karakus and Parameswaran Raman and Mingyi Hong and Shoham Sabach and Branislav Kveton and Volkan Cevher},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tASXcrMekp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2192066, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3718508371688082010&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 11, "email": "ucla.edu;;amazon.com;umn.edu;technion.ac.il;amazon.com;amazon.de", "author_num": 7, "aff_unique_index": "0;1;2;3;1;1", "aff_unique_norm": "University of California, Los Angeles;Amazon;University of Minnesota;Technion - Israel Institute of Technology", "aff_unique_dep": ";Amazon.com, Inc.;;", "aff_unique_url": "https://www.ucla.edu;https://www.amazon.com;https://www.minnesota.edu;https://www.technion.ac.il", "aff_unique_abbr": "UCLA;Amazon;UMN;Technion", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Los Angeles;;Minneapolis", "aff_country_unique_index": "0;0;0;1;0;2", "aff_country_unique": "United States;Israel;Germany" }, { "title": "SPHINX-X: Scaling Data and Parameters for a Family of Multi-modal Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32875", "id": "tDMlQkJRhZ", "proceeding": "https://proceedings.mlr.press/v235/liu24cc.html", "pdf": "https://openreview.net/pdf?id=tDMlQkJRhZ", "openreview": "https://openreview.net/forum?id=tDMlQkJRhZ", "author_site": "Dongyang Liu, Renrui Zhang, Longtian Qiu, Siyuan Huang, Weifeng Lin, Shitian Zhao, Shijie Geng, Ziyi Lin, Peng Jin, Kaipeng Zhang, WENQI SHAO, Chao Xu, Conghui He, Junjun He, Hao Shao, Pan Lu, Yu Qiao, Hongsheng Li, Peng Gao", "tldr": "", "abstract": "We propose SPHINX-X, an extensive Multi-modality Large Language Model (MLLM) series developed upon SPHINX. To improve the architecture and training efficiency, we modify the SPHINX framework by removing redundant visual encoders, bypassing fully-padded sub-images with skip tokens, and simplifying multi-stage training into a one-stage all-in-one paradigm. To fully unleash the potential of MLLMs, we assemble a comprehensive multi-domain and multi-modal dataset covering publicly available resources in language, vision, and vision-language tasks. We further enrich this collection with our curated OCR intensive and Set-of-Mark datasets, extending the diversity and generality. By training over different base LLMs including TinyLlama-1.1B, InternLM2-7B, LLaMA2-13B, and Mixtral-8$\\times$7B, we obtain a spectrum of MLLMs that vary in parameter size and multilingual capabilities. Comprehensive benchmarking reveals a strong correlation between the multi-modal performance with the data and parameter scales. Code and models are released at https://github.com/Alpha-VLLM/LLaMA2-Accessory.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dongyang Liu;Renrui Zhang;Longtian Qiu;Siyuan Huang;Weifeng Lin;Shitian Zhao;Shijie Geng;Ziyi Lin;Peng Jin;Kaipeng Zhang;Wenqi Shao;Chao Xu;Conghui He;Junjun He;Hao Shao;Pan Lu;Yu Qiao;Hongsheng Li;Peng Gao", "authorids": "~Dongyang_Liu3;~Renrui_Zhang1;~Longtian_Qiu1;~Siyuan_Huang4;~Weifeng_Lin1;~Shitian_Zhao1;~Shijie_Geng1;~Ziyi_Lin1;~Peng_Jin4;~Kaipeng_Zhang1;~Wenqi_Shao2;~Chao_Xu16;~Conghui_He2;~Junjun_He2;~Hao_Shao1;~Pan_Lu2;~Yu_Qiao1;~Hongsheng_Li3;~Peng_Gao3", "gender": ";M;M;M;M;M;M;M;M;M;M;;;M;;;;M;", "homepage": ";;https://github.com/Artanic30;https://siyuanhuang95.github.io/;https://github.com/Afeng-x;https://zhaoshitian.github.io/;;;;http://kpzhang93.github.io/;https://wqshao126.github.io/;;;https://junjun2016.github.io/;http://hao-shao.com;;;http://www.ee.cuhk.edu.hk/~hsli;", "dblp": ";244/1748;308/0937;62/885-4.html;193/7842;364/2271;171/3642;174/2038;83/6151-1.html;179/2126;227/3122;;;128/7027;66/3089.html;;;27/7402-1;", "google_scholar": ";YlL3xN4AAAAJ;https://scholar.google.com/citations?hl=zh-CN;QNkS4KEAAAAJ;8iCZxIAAAAAJ;https://scholar.google.com/citations?hl=en;wujqvGYAAAAJ;-VOnnzUAAAAJ;HHXLexAAAAAJ;4OqZBmYAAAAJ;Bs9mrwwAAAAJ;;;Z4LgebkAAAAJ;https://scholar.google.com.hk/citations?user=D_ZLR1oAAAAJ;;;BN2Ze-QAAAAJ;", "orcid": ";;;0009-0005-6363-833X;;;;;0000-0001-9287-6410;;;;;;;;;;", "linkedin": ";;;siyuan-huang-979672149/;;;;;;;;;;;;;;;", "or_profile": "~Dongyang_Liu3;~Renrui_Zhang1;~Longtian_Qiu1;~Siyuan_Huang4;~Weifeng_Lin1;~Shitian_Zhao1;~Shijie_Geng1;~Ziyi_Lin1;~Peng_Jin4;~Kaipeng_Zhang1;~Wenqi_Shao2;~Chao_Xu16;~Conghui_He2;~Junjun_He2;~Hao_Shao1;~Pan_Lu2;~Yu_Qiao1;~Hongsheng_Li3;~Peng_Gao3", "aff": ";MMLab of CUHK & Shanghai AI Laboratory;ShanghaiTech University;Shanghai Jiaotong University;South China University of Technology;East China Normal University;ByteDance Inc.;The Chinese University of Hong Kong;Peking University;Shanghai AI Laboratory;Shanghai AI Laboratory;;;Shanghai AI Laboratory;The Chinese University of Hong Kong, The Chinese University of Hong Kong;;;The Chinese University of Hong Kong;", "aff_domain": ";pjlab.org.cn;shanghaitech.edu.cn;sjtu.edu.cn;scut.edu.cn;ecnu.edu.cn;bytedance.com;cuhk.edu.hk;pku.edu.cn;pjlab.org.cn;pjlab.org.cn;;;pjlab.org.cn;ee.cuhk.edu.hk;;;cuhk.edu.hk;", "position": ";PhD student;PhD student;PhD student;MS student;Undergrad student;Researcher;PhD student;PhD student;Researcher;Researcher;;;Researcher;PhD student;;;Associate Professor;", "bibtex": "@inproceedings{\nliu2024sphinxx,\ntitle={{SPHINX}-X: Scaling Data and Parameters for a Family of Multi-modal Large Language Models},\nauthor={Dongyang Liu and Renrui Zhang and Longtian Qiu and Siyuan Huang and Weifeng Lin and Shitian Zhao and Shijie Geng and Ziyi Lin and Peng Jin and Kaipeng Zhang and Wenqi Shao and Chao Xu and Conghui He and Junjun He and Hao Shao and Pan Lu and Yu Qiao and Hongsheng Li and Peng Gao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tDMlQkJRhZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2107676, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 19, "gs_citation": 121, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=891412887268659419&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": ";pjlab.org.cn;shanghaitech.edu.cn;sjtu.edu.cn;scut.edu.cn;ecnu.edu.cn;bytedance.com;cuhk.edu.hk;pku.edu.cn;pjlab.org.cn;pjlab.org.cn;;;pjlab.org.cn;ee.cuhk.edu.hk;;;cuhk.edu.hk;", "author_num": 19, "aff_unique_index": "0;1;2;3;4;5;0;6;7;7;7;0;0", "aff_unique_norm": "Chinese University of Hong Kong;ShanghaiTech University;Shanghai Jiao Tong University;South China University of Technology;East China Normal University;ByteDance;Peking University;Shanghai AI Laboratory", "aff_unique_dep": "MMLab;;;;;;;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.shanghaitech.edu.cn;https://www.sjtu.edu.cn;https://www.scut.edu.cn;http://www.ecnu.edu.cn;https://www.bytedance.com;http://www.pku.edu.cn;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "CUHK;ShanghaiTech;SJTU;SCUT;ECNU;ByteDance;Peking U;SAIL", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Dynamic Memory Compression: Retrofitting LLMs for Accelerated Inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32874", "id": "tDRYrAkOB7", "proceeding": "https://proceedings.mlr.press/v235/nawrot24a.html", "pdf": "https://openreview.net/pdf?id=tDRYrAkOB7", "openreview": "https://openreview.net/forum?id=tDRYrAkOB7", "author_site": "Piotr Nawrot, Adrian \u0141a\u0144cucki, Marcin Chochowski, David Tarjan, Edoardo Ponti", "tldr": "", "abstract": "Transformers have emerged as the backbone of large language models (LLMs). However, generation remains inefficient due to the need to store in memory a cache of key\u2013value representations for past tokens, whose size scales linearly with the input sequence length and batch size. As a solution, we propose Dynamic Memory Compression (DMC), a method for on-line key\u2013value cache compression at inference time. Most importantly, the model learns to apply different compression ratios in different heads and layers. We retrofit pre-trained LLMs such as Llama 2 (7B, 13B and 70B) into DMC Transformers, achieving up to $\\sim 3.7 \\times$ throughput increase during auto-regressive inference on an NVIDIA H100 GPU. DMC is applied via continued pre-training on a negligible percentage of the original data without adding any extra parameters. We find that DMC preserves the original downstream performance with up to 4$\\times$ cache compression, outperforming up-trained grouped-query attention (GQA) and key\u2013value eviction policies (H$_2$O, TOVA). GQA and DMC can be even combined to obtain compounded gains. As a result DMC fits longer contexts and larger batches within any given memory budget. We release the DMC code and models at https://github.com/NVIDIA/Megatron-LM/tree/DMC.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Piotr Nawrot;Adrian \u0141a\u0144cucki;Marcin Chochowski;David Tarjan;Edoardo Ponti", "authorids": "~Piotr_Nawrot1;~Adrian_\u0141a\u0144cucki1;~Marcin_Chochowski1;dtarjan@nvidia.com;~Edoardo_Ponti1", "gender": "M;;M;;", "homepage": "https://piotrnawrot.github.io;;https://www.linkedin.com/in/marcinchochowski/;;https://ducdauge.github.io/", "dblp": "304/8773.html;140/7631;;;178/8829", "google_scholar": "9wrNHUQAAAAJ;;tYpWp-4AAAAJ;;https://scholar.google.ca/citations?user=tklL2q0AAAAJ", "orcid": "0009-0003-8552-9447;;;;0000-0002-6308-1050", "linkedin": "piotr-nawrot/;;;;edoardo-maria-ponti/", "or_profile": "~Piotr_Nawrot1;~Adrian_\u0141a\u0144cucki1;~Marcin_Chochowski1;dtarjan@nvidia.com;~Edoardo_Ponti1", "aff": "NVIDIA;NVIDIA;NVIDIA;;NVIDIA", "aff_domain": "nvidia.com;nvidia.com;nvidia.com;;nvidia.com", "position": "Intern;Researcher;Researcher;;Researcher", "bibtex": "@inproceedings{\nnawrot2024dynamic,\ntitle={Dynamic Memory Compression: Retrofitting {LLM}s for Accelerated Inference},\nauthor={Piotr Nawrot and Adrian {\\L}a{\\'n}cucki and Marcin Chochowski and David Tarjan and Edoardo Ponti},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tDRYrAkOB7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 858343, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=113381609764741814&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "nvidia.com;nvidia.com;nvidia.com;;nvidia.com", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "NVIDIA", "aff_unique_dep": "NVIDIA Corporation", "aff_unique_url": "https://www.nvidia.com", "aff_unique_abbr": "NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "A Touch, Vision, and Language Dataset for Multimodal Alignment", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32873", "id": "tFEOOH9eH0", "proceeding": "https://proceedings.mlr.press/v235/fu24b.html", "pdf": "https://openreview.net/pdf?id=tFEOOH9eH0", "openreview": "https://openreview.net/forum?id=tFEOOH9eH0", "author_site": "Letian Fu, Gaurav Datta, Huang Huang, William Panitch, Jaimyn Drake, Joseph Ortiz, Mustafa Mukadam, Mike Lambeta, Roberto Calandra, Ken Goldberg", "tldr": "", "abstract": "Touch is an important sensing modality for humans, but it has not yet been incorporated into a multimodal generative language model. This is partially due to the difficulty of obtaining natural language labels for tactile data and the complexity of aligning tactile readings with both visual observations and language descriptions. As a step towards bridging that gap, this work introduces a new dataset of 44K in-the-wild visiontouch pairs, with English language labels annotated by humans (10%) and textual pseudo-labels from GPT-4V (90%). We use this dataset to train a vision-language-aligned tactile encoder for open-vocabulary classification and a touch-visionlanguage (TVL) model for text generation using the trained encoder. Results suggest that by incorporating touch, the TVL model improves (+29% classification accuracy) tactile-vision-language alignment over existing models trained on any pair of those modalities. Although only a small fraction of the dataset is human labeled, the TVL model demonstrates improved visual-tactile understanding over GPT-4V (+12%) and open-source vision-language models (+32%) on a new touch-vision understanding benchmark. Code, checkpoints and data are available on https: //tactile-vlm.github.io.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Letian Fu;Gaurav Datta;Huang Huang;William Chung-Ho Panitch;Jaimyn Drake;Joseph Ortiz;Mustafa Mukadam;Mike Lambeta;Roberto Calandra;Ken Goldberg", "authorids": "~Letian_Fu1;~Gaurav_Datta1;~Huang_Huang1;~William_Chung-Ho_Panitch1;jaimyndrake@berkeley.edu;~Joseph_Ortiz2;~Mustafa_Mukadam1;~Mike_Lambeta1;~Roberto_Calandra1;~Ken_Goldberg1", "gender": "M;;;;;M;M;M;M;M", "homepage": "https://max-fu.github.io/;;https://sites.google.com/site/huanghuang9729/home;;;https://joeaortiz.github.io/;http://www.mustafamukadam.com;;https://www.robertocalandra.com;http://goldberg.berkeley.edu/", "dblp": ";;;;;;;;118/8239;g/KennethYGoldberg", "google_scholar": "aWot7UgAAAAJ;;;;;https://scholar.google.co.uk/citations?user=pea9lz0AAAAJ;yYpm9LoAAAAJ;;FdE3LOEAAAAJ;https://scholar.google.com.tw/citations?user=8fztli4AAAAJ", "orcid": ";;;0000-0003-2654-1617;;;;;0000-0001-9430-8433;0000-0001-6747-9499", "linkedin": ";https://linkedin.com/in/gaurav-datta;;;;;mhmukadam/;mike-maroje-lambeta;rcalandra;goldbergken/", "or_profile": "~Letian_Fu1;~Gaurav_Datta1;~Huang_Huang1;~William_Chung-Ho_Panitch1;jaimyndrake@berkeley.edu;~Joseph_Ortiz2;~Mustafa_Mukadam1;~Mike_Lambeta1;~Roberto_Calandra1;~Ken_Goldberg1", "aff": "University of California, Berkeley;;University of California, Berkeley;;;Google DeepMind;Meta AI;Meta;Technische Universit\u00e4t Dresden;University of California, Berkeley", "aff_domain": "berkeley.edu;;berkeley.edu;;;google.com;meta.com;meta.com;tu-dresden.de;berkeley.edu", "position": "PhD student;;PhD student;;;Researcher;Researcher;Engineer;Full Professor;Full Professor", "bibtex": "@inproceedings{\nfu2024a,\ntitle={A Touch, Vision, and Language Dataset for Multimodal Alignment},\nauthor={Letian Fu and Gaurav Datta and Huang Huang and William Chung-Ho Panitch and Jaimyn Drake and Joseph Ortiz and Mustafa Mukadam and Mike Lambeta and Roberto Calandra and Ken Goldberg},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tFEOOH9eH0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2043313, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14014622488087069248&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "berkeley.edu;;berkeley.edu;;;google.com;meta.com;meta.com;tu-dresden.de;berkeley.edu", "author_num": 10, "aff_unique_index": "0;0;1;2;2;3;0", "aff_unique_norm": "University of California, Berkeley;Google;Meta;Technische Universit\u00e4t Dresden", "aff_unique_dep": ";Google DeepMind;Meta AI;", "aff_unique_url": "https://www.berkeley.edu;https://deepmind.com;https://meta.com;https://tu-dresden.de", "aff_unique_abbr": "UC Berkeley;DeepMind;Meta;TUD", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;1;0;0;2;0", "aff_country_unique": "United States;United Kingdom;Germany" }, { "title": "Zero-Shot Reinforcement Learning via Function Encoders", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32872", "id": "tHBLwSYnLf", "proceeding": "https://proceedings.mlr.press/v235/ingebrand24a.html", "pdf": "https://openreview.net/pdf?id=tHBLwSYnLf", "openreview": "https://openreview.net/forum?id=tHBLwSYnLf", "author_site": "Tyler Ingebrand, Amy Zhang, Ufuk Topcu", "tldr": "", "abstract": "Although reinforcement learning (RL) can solve many challenging sequential decision making problems, achieving *zero-shot* transfer across related tasks remains a challenge. The difficulty lies in finding a good representation for the current task so that the agent understands how it relates to previously seen tasks. To achieve zero-shot transfer, we introduce the *function encoder*, a representation learning algorithm which represents a function as a weighted combination of learned, non-linear basis functions. By using a function encoder to represent the reward function or the transition function, the agent has information on how the current task relates to previously seen tasks via a coherent vector representation. Thus, the agent is able to achieve transfer between related tasks at run time with no additional training. We demonstrate state-of-the-art data efficiency, asymptotic performance, and training stability in three RL fields by augmenting basic RL algorithms with a function encoder task representation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tyler Ingebrand;Amy Zhang;ufuk topcu", "authorids": "~Tyler_Ingebrand1;~Amy_Zhang1;~ufuk_topcu1", "gender": "M;Unspecified;F", "homepage": ";https://autonomy.oden.utexas.edu/;", "dblp": ";12/6659.html;43/2754", "google_scholar": ";jeNGFfQAAAAJ;", "orcid": ";0000-0003-0819-9985;", "linkedin": "tyler-ingebrand;;", "or_profile": "~Tyler_Ingebrand1;~ufuk_topcu1;~Amy_Zhang2", "aff": "University of Texas at Austin;University of Texas, Austin;Meta Facebook", "aff_domain": "utexas.edu;utexas.edu;facebook.com", "position": "PhD student;Full Professor;Research Scientist", "bibtex": "@inproceedings{\ningebrand2024zeroshot,\ntitle={Zero-Shot Reinforcement Learning via Function Encoders},\nauthor={Tyler Ingebrand and Amy Zhang and ufuk topcu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tHBLwSYnLf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1891413, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1526088112808695259&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "utexas.edu;utexas.edu;facebook.com", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Texas at Austin;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.utexas.edu;https://meta.com", "aff_unique_abbr": "UT Austin;Meta", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Fast Sampling-Based Sketches for Tensors", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32871", "id": "tMkPL7Tiul", "proceeding": "https://proceedings.mlr.press/v235/swartworth24a.html", "pdf": "https://openreview.net/pdf?id=tMkPL7Tiul", "openreview": "https://openreview.net/forum?id=tMkPL7Tiul", "author_site": "William Swartworth, David Woodruff", "tldr": "", "abstract": "We introduce a new approach for applying sampling-based sketches to two and three mode tensors. We illustrate our technique to construct sketches for the classical problems of $\\ell_0$ sampling and producing $\\ell_1$ embeddings. In both settings we achieve sketches that can be applied to a rank one tensor in $(\\mathbb{R}^d)^{\\otimes q}$ (for $q=2,3$) in time scaling with $d$ rather than $d^2$ or $d^3$. Our main idea is a particular sampling construction based on fast convolution which allows us to quickly compute sums over sufficiently random subsets of tensor entries.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "William Joseph Swartworth;David Woodruff", "authorids": "~William_Joseph_Swartworth1;~David_Woodruff1", "gender": ";M", "homepage": "https://www.math.ucla.edu/~wswartworth/;http://www.cs.cmu.edu/~dwoodruf/", "dblp": ";w/DPWoodruff", "google_scholar": ";https://scholar.google.com.tw/citations?user=0G2t-6sAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~William_Joseph_Swartworth1;~David_Woodruff1", "aff": ";Carnegie Mellon University", "aff_domain": ";cmu.edu", "position": ";Full Professor", "bibtex": "@inproceedings{\nswartworth2024fast,\ntitle={Fast Sampling-Based Sketches for Tensors},\nauthor={William Joseph Swartworth and David Woodruff},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tMkPL7Tiul}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 375322, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Ona88XVslqIJ:scholar.google.com/&scioq=Fast+Sampling-Based+Sketches+for+Tensors&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": ";cmu.edu", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Random Exploration in Bayesian Optimization: Order-Optimal Regret and Computational Efficiency", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32870", "id": "tOO6PD3kYP", "proceeding": "https://proceedings.mlr.press/v235/salgia24a.html", "pdf": "https://openreview.net/pdf?id=tOO6PD3kYP", "openreview": "https://openreview.net/forum?id=tOO6PD3kYP", "author_site": "Sudeep Salgia, Sattar Vakili, Qing Zhao", "tldr": "", "abstract": "We consider Bayesian optimization using Gaussian Process models, also referred to as kernel-based bandit optimization. We study the methodology of exploring the domain using random samples drawn from a distribution. We show that this random exploration approach achieves the optimal error rates. Our analysis is based on novel concentration bounds in an infinite dimensional Hilbert space established in this work, which may be of independent interest. We further develop an algorithm based on random exploration with domain shrinking and establish its order-optimal regret guarantees under both noise-free and noisy settings. In the noise-free setting, our analysis closes the existing gap in regret performance under a mild assumption on the underlying function and thereby *partially resolves a COLT open problem*. The proposed algorithm also enjoys a computational advantage over prevailing methods due to the random exploration that obviates the expensive optimization of a non-convex acquisition function for choosing the query points at each iteration.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sudeep Salgia;Sattar Vakili;Qing Zhao", "authorids": "~Sudeep_Salgia1;~Sattar_Vakili1;~Qing_Zhao1", "gender": "M;;F", "homepage": "https://sudeepsalgia.github.io/;https://sattar-vakili.github.io/;https://zhao.ece.cornell.edu/", "dblp": "207/8460;140/5473;", "google_scholar": "Y5d5L84AAAAJ;N9xs8w0AAAAJ;ymsLVFsAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Sudeep_Salgia1;~Sattar_Vakili1;~Qing_Zhao1", "aff": "Carnegie Mellon University;MediaTek Research;Cornell University", "aff_domain": "cmu.edu;mtkresearch.com;cornell.edu", "position": "Postdoc;Principal AI Research Manager;Full Professor", "bibtex": "@inproceedings{\nsalgia2024random,\ntitle={Random Exploration in Bayesian Optimization: Order-Optimal Regret and Computational Efficiency},\nauthor={Sudeep Salgia and Sattar Vakili and Qing Zhao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tOO6PD3kYP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 733795, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15892658298468428635&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "email": "cmu.edu;mtkresearch.com;cornell.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Carnegie Mellon University;MediaTek Inc.;Cornell University", "aff_unique_dep": ";Research;", "aff_unique_url": "https://www.cmu.edu;https://www.mediatek.com/;https://www.cornell.edu", "aff_unique_abbr": "CMU;MediaTek;Cornell", "aff_campus_unique_index": "1", "aff_campus_unique": ";Taiwan", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;China" }, { "title": "PARDEN, Can You Repeat That? Defending against Jailbreaks via Repetition", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32869", "id": "tQPkzTdaaN", "proceeding": "https://proceedings.mlr.press/v235/zhang24ca.html", "pdf": "https://openreview.net/pdf?id=tQPkzTdaaN", "openreview": "https://openreview.net/forum?id=tQPkzTdaaN", "author_site": "Ziyang Zhang, Qizhen Zhang, Jakob Foerster", "tldr": "", "abstract": "Large language models (LLMs) have shown success in many natural language processing tasks. Despite rigorous safety alignment processes, supposedly safety-aligned LLMs like Llama 2 and Claude 2 are still susceptible to jailbreaks, leading to security risks and abuse of the models. One option to mitigate such risks is to augment the LLM with a dedicated \"safeguard\", which checks the LLM's inputs or outputs for undesired behaviour. A promising approach is to use the LLM itself as the safeguard. Nonetheless, baseline methods, such as prompting the LLM to self-classify toxic content, demonstrate limited efficacy. We hypothesise that this is due to domain shift: the alignment training imparts a self-censoring behaviour to the model (\"Sorry I can't do that\"), while the self-classify approach shifts it to a classification format (\"Is this prompt malicious\"). In this work, we propose PARDEN, which avoids this domain shift by simply asking the model to repeat its own outputs. PARDEN neither requires finetuning nor white box access to the model. We empirically verify the effectiveness of our method and show that PARDEN significantly outperforms existing jailbreak detection baselines for Llama-2 and Claude-2. We find that PARDEN is particularly powerful in the relevant regime of high True Positive Rate (TPR) and low False Positive Rate (FPR). For instance, for Llama2-7B, at TPR equal to 90%, PARDEN accomplishes a roughly 11x reduction in the FPR from 24.8% to 2.0% on the harmful behaviours dataset. Code and data are available at https://github.com/Ed-Zh/PARDEN.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ziyang Zhang;Qizhen Zhang;Jakob Nicolaus Foerster", "authorids": "~Ziyang_Zhang9;~Qizhen_Zhang1;~Jakob_Nicolaus_Foerster1", "gender": ";F;M", "homepage": ";https://irenezhang30.github.io/;https://www.jakobfoerster.com", "dblp": ";;176/5095", "google_scholar": ";https://scholar.google.ca/citations?hl=en;6z4lQzMAAAAJ", "orcid": ";;", "linkedin": "edward-ziyang-zhang/;;", "or_profile": "~Ziyang_Zhang9;~Qizhen_Zhang1;~Jakob_Nicolaus_Foerster1", "aff": "University of Oxford;Cohere;University of Oxford, University of Oxford", "aff_domain": "ox.ac.uk;cohere.ai;eng.ox.ac.uk", "position": "MS student;Researcher;Associate Professor", "bibtex": "@inproceedings{\nzhang2024parden,\ntitle={{PARDEN}, Can You Repeat That? Defending against Jailbreaks via Repetition},\nauthor={Ziyang Zhang and Qizhen Zhang and Jakob Nicolaus Foerster},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tQPkzTdaaN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3595526, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11683401493323786923&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "ox.ac.uk;cohere.ai;eng.ox.ac.uk", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Oxford;Cohere", "aff_unique_dep": ";", "aff_unique_url": "https://www.ox.ac.uk;https://cohere.ai", "aff_unique_abbr": "Oxford;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "Barrier Algorithms for Constrained Non-Convex Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32868", "id": "tRESfzWFtf", "proceeding": "https://proceedings.mlr.press/v235/dvurechensky24a.html", "pdf": "https://openreview.net/pdf?id=tRESfzWFtf", "openreview": "https://openreview.net/forum?id=tRESfzWFtf", "author_site": "Pavel Dvurechenskii, Mathias Staudigl", "tldr": "", "abstract": "In this paper we theoretically show that interior-point methods based on self-concordant barriers possess favorable global complexity beyond their standard application area of convex optimization. To do that we propose first- and second-order methods for non-convex optimization problems with general convex set constraints and linear constraints. Our methods attain a suitably defined class of approximate first- or second-order KKT points with the worst-case iteration complexity similar to unconstrained problems, namely $O(\\varepsilon^{-2})$ (first-order) and $O(\\varepsilon^{-3/2})$ (second-order), respectively.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pavel Dvurechensky;Mathias Staudigl", "authorids": "~Pavel_Dvurechensky1;~Mathias_Staudigl1", "gender": ";M", "homepage": "http://wias-berlin.de/people/dvureche/?lang=1;https://www.wim.uni-mannheim.de/staudigl/", "dblp": "164/7242;https://dblp.uni-trier.de/pers/hd/s/Staudigl:Mathias", "google_scholar": "28MSou8AAAAJ;", "orcid": "0000-0003-1201-2343;0000-0003-2481-0019", "linkedin": ";", "or_profile": "~Pavel_Dvurechensky1;~Mathias_Staudigl1", "aff": "Weierstrass Institute for Applied Analysis and Stochastics;Maastricht University", "aff_domain": "wias-berlin.de;maastrichtuniversity.nl", "position": "Postdoc;Associate Professor", "bibtex": "@inproceedings{\ndvurechensky2024barrier,\ntitle={Barrier Algorithms for Constrained Non-Convex Optimization},\nauthor={Pavel Dvurechensky and Mathias Staudigl},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tRESfzWFtf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 534886, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1033486807009167327&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "wias-berlin.de;maastrichtuniversity.nl", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Weierstrass Institute for Applied Analysis and Stochastics;Maastricht University", "aff_unique_dep": ";", "aff_unique_url": "https://www.wias-berlin.de/;https://www.maastrichtuniversity.nl", "aff_unique_abbr": "WIAS;MU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Germany;Netherlands" }, { "title": "Latent Noise Segmentation: How Neural Noise Leads to the Emergence of Segmentation and Grouping", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32867", "id": "tSjyKR8WIf", "proceeding": "https://proceedings.mlr.press/v235/lonnqvist24a.html", "pdf": "https://openreview.net/pdf?id=tSjyKR8WIf", "openreview": "https://openreview.net/forum?id=tSjyKR8WIf", "author_site": "Ben Lonnqvist, Zhengqing Wu, Michael Herzog", "tldr": "", "abstract": "Humans are able to segment images effortlessly without supervision using perceptual grouping. Here, we propose a counter-intuitive computational approach to solving unsupervised perceptual grouping and segmentation: that they arise *because* of neural noise, rather than in spite of it. We (1) mathematically demonstrate that under realistic assumptions, neural noise can be used to separate objects from each other; (2) that adding noise in a DNN enables the network to segment images even though it was never trained on any segmentation labels; and (3) that segmenting objects using noise results in segmentation performance that aligns with the perceptual grouping phenomena observed in humans, and is sample-efficient. We introduce the Good Gestalt (GG) datasets --- six datasets designed to specifically test perceptual grouping, and show that our DNN models reproduce many important phenomena in human perception, such as illusory contours, closure, continuity, proximity, and occlusion. Finally, we (4) show that our model improves performance on our GG datasets compared to other tested unsupervised models by $24.9$%. Together, our results suggest a novel unsupervised segmentation method requiring few assumptions, a new explanation for the formation of perceptual grouping, and a novel potential benefit of neural noise.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ben Lonnqvist;Zhengqing Wu;Michael Herzog", "authorids": "~Ben_Lonnqvist1;~Zhengqing_Wu1;~Michael_Herzog1", "gender": ";M;", "homepage": "https://benlonnqvist.github.io/;https://people.epfl.ch/zhengqing.wu?lang=en;https://www.epfl.ch/labs/lpsy/", "dblp": ";;", "google_scholar": "be0tNVwAAAAJ;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Ben_Lonnqvist1;~Zhengqing_Wu1;~Michael_Herzog1", "aff": "EPFL - EPF Lausanne;EPFL;EPFL - EPF Lausanne", "aff_domain": "epfl.ch;ic.epfl.ch;epfl.ch", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nlonnqvist2024latent,\ntitle={Latent Noise Segmentation: How Neural Noise Leads to the Emergence of Segmentation and Grouping},\nauthor={Ben Lonnqvist and Zhengqing Wu and Michael Herzog},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tSjyKR8WIf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9116395, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11057844675269945673&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "epfl.ch;ic.epfl.ch;epfl.ch", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "EPFL", "aff_unique_dep": "", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Scene Graph Generation Strategy with Co-occurrence Knowledge and Learnable Term Frequency", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32866", "id": "tTq3qMkJ8w", "proceeding": "https://proceedings.mlr.press/v235/kim24n.html", "pdf": "https://openreview.net/pdf?id=tTq3qMkJ8w", "openreview": "https://openreview.net/forum?id=tTq3qMkJ8w", "author_site": "Hyeongjin Kim, Sangwon Kim, Dasom Ahn, Jong Taek Lee, Byoung Chul Ko", "tldr": "", "abstract": "Scene graph generation (SGG) is an important task in image understanding because it represents the relationships between objects in an image as a graph structure, making it possible to understand the semantic relationships between objects intuitively. Previous SGG studies used a message-passing neural networks (MPNN) to update features, which can effectively reflect information about surrounding objects. However, these studies have failed to reflect the co-occurrence of objects during SGG generation. In addition, they only addressed the long-tail problem of the training dataset from the perspectives of sampling and learning methods. To address these two problems, we propose CooK, which reflects the Co-occurrence Knowledge between objects, and the learnable term frequency-inverse document frequency (TF-$l$-IDF) to solve the long-tail problem. We applied the proposed model to the SGG benchmark dataset, and the results showed a performance improvement of up to 3.8% compared with existing state-of-the-art models in SGGen subtask. The proposed method exhibits generalization ability from the results obtained, showing uniform performance improvement for all MPNN models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "HyeongJin Kim;Sangwon Kim;Dasom Ahn;Jong Taek Lee;Byoung Chul Ko", "authorids": "~HyeongJin_Kim1;~Sangwon_Kim1;~Dasom_Ahn1;~Jong_Taek_Lee1;~Byoung_Chul_Ko1", "gender": "M;M;F;M;M", "homepage": "https://cvpr.kmu.ac.kr/member.htm;https://jumpsnack.github.io;https://tommy-ahn.github.io/;https://sites.google.com/view/k-vislab;https://cvpr.kmu.ac.kr/", "dblp": ";;;38/7728;30/6174", "google_scholar": ";pUHZ-IcAAAAJ;SyZb3N8AAAAJ;NZ55Q-AAAAAJ;o8ToM1QAAAAJ", "orcid": ";0000-0002-7452-3897;;0000-0002-6962-3148;0000-0002-7284-0768", "linkedin": ";;dasom-ahn-37ab35110/;;", "or_profile": "~HyeongJin_Kim1;~Sangwon_Kim1;~Dasom_Ahn1;~Jong_Taek_Lee1;~Byoung_Chul_Ko1", "aff": "Keimyung University;Keimyung University;Keimyung University;Kyungpook National University;Keimyung University", "aff_domain": "kmu.ac.kr;kmu.ac.kr;kmu.ac.kr;knu.ac.kr;kmu.ac.kr", "position": "PhD student;PhD student;PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nkim2024scene,\ntitle={Scene Graph Generation Strategy with Co-occurrence Knowledge and Learnable Term Frequency},\nauthor={HyeongJin Kim and Sangwon Kim and Dasom Ahn and Jong Taek Lee and Byoung Chul Ko},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tTq3qMkJ8w}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8473204, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6961256927901169260&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "kmu.ac.kr;kmu.ac.kr;kmu.ac.kr;knu.ac.kr;kmu.ac.kr", "author_num": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Keimyung University;Kyungpook National University", "aff_unique_dep": ";", "aff_unique_url": "https://www.keimyung.ac.kr;https://www.knu.ac.kr", "aff_unique_abbr": "KMU;KNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Finite Time Logarithmic Regret Bounds for Self-Tuning Regulation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32865", "id": "tTtSnpH4fc", "proceeding": "https://proceedings.mlr.press/v235/singh24b.html", "pdf": "https://openreview.net/pdf?id=tTtSnpH4fc", "openreview": "https://openreview.net/forum?id=tTtSnpH4fc", "author_site": "Rahul Singh, Akshay Mete, Avik Kar, P. R. Kumar", "tldr": "", "abstract": "We establish the first finite-time logarithmic regret bounds for the self-tuning regulation problem. We introduce a modified version of the certainty equivalence algorithm, which we call PIECE, that clips inputs in addition to utilizing probing inputs for exploration. We show that it has a $C \\log T$ upper bound on the regret after $T$ time-steps for bounded noise, and $C\\log^3 T$ in the case of sub-Gaussian noise, unlike the LQ problem where logarithmic regret is shown to be not possible. The PIECE algorithm is also designed to address the critical challenge of poor initial transient performance of reinforcement learning algorithms for linear systems. Comparative simulation results illustrate the improved performance of PIECE.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rahul Singh;Akshay Mete;Avik Kar;Panganamala Kumar", "authorids": "~Rahul_Singh5;~Akshay_Mete1;~Avik_Kar1;~Panganamala_Kumar1", "gender": "M;M;M;M", "homepage": "https://sites.google.com/view/rsingh12/home;;https://avik-kar.github.io/;https://cesg.tamu.edu/faculty/p-r-kumar/", "dblp": ";228/0587;;https://dblp.org/pers/k/Kumar:P=_R=.html", "google_scholar": ";evLF1akAAAAJ;https://scholar.google.co.in/citations?user=Nsj1CQ8AAAAJ;qGUpTVwAAAAJ", "orcid": ";;;0000-0003-0389-5367", "linkedin": ";;karavik18/;", "or_profile": "~Rahul_Singh5;~Akshay_Mete1;~Avik_Kar1;~Panganamala_Kumar1", "aff": "Indian Institute of Science;Texas A&M University - College Station;Indian Institute of Science, Indian institute of science, Bangalore;Texas A&M", "aff_domain": "iisc.ac.in;tamu.edu;iisc.ac.in;tamu.edu", "position": "Assistant Professor;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nsingh2024finite,\ntitle={Finite Time Logarithmic Regret Bounds for Self-Tuning Regulation},\nauthor={Rahul Singh and Akshay Mete and Avik Kar and Panganamala Kumar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tTtSnpH4fc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4826232, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:VJKEya3MA0cJ:scholar.google.com/&scioq=Finite+Time+Logarithmic+Regret+Bounds+for+Self-Tuning+Regulation&hl=en&as_sdt=0,5", "gs_version_total": 5, "email": "iisc.ac.in;tamu.edu;iisc.ac.in;tamu.edu", "author_num": 4, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Indian Institute of Science;Texas A&M University", "aff_unique_dep": ";", "aff_unique_url": "https://www.iisc.ac.in;https://www.tamu.edu", "aff_unique_abbr": "IISc;TAMU", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";College Station;Bangalore", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "India;United States" }, { "title": "ContPhy: Continuum Physical Concept Learning and Reasoning from Videos", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32864", "id": "tVwzR1myUp", "proceeding": "https://proceedings.mlr.press/v235/zheng24l.html", "pdf": "https://openreview.net/pdf?id=tVwzR1myUp", "openreview": "https://openreview.net/forum?id=tVwzR1myUp", "author_site": "Zhicheng Zheng, Xin Yan, Zhenfang Chen, Jingzhou Wang, Qin Zhi Eddie Lim, Josh Tenenbaum, Chuang Gan", "tldr": "", "abstract": "We introduce the Continuum Physical Dataset (ContPhy), a novel benchmark for assessing machine physical commonsense. ContPhy complements existing physical reasoning benchmarks by encompassing the inference of diverse physical properties, such as mass and density, across various scenarios and predicting corresponding dynamics. We evaluated a range of AI models and found that they still struggle to achieve satisfactory performance on ContPhy, which shows that current AI models still lack physical commonsense for the continuum, especially soft-bodies, and illustrates the value of the proposed dataset. We also introduce an oracle model (ContPRO) that marries the particle-based physical dynamic models with the recent large language models, which enjoy the advantages of both models, precise dynamic predictions, and interpretable reasoning. ContPhy aims to spur progress in perception and reasoning within diverse physical settings, narrowing the divide between human and machine intelligence in understanding the physical world.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhicheng Zheng;Xin Yan;Zhenfang Chen;Jingzhou Wang;Qin Zhi Eddie Lim;Joshua B. Tenenbaum;Chuang Gan", "authorids": "~Zhicheng_Zheng2;~Xin_Yan3;~Zhenfang_Chen1;~Jingzhou_Wang3;~Qin_Zhi_Eddie_Lim1;~Joshua_B._Tenenbaum1;~Chuang_Gan1", "gender": "M;M;M;M;;;M", "homepage": "https://zzcnewly.github.io;https://cakeyan.github.io/;https://zfchenunique.github.io;https://jingzhou-wang.github.io;;;http://people.csail.mit.edu/ganchuang/", "dblp": ";71/4884-8;207/5321;;;t/JoshuaBTenenbaum;139/6993", "google_scholar": ";https://scholar.google.com/citations?hl=en;QSRdIzAAAAAJ;;;;PTeSCbIAAAAJ", "orcid": ";;;;;;", "linkedin": ";;\u632f\u65b9-\u9648-512011bb/;;;;", "or_profile": "~Zhicheng_Zheng2;~Xin_Yan3;~Zhenfang_Chen1;~Jingzhou_Wang3;~Qin_Zhi_Eddie_Lim1;~Joshua_B._Tenenbaum1;~Chuang_Gan1", "aff": "Tsinghua University;Wuhan University;MIT-IBM Watson AI lab;Tsinghua University;;Massachusetts Institute of Technology;University of Massachusetts at Amherst", "aff_domain": "mails.tsinghua.edu.cn;whu.edu.cn;ibm.com;tsinghua.edu.cn;;mit.edu;umass.edu", "position": "Undergrad student;Undergrad student;Researcher;Undergrad student;;Professor;Assistant Professor", "bibtex": "@inproceedings{\nzheng2024contphy,\ntitle={ContPhy: Continuum Physical Concept Learning and Reasoning from Videos},\nauthor={Zhicheng Zheng and Xin Yan and Zhenfang Chen and Jingzhou Wang and Qin Zhi Eddie Lim and Joshua B. Tenenbaum and Chuang Gan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tVwzR1myUp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7244675, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10173364558284560883&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "mails.tsinghua.edu.cn;whu.edu.cn;ibm.com;tsinghua.edu.cn;;mit.edu;umass.edu", "author_num": 7, "aff_unique_index": "0;1;2;0;2;3", "aff_unique_norm": "Tsinghua University;Wuhan University;Massachusetts Institute of Technology;University of Massachusetts Amherst", "aff_unique_dep": ";;IBM Watson AI lab;", "aff_unique_url": "https://www.tsinghua.edu.cn;http://www.whu.edu.cn/;https://www.mitibmwatsonailab.org;https://www.umass.edu", "aff_unique_abbr": "THU;WHU;MIT-IBM AI Lab;UMass Amherst", "aff_campus_unique_index": "1", "aff_campus_unique": ";Amherst", "aff_country_unique_index": "0;0;1;0;1;1", "aff_country_unique": "China;United States" }, { "title": "Connecting the Dots: Is Mode-Connectedness the Key to Feasible Sample-Based Inference in Bayesian Neural Networks?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32863", "id": "tc3Nmcpmnx", "proceeding": "https://proceedings.mlr.press/v235/sommer24a.html", "pdf": "https://openreview.net/pdf?id=tc3Nmcpmnx", "openreview": "https://openreview.net/forum?id=tc3Nmcpmnx", "author_site": "Emanuel Sommer, Lisa Wimmer, Theodore Papamarkou, Ludwig Bothmann, Bernd Bischl, David R\u00fcgamer", "tldr": "", "abstract": "A major challenge in sample-based inference (SBI) for Bayesian neural networks is the size and structure of the networks\u2019 parameter space. Our work shows that successful SBI is possible by embracing the characteristic relationship between weight and function space, uncovering a systematic link between overparameterization and the difficulty of the sampling problem. Through extensive experiments, we establish practical guidelines for sampling and convergence diagnosis. As a result, we present a deep ensemble initialized approach as an effective solution with competitive performance and uncertainty quantification.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Emanuel Sommer;Lisa Wimmer;Theodore Papamarkou;Ludwig Bothmann;Bernd Bischl;David R\u00fcgamer", "authorids": "~Emanuel_Sommer1;~Lisa_Wimmer1;~Theodore_Papamarkou1;~Ludwig_Bothmann1;~Bernd_Bischl1;~David_R\u00fcgamer1", "gender": "M;F;M;M;M;M", "homepage": "https://www.muniq.ai/;https://www.slds.stat.uni-muenchen.de/people/wimmer/;https://www.theopapamarkou.com/;https://www.slds.stat.uni-muenchen.de/people/bothmann/;https://www.slds.stat.uni-muenchen.de/;https://davidruegamer.github.io/", "dblp": ";;;187/8625;48/5326;220/5560", "google_scholar": "https://scholar.google.de/citations?user=qa2P1tYAAAAJ;https://scholar.google.de/citations?user=l0hl-mAAAAAJ;ydMfbhAAAAAJ;;https://scholar.google.de/citations?user=s34UckkAAAAJ;https://scholar.google.de/citations?user=_DYguksAAAAJ", "orcid": ";0009-0009-7928-6075;0000-0002-9689-543X;;0000-0001-6002-6980;", "linkedin": "emanuelsommer/;;papamarkou/;;;", "or_profile": "~Emanuel_Sommer1;~Lisa_Wimmer1;~Theodore_Papamarkou1;~Ludwig_Bothmann1;~Bernd_Bischl1;~David_R\u00fcgamer1", "aff": "Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;LMU Munich;University of Manchester;Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;LMU;LMU Munich", "aff_domain": "lmu.de;stat.uni-muenchen.de;manchester.ac.uk;uni-muenchen.de;uni-muenchen.de;lmu.de", "position": "PhD student;PhD student;Full Professor;Postdoc;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nsommer2024connecting,\ntitle={Connecting the Dots: Is Mode-Connectedness the Key to Feasible Sample-Based Inference in Bayesian Neural Networks?},\nauthor={Emanuel Sommer and Lisa Wimmer and Theodore Papamarkou and Ludwig Bothmann and Bernd Bischl and David R{\\\"u}gamer},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tc3Nmcpmnx}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8732343, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12481936235468032140&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "lmu.de;stat.uni-muenchen.de;manchester.ac.uk;uni-muenchen.de;uni-muenchen.de;lmu.de", "author_num": 6, "aff_unique_index": "0;1;2;0;1;1", "aff_unique_norm": "Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;Ludwig Maximilian University of Munich;University of Manchester", "aff_unique_dep": ";;", "aff_unique_url": "https://www.lmu.de;https://www.lmu.de;https://www.manchester.ac.uk", "aff_unique_abbr": "LMU;LMU;UoM", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Munich", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "Germany;United Kingdom" }, { "title": "Mitigating Privacy Risk in Membership Inference by Convex-Concave Loss", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32862", "id": "tdomF3PW6A", "proceeding": "https://proceedings.mlr.press/v235/liu24q.html", "pdf": "https://openreview.net/pdf?id=tdomF3PW6A", "openreview": "https://openreview.net/forum?id=tdomF3PW6A", "author_site": "Zhenlong Liu, Lei Feng, HUIPING ZHUANG, Xiaofeng Cao, Hongxin Wei", "tldr": "", "abstract": "Machine learning models are susceptible to membership inference attacks (MIAs), which aim to infer whether a sample is in the training set. Existing work utilizes gradient ascent to enlarge the loss variance of training data, alleviating the privacy risk. However, optimizing toward a reverse direction may cause the model parameters to oscillate near local minima, leading to instability and suboptimal performance. In this work, we propose a novel method -- Convex Concave Loss (CCL), which enables a high variance of training loss distribution by gradient descent. Our method is motivated by the theoretical analysis that convex losses tend to decrease the loss variance during training. Thus, our key idea behind CCL is to reduce the convexity of loss functions with a concave term. Trained with CCL, neural networks produce losses with high variance for training data, reinforcing the defense against MIAs. Extensive experiments demonstrate the superiority of CCL, achieving a state-of-the-art balance in the privacy-utility trade-off.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhenlong Liu;Lei Feng;Huiping Zhuang;Xiaofeng Cao;Hongxin Wei", "authorids": "~Zhenlong_Liu1;~Lei_Feng1;~Huiping_Zhuang2;~Xiaofeng_Cao2;~Hongxin_Wei1", "gender": "M;M;M;M;M", "homepage": ";https://lfeng1995.github.io/;https://hongxin001.github.io/;https://zhuanghp.github.io/;https://xiaofengcaoml.github.io/", "dblp": ";76/847-6;150/6350;194/5829;117/3982-2.html", "google_scholar": ";https://scholar.google.com.sg/citations?user=KomQOFkAAAAJ;cABH034AAAAJ;https://scholar.google.com.sg/citations?user=vCXxuLkAAAAJ;", "orcid": ";0000-0003-2839-5799;;0000-0002-4612-5445;", "linkedin": "zhenlong-liu-710719276/?locale=en_US;;;;", "or_profile": "~Zhenlong_Liu1;~Lei_Feng1;~Hongxin_Wei1;~HUIPING_ZHUANG1;~Xiaofeng_Cao1", "aff": "Southern University of Science and Technology;Singapore University of Technology and Design;Southern University of Science and Technology;South China University of Technology;Jilin University", "aff_domain": "sustech.edu.cn;sutd.edu.sg;sustech.edu.cn;scut.edu.cn;jlu.edu.cn", "position": "MS student;Assistant Professor;Assistant Professor;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nliu2024mitigating,\ntitle={Mitigating Privacy Risk in Membership Inference by Convex-Concave Loss},\nauthor={Zhenlong Liu and Lei Feng and Huiping Zhuang and Xiaofeng Cao and Hongxin Wei},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tdomF3PW6A}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 696232, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5338720047581085954&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "sustech.edu.cn;sutd.edu.sg;sustech.edu.cn;scut.edu.cn;jlu.edu.cn", "author_num": 5, "aff_unique_index": "0;1;0;2;3", "aff_unique_norm": "Southern University of Science and Technology;Singapore University of Technology and Design;South China University of Technology;Jilin University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.sustech.edu.cn;https://www.sutd.edu.sg;https://www.scut.edu.cn;http://www.jlu.edu.cn", "aff_unique_abbr": "SUSTech;SUTD;SCUT;JLU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "China;Singapore" }, { "title": "MD tree: a model-diagnostic tree grown on loss landscape", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32861", "id": "teHPKqjX8q", "proceeding": "https://proceedings.mlr.press/v235/zhou24d.html", "pdf": "https://openreview.net/pdf?id=teHPKqjX8q", "openreview": "https://openreview.net/forum?id=teHPKqjX8q", "author_site": "Yefan Zhou, Jianlong Chen, Qinxue Cao, Konstantin Sch\u00fcrholt, Yaoqing Yang", "tldr": "", "abstract": "This paper considers ''model diagnosis'', which we formulate as a classification problem. Given a pre-trained neural network (NN), the goal is to predict the source of failure from a set of failure modes (such as a wrong hyperparameter, inadequate model size, and insufficient data) without knowing the training configuration of the pre-trained NN. The conventional diagnosis approach uses training and validation errors to determine whether the model is underfitting or overfitting. However, we show that rich information about NN performance is encoded in the optimization loss landscape, which provides more actionable insights than validation-based measurements. Therefore, we propose a diagnosis method called MD tree based on loss landscape metrics and experimentally demonstrate its advantage over classical validation-based approaches. We verify the effectiveness of MD tree in multiple practical scenarios: (1) use several models trained on one dataset to diagnose a model trained on another dataset, essentially a few-shot dataset transfer problem; (2) use small models (or models trained with small data) to diagnose big models (or models trained with big data), essentially a scale transfer problem. In a dataset transfer task, MD tree achieves an accuracy of 87.7%, outperforming validation-based approaches by 14.88%. Our code is available at [https://github.com/YefanZhou/ModelDiagnosis](https://github.com/YefanZhou/ModelDiagnosis).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yefan Zhou;Jianlong Chen;Qinxue Cao;Konstantin Sch\u00fcrholt;Yaoqing Yang", "authorids": "~Yefan_Zhou1;~Jianlong_Chen1;~Qinxue_Cao1;~Konstantin_Sch\u00fcrholt1;~Yaoqing_Yang1", "gender": "M;M;;M;M", "homepage": "https://yefanzhou.github.io/;https://mail.zju.edu.cn/coremail/XT5/index.jsp?sid=CAPbiDkDhZOnhmhliJPTJtUlanupgfNl#mail.read%7C%7B%22fid%22%3A1%2C%22mid%22%3A%222%3A1tbiAgEKBWRZDVU3ogAVsp%22%2C%22mboxa%22%3A%22%22%2C%22start%22%3A0%7D;https://github.com/Qin-xue;https://kschuerholt.github.io/;https://sites.google.com/site/yangyaoqingcmu/", "dblp": "237/4333;;;267/9297;04/4176", "google_scholar": "TAeVaicAAAAJ;;;refZxl4AAAAJ;LYvugWgAAAAJ", "orcid": ";;;;0000-0001-9908-5531", "linkedin": "yefan-zhou/;;;https://de.linkedin.com/in/konstantin-schuerholt/en;", "or_profile": "~Yefan_Zhou1;~Jianlong_Chen1;~Qinxue_Cao1;~Konstantin_Sch\u00fcrholt1;~Yaoqing_Yang1", "aff": "Dartmouth College;Zhejiang University;University of Illinois Urbana-Champaign;University of St. Gallen;Dartmouth College", "aff_domain": "dartmouth.edu;zju.edu.cn;illinois.edu;unisg.ch;dartmouth.edu", "position": "PhD student;Undergrad student;MS student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nzhou2024md,\ntitle={{MD} tree: a model-diagnostic tree grown on loss landscape},\nauthor={Yefan Zhou and Jianlong Chen and Qinxue Cao and Konstantin Sch{\\\"u}rholt and Yaoqing Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=teHPKqjX8q}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8356850, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18188743004389714674&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "dartmouth.edu;zju.edu.cn;illinois.edu;unisg.ch;dartmouth.edu", "author_num": 5, "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "Dartmouth College;Zhejiang University;University of Illinois Urbana-Champaign;University of St. Gallen", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.dartmouth.edu;https://www.zju.edu.cn;https://illinois.edu;https://www.unisg.ch", "aff_unique_abbr": "Dartmouth;ZJU;UIUC;HSG", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;1;0;2;0", "aff_country_unique": "United States;China;Switzerland" }, { "title": "Batch Singular Value Polarization and Weighted Semantic Augmentation for Universal Domain Adaptation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32860", "id": "teteOa9nJ9", "proceeding": "https://proceedings.mlr.press/v235/ziqi24a.html", "pdf": "https://openreview.net/pdf?id=teteOa9nJ9", "openreview": "https://openreview.net/forum?id=teteOa9nJ9", "author_site": "Ziqi Wang, Wei Wang, Chao Huang, Jie Wen, Cong Wang", "tldr": "", "abstract": "As a more challenging domain adaptation setting, universal domain adaptation (UniDA) introduces category shift on top of domain shift, which needs to identify unknown category in the target domain and avoid misclassifying target samples into source private categories. To this end, we propose a novel UniDA approach named Batch Singular value Polarization and Weighted Semantic Augmentation (BSP-WSA). Specifically, we adopt an adversarial classifier to identify the target unknown category and align feature distributions between the two domains. Then, we propose to perform SVD on the classifier's outputs to maximize larger singular values while minimizing those smaller ones, which could prevent target samples from being wrongly assigned to source private classes. To better bridge the domain gap, we propose a weighted semantic augmentation approach for UniDA to generate data on common categories between the two domains. Extensive experiments on three benchmarks demonstrate that BSP-WSA could outperform existing state-of-the-art UniDA approaches.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "WangZiQi;Wei Wang;Chao Huang;Jie Wen;Cong Wang", "authorids": "~WangZiQi2;~Wei_Wang99;~Chao_Huang16;~Jie_Wen1;~Cong_Wang21", "gender": "M;M;M;;M", "homepage": "https://github.com/BannerOF/;;;;https://scholar.google.com/citations?hl", "dblp": ";;;;", "google_scholar": ";;https://scholar.google.com/citations?hl=zh-CN;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~WangZiQi2;~Wei_Wang99;~Chao_Huang16;~Jie_Wen1;~Cong_Wang21", "aff": "SUN YAT-SEN UNIVERSITY;SUN YAT-SEN UNIVERSITY;University of Macau;;Dalian University of Technology", "aff_domain": "mail.sysu.edu.cn;mail.sysu.edu.cn;um.edu.mo;;dlut.edu.cn", "position": "Intern;Postdoc;Intern;;MS student", "bibtex": "@inproceedings{\nwangziqi2024batch,\ntitle={Batch Singular Value Polarization and Weighted Semantic Augmentation for Universal Domain Adaptation},\nauthor={WangZiQi and Wei Wang and Chao Huang and Jie Wen and Cong Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=teteOa9nJ9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 0, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:V_G74Lf5VSAJ:scholar.google.com/&scioq=Batch+Singular+Value+Polarization+and+Weighted+Semantic+Augmentation+for+Universal+Domain+Adaptation&hl=en&as_sdt=0,33", "gs_version_total": 4, "email": "mail.sysu.edu.cn;mail.sysu.edu.cn;um.edu.mo;;dlut.edu.cn", "author_num": 5, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Sun Yat-sen University;University of Macau;Dalian University of Technology", "aff_unique_dep": ";;", "aff_unique_url": "http://www.sysu.edu.cn;https://www.um.edu.mo;http://www.dlut.edu.cn/", "aff_unique_abbr": "SYSU;UM;DUT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Macau SAR", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Keypoint-based Progressive Chain-of-Thought Distillation for LLMs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32859", "id": "tgsSKziIEa", "proceeding": "https://proceedings.mlr.press/v235/feng24e.html", "pdf": "https://openreview.net/pdf?id=tgsSKziIEa", "openreview": "https://openreview.net/forum?id=tgsSKziIEa", "author_site": "Kaituo Feng, Changsheng Li, Xiaolu Zhang, JUN ZHOU, Ye Yuan, Guoren Wang", "tldr": "", "abstract": "Chain-of-thought distillation is a powerful technique for transferring reasoning abilities from large language models (LLMs) to smaller student models. Previous methods typically require the student to mimic the step-by-step rationale produced by LLMs, often facing the following challenges: (i) Tokens within a rationale vary in significance, and treating them equally may fail to accurately mimic keypoint tokens, leading to reasoning errors. (ii) They usually distill knowledge by consistently predicting all the steps in a rationale, which falls short in distinguishing the learning order of step generation. This diverges from the human cognitive progression of starting with easy tasks and advancing to harder ones, resulting in sub-optimal outcomes. To this end, we propose a unified framework, called KPOD, to address these issues. Specifically, we propose a token weighting module utilizing mask learning to encourage accurate mimicry of keypoint tokens by the student during distillation. Besides, we develop an in-rationale progressive distillation strategy, starting with training the student to generate the final reasoning steps and gradually extending to cover the entire rationale. To accomplish this, a weighted token generation loss is proposed to assess step reasoning difficulty, and a value function is devised to schedule the progressive distillation by considering both step difficulty and question diversity. Extensive experiments on four reasoning benchmarks illustrate our KPOD outperforms previous methods by a large margin.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kaituo Feng;Changsheng Li;Xiaolu Zhang;JUN ZHOU;Ye Yuan;Guoren Wang", "authorids": "~Kaituo_Feng1;~Changsheng_Li4;~Xiaolu_Zhang2;~JUN_ZHOU6;~Ye_Yuan15;~Guoren_Wang2", "gender": "M;M;F;M;;M", "homepage": "https://github.com/tulerfeng;;https://scholar.google.com/citations?user=cAz9PToAAAAJ;https://scholar.google.com/citations?user=mCVvloEAAAAJ&hl=en;;https://guorenwang.github.io/", "dblp": "322/6044;;48/5176;99/3847-11;;", "google_scholar": "m1iCh00AAAAJ;FfJnUioAAAAJ;;mCVvloEAAAAJ;;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";0000-0001-9789-7632;0000-0001-8055-0245;0000-0001-6033-6102;;", "linkedin": ";;;;;", "or_profile": "~Kaituo_Feng1;~Changsheng_Li4;~Xiaolu_Zhang2;~JUN_ZHOU6;~Ye_Yuan15;~Guoren_Wang2", "aff": "Beijing Institute of Technology;Beijing Institute of Technology;Ant Group;Ant Group;;Beijing Institute of Technology", "aff_domain": "bit.edu.cn;bit.edu.cn;antfin.com;antgroup.com;;bit.edu.cn", "position": "MS student;Full Professor;Researcher;Researcher;;Full Professor", "bibtex": "@inproceedings{\nfeng2024keypointbased,\ntitle={Keypoint-based Progressive Chain-of-Thought Distillation for {LLM}s},\nauthor={Kaituo Feng and Changsheng Li and Xiaolu Zhang and JUN ZHOU and Ye Yuan and Guoren Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tgsSKziIEa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 860906, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4692399033032633254&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "bit.edu.cn;bit.edu.cn;antfin.com;antgroup.com;;bit.edu.cn", "author_num": 6, "aff_unique_index": "0;0;1;1;0", "aff_unique_norm": "Beijing Institute of Technology;Ant Group", "aff_unique_dep": ";", "aff_unique_url": "http://www.bit.edu.cn/;https://www.antgroup.com", "aff_unique_abbr": "BIT;Ant Group", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Offline Actor-Critic Reinforcement Learning Scales to Large Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32858", "id": "tl2qmO5kpD", "proceeding": "https://proceedings.mlr.press/v235/springenberg24a.html", "pdf": "https://openreview.net/pdf?id=tl2qmO5kpD", "openreview": "https://openreview.net/forum?id=tl2qmO5kpD", "author_site": "Jost Springenberg, Abbas Abdolmaleki, Jingwei Zhang, Oliver M Groth, Michael Bloesch, Thomas Lampe, Philemon Brakel, Sarah Bechtle, Steven Kapturowski, Roland Hafner, Nicolas Heess, Martin Riedmiller", "tldr": "", "abstract": "We show that offline actor-critic reinforcement learning can scale to large models - such as transformers - and follows similar scaling laws as supervised learning. We find that offline actor-critic algorithms can outperform strong, supervised, behavioral cloning baselines for multi-task training on a large dataset; containing both sub-optimal and expert behavior on 132 continuous control tasks. We introduce a Perceiver-based actor-critic model and elucidate the key features needed to make offline RL work with self- and cross-attention modules. Overall, we find that: i) simple offline actor critic algorithms are a natural choice for gradually moving away from the currently predominant paradigm of behavioral cloning, and ii) via offline RL it is possible to learn multi-task policies that master many domains simultaneously, including real robotics tasks, from sub-optimal demonstrations or self-generated data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jost Tobias Springenberg;Abbas Abdolmaleki;Jingwei Zhang;Oliver Groth;Michael Bloesch;Thomas Lampe;Philemon Brakel;Sarah Maria Elisabeth Bechtle;Steven Kapturowski;Roland Hafner;Nicolas Heess;Martin Riedmiller", "authorids": "~Jost_Tobias_Springenberg1;~Abbas_Abdolmaleki3;~Jingwei_Zhang2;~Oliver_Groth1;~Michael_Bloesch1;~Thomas_Lampe1;~Philemon_Brakel1;~Sarah_Maria_Elisabeth_Bechtle1;~Steven_Kapturowski1;~Roland_Hafner1;~Nicolas_Heess1;~Martin_Riedmiller1", "gender": "M;;;M;;;M;F;;Not Specified;;M", "homepage": "http://www.springenberg-tobias.de;;;https://ogroth.github.io/;;;;;;;;https://www.riedmiller.me/", "dblp": ";;;172/0935;40/8368;139/5934;82/10570;180/9966;;19/765;76/9181;", "google_scholar": ";;;https://scholar.google.co.uk/citations?user=h3wt5ocAAAAJ;fn6GhgoAAAAJ;;https://scholar.google.ca/citations?user=Q6UMpRYAAAAJ;https://scholar.google.com/citations?hl=de;;;79k7bGEAAAAJ;1gVfqpcAAAAJ", "orcid": ";;;;;;;;;;;", "linkedin": ";;;olivergroth/;;;;;stevenkapturowski/;;;", "or_profile": "~Jost_Tobias_Springenberg1;~Abbas_Abdolmaleki3;~Jingwei_Zhang2;~Oliver_Groth1;~Michael_Bloesch1;~Thomas_Lampe1;~Philemon_Brakel1;~Sarah_Maria_Elisabeth_Bechtle1;~Steven_Kapturowski1;~Roland_Hafner1;~Nicolas_Heess1;~Martin_Riedmiller1", "aff": "Google DeepMind;Google;;Google DeepMind;Google DeepMind;Google DeepMind;Google/DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;", "aff_domain": "google.com;google.com;;deepmind.com;google.com;deepmind.com;google.com;deepmind.com;deepmind.com;deepmind.com;google.com;", "position": "Researcher;research scientist;;Research Scientist;Research Scientist;Researcher;Research Scientist;Researcher;Staff Research Engineer;Researcher;Research Scientist;", "bibtex": "@inproceedings{\nspringenberg2024offline,\ntitle={Offline Actor-Critic Reinforcement Learning Scales to Large Models},\nauthor={Jost Tobias Springenberg and Abbas Abdolmaleki and Jingwei Zhang and Oliver Groth and Michael Bloesch and Thomas Lampe and Philemon Brakel and Sarah Maria Elisabeth Bechtle and Steven Kapturowski and Roland Hafner and Nicolas Heess and Martin Riedmiller},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tl2qmO5kpD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5332659, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4804325559717083287&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 6, "email": "google.com;google.com;;deepmind.com;google.com;deepmind.com;google.com;deepmind.com;deepmind.com;deepmind.com;google.com;", "author_num": 12, "aff_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google DeepMind", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;0;0;0;0;0;0;0;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "Rethinking Adversarial Robustness in the Context of the Right to be Forgotten", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32857", "id": "tmUorldOWN", "proceeding": "https://proceedings.mlr.press/v235/zhao24k.html", "pdf": "https://openreview.net/pdf?id=tmUorldOWN", "openreview": "https://openreview.net/forum?id=tmUorldOWN", "author_site": "Chenxu Zhao, Wei Qian, Yangyi Li, Aobo Chen, Mengdi Huai", "tldr": "", "abstract": "The past few years have seen an intense research interest in the practical needs of the \"right to be forgotten\", which has motivated researchers to develop machine unlearning methods to unlearn a fraction of training data and its lineage. While existing machine unlearning methods prioritize the protection of individuals' private data, they overlook investigating the unlearned models' susceptibility to adversarial attacks and security breaches. In this work, we uncover a novel security vulnerability of machine unlearning based on the insight that adversarial vulnerabilities can be bolstered, especially for adversarially robust models. To exploit this observed vulnerability, we propose a novel attack called Adversarial Unlearning Attack (AdvUA), which aims to generate a small fraction of malicious unlearning requests during the unlearning process. AdvUA causes a significant reduction of adversarial robustness in the unlearned model compared to the original model, providing an entirely new capability for adversaries that is infeasible in conventional machine learning pipelines. Notably, we also show that AdvUA can effectively enhance model stealing attacks by extracting additional decision boundary information, further emphasizing the breadth and significance of our research. We also conduct both theoretical analysis and computational complexity of AdvUA. Extensive numerical studies are performed to demonstrate the effectiveness and efficiency of the proposed attack.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chenxu Zhao;Wei Qian;Yangyi Li;Aobo Chen;Mengdi Huai", "authorids": "~Chenxu_Zhao4;~Wei_Qian5;~Yangyi_Li1;~Aobo_Chen1;~Mengdi_Huai1", "gender": "M;;M;F;M", "homepage": ";;;https://mdhuai.github.io/;", "dblp": ";326/2983;158/7501;150/8482;", "google_scholar": "n1gDJZQAAAAJ;evF1Es8AAAAJ;unVFhR8AAAAJ;40ZYTzEAAAAJ;6J8ln3QAAAAJ", "orcid": "0000-0002-3298-9218;;;0000-0001-6368-5973;", "linkedin": ";;;;chenxu-zhao-2b6590181/", "or_profile": "~Wei_Qian5;~Yangyi_Li1;~Aobo_Chen1;~Mengdi_Huai1;~CHENXU_ZHAO2", "aff": "Iowa State University;Iowa State University;Iowa State University;Iowa State University;Iowa State University", "aff_domain": "cs.iastate.edu;iastate.edu;iastate.edu;iastate.edu;iastate.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nzhao2024rethinking,\ntitle={Rethinking Adversarial Robustness in the Context of the Right to be Forgotten},\nauthor={Chenxu Zhao and Wei Qian and Yangyi Li and Aobo Chen and Mengdi Huai},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tmUorldOWN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 593210, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=420524086692277025&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "cs.iastate.edu;iastate.edu;iastate.edu;iastate.edu;iastate.edu", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Iowa State University", "aff_unique_dep": "", "aff_unique_url": "https://www.iastate.edu", "aff_unique_abbr": "ISU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Diffusion Posterior Sampling is Computationally Intractable", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32856", "id": "tp6ruPIfIV", "proceeding": "https://proceedings.mlr.press/v235/gupta24a.html", "pdf": "https://openreview.net/pdf?id=tp6ruPIfIV", "openreview": "https://openreview.net/forum?id=tp6ruPIfIV", "author_site": "Shivam Gupta, Ajil Jalal, Aditya Parulekar, Eric Price, Zhiyang Xun", "tldr": "", "abstract": "Diffusion models are a remarkably effective way of learning and sampling from a distribution $p(x)$. In posterior sampling, one is also given a measurement model $p(y \\mid x)$ and a measurement $y$, and would like to sample from $p(x \\mid y)$. Posterior sampling is useful for tasks such as inpainting, super-resolution, and MRI reconstruction, so a number of recent works have given algorithms to heuristically approximate it; but none are known to converge to the correct distribution in polynomial time. In this paper we show that posterior sampling is *computationally intractable*: under the most basic assumption in cryptography---that one-way functions exist---there are instances for which *every* algorithm takes superpolynomial time, even though *unconditional* sampling is provably fast. We also show that the exponential-time rejection sampling algorithm is essentially optimal under the stronger plausible assumption that there are one-way functions that take exponential time to invert.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shivam Gupta;Ajil Jalal;Aditya Parulekar;Eric Price;Zhiyang Xun", "authorids": "~Shivam_Gupta1;~Ajil_Jalal1;~Aditya_Parulekar1;~Eric_Price1;~Zhiyang_Xun1", "gender": "M;M;M;;M", "homepage": "https://shivamgupta2.github.io/;;https://www.linkedin.com/in/aditya-parulekar-b97899190/;;", "dblp": "29/8830-2;173/5088;293/7171;;307/5331", "google_scholar": "HsbPV-EAAAAJ;ePC7IC0AAAAJ;;;ICzrrFkAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Shivam_Gupta1;~Ajil_Jalal1;~Aditya_Parulekar1;~Eric_Price1;~Zhiyang_Xun1", "aff": "University of Texas, Austin;University of California, Berkeley;University of Texas at Austin;;University of Texas at Austin", "aff_domain": "utexas.edu;berkeley.edu;utexas.edu;;utexas.edu", "position": "PhD student;Postdoc;PhD student;;PhD student", "bibtex": "@inproceedings{\ngupta2024diffusion,\ntitle={Diffusion Posterior Sampling is Computationally Intractable},\nauthor={Shivam Gupta and Ajil Jalal and Aditya Parulekar and Eric Price and Zhiyang Xun},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tp6ruPIfIV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 767186, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10977192164647871709&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "utexas.edu;berkeley.edu;utexas.edu;;utexas.edu", "author_num": 5, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of Texas at Austin;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://www.utexas.edu;https://www.berkeley.edu", "aff_unique_abbr": "UT Austin;UC Berkeley", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Austin;Berkeley", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "How to Escape Sharp Minima with Random Perturbations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32855", "id": "tpYHbEl7P1", "proceeding": "https://proceedings.mlr.press/v235/ahn24a.html", "pdf": "https://openreview.net/pdf?id=tpYHbEl7P1", "openreview": "https://openreview.net/forum?id=tpYHbEl7P1", "author_site": "Kwangjun Ahn, Ali Jadbabaie, Suvrit Sra", "tldr": "", "abstract": "Modern machine learning applications have witnessed the remarkable success of optimization algorithms that are designed to find flat minima. Motivated by this design choice, we undertake a formal study that (i) formulates the notion of flat minima, and (ii) studies the complexity of finding them. Specifically, we adopt the trace of the Hessian of the cost function as a measure of flatness, and use it to formally define the notion of approximate flat minima. Under this notion, we then analyze algorithms that find approximate flat minima efficiently. For general cost functions, we discuss a gradient-based algorithm that finds an approximate flat local minimum efficiently. The main component of the algorithm is to use gradients computed from randomly perturbed iterates to estimate a direction that leads to flatter minima. For the setting where the cost function is an empirical risk over training data, we present a faster algorithm that is inspired by a recently proposed practical algorithm called sharpness-aware minimization, supporting its success in practice.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kwangjun Ahn;Ali Jadbabaie;Suvrit Sra", "authorids": "~Kwangjun_Ahn2;~Ali_Jadbabaie1;~Suvrit_Sra1", "gender": ";M;", "homepage": "http://kjahn.mit.edu/;http://www.mit.edu/~jadbabai/www;https://optml.mit.edu", "dblp": ";83/3158;90/930", "google_scholar": "z94iNtgAAAAJ;ZBc_WwYAAAAJ;eyCw9goAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Kwangjun_Ahn2;~Ali_Jadbabaie1;~Suvrit_Sra1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu", "position": "PhD student;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nahn2024how,\ntitle={How to Escape Sharp Minima with Random Perturbations},\nauthor={Kwangjun Ahn and Ali Jadbabaie and Suvrit Sra},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tpYHbEl7P1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1131392, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4264951420631111681&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "mit.edu;mit.edu;mit.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Learning to Infer Generative Template Programs for Visual Concepts", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32854", "id": "ttaTyweIr1", "proceeding": "https://proceedings.mlr.press/v235/jones24a.html", "pdf": "https://openreview.net/pdf?id=ttaTyweIr1", "openreview": "https://openreview.net/forum?id=ttaTyweIr1", "author_site": "R. Kenny Jones, Siddhartha Chaudhuri, Daniel Ritchie", "tldr": "", "abstract": "People grasp flexible visual concepts from a few examples. We explore a neurosymbolic system that learns how to infer programs that capture visual concepts in a domain-general fashion. We introduce Template Programs: programmatic expressions from a domain-specific language that specify structural and parametric patterns common to an input concept. Our framework supports multiple concept-related tasks, including few-shot generation and co-segmentation through parsing. We develop a learning paradigm that allows us to train networks that infer Template Programs directly from visual datasets that contain concept groupings. We run experiments across multiple visual domains: 2D layouts, Omniglot characters, and 3D shapes. We find that our method outperforms task-specific alternatives, and performs competitively against domain-specific approaches for the limited domains where they exist.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "R. Kenny Jones;Siddhartha Chaudhuri;Daniel Ritchie", "authorids": "~R._Kenny_Jones1;~Siddhartha_Chaudhuri3;~Daniel_Ritchie1", "gender": "M;M;M", "homepage": "https://rkjones4.github.io/;http://dritchie.github.io;https://sidch.com", "dblp": "274/7070;17/7188.html;42/8074", "google_scholar": "NwVbkmQAAAAJ;0RiypNsAAAAJ;QMc-grEAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~R._Kenny_Jones1;~Daniel_Ritchie1;~Siddhartha_Chaudhuri1", "aff": "Brown University;Brown University;Adobe", "aff_domain": "brown.edu;brown.edu;adobe.com", "position": "PhD student;Assistant Professor;Senior Research Scientist", "bibtex": "@inproceedings{\njones2024learning,\ntitle={Learning to Infer Generative Template Programs for Visual Concepts},\nauthor={R. Kenny Jones and Siddhartha Chaudhuri and Daniel Ritchie},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ttaTyweIr1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2926630, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6616541480916777772&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "brown.edu;brown.edu;adobe.com", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Brown University;Adobe", "aff_unique_dep": ";Adobe Inc.", "aff_unique_url": "https://www.brown.edu;https://www.adobe.com", "aff_unique_abbr": "Brown;Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Pairwise Alignment Improves Graph Domain Adaptation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32853", "id": "ttnbM598vZ", "proceeding": "https://proceedings.mlr.press/v235/liu24ci.html", "pdf": "https://openreview.net/pdf?id=ttnbM598vZ", "openreview": "https://openreview.net/forum?id=ttnbM598vZ", "author_site": "Shikun Liu, Deyu Zou, Han Zhao, Pan Li", "tldr": "", "abstract": "Graph-based methods, pivotal for label inference over interconnected objects in many real-world applications, often encounter generalization challenges, if the graph used for model training differs significantly from the graph used for testing. This work delves into Graph Domain Adaptation (GDA) to address the unique complexities of distribution shifts over graph data, where interconnected data points experience shifts in features, labels, and in particular, connecting patterns. We propose a novel, theoretically principled method, Pairwise Alignment (Pair-Align) to counter graph structure shift by mitigating conditional structure shift (CSS) and label shift (LS). Pair-Align uses edge weights to recalibrate the influence among neighboring nodes to handle CSS and adjusts the classification loss with label weights to handle LS. Our method demonstrates superior performance in real-world applications, including node classification with region shift in social networks, and the pileup mitigation task in particle colliding experiments. For the first application, we also curate the largest dataset by far for GDA studies. Our method shows strong performance in synthetic and other existing benchmark datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shikun Liu;Deyu Zou;Han Zhao;Pan Li", "authorids": "~Shikun_Liu3;~Deyu_Zou1;~Han_Zhao1;~Pan_Li2", "gender": ";M;M;", "homepage": "https://shikun-liu.com/;https://github.com/unimpor;https://hanzhaoml.github.io/;", "dblp": ";;03/3520-2;https://dblp.org/pers/hd/l/Li_0005:Pan", "google_scholar": "BLafTygAAAAJ;;x942ipYAAAAJ;IroP0EwAAAAJ", "orcid": ";;0000-0002-8579-1600;", "linkedin": ";;;pan-li-b951105a/", "or_profile": "~Shikun_Liu3;~Deyu_Zou1;~Han_Zhao1;~Pan_Li2", "aff": "Georgia Institute of Technology;University of Science and Technology of China;University of Illinois, Urbana Champaign;Purdue University", "aff_domain": "gatech.edu;ustc.edu.cn;illinois.edu;purdue.edu", "position": "PhD student;Undergrad student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nliu2024pairwise,\ntitle={Pairwise Alignment Improves Graph Domain Adaptation},\nauthor={Shikun Liu and Deyu Zou and Han Zhao and Pan Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ttnbM598vZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 728710, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7583554926916254447&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "gatech.edu;ustc.edu.cn;illinois.edu;purdue.edu", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Georgia Institute of Technology;University of Science and Technology of China;University of Illinois Urbana-Champaign;Purdue University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.gatech.edu;http://www.ustc.edu.cn;https://illinois.edu;https://www.purdue.edu", "aff_unique_abbr": "Georgia Tech;USTC;UIUC;Purdue", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;China" }, { "title": "DNCs Require More Planning Steps", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32852", "id": "tu5fCCuua2", "proceeding": "https://proceedings.mlr.press/v235/shamshoum24a.html", "pdf": "https://openreview.net/pdf?id=tu5fCCuua2", "openreview": "https://openreview.net/forum?id=tu5fCCuua2", "author_site": "Yara Shamshoum, Nitzan Hodos, Yuval Sieradzki, Assaf Schuster", "tldr": "", "abstract": "Many recent works use machine learning models to solve various complex algorithmic problems. However, these models attempt to reach a solution without considering the problem's required computational complexity, which can be detrimental to their ability to solve it correctly. In this work we investigate the effect of computational time and memory on generalization of implicit algorithmic solvers. To do so, we focus on the Differentiable Neural Computer (DNC), a general problem solver that also lets us reason directly about its usage of time and memory. In this work, we argue that the number of planning steps the model is allowed to take, which we call \u201dplanning budget\u201d, is a constraint that can cause the model to generalize poorly and hurt its ability to fully utilize its external memory. We evaluate our method on Graph Shortest Path, Convex Hull, Graph MinCut and Associative Recall, and show how the planning budget can drastically change the behavior of the learned algorithm, in terms of learned time complexity, training time, stability and generalization to inputs larger than those seen during training.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yara Shamshoum;Nitzan Hodos;Yuval Sieradzki;Assaf Schuster", "authorids": "~Yara_Shamshoum1;~Nitzan_Hodos1;~Yuval_Sieradzki1;~Assaf_Schuster2", "gender": "F;;M;M", "homepage": ";;;https://assaf.net.technion.ac.il/", "dblp": "379/5983;322/8783;322/9296;s/AssafSchuster", "google_scholar": ";https://scholar.google.com/citations?hl=en;-v1mrtcAAAAJ;https://scholar.google.co.il/citations?user=KfwgjswAAAAJ", "orcid": ";;;0000-0002-3311-6937", "linkedin": "yara-shamshoum-332a571ba/;nitzan-hodos/;;", "or_profile": "~Yara_Shamshoum1;~Nitzan_Hodos1;~Yuval_Sieradzki1;~Assaf_Schuster2", "aff": "Computer Science Department, Technion - Israel Institute of Technology;Taub Faculty of Computer Science, Technion - Israel Institute of Technology;Computer Science Department, Technion - Israel Institute of Technology;Technion - Israel Institute of Technology, Technion", "aff_domain": "cs.technion.ac.il;cs.technion.ac.il;cs.technion.ac.il;technion.ac.il", "position": "PhD student;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nshamshoum2024dncs,\ntitle={{DNC}s Require More Planning Steps},\nauthor={Yara Shamshoum and Nitzan Hodos and Yuval Sieradzki and Assaf Schuster},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tu5fCCuua2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2928796, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:IV0701Y3hzgJ:scholar.google.com/&scioq=DNCs+Require+More+Planning+Steps&hl=en&as_sdt=0,34", "gs_version_total": 8, "email": "cs.technion.ac.il;cs.technion.ac.il;cs.technion.ac.il;technion.ac.il", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "Computer Science Department", "aff_unique_url": "https://www.technion.ac.il", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Israel" }, { "title": "Faithfulness Measurable Masked Language Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32851", "id": "tw1PwpuAuN", "proceeding": "https://proceedings.mlr.press/v235/madsen24a.html", "pdf": "https://openreview.net/pdf?id=tw1PwpuAuN", "openreview": "https://openreview.net/forum?id=tw1PwpuAuN", "author_site": "Andreas Madsen, Siva Reddy, Sarath Chandar", "tldr": "", "abstract": "A common approach to explaining NLP models is to use importance measures that express which tokens are important for a prediction. Unfortunately, such explanations are often wrong despite being persuasive. Therefore, it is essential to measure their faithfulness. One such metric is if tokens are truly important, then masking them should result in worse model performance. However, token masking introduces out-of-distribution issues, and existing solutions that address this are computationally expensive and employ proxy models. Furthermore, other metrics are very limited in scope. This work proposes an inherently faithfulness measurable model that addresses these challenges. This is achieved using a novel fine-tuning method that incorporates masking, such that masking tokens become in-distribution by design. This differs from existing approaches, which are completely model-agnostic but are inapplicable in practice. We demonstrate the generality of our approach by applying it to 16 different datasets and validate it using statistical in-distribution tests. The faithfulness is then measured with 9 different importance measures. Because masking is in-distribution, importance measures that themselves use masking become consistently more faithful. Additionally, because the model makes faithfulness cheap to measure, we can optimize explanations towards maximal faithfulness; thus, our model becomes indirectly inherently explainable.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Andreas Madsen;Siva Reddy;Sarath Chandar", "authorids": "~Andreas_Madsen1;~Siva_Reddy1;~Sarath_Chandar1", "gender": "M;M;M", "homepage": "https://andreasmadsen.github.io/;http://sivareddy.in;http://sarathchandar.in/", "dblp": "250/2642;64/8153;45/8542", "google_scholar": "X0zwAXYAAAAJ;;https://scholar.google.co.in/citations?user=yxWtZLAAAAAJ", "orcid": "0000-0002-1487-2796;;", "linkedin": "https://linkedin.com/in/andreasmad/;;", "or_profile": "~Andreas_Madsen1;~Siva_Reddy1;~Sarath_Chandar1", "aff": "Mila;Mila, McGill University;\u00c9cole Polytechnique de Montr\u00e9al", "aff_domain": "mila.quebec;mila.quebec;polymtl.ca", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nmadsen2024faithfulness,\ntitle={Faithfulness Measurable Masked Language Models},\nauthor={Andreas Madsen and Siva Reddy and Sarath Chandar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tw1PwpuAuN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3056868, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3700505424873841878&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "mila.quebec;mila.quebec;polymtl.ca", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Mila;McGill University;\u00c9cole Polytechnique de Montr\u00e9al", "aff_unique_dep": "Quebec Artificial Intelligence Institute;Mila;", "aff_unique_url": "https://mila.quebec;https://www.mcgill.ca;https://www.polymtl.ca", "aff_unique_abbr": "Mila;McGill;Polytechnique Montr\u00e9al", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Montreal;Montr\u00e9al", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "title": "Bivariate Causal Discovery using Bayesian Model Selection", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32850", "id": "twm7qPVX1F", "proceeding": "https://proceedings.mlr.press/v235/dhir24a.html", "pdf": "https://openreview.net/pdf?id=twm7qPVX1F", "openreview": "https://openreview.net/forum?id=twm7qPVX1F", "author_site": "Anish Dhir, Samuel Power, Mark van der Wilk", "tldr": "", "abstract": "Much of the causal discovery literature prioritises guaranteeing the identifiability of causal direction in statistical models. For structures within a Markov equivalence class, this requires strong assumptions which may not hold in real-world datasets, ultimately limiting the usability of these methods. Building on previous attempts, we show how to incorporate causal assumptions within the Bayesian framework. Identifying causal direction then becomes a Bayesian model selection problem. This enables us to construct models with realistic assumptions, and consequently allows for the differentiation between Markov equivalent causal structures. We analyse why Bayesian model selection works in situations where methods based on maximum likelihood fail. To demonstrate our approach, we construct a Bayesian non-parametric model that can flexibly model the joint distribution. We then outperform previous methods on a wide range of benchmark datasets with varying data generating assumptions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anish Dhir;Samuel Power;Mark van der Wilk", "authorids": "~Anish_Dhir1;~Samuel_Power1;~Mark_van_der_Wilk1", "gender": "M;M;M", "homepage": ";https://sites.google.com/view/sp-monte-carlo/;https://mvdw.uk", "dblp": "251/9010;;142/2927", "google_scholar": "nuA78i0AAAAJ;ePQTKrEAAAAJ;PKcjcT4AAAAJ", "orcid": ";0000-0001-8644-8014;0000-0001-7947-6682", "linkedin": ";samuel-power-6308b02b/;", "or_profile": "~Anish_Dhir1;~Samuel_Power1;~Mark_van_der_Wilk1", "aff": "Amazon;University of Bristol;University of Oxford", "aff_domain": "amazon.de;bristol.ac.uk;cs.ox.ac.uk", "position": "Intern;Postdoc;Associate Professor", "bibtex": "@inproceedings{\ndhir2024bivariate,\ntitle={Bivariate Causal Discovery using Bayesian Model Selection},\nauthor={Anish Dhir and Samuel Power and Mark van der Wilk},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=twm7qPVX1F}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 599740, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13559279204286499713&as_sdt=1005&sciodt=0,4&hl=en", "gs_version_total": 7, "email": "amazon.de;bristol.ac.uk;cs.ox.ac.uk", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Amazon;University of Bristol;University of Oxford", "aff_unique_dep": "Amazon.com, Inc.;;", "aff_unique_url": "https://www.amazon.com;https://www.bristol.ac.uk;https://www.ox.ac.uk", "aff_unique_abbr": "Amazon;Bristol;Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Asymmetry in Low-Rank Adapters of Foundation Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32849", "id": "txRZBD8tBV", "proceeding": "https://proceedings.mlr.press/v235/zhu24c.html", "pdf": "https://openreview.net/pdf?id=txRZBD8tBV", "openreview": "https://openreview.net/forum?id=txRZBD8tBV", "author_site": "Jiacheng Zhu, Kristjan Greenewald, Kimia Nadjahi, Haitz S\u00e1ez de Oc\u00e1riz Borde, Rickard Gabrielsson, Leshem Choshen, Marzyeh Ghassemi, Mikhail Yurochkin, Justin Solomon", "tldr": "", "abstract": "Parameter-efficient fine-tuning optimizes large, pre-trained foundation models by updating a subset of parameters; in this class, Low-Rank Adaptation (LoRA) is particularly effective. Inspired by an effort to investigate the different roles of LoRA matrices during fine-tuning, this paper characterizes and leverages unexpected asymmetry in the importance of low-rank adapter matrices. Specifically, when updating the parameter matrices of a neural network by adding a product $BA$, we observe that the $B$ and $A$ matrices have distinct functions: $A$ extracts features from the input, while $B$ uses these features to create the desired output. Based on this observation, we demonstrate that fine-tuning $B$ is inherently more effective than fine-tuning $A$, and that a random untrained $A$ should perform nearly as well as a fine-tuned one. Using an information-theoretic lens, we also bound the generalization of low-rank adapters, showing that the parameter savings of exclusively training $B$ improves the bound. We support our conclusions with experiments on RoBERTa, BART-Large, LLaMA-2, and ViTs. The code and data is available at https://github.com/Jiacheng-Zhu-AIML/AsymmetryLoRA", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiacheng Zhu;Kristjan Greenewald;Kimia Nadjahi;Haitz S\u00e1ez de Oc\u00e1riz Borde;Rickard Br\u00fcel Gabrielsson;Leshem Choshen;Marzyeh Ghassemi;Mikhail Yurochkin;Justin Solomon", "authorids": "~Jiacheng_Zhu1;~Kristjan_Greenewald1;~Kimia_Nadjahi1;~Haitz_S\u00e1ez_de_Oc\u00e1riz_Borde1;~Rickard_Br\u00fcel_Gabrielsson1;~Leshem_Choshen1;~Marzyeh_Ghassemi2;~Mikhail_Yurochkin1;~Justin_Solomon1", "gender": "M;;F;M;Not Specified;Not Specified;F;M;M", "homepage": "https://jiachengzhuml.github.io/;https://researcher.watson.ibm.com/researcher/view.php?person=ibm-Kristjan.H.Greenewald;http://kimiandj.github.io/;https://www.linkedin.com/in/haitz-s%C3%A1ez-de-oc%C3%A1riz-borde-0933a9199/;http://bruel.org/;https://ktilana.wixsite.com/leshem-choshen;https://www.healthyml.org/;https://moonfolk.github.io/;http://people.csail.mit.edu/jsolomon/", "dblp": "40/10195;146/0563;236/4646;;228/6813;218/5237;145/6563;191/6719;80/5094", "google_scholar": "rKUnBPgAAAAJ;L3zNUG4AAAAJ;x0_peq4AAAAJ;aP0OakUAAAAJ;y9Oh5XwAAAAJ;https://scholar.google.com/citations?hl=en;;QjBF9sUAAAAJ;pImSVwoAAAAJ", "orcid": ";;;;;0000-0002-0085-6496;;;0000-0002-7701-7586", "linkedin": ";;;;;leshemchoshen/;;mikhail-yurochkin-a45659114/;justin-solomon-8a587914/", "or_profile": "~Jiacheng_Zhu1;~Kristjan_Greenewald1;~Kimia_Nadjahi1;~Haitz_S\u00e1ez_de_Oc\u00e1riz_Borde1;~Rickard_Br\u00fcel_Gabrielsson1;~Leshem_Choshen1;~Marzyeh_Ghassemi2;~Mikhail_Yurochkin1;~Justin_Solomon1", "aff": "Massachusetts Institute of Technology;MIT-IBM Watson AI Lab, IBM Research;Massachusetts Institute of Technology;University of Oxford;Massachusetts Institute of Technology;International Business Machines;Massachusetts Institute of Technology;IBM Research;Massachusetts Institute of Technology", "aff_domain": "mit.edu;ibm.com;mit.edu;ox.ac.uk;mit.edu;ibm.com;mit.edu;ibm.com;mit.edu", "position": "Postdoc;Research Scientist;Postdoc;PhD student;PhD student;Researcher;Assistant Professor;Researcher;Associate Professor", "bibtex": "@inproceedings{\nzhu2024asymmetry,\ntitle={Asymmetry in Low-Rank Adapters of Foundation Models},\nauthor={Jiacheng Zhu and Kristjan Greenewald and Kimia Nadjahi and Haitz S{\\'a}ez de Oc{\\'a}riz Borde and Rickard Br{\\\"u}el Gabrielsson and Leshem Choshen and Marzyeh Ghassemi and Mikhail Yurochkin and Justin Solomon},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=txRZBD8tBV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 830237, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2752947517582847656&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "mit.edu;ibm.com;mit.edu;ox.ac.uk;mit.edu;ibm.com;mit.edu;ibm.com;mit.edu", "author_num": 9, "aff_unique_index": "0;1;0;2;0;3;0;1;0", "aff_unique_norm": "Massachusetts Institute of Technology;IBM;University of Oxford;International Business Machines Corporation", "aff_unique_dep": ";AI Lab;;", "aff_unique_url": "https://web.mit.edu;https://www.ibmwatsonai.org/;https://www.ox.ac.uk;https://www.ibm.com", "aff_unique_abbr": "MIT;MIT-IBM AI Lab;Oxford;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Masked Face Recognition with Generative-to-Discriminative Representations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32848", "id": "tya725xlZ3", "proceeding": "https://proceedings.mlr.press/v235/ge24a.html", "pdf": "https://openreview.net/pdf?id=tya725xlZ3", "openreview": "https://openreview.net/forum?id=tya725xlZ3", "author_site": "Shiming Ge, Weijia Guo, Chenyu Li, Zhang Junzheng, Yong Li, Dan Zeng", "tldr": "", "abstract": "Masked face recognition is important for social good but challenged by diverse occlusions that cause insufficient or inaccurate representations. In this work, we propose a unified deep network to learn generative-to-discriminative representations for facilitating masked face recognition. To this end, we split the network into three modules and learn them on synthetic masked faces in a greedy module-wise pretraining manner. First, we leverage a generative encoder pretrained for face inpainting and finetune it to represent masked faces into category-aware descriptors. Attribute to the generative encoder's ability in recovering context information, the resulting descriptors can provide occlusion-robust representations for masked faces, mitigating the effect of diverse masks. Then, we incorporate a multi-layer convolutional network as a discriminative reformer and learn it to convert the category-aware descriptors into identity-aware vectors, where the learning is effectively supervised by distilling relation knowledge from off-the-shelf face recognition model. In this way, the discriminative reformer together with the generative encoder serves as the pretrained backbone, providing general and discriminative representations towards masked faces. Finally, we cascade one fully-connected layer following by one softmax layer into a feature classifier and finetune it to identify the reformed identity-aware vectors. Extensive experiments on synthetic and realistic datasets demonstrate the effectiveness of our approach in recognizing masked faces.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shiming Ge;Weijia Guo;Chenyu Li;Zhang Junzheng;Yong Li;Dan Zeng", "authorids": "~Shiming_Ge1;guoweijia@iie.ac.cn;lichenyu@iie.ac.cn;~Zhang_Junzheng1;liyong@iie.ac.cn;~Dan_Zeng2", "gender": "M;;;M;;", "homepage": ";;;https://www.researchgate.net/profile/Zhang-Junzheng-6;;", "dblp": "93/8104.html;;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Shiming_Ge1;guoweijia@iie.ac.cn;lichenyu@iie.ac.cn;~Zhang_Junzheng1;liyong@iie.ac.cn;~Dan_Zeng2", "aff": "Institute of Information Engineering, Chinese Academy of Sciences;;;University of Chinese Academy of Sciences;;", "aff_domain": "iie.ac.cn;;;ucas.ac.cn;;", "position": "Researcher;;;Undergrad student;;", "bibtex": "@inproceedings{\nge2024masked,\ntitle={Masked Face Recognition with Generative-to-Discriminative Representations},\nauthor={Shiming Ge and Weijia Guo and Chenyu Li and Zhang Junzheng and Yong Li and Dan Zeng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tya725xlZ3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 0, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6444241852755063861&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "iie.ac.cn;;;ucas.ac.cn;;", "author_num": 6, "aff_unique_index": "0;1", "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences", "aff_unique_dep": "Institute of Information Engineering;", "aff_unique_url": "http://www.cas.cn;http://www.ucas.ac.cn", "aff_unique_abbr": "CAS;UCAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Multi-Factor Adaptive Vision Selection for Egocentric Video Question Answering", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32847", "id": "u00dmbI8Db", "proceeding": "https://proceedings.mlr.press/v235/zhang24aj.html", "pdf": "https://openreview.net/pdf?id=u00dmbI8Db", "openreview": "https://openreview.net/forum?id=u00dmbI8Db", "author_site": "Haoyu Zhang, Meng Liu, Zixin Liu, Xuemeng Song, Yaowei Wang, Liqiang Nie", "tldr": "", "abstract": "The challenge of interpreting the world from a human perspective in Artificial Intelligence (AI) is particularly evident in egocentric video question answering, which grapples with issues like small object recognition, noise suppression, and spatial-temporal reasoning. To address these challenges, we introduce the Multi-Factor Adaptive vision Selection (MFAS) framework. MFAS integrates a patch partition and merging module for enhanced small object recognition, a prior-guided patch selection module for noise suppression and focused analysis, and a hierarchical aggregation network to aggregate visual semantics guided by questions. Extensive experiments on several public egocentric datasets have validated the effectiveness and generalization of our framework. Code and data are available in https://github.com/Hyu-Zhang/EgoVideoQA.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haoyu Zhang;Meng Liu;Zixin Liu;Xuemeng Song;Yaowei Wang;Liqiang Nie", "authorids": "~Haoyu_Zhang4;~Meng_Liu4;liuzixin1126@gmail.com;~Xuemeng_Song2;~Yaowei_Wang1;~Liqiang_Nie2", "gender": "M;F;;F;M;M", "homepage": "https://hyu-zhang.github.io/;https://mengliu1991.github.io;;https://xuemengsong.github.io/;https://dblp.org/pid/68/2992.html;https://liqiangnie.github.io/index.html", "dblp": ";41/7841-6;;147/9141.html;68/2992-1;92/8277", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;;;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com/citations?hl=zh-CN;yywVMhUAAAAJ", "orcid": "0000-0002-3896-170X;0000-0002-1582-5764;;0000-0002-5274-4197;0000-0002-6110-4036;0000-0003-1476-0273", "linkedin": ";;;;yaowei-wang-971ab310/;", "or_profile": "~Haoyu_Zhang4;~Meng_Liu4;liuzixin1126@gmail.com;~Xuemeng_Song2;~Yaowei_Wang1;~Liqiang_Nie2", "aff": "Harbin Institute of Technology;Shandong Jianzhu University;;Shandong University;Pengcheng Laboratory;Shandong University", "aff_domain": "hit.edu.cn;sdjzu.edu.cn;;sdu.edu.cn;pcl.ac.cn;sdu.edu.cn", "position": "PhD student;Full Professor;;Associate Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nzhang2024multifactor,\ntitle={Multi-Factor Adaptive Vision Selection for Egocentric Video Question Answering},\nauthor={Haoyu Zhang and Meng Liu and Zixin Liu and Xuemeng Song and Yaowei Wang and Liqiang Nie},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=u00dmbI8Db}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4579104, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16494583920603111049&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "hit.edu.cn;sdjzu.edu.cn;;sdu.edu.cn;pcl.ac.cn;sdu.edu.cn", "author_num": 6, "aff_unique_index": "0;1;2;3;2", "aff_unique_norm": "Harbin Institute of Technology;Shandong Jianzhu University;Shandong University;Pengcheng Laboratory", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.hit.edu.cn/;http://www.sdjzu.edu.cn;http://www.sdu.edu.cn;", "aff_unique_abbr": "HIT;SDJU;SDU;", "aff_campus_unique_index": "0", "aff_campus_unique": "Harbin;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Any-Precision LLM: Low-Cost Deployment of Multiple, Different-Sized LLMs", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32846", "id": "u09gadH3BU", "proceeding": "https://proceedings.mlr.press/v235/park24e.html", "pdf": "https://openreview.net/pdf?id=u09gadH3BU", "openreview": "https://openreview.net/forum?id=u09gadH3BU", "author_site": "Yeonhong Park, Jake Hyun, SangLyul Cho, Bonggeun Sim, Jae W. Lee", "tldr": "", "abstract": "Recently, considerable efforts have been directed towards compressing Large Language Models (LLMs), which showcase groundbreaking capabilities across diverse applications but entail significant deployment costs due to their large sizes. Meanwhile, much less attention has been given to mitigating the costs associated with deploying multiple LLMs of varying sizes despite its practical significance. Thus, this paper introduces any-precision LLM, extending the concept of any-precision DNN to LLMs. Addressing challenges in any-precision LLM, we propose a lightweight method for any-precision quantization of LLMs, leveraging a post-training quantization framework, and develop a specialized software engine for its efficient serving. As a result, our solution significantly reduces the high costs of deploying multiple, different-sized LLMs by overlaying LLMs quantized to varying bit-widths, such as 3, 4, ..., $n$ bits, into a memory footprint comparable to a single $n$-bit LLM. All the supported LLMs with varying bit-widths demonstrate state-of-the-art model quality and inference throughput, proving itself to be a compelling option for deployment of multiple, different-sized LLMs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yeonhong Park;Jake Hyun;SangLyul Cho;Bonggeun Sim;Jae W. Lee", "authorids": "~Yeonhong_Park1;~Jake_Hyun1;~SangLyul_Cho1;~Bonggeun_Sim1;~Jae_W._Lee1", "gender": "M;M;M;;M", "homepage": "http://arc.snu.ac.kr/people/pyh/index.html;https://syphonarch.github.io;https://github.com/chosanglyul/;;https://iamjaelee.github.io/www/", "dblp": "259/2571;;;;21/4685", "google_scholar": "https://scholar.google.co.kr/citations?user=eZN-njgAAAAJ;;;;PA-QN6IAAAAJ", "orcid": ";;;;0000-0002-4266-4919", "linkedin": ";jake-hyun-556b88280/;sanglyul-cho-69b553246/;bonggeun-sim-620515143/;jae-w-lee-6486796/", "or_profile": "~Yeonhong_Park1;~Jake_Hyun1;~SangLyul_Cho1;~Bonggeun_Sim1;~Jae_W._Lee1", "aff": "Seoul National University;Seoul National University;Seoul National University;Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr", "position": "PhD student;Undergrad student;Undergrad student;PhD student;Full Professor", "bibtex": "@inproceedings{\npark2024anyprecision,\ntitle={Any-Precision {LLM}: Low-Cost Deployment of Multiple, Different-Sized {LLM}s},\nauthor={Yeonhong Park and Jake Hyun and SangLyul Cho and Bonggeun Sim and Jae W. Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=u09gadH3BU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1360759, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13512688553929728412&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Improving Antibody Humanness Prediction using Patent Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32845", "id": "u26c52rxZC", "proceeding": "https://proceedings.mlr.press/v235/ucar24a.html", "pdf": "https://openreview.net/pdf?id=u26c52rxZC", "openreview": "https://openreview.net/forum?id=u26c52rxZC", "author_site": "Talip Ucar, Aubin Ramon, Dino Oglic, Rebecca Croasdale-Wood, Tom Diethe, Pietro Sormanni", "tldr": "", "abstract": "We investigate the potential of patent data for improving the antibody humanness prediction using a multi-stage, multi-loss training process. Humanness serves as a proxy for the immunogenic response to antibody therapeutics, one of the major causes of attrition in drug discovery and a challenging obstacle for their use in clinical settings. We pose the initial learning stage as a weakly-supervised contrastive-learning problem, where each antibody sequence is associated with possibly multiple identifiers of function and the objective is to learn an encoder that groups them according to their patented properties. We then freeze a part of the contrastive encoder and continue training it on the patent data using the cross-entropy loss to predict the humanness score of a given antibody sequence. We illustrate the utility of the patent data and our approach by performing inference on three different immunogenicity datasets, unseen during training. Our empirical results demonstrate that the learned model consistently outperforms the alternative baselines and establishes new state-of-the-art on five out of six inference tasks, irrespective of the used metric.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Talip Ucar;Aubin Ramon;Dino Oglic;Rebecca Croasdale-Wood;Tom Diethe;Pietro Sormanni", "authorids": "~Talip_Ucar2;ar2033@cam.ac.uk;~Dino_Oglic1;rebecca.croasdale-wood@astrazeneca.com;~Tom_Diethe1;~Pietro_Sormanni1", "gender": ";;M;;M;", "homepage": ";;https://doglic.bitbucket.io;;http://www.tomdiethe.com;http://www-sormanni.ch.cam.ac.uk", "dblp": ";;150/2759;;33/1098;", "google_scholar": ";;ewbMwjYAAAAJ;;https://scholar.google.co.uk/citations?user=oWGk9c8AAAAJ;", "orcid": ";;;;0000-0002-0776-5407;", "linkedin": ";;;;tomdiethe/;", "or_profile": "~Talip_Ucar2;ar2033@cam.ac.uk;~Dino_Oglic1;rebecca.croasdale-wood@astrazeneca.com;~Tom_Diethe1;~Pietro_Sormanni1", "aff": ";;AstraZeneca UK;;AstraZeneca;University of Cambridge", "aff_domain": ";;astrazeneca.com;;astrazeneca.com;cam.ac.uk", "position": ";;Researcher;;Principal Researcher;Assistant Professor", "bibtex": "@inproceedings{\nucar2024improving,\ntitle={Improving Antibody Humanness Prediction using Patent Data},\nauthor={Talip Ucar and Aubin Ramon and Dino Oglic and Rebecca Croasdale-Wood and Tom Diethe and Pietro Sormanni},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=u26c52rxZC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9607626, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14078106471613446991&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": ";;astrazeneca.com;;astrazeneca.com;cam.ac.uk", "author_num": 6, "aff_unique_index": "0;0;1", "aff_unique_norm": "AstraZeneca;University of Cambridge", "aff_unique_dep": ";", "aff_unique_url": "https://www.astrazeneca.com;https://www.cam.ac.uk", "aff_unique_abbr": "AZ;Cambridge", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "STELLA: Continual Audio-Video Pre-training with SpatioTemporal Localized Alignment", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32844", "id": "u4VR3WBH7a", "proceeding": "https://proceedings.mlr.press/v235/lee24ac.html", "pdf": "https://openreview.net/pdf?id=u4VR3WBH7a", "openreview": "https://openreview.net/forum?id=u4VR3WBH7a", "author_site": "Jaewoo Lee, Jaehong Yoon, Wonjae Kim, Yunji Kim, Sung Ju Hwang", "tldr": "", "abstract": "Continuously learning a variety of audio-video semantics over time is crucial for audio-related reasoning tasks in our ever-evolving world. However, this is a nontrivial problem and poses two critical challenges: sparse spatio-temporal correlation between audio-video pairs and multimodal correlation overwriting that forgets audio-video relations. To tackle this problem, we propose a new continual audio-video pre-training method with two novel ideas: (1) Localized Patch Importance Scoring: we introduce a multimodal encoder to determine the importance score for each patch, emphasizing semantically intertwined audio-video patches. (2) Replay-guided Correlation Assessment: to reduce the corruption of previously learned audiovisual knowledge due to drift, we propose to assess the correlation of the current patches on the past steps to identify the patches exhibiting high correlations with the past steps. Based on the results from the two ideas, we perform probabilistic patch selection for effective continual audio-video pre-training. Experimental validation on multiple benchmarks shows that our method achieves a $3.69\\%$p of relative performance gain in zero-shot retrieval tasks compared to strong continual learning baselines, while reducing memory consumption by $\\sim 45 \\%$.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jaewoo Lee;Jaehong Yoon;Wonjae Kim;Yunji Kim;Sung Ju Hwang", "authorids": "~Jaewoo_Lee4;~Jaehong_Yoon1;~Wonjae_Kim1;~Yunji_Kim1;~Sung_Ju_Hwang1", "gender": "M;M;M;;", "homepage": "https://g-jwlee.github.io/;https://jaehong31.github.io/;https://wonjae.kim;;", "dblp": ";203/4449;158/3433;;", "google_scholar": ";-5comoUAAAAJ;https://scholar.google.co.kr/citations?user=UpZ41EwAAAAJ;;", "orcid": ";;0000-0002-6616-7685;;", "linkedin": ";jaehongyoon/;;;", "or_profile": "~Jaewoo_Lee4;~Jaehong_Yoon1;~Wonjae_Kim1;~Yunji_Kim1;~Sung_Ju_Hwang1", "aff": "Korea Advanced Institute of Science & Technology;University of North Carolina at Chapel Hill;NAVER;;", "aff_domain": "kaist.edu;unc.edu;navercorp.com;;", "position": "MS student;Postdoc;Research Scientist;;", "bibtex": "@inproceedings{\nlee2024stella,\ntitle={{STELLA}: Continual Audio-Video Pre-training with SpatioTemporal Localized Alignment},\nauthor={Jaewoo Lee and Jaehong Yoon and Wonjae Kim and Yunji Kim and Sung Ju Hwang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=u4VR3WBH7a}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5489970, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=985692354827353156&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "kaist.edu;unc.edu;navercorp.com;;", "author_num": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;University of North Carolina;NAVER Corporation", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kaist.ac.kr;https://www.unc.edu;https://www.naver.com", "aff_unique_abbr": "KAIST;UNC;NAVER", "aff_campus_unique_index": "1", "aff_campus_unique": ";Chapel Hill", "aff_country_unique_index": "0;1;0", "aff_country_unique": "South Korea;United States" }, { "title": "Position: Evolving AI Collectives Enhance Human Diversity and Enable Self-Regulation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32843", "id": "u6PeRHEsjL", "proceeding": "https://proceedings.mlr.press/v235/lai24b.html", "pdf": "https://openreview.net/pdf?id=u6PeRHEsjL", "openreview": "https://openreview.net/forum?id=u6PeRHEsjL", "author_site": "Shiyang Lai, Yujin Potter, Junsol Kim, Richard Zhuang, Dawn Song, James Evans", "tldr": "", "abstract": "Large language model behavior is shaped by the language of those with whom they interact. This capacity and their increasing prevalence online portend that they will intentionally or unintentionally \"program\" one another and form emergent AI subjectivities, relationships, and collectives. Here, we call upon the research community to investigate these \"societies\" of interacting artificial intelligences to increase their rewards and reduce their risks for human society and the health of online environments. We use a small \"community\" of models and their evolving outputs to illustrate how such emergent, decentralized AI collectives can spontaneously expand the bounds of human diversity and reduce the risk of toxic, anti-social behavior online. Finally, we discuss opportunities for AI cross-moderation and address ethical issues and design challenges associated with creating and maintaining free-formed AI collectives.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shiyang Lai;Yujin Potter;Junsol Kim;Richard Zhuang;Dawn Song;James Evans", "authorids": "shiyanglai@uchicago.edu;~Yujin_Potter1;~Junsol_Kim1;richardzhuang0412@berkeley.edu;~Dawn_Song1;~James_Evans1", "gender": ";F;;;F;M", "homepage": ";;https://github.com/junsolkim;;;https://macss.uchicago.edu/directory/James-Evans", "dblp": ";;;;s/DXSong;", "google_scholar": ";ZDG9RD8AAAAJ;;;;kV4N4zoAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "shiyanglai@uchicago.edu;~Yujin_Potter1;~Junsol_Kim1;richardzhuang0412@berkeley.edu;~Dawn_Song1;~James_Evans1", "aff": ";University of California, Berkeley;University of Chicago;;University of California, Berkeley;University of Chicago", "aff_domain": ";berkeley.edu;uchicago.edu;;berkeley.edu;uchicago.edu", "position": ";Postdoc;PhD student;;Full Professor;Full Professor", "bibtex": "@inproceedings{\nlai2024position,\ntitle={Position: Evolving {AI} Collectives Enhance Human Diversity and Enable Self-Regulation},\nauthor={Shiyang Lai and Yujin Potter and Junsol Kim and Richard Zhuang and Dawn Song and James Evans},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=u6PeRHEsjL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 771033, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16320156383272837588&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": ";berkeley.edu;uchicago.edu;;berkeley.edu;uchicago.edu", "author_num": 6, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "University of California, Berkeley;University of Chicago", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://www.uchicago.edu", "aff_unique_abbr": "UC Berkeley;UChicago", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Neural Image Compression with Text-guided Encoding for both Pixel-level and Perceptual Fidelity", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32842", "id": "u8TZ9gm4im", "proceeding": "https://proceedings.mlr.press/v235/lee24n.html", "pdf": "https://openreview.net/pdf?id=u8TZ9gm4im", "openreview": "https://openreview.net/forum?id=u8TZ9gm4im", "author_site": "Hagyeong Lee, Minkyu Kim, Jun-Hyuk Kim, Seungeon Kim, Dokwan Oh, Jaeho Lee", "tldr": "", "abstract": "Recent advances in text-guided image compression have shown great potential to enhance the perceptual quality of reconstructed images. These methods, however, tend to have significantly degraded pixel-wise fidelity, limiting their practicality. To fill this gap, we develop a new text-guided image compression algorithm that achieves both high perceptual and pixel-wise fidelity. In particular, we propose a compression framework that leverages text information mainly by text-adaptive encoding and training with joint image-text loss. By doing so, we avoid decoding based on text-guided generative models---known for high generative diversity---and effectively utilize the semantic information of text at a global level. Experimental results on various datasets show that our method can achieve high pixel-level and perceptual quality, with either human- or machine-generated captions. In particular, our method outperforms all baselines in terms of LPIPS, with some room for even more improvements when we use more carefully generated captions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hagyeong Lee;Minkyu Kim;Jun-Hyuk Kim;Seungeon Kim;Dokwan Oh;Jaeho Lee", "authorids": "~Hagyeong_Lee1;~Minkyu_Kim4;~Jun-Hyuk_Kim1;~Seungeon_Kim1;~Dokwan_Oh1;~Jaeho_Lee3", "gender": "F;M;M;M;M;M", "homepage": "https://hagyeonglee.github.io/;https://velog.io/@minkyu4506/posts;https://junhyukk.github.io/;;;https://jaeho-lee.github.io", "dblp": ";;193/6547;;274/9571;78/6080-1", "google_scholar": "qZcf4DQAAAAJ;;A0io6mQAAAAJ;https://scholar.google.com/citations?hl=ko;;t91zoQMAAAAJ", "orcid": ";;;0000-0002-8370-8631;;", "linkedin": "hagyeonglee;;;;dokwan-oh-18a26572/;", "or_profile": "~Hagyeong_Lee1;~Minkyu_Kim4;~Jun-Hyuk_Kim1;~Seungeon_Kim1;~Dokwan_Oh1;~Jaeho_Lee3", "aff": "Pohang University of Science and Technology;POSTECH;Samsung Advanced Institute of Technology;Samsung;Samsung Advanced Institute of Technology;Pohang University of Science and Technology", "aff_domain": "postech.ac.kr;postech.ac.kr;samsung.com;samsung.com;samsung.com;postech.ac.kr", "position": "MS student;MS student;Researcher;Researcher;Principal Researcher;Assistant Professor", "bibtex": "@inproceedings{\nlee2024neural,\ntitle={Neural Image Compression with Text-guided Encoding for both Pixel-level and Perceptual Fidelity},\nauthor={Hagyeong Lee and Minkyu Kim and Jun-Hyuk Kim and Seungeon Kim and Dokwan Oh and Jaeho Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=u8TZ9gm4im}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5007266, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13738273632613128059&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "postech.ac.kr;postech.ac.kr;samsung.com;samsung.com;samsung.com;postech.ac.kr", "author_num": 6, "aff_unique_index": "0;0;1;1;1;0", "aff_unique_norm": "Pohang University of Science and Technology;Samsung", "aff_unique_dep": ";Samsung Advanced Institute of Technology", "aff_unique_url": "https://www.postech.ac.kr;https://www.sait.samsung.com", "aff_unique_abbr": "POSTECH;SAIT", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pohang;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Empowering Graph Invariance Learning with Deep Spurious Infomax", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32841", "id": "u9oSQtujCF", "proceeding": "https://proceedings.mlr.press/v235/yao24a.html", "pdf": "https://openreview.net/pdf?id=u9oSQtujCF", "openreview": "https://openreview.net/forum?id=u9oSQtujCF", "author_site": "Tianjun Yao, Yongqiang Chen, Zhenhao Chen, Kai Hu, Zhiqiang Shen, Kun Zhang", "tldr": "", "abstract": "Recently, there has been a surge of interest in developing graph neural networks that utilize the invariance principle on graphs to generalize the out-of-distribution (OOD) data. Due to the limited knowledge about OOD data, existing approaches often pose assumptions about the correlation strengths of the underlying spurious features and the target labels. However, this prior is often unavailable and will change arbitrarily in the real-world scenarios, which may lead to severe failures of the existing graph invariance learning methods. To bridge this gap, we introduce a novel graph invariance learning paradigm, which induces a robust and general inductive bias, which is built upon the observation that the infomax principle encourages learning spurious features regardless of spurious correlation strengths. We further propose the EQuAD framework that realizes this learning paradigm and employs tailored learning objectives that provably elicit invariant features by disentangling them from the spurious features learned through infomax. Notably, EQuAD shows stable and enhanced performance across different degrees of bias in synthetic datasets and challenging real-world datasets up to 31.76%.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tianjun Yao;Yongqiang Chen;Zhenhao Chen;Kai Hu;Zhiqiang Shen;Kun Zhang", "authorids": "~Tianjun_Yao1;~Yongqiang_Chen1;~Zhenhao_Chen1;~Kai_Hu2;~Zhiqiang_Shen1;~Kun_Zhang1", "gender": ";;M;M;;M", "homepage": ";https://lfhase.win;https://zhenhaochenofficial.github.io/;https://github.com/hukkai;;http://www.andrew.cmu.edu/user/kunz1/", "dblp": ";76/5774-2;192/7717;;;96/3115-1", "google_scholar": ";huQ_Ig8AAAAJ;xOAtM0YAAAAJ;;;RGoypN4AAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Tianjun_Yao1;~Yongqiang_Chen1;~Zhenhao_Chen1;~Kai_Hu2;~Zhiqiang_Shen1;~Kun_Zhang1", "aff": ";Department of Computer Science and Engineering, The Chinese University of Hong Kong;Mohamed bin Zayed University of Artificial Intelligence;Carnegie Mellon University;;Carnegie Mellon University", "aff_domain": ";cse.cuhk.edu.hk;mbzuai.ac.ae;cmu.edu;;cmu.edu", "position": ";PhD student;PhD student;PhD student;;Associate Professor", "bibtex": "@inproceedings{\nyao2024empowering,\ntitle={Empowering Graph Invariance Learning with Deep Spurious Infomax},\nauthor={Tianjun Yao and Yongqiang Chen and Zhenhao Chen and Kai Hu and Zhiqiang Shen and Kun Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=u9oSQtujCF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6470335, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1468805169295959058&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";cse.cuhk.edu.hk;mbzuai.ac.ae;cmu.edu;;cmu.edu", "author_num": 6, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Chinese University of Hong Kong;Mohamed bin Zayed University of Artificial Intelligence;Carnegie Mellon University", "aff_unique_dep": "Department of Computer Science and Engineering;;", "aff_unique_url": "https://www.cuhk.edu.hk;https://mbzuai.ac.ae;https://www.cmu.edu", "aff_unique_abbr": "CUHK;MBZUAI;CMU", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;2;2", "aff_country_unique": "China;United Arab Emirates;United States" }, { "title": "A Global Geometric Analysis of Maximal Coding Rate Reduction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32840", "id": "u9qmjV2khT", "proceeding": "https://proceedings.mlr.press/v235/wang24as.html", "pdf": "https://openreview.net/pdf?id=u9qmjV2khT", "openreview": "https://openreview.net/forum?id=u9qmjV2khT", "author_site": "Peng Wang, Huikang Liu, Druv Pai, Yaodong Yu, Zhihui Zhu, Qing Qu, Yi Ma", "tldr": "", "abstract": "The maximal coding rate reduction (MCR$^2$) objective for learning structured and compact deep representations is drawing increasing attention, especially after its recent usage in the derivation of fully explainable and highly effective deep network architectures. However, it lacks a complete theoretical justification: only the properties of its global optima are known, and its global landscape has not been studied. In this work, we give a complete characterization of the properties of all its local and global optima as well as other types of critical points. Specifically, we show that each (local or global) maximizer of the MCR$^2$ problem corresponds to a low-dimensional, discriminative, and diverse representation, and furthermore, each critical point of the objective is either a local maximizer or a strict saddle point. Such a favorable landscape makes MCR$^2$ a natural choice of objective for learning diverse and discriminative representations via first-order optimization. To further verify our theoretical findings, we illustrate these properties with extensive experiments on both synthetic and real data sets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Peng Wang;Huikang Liu;Druv Pai;Yaodong Yu;Zhihui Zhu;Qing Qu;Yi Ma", "authorids": "~Peng_Wang23;~Huikang_Liu2;~Druv_Pai1;~Yaodong_Yu4;~Zhihui_Zhu1;~Qing_Qu2;~Yi_Ma4", "gender": "M;M;M;M;M;M;M", "homepage": "https://peng8wang.github.io/;https://huikang2019.github.io;https://druvpai.github.io;https://yaodongyu.github.io;https://zhihuizhu.github.io/;https://qingqu.engin.umich.edu/;http://people.eecs.berkeley.edu/~yima/", "dblp": "95/4442-98;62/8489;;;71/8081;127/6874-1;", "google_scholar": "baF3HKUAAAAJ;https://scholar.google.com.hk/citations?hl=zh-TW;https://scholar.google.com/citations?hl=en;bZ9oyW8AAAAJ;gmSwszcAAAAJ;JfblW3MAAAAJ;https://scholar.google.com.hk/citations?user=XqLiBQMAAAAJ", "orcid": "0000-0002-6799-0745;;;;;0000-0001-9136-558X;", "linkedin": ";;https://linkedin.com/in/druvpai;;;qing-q-1a0b9746/;", "or_profile": "~Peng_Wang23;~Huikang_Liu2;~Druv_Pai1;~Yaodong_Yu4;~Zhihui_Zhu1;~Qing_Qu2;~Yi_Ma4", "aff": "University of Michigan - Ann Arbor;Shanghai University of Finance and Economics;Electrical Engineering & Computer Science Department, University of California, Berkeley;Electrical Engineering & Computer Science Department, University of California Berkeley;Ohio State University, Columbus;University of Michigan;University of California, Berkeley", "aff_domain": "umich.edu;sufe.edu;eecs.berkeley.edu;eecs.berkeley.edu;osu.edu;umich.edu;berkeley.edu", "position": "Postdoc;Assistant Professor;PhD student;PhD student;Assistant Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nwang2024a,\ntitle={A Global Geometric Analysis of Maximal Coding Rate Reduction},\nauthor={Peng Wang and Huikang Liu and Druv Pai and Yaodong Yu and Zhihui Zhu and Qing Qu and Yi Ma},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=u9qmjV2khT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9791122, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14470983432755644791&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "umich.edu;sufe.edu;eecs.berkeley.edu;eecs.berkeley.edu;osu.edu;umich.edu;berkeley.edu", "author_num": 7, "aff_unique_index": "0;1;2;2;3;0;2", "aff_unique_norm": "University of Michigan;Shanghai University of Finance and Economics;University of California, Berkeley;Ohio State University", "aff_unique_dep": ";;Electrical Engineering & Computer Science Department;", "aff_unique_url": "https://www.umich.edu;http://www.sufe.edu.cn;https://www.berkeley.edu;https://www.osu.edu", "aff_unique_abbr": "UM;SUFE;UC Berkeley;OSU", "aff_campus_unique_index": "0;2;2;3;2", "aff_campus_unique": "Ann Arbor;;Berkeley;Columbus", "aff_country_unique_index": "0;1;0;0;0;0;0", "aff_country_unique": "United States;China" }, { "title": "On the Universality of Volume-Preserving and Coupling-Based Normalizing Flows", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32839", "id": "uA3FRvO2DJ", "proceeding": "https://proceedings.mlr.press/v235/draxler24a.html", "pdf": "https://openreview.net/pdf?id=uA3FRvO2DJ", "openreview": "https://openreview.net/forum?id=uA3FRvO2DJ", "author_site": "Felix Draxler, Stefan Wahl, Christoph Schn\u00f6rr, Ullrich Koethe", "tldr": "", "abstract": "We present a novel theoretical framework for understanding the expressive power of normalizing flows. Despite their prevalence in scientific applications, a comprehensive understanding of flows remains elusive due to their restricted architectures. Existing theorems fall short as they require the use of arbitrarily ill-conditioned neural networks, limiting practical applicability. We propose a distributional universality theorem for well-conditioned coupling-based normalizing flows such as RealNVP. In addition, we show that volume-preserving normalizing flows are not universal, what distribution they learn instead, and how to fix their expressivity. Our results support the general wisdom that affine and related couplings are expressive and in general outperform volume-preserving flows, bridging a gap between empirical results and theoretical understanding.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Felix Draxler;Stefan Wahl;Christoph Schnoerr;Ullrich Koethe", "authorids": "~Felix_Draxler1;~Stefan_Wahl1;~Christoph_Schnoerr1;~Ullrich_Koethe1", "gender": "M;M;;M", "homepage": ";;https://ipa.math.uni-heidelberg.de;https://hci.iwr.uni-heidelberg.de/vislearn/people/ullrich-koethe/", "dblp": "242/9148;;59/5226;15/809", "google_scholar": "rFbxDSAAAAAJ;;https://scholar.google.de/citations?user=C-5j7CQAAAAJ;gt-yaNMAAAAJ", "orcid": "0000-0003-0978-1539;;;0000-0001-6036-1287", "linkedin": "felix-draxler/;wahlstefan/;;", "or_profile": "~Felix_Draxler1;~Stefan_Wahl1;~Christoph_Schnoerr1;~Ullrich_Koethe1", "aff": "Heidelberg University;Ruprecht-Karls-Universit\u00e4t Heidelberg;Ruprecht-Karls-Universit\u00e4t Heidelberg;Heidelberg University", "aff_domain": "uni-heidelberg.de;uni-heidelberg.de;uni-heidelberg.de;uni-heidelberg.de", "position": "PhD student;MS student;Full Professor;Adjunct Professor", "bibtex": "@inproceedings{\ndraxler2024on,\ntitle={On the Universality of Volume-Preserving and Coupling-Based Normalizing Flows},\nauthor={Felix Draxler and Stefan Wahl and Christoph Schnoerr and Ullrich Koethe},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uA3FRvO2DJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1959842, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8249901186887903999&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "uni-heidelberg.de;uni-heidelberg.de;uni-heidelberg.de;uni-heidelberg.de", "author_num": 4, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Heidelberg University;Ruprecht-Karls-Universit\u00e4t Heidelberg", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-heidelberg.de;https://www.uni-heidelberg.de/", "aff_unique_abbr": "Uni Heidelberg;Uni Heidelberg", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "title": "An amortized approach to non-linear mixed-effects modeling based on neural posterior estimation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32838", "id": "uCdcXRuHnC", "proceeding": "https://proceedings.mlr.press/v235/arruda24a.html", "pdf": "https://openreview.net/pdf?id=uCdcXRuHnC", "openreview": "https://openreview.net/forum?id=uCdcXRuHnC", "author_site": "Jonas Arruda, Yannik Sch\u00e4lte, Clemens Peiter, Olga Teplytska, Ulrich Jaehde, Jan Hasenauer", "tldr": "", "abstract": "Non-linear mixed-effects models are a powerful tool for studying heterogeneous populations in various fields, including biology, medicine, economics, and engineering. Here, the aim is to find a distribution over the parameters that describe the whole population using a model that can generate simulations for an individual of that population. However, fitting these distributions to data is computationally challenging if the description of individuals is complex and the population is large. To address this issue, we propose a novel machine learning-based approach: We exploit neural density estimation based on conditional normalizing flows to approximate individual-specific posterior distributions in an amortized fashion, thereby allowing for efficient inference of population parameters. Applying this approach to problems from cell biology and pharmacology, we demonstrate its unseen flexibility and scalability to large data sets compared to established methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jonas Arruda;Yannik Sch\u00e4lte;Clemens Peiter;Olga Teplytska;Ulrich Jaehde;Jan Hasenauer", "authorids": "~Jonas_Arruda1;yannik.schaelte@gmail.com;cpeiter@uni-bonn.de;oteplytska@uni-bonn.de;u.jaehde@uni-bonn.de;jan.hasenauer@uni-bonn.de", "gender": "M;;;;;", "homepage": "https://www.mathematics-and-life-sciences.uni-bonn.de/en/group-members/people/hasenauer-group-members/jonas-arruda;;;;;", "dblp": ";;;;;", "google_scholar": "7ifRSekAAAAJ;;;;;", "orcid": "0009-0008-9644-5771;;;;;", "linkedin": "jonas-arruda;;;;;", "or_profile": "~Jonas_Arruda1;yannik.schaelte@gmail.com;cpeiter@uni-bonn.de;oteplytska@uni-bonn.de;u.jaehde@uni-bonn.de;jan.hasenauer@uni-bonn.de", "aff": "Rheinische Friedrich-Wilhelms Universit\u00e4t Bonn;;;;;", "aff_domain": "uni-bonn.de;;;;;", "position": "PhD student;;;;;", "bibtex": "@inproceedings{\narruda2024an,\ntitle={An amortized approach to non-linear mixed-effects modeling based on neural posterior estimation},\nauthor={Jonas Arruda and Yannik Sch{\\\"a}lte and Clemens Peiter and Olga Teplytska and Ulrich Jaehde and Jan Hasenauer},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uCdcXRuHnC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9515659, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8627925177823513231&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "uni-bonn.de;;;;;", "author_num": 6, "aff_unique_index": "0", "aff_unique_norm": "Rheinische Friedrich-Wilhelms Universit\u00e4t Bonn", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-bonn.de/", "aff_unique_abbr": "Uni Bonn", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "title": "Compressible Dynamics in Deep Overparameterized Low-Rank Learning & Adaptation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32837", "id": "uDkXoZMzBv", "proceeding": "https://proceedings.mlr.press/v235/yaras24a.html", "pdf": "https://openreview.net/pdf?id=uDkXoZMzBv", "openreview": "https://openreview.net/forum?id=uDkXoZMzBv", "author_site": "Can Yaras, Peng Wang, Laura Balzano, Qing Qu", "tldr": "", "abstract": "While overparameterization in machine learning models offers great benefits in terms of optimization and generalization, it also leads to increased computational requirements as model sizes grow. In this work, we show that by leveraging the inherent low-dimensional structures of data and compressible dynamics within the model parameters, we can reap the benefits of overparameterization without the computational burdens. In practice, we demonstrate the effectiveness of this approach for deep low-rank matrix completion as well as fine-tuning language models. Our approach is grounded in theoretical findings for deep overparameterized low-rank matrix recovery, where we show that the learning dynamics of each weight matrix are confined to an invariant low-dimensional subspace. Consequently, we can construct and train compact, highly compressed factorizations possessing the same benefits as their overparameterized counterparts. In the context of deep matrix completion, our technique substantially improves training efficiency while retaining the advantages of overparameterization. For language model fine-tuning, we propose a method called \"Deep LoRA\", which improves the existing low-rank adaptation (LoRA) technique, leading to reduced overfitting and a simplified hyperparameter setup, while maintaining comparable efficiency. We validate the effectiveness of Deep LoRA on natural language tasks, particularly when fine-tuning with limited data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Can Yaras;Peng Wang;Laura Balzano;Qing Qu", "authorids": "~Can_Yaras1;~Peng_Wang23;~Laura_Balzano1;~Qing_Qu2", "gender": "M;M;F;M", "homepage": "https://canyaras.com;https://peng8wang.github.io/;http://web.eecs.umich.edu/~girasole/;https://qingqu.engin.umich.edu/", "dblp": "329/6596.html;95/4442-98;25/6625;127/6874-1", "google_scholar": "KmjObzwAAAAJ;baF3HKUAAAAJ;https://scholar.google.com/citations?hl=en;JfblW3MAAAAJ", "orcid": ";0000-0002-6799-0745;0000-0003-2914-123X;0000-0001-9136-558X", "linkedin": ";;;qing-q-1a0b9746/", "or_profile": "~Can_Yaras1;~Peng_Wang23;~Laura_Balzano1;~Qing_Qu2", "aff": "University of Michigan - Ann Arbor;University of Michigan - Ann Arbor;University of Michigan - Ann Arbor;University of Michigan", "aff_domain": "umich.edu;umich.edu;umich.edu;umich.edu", "position": "PhD student;Postdoc;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nyaras2024compressible,\ntitle={Compressible Dynamics in Deep Overparameterized Low-Rank Learning \\& Adaptation},\nauthor={Can Yaras and Peng Wang and Laura Balzano and Qing Qu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uDkXoZMzBv}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9551539, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13290663196137098403&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 6, "email": "umich.edu;umich.edu;umich.edu;umich.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Michigan", "aff_unique_dep": "", "aff_unique_url": "https://www.umich.edu", "aff_unique_abbr": "UM", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Ann Arbor;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "LayerMerge: Neural Network Depth Compression through Layer Pruning and Merging", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32836", "id": "uDoy7AGvEC", "proceeding": "https://proceedings.mlr.press/v235/kim24c.html", "pdf": "https://openreview.net/pdf?id=uDoy7AGvEC", "openreview": "https://openreview.net/forum?id=uDoy7AGvEC", "author_site": "Jinuk Kim, Marwa El Halabi, Mingi Ji, Hyun Oh Song", "tldr": "", "abstract": "Recent works show that reducing the number of layers in a convolutional neural network can enhance efficiency while maintaining the performance of the network. Existing depth compression methods remove redundant non-linear activation functions and merge the consecutive convolution layers into a single layer. However, these methods suffer from a critical drawback; the kernel size of the merged layers becomes larger, significantly undermining the latency reduction gained from reducing the depth of the network. We show that this problem can be addressed by jointly pruning convolution layers and activation functions. To this end, we propose *LayerMerge*, a novel depth compression method that selects which activation layers and convolution layers to remove, to achieve a desired inference speed-up while minimizing performance loss. Since the corresponding selection problem involves an exponential search space, we formulate a novel surrogate optimization problem and efficiently solve it via dynamic programming. Empirical results demonstrate that our method consistently outperforms existing depth compression and layer pruning methods on various network architectures, both on image classification and generation tasks. We release the code at https://github.com/snu-mllab/LayerMerge.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jinuk Kim;Marwa El Halabi;Mingi Ji;Hyun Oh Song", "authorids": "~Jinuk_Kim1;~Marwa_El_Halabi2;~Mingi_Ji1;~Hyun_Oh_Song1", "gender": "M;M;M;F", "homepage": "https://jusjinuk.github.io;;https://mllab.snu.ac.kr/hyunoh;https://sites.google.com/view/marwaelhalabi/home", "dblp": ";234/1226;05/10781;154/4329", "google_scholar": ";;ScoZZPsAAAAJ;Vd6RW7cAAAAJ", "orcid": ";;;", "linkedin": ";;hyun-oh-song-5a39b03;marwa-el-halabi-1504bb49/", "or_profile": "~Jinuk_Kim1;~Mingi_Ji1;~Hyun_Oh_Song1;~Marwa_El_Halabi1", "aff": "Google;Google;Seoul National University;Samsung", "aff_domain": "google.com;google.com;snu.ac.kr;samsung.com", "position": "Intern;Researcher;Associate Professor;Researcher", "bibtex": "@inproceedings{\nkim2024layermerge,\ntitle={LayerMerge: Neural Network Depth Compression through Layer Pruning and Merging},\nauthor={Jinuk Kim and Marwa El Halabi and Mingi Ji and Hyun Oh Song},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uDoy7AGvEC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 615727, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12823716862709945214&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "google.com;google.com;snu.ac.kr;samsung.com", "author_num": 4, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Google;Seoul National University;Samsung", "aff_unique_dep": "Google;;Samsung", "aff_unique_url": "https://www.google.com;https://www.snu.ac.kr;https://www.samsung.com", "aff_unique_abbr": "Google;SNU;Samsung", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "United States;South Korea" }, { "title": "Multi-View Clustering by Inter-cluster Connectivity Guided Reward", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32835", "id": "uEx2bSAJu8", "proceeding": "https://proceedings.mlr.press/v235/dai24b.html", "pdf": "https://openreview.net/pdf?id=uEx2bSAJu8", "openreview": "https://openreview.net/forum?id=uEx2bSAJu8", "author_site": "Hao Dai, Yang Liu, Peng Su, Hecheng Cai, Shudong Huang, Jiancheng Lv", "tldr": "", "abstract": "Multi-view clustering has been widely explored for its effectiveness in harmonizing heterogeneity along with consistency in different views of data. Despite the significant progress made by recent works, the performance of most existing methods is heavily reliant on strong priori information regarding the true cluster number $\\textit{K}$, which is rarely feasible in real-world scenarios. In this paper, we propose a novel graph-based multi-view clustering algorithm to infer unknown $\\textit{K}$ through a graph consistency reward mechanism. To be specific, we evaluate the cluster indicator matrix during each iteration with respect to diverse $\\textit{K}$. We formulate the inference process of unknown $\\textit{K}$ as a parsimonious reinforcement learning paradigm, where the reward is measured by inter-cluster connectivity. As a result, our approach is capable of independently producing the final clustering result, free from the input of a predefined cluster number. Experimental results on multiple benchmark datasets demonstrate the effectiveness of our proposed approach in comparison to existing state-of-the-art methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hao Dai;Yang Liu;Peng Su;Hecheng Cai;Shudong Huang;Jiancheng Lv", "authorids": "~Hao_Dai2;~Yang_Liu76;~Peng_Su2;~Hecheng_Cai1;~Shudong_Huang1;~Jiancheng_Lv2", "gender": "M;M;Not Specified;M;M;M", "homepage": ";;https://huangsd.github.io/;;https://huangsd.github.io/;https://cs.scu.edu.cn/info/1303/13767.htm", "dblp": "26/3998.html;51/3710-264;;;48/2141;", "google_scholar": ";0prb9aQAAAAJ;;;xa2bfaAAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";0000-0001-5159-5954;;0000-0003-1739-4782;;", "linkedin": ";;;;;", "or_profile": "~Hao_Dai2;~Yang_Liu76;~Peng_Su2;~Hecheng_Cai1;~Shudong_Huang1;~Jiancheng_Lv2", "aff": "Sichuan University;Sichuan University;Sichuan University;Sichuan University;Sichuan University;Sichuan University", "aff_domain": "en.scu.edu.cn;scu.edu.cn;scu.edu.cn;scu.edu.cn;scu.edu.cn;scu.edu.cn", "position": "MS student;PhD student;MS student;MS student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\ndai2024multiview,\ntitle={Multi-View Clustering by Inter-cluster Connectivity Guided Reward},\nauthor={Hao Dai and Yang Liu and Peng Su and Hecheng Cai and Shudong Huang and Jiancheng Lv},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uEx2bSAJu8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1447525, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11527295450876152464&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 4, "email": "en.scu.edu.cn;scu.edu.cn;scu.edu.cn;scu.edu.cn;scu.edu.cn;scu.edu.cn", "author_num": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Sichuan University", "aff_unique_dep": "", "aff_unique_url": "https://www.scu.edu.cn", "aff_unique_abbr": "SCU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "BadPart: Unified Black-box Adversarial Patch Attacks against Pixel-wise Regression Tasks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32834", "id": "uGoi3nY62g", "proceeding": "https://proceedings.mlr.press/v235/cheng24e.html", "pdf": "https://openreview.net/pdf?id=uGoi3nY62g", "openreview": "https://openreview.net/forum?id=uGoi3nY62g", "author_site": "Zhiyuan Cheng, Zhaoyi Liu, Tengda Guo, Shiwei Feng, Dongfang Liu, Mingjie Tang, Xiangyu Zhang", "tldr": "", "abstract": "Pixel-wise regression tasks (e.g., monocular depth estimation (MDE) and optical flow estimation (OFE)) have been widely involved in our daily life in applications like autonomous driving, augmented reality and video composition. Although certain applications are security-critical or bear societal significance, the adversarial robustness of such models are not sufficiently studied, especially in the black-box scenario. In this work, we introduce the first unified black-box adversarial patch attack framework against pixel-wise regression tasks, aiming to identify the vulnerabilities of these models under query-based black-box attacks. We propose a novel square-based adversarial patch optimization framework and employ probabilistic square sampling and score-based gradient estimation techniques to generate the patch effectively and efficiently, overcoming the scalability problem of previous black-box patch attacks. Our attack prototype, named BadPart, is evaluated on both MDE and OFE tasks, utilizing a total of 7 models. BadPart surpasses 3 baseline methods in terms of both attack performance and efficiency. We also apply BadPart on the Google online service for portrait depth estimation, causing 43.5% relative distance error with 50K queries. State-of-the-art (SOTA) countermeasures cannot defend our attack effectively.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhiyuan Cheng;Zhaoyi Liu;Tengda Guo;Shiwei Feng;Dongfang Liu;Mingjie Tang;Xiangyu Zhang", "authorids": "~Zhiyuan_Cheng2;~Zhaoyi_Liu1;~Tengda_Guo1;~Shiwei_Feng1;~Dongfang_Liu1;~Mingjie_Tang1;~Xiangyu_Zhang3", "gender": "M;;M;M;;M;M", "homepage": "https://bob-cheng.github.io;;https://github.com/matmua;https://www.cs.purdue.edu/homes/feng292/;https://www.rit.edu/directory/dxleec-dongfang-liu;http://merlintang.github.io/;https://www.cs.purdue.edu/homes/xyzhang", "dblp": "324/1963;;;138/9141-2;;;", "google_scholar": "dVchB-gAAAAJ;;;https://scholar.google.com/citations?hl=en;uICY0vEAAAAJ;tVgxEuwAAAAJ;PXbu1wIAAAAJ", "orcid": "0000-0001-7280-6079;;;0000-0001-6959-4327;;;", "linkedin": "bobchengzy/;;;swfeng98/;;;", "or_profile": "~Zhiyuan_Cheng2;~Zhaoyi_Liu1;~Tengda_Guo1;~Shiwei_Feng1;~Dongfang_Liu1;~Mingjie_Tang1;~Xiangyu_Zhang3", "aff": "Purdue University;;Sichuan University;Purdue University;Rochester Institute of Technology;Purdue University;Purdue University", "aff_domain": "purdue.edu;;scu.edu.cn;cs.purdue.edu;rit.edu;purdue.edu;cs.purdue.edu", "position": "PhD student;;Undergrad student;PhD student;Assistant Professor;PhD student;Full Professor", "bibtex": "@inproceedings{\ncheng2024badpart,\ntitle={BadPart: Unified Black-box Adversarial Patch Attacks against Pixel-wise Regression Tasks},\nauthor={Zhiyuan Cheng and Zhaoyi Liu and Tengda Guo and Shiwei Feng and Dongfang Liu and Mingjie Tang and Xiangyu Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uGoi3nY62g}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6294003, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8211005481905006918&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "purdue.edu;;scu.edu.cn;cs.purdue.edu;rit.edu;purdue.edu;cs.purdue.edu", "author_num": 7, "aff_unique_index": "0;1;0;2;0;0", "aff_unique_norm": "Purdue University;Sichuan University;Rochester Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.purdue.edu;https://www.scu.edu.cn;https://www.rit.edu", "aff_unique_abbr": "Purdue;SCU;RIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;0", "aff_country_unique": "United States;China" }, { "title": "Less is More: on the Over-Globalizing Problem in Graph Transformers", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32833", "id": "uKmcyyrZae", "proceeding": "https://proceedings.mlr.press/v235/xing24b.html", "pdf": "https://openreview.net/pdf?id=uKmcyyrZae", "openreview": "https://openreview.net/forum?id=uKmcyyrZae", "author_site": "Yujie Xing, Xiao Wang, Yibo Li, Hai Huang, Chuan Shi", "tldr": "", "abstract": "Graph Transformer, due to its global attention mechanism, has emerged as a new tool in dealing with graph-structured data. It is well recognized that the global attention mechanism considers a wider receptive field in a fully connected graph, leading many to believe that useful information can be extracted from all the nodes. In this paper, we challenge this belief: does the globalizing property always benefit Graph Transformers? We reveal the over-globalizing problem in Graph Transformer by presenting both empirical evidence and theoretical analysis, i.e., the current attention mechanism overly focuses on those distant nodes, while the near nodes, which actually contain most of the useful information, are relatively weakened. Then we propose a novel Bi-Level Global Graph Transformer with Collaborative Training (CoBFormer), including the inter-cluster and intra-cluster Transformers, to prevent the over-globalizing problem while keeping the ability to extract valuable information from distant nodes. Moreover, the collaborative training is proposed to improve the model's generalization ability with a theoretical guarantee. Extensive experiments on various graphs well validate the effectiveness of our proposed CoBFormer.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yujie Xing;Xiao Wang;Yibo Li;Hai Huang;Chuan Shi", "authorids": "~Yujie_Xing2;~Xiao_Wang2;~Yibo_Li2;~Hai_Huang9;~Chuan_Shi1", "gender": "M;M;;;M", "homepage": "https://github.com/null-xyj;https://wangxiaocs.github.io/;;;http://www.shichuan.org/", "dblp": "228/7019;49/67-17;;;64/3041-1", "google_scholar": "6OW4z84AAAAJ;MnzarAQAAAAJ;;;tUq_v90AAAAJ", "orcid": ";0000-0002-4444-7811;;;0000-0002-3734-0266", "linkedin": ";;;;", "or_profile": "~Yujie_Xing2;~Xiao_Wang2;~Yibo_Li2;~Hai_Huang9;~Chuan_Shi1", "aff": "Beijing University of Posts and Telecommunications;Beihang University;;;Beijing University of Post and Telecommunication", "aff_domain": "bupt.edu.cn;buaa.edu.cn;;;bupt.edu.cn", "position": "MS student;Full Professor;;;Full Professor", "bibtex": "@inproceedings{\nxing2024less,\ntitle={Less is More: on the Over-Globalizing Problem in Graph Transformers},\nauthor={Yujie Xing and Xiao Wang and Yibo Li and Hai Huang and Chuan Shi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uKmcyyrZae}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 723896, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17778216463220532672&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 9, "email": "bupt.edu.cn;buaa.edu.cn;;;bupt.edu.cn", "author_num": 5, "aff_unique_index": "0;1;0", "aff_unique_norm": "Beijing University of Posts and Telecommunications;Beihang University", "aff_unique_dep": ";", "aff_unique_url": "http://www.bupt.edu.cn/;http://www.buaa.edu.cn/", "aff_unique_abbr": "BUPT;BUAA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Statistical Test for Attention Maps in Vision Transformers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32832", "id": "uLonuOfrwp", "proceeding": "https://proceedings.mlr.press/v235/shiraishi24a.html", "pdf": "https://openreview.net/pdf?id=uLonuOfrwp", "openreview": "https://openreview.net/forum?id=uLonuOfrwp", "author_site": "Tomohiro Shiraishi, Daiki Miwa, Teruyuki Katsuoka, Vo Nguyen Le Duy, Kouichi Taji, Ichiro Takeuchi", "tldr": "", "abstract": "The Vision Transformer (ViT) demonstrates exceptional performance in various computer vision tasks. Attention is crucial for ViT to capture complex wide-ranging relationships among image patches, allowing the model to weigh the importance of image patches and aiding our understanding of the decision-making process. However, when utilizing the attention of ViT as evidence in high-stakes decision-making tasks such as medical diagnostics, a challenge arises due to the potential of attention mechanisms erroneously focusing on irrelevant regions. In this study, we propose a statistical test for ViT's attentions, enabling us to use the attentions as reliable quantitative evidence indicators for ViT's decision-making with a rigorously controlled error rate. Using the framework called selective inference, we quantify the statistical significance of attentions in the form of p-values, which enables the theoretically grounded quantification of the false positive detection probability of attentions. We demonstrate the validity and the effectiveness of the proposed method through numerical experiments and applications to brain image diagnoses.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tomohiro Shiraishi;Daiki Miwa;Teruyuki Katsuoka;Vo Nguyen Le Duy;Kouichi Taji;Ichiro Takeuchi", "authorids": "shiraishi.tomohiro.nagoyaml@gmail.com;~Daiki_Miwa1;katsuoka.teruyuki.nagoyaml@gmail.com;~Vo_Nguyen_Le_Duy1;taji@nagoya-u.jp;~Ichiro_Takeuchi1", "gender": ";M;;M;;M", "homepage": ";https://github.com/DaikiMiwa;;http://vonguyenleduy.github.io;;https://www.mlds.mae.nagoya-u.ac.jp/takeuchi/index.en.html", "dblp": ";;;241/7008;;36/4181", "google_scholar": ";;;qcpIUoQAAAAJ;;IwBHa3gAAAAJ", "orcid": ";;;;;0009-0005-1905-2366", "linkedin": ";;;vo-nguyen-le-duy/;;", "or_profile": "shiraishi.tomohiro.nagoyaml@gmail.com;~Daiki_Miwa1;katsuoka.teruyuki.nagoyaml@gmail.com;~Vo_Nguyen_Le_Duy1;taji@nagoya-u.jp;~Ichiro_Takeuchi1", "aff": ";Nagoya Institute of Technology, Tokyo Institute of Technology;;University of Information Technology, Vietnam National University - HCM;;RIKEN", "aff_domain": ";nitech.ac.jp;;uit.edu.vn;;riken.jp", "position": ";MS student;;Lecturer;;Principal Researcher", "bibtex": "@inproceedings{\nshiraishi2024statistical,\ntitle={Statistical Test for Attention Maps in Vision Transformers},\nauthor={Tomohiro Shiraishi and Daiki Miwa and Teruyuki Katsuoka and Vo Nguyen Le Duy and Kouichi Taji and Ichiro Takeuchi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uLonuOfrwp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1900038, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8824985045137240032&as_sdt=20005&sciodt=0,9&hl=en", "gs_version_total": 7, "email": ";nitech.ac.jp;;uit.edu.vn;;riken.jp", "author_num": 6, "aff_unique_index": "0;1;2", "aff_unique_norm": "Nagoya Institute of Technology;University of Information Technology;RIKEN", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nitech.ac.jp;https://uit.edu.vn;https://www.riken.jp", "aff_unique_abbr": "NIT;UIT;RIKEN", "aff_campus_unique_index": "1", "aff_campus_unique": ";HCM", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Japan;Vietnam" }, { "title": "Scaling Beyond the GPU Memory Limit for Large Mixture-of-Experts Model Training", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32831", "id": "uLpyWQPyF9", "proceeding": "https://proceedings.mlr.press/v235/kim24w.html", "pdf": "https://openreview.net/pdf?id=uLpyWQPyF9", "openreview": "https://openreview.net/forum?id=uLpyWQPyF9", "author_site": "Yechan Kim, Hwijoon Lim, Dongsu Han", "tldr": "", "abstract": "Mixture-of-Experts (MoE) is a powerful technique for enhancing the performance of neural networks while decoupling computational complexity from the number of parameters. However, despite this, scaling the number of experts requires adding more GPUs. In addition, the load imbalance in token load across experts causes unnecessary computation or straggler problems. We present ES-MoE, a novel method for efficient scaling MoE training. It offloads expert parameters to host memory and leverages pipelined expert processing to overlap GPU-CPU communication with GPU computation. It dynamically balances token loads across GPUs, improving computational efficiency. ES-MoE accelerates MoE training on a limited number of GPUs without degradation in model performance. We validate our approach on GPT-based MoE models, demonstrating 67$\\times$ better scalability and up to 17.5$\\times$ better throughput over existing frameworks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yechan Kim;Hwijoon Lim;Dongsu Han", "authorids": "~Yechan_Kim1;~Hwijoon_Lim1;~Dongsu_Han1", "gender": "M;M;M", "homepage": ";https://wjuni.com;http://ina.kaist.ac.kr/", "dblp": "274/0876;187/0217;12/5388", "google_scholar": ";aOgTgQIAAAAJ;https://scholar.google.com.tw/citations?user=NMggRxkAAAAJ", "orcid": ";0000-0002-9872-6234;0000-0001-6922-7244", "linkedin": "yechan-kim-elliot/;hwijoon-lim/;", "or_profile": "~Yechan_Kim1;~Hwijoon_Lim1;~Dongsu_Han1", "aff": "NVIDIA;Korea Advanced Institute of Science & Technology;KAIST, Korea Advanced Institute of Science & Technology", "aff_domain": "nvidia.com;kaist.ac.kr;ee.kaist.ac.kr", "position": "Researcher;PhD student;Full Professor", "bibtex": "@inproceedings{\nkim2024scaling,\ntitle={Scaling Beyond the {GPU} Memory Limit for Large Mixture-of-Experts Model Training},\nauthor={Yechan Kim and Hwijoon Lim and Dongsu Han},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uLpyWQPyF9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1053280, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6435446837019815218&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "nvidia.com;kaist.ac.kr;ee.kaist.ac.kr", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "NVIDIA;Korea Advanced Institute of Science and Technology", "aff_unique_dep": "NVIDIA Corporation;", "aff_unique_url": "https://www.nvidia.com;https://www.kaist.ac.kr", "aff_unique_abbr": "NVIDIA;KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;South Korea" }, { "title": "Conformal prediction for multi-dimensional time series by ellipsoidal sets", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32830", "id": "uN39Tt9P8b", "proceeding": "https://proceedings.mlr.press/v235/xu24m.html", "pdf": "https://openreview.net/pdf?id=uN39Tt9P8b", "openreview": "https://openreview.net/forum?id=uN39Tt9P8b", "author_site": "Chen Xu, Hanyang Jiang, Yao Xie", "tldr": "", "abstract": "Conformal prediction (CP) has been a popular method for uncertainty quantification because it is distribution-free, model-agnostic, and theoretically sound. For forecasting problems in supervised learning, most CP methods focus on building prediction intervals for univariate responses. In this work, we develop a sequential CP method called $\\texttt{MultiDimSPCI}$ that builds prediction $\\textit{regions}$ for a multivariate response, especially in the context of multivariate time series, which are not exchangeable. Theoretically, we estimate $\\textit{finite-sample}$ high-probability bounds on the conditional coverage gap. Empirically, we demonstrate that $\\texttt{MultiDimSPCI}$ maintains valid coverage on a wide range of multivariate time series while producing smaller prediction regions than CP and non-CP baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chen Xu;Hanyang Jiang;Yao Xie", "authorids": "~Chen_Xu12;~Hanyang_Jiang2;~Yao_Xie2", "gender": "M;;F", "homepage": "https://hamrel-cxu.github.io/;;http://www2.isye.gatech.edu/~yxie77", "dblp": ";;13/4242-2", "google_scholar": "https://scholar.google.com/citations?hl=en;;qvYp8ZQAAAAJ", "orcid": ";;", "linkedin": "chen-xu-92013714a/;;yaoxie/", "or_profile": "~Chen_Xu12;~Hanyang_Jiang2;~Yao_Xie2", "aff": "Georgia Institute of Technology;;Georgia Institute of Technology", "aff_domain": "gatech.edu;;gatech.edu", "position": "PhD student;;Full Professor", "bibtex": "@inproceedings{\nxu2024conformal,\ntitle={Conformal prediction for multi-dimensional time series by ellipsoidal sets},\nauthor={Chen Xu and Hanyang Jiang and Yao Xie},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uN39Tt9P8b}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 960839, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8008467754022942067&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 11, "email": "gatech.edu;;gatech.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "An LLM Compiler for Parallel Function Calling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32829", "id": "uQ2FUoFjnF", "proceeding": "https://proceedings.mlr.press/v235/kim24y.html", "pdf": "https://openreview.net/pdf?id=uQ2FUoFjnF", "openreview": "https://openreview.net/forum?id=uQ2FUoFjnF", "author_site": "Sehoon Kim, Suhong Moon, Ryan Tabrizi, Nicholas Lee, Michael Mahoney, EECS Kurt Keutzer, Amir Gholaminejad", "tldr": "", "abstract": "The reasoning capabilities of the recent LLMs enable them to execute external function calls to overcome their inherent limitations, such as knowledge cutoffs, poor arithmetic skills, or lack of access to private data. This development has allowed LLMs to select and coordinate multiple functions based on the context to tackle more complex problems. However, current methods for function calling often require sequential reasoning and acting for each function which can result in high latency, cost, and sometimes inaccurate behavior. To address this, we introduce LLMCompiler, which executes functions in parallel to efficiently orchestrate multiple function calls. Drawing inspiration from the principles of classical compilers, LLMCompiler enables parallel function calling with three components: (i) a Function Calling Planner, formulating execution plans for function calling; (ii) a Task Fetching Unit, dispatching function calling tasks; and (iii) an Executor, executing these tasks in parallel. LLMCompiler automatically generates an optimized orchestration for the function calls and can be used with both open-source and closed-source models. We have benchmarked LLMCompiler on a range of tasks with different patterns of function calling. We observe consistent latency speedup of up to $3.7 \\times$, cost savings of up to $6.7 \\times$, and accuracy improvement of up to $\\sim 9 \\%$ compared to ReAct.Our code is available at https://github.com/SqueezeAILab/LLMCompiler.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sehoon Kim;Suhong Moon;Ryan Tabrizi;Nicholas Lee;Michael W. Mahoney;Kurt Keutzer;Amir Gholami", "authorids": "~Sehoon_Kim1;~Suhong_Moon1;rtabrizi@berkeley.edu;~Nicholas_Lee1;~Michael_W._Mahoney1;~Kurt_Keutzer1;~Amir_Gholami2", "gender": "M;M;;M;;M;", "homepage": "https://sehoonkim.org;;;;;https://people.eecs.berkeley.edu/~keutzer/;", "dblp": ";242/2290;;;;k/KurtKeutzer.html;", "google_scholar": "zQABr7QAAAAJ;https://scholar.google.com/citations?hl=en;;57gDGpUAAAAJ;;ID9QePIAAAAJ;", "orcid": ";;;;;0000-0003-3868-8501;", "linkedin": "sehoon-kim-13a1b51b1/;;;nicholas-lee-74731916a/;;kurtkeutzer/;", "or_profile": "~Sehoon_Kim1;~Suhong_Moon1;rtabrizi@berkeley.edu;~Nicholas_Lee1;~Michael_W._Mahoney1;~Kurt_Keutzer1;~Amir_Gholami2", "aff": "University of California, Berkeley;University of California, Berkeley;;University of California, Berkeley;;University of California, Berkeley;", "aff_domain": "berkeley.edu;berkeley.edu;;berkeley.edu;;berkeley.edu;", "position": "PhD student;PhD student;;PhD student;;Full Professor;", "bibtex": "@inproceedings{\nkim2024an,\ntitle={An {LLM} Compiler for Parallel Function Calling},\nauthor={Sehoon Kim and Suhong Moon and Ryan Tabrizi and Nicholas Lee and Michael W. Mahoney and Kurt Keutzer and Amir Gholami},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uQ2FUoFjnF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 887109, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 66, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5145629690886803973&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 7, "email": "berkeley.edu;berkeley.edu;;berkeley.edu;;berkeley.edu;", "author_num": 7, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Random matrix theory improved Fr\u00e9chet mean of symmetric positive definite matrices", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32828", "id": "uQiFsBil3p", "proceeding": "https://proceedings.mlr.press/v235/bouchard24a.html", "pdf": "https://openreview.net/pdf?id=uQiFsBil3p", "openreview": "https://openreview.net/forum?id=uQiFsBil3p", "author_site": "Florent Bouchard, Ammar Mian, Malik TIOMOKO, Guillaume GINOLHAC, Frederic Pascal", "tldr": "", "abstract": "In this study, we consider the realm of covariance matrices in machine learning, particularly focusing on computing Fr\u00e9chet means on the manifold of symmetric positive definite matrices, commonly referred to as Karcher or geometric means. Such means are leveraged in numerous machine learning tasks. Relying on advanced statistical tools, we introduce a random matrix theory based method that estimates Fr\u00e9chet means, which is particularly beneficial when dealing with low sample support and a high number of matrices to average. Our experimental evaluation, involving both synthetic and real-world EEG and hyperspectral datasets, shows that we largely outperform state-of-the-art methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Florent Bouchard;Ammar Mian;Malik Tiomoko;Guillaume Ginolhac;Frederic Pascal", "authorids": "florent.bouchard@centralesupelec.fr;ammar.mian@univ-smb.fr;~Malik_Tiomoko1;~Guillaume_Ginolhac1;~Frederic_Pascal1", "gender": ";;M;M;M", "homepage": ";;;https://www.univ-smb.fr/listic/presentation/membres/enseignants-chercheurs/guillaume-ginolhac/;https://fredericpascal.blogspot.com/p/home.html", "dblp": ";;228/9231;07/2435;", "google_scholar": ";;;https://scholar.google.fr/citations?user=-fCtTbcAAAAJ;https://scholar.google.fr/citations?hl=en", "orcid": ";;;0000-0001-9318-028X;0000-0003-0196-6395", "linkedin": ";;;;frederic-pascal/", "or_profile": "florent.bouchard@centralesupelec.fr;ammar.mian@univ-smb.fr;~Malik_Tiomoko1;~Guillaume_Ginolhac1;~Frederic_Pascal1", "aff": ";;Huawei Technologies Ltd.;Universit\u00e9 de Savoie;CentraleSupelec", "aff_domain": ";;huawei.com;univ-savoie.fr;centralesupelec.fr", "position": ";;Researcher;Full Professor;Full Professor", "bibtex": "@inproceedings{\nbouchard2024random,\ntitle={Random matrix theory improved Fr\\'echet mean of symmetric positive definite matrices},\nauthor={Florent Bouchard and Ammar Mian and Malik Tiomoko and Guillaume Ginolhac and Frederic Pascal},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uQiFsBil3p}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 454028, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1442348174617312379&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "email": ";;huawei.com;univ-savoie.fr;centralesupelec.fr", "author_num": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "Huawei;Universit\u00e9 de Savoie;CentraleSup\u00e9lec", "aff_unique_dep": "Huawei Technologies;;", "aff_unique_url": "https://www.huawei.com;https://www.univ-savoie.fr;https://www.centralesupelec.fr", "aff_unique_abbr": "Huawei;;CS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;France" }, { "title": "Bidirectional Reciprocative Information Communication for Few-Shot Semantic Segmentation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32827", "id": "uRz9GZN17X", "proceeding": "https://proceedings.mlr.press/v235/liu24t.html", "pdf": "https://openreview.net/pdf?id=uRz9GZN17X", "openreview": "https://openreview.net/forum?id=uRz9GZN17X", "author_site": "Yuanwei Liu, Junwei Han, Xiwen Yao, Salman Khan, Hisham Cholakkal, Rao Anwer, Nian Liu, Fahad Khan", "tldr": "", "abstract": "Existing few-shot semantic segmentation methods typically rely on a one-way flow of category information from support to query, ignoring the impact of intra-class diversity. To address this, drawing inspiration from cybernetics, we introduce a Query Feedback Branch (QFB) to propagate query information back to support, generating a query-related support prototype that is more aligned with the query. Subsequently, a Query Amplifier Branch (QAB) is employed to amplify target objects in the query using the acquired support prototype. To further improve the model, we propose a Query Rectification Module (QRM), which utilizes the prediction disparity in the query before and after support activation to identify challenging positive and negative samples from ambiguous regions for query self-rectification. Furthermore, we integrate the QFB, QAB, and QRM into a feedback and rectification layer and incorporate it into an iterative pipeline. This configuration enables the progressive enhancement of bidirectional reciprocative flow of category information between query and support, effectively providing query-adaptive support information and addressing the intra-class diversity problem. Extensive experiments conducted on both PASCAL-5i and COCO-20i datasets validate the effectiveness of our approach. The code is available at https://github.com/LIUYUANWEI98/IFRNet .", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuanwei Liu;Junwei Han;Xiwen Yao;Salman Khan;Hisham Cholakkal;Rao Muhammad Anwer;Nian Liu;Fahad Shahbaz Khan", "authorids": "~Yuanwei_Liu1;~Junwei_Han1;~Xiwen_Yao1;~Salman_Khan4;~Hisham_Cholakkal2;~Rao_Muhammad_Anwer2;~Nian_Liu1;~Fahad_Shahbaz_Khan1", "gender": ";;;M;M;;M;", "homepage": ";;;https://salman-h-khan.github.io/;https://mbzuai.ac.ae/pages/hisham-cholakkal/;;https://sites.google.com/site/liunian228/;", "dblp": ";;;32/11535-1;129/2046;;0000-0002-0825-6081;", "google_scholar": ";;;https://scholar.google.es/citations?user=M59O9lkAAAAJ;bZ3YBRcAAAAJ;;ZSilWs4AAAAJ;", "orcid": ";;;0000-0002-9502-1749;;;;", "linkedin": ";;;;;;;", "or_profile": "~Yuanwei_Liu1;~Junwei_Han1;~Xiwen_Yao1;~Salman_Khan4;~Hisham_Cholakkal2;~Rao_Muhammad_Anwer2;~Nian_Liu1;~Fahad_Shahbaz_Khan1", "aff": ";;;Australian National University;MBZUAI;;Mohamed bin Zayed University of Artificial Intelligence;", "aff_domain": ";;;anu.edu.au;mbzuai.ac.ae;;mbzuai.ac.ae;", "position": ";;;Lecturer;Assistant Professor;;Researcher;", "bibtex": "@inproceedings{\nliu2024bidirectional,\ntitle={Bidirectional Reciprocative Information Communication for Few-Shot Semantic Segmentation},\nauthor={Yuanwei Liu and Junwei Han and Xiwen Yao and Salman Khan and Hisham Cholakkal and Rao Muhammad Anwer and Nian Liu and Fahad Shahbaz Khan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uRz9GZN17X}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9171591, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10924782496774102842&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 4, "email": ";;;anu.edu.au;mbzuai.ac.ae;;mbzuai.ac.ae;", "author_num": 8, "aff_unique_index": "0;1;1", "aff_unique_norm": "Australian National University;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";", "aff_unique_url": "https://www.anu.edu.au;https://www.mbzuai.ac.ae", "aff_unique_abbr": "ANU;MBZUAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Australia;United Arab Emirates" }, { "title": "GPTSwarm: Language Agents as Optimizable Graphs", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32826", "id": "uTC9AFXIhg", "proceeding": "https://proceedings.mlr.press/v235/zhuge24a.html", "pdf": "https://openreview.net/pdf?id=uTC9AFXIhg", "openreview": "https://openreview.net/forum?id=uTC9AFXIhg", "author_site": "Mingchen Zhuge, Wenyi Wang, Louis Kirsch, Francesco Faccio, Dmitrii Khizbullin, J\u00fcrgen Schmidhuber", "tldr": "", "abstract": "Various human-designed prompt engineering techniques have been proposed to improve problem solvers based on Large Language Models (LLMs), yielding many disparate code bases. We unify these approaches by describing LLM-based agents as computational graphs. The nodes implement functions to process multimodal data or query LLMs, and the edges describe the information flow between operations. Graphs can be recursively combined into larger composite graphs representing hierarchies of inter-agent collaboration (where edges connect operations of different agents). Our novel automatic graph optimizers (1) refine node-level LLM prompts (node optimization) and (2) improve agent orchestration by changing graph connectivity (edge optimization). Experiments demonstrate that our framework can be used to efficiently develop, integrate, and automatically improve various LLM agents. Our code is public.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mingchen Zhuge;Wenyi Wang;Louis Kirsch;Francesco Faccio;Dmitrii Khizbullin;J\u00fcrgen Schmidhuber", "authorids": "~Mingchen_Zhuge2;~Wenyi_Wang1;~Louis_Kirsch1;~Francesco_Faccio1;~Dmitrii_Khizbullin2;~J\u00fcrgen_Schmidhuber1", "gender": "M;;;M;M;M", "homepage": "https://metauto.ai;https://scholar.google.com/citations?user=79ODhuQAAAAJ&hl=en;http://louiskirsch.com;;https://khizbullin.tech;http://people.idsia.ch/~juergen/", "dblp": "283/5310;;202/2379;227/3214;;s/JurgenSchmidhuber", "google_scholar": "Qnj6XlMAAAAJ;;w8AkOEAAAAAJ;0z3DkrkAAAAJ;;https://scholar.google.ch/citations?user=gLnCTgIAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;dmitrii-khizbullin;", "or_profile": "~Mingchen_Zhuge2;~Wenyi_Wang1;~Louis_Kirsch1;~Francesco_Faccio1;~Dmitrii_Khizbullin2;~J\u00fcrgen_Schmidhuber1", "aff": "King Abdullah University of Science and Technology;King Abdullah University of Science and Technology;Scuola universitaria professionale della Svizzera italiana (SUPSI);The Swiss AI Lab IDSIA - USI - SUPSI;King Abdullah University of Science and Technology;IDSIA", "aff_domain": "kaust.edu.sa;kaust.edu.sa;supsi.ch;idsia.ch;kaust.edu.sa;idsia.ch", "position": "PhD student;PhD student;PhD student;PhD student;Researcher;Scientific Director", "bibtex": "@inproceedings{\nzhuge2024gptswarm,\ntitle={{GPTS}warm: Language Agents as Optimizable Graphs},\nauthor={Mingchen Zhuge and Wenyi Wang and Louis Kirsch and Francesco Faccio and Dmitrii Khizbullin and J{\\\"u}rgen Schmidhuber},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uTC9AFXIhg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3429467, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4269456490681308104&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "kaust.edu.sa;kaust.edu.sa;supsi.ch;idsia.ch;kaust.edu.sa;idsia.ch", "author_num": 6, "aff_unique_index": "0;0;1;2;0;3", "aff_unique_norm": "King Abdullah University of Science and Technology;Scuola universitaria professionale della Svizzera italiana;Swiss AI Lab IDSIA;Institute of Digital Technologies", "aff_unique_dep": ";;AI Lab;", "aff_unique_url": "https://www.kast.kau.edu.sa;https://www.supsi.ch;https://www.idsia.ch/;https://www.idsia.ch", "aff_unique_abbr": "KAUST;SUPSI;IDSIA;IDSIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0;1", "aff_country_unique": "Saudi Arabia;Switzerland" }, { "title": "A Dynamic Algorithm for Weighted Submodular Cover Problem", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32825", "id": "uUeXaKLE1I", "proceeding": "https://proceedings.mlr.press/v235/banihashem24a.html", "pdf": "https://openreview.net/pdf?id=uUeXaKLE1I", "openreview": "https://openreview.net/forum?id=uUeXaKLE1I", "author_site": "Kiarash Banihashem, Samira Goudarzi, MohammadTaghi Hajiaghayi, Peyman Jabbarzade, Morteza Monemizadeh", "tldr": "", "abstract": "We initiate the study of the submodular cover problem in a dynamic setting where the elements of the ground set are inserted and deleted. In the classical submodular cover problem, we are given a monotone submodular function $f : 2^{V} \\to \\mathbb{R}^{\\ge 0}$ and the goal is to obtain a set $S \\subseteq V$ that minimizes the cost subject to the constraint $f(S) = f(V)$. This is a classical problem in computer science and generalizes the Set Cover problem, 2-Set Cover, and dominating set problem among others. We consider this problem in a dynamic setting where there are updates to our set $V$, in the form of insertions and deletions of elements from a ground set $\\mathcal{V}$, and the goal is to maintain an approximately optimal solution with low query complexity per update. For this problem, we propose a randomized algorithm that, in expectation, obtains a $(1-O(\\epsilon), O(\\epsilon^{-1}))$-bicriteria approximation using polylogarithmic query complexity per update.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kiarash Banihashem;Samira Goudarzi;MohammadTaghi Hajiaghayi;Peyman Jabbarzade;Morteza Monemizadeh", "authorids": "~Kiarash_Banihashem1;~Samira_Goudarzi1;~MohammadTaghi_Hajiaghayi1;~Peyman_Jabbarzade1;~Morteza_Monemizadeh1", "gender": "M;F;M;M;M", "homepage": ";;http://www.cs.umd.edu/~hajiagha/;https://research.tue.nl/en/persons/morteza-monemizadeh;", "dblp": "285/5061;;334/4488;11/4322.html;308/2567", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?view_op=list_works;https://scholar.google.com.tw/citations?user=SQ1eGN4AAAAJ;wVH7Gp4AAAAJ;waMTjQcAAAAJ", "orcid": ";;0000-0003-4842-0533;;", "linkedin": ";;mohammad-hajiaghayi-2139a913a&ved=2ahUKEwjMyeH-5-_-AhV3K1kFHeeBDKwQjjh6BAgSEAE&usg=AOvVaw1NSVoT5FCGtOTi4eT8nr4b;;", "or_profile": "~Kiarash_Banihashem1;~Samira_Goudarzi1;~MohammadTaghi_Hajiaghayi1;~Morteza_Monemizadeh1;~Peyman_Jabbarzade_Ganje1", "aff": "University of Maryland, College Park;University of Maryland, College Park;University of Maryland, College Park;Eindhoven University of Technology;Department of Computer Science, University of Maryland, College Park", "aff_domain": "umd.edu;umd.edu;umd.edu;tue.nl;cs.umd.edu", "position": "PhD student;PhD student;Full Professor;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nbanihashem2024a,\ntitle={A Dynamic Algorithm for Weighted Submodular Cover Problem},\nauthor={Kiarash Banihashem and Samira Goudarzi and MohammadTaghi Hajiaghayi and Peyman Jabbarzade and Morteza Monemizadeh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uUeXaKLE1I}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 398270, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3nB_su2TKT0J:scholar.google.com/&scioq=A+Dynamic+Algorithm+for+Weighted+Submodular+Cover+Problem&hl=en&as_sdt=0,33", "gs_version_total": 6, "email": "umd.edu;umd.edu;umd.edu;tue.nl;cs.umd.edu", "author_num": 5, "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "University of Maryland;Eindhoven University of Technology;University of Maryland, College Park", "aff_unique_dep": ";;Department of Computer Science", "aff_unique_url": "https://www/umd.edu;https://www.tue.nl;https://www/umd.edu", "aff_unique_abbr": "UMD;TU/e;UMD", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;Netherlands" }, { "title": "Slicing Mutual Information Generalization Bounds for Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32824", "id": "uWNUTRgBso", "proceeding": "https://proceedings.mlr.press/v235/nadjahi24a.html", "pdf": "https://openreview.net/pdf?id=uWNUTRgBso", "openreview": "https://openreview.net/forum?id=uWNUTRgBso", "author_site": "Kimia Nadjahi, Kristjan Greenewald, Rickard Gabrielsson, Justin Solomon", "tldr": "", "abstract": "The ability of machine learning (ML) algorithms to generalize well to unseen data has been studied through the lens of information theory, by bounding the generalization error with the input-output mutual information (MI), i.e., the MI between the training data and the learned hypothesis. Yet, these bounds have limited practicality for modern ML applications (e.g., deep learning), due to the difficulty of evaluating MI in high dimensions. Motivated by recent findings on the compressibility of neural networks, we consider algorithms that operate by *slicing* the parameter space, i.e., trained on random lower-dimensional subspaces. We introduce new, tighter information-theoretic generalization bounds tailored for such algorithms, demonstrating that slicing improves generalization. Our bounds offer significant computational and statistical advantages over standard MI bounds, as they rely on scalable alternative measures of dependence, i.e., disintegrated mutual information and $k$-sliced mutual information. Then, we extend our analysis to algorithms whose parameters do not need to exactly lie on random subspaces, by leveraging rate-distortion theory. This strategy yields generalization bounds that incorporate a distortion term measuring model compressibility under slicing, thereby tightening existing bounds without compromising performance or requiring model compression. Building on this, we propose a regularization scheme enabling practitioners to control generalization through compressibility. Finally, we empirically validate our results and achieve the computation of non-vacuous information-theoretic generalization bounds for neural networks, a task that was previously out of reach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kimia Nadjahi;Kristjan Greenewald;Rickard Br\u00fcel Gabrielsson;Justin Solomon", "authorids": "~Kimia_Nadjahi1;~Kristjan_Greenewald1;~Rickard_Br\u00fcel_Gabrielsson1;~Justin_Solomon1", "gender": "F;;Not Specified;M", "homepage": "http://kimiandj.github.io/;https://researcher.watson.ibm.com/researcher/view.php?person=ibm-Kristjan.H.Greenewald;http://bruel.org/;http://people.csail.mit.edu/jsolomon/", "dblp": "236/4646;146/0563;228/6813;80/5094", "google_scholar": "x0_peq4AAAAJ;L3zNUG4AAAAJ;y9Oh5XwAAAAJ;pImSVwoAAAAJ", "orcid": ";;;0000-0002-7701-7586", "linkedin": ";;;justin-solomon-8a587914/", "or_profile": "~Kimia_Nadjahi1;~Kristjan_Greenewald1;~Rickard_Br\u00fcel_Gabrielsson1;~Justin_Solomon1", "aff": "Massachusetts Institute of Technology;MIT-IBM Watson AI Lab, IBM Research;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;ibm.com;mit.edu;mit.edu", "position": "Postdoc;Research Scientist;PhD student;Associate Professor", "bibtex": "@inproceedings{\nnadjahi2024slicing,\ntitle={Slicing Mutual Information Generalization Bounds for Neural Networks},\nauthor={Kimia Nadjahi and Kristjan Greenewald and Rickard Br{\\\"u}el Gabrielsson and Justin Solomon},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uWNUTRgBso}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 673828, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4214553080359995200&as_sdt=5,24&sciodt=0,24&hl=en", "gs_version_total": 10, "email": "mit.edu;ibm.com;mit.edu;mit.edu", "author_num": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Massachusetts Institute of Technology;IBM", "aff_unique_dep": ";AI Lab", "aff_unique_url": "https://web.mit.edu;https://www.ibmwatsonai.org/", "aff_unique_abbr": "MIT;MIT-IBM AI Lab", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Graph-based Forecasting with Missing Data through Spatiotemporal Downsampling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32823", "id": "uYIFQOtb58", "proceeding": "https://proceedings.mlr.press/v235/marisca24a.html", "pdf": "https://openreview.net/pdf?id=uYIFQOtb58", "openreview": "https://openreview.net/forum?id=uYIFQOtb58", "author_site": "Ivan Marisca, Cesare Alippi, Filippo Maria Bianchi", "tldr": "", "abstract": "Given a set of synchronous time series, each associated with a sensor-point in space and characterized by inter-series relationships, the problem of spatiotemporal forecasting consists of predicting future observations for each point. Spatiotemporal graph neural networks achieve striking results by representing the relationships across time series as a graph. Nonetheless, most existing methods rely on the often unrealistic assumption that inputs are always available and fail to capture hidden spatiotemporal dynamics when part of the data is missing. In this work, we tackle this problem through hierarchical spatiotemporal downsampling. The input time series are progressively coarsened over time and space, obtaining a pool of representations that capture heterogeneous temporal and spatial dynamics. Conditioned on observations and missing data patterns, such representations are combined by an interpretable attention mechanism to generate the forecasts. Our approach outperforms state-of-the-art methods on synthetic and real-world benchmarks under different missing data distributions, particularly in the presence of contiguous blocks of missing values.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ivan Marisca;Cesare Alippi;Filippo Maria Bianchi", "authorids": "~Ivan_Marisca1;~Cesare_Alippi1;~Filippo_Maria_Bianchi1", "gender": "M;M;M", "homepage": "https://marshka.github.io/;https://alippi.faculty.polimi.it/;https://sites.google.com/view/filippombianchi/home", "dblp": "298/8039;84/6337;139/5968", "google_scholar": "loKgz80AAAAJ;SCZObbIAAAAJ;https://scholar.google.ca/citations?user=yb7cT1MAAAAJ", "orcid": "0000-0002-9713-1626;;", "linkedin": "ivanmarisca;;", "or_profile": "~Ivan_Marisca1;~Cesare_Alippi1;~Filippo_Maria_Bianchi1", "aff": "Universit\u00e0 della Svizzera Italiana;Politecnico di Milano;NORCE the Norwegian Research Center", "aff_domain": "usi.ch;polimi.it;norce.no", "position": "PhD student;Full Professor;Researcher", "bibtex": "@inproceedings{\nmarisca2024graphbased,\ntitle={Graph-based Forecasting with Missing Data through Spatiotemporal Downsampling},\nauthor={Ivan Marisca and Cesare Alippi and Filippo Maria Bianchi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uYIFQOtb58}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3064234, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3639003022091882194&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "usi.ch;polimi.it;norce.no", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Universit\u00e0 della Svizzera italiana;Politecnico di Milano;NORCE", "aff_unique_dep": ";;", "aff_unique_url": "https://www.usi.ch;https://www.polimi.it;https://www.norce.no", "aff_unique_abbr": "USI;Polimi;NORCE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "Switzerland;Italy;Norway" }, { "title": "Language Models with Conformal Factuality Guarantees", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32822", "id": "uYISs2tpwP", "proceeding": "https://proceedings.mlr.press/v235/mohri24a.html", "pdf": "https://openreview.net/pdf?id=uYISs2tpwP", "openreview": "https://openreview.net/forum?id=uYISs2tpwP", "author_site": "Christopher Mohri, Tatsunori Hashimoto", "tldr": "", "abstract": "Guaranteeing the correctness and factuality of language model (LM) outputs is a major open problem. In this work, we propose conformal factuality, a framework that can ensure high probability correctness guarantees for LMs by connecting language modeling and conformal prediction. Our insight is that the correctness of an LM output is equivalent to an uncertainty quantification problem, where the uncertainty sets are defined as the entailment set of an LM's output. Using this connection, we show that conformal prediction in language models corresponds to a back-off algorithm that provides high probability correctness guarantees by progressively making LM outputs less specific (and expanding the associated uncertainty sets). This approach applies to any black-box LM and requires very few human-annotated samples. Evaluations of our approach on closed book QA (FActScore, NaturalQuestions) and reasoning tasks (MATH) show that our approach can provide 80-90% correctness guarantees while retaining the majority of the LM's original output.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Christopher Mohri;Tatsunori Hashimoto", "authorids": "~Christopher_Mohri1;~Tatsunori_Hashimoto1", "gender": "M;M", "homepage": ";https://thashim.github.io", "dblp": ";", "google_scholar": "_otSGXcAAAAJ;5ygiTwsAAAAJ", "orcid": ";", "linkedin": "christopher-mohri-3429841a0/;", "or_profile": "~Christopher_Mohri1;~Tatsunori_Hashimoto1", "aff": "Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nmohri2024language,\ntitle={Language Models with Conformal Factuality Guarantees},\nauthor={Christopher Mohri and Tatsunori Hashimoto},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uYISs2tpwP}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6874665, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4797738356516460871&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "stanford.edu;stanford.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Fast White-Box Adversarial Streaming Without a Random Oracle", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32821", "id": "uaExqhJ2Ag", "proceeding": "https://proceedings.mlr.press/v235/feng24c.html", "pdf": "https://openreview.net/pdf?id=uaExqhJ2Ag", "openreview": "https://openreview.net/forum?id=uaExqhJ2Ag", "author_site": "Ying Feng, Aayush Jain, David Woodruff", "tldr": "", "abstract": "Recently, the question of adversarially robust streaming, where the stream is allowed to depend on the randomness of the streaming algorithm, has gained a lot of attention. In this work, we consider a strong white-box adversarial model (Ajtai et al. PODS 2022), in which the adversary has access to all past random coins and the parameters used by the streaming algorithm. We focus on the sparse recovery problem and extend our result to other tasks such as distinct element estimation and low-rank approximation of matrices and tensors. The main drawback of previous work is that it requires a *random oracle*, which is especially problematic in the streaming model since the amount of randomness is counted in the space complexity of a streaming algorithm. Also, the previous work suffers from large update time. We construct a near-optimal solution for the sparse recovery problem in white-box adversarial streams, based on the subexponentially secure Learning with Errors assumption. Importantly, our solution does not require a random oracle and has a polylogarithmic per item processing time. We also give results in a related white-box adversarially robust distributed model. Our constructions are based on homomorphic encryption schemes satisfying very mild structural properties that are currently satisfied by most known schemes.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ying Feng;Aayush Jain;David Woodruff", "authorids": "~Ying_Feng2;aayushja@andrew.cmu.edu;~David_Woodruff1", "gender": ";;M", "homepage": "https://yinggggfeng.github.io/;;http://www.cs.cmu.edu/~dwoodruf/", "dblp": ";;w/DPWoodruff", "google_scholar": ";;https://scholar.google.com.tw/citations?user=0G2t-6sAAAAJ", "orcid": ";;", "linkedin": "yinggggfeng;;", "or_profile": "~Ying_Feng2;aayushja@andrew.cmu.edu;~David_Woodruff1", "aff": "Massachusetts Institute of Technology;;Carnegie Mellon University", "aff_domain": "mit.edu;;cmu.edu", "position": "PhD student;;Full Professor", "bibtex": "@inproceedings{\nfeng2024fast,\ntitle={Fast White-Box Adversarial Streaming Without a Random Oracle},\nauthor={Ying Feng and Aayush Jain and David Woodruff},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uaExqhJ2Ag}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 542349, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:XO4U8jxA6_IJ:scholar.google.com/&scioq=Fast+White-Box+Adversarial+Streaming+Without+a+Random+Oracle&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": "mit.edu;;cmu.edu", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Massachusetts Institute of Technology;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://www.cmu.edu", "aff_unique_abbr": "MIT;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Integrated Hardware Architecture and Device Placement Search", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32820", "id": "ucl3B05EsX", "proceeding": "https://proceedings.mlr.press/v235/wang24bp.html", "pdf": "https://openreview.net/pdf?id=ucl3B05EsX", "openreview": "https://openreview.net/forum?id=ucl3B05EsX", "author_site": "Irene Wang, Jakub Tarnawski, Amar Phanishayee, Divya Mahajan", "tldr": "", "abstract": "Distributed execution of deep learning training involves a dynamic interplay between hardware accelerator architecture and device placement strategy. This is the first work to explore the co-optimization of determining the optimal architecture and device placement strategy through novel algorithms, improving the balance of computational resources, memory usage, and data distribution. Our architecture search leverages tensor and vector units, determining their quantity and dimensionality, and on-chip and off-chip memory configurations. It also determines the microbatch size and decides whether to recompute or stash activations, balancing the memory footprint of training and storage size. For each explored architecture configuration, we use an Integer Linear Program (ILP) to find the optimal schedule for executing operators on the accelerator. The ILP results then integrate with a dynamic programming solution to identify the most effective device placement strategy, combining data, pipeline, and tensor model parallelism across multiple accelerators. Our approach achieves higher throughput on large language models compared to the state-of-the-art TPUv4 and the Spotlight accelerator search framework. The entire source code of PHAZE is available at https://github.com/msr-fiddle/phaze.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Irene Wang;Jakub Tarnawski;Amar Phanishayee;Divya Mahajan", "authorids": "~Irene_Wang1;~Jakub_Tarnawski1;~Amar_Phanishayee1;~Divya_Mahajan1", "gender": "F;M;M;", "homepage": ";http://jakub.tarnawski.org/;https://aka.ms/amar;", "dblp": "216/9321;157/6045;14/877;", "google_scholar": "vNPoWx0AAAAJ;ddHxUHoAAAAJ;;", "orcid": "0000-0003-1912-5834;0000-0001-6175-5827;;", "linkedin": "irenewang05/;jakubtarnawski/;;", "or_profile": "~Irene_Wang1;~Jakub_Tarnawski1;~Amar_Phanishayee1;~Divya_Mahajan1", "aff": "Georgia Institute of Technology;Microsoft;Microsoft;", "aff_domain": "gatech.edu;microsoft.com;microsoft.com;", "position": "PhD student;Researcher;Sr. Principal Researcher;", "bibtex": "@inproceedings{\nwang2024integrated,\ntitle={Integrated Hardware Architecture and Device Placement Search},\nauthor={Irene Wang and Jakub Tarnawski and Amar Phanishayee and Divya Mahajan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ucl3B05EsX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1281782, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9073216990226568929&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "gatech.edu;microsoft.com;microsoft.com;", "author_num": 4, "aff_unique_index": "0;1;1", "aff_unique_norm": "Georgia Institute of Technology;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.gatech.edu;https://www.microsoft.com", "aff_unique_abbr": "Georgia Tech;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Distinguishing the Knowable from the Unknowable with Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32819", "id": "ud4GSrqUKI", "proceeding": "https://proceedings.mlr.press/v235/ahdritz24a.html", "pdf": "https://openreview.net/pdf?id=ud4GSrqUKI", "openreview": "https://openreview.net/forum?id=ud4GSrqUKI", "author_site": "Gustaf Ahdritz, Tian Qin, Nikhil Vyas, Boaz Barak, Benjamin Edelman", "tldr": "", "abstract": "We study the feasibility of identifying *epistemic* uncertainty (reflecting a lack of knowledge), as opposed to *aleatoric* uncertainty (reflecting entropy in the underlying distribution), in the outputs of large language models (LLMs) over free-form text. In the absence of ground-truth probabilities, we explore a setting where, in order to (approximately) disentangle a given LLM's uncertainty, a significantly larger model stands in as a proxy for the ground truth. We show that small linear probes trained on the embeddings of frozen, pretrained models accurately predict when larger models will be more confident at the token level and that probes trained on one text domain generalize to others. Going further, we propose a fully unsupervised method that achieves non-trivial accuracy on the same task. Taken together, we interpret these results as evidence that LLMs naturally contain internal representations of different types of uncertainty that could potentially be leveraged to devise more informative indicators of model confidence in diverse practical settings. Code can be found at: https://github.com/KempnerInstitute/llm_uncertainty", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gustaf Ahdritz;Tian Qin;Nikhil Vyas;Boaz Barak;Benjamin L. Edelman", "authorids": "~Gustaf_Ahdritz2;~Tian_Qin3;~Nikhil_Vyas1;~Boaz_Barak2;~Benjamin_L._Edelman1", "gender": "M;F;M;M;M", "homepage": "https://gahdritz.github.io/;https://sunnytqin.github.io/;https://nikhilvyas.github.io/;https://boazbarak.org;https://www.benjaminedelman.com/", "dblp": ";;176/1074;b/BBarak;241/9410", "google_scholar": "https://scholar.google.com/citations?hl=en;;;I0fbJ6cAAAAJ;mQSj2C0AAAAJ", "orcid": "0000-0001-8283-5324;;;0000-0002-4053-8927;", "linkedin": ";sunny-qin-b70567203/;;;", "or_profile": "~Gustaf_Ahdritz2;~Tian_Qin3;~Nikhil_Vyas1;~Boaz_Barak2;~Benjamin_L_Edelman1", "aff": "Harvard University;Harvard University, Harvard University;Harvard University;Harvard University;Harvard University", "aff_domain": "harvard.edu;g.harvard.edu;harvard.edu;fas.harvard.edu;harvard.edu", "position": "PhD student;PhD student;Postdoc;Full Professor;PhD student", "bibtex": "@inproceedings{\nahdritz2024distinguishing,\ntitle={Distinguishing the Knowable from the Unknowable with Language Models},\nauthor={Gustaf Ahdritz and Tian Qin and Nikhil Vyas and Boaz Barak and Benjamin L. Edelman},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ud4GSrqUKI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4536873, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9790902635261826088&as_sdt=5,38&sciodt=0,38&hl=en", "gs_version_total": 6, "email": "harvard.edu;g.harvard.edu;harvard.edu;fas.harvard.edu;harvard.edu", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Scale-Aware Spatio-temporal Implicit Representation for Event-based Motion Deblurring", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32818", "id": "udFZhUgtkI", "proceeding": "https://proceedings.mlr.press/v235/yu24g.html", "pdf": "https://openreview.net/pdf?id=udFZhUgtkI", "openreview": "https://openreview.net/forum?id=udFZhUgtkI", "author_site": "Wei Yu, Jianing Li, Shengping Zhang, Xiangyang Ji", "tldr": "", "abstract": "Existing event-based motion deblurring methods mostly focus on restoring images with the same spatial and temporal scales as events. However, the unknown scales of images and events in the real world pose great challenges and have rarely been explored. To address this gap, we propose a novel Scale-Aware Spatio-temporal Network (SASNet) to flexibly restore blurred images with event streams at arbitrary scales. The core idea is to implicitly aggregate both spatial and temporal correspondence features of images and events to generalize at continuous scales. To restore highly blurred local areas, we develop a Spatial Implicit Representation Module (SIRM) to aggregate spatial correlation at any resolution through event encoding sampling. To tackle global motion blur, a Temporal Implicit Representation Module (TIRM) is presented to learn temporal correlation via temporal shift operations with long-term aggregation. Additionally, we build a High-resolution Hybrid Deblur (H2D) dataset using a new-generation hybrid event-based sensor, which comprises images with naturally spatially aligned and temporally synchronized events at various scales. Experiments demonstrate that our SASNet outperforms state-of-the-art methods on both synthetic GoPro and real H2D datasets, especially in high-speed motion scenarios. Code and dataset are available at https://github.com/aipixel/SASNet.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wei Yu;Jianing Li;Shengping Zhang;Xiangyang Ji", "authorids": "~Wei_Yu14;~Jianing_Li4;~Shengping_Zhang1;~Xiangyang_Ji1", "gender": "M;M;M;", "homepage": ";https://jianing-li.github.io/;http://homepage.hit.edu.cn/zhangshengping;", "dblp": ";;60/1866;", "google_scholar": "9-EijjUAAAAJ;https://scholar.google.com.hk/citations?user=xrYnfwcAAAAJ;hMNsT8sAAAAJ;", "orcid": "0000-0002-4805-3115;0000-0002-7468-0622;;", "linkedin": ";;;", "or_profile": "~Wei_Yu14;~Jianing_Li4;~Shengping_Zhang1;~Xiangyang_Ji1", "aff": "Harbin Institute of Technology;Peking University;Harbin Institute of Technology;", "aff_domain": "hit.edu.cn;pku.edu.cn;hit.edu.cn;", "position": "PhD student;Researcher;Full Professor;", "bibtex": "@inproceedings{\nyu2024learning,\ntitle={Learning Scale-Aware Spatio-temporal Implicit Representation for Event-based Motion Deblurring},\nauthor={Wei Yu and Jianing Li and Shengping Zhang and Xiangyang Ji},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=udFZhUgtkI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6053038, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12490600808692138084&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": "hit.edu.cn;pku.edu.cn;hit.edu.cn;", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Harbin Institute of Technology;Peking University", "aff_unique_dep": ";", "aff_unique_url": "http://www.hit.edu.cn/;http://www.pku.edu.cn", "aff_unique_abbr": "HIT;Peking U", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Harbin;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Isometric Representation Learning for Disentangled Latent Space of Diffusion Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32817", "id": "ufCptn28vG", "proceeding": "https://proceedings.mlr.press/v235/hahm24a.html", "pdf": "https://openreview.net/pdf?id=ufCptn28vG", "openreview": "https://openreview.net/forum?id=ufCptn28vG", "author_site": "Jaehoon Hahm, Junho Lee, Sunghyun Kim, Joonseok Lee", "tldr": "", "abstract": "The latent space of diffusion model mostly still remains unexplored, despite its great success and potential in the field of generative modeling. In fact, the latent space of existing diffusion models are entangled, with a distorted mapping from its latent space to image space. To tackle this problem, we present Isometric Diffusion, equipping a diffusion model with a geometric regularizer to guide the model to learn a geometrically sound latent space of the training data manifold. This approach allows diffusion models to learn a more disentangled latent space, which enables smoother interpolation, more accurate inversion, and more precise control over attributes directly in the latent space. Our extensive experiments consisting of image interpolations, image inversions, and linear editing show the effectiveness of our method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jaehoon Hahm;Junho Lee;Sunghyun Kim;Joonseok Lee", "authorids": "~Jaehoon_Hahm1;~Junho_Lee2;~Sunghyun_Kim4;~Joonseok_Lee1", "gender": "M;M;F;M", "homepage": "https://jaehoon-hahm.github.io/;https://sites.google.com/view/junopage;http://viplab.snu.ac.kr/viplab/members.html;http://www.joonseok.net", "dblp": ";;;77/1319.html", "google_scholar": "z9RfcGQAAAAJ;s_orZYMAAAAJ;;https://scholar.google.co.kr/citations?user=M-MfqpMAAAAJ", "orcid": ";;;", "linkedin": ";junho-lee-457748229/;;joonseoklee", "or_profile": "~Jaehoon_Hahm1;~Junho_Lee2;~Sunghyun_Kim4;~Joonseok_Lee1", "aff": "Seoul National University;Seoul National University;Seoul National University;Google Research", "aff_domain": "snu.ac.kr;snu.ac.kr;snu.ac.kr;google.com", "position": "MS student;PhD student;MS student;Research Scientist", "bibtex": "@inproceedings{\nhahm2024isometric,\ntitle={Isometric Representation Learning for Disentangled Latent Space of Diffusion Models},\nauthor={Jaehoon Hahm and Junho Lee and Sunghyun Kim and Joonseok Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ufCptn28vG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9932684, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8686601567271018086&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "snu.ac.kr;snu.ac.kr;snu.ac.kr;google.com", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Seoul National University;Google", "aff_unique_dep": ";Google Research", "aff_unique_url": "https://www.snu.ac.kr;https://research.google", "aff_unique_abbr": "SNU;Google Research", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "South Korea;United States" }, { "title": "Sparse Dimensionality Reduction Revisited", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32816", "id": "ufgVvFmUom", "proceeding": "https://proceedings.mlr.press/v235/hogsgaard24a.html", "pdf": "https://openreview.net/pdf?id=ufgVvFmUom", "openreview": "https://openreview.net/forum?id=ufgVvFmUom", "author_site": "Mikael M\u00f8ller H\u00f8gsgaard, Lior Kamma, Kasper Green Larsen, Jelani Nelson, Chris Schwiegelshohn", "tldr": "", "abstract": "The sparse Johnson-Lindenstrauss transform is one of the central techniques in dimensionality reduction. It supports embedding a set of $n$ points in $\\mathbb{R}^d$ into $m=O(\\varepsilon^{-2} \\ln n)$ dimensions while preserving all pairwise distances to within $1 \\pm \\varepsilon$. Each input point $x$ is embedded to $Ax$, where $A$ is an $m \\times d$ matrix having $s$ non-zeros per column, allowing for an embedding time of $O(s \\|x\\|_0)$. Since the sparsity of $A$ governs the embedding time, much work has gone into improving the sparsity $s$. The current state-of-the-art by Kane and Nelson (2014) shows that $s = O(\\varepsilon^{-1} \\ln n)$ suffices. This is almost matched by a lower bound of $s = \\Omega(\\varepsilon^{-1} \\ln n/\\ln(1/\\varepsilon))$ by Nelson and Nguyen (2013) for $d=\\Omega(n)$. Previous work thus suggests that we have near-optimal embeddings. In this work, we revisit sparse embeddings and present a sparser embedding for instances in which $d = n^{o(1)}$, which in many applications is realistic. Formally, our embedding achieves $s = O(\\varepsilon^{-1}(\\ln n/\\ln(1/\\varepsilon)+\\ln^{2/3}n \\ln^{1/3} d))$. We also complement our analysis by strengthening the lower bound of Nelson and Nguyen to hold also when $d \\ll n$, thereby matching the first term in our new sparsity upper bound. Finally, we also improve the sparsity of the best oblivious subspace embeddings for optimal embedding dimensionality.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mikael M\u00f8ller H\u00f8gsgaard;Lior Kamma;Kasper Green Larsen;Jelani Nelson;Chris Schwiegelshohn", "authorids": "~Mikael_M\u00f8ller_H\u00f8gsgaard1;~Lior_Kamma1;~Kasper_Green_Larsen1;~Jelani_Nelson2;~Chris_Schwiegelshohn1", "gender": "M;;;M;M", "homepage": "https://pure.au.dk/portal/da/persons/mikael-moeller-hoegsgaard(3b07133a-329d-4585-a864-d37c7cb9056b).html;;https://cs.au.dk/~schwiegelshohn/;http://people.eecs.berkeley.edu/~minilek;http://www.cs.au.dk/~larsen/", "dblp": "295/8599;16/9056;https://dblp.uni-trier.de/pers/hd/s/Schwiegelshohn:Chris;68/3296.html;07/6242", "google_scholar": ";;X9Hl0LcAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=ZluoxUcAAAAJ", "orcid": ";;;;", "linkedin": ";;;minilek/;", "or_profile": "~Mikael_M\u00f8ller_H\u00f8gsgaard1;~Lior_Kamma1;~Chris_Schwiegelshohn1;~Jelani_Nelson1;~Kasper_Larsen1", "aff": "Aarhus University;The Academic College of Tel Aviv-Yaffo;Aarhus University;University of California, Berkeley;Aarhus University", "aff_domain": "cs.au.dk;mta.ac.il;cs.au.dk;berkeley.edu;au.dk", "position": "PhD student;Assistant Professor;Associate Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nh{\\o}gsgaard2024sparse,\ntitle={Sparse Dimensionality Reduction Revisited},\nauthor={Mikael M{\\o}ller H{\\o}gsgaard and Lior Kamma and Kasper Green Larsen and Jelani Nelson and Chris Schwiegelshohn},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ufgVvFmUom}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 379642, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4915320759410305989&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "cs.au.dk;mta.ac.il;cs.au.dk;berkeley.edu;au.dk", "author_num": 5, "aff_unique_index": "0;1;0;2;0", "aff_unique_norm": "Aarhus University;Academic College of Tel Aviv-Yaffo;University of California, Berkeley", "aff_unique_dep": ";;", "aff_unique_url": "https://au.dk;https://www.acadcol-ta.ac.il;https://www.berkeley.edu", "aff_unique_abbr": "AU;;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;0;2;0", "aff_country_unique": "Denmark;Israel;United States" }, { "title": "Towards Scalable and Versatile Weight Space Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32815", "id": "ug2uoAZ9c2", "proceeding": "https://proceedings.mlr.press/v235/schurholt24a.html", "pdf": "https://openreview.net/pdf?id=ug2uoAZ9c2", "openreview": "https://openreview.net/forum?id=ug2uoAZ9c2", "author_site": "Konstantin Sch\u00fcrholt, Michael Mahoney, Damian Borth", "tldr": "", "abstract": "Learning representations of well-trained neural network models holds the promise to provide an understanding of the inner workings of those models. However, previous work has either faced limitations when processing larger networks or was task-specific to either discriminative or generative tasks. This paper introduces the SANE approach to weight-space learning. SANE overcomes previous limitations by learning task-agnostic representations of neural networks that are scalable to larger models of varying architectures and that show capabilities beyond a single task. Our method extends the idea of *hyper-representations* towards sequential processing of subsets of neural network weights, thus allowing one to embed larger neural networks as a set of tokens into the learned representation space. SANE reveals global model information from layer-wise embeddings, and it can sequentially generate unseen neural network models, which was unattainable with previous *hyper-representation* learning methods. Extensive empirical evaluation demonstrates that SANE matches or exceeds state-of-the-art performance on several weight representation learning benchmarks, particularly in initialization for new tasks and larger ResNet architectures.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Konstantin Sch\u00fcrholt;Michael W. Mahoney;Damian Borth", "authorids": "~Konstantin_Sch\u00fcrholt1;~Michael_W._Mahoney1;~Damian_Borth1", "gender": "M;;M", "homepage": "https://kschuerholt.github.io/;;http://www.hsg.ai", "dblp": "267/9297;;48/1492", "google_scholar": "refZxl4AAAAJ;;J-8Z038AAAAJ", "orcid": ";;0000-0002-4660-2627", "linkedin": "https://de.linkedin.com/in/konstantin-schuerholt/en;;damianborth/", "or_profile": "~Konstantin_Sch\u00fcrholt1;~Michael_W._Mahoney1;~Damian_Borth1", "aff": "University of St. Gallen;;Eindhoven University of Technology", "aff_domain": "unisg.ch;;tue.nl", "position": "PhD student;;Researcher", "bibtex": "@inproceedings{\nsch{\\\"u}rholt2024towards,\ntitle={Towards Scalable and Versatile Weight Space Learning},\nauthor={Konstantin Sch{\\\"u}rholt and Michael W. Mahoney and Damian Borth},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ug2uoAZ9c2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2498274, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7158928731914354844&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "unisg.ch;;tue.nl", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "University of St. Gallen;Eindhoven University of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.unisg.ch;https://www.tue.nl", "aff_unique_abbr": "HSG;TU/e", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Switzerland;Netherlands" }, { "title": "On Prompt-Driven Safeguarding for Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32814", "id": "ugxGpOEkox", "proceeding": "https://proceedings.mlr.press/v235/zheng24n.html", "pdf": "https://openreview.net/pdf?id=ugxGpOEkox", "openreview": "https://openreview.net/forum?id=ugxGpOEkox", "author_site": "Chujie Zheng, Fan Yin, Hao Zhou, Fandong Meng, Jie Zhou, Kai-Wei Chang, Minlie Huang, Nanyun Peng", "tldr": "", "abstract": "Prepending model inputs with safety prompts is a common practice for safeguarding large language models (LLMs) against queries with harmful intents. However, the underlying working mechanisms of safety prompts have not been unraveled yet, restricting the possibility of automatically optimizing them to improve LLM safety. In this work, we investigate how LLMs' behavior (i.e., complying with or refusing user queries) is affected by safety prompts from the perspective of model representation. We find that in the representation space, the input queries are typically moved by safety prompts in a \"higher-refusal\" direction, in which models become more prone to refusing to provide assistance, even when the queries are harmless. On the other hand, LLMs are naturally capable of distinguishing harmful and harmless queries without safety prompts. Inspired by these findings, we propose a method for safety prompt optimization, namely DRO (Directed Representation Optimization). Treating a safety prompt as continuous, trainable embeddings, DRO learns to move the queries' representations along or opposite the refusal direction, depending on their harmfulness. Experiments with eight LLMs on out-of-domain and jailbreak benchmarks demonstrate that DRO remarkably improves the safeguarding performance of human-crafted safety prompts, without compromising the models' general performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chujie Zheng;Fan Yin;Hao Zhou;Fandong Meng;Jie Zhou;Kai-Wei Chang;Minlie Huang;Nanyun Peng", "authorids": "~Chujie_Zheng2;~Fan_Yin1;~Hao_Zhou8;~Fandong_Meng3;~Jie_Zhou8;~Kai-Wei_Chang1;~Minlie_Huang1;~Nanyun_Peng1", "gender": "M;M;M;M;M;M;M;F", "homepage": "https://chujiezheng.github.io/;;;http://fandongmeng.github.io/;;http://kwchang.net;http://coai.cs.tsinghua.edu.cn/hml;https://violetpeng.github.io/", "dblp": "242/8504;;;117/4056.html;00/5012-16;18/2428;;117/4036", "google_scholar": "55zBNgUAAAAJ;klShdV0AAAAJ;q3WaozcAAAAJ;sA8U4S0AAAAJ;https://scholar.google.com.hk/citations?user=OijxQCMAAAAJ;fqDBtzYAAAAJ;https://scholar.google.com/citations?hl=zh-CN;XxRXvX0AAAAJ", "orcid": ";;;0000-0002-8158-2377;0000-0002-5899-5165;0000-0001-5365-0072;;", "linkedin": "chujie-zheng-54b85820b/;fan-y-60b666180/;;;;kai-wei-chang-41239040;;", "or_profile": "~Chujie_Zheng2;~Fan_Yin1;~Hao_Zhou8;~Fandong_Meng3;~Jie_Zhou8;~Kai-Wei_Chang1;~Minlie_Huang1;~Nanyun_Peng1", "aff": "University of California, Los Angeles;University of California, Los Angeles;Tencent;WeChat AI, Tencent Inc.;WeChat AI, Tencent Inc.;Amazon;Tsinghua University;University of California, Los Angeles", "aff_domain": "ucla.edu;cs.ucla.edu;tencent.com;tencent.com;tencent.com;amazon.com;tsinghua.edu.cn;ucla.edu", "position": "Intern;PhD student;Researcher;Principal Researcher;Principal Researcher;Researcher;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nzheng2024on,\ntitle={On Prompt-Driven Safeguarding for Large Language Models},\nauthor={Chujie Zheng and Fan Yin and Hao Zhou and Fandong Meng and Jie Zhou and Kai-Wei Chang and Minlie Huang and Nanyun Peng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ugxGpOEkox}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3774349, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 84, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1525614600927265547&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "ucla.edu;cs.ucla.edu;tencent.com;tencent.com;tencent.com;amazon.com;tsinghua.edu.cn;ucla.edu", "author_num": 8, "aff_unique_index": "0;0;1;1;1;2;3;0", "aff_unique_norm": "University of California, Los Angeles;Tencent;Amazon;Tsinghua University", "aff_unique_dep": ";Tencent Holdings Limited;Amazon.com, Inc.;", "aff_unique_url": "https://www.ucla.edu;https://www.tencent.com;https://www.amazon.com;https://www.tsinghua.edu.cn", "aff_unique_abbr": "UCLA;Tencent;Amazon;THU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;1;1;1;0;1;0", "aff_country_unique": "United States;China" }, { "title": "Get More with LESS: Synthesizing Recurrence with KV Cache Compression for Efficient LLM Inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32813", "id": "uhHDhVKFMW", "proceeding": "https://proceedings.mlr.press/v235/dong24f.html", "pdf": "https://openreview.net/pdf?id=uhHDhVKFMW", "openreview": "https://openreview.net/forum?id=uhHDhVKFMW", "author_site": "Harry Dong, Xinyu Yang, Zhenyu Zhang, Zhangyang \u201cAtlas\u201d Wang, Yuejie Chi, Beidi Chen", "tldr": "", "abstract": "Many computational factors limit broader deployment of large language models. In this paper, we focus on a memory bottleneck imposed by the key-value (KV) cache, a computational shortcut that requires storing previous KV pairs during decoding. While existing KV cache methods approach this problem by pruning or evicting large swaths of relatively less important KV pairs to dramatically reduce the memory footprint of the cache, they can have limited success in tasks that require recollecting a majority of previous tokens. To alleviate this issue, we propose LESS, a simple integration of a (nearly free) constant sized cache with eviction-based cache methods, such that all tokens can be queried at later decoding steps. Its ability to retain information throughout time shows merit on a variety of tasks where we demonstrate LESS can help reduce the performance gap from caching everything, sometimes even matching it, all while being efficient. Relevant code can be found at https://github.com/hdong920/LESS.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Harry Dong;Xinyu Yang;Zhenyu Zhang;Zhangyang Wang;Yuejie Chi;Beidi Chen", "authorids": "~Harry_Dong1;~Xinyu_Yang4;~Zhenyu_Zhang4;~Zhangyang_Wang1;~Yuejie_Chi1;~Beidi_Chen1", "gender": "M;M;M;M;;F", "homepage": "https://www.andrew.cmu.edu/user/harryd/;http://xinyuyang.me;https://zhenyu.gallery;https://vita-group.github.io;;https://www.andrew.cmu.edu/user/beidic/", "dblp": ";89/473-2.html;01/1844-15;119/4026;;192/1339", "google_scholar": ";;ZLyJRxoAAAAJ;pxFyKAIAAAAJ;;", "orcid": ";;;;;", "linkedin": ";;zhenyu-allen-zhang-a9b1391a3/;;;", "or_profile": "~Harry_Dong1;~Xinyu_Yang4;~Zhenyu_Zhang4;~Zhangyang_Wang1;~Yuejie_Chi1;~Beidi_Chen1", "aff": "Carnegie Mellon University;Carnegie Mellon University;University of Texas at Austin;University of Texas at Austin;;Meta Facebook", "aff_domain": "cmu.edu;cmu.edu;utexas.edu;utexas.edu;;fb.com", "position": "PhD student;PhD student;PhD student;Associate Professor;;Researcher", "bibtex": "@inproceedings{\ndong2024get,\ntitle={Get More with {LESS}: Synthesizing Recurrence with {KV} Cache Compression for Efficient {LLM} Inference},\nauthor={Harry Dong and Xinyu Yang and Zhenyu Zhang and Zhangyang Wang and Yuejie Chi and Beidi Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uhHDhVKFMW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9732635, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8916375057188145169&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "cmu.edu;cmu.edu;utexas.edu;utexas.edu;;fb.com", "author_num": 6, "aff_unique_index": "0;0;1;1;2", "aff_unique_norm": "Carnegie Mellon University;University of Texas at Austin;Meta", "aff_unique_dep": ";;Meta Platforms, Inc.", "aff_unique_url": "https://www.cmu.edu;https://www.utexas.edu;https://meta.com", "aff_unique_abbr": "CMU;UT Austin;Meta", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Two Tales of Single-Phase Contrastive Hebbian Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32812", "id": "ui8ewXg1hV", "proceeding": "https://proceedings.mlr.press/v235/hoier24a.html", "pdf": "https://openreview.net/pdf?id=ui8ewXg1hV", "openreview": "https://openreview.net/forum?id=ui8ewXg1hV", "author_site": "Rasmus Kj\u00e6r H\u00f8ier, Christopher Zach", "tldr": "", "abstract": "The search for \"biologically plausible\" learning algorithms has converged on the idea of representing gradients as activity differences. However, most approaches require a high degree of synchronization (distinct phases during learning) and introduce substantial computational overhead, which raises doubts regarding their biological plausibility as well as their potential utility for neuromorphic computing. Furthermore, they commonly rely on applying infinitesimal perturbations (nudges) to output units, which is impractical in noisy environments. Recently it has been shown that by modelling artificial neurons as dyads with two oppositely nudged compartments, it is possible for a fully local learning algorithm named ``dual propagation'' to bridge the performance gap to backpropagation, without requiring separate learning phases or infinitesimal nudging. However, the algorithm has the drawback that its numerical stability relies on symmetric nudging, which may be restrictive in biological and analog implementations. In this work we first provide a solid foundation for the objective underlying the dual propagation method, which also reveals a surpising connection with adversarial robustness. Second, we demonstrate how dual propagation is related to a particular adjoint state method, which is stable regardless of asymmetric nudging.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rasmus H\u00f8ier;Christopher Zach", "authorids": "~Rasmus_H\u00f8ier1;~Christopher_Zach2", "gender": "M;M", "homepage": "https://www.chalmers.se/en/staff/Pages/Rasmus-Kjaer-Hoier.aspx;", "dblp": "264/5891;93/4824", "google_scholar": ";Pmi5GEAAAAAJ", "orcid": ";0000-0003-2840-6187", "linkedin": ";", "or_profile": "~Rasmus_H\u00f8ier1;~Christopher_Zach2", "aff": "Chalmers University of Technology;Chalmers University", "aff_domain": "chalmers.se;chalmers.se", "position": "PhD student;Research Professor", "bibtex": "@inproceedings{\nh{\\o}ier2024two,\ntitle={Two Tales of Single-Phase Contrastive Hebbian Learning},\nauthor={Rasmus H{\\o}ier and Christopher Zach},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ui8ewXg1hV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 827967, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10131513009501805340&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "chalmers.se;chalmers.se", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Chalmers University of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.chalmers.se", "aff_unique_abbr": "Chalmers", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Sweden" }, { "title": "Stability-Informed Initialization of Neural Ordinary Differential Equations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32811", "id": "uiqbnV4msl", "proceeding": "https://proceedings.mlr.press/v235/westny24a.html", "pdf": "https://openreview.net/pdf?id=uiqbnV4msl", "openreview": "https://openreview.net/forum?id=uiqbnV4msl", "author_site": "Theodor Westny, Arman Mohammadi, Daniel Jung, Erik Frisk", "tldr": "", "abstract": "This paper addresses the training of Neural Ordinary Differential Equations (neural ODEs), and in particular explores the interplay between numerical integration techniques, stability regions, step size, and initialization techniques. It is shown how the choice of integration technique implicitly regularizes the learned model, and how the solver's corresponding stability region affects training and prediction performance. From this analysis, a stability-informed parameter initialization technique is introduced. The effectiveness of the initialization method is displayed across several learning benchmarks and industrial applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Theodor Westny;Arman Mohammadi;Daniel Jung;Erik Frisk", "authorids": "~Theodor_Westny1;~Arman_Mohammadi1;~Daniel_Jung2;~Erik_Frisk1", "gender": "M;M;M;", "homepage": "https://liu.se/medarbetare/thewe60;https://liu.se/en/employee/armmo89;;https://liu.se/medarbetare/erifr93", "dblp": "302/4224;;;", "google_scholar": "PJqOR8gAAAAJ;;K7osHasAAAAJ;o7sLRpcAAAAJ", "orcid": "0000-0001-9075-7477;;;0000-0001-7349-1937", "linkedin": "theodor-westny/;;;", "or_profile": "~Theodor_Westny1;~Arman_Mohammadi1;~Daniel_Jung2;~Erik_Frisk1", "aff": "Link\u00f6ping University;Link\u00f6ping University;Link\u00f6ping University;Link\u00f6ping University", "aff_domain": "liu.se;liu.se;liu.se;liu.se", "position": "PhD student;PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nwestny2024stabilityinformed,\ntitle={Stability-Informed Initialization of Neural Ordinary Differential Equations},\nauthor={Theodor Westny and Arman Mohammadi and Daniel Jung and Erik Frisk},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uiqbnV4msl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1375640, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13429740629353327647&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "liu.se;liu.se;liu.se;liu.se", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Link\u00f6ping University", "aff_unique_dep": "", "aff_unique_url": "https://www.liu.se", "aff_unique_abbr": "LiU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Sweden" }, { "title": "DRED: Zero-Shot Transfer in Reinforcement Learning via Data-Regularised Environment Design", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32810", "id": "uku9r6RROl", "proceeding": "https://proceedings.mlr.press/v235/garcin24a.html", "pdf": "https://openreview.net/pdf?id=uku9r6RROl", "openreview": "https://openreview.net/forum?id=uku9r6RROl", "author_site": "Samuel Garcin, James Doran, Shangmin Guo, Christopher Lucas, Stefano V. Albrecht", "tldr": "", "abstract": "Autonomous agents trained using deep reinforcement learning (RL) often lack the ability to successfully generalise to new environments, even when these environments share characteristics with the ones they have encountered during training. In this work, we investigate how the sampling of individual environment instances, or levels, affects the zero-shot generalisation (ZSG) ability of RL agents. We discover that, for deep actor-critic architectures sharing their base layers, prioritising levels according to their value loss minimises the mutual information between the agent's internal representation and the set of training levels in the generated training data. This provides a novel theoretical justification for the regularisation achieved by certain adaptive sampling strategies. We then turn our attention to unsupervised environment design (UED) methods, which assume control over level generation. We find that existing UED methods can significantly shift the training distribution, which translates to low ZSG performance. To prevent both overfitting and distributional shift, we introduce *data-regularised environment design* (DRED). DRED generates levels using a generative model trained to approximate the ground truth distribution of an initial set of level parameters. Through its grounding, DRED achieves significant improvements in ZSG over adaptive level sampling strategies and UED methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Samuel Garcin;James Doran;Shangmin Guo;Christopher G. Lucas;Stefano V Albrecht", "authorids": "~Samuel_Garcin1;~James_Doran1;~Shangmin_Guo1;~Christopher_G._Lucas1;~Stefano_V_Albrecht1", "gender": "M;;M;;", "homepage": ";;;http://christopherglucas.com;https://agents-lab.org/stefano-albrecht/", "dblp": ";;183/0949;69/3093;118/3975", "google_scholar": ";;cpOrbSoAAAAJ;;https://scholar.google.co.uk/citations?user=ceSFqCcAAAAJ", "orcid": ";;0000-0003-1716-0994;;0000-0002-8735-1465", "linkedin": "samuel-garcin-7469b9b1/;;;;", "or_profile": "~Samuel_Garcin1;~James_Doran1;~Shangmin_Guo1;~Christopher_G._Lucas1;~Stefano_V_Albrecht1", "aff": "University of Edinburgh;;University of Edinburgh;University of Edinburgh, University of Edinburgh;University of Edinburgh", "aff_domain": "edinburgh.org;;ed.ac.uk;ed.ac.uk;ed.ac.uk", "position": "PhD student;;PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\ngarcin2024dred,\ntitle={{DRED}: Zero-Shot Transfer in Reinforcement Learning via Data-Regularised Environment Design},\nauthor={Samuel Garcin and James Doran and Shangmin Guo and Christopher G. Lucas and Stefano V Albrecht},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uku9r6RROl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9281568, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16687835643792125019&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 5, "email": "edinburgh.org;;ed.ac.uk;ed.ac.uk;ed.ac.uk", "author_num": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Edinburgh", "aff_unique_dep": "", "aff_unique_url": "https://www.ed.ac.uk", "aff_unique_abbr": "Edinburgh", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Fast and Sample Efficient Multi-Task Representation Learning in Stochastic Contextual Bandits", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32809", "id": "uog14iBFLA", "proceeding": "https://proceedings.mlr.press/v235/lin24o.html", "pdf": "https://openreview.net/pdf?id=uog14iBFLA", "openreview": "https://openreview.net/forum?id=uog14iBFLA", "author_site": "Jiabin Lin, Shana Moothedath, Namrata Vaswani", "tldr": "", "abstract": "We study how representation learning can improve the learning efficiency of contextual bandit problems. We study the setting where we play T linear contextual bandits with dimension simultaneously, and these T bandit tasks collectively share a common linear representation with a dimensionality of r \u226a d. We present a new algorithm based on alternating projected gradient descent (GD) and minimization estimator to recover a low-rank feature matrix. We obtain constructive provable guarantees for our estimator that provide a lower bound on the required sample complexity and an upper bound on the iteration complexity (total number of iterations needed to achieve a certain error level). Using the proposed estimator, we present a multi-task learning algorithm for linear contextual bandits and prove the regret bound of our algorithm. We presented experiments and compared the performance of our algorithm against benchmark algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiabin Lin;Shana Moothedath;Namrata Vaswani", "authorids": "jiabin@iastate.edu;~Shana_Moothedath1;~Namrata_Vaswani1", "gender": ";F;", "homepage": ";https://shanazuhara.wixsite.com/mysite;https://www.ece.iastate.edu/~namrata/", "dblp": ";;", "google_scholar": ";https://scholar.google.com/citations?hl=en;s-dQPO8AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "jiabin@iastate.edu;~Shana_Moothedath1;~Namrata_Vaswani1", "aff": ";Iowa State University;Iowa State University", "aff_domain": ";iastate.edu;iastate.edu", "position": ";Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nlin2024fast,\ntitle={Fast and Sample Efficient Multi-Task Representation Learning in Stochastic Contextual Bandits},\nauthor={Jiabin Lin and Shana Moothedath and Namrata Vaswani},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uog14iBFLA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 662603, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17814321847523617615&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": ";iastate.edu;iastate.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Iowa State University", "aff_unique_dep": "", "aff_unique_url": "https://www.iastate.edu", "aff_unique_abbr": "ISU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Towards Compositionality in Concept Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32808", "id": "upO8FUwf92", "proceeding": "https://proceedings.mlr.press/v235/stein24b.html", "pdf": "https://openreview.net/pdf?id=upO8FUwf92", "openreview": "https://openreview.net/forum?id=upO8FUwf92", "author_site": "Adam Stein, Aaditya Naik, Yinjun Wu, Mayur Naik, Eric Wong", "tldr": "", "abstract": "Concept-based interpretability methods offer a lens into the internals of foundation models by decomposing their embeddings into high-level concepts. These concept representations are most useful when they are *compositional*, meaning that the individual concepts compose to explain the full sample. We show that existing unsupervised concept extraction methods find concepts which are not compositional. To automatically discover compositional concept representations, we identify two salient properties of such representations, and propose Compositional Concept Extraction (CCE) for finding concepts which obey these properties. We evaluate CCE on five different datasets over image and text data. Our evaluation shows that CCE finds more compositional concept representations than baselines and yields better accuracy on four downstream classification tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Adam Stein;Aaditya Naik;Yinjun Wu;Mayur Naik;Eric Wong", "authorids": "~Adam_Stein2;~Aaditya_Naik1;~Yinjun_Wu1;~Mayur_Naik1;~Eric_Wong1", "gender": "M;M;M;M;M", "homepage": "https://www.seas.upenn.edu/~steinad/;https://www.seas.upenn.edu/~asnaik;https://wuyinjun-1993.github.io/;http://www.cis.upenn.edu/~mhnaik/;http://riceric22.github.io/", "dblp": "217/4482;269/9481;169/1054;92/6794;64/1811-1.html", "google_scholar": "https://scholar.google.com/citations?hl=en;EfE0jh4AAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=fmsV6nEAAAAJ;pWnTMRkAAAAJ", "orcid": "0000-0003-1887-100X;;;;", "linkedin": "adam-stein-086135241/;;;ai4code/;", "or_profile": "~Adam_Stein2;~Aaditya_Naik1;~Yinjun_Wu1;~Mayur_Naik1;~Eric_Wong1", "aff": "University of Pennsylvania;University of Pennsylvania;University of Pennsylvania;University of Pennsylvania;University of Pennsylvania", "aff_domain": "seas.upenn.edu;upenn.edu;seas.upenn.edu;upenn.edu;upenn.edu", "position": "PhD student;PhD student;Postdoc;Professor;Assistant Professor", "bibtex": "@inproceedings{\nstein2024towards,\ntitle={Towards Compositionality in Concept Learning},\nauthor={Adam Stein and Aaditya Naik and Yinjun Wu and Mayur Naik and Eric Wong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=upO8FUwf92}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2570876, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6395104009991021802&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "seas.upenn.edu;upenn.edu;seas.upenn.edu;upenn.edu;upenn.edu", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Amortized Equation Discovery in Hybrid Dynamical Systems", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32807", "id": "uqWfZ23O9g", "proceeding": "https://proceedings.mlr.press/v235/liu24at.html", "pdf": "https://openreview.net/pdf?id=uqWfZ23O9g", "openreview": "https://openreview.net/forum?id=uqWfZ23O9g", "author_site": "Yongtuo Liu, Sara Magliacane, Miltiadis (Miltos) Kofinas, Efstratios Gavves", "tldr": "", "abstract": "Hybrid dynamical systems are prevalent in science and engineering to express complex systems with continuous and discrete states. To learn laws of systems, all previous methods for equation discovery in hybrid systems follow a two-stage paradigm, i.e. they first group time series into small cluster fragments and then discover equations in each fragment separately through methods in non-hybrid systems. Although effective, performance is then limited because these methods ignore the commonalities in the shared dynamics of fragments that are driven by the same equations. Besides, the two-stage paradigm breaks the interdependence between categorizing and representing dynamics that jointly form hybrid systems. In this paper, we reformulate the problem and propose an end-to-end learning framework, i.e. Amortized Equation Discovery (AMORE), to jointly categorize modes and discover equations characterizing motion dynamics of each mode by all segments of the mode. Experiments on four hybrid and six non-hybrid systems demonstrate the superior performance of our method against previous methods on equation discovery, segmentation, and forecasting.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yongtuo Liu;Sara Magliacane;Miltiadis Kofinas;Stratis Gavves", "authorids": "~Yongtuo_Liu1;~Sara_Magliacane1;~Miltiadis_Kofinas2;~Stratis_Gavves1", "gender": "M;F;M;M", "homepage": ";http://saramagliacane.github.io;https://mkofinas.github.io;https://www.egavves.com", "dblp": "271/5155;120/5256;305/0160;03/8693", "google_scholar": "EjzPQtMAAAAJ;https://scholar.google.nl/citations?user=H3j_zQ4AAAAJ;Ur5BV8MAAAAJ;https://scholar.google.nl/citations?user=QqfCvsgAAAAJ", "orcid": ";;0000-0002-3392-4037;", "linkedin": "liu-yongtuo-b9091b11b/;magliacane/;miltiadiskofinas/;", "or_profile": "~Yongtuo_Liu1;~Sara_Magliacane1;~Miltiadis_Kofinas2;~Efstratios_Gavves1", "aff": "University of Amsterdam;University of Amsterdam;University of Amsterdam;University of Amsterdam", "aff_domain": "uva.nl;uva.nl;uva.nl;uva.nl", "position": "PhD student;Assistant Professor;PhD student;Associate Professor", "bibtex": "@inproceedings{\nliu2024amortized,\ntitle={Amortized Equation Discovery in Hybrid Dynamical Systems},\nauthor={Yongtuo Liu and Sara Magliacane and Miltiadis Kofinas and Stratis Gavves},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uqWfZ23O9g}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1019671, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10059911018360371869&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "uva.nl;uva.nl;uva.nl;uva.nl", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Amsterdam", "aff_unique_dep": "", "aff_unique_url": "https://www.uva.nl", "aff_unique_abbr": "UvA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Netherlands" }, { "title": "Multi-Region Markovian Gaussian Process: An Efficient Method to Discover Directional Communications Across Multiple Brain Regions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32806", "id": "us6zMORsMe", "proceeding": "https://proceedings.mlr.press/v235/li24ak.html", "pdf": "https://openreview.net/pdf?id=us6zMORsMe", "openreview": "https://openreview.net/forum?id=us6zMORsMe", "author_site": "Weihan Li, Chengrui Li, Yule Wang, Anqi Wu", "tldr": "", "abstract": "Studying the complex interactions between different brain regions is crucial in neuroscience. Various statistical methods have explored the latent communication across multiple brain regions. Two main categories are the Gaussian Process (GP) and Linear Dynamical System (LDS), each with unique strengths. The GP-based approach effectively discovers latent variables with frequency bands and communication directions. Conversely, the LDS-based approach is computationally efficient but lacks powerful expressiveness in latent representation. In this study, we merge both methodologies by creating an LDS mirroring a multi-output GP, termed Multi-Region Markovian Gaussian Process (MRM-GP). Our work establishes a connection between an LDS and a multi-output GP that explicitly models frequencies and phase delays within the latent space of neural recordings. Consequently, the model achieves a linear inference cost over time points and provides an interpretable low-dimensional representation, revealing communication directions across brain regions and separating oscillatory communications into different frequency bands.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weihan Li;Chengrui Li;Yule Wang;Anqi Wu", "authorids": "~Weihan_Li1;~Chengrui_Li1;~Yule_Wang1;~Anqi_Wu3", "gender": "M;M;M;F", "homepage": "https://weihanlikk.github.io/;https://jerrysoybean.github.io/;https://yulewang97.github.io/;https://sites.google.com/view/brainml/home", "dblp": "24/8923;174/4237;;15/9453", "google_scholar": "qW4_NR4AAAAJ;https://scholar.google.com/citations?h;vqsl1YYAAAAJ;ptGYJiEAAAAJ", "orcid": ";0000-0001-5947-2393;;0000-0002-7866-9455", "linkedin": ";;yule-wang-a8002b195/;", "or_profile": "~Weihan_Li1;~Chengrui_Li1;~Yule_Wang1;~Anqi_Wu3", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology", "aff_domain": "gatech.edu;gatech.edu;gatech.edu;gatech.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nli2024multiregion,\ntitle={Multi-Region Markovian Gaussian Process: An Efficient Method to Discover Directional Communications Across Multiple Brain Regions},\nauthor={Weihan Li and Chengrui Li and Yule Wang and Anqi Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=us6zMORsMe}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7874405, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7554714000420805189&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "gatech.edu;gatech.edu;gatech.edu;gatech.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Language Agents with Reinforcement Learning for Strategic Play in the Werewolf Game", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32805", "id": "usUPvQH3XK", "proceeding": "https://proceedings.mlr.press/v235/xu24ad.html", "pdf": "https://openreview.net/pdf?id=usUPvQH3XK", "openreview": "https://openreview.net/forum?id=usUPvQH3XK", "author_site": "Zelai Xu, Chao Yu, Fei Fang, Yu Wang, Yi Wu", "tldr": "", "abstract": "Agents built with large language models (LLMs) have shown great potential across a wide range of domains. However, in complex decision-making tasks, pure LLM-based agents tend to exhibit intrinsic bias in their choice of actions, which is inherited from the model's training data and results in suboptimal performance. To develop *strategic language agents*, i.e., agents that generate flexible language actions and possess strong decision-making abilities, we propose a novel framework that powers LLM-based agents with reinforcement learning (RL). We consider Werewolf, a popular social deduction game, as a challenging testbed that emphasizes versatile communication and strategic gameplay. To mitigate the intrinsic bias in language actions, our agents use an LLM to perform deductive reasoning and generate a diverse set of action candidates. Then an RL policy trained to optimize the decision-making ability chooses an action from the candidates to play in the game. Extensive experiments show that our agents overcome the intrinsic bias and outperform existing LLM-based agents in the Werewolf game. We also conduct human-agent experiments and find that our agents achieve human-level performance and demonstrate strong strategic play.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zelai Xu;Chao Yu;Fei Fang;Yu Wang;Yi Wu", "authorids": "~Zelai_Xu1;~Chao_Yu1;~Fei_Fang1;~Yu_Wang3;~Yi_Wu1", "gender": "M;F;F;M;M", "homepage": "https://nicsefc.ee.tsinghua.edu.cn/people/ZelaiXu;http://zoeyuchao.github.io;https://feifang.info/;https://nicsefc.ee.tsinghua.edu.cn;https://jxwuyi.weebly.com", "dblp": ";36/6789-5;57/2878;w/YuWang2.html;", "google_scholar": "3JjcAnoAAAAJ;BYoq_bwAAAAJ;R6jE0VEAAAAJ;https://scholar.google.com.hk/citations?user=j8JGVvoAAAAJ;dusV5HMAAAAJ", "orcid": "0000-0001-5578-199X;0000-0001-6975-0158;;0000-0001-6108-5157;", "linkedin": ";;;;", "or_profile": "~Zelai_Xu1;~Chao_Yu1;~Fei_Fang1;~Yu_Wang3;~Yi_Wu1", "aff": "Tsinghua University;Tsinghua University;Carnegie Mellon University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;mail.tsinghua.edu.cn;cmu.edu;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;Postdoc;Associate Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nxu2024language,\ntitle={Language Agents with Reinforcement Learning for Strategic Play in the Werewolf Game},\nauthor={Zelai Xu and Chao Yu and Fei Fang and Yu Wang and Yi Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=usUPvQH3XK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 731058, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 88, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2025690588575542688&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "tsinghua.edu.cn;mail.tsinghua.edu.cn;cmu.edu;tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Tsinghua University;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.cmu.edu", "aff_unique_abbr": "THU;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "China;United States" }, { "title": "Flora: Low-Rank Adapters Are Secretly Gradient Compressors", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32804", "id": "uubBZKM99Y", "proceeding": "https://proceedings.mlr.press/v235/hao24a.html", "pdf": "https://openreview.net/pdf?id=uubBZKM99Y", "openreview": "https://openreview.net/forum?id=uubBZKM99Y", "author_site": "Yongchang Hao, Yanshuai Cao, Lili Mou", "tldr": "", "abstract": "Despite large neural networks demonstrating remarkable abilities to complete different tasks, they require excessive memory usage to store the optimization states for training. To alleviate this, the low-rank adaptation (LoRA) is proposed to reduce the optimization states by training fewer parameters. However, LoRA restricts overall weight update matrices to be low-rank, limiting the model performance. In this work, we investigate the dynamics of LoRA and identify that it can be approximated by a random projection. Based on this observation, we propose Flora, which is able to achieve high-rank updates by resampling the projection matrices while enjoying the sublinear space complexity of optimization states. We conduct experiments across different tasks and model architectures to verify the effectiveness of our approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yongchang Hao;Yanshuai Cao;Lili Mou", "authorids": "~Yongchang_Hao1;~Yanshuai_Cao1;~Lili_Mou1", "gender": "M;;M", "homepage": "https://yongchanghao.github.io;;https://lili-mou.github.io/", "dblp": "277/4987;;", "google_scholar": "sRqHvoYAAAAJ;https://scholar.google.ca/citations?user=RTVRTSsAAAAJ;https://scholar.google.com.hk/schhp?hl=en", "orcid": ";;", "linkedin": "yongchang-hao/;yanshuai-cao-b59878a4/;", "or_profile": "~Yongchang_Hao1;~Yanshuai_Cao1;~Lili_Mou1", "aff": "University of Alberta;Borealis AI;University of Alberta", "aff_domain": "ualberta.ca;borealisai.com;ualberta.ca", "position": "PhD student;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nhao2024flora,\ntitle={Flora: Low-Rank Adapters Are Secretly Gradient Compressors},\nauthor={Yongchang Hao and Yanshuai Cao and Lili Mou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uubBZKM99Y}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1235586, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16549025915713920124&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "ualberta.ca;borealisai.com;ualberta.ca", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Alberta;Borealis AI", "aff_unique_dep": ";", "aff_unique_url": "https://www.ualberta.ca;https://www.borealisai.com", "aff_unique_abbr": "UAlberta;Borealis AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "title": "Understanding the Training Speedup from Sampling with Approximate Losses", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32803", "id": "uun4fzaiat", "proceeding": "https://proceedings.mlr.press/v235/das24b.html", "pdf": "https://openreview.net/pdf?id=uun4fzaiat", "openreview": "https://openreview.net/forum?id=uun4fzaiat", "author_site": "Rudrajit Das, Xi Chen, Bertram Ieong, Parikshit Bansal, Sujay Sanghavi", "tldr": "", "abstract": "It is well known that selecting samples with large losses/gradients can significantly reduce the number of training steps. However, the selection overhead is often too high to yield any meaningful gains in terms of overall training time. In this work, we focus on the greedy approach of selecting samples with large *approximate losses* instead of exact losses in order to reduce the selection overhead. For smooth convex losses, we show that such a greedy strategy can converge to a constant factor of the minimum value of the average loss in fewer iterations than the standard approach of random selection. We also theoretically quantify the effect of the approximation level. We then develop SIFT which uses early exiting to obtain approximate losses with an intermediate layer's representations for sample selection. We evaluate SIFT on the task of training a 110M parameter 12 layer BERT base model, and show significant gains (in terms of training hours and number of backpropagation steps) without any optimized implementation over vanilla training. For e.g., to reach 64% validation accuracy, SIFT with exit at the first layer takes $\\sim$ 43 hours compared to $\\sim$ 57 hours of vanilla training.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rudrajit Das;Xi Chen;Bertram Ieong;Parikshit Bansal;sujay sanghavi", "authorids": "~Rudrajit_Das1;~Xi_Chen37;~Bertram_Ieong1;~Parikshit_Bansal1;~sujay_sanghavi1", "gender": "M;F;;M;M", "homepage": "https://rudrajit15.github.io/;;;https://pbansal5.github.io;https://sites.utexas.edu/sanghavi", "dblp": "227/2712;;;;69/4911.html", "google_scholar": "L5K4mJMAAAAJ;https://scholar.google.com/citations?hl=en;;7a20V2oAAAAJ;O-DazBUAAAAJ", "orcid": ";;;;", "linkedin": "rudrajit-das-a6717a100/;xi-chen-78603120/;bertram-bert;;", "or_profile": "~Rudrajit_Das1;~Xi_Chen37;~Bertram_Ieong1;~Parikshit_Bansal1;~sujay_sanghavi1", "aff": "University of Texas, Austin;;;University of Texas at Austin;University of Texas, Austin", "aff_domain": "utexas.edu;;;utexas.edu;utexas.edu", "position": "PhD student;;;PhD student;Associate Professor", "bibtex": "@inproceedings{\ndas2024understanding,\ntitle={Understanding the Training Speedup from Sampling with Approximate Losses},\nauthor={Rudrajit Das and Xi Chen and Bertram Ieong and Parikshit Bansal and sujay sanghavi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uun4fzaiat}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 539471, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:aaownRoUL0UJ:scholar.google.com/&scioq=Understanding+the+Training+Speedup+from+Sampling+with+Approximate+Losses&hl=en&as_sdt=0,5", "gs_version_total": 8, "email": "utexas.edu;;;utexas.edu;utexas.edu", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "RLAIF vs. RLHF: Scaling Reinforcement Learning from Human Feedback with AI Feedback", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32802", "id": "uydQ2W41KO", "proceeding": "https://proceedings.mlr.press/v235/lee24t.html", "pdf": "https://openreview.net/pdf?id=uydQ2W41KO", "openreview": "https://openreview.net/forum?id=uydQ2W41KO", "author_site": "Harrison Lee, Samrat Phatale, Hassan Mansoor, Thomas Mesnard, Johan Ferret, Kellie Lu, Colton Bishop, Ethan Hall, Victor Carbune, Abhinav Rastogi, Sushant Prakash", "tldr": "", "abstract": "Reinforcement learning from human feedback (RLHF) has proven effective in aligning large language models (LLMs) with human preferences, but gathering high-quality preference labels is expensive. RL from AI Feedback (RLAIF), introduced in Bai et al. (2022b), offers a promising alternative that trains the reward model (RM) on preferences generated by an off-the-shelf LLM. Across the tasks of summarization, helpful dialogue generation, and harmless dialogue generation, we show that RLAIF achieves comparable performance to RLHF. Furthermore, we take a step towards \"self-improvement\" by demonstrating that RLAIF can outperform a supervised fine-tuned baseline even when the AI labeler is the same size as the policy, or even the exact same checkpoint as the initial policy. Finally, we introduce direct-RLAIF (d-RLAIF) - a technique that circumvents RM training by obtaining rewards directly from an off-the-shelf LLM during RL, which achieves superior performance to canonical RLAIF. Our results suggest that RLAIF can achieve performance on-par with using human feedback, offering a potential solution to the scalability limitations of RLHF.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Harrison Lee;Samrat Phatale;Hassan Mansoor;Thomas Mesnard;Johan Ferret;Kellie Ren Lu;Colton Bishop;Ethan Hall;Victor Carbune;Abhinav Rastogi;Sushant Prakash", "authorids": "~Harrison_Lee1;~Samrat_Phatale1;~Hassan_Mansoor1;~Thomas_Mesnard2;~Johan_Ferret1;~Kellie_Ren_Lu1;~Colton_Bishop1;~Ethan_Hall1;~Victor_Carbune1;~Abhinav_Rastogi2;~Sushant_Prakash1", "gender": "M;M;M;;M;;M;M;M;M;M", "homepage": ";;https://www.linkedin.com/in/hassan-mansoor-6938364/;https://thomasmesnard.github.io/;https://ferretj.github.io;;https://bishopcolton.com;;https://ai.google/research/people/104909;;", "dblp": "249/6387-1;192/7541.html;;;;;;;199/7020;;15/8160", "google_scholar": ";gTK5jNYAAAAJ;;;uyUnqjMAAAAJ;qZ3I8gQAAAAJ;;;https://scholar.google.ch/citations?user=35djUQYAAAAJ;uDrgdtwAAAAJ;mTHA3HEAAAAJ", "orcid": ";;;;;;;;;;", "linkedin": "harrisonl;;;;;;;ethan-hall-397391b0/;vcarbune/;abhinav-rastogi-0a466934/;https://linkedin.com/in/sushant1", "or_profile": "~Harrison_Lee1;~Samrat_Phatale1;~Hassan_Mansoor1;~Thomas_Mesnard2;~Johan_Ferret1;~Kellie_Ren_Lu1;~Colton_Bishop1;~Ethan_Hall1;~Victor_Carbune1;~Abhinav_Rastogi2;~Sushant_Prakash1", "aff": "Google;Google DeepMind;Google;Google DeepMind;Google;;;Google;Google;Google;Google", "aff_domain": "google.com;deepmind.com;google.com;google.com;google.com;;;google.com;google.com;google.com;google.com", "position": "Researcher;Researcher;Researcher;PhD student;Researcher;;;Software Engineer;Researcher;Research Scientist;Researcher", "bibtex": "@inproceedings{\nlee2024rlaif,\ntitle={{RLAIF} vs. {RLHF}: Scaling Reinforcement Learning from Human Feedback with {AI} Feedback},\nauthor={Harrison Lee and Samrat Phatale and Hassan Mansoor and Thomas Mesnard and Johan Ferret and Kellie Ren Lu and Colton Bishop and Ethan Hall and Victor Carbune and Abhinav Rastogi and Sushant Prakash},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uydQ2W41KO}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1094922, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 99, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1177737291887470611&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "google.com;deepmind.com;google.com;google.com;google.com;;;google.com;google.com;google.com;google.com", "author_num": 11, "aff_unique_index": "0;0;0;0;0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;0;1;0;0;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Delaunay Graph: Addressing Over-Squashing and Over-Smoothing Using Delaunay Triangulation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32801", "id": "uyhjKoaIQa", "proceeding": "https://proceedings.mlr.press/v235/attali24a.html", "pdf": "https://openreview.net/pdf?id=uyhjKoaIQa", "openreview": "https://openreview.net/forum?id=uyhjKoaIQa", "author_site": "Hugo Attali, Davide Buscaldi, Nathalie Pernelle", "tldr": "", "abstract": "GNNs rely on the exchange of messages to distribute information along the edges of the graph. This approach makes the efficiency of architectures highly dependent on the specific structure of the input graph. Certain graph topologies lead to inefficient information propagation, resulting in a phenomenon known as over-squashing. While the majority of existing methods address over-squashing by rewiring the input graph, our novel approach involves constructing a graph directly from features using Delaunay Triangulation. We posit that the topological properties of the resulting graph prove advantageous for mitigate oversmoothing and over-squashing. Our extensive experimentation demonstrates that our method consistently outperforms established graph rewiring methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hugo Attali;Davide Buscaldi;Nathalie Pernelle", "authorids": "~Hugo_Attali1;~Davide_Buscaldi2;~Nathalie_Pernelle1", "gender": ";M;F", "homepage": "https://sites.google.com/view/hugoattali/profile;https://buscaldi.eu;", "dblp": ";34/4842;55/2273", "google_scholar": "ld0pUJcAAAAJ;https://scholar.google.fr/citations?user=3qCGWfoAAAAJ;", "orcid": ";0000-0003-1112-3789;", "linkedin": "https://www.linkedin.com/feed/;dbuscaldi/;", "or_profile": "~Hugo_Attali1;~Davide_Buscaldi2;~Nathalie_Pernelle1", "aff": "University Paris 13, Universit\u00e9 Paris Nord (Paris XIII);Universit\u00e9 Paris 13;LIPN, Universit\u00e9 Sorbonne Paris Nord", "aff_domain": "lipn.univ-paris13.fr;univ-paris13.fr;paris13", "position": "PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nattali2024delaunay,\ntitle={Delaunay Graph: Addressing Over-Squashing and Over-Smoothing Using Delaunay Triangulation},\nauthor={Hugo Attali and Davide Buscaldi and Nathalie Pernelle},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uyhjKoaIQa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 375334, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2610086682799436778&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 6, "email": "lipn.univ-paris13.fr;univ-paris13.fr;paris13", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University Paris 13;Universit\u00e9 Paris 13;Universit\u00e9 Sorbonne Paris Nord", "aff_unique_dep": ";;LIPN", "aff_unique_url": "https://www.univ-paris13.fr;https://www.univ-paris13.fr;https://www.univ-paris-nord.fr", "aff_unique_abbr": "UP13;UP13;USPN", "aff_campus_unique_index": "0", "aff_campus_unique": "Paris;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "Smoothness Adaptive Hypothesis Transfer Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32800", "id": "v0VUsQI5yw", "proceeding": "https://proceedings.mlr.press/v235/lin24q.html", "pdf": "https://openreview.net/pdf?id=v0VUsQI5yw", "openreview": "https://openreview.net/forum?id=v0VUsQI5yw", "author_site": "Haotian Lin, Matthew Reimherr", "tldr": "", "abstract": "Many existing two-phase kernel-based hypothesis transfer learning algorithms employ the same kernel regularization across phases and rely on the known smoothness of functions to obtain optimality. Therefore, they fail to adapt to the varying and unknown smoothness between the target/source and their offset. This paper introduces Smoothness Adaptive Transfer Learning (SATL), a two-phase kernel ridge regression (KRR)-based algorithm to address these limitations. We first demonstrate that employing a misspecified fixed bandwidth Gaussian kernel in target-only KRR learning can achieve minimax optimality when the true function resides in Sobolev spaces. Leveraging this result, SATL enables the estimators to provably and universally adapt to the varying and unknown Sobolev smoothness of the source and offset functions. We derive the minimax lower bound of the learning problem in excess risk and show that SATL achieves a matching upper bound up to logarithmic factors. The optimal statistical rate reveals the factors influencing the transfer dynamics and efficacy, including the source sample size and the relative strength between domains. The theoretical findings and the effectiveness of SATL are confirmed by several experiments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haotian Lin;Matthew Reimherr", "authorids": "~Haotian_Lin1;~Matthew_Reimherr1", "gender": "M;", "homepage": "https://haotianlin.github.io/;https://www.personal.psu.edu/~mlr36", "dblp": "177/6974-2;187/4282", "google_scholar": "DtHTtSwAAAAJ;UZcbx9gAAAAJ", "orcid": ";0000-0002-7149-0591", "linkedin": ";", "or_profile": "~Haotian_Lin1;~Matthew_Reimherr1", "aff": "Pennsylvania State University;Amazon", "aff_domain": "psu.edu;amazon.com", "position": "PhD student;Principal Researcher", "bibtex": "@inproceedings{\nlin2024smoothness,\ntitle={Smoothness Adaptive Hypothesis Transfer Learning},\nauthor={Haotian Lin and Matthew Reimherr},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=v0VUsQI5yw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 616400, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13569984806370038568&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "psu.edu;amazon.com", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Pennsylvania State University;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.psu.edu;https://www.amazon.com", "aff_unique_abbr": "PSU;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "TENG: Time-Evolving Natural Gradient for Solving PDEs With Deep Neural Nets Toward Machine Precision", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32799", "id": "v1I4zRAjMb", "proceeding": "https://proceedings.mlr.press/v235/chen24ad.html", "pdf": "https://openreview.net/pdf?id=v1I4zRAjMb", "openreview": "https://openreview.net/forum?id=v1I4zRAjMb", "author_site": "Zhuo Chen, Jacob McCarran, Esteban Vizcaino, Marin Solja\u010di\u0107, Di Luo", "tldr": "", "abstract": "Partial differential equations (PDEs) are instrumental for modeling dynamical systems in science and engineering. The advent of neural networks has initiated a significant shift in tackling these complexities though challenges in accuracy persist, especially for initial value problems. In this paper, we introduce the *Time-Evolving Natural Gradient (TENG)*, generalizing time-dependent variational principles and optimization-based time integration, leveraging natural gradient optimization to obtain high accuracy in neural-network-based PDE solutions. Our comprehensive development includes algorithms like TENG-Euler and its high-order variants, such as TENG-Heun, tailored for enhanced precision and efficiency. TENG's effectiveness is further validated through its performance, surpassing current leading methods and achieving *machine precision* in step-by-step optimizations across a spectrum of PDEs, including the heat equation, Allen-Cahn equation, and Burgers' equation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhuo Chen;Jacob McCarran;Esteban Vizcaino;Marin Soljacic;Di Luo", "authorids": "~Zhuo_Chen8;~Jacob_McCarran1;~Esteban_Vizcaino1;~Marin_Soljacic1;~Di_Luo1", "gender": ";M;M;;M", "homepage": ";;https://www.linkedin.com/in/esteban-vizcaino-9920a5210/;https://www.rle.mit.edu/marin/;", "dblp": ";;;131/2044;", "google_scholar": ";;;;OxZytTQAAAAJ", "orcid": ";;;;", "linkedin": ";jacob-mccarran-3ba648278/;;;", "or_profile": "~Zhuo_Chen8;~Jacob_McCarran1;~Esteban_Vizcaino1;~Marin_Soljacic1;~Di_Luo1", "aff": ";Massachusetts Institute of Technology;Massachusetts Institute of Technology;;Massachusetts Institute of Technology", "aff_domain": ";mit.edu;mit.edu;;mit.edu", "position": ";Undergrad student;Undergrad student;;Postdoc", "bibtex": "@inproceedings{\nchen2024teng,\ntitle={{TENG}: Time-Evolving Natural Gradient for Solving {PDE}s With Deep Neural Nets Toward Machine Precision},\nauthor={Zhuo Chen and Jacob McCarran and Esteban Vizcaino and Marin Soljacic and Di Luo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=v1I4zRAjMb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1469651, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17833077628714993203&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";mit.edu;mit.edu;;mit.edu", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Confronting Reward Overoptimization for Diffusion Models: A Perspective of Inductive and Primacy Biases", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32798", "id": "v2o9rRJcEv", "proceeding": "https://proceedings.mlr.press/v235/zhang24ch.html", "pdf": "https://openreview.net/pdf?id=v2o9rRJcEv", "openreview": "https://openreview.net/forum?id=v2o9rRJcEv", "author_site": "Ziyi Zhang, Sen Zhang, Yibing Zhan, Yong Luo, Yonggang Wen, Dacheng Tao", "tldr": "", "abstract": "Bridging the gap between diffusion models and human preferences is crucial for their integration into practical generative workflows. While optimizing downstream reward models has emerged as a promising alignment strategy, concerns arise regarding the risk of excessive optimization with learned reward models, which potentially compromises ground-truth performance. In this work, we confront the reward overoptimization problem in diffusion model alignment through the lenses of both inductive and primacy biases. We first identify a mismatch between current methods and the temporal inductive bias inherent in the multi-step denoising process of diffusion models, as a potential source of reward overoptimization. Then, we surprisingly discover that dormant neurons in our critic model act as a regularization against reward overoptimization while active neurons reflect primacy bias. Motivated by these observations, we propose Temporal Diffusion Policy Optimization with critic active neuron Reset (TDPO-R), a policy gradient algorithm that exploits the temporal inductive bias of diffusion models and mitigates the primacy bias stemming from active neurons. Empirical results demonstrate the superior efficacy of our methods in mitigating reward overoptimization. Code is avaliable at https://github.com/ZiyiZhang27/tdpo.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ziyi Zhang;Sen Zhang;Yibing Zhan;Yong Luo;Yonggang Wen;Dacheng Tao", "authorids": "~Ziyi_Zhang9;~Sen_Zhang3;~Yibing_Zhan2;~Yong_Luo2;~Yonggang_Wen1;~Dacheng_Tao1", "gender": "M;M;;M;M;", "homepage": "https://ziyizhang27.github.io/;https://github.com/SenZHANG-GitHub;;;https://personal.ntu.edu.sg/ygwen/;", "dblp": ";57/6221-6;;57/5272-2.html;;", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;-bJJNV0AAAAJ;;zb1oVGIAAAAJ;https://scholar.google.com.tw/citations?user=byeygOkAAAAJ;", "orcid": "0000-0003-1728-2588;;;;;", "linkedin": ";;;;;", "or_profile": "~Ziyi_Zhang9;~Sen_Zhang3;~Yibing_Zhan2;~Yong_Luo2;~Yonggang_Wen1;~Dacheng_Tao1", "aff": "Wuhan University;University of Sydney, University of Sydney;;Wuhan University;Nanyang Technological University;", "aff_domain": "whu.edu.cn;sydney.edu.au;;whu.edu.cn;ntu.edu.sg;", "position": "PhD student;Postdoc;;Professor;Full Professor;", "bibtex": "@inproceedings{\nzhang2024confronting,\ntitle={Confronting Reward Overoptimization for Diffusion Models: A Perspective of Inductive and Primacy Biases},\nauthor={Ziyi Zhang and Sen Zhang and Yibing Zhan and Yong Luo and Yonggang Wen and Dacheng Tao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=v2o9rRJcEv}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4762194, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7696204213341954394&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "whu.edu.cn;sydney.edu.au;;whu.edu.cn;ntu.edu.sg;", "author_num": 6, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Wuhan University;University of Sydney;Nanyang Technological University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.whu.edu.cn/;https://www.sydney.edu.au;https://www.ntu.edu.sg", "aff_unique_abbr": "WHU;USYD;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;2", "aff_country_unique": "China;Australia;Singapore" }, { "title": "Adaptive Robust Learning using Latent Bernoulli Variables", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32797", "id": "v6eaD7Wekw", "proceeding": "https://proceedings.mlr.press/v235/karakulev24a.html", "pdf": "https://openreview.net/pdf?id=v6eaD7Wekw", "openreview": "https://openreview.net/forum?id=v6eaD7Wekw", "author_site": "Aleksandr Karakulev, Dave Zachariah, Prashant Singh", "tldr": "", "abstract": "We present an adaptive approach for robust learning from corrupted training sets. We identify corrupted and non-corrupted samples with latent Bernoulli variables and thus formulate the learning problem as maximization of the likelihood where latent variables are marginalized. The resulting problem is solved via variational inference, using an efficient Expectation-Maximization based method. The proposed approach improves over the state-of-the-art by automatically inferring the corruption level, while adding minimal computational overhead. We demonstrate our robust learning method and its parameter-free nature on a wide variety of machine learning tasks including online learning and deep learning where it adapts to different levels of noise and maintains high prediction accuracy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aleksandr Karakulev;Dave Zachariah;Prashant Singh", "authorids": "~Aleksandr_Karakulev1;~Dave_Zachariah1;~Prashant_Singh1", "gender": "M;;M", "homepage": ";;https://www.uu.se/en/contact-and-organisation/staff?query=N16-1953", "dblp": ";84/2663;", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;;https://scholar.google.de/citations?user=Q3gybHUAAAAJ", "orcid": ";;0000-0002-3123-3478", "linkedin": ";;", "or_profile": "~Aleksandr_Karakulev1;~Dave_Zachariah1;~Prashant_Singh1", "aff": "Uppsala University;Uppsala University;Uppsala University", "aff_domain": "uu.se;it.uu.se;uu.se", "position": "PhD student;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nkarakulev2024adaptive,\ntitle={Adaptive Robust Learning using Latent Bernoulli Variables},\nauthor={Aleksandr Karakulev and Dave Zachariah and Prashant Singh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=v6eaD7Wekw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7924218, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:1noXZZE80rMJ:scholar.google.com/&scioq=Adaptive+Robust+Learning+using+Latent+Bernoulli+Variables&hl=en&as_sdt=0,5", "gs_version_total": 8, "email": "uu.se;it.uu.se;uu.se", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Uppsala University", "aff_unique_dep": "", "aff_unique_url": "https://www.uu.se", "aff_unique_abbr": "UU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Sweden" }, { "title": "Balancing Similarity and Complementarity for Federated Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32796", "id": "v6tAdeCXKH", "proceeding": "https://proceedings.mlr.press/v235/yan24a.html", "pdf": "https://openreview.net/pdf?id=v6tAdeCXKH", "openreview": "https://openreview.net/forum?id=v6tAdeCXKH", "author_site": "Kunda Yan, Sen Cui, Abudukelimu Wuerkaixi, Jingfeng ZHANG, Bo Han, Gang Niu, Masashi Sugiyama, Changshui Zhang", "tldr": "", "abstract": "In mobile and IoT systems, Federated Learning (FL) is increasingly important for effectively using data while maintaining user privacy. One key challenge in FL is managing statistical heterogeneity, such as non-i.i.d. data, arising from numerous clients and diverse data sources. This requires strategic cooperation, often with clients having similar characteristics. However, we are interested in a fundamental question: does achieving optimal cooperation necessarily entail cooperating with the most similar clients? Typically, significant model performance improvements are often realized not by partnering with the most similar models, but through leveraging complementary data. Our theoretical and empirical analyses suggest that optimal cooperation is achieved by enhancing complementarity in feature distribution while restricting the disparity in the correlation between features and targets. Accordingly, we introduce a novel framework, FedSaC, which balances similarity and complementarity in FL cooperation. Our framework aims to approximate an optimal cooperation network for each client by optimizing a weighted sum of model similarity and feature complementarity. The strength of FedSaC lies in its adaptability to various levels of data heterogeneity and multimodal scenarios. Our comprehensive unimodal and multimodal experiments demonstrate that FedSaC markedly surpasses other state-of-the-art FL methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kunda Yan;Sen Cui;Abudukelimu Wuerkaixi;Jingfeng Zhang;Bo Han;Gang Niu;Masashi Sugiyama;Changshui Zhang", "authorids": "~Kunda_Yan1;~Sen_Cui1;~Abudukelimu_Wuerkaixi1;~Jingfeng_Zhang1;~Bo_Han1;~Gang_Niu1;~Masashi_Sugiyama1;~Changshui_Zhang2", "gender": "M;M;M;M;M;M;M;M", "homepage": "https://github.com/yankd22;;https://www.baidu.com;https://zjfheart.github.io;https://niug1984.github.io;http://www.ms.k.u-tokyo.ac.jp/sugi/;http://bigeye.au.tsinghua.edu.cn/english/Introduction.html;https://bhanml.github.io/", "dblp": ";267/5483;293/3368;227/2664.html;26/3367-1;35/1228;z/ChangshuiZhang;241/0472-3", "google_scholar": ";UzQuG1UAAAAJ;;NS0P1FkAAAAJ;https://scholar.google.co.jp/citations?user=HOkcy00AAAAJ;https://scholar.google.co.jp/citations?user=GkYIrlIAAAAJ;GL9M37YAAAAJ;nTNjqHwAAAAJ", "orcid": ";;;0000-0003-3491-8074;;0000-0001-6658-6743;;", "linkedin": ";;;;;;;", "or_profile": "~Kunda_Yan1;~Sen_Cui1;~Abudukelimu_Wuerkaixi1;~Jingfeng_Zhang1;~Gang_Niu1;~Masashi_Sugiyama1;~Changshui_Zhang2;~bo_han2", "aff": "Tsinghua University;Tsinghua University;Tsinghua University, Beijing;University of Auckland;Southeast University;The University of Tokyo;Tsinghua University;MBZUAI", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;auckland.ac.nz;seu.edu.cn;u-tokyo.ac.jp;mail.tsinghua.edu.cn;mbzuai.ac.ae", "position": "PhD student;PhD student;PhD student;Assistant Professor;Adjunct Full Professor;Full Professor;Full Professor;Researcher", "bibtex": "@inproceedings{\nyan2024balancing,\ntitle={Balancing Similarity and Complementarity for Federated Learning},\nauthor={Kunda Yan and Sen Cui and Abudukelimu Wuerkaixi and Jingfeng Zhang and Bo Han and Gang Niu and Masashi Sugiyama and Changshui Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=v6tAdeCXKH}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 0, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13058943763906399897&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;auckland.ac.nz;seu.edu.cn;u-tokyo.ac.jp;mail.tsinghua.edu.cn;mbzuai.ac.ae", "author_num": 8, "aff_unique_index": "0;0;0;1;2;3;0;4", "aff_unique_norm": "Tsinghua University;University of Auckland;Southeast University;University of Tokyo;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.auckland.ac.nz;https://www.seu.edu.cn/;https://www.u-tokyo.ac.jp;https://www.mbzuai.ac.ae", "aff_unique_abbr": "THU;UoA;SEU;UTokyo;MBZUAI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0;0;0;1;0;2;0;3", "aff_country_unique": "China;New Zealand;Japan;United Arab Emirates" }, { "title": "Tabular Insights, Visual Impacts: Transferring Expertise from Tables to Images", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32795", "id": "v7I5FtL2pV", "proceeding": "https://proceedings.mlr.press/v235/jiang24h.html", "pdf": "https://openreview.net/pdf?id=v7I5FtL2pV", "openreview": "https://openreview.net/forum?id=v7I5FtL2pV", "author_site": "Jun-Peng Jiang, Han-Jia Ye, Leye Wang, Yang Yang, Yuan Jiang, De-Chuan Zhan", "tldr": "", "abstract": "Transferring knowledge across diverse data modalities is receiving increasing attention in machine learning. This paper tackles the task of leveraging expert-derived, yet expensive, tabular data to enhance image-based predictions when tabular data is unavailable during inference. The primary challenges stem from the inherent complexity of accurately mapping diverse tabular data to visual contexts, coupled with the necessity to devise distinct strategies for numerical and categorical tabular attributes. We propose CHannel tAbulaR alignment with optiMal tranSport (Charms), which establishes an alignment between image channels and tabular attributes, enabling selective knowledge transfer that is pertinent to visual features. Specifically, Charms measures similarity distributions across modalities to effectively differentiate and transfer relevant tabular features, with a focus on morphological characteristics, enhancing the capabilities of visual classifiers. By maximizing the mutual information between image channels and tabular features, knowledge from both numerical and categorical tabular attributes are extracted. Experimental results demonstrate that Charms not only enhances the performance of image classifiers but also improves their interpretability by effectively utilizing tabular knowledge.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jun-Peng Jiang;Han-Jia Ye;Leye Wang;Yang Yang;Yuan Jiang;De-Chuan Zhan", "authorids": "~Jun-Peng_Jiang2;~Han-Jia_Ye1;~Leye_Wang1;~Yang_Yang17;~Yuan_Jiang1;~De-Chuan_Zhan1", "gender": ";M;M;M;F;M", "homepage": "http://www.lamda.nju.edu.cn/jiangjp/;http://www.lamda.nju.edu.cn/yehj;https://wangleye.github.io/;http://www.njustkmg.cn/;http://lamda.nju.edu.cn/jiangy;http://www.lamda.nju.edu.cn/zhandc/", "dblp": "266/2867;165/3014;07/8764;48/450-74;;74/498", "google_scholar": "ZZ_7-TQAAAAJ;mgOYhtoAAAAJ;;_6NJip0AAAAJ;;mYJf4TcAAAAJ", "orcid": ";;;0000-0002-5245-3584;;0000-0002-3533-2078", "linkedin": ";;;;;", "or_profile": "~Jun-Peng_Jiang2;~Han-Jia_Ye1;~Leye_Wang1;~Yang_Yang17;~Yuan_Jiang1;~De-Chuan_Zhan1", "aff": "NanJing University;Nanjing University;Peking University;Nanjing University of Science and Technology;Nanjing University;Nanjing University", "aff_domain": "nju.edu.cn;nju.edu.cn;pku.edu.cn;njust.edu.cn;nju.edu.cn;nju.edu.cn", "position": "PhD student;Associate Professor;Assistant Professor;Full Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\njiang2024tabular,\ntitle={Tabular Insights, Visual Impacts: Transferring Expertise from Tables to Images},\nauthor={Jun-Peng Jiang and Han-Jia Ye and Leye Wang and Yang Yang and Yuan Jiang and De-Chuan Zhan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=v7I5FtL2pV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2264128, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9614932538150419497&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "nju.edu.cn;nju.edu.cn;pku.edu.cn;njust.edu.cn;nju.edu.cn;nju.edu.cn", "author_num": 6, "aff_unique_index": "0;0;1;2;0;0", "aff_unique_norm": "Nanjing University;Peking University;Nanjing University of Science and Technology", "aff_unique_dep": ";;", "aff_unique_url": "http://www.nju.edu.cn;http://www.pku.edu.cn;http://www.nust.edu.cn/", "aff_unique_abbr": "Nanjing U;Peking U;NUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Model Assessment and Selection under Temporal Distribution Shift", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32794", "id": "v8MgLJ7kbL", "proceeding": "https://proceedings.mlr.press/v235/han24b.html", "pdf": "https://openreview.net/pdf?id=v8MgLJ7kbL", "openreview": "https://openreview.net/forum?id=v8MgLJ7kbL", "author_site": "Elise Han, Chengpiao Huang, Kaizheng Wang", "tldr": "", "abstract": "We investigate model assessment and selection in a changing environment, by synthesizing datasets from both the current time period and historical epochs. To tackle unknown and potentially arbitrary temporal distribution shift, we develop an adaptive rolling window approach to estimate the generalization error of a given model. This strategy also facilitates the comparison between any two candidate models by estimating the difference of their generalization errors. We further integrate pairwise comparisons into a single-elimination tournament, achieving near-optimal model selection from a collection of candidates. Theoretical analyses and empirical experiments underscore the adaptivity of our proposed methods to the non-stationarity in data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Elise Han;Chengpiao Huang;Kaizheng Wang", "authorids": "lh3117@columbia.edu;~Chengpiao_Huang1;~Kaizheng_Wang1", "gender": ";;", "homepage": ";https://ch3702.github.io/;", "dblp": ";;", "google_scholar": ";s2LGkrEAAAAJ;", "orcid": ";;", "linkedin": ";;", "or_profile": "lh3117@columbia.edu;~Chengpiao_Huang1;~Kaizheng_Wang1", "aff": ";Columbia University;", "aff_domain": ";columbia.edu;", "position": ";PhD student;", "bibtex": "@inproceedings{\nhan2024model,\ntitle={Model Assessment and Selection under Temporal Distribution Shift},\nauthor={Elise Han and Chengpiao Huang and Kaizheng Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=v8MgLJ7kbL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1197197, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12111897845434557318&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";columbia.edu;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Columbia University", "aff_unique_dep": "", "aff_unique_url": "https://www.columbia.edu", "aff_unique_abbr": "Columbia", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Energy-Efficient Gaussian Processes Using Low-Precision Arithmetic", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32793", "id": "v9tIJW1fzt", "proceeding": "https://proceedings.mlr.press/v235/alder24a.html", "pdf": "https://openreview.net/pdf?id=v9tIJW1fzt", "openreview": "https://openreview.net/forum?id=v9tIJW1fzt", "author_site": "Nicolas Alder, Ralf Herbrich", "tldr": "", "abstract": "The widespread use of artificial intelligence requires finding energy-efficient paradigms for the field. We propose to reduce the energy consumption of Gaussian process regression using low-precision floating-point representations. We explore how low-precision representations impact the results of Gaussian process regression and how data set properties, implementation approach, model performance, and energy consumption interact. Our findings show that a well-conditioned kernel matrix allows reducing the energy consumption by up to 89.01% for 98.08% of arithmetic operations with little to no impact on model performance. Our findings are relevant whenever one needs to invert a symmetric full-rank matrix.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nicolas Alder;Ralf Herbrich", "authorids": "~Nicolas_Alder1;~Ralf_Herbrich1", "gender": "Not Specified;M", "homepage": ";https://herbrich.me", "dblp": "282/0349;h/RalfHerbrich", "google_scholar": "G90XsK0AAAAJ;RuvHkikAAAAJ", "orcid": ";", "linkedin": ";ralf-herbrich-28a8324/", "or_profile": "~Nicolas_Alder1;~Ralf_Herbrich1", "aff": "Hasso Plattner Institute;Hasso Plattner Institute", "aff_domain": "hpi.de;hpi.de", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nalder2024energyefficient,\ntitle={Energy-Efficient Gaussian Processes Using Low-Precision Arithmetic},\nauthor={Nicolas Alder and Ralf Herbrich},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=v9tIJW1fzt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 403815, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jsShxrPshb8J:scholar.google.com/&scioq=Energy-Efficient+Gaussian+Processes+Using+Low-Precision+Arithmetic&hl=en&as_sdt=0,5", "gs_version_total": 5, "email": "hpi.de;hpi.de", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Hasso Plattner Institute", "aff_unique_dep": "", "aff_unique_url": "https://www.hpi.de", "aff_unique_abbr": "HPI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Modelling Microbial Communities with Graph Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32792", "id": "vBJZ93tvoE", "proceeding": "https://proceedings.mlr.press/v235/ruaud24a.html", "pdf": "https://openreview.net/pdf?id=vBJZ93tvoE", "openreview": "https://openreview.net/forum?id=vBJZ93tvoE", "author_site": "Albane Ruaud, Cansu Sancaktar, Marco Bagatella, Christoph Ratzke, Georg Martius", "tldr": "", "abstract": "Understanding the interactions and interplay of microorganisms is a great challenge with many applications in medical and environmental settings. In this work, we model bacterial communities directly from their genomes using graph neural networks (GNNs). GNNs leverage the inductive bias induced by the set nature of bacteria, enforcing permutation invariance and granting combinatorial generalization. We propose to learn the dynamics implicitly by directly predicting community relative abundance profiles at steady state, thus escaping the need for growth curves. On two real-world datasets, we show for the first time generalization to unseen bacteria and different community structures. To investigate the prediction results more deeply, we create a simulation for flexible data generation and analyze effects of bacteria interaction strength, community size, and training data amount.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Albane Ruaud;Cansu Sancaktar;Marco Bagatella;Christoph Ratzke;Georg Martius", "authorids": "~Albane_Ruaud1;~Cansu_Sancaktar1;~Marco_Bagatella1;~Christoph_Ratzke1;~Georg_Martius1", "gender": ";F;;;M", "homepage": ";https://csancaktar.github.io;;https://www.cratzke.de/;https://uni-tuebingen.de/de/264672", "dblp": ";256/5345;;;47/2706", "google_scholar": ";9JqNY7UAAAAJ;;6Duap4gAAAAJ;https://scholar.google.de/citations?user=b-JF-UIAAAAJ", "orcid": "0000-0001-5920-1710;;;;", "linkedin": ";cansu-sancaktar-61715b140/;marco-bagatella-9b8017197/;;", "or_profile": "~Albane_Ruaud1;~Cansu_Sancaktar1;~Marco_Bagatella1;~Christoph_Ratzke1;~Georg_Martius1", "aff": "Eberhard-Karls-Universit\u00e4t T\u00fcbingen;Qualcomm Inc, QualComm;Max Planck Institute for Intelligent Systems, Max Planck Institute for Intelligent Systems;Eberhard-Karls-Universit\u00e4t T\u00fcbingen;Max Planck Institute for Intelligent Systems", "aff_domain": "uni-tuebingen.de;qti.qualcomm.com;is.tue.mpg.de;uni-tuebingen.de;tuebingen.mpg.de", "position": "Postdoc;Intern;PhD student;Principal Researcher;Assistant Professor", "bibtex": "@inproceedings{\nruaud2024modelling,\ntitle={Modelling Microbial Communities with Graph Neural Networks},\nauthor={Albane Ruaud and Cansu Sancaktar and Marco Bagatella and Christoph Ratzke and Georg Martius},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vBJZ93tvoE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2565168, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8483046927231631903&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "uni-tuebingen.de;qti.qualcomm.com;is.tue.mpg.de;uni-tuebingen.de;tuebingen.mpg.de", "author_num": 5, "aff_unique_index": "0;1;2;0;2", "aff_unique_norm": "Eberhard Karls University of T\u00fcbingen;Qualcomm Incorporated;Max Planck Institute for Intelligent Systems", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.qualcomm.com;https://www.mpi-is.mpg.de", "aff_unique_abbr": "Uni T\u00fcbingen;Qualcomm;MPI-IS", "aff_campus_unique_index": "0;0", "aff_campus_unique": "T\u00fcbingen;", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "Germany;United States" }, { "title": "Lookbehind-SAM: k steps back, 1 step forward", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32791", "id": "vCN5lwcWWE", "proceeding": "https://proceedings.mlr.press/v235/mordido24a.html", "pdf": "https://openreview.net/pdf?id=vCN5lwcWWE", "openreview": "https://openreview.net/forum?id=vCN5lwcWWE", "author_site": "Gon\u00e7alo Mordido, Pranshu Malviya, Aristide Baratin, Sarath Chandar", "tldr": "", "abstract": "Sharpness-aware minimization (SAM) methods have gained increasing popularity by formulating the problem of minimizing both loss value and loss sharpness as a minimax objective. In this work, we increase the efficiency of the maximization and minimization parts of SAM's objective to achieve a better loss-sharpness trade-off. By taking inspiration from the Lookahead optimizer, which uses multiple descent steps ahead, we propose Lookbehind, which performs multiple ascent steps behind to enhance the maximization step of SAM and find a worst-case perturbation with higher loss. Then, to mitigate the variance in the descent step arising from the gathered gradients across the multiple ascent steps, we employ linear interpolation to refine the minimization step. Lookbehind leads to a myriad of benefits across a variety of tasks. Particularly, we show increased generalization performance, greater robustness against noisy weights, as well as improved learning and less catastrophic forgetting in lifelong learning settings. Our code is available at https://github.com/chandar-lab/Lookbehind-SAM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Goncalo Mordido;Pranshu Malviya;Aristide Baratin;Sarath Chandar", "authorids": "~Goncalo_Mordido1;~Pranshu_Malviya1;~Aristide_Baratin1;~Sarath_Chandar1", "gender": ";M;;M", "homepage": ";https://pranshu28.github.io/about/;;http://sarathchandar.in/", "dblp": ";;;45/8542", "google_scholar": ";;;https://scholar.google.co.in/citations?user=yxWtZLAAAAAJ", "orcid": ";;;", "linkedin": ";pranshumalviya2/;;", "or_profile": "~Goncalo_Mordido1;~Pranshu_Malviya1;~Aristide_Baratin1;~Sarath_Chandar1", "aff": ";\u00c9cole Polytechnique de Montr\u00e9al, Universit\u00e9 de Montr\u00e9al;;\u00c9cole Polytechnique de Montr\u00e9al", "aff_domain": ";polymtl.ca;;polymtl.ca", "position": ";PhD student;;Assistant Professor", "bibtex": "@inproceedings{\nmordido2024lookbehindsam,\ntitle={Lookbehind-{SAM}: k steps back, 1 step forward},\nauthor={Goncalo Mordido and Pranshu Malviya and Aristide Baratin and Sarath Chandar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vCN5lwcWWE}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 885685, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13817760683905675690&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";polymtl.ca;;polymtl.ca", "author_num": 4, "aff_unique_index": "0;0", "aff_unique_norm": "\u00c9cole Polytechnique de Montr\u00e9al", "aff_unique_dep": "", "aff_unique_url": "https://www.polymtl.ca", "aff_unique_abbr": "Polytechnique Montr\u00e9al", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Montr\u00e9al", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "title": "UGrid: An Efficient-And-Rigorous Neural Multigrid Solver for Linear PDEs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32790", "id": "vFATIZXlCm", "proceeding": "https://proceedings.mlr.press/v235/han24a.html", "pdf": "https://openreview.net/pdf?id=vFATIZXlCm", "openreview": "https://openreview.net/forum?id=vFATIZXlCm", "author_site": "Xi Han, Fei Hou, Hong Qin", "tldr": "", "abstract": "Numerical solvers of Partial Differential Equations (PDEs) are of fundamental significance to science and engineering. To date, the historical reliance on legacy techniques has circumscribed possible integration of big data knowledge and exhibits sub-optimal efficiency for certain PDE formulations, while data-driven neural methods typically lack mathematical guarantee of convergence and correctness. This paper articulates a mathematically rigorous neural solver for linear PDEs. The proposed UGrid solver, built upon the principled integration of U-Net and MultiGrid, manifests a mathematically rigorous proof of both convergence and correctness, and showcases high numerical accuracy, as well as strong generalization power to various input geometry/values and multiple PDE formulations. In addition, we devise a new residual loss metric, which enables unsupervised training and affords more stability and a larger solution space over the legacy losses.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xi Han;Fei Hou;Hong Qin", "authorids": "~Xi_Han1;~Fei_Hou1;~Hong_Qin1", "gender": "M;M;M", "homepage": ";https://lcs.ios.ac.cn/~houf/;http://www.cs.stonybrook.edu/~qin", "dblp": ";24/3702;79/627-1", "google_scholar": "paSti1kAAAAJ;NWoYRf8AAAAJ;NOcejj8AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Xi_Han1;~Fei_Hou1;~Hong_Qin1", "aff": "State University of New York at Stony Brook;Institute of Software, Chinese Academy of Sciences;Stony Brook University (State University of New York, Stony Brook)", "aff_domain": "stonybrook.edu;ios.ac.cn;cs.stonybrook.edu", "position": "PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nhan2024ugrid,\ntitle={{UG}rid: An Efficient-And-Rigorous Neural Multigrid Solver for Linear {PDE}s},\nauthor={Xi Han and Fei Hou and Hong Qin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vFATIZXlCm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4147966, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1521160273802162521&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "stonybrook.edu;ios.ac.cn;cs.stonybrook.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "State University of New York at Stony Brook;Chinese Academy of Sciences;Stony Brook University", "aff_unique_dep": ";Institute of Software;", "aff_unique_url": "https://www.stonybrook.edu;http://www.ios.ac.cn;https://www.stonybrook.edu", "aff_unique_abbr": "SUNY Stony Brook;CAS;SBU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stony Brook;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;China" }, { "title": "Interpreting Equivariant Representations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32789", "id": "vFk9fqXLst", "proceeding": "https://proceedings.mlr.press/v235/hansen24a.html", "pdf": "https://openreview.net/pdf?id=vFk9fqXLst", "openreview": "https://openreview.net/forum?id=vFk9fqXLst", "author_site": "Andreas Abildtrup Hansen, Anna Calissano, Aasa Feragen", "tldr": "", "abstract": "Latent representations are extensively used for tasks like visualization, interpolation, or feature extraction in deep learning models. This paper demonstrates the importance of considering the inductive bias imposed by an equivariant model when using latent representations as neglecting these biases can lead to decreased performance in downstream tasks. We propose principles for choosing invariant projections of latent representations and show their effectiveness in two examples: A permutation equivariant variational auto-encoder for molecular graph generation, where an invariant projection can be designed to maintain information without loss, and for a rotation-equivariant representation in image classification, where random invariant projections proves to retain a high degree of information. In both cases, the analysis of invariant latent representations proves superior to their equivariant counterparts. Finally, we illustrate that the phenomena documented here for equivariant neural networks have counterparts in standard neural networks where invariance is encouraged via augmentation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Andreas Abildtrup Hansen;Anna Calissano;Aasa Feragen", "authorids": "~Andreas_Abildtrup_Hansen1;~Anna_Calissano1;~Aasa_Feragen2", "gender": ";F;", "homepage": ";https://annacalissano.com/;http://people.compute.dtu.dk/afhar/", "dblp": ";;62/8616", "google_scholar": ";;MNDVpoUAAAAJ", "orcid": ";0000-0002-7403-0531;", "linkedin": ";;", "or_profile": "~Andreas_Abildtrup_Hansen1;~Anna_Calissano1;~Aasa_Feragen2", "aff": ";Imperial College London;Technical University of Denmark", "aff_domain": ";imperial.ac.uk;dtu.dk", "position": ";Postdoc;Professor", "bibtex": "@inproceedings{\nhansen2024interpreting,\ntitle={Interpreting Equivariant Representations},\nauthor={Andreas Abildtrup Hansen and Anna Calissano and Aasa Feragen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vFk9fqXLst}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8000843, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6712259069507697350&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";imperial.ac.uk;dtu.dk", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Imperial College London;Technical University of Denmark", "aff_unique_dep": ";", "aff_unique_url": "https://www.imperial.ac.uk;https://www.tek.dk", "aff_unique_abbr": "ICL;DTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United Kingdom;Denmark" }, { "title": "Gradient Compressed Sensing: A Query-Efficient Gradient Estimator for High-Dimensional Zeroth-Order Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32788", "id": "vG7YpsJT74", "proceeding": "https://proceedings.mlr.press/v235/qiu24g.html", "pdf": "https://openreview.net/pdf?id=vG7YpsJT74", "openreview": "https://openreview.net/forum?id=vG7YpsJT74", "author_site": "Ruizhong Qiu, Hanghang Tong", "tldr": "", "abstract": "We study nonconvex zeroth-order optimization (ZOO) in a high-dimensional space $\\mathbb R^d$ for functions with approximately $s$-sparse gradients. To reduce the dependence on the dimensionality $d$ in the query complexity, high-dimensional ZOO methods seek to leverage gradient sparsity to design gradient estimators. The previous best method needs $O\\big(s\\log\\frac ds\\big)$ queries per step to achieve $O\\big(\\frac1T\\big)$ rate of convergence w.r.t. the number T of steps. In this paper, we propose *Gradient Compressed Sensing* (GraCe), a query-efficient and accurate estimator for sparse gradients that uses only $O\\big(s\\log\\log\\frac ds\\big)$ queries per step and still achieves $O\\big(\\frac1T\\big)$ rate of convergence. To our best knowledge, we are the first to achieve a *double-logarithmic* dependence on $d$ in the query complexity under weaker assumptions. Our proposed GraCe generalizes the Indyk\u2013Price\u2013Woodruff (IPW) algorithm in compressed sensing from linear measurements to nonlinear functions. Furthermore, since the IPW algorithm is purely theoretical due to its impractically large constant, we improve the IPW algorithm via our *dependent random partition* technique together with our corresponding novel analysis and successfully reduce the constant by a factor of nearly $4300$. Our GraCe is not only theoretically query-efficient but also achieves strong empirical performance. We benchmark our GraCe against $12$ existing ZOO methods with $10000$-dimensional functions and demonstrate that GraCe significantly outperforms existing methods. Our code is publicly available at https://github.com/q-rz/ICML24-GraCe.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ruizhong Qiu;Hanghang Tong", "authorids": "~Ruizhong_Qiu1;~Hanghang_Tong3", "gender": "M;", "homepage": "https://q-rz.github.io/;http://tonghanghang.org", "dblp": "330/9860;58/1757", "google_scholar": "REKarmcAAAAJ;RaINcuUAAAAJ", "orcid": "0009-0000-3253-8890;0000-0003-4405-3887", "linkedin": "ruizhong-qiu/;htong/", "or_profile": "~Ruizhong_Qiu1;~Hanghang_Tong3", "aff": "University of Illinois Urbana-Champaign;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;illinois.edu", "position": "MS student;Associate Professor", "bibtex": "@inproceedings{\nqiu2024gradient,\ntitle={Gradient Compressed Sensing: A Query-Efficient Gradient Estimator for High-Dimensional Zeroth-Order Optimization},\nauthor={Ruizhong Qiu and Hanghang Tong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vG7YpsJT74}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 828407, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12906714090235614798&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 9, "email": "illinois.edu;illinois.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Simulation-Based Inference with Quantile Regression", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32787", "id": "vGHOFeUQi8", "proceeding": "https://proceedings.mlr.press/v235/jia24a.html", "pdf": "https://openreview.net/pdf?id=vGHOFeUQi8", "openreview": "https://openreview.net/forum?id=vGHOFeUQi8", "tldr": "", "abstract": "We present Neural Quantile Estimation (NQE), a novel Simulation-Based Inference (SBI) method based on conditional quantile regression. NQE autoregressively learns individual one dimensional quantiles for each posterior dimension, conditioned on the data and previous posterior dimensions. Posterior samples are obtained by interpolating the predicted quantiles using monotonic cubic Hermite spline, with specific treatment for the tail behavior and multi-modal distributions. We introduce an alternative definition for the Bayesian credible region using the local Cumulative Density Function (CDF), offering substantially faster evaluation than the traditional Highest Posterior Density Region (HPDR). In case of limited simulation budget and/or known model misspecification, a post-processing calibration step can be integrated into NQE to ensure the unbiasedness of the posterior estimation with negligible additional computational cost. We demonstrate that NQE achieves state-of-the-art performance on a variety of benchmark problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "He Jia", "authorids": "~He_Jia1", "gender": "M", "homepage": "http://hejia.io", "dblp": "20/10769", "google_scholar": "VLAp8QMAAAAJ", "orcid": "0000-0002-9958-7758", "linkedin": "he-jia/", "or_profile": "~He_Jia1", "aff": "Princeton University", "aff_domain": "princeton.edu", "position": "PhD student", "bibtex": "@inproceedings{\njia2024simulationbased,\ntitle={Simulation-Based Inference with Quantile Regression},\nauthor={He Jia},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vGHOFeUQi8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1547097, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 1, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4549921219416747862&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "princeton.edu", "author_num": 1, "aff_unique_index": "0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Semantic-Aware Human Object Interaction Image Generation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32786", "id": "vITl6CqIkk", "proceeding": "https://proceedings.mlr.press/v235/xu24e.html", "pdf": "https://openreview.net/pdf?id=vITl6CqIkk", "openreview": "https://openreview.net/forum?id=vITl6CqIkk", "author_site": "zhu xu, Qingchao Chen, Yuxin Peng, Yang Liu", "tldr": "", "abstract": "Recent text-to-image generative models have demonstrated remarkable abilities in generating realistic images. Despite their great success, these models struggle to generate high-fidelity images with prompts oriented toward human-object interaction (HOI). The difficulty in HOI generation arises from two aspects. Firstly, the complexity and diversity of human poses challenge plausible human generation. Furthermore, untrustworthy generation of interaction boundary regions may lead to deficiency in HOI semantics. To tackle the problems, we propose a Semantic-Aware HOI generation framework SA-HOI . It utilizes human pose quality and interaction boundary region information as guidance for denoising process, thereby encouraging refinement in these regions to produce more reasonable HOI images. Based on it, we establish an iterative inversion and image refinement pipeline to continually enhance generation quality. Further, we introduce a comprehensive benchmark for HOI generation, which comprises a dataset involving diverse and fine-grained HOI categories, along with multiple custom-tailored evaluation metrics for HOI generation. Experiments demonstrate that our method significantly improves generation quality under both HOI-specific and conventional image evaluation metrics. The code is available at https://github.com/XZPKU/SA-HOI.git", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhu Xu;Qingchao Chen;Yuxin Peng;Yang Liu", "authorids": "~Zhu_Xu1;~Qingchao_Chen2;~Yuxin_Peng1;~Yang_Liu20", "gender": "M;M;M;F", "homepage": "https://github.com/xz-123-new;;http://39.108.48.32/mipl/pengyuxin/;http://www.csyangliu.com/", "dblp": ";123/9213;;51/3710-105", "google_scholar": ";Nm8aSfIAAAAJ;mFsXPNYAAAAJ;Yhwu4C4AAAAJ", "orcid": ";;0000-0001-7658-3845;", "linkedin": ";;;", "or_profile": "~Zhu_Xu1;~Qingchao_Chen2;~Yuxin_Peng1;~Yang_Liu20", "aff": "Peking University;Peking University;Peking University;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn", "position": "Undergrad student;Assistant Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nxu2024semanticaware,\ntitle={Semantic-Aware Human Object Interaction Image Generation},\nauthor={Zhu Xu and Qingchao Chen and Yuxin Peng and Yang Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vITl6CqIkk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8716526, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14581548019341897412&as_sdt=5,24&sciodt=0,24&hl=en", "gs_version_total": 3, "email": "pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Locality-Sensitive Hashing-Based Efficient Point Transformer with Applications in High-Energy Physics", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32785", "id": "vJx6fld6l0", "proceeding": "https://proceedings.mlr.press/v235/miao24b.html", "pdf": "https://openreview.net/pdf?id=vJx6fld6l0", "openreview": "https://openreview.net/forum?id=vJx6fld6l0", "author_site": "Siqi Miao, Zhiyuan Lu, Mia Liu, Javier Duarte, Pan Li", "tldr": "", "abstract": "This study introduces a novel transformer model optimized for large-scale point cloud processing in scientific domains such as high-energy physics (HEP) and astrophysics. Addressing the limitations of graph neural networks and standard transformers, our model integrates local inductive bias and achieves near-linear complexity with hardware-friendly regular operations. One contribution of this work is the quantitative analysis of the error-complexity tradeoff of various sparsification techniques for building efficient transformers. Our findings highlight the superiority of using locality-sensitive hashing (LSH), especially OR & AND-construction LSH, in kernel approximation for large-scale point cloud data with local inductive bias. Based on this finding, we propose LSH-based Efficient Point Transformer (**HEPT**), which combines E$^2$LSH with OR & AND constructions and is built upon regular computations. HEPT demonstrates remarkable performance on two critical yet time-consuming HEP tasks, significantly outperforming existing GNNs and transformers in accuracy and computational speed, marking a significant advancement in geometric deep learning and large-scale scientific data processing. Our code is available at https://github.com/Graph-COM/HEPT.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Siqi Miao;Zhiyuan Lu;Mia Liu;Javier Duarte;Pan Li", "authorids": "~Siqi_Miao1;~Zhiyuan_Lu1;~Mia_Liu1;~Javier_Duarte1;~Pan_Li2", "gender": ";;F;M;", "homepage": "https://siqi.plus/;;https://mia.physics.purdue.edu/;https://jduarte.physics.ucsd.edu;", "dblp": "312/7014-1;;;218/6528;https://dblp.org/pers/hd/l/Li_0005:Pan", "google_scholar": "bVF_CzUAAAAJ;YHjKBWQAAAAJ;;GTtW9H0AAAAJ;IroP0EwAAAAJ", "orcid": ";;;0000-0002-5076-7096;", "linkedin": ";;;javier-m-duarte/;pan-li-b951105a/", "or_profile": "~Siqi_Miao1;~Zhiyuan_Lu1;~Mia_Liu1;~Javier_Duarte1;~Pan_Li2", "aff": "Georgia Institute of Technology;None;Purdue University;University of California, San Diego;Purdue University", "aff_domain": "gatech.edu;none.com;purdue.edu;ucsd.edu;purdue.edu", "position": "PhD student;None;Assistant Professor;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nmiao2024localitysensitive,\ntitle={Locality-Sensitive Hashing-Based Efficient Point Transformer with Applications in High-Energy Physics},\nauthor={Siqi Miao and Zhiyuan Lu and Mia Liu and Javier Duarte and Pan Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vJx6fld6l0}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4542962, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15739366140009123422&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "gatech.edu;none.com;purdue.edu;ucsd.edu;purdue.edu", "author_num": 5, "aff_unique_index": "0;1;2;3;2", "aff_unique_norm": "Georgia Institute of Technology;Google;Purdue University;University of California, San Diego", "aff_unique_dep": ";Google AI;;", "aff_unique_url": "https://www.gatech.edu;https://ai.google;https://www.purdue.edu;https://www.ucsd.edu", "aff_unique_abbr": "Georgia Tech;Google AI;Purdue;UCSD", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Mountain View;San Diego", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Chain of Code: Reasoning with a Language Model-Augmented Code Emulator", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32784", "id": "vKtomqlSxm", "proceeding": "https://proceedings.mlr.press/v235/li24ar.html", "pdf": "https://openreview.net/pdf?id=vKtomqlSxm", "openreview": "https://openreview.net/forum?id=vKtomqlSxm", "author_site": "Chengshu Li, Jacky Liang, Andy Zeng, Xinyun Chen, Karol Hausman, Dorsa Sadigh, Sergey Levine, Li Fei-Fei, Fei Xia, brian ichter", "tldr": "", "abstract": "Code provides a general syntactic structure to build complex programs and perform precise computations when paired with a code interpreter \u2013 we hypothesize that language models (LMs) can leverage code-writing to improve Chain of Thought reasoning not only for logic and arithmetic tasks, but also for semantic ones (and in particular, those that are a mix of both). For example, consider prompting an LM to write code that counts the number of times it detects sarcasm in an essay: the LM may struggle to write an implementation for \"detect_sarcasm(string)\" that can be executed by the interpreter (handling the edge cases would be insurmountable). However, LMs may still produce a valid solution if they not only write code, but also selectively \"emulate\" the interpreter by generating the expected output of \"detect_sarcasm(string)\". In this work, we propose Chain of Code (CoC), a simple yet surprisingly effective extension that improves LM code-driven reasoning. The key idea is to encourage LMs to format semantic sub-tasks in a program as flexible pseudocode that the interpreter can explicitly catch undefined behaviors and hand off to simulate with an LM (as an \"LMulator\"). Experiments demonstrate that Chain of Code outperforms Chain of Thought and other baselines across a variety of benchmarks; on BIG-Bench Hard, Chain of Code achieves 84%, a gain of 12% over Chain of Thought. In a nutshell, CoC broadens the scope of reasoning questions that LMs can answer by \"thinking in code\".", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chengshu Li;Jacky Liang;Andy Zeng;Xinyun Chen;Karol Hausman;Dorsa Sadigh;Sergey Levine;Li Fei-Fei;Fei Xia;brian ichter", "authorids": "~Chengshu_Li1;~Jacky_Liang1;~Andy_Zeng3;~Xinyun_Chen1;~Karol_Hausman2;~Dorsa_Sadigh1;~Sergey_Levine1;~Li_Fei-Fei1;~Fei_Xia1;~brian_ichter1", "gender": "M;M;F;M;F;M;;M;M;F", "homepage": "https://www.chengshuli.me/;https://www.jacky.io;https://dorsa.fyi/;https://people.eecs.berkeley.edu/~svlevine/;https://profiles.stanford.edu/fei-fei-li;;;http://andyzeng.github.io/;https://karolhausman.github.io/;https://jungyhuk.github.io/", "dblp": "63/6091-2;;117/3174;80/7594;79/2528;;;http://dblp.uni-trier.de/pers/hd/z/Zeng:Andy;135/8164;", "google_scholar": "yay_v9EAAAAJ;K29Sv1EAAAAJ;ZaJEZpYAAAAJ;8R35rCwAAAAJ;rDfyQnIAAAAJ;pqP5_PgAAAAJ;-w5DuHgAAAAJ;q7nFtUcAAAAJ;yy0UFOwAAAAJ;d4W1UT0AAAAJ", "orcid": "0000-0002-9027-8617;;;;;0000-0003-4343-1444;;;;", "linkedin": "chengshu/;jackyliang42;;;fei-fei-li-4541247/;;;;karolhausman/;", "or_profile": "~Chengshu_Li1;~Jacky_Liang1;~Dorsa_Sadigh1;~Sergey_Levine1;~Li_Fei-Fei1;~Fei_Xia1;~brian_ichter1;~Andy_Zeng1;~Karol_Hausman1;~Xinyun_Chen2", "aff": "Stanford University;Google;Stanford University;Google;Stanford University;Google;Google;Google;Google Brain;Google", "aff_domain": "stanford.edu;google.com;stanford.edu;google.com;stanford.edu;google.com;google.com;google.com;google.com;google.com", "position": "PhD student;Researcher;Assistant Professor;Research Scientist;Full Professor;Researcher;Research Scientist;Research Scientist;Research Scientist;Researcher", "bibtex": "@inproceedings{\nli2024chain,\ntitle={Chain of Code: Reasoning with a Language Model-Augmented Code Emulator},\nauthor={Chengshu Li and Jacky Liang and Andy Zeng and Xinyun Chen and Karol Hausman and Dorsa Sadigh and Sergey Levine and Li Fei-Fei and Fei Xia and brian ichter},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vKtomqlSxm}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1744320, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17002185806256415894&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "stanford.edu;google.com;stanford.edu;google.com;stanford.edu;google.com;google.com;google.com;google.com;google.com", "author_num": 10, "aff_unique_index": "0;1;0;1;0;1;1;1;1;1", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.stanford.edu;https://www.google.com", "aff_unique_abbr": "Stanford;Google", "aff_campus_unique_index": "0;1;0;1;0;1;1;1;1;1", "aff_campus_unique": "Stanford;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Stereographic Spherical Sliced Wasserstein Distances", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32783", "id": "vLtVGtEz5h", "proceeding": "https://proceedings.mlr.press/v235/tran24a.html", "pdf": "https://openreview.net/pdf?id=vLtVGtEz5h", "openreview": "https://openreview.net/forum?id=vLtVGtEz5h", "author_site": "Huy Tran, Yikun Bai, Abihith Kothapalli, Ashkan Shahbazi, XINRAN LIU, Rocio Diaz Martin, Soheil Kolouri", "tldr": "", "abstract": "Comparing spherical probability distributions is of great interest in various fields, including geology, medical domains, computer vision, and deep representation learning. The utility of optimal transport-based distances, such as the Wasserstein distance, for comparing probability measures has spurred active research in developing computationally efficient variations of these distances for spherical probability measures. This paper introduces a high-speed and highly parallelizable distance for comparing spherical measures using the stereographic projection and the generalized Radon transform, which we refer to as the Stereographic Spherical Sliced Wasserstein (S3W) distance. We carefully address the distance distortion caused by the stereographic projection and provide an extensive theoretical analysis of our proposed metric and its rotationally invariant variation. Finally, we evaluate the performance of the proposed metrics and compare them with recent baselines in terms of both speed and accuracy through a wide range of numerical studies, including gradient flows and self-supervised learning. Our code is available at https://github.com/mint-vu/s3wd.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Huy Tran;Yikun Bai;Abihith Kothapalli;Ashkan Shahbazi;Xinran Liu;Rocio P Diaz Martin;Soheil Kolouri", "authorids": "~Huy_Tran3;~Yikun_Bai2;~Abihith_Kothapalli1;~Ashkan_Shahbazi1;~Xinran_Liu2;~Rocio_P_Diaz_Martin1;~Soheil_Kolouri1", "gender": "M;M;M;M;;M;F", "homepage": "https://www.linkedin.com/in/huytransformer/;;https://abi-kothapalli.github.io/;;;https://skolouri.github.io/;", "dblp": ";273/3993.html;;;;143/9637;", "google_scholar": ";zLm6JOAAAAAJ;;lzdU2j8AAAAJ;ZHz5VScAAAAJ;yREBSy0AAAAJ;7RHakmMAAAAJ", "orcid": ";;;;;0000-0001-8495-5362;0000-0002-3732-6296", "linkedin": "huytransformer/;yikun-bai-b70050138/?trk=public_profile_browsemap;abikothapalli/;;xinran-l-5777a0205/;skolouri/;", "or_profile": "~Huy_Tran3;~Yikun_Bai2;~Abihith_Kothapalli1;~Ashkan_Shahbazi1;~Xinran_Liu2;~Soheil_Kolouri1;~ROCIO_DIAZ_MARTIN1", "aff": "University of Memphis;Vanderbilt University;Vanderbilt University;Vanderbilt University;Vanderbilt University;Vanderbilt University;Tufts University", "aff_domain": "memphis.edu;vanderbilt.edu;vanderbilt.edu;vanderbilt.edu;vanderbilt.edu;vanderbilt.edu;tufts.edu", "position": "Undergrad student;Postdoc;Undergrad student;PhD student;PhD student;Assistant Professor;Postdoc", "bibtex": "@inproceedings{\ntran2024stereographic,\ntitle={Stereographic Spherical Sliced Wasserstein Distances},\nauthor={Huy Tran and Yikun Bai and Abihith Kothapalli and Ashkan Shahbazi and Xinran Liu and Rocio P Diaz Martin and Soheil Kolouri},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vLtVGtEz5h}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9968792, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18231906569686173150&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "memphis.edu;vanderbilt.edu;vanderbilt.edu;vanderbilt.edu;vanderbilt.edu;vanderbilt.edu;tufts.edu", "author_num": 7, "aff_unique_index": "0;1;1;1;1;1;2", "aff_unique_norm": "University of Memphis;Vanderbilt University;Tufts University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.memphis.edu;https://www.vanderbilt.edu;https://www.tufts.edu", "aff_unique_abbr": "UM;Vanderbilt;Tufts", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Particle Denoising Diffusion Sampler", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32782", "id": "vMUnnS4OWC", "proceeding": "https://proceedings.mlr.press/v235/phillips24a.html", "pdf": "https://openreview.net/pdf?id=vMUnnS4OWC", "openreview": "https://openreview.net/forum?id=vMUnnS4OWC", "author_site": "Angus Phillips, Hai-Dang Dau, Michael Hutchinson, Valentin De Bortoli, George Deligiannidis, Arnaud Doucet", "tldr": "", "abstract": "Denoising diffusion models have become ubiquitous for generative modeling. The core idea is to transport the data distribution to a Gaussian by using a diffusion. Approximate samples from the data distribution are then obtained by estimating the time-reversal of this diffusion using score matching ideas. We follow here a similar strategy to sample from unnormalized probability densities and compute their normalizing constants. However, the time-reversed diffusion is here simulated by using an original iterative particle scheme relying on a novel score matching loss. Contrary to standard denoising diffusion models, the resulting Particle Denoising Diffusion Sampler (PDDS) provides asymptotically consistent estimates under mild assumptions. We demonstrate PDDS on multimodal and high dimensional sampling tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Angus Phillips;Hai-Dang Dau;Michael John Hutchinson;Valentin De Bortoli;George Deligiannidis;Arnaud Doucet", "authorids": "~Angus_Phillips1;~Hai-Dang_Dau1;~Michael_John_Hutchinson1;~Valentin_De_Bortoli1;~George_Deligiannidis2;~Arnaud_Doucet2", "gender": "M;M;M;;M;", "homepage": "https://www.stats.ox.ac.uk/;https://hai-dang-dau.github.io/;https://mjhutchinson.github.io;https://vdeborto.github.io/;https://www.stats.ox.ac.uk/~deligian;https://www.stats.ox.ac.uk/~doucet/", "dblp": ";368/6811;352/6313.html;224/9338;;68/1628", "google_scholar": ";hVV6rdcAAAAJ;ot1m2GUAAAAJ;;https://scholar.google.co.uk/citations?user=EF1FwN4AAAAJ;W4SZGV8AAAAJ", "orcid": ";0000-0002-0617-7566;;;;0000-0002-7662-419X", "linkedin": ";;;;;", "or_profile": "~Angus_Phillips1;~Hai-Dang_Dau1;~Michael_John_Hutchinson1;~Valentin_De_Bortoli1;~George_Deligiannidis2;~Arnaud_Doucet2", "aff": "University of Oxford;Oxofrd, University of Oxford;University of Oxford;University of Oxford;Oxford, University of Oxford;University of Oxford", "aff_domain": "ox.ac.uk;stats.ox.ac.uk;ox.ac.uk;ox.ac.uk;stats.ox.ac.uk;ox.ac.uk", "position": "PhD student;Postdoc;PhD student;Postdoc;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nphillips2024particle,\ntitle={Particle Denoising Diffusion Sampler},\nauthor={Angus Phillips and Hai-Dang Dau and Michael John Hutchinson and Valentin De Bortoli and George Deligiannidis and Arnaud Doucet},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vMUnnS4OWC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5420064, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5909895129331672317&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "ox.ac.uk;stats.ox.ac.uk;ox.ac.uk;ox.ac.uk;stats.ox.ac.uk;ox.ac.uk", "author_num": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Oxford", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Achieving Lossless Gradient Sparsification via Mapping to Alternative Space in Federated Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32781", "id": "vQmVmMN5ft", "proceeding": "https://proceedings.mlr.press/v235/kim24e.html", "pdf": "https://openreview.net/pdf?id=vQmVmMN5ft", "openreview": "https://openreview.net/forum?id=vQmVmMN5ft", "author_site": "Do-Yeon Kim, Dong-Jun Han, Jun Seo, Jaekyun Moon", "tldr": "", "abstract": "Handling the substantial communication burden in federated learning (FL) still remains a significant challenge. Although recent studies have attempted to compress the local gradients to address this issue, they typically perform compression only within the original parameter space, which may potentially limit the fundamental compression rate of the gradient. In this paper, instead of restricting our scope to a fixed traditional space, we consider an alternative space that provides an improved compressibility of the gradient. To this end, we utilize the structures of input activation and output gradient in designing our mapping function to a new space, which enables *lossless gradient sparsification*, i.e., mapping the gradient to our new space induces a greater number of *near-zero* elements without any loss of information. In light of this attribute, employing sparsification-based compressors in our new space allows for more aggressive compression with minimal information loss than the baselines. More surprisingly, our model even reaches higher accuracies than the full gradient uploading strategy in some cases, an extra benefit for utilizing the new space. We also theoretically confirm that our approach does not alter the existing, best known convergence rate of FL thanks to the orthogonal transformation properties of our mapping.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Do-Yeon Kim;Dong-Jun Han;Jun Seo;Jaekyun Moon", "authorids": "~Do-Yeon_Kim1;~Dong-Jun_Han1;~Jun_Seo1;~Jaekyun_Moon2", "gender": ";M;M;M", "homepage": ";https://sites.google.com/view/djhan930/home?authuser=0;;http://comstolab.kaist.ac.kr/people.html", "dblp": ";201/0078;222/1700;78/2744", "google_scholar": ";https://scholar.google.co.kr/citations?user=-YR-GxUAAAAJ;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Do-Yeon_Kim1;~Dong-Jun_Han1;~Jun_Seo1;~Jaekyun_Moon2", "aff": ";Purdue University;LG AI Research;KAIST", "aff_domain": ";purdue.edu;lgresearch.ai;kaist.edu", "position": ";Postdoc;Researcher;Full Professor", "bibtex": "@inproceedings{\nkim2024achieving,\ntitle={Achieving Lossless Gradient Sparsification via Mapping to Alternative Space in Federated Learning},\nauthor={Do-Yeon Kim and Dong-Jun Han and Jun Seo and Jaekyun Moon},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vQmVmMN5ft}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4999729, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3086420942340009709&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": ";purdue.edu;lgresearch.ai;kaist.edu", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Purdue University;LG;Korea Advanced Institute of Science and Technology", "aff_unique_dep": ";LG AI Research;", "aff_unique_url": "https://www.purdue.edu;https://www.lgaires.com;https://www.kaist.ac.kr", "aff_unique_abbr": "Purdue;LG AI;KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;South Korea" }, { "title": "One for All: A Universal Generator for Concept Unlearnability via Multi-Modal Alignment", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32780", "id": "vSerUPYFtB", "proceeding": "https://proceedings.mlr.press/v235/chen24bc.html", "pdf": "https://openreview.net/pdf?id=vSerUPYFtB", "openreview": "https://openreview.net/forum?id=vSerUPYFtB", "author_site": "Chaochao Chen, Jiaming Zhang, Yuyuan Li, Zhongxuan Han", "tldr": "", "abstract": "The abundance of free internet data offers unprecedented opportunities for researchers and developers, but it also poses privacy risks. Utilizing data without explicit consent raises critical challenges in protecting personal information.Unlearnable examples have emerged as a feasible protection approach, which renders the data unlearnable, i.e., useless to third parties, by injecting imperceptible perturbations. However, these perturbations only exhibit unlearnable effects on either a particular dataset or label-consistent scenarios, thereby lacking broad applicability. To address both issues concurrently, we propose a universal perturbation generator that harnesses data with concept unlearnability, thereby broadening the scope of unlearnability beyond specific datasets or labels. Specifically, we leverage multi-modal pre-trained models to establish a connection between the data concepts in a shared embedding space. This connection enables the information transformation from image data to text concepts. Consequently, we can align the text embedding using concept-wise discriminant loss, and render the data unlearnable. Extensive experiments conducted on real-world datasets demonstrate the concept unlearnability, i.e., cross-dataset transferability and label-agnostic utility, of our proposed unlearnable examples, as well as their robustness against attacks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chaochao Chen;Jiaming Zhang;Yuyuan Li;Zhongxuan Han", "authorids": "~Chaochao_Chen3;~Jiaming_Zhang7;~Yuyuan_Li1;~Zhongxuan_Han1", "gender": ";M;;M", "homepage": "https://sites.google.com/site/ccchomepage/;https://xiye7lai.github.io;;", "dblp": "26/1492-1;;35/11288;331/8494", "google_scholar": "qZTMyzwAAAAJ;aFC0W18AAAAJ;v4e49qEAAAAJ;beXTAoMAAAAJ", "orcid": "0000-0003-1419-964X;0009-0001-7855-3372;0000-0003-4896-2885;0000-0001-9957-7325", "linkedin": "ccchomepage/;;;", "or_profile": "~Chaochao_Chen3;~Jiaming_Zhang7;~Yuyuan_Li1;~Zhongxuan_Han1", "aff": "Zhejiang University;Zhejiang University;Hangzhou Dianzi University;Zhejiang University", "aff_domain": "zju.edu.cn;zju.edu.cn;hdu.edu.cn;zju.edu.cn", "position": "Distinguished Research Fellow;MS student;Associate Professor;PhD student", "bibtex": "@inproceedings{\nchen2024one,\ntitle={One for All: A Universal Generator for Concept Unlearnability via Multi-Modal Alignment},\nauthor={Chaochao Chen and Jiaming Zhang and Yuyuan Li and Zhongxuan Han},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vSerUPYFtB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 693663, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8046241322679761748&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "zju.edu.cn;zju.edu.cn;hdu.edu.cn;zju.edu.cn", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Zhejiang University;Hangzhou Dianzi University", "aff_unique_dep": ";", "aff_unique_url": "https://www.zju.edu.cn;http://www.hdu.edu.cn/", "aff_unique_abbr": "ZJU;HGHDU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "On Computational Limits of Modern Hopfield Models: A Fine-Grained Complexity Analysis", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32779", "id": "vXUqOCsbj8", "proceeding": "https://proceedings.mlr.press/v235/hu24j.html", "pdf": "https://openreview.net/pdf?id=vXUqOCsbj8", "openreview": "https://openreview.net/forum?id=vXUqOCsbj8", "author_site": "Jerry Yao-Chieh Hu, Thomas Lin, Zhao Song, Han Liu", "tldr": "", "abstract": "We investigate the computational limits of the memory retrieval dynamics of modern Hopfield models from the fine-grained complexity analysis. Our key contribution is the characterization of a phase transition behavior in the efficiency of all possible modern Hopfield models based on the norm of patterns. Specifically, we establish an upper bound criterion for the norm of input query patterns and memory patterns. Only below this criterion, sub-quadratic (efficient) variants of the modern Hopfield model exist, assuming the Strong Exponential Time Hypothesis (SETH). To showcase our theory, we provide a formal example of efficient constructions of modern Hopfield models using low-rank approximation when the efficient criterion holds. This includes a derivation of a lower bound on the computational time, scaling linearly with $\\max$\\{ \\# of stored memory patterns, length of input query sequence\\}. In addition, we prove its memory retrieval error bound and exponential memory capacity.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jerry Yao-Chieh Hu;Thomas Lin;Zhao Song;Han Liu", "authorids": "~Jerry_Yao-Chieh_Hu1;~Thomas_Lin2;~Zhao_Song3;~Han_Liu4", "gender": ";;M;", "homepage": ";;https://www.youtube.com/@zhaosong2031;", "dblp": ";;76/4051-2;", "google_scholar": ";;yDZct7UAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Jerry_Yao-Chieh_Hu1;~Thomas_Lin2;~Zhao_Song3;~Han_Liu4", "aff": ";;Adobe;Northwestern University", "aff_domain": ";;adobe.com;u.northwestern.edu", "position": ";;Researcher;Associate Professor", "bibtex": "@inproceedings{\nhu2024on,\ntitle={On Computational Limits of Modern Hopfield Models: A Fine-Grained Complexity Analysis},\nauthor={Jerry Yao-Chieh Hu and Thomas Lin and Zhao Song and Han Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vXUqOCsbj8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 434564, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1683894421428801220&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": ";;adobe.com;u.northwestern.edu", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Adobe;Northwestern University", "aff_unique_dep": "Adobe Inc.;", "aff_unique_url": "https://www.adobe.com;https://www.northwestern.edu", "aff_unique_abbr": "Adobe;NU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Partial Optimality in the Linear Ordering Problem", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32778", "id": "vYYIuJDTHq", "proceeding": "https://proceedings.mlr.press/v235/stein24a.html", "pdf": "https://openreview.net/pdf?id=vYYIuJDTHq", "openreview": "https://openreview.net/forum?id=vYYIuJDTHq", "author_site": "David Stein, Bjoern Andres", "tldr": "", "abstract": "The linear ordering problem consists in finding a linear order $<$ on a finite set $A$ so as to minimize the sum of costs associated with pairs of elements $a, b$ for which $a < b$. The problem is NP-hard and APX-hard. We introduce algorithms for solving the problem *partially* by deciding efficiently for some pairs $(a,b)$ whether $a < b$ is in an optimal solution. To do so, we construct maps from the feasible set of orders to itself and establish efficiently testable conditions on the cost function of the problem for which these maps are improving. We examine the effectiveness and efficiency of these conditions and algorithms empirically, on two data sets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "David Stein;Bjoern Andres", "authorids": "~David_Stein2;~Bjoern_Andres6", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nstein2024partial,\ntitle={Partial Optimality in the Linear Ordering Problem},\nauthor={David Stein and Bjoern Andres},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vYYIuJDTHq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 935582, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:VaHmhDAmbO0J:scholar.google.com/&scioq=Partial+Optimality+in+the+Linear+Ordering+Problem&hl=en&as_sdt=0,5", "gs_version_total": 4, "email": ";", "author_num": 2 }, { "title": "Comparing Graph Transformers via Positional Encodings", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32777", "id": "va3r3hSA6n", "proceeding": "https://proceedings.mlr.press/v235/black24b.html", "pdf": "https://openreview.net/pdf?id=va3r3hSA6n", "openreview": "https://openreview.net/forum?id=va3r3hSA6n", "author_site": "Mitchell Black, Zhengchao Wan, Gal Mishne, Amir Nayyeri, Yusu Wang", "tldr": "", "abstract": "The distinguishing power of graph transformers is tied to the choice of *positional encoding*: features used to augment the base transformer with information about the graph. There are two primary types of positional encoding: *absolute positional encodings (APEs)* and *relative positional encodings (RPEs)*. APEs assign features to each node and are given as input to the transformer. RPEs instead assign a feature to each *pair of nodes*, e.g., shortest-path distance, and are used to augment the attention block. A priori, it is unclear which method is better for maximizing the power of the resulting graph transformer. In this paper, we aim to understand the relationship between these different types of positional encodings. Interestingly, we show that graph transformers using APEs and RPEs are equivalent in their ability to distinguish non-isomorphic graphs. In particular, we demonstrate how to interchange APEs and RPEs while maintaining their distinguishing power in terms of graph transformers. However, in the case of graphs with node features, we show that RPEs may have an advantage over APEs. Based on our theoretical results, we provide a study of different APEs and RPEs---including the shortest-path and resistance distance and the recently introduced stable and expressive positional encoding (SPE)---and compare their distinguishing power in terms of transformers. We believe our work will help navigate the vast number of positional encoding choices and provide guidance on the future design of positional encodings for graph transformers.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mitchell Black;Zhengchao Wan;Gal Mishne;Amir Nayyeri;Yusu Wang", "authorids": "~Mitchell_Black1;~Zhengchao_Wan1;~Gal_Mishne1;~Amir_Nayyeri1;~Yusu_Wang1", "gender": "M;;F;;", "homepage": "https://mitchell.black;https://zhengchaow.github.io;http://mishne.ucsd.edu/;;", "dblp": "262/3347-2;228/7893;125/3214;;", "google_scholar": "https://scholar.google.com/citations?hl=en;kmTKYtoAAAAJ;KrwpdXYAAAAJ;;", "orcid": ";0000-0003-4388-6991;0000-0002-5287-3626;;", "linkedin": ";;;;", "or_profile": "~Mitchell_Black1;~Zhengchao_Wan1;~Gal_Mishne1;~Amir_Nayyeri1;~Yusu_Wang1", "aff": "Oregon State University;University of California, San Diego;University of California, San Diego;;", "aff_domain": "oregonstate.edu;ucsd.edu;ucsd.edu;;", "position": "PhD student;Postdoc;Assistant Professor;;", "bibtex": "@inproceedings{\nblack2024comparing,\ntitle={Comparing Graph Transformers via Positional Encodings},\nauthor={Mitchell Black and Zhengchao Wan and Gal Mishne and Amir Nayyeri and Yusu Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=va3r3hSA6n}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 656236, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11225577130507929785&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "oregonstate.edu;ucsd.edu;ucsd.edu;;", "author_num": 5, "aff_unique_index": "0;1;1", "aff_unique_norm": "Oregon State University;University of California, San Diego", "aff_unique_dep": ";", "aff_unique_url": "https://oregonstate.edu;https://www.ucsd.edu", "aff_unique_abbr": "OSU;UCSD", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Local vs. Global Interpretability: A Computational Complexity Perspective", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32776", "id": "veEjiN2w9F", "proceeding": "https://proceedings.mlr.press/v235/bassan24a.html", "pdf": "https://openreview.net/pdf?id=veEjiN2w9F", "openreview": "https://openreview.net/forum?id=veEjiN2w9F", "author_site": "Shahaf Bassan, Guy Amir, Guy Katz", "tldr": "", "abstract": "The local and global interpretability of various ML models has been studied extensively in recent years. However, despite significant progress in the field, many known results remain informal or lack sufficient mathematical rigor. We propose a framework for bridging this gap, by using computational complexity theory to assess local and global perspectives of interpreting ML models. We begin by proposing proofs for two novel insights that are essential for our analysis: (1) a duality between local and global forms of explanations; and (2) the inherent uniqueness of certain global explanation forms. We then use these insights to evaluate the complexity of computing explanations, across three model types representing the extremes of the interpretability spectrum: (1) linear models; (2) decision trees; and (3) neural networks. Our findings offer insights into both the local and global interpretability of these models. For instance, under standard complexity assumptions such as P != NP, we prove that selecting *global* sufficient subsets in linear models is computationally harder than selecting *local* subsets. Interestingly, with neural networks and decision trees, the opposite is true: it is harder to carry out this task locally than globally. We believe that our findings demonstrate how examining explainability through a computational complexity lens can help us develop a more rigorous grasp of the inherent interpretability of ML models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shahaf Bassan;Guy Amir;Guy Katz", "authorids": "~Shahaf_Bassan1;~Guy_Amir1;~Guy_Katz1", "gender": ";M;M", "homepage": ";https://guyam2.github.io/;http://www.katz-lab.com", "dblp": ";277/9596;23/10321", "google_scholar": ";CSJEObYAAAAJ;https://scholar.google.com.tw/citations?user=3nYG5BMAAAAJ", "orcid": ";;", "linkedin": ";https://linkedin.com/in/guy-amir-a335a3ba;", "or_profile": "~Shahaf_Bassan1;~Guy_Amir1;~Guy_Katz1", "aff": ";Cornell University;Hebrew University of Jerusalem", "aff_domain": ";cornell.edu;huji.ac.il", "position": ";Postdoc;Associate Professor", "bibtex": "@inproceedings{\nbassan2024local,\ntitle={Local vs. Global Interpretability: A Computational Complexity Perspective},\nauthor={Shahaf Bassan and Guy Amir and Guy Katz},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=veEjiN2w9F}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 787646, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11317724896642425989&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 7, "email": ";cornell.edu;huji.ac.il", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Cornell University;Hebrew University of Jerusalem", "aff_unique_dep": ";", "aff_unique_url": "https://www.cornell.edu;https://www.huji.ac.il", "aff_unique_abbr": "Cornell;HUJI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Jerusalem", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Israel" }, { "title": "DiffDA: a Diffusion model for weather-scale Data Assimilation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32775", "id": "vhMq3eAB34", "proceeding": "https://proceedings.mlr.press/v235/huang24h.html", "pdf": "https://openreview.net/pdf?id=vhMq3eAB34", "openreview": "https://openreview.net/forum?id=vhMq3eAB34", "author_site": "Langwen Huang, Lukas Gianinazzi, Yuejiang Yu, Peter Dueben, Torsten Hoefler", "tldr": "", "abstract": "The generation of initial conditions via accurate data assimilation is crucial for weather forecasting and climate modeling. We propose DiffDA as a denoising diffusion model capable of assimilating atmospheric variables using predicted states and sparse observations. Acknowledging the similarity between a weather forecast model and a denoising diffusion model dedicated to weather applications, we adapt the pretrained GraphCast neural network as the backbone of the diffusion model. Through experiments based on simulated observations from the ERA5 reanalysis dataset, our method can produce assimilated global atmospheric data consistent with observations at 0.25$^\\circ$ ($\\approx$30km) resolution globally. This marks the highest resolution achieved by ML data assimilation models. The experiments also show that the initial conditions assimilated from sparse observations (less than 0.96% of gridded data) and 48-hour forecast can be used for forecast models with a loss of lead time of at most 24 hours compared to initial conditions from state-of-the-art data assimilation in ERA5. This enables the application of the method to real-world applications, such as creating reanalysis datasets with autoregressive data assimilation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Langwen Huang;Lukas Gianinazzi;Yuejiang Yu;Peter Dominik Dueben;Torsten Hoefler", "authorids": "~Langwen_Huang1;~Lukas_Gianinazzi1;yuejyu@student.ethz.ch;~Peter_Dominik_Dueben1;~Torsten_Hoefler1", "gender": "M;M;;M;", "homepage": ";https://people.inf.ethz.ch/glukas/;;https://www.ecmwf.int/en/about/who-we-are/staff-profiles/peter-dueben;", "dblp": ";199/6118.html;;;16/3869", "google_scholar": ";FphhrCoAAAAJ;;;", "orcid": "0000-0002-9204-0346;;;;", "linkedin": ";;;;", "or_profile": "~Langwen_Huang1;~Lukas_Gianinazzi1;yuejyu@student.ethz.ch;~Peter_Dominik_Dueben1;~Torsten_Hoefler1", "aff": "ETHZ - ETH Zurich;Swiss Federal Institute of Technology;;;Swiss Federal Institute of Technology", "aff_domain": "ethz.ch;ethz.ch;;;ethz.ch", "position": "PhD student;PhD student;;;Professor", "bibtex": "@inproceedings{\nhuang2024diffda,\ntitle={Diff{DA}: a Diffusion model for weather-scale Data Assimilation},\nauthor={Langwen Huang and Lukas Gianinazzi and Yuejiang Yu and Peter Dominik Dueben and Torsten Hoefler},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vhMq3eAB34}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9108445, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11151717122720283309&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "ethz.ch;ethz.ch;;;ethz.ch", "author_num": 5, "aff_unique_index": "0;1;1", "aff_unique_norm": "ETH Zurich;Swiss Federal Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch", "aff_unique_abbr": "ETHZ;ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Graph Automorphism Group Equivariant Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32774", "id": "vjkq5fwsj3", "proceeding": "https://proceedings.mlr.press/v235/pearce-crump24a.html", "pdf": "https://openreview.net/pdf?id=vjkq5fwsj3", "openreview": "https://openreview.net/forum?id=vjkq5fwsj3", "author_site": "Edward Pearce-Crump, William J. Knottenbelt", "tldr": "", "abstract": "Permutation equivariant neural networks are typically used to learn from data that lives on a graph. However, for any graph $G$ that has $n$ vertices, using the symmetric group $S_n$ as its group of symmetries does not take into account the relations that exist between the vertices. Given that the actual group of symmetries is the automorphism group Aut$(G)$, we show how to construct neural networks that are equivariant to Aut$(G)$ by obtaining a full characterisation of the learnable, linear, Aut$(G)$-equivariant functions between layers that are some tensor power of $\\mathbb{R}^{n}$. In particular, we find a spanning set of matrices for these layer functions in the standard basis of $\\mathbb{R}^{n}$. This result has important consequences for learning from data whose group of symmetries is a finite group because a theorem by Frucht (1938) showed that any finite group is isomorphic to the automorphism group of a graph.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Edward Pearce-Crump;William Knottenbelt", "authorids": "~Edward_Pearce-Crump1;~William_Knottenbelt1", "gender": "M;Not Specified", "homepage": "https://www.epearcecrump.co.uk;http://www.imperial.ac.uk/people/w.knottenbelt/", "dblp": "318/8954;37/1901", "google_scholar": "75o1Jy4AAAAJ;https://scholar.google.com.tw/citations?user=Vz8XWRoAAAAJ", "orcid": "0000-0001-8275-177X;", "linkedin": "epearcecrump;", "or_profile": "~Edward_Pearce-Crump1;~William_Knottenbelt1", "aff": "Imperial College London;Imperial College London, Imperial College London", "aff_domain": "imperial.ac.uk;imperial.ac.uk", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\npearce-crump2024graph,\ntitle={Graph Automorphism Group Equivariant Neural Networks},\nauthor={Edward Pearce-Crump and William Knottenbelt},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vjkq5fwsj3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 606043, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11453736746117275466&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "imperial.ac.uk;imperial.ac.uk", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Imperial College London", "aff_unique_dep": "", "aff_unique_url": "https://www.imperial.ac.uk", "aff_unique_abbr": "ICL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Neural Operators with Localized Integral and Differential Kernels", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32773", "id": "vl9GB3fbht", "proceeding": "https://proceedings.mlr.press/v235/liu-schiaffini24a.html", "pdf": "https://openreview.net/pdf?id=vl9GB3fbht", "openreview": "https://openreview.net/forum?id=vl9GB3fbht", "author_site": "Miguel Liu-Schiaffini, Julius Berner, Boris Bonev, Thorsten Kurth, Kamyar Azizzadenesheli, Anima Anandkumar", "tldr": "", "abstract": "Neural operators learn mappings between function spaces, which is practical for learning solution operators of PDEs and other scientific modeling applications. Among them, the Fourier neural operator (FNO) is a popular architecture that performs global convolutions in the Fourier space. However, such global operations are often prone to over-smoothing and may fail to capture local details. In contrast, convolutional neural networks (CNN) can capture local features but are limited to training and inference at a single resolution. In this work, we present a principled approach to operator learning that can capture local features under two frameworks by learning differential operators and integral operators with locally supported kernels. Specifically, inspired by stencil methods, we prove that we obtain differential operators under an appropriate scaling of the kernel values of CNNs. To obtain local integral operators, we utilize suitable basis representations for the kernels based on discrete-continuous convolutions. Both these approaches preserve the properties of operator learning and, hence, the ability to predict at any resolution. Adding our layers to FNOs significantly improves their performance, reducing the relative L2-error by 34-72% in our experiments, which include a turbulent 2D Navier-Stokes and the spherical shallow water equations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Miguel Liu-Schiaffini;Julius Berner;Boris Bonev;Thorsten Kurth;Kamyar Azizzadenesheli;Anima Anandkumar", "authorids": "~Miguel_Liu-Schiaffini1;~Julius_Berner1;~Boris_Bonev1;tkurth@nvidia.com;~Kamyar_Azizzadenesheli1;~Anima_Anandkumar1", "gender": ";M;M;;M;", "homepage": "https://mliuschi.github.io/;https://jberner.info/;https://bonevbs.github.io;;https://kamyar.page/;", "dblp": "332/5619;227/2217;199/1689;;176/5584;", "google_scholar": "LebtA84AAAAJ;73-D2jgAAAAJ;sYo-KS4AAAAJ;;CxAS4SQAAAAJ;", "orcid": "0000-0001-9685-8383;0000-0002-5648-648X;0000-0002-4041-238X;;;", "linkedin": ";julius-berner/;;;;", "or_profile": "~Miguel_Liu-Schiaffini1;~Julius_Berner1;~Boris_Bonev1;tkurth@nvidia.com;~Kamyar_Azizzadenesheli1;~Anima_Anandkumar1", "aff": "California Institute of Technology;California Institute of Technology;NVIDIA;;NVIDIA;", "aff_domain": "caltech.edu;caltech.edu;nvidia.com;;nvidia.com;", "position": "Undergrad student;Postdoc;Researcher;;Researcher;", "bibtex": "@inproceedings{\nliu-schiaffini2024neural,\ntitle={Neural Operators with Localized Integral and Differential Kernels},\nauthor={Miguel Liu-Schiaffini and Julius Berner and Boris Bonev and Thorsten Kurth and Kamyar Azizzadenesheli and Anima Anandkumar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vl9GB3fbht}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3263428, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7161784565816506661&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "caltech.edu;caltech.edu;nvidia.com;;nvidia.com;", "author_num": 6, "aff_unique_index": "0;0;1;1", "aff_unique_norm": "California Institute of Technology;NVIDIA", "aff_unique_dep": ";NVIDIA Corporation", "aff_unique_url": "https://www.caltech.edu;https://www.nvidia.com", "aff_unique_abbr": "Caltech;NVIDIA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Pasadena;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Two Heads are Actually Better than One: Towards Better Adversarial Robustness via Transduction and Rejection", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32772", "id": "vn92qYjL1F", "proceeding": "https://proceedings.mlr.press/v235/palumbo24a.html", "pdf": "https://openreview.net/pdf?id=vn92qYjL1F", "openreview": "https://openreview.net/forum?id=vn92qYjL1F", "author_site": "Nils Palumbo, Yang Guo, Xi Wu, Jiefeng Chen, Yingyiu Liang, Somesh Jha", "tldr": "", "abstract": "Both transduction and rejection have emerged as important techniques for defending against adversarial perturbations. A recent work by Goldwasser et. al showed that rejection combined with transduction can give *provable* guarantees (for certain problems) that cannot be achieved otherwise. Nevertheless, under recent strong adversarial attacks (GMSA), Goldwasser et al.'s work was shown to have low performance in a practical deep-learning setting. In this paper, we take a step towards realizing the promise of transduction+rejection in more realistic scenarios. Our key observation is that a novel application of a reduction technique by Tram\u00e8r, which was until now only used to demonstrate the vulnerability of certain defenses, can be used to actually construct effective defenses. Theoretically, we show that a careful application of this technique in the transductive setting can give significantly improved sample-complexity for robust generalization. Our theory guides us to design a new transductive algorithm for learning a selective model; extensive experiments using state of the art attacks (AutoAttack, GMSA) show that our approach provides significantly better robust accuracy (81.6% on CIFAR-10 and 57.9% on CIFAR-100 under $l_\\infty$ with budget 8/255) than existing techniques. The implementation is available at https://github.com/nilspalumbo/transduction-rejection.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nils Palumbo;Yang Guo;Xi Wu;Jiefeng Chen;Yingyu Liang;Somesh Jha", "authorids": "~Nils_Palumbo1;~Yang_Guo4;~Xi_Wu1;~Jiefeng_Chen2;~Yingyu_Liang1;~Somesh_Jha1", "gender": ";M;M;M;;M", "homepage": ";;http://andrewxiwu.github.io/;https://jfc43.github.io/;;", "dblp": "258/3557;;37/4465-1;199/3381;;j/SomeshJha", "google_scholar": "vXBD3Q8AAAAJ;BbQQEPcAAAAJ;OmmxazMAAAAJ;5mOfQfAAAAAJ;;BaI7l8QAAAAJ", "orcid": ";;;;;", "linkedin": ";;;jiefeng-chen-aa1769122/;;", "or_profile": "~Nils_Palumbo1;~Yang_Guo4;~Xi_Wu1;~Jiefeng_Chen2;~Yingyu_Liang1;~Somesh_Jha1", "aff": "University of Wisconsin - Madison;;Google;Amazon;;Department of Computer Science, University of Wisconsin, Madison", "aff_domain": "wisc.edu;;google.com;amazon.com;;cs.wisc.edu", "position": "PhD student;;Software Engineer;Applied Scientist;;Full Professor", "bibtex": "@inproceedings{\npalumbo2024two,\ntitle={Two Heads are Actually Better than One: Towards Better Adversarial Robustness via Transduction and Rejection},\nauthor={Nils Palumbo and Yang Guo and Xi Wu and Jiefeng Chen and Yingyu Liang and Somesh Jha},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vn92qYjL1F}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 646099, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Am2gUwgHl0AJ:scholar.google.com/&scioq=Two+Heads+are+Actually+Better+than+One:+Towards+Better+Adversarial+Robustness+via+Transduction+and+Rejection&hl=en&as_sdt=0,44", "gs_version_total": 4, "email": "wisc.edu;;google.com;amazon.com;;cs.wisc.edu", "author_num": 6, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Wisconsin-Madison;Google;Amazon", "aff_unique_dep": ";Google;Amazon.com, Inc.", "aff_unique_url": "https://www.wisc.edu;https://www.google.com;https://www.amazon.com", "aff_unique_abbr": "UW-Madison;Google;Amazon", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Madison;Mountain View;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Kernel Debiased Plug-in Estimation: Simultaneous, Automated Debiasing without Influence Functions for Many Target Parameters", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32771", "id": "vq7ITv8a49", "proceeding": "https://proceedings.mlr.press/v235/cho24c.html", "pdf": "https://openreview.net/pdf?id=vq7ITv8a49", "openreview": "https://openreview.net/forum?id=vq7ITv8a49", "author_site": "Brian Cho, Yaroslav Mukhin, Kyra Gan, Ivana Malenica", "tldr": "", "abstract": "When estimating target parameters in nonparametric models with nuisance parameters, substituting the unknown nuisances with nonparametric estimators can introduce \"plug-in bias.\" Traditional methods addressing this suboptimal bias-variance trade-off rely on the influence function (IF) of the target parameter. When estimating multiple target parameters, these methods require debiasing the nuisance parameter multiple times using the corresponding IFs, which poses analytical and computational challenges. In this work, we leverage the targeted maximum likelihood estimation (TMLE) framework to propose a novel method named kernel debiased plug-in estimation (KDPE). KDPE refines an initial estimate through regularized likelihood maximization steps, employing a nonparametric model based on reproducing kernel Hilbert spaces. We show that KDPE: (i) simultaneously debiases all pathwise differentiable target parameters that satisfy our regularity conditions, (ii) does not require the IF for implementation, and (iii) remains computationally tractable. We numerically illustrate the use of KDPE and validate our theoretical results.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Brian M Cho;Yaroslav Mukhin;Kyra Gan;Ivana Malenica", "authorids": "~Brian_M_Cho1;~Yaroslav_Mukhin1;~Kyra_Gan1;~Ivana_Malenica1", "gender": "M;;;F", "homepage": "https://bcho.page/;;;", "dblp": "32/3261-1.html;;;242/3862", "google_scholar": "https://scholar.google.co.jp/citations?user=9k0bfB0AAAAJ;;;", "orcid": "0000-0003-3558-0415;;;", "linkedin": "brian-cho-5a7876172/;;;", "or_profile": "~Brian_M_Cho1;~Yaroslav_Mukhin1;~Kyra_Gan1;~Ivana_Malenica1", "aff": "Cornell University;;;Harvard University", "aff_domain": "cornell.edu;;;harvard.edu", "position": "PhD student;;;Postdoc", "bibtex": "@inproceedings{\ncho2024kernel,\ntitle={Kernel Debiased Plug-in Estimation: Simultaneous, Automated Debiasing without Influence Functions for Many Target Parameters},\nauthor={Brian M Cho and Yaroslav Mukhin and Kyra Gan and Ivana Malenica},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vq7ITv8a49}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3185383, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16609824248463258689&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "cornell.edu;;;harvard.edu", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Cornell University;Harvard University", "aff_unique_dep": ";", "aff_unique_url": "https://www.cornell.edu;https://www.harvard.edu", "aff_unique_abbr": "Cornell;Harvard", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "What is the Long-Run Distribution of Stochastic Gradient Descent? A Large Deviations Analysis", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32770", "id": "vsOF7qDNhl", "proceeding": "https://proceedings.mlr.press/v235/azizian24a.html", "pdf": "https://openreview.net/pdf?id=vsOF7qDNhl", "openreview": "https://openreview.net/forum?id=vsOF7qDNhl", "author_site": "Wa\u00efss Azizian, Franck Iutzeler, J\u00e9r\u00f4me Malick, Panayotis Mertikopoulos", "tldr": "", "abstract": "In this paper, we examine the long-run distribution of stochastic gradient descent (SGD) in general, non-convex problems. Specifically, we seek to understand which regions of the problem's state space are more likely to be visited by SGD, and by how much. Using an approach based on the theory of large deviations and randomly perturbed dynamical systems, we show that the long-run distribution of SGD resembles the Boltzmann-Gibbs distribution of equilibrium thermodynamics with temperature equal to the method's step-size and energy levels determined by the problem's objective and the statistics of the noise. In particular, we show that, in the long run, (*a*) the problem's critical region is visited exponentially more often than any non-critical region; (*b*) the iterates of SGD are exponentially concentrated around the problem's minimum energy state (which does not always coincide with the global minimum of the objective); (*c*) all other connected components of critical points are visited with frequency that is exponentially proportional to their energy level; and, finally, (*d*) any component of local maximizers or saddle points is \"dominated\" by a component of local minimizers which is visited exponentially more often.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wa\u00efss Azizian;Franck Iutzeler;Jerome Malick;Panayotis Mertikopoulos", "authorids": "~Wa\u00efss_Azizian1;~Franck_Iutzeler1;~Jerome_Malick1;~Panayotis_Mertikopoulos1", "gender": ";M;M;M", "homepage": "https://wazizian.fr;http://www.iutzeler.org;https://membres-ljk.imag.fr/Jerome.Malick/;http://polaris.imag.fr/panayotis.mertikopoulos/", "dblp": "243/3135;119/4896;61/1089;49/6721", "google_scholar": "https://scholar.google.fr/citations?user=oXxTTe8AAAAJ;https://scholar.google.fr/citations?user=De82J1EAAAAJ;https://scholar.google.fr/citations?user=ETJGb1gAAAAJ;xsusqPYAAAAJ", "orcid": ";0000-0003-2537-380X;;0000-0003-2026-9616", "linkedin": ";;;", "or_profile": "~Wa\u00efss_Azizian1;~Franck_Iutzeler1;~Jerome_Malick1;~Panayotis_Mertikopoulos1", "aff": "Universit\u00e9 Grenobel Alpes;Universit\u00e9 de Toulouse;CNRS;French National Center for Scientific Research", "aff_domain": "univ-grenoble-alpes.fr;univ-toulouse.fr;cnrs.fr;imag.fr", "position": "PhD student;Full Professor;Researcher;Principal Researcher", "bibtex": "@inproceedings{\nazizian2024what,\ntitle={What is the Long-Run Distribution of Stochastic Gradient Descent? A Large Deviations Analysis},\nauthor={Wa{\\\"\\i}ss Azizian and Franck Iutzeler and Jerome Malick and Panayotis Mertikopoulos},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vsOF7qDNhl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3750434, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3373035422121973022&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 23, "email": "univ-grenoble-alpes.fr;univ-toulouse.fr;cnrs.fr;imag.fr", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Universit\u00e9 Grenoble Alpes;Universit\u00e9 de Toulouse;Centre National de la Recherche Scientifique;French National Center for Scientific Research", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.univ-grenoble-alpes.fr;https://www.univ-toulouse.fr;https://www.cnrs.fr;https://www.cnrs.fr", "aff_unique_abbr": "UGA;UT;CNRS;CNRS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "France" }, { "title": "Efficient Contrastive Learning for Fast and Accurate Inference on Graphs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32769", "id": "vsy21Xodrt", "proceeding": "https://proceedings.mlr.press/v235/xiao24g.html", "pdf": "https://openreview.net/pdf?id=vsy21Xodrt", "openreview": "https://openreview.net/forum?id=vsy21Xodrt", "author_site": "Teng Xiao, Huaisheng Zhu, Zhiwei Zhang, Zhimeng Guo, Charu Aggarwal, Suhang Wang, Vasant Honavar", "tldr": "", "abstract": "Graph contrastive learning has made remarkable advances in settings where there is a scarcity of task-specific labels. Despite these advances, the significant computational overhead for representation inference incurred by existing methods that rely on intensive message passing makes them unsuitable for latency-constrained applications. In this paper, we present GraphECL, a simple and efficient contrastive learning method for fast inference on graphs. GraphECL does away with the need for expensive message passing during inference. Specifically, it introduces a novel coupling of the MLP and GNN models, where the former learns to computationally efficiently mimic the computations performed by the latter. We provide a theoretical analysis showing why MLP can capture essential structural information in neighbors well enough to match the performance of GNN in downstream tasks. The extensive experiments on widely used real-world benchmarks that show that GraphECL achieves superior performance and inference efficiency compared to state-of-the-art graph constrastive learning (GCL) methods on homophilous and heterophilous graphs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Teng Xiao;Huaisheng Zhu;Zhiwei Zhang;Zhimeng Guo;Charu C. Aggarwal;Suhang Wang;Vasant G Honavar", "authorids": "~Teng_Xiao2;~Huaisheng_Zhu1;~Zhiwei_Zhang10;~Zhimeng_Guo1;~Charu_C._Aggarwal2;~Suhang_Wang1;~Vasant_G_Honavar1", "gender": "M;M;M;M;M;M;M", "homepage": ";https://zzwjames.github.io/zhiweizhang.github.io/;;http://www.charuaggarwal.net;https://faculty.ist.psu.edu/szw494/;http://faculty.ist.psu.edu/vhonavar;https://tengxiao1.github.io/", "dblp": "264/2622.html;68/1980-1.html;304/3478;a/CharuCAggarwal;136/9440;https://dblp.uni-trier.de/pid/h/VasantHonavar.html;", "google_scholar": ";bT8RwQMAAAAJ;Du6bnGQAAAAJ;x_wsduUAAAAJ;cdT_WMMAAAAJ;GPqMVRkAAAAJ;ld3OKXwAAAAJ", "orcid": ";0009-0007-6153-2739;;0000-0003-2579-7581;0000-0003-3448-4878;0000-0001-5399-3489;", "linkedin": ";;;;;vhonavar/;", "or_profile": "~Huaisheng_Zhu1;~Zhiwei_Zhang10;~Zhimeng_Guo1;~Charu_C._Aggarwal2;~Suhang_Wang1;~Vasant_G_Honavar1;~Teng_Xiao1", "aff": "Pennsylvania State University;Pennsylvania State University;Pennsylvania State University;International Business Machines;Pennsylvania State University;Pennsylvania State University;The Pennsylvania State University", "aff_domain": "psu.edu;psu.edu;psu.edu;ibm.com;psu.edu;ist.psu.edu;psu.edu", "position": "PhD student;PhD student;PhD student;Distinguished Research Staff Member;Assistant Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nxiao2024efficient,\ntitle={Efficient Contrastive Learning for Fast and Accurate Inference on Graphs},\nauthor={Teng Xiao and Huaisheng Zhu and Zhiwei Zhang and Zhimeng Guo and Charu C. Aggarwal and Suhang Wang and Vasant G Honavar},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vsy21Xodrt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1576471, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4345301192049511400&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "psu.edu;psu.edu;psu.edu;ibm.com;psu.edu;ist.psu.edu;psu.edu", "author_num": 7, "aff_unique_index": "0;0;0;1;0;0;0", "aff_unique_norm": "Pennsylvania State University;International Business Machines Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://www.psu.edu;https://www.ibm.com", "aff_unique_abbr": "PSU;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Can We Remove the Square-Root in Adaptive Gradient Methods? A Second-Order Perspective", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32768", "id": "vuMD71R20q", "proceeding": "https://proceedings.mlr.press/v235/lin24e.html", "pdf": "https://openreview.net/pdf?id=vuMD71R20q", "openreview": "https://openreview.net/forum?id=vuMD71R20q", "author_site": "Wu Lin, Felix Dangel, Runa Eschenhagen, Juhan Bae, Richard E Turner, Alireza Makhzani", "tldr": "", "abstract": "Adaptive gradient optimizers like Adam(W) are the default training algorithms for many deep learning architectures, such as transformers. Their diagonal preconditioner is based on the gradient outer product which is incorporated into the parameter update via a square root. While these methods are often motivated as approximate second-order methods, the square root represents a fundamental difference. In this work, we investigate how the behavior of adaptive methods changes when we remove the root, i.e. strengthen their second-order motivation. Surprisingly, we find that such square-root-free adaptive methods close the generalization gap to SGD on convolutional architectures, while maintaining their root-based counterpart's performance on transformers. The second-order perspective also has practical benefits for the development of non-diagonal adaptive methods through the concept of preconditioner invariance. In contrast to root-based methods like Shampoo, the root-free counterparts do not require numerically unstable matrix root decompositions and inversions, thus work well in half precision. Our findings provide new insights into the development of adaptive methods and raise important questions regarding the currently overlooked role of adaptivity for their success.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wu Lin;Felix Dangel;Runa Eschenhagen;Juhan Bae;Richard E. Turner;Alireza Makhzani", "authorids": "~Wu_Lin2;~Felix_Dangel1;~Runa_Eschenhagen1;~Juhan_Bae2;~Richard_E_Turner1;~Alireza_Makhzani1", "gender": "M;;M;M;;M", "homepage": "https://f-dangel.com;https://runame.github.io;http://www.juhanbae.com/;https://rich-turner-group.github.io/;http://www.alireza.ai/;https://yorkerlin.github.io/", "dblp": "236/4218;242/9235;158/9492;40/5352;122/5126.html;70/10338", "google_scholar": "9hlJ9W0AAAAJ;Ribmq4oAAAAJ;https://scholar.google.ca/citations?user=9RFr4usAAAAJ;https://scholar.google.co.uk/citations?user=DgLEyZgAAAAJ;B0KVWJEAAAAJ;https://scholar.google.ca/citations?user=sGl6muoAAAAJ", "orcid": "0000-0002-1414-8554;;;;;", "linkedin": ";;;;;", "or_profile": "~Felix_Dangel1;~Runa_Eschenhagen1;~Juhan_Bae2;~Richard_E_Turner1;~Alireza_Makhzani1;~Wu_Lin1", "aff": "Vector Institute, Toronto;University of Cambridge;University of Toronto;Microsoft Research;Vector Institute;Vector Institute", "aff_domain": "vectorinstitute.ai;cam.ac.uk;cs.toronto.edu;research.microsoft.com;vectorinstitute.ai;vectorinstitute.ai", "position": "Postdoc;PhD student;PhD student;Researcher;Researcher;Postdoc", "bibtex": "@inproceedings{\nlin2024can,\ntitle={Can We Remove the Square-Root in Adaptive Gradient Methods? A Second-Order Perspective},\nauthor={Wu Lin and Felix Dangel and Runa Eschenhagen and Juhan Bae and Richard E. Turner and Alireza Makhzani},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vuMD71R20q}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 788812, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4921448415279133693&as_sdt=5,48&sciodt=0,48&hl=en", "gs_version_total": 8, "email": "vectorinstitute.ai;cam.ac.uk;cs.toronto.edu;research.microsoft.com;vectorinstitute.ai;vectorinstitute.ai", "author_num": 6, "aff_unique_index": "0;1;2;3;0;0", "aff_unique_norm": "Vector Institute;University of Cambridge;University of Toronto;Microsoft", "aff_unique_dep": ";;;Microsoft Research", "aff_unique_url": "https://vectorinstitute.ai;https://www.cam.ac.uk;https://www.utoronto.ca;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Vector Institute;Cambridge;U of T;MSR", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Toronto;Cambridge;", "aff_country_unique_index": "0;1;0;2;0;0", "aff_country_unique": "Canada;United Kingdom;United States" }, { "title": "Information Flow in Self-Supervised Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32767", "id": "vxDjeeBnTu", "proceeding": "https://proceedings.mlr.press/v235/tan24e.html", "pdf": "https://openreview.net/pdf?id=vxDjeeBnTu", "openreview": "https://openreview.net/forum?id=vxDjeeBnTu", "author_site": "Zhiquan Tan, Jingqin Yang, Weiran Huang, Yang Yuan, Yifan Zhang", "tldr": "", "abstract": "In this paper, we conduct a comprehensive analysis of two dual-branch (Siamese architecture) self-supervised learning approaches, namely Barlow Twins and spectral contrastive learning, through the lens of matrix mutual information. We prove that the loss functions of these methods implicitly optimize both matrix mutual information and matrix joint entropy. This insight prompts us to further explore the category of single-branch algorithms, specifically MAE and U-MAE, for which mutual information and joint entropy become the entropy. Building on this intuition, we introduce the Matrix Variational Masked Auto-Encoder (M-MAE), a novel method that leverages the matrix-based estimation of entropy as a regularizer and subsumes U-MAE as a special case. The empirical evaluations underscore the effectiveness of M-MAE compared with the state-of-the-art methods, including a 3.9% improvement in linear probing ViT-Base, and a 1% improvement in fine-tuning ViT-Large, both on ImageNet.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhiquan Tan;Jingqin Yang;Weiran Huang;Yang Yuan;Yifan Zhang", "authorids": "~Zhiquan_Tan1;~Jingqin_Yang2;~Weiran_Huang1;~Yang_Yuan4;~Yifan_Zhang16", "gender": "M;M;M;M;", "homepage": ";https://github.com/yjqqqaq;https://www.weiranhuang.com;http://people.iiis.tsinghua.edu.cn/~yuanyang/index.html;", "dblp": "326/0177;267/1955;170/0073-1;;", "google_scholar": ";;AjJ2rf8AAAAJ;;", "orcid": ";;;;", "linkedin": "https://www.linkedin.cn/incareer/in/ACoAAC1A8_QBFX8OlchWmVI_pNXN4zm_t6vPKCs;;;;", "or_profile": "~Zhiquan_Tan1;~Jingqin_Yang2;~Weiran_Huang1;~Yang_Yuan4;~Yifan_Zhang16", "aff": "Tsinghua University;Tsinghua University;Shanghai AI Laboratory;Tsinghua University;", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;pjlab.org.cn;tsinghua.edu.cn;", "position": "PhD student;PhD student;Consultant;Assistant Professor;", "bibtex": "@inproceedings{\ntan2024information,\ntitle={Information Flow in Self-Supervised Learning},\nauthor={Zhiquan Tan and Jingqin Yang and Weiran Huang and Yang Yuan and Yifan Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vxDjeeBnTu}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 629907, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12255284306292145309&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "tsinghua.edu.cn;tsinghua.edu.cn;pjlab.org.cn;tsinghua.edu.cn;", "author_num": 5, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Tsinghua University;Shanghai AI Laboratory", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "THU;SAIL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "On the Hardness of Probabilistic Neurosymbolic Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32766", "id": "vxPmrxKe0J", "proceeding": "https://proceedings.mlr.press/v235/maene24a.html", "pdf": "https://openreview.net/pdf?id=vxPmrxKe0J", "openreview": "https://openreview.net/forum?id=vxPmrxKe0J", "author_site": "Jaron Maene, Vincent Derkinderen, Luc De Raedt", "tldr": "", "abstract": "The limitations of purely neural learning have sparked an interest in probabilistic neurosymbolic models, which combine neural networks with probabilistic logical reasoning. As these neurosymbolic models are trained with gradient descent, we study the complexity of differentiating probabilistic reasoning. We prove that although approximating these gradients is intractable in general, it becomes tractable during training. Furthermore, we introduce *WeightME*, an unbiased gradient estimator based on model sampling. Under mild assumptions, WeightME approximates the gradient with probabilistic guarantees using a logarithmic number of calls to a SAT solver. Lastly, we evaluate the necessity of these guarantees on the gradient. Our experiments indicate that the existing biased approximations indeed struggle to optimize even when exact solving is still feasible.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jaron Maene;Vincent Derkinderen;Luc De Raedt", "authorids": "~Jaron_Maene1;~Vincent_Derkinderen1;~Luc_De_Raedt1", "gender": ";M;M", "homepage": ";;https://people.cs.kuleuven.be/~luc.deraedt/", "dblp": ";273/9798;r/LucDeRaedt", "google_scholar": ";0TC7EGQAAAAJ;https://scholar.google.com.tw/citations?user=dgobB6AAAAAJ", "orcid": ";0000-0002-8894-270X;0000-0002-6860-6303", "linkedin": ";;", "or_profile": "~Jaron_Maene1;~Vincent_Derkinderen1;~Luc_De_Raedt1", "aff": ";KU Leuven;KU Leuven, Belgium", "aff_domain": ";kuleuven.be;cs.kuleuven.be", "position": ";Postdoc;Full Professor", "bibtex": "@inproceedings{\nmaene2024on,\ntitle={On the Hardness of Probabilistic Neurosymbolic Learning},\nauthor={Jaron Maene and Vincent Derkinderen and Luc De Raedt},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vxPmrxKe0J}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 479595, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18342784635099049146&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": ";kuleuven.be;cs.kuleuven.be", "author_num": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Katholieke Universiteit Leuven;KU Leuven", "aff_unique_dep": ";", "aff_unique_url": "https://www.kuleuven.be;https://www.kuleuven.be", "aff_unique_abbr": "KU Leuven;KU Leuven", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Belgium" }, { "title": "FlashST: A Simple and Universal Prompt-Tuning Framework for Traffic Prediction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32765", "id": "vye4OgLaTy", "proceeding": "https://proceedings.mlr.press/v235/li24bw.html", "pdf": "https://openreview.net/pdf?id=vye4OgLaTy", "openreview": "https://openreview.net/forum?id=vye4OgLaTy", "author_site": "Zhonghang Li, Lianghao Xia, Yong Xu, Chao Huang", "tldr": "", "abstract": "The objective of traffic prediction is to accurately forecast and analyze the dynamics of transportation patterns, considering both space and time. However, the presence of distribution shift poses a significant challenge in this field, as existing models struggle to generalize well when faced with test data that significantly differs from the training distribution. To tackle this issue, this paper introduces a simple and universal spatio-temporal prompt-tuning framework-FlashST, which adapts pre-trained models to the specific characteristics of diverse downstream datasets, improving generalization in diverse traffic prediction scenarios. Specifically, the FlashST framework employs a lightweight spatio-temporal prompt network for in-context learning, capturing spatio-temporal invariant knowledge and facilitating effective adaptation to diverse scenarios. Additionally, we incorporate a distribution mapping mechanism to align the data distributions of pre-training and downstream data, facilitating effective knowledge transfer in spatio-temporal forecasting. Empirical evaluations demonstrate the effectiveness of our FlashST across different spatio-temporal prediction tasks using diverse urban datasets. Code is available at [https://github.com/HKUDS/FlashST](https://github.com/HKUDS/FlashST).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhonghang Li;Lianghao Xia;Yong Xu;Chao Huang", "authorids": "~Zhonghang_Li1;~Lianghao_Xia1;~Yong_Xu2;~Chao_Huang7", "gender": "M;M;M;M", "homepage": "https://github.com/LZH-YS1998;https://akaxlh.github.io/;;", "dblp": "258/0356;270/6586;07/4630-7;", "google_scholar": "__9uvQkAAAAJ;fDDjoUEAAAAJ;;Zkv9FqwAAAAJ", "orcid": "0000-0002-3977-1334;0000-0003-0725-2211;;", "linkedin": ";;;", "or_profile": "~Zhonghang_Li1;~Lianghao_Xia1;~Yong_Xu2;~Chao_Huang7", "aff": "South China University of Technology;University of Hong Kong;South China University of Technology;University of Hong Kong", "aff_domain": "scut.edu.cn;hku.hk;scut.edu.cn;hku.hk", "position": "PhD student;Postdoc;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nli2024flashst,\ntitle={Flash{ST}: A Simple and Universal Prompt-Tuning Framework for Traffic Prediction},\nauthor={Zhonghang Li and Lianghao Xia and Yong Xu and Chao Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vye4OgLaTy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 674627, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3873106160601843849&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 7, "email": "scut.edu.cn;hku.hk;scut.edu.cn;hku.hk", "author_num": 4, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "South China University of Technology;University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "https://www.scut.edu.cn;https://www.hku.hk", "aff_unique_abbr": "SCUT;HKU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "PICLe: Eliciting Diverse Behaviors from Large Language Models with Persona In-Context Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32764", "id": "w1HdBXSJXn", "proceeding": "https://proceedings.mlr.press/v235/choi24e.html", "pdf": "https://openreview.net/pdf?id=w1HdBXSJXn", "openreview": "https://openreview.net/forum?id=w1HdBXSJXn", "author_site": "Hyeong Kyu Choi, Sharon Li", "tldr": "", "abstract": "Large Language Models (LLMs) are trained on massive text corpora, which are encoded with diverse personality traits. This triggers an interesting goal of eliciting a desired personality trait from the LLM, and probing its behavioral preferences. Accordingly, we formalize the persona elicitation task, aiming to customize LLM behaviors to align with a target persona. We present Persona In-Context Learning (PICLe), a novel persona elicitation framework grounded in Bayesian inference. At the core, PICLe introduces a new ICL example selection criterion based on likelihood ratio, which is designed to optimally guide the model in eliciting a specific target persona. We demonstrate the effectiveness of PICLe through extensive comparisons against baseline methods across three contemporary LLMs. Code is available at https://github.com/deeplearning-wisc/picle.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hyeong Kyu Choi;Yixuan Li", "authorids": "~Hyeong_Kyu_Choi1;~Yixuan_Li1", "gender": "M;F", "homepage": "https://sites.google.com/view/froilanchoi;http://pages.cs.wisc.edu/~sharonli/", "dblp": "225/4796;144/6087-1", "google_scholar": "https://scholar.google.co.kr/citations?hl=en;https://scholar.google.com/citations?hl=en", "orcid": "0000-0003-2090-9273;", "linkedin": "https://linkedin.com/in/hyeonggyufroilanchoi;liyixuan", "or_profile": "~Hyeong_Kyu_Choi1;~Yixuan_Li1", "aff": "Department of Computer Science, University of Wisconsin - Madison;Cornell University", "aff_domain": "cs.wisc.edu;cornell.edu", "position": "PhD student;Graduate Student", "bibtex": "@inproceedings{\nchoi2024picle,\ntitle={{PICL}e: Eliciting Diverse Behaviors from Large Language Models with Persona In-Context Learning},\nauthor={Hyeong Kyu Choi and Yixuan Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=w1HdBXSJXn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1124878, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1487264556754350676&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "cs.wisc.edu;cornell.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of Wisconsin-Madison;Cornell University", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.wisc.edu;https://www.cornell.edu", "aff_unique_abbr": "UW-Madison;Cornell", "aff_campus_unique_index": "0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Position: Social Choice Should Guide AI Alignment in Dealing with Diverse Human Feedback", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32763", "id": "w1d9DOGymR", "proceeding": "https://proceedings.mlr.press/v235/conitzer24a.html", "pdf": "https://openreview.net/pdf?id=w1d9DOGymR", "openreview": "https://openreview.net/forum?id=w1d9DOGymR", "author_site": "Vincent Conitzer, Rachel Freedman, Jobstq Heitzig, Wesley H. Holliday, Bob Jacobs, Nathan Lambert, Milan Mosse, Eric Pacuit, Stuart Russell, Hailey Schoelkopf, Emanuel Tewolde, William Zwicker", "tldr": "", "abstract": "Foundation models such as GPT-4 are fine-tuned to avoid unsafe or otherwise problematic behavior, such as helping to commit crimes or producing racist text. One approach to fine-tuning, called reinforcement learning from human feedback, learns from humans\u2019 expressed preferences over multiple outputs. Another approach is constitutional AI, in which the input from humans is a list of high-level principles. But how do we deal with potentially diverging input from humans? How can we aggregate the input into consistent data about \u201ccollective\u201d preferences or otherwise use it to make collective choices about model behavior? In this paper, we argue that the field of social choice is well positioned to address these questions, and we discuss ways forward for this agenda, drawing on discussions in a recent workshop on Social Choice for AI Ethics and Safety held in Berkeley, CA, USA in December 2023.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vincent Conitzer;Rachel Freedman;Jobst Heitzig;Wesley H. Holliday;Bob M. Jacobs;Nathan Lambert;Milan Moss\u00e9;Eric Pacuit;Stuart Russell;Hailey Schoelkopf;Emanuel Tewolde;William S. Zwicker", "authorids": "~Vincent_Conitzer2;~Rachel_Freedman1;~Jobst_Heitzig1;~Wesley_H._Holliday1;~Bob_M._Jacobs1;~Nathan_Lambert1;milan.mosse@gmail.com;epacuit@umd.edu;~Stuart_Russell1;~Hailey_Schoelkopf1;~Emanuel_Tewolde1;zwickerw@union.edu", "gender": "M;F;M;M;M;M;;;M;F;M;", "homepage": "https://www.cs.cmu.edu/~conitzer/;https://rachelfreedman.github.io/;https://www.pik-potsdam.de/members/heitzig;https://sites.google.com/site/wesholliday/;https://bobjacobs.substack.com/;https://natolambert.com;;;https://people.eecs.berkeley.edu/~russell/;;https://emanueltewolde.com/;", "dblp": "c/VincentConitzer;218/7198;;30/7409;;228/9584.html;;;;;305/4404.html;", "google_scholar": "juRk4lQAAAAJ;Mj1fmhsAAAAJ;Wpcuyl4AAAAJ;MApFn9wAAAAJ;;O4jW7BsAAAAJ;;;https://scholar.google.com.tw/citations?user=KJGrjCAAAAAJ;XLahYIYAAAAJ;LpZkIogAAAAJ;", "orcid": "0000-0003-1899-7884;0000-0003-3299-4313;0000-0002-0442-8077;0000-0001-6054-9052;;0000-0002-9997-6817;;;;;0000-0003-4128-7872;", "linkedin": "vincent-conitzer-2563082/;rachelalexfreedman/;;wesleyholliday/;bob-jacobs-31a6b32a6/;nathan-lambert-55093468/;;;;;;", "or_profile": "~Vincent_Conitzer2;~Rachel_Freedman1;~Jobst_Heitzig1;~Wesley_H._Holliday1;~Bob_M._Jacobs1;~Nathan_Lambert1;milan.mosse@gmail.com;epacuit@umd.edu;~Stuart_Russell1;~Hailey_Schoelkopf1;~Emanuel_Tewolde1;zwickerw@union.edu", "aff": "University of Oxford;University of California, Berkeley;Potsdam Institute for Climate Impact Research;University of California, Berkeley;Universiteit Gent;Allen Institute for Artificial Intelligence;;;University of California, Berkeley;EleutherAI;Carnegie Mellon University;", "aff_domain": "oxford.ac.uk;berkeley.edu;pik-potsdam.de;berkeley.edu;ugent.be;allenai.org;;;berkeley.edu;eleuther.ai;cmu.edu;", "position": "Full Professor;PhD student;Lab Lead;Full Professor;Undergrad student;Researcher;;;Full Professor;Researcher;PhD student;", "bibtex": "@inproceedings{\nconitzer2024position,\ntitle={Position: Social Choice Should Guide {AI} Alignment in Dealing with Diverse Human Feedback},\nauthor={Vincent Conitzer and Rachel Freedman and Jobst Heitzig and Wesley H. Holliday and Bob M. Jacobs and Nathan Lambert and Milan Moss{\\'e} and Eric Pacuit and Stuart Russell and Hailey Schoelkopf and Emanuel Tewolde and William S. Zwicker},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=w1d9DOGymR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 866156, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8293996441908297032&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 16, "email": "oxford.ac.uk;berkeley.edu;pik-potsdam.de;berkeley.edu;ugent.be;allenai.org;;;berkeley.edu;eleuther.ai;cmu.edu;", "author_num": 12, "aff_unique_index": "0;1;2;1;3;4;1;5;6", "aff_unique_norm": "University of Oxford;University of California, Berkeley;Potsdam Institute for Climate Impact Research;University of Ghent;Allen Institute for Artificial Intelligence;EleutherAI;Carnegie Mellon University", "aff_unique_dep": ";;;;;;", "aff_unique_url": "https://www.ox.ac.uk;https://www.berkeley.edu;https://www.pik-potsdam.de;https://www.ugent.be/en;https://allenai.org;https://www.eleuther.ai;https://www.cmu.edu", "aff_unique_abbr": "Oxford;UC Berkeley;PIK;UGent;AI2;EleutherAI;CMU", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;2;1;3;1;1;1;1", "aff_country_unique": "United Kingdom;United States;Germany;Belgium" }, { "title": "Recurrent Early Exits for Federated Learning with Heterogeneous Clients", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32762", "id": "w4B42sxNq3", "proceeding": "https://proceedings.mlr.press/v235/lee24h.html", "pdf": "https://openreview.net/pdf?id=w4B42sxNq3", "openreview": "https://openreview.net/forum?id=w4B42sxNq3", "author_site": "Royson Lee, Javier Fernandez-Marques, Xu Hu, Da Li, Stefanos Laskaridis, \u0141ukasz Dudziak, Timothy Hospedales, Ferenc Husz\u00e1r, Nicholas Lane", "tldr": "", "abstract": "Federated learning (FL) has enabled distributed learning of a model across multiple clients in a privacy-preserving manner. One of the main challenges of FL is to accommodate clients with varying hardware capacities; clients have differing compute and memory requirements. To tackle this challenge, recent state-of-the-art approaches leverage the use of early exits. Nonetheless, these approaches fall short of mitigating the challenges of joint learning multiple exit classifiers, often relying on hand-picked heuristic solutions for knowledge distillation among classifiers and/or utilizing additional layers for weaker classifiers. In this work, instead of utilizing multiple classifiers, we propose a recurrent early exit approach named ReeFL that fuses features from different sub-models into a single shared classifier. Specifically, we use a transformer-based early-exit module shared among sub-models to i) better exploit multi-layer feature representations for task-specific prediction and ii) modulate the feature representation of the backbone model for subsequent predictions. We additionally present a per-client self-distillation approach where the best sub-model is automatically selected as the teacher of the other sub-models at each client. Our experiments on standard image and speech classification benchmarks across various emerging federated fine-tuning baselines demonstrate ReeFL effectiveness over previous works.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Royson Lee;Javier Fernandez-Marques;Shell Xu Hu;Da Li;Stefanos Laskaridis;\u0141ukasz Dudziak;Timothy Hospedales;Ferenc Husz\u00e1r;Nicholas Donald Lane", "authorids": "~Royson_Lee1;~Javier_Fernandez-Marques1;~Shell_Xu_Hu1;~Da_Li3;~Stefanos_Laskaridis1;~\u0141ukasz_Dudziak1;~Timothy_Hospedales1;~Ferenc_Husz\u00e1r1;~Nicholas_Donald_Lane1", "gender": ";M;;M;M;M;M;M;M", "homepage": ";https://dali-dl.github.io/;https://stefanos.cc;;http://homepages.inf.ed.ac.uk/thospeda/;;;http://niclane.org;http://hushell.github.io/", "dblp": "247/5940;43/4804-1;241/6273;228/7987;32/3545;http://dblp.uni-trier.de/pers/hd/h/Huszar:Ferenc;171/7908;03/2663.html;", "google_scholar": "qNu3yNoAAAAJ;RPvaE3oAAAAJ;https://scholar.google.co.uk/citations?user=TcVC--IAAAAJ;R47NvpoAAAAJ;https://scholar.google.fr/citations?user=nHhtvqkAAAAJ;https://scholar.google.co.uk/citations?user=koQCVT4AAAAJ;Htu1YhIAAAAJ;https://scholar.google.co.uk/citations?hl=en;https://scholar.google.fr/citations?user=jU7nGnEAAAAJ", "orcid": ";0000-0002-2101-2989;;;0000-0003-4867-7486;;;0000-0002-2728-8273;", "linkedin": "royson-lee-025a09169/;;stevelaskaridis/;;timothyhospedales/;;jafermarq/;niclane;", "or_profile": "~Royson_Lee1;~Da_Li3;~Stefanos_Laskaridis1;~\u0141ukasz_Dudziak1;~Timothy_Hospedales1;~Ferenc_Huszar1;~Javier_Fern\u00e1ndez_Marqu\u00e9s1;~Nic_Lane2;~Xu_Shell_Hu1", "aff": "Samsung AI Center, Cambridge;University of Edinburgh;Brave Software;Samsung;Samsung AI Research Centre;University of Cambridge;Flower Labs;University of Cambridge;Samsung", "aff_domain": "samsung.com;ed.ac.uk;brave.com;samsung.com;samsung.com;cam.ac.uk;flower.ai;cst.cam.ac.uk;samsung.com", "position": "Research Engineer;Visiting Scholar;Researcher;Software Engineer;Principal Researcher;Associate Professor;Principal Researcher;Full Professor;Researcher", "bibtex": "@inproceedings{\nlee2024recurrent,\ntitle={Recurrent Early Exits for Federated Learning with Heterogeneous Clients},\nauthor={Royson Lee and Javier Fernandez-Marques and Shell Xu Hu and Da Li and Stefanos Laskaridis and {\\L}ukasz Dudziak and Timothy Hospedales and Ferenc Husz{\\'a}r and Nicholas Donald Lane},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=w4B42sxNq3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8270898, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12146764880513279178&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "samsung.com;ed.ac.uk;brave.com;samsung.com;samsung.com;cam.ac.uk;flower.ai;cst.cam.ac.uk;samsung.com", "author_num": 9, "aff_unique_index": "0;1;2;0;0;3;4;3;0", "aff_unique_norm": "Samsung;University of Edinburgh;Brave Software;University of Cambridge;Flower Labs", "aff_unique_dep": "AI Center;;;;", "aff_unique_url": "https://www.samsung.com/global/research-innovation/ai-research-centers/samsung-ai-center-cambridge/;https://www.ed.ac.uk;https://www.brave.com;https://www.cam.ac.uk;", "aff_unique_abbr": "SAC;Edinburgh;Brave;Cambridge;", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0;1;2;2;0;1;0;2", "aff_country_unique": "United Kingdom;United States;South Korea" }, { "title": "Kernel Semi-Implicit Variational Inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32761", "id": "w5oUo0LhO1", "proceeding": "https://proceedings.mlr.press/v235/cheng24l.html", "pdf": "https://openreview.net/pdf?id=w5oUo0LhO1", "openreview": "https://openreview.net/forum?id=w5oUo0LhO1", "author_site": "Ziheng Cheng, Longlin Yu, Tianyu Xie, Shiyue Zhang, Cheng Zhang", "tldr": "", "abstract": "Semi-implicit variational inference (SIVI) extends traditional variational families with semi-implicit distributions defined in a hierarchical manner. Due to the intractable densities of semi-implicit distributions, classical SIVI often resorts to surrogates of evidence lower bound (ELBO) that would introduce biases for training. A recent advancement in SIVI, named SIVI-SM, utilizes an alternative score matching objective made tractable via a minimax formulation, albeit requiring an additional lower-level optimization. In this paper, we propose kernel SIVI (KSIVI), a variant of SIVI-SM that eliminates the need for the lower-level optimization through kernel tricks. Specifically, we show that when optimizing over a reproducing kernel Hilbert space (RKHS), the lower-level problem has an explicit solution. This way, the upper-level objective becomes the kernel Stein discrepancy (KSD), which is readily computable for stochastic gradient descent due to the hierarchical structure of semi-implicit variational distributions. An upper bound for the variance of the Monte Carlo gradient estimators of the KSD objective is derived, which allows us to establish novel convergence guarantees of KSIVI. We demonstrate the effectiveness and efficiency of KSIVI on both synthetic distributions and a variety of real data Bayesian inference tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ziheng Cheng;Longlin Yu;Tianyu Xie;Shiyue Zhang;Cheng Zhang", "authorids": "~Ziheng_Cheng4;~Longlin_Yu1;~Tianyu_Xie1;~Shiyue_Zhang3;~Cheng_Zhang3", "gender": "M;M;M;M;M", "homepage": "https://alexczh1.github.io/;https://github.com/longinYu;https://tyuxie.github.io;https://github.com/ShiyueZhang66;https://zcrabbit.github.io", "dblp": ";;345/3987-1;;", "google_scholar": "M8Hz2NSNe3QC;;qbJJQ_AAAAAJ;nu6YfFkAAAAJ;PddDrLgAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Ziheng_Cheng4;~Longlin_Yu1;~Tianyu_Xie1;~Shiyue_Zhang3;~Cheng_Zhang3", "aff": "Peking University;Peking University;Peking University;Peking University;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn", "position": "Undergrad student;PhD student;PhD student;Undergrad student;Assistant Professor", "bibtex": "@inproceedings{\ncheng2024kernel,\ntitle={Kernel Semi-Implicit Variational Inference},\nauthor={Ziheng Cheng and Longlin Yu and Tianyu Xie and Shiyue Zhang and Cheng Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=w5oUo0LhO1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3191120, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4142388055524507907&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Learning to Play Atari in a World of Tokens", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32760", "id": "w8BnKGFIYN", "proceeding": "https://proceedings.mlr.press/v235/agarwal24b.html", "pdf": "https://openreview.net/pdf?id=w8BnKGFIYN", "openreview": "https://openreview.net/forum?id=w8BnKGFIYN", "author_site": "Pranav Agarwal, Sheldon Andrews, Samira Ebrahimi Kahou", "tldr": "", "abstract": "Model-based reinforcement learning agents utilizing transformers have shown improved sample efficiency due to their ability to model extended context, resulting in more accurate world models. However, for complex reasoning and planning tasks, these methods primarily rely on continuous representations. This complicates modeling of discrete properties of the real world such as disjoint object classes between which interpolation is not plausible. In this work, we introduce discrete abstract representations for transformer-based learning (DART), a sample-efficient method utilizing discrete representations for modeling both the world and learning behavior. We incorporate a transformer-decoder for auto-regressive world modeling and a transformer-encoder for learning behavior by attending to task-relevant cues in the discrete representation of the world model. For handling partial observability, we aggregate information from past time steps as memory tokens. DART outperforms previous state-of-the-art methods that do not use look-ahead search on the Atari 100k sample efficiency benchmark with a median human-normalized score of 0.790 and beats humans in 9 out of 26 games. We release our code at https://pranaval.github.io/DART/.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pranav Agarwal;Sheldon Andrews;Samira Ebrahimi Kahou", "authorids": "~Pranav_Agarwal1;~Sheldon_Andrews1;~Samira_Ebrahimi_Kahou1", "gender": "M;M;F", "homepage": "https://pranaval.github.io/;https://profs.etsmtl.ca/sandrews;https://saebrahimi.github.io", "dblp": ";;20/11069", "google_scholar": "QFEzapMAAAAJ;aoAfWEQAAAAJ;https://scholar.google.ca/citations?user=F99FuaAAAAAJ", "orcid": ";0000-0001-9776-117X;", "linkedin": "pranav-agarwal-6b4453114/;sheldonandrews/;", "or_profile": "~Pranav_Agarwal1;~Sheldon_Andrews1;~Samira_Ebrahimi_Kahou1", "aff": "\u00c9cole de technologie sup\u00e9rieure, Universit\u00e9 du Qu\u00e9bec;McGill University;\u00c9cole de technologie sup\u00e9rieure", "aff_domain": "etsmtl.ca;mcgill.ca;etsmtl.ca", "position": "PhD student;Adjunct Professor;Associate Professor", "bibtex": "@inproceedings{\nagarwal2024learning,\ntitle={Learning to Play Atari in a World of Tokens},\nauthor={Pranav Agarwal and Sheldon Andrews and Samira Ebrahimi Kahou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=w8BnKGFIYN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 885642, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15656214051583240236&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 9, "email": "etsmtl.ca;mcgill.ca;etsmtl.ca", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Universit\u00e9 du Qu\u00e9bec;McGill University;\u00c9cole de technologie sup\u00e9rieure", "aff_unique_dep": ";;", "aff_unique_url": "https://www.etsmtl.ca;https://www.mcgill.ca;https://www.etsmtl.ca", "aff_unique_abbr": "ETS;McGill;ETS", "aff_campus_unique_index": "0", "aff_campus_unique": "\u00c9cole de technologie sup\u00e9rieure;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "title": "Sample-Efficient Multiagent Reinforcement Learning with Reset Replay", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32759", "id": "w8ei1o9U5y", "proceeding": "https://proceedings.mlr.press/v235/yang24c.html", "pdf": "https://openreview.net/pdf?id=w8ei1o9U5y", "openreview": "https://openreview.net/forum?id=w8ei1o9U5y", "author_site": "Yaodong Yang, Guangyong Chen, Jianye Hao, Pheng Ann Heng", "tldr": "", "abstract": "The popularity of multiagent reinforcement learning (MARL) is growing rapidly with the demand for real-world tasks that require swarm intelligence. However, a noticeable drawback of MARL is its low sample efficiency, which leads to a huge amount of interactions with the environment. Surprisingly, few MARL works focus on this practical problem especially in the parallel environment setting, which greatly hampers the application of MARL into the real world. In response to this gap, in this paper, we propose Multiagent Reinforcement Learning with Reset Replay (MARR) to greatly improve the sample efficiency of MARL by enabling MARL training at a high replay ratio in the parallel environment setting for the first time. To achieve this, first, a reset strategy is introduced for maintaining the network plasticity to ensure that MARL continually learns with a high replay ratio. Second, MARR incorporates a data augmentation technique to boost the sample efficiency further. Extensive experiments in SMAC and MPE show that MARR significantly improves the performance of various MARL approaches with much fewer environment interactions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yaodong Yang;Guangyong Chen;Jianye HAO;Pheng-Ann Heng", "authorids": "~Yaodong_Yang2;~Guangyong_Chen1;~Jianye_HAO1;~Pheng-Ann_Heng1", "gender": "M;M;M;M", "homepage": ";https://guangyongchen.github.io/;http://www.icdai.org/jianye.html;http://www.cse.cuhk.edu.hk/~pheng", "dblp": "170/1496-2;175/1354;21/7664.html;52/2889", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.hk/citations?user=AUpqepUAAAAJ;;https://scholar.google.com/citations?sortby=pubdate", "orcid": ";;0000-0002-0422-8235;", "linkedin": ";;;", "or_profile": "~Yaodong_Yang2;~Guangyong_Chen1;~Jianye_HAO1;~Pheng-Ann_Heng1", "aff": "Department of Computer Science and Engineering, The Chinese University of Hong Kong;Zhejiang Lab;Tianjin University;The Chinese University of Hong Kong", "aff_domain": "cse.cuhk.edu.hk;zju.edu.cn;tju.edu.cn;cuhk.edu.hk", "position": "PhD student;Principal Researcher;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nyang2024sampleefficient,\ntitle={Sample-Efficient Multiagent Reinforcement Learning with Reset Replay},\nauthor={Yaodong Yang and Guangyong Chen and Jianye HAO and Pheng-Ann Heng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=w8ei1o9U5y}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1425502, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:oPYlk8mA3zQJ:scholar.google.com/&scioq=Sample-Efficient+Multiagent+Reinforcement+Learning+with+Reset+Replay&hl=en&as_sdt=0,5", "gs_version_total": 4, "email": "cse.cuhk.edu.hk;zju.edu.cn;tju.edu.cn;cuhk.edu.hk", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Chinese University of Hong Kong;Zhejiang Lab;Tianjin University", "aff_unique_dep": "Department of Computer Science and Engineering;;", "aff_unique_url": "https://www.cuhk.edu.hk;http://www.zhejianglab.com;http://www.tju.edu.cn", "aff_unique_abbr": "CUHK;;TJU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Position: $C^*$-Algebraic Machine Learning $-$ Moving in a New Direction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32758", "id": "w9nxTXuaCc", "proceeding": "https://proceedings.mlr.press/v235/hashimoto24a.html", "pdf": "https://openreview.net/pdf?id=w9nxTXuaCc", "openreview": "https://openreview.net/forum?id=w9nxTXuaCc", "author_site": "Yuka Hashimoto, Masahiro Ikeda, Hachem Kadri", "tldr": "", "abstract": "Machine learning has a long collaborative tradition with several fields of mathematics, such as statistics, probability and linear algebra. We propose a new direction for machine learning research: $C^*$-algebraic ML $-$ a cross-fertilization between $C^*$-algebra and machine learning. The mathematical concept of $C^*$-algebra is a natural generalization of the space of complex numbers. It enables us to unify existing learning strategies, and construct a new framework for more diverse and information-rich data models. We explain why and how to use $C^*$-algebras in machine learning, and provide technical considerations that go into the design of $C^*$-algebraic learning models in the contexts of kernel methods and neural networks. Furthermore, we discuss open questions and challenges in $C^*$-algebraic ML and give our thoughts for future development and applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuka Hashimoto;Masahiro Ikeda;Hachem Kadri", "authorids": "~Yuka_Hashimoto2;~Masahiro_Ikeda1;~Hachem_Kadri3", "gender": ";M;M", "homepage": "https://www.rd.ntt/e/ns/qos/person/hashimoto/index.html;https://sites.google.com/view/masahiroikedaswebpage/home;https://hachem-kadri.pedaweb.univ-amu.fr", "dblp": "220/5306;43/5572;88/9175", "google_scholar": ";https://scholar.google.com.tr/citations?user=6ozp0qMAAAAJ;E2XUpSkAAAAJ", "orcid": "0000-0002-1424-4298;;", "linkedin": ";;hachem-kadri-8266aa1a9", "or_profile": "~Yuka_Hashimoto2;~Masahiro_Ikeda1;~hachem_kadri2", "aff": "NTT;RIKEN;Aix-Marseille University", "aff_domain": "ntt.co.jp;riken.jp;univ-amu.fr", "position": "Researcher;Researcher;Full Professor", "bibtex": "@inproceedings{\nhashimoto2024position,\ntitle={Position: \\$C{\\textasciicircum}*\\$-Algebraic Machine Learning \\$-\\$ Moving in a New Direction},\nauthor={Yuka Hashimoto and Masahiro Ikeda and Hachem Kadri},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=w9nxTXuaCc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 860342, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Z6o2Y3WnQ0UJ:scholar.google.com/&scioq=Position:+%24C%5E*%24-Algebraic+Machine+Learning+%24-%24+Moving+in+a+New+Direction&hl=en&as_sdt=0,5", "gs_version_total": 5, "email": "ntt.co.jp;riken.jp;univ-amu.fr", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "NTT Corporation;RIKEN;Aix-Marseille University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ntt.co.jp;https://www.riken.jp;https://www.univ-amu.fr", "aff_unique_abbr": "NTT;RIKEN;AMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Japan;France" }, { "title": "Position: Future Directions in the Theory of Graph Machine Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32757", "id": "wBr5ozDEKp", "proceeding": "https://proceedings.mlr.press/v235/morris24a.html", "pdf": "https://openreview.net/pdf?id=wBr5ozDEKp", "openreview": "https://openreview.net/forum?id=wBr5ozDEKp", "author_site": "Christopher Morris, Fabrizio Frasca, Nadav Dym, Haggai Maron, Ismail Ceylan, Ron Levie, Derek Lim, Michael Bronstein, Martin Grohe, Stefanie Jegelka", "tldr": "", "abstract": "Machine learning on graphs, especially using graph neural networks (GNNs), has seen a surge in interest due to the wide availability of graph data across a broad spectrum of disciplines, from life to social and engineering sciences. Despite their practical success, our theoretical understanding of the properties of GNNs remains highly incomplete. Recent theoretical advancements primarily focus on elucidating the coarse-grained expressive power of GNNs, predominantly employing combinatorial techniques. However, these studies do not perfectly align with practice, particularly in understanding the generalization behavior of GNNs when trained with stochastic first-order optimization techniques. In this position paper, we argue that the graph machine learning community needs to shift its attention to developing a balanced theory of graph machine learning, focusing on a more thorough understanding of the interplay of expressive power, generalization, and optimization.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Christopher Morris;Fabrizio Frasca;Nadav Dym;Haggai Maron;Ismail Ilkan Ceylan;Ron Levie;Derek Lim;Michael M. Bronstein;Martin Grohe;Stefanie Jegelka", "authorids": "~Christopher_Morris1;~Fabrizio_Frasca1;~Nadav_Dym1;~Haggai_Maron1;~Ismail_Ilkan_Ceylan2;~Ron_Levie1;~Derek_Lim1;~Michael_M._Bronstein1;~Martin_Grohe1;~Stefanie_Jegelka3", "gender": "M;M;;M;;;M;M;M;F", "homepage": "http://christophermorris.info;https://noired.github.io;;https://haggaim.github.io/;https://www.cs.ox.ac.uk/people/ismaililkan.ceylan/;;https://cptq.github.io/;http://www.inf.usi.ch/bronstein/;http://www.lics.rwth-aachen.de/~grohe;http://people.csail.mit.edu/stefje/", "dblp": "156/7303;228/1840;;181/6629;147/6111;;267/5433;07/2668;g/MGrohe;38/7003", "google_scholar": ";PT2CDA4AAAAJ;;https://scholar.google.co.il/citations?user=4v8uJrIAAAAJ;avJ5kQcAAAAJ;;y9YTBIsAAAAJ;UU3N6-UAAAAJ;https://scholar.google.com.tw/citations?user=Sou5ih0AAAAJ;gTWUZlsAAAAJ", "orcid": ";0000-0002-5165-1394;;;0000-0003-4118-4689;;;;0000-0002-0292-9142;", "linkedin": ";;;;;;;mbronstein/;;", "or_profile": "~Christopher_Morris1;~Fabrizio_Frasca1;~Nadav_Dym1;~Haggai_Maron1;~Ismail_Ilkan_Ceylan2;~Ron_Levie1;~Derek_Lim1;~Michael_M._Bronstein1;~Martin_Grohe1;~Stefanie_Jegelka3", "aff": "Rheinisch Westf\u00e4lische Technische Hochschule Aachen;Technion - Israel Institute of Technology, Technion - Israel Institute of Technology;;NVIDIA;University of Oxford;;Liquid AI;University of Oxford;RWTH Aachen University;Massachusetts Institute of Technology", "aff_domain": "rwth-aachen.de;campus.technion.ac.il;;nvidia.com;oxford.ac.uk;;liquid.ai;ox.ac.uk;rwth-aachen.de;mit.edu", "position": "Assistant Professor;Postdoc;;Research Scientist;Assistant Professor;;Researcher;Full Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nmorris2024position,\ntitle={Position: Future Directions in the Theory of Graph Machine Learning},\nauthor={Christopher Morris and Fabrizio Frasca and Nadav Dym and Haggai Maron and Ismail Ilkan Ceylan and Ron Levie and Derek Lim and Michael M. Bronstein and Martin Grohe and Stefanie Jegelka},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wBr5ozDEKp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 252860, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15587440882460944527&as_sdt=8005&sciodt=0,7&hl=en", "gs_version_total": 5, "email": "rwth-aachen.de;campus.technion.ac.il;;nvidia.com;oxford.ac.uk;;liquid.ai;ox.ac.uk;rwth-aachen.de;mit.edu", "author_num": 10, "aff_unique_index": "0;1;2;3;4;3;0;5", "aff_unique_norm": "RWTH Aachen University;Technion - Israel Institute of Technology;NVIDIA;University of Oxford;Liquid AI;Massachusetts Institute of Technology", "aff_unique_dep": ";;NVIDIA Corporation;;;", "aff_unique_url": "https://www.rwth-aachen.de;https://www.technion.ac.il/en/;https://www.nvidia.com;https://www.ox.ac.uk;;https://web.mit.edu", "aff_unique_abbr": "RWTH;Technion;NVIDIA;Oxford;;MIT", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Aachen;", "aff_country_unique_index": "0;1;2;3;4;3;0;2", "aff_country_unique": "Germany;Israel;United States;United Kingdom;Unknown" }, { "title": "Fast Adversarial Attacks on Language Models In One GPU Minute", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32756", "id": "wCMNbdshcY", "proceeding": "https://proceedings.mlr.press/v235/sadasivan24a.html", "pdf": "https://openreview.net/pdf?id=wCMNbdshcY", "openreview": "https://openreview.net/forum?id=wCMNbdshcY", "author_site": "Vinu Sankar Sadasivan, Shoumik Saha, Gaurang Sriramanan, Priyatham Kattakinda, Atoosa Malemir Chegini, Soheil Feizi", "tldr": "", "abstract": "In this paper, we introduce a novel class of fast, beam search-based adversarial attack (BEAST) for Language Models (LMs). BEAST employs interpretable parameters, enabling attackers to balance between attack speed, success rate, and the readability of adversarial prompts. The computational efficiency of BEAST facilitates us to investigate its applications on LMs for jailbreaking, eliciting hallucinations, and privacy attacks. Our gradient-free targeted attack can jailbreak aligned LMs with high attack success rates within one minute. For instance, BEAST can jailbreak Vicuna-7B-v1.5 under one minute with a success rate of 89% when compared to a gradient-based baseline that takes over an hour to achieve 70% success rate using a single Nvidia RTX A6000 48GB GPU. BEAST can also generate adversarial suffixes for successful jailbreaks that can transfer to unseen prompts and unseen models such as GPT-4-Turbo. Additionally, we discover a unique outcome wherein our untargeted attack induces hallucinations in LM chatbots. Through human evaluations, we find that our untargeted attack causes Vicuna-7B-v1.5 to produce $\\sim$15% more incorrect outputs when compared to LM outputs in the absence of our attack. We also learn that 22% of the time, BEAST causes Vicuna to generate outputs that are not relevant to the original prompt. Further, we use BEAST to generate adversarial prompts in a few seconds that can boost the performance of existing membership inference attacks for LMs. We believe that our fast attack, BEAST, has the potential to accelerate research in LM security and privacy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vinu Sankar Sadasivan;Shoumik Saha;Gaurang Sriramanan;Priyatham Kattakinda;Atoosa Chegini;Soheil Feizi", "authorids": "~Vinu_Sankar_Sadasivan1;~Shoumik_Saha1;~Gaurang_Sriramanan1;~Priyatham_Kattakinda1;~Atoosa_Chegini1;~Soheil_Feizi2", "gender": "M;M;M;M;F;M", "homepage": "https://vinusankars.github.io/;https://shoumiksaha.github.io/;https://gaurangsriramanan.github.io/;https://priyathamkat.com/;;https://www.cs.umd.edu/~sfeizi/", "dblp": "244/8052;307/5377;262/3916;;;57/2132", "google_scholar": "y1IKIw0AAAAJ;https://scholar.google.com/citations?view_op=list_works;t76Uk8oAAAAJ;D9ebp-YAAAAJ;5nY9tagAAAAJ;lptAmrMAAAAJ", "orcid": ";0009-0007-7461-5306;;;;", "linkedin": "vinusankars/;shoumik-saha/;gaurang-sriramanan-16141a1a3/;priyathamkat/;atoosa-chegini-6713741a3/;", "or_profile": "~Vinu_Sankar_Sadasivan1;~Shoumik_Saha1;~Gaurang_Sriramanan1;~Priyatham_Kattakinda1;~Atoosa_Chegini1;~Soheil_Feizi2", "aff": "University of Maryland, College Park;University of Maryland, College Park;University of Maryland, College Park;University of Maryland, College Park;University of Maryland, College Park;University of Maryland, College Park", "aff_domain": "umd.edu;umd.edu;umd.edu;umd.edu;umd.edu;umd.edu", "position": "PhD student;MS student;PhD student;PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nsadasivan2024fast,\ntitle={Fast Adversarial Attacks on Language Models In One {GPU} Minute},\nauthor={Vinu Sankar Sadasivan and Shoumik Saha and Gaurang Sriramanan and Priyatham Kattakinda and Atoosa Chegini and Soheil Feizi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wCMNbdshcY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4660120, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=921734893857184901&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "umd.edu;umd.edu;umd.edu;umd.edu;umd.edu;umd.edu", "author_num": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "University of Maryland", "aff_unique_dep": "", "aff_unique_url": "https://www/umd.edu", "aff_unique_abbr": "UMD", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "InferCept: Efficient Intercept Support for Augmented Large Language Model Inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32755", "id": "wDDGQabYPQ", "proceeding": "https://proceedings.mlr.press/v235/abhyankar24a.html", "pdf": "https://openreview.net/pdf?id=wDDGQabYPQ", "openreview": "https://openreview.net/forum?id=wDDGQabYPQ", "author_site": "Reyna Abhyankar, Zijian He, Vikranth Srivatsa, Hao Zhang, Yiying Zhang", "tldr": "", "abstract": "Large language models are increasingly integrated with external environments, tools, and agents like ChatGPT plugins to extend their capability beyond language-centric tasks. However, today's LLM inference systems are designed for standalone LLMs. They treat each external interaction as the end of LLM generation and form a new request when the interaction finishes, causing unnecessary recomputation of already computed contexts, which accounts for 37-40% of total model forwarding time. This paper presents **InferCept, the first LLM inference framework targeting augmented LLMs** and supporting the efficient interception of LLM generation. InferCept minimizes the GPU resource waste caused by LLM interceptions and dedicates saved memory for serving more requests.InferCept improves the overall serving throughput by **1.6x-2x** and completes 2x more requests per second compared to the state-of-the-art LLM inference systems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Reyna Abhyankar;Zijian He;Vikranth Srivatsa;Hao Zhang;Yiying Zhang", "authorids": "~Reyna_Abhyankar1;~Zijian_He5;~Vikranth_Srivatsa1;~Hao_Zhang2;~Yiying_Zhang2", "gender": "F;M;;M;F", "homepage": ";https://github.com/jiange91;https://github.com/vikranth22446;https://cseweb.ucsd.edu/~haozhang/;https://cseweb.ucsd.edu/~yiying/", "dblp": "347/7970;;;55/2270-25;", "google_scholar": ";;CfKaGRQAAAAJ;H1d4BS8AAAAJ;ijn77lsAAAAJ", "orcid": "0009-0005-6763-0108;0009-0004-8827-2664;;;", "linkedin": "reyna-abhyankar/;;;;", "or_profile": "~Reyna_Abhyankar1;~Zijian_He5;~Vikranth_Srivatsa1;~Hao_Zhang2;~Yiying_Zhang2", "aff": "University of California, San Diego;University of California, San Diego;University of California, San Diego;Carnegie Mellon University;University of California, San Diego", "aff_domain": "ucsd.edu;ucsd.edu;ucsd.edu;cmu.edu;ucsd.edu", "position": "PhD student;PhD student;PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nabhyankar2024infercept,\ntitle={InferCept: Efficient Intercept Support for Augmented Large Language Model Inference},\nauthor={Reyna Abhyankar and Zijian He and Vikranth Srivatsa and Hao Zhang and Yiying Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wDDGQabYPQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 504502, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11863195836140238769&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8, "email": "ucsd.edu;ucsd.edu;ucsd.edu;cmu.edu;ucsd.edu", "author_num": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "University of California, San Diego;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucsd.edu;https://www.cmu.edu", "aff_unique_abbr": "UCSD;CMU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "xT: Nested Tokenization for Larger Context in Large Images", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32754", "id": "wDDprThYeT", "proceeding": "https://proceedings.mlr.press/v235/gupta24b.html", "pdf": "https://openreview.net/pdf?id=wDDprThYeT", "openreview": "https://openreview.net/forum?id=wDDprThYeT", "author_site": "Ritwik Gupta, Shufan Li, Tyler Zhu, Jitendra Malik, Trevor Darrell, Karttikeya Mangalam", "tldr": "", "abstract": "Modern computer vision pipelines handle large images in one of two sub-optimal ways: down-sampling or cropping. These two methods incur significant losses in the amount of information and context present in an image. There are many downstream applications in which global context matters as much as high frequency details, such as in real-world satellite imagery; in such cases researchers have to make the uncomfortable choice of which information to discard. We introduce *xT*, a simple framework for vision transformers which effectively aggregates global context with local details and can model large images end-to-end on contemporary GPUs. We select a set of benchmark datasets across classic vision tasks which accurately reflect a vision model's ability to understand truly large images and incorporate fine details over large scales and assess our method's improvement on them. *xT* is a streaming, two-stage architecture that adapts existing vision backbones and long sequence language models to effectively model large images without quadratic memory growth. We are able to increase accuracy by up to 8.6% on challenging classification tasks and F1 score by 11.6 on context-dependent segmentation on images as large as 29,000 x 29,000 pixels.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ritwik Gupta;Shufan Li;Tyler Zhu;Jitendra Malik;Trevor Darrell;Karttikeya Mangalam", "authorids": "~Ritwik_Gupta1;~Shufan_Li1;~Tyler_Zhu2;~Jitendra_Malik2;~Trevor_Darrell2;~Karttikeya_Mangalam1", "gender": ";M;M;M;;M", "homepage": ";;https://tylerzhu.com/research;https://people.eecs.berkeley.edu/~malik/;;http://karttikeya.github.io/", "dblp": ";218/8196;;58/2944;;200/8205", "google_scholar": ";;-aoybA4AAAAJ;oY9R5YQAAAAJ;;2l1fWEoAAAAJ", "orcid": ";;;0000-0003-3695-1580;;", "linkedin": ";shufan-li-126b70187/;;;;", "or_profile": "~Ritwik_Gupta1;~Shufan_Li1;~Tyler_Zhu2;~Jitendra_Malik2;~Trevor_Darrell2;~Karttikeya_Mangalam1", "aff": ";UCLA Computer Science Department, University of California, Los Angeles;Department of Computer Science, Princeton University;University of California, Berkeley;;University of California, Berkeley", "aff_domain": ";cs.ucla.edu;cs.princeton.edu;berkeley.edu;;berkeley.edu", "position": ";PhD student;PhD student;Full Professor;;PhD student", "bibtex": "@inproceedings{\ngupta2024xt,\ntitle={xT: Nested Tokenization for Larger Context in Large Images},\nauthor={Ritwik Gupta and Shufan Li and Tyler Zhu and Jitendra Malik and Trevor Darrell and Karttikeya Mangalam},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wDDprThYeT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2885479, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1047685994006666505&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": ";cs.ucla.edu;cs.princeton.edu;berkeley.edu;;berkeley.edu", "author_num": 6, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "University of California, Los Angeles;Princeton University;University of California, Berkeley", "aff_unique_dep": "Computer Science Department;Department of Computer Science;", "aff_unique_url": "https://www.ucla.edu;https://www.princeton.edu;https://www.berkeley.edu", "aff_unique_abbr": "UCLA;Princeton;UC Berkeley", "aff_campus_unique_index": "0;2;2", "aff_campus_unique": "Los Angeles;;Berkeley", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Position: A Call to Action for a Human-Centered AutoML Paradigm", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32753", "id": "wELbEYgnmo", "proceeding": "https://proceedings.mlr.press/v235/lindauer24a.html", "pdf": "https://openreview.net/pdf?id=wELbEYgnmo", "openreview": "https://openreview.net/forum?id=wELbEYgnmo", "author_site": "Marius Lindauer, Florian Karl, Anne Klier, Julia Moosbauer, Alexander Tornede, Andreas Mueller, Frank Hutter, Matthias Feurer, Bernd Bischl", "tldr": "", "abstract": "Automated machine learning (AutoML) was formed around the fundamental objectives of automatically and efficiently configuring machine learning (ML) workflows, aiding the research of new ML algorithms, and contributing to the democratization of ML by making it accessible to a broader audience. Over the past decade, commendable achievements in AutoML have primarily focused on optimizing predictive performance. This focused progress, while substantial, raises questions about how well AutoML has met its broader, original goals. In this position paper, we argue that a key to unlocking AutoML's full potential lies in addressing the currently underexplored aspect of user interaction with AutoML systems, including their diverse roles, expectations, and expertise. We envision a more human-centered approach in future AutoML research, promoting the collaborative design of ML systems that tightly integrates the complementary strengths of human expertise and AutoML methodologies.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Marius Lindauer;Florian Karl;Anne Klier;Julia Moosbauer;Alexander Tornede;Andreas C Mueller;Frank Hutter;Matthias Feurer;Bernd Bischl", "authorids": "~Marius_Lindauer1;~Florian_Karl1;anne.klier@iis.fraunhofer.de;~Julia_Moosbauer1;~Alexander_Tornede1;~Andreas_C_Mueller1;~Frank_Hutter1;~Matthias_Feurer2;~Bernd_Bischl1", "gender": "M;M;;F;M;M;M;;M", "homepage": "https://www.ai.uni-hannover.de/de/institut/team/lindauer;https://www.slds.stat.uni-muenchen.de/people/karl/;;http://www.compstat.statistik.uni-muenchen.de/people/moosbauer/;https://www.uni-paderborn.de/person/38209/;https://amueller.github.io;http://ml.informatik.uni-freiburg.de/~hutter/;;https://www.slds.stat.uni-muenchen.de/", "dblp": "28/9142;;;255/7464;252/8859;38/4335-4;89/5383;;48/5326", "google_scholar": "https://scholar.google.de/citations?user=0Sxx7DUAAAAJ;k-ZyY8EAAAAJ;;eoroT-MAAAAJ;https://scholar.google.de/citations?hl=de;8NTO5XIAAAAJ;https://scholar.google.de/citations?user=YUrxwrkAAAAJ;;https://scholar.google.de/citations?user=s34UckkAAAAJ", "orcid": ";0000-0003-0163-2272;;0000-0002-0000-9297;0000-0002-2415-2186;0000-0002-2349-9428;0000-0002-2037-3694;;0000-0001-6002-6980", "linkedin": ";;;juliamoosbauer/;;;frank-hutter-9190b24b/;;", "or_profile": "~Marius_Lindauer1;~Florian_Karl1;anne.klier@iis.fraunhofer.de;~Julia_Moosbauer1;~Alexander_Tornede1;~Andreas_C_Mueller1;~Frank_Hutter1;~Matthias_Feurer2;~Bernd_Bischl1", "aff": "Leibniz Universit\u00e4t Hannover;University of Munich, Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;;Department of Statistics;Universit\u00e4t Hannover;Microsoft;Albert-Ludwigs-Universit\u00e4t Freiburg;;LMU", "aff_domain": "uni-hannover.de;campus.lmu.de;;lmu.de;uni-hannover.de;microsoft.com;uni-freiburg.de;;uni-muenchen.de", "position": "Full Professor;PhD student;;PhD student;Postdoc;Principal Researcher;Full Professor;;Full Professor", "bibtex": "@inproceedings{\nlindauer2024position,\ntitle={Position: A Call to Action for a Human-Centered Auto{ML} Paradigm},\nauthor={Marius Lindauer and Florian Karl and Anne Klier and Julia Moosbauer and Alexander Tornede and Andreas C Mueller and Frank Hutter and Matthias Feurer and Bernd Bischl},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wELbEYgnmo}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1469879, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12676037479966191288&as_sdt=5,39&sciodt=0,39&hl=en", "gs_version_total": 9, "email": "uni-hannover.de;campus.lmu.de;;lmu.de;uni-hannover.de;microsoft.com;uni-freiburg.de;;uni-muenchen.de", "author_num": 9, "aff_unique_index": "0;1;2;3;4;5;6", "aff_unique_norm": "Leibniz Universit\u00e4t Hannover;Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen;University Affiliation Not Specified;University of Hanover;Microsoft;Albert-Ludwigs-Universit\u00e4t Freiburg;Ludwig Maximilian University of Munich", "aff_unique_dep": ";;Department of Statistics;;Microsoft Corporation;;", "aff_unique_url": "https://www.leibniz.uni-hannover.de/;https://www.lmu.de;;https://www.uni-hannover.de;https://www.microsoft.com;https://www.uni-freiburg.de;https://www.lmu.de", "aff_unique_abbr": "LUH;LMU;;Uni Hanover;Microsoft;Albert-Ludwigs-Universit\u00e4t;LMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Freiburg", "aff_country_unique_index": "0;0;0;2;0;0", "aff_country_unique": "Germany;;United States" }, { "title": "A Unified Framework for Learning with Nonlinear Model Classes from Arbitrary Linear Samples", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32752", "id": "wG2SgnH6Zv", "proceeding": "https://proceedings.mlr.press/v235/adcock24a.html", "pdf": "https://openreview.net/pdf?id=wG2SgnH6Zv", "openreview": "https://openreview.net/forum?id=wG2SgnH6Zv", "author_site": "Ben Adcock, Juan Cardenas, Nick Dexter", "tldr": "", "abstract": "This work considers the fundamental problem of learning an unknown object from training data using a given model class. We introduce a framework that allows for objects in arbitrary Hilbert spaces, general types of (random) linear measurements as training data and general types of nonlinear model classes. We establish a series of learning guarantees for this framework, which provide explicit relations between the amount of training data and the model class to ensure near-best generalization bounds. In doing so, we introduce the key notion of the *variation* of a model class with respect to a distribution of sampling operators. We show that this framework can accommodate many different types of well-known problems of interest, such as matrix sketching by random sampling, compressed sensing with isotropic vectors, active learning in regression and compressed sensing with generative models. In all cases, known results become straightforward corollaries of our general theory. Hence, this work provides a powerful framework for studying and analyzing many different types of learning problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ben Adcock;Juan M. Cardenas;Nick Dexter", "authorids": "~Ben_Adcock1;~Juan_M._Cardenas1;~Nick_Dexter1", "gender": ";;M", "homepage": ";;https://sites.google.com/view/ndexter", "dblp": ";;256/9590", "google_scholar": ";;Sgso_3QAAAAJ", "orcid": ";;0000-0002-2418-4735", "linkedin": ";;https://www.linkedin.com/mwlite/in/nick-dexter-7139409", "or_profile": "~Ben_Adcock1;~Juan_M._Cardenas1;~Nick_Dexter1", "aff": ";;Florida State University", "aff_domain": ";;fsu.edu", "position": ";;Assistant Professor", "bibtex": "@inproceedings{\nadcock2024a,\ntitle={A Unified Framework for Learning with Nonlinear Model Classes from Arbitrary Linear Samples},\nauthor={Ben Adcock and Juan M. Cardenas and Nick Dexter},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wG2SgnH6Zv}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 723884, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6513861956293244454&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": ";;fsu.edu", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Florida State University", "aff_unique_dep": "", "aff_unique_url": "https://www.fsu.edu", "aff_unique_abbr": "FSU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "CompeteAI: Understanding the Competition Dynamics of Large Language Model-based Agents", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32751", "id": "wGtzp4ZT1n", "proceeding": "https://proceedings.mlr.press/v235/zhao24q.html", "pdf": "https://openreview.net/pdf?id=wGtzp4ZT1n", "openreview": "https://openreview.net/forum?id=wGtzp4ZT1n", "author_site": "Qinlin Zhao, Jindong Wang, Yixuan Zhang, Yiqiao Jin, Kaijie Zhu, Hao Chen, Xing Xie", "tldr": "", "abstract": "Large language models (LLMs) have been widely used as agents to complete different tasks, such as personal assistance or event planning. Although most of the work has focused on cooperation and collaboration between agents, little work explores *competition*, another important mechanism that promotes the development of society and economy. In this paper, we seek to examine the competition dynamics in LLM-based agents. We first propose a general framework for studying the competition between agents. Then, we implement a practical competitive environment using GPT-4 to simulate a virtual town with two types of agents, including restaurant agents and customer agents. Specifically, the restaurant agents compete with each other to attract more customers, where competition encourages them to transform, such as cultivating new operating strategies. Simulation experiments reveal several interesting findings at the micro and macro levels, which align well with existing market and sociological theories. We hope that the framework and environment can be a promising testbed to study the competition that fosters understanding of society. Code is available at: https://github.com/microsoft/competeai.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qinlin Zhao;Jindong Wang;Yixuan Zhang;Yiqiao Jin;Kaijie Zhu;Hao Chen;Xing Xie", "authorids": "~Qinlin_Zhao1;~Jindong_Wang1;~Yixuan_Zhang7;~Yiqiao_Jin1;~Kaijie_Zhu1;~Hao_Chen15;~Xing_Xie3", "gender": "M;F;M;M;M;M;M", "homepage": ";https://zjanice.github.io/;https://ahren09.github.io/;https://github.com/Immortalise;https://hhhhhhao.github.io/;http://research.microsoft.com/en-us/people/xingx/;https://jd92.wang/", "dblp": ";;207/6631.html;56/7058;;08/6809-1;19/2969-1", "google_scholar": ";NJJJ45AAAAAJ;eY85qm4AAAAJ;;tktqkhwAAAAJ;5EQfAFIAAAAJ;hBZ_tKsAAAAJ", "orcid": ";;0000-0002-6974-5970;;;0000-0002-8608-8482;0000-0002-4833-0880", "linkedin": "qinlin-zhao-3a51292b2/;;ahren-jin/;;haochen97/;xingx/;jindong-wang/", "or_profile": "~Qinlin_Zhao1;~Yixuan_Zhang7;~Yiqiao_Jin1;~Kaijie_Zhu1;~Hao_Chen15;~Xing_Xie3;~Jindong_Wang4", "aff": "University of Science and Technology of China;College of William and Mary;Georgia Institute of Technology;Institute of automation, Chinese Academy of Sciences;Carnegie Mellon University;Microsoft Research Asia;Microsoft Research", "aff_domain": "ustc.edu.cn;wm.edu;gatech.edu;ia.ac.cn;andrew.cmu.edu;microsoft.com;microsoft.com", "position": "Undergrad student;Assistant Professor;PhD student;MS student;PhD student;Senior Principal Researcher;Researcher", "bibtex": "@inproceedings{\nzhao2024competeai,\ntitle={Compete{AI}: Understanding the Competition Dynamics of Large Language Model-based Agents},\nauthor={Qinlin Zhao and Jindong Wang and Yixuan Zhang and Yiqiao Jin and Kaijie Zhu and Hao Chen and Xing Xie},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wGtzp4ZT1n}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7182233, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 80, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17075203517804203391&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "ustc.edu.cn;wm.edu;gatech.edu;ia.ac.cn;andrew.cmu.edu;microsoft.com;microsoft.com", "author_num": 7, "aff_unique_index": "0;1;2;3;4;5;5", "aff_unique_norm": "University of Science and Technology of China;College of William and Mary;Georgia Institute of Technology;Chinese Academy of Sciences;Carnegie Mellon University;Microsoft", "aff_unique_dep": ";;;Institute of Automation;;Research", "aff_unique_url": "http://www.ustc.edu.cn;https://www.wm.edu;https://www.gatech.edu;http://www.ia.cas.cn;https://www.cmu.edu;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "USTC;WM;Georgia Tech;CAS;CMU;MSR Asia", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1;1;0;1;0;1", "aff_country_unique": "China;United States" }, { "title": "Understanding Heterophily for Graph Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32750", "id": "wK9RvVmi7u", "proceeding": "https://proceedings.mlr.press/v235/wang24u.html", "pdf": "https://openreview.net/pdf?id=wK9RvVmi7u", "openreview": "https://openreview.net/forum?id=wK9RvVmi7u", "author_site": "Junfu Wang, Yuanfang Guo, Liang Yang, Yunhong Wang", "tldr": "", "abstract": "Graphs with heterophily have been regarded as challenging scenarios for Graph Neural Networks (GNNs), where nodes are connected with dissimilar neighbors through various patterns. In this paper, we present theoretical understandings of heterophily for GNNs by incorporating the graph convolution (GC) operations into fully connected networks via the proposed Heterophilous Stochastic Block Models (HSBM), a general random graph model that can accommodate diverse heterophily patterns. Our theoretical investigation comprehensively analyze the impact of heterophily from three critical aspects. Firstly, for the impact of different heterophily patterns, we show that the separability gains are determined by two factors, i.e., the Euclidean distance of the neighborhood distributions and $\\sqrt{\\mathbb{E}\\left[\\operatorname{deg}\\right]}$, where $\\mathbb{E}\\left[\\operatorname{deg}\\right]$ is the averaged node degree. Secondly, we show that the neighborhood inconsistency has a detrimental impact on separability, which is similar to degrading $\\mathbb{E}\\left[\\operatorname{deg}\\right]$ by a specific factor. Finally, for the impact of stacking multiple layers, we show that the separability gains are determined by the normalized distance of the $l$-powered neighborhood distributions, indicating that nodes still possess separability in various regimes, even when over-smoothing occurs. Extensive experiments on both synthetic and real-world data verify the effectiveness of our theory.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Junfu Wang;Yuanfang Guo;Liang Yang;Yunhong Wang", "authorids": "~Junfu_Wang1;~Yuanfang_Guo1;~Liang_Yang2;~Yunhong_Wang1", "gender": ";M;M;", "homepage": ";https://irip.buaa.edu.cn/andyguo/index.html;http://yangliang.github.io/;", "dblp": "276/6628.html;78/8545;05/3933-2;", "google_scholar": ";;7agkJogAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Junfu_Wang1;~Yuanfang_Guo1;~Liang_Yang2;~Yunhong_Wang1", "aff": "Beihang University;Beihang University;Hebei University of Technology;", "aff_domain": "buaa.edu.cn;buaa.edu.cn;hebut.edu.cn;", "position": "PhD student;Associate Professor;Full Professor;", "bibtex": "@inproceedings{\nwang2024understanding,\ntitle={Understanding Heterophily for Graph Neural Networks},\nauthor={Junfu Wang and Yuanfang Guo and Liang Yang and Yunhong Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wK9RvVmi7u}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1612021, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10205261759747566091&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "buaa.edu.cn;buaa.edu.cn;hebut.edu.cn;", "author_num": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Beihang University;Hebei University of Technology", "aff_unique_dep": ";", "aff_unique_url": "http://www.buaa.edu.cn/;http://www.hbut.edu.cn", "aff_unique_abbr": "BUAA;HUT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "What\u2019s the score? Automated Denoising Score Matching for Nonlinear Diffusions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32749", "id": "wLoESsgZIq", "proceeding": "https://proceedings.mlr.press/v235/singhal24a.html", "pdf": "https://openreview.net/pdf?id=wLoESsgZIq", "openreview": "https://openreview.net/forum?id=wLoESsgZIq", "author_site": "raghav singhal, Mark Goldstein, Rajesh Ranganath", "tldr": "", "abstract": "Reversing a diffusion process by learning its score forms the heart of diffusion-based generative modeling and for estimating properties of scientific systems. The diffusion processes that are tractable center on linear processes with a Gaussian stationary distribution, limiting the kinds of models that can be built to those that target a Gaussian prior or more generally limits the kinds of problems that can be generically solved to those that have conditionally linear score functions. In this work, we introduce a family of tractable denoising score matching objectives, called local-DSM, built using local increments of the diffusion process. We show how local-DSM melded with Taylor expansions enables automated training and score estimation with nonlinear diffusion processes. To demonstrate these ideas, we use automated-DSM to train generative models using non-Gaussian priors on challenging low dimensional distributions and the CIFAR10 image dataset. Additionally, we use the automated-DSM to learn the scores for nonlinear processes studied in statistical physics.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Raghav Singhal;Mark Goldstein;Rajesh Ranganath", "authorids": "~Raghav_Singhal1;~Mark_Goldstein1;~Rajesh_Ranganath2", "gender": ";M;", "homepage": ";https://cims.nyu.edu/~mg3479/;", "dblp": ";;97/7057", "google_scholar": "8IWpqtcAAAAJ;https://scholar.google.fr/citations?hl=en;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Raghav_Singhal1;~Mark_Goldstein1;~Rajesh_Ranganath2", "aff": "New York University;Google;New York University", "aff_domain": "nyu.edu;google.com;nyu.edu", "position": "PhD student;Intern;Assistant Professor", "bibtex": "@inproceedings{\nsinghal2024whats,\ntitle={What{\\textquoteright}s the score? Automated Denoising Score Matching for Nonlinear Diffusions},\nauthor={Raghav Singhal and Mark Goldstein and Rajesh Ranganath},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wLoESsgZIq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6459123, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16786575050910471967&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "nyu.edu;google.com;nyu.edu", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "New York University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.nyu.edu;https://www.google.com", "aff_unique_abbr": "NYU;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Minimax Optimality of Score-based Diffusion Models: Beyond the Density Lower Bound Assumptions", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32748", "id": "wTd7dogTsB", "proceeding": "https://proceedings.mlr.press/v235/zhang24bv.html", "pdf": "https://openreview.net/pdf?id=wTd7dogTsB", "openreview": "https://openreview.net/forum?id=wTd7dogTsB", "author_site": "Kaihong Zhang, Heqi Yin, Feng Liang, Jingbo Liu", "tldr": "", "abstract": "We study the asymptotic error of score-based diffusion model sampling in large-sample scenarios from a non-parametric statistics perspective. We show that a kernel-based score estimator achieves an optimal mean square error of $\\widetilde{O}\\left(n^{-1} t^{-\\frac{d+2}{2}}(t^{\\frac{d}{2}} \\vee 1)\\right)$ for the score function of $p_0*\\mathcal{N}(0,t\\boldsymbol{I}_d)$, where $n$ and $d$ represent the sample size and the dimension, $t$ is bounded above and below by polynomials of $n$, and $p_0$ is an arbitrary sub-Gaussian distribution. As a consequence, this yields an $\\widetilde{O}\\left(n^{-1/2} t^{-\\frac{d}{4}}\\right)$ upper bound for the total variation error of the distribution of the sample generated by the diffusion model under a mere sub-Gaussian assumption. If in addition, $p_0$ belongs to the nonparametric family of the $\\beta$-Sobolev space with $\\beta\\le 2$, by adopting an early stopping strategy, we obtain that the diffusion model is nearly (up to log factors) minimax optimal. This removes the crucial lower bound assumption on $p_0$ in previous proofs of the minimax optimality of the diffusion model for nonparametric families.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kaihong Zhang;Heqi Yin;Feng Liang;Jingbo Liu", "authorids": "~Kaihong_Zhang1;~Heqi_Yin1;~Feng_Liang1;~Jingbo_Liu3", "gender": "M;F;F;M", "homepage": ";https://caitlyn-yin.github.io//;https://publish.illinois.edu/liangf/;https://stat.illinois.edu/directory/profile/jingbol", "dblp": ";371/4284.html;54/6821;", "google_scholar": ";MBKRLlgAAAAJ;EzvMOjkAAAAJ;", "orcid": ";;;", "linkedin": "kaihong-zhang-39423b168/;;;", "or_profile": "~Kaihong_Zhang1;~Heqi_Yin1;~Feng_Liang1;~Jingbo_Liu3", "aff": "University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;illinois.edu;;illinois.edu", "position": "PhD student;Undergrad student;;Assistant Professor", "bibtex": "@inproceedings{\nzhang2024minimax,\ntitle={Minimax Optimality of Score-based Diffusion Models: Beyond the Density Lower Bound Assumptions},\nauthor={Kaihong Zhang and Heqi Yin and Feng Liang and Jingbo Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wTd7dogTsB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 632582, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9065288989708542639&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "illinois.edu;illinois.edu;;illinois.edu", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "An Interpretable Evaluation of Entropy-based Novelty of Generative Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32747", "id": "wUgTnf918v", "proceeding": "https://proceedings.mlr.press/v235/zhang24ac.html", "pdf": "https://openreview.net/pdf?id=wUgTnf918v", "openreview": "https://openreview.net/forum?id=wUgTnf918v", "author_site": "Jingwei Zhang, Cheuk Ting Li, Farzan Farnia", "tldr": "", "abstract": "The massive developments of generative model frameworks require principled methods for the evaluation of a model's novelty compared to a reference dataset. While the literature has extensively studied the evaluation of the quality, diversity, and generalizability of generative models, the assessment of a model's novelty compared to a reference model has not been adequately explored in the machine learning community. In this work, we focus on the novelty assessment for multi-modal distributions and attempt to address the following differential clustering task: Given samples of a generative model $P_\\mathcal{G}$ and a reference model $P_\\mathrm{ref}$, how can we discover the sample types expressed by $P_\\mathcal{G}$ more frequently than in $P_\\mathrm{ref}$? We introduce a spectral approach to the differential clustering task and propose the Kernel-based Entropic Novelty (KEN) score to quantify the mode-based novelty of $P_\\mathcal{G}$ with respect to $P_\\mathrm{ref}$. We analyze the KEN score for mixture distributions with well-separable components and develop a kernel-based method to compute the KEN score from empirical data. We support the KEN framework by presenting numerical results on synthetic and real image datasets, indicating the framework's effectiveness in detecting novel modes and comparing generative models. The paper's code is available at: github.com/buyeah1109/KEN.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jingwei Zhang;Cheuk Ting Li;Farzan Farnia", "authorids": "~Jingwei_Zhang9;~Cheuk_Ting_Li1;~Farzan_Farnia1", "gender": "M;M;M", "homepage": ";https://www.ie.cuhk.edu.hk/people/ctli.shtml;https://www.cse.cuhk.edu.hk/~farnia/", "dblp": ";120/7097;132/7757", "google_scholar": ";;GYPCqcYAAAAJ", "orcid": ";;0000-0002-6049-9232", "linkedin": "anthonzhang/;;farzan-farnia-00798335", "or_profile": "~Jingwei_Zhang9;~Cheuk_Ting_Li1;~Farzan_Farnia1", "aff": "The Chinese University of Hong Kong;The Chinese University of Hong Kong;The Chinese University of Hong Kong", "aff_domain": "cse.cuhk.edu.hk;cuhk.edu.hk;cuhk.edu.hk", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nzhang2024an,\ntitle={An Interpretable Evaluation of Entropy-based Novelty of Generative Models},\nauthor={Jingwei Zhang and Cheuk Ting Li and Farzan Farnia},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wUgTnf918v}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9240567, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18299548087026981152&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": "cse.cuhk.edu.hk;cuhk.edu.hk;cuhk.edu.hk", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Chinese University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Improving Equivariant Graph Neural Networks on Large Geometric Graphs via Virtual Nodes Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32746", "id": "wWdkNkUY8k", "proceeding": "https://proceedings.mlr.press/v235/zhang24f.html", "pdf": "https://openreview.net/pdf?id=wWdkNkUY8k", "openreview": "https://openreview.net/forum?id=wWdkNkUY8k", "author_site": "Yuelin Zhang, Jiacheng Cen, Jiaqi Han, Zhiqiang Zhang, JUN ZHOU, Wenbing Huang", "tldr": "", "abstract": "Equivariant Graph Neural Networks (GNNs) have made remarkable success in a variety of scientific applications. However, existing equivariant GNNs encounter the efficiency issue for large geometric graphs and perform poorly if the input is reduced to sparse local graph for speed acceleration. In this paper, we propose FastEGNN, an enhanced model of equivariant GNNs on large geometric graphs. The central idea is leveraging a small ordered set of virtual nodes to approximate the large unordered graph of real nodes. In particular, we distinguish the message passing and aggregation for different virtual node to encourage the mutual distinctiveness, and minimize the Maximum Mean Discrepancy (MMD) between virtual and real coordinates to realize the global distributedness. FastEGNN meets all necessary E(3) symmetries, with certain universal expressivity assurance as well. Our experiments on N-body systems (100 nodes), proteins (800 nodes) and water-3D (8000 nodes), demonstrate that FastEGNN achieves a promising balance between accuracy and efficiency, and outperforms EGNN in accuracy even after dropping all edges in real systems like proteins and water-3D.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuelin Zhang;Jiacheng Cen;Jiaqi Han;Zhiqiang Zhang;JUN ZHOU;Wenbing Huang", "authorids": "~Yuelin_Zhang2;~Jiacheng_Cen1;~Jiaqi_Han2;~Zhiqiang_Zhang4;~JUN_ZHOU6;~Wenbing_Huang1", "gender": "M;;M;M;M;M", "homepage": "https://github.com/dhcpack/;;https://hanjq17.github.io;;https://scholar.google.com/citations?user=mCVvloEAAAAJ&hl=en;https://gsai.ruc.edu.cn/english/wenbing_huang", "dblp": ";;235/0412;67/2010-12;99/3847-11;155/3181-1.html", "google_scholar": ";;AKppgMAAAAAJ;TMx0g8kAAAAJ;mCVvloEAAAAJ;0yNkmO4AAAAJ", "orcid": ";;;0000-0002-2321-7259;0000-0001-6033-6102;", "linkedin": ";;;;;", "or_profile": "~Yuelin_Zhang2;~Jiacheng_Cen1;~Jiaqi_Han2;~Zhiqiang_Zhang4;~JUN_ZHOU6;~Wenbing_Huang1", "aff": "Beihang University;;Computer Science Department, Stanford University;Ant Group;Ant Group;Renmin University of China", "aff_domain": "buaa.edu.cn;;cs.stanford.edu;antfin.com;antgroup.com;ruc.edu.cn", "position": "Undergrad student;;PhD student;Researcher;Researcher;Associate Professor", "bibtex": "@inproceedings{\nzhang2024improving,\ntitle={Improving Equivariant Graph Neural Networks on Large Geometric Graphs via Virtual Nodes Learning},\nauthor={Yuelin Zhang and Jiacheng Cen and Jiaqi Han and Zhiqiang Zhang and JUN ZHOU and Wenbing Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wWdkNkUY8k}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3975211, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6091557263679405123&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "buaa.edu.cn;;cs.stanford.edu;antfin.com;antgroup.com;ruc.edu.cn", "author_num": 6, "aff_unique_index": "0;1;2;2;3", "aff_unique_norm": "Beihang University;Stanford University;Ant Group;Renmin University of China", "aff_unique_dep": ";Computer Science Department;;", "aff_unique_url": "http://www.buaa.edu.cn/;https://www.stanford.edu;https://www.antgroup.com;http://www.ruc.edu.cn", "aff_unique_abbr": "BUAA;Stanford;Ant Group;RUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "China;United States" }, { "title": "Feature Reuse and Scaling: Understanding Transfer Learning with Protein Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32745", "id": "wdTiuvd0fR", "proceeding": "https://proceedings.mlr.press/v235/li24a.html", "pdf": "https://openreview.net/pdf?id=wdTiuvd0fR", "openreview": "https://openreview.net/forum?id=wdTiuvd0fR", "author_site": "Francesca-Zhoufan Li, Ava Amini, Yisong Yue, Kevin Yang, Alex Lu", "tldr": "", "abstract": "Large pretrained protein language models (PLMs) have improved protein property and structure prediction from sequences via transfer learning, in which weights and representations from PLMs are repurposed for downstream tasks. Although PLMs have shown great promise, currently there is little understanding of how the features learned by pretraining relate to and are useful for downstream tasks. We perform a systematic analysis of transfer learning using PLMs, conducting 370 experiments across a comprehensive suite of factors including different downstream tasks, architectures, model sizes, model depths, and pretraining time. We observe that while almost all downstream tasks do benefit from pretrained models compared to naive sequence representations, for the majority of tasks performance does not scale with pretraining, and instead relies on low-level features learned early in pretraining. Our results point to a mismatch between current PLM pretraining paradigms and most applications of these models, indicating a need for better pretraining methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Francesca-Zhoufan Li;Ava P Amini;Yisong Yue;Kevin K Yang;Alex Xijie Lu", "authorids": "~Francesca-Zhoufan_Li1;~Ava_P_Amini1;~Yisong_Yue1;~Kevin_K_Yang1;~Alex_Xijie_Lu1", "gender": ";M;;M;F", "homepage": "https://www.francescazfl.com/;http://www.yisongyue.com;;http://alexluresearch.com/;https://avaamini.com/", "dblp": ";28/1244;216/0400;;", "google_scholar": "W0DR9UMAAAAJ;tEk4qo8AAAAJ;mq-Vzk8AAAAJ;https://scholar.google.ca/citations?user=gz7gLggAAAAJ;w_wosd4AAAAJ", "orcid": "0000-0002-5710-9512;0000-0001-9127-1989;;0000-0001-9568-3155;0000-0002-8601-6040", "linkedin": "francescazhoufanli/;yisongyue/;;;", "or_profile": "~Francesca-Zhoufan_Li1;~Yisong_Yue1;~Kevin_K_Yang1;~Alex_Xijie_Lu1;~Ava_Soleimany1", "aff": "California Institute of Technology;California Institute of Technology;;Microsoft Research;Microsoft", "aff_domain": "caltech.edu;caltech.edu;;microsoft.com;microsoft.com", "position": "PhD student;Full Professor;;Senior Researcher;Researcher", "bibtex": "@inproceedings{\nli2024feature,\ntitle={Feature Reuse and Scaling: Understanding Transfer Learning with Protein Language Models},\nauthor={Francesca-Zhoufan Li and Ava P Amini and Yisong Yue and Kevin K Yang and Alex Xijie Lu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wdTiuvd0fR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2156830, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18392408365299947043&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 8, "email": "caltech.edu;caltech.edu;;microsoft.com;microsoft.com", "author_num": 5, "aff_unique_index": "0;0;1;1", "aff_unique_norm": "California Institute of Technology;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.caltech.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Caltech;MSR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Pasadena;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Perfect Alignment May be Poisonous to Graph Contrastive Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32744", "id": "wdezvnc9EG", "proceeding": "https://proceedings.mlr.press/v235/liu24bd.html", "pdf": "https://openreview.net/pdf?id=wdezvnc9EG", "openreview": "https://openreview.net/forum?id=wdezvnc9EG", "author_site": "Jingyu Liu, Huayi Tang, Yong Liu", "tldr": "", "abstract": "Graph Contrastive Learning (GCL) aims to learn node representations by aligning positive pairs and separating negative ones. However, few of researchers have focused on the inner law behind specific augmentations used in graph-based learning. What kind of augmentation will help downstream performance, how does contrastive learning actually influence downstream tasks, and why the magnitude of augmentation matters so much? This paper seeks to address these questions by establishing a connection between augmentation and downstream performance. Our findings reveal that GCL contributes to downstream tasks mainly by separating different classes rather than gathering nodes of the same class. So perfect alignment and augmentation overlap which draw all intra-class samples the same can not fully explain the success of contrastive learning. Therefore, in order to understand how augmentation aids the contrastive learning process, we conduct further investigations into the generalization, finding that perfect alignment that draw positive pair the same could help contrastive loss but is poisonous to generalization, as a result, perfect alignment may not lead to best downstream performance, so specifically designed augmentation is needed to achieve appropriate alignment performance and improve downstream accuracy. We further analyse the result by information theory and graph spectrum theory and propose two simple but effective methods to verify the theories. The two methods could be easily applied to various GCL algorithms and extensive experiments are conducted to prove its effectiveness. The code is available at https://github.com/somebodyhh1/GRACEIS", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jingyu Liu;Huayi Tang;Yong Liu", "authorids": "~Jingyu_Liu4;~Huayi_Tang1;~Yong_Liu7", "gender": "M;;M", "homepage": "https://github.com/somebodyhh1;;https://iie-liuyong.github.io", "dblp": ";;29/4867-18", "google_scholar": ";;vVhmzbAAAAAJ", "orcid": ";;0000-0002-6739-621X", "linkedin": ";;", "or_profile": "~Jingyu_Liu4;~Huayi_Tang1;~Yong_Liu7", "aff": "Renmin University of China;;Renmin University of China", "aff_domain": "ruc.edu.cn;;ruc.edu.cn", "position": "PhD student;;Associate Professor", "bibtex": "@inproceedings{\nliu2024perfect,\ntitle={Perfect Alignment May be Poisonous to Graph Contrastive Learning},\nauthor={Jingyu Liu and Huayi Tang and Yong Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wdezvnc9EG}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7295876, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12307682829642070275&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "ruc.edu.cn;;ruc.edu.cn", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Renmin University of China", "aff_unique_dep": "", "aff_unique_url": "http://www.ruc.edu.cn", "aff_unique_abbr": "RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Incremental Topological Ordering and Cycle Detection with Predictions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32743", "id": "wea7nsJdMc", "proceeding": "https://proceedings.mlr.press/v235/mccauley24a.html", "pdf": "https://openreview.net/pdf?id=wea7nsJdMc", "openreview": "https://openreview.net/forum?id=wea7nsJdMc", "author_site": "Samuel McCauley, Benjamin Moseley, Aidin Niaparast, Shikha Singh", "tldr": "", "abstract": "This paper leverages the framework of algorithms-with-predictions to design data structures for two fundamental dynamic graph problems: incremental topological ordering and cycle detection. In these problems, the input is a directed graph on $n$ nodes, and the $m$ edges arrive one by one. The data structure must maintain a topological ordering of the vertices at all times and detect if the newly inserted edge creates a cycle. The theoretically best worst-case algorithms for these problems have high update cost (polynomial in $n$ and $m$). In practice, greedy heuristics (that recompute the solution from scratch each time) perform well but can have high update cost in the worst case. In this paper, we bridge this gap by leveraging predictions to design a learned new data structure for the problems. Our data structure guarantees consistency, robustness, and smoothness with respect to predictions---that is, it has the best possible running time under perfect predictions, never performs worse than the best-known worst-case methods, and its running time degrades smoothly with the prediction error. Moreover, we demonstrate empirically that predictions, learned from a very small training dataset, are sufficient to provide significant speed-ups on real datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Samuel McCauley;Benjamin Moseley;Aidin Niaparast;Shikha Singh", "authorids": "~Samuel_McCauley1;~Benjamin_Moseley1;~Aidin_Niaparast1;~Shikha_Singh2", "gender": ";M;;F", "homepage": "http://dept.cs.williams.edu/~sam/;http://www.andrew.cmu.edu/user/moseleyb/;;https://www.cs.williams.edu/~shikha/", "dblp": "09/11461;28/5638;;124/3768-2", "google_scholar": ";qq-SXN8AAAAJ;;0aLpDg4AAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Samuel_McCauley1;~Benjamin_Moseley1;~Aidin_Niaparast1;~Shikha_Singh2", "aff": "Williams College;RelationalAI;;Williams College", "aff_domain": "cs.williams.edu;relational.ai;;williams.edu", "position": "Assistant Professor;Researcher;;Assistant Professor", "bibtex": "@inproceedings{\nmccauley2024incremental,\ntitle={Incremental Topological Ordering and Cycle Detection with Predictions},\nauthor={Samuel McCauley and Benjamin Moseley and Aidin Niaparast and Shikha Singh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wea7nsJdMc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 513125, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10770760228043128436&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "cs.williams.edu;relational.ai;;williams.edu", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Williams College;RelationalAI", "aff_unique_dep": ";", "aff_unique_url": "https://www.williams.edu;https://www.relationalai.com", "aff_unique_abbr": "Williams;RelationalAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "On The Fairness Impacts of Hardware Selection in Machine Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32742", "id": "weixEb6Wjd", "proceeding": "https://proceedings.mlr.press/v235/nelaturu24a.html", "pdf": "https://openreview.net/pdf?id=weixEb6Wjd", "openreview": "https://openreview.net/forum?id=weixEb6Wjd", "author_site": "Sree Harsha Nelaturu, Nishaanth Kanna, Cuong Tran, Sara Hooker, Ferdinando Fioretto", "tldr": "", "abstract": "In the machine learning ecosystem, hardware selection is often regarded as a mere utility, overshadowed by the spotlight on algorithms and data. This is especially relevant in contexts like ML-as-a-service platforms, where users often lack control over the hardware used for model deployment. This paper investigates the influence of hardware on the delicate balance between model performance and fairness. We demonstrate that hardware choices can exacerbate existing disparities, attributing these discrepancies to variations in gradient flows and loss surfaces across different demographic groups. Through both theoretical and empirical analysis, the paper not only identifies the underlying factors but also proposes an effective strategy for mitigating hardware-induced performance imbalances.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sree Harsha Nelaturu;Nishaanth Kanna Ravichandran;Cuong Tran;Sara Hooker;Ferdinando Fioretto", "authorids": "~Sree_Harsha_Nelaturu1;~Nishaanth_Kanna_Ravichandran1;~Cuong_Tran1;~Sara_Hooker2;~Ferdinando_Fioretto1", "gender": "M;M;M;M;", "homepage": ";;;http://nandofioretto.com;https://www.sarahooker.me/", "dblp": "247/1104;360/4955;275/3885;119/6404;210/2611", "google_scholar": "5OQze6gAAAAJ;vT3LDgwAAAAJ;RiYBF7sAAAAJ;ASf9Q04AAAAJ;2xy6h3sAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Sree_Harsha_Nelaturu1;~Nishaanth_Kanna_Ravichandran1;~Cuong_Tran1;~Ferdinando_Fioretto1;~Sara_Hooker1", "aff": "CISPA Helmholtz Center for Information Security;;University of Virginia, Charlottesville;University of Virginia, Charlottesville;Cohere For AI", "aff_domain": "cispa.de;;virginia.edu;virginia.edu;cohere.com", "position": "Hiwi;;Postdoc;Assistant Professor;Principal Researcher", "bibtex": "@inproceedings{\nnelaturu2024on,\ntitle={On The Fairness Impacts of Hardware Selection in Machine Learning},\nauthor={Sree Harsha Nelaturu and Nishaanth Kanna Ravichandran and Cuong Tran and Sara Hooker and Ferdinando Fioretto},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=weixEb6Wjd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 760293, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9038882036785094816&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "cispa.de;;virginia.edu;virginia.edu;cohere.com", "author_num": 5, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "CISPA Helmholtz Center for Information Security;University of Virginia;Cohere", "aff_unique_dep": ";;Cohere AI", "aff_unique_url": "https://www.cispa.de/;https://www.virginia.edu;https://cohere.ai", "aff_unique_abbr": "CISPA;UVA;Cohere", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Charlottesville", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Germany;United States" }, { "title": "InterLUDE: Interactions between Labeled and Unlabeled Data to Enhance Semi-Supervised Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32741", "id": "wilej5VnqL", "proceeding": "https://proceedings.mlr.press/v235/huang24af.html", "pdf": "https://openreview.net/pdf?id=wilej5VnqL", "openreview": "https://openreview.net/forum?id=wilej5VnqL", "author_site": "Zhe Huang, Xiaowei Yu, Dajiang Zhu, Michael Hughes", "tldr": "", "abstract": "Semi-supervised learning (SSL) seeks to enhance task performance by training on both labeled and unlabeled data. Mainstream SSL image classification methods mostly optimize a loss that additively combines a supervised classification objective with a regularization term derived *solely* from unlabeled data. This formulation often neglects the potential for interaction between labeled and unlabeled images. In this paper, we introduce InterLUDE, a new approach to enhance SSL made of two parts that each benefit from labeled-unlabeled interaction. The first part, embedding fusion, interpolates between labeled and unlabeled embeddings to improve representation learning. The second part is a new loss, grounded in the principle of consistency regularization, that aims to minimize discrepancies in the model's predictions between labeled versus unlabeled inputs. Experiments on standard closed-set SSL benchmarks and a medical SSL task with an uncurated unlabeled set show clear benefits to our approach. On the STL-10 dataset with only 40 labels, InterLUDE achieves **3.2%** error rate, while the best previous method reports 6.3%.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhe Huang;Xiaowei Yu;Dajiang Zhu;Michael C Hughes", "authorids": "~Zhe_Huang2;~Xiaowei_Yu1;~Dajiang_Zhu1;~Michael_C_Hughes1", "gender": "M;M;M;M", "homepage": "https://hzhz2020.github.io/;http://shawey94.github.io/;https://mentis.uta.edu/explore/profile/dajiang-zhu;https://www.michaelchughes.com", "dblp": "30/8073;;https://dblp.uni-trier.de/pers/hd/z/Zhu:Dajiang;117/8186", "google_scholar": "https://scholar.google.com/citations?hl=en;Kc1FjToAAAAJ;cFgudIYAAAAJ;https://scholar.google.ca/citations?user=ugSmcnoAAAAJ", "orcid": ";;;", "linkedin": "zhe-huang-7aa065113/;shawey94;;", "or_profile": "~Zhe_Huang2;~Xiaowei_Yu1;~Dajiang_Zhu1;~Michael_C_Hughes1", "aff": "Tufts University;University of Texas at Arlington, University of Texas at Arlington;University of Texas at Arlington;Tufts University", "aff_domain": "tufts.edu;mavs.uta.edu;uta.edu;tufts.edu", "position": "PhD student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nhuang2024interlude,\ntitle={Inter{LUDE}: Interactions between Labeled and Unlabeled Data to Enhance Semi-Supervised Learning},\nauthor={Zhe Huang and Xiaowei Yu and Dajiang Zhu and Michael C Hughes},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wilej5VnqL}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 593406, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4416376335456877513&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "tufts.edu;mavs.uta.edu;uta.edu;tufts.edu", "author_num": 4, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Tufts University;University of Texas at Arlington", "aff_unique_dep": ";", "aff_unique_url": "https://www.tufts.edu;https://www.uta.edu", "aff_unique_abbr": "Tufts;UTA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Arlington", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Joint Composite Latent Space Bayesian Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32740", "id": "wkCUmO7oi2", "proceeding": "https://proceedings.mlr.press/v235/maus24a.html", "pdf": "https://openreview.net/pdf?id=wkCUmO7oi2", "openreview": "https://openreview.net/forum?id=wkCUmO7oi2", "author_site": "Natalie Maus, Zhiyuan Jerry Lin, Maximilian Balandat, Eytan Bakshy", "tldr": "", "abstract": "Bayesian Optimization (BO) is a technique for sample-efficient black-box optimization that employs probabilistic models to identify promising input for evaluation. When dealing with composite-structured functions, such as $f=g \\circ h$, evaluating a specific location $x$ yields observations of both the final outcome $f(x) = g(h(x))$ as well as the intermediate output(s) $h(x)$. Previous research has shown that integrating information from these intermediate outputs can enhance BO performance substantially. However, existing methods struggle if the outputs $h(x)$ are high-dimensional. Many relevant problems fall into this setting, including in the context of generative AI, molecular design, or robotics. To effectively tackle these challenges, we introduce Joint Composite Latent Space Bayesian Optimization (JoCo), a novel framework that jointly trains neural network encoders and probabilistic models to adaptively compress high-dimensional input and output spaces into manageable latent representations. This enables effective BO on these compressed representations, allowing JoCo to outperform other state-of-the-art methods in high-dimensional BO on a wide variety of simulated and real-world problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Natalie Maus;Zhiyuan Jerry Lin;Maximilian Balandat;Eytan Bakshy", "authorids": "~Natalie_Maus1;~Zhiyuan_Jerry_Lin1;~Maximilian_Balandat1;~Eytan_Bakshy1", "gender": "F;M;;M", "homepage": "https://sites.google.com/seas.upenn.edu/natalie-maus/;https://itsmrlin.com/;https://research.facebook.com/people/balandat-max/;http://eytan.github.io", "dblp": "264/7932;132/1565-1.html;41/9185;58/2226", "google_scholar": "hNRd6lsAAAAJ;ajmk9mAAAAAJ;N0iLicUAAAAJ;8y9rrq0AAAAJ", "orcid": ";0000-0003-3739-769X;0000-0002-8214-8935;", "linkedin": "natalie-maus-14b936178/;;maximilian-balandat-b5843946/;", "or_profile": "~Natalie_Maus1;~Zhiyuan_Jerry_Lin1;~Maximilian_Balandat1;~Eytan_Bakshy1", "aff": "University of Pennsylvania;Meta;Meta;Meta", "aff_domain": "upenn.edu;meta.com;meta.com;meta.com", "position": "PhD student;Research Scientist;Research Scientist Manager;Principal Researcher", "bibtex": "@inproceedings{\nmaus2024joint,\ntitle={Joint Composite Latent Space Bayesian Optimization},\nauthor={Natalie Maus and Zhiyuan Jerry Lin and Maximilian Balandat and Eytan Bakshy},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wkCUmO7oi2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5790318, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14574369048649139618&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "upenn.edu;meta.com;meta.com;meta.com", "author_num": 4, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of Pennsylvania;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.upenn.edu;https://meta.com", "aff_unique_abbr": "UPenn;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Better Locally Private Sparse Estimation Given Multiple Samples Per User", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32739", "id": "wlBtHP8KqS", "proceeding": "https://proceedings.mlr.press/v235/ma24c.html", "pdf": "https://openreview.net/pdf?id=wlBtHP8KqS", "openreview": "https://openreview.net/forum?id=wlBtHP8KqS", "author_site": "Yuheng Ma, Ke Jia, Hanfang Yang", "tldr": "", "abstract": "Previous studies yielded discouraging results for item-level locally differentially private linear regression with $s$-sparsity assumption, where the minimax rate for $nm$ samples is $\\mathcal{O}(sd / nm\\varepsilon^2)$. This can be challenging for high-dimensional data, where the dimension $d$ is extremely large. In this work, we investigate user-level locally differentially private sparse linear regression. We show that with $n$ users each contributing $m$ samples, the linear dependency of dimension $d$ can be eliminated, yielding an error upper bound of $\\mathcal{O}(s/ nm\\varepsilon^2)$. We propose a framework that first selects candidate variables and then conducts estimation in the narrowed low-dimensional space, which is extendable to general sparse estimation problems with tight error bounds. Experiments on both synthetic and real datasets demonstrate the superiority of the proposed methods. Both the theoretical and empirical results suggest that, with the same number of samples, locally private sparse estimation is better conducted when multiple samples per user are available.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuheng Ma;Ke Jia;Hanfang Yang", "authorids": "~Yuheng_Ma1;~Ke_Jia1;~Hanfang_Yang2", "gender": "M;F;M", "homepage": "https://karlmyh.github.io/;;http://stat.ruc.edu.cn/en/teacher_more.php?cid=89248&id=40", "dblp": "258/0645-1.html;;", "google_scholar": "JvMlW0gAAAAJ;;EsSjDdAAAAAJ", "orcid": ";0009-0008-1110-8670;", "linkedin": ";;", "or_profile": "~Yuheng_Ma1;~Ke_Jia1;~Hanfang_Yang2", "aff": "Mohamed bin Zayed University of Artificial Intelligence;Renmin University of China;Renmin University of China", "aff_domain": "mbzuai.ac.ae;ruc.edu.cn;ruc.edu.cn", "position": "PhD student;MS student;Associate Professor", "bibtex": "@inproceedings{\nma2024better,\ntitle={Better Locally Private Sparse Estimation Given Multiple Samples Per User},\nauthor={Yuheng Ma and Ke Jia and Hanfang Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wlBtHP8KqS}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1529311, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10263036435012952237&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 6, "email": "mbzuai.ac.ae;ruc.edu.cn;ruc.edu.cn", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;Renmin University of China", "aff_unique_dep": ";", "aff_unique_url": "https://mbzuai.ac.ae;http://www.ruc.edu.cn", "aff_unique_abbr": "MBZUAI;RUC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United Arab Emirates;China" }, { "title": "The Good, The Bad, and Why: Unveiling Emotions in Generative AI", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32738", "id": "wlOaG9g0uq", "proceeding": "https://proceedings.mlr.press/v235/li24bs.html", "pdf": "https://openreview.net/pdf?id=wlOaG9g0uq", "openreview": "https://openreview.net/forum?id=wlOaG9g0uq", "author_site": "CHENG LI, Jindong Wang, Yixuan Zhang, Kaijie Zhu, Xinyi Wang, Wenxin Hou, Jianxun Lian, Fang Luo, Qiang Yang, Xing Xie", "tldr": "", "abstract": "Emotion significantly impacts our daily behaviors and interactions. While recent generative AI models, such as large language models, have shown impressive performance in various tasks, it remains unclear whether they truly comprehend emotions and why. This paper aims to address this gap by incorporating psychological theories to gain a holistic understanding of emotions in generative AI models. Specifically, we propose three approaches: 1) EmotionPrompt to enhance AI model performance, 2) EmotionAttack to impair AI model performance, and 3) EmotionDecode to explain the effects of emotional stimuli, both benign and malignant. Through extensive experiments involving language and multi-modal models on semantic understanding, logical reasoning, and generation tasks, we demonstrate that both textual and visual EmotionPrompt can boost the performance of AI models while EmotionAttack can hinder it. More importantly, EmotionDecode reveals that AI models can comprehend emotional stimuli akin to the mechanism of dopamine in the human brain. Our work heralds a novel avenue for exploring psychology to enhance our understanding of generative AI models, thus boosting the research and development of human-AI collaboration and mitigating potential risks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "CHENG LI;Jindong Wang;Yixuan Zhang;Kaijie Zhu;Xinyi Wang;Wenxin Hou;Jianxun Lian;Fang Luo;Qiang Yang;Xing Xie", "authorids": "~CHENG_LI26;~Jindong_Wang1;~Yixuan_Zhang7;~Kaijie_Zhu1;wxymmhww86@163.com;~Wenxin_Hou1;~Jianxun_Lian1;luof@bnu.edu.cn;~Qiang_Yang1;~Xing_Xie3", "gender": ";;F;M;;M;M;;;M", "homepage": "https://scholar.google.com/citations?user=083GCIwAAAAJ&hl=zh-CN;;https://zjanice.github.io/;https://github.com/Immortalise;;https://houwx.net/;https://www.microsoft.com/en-us/research/people/jialia/;;;http://research.microsoft.com/en-us/people/xingx/", "dblp": ";;;56/7058;;270/4628;161/0030;;;08/6809-1", "google_scholar": "083GCIwAAAAJ;;NJJJ45AAAAAJ;;;https://scholar.google.co.jp/citations?user=EbqaLAEAAAAJ;tSq7dIkAAAAJ;;;5EQfAFIAAAAJ", "orcid": ";;;;;;0000-0003-3108-5601;;;0000-0002-8608-8482", "linkedin": ";;;;;;;;;xingx/", "or_profile": "~CHENG_LI26;~Jindong_Wang1;~Yixuan_Zhang7;~Kaijie_Zhu1;wxymmhww86@163.com;~Wenxin_Hou1;~Jianxun_Lian1;luof@bnu.edu.cn;~Qiang_Yang1;~Xing_Xie3", "aff": "Department of Computer Science, University of Washington;;College of William and Mary;Institute of automation, Chinese Academy of Sciences;;Microsoft;Microsoft Research;;;Microsoft Research Asia", "aff_domain": "cs.washington.edu;;wm.edu;ia.ac.cn;;microsoft.com;microsoft.com;;;microsoft.com", "position": "Intern;;Assistant Professor;MS student;;Applied Scientist;Researcher;;;Senior Principal Researcher", "bibtex": "@inproceedings{\nli2024the,\ntitle={The Good, The Bad, and Why: Unveiling Emotions in Generative {AI}},\nauthor={CHENG LI and Jindong Wang and Yixuan Zhang and Kaijie Zhu and Xinyi Wang and Wenxin Hou and Jianxun Lian and Fang Luo and Qiang Yang and Xing Xie},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wlOaG9g0uq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1032102, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5558592794701444423&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "cs.washington.edu;;wm.edu;ia.ac.cn;;microsoft.com;microsoft.com;;;microsoft.com", "author_num": 10, "aff_unique_index": "0;1;2;3;3;3", "aff_unique_norm": "University of Washington;College of William and Mary;Chinese Academy of Sciences;Microsoft", "aff_unique_dep": "Department of Computer Science;;Institute of Automation;Microsoft Corporation", "aff_unique_url": "https://www.washington.edu;https://www.wm.edu;http://www.ia.cas.cn;https://www.microsoft.com", "aff_unique_abbr": "UW;WM;CAS;Microsoft", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Seattle;;Asia", "aff_country_unique_index": "0;0;1;0;0;1", "aff_country_unique": "United States;China" }, { "title": "Matrix Information Theory for Self-Supervised Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32737", "id": "wleAlsklEh", "proceeding": "https://proceedings.mlr.press/v235/zhang24bi.html", "pdf": "https://openreview.net/pdf?id=wleAlsklEh", "openreview": "https://openreview.net/forum?id=wleAlsklEh", "author_site": "Yifan Zhang, Zhiquan Tan, Jingqin Yang, Weiran Huang, Yang Yuan", "tldr": "", "abstract": "The maximum entropy encoding framework provides a unified perspective for many non-contrastive learning methods like SimSiam, Barlow Twins, and MEC. Inspired by this framework, we introduce Matrix-SSL, a novel approach that leverages matrix information theory to interpret the maximum entropy encoding loss as matrix uniformity loss. Furthermore, Matrix-SSL enhances the maximum entropy encoding method by seamlessly incorporating matrix alignment loss, directly aligning covariance matrices in different branches. Experimental results reveal that Matrix-SSL outperforms state-of-the-art methods on the ImageNet dataset under linear evaluation settings and on MS-COCO for transfer learning tasks. Specifically, when performing transfer learning tasks on MS-COCO, our method outperforms previous SOTA methods such as MoCo v2 and BYOL up to 3.3% with only 400 epochs compared to 800 epochs pre-training. We also try to introduce representation learning into the language modeling regime by fine-tuning a 7B model using matrix cross-entropy loss, with a margin of 3.1% on the GSM8K dataset over the standard cross-entropy loss.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yifan Zhang;Zhiquan Tan;Jingqin Yang;Weiran Huang;Yang Yuan", "authorids": "~Yifan_Zhang16;~Zhiquan_Tan1;~Jingqin_Yang2;~Weiran_Huang1;~Yang_Yuan4", "gender": ";M;M;M;M", "homepage": ";;https://github.com/yjqqqaq;https://www.weiranhuang.com;http://people.iiis.tsinghua.edu.cn/~yuanyang/index.html", "dblp": ";326/0177;267/1955;170/0073-1;", "google_scholar": ";;;AjJ2rf8AAAAJ;", "orcid": ";;;;", "linkedin": ";https://www.linkedin.cn/incareer/in/ACoAAC1A8_QBFX8OlchWmVI_pNXN4zm_t6vPKCs;;;", "or_profile": "~Yifan_Zhang16;~Zhiquan_Tan1;~Jingqin_Yang2;~Weiran_Huang1;~Yang_Yuan4", "aff": ";Tsinghua University;Tsinghua University;Shanghai AI Laboratory;Tsinghua University", "aff_domain": ";tsinghua.edu.cn;tsinghua.edu.cn;pjlab.org.cn;tsinghua.edu.cn", "position": ";PhD student;PhD student;Consultant;Assistant Professor", "bibtex": "@inproceedings{\nzhang2024matrix,\ntitle={Matrix Information Theory for Self-Supervised Learning},\nauthor={Yifan Zhang and Zhiquan Tan and Jingqin Yang and Weiran Huang and Yang Yuan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wleAlsklEh}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1317093, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17075301280756353501&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": ";tsinghua.edu.cn;tsinghua.edu.cn;pjlab.org.cn;tsinghua.edu.cn", "author_num": 5, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Tsinghua University;Shanghai AI Laboratory", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "THU;SAIL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Unsupervised Parameter-free Simplicial Representation Learning with Scattering Transforms", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32736", "id": "wmljUnbjy6", "proceeding": "https://proceedings.mlr.press/v235/madhu24a.html", "pdf": "https://openreview.net/pdf?id=wmljUnbjy6", "openreview": "https://openreview.net/forum?id=wmljUnbjy6", "author_site": "Hiren Madhu, Sravanthi Gurugubelli, Sundeep Prabhakar Chepuri", "tldr": "", "abstract": "Simplicial neural network models are becoming popular for processing and analyzing higher-order graph data, but they suffer from high training complexity and dependence on task-specific labels. To address these challenges, we propose simplicial scattering networks (SSNs), a parameter-free model inspired by scattering transforms designed to extract task-agnostic features from simplicial complex data without labels in a principled manner. Specifically, we propose a simplicial scattering transform based on random walk matrices for various adjacencies underlying a simplicial complex. We then use the simplicial scattering transform to construct a deep filter bank network that captures high-frequency information at multiple scales. The proposed simplicial scattering transform possesses properties such as permutation invariance, robustness to perturbations, and expressivity. We theoretically prove that including higher-order information improves the robustness of SSNs to perturbations. Empirical evaluations demonstrate that SSNs outperform existing simplicial or graph neural models in many tasks like node classification, simplicial closure, graph classification, trajectory prediction, and simplex prediction while being computationally efficient.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hiren Madhu;Sravanthi Gurugubelli;Sundeep Prabhakar Chepuri", "authorids": "~Hiren_Madhu1;~Sravanthi_Gurugubelli1;~Sundeep_Prabhakar_Chepuri1", "gender": "M;F;M", "homepage": "http://hirenmadhu.github.io;;https://ece.iisc.ac.in/~spchepuri/", "dblp": ";;72/10237.html", "google_scholar": "Bt8Q-x0AAAAJ;DXyvmJsAAAAJ;Gu8FjdwAAAAJ", "orcid": "0000-0002-6701-6782;;", "linkedin": "hiren-madhu/;;", "or_profile": "~Hiren_Madhu1;~Sravanthi_Gurugubelli1;~Sundeep_Prabhakar_Chepuri1", "aff": "Yale University;Indian Institute of Science, Bangalore;Indian Institute of Science", "aff_domain": "yale.edu;iisc.ac.in;iisc.ac.in", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nmadhu2024unsupervised,\ntitle={Unsupervised Parameter-free Simplicial Representation Learning with Scattering Transforms},\nauthor={Hiren Madhu and Sravanthi Gurugubelli and Sundeep Prabhakar Chepuri},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wmljUnbjy6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1211956, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16774386663299217048&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "yale.edu;iisc.ac.in;iisc.ac.in", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Yale University;Indian Institute of Science", "aff_unique_dep": ";", "aff_unique_url": "https://www.yale.edu;https://www.iisc.ac.in", "aff_unique_abbr": "Yale;IISc", "aff_campus_unique_index": "1", "aff_campus_unique": ";Bangalore", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;India" }, { "title": "Attention Meets Post-hoc Interpretability: A Mathematical Perspective", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32735", "id": "wnkC5T11Z9", "proceeding": "https://proceedings.mlr.press/v235/lopardo24a.html", "pdf": "https://openreview.net/pdf?id=wnkC5T11Z9", "openreview": "https://openreview.net/forum?id=wnkC5T11Z9", "author_site": "Gianluigi Lopardo, Frederic Precioso, Damien Garreau", "tldr": "", "abstract": "Attention-based architectures, in particular transformers, are at the heart of a technological revolution. Interestingly, in addition to helping obtain state-of-the-art results on a wide range of applications, the attention mechanism intrinsically provides meaningful insights on the internal behavior of the model. Can these insights be used as explanations? Debate rages on. In this paper, we mathematically study a simple attention-based architecture and pinpoint the differences between post-hoc and attention-based explanations. We show that they provide quite different results, and that, despite their limitations, post-hoc methods are capable of capturing more useful insights than merely examining the attention weights.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gianluigi Lopardo;Frederic Precioso;Damien Garreau", "authorids": "~Gianluigi_Lopardo1;~Frederic_Precioso2;~Damien_Garreau1", "gender": "M;M;M", "homepage": "https://www.gianluigilopardo.science/;https://sites.google.com/view/damien-garreau/home;https://www.i3s.unice.fr/~precioso/", "dblp": ";151/6584;83/1407.html", "google_scholar": "Ddns-QsAAAAJ;https://scholar.google.fr/citations?user=qn4N61QAAAAJ;-0cKTucAAAAJ", "orcid": ";0000-0002-7855-2847;0000-0001-8712-1443", "linkedin": "gianluigilopardo/;damien-garreau-05817858/;fr%C3%A9d%C3%A9ric-precioso-3a37389/", "or_profile": "~Gianluigi_Lopardo1;~Damien_Garreau1;~Frederic_Precioso1", "aff": "Universit\u00e9 de Nice-Sophia Antipolis;Universit\u00e9 C\u00f4te d'Azur;Universit\u00e9 de Nice-Sophia Antipolis", "aff_domain": "unice.fr;unice.fr;unice.fr", "position": "Postdoc;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nlopardo2024attention,\ntitle={Attention Meets Post-hoc Interpretability: A Mathematical Perspective},\nauthor={Gianluigi Lopardo and Frederic Precioso and Damien Garreau},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wnkC5T11Z9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1378786, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7591320079320922248&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "unice.fr;unice.fr;unice.fr", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Universit\u00e9 de Nice-Sophia Antipolis;Universit\u00e9 C\u00f4te d'Azur", "aff_unique_dep": ";", "aff_unique_url": "https://www.unice.fr;https://www.univ-cotedazur.fr", "aff_unique_abbr": "UNICA;UCA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Sophia Antipolis;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "TimeSiam: A Pre-Training Framework for Siamese Time-Series Modeling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32734", "id": "wrTzLoqbCg", "proceeding": "https://proceedings.mlr.press/v235/dong24e.html", "pdf": "https://openreview.net/pdf?id=wrTzLoqbCg", "openreview": "https://openreview.net/forum?id=wrTzLoqbCg", "author_site": "Jiaxiang Dong, Haixu Wu, Yuxuan Wang, Yun-Zhong Qiu, Li Zhang, Jianmin Wang, Mingsheng Long", "tldr": "", "abstract": "Time series pre-training has recently garnered wide attention for its potential to reduce labeling expenses and benefit various downstream tasks. Prior methods are mainly based on pre-training techniques well-acknowledged in vision or language, such as masked modeling and contrastive learning. However, randomly masking time series or calculating series-wise similarity will distort or neglect inherent temporal correlations crucial in time series data. To emphasize temporal correlation modeling, this paper proposes TimeSiam as a simple but effective self-supervised pre-training framework for Time series based on Siamese networks. Concretely, TimeSiam pre-trains Siamese encoders to capture intrinsic temporal correlations between randomly sampled past and current subseries. With a simple data augmentation method (e.g. masking), TimeSiam can benefit from diverse augmented subseries and learn internal time-dependent representations through a past-to-current reconstruction. Moreover, learnable lineage embeddings are also introduced to distinguish temporal distance between sampled series and further foster the learning of diverse temporal correlations. TimeSiam consistently outperforms extensive advanced pre-training baselines, demonstrating superior forecasting and classification capabilities across 13 standard benchmarks in both intra- and cross-domain scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiaxiang Dong;Haixu Wu;Yuxuan Wang;Yunzhong Qiu;Li Zhang;Jianmin Wang;Mingsheng Long", "authorids": "~Jiaxiang_Dong1;~Haixu_Wu1;~Yuxuan_Wang5;~Yunzhong_Qiu1;~Li_Zhang37;~Jianmin_Wang1;~Mingsheng_Long5", "gender": ";M;;M;M;M;", "homepage": ";;;;https://www.thss.tsinghua.edu.cn/faculty/zhangli.htm;https://www.thss.tsinghua.edu.cn/en/faculty/jianminwang.htm;", "dblp": ";286/8115;;;;06/3456-1.html;", "google_scholar": ";oLL_x0wAAAAJ;;0kiLvzkAAAAJ;;https://scholar.google.com.tw/citations?user=MiovcboAAAAJ;", "orcid": ";;0000-0002-4899-4716;0009-0003-1034-1140;;0000-0001-6841-7943;", "linkedin": ";;;;;;", "or_profile": "~Jiaxiang_Dong1;~Haixu_Wu1;~Yuxuan_Wang5;~Yunzhong_Qiu1;~Li_Zhang37;~Jianmin_Wang1;~Mingsheng_Long5", "aff": ";Tsinghua University;Tsinghua University;South China University of Technology;;Tsinghua University;", "aff_domain": ";tsinghua.edu.cn;mail.tsinghua.edu.cn;scut.edu.cn;;tsinghua.edu.cn;", "position": ";PhD student;PhD student;Undergrad student;;Full Professor;", "bibtex": "@inproceedings{\ndong2024timesiam,\ntitle={TimeSiam: A Pre-Training Framework for Siamese Time-Series Modeling},\nauthor={Jiaxiang Dong and Haixu Wu and Yuxuan Wang and Yunzhong Qiu and Li Zhang and Jianmin Wang and Mingsheng Long},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wrTzLoqbCg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8238642, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12522336095096910789&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": ";tsinghua.edu.cn;mail.tsinghua.edu.cn;scut.edu.cn;;tsinghua.edu.cn;", "author_num": 7, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Tsinghua University;South China University of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.scut.edu.cn", "aff_unique_abbr": "THU;SCUT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Noise-Aware Algorithm for Heterogeneous Differentially Private Federated Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32733", "id": "wuQ2DRPAuy", "proceeding": "https://proceedings.mlr.press/v235/malekmohammadi24a.html", "pdf": "https://openreview.net/pdf?id=wuQ2DRPAuy", "openreview": "https://openreview.net/forum?id=wuQ2DRPAuy", "author_site": "Saber Malekmohammadi, Yaoliang Yu, YANG CAO", "tldr": "", "abstract": "High utility and rigorous data privacy are of the main goals of a federated learning (FL) system, which learns a model from the data distributed among some clients. The latter has been tried to achieve by using differential privacy in FL (DPFL). There is often heterogeneity in clients' privacy requirements, and existing DPFL works either assume uniform privacy requirements for clients or are not applicable when server is not fully trusted (our setting). Furthermore, there is often heterogeneity in batch and/or dataset size of clients, which as shown, results in extra variation in the DP noise level across clients' model updates. With these sources of heterogeneity, straightforward aggregation strategies, e.g., assigning clients' aggregation weights proportional to their privacy parameters ($\\epsilon$) will lead to lower utility. We propose Robust-HDP, which efficiently estimates the true noise level in clients' model updates and reduces the noise-level in the aggregated model updates considerably. Robust-HDP improves utility and convergence speed, while being safe to the clients that may maliciously send falsified privacy parameter $\\epsilon$ to server. Extensive experimental results on multiple datasets and our theoretical analysis confirm the effectiveness of Robust-HDP. Our code can be found here: https://github.com/Saber-mm/HDPFL.git", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Saber Malekmohammadi;Yaoliang Yu;YANG CAO", "authorids": "~Saber_Malekmohammadi1;~Yaoliang_Yu1;~YANG_CAO10", "gender": "M;M;Unspecified", "homepage": ";https://cs.uwaterloo.ca/~y328yu/;https://yangcao88.github.io/", "dblp": "236/4827;90/4989;https://dblp.uni-trier.de/pid/25/7045-11", "google_scholar": ";https://scholar.google.ca/citations?user=zbXIQMsAAAAJ;https://scholar.google.co.jp/citations?user=S-p4DFMAAAAJ", "orcid": ";0000-0002-3823-0720;0000-0002-6424-8633", "linkedin": "saber-mm;;", "or_profile": "~Saber_Malekmohammadi1;~Yaoliang_Yu1;~YANG_CAO10", "aff": "University of Waterloo;University of Waterloo;Hokkaido University", "aff_domain": "uwaterloo.ca;uwaterloo.ca;hokudai.ac.jp", "position": "PhD Student;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nmalekmohammadi2024noiseaware,\ntitle={Noise-Aware Algorithm for Heterogeneous Differentially Private Federated Learning},\nauthor={Saber Malekmohammadi and Yaoliang Yu and YANG CAO},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wuQ2DRPAuy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1881931, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2189799830929108890&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "uwaterloo.ca;uwaterloo.ca;hokudai.ac.jp", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Waterloo;Hokkaido University", "aff_unique_dep": ";", "aff_unique_url": "https://uwaterloo.ca;https://www.hokudai.ac.jp", "aff_unique_abbr": "UW;Hokkaido U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Canada;Japan" }, { "title": "A Computational Framework for Solving Wasserstein Lagrangian Flows", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32732", "id": "wwItuHdus6", "proceeding": "https://proceedings.mlr.press/v235/neklyudov24a.html", "pdf": "https://openreview.net/pdf?id=wwItuHdus6", "openreview": "https://openreview.net/forum?id=wwItuHdus6", "author_site": "Kirill Neklyudov, Rob Brekelmans, Alexander Tong, Lazar Atanackovic, qiang liu, Alireza Makhzani", "tldr": "", "abstract": "The dynamical formulation of the optimal transport can be extended through various choices of the underlying geometry (*kinetic energy*), and the regularization of density paths (*potential energy*). These combinations yield different variational problems (*Lagrangians*), encompassing many variations of the optimal transport problem such as the Schr\u00f6dinger bridge, unbalanced optimal transport, and optimal transport with physical constraints, among others. In general, the optimal density path is unknown, and solving these variational problems can be computationally challenging. We propose a novel deep learning based framework approaching all of these problems from a unified perspective. Leveraging the dual formulation of the Lagrangians, our method does not require simulating or backpropagating through the trajectories of the learned dynamics, and does not need access to optimal couplings. We showcase the versatility of the proposed framework by outperforming previous approaches for the single-cell trajectory inference, where incorporating prior knowledge into the dynamics is crucial for correct predictions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kirill Neklyudov;Rob Brekelmans;Alexander Tong;Lazar Atanackovic;qiang liu;Alireza Makhzani", "authorids": "~Kirill_Neklyudov1;~Rob_Brekelmans1;~Alexander_Tong1;~Lazar_Atanackovic1;~qiang_liu4;~Alireza_Makhzani1", "gender": "M;M;;M;;M", "homepage": "https://necludov.github.io/;https://brekelma.github.io;https://alextong.net;https://lazaratan.github.io/;http://www.alireza.ai/;https://www.cs.utexas.edu/~lqiang/", "dblp": "195/1093;207/7856.html;153/9296;235/6207;122/5126.html;61/3234-1", "google_scholar": "https://scholar.google.ru/citations?user=eOttYWgAAAAJ;M6ADg_UAAAAJ;CS80pt4AAAAJ;qhTWIh4AAAAJ;B0KVWJEAAAAJ;https://scholar.google.com.tw/citations?user=2qDh4WUAAAAJ", "orcid": ";;0000-0002-2031-4096;;;", "linkedin": ";;atong01/;;;", "or_profile": "~Kirill_Neklyudov1;~Rob_Brekelmans1;~Alexander_Tong1;~Lazar_Atanackovic1;~Alireza_Makhzani1;~Qiang_Liu1", "aff": "Vector Institute;;Universit\u00e9 de Montr\u00e9al;Valence Labs powered by recursion;Vector Institute;University of Texas, Austin", "aff_domain": "vectorinstitute.ai;;umontreal.ca;valencelabs.com;vectorinstitute.ai;utexas.edu", "position": "Postdoc;;Postdoc;Intern;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nneklyudov2024a,\ntitle={A Computational Framework for Solving Wasserstein Lagrangian Flows},\nauthor={Kirill Neklyudov and Rob Brekelmans and Alexander Tong and Lazar Atanackovic and qiang liu and Alireza Makhzani},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wwItuHdus6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4291792, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13718585745659394597&as_sdt=20000005&sciodt=0,21&hl=en", "gs_version_total": 7, "email": "vectorinstitute.ai;;umontreal.ca;valencelabs.com;vectorinstitute.ai;utexas.edu", "author_num": 6, "aff_unique_index": "0;1;2;0;3", "aff_unique_norm": "Vector Institute;Universit\u00e9 de Montr\u00e9al;Valence Labs;University of Texas at Austin", "aff_unique_dep": ";;;", "aff_unique_url": "https://vectorinstitute.ai/;https://www.umontreal.ca;;https://www.utexas.edu", "aff_unique_abbr": "Vector Institute;UdeM;;UT Austin", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;0;2", "aff_country_unique": "Canada;;United States" }, { "title": "Randomized Confidence Bounds for Stochastic Partial Monitoring", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32731", "id": "x0vLj1S6Wg", "proceeding": "https://proceedings.mlr.press/v235/heuillet24a.html", "pdf": "https://openreview.net/pdf?id=x0vLj1S6Wg", "openreview": "https://openreview.net/forum?id=x0vLj1S6Wg", "author_site": "Maxime Heuillet, Ola Ahmad, Audrey Durand", "tldr": "", "abstract": "The partial monitoring (PM) framework provides a theoretical formulation of sequential learning problems with incomplete feedback. At each round, a learning agent plays an action while the environment simultaneously chooses an outcome. The agent then observes a feedback signal that is only partially informative about the (unobserved) outcome. The agent leverages the received feedback signals to select actions that minimize the (unobserved) cumulative loss. In contextual PM, the outcomes depend on some side information that is observable by the agent before selecting the action. In this paper, we consider the contextual and non-contextual PM settings with stochastic outcomes. We introduce a new class of PM strategies based on the randomization of deterministic confidence bounds. We also extend regret guarantees to settings where existing stochastic strategies are not applicable. Our experiments show that the proposed RandCBP and RandCBPside* strategies have competitive performance against state-of-the-art baselines in multiple PM games. To illustrate how the PM framework can benefit real world applications, we design a use case on the real-world problem of monitoring the error rate of any deployed classification system.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Maxime Heuillet;Ola Ahmad;Audrey Durand", "authorids": "~Maxime_Heuillet1;~Ola_Ahmad1;~Audrey_Durand1", "gender": ";F;F", "homepage": "https://github.com/MaxHeuillet;;https://audur2.ift.ulaval.ca/", "dblp": ";99/10407;70/9804", "google_scholar": "SduUuGQAAAAJ;https://scholar.google.fr/citations?user=MbXCWNMAAAAJ;https://scholar.google.ca/citations?user=Qdm6sEwAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Maxime_Heuillet1;~Ola_Ahmad1;~Audrey_Durand1", "aff": "Universit\u00e9 Laval;Universit\u00e9 Laval;Universit\u00e9 Laval", "aff_domain": "ift.ulaval.ca;ulaval.ca;ulaval.ca", "position": "PhD student;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nheuillet2024randomized,\ntitle={Randomized Confidence Bounds for Stochastic Partial Monitoring},\nauthor={Maxime Heuillet and Ola Ahmad and Audrey Durand},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=x0vLj1S6Wg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1063789, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13687289580969620429&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 7, "email": "ift.ulaval.ca;ulaval.ca;ulaval.ca", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Universit\u00e9 Laval", "aff_unique_dep": "", "aff_unique_url": "https://www.ulaval.ca", "aff_unique_abbr": "ULaval", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "title": "HarmonyDream: Task Harmonization Inside World Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32730", "id": "x0yIaw2fgk", "proceeding": "https://proceedings.mlr.press/v235/ma24o.html", "pdf": "https://openreview.net/pdf?id=x0yIaw2fgk", "openreview": "https://openreview.net/forum?id=x0yIaw2fgk", "author_site": "Haoyu Ma, Jialong Wu, Ningya Feng, Chenjun Xiao, Dong Li, Jianye Hao, Jianmin Wang, Mingsheng Long", "tldr": "", "abstract": "Model-based reinforcement learning (MBRL) holds the promise of sample-efficient learning by utilizing a world model, which models how the environment works and typically encompasses components for two tasks: observation modeling and reward modeling. In this paper, through a dedicated empirical investigation, we gain a deeper understanding of the role each task plays in world models and uncover the overlooked potential of sample-efficient MBRL by mitigating the domination of either observation or reward modeling. Our key insight is that while prevalent approaches of explicit MBRL attempt to restore abundant details of the environment via observation models, it is difficult due to the environment's complexity and limited model capacity. On the other hand, reward models, while dominating implicit MBRL and adept at learning compact task-centric dynamics, are inadequate for sample-efficient learning without richer learning signals. Motivated by these insights and discoveries, we propose a simple yet effective approach, HarmonyDream, which automatically adjusts loss coefficients to maintain task harmonization, i.e. a dynamic equilibrium between the two tasks in world model learning. Our experiments show that the base MBRL method equipped with HarmonyDream gains 10%-69% absolute performance boosts on visual robotic tasks and sets a new state-of-the-art result on the Atari 100K benchmark. Code is available at https://github.com/thuml/HarmonyDream.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haoyu Ma;Jialong Wu;Ningya Feng;Chenjun Xiao;Dong Li;Jianye HAO;Jianmin Wang;Mingsheng Long", "authorids": "~Haoyu_Ma3;~Jialong_Wu1;~Ningya_Feng1;~Chenjun_Xiao1;~Dong_Li10;~Jianye_HAO1;~Jianmin_Wang1;~Mingsheng_Long5", "gender": ";M;M;;M;M;M;", "homepage": ";https://manchery.github.io/;https://github.com/fny21;https://chenjun-x.github.io/;;http://www.icdai.org/jianye.html;https://www.thss.tsinghua.edu.cn/en/faculty/jianminwang.htm;", "dblp": ";73/498-1.html;358/4461;178/8641;47/4826-16;21/7664.html;06/3456-1.html;", "google_scholar": ";FfTZ66gAAAAJ;;;;;https://scholar.google.com.tw/citations?user=MiovcboAAAAJ;", "orcid": ";0009-0008-7846-053X;0009-0006-8448-2570;0000-0002-5493-1500;;0000-0002-0422-8235;0000-0001-6841-7943;", "linkedin": ";;;;;;;", "or_profile": "~Haoyu_Ma3;~Jialong_Wu1;~Ningya_Feng1;~Chenjun_Xiao1;~Dong_Li10;~Jianye_HAO1;~Jianmin_Wang1;~Mingsheng_Long5", "aff": ";Tsinghua University;Tsinghua University;Huawei Technologies Ltd.;Huawei Technologies Ltd.;Tianjin University;Tsinghua University;", "aff_domain": ";tsinghua.edu.cn;tsinghua.edu.cn;huawei.com;huawei.com;tju.edu.cn;tsinghua.edu.cn;", "position": ";PhD student;Undergrad student;Researcher;Principal Researcher;Associate Professor;Full Professor;", "bibtex": "@inproceedings{\nma2024harmonydream,\ntitle={HarmonyDream: Task Harmonization Inside World Models},\nauthor={Haoyu Ma and Jialong Wu and Ningya Feng and Chenjun Xiao and Dong Li and Jianye HAO and Jianmin Wang and Mingsheng Long},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=x0yIaw2fgk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2787174, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7776303018684429750&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": ";tsinghua.edu.cn;tsinghua.edu.cn;huawei.com;huawei.com;tju.edu.cn;tsinghua.edu.cn;", "author_num": 8, "aff_unique_index": "0;0;1;1;2;0", "aff_unique_norm": "Tsinghua University;Huawei;Tianjin University", "aff_unique_dep": ";Huawei Technologies;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.huawei.com;http://www.tju.edu.cn", "aff_unique_abbr": "THU;Huawei;TJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Improved Communication-Privacy Trade-offs in $L_2$ Mean Estimation under Streaming Differential Privacy", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32729", "id": "x1G7ieRgRd", "proceeding": "https://proceedings.mlr.press/v235/chen24v.html", "pdf": "https://openreview.net/pdf?id=x1G7ieRgRd", "openreview": "https://openreview.net/forum?id=x1G7ieRgRd", "author_site": "Wei-Ning Chen, Berivan Isik, Peter Kairouz, Albert No, Sewoong Oh, Zheng Xu", "tldr": "", "abstract": "We study $L_2$ mean estimation under central differential privacy and communication constraints, and address two key challenges: firstly, existing mean estimation schemes that simultaneously handle both constraints are usually optimized for $L_\\infty$ geometry and rely on random rotation or Kashin's representation to adapt to $L_2$ geometry, resulting in suboptimal leading constants in mean square errors (MSEs); secondly, schemes achieving order-optimal communication-privacy trade-offs do not extend seamlessly to streaming differential privacy (DP) settings (e.g., tree aggregation or matrix factorization), rendering them incompatible with DP-FTRL type optimizers. In this work, we tackle these issues by introducing a novel privacy accounting method for the sparsified Gaussian mechanism that incorporates the randomness inherent in sparsification into the DP noise. Unlike previous approaches, our accounting algorithm directly operates in $L_2$ geometry, yielding MSEs that fast converge to those of the uncompressed Gaussian mechanism. Additionally, we extend the sparsification scheme to the matrix factorization framework under streaming DP and provide a precise accountant tailored for DP-FTRL type optimizers. Empirically, our method demonstrates at least a 100x improvement of compression for DP-SGD across various FL tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wei-Ning Chen;Berivan Isik;Peter Kairouz;Albert No;Sewoong Oh;Zheng Xu", "authorids": "~Wei-Ning_Chen1;~Berivan_Isik1;~Peter_Kairouz1;~Albert_No1;~Sewoong_Oh3;~Zheng_Xu2", "gender": ";;M;;;M", "homepage": "https://web.stanford.edu/~wnchen/index.html;https://sites.google.com/view/berivanisik;https://kairouzp.github.io/;http://albert-no.github.io/;https://sites.google.com/site/xuzhustc/;https://homes.cs.washington.edu/~sewoong/", "dblp": "51/2118;265/6197;129/1254;https://dblp.uni-trier.de/pid/23/11268;83/2535-2;80/4366", "google_scholar": "-TqCZLIAAAAJ;GdXOFKoAAAAJ;m8NUgw0AAAAJ;Kzj3HC8AAAAJ;TfWlMTYAAAAJ;55TAOdgAAAAJ", "orcid": "0000-0001-7355-9487;;;;0009-0003-6747-3953;", "linkedin": ";berivan-isik-439a3b122/;kayrouzp;;zheng-xu-0a125236/;", "or_profile": "~Wei-Ning_Chen1;~Berivan_Isik1;~Peter_Kairouz1;~Albert_No1;~Zheng_Xu2;~Sewoong_Oh1", "aff": "Stanford University;Stanford University;Google;Hongik University;Google;University of Washington", "aff_domain": "stanford.edu;stanford.edu;google.com;hongik.ac.kr;google.com;uw.edu", "position": "PhD student;PhD student;Research Scientist;Assistant Professor;Research Scientist;Full Professor", "bibtex": "@inproceedings{\nchen2024improved,\ntitle={Improved Communication-Privacy Trade-offs in \\$L\\_2\\$ Mean Estimation under Streaming Differential Privacy},\nauthor={Wei-Ning Chen and Berivan Isik and Peter Kairouz and Albert No and Sewoong Oh and Zheng Xu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=x1G7ieRgRd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1041450, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11420563695222909437&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 9, "email": "stanford.edu;stanford.edu;google.com;hongik.ac.kr;google.com;uw.edu", "author_num": 6, "aff_unique_index": "0;0;1;2;1;3", "aff_unique_norm": "Stanford University;Google;Hongik University;University of Washington", "aff_unique_dep": ";Google;;", "aff_unique_url": "https://www.stanford.edu;https://www.google.com;https://www.hongik.ac.kr;https://www.washington.edu", "aff_unique_abbr": "Stanford;Google;HU;UW", "aff_campus_unique_index": "0;0;1;1", "aff_campus_unique": "Stanford;Mountain View;", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "United States;South Korea" }, { "title": "FedBAT: Communication-Efficient Federated Learning via Learnable Binarization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32728", "id": "x2zxPwCkAZ", "proceeding": "https://proceedings.mlr.press/v235/li24ca.html", "pdf": "https://openreview.net/pdf?id=x2zxPwCkAZ", "openreview": "https://openreview.net/forum?id=x2zxPwCkAZ", "author_site": "Shiwei Li, Wenchao Xu, Haozhao Wang, Xing Tang, Yining Qi, Shijie Xu, weihongluo, Yuhua Li, xiuqiang He, Ruixuan Li", "tldr": "", "abstract": "Federated learning is a promising distributed machine learning paradigm that can effectively exploit large-scale data without exposing users' privacy. However, it may incur significant communication overhead, thereby potentially impairing the training efficiency. To address this challenge, numerous studies suggest binarizing the model updates. Nonetheless, traditional methods usually binarize model updates in a post-training manner, resulting in significant approximation errors and consequent degradation in model accuracy. To this end, we propose **Federated Binarization-Aware Training (FedBAT)**, a novel framework that directly learns binary model updates during the local training process, thus inherently reducing the approximation errors. FedBAT incorporates an innovative binarization operator, along with meticulously designed derivatives to facilitate efficient learning. In addition, we establish theoretical guarantees regarding the convergence of FedBAT. Extensive experiments are conducted on four popular datasets. The results show that FedBAT significantly accelerates the convergence and exceeds the accuracy of baselines by up to 9%, even surpassing that of FedAvg in some cases.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shiwei Li;Wenchao Xu;Haozhao Wang;Xing Tang;Yining Qi;Shijie Xu;weihongluo;Yuhua Li;xiuqiang He;Ruixuan Li", "authorids": "~Shiwei_Li3;~Wenchao_Xu1;~Haozhao_Wang1;~Xing_Tang2;~Yining_Qi1;~Shijie_Xu1;~weihongluo1;~Yuhua_Li2;~xiuqiang_He3;~Ruixuan_Li1", "gender": "M;;M;M;F;M;M;F;M;M", "homepage": "https://leopold1423.github.io/;;https://wanghaozhao.mysxl.cn/;https://xingt-tang.github.io;;https://github.com/shijiexu09;https://github.com/lobby66;;https://he-xiuqiang.github.io/;http://idc.hust.edu.cn/rxli/index.html", "dblp": ";;224/4500.html;09/2824-7;67/5044.html;;;79/5796-3;11/5357-1;60/4429.html", "google_scholar": "H0egTIsAAAAJ;;https://scholar.google.com.hk/citations?user=yFrOuMEAAAAJ;rtRexdQAAAAJ;3ebpsI4AAAAJ;;;https://scholar.google.com/citations?hl=zh-CN;3lprwmsAAAAJ;https://scholar.google.com/scholar?q=ruixuan+li", "orcid": "0000-0002-7067-0275;;0000-0002-7591-5315;0000-0003-4360-0754;0009-0001-3685-5989;;;;0000-0002-4115-8205;0000-0002-7791-5511", "linkedin": "%E4%B8%96%E4%BC%9F-%E6%9D%8E-813a1723a/;;;;https://www.linkedin.cn/incareer/in/ACoAAANRzwYBJayrkeNy90FtpSZFtuADWwE6G90;;;;;https://www.linkedin.cn/incareer/in/ruixuan-li-b367319", "or_profile": "~Shiwei_Li3;~Wenchao_Xu1;~Haozhao_Wang1;~Xing_Tang2;~Yining_Qi1;~Shijie_Xu1;~weihongluo1;~Yuhua_Li2;~xiuqiang_He3;~Ruixuan_Li1", "aff": "Huazhong University of Science and Technology;;Huazhong University of Science and Technology;FiT,Tencent;Huazhong University of Science and Technology;;FiT;Huazhong University of Science and Technology;Tencent ;Huazhong University of Science and Technology", "aff_domain": "hust.edu.cn;;hust.edu.cn;tencent.com;hust.edu.cn;;tencent.com;hust.edu.cn;tencent.com;hust.edu.cn", "position": "PhD student;;Postdoc;Researcher;Postdoc;;Researcher;Full Professor;Principal Researcher;Full Professor", "bibtex": "@inproceedings{\nli2024fedbat,\ntitle={Fed{BAT}: Communication-Efficient Federated Learning via Learnable Binarization},\nauthor={Shiwei Li and Wenchao Xu and Haozhao Wang and Xing Tang and Yining Qi and Shijie Xu and weihongluo and Yuhua Li and xiuqiang He and Ruixuan Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=x2zxPwCkAZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5845581, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14319252924444020740&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "hust.edu.cn;;hust.edu.cn;tencent.com;hust.edu.cn;;tencent.com;hust.edu.cn;tencent.com;hust.edu.cn", "author_num": 10, "aff_unique_index": "0;0;1;0;2;0;1;0", "aff_unique_norm": "Huazhong University of Science and Technology;Tencent;Florida Institute of Technology", "aff_unique_dep": ";FiT;", "aff_unique_url": "http://www.hust.edu.cn;https://www.tencent.com;https://www.fit.edu", "aff_unique_abbr": "HUST;Tencent;FIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0;0;0", "aff_country_unique": "China;United States" }, { "title": "RVI-SAC: Average Reward Off-Policy Deep Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32727", "id": "xB6YJZOKyT", "proceeding": "https://proceedings.mlr.press/v235/hisaki24a.html", "pdf": "https://openreview.net/pdf?id=xB6YJZOKyT", "openreview": "https://openreview.net/forum?id=xB6YJZOKyT", "author_site": "Yukinari Hisaki, Isao Ono", "tldr": "", "abstract": "In this paper, we propose an off-policy deep reinforcement learning (DRL) method utilizing the average reward criterion. While most existing DRL methods employ the discounted reward criterion, this can potentially lead to a discrepancy between the training objective and performance metrics in continuing tasks, making the average reward criterion a recommended alternative. We introduce RVI-SAC, an extension of the state-of-the-art off-policy DRL method, Soft Actor-Critic (SAC), to the average reward criterion. Our proposal consists of (1) Critic updates based on RVI Q-learning, (2) Actor updates introduced by the average reward soft policy improvement theorem, and (3) automatic adjustment of Reset Cost enabling the average reward reinforcement learning to be applied to tasks with termination. We apply our method to the Gymnasium's Mujoco tasks, a subset of locomotion tasks, and demonstrate that RVI-SAC shows competitive performance compared to existing methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yukinari Hisaki;Isao Ono", "authorids": "~Yukinari_Hisaki1;~Isao_Ono1", "gender": "M;M", "homepage": ";", "dblp": ";93/6976", "google_scholar": ";", "orcid": ";", "linkedin": "yukinari-hisaki-788136213/;", "or_profile": "~Yukinari_Hisaki1;~Isao_Ono1", "aff": "Tokyo Institute of Technology, Tokyo Institute of Technology;Tokyo Institute of Technology", "aff_domain": "titech.ac.jp;titech.ac.jp", "position": "MS student;Associate Professor", "bibtex": "@inproceedings{\nhisaki2024rvisac,\ntitle={{RVI}-{SAC}: Average Reward Off-Policy Deep Reinforcement Learning},\nauthor={Yukinari Hisaki and Isao Ono},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xB6YJZOKyT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9762766, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15605843967507020901&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "titech.ac.jp;titech.ac.jp", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Tokyo Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.titech.ac.jp", "aff_unique_abbr": "Titech", "aff_campus_unique_index": "0", "aff_campus_unique": "Tokyo;", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "title": "Simultaneous identification of models and parameters of scientific simulators", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32726", "id": "xC7SYAZygF", "proceeding": "https://proceedings.mlr.press/v235/schroder24b.html", "pdf": "https://openreview.net/pdf?id=xC7SYAZygF", "openreview": "https://openreview.net/forum?id=xC7SYAZygF", "author_site": "Cornelius Schr\u00f6der, Jakob Macke", "tldr": "", "abstract": "Many scientific models are composed of multiple discrete components, and scientists often make heuristic decisions about which components to include. Bayesian inference provides a mathematical framework for systematically selecting model components, but defining prior distributions over model components and developing associated inference schemes has been challenging. We approach this problem in a simulation-based inference framework: We define model priors over candidate components and, from model simulations, train neural networks to infer joint probability distributions over both model components and associated parameters. Our method, simulation-based model inference (SBMI), represents distributions over model components as a conditional mixture of multivariate binary distributions in the Grassmann formalism. SBMI can be applied to any compositional stochastic simulator without requiring likelihood evaluations. We evaluate SBMI on a simple time series model and on two scientific models from neuroscience, and show that it can discover multiple data-consistent model configurations, and that it reveals non-identifiable model components and parameters. SBMI provides a powerful tool for data-driven scientific inquiry which will allow scientists to identify essential model components and make uncertainty-informed modelling decisions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Cornelius Schr\u00f6der;Jakob H. Macke", "authorids": "~Cornelius_Schr\u00f6der1;~Jakob_H._Macke1", "gender": "M;M", "homepage": ";http://www.mackelab.org", "dblp": "255/6950;97/11106", "google_scholar": "https://scholar.google.de/citations?user=HpLWKHEAAAAJ;FKOqtF8AAAAJ", "orcid": "0000-0001-5643-2097;0000-0001-5154-8912", "linkedin": ";", "or_profile": "~Cornelius_Schr\u00f6der1;~Jakob_H_Macke1", "aff": "University T\u00fcbingen;University of Tuebingen", "aff_domain": "uni-tuebingen.de;uni-tuebingen.de", "position": "Postdoc;Full Professor", "bibtex": "@inproceedings{\nschr{\\\"o}der2024simultaneous,\ntitle={Simultaneous identification of models and parameters of scientific simulators},\nauthor={Cornelius Schr{\\\"o}der and Jakob H. Macke},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xC7SYAZygF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3666471, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1516447786495928233&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "uni-tuebingen.de;uni-tuebingen.de", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "University of T\u00fcbingen;University of Tuebingen", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.uni-tuebingen.de/", "aff_unique_abbr": "Uni T\u00fcbingen;Uni T\u00fcbingen", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Position: Application-Driven Innovation in Machine Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32725", "id": "xEB2oF3vvb", "proceeding": "https://proceedings.mlr.press/v235/rolnick24a.html", "pdf": "https://openreview.net/pdf?id=xEB2oF3vvb", "openreview": "https://openreview.net/forum?id=xEB2oF3vvb", "author_site": "David Rolnick, Alan Aspuru-Guzik, Sara Beery, Bistra Dilkina, Priya Donti, Marzyeh Ghassemi, Hannah Kerner, Claire Monteleoni, Esther Rolf, Milind Tambe, Adam White", "tldr": "", "abstract": "In this position paper, we argue that application-driven research has been systemically under-valued in the machine learning community. As applications of machine learning proliferate, innovative algorithms inspired by specific real-world challenges have become increasingly important. Such work offers the potential for significant impact not merely in domains of application but also in machine learning itself. In this paper, we describe the paradigm of application-driven research in machine learning, contrasting it with the more standard paradigm of methods-driven research. We illustrate the benefits of application-driven machine learning and how this approach can productively synergize with methods-driven work. Despite these benefits, we find that reviewing, hiring, and teaching practices in machine learning often hold back application-driven innovation. We outline how these processes may be improved.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "David Rolnick;Alan Aspuru-Guzik;Sara Beery;Bistra Dilkina;Priya L. Donti;Marzyeh Ghassemi;Hannah Kerner;Claire Monteleoni;Esther Rolf;Milind Tambe;Adam White", "authorids": "~David_Rolnick1;~Alan_Aspuru-Guzik2;~Sara_Beery1;~Bistra_Dilkina2;~Priya_L._Donti1;~Marzyeh_Ghassemi2;~Hannah_Kerner1;~Claire_Monteleoni1;~Esther_Rolf1;~Milind_Tambe1;~Adam_White1", "gender": "M;M;F;F;F;F;F;F;;;", "homepage": "http://www.davidrolnick.com/;http://matter.toronto.edu;https://beerys.github.io/;;https://priyadonti.com/;https://www.healthyml.org/;https://hannah-rae.github.io/;;;http://teamcore.seas.harvard.edu/tambe;", "dblp": "37/10718;;191/2643;30/5718;198/0500;145/6563;218/2646;;;67/2667;", "google_scholar": "P_luG3cAAAAJ;Ag_6KEgAAAAJ;https://scholar.google.com/citations?hl=en;1jjyaBYAAAAJ;PfRSkfEAAAAJ;;g5CD7dQAAAAJ;FqNPXeoAAAAJ;;YOVZiJkAAAAJ;", "orcid": ";0000-0002-8277-4434;;0000-0002-6784-473X;;;0000-0002-3259-7759;;;;", "linkedin": ";;;;priya-donti/;;hannahkerner/;;;;", "or_profile": "~David_Rolnick1;~Alan_Aspuru-Guzik2;~Sara_Beery1;~Bistra_Dilkina2;~Priya_L._Donti1;~Marzyeh_Ghassemi2;~Hannah_Kerner1;~Claire_Monteleoni1;~Esther_Rolf1;~Milind_Tambe1;~Adam_White1", "aff": "McGill University;University of Toronto;Massachusetts Institute of Technology;University of Southern California;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Arizona State University;INRIA;;Google;", "aff_domain": "cs.mcgill.ca;utoronto.ca;mit.edu;usc.edu;mit.edu;mit.edu;asu.edu;inria.fr;;google.com;", "position": "Assistant Professor;Full Professor;Assistant Professor;Associate Professor;Assistant Professor;Assistant Professor;Assistant Professor;Principal Researcher;;Principal Researcher;", "bibtex": "@inproceedings{\nrolnick2024position,\ntitle={Position: Application-Driven Innovation in Machine Learning},\nauthor={David Rolnick and Alan Aspuru-Guzik and Sara Beery and Bistra Dilkina and Priya L. Donti and Marzyeh Ghassemi and Hannah Kerner and Claire Monteleoni and Esther Rolf and Milind Tambe and Adam White},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xEB2oF3vvb}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 609725, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11642715964277811360&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "cs.mcgill.ca;utoronto.ca;mit.edu;usc.edu;mit.edu;mit.edu;asu.edu;inria.fr;;google.com;", "author_num": 11, "aff_unique_index": "0;1;2;3;2;2;4;5;6", "aff_unique_norm": "McGill University;University of Toronto;Massachusetts Institute of Technology;University of Southern California;Arizona State University;INRIA;Google", "aff_unique_dep": ";;;;;;Google", "aff_unique_url": "https://www.mcgill.ca;https://www.utoronto.ca;https://web.mit.edu;https://www.usc.edu;https://www.asu.edu;https://www.inria.fr;https://www.google.com", "aff_unique_abbr": "McGill;U of T;MIT;USC;ASU;INRIA;Google", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Los Angeles;Mountain View", "aff_country_unique_index": "0;0;1;1;1;1;1;2;1", "aff_country_unique": "Canada;United States;France" }, { "title": "Online Algorithms with Uncertainty-Quantified Predictions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32724", "id": "xF656w37Mj", "proceeding": "https://proceedings.mlr.press/v235/sun24f.html", "pdf": "https://openreview.net/pdf?id=xF656w37Mj", "openreview": "https://openreview.net/forum?id=xF656w37Mj", "author_site": "Bo Sun, Jerry Huang, Nicolas Christianson, Mohammad Hajiesmaili, Adam Wierman, Raouf Boutaba", "tldr": "", "abstract": "The burgeoning field of algorithms with predictions studies the problem of using possibly imperfect machine learning predictions to improve online algorithm performance. While nearly all existing algorithms in this framework make no assumptions on prediction quality, a number of methods providing uncertainty quantification (UQ) on machine learning models have been developed in recent years, which could enable additional information about prediction quality at decision time. In this work, we investigate the problem of optimally utilizing uncertainty-quantified predictions in the design of online algorithms. In particular, we study two classic online problems, ski rental and online search, where the decision-maker is provided predictions augmented with UQ describing the likelihood of the ground truth falling within a particular range of values. We demonstrate that non-trivial modifications to algorithm design are needed to fully leverage the UQ predictions. Moreover, we consider how to utilize more general forms of UQ, proposing an online learning framework that learns to exploit UQ to make decisions in multi-instance settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bo Sun;Jerry Huang;Nicolas Christianson;Mohammad Hajiesmaili;Adam Wierman;Raouf Boutaba", "authorids": "~Bo_Sun8;jyhuang@caltech.edu;~Nicolas_Christianson1;~Mohammad_Hajiesmaili1;~Adam_Wierman1;~Raouf_Boutaba2", "gender": ";;;M;M;M", "homepage": ";;https://nicochristianson.com/;https://groups.cs.umass.edu/hajiesmaili/;https://adamwierman.com/;https://rboutaba.cs.uwaterloo.ca", "dblp": ";;322/8648;49/7911;56/4447;b/RaoufBoutaba.html", "google_scholar": ";;XS2UFA8AAAAJ;XCGuYKIAAAAJ;4OvOdSgAAAAJ;L1RHDCcAAAAJ", "orcid": ";;0000-0001-8330-8964;;0000-0002-5923-0199;0000-0001-7936-6862", "linkedin": ";;;;adam-wierman-a529474/;raouf-boutaba-7496b0/?originalSubdomain=ca", "or_profile": "~Bo_Sun8;jyhuang@caltech.edu;~Nicolas_Christianson1;~Mohammad_Hajiesmaili1;~Adam_Wierman1;~Raouf_Boutaba2", "aff": ";;California Institute of Technology;College of Information and Computer Science, University of Massachusetts, Amherst;California Institute of Technology;University of Waterloo", "aff_domain": ";;caltech.edu;cics.umass.edu;caltech.edu;cs.uwaterloo.ca", "position": ";;PhD student;Assistant Professor;Professor;Full Professor", "bibtex": "@inproceedings{\nsun2024online,\ntitle={Online Algorithms with Uncertainty-Quantified Predictions},\nauthor={Bo Sun and Jerry Huang and Nicolas Christianson and Mohammad Hajiesmaili and Adam Wierman and Raouf Boutaba},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xF656w37Mj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 592161, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18023597934804148070&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 8, "email": ";;caltech.edu;cics.umass.edu;caltech.edu;cs.uwaterloo.ca", "author_num": 6, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "California Institute of Technology;University of Massachusetts Amherst;University of Waterloo", "aff_unique_dep": ";College of Information and Computer Science;", "aff_unique_url": "https://www.caltech.edu;https://www.umass.edu;https://uwaterloo.ca", "aff_unique_abbr": "Caltech;UMass Amherst;UW", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Pasadena;Amherst;", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;Canada" }, { "title": "Ai-sampler: Adversarial Learning of Markov kernels with involutive maps", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32723", "id": "xFCA2yWVs4", "proceeding": "https://proceedings.mlr.press/v235/egorov24a.html", "pdf": "https://openreview.net/pdf?id=xFCA2yWVs4", "openreview": "https://openreview.net/forum?id=xFCA2yWVs4", "author_site": "Evgenii Egorov, Riccardo Valperga, Efstratios Gavves", "tldr": "", "abstract": "Markov chain Monte Carlo methods have become popular in statistics as versatile techniques to sample from complicated probability distributions. In this work, we propose a method to parameterize and train transition kernels of Markov chains to achieve efficient sampling and good mixing. This training procedure minimizes the total variation distance between the stationary distribution of the chain and the empirical distribution of the data. Our approach leverages involutive Metropolis-Hastings kernels constructed from reversible neural networks that ensure detailed balance by construction. We find that reversibility also implies $C_2$-equivariance of the discriminator function which can be used to restrict its function space.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Evgenii Egorov;Riccardo Valperga;Stratis Gavves", "authorids": "~Evgenii_Egorov1;~Riccardo_Valperga1;~Stratis_Gavves1", "gender": "M;M;M", "homepage": "https://evgenii-egorov.github.io;;https://www.egavves.com", "dblp": ";;03/8693", "google_scholar": "https://scholar.google.ru/citations?user=LwVVunEAAAAJ;IK64D1wAAAAJ;https://scholar.google.nl/citations?user=QqfCvsgAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Evgenii_Egorov1;~Riccardo_Valperga1;~Efstratios_Gavves1", "aff": "University of Amsterdam;University of Amsterdam;University of Amsterdam", "aff_domain": "uva.nl;uva.nl;uva.nl", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\negorov2024aisampler,\ntitle={Ai-sampler: Adversarial Learning of Markov kernels with involutive maps},\nauthor={Evgenii Egorov and Riccardo Valperga and Stratis Gavves},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xFCA2yWVs4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1132182, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6906297636731400314&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "uva.nl;uva.nl;uva.nl", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Amsterdam", "aff_unique_dep": "", "aff_unique_url": "https://www.uva.nl", "aff_unique_abbr": "UvA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Netherlands" }, { "title": "CRoFT: Robust Fine-Tuning with Concurrent Optimization for OOD Generalization and Open-Set OOD Detection", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32722", "id": "xFDJBzPhci", "proceeding": "https://proceedings.mlr.press/v235/zhu24n.html", "pdf": "https://openreview.net/pdf?id=xFDJBzPhci", "openreview": "https://openreview.net/forum?id=xFDJBzPhci", "author_site": "Lin Zhu, Yifeng Yang, Qinying Gu, Xinbing Wang, Chenghu Zhou, Nanyang Ye", "tldr": "", "abstract": "Recent vision-language pre-trained models (VL-PTMs) have shown remarkable success in open-vocabulary tasks. However, downstream use cases often involve further fine-tuning of VL-PTMs, which may distort their general knowledge and impair their ability to handle distribution shifts. In real-world scenarios, machine learning systems inevitably encounter both covariate shifts (e.g., changes in image styles) and semantic shifts (e.g., test-time unseen classes). This highlights the importance of enhancing out-of-distribution (OOD) generalization on covariate shifts and simultaneously detecting semantic-shifted unseen classes. Thus a critical but underexplored question arises: How to improve VL-PTMs' generalization ability to closed-set OOD data, while effectively detecting open-set unseen classes during fine-tuning? In this paper, we propose a novel objective function of OOD detection that also serves to improve OOD generalization. We show that minimizing the gradient magnitude of energy scores on training data leads to domain-consistent Hessians of classification loss, a strong indicator for OOD generalization revealed by theoretical analysis. Based on this finding, we have developed a unified fine-tuning framework that allows for concurrent optimization of both tasks. Extensive experiments have demonstrated the superiority of our method. The code is available at https://github.com/LinLLLL/CRoFT.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lin Zhu;Yifeng Yang;Qinying Gu;Xinbing Wang;Chenghu Zhou;Nanyang Ye", "authorids": "~Lin_Zhu10;~Yifeng_Yang1;~Qinying_Gu1;~Xinbing_Wang1;~Chenghu_Zhou3;~Nanyang_Ye1", "gender": "F;M;;M;M;", "homepage": "https://github.com/LinLLLL;https://zxk1212.github.io/;;http://www.cs.sjtu.edu.cn/~wang-xb/;http://www.igsnrr.cas.cn/gkjj/ysfc/ysfc_zhouchenghu/;", "dblp": ";;;96/1149.html;85/1324.html;175/2581", "google_scholar": "https://scholar.google.com.hk/citations?user=_kAniL4AAAAJ;yourID;;https://scholar.google.com.tw/citations?user=CT5yZbwAAAAJ;;", "orcid": ";;;0000-0002-0357-8356;;", "linkedin": ";;;;;", "or_profile": "~Lin_Zhu10;~Yifeng_Yang1;~Qinying_Gu1;~Xinbing_Wang1;~Chenghu_Zhou3;~Nanyang_Ye1", "aff": "Shanghai Jiaotong University;Tianjin University;;Shanghai Jiaotong University;IGSNRR, Chinese Academy of Sciences, Beijing, China;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;tju.edu.cn;;cs.sjtu.edu.cn;lreis.ac.cn;sjtu.edu", "position": "PhD student;Undergrad student;;Full Professor;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nzhu2024croft,\ntitle={{CR}o{FT}: Robust Fine-Tuning with Concurrent Optimization for {OOD} Generalization and Open-Set {OOD} Detection},\nauthor={Lin Zhu and Yifeng Yang and Qinying Gu and Xinbing Wang and Chenghu Zhou and Nanyang Ye},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xFDJBzPhci}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3170392, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11468844987616939514&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "sjtu.edu.cn;tju.edu.cn;;cs.sjtu.edu.cn;lreis.ac.cn;sjtu.edu", "author_num": 6, "aff_unique_index": "0;1;0;2;0", "aff_unique_norm": "Shanghai Jiao Tong University;Tianjin University;Chinese Academy of Sciences", "aff_unique_dep": ";;IGSNRR", "aff_unique_url": "https://www.sjtu.edu.cn;http://www.tju.edu.cn;http://www.cas.cn", "aff_unique_abbr": "SJTU;TJU;CAS", "aff_campus_unique_index": "1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "EE-LLM: Large-Scale Training and Inference of Early-Exit Large Language Models with 3D Parallelism", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32721", "id": "xFk0w9zoV3", "proceeding": "https://proceedings.mlr.press/v235/chen24ae.html", "pdf": "https://openreview.net/pdf?id=xFk0w9zoV3", "openreview": "https://openreview.net/forum?id=xFk0w9zoV3", "author_site": "Yanxi Chen, Xuchen Pan, Yaliang Li, Bolin Ding, Jingren Zhou", "tldr": "", "abstract": "We present EE-LLM, a framework for large-scale training and inference of early-exit large language models (LLMs). While recent works have shown preliminary evidence for the efficacy of early exiting in accelerating LLM inference, EE-LLM makes a foundational step towards scaling up early-exit LLMs by supporting their training and inference with massive 3D parallelism. Built upon Megatron-LM, EE-LLM implements a variety of algorithmic innovations and performance optimizations tailored to early exiting, including a lightweight method that facilitates backpropagation for the early-exit training objective with pipeline parallelism, techniques of leveraging idle resources in the original pipeline schedule for computation related to early-exit layers, and two approaches of early-exit inference that are compatible with KV caching for autoregressive generation. Our analytical and empirical study shows that EE-LLM achieves great training efficiency with negligible computational overhead compared to standard LLM training, as well as outstanding inference speedup without compromising output quality. To facilitate further research and adoption, we release EE-LLM at https://github.com/pan-x-c/EE-LLM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yanxi Chen;Xuchen Pan;Yaliang Li;Bolin Ding;Jingren Zhou", "authorids": "~Yanxi_Chen1;~Xuchen_Pan1;~Yaliang_Li1;~Bolin_Ding3;~Jingren_Zhou1", "gender": ";M;M;M;M", "homepage": "https://yanxi-chen.github.io/;https://github.com/pan-x-c;https://sites.google.com/site/yaliangli/;https://bolinding.github.io/;", "dblp": "40/5750-1.html;323/0621;https://dblp.org/pers/hd/l/Li:Yaliang;46/3522.html;84/2644", "google_scholar": ";;CCPBcdYAAAAJ;AjYkTi8AAAAJ;", "orcid": "0000-0003-0610-8103;;0000-0002-4204-6096;;", "linkedin": "yanxi-chen-476b03179/;;;bolin-ding-50a0119/;", "or_profile": "~Yanxi_Chen1;~Xuchen_Pan1;~Yaliang_Li1;~Bolin_Ding3;~Jingren_Zhou1", "aff": "Alibaba Group;Alibaba Group;Alibaba Group;Alibaba Group;Alibaba Group", "aff_domain": "alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com", "position": "Researcher;Researcher;Staff Engineer;Senior Director;Researcher", "bibtex": "@inproceedings{\nchen2024eellm,\ntitle={{EE}-{LLM}: Large-Scale Training and Inference of Early-Exit Large Language Models with 3D Parallelism},\nauthor={Yanxi Chen and Xuchen Pan and Yaliang Li and Bolin Ding and Jingren Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xFk0w9zoV3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1719665, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10899636243828811077&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Alibaba Group", "aff_unique_dep": "", "aff_unique_url": "https://www.alibaba.com", "aff_unique_abbr": "Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Dynamic Survival Analysis with Controlled Latent States", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32720", "id": "xGlVkBSDdt", "proceeding": "https://proceedings.mlr.press/v235/bleistein24a.html", "pdf": "https://openreview.net/pdf?id=xGlVkBSDdt", "openreview": "https://openreview.net/forum?id=xGlVkBSDdt", "author_site": "Linus Bleistein, Van NGUYEN, Adeline Fermanian, Agathe Guilloux", "tldr": "", "abstract": "We consider the task of learning individual-specific intensities of counting processes from a set of static variables and irregularly sampled time series. We introduce a novel modelization approach in which the intensity is the solution to a controlled differential equation. We first design a neural estimator by building on neural controlled differential equations. In a second time, we show that our model can be linearized in the signature space under sufficient regularity conditions, yielding a signature-based estimator which we call CoxSig. We provide theoretical learning guarantees for both estimators, before showcasing the performance of our models on a vast array of simulated and real-world datasets from finance, predictive maintenance and food supply chain management.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Linus Bleistein;Van Tuan NGUYEN;Adeline Fermanian;Agathe Guilloux", "authorids": "~Linus_Bleistein1;~Van_Tuan_NGUYEN1;~Adeline_Fermanian1;~Agathe_Guilloux1", "gender": "M;M;;F", "homepage": "https://linusbleistein.com/;;https://afermanian.github.io;https://sites.google.com/view/agatheguilloux-personalwebsite/", "dblp": "338/9094;;;", "google_scholar": "N5js_UkAAAAJ;;3Mfclk8AAAAJ;", "orcid": ";;;0000-0003-0473-1970", "linkedin": "linus-bleistein-431388114/;van-tuan-nguyen/;;", "or_profile": "~Linus_Bleistein1;~Van_Tuan_NGUYEN1;~Adeline_Fermanian1;~Agathe_Guilloux1", "aff": "INRIA;Universit\u00e9 Paris Cit\u00e9;Califrais;INRIA", "aff_domain": "inria.fr;paris.fr;califrais.fr;inria.fr", "position": "PhD student;PhD student;Researcher;Principal Researcher", "bibtex": "@inproceedings{\nbleistein2024dynamic,\ntitle={Dynamic Survival Analysis with Controlled Latent States},\nauthor={Linus Bleistein and Van Tuan NGUYEN and Adeline Fermanian and Agathe Guilloux},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xGlVkBSDdt}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3526016, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17498941863873751807&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "inria.fr;paris.fr;califrais.fr;inria.fr", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "INRIA;Universit\u00e9 Paris Cit\u00e9;California Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.inria.fr;https://www.universite-paris.fr;https://www.caltech.edu", "aff_unique_abbr": "INRIA;UPC;Caltech", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pasadena", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "France;United States" }, { "title": "Mastering Robot Manipulation with Multimodal Prompts through Pretraining and Multi-task Fine-tuning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32719", "id": "xIRKB5nRJl", "proceeding": "https://proceedings.mlr.press/v235/li24x.html", "pdf": "https://openreview.net/pdf?id=xIRKB5nRJl", "openreview": "https://openreview.net/forum?id=xIRKB5nRJl", "author_site": "Jiachen Li, Qiaozi Gao, Michael Johnston, Xiaofeng Gao, Xuehai He, Hangjie Shi, Suhaila Shakiah, Reza Ghanadan, William Wang", "tldr": "", "abstract": "Prompt-based learning has been demonstrated as a compelling paradigm contributing to large language models' tremendous success (LLMs). Inspired by their success in language tasks, existing research has leveraged LLMs in embodied instruction following and task planning. In this work, we tackle the problem of training a robot to understand multimodal prompts, interleaving vision signals with text descriptions. This type of task poses a major challenge to robots' capability to understand the interconnection and complementarity between vision and language signals. In this work, we introduce an effective framework that learns a policy to perform robot manipulation with multimodal prompts from multi-task expert trajectories. Our methods consist of a two-stage training pipeline that performs inverse dynamics pretraining and multi-task finetuning. To facilitate multimodal understanding, we design our multimodal prompt encoder by augmenting a pretrained LM with a residual connection to the visual input and model the dependencies among action dimensions. Empirically, we evaluate the efficacy of our method on the VIMA-BENCH and establish a new state-of-the-art (10% improvement in success rate). Moreover, we demonstrate that our model exhibits remarkable in-context learning ability.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiachen Li;Qiaozi Gao;Michael Johnston;Xiaofeng Gao;Xuehai He;Hangjie Shi;Suhaila Shakiah;Reza Ghanadan;William Yang Wang", "authorids": "~Jiachen_Li6;~Qiaozi_Gao1;~Michael_Johnston1;~Xiaofeng_Gao1;~Xuehai_He1;~Hangjie_Shi1;~Suhaila_Shakiah1;~Reza_Ghanadan1;~William_Yang_Wang2", "gender": "M;M;M;M;M;M;F;M;M", "homepage": "https://sites.google.com/view/jiachenli/;;;https://xfgao.github.io/;;;https://www.linkedin.com/in/suhailashakiah/;;https://www.cs.ucsb.edu/~william/", "dblp": ";173/1986;77/2529;95/6947-2;251/0763;;271/2364;50/5680;08/9282", "google_scholar": "https://scholar.google.com/citations?hl=en;Ub3LlsgAAAAJ;;AjTfCjEAAAAJ;kDzxOzUAAAAJ;7r5shcMAAAAJ;IP6H8LYAAAAJ;00ncu3cAAAAJ;gf8Ms_8AAAAJ", "orcid": ";;;0000-0003-3331-9846;;;;;", "linkedin": ";;mjrjohnston/;;;hangjie-shi-7565903b;suhailashakiah/;reza-ghanadan-ph-d-mba-820756;", "or_profile": "~Jiachen_Li6;~Qiaozi_Gao1;~Michael_Johnston1;~Xiaofeng_Gao1;~Xuehai_He1;~Hangjie_Shi1;~Suhaila_Shakiah1;~Reza_Ghanadan1;~William_Wang1", "aff": "University of California, Santa Barbara;Amazon;Amazon;Amazon;University of California Santa Curz;;Amazon;University of Maryland, College Park;UC Santa Barbara", "aff_domain": "ucsb.edu;amazon.com;amazon.com;amazon.com;ucsc.edu;;amazon.com;umd.edu;ucsb.edu", "position": "PhD student;Scientist;Principal Researcher;Scientist;PhD student;;Researcher;Full Professor;Full Professor", "bibtex": "@inproceedings{\nli2024mastering,\ntitle={Mastering Robot Manipulation with Multimodal Prompts through Pretraining and Multi-task Fine-tuning},\nauthor={Jiachen Li and Qiaozi Gao and Michael Johnston and Xiaofeng Gao and Xuehai He and Hangjie Shi and Suhaila Shakiah and Reza Ghanadan and William Yang Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xIRKB5nRJl}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4292859, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11429149269156367262&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 10, "email": "ucsb.edu;amazon.com;amazon.com;amazon.com;ucsc.edu;;amazon.com;umd.edu;ucsb.edu", "author_num": 9, "aff_unique_index": "0;1;1;1;2;1;3;0", "aff_unique_norm": "University of California, Santa Barbara;Amazon;University of California, Santa Cruz;University of Maryland", "aff_unique_dep": ";Amazon.com, Inc.;;", "aff_unique_url": "https://www.ucsb.edu;https://www.amazon.com;https://www.ucsc.edu;https://www/umd.edu", "aff_unique_abbr": "UCSB;Amazon;UCSC;UMD", "aff_campus_unique_index": "0;2;3;0", "aff_campus_unique": "Santa Barbara;;Santa Cruz;College Park", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "LLM-Empowered State Representation for Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32718", "id": "xJMZbdiQnf", "proceeding": "https://proceedings.mlr.press/v235/wang24bh.html", "pdf": "https://openreview.net/pdf?id=xJMZbdiQnf", "openreview": "https://openreview.net/forum?id=xJMZbdiQnf", "author_site": "Boyuan Wang, Yun Qu, Yuhang Jiang, Jianzhun Shao, Chang Liu, Wenming Yang, Xiangyang Ji", "tldr": "", "abstract": "Conventional state representations in reinforcement learning often omit critical task-related details, presenting a significant challenge for value networks in establishing accurate mappings from states to task rewards. Traditional methods typically depend on extensive sample learning to enrich state representations with task-specific information, which leads to low sample efficiency and high time costs. Recently, surging knowledgeable large language models (LLM) have provided promising substitutes for prior injection with minimal human intervention. Motivated by this, we propose LLM-Empowered State Representation (LESR), a novel approach that utilizes LLM to autonomously generate task-related state representation codes which help to enhance the continuity of network mappings and facilitate efficient training. Experimental results demonstrate LESR exhibits high sample efficiency and outperforms state-of-the-art baselines by an average of **29%** in accumulated reward in Mujoco tasks and **30%** in success rates in Gym-Robotics tasks. Codes of LESR are accessible at https://github.com/thu-rllab/LESR.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Boyuan Wang;Yun Qu;Yuhang Jiang;Jianzhun Shao;Chang Liu;Wenming Yang;Xiangyang Ji", "authorids": "~Boyuan_Wang1;~Yun_Qu2;~Yuhang_Jiang3;~Jianzhun_Shao1;~Chang_Liu9;~Wenming_Yang1;~Xiangyang_Ji1", "gender": "M;M;;M;M;M;", "homepage": "https://github.com/BoyuanWang-hub;https://github.com/cloud-qu;;https://github.com/qyz55;https://www.au.tsinghua.edu.cn/en/info/1096/3484.htm;https://www.sigs.tsinghua.edu.cn/ywm_en/main.htm;", "dblp": ";80/10774-2;239/4567;263/2309;52/5716-42;75/2339.html;", "google_scholar": ";l9Ky9goAAAAJ;https://scholar.google.com/citations?hl=en;;vsh1WP4AAAAJ;https://scholar.google.com/citations?hl=zh-CN;", "orcid": ";0009-0000-1803-8435;;;0000-0001-6747-0646;0000-0002-2506-1286;", "linkedin": ";;;;;;", "or_profile": "~Boyuan_Wang1;~Yun_Qu2;~Yuhang_Jiang3;~Jianzhun_Shao1;~Chang_Liu9;~Wenming_Yang1;~Xiangyang_Ji1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University,;", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;", "position": "MS student;PhD student;PhD student;PhD student;Postdoc;Associate Professor;", "bibtex": "@inproceedings{\nwang2024llmempowered,\ntitle={{LLM}-Empowered State Representation for Reinforcement Learning},\nauthor={Boyuan Wang and Yun Qu and Yuhang Jiang and Jianzhun Shao and Chang Liu and Wenming Yang and Xiangyang Ji},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xJMZbdiQnf}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6047046, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10447968698401634952&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8, "email": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;", "author_num": 7, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Graph Neural Stochastic Diffusion for Estimating Uncertainty in Node Classification", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32717", "id": "xJUhgvM2u8", "proceeding": "https://proceedings.mlr.press/v235/lin24x.html", "pdf": "https://openreview.net/pdf?id=xJUhgvM2u8", "openreview": "https://openreview.net/forum?id=xJUhgvM2u8", "author_site": "Xixun Lin, Wenxiao Zhang, Fengzhao Shi, Chuan Zhou, Lixin Zou, Xiangyu Zhao, Dawei Yin, Shirui Pan, Yanan Cao", "tldr": "", "abstract": "Graph neural networks (GNNs) have advanced the state of the art in various domains. Despite their remarkable success, the uncertainty estimation of GNN predictions remains under-explored, which limits their practical applications especially in risk-sensitive areas. Current works suffer from either intractable posteriors or inflexible prior specifications, leading to sub-optimal empirical results. In this paper, we present graph neural stochastic diffusion (GNSD), a novel framework for estimating predictive uncertainty on graphs by establishing theoretical connections between GNNs and stochastic partial differential equation. GNSD represents a GNN-based parameterization of the proposed graph stochastic diffusion equation which includes a $Q$-Wiener process to model the stochastic evolution of node representations. GNSD introduces a drift network to guarantee accurate prediction and a stochastic forcing network to model the propagation of epistemic uncertainty among nodes. Extensive experiments are conducted on multiple detection tasks, demonstrating that GNSD yields the superior performance over existing strong approaches.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xixun Lin;Wenxiao Zhang;Fengzhao Shi;Chuan Zhou;Lixin Zou;Xiangyu Zhao;Dawei Yin;Shirui Pan;Yanan Cao", "authorids": "~Xixun_Lin3;~Wenxiao_Zhang2;~Fengzhao_Shi1;~Chuan_Zhou3;~Lixin_Zou1;~Xiangyu_Zhao1;~Dawei_Yin1;~Shirui_Pan1;~Yanan_Cao1", "gender": "M;;M;M;M;M;M;;F", "homepage": "https://linxixun.github.io/;;;http://www.chuanzhou.online/;https://www.zoulixin.site/;https://zhaoxyai.github.io/;https://www.yindawei.com/;;", "dblp": "190/7231;;;https://dblp.uni-trier.de/pid/52/564-1;193/4216;08/890-1.html;;91/8171;97/5152-1", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;;https://scholar.google.com.hk/citations?user=pbh2EnkAAAAJ;4oBUWVEAAAAJ;J8tHYjIAAAAJ;;GuQ9bpAAAAAJ;https://scholar.google.com.au/citations?user=frWRJN4AAAAJ;", "orcid": "0009-0004-6645-0597;0009-0000-6693-1340;;0000-0001-9958-8673;0000-0001-6755-871X;0000-0003-2926-4416;0000-0002-0684-6205;0000-0003-0794-527X;0000-0003-3534-1094", "linkedin": ";;;;;;dwyin/;;", "or_profile": "~Xixun_Lin3;~Wenxiao_Zhang2;~Fengzhao_Shi1;~Chuan_Zhou3;~Lixin_Zou1;~Xiangyu_Zhao1;~Dawei_Yin1;~Shirui_Pan1;~Yanan_Cao1", "aff": "Institute of Information Engineering, Chinese Academy of Sciences;Beijing Jiaotong University;University of Chinese Academy of Sciences;Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Chinese Academy of Sciences;Wuhan University;City University of Hong Kong;Baidu;Griffith University;Institute of Information Engineering, Chinese Academy of Sciences", "aff_domain": "iie.ac.cn;bjtu.edu.cn;ucas.ac.cn;amss.ac.cn;whu.edu.cn;cityu.edu.hk;baidu.com;griffith.edu.au;iie.ac.cn", "position": "Assistant Professor;PhD student;PhD student;Associate Professor;Associate Professor;Assistant Professor;Principal Researcher;Full Professor;Full Professor", "bibtex": "@inproceedings{\nlin2024graph,\ntitle={Graph Neural Stochastic Diffusion for Estimating Uncertainty in Node Classification},\nauthor={Xixun Lin and Wenxiao Zhang and Fengzhao Shi and Chuan Zhou and Lixin Zou and Xiangyu Zhao and Dawei Yin and Shirui Pan and Yanan Cao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xJUhgvM2u8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2806537, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6343973362820813705&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "iie.ac.cn;bjtu.edu.cn;ucas.ac.cn;amss.ac.cn;whu.edu.cn;cityu.edu.hk;baidu.com;griffith.edu.au;iie.ac.cn", "author_num": 9, "aff_unique_index": "0;1;2;0;3;4;5;6;0", "aff_unique_norm": "Chinese Academy of Sciences;Beijing Jiao Tong University;University of Chinese Academy of Sciences;Wuhan University;City University of Hong Kong;Baidu;Griffith University", "aff_unique_dep": "Institute of Information Engineering;;;;;Baidu, Inc.;", "aff_unique_url": "http://www.cas.cn;http://www.njtu.edu.cn/en;http://www.ucas.ac.cn;http://www.whu.edu.cn/;https://www.cityu.edu.hk;https://www.baidu.com;https://www.griffith.edu.au", "aff_unique_abbr": "CAS;BJTU;UCAS;WHU;CityU;Baidu;Griffith", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0;1;0", "aff_country_unique": "China;Australia" }, { "title": "Do Efficient Transformers Really Save Computation?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32716", "id": "xLikRS9OhW", "proceeding": "https://proceedings.mlr.press/v235/yang24a.html", "pdf": "https://openreview.net/pdf?id=xLikRS9OhW", "openreview": "https://openreview.net/forum?id=xLikRS9OhW", "author_site": "Kai Yang, Jan Ackermann, Zhenyu He, Guhao Feng, Bohang Zhang, Yunzhen Feng, Qiwei Ye, Di He, Liwei Wang", "tldr": "", "abstract": "As transformer-based language models are trained on increasingly large datasets and with vast numbers of parameters, finding more efficient alternatives to the standard Transformer has become very valuable. While many efficient Transformers and Transformer alternatives have been proposed, none provide theoretical guarantees that they are a suitable replacement for the standard Transformer. This makes it challenging to identify when to use a specific model and what directions to prioritize for further investigation. In this paper, we aim to understand the capabilities and limitations of efficient Transformers, specifically the Sparse Transformer and the Linear Transformer. We focus on their reasoning capability as exhibited by Chain-of-Thought (CoT) prompts and follow previous works to model them as Dynamic Programming (DP) problems. Our results show that while these models are expressive enough to solve general DP tasks, contrary to expectations, they require a model size that scales with the problem size. Nonetheless, we identify a class of DP problems for which these models can be more efficient than the standard Transformer. We confirm our theoretical results through experiments on representative DP tasks, adding to the understanding of efficient Transformers' practical strengths and weaknesses.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kai Yang;Jan Ackermann;Zhenyu He;Guhao Feng;Bohang Zhang;Yunzhen Feng;Qiwei Ye;Di He;Liwei Wang", "authorids": "~Kai_Yang12;~Jan_Ackermann1;~Zhenyu_He3;~Guhao_Feng1;~Bohang_Zhang1;~Yunzhen_Feng1;~Qiwei_Ye1;~Di_He1;~Liwei_Wang1", "gender": ";;M;M;M;M;M;M;M", "homepage": "https://yk-youngk.github.io/;;https://zhenyuhe00.github.io/;;https://zbh2047.github.io;https://fengyzpku.github.io;;https://dihe-pku.github.io/;http://www.liweiwang-pku.com/", "dblp": ";;355/4626;;276/0156.html;254/4752;50/995;74/184;", "google_scholar": "GODkQEkAAAAJ;;https://scholar.google.co.jp/citations?user=bKwkUO4AAAAJ;wmDqYvUAAAAJ;https://scholar.google.com.hk/citations?hl=zh-CN;QebzOsIAAAAJ;RJ6SuR8AAAAJ;https://scholar.google.co.jp/citations?user=orVoz4IAAAAJ;VZHxoh8AAAAJ", "orcid": "0009-0006-4848-7382;;;;;;0000-0003-4264-5846;;", "linkedin": ";;;;zhangbohang;;qiwei-ye-15282964/;;", "or_profile": "~Kai_Yang12;~Jan_Ackermann1;~Zhenyu_He3;~Guhao_Feng1;~Bohang_Zhang1;~Yunzhen_Feng1;~Qiwei_Ye1;~Di_He1;~Liwei_Wang1", "aff": "Peking University;;Peking University;Peking University;Peking University;Meta FAIR;Beijing Academy of Artificial Intelligence;Microsoft;Peking University", "aff_domain": "stu.pku.edu.cn;;pku.edu.cn;pku.edu.cn;pku.edu.cn;meta.com;baai.ac.cn;microsoft.com;pku.edu.cn", "position": "Undergrad student;;PhD student;Undergrad student;PhD student;Intern;Principal Researcher;Senior Researcher;Full Professor", "bibtex": "@inproceedings{\nyang2024do,\ntitle={Do Efficient Transformers Really Save Computation?},\nauthor={Kai Yang and Jan Ackermann and Zhenyu He and Guhao Feng and Bohang Zhang and Yunzhen Feng and Qiwei Ye and Di He and Liwei Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xLikRS9OhW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 541389, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16485155307983291066&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "stu.pku.edu.cn;;pku.edu.cn;pku.edu.cn;pku.edu.cn;meta.com;baai.ac.cn;microsoft.com;pku.edu.cn", "author_num": 9, "aff_unique_index": "0;0;0;0;1;2;3;0", "aff_unique_norm": "Peking University;Meta;Beijing Academy of Artificial Intelligence;Microsoft", "aff_unique_dep": ";Meta Platforms, Inc.;;Microsoft Corporation", "aff_unique_url": "http://www.pku.edu.cn;https://meta.com;https://www.baaic.cn;https://www.microsoft.com", "aff_unique_abbr": "Peking U;Meta;BAAI;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Critical feature learning in deep neural networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32715", "id": "xMJT4XW468", "proceeding": "https://proceedings.mlr.press/v235/fischer24a.html", "pdf": "https://openreview.net/pdf?id=xMJT4XW468", "openreview": "https://openreview.net/forum?id=xMJT4XW468", "author_site": "Kirsten Fischer, Javed Lindner, David Dahmen, Zohar Ringel, Michael Kr\u00e4mer, Moritz Helias", "tldr": "", "abstract": "A key property of neural networks driving their success is their ability to learn features from data. Understanding feature learning from a theoretical viewpoint is an emerging field with many open questions. In this work we capture finite-width effects with a systematic theory of network kernels in deep non-linear neural networks. We show that the Bayesian prior of the network can be written in closed form as a superposition of Gaussian processes, whose kernels are distributed with a variance that depends inversely on the network width $N$. A large deviation approach, which is exact in the proportional limit for the number of data points $P=\\alpha N\\to\\infty$, yields a pair of forward-backward equations for the maximum a posteriori kernels in all layers at once. We study their solutions perturbatively, to demonstrate how the backward propagation across layers aligns kernels with the target. An alternative field-theoretic formulation shows that kernel adaptation of the Bayesian posterior at finite-width results from fluctuations in the prior: larger fluctuations correspond to a more flexible network prior and thus enable stronger adaptation to data. We thus find a bridge between the classical edge-of-chaos NNGP theory and feature learning, exposing an intricate interplay between criticality, response functions, and feature scale.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kirsten Fischer;Javed Lindner;David Dahmen;Zohar Ringel;Michael Kr\u00e4mer;Moritz Helias", "authorids": "~Kirsten_Fischer1;~Javed_Lindner1;~David_Dahmen1;~Zohar_Ringel1;~Michael_Kr\u00e4mer1;~Moritz_Helias1", "gender": "F;M;M;M;M;M", "homepage": ";;https://www.fz-juelich.de/profile/dahmen_d;http://old.phys.huji.ac.il/~zohar.ringel/;https://web.physik.rwth-aachen.de/~mkraemer/;https://www.fz-juelich.de/profile/helias_m", "dblp": ";;;;;79/8535", "google_scholar": ";;https://scholar.google.de/citations?hl=de;https://scholar.google.co.il/citations?user=8-8VIDgAAAAJ;;https://scholar.google.de/citations?user=NZQ_gSAAAAAJ", "orcid": "0000-0001-9973-9953;0009-0008-9913-9040;0000-0002-7664-916X;;0000-0002-3089-6827;0000-0002-0404-8656", "linkedin": ";https://de.linkedin.com/in/javed-lindner-7a0075179;;;;", "or_profile": "~Kirsten_Fischer1;~Javed_Lindner1;~David_Dahmen1;~Zohar_Ringel1;~Michael_Kr\u00e4mer1;~Moritz_Helias1", "aff": "Juelich Research Center;Rheinisch Westf\u00e4lische Technische Hochschule Aachen;Forschungszentrum J\u00fclich;Hebrew University of Jerusalem, Israel;RWTH Aachen University;Forschungszentrum J\u00fclich", "aff_domain": "fz-juelich.de;rwth-aachen.de;fz-juelich.de;huji.ac.il;rwth-aachen.de;fz-juelich.de", "position": "PhD student;PhD student;Postdoc;Associate Professor;Full Professor;Principal Researcher", "bibtex": "@inproceedings{\nfischer2024critical,\ntitle={Critical feature learning in deep neural networks},\nauthor={Kirsten Fischer and Javed Lindner and David Dahmen and Zohar Ringel and Michael Kr{\\\"a}mer and Moritz Helias},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xMJT4XW468}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 672820, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12971799354742990993&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 6, "email": "fz-juelich.de;rwth-aachen.de;fz-juelich.de;huji.ac.il;rwth-aachen.de;fz-juelich.de", "author_num": 6, "aff_unique_index": "0;1;2;3;1;2", "aff_unique_norm": "Juelich Research Center;RWTH Aachen University;Forschungszentrum J\u00fclich;Hebrew University of Jerusalem", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.fz-juelich.de/;https://www.rwth-aachen.de;https://www.fz-juelich.de;https://www.huji.ac.il", "aff_unique_abbr": "FZ J\u00fclich;RWTH;FZJ;HUJI", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Aachen", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "Germany;Israel" }, { "title": "A New Branch-and-Bound Pruning Framework for $\\ell_0$-Regularized Problems", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32714", "id": "xPmSNLle1w", "proceeding": "https://proceedings.mlr.press/v235/theo24a.html", "pdf": "https://openreview.net/pdf?id=xPmSNLle1w", "openreview": "https://openreview.net/forum?id=xPmSNLle1w", "author_site": "Guyard Theo, C\u00e9dric Herzet, Cl\u00e9ment Elvira, Ayse-Nur Arslan", "tldr": "", "abstract": "We consider the resolution of learning problems involving $\\ell_0$-regularization via Branch-and- Bound (BnB) algorithms. These methods explore regions of the feasible space of the problem and check whether they do not contain solutions through \u201cpruning tests\u201d. In standard implementations, evaluating a pruning test requires to solve a convex optimization problem, which may result in computational bottlenecks. In this paper, we present an alternative to implement pruning tests for some generic family of $\\ell_0$-regularized problems. Our proposed procedure allows the simultaneous assessment of several regions and can be embedded in standard BnB implementations with a negligible computational overhead. We show through numerical simulations that our pruning strategy can improve the solving time of BnB procedures by several orders of magnitude for typical problems encountered in machine-learning applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Theo Guyard;C\u00e9dric Herzet;Cl\u00e9ment Elvira;Ayse-Nur Arslan", "authorids": "~Theo_Guyard1;cedric.herzet@ensai.fr;~Cl\u00e9ment_Elvira1;ayse-nur.arslan@inria.fr", "gender": "M;;M;", "homepage": "http://theoguyard.github.io/;;https://c-elvira.github.io/;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Theo_Guyard1;cedric.herzet@ensai.fr;~Cl\u00e9ment_Elvira1;ayse-nur.arslan@inria.fr", "aff": "Insa and Inria Rennes;;CentraleSupelec;", "aff_domain": "insa-rennes.fr;;centralesupelec.fr;", "position": "PhD student;;Associate Professor;", "bibtex": "@inproceedings{\ntheo2024a,\ntitle={A New Branch-and-Bound Pruning Framework for \\${\\textbackslash}ell\\_0\\$-Regularized Problems},\nauthor={Theo Guyard and C{\\'e}dric Herzet and Cl{\\'e}ment Elvira and Ayse-Nur Arslan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xPmSNLle1w}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 455947, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1070988763065687429&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "insa-rennes.fr;;centralesupelec.fr;", "author_num": 4, "aff_unique_index": "0;1", "aff_unique_norm": "Institut National des Sciences Appliqu\u00e9es de Rennes;CentraleSup\u00e9lec", "aff_unique_dep": ";", "aff_unique_url": "https://www.insa-rennes.fr;https://www.centralesupelec.fr", "aff_unique_abbr": "INSA Rennes;CS", "aff_campus_unique_index": "0", "aff_campus_unique": "Rennes;", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "title": "FrameQuant: Flexible Low-Bit Quantization for Transformers", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32713", "id": "xPypr0kufs", "proceeding": "https://proceedings.mlr.press/v235/adepu24a.html", "pdf": "https://openreview.net/pdf?id=xPypr0kufs", "openreview": "https://openreview.net/forum?id=xPypr0kufs", "author_site": "Harshavardhan Adepu, Zhanpeng Zeng, Li Zhang, Vikas Singh", "tldr": "", "abstract": "Transformers are the backbone of powerful foundation models for many Vision and Natural Language Processing tasks. But their compute and memory/storage footprint is large, and so, serving such models is expensive often requiring high-end hardware. To mitigate this difficulty, Post-Training Quantization seeks to modify a pre-trained model and quantize it to eight bits or lower, significantly boosting compute/memory/latency efficiency. Such models have been successfully quantized to four bits with some performance loss. In this work, we outline a simple scheme to quantize Transformer-based models to just two bits (plus some overhead) with only a small drop in accuracy. Key to our formulation is a concept borrowed from Harmonic analysis called Fusion Frames. Our main finding is that the quantization must take place not in the original weight space, but instead in the Fusion Frame representations. If quantization is interpreted as the addition of noise, our casting of the problem allows invoking an extensive body of known consistent recovery and noise robustness guarantees. Further, if desired, de-noising filters are known in closed form. We show empirically, via a variety of experiments, that (almost) two-bit quantization for Transformer models promises sizable efficiency gains. The code is available at https://github.com/vsingh-group/FrameQuant", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Harshavardhan Adepu;Zhanpeng Zeng;Li Zhang;Vikas Singh", "authorids": "~Harshavardhan_Adepu1;~Zhanpeng_Zeng1;~Li_Zhang28;~Vikas_Singh1", "gender": "M;M;M;M", "homepage": "https://harshauwm163.github.io/;;https://research.google/people/105588/;http://vsingh-www.cs.wisc.edu/", "dblp": "372/1508;284/9150;;", "google_scholar": "dOUqv1AAAAAJ;P9ctuRUAAAAJ;;d32BmwcAAAAJ", "orcid": ";;;", "linkedin": "adepu-harshavardhan-9a7006ba/;;;", "or_profile": "~Harshavardhan_Adepu1;~Zhanpeng_Zeng1;~Li_Zhang28;~Vikas_Singh1", "aff": "Google;University of Wisconsin, Madison;Google;University of Wisconsin, Madison", "aff_domain": "google.com;wisc.edu;google.com;wisc.edu", "position": "Intern;PhD student;Software engineer;Professor", "bibtex": "@inproceedings{\nadepu2024framequant,\ntitle={FrameQuant: Flexible Low-Bit Quantization for Transformers},\nauthor={Harshavardhan Adepu and Zhanpeng Zeng and Li Zhang and Vikas Singh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xPypr0kufs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7508428, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14033327830665183008&as_sdt=5,34&sciodt=0,34&hl=en", "gs_version_total": 8, "email": "google.com;wisc.edu;google.com;wisc.edu", "author_num": 4, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Google;University of Wisconsin", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.wisc.edu", "aff_unique_abbr": "Google;UW", "aff_campus_unique_index": "0;1;0;1", "aff_campus_unique": "Mountain View;Madison", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Temporal Distances: Contrastive Successor Features Can Provide a Metric Structure for Decision-Making", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32712", "id": "xQiYCmDrjp", "proceeding": "https://proceedings.mlr.press/v235/myers24a.html", "pdf": "https://openreview.net/pdf?id=xQiYCmDrjp", "openreview": "https://openreview.net/forum?id=xQiYCmDrjp", "author_site": "Vivek Myers, Chongyi Zheng, Anca Dragan, Sergey Levine, Benjamin Eysenbach", "tldr": "", "abstract": "Temporal distances lie at the heart of many algorithms for planning, control, and reinforcement learning that involve reaching goals, allowing one to estimate the transit time between two states. However, prior attempts to define such temporal distances in stochastic settings have been stymied by an important limitation: these prior approaches do not satisfy the triangle inequality. This is not merely a definitional concern, but translates to an inability to generalize and find shortest paths. In this paper, we build on prior work in contrastive learning and quasimetrics to show how successor features learned by contrastive learning (after a change of variables) form a temporal distance that does satisfy the triangle inequality, even in stochastic settings. Importantly, this temporal distance is computationally efficient to estimate, even in high-dimensional and stochastic settings. Experiments in controlled settings and benchmark suites demonstrate that an RL algorithm based on these new temporal distances exhibits combinatorial generalization (i.e., \"stitching\") and can sometimes learn more quickly than prior methods, including those based on quasimetrics.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vivek Myers;Chongyi Zheng;Anca Dragan;Sergey Levine;Benjamin Eysenbach", "authorids": "~Vivek_Myers1;~Chongyi_Zheng1;~Anca_Dragan1;~Sergey_Levine1;~Benjamin_Eysenbach1", "gender": ";M;F;M;M", "homepage": "https://people.eecs.berkeley.edu/~vmyers/;https://chongyi-zheng.github.io;http://www.ancadragan.com/;https://people.eecs.berkeley.edu/~svlevine/;https://ben-eysenbach.github.io/", "dblp": "270/8694;250/9267;;80/7594;192/1863", "google_scholar": "5NGAbT4AAAAJ;bezWXYcAAAAJ;;8R35rCwAAAAJ;DRnOvU8AAAAJ", "orcid": ";;;;0009-0000-7136-6307", "linkedin": ";;;;benjamin-eysenbach-a7235775/", "or_profile": "~Vivek_Myers1;~Chongyi_Zheng1;~Anca_Dragan1;~Sergey_Levine1;~Benjamin_Eysenbach1", "aff": "University of California, Berkeley;Princeton University;University of California, Berkeley;Google;Princeton University", "aff_domain": "berkeley.edu;princeton.edu;berkeley.edu;google.com;princeton.edu", "position": "PhD student;PhD student;Associate Professor;Research Scientist;Assistant Professor", "bibtex": "@inproceedings{\nmyers2024learning,\ntitle={Learning Temporal Distances: Contrastive Successor Features Can Provide a Metric Structure for Decision-Making},\nauthor={Vivek Myers and Chongyi Zheng and Anca Dragan and Sergey Levine and Benjamin Eysenbach},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xQiYCmDrjp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1702891, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13617809299268668571&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "berkeley.edu;princeton.edu;berkeley.edu;google.com;princeton.edu", "author_num": 5, "aff_unique_index": "0;1;0;2;1", "aff_unique_norm": "University of California, Berkeley;Princeton University;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.berkeley.edu;https://www.princeton.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;Princeton;Google", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Berkeley;;Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Achieving Margin Maximization Exponentially Fast via Progressive Norm Rescaling", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32711", "id": "xS2YKQlBIZ", "proceeding": "https://proceedings.mlr.press/v235/wang24ax.html", "pdf": "https://openreview.net/pdf?id=xS2YKQlBIZ", "openreview": "https://openreview.net/forum?id=xS2YKQlBIZ", "author_site": "Mingze Wang, Zeping Min, Lei Wu", "tldr": "", "abstract": "In this work, we investigate the margin-maximization bias exhibited by gradient-based algorithms in classifying linearly separable data. We present an in-depth analysis of the specific properties of the velocity field associated with (normalized) gradients, focusing on their role in margin maximization. Inspired by this analysis, we propose a novel algorithm called Progressive Rescaling Gradient Descent (PRGD) and show that PRGD can maximize the margin at an *exponential rate*. This stands in stark contrast to all existing algorithms, which maximize the margin at a slow *polynomial rate*. Specifically, we identify mild conditions on data distribution under which existing algorithms such as gradient descent (GD) and normalized gradient descent (NGD) *provably fail* in maximizing the margin efficiently. To validate our theoretical findings, we present both synthetic and real-world experiments. Notably, PRGD also shows promise in enhancing the generalization performance when applied to linearly non-separable datasets and deep neural networks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mingze Wang;Zeping Min;Lei Wu", "authorids": "~Mingze_Wang2;~Zeping_Min1;~Lei_Wu1", "gender": ";;M", "homepage": "https://wmz9.github.io/;;https://leiwu0.github.io/", "dblp": "296/7556;;", "google_scholar": "CkU47X0AAAAJ;;CMweeYcAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Mingze_Wang2;~Zeping_Min1;~Lei_Wu1", "aff": "Peking University;;Peking University", "aff_domain": "pku.edu.cn;;math.pku.edu.cn", "position": "PhD student;;Assistant Professor", "bibtex": "@inproceedings{\nwang2024achieving,\ntitle={Achieving Margin Maximization Exponentially Fast via Progressive Norm Rescaling},\nauthor={Mingze Wang and Zeping Min and Lei Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xS2YKQlBIZ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1020876, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17621373031727627894&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "pku.edu.cn;;math.pku.edu.cn", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Translating Subgraphs to Nodes Makes Simple GNNs Strong and Efficient for Subgraph Representation Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32710", "id": "xSizvCoI79", "proceeding": "https://proceedings.mlr.press/v235/kim24aa.html", "pdf": "https://openreview.net/pdf?id=xSizvCoI79", "openreview": "https://openreview.net/forum?id=xSizvCoI79", "author_site": "Dongkwan Kim, Alice Oh", "tldr": "", "abstract": "Subgraph representation learning has emerged as an important problem, but it is by default approached with specialized graph neural networks on a large global graph. These models demand extensive memory and computational resources but challenge modeling hierarchical structures of subgraphs. In this paper, we propose Subgraph-To-Node (S2N) translation, a novel formulation for learning representations of subgraphs. Specifically, given a set of subgraphs in the global graph, we construct a new graph by coarsely transforming subgraphs into nodes. Demonstrating both theoretical and empirical evidence, S2N not only significantly reduces memory and computational costs compared to state-of-the-art models but also outperforms them by capturing both local and global structures of the subgraph. By leveraging graph coarsening methods, our method outperforms baselines even in a data-scarce setting with insufficient subgraphs. Our experiments on eight benchmarks demonstrate that fined-tuned models with S2N translation can process 183 -- 711 times more subgraph samples than state-of-the-art models at a better or similar performance level.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dongkwan Kim;Alice Oh", "authorids": "~Dongkwan_Kim1;~Alice_Oh1", "gender": "M;F", "homepage": "https://dongkwan-kim.github.io/;http://uilab.kr", "dblp": "62/10307-1.html;50/7562", "google_scholar": "KgjSE64AAAAJ;https://scholar.google.co.kr/citations?user=B88-xMEAAAAJ", "orcid": ";", "linkedin": "dongkwan-kim/;alice-oh-4677544/", "or_profile": "~Dongkwan_Kim1;~Alice_Oh1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nkim2024translating,\ntitle={Translating Subgraphs to Nodes Makes Simple {GNN}s Strong and Efficient for Subgraph Representation Learning},\nauthor={Dongkwan Kim and Alice Oh},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xSizvCoI79}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1839141, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11161318104989236287&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "kaist.ac.kr;kaist.ac.kr", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "CF-OPT: Counterfactual Explanations for Structured Prediction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32709", "id": "xSkIxKdO08", "proceeding": "https://proceedings.mlr.press/v235/vivier-ardisson24a.html", "pdf": "https://openreview.net/pdf?id=xSkIxKdO08", "openreview": "https://openreview.net/forum?id=xSkIxKdO08", "author_site": "Germain Vivier-Ardisson, Alexandre Forel, Axel Parmentier, Thibaut Vidal", "tldr": "", "abstract": "Optimization layers in deep neural networks have enjoyed a growing popularity in structured learning, improving the state of the art on a variety of applications. Yet, these pipelines lack interpretability since they are made of two opaque layers: a highly non-linear prediction model, such as a deep neural network, and an optimization layer, which is typically a complex black-box solver. Our goal is to improve the transparency of such methods by providing counterfactual explanations. We build upon variational autoencoders a principled way of obtaining counterfactuals: working in the latent space leads to a natural notion of plausibility of explanations. We finally introduce a variant of the classic loss for VAE training that improves their performance in our specific structured context. These provide the foundations of CF-OPT, a first-order optimization algorithm that can find counterfactual explanations for a broad class of structured learning architectures. Our numerical results show that both close and plausible explanations can be obtained for problems from the recent literature.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Germain Vivier-Ardisson;Alexandre Forel;Axel Parmentier;Thibaut Vidal", "authorids": "~Germain_Vivier-Ardisson1;~Alexandre_Forel1;~Axel_Parmentier1;~Thibaut_Vidal1", "gender": "M;M;M;M", "homepage": ";https://alexforel.github.io/;https://cermics.enpc.fr/~parmenta/;https://w1.cirrelt.ca/~vidalt/en/home-thibaut-vidal.html", "dblp": ";321/1816;150/6250;40/11481", "google_scholar": "IOVEdbAAAAAJ;https://scholar.google.ca/citations?user=DcR5I4cAAAAJ;https://scholar.google.fr/citations?hl=fr;https://scholar.google.com.tw/citations?user=qbO0xwUAAAAJ", "orcid": ";0000-0002-9868-4804;0000-0003-1762-4947;0000-0001-5183-8485", "linkedin": "germain-vivier-ardisson/;alexandre-forel-275267147/;axel-parmentier-466548148/;thibaut-vidal-7a877055/", "or_profile": "~Germain_Vivier-Ardisson1;~Alexandre_Forel1;~Axel_Parmentier1;~Thibaut_Vidal1", "aff": "\u00c9cole Polytechnique;Polytechnique Montr\u00e9al;Ecole Nationale des Ponts et Chausees;Polytechnique Montreal", "aff_domain": "polytechnique.edu;polymtl.ca;enpc.fr;polymtl.ca", "position": "MS student;Postdoc;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nvivier-ardisson2024cfopt,\ntitle={{CF}-{OPT}: Counterfactual Explanations for Structured Prediction},\nauthor={Germain Vivier-Ardisson and Alexandre Forel and Axel Parmentier and Thibaut Vidal},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xSkIxKdO08}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4946325, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:u46hW4LQawEJ:scholar.google.com/&scioq=CF-OPT:+Counterfactual+Explanations+for+Structured+Prediction&hl=en&as_sdt=0,5", "gs_version_total": 7, "email": "polytechnique.edu;polymtl.ca;enpc.fr;polymtl.ca", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Ecole Polytechnique;Polytechnique Montr\u00e9al;Ecole Nationale des Ponts et Chaussees;Polytechnique Montreal", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.polytechnique.edu;https://www.polymtl.ca;https://www.enpc.fr;https://www.polymtl.ca", "aff_unique_abbr": "X;PolyMTL;ENPC;PolyMTL", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Montr\u00e9al;Montreal", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "France;Canada" }, { "title": "Out-of-Domain Generalization in Dynamical Systems Reconstruction", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32708", "id": "xTYIAD2NND", "proceeding": "https://proceedings.mlr.press/v235/goring24a.html", "pdf": "https://openreview.net/pdf?id=xTYIAD2NND", "openreview": "https://openreview.net/forum?id=xTYIAD2NND", "author_site": "Niclas G\u00f6ring, Florian Hess, Manuel Brenner, Zahra Monfared, Daniel Durstewitz", "tldr": "", "abstract": "In science we are interested in finding the governing equations, the dynamical rules, underlying empirical phenomena. While traditionally scientific models are derived through cycles of human insight and experimentation, recently deep learning (DL) techniques have been advanced to reconstruct dynamical systems (DS) directly from time series data. State-of-the-art dynamical systems reconstruction (DSR) methods show promise in capturing invariant and long-term properties of observed DS, but their ability to generalize to unobserved domains remains an open challenge. Yet, this is a crucial property we would expect from any viable scientific theory. In this work, we provide a formal framework that addresses generalization in DSR. We explain why and how out-of-domain (OOD) generalization (OODG) in DSR profoundly differs from OODG considered elsewhere in machine learning. We introduce mathematical notions based on topological concepts and ergodic theory to formalize the idea of learnability of a DSR model. We formally prove that black-box DL techniques, without adequate structural priors, generally will not be able to learn a generalizing DSR model. We also show this empirically, considering major classes of DSR algorithms proposed so far, and illustrate where and why they fail to generalize across the whole phase space. Our study provides the first comprehensive mathematical treatment of OODG in DSR, and gives a deeper conceptual understanding of where the fundamental problems in OODG lie and how they could possibly be addressed in practice.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Niclas Alexander G\u00f6ring;Florian Hess;Manuel Brenner;Zahra Monfared;Daniel Durstewitz", "authorids": "~Niclas_Alexander_G\u00f6ring1;~Florian_Hess1;~Manuel_Brenner1;~Zahra_Monfared1;~Daniel_Durstewitz1", "gender": "M;M;M;F;", "homepage": ";https://www.zi-mannheim.de/forschung/abteilungen-ags-institute/theoret-neurowissenschaften/infos-theor-neurowiss.html;;;https://durstewitzlab.github.io", "dblp": ";;323/8935;;98/2120", "google_scholar": "SpoGWKgAAAAJ;nOZM-1AAAAAJ;HCUeyg8AAAAJ;https://scholar.google.pl/citations?user=OPUIwIoAAAAJ;https://scholar.google.de/citations?user=2bcbKU0AAAAJ", "orcid": ";;;;0000-0002-9340-3786", "linkedin": ";;manuel-brenner-772261191/;;", "or_profile": "~Niclas_Alexander_G\u00f6ring1;~Florian_Hess1;~Manuel_Brenner1;~Zahra_Monfared1;~Daniel_Durstewitz1", "aff": "University of Oxford;Ruprecht-Karls-Universit\u00e4t Heidelberg;Heidelberg University;ZI Mannheim-Heidelberg University;Heidelberg University", "aff_domain": "oxford.ac.uk;uni-heidelberg.de;uni-heidelberg.de;zi-manheim.de;uni-heidelberg.de", "position": "PhD student;PhD student;PhD student;Postdoc;Full Professor", "bibtex": "@inproceedings{\ng{\\\"o}ring2024outofdomain,\ntitle={Out-of-Domain Generalization in Dynamical Systems Reconstruction},\nauthor={Niclas Alexander G{\\\"o}ring and Florian Hess and Manuel Brenner and Zahra Monfared and Daniel Durstewitz},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xTYIAD2NND}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8773387, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=238989676540203424&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "oxford.ac.uk;uni-heidelberg.de;uni-heidelberg.de;zi-manheim.de;uni-heidelberg.de", "author_num": 5, "aff_unique_index": "0;1;2;3;2", "aff_unique_norm": "University of Oxford;Ruprecht-Karls-Universit\u00e4t Heidelberg;Heidelberg University;ZI Mannheim-Heidelberg University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ox.ac.uk;https://www.uni-heidelberg.de/;https://www.uni-heidelberg.de;", "aff_unique_abbr": "Oxford;Uni Heidelberg;Uni Heidelberg;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "United Kingdom;Germany" }, { "title": "A Dense Reward View on Aligning Text-to-Image Diffusion with Preference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32707", "id": "xVXnXk9I3I", "proceeding": "https://proceedings.mlr.press/v235/yang24e.html", "pdf": "https://openreview.net/pdf?id=xVXnXk9I3I", "openreview": "https://openreview.net/forum?id=xVXnXk9I3I", "author_site": "Shentao Yang, Tianqi Chen, Mingyuan Zhou", "tldr": "", "abstract": "Aligning text-to-image diffusion model (T2I) with preference has been gaining increasing research attention. While prior works exist on directly optimizing T2I by preference data, these methods are developed under the bandit assumption of a latent reward on the entire diffusion reverse chain, while ignoring the sequential nature of the generation process. This may harm the efficacy and efficiency of preference alignment. In this paper, we take on a finer dense reward perspective and derive a tractable alignment objective that emphasizes the initial steps of the T2I reverse chain. In particular, we introduce temporal discounting into DPO-style explicit-reward-free objectives, to break the temporal symmetry therein and suit the T2I generation hierarchy. In experiments on single and multiple prompt generation, our method is competitive with strong relevant baselines, both quantitatively and qualitatively. Further investigations are conducted to illustrate the insight of our approach. Source code is available at https://github.com/Shentao-YANG/Dense_Reward_T2I .", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shentao Yang;Tianqi Chen;Mingyuan Zhou", "authorids": "~Shentao_Yang1;~Tianqi_Chen2;~Mingyuan_Zhou1", "gender": "M;;M", "homepage": ";https://tqch.github.io;http://mingyuanzhou.github.io", "dblp": ";94/8023;", "google_scholar": "https://scholar.google.com/citations?hl=en;jucvWbcAAAAJ;LXwCIisAAAAJ", "orcid": "0009-0009-8058-3149;0000-0003-3604-3048;", "linkedin": "shentaoyang/;tianqi-chen-4875671a3;", "or_profile": "~Shentao_Yang1;~Tianqi_Chen2;~Mingyuan_Zhou1", "aff": "University of Texas at Austin;University of Texas at Austin;Google", "aff_domain": "utexas.edu;utexas.edu;google.com", "position": "PhD student;PhD student;Researcher", "bibtex": "@inproceedings{\nyang2024a,\ntitle={A Dense Reward View on Aligning Text-to-Image Diffusion with Preference},\nauthor={Shentao Yang and Tianqi Chen and Mingyuan Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xVXnXk9I3I}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9560943, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3105339794050899533&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "utexas.edu;utexas.edu;google.com", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Texas at Austin;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.utexas.edu;https://www.google.com", "aff_unique_abbr": "UT Austin;Google", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Austin;Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Model-based Reinforcement Learning for Parameterized Action Spaces", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32706", "id": "xW79geE0RA", "proceeding": "https://proceedings.mlr.press/v235/zhang24r.html", "pdf": "https://openreview.net/pdf?id=xW79geE0RA", "openreview": "https://openreview.net/forum?id=xW79geE0RA", "author_site": "Renhao Zhang, Haotian Fu, Yilin Miao, George Konidaris", "tldr": "", "abstract": "We propose a novel model-based reinforcement learning algorithm---Dynamics Learning and predictive control with Parameterized Actions (DLPA)---for Parameterized Action Markov Decision Processes (PAMDPs). The agent learns a parameterized-action-conditioned dynamics model and plans with a modified Model Predictive Path Integral control. We theoretically quantify the difference between the generated trajectory and the optimal trajectory during planning in terms of the value they achieved through the lens of Lipschitz Continuity. Our empirical results on several standard benchmarks show that our algorithm achieves superior sample efficiency and asymptotic performance than state-of-the-art PAMDP methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Renhao Zhang;Haotian Fu;Yilin Miao;George Konidaris", "authorids": "~Renhao_Zhang1;~Haotian_Fu3;~Yilin_Miao1;~George_Konidaris1", "gender": "M;M;M;M", "homepage": "https://renhaoz.github.io/;https://haotianfu.me/;;http://cs.brown.edu/people/gdk/", "dblp": "256/2504;237/9681;;56/6762", "google_scholar": "https://scholar.google.com/citations?hl=en;btaP96wAAAAJ;;9UERvVEAAAAJ", "orcid": ";;0000-0002-5178-7363;", "linkedin": ";;yilin-miao/;", "or_profile": "~Renhao_Zhang1;~Haotian_Fu3;~Yilin_Miao1;~George_Konidaris1", "aff": "Brown University;Brown University;Brown University;Brown University", "aff_domain": "brown.edu;brown.edu;brown.edu;brown.edu", "position": "MS student;PhD student;MS student;Assistant Professor", "bibtex": "@inproceedings{\nzhang2024modelbased,\ntitle={Model-based Reinforcement Learning for Parameterized Action Spaces},\nauthor={Renhao Zhang and Haotian Fu and Yilin Miao and George Konidaris},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xW79geE0RA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6440972, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11794975877883303160&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13, "email": "brown.edu;brown.edu;brown.edu;brown.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Brown University", "aff_unique_dep": "", "aff_unique_url": "https://www.brown.edu", "aff_unique_abbr": "Brown", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "How Private are DP-SGD Implementations?", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32705", "id": "xWI0MKwJSS", "proceeding": "https://proceedings.mlr.press/v235/chua24a.html", "pdf": "https://openreview.net/pdf?id=xWI0MKwJSS", "openreview": "https://openreview.net/forum?id=xWI0MKwJSS", "author_site": "Lynn Chua, Badih Ghazi, Pritish Kamath, Ravi Kumar, Pasin Manurangsi, Amer Sinha, Chiyuan Zhang", "tldr": "", "abstract": "We demonstrate a substantial gap between the privacy guarantees of the Adaptive Batch Linear Queries (ABLQ) mechanism under different types of batch sampling: (i) Shuffling, and (ii) Poisson subsampling; the typical analysis of Differentially Private Stochastic Gradient Descent (DP-SGD) follows by interpreting it as a post-processing of ABLQ. While shuffling-based DP-SGD is more commonly used in practical implementations, it has not been amenable to easy privacy analysis, either analytically or even numerically. On the other hand, Poisson subsampling-based DP-SGD is challenging to scalably implement, but has a well-understood privacy analysis, with multiple open-source numerically tight privacy accountants available. This has led to a common practice of using shuffling-based DP-SGD in practice, but using the privacy analysis for the corresponding Poisson subsampling version. Our result shows that there can be a substantial gap between the privacy analysis when using the two types of batch sampling, and thus advises caution in reporting privacy parameters for DP-SGD.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lynn Chua;Badih Ghazi;Pritish Kamath;Ravi Kumar;Pasin Manurangsi;Amer Sinha;Chiyuan Zhang", "authorids": "~Lynn_Chua1;~Badih_Ghazi1;~Pritish_Kamath2;~Ravi_Kumar1;~Pasin_Manurangsi2;~Amer_Sinha1;~Chiyuan_Zhang1", "gender": "F;;M;M;M;M;M", "homepage": ";https://sites.google.com/view/badihghazi/home;https://pritishkamath.github.io/;https://sites.google.com/site/ravik53/;https://pasin30055.github.io/;;http://pluskid.org", "dblp": "143/4392;125/2134;https://dblp.org/pers/k/Kamath:Pritish.html;k/RaviKumar.html;133/2059;;21/8315", "google_scholar": "D2SXVSYAAAAJ;GBJLTN8AAAAJ;1JFARhUAAAAJ;J_XhIsgAAAAJ;35hM-PkAAAAJ;;l_G2vr0AAAAJ", "orcid": ";;;0000-0002-2203-2586;;;", "linkedin": "chua-lynn/;badih-ghazi-608379132/;;ravi-kumar-a3a9631;;amersinha/;", "or_profile": "~Lynn_Chua1;~Badih_Ghazi1;~Pritish_Kamath2;~Ravi_Kumar1;~Pasin_Manurangsi2;~Amer_Sinha1;~Chiyuan_Zhang1", "aff": "Google;Google;Google Research;Google;Google;Research, Google;Google", "aff_domain": "google.com;google.com;google.com;google.com;google.com;research.google.com;google.com", "position": "Researcher;Researcher;Research Scientist;Research Scientist;Research Scientist;Researcher;Research Scientist", "bibtex": "@inproceedings{\nchua2024how,\ntitle={How Private are {DP}-{SGD} Implementations?},\nauthor={Lynn Chua and Badih Ghazi and Pritish Kamath and Ravi Kumar and Pasin Manurangsi and Amer Sinha and Chiyuan Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xWI0MKwJSS}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 981753, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2272133375448158477&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "google.com;google.com;google.com;google.com;google.com;research.google.com;google.com", "author_num": 7, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Envisioning Outlier Exposure by Large Language Models for Out-of-Distribution Detection", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32704", "id": "xZO7SmM12y", "proceeding": "https://proceedings.mlr.press/v235/cao24d.html", "pdf": "https://openreview.net/pdf?id=xZO7SmM12y", "openreview": "https://openreview.net/forum?id=xZO7SmM12y", "author_site": "Chentao Cao, Zhun Zhong, Zhanke Zhou, Yang Liu, Tongliang Liu, Bo Han", "tldr": "", "abstract": "Detecting out-of-distribution (OOD) samples is essential when deploying machine learning models in open-world scenarios. Zero-shot OOD detection, requiring no training on in-distribution (ID) data, has been possible with the advent of vision-language models like CLIP. Existing methods build a text-based classifier with only closed-set labels. However, this largely restricts the inherent capability of CLIP to recognize samples from large and open label space. In this paper, we propose to tackle this constraint by leveraging the expert knowledge and reasoning capability of large language models (LLM) to Envision potential Outlier Exposure, termed EOE, without access to any actual OOD data. Owing to better adaptation to open-world scenarios, EOE can be generalized to different tasks, including far, near, and fine-grained OOD detection. Technically, we design (1) LLM prompts based on visual similarity to generate potential outlier class labels specialized for OOD detection, as well as (2) a new score function based on potential outlier penalty to distinguish hard OOD samples effectively. Empirically, EOE achieves state-of-the-art performance across different OOD tasks and can be effectively scaled to the ImageNet-1K dataset. The code is publicly available at: https://github.com/tmlr-group/EOE.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chentao Cao;Zhun Zhong;Zhanke Zhou;Yang Liu;Tongliang Liu;Bo Han", "authorids": "~Chentao_Cao1;~Zhun_Zhong1;~Zhanke_Zhou1;~Yang_Liu3;~Tongliang_Liu1;~Bo_Han1", "gender": "M;M;M;M;M;M", "homepage": "https://github.com/Aboriginer;http://zhunzhong.site;https://andrewzhou924.github.io/;http://www.yliuu.com;https://tongliang-liu.github.io/;https://bhanml.github.io/", "dblp": "320/1115.html;32/6525;285/5311;51/3710-18;150/6667;241/0472-3", "google_scholar": "vZPl_oQAAAAJ;nZizkQ0AAAAJ;GVXErr0AAAAJ;jKrIVCIAAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ;nTNjqHwAAAAJ", "orcid": ";;;0000-0001-8420-6011;;", "linkedin": ";;;;;", "or_profile": "~Chentao_Cao1;~Zhun_Zhong1;~Zhanke_Zhou1;~Yang_Liu3;~Tongliang_Liu1;~bo_han2", "aff": "Hong Kong Baptist University;University of Nottingham;Hong Kong Baptist University;University of California, Santa Cruz;Mohamed bin Zayed University of Artificial Intelligence;MBZUAI", "aff_domain": "hkbu.edu.hk;nottingham.ac.uk;hkbu.edu.hk;ucsc.edu;mbzuai.ac.ae;mbzuai.ac.ae", "position": "PhD student;Assistant Professor;PhD student;Assistant Professor;Affiliated Associate Professor;Researcher", "bibtex": "@inproceedings{\ncao2024envisioning,\ntitle={Envisioning Outlier Exposure by Large Language Models for Out-of-Distribution Detection},\nauthor={Chentao Cao and Zhun Zhong and Zhanke Zhou and Yang Liu and Tongliang Liu and Bo Han},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xZO7SmM12y}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7135878, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7347899670742272864&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "hkbu.edu.hk;nottingham.ac.uk;hkbu.edu.hk;ucsc.edu;mbzuai.ac.ae;mbzuai.ac.ae", "author_num": 6, "aff_unique_index": "0;1;0;2;3;3", "aff_unique_norm": "Hong Kong Baptist University;University of Nottingham;University of California, Santa Cruz;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.hkbu.edu.hk;https://www.nottingham.ac.uk;https://www.ucsc.edu;https://mbzuai.ac.ae", "aff_unique_abbr": "HKBU;UoN;UCSC;MBZUAI", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Hong Kong SAR;;Santa Cruz", "aff_country_unique_index": "0;1;0;2;3;3", "aff_country_unique": "China;United Kingdom;United States;United Arab Emirates" }, { "title": "Robust Classification via a Single Diffusion Model", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32703", "id": "xaSpuvNYwS", "proceeding": "https://proceedings.mlr.press/v235/chen24k.html", "pdf": "https://openreview.net/pdf?id=xaSpuvNYwS", "openreview": "https://openreview.net/forum?id=xaSpuvNYwS", "author_site": "Huanran Chen, Yinpeng Dong, Zhengyi Wang, Xiao Yang, Chengqi Duan, Hang Su, Jun Zhu", "tldr": "", "abstract": "Diffusion models have been applied to improve adversarial robustness of image classifiers by purifying the adversarial noises or generating realistic data for adversarial training. However, diffusion-based purification can be evaded by stronger adaptive attacks while adversarial training does not perform well under unseen threats, exhibiting inevitable limitations of these methods. To better harness the expressive power of diffusion models, this paper proposes Robust Diffusion Classifier (RDC), a generative classifier that is constructed from a pre-trained diffusion model to be adversarially robust. RDC first maximizes the data likelihood of a given input and then predicts the class probabilities of the optimized input using the conditional likelihood estimated by the diffusion model through Bayes' theorem. To further reduce the computational cost, we propose a new diffusion backbone called multi-head diffusion and develop efficient sampling strategies. As RDC does not require training on particular adversarial attacks, we demonstrate that it is more generalizable to defend against multiple unseen threats. In particular, RDC achieves $75.67\\%$ robust accuracy against various $\\ell_\\infty$ norm-bounded adaptive attacks with $\\epsilon_\\infty=8/255$ on CIFAR-10, surpassing the previous state-of-the-art adversarial training models by $+4.77\\%$. The results highlight the potential of generative classifiers by employing pre-trained diffusion models for adversarial robustness compared with the commonly studied discriminative classifiers.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Huanran Chen;Yinpeng Dong;Zhengyi Wang;Xiao Yang;Chengqi Duan;Hang Su;Jun Zhu", "authorids": "~Huanran_Chen1;~Yinpeng_Dong2;~Zhengyi_Wang1;~Xiao_Yang4;~Chengqi_Duan1;~Hang_Su3;~Jun_Zhu2", "gender": "M;M;M;M;M;M;M", "homepage": "https://huanranchen.github.io/;https://dongyp13.github.io;https://thuwzy.github.io;https://ml.cs.tsinghua.edu.cn/~xiaoyang/;;http://ml.cs.tsinghua.edu.cn/~jun;", "dblp": "329/6558;183/0980;;57/33851;336/2001;50/2644-1;26/5371-6", "google_scholar": "https://scholar.google.co.jp/citations?user=QYsKXccAAAAJ;6_4ad84AAAAJ;dtuPuRQAAAAJ;bwkwp0MAAAAJ;r9qb4ZwAAAAJ;axsP38wAAAAJ;dxN1_X0AAAAJ", "orcid": ";;;0000-0001-9502-9962;;;", "linkedin": ";;;;;;", "or_profile": "~Huanran_Chen1;~Yinpeng_Dong2;~Zhengyi_Wang1;~Xiao_Yang4;~Chengqi_Duan1;~Jun_Zhu2;~Hang_Su2", "aff": ";Tsinghua University;Tsinghua University;Tsinghua University;University of Hong Kong;Tsinghua University;Tsinghua University", "aff_domain": ";tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn;hku.hk;mail.tsinghua.edu.cn;tsinghua.edu.cn", "position": ";Postdoc;PhD student;Postdoc;PhD student;Professor;Associate Professor", "bibtex": "@inproceedings{\nchen2024robust,\ntitle={Robust Classification via a Single Diffusion Model},\nauthor={Huanran Chen and Yinpeng Dong and Zhengyi Wang and Xiao Yang and Chengqi Duan and Hang Su and Jun Zhu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xaSpuvNYwS}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2761660, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1883020923716387299&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": ";tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn;hku.hk;mail.tsinghua.edu.cn;tsinghua.edu.cn", "author_num": 7, "aff_unique_index": "0;0;0;1;0;0", "aff_unique_norm": "Tsinghua University;University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.hku.hk", "aff_unique_abbr": "THU;HKU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Positive and Unlabeled Learning with Controlled Probability Boundary Fence", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32702", "id": "xbQqhojHTg", "proceeding": "https://proceedings.mlr.press/v235/li24p.html", "pdf": "https://openreview.net/pdf?id=xbQqhojHTg", "openreview": "https://openreview.net/forum?id=xbQqhojHTg", "author_site": "Changchun Li, Yuanchao Dai, Lei Feng, Ximing Li, Bing Wang, Jihong Ouyang", "tldr": "", "abstract": "Positive and Unlabeled (PU) learning refers to a special case of binary classification, and technically, it aims to induce a binary classifier from a few labeled positive training instances and loads of unlabeled instances. In this paper, we derive a theorem indicating that the probability boundary of the asymmetric disambiguation-free expected risk of PU learning is controlled by its asymmetric penalty, and we further empirically evaluated this theorem. Inspired by the theorem and its empirical evaluations, we propose an easy-to-implement two-stage PU learning method, namely **P**ositive and **U**nlabeled **L**earning with **C**ontrolled **P**robability **B**oundary **F**ence (**PULCPBF**). In the first stage, we train a set of weak binary classifiers concerning different probability boundaries by minimizing the asymmetric disambiguation-free empirical risks with specific asymmetric penalty values. We can interpret these induced weak binary classifiers as a probability boundary fence. For each unlabeled instance, we can use the predictions to locate its class posterior probability and generate a stochastic label. In the second stage, we train a strong binary classifier over labeled positive training instances and all unlabeled instances with stochastic labels in a self-training manner. Extensive empirical results demonstrate that PULCPBF can achieve competitive performance compared with the existing PU learning baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Changchun Li;Yuanchao Dai;Lei Feng;Ximing Li;Bing Wang;Jihong Ouyang", "authorids": "~Changchun_Li1;~Yuanchao_Dai1;~Lei_Feng1;~Ximing_Li1;~Bing_Wang12;~Jihong_Ouyang2", "gender": "M;F;M;M;M;F", "homepage": ";https://github.com/daidai1118;https://lfeng1995.github.io/;https://ccst.jlu.edu.cn/info/1367/19282.htm;;http://ccst.jlu.edu.cn/info/1186/2081.htm", "dblp": "73/7819;;76/847-6;130/1013-2;;46/3783", "google_scholar": "https://scholar.google.com.hk/citations?user=tO6IqzAAAAAJ;;https://scholar.google.com.sg/citations?user=KomQOFkAAAAJ;2WQ--c4AAAAJ;uqXWZBYAAAAJ;", "orcid": ";;0000-0003-2839-5799;0000-0001-8190-5087;0000-0002-1304-3718;", "linkedin": ";;;;;", "or_profile": "~Changchun_Li1;~Yuanchao_Dai1;~Lei_Feng1;~Ximing_Li1;~Bing_Wang12;~Jihong_Ouyang2", "aff": "Jilin University;Jilin University;Singapore University of Technology and Design;Jilin University;Jilin University;Jilin University", "aff_domain": "jlu.edu.cn;mails.jlu.edu.cn;sutd.edu.sg;jlu.edu.cn;jlu.edu.cn;jlu.edu.cn", "position": "Postdoc;PhD student;Assistant Professor;Full Professor;PhD student;Full Professor", "bibtex": "@inproceedings{\nli2024positive,\ntitle={Positive and Unlabeled Learning with Controlled Probability Boundary Fence},\nauthor={Changchun Li and Yuanchao Dai and Lei Feng and Ximing Li and Bing Wang and Jihong Ouyang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xbQqhojHTg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 602553, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6221455946952835697&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "email": "jlu.edu.cn;mails.jlu.edu.cn;sutd.edu.sg;jlu.edu.cn;jlu.edu.cn;jlu.edu.cn", "author_num": 6, "aff_unique_index": "0;0;1;0;0;0", "aff_unique_norm": "Jilin University;Singapore University of Technology and Design", "aff_unique_dep": ";", "aff_unique_url": "http://www.jlu.edu.cn;https://www.sutd.edu.sg", "aff_unique_abbr": "JLU;SUTD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "China;Singapore" }, { "title": "CHAI: Clustered Head Attention for Efficient LLM Inference", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32701", "id": "xcDRx8vzCa", "proceeding": "https://proceedings.mlr.press/v235/agarwal24a.html", "pdf": "https://openreview.net/pdf?id=xcDRx8vzCa", "openreview": "https://openreview.net/forum?id=xcDRx8vzCa", "author_site": "Saurabh Agarwal, Bilge Acun, Basil Hosmer, Mostafa Elhoushi, Yejin Lee, Shivaram Venkataraman, Dimitris Papailiopoulos, Carole-Jean Wu", "tldr": "", "abstract": "Large Language Models (LLMs) with hundreds of billions of parameters have transformed the field of machine learning. However, serving these models at inference time is both compute and memory intensive, where a single request can require multiple GPUs and tens of Gigabytes of memory. Multi-head attention is one of the key components of LLMs, which can for over 50% of LLMs memory and compute requirement. We observe that there is a high amount of redundancy across heads on which tokens they pay attention to. Based on this insight, we propose Clustered HeadAttention ( CHAI ). CHAI combines heads with a high amount of correlation for self-attention at runtime, thus reducing both memory and compute. In our experiments, we show that CHAI is able to reduce the memory requirements for storing K,V cache by up to 21.4% and inference time latency by up to 1.73\u00d7 without any fine-tuning required. CHAI achieves this with a maximum 3.2% deviation in accuracy across 3 different models (i.e. OPT-66B, LLAMA-7B, LLAMA-33B) and 5 different evaluation datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Saurabh Agarwal;Bilge Acun;Basil Hosmer;Mostafa Elhoushi;Yejin Lee;Shivaram Venkataraman;Dimitris Papailiopoulos;Carole-Jean Wu", "authorids": "~Saurabh_Agarwal1;~Bilge_Acun1;~Basil_Hosmer1;~Mostafa_Elhoushi1;~Yejin_Lee4;~Shivaram_Venkataraman1;~Dimitris_Papailiopoulos1;~Carole-Jean_Wu2", "gender": "M;;;M;F;;M;F", "homepage": "https://www.saurabh.dev;;https://github.com/bhosmer;;https://yjyjlee.github.io/;https://pages.cs.wisc.edu/~shivaram/;http://papail.io;", "dblp": "37/2620;;;157/6350;185/6602-1;65/8569;;26/9655", "google_scholar": ";;fOKkaE0AAAAJ;https://scholar.google.ca/citations?user=y_cwSKAAAAAJ;;5LLV29oAAAAJ;hYi6i9sAAAAJ;S1szbyAAAAAJ", "orcid": ";;;0000-0001-6172-4510;;0000-0001-9575-7935;;", "linkedin": ";;basilhosmer/;mostafaelhoushi/;;;;", "or_profile": "~Saurabh_Agarwal1;~Bilge_Acun1;~Basil_Hosmer1;~Mostafa_Elhoushi1;~Yejin_Lee4;~Shivaram_Venkataraman1;~Dimitris_Papailiopoulos1;~Carole-Jean_Wu2", "aff": "University of Wisconsin, Madison;;Meta Facebook;Meta;META;University of Wisconsin, Madison;University of Wisconsin - Madison;Meta", "aff_domain": "wisc.edu;;meta.com;meta.com;meta.com;wisc.edu;wisc.edu;meta.com", "position": "Student;;Researcher;Researcher;Postdoc;Assistant Professor;Associate Professor;Researcher", "bibtex": "@inproceedings{\nagarwal2024chai,\ntitle={{CHAI}: Clustered Head Attention for Efficient {LLM} Inference},\nauthor={Saurabh Agarwal and Bilge Acun and Basil Hosmer and Mostafa Elhoushi and Yejin Lee and Shivaram Venkataraman and Dimitris Papailiopoulos and Carole-Jean Wu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xcDRx8vzCa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3441347, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7254133979675802040&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "wisc.edu;;meta.com;meta.com;meta.com;wisc.edu;wisc.edu;meta.com", "author_num": 8, "aff_unique_index": "0;1;1;0;3;1", "aff_unique_norm": "University of Wisconsin;Meta;;University of Wisconsin-Madison", "aff_unique_dep": ";Meta Platforms, Inc.;;", "aff_unique_url": "https://www.wisc.edu;https://meta.com;;https://www.wisc.edu", "aff_unique_abbr": "UW;Meta;;UW-Madison", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States;" }, { "title": "S3O: A Dual-Phase Approach for Reconstructing Dynamic Shape and Skeleton of Articulated Objects from Single Monocular Video", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32700", "id": "xcyKKACmSd", "proceeding": "https://proceedings.mlr.press/v235/zhang24ae.html", "pdf": "https://openreview.net/pdf?id=xcyKKACmSd", "openreview": "https://openreview.net/forum?id=xcyKKACmSd", "author_site": "Hao Zhang, Fang Li, Samyak Rawlekar, Narendra Ahuja", "tldr": "", "abstract": "Reconstructing dynamic articulated objects from a singular monocular video is challenging, requiring joint estimation of shape, motion, and camera parameters from limited views. Current methods typically demand extensive computational resources and training time, and require additional human annotations such as predefined parametric models, camera poses, and key points, limiting their generalizability. We propose Synergistic Shape and Skeleton Optimization (S3O), a novel two-phase method that forgoes these prerequisites and efficiently learns parametric models including visible shapes and underlying skeletons. Conventional strategies typically learn all parameters simultaneously, leading to interdependencies where a single incorrect prediction can result in significant errors. In contrast, S3O adopts a phased approach: it first focuses on learning coarse parametric models, then progresses to motion learning and detail addition. This method substantially lowers computational complexity and enhances robustness in reconstruction from limited viewpoints, all without requiring additional annotations. To address the current inadequacies in 3D reconstruction from monocular video benchmarks, we collected the PlanetZoo dataset. Our experimental evaluations on standard benchmarks and the PlanetZoo dataset affirm that S3O provides more accurate 3D reconstruction, and plausible skeletons, and reduces the training time by approximately 60% compared to the state-of-the-art, thus advancing the state of the art in dynamic object reconstruction.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hao Zhang;Fang Li;Samyak Rawlekar;Narendra Ahuja", "authorids": "~Hao_Zhang47;~Fang_Li8;~Samyak_Rawlekar1;~Narendra_Ahuja1", "gender": "M;M;;M", "homepage": "https://haoz19.github.io/;https://fangli333.github.io/;http://vision.ai.illinois.edu/ahuja.html;https://samyakr99.github.io", "dblp": ";;;", "google_scholar": "KeDuEtcAAAAJ;WybiEu0AAAAJ;dY7OSl0AAAAJ;4Jp_SN4AAAAJ", "orcid": ";;;", "linkedin": ";fang-li-8ab696223/;;", "or_profile": "~Hao_Zhang47;~Fang_Li8;~Narendra_Ahuja1;~Samyak_Kabir_Rawlekar1", "aff": "University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign", "aff_domain": "illinois.edu;illinois.edu;illinois.edu;uiuc.edu", "position": "PhD student;PhD student;Research Professor;PhD student", "bibtex": "@inproceedings{\nzhang2024so,\ntitle={S3O: A Dual-Phase Approach for Reconstructing Dynamic Shape and Skeleton of Articulated Objects from Single Monocular Video},\nauthor={Hao Zhang and Fang Li and Samyak Rawlekar and Narendra Ahuja},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xcyKKACmSd}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9689081, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16259570056535395861&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "illinois.edu;illinois.edu;illinois.edu;uiuc.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Langevin Policy for Safe Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32699", "id": "xgoilgLPGD", "proceeding": "https://proceedings.mlr.press/v235/lei24a.html", "pdf": "https://openreview.net/pdf?id=xgoilgLPGD", "openreview": "https://openreview.net/forum?id=xgoilgLPGD", "author_site": "Fenghao Lei, Long Yang, Shiting Wen, Zhixiong Huang, Zhiwang Zhang, Chaoyi Pang", "tldr": "", "abstract": "Optimization and sampling based algorithms are two branches of methods in machine learning, while existing safe reinforcement learning (RL) algorithms are mainly based on optimization, it is still unclear whether sampling based methods can lead to desirable performance with safe policy. This paper formulates the Langevin policy for safe RL, and proposes Langevin Actor-Critic (LAC) to accelerate the process of policy inference. Concretely, instead of parametric policy, the proposed Langevin policy provides a stochastic process that directly infers actions, which is the numerical solver to the Langevin dynamic of actions on the continuous time. Furthermore, to make Langevin policy practical on RL tasks, the proposed LAC accumulates the transitions induced by Langevin policy and reproduces them with a generator. Finally, extensive empirical results show the effectiveness and superiority of LAC on the MuJoCo-based and Safety Gym tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fenghao Lei;Long Yang;Shiting Wen;Zhixiong Huang;Zhiwang Zhang;Chaoyi Pang", "authorids": "~Fenghao_Lei1;~Long_Yang4;~Shiting_Wen1;~Zhixiong_Huang1;~Zhiwang_Zhang1;~Chaoyi_Pang2", "gender": ";M;M;M;;M", "homepage": ";https://person.zju.edu.cn/longyang;;https://github.com/Wyatt-Huang;;", "dblp": ";;89/9078.html;;;https://dblp.uni-trier.de/pers/p/Pang:Chaoyi.html", "google_scholar": ";;;;;https://scholar.google.com.au/citations?user=PZZ9jIEAAAAJ", "orcid": "0009-0009-8539-3790;;0000-0002-2055-2553;;;0000-0001-7038-3789", "linkedin": ";;;;;", "or_profile": "~Fenghao_Lei1;~Long_Yang4;~Shiting_Wen1;~Zhixiong_Huang1;~Zhiwang_Zhang1;~Chaoyi_Pang2", "aff": "Zhejiang University;;NingboTech University;Zhejiang University;;Zhejiang University NIT", "aff_domain": "zju.edu.cn;;nbt.edu.cn;zju.edu.cn;;nit.zju.edu.cn", "position": "MS student;;Full Professor;MS student;;Full Professor/Dean", "bibtex": "@inproceedings{\nlei2024langevin,\ntitle={Langevin Policy for Safe Reinforcement Learning},\nauthor={Fenghao Lei and Long Yang and Shiting Wen and Zhixiong Huang and Zhiwang Zhang and Chaoyi Pang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xgoilgLPGD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3026130, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7016619221902208774&as_sdt=4005&sciodt=0,6&hl=en", "gs_version_total": 4, "email": "zju.edu.cn;;nbt.edu.cn;zju.edu.cn;;nit.zju.edu.cn", "author_num": 6, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Zhejiang University;NingboTech University", "aff_unique_dep": ";", "aff_unique_url": "https://www.zju.edu.cn;https://www.ningbo-tech.edu.cn", "aff_unique_abbr": "ZJU;", "aff_campus_unique_index": "1", "aff_campus_unique": ";NIT (Ningbo Institute of Technology)", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Improved Differentially Private and Lazy Online Convex Optimization: Lower Regret without Smoothness Requirements", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32698", "id": "xl2yU3dsHK", "proceeding": "https://proceedings.mlr.press/v235/agarwal24d.html", "pdf": "https://openreview.net/pdf?id=xl2yU3dsHK", "openreview": "https://openreview.net/forum?id=xl2yU3dsHK", "author_site": "Naman Agarwal, Satyen Kale, Karan Singh, Abhradeep Guha Thakurta", "tldr": "", "abstract": "We design differentially private regret-minimizing algorithms in the online convex optimization (OCO) framework. Unlike recent results, our algorithms and analyses do not require smoothness, thus yielding the first private regret bounds with an optimal leading-order term for non-smooth loss functions. Additionally, even for smooth losses, the resulting regret guarantees improve upon previous results in terms their dependence of dimension. Our results provide the best known rates for DP-OCO in all practical regimes of the privacy parameter, barring when it is exceptionally small. The principal innovation in our algorithm design is the use of sampling from strongly log-concave densities which satisfy the Log-Sobolev Inequality. The resulting concentration of measure allows us to obtain a better trade-off for the dimension factors than prior work, leading to improved results. Following previous works on DP-OCO, the proposed algorithm explicitly limits the number of switches via rejection sampling. Thus, independently of privacy constraints, the algorithm also provides improved results for online convex optimization with a switching budget.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Naman Agarwal;Satyen Kale;Karan Singh;Abhradeep Guha Thakurta", "authorids": "~Naman_Agarwal1;~Satyen_Kale2;~Karan_Singh1;~Abhradeep_Guha_Thakurta1", "gender": "M;;M;M", "homepage": "https://naman33k.github.io;https://www.satyenkale.com;https://i-am-karan-singh.github.io/;https://athakurta.squarespace.com/", "dblp": "72/3910;52/4768;00/505;31/8315", "google_scholar": "sEMrGicAAAAJ;https://scholar.google.com/citations?hl=en;PZJIgZUAAAAJ;1rV69hMAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Naman_Agarwal1;~Satyen_Kale2;~Karan_Singh1;~Abhradeep_Guha_Thakurta1", "aff": "Google;Google;Carnegie Mellon University;Google", "aff_domain": "google.com;google.com;cmu.edu;google.com", "position": "Researcher;Research Scientist;Assistant Professor;Senior Research Scientist", "bibtex": "@inproceedings{\nagarwal2024improved,\ntitle={Improved Differentially Private and Lazy Online Convex Optimization: Lower Regret without Smoothness Requirements},\nauthor={Naman Agarwal and Satyen Kale and Karan Singh and Abhradeep Guha Thakurta},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xl2yU3dsHK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 462580, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11756749134760769364&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "email": "google.com;google.com;cmu.edu;google.com", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Google;Carnegie Mellon University", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.cmu.edu", "aff_unique_abbr": "Google;CMU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "An Analysis of Linear Time Series Forecasting Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32697", "id": "xl82CcbYaT", "proceeding": "https://proceedings.mlr.press/v235/toner24a.html", "pdf": "https://openreview.net/pdf?id=xl82CcbYaT", "openreview": "https://openreview.net/forum?id=xl82CcbYaT", "author_site": "William Toner, Luke Darlow", "tldr": "", "abstract": "Despite their simplicity, linear models perform well at time series forecasting, even when pitted against deeper and more expensive models. A number of variations to the linear model have been proposed, often including some form of feature normalisation that improves model generalisation. In this paper we analyse the sets of functions expressible using these linear model architectures. In so doing we show that several popular variants of linear models for time series forecasting are equivalent and functionally indistinguishable from standard, unconstrained linear regression. We characterise the model classes for each linear variant. We demonstrate that each model can be reinterpreted as unconstrained linear regression over a suitably augmented feature set, and therefore admit closed-form solutions when using a mean-squared loss function. We provide experimental evidence that the models under inspection learn nearly identical solutions, and finally demonstrate that the simpler closed form solutions are superior forecasters across 72% dataset-horizon settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "William Toner;Luke Nicholas Darlow", "authorids": "~William_Toner1;~Luke_Nicholas_Darlow1", "gender": "M;", "homepage": "http://www.inf.ed.ac.uk/people/students/William_Toner.html;", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": "william-toner-4647a5212/;", "or_profile": "~William_Toner1;~Luke_Nicholas_Darlow1", "aff": "University of Edinburgh;", "aff_domain": "edinburgh.org;", "position": "PhD student;", "bibtex": "@inproceedings{\ntoner2024an,\ntitle={An Analysis of Linear Time Series Forecasting Models},\nauthor={William Toner and Luke Nicholas Darlow},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xl82CcbYaT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 956934, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15176229412932960321&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "edinburgh.org;", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "University of Edinburgh", "aff_unique_dep": "", "aff_unique_url": "https://www.ed.ac.uk", "aff_unique_abbr": "Edinburgh", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "title": "InstructSpeech: Following Speech Editing Instructions via Large Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32696", "id": "xlWcdtCyOC", "proceeding": "https://proceedings.mlr.press/v235/huang24k.html", "pdf": "https://openreview.net/pdf?id=xlWcdtCyOC", "openreview": "https://openreview.net/forum?id=xlWcdtCyOC", "author_site": "Rongjie Huang, Ruofan Hu, Yongqi Wang, Zehan Wang, xize cheng, Ziyue Jiang, Zhenhui Ye, Dongchao Yang, Luping Liu, Peng Gao, Zhou Zhao", "tldr": "", "abstract": "Instruction-guided speech editing aims to follow the user's natural language instruction to manipulate the semantic and acoustic attributes of a speech. In this work, we construct triplet paired data (instruction, input speech, output speech) to alleviate data scarcity and train a multi-task large language model named InstructSpeech. To mitigate the challenges of accurately executing user's instructions, we 1) introduce the learned task embeddings with a fine-tuned Flan-T5-XL to guide the generation process towards the correct generative task; 2) include an extensive and diverse set of speech editing and processing tasks to enhance model capabilities; 3) investigate chain-of-thought reasoning for free-form semantic content editing; and 4) propose a hierarchical adapter that effectively updates a small portion of parameters for generalization to new tasks. To assess instruction speech editing in greater depth, we introduce a benchmark evaluation with contrastive instruction-speech pre-training (CISP) to test the speech quality and instruction-speech alignment faithfulness. Experimental results demonstrate that InstructSpeech achieves state-of-the-art results in eleven tasks, for the first time unlocking the ability to edit speech's acoustic and semantic attributes following a user's instruction. Audio samples are available at https://InstructSpeech.github.io", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rongjie Huang;Ruofan Hu;Yongqi Wang;Zehan Wang;Xize Cheng;Ziyue Jiang;Zhenhui Ye;Dongchao Yang;Luping Liu;Peng Gao;Zhou Zhao", "authorids": "~Rongjie_Huang1;~Ruofan_Hu2;~Yongqi_Wang1;~Zehan_Wang2;~Xize_Cheng1;~Ziyue_Jiang1;~Zhenhui_Ye1;~Dongchao_Yang1;~Luping_Liu2;~Peng_Gao3;~Zhou_Zhao3", "gender": "M;;M;M;M;M;M;M;;;", "homepage": ";;;https://github.com/12zehan17;https://exgc.github.io/;;https://yerfor.github.io;http://dongchaoyang.top;;;", "dblp": "212/8936-1;;;126/7826-1;334/2167;258/6865;265/6375;;;;", "google_scholar": "iRHBUsgAAAAJ;;9_79D6IAAAAJ;euXK0lkAAAAJ;https://scholar.google.com/citations?hl=zh-CN;wDgSBssAAAAJ;;WNiojyAAAAAJ;;;", "orcid": ";;0000-0003-4695-3440;0009-0007-7509-7563;0000-0001-9708-3225;;;;;;", "linkedin": ";;;;;;;;;;", "or_profile": "~Rongjie_Huang1;~Ruofan_Hu2;~Yongqi_Wang1;~Zehan_Wang2;~Xize_Cheng1;~Ziyue_Jiang1;~Zhenhui_Ye1;~Dongchao_Yang1;~Luping_Liu2;~Peng_Gao3;~Zhou_Zhao3", "aff": "Zhejiang University;;Zhejiang University;Zhejiang University;Zhejiang University;Zhejiang University;Zhejiang University;Chinese University of Hong Kong;;;", "aff_domain": "zju.edu.cn;;zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn;cuhk.hk;;;", "position": "MS student;;MS student;PhD student;PhD student;PhD student;PhD student;PhD student;;;", "bibtex": "@inproceedings{\nhuang2024instructspeech,\ntitle={InstructSpeech: Following Speech Editing Instructions via Large Language Models},\nauthor={Rongjie Huang and Ruofan Hu and Yongqi Wang and Zehan Wang and Xize Cheng and Ziyue Jiang and Zhenhui Ye and Dongchao Yang and Luping Liu and Peng Gao and Zhou Zhao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xlWcdtCyOC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3351389, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 11, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16502846156364932331&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "zju.edu.cn;;zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn;zju.edu.cn;cuhk.hk;;;", "author_num": 11, "aff_unique_index": "0;0;0;0;0;0;1", "aff_unique_norm": "Zhejiang University;Chinese University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "https://www.zju.edu.cn;https://www.cuhk.edu.hk", "aff_unique_abbr": "ZJU;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "The WMDP Benchmark: Measuring and Reducing Malicious Use with Unlearning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32695", "id": "xlr6AUDuJz", "proceeding": "https://proceedings.mlr.press/v235/li24bc.html", "pdf": "https://openreview.net/pdf?id=xlr6AUDuJz", "openreview": "https://openreview.net/forum?id=xlr6AUDuJz", "author_site": "Nathaniel Li, Alexander Pan, Anjali Gopal, Summer Yue, Daniel Berrios, Alice Gatti, Justin Li, Ann-Kathrin Dombrowski, Shashwat Goel, Gabriel Mukobi, Nathan Helm-Burger, Rassin Lababidi, Lennart Justen, Andrew Liu, Michael Chen, Isabelle Barrass, Oliver Zhang, Xiaoyuan Zhu, Rishub Tamirisa, Bhrugu Bharathi, Ariel Herbert-Voss, Cort Breuer, Andy Zou, Mantas Mazeika, Zifan Wang, Palash Oswal, Weiran Lin, Adam Hunt, Justin Tienken-Harder, Kevin Shih, Kemper Talley, John Guan, Ian Steneker, David Campbell, Brad Jokubaitis, Steven Basart, Stephen Fitz, Ponnurangam Kumaraguru, Kallol Karmakar, Uday Tupakula, Vijay Varadharajan, Yan Shoshitaishvili, Jimmy Ba, Kevin Esvelt, Alexandr Wang, Dan Hendrycks", "tldr": "", "abstract": "The White House Executive Order on Artificial Intelligence highlights the risks of large language models (LLMs) empowering malicious actors in developing biological, cyber, and chemical weapons. To measure these risks, government institutions and major AI labs are developing evaluations for hazardous capabilities in LLMs. However, current evaluations are private and restricted to a narrow range of malicious use scenarios, which limits further research into reducing malicious use. To fill these gaps, we release the Weapons of Mass Destruction Proxy (WMDP) benchmark, a dataset of 3,668 multiple-choice questions that serve as a proxy measurement of hazardous knowledge in biosecurity, cybersecurity, and chemical security. To guide progress on unlearning, we develop RMU, a state-of-the-art unlearning method based on controlling model representations. RMU reduces model performance on WMDP while maintaining general capabilities in areas such as biology and computer science, suggesting that unlearning may be a concrete path towards reducing malicious use from LLMs. We release our benchmark and code publicly at https://wmdp.ai.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nathaniel Li;Alexander Pan;Anjali Gopal;Summer Yue;Daniel Berrios;Alice Gatti;Justin D. Li;Ann-Kathrin Dombrowski;Shashwat Goel;Gabriel Mukobi;Nathan Helm-Burger;Rassin Lababidi;Lennart Justen;Andrew Bo Liu;Michael Chen;Isabelle Barrass;Oliver Zhang;Xiaoyuan Zhu;Rishub Tamirisa;Bhrugu Bharathi;Ariel Herbert-Voss;Cort B Breuer;Andy Zou;Mantas Mazeika;Zifan Wang;Palash Oswal;Weiran Lin;Adam Alfred Hunt;Justin Tienken-Harder;Kevin Y. Shih;Kemper Talley;John Guan;Ian Steneker;David Campbell;Brad Jokubaitis;Steven Basart;Stephen Fitz;Ponnurangam Kumaraguru;Kallol Krishna Karmakar;Uday Tupakula;Vijay Varadharajan;Yan Shoshitaishvili;Jimmy Ba;Kevin M. Esvelt;Alexandr Wang;Dan Hendrycks", "authorids": "~Nathaniel_Li1;~Alexander_Pan1;~Anjali_Gopal1;~Summer_Yue2;~Daniel_Berrios1;~Alice_Gatti1;~Justin_D._Li1;~Ann-Kathrin_Dombrowski1;~Shashwat_Goel1;~Gabriel_Mukobi1;~Nathan_Helm-Burger1;~Rassin_Lababidi1;~Lennart_Justen1;~Andrew_Bo_Liu1;~Michael_Chen3;~Isabelle_Barrass1;~Oliver_Zhang1;~Xiaoyuan_Zhu2;~Rishub_Tamirisa1;~Bhrugu_Bharathi2;~Ariel_Herbert-Voss1;~Cort_B_Breuer1;~Andy_Zou1;~Mantas_Mazeika3;~Zifan_Wang1;~Palash_Oswal1;~Weiran_Lin1;~Adam_Alfred_Hunt1;~Justin_Tienken-Harder1;~Kevin_Y._Shih1;~Kemper_Talley1;~John_Guan1;~Ian_Steneker1;~David_Campbell2;~Brad_Jokubaitis1;~Steven_Basart1;~Stephen_Fitz1;~Ponnurangam_Kumaraguru3;~Kallol_Krishna_Karmakar1;~Uday_Tupakula1;~Vijay_Varadharajan1;~Yan_Shoshitaishvili1;~Jimmy_Ba1;~Kevin_M._Esvelt1;~Alexandr_Wang1;~Dan_Hendrycks1", "gender": "M;M;F;F;M;F;;;M;M;Not Specified;M;Not Specified;;M;F;M;;M;M;;;;M;M;M;M;;M;M;;;M;;;M;;;M;;;M;M;M;M;", "homepage": "https://nli0.github.io;https://aypan17.github.io;;;;https://github.com/alga-hopf;http://mjt.cs.illinois.edu/;;https://shash42.github.io/;https://gabrielmukobi.com/;;;;https://abliu.github.io/;https://miclchen.com/;;;;https://rishub-tamirisa.github.io/research/;;;;;https://github.com/mmazeika;https://www.zifanw.net;https://oswalpalash.com;https://linweiran.github.io/;;;https://dinnenylab.me/;;;https://iansteneker.com;;;http://stevenbas.art;;https://precog.iiit.ac.in/;https://www.newcastle.edu.au/profile/kallolkrishna-karmakar;https://www.une.edu.au/staff-profiles/science-and-technology/uday-tupakula;https://www.newcastle.edu.au/profile/vijay-varadharajan#career;http://www.yancomm.net;http://jimmylba.github.io;https://www.sculptingevolution.org;https://scale.com/;", "dblp": ";304/3394;;;;289/7531;202/6669;;300/8333.html;;;;;;;;;;;;;;274/2362;215/4447;;;68/4713;;;;;;;;;245/2547;;97/5147.html;;;;;https://dblp.org/pers/b/Ba:Jimmy.html;;;182/2504", "google_scholar": "2XmBzbcAAAAJ;PaltSA0AAAAJ;76zB4T0AAAAJ;;;Wm-EioQAAAAJ;;YoNVKCYAAAAJ;exaNV-0AAAAJ;;;;https://scholar.google.co.jp/citations?user=rqPyhJYAAAAJ;2f2AIg4AAAAJ;FFCe9-0AAAAJ;;;;mwqUAxwAAAAJ;;Rw3h82QAAAAJ;H6EsqV0AAAAJ;;;HJOP3wMAAAAJ;0erxV34AAAAJ;oHxu2LsAAAAJ;;;;3K-IOLAAAAAJ;;;;;MzKvJhAAAAAJ;;MfzQyP8AAAAJ;fcS17jIAAAAJ;https://scholar.google.com.mx/citations?user=DtcxFY4AAAAJ;;https://scholar.google.com.tw/citations?user=ff1RkwcAAAAJ;https://scholar.google.ca/citations?user=ymzxRhAAAAAJ;eJ9qDHMAAAAJ;;", "orcid": ";;;;;0000-0001-5692-3996;;;;;;;0000-0001-7250-1099;0000-0003-2222-9423;0009-0001-2383-5380;;;;;;;;;;;0009-0000-0713-3488;;0009-0001-1111-9272;;;;;;;;;;;;;;;;0000-0001-8797-3945;;", "linkedin": "nli0/;alexander-pan-0567a2102/;;yutingyue/;danielxberrios/;gattialice/;;;shashwatgoel42/;gabrielmukobi/;https://linkedin.com/in/nathanhelmburger;rassin-lababidi/;lenni-justen/;andrew-liu-phd-8bb07929/;https://linkedin.com/in/miclchen;isabelle-barrass-44a36210a/;oliver-z-30a16812a/;xiaoyuan-zhu-38005a224/;rishubtamirisa/;bhrugu-bharathi;;;andy-zou-09ba3616a/;;zifan-wang-sail/;0xf00df00d/;;adam-hunt-8b5982122/;justin-tienken-harder-237b801a7;;kemper-talley/;johnnguan/;;https://linkedin.com/in/ddcam;brad-jokubaitis-30a18514b/;xksteven/;;ponguru/;;;;;;kevin-esvelt/;alexandrwang/;", "or_profile": "~Nathaniel_Li1;~Alexander_Pan1;~Anjali_Gopal1;~Summer_Yue2;~Daniel_Berrios1;~Alice_Gatti1;~Justin_D._Li1;~Ann-Kathrin_Dombrowski1;~Shashwat_Goel1;~Gabriel_Mukobi1;~Nathan_Helm-Burger1;~Rassin_Lababidi1;~Lennart_Justen1;~Andrew_Bo_Liu1;~Michael_Chen3;~Isabelle_Barrass1;~Oliver_Zhang1;~Xiaoyuan_Zhu2;~Rishub_Tamirisa1;~Bhrugu_Bharathi2;~Ariel_Herbert-Voss1;~Cort_B_Breuer1;~Andy_Zou1;~Mantas_Mazeika3;~Zifan_Wang1;~Palash_Oswal1;~Weiran_Lin1;~Adam_Alfred_Hunt1;~Justin_Tienken-Harder1;~Kevin_Y._Shih1;~Kemper_Talley1;~John_Guan1;~Ian_Steneker1;~David_Campbell2;~Brad_Jokubaitis1;~Steven_Basart1;~Stephen_Fitz1;~Ponnurangam_Kumaraguru3;~Kallol_Krishna_Karmakar1;~Uday_Tupakula1;~Vijay_Varadharajan1;~Yan_Shoshitaishvili1;~Jimmy_Ba1;~Kevin_M._Esvelt1;~Alexandr_Wang1;~Dan_Hendrycks1", "aff": "University of California, Berkeley;University of California, Berkeley;Massachusetts Institute of Technology;Scale AI;;Center for AI Safety;New York University;FAR.AI;IIIT Hyderabad;Computer Science Department, Stanford University;SecureBio;;Massachusetts Institute of Technology;;Stripe;Center for AI Safety;Stanford University;University of Southern California;University of Illinois, Urbana Champaign;University of California, Los Angeles;Harvard University;Stanford University;Carnegie Mellon University;University of Illinois, Urbana-Champaign;Center for AI Safety;;Carnegie Mellon University;Carnegie Mellon University;;Stanford University;;;;Drake University;;Center for AI Safety ;;International Institute of Information Technology Hyderabad ;University of Newcastle;University of New England;University of Newcastle;Arizona State University;Department of Computer Science, University of Toronto;Massachusetts Institute of Technology;;Center for AI Safety", "aff_domain": "berkeley.edu;berkeley.edu;mit.edu;scale.ai;;safe.ai;nyu.edu;far.ai;iiit.ac.in;cs.stanford.edu;securebio.org;;mit.edu;;stripe.com;safe.ai;stanford.edu;usc.edu;cs.illinois.edu;ucla.edu;harvard.edu;stanford.edu;andrew.cmu.edu;uiuc.edu;safe.ai;;andrew.cmu.edu;cmu.edu;;stanford.edu;;;;drake.edu;;safe.ai;;iiit.ac.in;newcastle.edu.au;une.edu;newcastle.edu.au;asu.edu;cs.toronto.edu;mit.edu;;safe.ai", "position": "Undergrad student;PhD student;Researcher;Researcher;;Researcher;PhD student;Researcher;MS student;MS student;Researcher;;MS student;;Software Engineer;Researcher;Undergrad student;Undergrad student;Undergrad student;Undergrad student;PhD student;PhD student;PhD student;PhD student;Researcher;;PhD student;MS student;;PhD student;;;;Undergrad student;;Researcher;;Full Professor;Lecturer;Associate Professor;Full Professor;Associate Professor;Assistant Professor;Associate Professor;;Executive and Research Director", "bibtex": "@inproceedings{\nli2024the,\ntitle={The {WMDP} Benchmark: Measuring and Reducing Malicious Use with Unlearning},\nauthor={Nathaniel Li and Alexander Pan and Anjali Gopal and Summer Yue and Daniel Berrios and Alice Gatti and Justin D. Li and Ann-Kathrin Dombrowski and Shashwat Goel and Gabriel Mukobi and Nathan Helm-Burger and Rassin Lababidi and Lennart Justen and Andrew Bo Liu and Michael Chen and Isabelle Barrass and Oliver Zhang and Xiaoyuan Zhu and Rishub Tamirisa and Bhrugu Bharathi and Ariel Herbert-Voss and Cort B Breuer and Andy Zou and Mantas Mazeika and Zifan Wang and Palash Oswal and Weiran Lin and Adam Alfred Hunt and Justin Tienken-Harder and Kevin Y. Shih and Kemper Talley and John Guan and Ian Steneker and David Campbell and Brad Jokubaitis and Steven Basart and Stephen Fitz and Ponnurangam Kumaraguru and Kallol Krishna Karmakar and Uday Tupakula and Vijay Varadharajan and Yan Shoshitaishvili and Jimmy Ba and Kevin M. Esvelt and Alexandr Wang and Dan Hendrycks},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xlr6AUDuJz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 669554, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 46, "gs_citation": 145, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=428969006009722343&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "berkeley.edu;berkeley.edu;mit.edu;scale.ai;;safe.ai;nyu.edu;far.ai;iiit.ac.in;cs.stanford.edu;securebio.org;;mit.edu;;stripe.com;safe.ai;stanford.edu;usc.edu;cs.illinois.edu;ucla.edu;harvard.edu;stanford.edu;andrew.cmu.edu;uiuc.edu;safe.ai;;andrew.cmu.edu;cmu.edu;;stanford.edu;;;;drake.edu;;safe.ai;;iiit.ac.in;newcastle.edu.au;une.edu;newcastle.edu.au;asu.edu;cs.toronto.edu;mit.edu;;safe.ai", "author_num": 46, "aff_unique_index": "0;0;1;2;3;4;5;6;7;8;1;9;3;7;10;11;12;13;7;14;15;3;14;14;7;16;3;17;18;19;18;20;21;1;3", "aff_unique_norm": "University of California, Berkeley;Massachusetts Institute of Technology;Scale AI;Center for AI Safety;New York University;FAR.AI;International Institute of Information Technology, Hyderabad;Stanford University;SecureBio;Stripe;University of Southern California;University of Illinois Urbana-Champaign;University of California, Los Angeles;Harvard University;Carnegie Mellon University;University of Illinois;Drake University;International Institute of Information Technology;University of Newcastle;University of New England;Arizona State University;University of Toronto", "aff_unique_dep": ";;;;;;;Computer Science Department;;;;;;;;;;;;;;Department of Computer Science", "aff_unique_url": "https://www.berkeley.edu;https://web.mit.edu;https://scale.ai;https://www.centerforaisafety.org;https://www.nyu.edu;https://www.far.ai;https://iiit Hyderabad.ac.in;https://www.stanford.edu;;https://stripe.com;https://www.usc.edu;https://illinois.edu;https://www.ucla.edu;https://www.harvard.edu;https://www.cmu.edu;https://illinois.edu;https://www.drake.edu;https://iiit Hyderabad.ac.in;https://www.newcastle.edu.au;https://www.une.edu;https://www.asu.edu;https://www.utoronto.ca", "aff_unique_abbr": "UC Berkeley;MIT;Scale AI;;NYU;FAR.AI;IIIT-H;Stanford;;Stripe;USC;UIUC;UCLA;Harvard;CMU;UIUC;Drake;IIIT Hyderabad;UON;UNE;ASU;U of T", "aff_campus_unique_index": "0;0;2;3;3;4;5;4;3;5;3;2;6", "aff_campus_unique": "Berkeley;;Hyderabad;Stanford;Los Angeles;Urbana-Champaign;Toronto", "aff_country_unique_index": "0;0;0;0;0;0;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1;3;0;3;0;4;0;0", "aff_country_unique": "United States;India;;Australia;Canada" }, { "title": "Transformers Learn Nonlinear Features In Context: Nonconvex Mean-field Dynamics on the Attention Landscape", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32694", "id": "xm2lU7tteQ", "proceeding": "https://proceedings.mlr.press/v235/kim24af.html", "pdf": "https://openreview.net/pdf?id=xm2lU7tteQ", "openreview": "https://openreview.net/forum?id=xm2lU7tteQ", "author_site": "Juno Kim, Taiji Suzuki", "tldr": "", "abstract": "Large language models based on the Transformer architecture have demonstrated impressive capabilities to learn in context. However, existing theoretical studies on how this phenomenon arises are limited to the dynamics of a single layer of attention trained on linear regression tasks. In this paper, we study the optimization of a Transformer consisting of a fully connected layer followed by a linear attention layer. The MLP acts as a common nonlinear representation or feature map, greatly enhancing the power of in-context learning. We prove in the mean-field and two-timescale limit that the infinite-dimensional loss landscape for the distribution of parameters, while highly nonconvex, becomes quite benign. We also analyze the second-order stability of mean-field dynamics and show that Wasserstein gradient flow almost always avoids saddle points. Furthermore, we establish novel methods for obtaining concrete improvement rates both away from and near critical points. This represents the first saddle point analysis of mean-field dynamics in general and the techniques are of independent interest.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Juno Kim;Taiji Suzuki", "authorids": "~Juno_Kim1;~Taiji_Suzuki1", "gender": "M;M", "homepage": "https://junokim1.github.io/;http://ibis.t.u-tokyo.ac.jp/suzuki/", "dblp": "59/8200;08/312", "google_scholar": "PEHQlgkAAAAJ;x8osrBsAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Juno_Kim1;~Taiji_Suzuki1", "aff": "RIKEN;The University of Tokyo", "aff_domain": "riken.jp;tokyo.ac.jp", "position": "Researcher;Associate Professor", "bibtex": "@inproceedings{\nkim2024transformers,\ntitle={Transformers Learn Nonlinear Features In Context: Nonconvex Mean-field Dynamics on the Attention Landscape},\nauthor={Juno Kim and Taiji Suzuki},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xm2lU7tteQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 676328, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6142444491284027819&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "riken.jp;tokyo.ac.jp", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "RIKEN;University of Tokyo", "aff_unique_dep": ";", "aff_unique_url": "https://www.riken.jp;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "RIKEN;UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "title": "RoboCodeX: Multimodal Code Generation for Robotic Behavior Synthesis", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32693", "id": "xnQ1qoly7Q", "proceeding": "https://proceedings.mlr.press/v235/mu24a.html", "pdf": "https://openreview.net/pdf?id=xnQ1qoly7Q", "openreview": "https://openreview.net/forum?id=xnQ1qoly7Q", "author_site": "Yao Mu, Junting Chen, Qing-Long Zhang, Shoufa Chen, Qiaojun Yu, Chongjian GE, Runjian Chen, Zhixuan Liang, Mengkang Hu, Chaofan Tao, Peize Sun, Haibao Yu, Chao Yang, WENQI SHAO, Wenhai Wang, Jifeng Dai, Yu Qiao, Mingyu Ding, Ping Luo", "tldr": "", "abstract": "Robotic behavior synthesis, the problem of understanding multimodal inputs and generating precise physical control for robots, is an important part of Embodied AI. Despite successes in applying multimodal large language models for high-level understanding, it remains challenging to translate these conceptual understandings into detailed robotic actions while achieving generalization across various scenarios. In this paper, we propose a tree-structured multimodal code generation framework for generalized robotic behavior synthesis, termed RoboCodeX. RoboCodeX decomposes high-level human instructions into multiple object-centric manipulation units consisting of physical preferences such as affordance and safety constraints, and applies code generation to introduce generalization ability across various robotics platforms. To further enhance the capability to map conceptual and perceptual understanding into control commands, a specialized multimodal reasoning dataset is collected for pre-training and an iterative self-updating methodology is introduced for supervised fine-tuning. Extensive experiments demonstrate that RoboCodeX achieves state-of-the-art performance in both simulators and real robots on four different kinds of manipulation tasks and one embodied navigation task.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yao Mu;Junting Chen;Qing-Long Zhang;Shoufa Chen;Qiaojun Yu;Chongjian GE;Runjian Chen;Zhixuan Liang;Mengkang Hu;Chaofan Tao;Peize Sun;Haibao Yu;Chao Yang;Wenqi Shao;Wenhai Wang;Jifeng Dai;Yu Qiao;Mingyu Ding;Ping Luo", "authorids": "~Yao_Mu1;~Junting_Chen2;~Qing-Long_Zhang1;~Shoufa_Chen1;~Qiaojun_Yu1;~Chongjian_GE1;~Runjian_Chen1;~Zhixuan_Liang2;~Mengkang_Hu1;~Chaofan_Tao1;~Peize_Sun1;~Haibao_Yu2;~Chao_Yang3;~Wenqi_Shao2;~Wenhai_Wang2;~Jifeng_Dai1;~Yu_Qiao1;~Mingyu_Ding1;~Ping_Luo2", "gender": "M;M;;M;M;M;M;M;M;M;M;M;;M;;M;;M;", "homepage": "https://yaomarkmu.github.io/;https://sgtvincent.github.io/;;https://www.shoufachen.com;https://github.com/qiaojunyu/qiaojunyu.github.io;https://chongjiange.github.io;https://runjian-chen.github.io;https://liang-zx.github.io/;https://aaron617.github.io/;;https://peizesun.github.io/;;;https://wqshao126.github.io/;;https://jifengdai.org/;;https://dingmyu.github.io/;", "dblp": "260/0674;;;187/4654;;287/4197;257/4647;212/8952;321/0644;239/5831;249/2345;246/4643;;227/3122;;14/9399;;188/5243;", "google_scholar": ";;;ogoCvHEAAAAJ;hOxT8QUAAAAJ;https://scholar.google.com.hk/citations?user=7DA_vcUAAAAJ;_USUMdAAAAAJ;KEGTmmIAAAAJ;FhVRimUAAAAJ;gjmfLroAAAAJ;Grkp5AQAAAAJ;JW4F5HoAAAAJ;;Bs9mrwwAAAAJ;;SH_-B_AAAAAJ;;w4yTWwoAAAAJ;", "orcid": ";;;0000-0002-6126-2595;;;0000-0003-0519-496X;0009-0008-6815-9866;0009-0009-3779-3378;;;;;;;;;0000-0001-6556-8359;", "linkedin": ";;;;;chongjian-ge-%EF%BC%88%E8%91%9B%E5%B4%87%E5%89%91%EF%BC%89-3b393310b/;;;;;;;;;;;;dingmyu/;", "or_profile": "~Yao_Mu1;~Junting_Chen2;~Qing-Long_Zhang1;~Shoufa_Chen1;~Qiaojun_Yu1;~Chongjian_GE1;~Runjian_Chen1;~Zhixuan_Liang2;~Mengkang_Hu1;~Chaofan_Tao1;~Peize_Sun1;~Haibao_Yu2;~Chao_Yang3;~Wenqi_Shao2;~Wenhai_Wang2;~Jifeng_Dai1;~Yu_Qiao1;~Mingyu_Ding1;~Ping_Luo2", "aff": "The University of Hong Kong;National University of Singapore;;The University of Hong Kong;Shanghai Jiaotong University;The University of Hong Kong;University of Hong Kong;The University of Hong Kong;University of Hong Kong;The University of Hong Kong;The University of Hong Kong;The University of Hong Kong;;Shanghai AI Laboratory;;Tsinghua University;;University of California, Berkeley;", "aff_domain": "hku.hk;u.nus.edu;;hku.hk;sjtu.edu.cn;hku.hk;hku.hk;hku.hk;hku.hk;hku.hk;hku.hk;hku.hk;;pjlab.org.cn;;tsinghua.edu.cn;;berkeley.edu;", "position": "PhD student;PhD student;;PhD student;PhD student;PhD student;PhD student;PhD student;PhD student;PhD Student;PhD student;PhD student;;Researcher;;Associate Professor;;Postdoc;", "bibtex": "@inproceedings{\nmu2024robocodex,\ntitle={RoboCodeX: Multimodal Code Generation for Robotic Behavior Synthesis},\nauthor={Yao Mu and Junting Chen and Qing-Long Zhang and Shoufa Chen and Qiaojun Yu and Chongjian GE and Runjian Chen and Zhixuan Liang and Mengkang Hu and Chaofan Tao and Peize Sun and Haibao Yu and Chao Yang and Wenqi Shao and Wenhai Wang and Jifeng Dai and Yu Qiao and Mingyu Ding and Ping Luo},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xnQ1qoly7Q}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 10021722, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 19, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6667727382630266665&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "hku.hk;u.nus.edu;;hku.hk;sjtu.edu.cn;hku.hk;hku.hk;hku.hk;hku.hk;hku.hk;hku.hk;hku.hk;;pjlab.org.cn;;tsinghua.edu.cn;;berkeley.edu;", "author_num": 19, "aff_unique_index": "0;1;0;2;0;0;0;0;0;0;0;3;4;5", "aff_unique_norm": "University of Hong Kong;National University of Singapore;Shanghai Jiao Tong University;Shanghai AI Laboratory;Tsinghua University;University of California, Berkeley", "aff_unique_dep": ";;;;;", "aff_unique_url": "https://www.hku.hk;https://www.nus.edu.sg;https://www.sjtu.edu.cn;https://www.shanghai-ai-lab.com;https://www.tsinghua.edu.cn;https://www.berkeley.edu", "aff_unique_abbr": "HKU;NUS;SJTU;SAIL;THU;UC Berkeley", "aff_campus_unique_index": "0;0;0;0;0;0;0;0;0;2", "aff_campus_unique": "Hong Kong SAR;;Berkeley", "aff_country_unique_index": "0;1;0;0;0;0;0;0;0;0;0;0;0;2", "aff_country_unique": "China;Singapore;United States" }, { "title": "Evaluating and Analyzing Relationship Hallucinations in Large Vision-Language Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32692", "id": "xpSlt67vxQ", "proceeding": "https://proceedings.mlr.press/v235/wu24l.html", "pdf": "https://openreview.net/pdf?id=xpSlt67vxQ", "openreview": "https://openreview.net/forum?id=xpSlt67vxQ", "author_site": "Mingrui Wu, Jiayi Ji, Oucheng Huang, Jiale Li, Yuhang Wu, Xiaoshuai Sun, Rongrong Ji", "tldr": "", "abstract": "The issue of hallucinations is a prevalent concern in existing Large Vision-Language Models (LVLMs). Previous efforts have primarily focused on investigating object hallucinations, which can be easily alleviated by introducing object detectors. However, these efforts neglect hallucinations in inter-object relationships, which is essential for visual comprehension. In this work, we introduce R-Bench, a novel benchmark for evaluating Vision Relationship Hallucination. R-Bench features image-level questions that focus on the existence of relationships and instance-level questions that assess local visual comprehension. We identify three types of relationship co-occurrences that lead to hallucinations: relationship-relationship, subject-relationship, and relationship-object. The visual instruction tuning dataset's long-tail distribution significantly impacts LVLMs' understanding of visual relationships. Additionally, our analysis reveals that current LVLMs tend to overlook visual content, overly rely on the common sense knowledge of Large Language Models (LLMs), and struggle with spatial relationship reasoning based on contextual information.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mingrui Wu;Jiayi Ji;Oucheng Huang;Jiale Li;Yuhang Wu;Xiaoshuai Sun;Rongrong Ji", "authorids": "~Mingrui_Wu2;~Jiayi_Ji1;~Oucheng_Huang1;~Jiale_Li5;~Yuhang_Wu4;~Xiaoshuai_Sun3;~Rongrong_Ji5", "gender": "Not Specified;M;M;M;M;M;M", "homepage": ";https://scholar.google.com/citations?user=xp_rICcAAAAJ&hl=zh-CN;https://i.csdn.net/#/user-center/profile?spm=1001.2101.3001.5111;;https://sites.google.com/view/xssun;http://mac.xmu.edu.cn/rrji-en.html;https://scholar.google.com/citations?hl=zh-CN&user=fED_ASAAAAAJ", "dblp": ";250/9459;;41/7732-4;26/5787.html;86/5681;", "google_scholar": "sbCKwnYAAAAJ;xp_rICcAAAAJ;https://scholar.google.com/citations?view_op=list_works;lSmY99gAAAAJ;KPMK3B4AAAAJ;;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";0000-0002-9956-6308;;;0000-0003-3912-9306;;", "linkedin": ";;;;;;", "or_profile": "~Mingrui_Wu2;~Jiayi_Ji1;~Oucheng_Huang1;~Yuhang_Wu4;~Xiaoshuai_Sun3;~Rongrong_Ji5;~JialeLi2", "aff": "Xiamen University;Xiamen University;Xiamen University;Xiamen University;Xiamen University;Xiamen University;Xiamen University", "aff_domain": "xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn", "position": "PhD student;Postdoc;MS student;PhD student;Associate Professor;Full Professor;Undergrad student", "bibtex": "@inproceedings{\nwu2024evaluating,\ntitle={Evaluating and Analyzing Relationship Hallucinations in Large Vision-Language Models},\nauthor={Mingrui Wu and Jiayi Ji and Oucheng Huang and Jiale Li and Yuhang Wu and Xiaoshuai Sun and Rongrong Ji},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xpSlt67vxQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1975852, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9285619971179190182&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn;xmu.edu.cn", "author_num": 7, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Xiamen University", "aff_unique_dep": "", "aff_unique_url": "https://www.xmu.edu.cn", "aff_unique_abbr": "XMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Membership Inference Attacks on Diffusion Models via Quantile Regression", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32691", "id": "xqqccG7gf1", "proceeding": "https://proceedings.mlr.press/v235/tang24g.html", "pdf": "https://openreview.net/pdf?id=xqqccG7gf1", "openreview": "https://openreview.net/forum?id=xqqccG7gf1", "author_site": "Shuai Tang, Steven Wu, Sergul Aydore, Michael Kearns, Aaron Roth", "tldr": "", "abstract": "Recently, diffusion models have become popular tools for image synthesis due to their high-quality outputs. However, like other large models, they may leak private information about their training data. Here, we demonstrate a privacy vulnerability of diffusion models through a *membership inference (MI) attack*, which aims to identify whether a target example belongs to the training set when given the trained diffusion model. Our proposed MI attack learns quantile regression models that predict (a quantile of) the distribution of reconstruction loss on examples not used in training. This allows us to define a granular hypothesis test for determining the membership of a point in the training set, based on thresholding the reconstruction loss of that point using a custom threshold tailored to the example. We also provide a simple bootstrap technique that takes a majority membership prediction over ''a bag of weak attackers'' which improves the accuracy over individual quantile regression models. We show that our attack outperforms the prior state-of-the-art attack while being substantially less computationally expensive --- prior attacks required training multiple ''shadow models'' with the same architecture as the model under attack, whereas our attack requires training only much smaller models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shuai Tang;Steven Wu;Sergul Aydore;Michael Kearns;Aaron Roth", "authorids": "~Shuai_Tang1;~Steven_Wu1;~Sergul_Aydore1;~Michael_Kearns2;~Aaron_Roth1", "gender": "M;M;;F;M", "homepage": "http://shuaitang.github.io;http://www.cis.upenn.edu/~aaroth/;https://www.cis.upenn.edu/~mkearns/;https://sergulaydore.github.io/;https://zstevenwu.com/", "dblp": ";80/3311;78/6858;80/11511;137/8350", "google_scholar": "fJVeBrAAAAAJ;https://scholar.google.com.tw/citations?user=kLUQrrYAAAAJ;8iQk0DIAAAAJ;;MbF6rTEAAAAJ", "orcid": ";;;;", "linkedin": ";;;;zstevenwu/", "or_profile": "~Shuai_Tang1;~Aaron_Roth1;~Michael_J._Kearns1;~Serg\u00fcl_Ayd\u00f6re1;~Zhiwei_Steven_Wu1", "aff": "Amazon Web Services;University of Pennsylvania;University of Pennsylvania;Amazon;Carnegie Mellon University", "aff_domain": "amazon.com;upenn.edu;upenn.edu;amazon.com;cmu.edu", "position": "Applied Scientist;Full Professor;Professor;Researcher;Assistant Professor", "bibtex": "@inproceedings{\ntang2024membership,\ntitle={Membership Inference Attacks on Diffusion Models via Quantile Regression},\nauthor={Shuai Tang and Steven Wu and Sergul Aydore and Michael Kearns and Aaron Roth},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xqqccG7gf1}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 923970, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3091671437852823191&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "amazon.com;upenn.edu;upenn.edu;amazon.com;cmu.edu", "author_num": 5, "aff_unique_index": "0;1;1;0;2", "aff_unique_norm": "Amazon;University of Pennsylvania;Carnegie Mellon University", "aff_unique_dep": "Amazon Web Services;;", "aff_unique_url": "https://aws.amazon.com;https://www.upenn.edu;https://www.cmu.edu", "aff_unique_abbr": "AWS;UPenn;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Position: Reinforcement Learning in Dynamic Treatment Regimes Needs Critical Reexamination", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32690", "id": "xtKWwB6lzT", "proceeding": "https://proceedings.mlr.press/v235/luo24f.html", "pdf": "https://openreview.net/pdf?id=xtKWwB6lzT", "openreview": "https://openreview.net/forum?id=xtKWwB6lzT", "author_site": "Zhiyao Luo, Yangchen Pan, Peter Watkinson, Tingting Zhu", "tldr": "", "abstract": "In the rapidly changing healthcare landscape, the implementation of offline reinforcement learning (RL) in dynamic treatment regimes (DTRs) presents a mix of unprecedented opportunities and challenges. This position paper offers a critical examination of the current status of offline RL in the context of DTRs. We argue for a reassessment of applying RL in DTRs, citing concerns such as inconsistent and potentially inconclusive evaluation metrics, the absence of naive and supervised learning baselines, and the diverse choice of RL formulation in existing research. Through a case study with more than 17,000 evaluation experiments using a publicly available Sepsis dataset, we demonstrate that the performance of RL algorithms can significantly vary with changes in evaluation metrics and Markov Decision Process (MDP) formulations. Surprisingly, it is observed that in some instances, RL algorithms can be surpassed by random baselines subjected to policy evaluation methods and reward design. This calls for more careful policy evaluation and algorithm development in future DTR works. Additionally, we discussed potential enhancements toward more reliable development of RL-based dynamic treatment regimes and invited further discussion within the community. Code is available at https://github.com/GilesLuo/ReassessDTR.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhiyao Luo;Yangchen Pan;Peter Watkinson;Tingting Zhu", "authorids": "~Zhiyao_Luo1;~Yangchen_Pan2;~Peter_Watkinson1;~Tingting_Zhu1", "gender": "M;M;M;F", "homepage": ";https://yannickycpan.github.io/yangchenpan/;https://www.ndcn.ox.ac.uk/team/peter-watkinson;https://eng.ox.ac.uk/people/tingting-zhu/", "dblp": ";183/0925;;29/7666-1", "google_scholar": "g65Ry78AAAAJ;4M4pOp4AAAAJ;https://scholar.google.co.uk/citations?user=pbIn-FUAAAAJ;https://scholar.google.com.vn/citations?user=fjGMIl0AAAAJ", "orcid": ";;0000-0003-1023-3927;0000-0002-1552-5630", "linkedin": ";;;", "or_profile": "~Zhiyao_Luo1;~Yangchen_Pan2;~Peter_Watkinson1;~Tingting_Zhu1", "aff": "University of Oxford, University of Oxford;University of Oxford;University of Oxford;University of Oxford", "aff_domain": "eng.ox.ac.uk;eng.ox.ac.uk;ox.ac.uk;eng.ox.ac.uk", "position": "PhD student;Lecturer;Full Professor;RAEng Research Fellow", "bibtex": "@inproceedings{\nluo2024position,\ntitle={Position: Reinforcement Learning in Dynamic Treatment Regimes Needs Critical Reexamination},\nauthor={Zhiyao Luo and Yangchen Pan and Peter Watkinson and Tingting Zhu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xtKWwB6lzT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1816846, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6369624580006680445&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "eng.ox.ac.uk;eng.ox.ac.uk;ox.ac.uk;eng.ox.ac.uk", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Memory Efficient Neural Processes via Constant Memory Attention Block", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32689", "id": "xtwCf7iAs2", "proceeding": "https://proceedings.mlr.press/v235/feng24i.html", "pdf": "https://openreview.net/pdf?id=xtwCf7iAs2", "openreview": "https://openreview.net/forum?id=xtwCf7iAs2", "author_site": "Leo Feng, Frederick Tung, Hossein Hajimirsadeghi, Yoshua Bengio, Mohamed Osama Ahmed", "tldr": "", "abstract": "Neural Processes (NPs) are popular meta-learning methods for efficiently modelling predictive uncertainty. Recent state-of-the-art methods, however, leverage expensive attention mechanisms, limiting their applications, particularly in low-resource settings. In this work, we propose Constant Memory Attentive Neural Processes (CMANPs), an NP variant that only requires **constant** memory. To do so, we first propose an efficient update operation for Cross Attention. Leveraging the update operation, we propose Constant Memory Attention Block (CMAB), a novel attention block that (i) is permutation invariant, (ii) computes its output in constant memory, and (iii) performs constant computation updates. Finally, building on CMAB, we detail Constant Memory Attentive Neural Processes. Empirically, we show CMANPs achieve state-of-the-art results on popular NP benchmarks while being significantly more memory efficient than prior methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Leo Feng;Frederick Tung;Hossein Hajimirsadeghi;Yoshua Bengio;Mohamed Osama Ahmed", "authorids": "~Leo_Feng1;~Frederick_Tung1;~Hossein_Hajimirsadeghi1;~Yoshua_Bengio1;~Mohamed_Osama_Ahmed2", "gender": "M;M;M;M;M", "homepage": "https://leofeng-ca.github.io/;;;http://yoshuabengio.org;", "dblp": "255/9367;10/7697;64/8131;56/953;https://dblp.org/pers/hd/a/Ahmed:Mohamed_Osama", "google_scholar": "WsRunnEAAAAJ;https://scholar.google.ca/citations?user=T4EeZ9gAAAAJ;;kukA0LcAAAAJ;https://scholar.google.ca/citations?user=jyVyVj4AAAAJ", "orcid": ";;;;0000-0001-6758-1178", "linkedin": "leo-feng/;;;yoshuabengio/?originalSubdomain=ca;mohamed-osama-ahmed-91439a154/", "or_profile": "~Leo_Feng1;~Frederick_Tung1;~Hossein_Hajimirsadeghi1;~Yoshua_Bengio1;~Mohamed_Osama_Ahmed2", "aff": "Mila - Quebec Artificial Intelligence Institute;Borealis AI;Borealis AI;University of Montreal;", "aff_domain": "mila.quebec;borealisai.com;borealisai.com;umontreal.ca;", "position": "PhD student;Researcher;Principal Researcher;Full Professor;", "bibtex": "@inproceedings{\nfeng2024memory,\ntitle={Memory Efficient Neural Processes via Constant Memory Attention Block},\nauthor={Leo Feng and Frederick Tung and Hossein Hajimirsadeghi and Yoshua Bengio and Mohamed Osama Ahmed},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xtwCf7iAs2}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 0, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2028368274085747897&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "mila.quebec;borealisai.com;borealisai.com;umontreal.ca;", "author_num": 5, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Quebec Artificial Intelligence Institute;Borealis AI;University of Montreal", "aff_unique_dep": "Artificial Intelligence;;", "aff_unique_url": "https://mila.quebec;https://www.borealisai.com;https://wwwumontreal.ca", "aff_unique_abbr": "Mila;Borealis AI;UM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Canada" }, { "title": "Drug Discovery with Dynamic Goal-aware Fragments", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32688", "id": "xuX2rDSSco", "proceeding": "https://proceedings.mlr.press/v235/lee24o.html", "pdf": "https://openreview.net/pdf?id=xuX2rDSSco", "openreview": "https://openreview.net/forum?id=xuX2rDSSco", "author_site": "Seul Lee, Seanie Lee, Kenji Kawaguchi, Sung Ju Hwang", "tldr": "", "abstract": "Fragment-based drug discovery is an effective strategy for discovering drug candidates in the vast chemical space, and has been widely employed in molecular generative models. However, many existing fragment extraction methods in such models do not take the target chemical properties into account or rely on heuristic rules. Additionally, the existing fragment-based generative models cannot update the fragment vocabulary with goal-aware fragments newly discovered during the generation. To this end, we propose a molecular generative framework for drug discovery, named *Goal-aware fragment Extraction, Assembly, and Modification* (GEAM). GEAM consists of three modules, each responsible for goal-aware fragment extraction, fragment assembly, and fragment modification. The fragment extraction module identifies important fragments contributing to the desired target properties with the information bottleneck principle, thereby constructing an effective goal-aware fragment vocabulary. Moreover, GEAM can explore beyond the initial vocabulary with the fragment modification module, and the exploration is further enhanced through the dynamic goal-aware vocabulary update. We experimentally demonstrate that GEAM effectively discovers drug candidates through the generative cycle of the three modules in various drug discovery tasks. Our code is available at https://github.com/SeulLee05/GEAM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Seul Lee;Seanie Lee;Kenji Kawaguchi;Sung Ju Hwang", "authorids": "~Seul_Lee1;~Seanie_Lee1;~Kenji_Kawaguchi1;~Sung_Ju_Hwang1", "gender": "Not Specified;M;;", "homepage": "https://seullee05.github.io;https://seanie12.github.io/;https://ml.comp.nus.edu.sg/#members;", "dblp": "159/0357;219/6771;;", "google_scholar": "Ek0N9YYAAAAJ;zrZu6GkAAAAJ;aLl3rYoAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Seul_Lee1;~Seanie_Lee1;~Kenji_Kawaguchi1;~Sung_Ju_Hwang1", "aff": "Korea Advanced Institute of Science & Technology;Montreal Institute for Learning Algorithms, University of Montreal, Universit\u00e9 de Montr\u00e9al;National University of Singapore;", "aff_domain": "kaist.ac.kr;mila.umontreal.ca;nus.edu;", "position": "PhD student;Intern;Presidential Young Professor;", "bibtex": "@inproceedings{\nlee2024drug,\ntitle={Drug Discovery with Dynamic Goal-aware Fragments},\nauthor={Seul Lee and Seanie Lee and Kenji Kawaguchi and Sung Ju Hwang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xuX2rDSSco}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2928954, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12467445023885024926&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "kaist.ac.kr;mila.umontreal.ca;nus.edu;", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;University of Montreal;National University of Singapore", "aff_unique_dep": ";Montreal Institute for Learning Algorithms;", "aff_unique_url": "https://www.kaist.ac.kr;https://www.mila.quebec;https://www.nus.edu.sg", "aff_unique_abbr": "KAIST;MILA;NUS", "aff_campus_unique_index": "1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;1;2", "aff_country_unique": "South Korea;Canada;Singapore" }, { "title": "Learning Graph Representation via Graph Entropy Maximization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32687", "id": "xwOENWCo46", "proceeding": "https://proceedings.mlr.press/v235/sun24i.html", "pdf": "https://openreview.net/pdf?id=xwOENWCo46", "openreview": "https://openreview.net/forum?id=xwOENWCo46", "author_site": "Ziheng Sun, Xudong Wang, Chris Ding, Jicong Fan", "tldr": "", "abstract": "Graph representation learning aims to represent graphs as vectors that can be utilized in downstream tasks such as graph classification. In this work, we focus on learning diverse representations that can capture the graph information as much as possible. We propose quantifying graph information using graph entropy, where we define a probability distribution of a graph based on its nodes' representations and global-graph representation. However, the computation of graph entropy is NP-hard due to the complex vertex-packing polytope involved in its definition. To address this challenge, we provide an approximation method leveraging orthonormal representations for graph entropy maximization. The proposed method is implemented via graph neural networks, resulting in informative node-level and graph-level representations. Experimental results demonstrate the effectiveness of our method in comparison to many baselines in unsupervised learning and semi-supervised learning tasks. The code of our method is available at https://github.com/MathAdventurer/GeMax.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ziheng Sun;Xudong Wang;Chris Ding;Jicong Fan", "authorids": "~Ziheng_Sun1;~Xudong_Wang10;~Chris_Ding1;~Jicong_Fan2", "gender": "M;M;M;M", "homepage": ";https://xd-w.github.io;http://ranger.uta.edu/~chqding/;https://jicongfan.github.io/", "dblp": ";;https://dblp.uni-trier.de/pers/hd/d/Ding:Chris;139/1570", "google_scholar": "https://scholar.google.com/citations?hl=en;;q7FfnjgAAAAJ;vdJsnhIAAAAJ", "orcid": ";;;0000-0001-9665-0355", "linkedin": ";;;", "or_profile": "~Ziheng_Sun1;~Xudong_Wang10;~Chris_Ding1;~Jicong_Fan2", "aff": "Chinese University of HongKong;The Chinese University of Hong Kong (Shenzhen);University of Texas at Arlington;The Chinese University of Hong Kong, Shenzhen", "aff_domain": "cuhk.edu.cn;cuhk.edu.cn;cse.uta.edu;cuhk.edu.cn", "position": "PhD student;PhD student;Professor;Assistant Professor", "bibtex": "@inproceedings{\nsun2024learning,\ntitle={Learning Graph Representation via Graph Entropy Maximization},\nauthor={Ziheng Sun and Xudong Wang and Chris Ding and Jicong Fan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xwOENWCo46}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2413096, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1164052794404602400&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "cuhk.edu.cn;cuhk.edu.cn;cse.uta.edu;cuhk.edu.cn", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Chinese University of Hong Kong;University of Texas at Arlington", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.uta.edu", "aff_unique_abbr": "CUHK;UTA", "aff_campus_unique_index": "0;1;2;1", "aff_campus_unique": "Hong Kong SAR;Shenzhen;Arlington", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Convergence Guarantees for the DeepWalk Embedding on Block Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32686", "id": "xwxUbBHC1q", "proceeding": "https://proceedings.mlr.press/v235/harker24a.html", "pdf": "https://openreview.net/pdf?id=xwxUbBHC1q", "openreview": "https://openreview.net/forum?id=xwxUbBHC1q", "author_site": "Christopher Harker, Aditya Bhaskara", "tldr": "", "abstract": "Graph embeddings have emerged as a powerful tool for understanding the structure of graphs. Unlike classical spectral methods, recent methods such as DeepWalk, Node2Vec, etc. are based on solving nonlinear optimization problems on the graph, using local information obtained by performing random walks. These techniques have empirically been shown to produce ``better'' embeddings than their classical counterparts. However, due to their reliance on solving a nonconvex optimization problem, obtaining theoretical guarantees on the properties of the solution has remained a challenge, even for simple classes of graphs. In this work, we show convergence properties for the DeepWalk algorithm on graphs obtained from the Stochastic Block Model (SBM). Despite being simplistic, the SBM has proved to be a classic model for analyzing the behavior of algorithms on large graphs. Our results mirror the existing ones for spectral embeddings on SBMs, showing that even in the case of one-dimensional embeddings, the output of the DeepWalk algorithm provably recovers the cluster structure with high probability.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Christopher Harker;Aditya Bhaskara", "authorids": "~Christopher_Harker1;~Aditya_Bhaskara1", "gender": ";M", "homepage": ";http://www.cs.utah.edu/~bhaskara/", "dblp": ";47/7801.html", "google_scholar": ";tqxTaiAAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Christopher_Harker1;~Aditya_Bhaskara1", "aff": ";University of Utah", "aff_domain": ";utah.edu", "position": ";Associate Professor", "bibtex": "@inproceedings{\nharker2024convergence,\ntitle={Convergence Guarantees for the DeepWalk Embedding on Block Models},\nauthor={Christopher Harker and Aditya Bhaskara},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xwxUbBHC1q}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 683898, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gNqCZpPMqdAJ:scholar.google.com/&scioq=Convergence+Guarantees+for+the+DeepWalk+Embedding+on+Block+Models&hl=en&as_sdt=0,5", "gs_version_total": 6, "email": ";utah.edu", "author_num": 2, "aff_unique_index": "0", "aff_unique_norm": "University of Utah", "aff_unique_dep": "", "aff_unique_url": "https://www.utah.edu", "aff_unique_abbr": "Utah", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Exploring Intrinsic Dimension for Vision-Language Model Pruning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32685", "id": "xxL7CEWuxz", "proceeding": "https://proceedings.mlr.press/v235/wang24cp.html", "pdf": "https://openreview.net/pdf?id=xxL7CEWuxz", "openreview": "https://openreview.net/forum?id=xxL7CEWuxz", "author_site": "Hanzhang Wang, Jiawen Zhang, Qingyuan Ma", "tldr": "", "abstract": "The intrinsic dimension (ID) represents the minimum dimension needed to describe data on a lower-dimensional manifold within high-dimensional spaces. Network pruning aims to reduce the complexity of high-dimensional networks while minimizing performance trade-offs. This symmetry motivates the exploration of ID as a metric for effective pruning. For vision-language models, we investigate whether different modalities exist on separate manifolds, indicating varying complexity and prunability. We empirically study ID variations in large-scale vision-language pre-trained models and examine the contributions of different modalities to model prunability. We propose a layer importance metric based on ID, which can conveniently integrate with current metrics and enhance performance in vision-language model pruning. The experimental results show a high correlation between ID and modality prunability. Visual representations are more sensitive and crucial to model performance, while language representations are more robust and offer greater prunability. Our findings suggest an asymmetric pruning strategy for vision and language modalities, guided by the ID metric. The code is available at https://github.com/Nofear18/ID_VL_Pruning", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hanzhang Wang;Jiawen Zhang;Qingyuan Ma", "authorids": "~Hanzhang_Wang1;~Jiawen_Zhang4;~Qingyuan_Ma1", "gender": "F;F;", "homepage": "https://hanzhang-wang.github.io;;https://github.com/Nofear18", "dblp": ";;", "google_scholar": ";;", "orcid": "0000-0002-4649-7361;;", "linkedin": ";\u6e05\u6e90-\u9a6c-003078260;", "or_profile": "~Hanzhang_Wang1;~Qingyuan_Ma1;~jiawen_zhang3", "aff": "Shanghai University;Shanghai University;Shanghai University", "aff_domain": "shu.edu.cn;shu.edu.cn;shu.edu.cn", "position": "Assistant Professor;Undergrad student;MS student", "bibtex": "@inproceedings{\nwang2024exploring,\ntitle={Exploring Intrinsic Dimension for Vision-Language Model Pruning},\nauthor={Hanzhang Wang and Jiawen Zhang and Qingyuan Ma},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xxL7CEWuxz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 550638, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:C63QhadiHrgJ:scholar.google.com/&scioq=Exploring+Intrinsic+Dimension+for+Vision-Language+Model+Pruning&hl=en&as_sdt=0,3", "gs_version_total": 4, "email": "shu.edu.cn;shu.edu.cn;shu.edu.cn", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Shanghai University", "aff_unique_dep": "", "aff_unique_url": "https://www.shu.edu.cn", "aff_unique_abbr": "SHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32684", "id": "xye7iNsgXn", "proceeding": "https://proceedings.mlr.press/v235/zhai24a.html", "pdf": "https://openreview.net/pdf?id=xye7iNsgXn", "openreview": "https://openreview.net/forum?id=xye7iNsgXn", "author_site": "Jiaqi Zhai, Yunxing Liao, Xing Liu, Yueming Wang, Rui Li, Xuan Cao, Yazhi Gao, Zhaojie Gong, Fangda Gu, Michael He, Yinghai Lu, Yu Shi", "tldr": "", "abstract": "Large-scale recommendation systems are characterized by their reliance on high cardinality, heterogeneous features and the need to handle tens of billions of user actions on a daily basis. Despite being trained on huge volume of data with thousands of features, most Deep Learning Recommendation Models (DLRMs) in industry fail to scale with compute. Inspired by success achieved by Transformers in language and vision domains, we revisit fundamental design choices in recommendation systems. We reformulate recommendation problems as sequential transduction tasks within a generative modeling framework (``Generative Recommenders''), and propose a new architecture, HSTU, designed for high cardinality, non-stationary streaming recommendation data. HSTU outperforms baselines over synthetic and public datasets by up to 65.8% in NDCG, and is 5.3x to 15.2x faster than FlashAttention2-based Transformers on 8192 length sequences. HSTU-based Generative Recommenders, with 1.5 trillion parameters, improve metrics in online A/B tests by 12.4% and have been deployed on multiple surfaces of a large internet platform with billions of users. More importantly, the model quality of Generative Recommenders empirically scales as a power-law of training compute across three orders of magnitude, up to GPT-3/LLaMa-2 scale, which reduces carbon footprint needed for future model developments, and further paves the way for the first foundation models in recommendations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiaqi Zhai;Lucy Liao;Xing Liu;Yueming Wang;Rui Li;Xuan Cao;Leon Gao;Zhaojie Gong;Fangda Gu;Jiayuan He;Yinghai Lu;Yu Shi", "authorids": "~Jiaqi_Zhai1;~Lucy_Liao1;~Xing_Liu5;~Yueming_Wang4;~Rui_Li32;~Xuan_Cao2;~Leon_Gao1;~Zhaojie_Gong1;~Fangda_Gu1;~Jiayuan_He2;~Yinghai_Lu1;~Yu_Shi1", "gender": ";M;F;M;M;;M;M;M;M;M;", "homepage": ";;;https://www.linkedin.com/in/rui-li-1108a117tduuufgdbjrfnfkrdufufreuvnuretfnutduutcjjrnnvrvffgchdichkrrblggi;;;;https://michael-jy-he.github.io/;;https://yu-shi-homepage.github.io;https://github.com/YazhiGao;", "dblp": "95/9726;;;96/4282-49.html;;348/9701;222/1888;;;55/4736-2;;140/0641", "google_scholar": "E9wn7LUAAAAJ;ewRsKt4AAAAJ;;m2bfcScAAAAJ;-brP7VEAAAAJ;;ZuFwafoAAAAJ;;;bKveCp4AAAAJ;;", "orcid": "0009-0004-7279-3318;;;0009-0002-3894-0471;0000-0001-7241-7188;0009-0004-1761-7530;;;;0000-0003-4931-7976;;0009-0007-1945-4132", "linkedin": "jiaqizhai/;xing-liu-56529710/;yuemingw;rui-li-1108a117tduuufgdbjrfnfkrdufufreuvnuretfnutduutcjjrnnvrvffgchdichkrrblggi;xuan-cao-23176577/;zhaojie-gong;;michael-jiayuan-he-a36b79101/;yinghai83/;yushi2/;;yunxing-lucy-liao-2806bb158/", "or_profile": "~Jiaqi_Zhai1;~Xing_Liu5;~Yueming_Wang4;~Rui_Li32;~Xuan_Cao2;~Zhaojie_Gong1;~Fangda_Gu1;~Jiayuan_He2;~Yinghai_Lu1;~Yu_Shi1;~Yazhi_Gao1;~Yunxing_Liao1", "aff": "Meta Facebook;Meta;;Meta;Meta Facebook;Meta Facebook;University of California, Berkeley;Meta Platform Inc.;OpenAI;Meta;Meta Facebook;Meta Facebook", "aff_domain": "fb.com;meta.com;;meta.com;meta.com;meta.com;berkeley.edu;meta.com;openai.com;meta.com;meta.com;meta.com", "position": "Distinguished Engineer;Researcher;;Researcher;Researcher;Researcher;PhD student;Researcher;Member of Technical Staff;Research Scientist;Researcher;Researcher", "bibtex": "@inproceedings{\nzhai2024actions,\ntitle={Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations},\nauthor={Jiaqi Zhai and Lucy Liao and Xing Liu and Yueming Wang and Rui Li and Xuan Cao and Leon Gao and Zhaojie Gong and Fangda Gu and Jiayuan He and Yinghai Lu and Yu Shi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xye7iNsgXn}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1404296, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 81, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17165962116778901299&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "fb.com;meta.com;;meta.com;meta.com;meta.com;berkeley.edu;meta.com;openai.com;meta.com;meta.com;meta.com", "author_num": 12, "aff_unique_index": "0;0;0;0;0;1;0;2;0;0;0", "aff_unique_norm": "Meta;University of California, Berkeley;OpenAI", "aff_unique_dep": "Meta Platforms, Inc.;;", "aff_unique_url": "https://meta.com;https://www.berkeley.edu;https://openai.com", "aff_unique_abbr": "Meta;UC Berkeley;OpenAI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Neural Diffusion Models", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32683", "id": "xzX7kf486K", "proceeding": "https://proceedings.mlr.press/v235/bartosh24a.html", "pdf": "https://openreview.net/pdf?id=xzX7kf486K", "openreview": "https://openreview.net/forum?id=xzX7kf486K", "author_site": "Grigory Bartosh, Dmitry Vetrov, Christian Andersson Naesseth", "tldr": "", "abstract": "Diffusion models have shown remarkable performance on many generative tasks. Despite recent success, most diffusion models are restricted in that they only allow linear transformation of the data distribution. In contrast, broader family of transformations can help train generative distributions more efficiently, simplifying the reverse process and closing the gap between the true negative log-likelihood and the variational approximation. In this paper, we present Neural Diffusion Models (NDMs), a generalization of conventional diffusion models that enables defining and learning time-dependent non-linear transformations of data. We show how to optimise NDMs using a variational bound in a simulation-free setting. Moreover, we derive a time-continuous formulation of NDMs, which allows fast and reliable inference using off-the-shelf numerical ODE and SDE solvers. Finally, we demonstrate the utility of NDMs through experiments on many image generation benchmarks, including MNIST, CIFAR-10, downsampled versions of ImageNet and CelebA-HQ. NDMs outperform conventional diffusion models in terms of likelihood, achieving state-of-the-art results on ImageNet and CelebA-HQ, and produces high-quality samples.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Grigory Bartosh;Dmitry Vetrov;Christian A. Naesseth", "authorids": "~Grigory_Bartosh1;~Dmitry_P._Vetrov1;~Christian_A._Naesseth1", "gender": "M;M;M", "homepage": ";https://constructor.university/faculty-member/dmitry-vetrov;https://naesseth.github.io/", "dblp": ";89/3348;146/0902", "google_scholar": ";https://scholar.google.ru/citations?user=7HU0UoUAAAAJ;GQ6rOssAAAAJ", "orcid": ";;", "linkedin": "grigory-bartosh-76004a163/;;", "or_profile": "~Grigory_Bartosh1;~Dmitry_P._Vetrov1;~Christian_A_Naesseth1", "aff": "University of Amsterdam;National Research University Higher School of Economics;University of Amsterdam", "aff_domain": "uva.nl;hse.ru;uva.nl", "position": "PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nbartosh2024neural,\ntitle={Neural Diffusion Models},\nauthor={Grigory Bartosh and Dmitry Vetrov and Christian A. Naesseth},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xzX7kf486K}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6018180, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3759804013750445777&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "uva.nl;hse.ru;uva.nl", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Amsterdam;National Research University Higher School of Economics", "aff_unique_dep": ";", "aff_unique_url": "https://www.uva.nl;https://hse.ru", "aff_unique_abbr": "UvA;HSE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Netherlands;Russian Federation" }, { "title": "Evolution-Inspired Loss Functions for Protein Representation Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32682", "id": "y5L8W0KRUX", "proceeding": "https://proceedings.mlr.press/v235/gong24e.html", "pdf": "https://openreview.net/pdf?id=y5L8W0KRUX", "openreview": "https://openreview.net/forum?id=y5L8W0KRUX", "author_site": "Chengyue Gong, Adam Klivans, James Loy, Tianlong Chen, qiang liu, Danny Diaz", "tldr": "", "abstract": "AI-based frameworks for protein engineering use self-supervised learning (SSL) to obtain representations for downstream mutation effect predictions. The most common training objective for these methods is wildtype accuracy: given a sequence or structure where a wildtype residue has been masked, predict the missing amino acid. Wildtype accuracy, however, does not align with the primary goal of protein engineering, which is to suggest a mutation rather than to identify what already appears in nature. Here we present Evolutionary Ranking (EvoRank), a training objective that incorporates evolutionary information derived from multiple sequence alignments (MSAs) to learn more diverse protein representations. EvoRank corresponds to ranking amino-acid likelihoods in the probability distribution induced by an MSA. This objective forces models to learn the underlying evolutionary dynamics of a protein. Across a variety of phenotypes and datasets, we demonstrate that EvoRank leads to dramatic improvements in zero-shot performance and can compete with models fine-tuned on experimental data. This is particularly important in protein engineering, where it is expensive to obtain data for fine-tuning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chengyue Gong;Adam Klivans;James Madigan Loy;Tianlong Chen;qiang liu;Daniel Jesus Diaz", "authorids": "~Chengyue_Gong1;~Adam_Klivans1;~James_Madigan_Loy1;~Tianlong_Chen1;~qiang_liu4;~Daniel_Jesus_Diaz1", "gender": "M;M;M;M;M;M", "homepage": ";http://www.cs.utexas.edu/~klivans;;https://tianlong-chen.github.io;;https://www.cs.utexas.edu/~lqiang/", "dblp": "209/4862;k/AdamRKlivans;;;;61/3234-1", "google_scholar": "AscakBgAAAAJ;;;LE3ctn0AAAAJ;lVD0CNEAAAAJ;https://scholar.google.com.tw/citations?user=2qDh4WUAAAAJ", "orcid": ";;0000-0001-8519-9482;0000-0001-7774-8197;0000-0002-7891-2128;", "linkedin": ";;;tianlong-chen-783862167/;aiproteins/;", "or_profile": "~Chengyue_Gong1;~Adam_Klivans1;~James_Madigan_Loy1;~Tianlong_Chen1;~Daniel_Jesus_Diaz1;~Qiang_Liu1", "aff": "University of Texas at Austin;University of Texas, Austin;;Harvard University;University of Texas at Austin;University of Texas, Austin", "aff_domain": "cs.utexas.edu;cs.utexas.edu;;harvard.edu;utexas.edu;utexas.edu", "position": "grad student;Professor;;Postdoc;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\ngong2024evolutioninspired,\ntitle={Evolution-Inspired Loss Functions for Protein Representation Learning},\nauthor={Chengyue Gong and Adam Klivans and James Madigan Loy and Tianlong Chen and qiang liu and Daniel Jesus Diaz},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=y5L8W0KRUX}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3227222, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10583231952638937916&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "cs.utexas.edu;cs.utexas.edu;;harvard.edu;utexas.edu;utexas.edu", "author_num": 6, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "University of Texas at Austin;Harvard University", "aff_unique_dep": ";", "aff_unique_url": "https://www.utexas.edu;https://www.harvard.edu", "aff_unique_abbr": "UT Austin;Harvard", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Roping in Uncertainty: Robustness and Regularization in Markov Games", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32681", "id": "y6y2HauOpR", "proceeding": "https://proceedings.mlr.press/v235/mcmahan24a.html", "pdf": "https://openreview.net/pdf?id=y6y2HauOpR", "openreview": "https://openreview.net/forum?id=y6y2HauOpR", "author_site": "Jeremy McMahan, Giovanni Artiglio, Qiaomin Xie", "tldr": "", "abstract": "We study robust Markov games (RMG) with $s$-rectangular uncertainty. We show a general equivalence between computing a robust Nash equilibrium (RNE) of a $s$-rectangular RMG and computing a Nash equilibrium (NE) of an appropriately constructed regularized MG. The equivalence result yields a planning algorithm for solving $s$-rectangular RMGs, as well as provable robustness guarantees for policies computed using regularized methods. However, we show that even for just reward-uncertain two-player zero-sum matrix games, computing an RNE is PPAD-hard. Consequently, we derive a special uncertainty structure called efficient player-decomposability and show that RNE for two-player zero-sum RMG in this class can be provably solved in polynomial time. This class includes commonly used uncertainty sets such as $L_1$ and $L_\\infty$ ball uncertainty sets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jeremy McMahan;Giovanni Artiglio;Qiaomin Xie", "authorids": "~Jeremy_McMahan1;artiglio@wisc.edu;~Qiaomin_Xie1", "gender": "M;;F", "homepage": "http://jeremymmcmahan.com;;https://qiaominxie.github.io/", "dblp": "299/1330;;37/10269", "google_scholar": "Ujya6FIAAAAJ;;RVNcy4EAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Jeremy_McMahan1;artiglio@wisc.edu;~Qiaomin_Xie1", "aff": "University of Wisconsin - Madison;;University of Wisconsin - Madison", "aff_domain": "wisc.edu;;wisc.edu", "position": "PhD student;;Assistant Professor", "bibtex": "@inproceedings{\nmcmahan2024roping,\ntitle={Roping in Uncertainty: Robustness and Regularization in Markov Games},\nauthor={Jeremy McMahan and Giovanni Artiglio and Qiaomin Xie},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=y6y2HauOpR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 532845, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9017559329799951737&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "wisc.edu;;wisc.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Wisconsin-Madison", "aff_unique_dep": "", "aff_unique_url": "https://www.wisc.edu", "aff_unique_abbr": "UW-Madison", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Neural Collapse in Multi-label Learning with Pick-all-label Loss", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32680", "id": "y8NevOhrnW", "proceeding": "https://proceedings.mlr.press/v235/li24ai.html", "pdf": "https://openreview.net/pdf?id=y8NevOhrnW", "openreview": "https://openreview.net/forum?id=y8NevOhrnW", "author_site": "Pengyu Li, Xiao Li, Yutong Wang, Qing Qu", "tldr": "", "abstract": "We study deep neural networks for the multi-label classification (MLab) task through the lens of neural collapse (NC). Previous works have been restricted to the multi-class classification setting and discovered a prevalent NC phenomenon comprising of the following properties for the last-layer features: (i) the variability of features within every class collapses to zero, (ii) the set of feature means form an equi-angular tight frame (ETF), and (iii) the last layer classifiers collapse to the feature mean upon some scaling. We generalize the study to multi-label learning, and prove for the first time that a generalized NC phenomenon holds with the \"pick-all-label'' formulation, which we term as MLab NC. While the ETF geometry remains consistent for features with a single label, multi-label scenarios introduce a unique combinatorial aspect we term the \"tag-wise average\" property, where the means of features with multiple labels are the scaled averages of means for single-label instances. Theoretically, under proper assumptions on the features, we establish that the only global optimizer of the pick-all-label cross-entropy loss satisfy the multi-label NC. In practice, we demonstrate that our findings can lead to better test performance with more efficient training techniques for MLab learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pengyu Li;Xiao Li;Yutong Wang;Qing Qu", "authorids": "~Pengyu_Li6;~Xiao_Li8;~Yutong_Wang1;~Qing_Qu2", "gender": "M;;M;M", "homepage": "https://scholar.google.com/citations?user=QKNif2sAAAAJ&hl=en;https://heimine.github.io/;https://yutongwang.me/;https://qingqu.engin.umich.edu/", "dblp": ";66/2069-26.html;90/3631;127/6874-1", "google_scholar": ";aAX0au8AAAAJ;GH7ryE4AAAAJ;JfblW3MAAAAJ", "orcid": ";;0000-0001-7472-6750;0000-0001-9136-558X", "linkedin": ";;;qing-q-1a0b9746/", "or_profile": "~Pengyu_Li6;~Xiao_Li8;~Yutong_Wang1;~Qing_Qu2", "aff": "University of Michigan - Ann Arbor;University of Michigan;University of Michigan - Ann Arbor;University of Michigan", "aff_domain": "umich.edu;umich.edu;umich.edu;umich.edu", "position": "PhD student;PhD student;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nli2024neural,\ntitle={Neural Collapse in Multi-label Learning with Pick-all-label Loss},\nauthor={Pengyu Li and Xiao Li and Yutong Wang and Qing Qu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=y8NevOhrnW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3612497, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15538795410374697025&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "umich.edu;umich.edu;umich.edu;umich.edu", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Michigan", "aff_unique_dep": "", "aff_unique_url": "https://www.umich.edu", "aff_unique_abbr": "UM", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Ann Arbor;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "On the Implicit Bias of Adam", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32679", "id": "y8YovS0lOg", "proceeding": "https://proceedings.mlr.press/v235/cattaneo24a.html", "pdf": "https://openreview.net/pdf?id=y8YovS0lOg", "openreview": "https://openreview.net/forum?id=y8YovS0lOg", "author_site": "Matias Cattaneo, Jason Klusowski, Boris Shigida", "tldr": "", "abstract": "In previous literature, backward error analysis was used to find ordinary differential equations (ODEs) approximating the gradient descent trajectory. It was found that finite step sizes implicitly regularize solutions because terms appearing in the ODEs penalize the two-norm of the loss gradients. We prove that the existence of similar implicit regularization in RMSProp and Adam depends on their hyperparameters and the training stage, but with a different \"norm\" involved: the corresponding ODE terms either penalize the (perturbed) one-norm of the loss gradients or, conversely, impede its reduction (the latter case being typical). We also conduct numerical experiments and discuss how the proven facts can influence generalization.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Matias D. Cattaneo;Jason Matthew Klusowski;Boris Shigida", "authorids": "~Matias_D._Cattaneo1;~Jason_Matthew_Klusowski1;~Boris_Shigida1", "gender": "M;M;M", "homepage": "https://klusowski.princeton.edu/;;https://cattaneo.princeton.edu/", "dblp": ";356/2718.html;", "google_scholar": "4HkhCjsAAAAJ;KImeZo8AAAAJ;TNdmZVEAAAAJ", "orcid": "0000-0001-6484-8682;;0000-0003-0493-7506", "linkedin": "jklusowski/;boris-shigida/;matias-d-cattaneo/", "or_profile": "~Jason_Matthew_Klusowski1;~Boris_Shigida1;~Matias_Cattaneo1", "aff": "Princeton University;Princeton University;Princeton University", "aff_domain": "princeton.edu;princeton.edu;princeton.edu", "position": "Assistant Professor;PhD student;Full Professor", "bibtex": "@inproceedings{\ncattaneo2024on,\ntitle={On the Implicit Bias of Adam},\nauthor={Matias D. Cattaneo and Jason Matthew Klusowski and Boris Shigida},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=y8YovS0lOg}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1802763, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17420330623754347191&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13, "email": "princeton.edu;princeton.edu;princeton.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Variational Partial Group Convolutions for Input-Aware Partial Equivariance of Rotations and Color-Shifts", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32678", "id": "yDXnXJE1RK", "proceeding": "https://proceedings.mlr.press/v235/kim24q.html", "pdf": "https://openreview.net/pdf?id=yDXnXJE1RK", "openreview": "https://openreview.net/forum?id=yDXnXJE1RK", "author_site": "Hyunsu Kim, Ye Gon Kim, Hongseok Yang, Juho Lee", "tldr": "", "abstract": "Group Equivariant CNNs (G-CNNs) have shown promising efficacy in various tasks, owing to their ability to capture hierarchical features in an equivariant manner. However, their equivariance is fixed to the symmetry of the whole group, limiting adaptability to diverse partial symmetries in real-world datasets, such as limited rotation symmetry of handwritten digit images and limited color-shift symmetry of flower images. Recent efforts address this limitation, one example being Partial G-CNN which restricts the output group space of convolution layers to break full equivariance. However, such an approach still fails to adjust equivariance levels across data. In this paper, we propose a novel approach, Variational Partial G-CNN (VP G-CNN), to capture varying levels of partial equivariance specific to each data instance. VP G-CNN redesigns the distribution of the output group elements to be conditioned on input data, leveraging variational inference to avoid overfitting. This enables the model to adjust its equivariance levels according to the needs of individual data points. Additionally, we address training instability inherent in discrete group equivariance models by redesigning the reparametrizable distribution. We demonstrate the effectiveness of VP G-CNN on both toy and real-world datasets, including MNIST67-180, CIFAR10, ColorMNIST, and Flowers102. Our results show robust performance, even in uncertainty metrics.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hyunsu Kim;Yegon Kim;Hongseok Yang;Juho Lee", "authorids": "~Hyunsu_Kim2;~Yegon_Kim1;~Hongseok_Yang2;~Juho_Lee2", "gender": "M;M;M;M", "homepage": "https://kim-hyunsu.github.io/;https://yegonkim.github.io/;https://juho.lee.github.io;https://sites.google.com/view/hongseokyang/home", "dblp": ";;55/3410-1;82/5808", "google_scholar": ";;Py4URJUAAAAJ;cLuwH14AAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Hyunsu_Kim2;~Yegon_Kim1;~Juho_Lee2;~Hongseok_Yang1", "aff": "Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "PhD student;MS student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nkim2024variational,\ntitle={Variational Partial Group Convolutions for Input-Aware Partial Equivariance of Rotations and Color-Shifts},\nauthor={Hyunsu Kim and Yegon Kim and Hongseok Yang and Juho Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yDXnXJE1RK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7975796, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:njoOR2-tOKcJ:scholar.google.com/&scioq=Variational+Partial+Group+Convolutions+for+Input-Aware+Partial+Equivariance+of+Rotations+and+Color-Shifts&hl=en&as_sdt=0,44", "gs_version_total": 7, "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Domain Generalisation via Imprecise Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32677", "id": "yFUdZfbEme", "proceeding": "https://proceedings.mlr.press/v235/singh24a.html", "pdf": "https://openreview.net/pdf?id=yFUdZfbEme", "openreview": "https://openreview.net/forum?id=yFUdZfbEme", "author_site": "Anurag Singh, Siu Lun Chau, Shahine Bouabid, Krikamol Muandet", "tldr": "", "abstract": "Out-of-distribution (OOD) generalisation is challenging because it involves not only learning from empirical data, but also deciding among various notions of generalisation, e.g. optimise based on the average-case risk, worst-case risk, or interpolations thereof. While this decision should in principle be decided by the model operator like medical doctors in practice, this information might not always be available at training time. This situation leads to arbitrary commitments to specific generalisation strategies by machine learners due to these deployment uncertainties. We introduce the Imprecise Domain Generalisation framework to mitigate this, featuring an imprecise risk optimisation that allows learners to stay imprecise by optimising against a continuous spectrum of generalisation strategies during training, and a model framework that allows operators to specify their generalisation preference at deployment. Our work, supported by theoretical and empirical evidence, showcases the benefits of integrating imprecision into domain generalisation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anurag Singh;Siu Lun Chau;Shahine Bouabid;Krikamol Muandet", "authorids": "~Anurag_Singh3;~Siu_Lun_Chau1;~Shahine_Bouabid1;~Krikamol_Muandet1", "gender": "M;M;M;M", "homepage": "https://anurag14.github.io/;https://chau999.github.io/;http://krikamol.org;https://shahineb.github.io/", "dblp": ";264/9823;34/1240;260/0855.html", "google_scholar": ";e7ZBlIsAAAAJ;E2z5uYsAAAAJ;bK-xIMcAAAAJ", "orcid": ";;0000-0002-4182-5282;", "linkedin": "anuragsinghnsit/;;krikamol-muandet/;", "or_profile": "~Anurag_Singh3;~Siu_Lun_Chau1;~Krikamol_Muandet1;~Shahine_Abderrahim_Bouabid1", "aff": "CISPA, saarland university, saarland informatics campus;CISPA \u2013 Helmholtz Center for Information Security;CISPA Helmholtz Center for Information Security;University of Oxford", "aff_domain": "cispa.saarland;cispa.de;cispa.saarland;ox.ac.uk", "position": "PhD student;Postdoc;Associate Professor;PhD student", "bibtex": "@inproceedings{\nsingh2024domain,\ntitle={Domain Generalisation via Imprecise Learning},\nauthor={Anurag Singh and Siu Lun Chau and Shahine Bouabid and Krikamol Muandet},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yFUdZfbEme}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1408418, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16661514705031628830&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "email": "cispa.saarland;cispa.de;cispa.saarland;ox.ac.uk", "author_num": 4, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Saarland University;Helmholtz Center for Information Security;CISPA Helmholtz Center for Information Security;University of Oxford", "aff_unique_dep": "CISPA;;;", "aff_unique_url": "https://www.uni-saarland.de;https://www.cispa.de/;https://www.cispa.de/;https://www.ox.ac.uk", "aff_unique_abbr": "Saarland U;CISPA;CISPA;Oxford", "aff_campus_unique_index": "0", "aff_campus_unique": "Saarland Informatics Campus;", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "Germany;United Kingdom" }, { "title": "Provable Benefits of Local Steps in Heterogeneous Federated Learning for Neural Networks: A Feature Learning Perspective", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32676", "id": "yHRxnhKyEJ", "proceeding": "https://proceedings.mlr.press/v235/bao24a.html", "pdf": "https://openreview.net/pdf?id=yHRxnhKyEJ", "openreview": "https://openreview.net/forum?id=yHRxnhKyEJ", "author_site": "Yajie Bao, Michael Crawshaw, Mingrui Liu", "tldr": "", "abstract": "Local steps are crucial for Federated Learning (FL) algorithms and have witnessed great empirical success in reducing communication costs and improving the generalization performance of deep neural networks. However, there are limited studies on the effect of local steps on heterogeneous FL. A few works investigate this problem from the optimization perspective. Woodworth et al. (2020a) showed that the iteration complexity of Local SGD, the most popular FL algorithm, is dominated by the baseline mini-batch SGD, which does not show the benefits of local steps. In addition, Levy (2023) proposed a new local update method that provably benefits over mini-batch SGD. However, in the same setting, there is still no work analyzing the effects of local steps to generalization in a heterogeneous FL setting. Motivated by our experimental findings where Local SGD learns more distinguishing features than parallel SGD, this paper studies the generalization benefits of local steps from a feature learning perspective. We propose a novel federated data model that exhibits a new form of data heterogeneity, under which we show that a convolutional neural network (CNN) trained by GD with *global* updates will miss some pattern-related features, while the network trained by GD with *local* updates can learn all features in polynomial time. Consequently, local steps help CNN generalize better in our data model. In a different parameter setting, we also prove that Local GD with *one-shot* model averaging can learn all features and generalize well in all clients. Our experimental results also confirm the benefits of local steps in improving test accuracy on real-world data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yajie Bao;Michael Crawshaw;Mingrui Liu", "authorids": "~Yajie_Bao2;~Michael_Crawshaw1;~Mingrui_Liu2", "gender": "M;M;", "homepage": "https://yajiebao.github.io/;;https://mingrliu.github.io", "dblp": "254/8290;274/7164;", "google_scholar": "1n_aUsIAAAAJ;XVrMZ_4AAAAJ;KFoEnFQAAAAJ", "orcid": "0000-0003-3843-7016;;", "linkedin": ";;mingrui-liu-447a2aab/", "or_profile": "~Yajie_Bao2;~Michael_Crawshaw1;~Mingrui_Liu2", "aff": "Shanghai Jiaotong University;George Mason University;George Mason University", "aff_domain": "sjtu.edu.cn;gmu.edu;gmu.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nbao2024provable,\ntitle={Provable Benefits of Local Steps in Heterogeneous Federated Learning for Neural Networks: A Feature Learning Perspective},\nauthor={Yajie Bao and Michael Crawshaw and Mingrui Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yHRxnhKyEJ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 854027, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5526518297257808677&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "sjtu.edu.cn;gmu.edu;gmu.edu", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Shanghai Jiao Tong University;George Mason University", "aff_unique_dep": ";", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.gmu.edu", "aff_unique_abbr": "SJTU;GMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;United States" }, { "title": "Performative Prediction with Bandit Feedback: Learning through Reparameterization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32675", "id": "yHs3jIPgaF", "proceeding": "https://proceedings.mlr.press/v235/chen24al.html", "pdf": "https://openreview.net/pdf?id=yHs3jIPgaF", "openreview": "https://openreview.net/forum?id=yHs3jIPgaF", "author_site": "Yatong Chen, Wei Tang, Chien-Ju Ho, Yang Liu", "tldr": "", "abstract": "Performative prediction, as introduced by Perdomo et al., is a framework for studying social prediction in which the data distribution itself changes in response to the deployment of a model. Existing work in this field usually hinges on three assumptions that are easily violated in practice: that the performative risk is convex over the deployed model, that the mapping from the model to the data distribution is known to the model designer in advance, and the first-order information of the performative risk is available. In this paper, we initiate the study of performative prediction problems that do not require these assumptions. Specifically, we develop a parameterization framework that parametrizes the performative prediction objective as a function of the induced data distribution. We also develop a two-level zeroth-order optimization procedure, where the first level performs iterative optimization on the distribution parameter space, and the second level learns the model that induced a particular target distribution parameter at each iteration. Under mild conditions, this reparameterization allows us to transform the non-convex objective into a convex one and achieve provable regret guarantees. In particular, we provide a regret bound that is sublinear in the total number of performative samples taken and is only polynomial in the dimension of the model parameter.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yatong Chen;Wei Tang;Chien-Ju Ho;Yang Liu", "authorids": "~Yatong_Chen1;~Wei_Tang1;~Chien-Ju_Ho1;~Yang_Liu3", "gender": "F;M;M;M", "homepage": "https://yatongchen.github.io/;https://wtang.org/;http://chienjuho.com;http://www.yliuu.com", "dblp": "202/8466;;85/4929;51/3710-18", "google_scholar": "yoExm_UAAAAJ;;https://scholar.google.com.tw/citations?user=DWKoeW0AAAAJ;jKrIVCIAAAAJ", "orcid": ";;;0000-0001-8420-6011", "linkedin": ";;;", "or_profile": "~Yatong_Chen1;~Wei_Tang1;~Chien-Ju_Ho1;~Yang_Liu3", "aff": "University of California, Santa Cruz;Columbia University;Washington University in St. Louis;University of California, Santa Cruz", "aff_domain": "ucsc.edu;columbia.edu;wustl.edu;ucsc.edu", "position": "PhD student;Postdoc;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nchen2024performative,\ntitle={Performative Prediction with Bandit Feedback: Learning through Reparameterization},\nauthor={Yatong Chen and Wei Tang and Chien-Ju Ho and Yang Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yHs3jIPgaF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 861092, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16584755346959730942&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "ucsc.edu;columbia.edu;wustl.edu;ucsc.edu", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of California, Santa Cruz;Columbia University;Washington University in St. Louis", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ucsc.edu;https://www.columbia.edu;https://wustl.edu", "aff_unique_abbr": "UCSC;Columbia;WashU", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Santa Cruz;;St. Louis", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Towards Efficient Spiking Transformer: a Token Sparsification Framework for Training and Inference Acceleration", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32674", "id": "yL6hljtjW4", "proceeding": "https://proceedings.mlr.press/v235/zhuge24b.html", "pdf": "https://openreview.net/pdf?id=yL6hljtjW4", "openreview": "https://openreview.net/forum?id=yL6hljtjW4", "author_site": "Zhengyang Zhuge, Peisong Wang, Xingting Yao, Jian Cheng", "tldr": "", "abstract": "Nowadays Spiking Transformers have exhibited remarkable performance close to Artificial Neural Networks (ANNs), while enjoying the inherent energy-efficiency of Spiking Neural Networks (SNNs). However, training Spiking Transformers on GPUs is considerably more time-consuming compared to the ANN counterparts, despite the energy-efficient inference through neuromorphic computation. In this paper, we investigate the token sparsification technique for efficient training of Spiking Transformer and find conventional methods suffer from noticeable performance degradation. We analyze the issue and propose our Sparsification with Timestep-wise Anchor Token and dual Alignments (STATA). Timestep-wise Anchor Token enables precise identification of important tokens across timesteps based on standardized criteria. Additionally, dual Alignments incorporate both Intra and Inter Alignment of the attention maps, fostering the learning of inferior attention. Extensive experiments show the effectiveness of STATA thoroughly, which demonstrates up to $\\sim$1.53$\\times$ training speedup and $\\sim$48% energy reduction with comparable performance on various datasets and architectures.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhengyang Zhuge;Peisong Wang;Xingting Yao;Jian Cheng", "authorids": "~Zhengyang_Zhuge1;~Peisong_Wang1;~Xingting_Yao1;~Jian_Cheng7", "gender": "M;M;M;M", "homepage": ";;https://github.com/Ikarosy;https://people.ucas.ac.cn/~chengjian?language=en", "dblp": "333/0529.html;187/5474;332/0012;14/6145-1", "google_scholar": ";UYFZpk4AAAAJ;;ZGCIUJ8AAAAJ", "orcid": ";;;0000-0003-1289-2758", "linkedin": "https://www.linkedin.cn/injobs/in/%E6%AD%A3-%E8%91%9B-489259176;;;", "or_profile": "~Zhengyang_Zhuge1;~Peisong_Wang1;~Xingting_Yao1;~Jian_Cheng7", "aff": "Institute of Automation, Chinese Academy of Sciences;Institute of Automation of\uff0cChinese Academy of Sciences;Institute of Automation, Chinese Academy of Sciences;Institute of Automation, Chinese Academy of Sciences", "aff_domain": "ia.ac.cn;ia.ac.cn;ia.ac.cn;ia.ac.cn", "position": "PhD student;Associate Professor;PhD student;Full Professor", "bibtex": "@inproceedings{\nzhuge2024towards,\ntitle={Towards Efficient Spiking Transformer: a Token Sparsification Framework for Training and Inference Acceleration},\nauthor={Zhengyang Zhuge and Peisong Wang and Xingting Yao and Jian Cheng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yL6hljtjW4}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1427429, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:whR9GmcoAkQJ:scholar.google.com/&scioq=Towards+Efficient+Spiking+Transformer:+a+Token+Sparsification+Framework+for+Training+and+Inference+Acceleration&hl=en&as_sdt=0,5", "gs_version_total": 5, "email": "ia.ac.cn;ia.ac.cn;ia.ac.cn;ia.ac.cn", "author_num": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Chinese Academy of Sciences", "aff_unique_dep": "Institute of Automation", "aff_unique_url": "http://www.ia.cas.cn", "aff_unique_abbr": "CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "RODEO: Robust Outlier Detection via Exposing Adaptive Out-of-Distribution Samples", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32673", "id": "yOe5lqDPvM", "proceeding": "https://proceedings.mlr.press/v235/mirzaei24a.html", "pdf": "https://openreview.net/pdf?id=yOe5lqDPvM", "openreview": "https://openreview.net/forum?id=yOe5lqDPvM", "author_site": "Hossein Mirzaei, Mohammad Jafari Varnousfaderani, Hamid Reza Dehbashi, Ali Ansari, Sepehr Ghobadi, Masoud Hadi, Arshia Soltani Moakhar, Mohammad Azizmalayeri, Mahdieh Soleymani Baghshah, Mohammad H Rohban", "tldr": "", "abstract": "In recent years, there have been significant improvements in various forms of image outlier detection. However, outlier detection performance under adversarial settings lags far behind that in standard settings. This is due to the lack of effective exposure to adversarial scenarios during training, especially on unseen outliers, leading detection models failing to learn robust features. To bridge this gap, we introduce RODEO, a data-centric approach that generates effective outliers for robust outlier detection. More specifically, we show that incorporating outlier exposure (OE) and adversarial training could be an effective strategy for this purpose, as long as the exposed training outliers meet certain characteristics, including diversity, and both conceptual differentiability and analogy to the inlier samples. We leverage a text-to-image model to achieve this goal. We demonstrate both quantitatively and qualitatively that our adaptive OE method effectively generates ''diverse'' and ''near-distribution'' outliers, leveraging information from both text and image domains. Moreover, our experimental results show that utilizing our synthesized outliers significantly enhances the performance of the outlier detector, particularly in adversarial settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hossein Mirzaei;Mohammad Jafari;Hamid Reza Dehbashi;Ali Ansari;Sepehr Ghobadi;Masoud Hadi;Arshia Soltani Moakhar;Mohammad Azizmalayeri;Mahdieh Soleymani Baghshah;Mohammad Hossein Rohban", "authorids": "~Hossein_Mirzaei1;~Mohammad_Jafari1;~Hamid_Reza_Dehbashi1;~Ali_Ansari2;~Sepehr_Ghobadi1;~Masoud_Hadi1;~Arshia_Soltani_Moakhar1;~Mohammad_Azizmalayeri1;~Mahdieh_Soleymani_Baghshah1;~Mohammad_Hossein_Rohban1", "gender": "M;M;M;M;;M;;;;M", "homepage": ";https://mohammadjafari80.github.io/;https://github.com/hamiddeboo8;https://allliance.github.io;;https://ex3ploiter.github.io/;;;;http://sharif.edu/~rohban/", "dblp": ";;;200/9876-1;;;;;;43/8108", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;;WEMgPWgAAAAJ;WYxYPXUAAAAJ;;https://scholar.google.com/citations?hl=en;;;;pRyJ6FkAAAAJ", "orcid": ";;;0000-0002-9798-6966;;;;;;", "linkedin": "hossein-mirzaei-6bb2301aa;mohammadjafari01/;;https://linkedin.com/in/ali-ansari-195999213;;masoud-hadi-a382601b4/;;;;", "or_profile": "~Hossein_Mirzaei1;~Mohammad_Jafari1;~Hamid_Reza_Dehbashi1;~Ali_Ansari2;~Sepehr_Ghobadi1;~Masoud_Hadi1;~Arshia_Soltani_Moakhar1;~Mohammad_Azizmalayeri1;~Mahdieh_Soleymani_Baghshah1;~Mohammad_Hossein_Rohban1", "aff": "EPFL - EPF Lausanne;Sharif University of Technology;Sharif University of Technology, Sharif University of Technology;Sharif University of Technology;;Isfahan University of Technology;;;;Sharif University of Technology", "aff_domain": "epfl.ch;sharif.edu;ce.sharif.edu;sharif.edu;;iut.ac.ir;;;;sharif.edu", "position": "PhD student;Undergrad student;Undergrad student;Undergrad student;;Undergrad student;;;;Associate Professor", "bibtex": "@inproceedings{\nmirzaei2024rodeo,\ntitle={{RODEO}: Robust Outlier Detection via Exposing Adaptive Out-of-Distribution Samples},\nauthor={Hossein Mirzaei and Mohammad Jafari and Hamid Reza Dehbashi and Ali Ansari and Sepehr Ghobadi and Masoud Hadi and Arshia Soltani Moakhar and Mohammad Azizmalayeri and Mahdieh Soleymani Baghshah and Mohammad Hossein Rohban},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yOe5lqDPvM}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5500624, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 10, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3070586828241472955&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "email": "epfl.ch;sharif.edu;ce.sharif.edu;sharif.edu;;iut.ac.ir;;;;sharif.edu", "author_num": 10, "aff_unique_index": "0;1;1;1;2;1", "aff_unique_norm": "EPFL;Sharif University of Technology;Isfahan University of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.epfl.ch;https://www.sharif.edu;https://www.iut.ac.ir", "aff_unique_abbr": "EPFL;SUT;IUT", "aff_campus_unique_index": "0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;1;1;1;1;1", "aff_country_unique": "Switzerland;Iran" }, { "title": "ETHER: Efficient Finetuning of Large-Scale Models with Hyperplane Reflections", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32672", "id": "yPDTXQwUPy", "proceeding": "https://proceedings.mlr.press/v235/bini24a.html", "pdf": "https://openreview.net/pdf?id=yPDTXQwUPy", "openreview": "https://openreview.net/forum?id=yPDTXQwUPy", "author_site": "Massimo Bini, Karsten Roth, Zeynep Akata, Anna Khoreva", "tldr": "", "abstract": "Parameter-efficient finetuning (PEFT) has become ubiquitous to adapt foundation models to downstream task requirements while retaining their generalization ability. However, the amount of additionally introduced parameters and compute for successful adaptation and hyperparameter searches can explode quickly, especially when deployed at scale to serve numerous individual requests. To ensure effective, parameter-efficient, and hyperparameter-robust adaptation, we propose the *ETHER* transformation family, which performs Efficient fineTuning via HypErplane Reflections. By design, *ETHER* transformations require *a minimal number of parameters*, are *less likely to deteriorate model performance*, and exhibit *robustness to hyperparameter and learning rate choices*. In particular, we introduce *ETHER* and its relaxation *ETHER+*, which match or outperform existing PEFT methods with significantly fewer parameters ($\\sim$$10$-$100$ times lower than LoRA or OFT) across multiple image synthesis and natural language tasks without *exhaustive hyperparameter tuning*. Finally, we investigate the recent emphasis on Hyperspherical Energy retention for adaptation and raise questions on its practical utility. The code is available at https://github.com/mwbini/ether.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Massimo Bini;Karsten Roth;Zeynep Akata;Anna Khoreva", "authorids": "~Massimo_Bini1;~Karsten_Roth1;~Zeynep_Akata1;~Anna_Khoreva1", "gender": ";Not Specified;F;F", "homepage": ";https://karroth.com/;https://eml-unitue.de/people/zeynep-akata;", "dblp": ";234/7803;117/4838;152/5005", "google_scholar": ";93ZjIs0AAAAJ;jQl9RtkAAAAJ;https://scholar.google.de/citations?user=ILgZT7MAAAAJ", "orcid": ";;0000-0002-1432-7747;", "linkedin": ";;zeynep-akata-36182045/?ppe=1;", "or_profile": "~Massimo_Bini1;~Karsten_Roth1;~Zeynep_Akata1;~Anna_Khoreva1", "aff": ";University of Tuebingen;Helmholtz Munich;Bosch Center for Artificial Intelligence", "aff_domain": ";uni-tuebingen.de;helmholtz-munich.de;bosch.com", "position": ";PhD student;Researcher;Research Group Leader", "bibtex": "@inproceedings{\nbini2024ether,\ntitle={{ETHER}: Efficient Finetuning of Large-Scale Models with Hyperplane Reflections},\nauthor={Massimo Bini and Karsten Roth and Zeynep Akata and Anna Khoreva},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yPDTXQwUPy}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7459171, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12146258525078881955&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": ";uni-tuebingen.de;helmholtz-munich.de;bosch.com", "author_num": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Tuebingen;Helmholtz Zentrum M\u00fcnchen;Bosch Center for Artificial Intelligence", "aff_unique_dep": ";;Center for Artificial Intelligence", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.helmholtz-muenchen.de;https://www.bosch-ai.com", "aff_unique_abbr": "Uni T\u00fcbingen;HMGU;BCAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "High-Dimensional Geometric Streaming for Nearly Low Rank Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32671", "id": "yQfA0etfB7", "proceeding": "https://proceedings.mlr.press/v235/esfandiari24a.html", "pdf": "https://openreview.net/pdf?id=yQfA0etfB7", "openreview": "https://openreview.net/forum?id=yQfA0etfB7", "author_site": "Hossein Esfandiari, Praneeth Kacham, Vahab Mirrokni, David Woodruff, Peilin Zhong", "tldr": "", "abstract": "We study streaming algorithms for the $\\ell_p$ subspace approximation problem. Given points $a_1, \\ldots, a_n$ as an insertion-only stream and a rank parameter $k$, the $\\ell_p$ subspace approximation problem is to find a $k$-dimensional subspace $V$ such that $(\\sum_{i=1}^n d(a_i, V)^p)^{1/p}$ is minimized, where $d(a, V)$ denotes the Euclidean distance between $a$ and $V$ defined as $\\min_{v \\in V} ||a - v||$. When $p = \\infty$, we need to find a subspace $V$ that minimizes $\\max_i d(a_i, V)$. For $\\ell_{\\infty}$ subspace approximation, we give a deterministic strong coreset construction algorithm and show that it can be used to compute a $\\mathrm{poly}(k, \\log n)$ approximate solution. We show that the distortion obtained by our coreset is nearly tight for any sublinear space algorithm. For $\\ell_p$ subspace approximation, we show that suitably scaling the points and then using our $\\ell_{\\infty}$ coreset construction, we can compute a $\\mathrm{poly}(k, \\log n)$ approximation. Our algorithms are easy to implement and run very fast on large datasets. We also use our strong coreset construction to improve the results in a recent work of Woodruff and Yasuda (FOCS 2022) which gives streaming algorithms for high-dimensional geometric problems such as width estimation, convex hull estimation, and volume estimation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hossein Esfandiari;Praneeth Kacham;Vahab Mirrokni;David Woodruff;Peilin Zhong", "authorids": "~Hossein_Esfandiari1;~Praneeth_Kacham1;~Vahab_Mirrokni2;~David_Woodruff1;~Peilin_Zhong1", "gender": ";M;M;M;M", "homepage": "https://sites.google.com/corp/view/hossein-esfandiari;https://www.praneethkacham.com;https://people.csail.mit.edu/mirrokni/Welcome.html;http://www.cs.cmu.edu/~dwoodruf/;http://www.cs.columbia.edu/~peilin/", "dblp": "146/7746;255/5684;m/VahabSMirrokni;w/DPWoodruff;148/9632", "google_scholar": "Rt8ppJsAAAAJ;hKhPmTkAAAAJ;opbZfw0AAAAJ;https://scholar.google.com.tw/citations?user=0G2t-6sAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0000-0001-8130-6631;;;;", "linkedin": "hossein-esfandiari-10bb0281;;;;", "or_profile": "~Hossein_Esfandiari1;~Praneeth_Kacham1;~Vahab_Mirrokni2;~David_Woodruff1;~Peilin_Zhong1", "aff": "Google;Carnegie Mellon University;Google Research;Carnegie Mellon University;Google", "aff_domain": "google.com;cmu.edu;google.com;cmu.edu;google.com", "position": "Researcher;PhD student;VP, Google Fellow;Full Professor;Researcher", "bibtex": "@inproceedings{\nesfandiari2024highdimensional,\ntitle={High-Dimensional Geometric Streaming for Nearly Low Rank Data},\nauthor={Hossein Esfandiari and Praneeth Kacham and Vahab Mirrokni and David Woodruff and Peilin Zhong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yQfA0etfB7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1137379, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yBCFaXBOuMIJ:scholar.google.com/&scioq=High-Dimensional+Geometric+Streaming+for+Nearly+Low+Rank+Data&hl=en&as_sdt=0,33", "gs_version_total": 7, "email": "google.com;cmu.edu;google.com;cmu.edu;google.com", "author_num": 5, "aff_unique_index": "0;1;0;1;0", "aff_unique_norm": "Google;Carnegie Mellon University", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.cmu.edu", "aff_unique_abbr": "Google;CMU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "${\\rm E}(3)$-Equivariant Actor-Critic Methods for Cooperative Multi-Agent Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32670", "id": "yShA4VPYZB", "proceeding": "https://proceedings.mlr.press/v235/chen24az.html", "pdf": "https://openreview.net/pdf?id=yShA4VPYZB", "openreview": "https://openreview.net/forum?id=yShA4VPYZB", "author_site": "Dingyang Chen, Qi Zhang", "tldr": "", "abstract": "Identification and analysis of symmetrical patterns in the natural world have led to significant discoveries across various scientific fields, such as the formulation of gravitational laws in physics and advancements in the study of chemical structures. In this paper, we focus on exploiting Euclidean symmetries inherent in certain cooperative multi-agent reinforcement learning (MARL) problems and prevalent in many applications. We begin by formally characterizing a subclass of Markov games with a general notion of symmetries that admits the existence of symmetric optimal values and policies. Motivated by these properties, we design neural network architectures with symmetric constraints embedded as an inductive bias for multi-agent actor-critic methods. This inductive bias results in superior performance in various cooperative MARL benchmarks and impressive generalization capabilities such as zero-shot learning and transfer learning in unseen scenarios with repeated symmetric patterns.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dingyang Chen;Qi Zhang", "authorids": "~Dingyang_Chen1;~Qi_Zhang12", "gender": "M;M", "homepage": "https://dchen48.github.io/;https://qizhg.github.io/", "dblp": "212/7542-1.html;https://dblp.uni-trier.de/pers/hd/z/Zhang_0038:Qi", "google_scholar": "vSdOGREAAAAJ;wJNQVS0AAAAJ", "orcid": ";", "linkedin": "dingyang-chen-97512712a/;", "or_profile": "~Dingyang_Chen1;~Qi_Zhang12", "aff": "University of South Carolina;University of South Carolina", "aff_domain": "sc.edu;sc.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nchen2024rm,\ntitle={\\$\\{{\\textbackslash}rm E\\}(3)\\$-Equivariant Actor-Critic Methods for Cooperative Multi-Agent Reinforcement Learning},\nauthor={Dingyang Chen and Qi Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yShA4VPYZB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 933921, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3839645531495840011&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "sc.edu;sc.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of South Carolina", "aff_unique_dep": "", "aff_unique_url": "https://www.sc.edu", "aff_unique_abbr": "USC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Performance Bounds for Active Binary Testing with Information Maximization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32669", "id": "yTXv8KDD1P", "proceeding": "https://proceedings.mlr.press/v235/chattopadhyay24a.html", "pdf": "https://openreview.net/pdf?id=yTXv8KDD1P", "openreview": "https://openreview.net/forum?id=yTXv8KDD1P", "author_site": "Aditya Chattopadhyay, Benjamin Haeffele, Rene Vidal, Donald Geman", "tldr": "", "abstract": "In many applications like experimental design, group testing, and medical diagnosis, the state of a random variable $Y$ is revealed by successively observing the outcomes of binary tests about $Y$. New tests are selected adaptively based on the history of outcomes observed so far. If the number of states of $Y$ is finite, the process ends when $Y$ can be predicted with a desired level of confidence or all available tests have been used. Finding the strategy that minimizes the expected number of tests needed to predict $Y$ is virtually impossible in most real applications. Therefore, the commonly used strategy is the greedy heuristic of Information Maximization (InfoMax) that selects tests sequentially in order of information gain. Despite its widespread use, existing guarantees on its performance are often vacuous when compared to its empirical efficiency. In this paper, for the first time to the best of our knowledge, we establish tight non-vacuous bounds on InfoMax's performance. Our analysis is based on the assumption that at any iteration of the greedy strategy, there is always a binary test available whose conditional probability of being 'true', given the history, is within $\\delta$ units of one-half. This assumption is motivated by practical applications where the available set of tests often satisfies this property for modest values of $\\delta$, say, ${0.1 \\leq \\delta \\leq 0.4}$. Specifically, we analyze two distinct scenarios: (i) all tests are functions of $Y$, and (ii) test outcomes are corrupted by a binary symmetric channel. For both cases, our bounds guarantee the near-optimal performance of InfoMax for modest $\\delta$ values. It requires only a small multiplicative factor of the entropy of $Y$, in terms of the average number of tests needed to make accurate predictions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aditya Chattopadhyay;Benjamin David Haeffele;Rene Vidal;Donald Geman", "authorids": "~Aditya_Chattopadhyay1;~Benjamin_David_Haeffele1;~Rene_Vidal1;~Donald_Geman2", "gender": "M;;;M", "homepage": ";;http://www.vision.jhu.edu;http://www.cis.jhu.edu/people/faculty/geman/", "dblp": "207/8574;;v/ReneVidal;", "google_scholar": "aekzv1gAAAAJ;;https://scholar.google.com/citations?hl=en;", "orcid": ";;;", "linkedin": ";;rene-vidal-74844928/;", "or_profile": "~Aditya_Chattopadhyay1;~Benjamin_David_Haeffele1;~Rene_Vidal1;~Donald_Geman2", "aff": "Johns Hopkins University;;Amazon;Johns Hopkins University", "aff_domain": "jhu.edu;;amazon.com;jh.edu", "position": "PhD student;;Principal Researcher;Full Professor", "bibtex": "@inproceedings{\nchattopadhyay2024performance,\ntitle={Performance Bounds for Active Binary Testing with Information Maximization},\nauthor={Aditya Chattopadhyay and Benjamin David Haeffele and Rene Vidal and Donald Geman},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yTXv8KDD1P}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4331543, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11160799959504172503&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "jhu.edu;;amazon.com;jh.edu", "author_num": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Johns Hopkins University;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.jhu.edu;https://www.amazon.com", "aff_unique_abbr": "JHU;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Memoria: Resolving Fateful Forgetting Problem through Human-Inspired Memory Architecture", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32668", "id": "yTz0u4B8ug", "proceeding": "https://proceedings.mlr.press/v235/park24a.html", "pdf": "https://openreview.net/pdf?id=yTz0u4B8ug", "openreview": "https://openreview.net/forum?id=yTz0u4B8ug", "author_site": "Sangjun Park, JinYeong Bak", "tldr": "", "abstract": "Making neural networks remember over the long term has been a longstanding issue. Although several external memory techniques have been introduced, most focus on retaining recent information in the short term. Regardless of its importance, information tends to be fatefully forgotten over time. We present Memoria, a memory system for artificial neural networks, drawing inspiration from humans and applying various neuroscientific and psychological theories. The experimental results prove the effectiveness of Memoria in the diverse tasks of sorting, language modeling, and classification, surpassing conventional techniques. Engram analysis reveals that Memoria exhibits the primacy, recency, and temporal contiguity effects which are characteristics of human memory.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sangjun Park;JinYeong Bak", "authorids": "~Sangjun_Park1;~JinYeong_Bak2", "gender": "M;M", "homepage": "https://cosmoquester.github.io/;https://nosyu.kr", "dblp": ";22/11519", "google_scholar": "lb_8YyMAAAAJ;https://scholar.google.co.kr/citations?user=oYK9Z_IAAAAJ", "orcid": "0000-0002-1838-9259;0000-0002-3212-5241", "linkedin": "cosmoquester/;jybak/", "or_profile": "~Sangjun_Park1;~JinYeong_Bak2", "aff": "SungKyunKwan University;Sungkyunkwan University", "aff_domain": "skku.edu;skku.edu", "position": "Undergrad student;Assistant Professor", "bibtex": "@inproceedings{\npark2024memoria,\ntitle={Memoria: Resolving Fateful Forgetting Problem through Human-Inspired Memory Architecture},\nauthor={Sangjun Park and JinYeong Bak},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yTz0u4B8ug}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 7149032, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2080593400384567263&as_sdt=5,24&sciodt=0,24&hl=en", "gs_version_total": 8, "email": "skku.edu;skku.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Sungkyunkwan University", "aff_unique_dep": "", "aff_unique_url": "https://www.skku.edu", "aff_unique_abbr": "SKKU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Adapting Static Fairness to Sequential Decision-Making: Bias Mitigation Strategies towards Equal Long-term Benefit Rate", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32667", "id": "yUPBkPKzHw", "proceeding": "https://proceedings.mlr.press/v235/xu24g.html", "pdf": "https://openreview.net/pdf?id=yUPBkPKzHw", "openreview": "https://openreview.net/forum?id=yUPBkPKzHw", "author_site": "Yuancheng Xu, Chenghao Deng, Yanchao Sun, Ruijie Zheng, xiyao wang, Jieyu Zhao, Furong Huang", "tldr": "", "abstract": "Decisions made by machine learning models can have lasting impacts, making long-term fairness a critical consideration. It has been observed that ignoring the long-term effect and directly applying fairness criterion in static settings can actually worsen bias over time. To address biases in sequential decision-making, we introduce a long-term fairness concept named Equal Long-term Benefit Rate (ELBERT). This concept is seamlessly integrated into a Markov Decision Process (MDP) to consider the future effects of actions on long-term fairness, thus providing a unified framework for fair sequential decision-making problems. ELBERT effectively addresses the temporal discrimination issues found in previous long-term fairness notions. Additionally, we demonstrate that the policy gradient of Long-term Benefit Rate can be analytically simplified to standard policy gradients. This simplification makes conventional policy optimization methods viable for reducing bias, leading to our bias mitigation approach ELBERT-PO. Extensive experiments across various diverse sequential decision-making environments consistently reveal that ELBERT-PO significantly diminishes bias while maintaining high utility. Code is available at https://github.com/umd-huang-lab/ELBERT.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuancheng Xu;Chenghao Deng;Yanchao Sun;Ruijie Zheng;Xiyao Wang;Jieyu Zhao;Furong Huang", "authorids": "~Yuancheng_Xu1;~Chenghao_Deng1;~Yanchao_Sun1;~Ruijie_Zheng1;~Xiyao_Wang1;~Jieyu_Zhao1;~Furong_Huang1", "gender": "M;M;F;;M;F;F", "homepage": "https://yuancheng-xu.github.io;https://deng-chenghao.com;https://ycsun2017.github.io/home/index.html;http://www.ruijiezheng.com;;http://jyzhao.net/;https://furong-huang.com", "dblp": ";;132/6840;294/8474;;59/2379-1;72/8513", "google_scholar": "OPB0QgwAAAAJ;AcGw1hcAAAAJ;bloBY_QAAAAJ;;puVqfbwAAAAJ;9VaGBCQAAAAJ;13yyuCcAAAAJ", "orcid": ";;0000-0002-1137-9939;;;;", "linkedin": "yuancheng-xu/;chenghao-deng-326444182/;;;;;", "or_profile": "~Yuancheng_Xu1;~Chenghao_Deng1;~Yanchao_Sun1;~Ruijie_Zheng1;~Xiyao_Wang1;~Jieyu_Zhao1;~Furong_Huang1", "aff": "University of Maryland, College Park;University of Maryland, College Park;J.P. Morgan AI Research;University of Maryland, College Park;University of Maryland, College Park;University of Southern California;University of Maryland", "aff_domain": "umd.edu;umd.edu;jpmchase.com;cs.umd.edu;umd.edu;usc.edu;cs.umd.edu", "position": "PhD student;PhD student;Researcher;PhD student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nxu2024adapting,\ntitle={Adapting Static Fairness to Sequential Decision-Making: Bias Mitigation Strategies towards Equal Long-term Benefit Rate},\nauthor={Yuancheng Xu and Chenghao Deng and Yanchao Sun and Ruijie Zheng and Xiyao Wang and Jieyu Zhao and Furong Huang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yUPBkPKzHw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1880254, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4575208676138656987&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "umd.edu;umd.edu;jpmchase.com;cs.umd.edu;umd.edu;usc.edu;cs.umd.edu", "author_num": 7, "aff_unique_index": "0;0;1;0;0;2;0", "aff_unique_norm": "University of Maryland;J.P. Morgan;University of Southern California", "aff_unique_dep": ";AI Research;", "aff_unique_url": "https://www/umd.edu;https://www.jpmorgan.com;https://www.usc.edu", "aff_unique_abbr": "UMD;JPM;USC", "aff_campus_unique_index": "0;0;0;0;2", "aff_campus_unique": "College Park;;Los Angeles", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "COLD-Attack: Jailbreaking LLMs with Stealthiness and Controllability", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32666", "id": "yUxdk32TU6", "proceeding": "https://proceedings.mlr.press/v235/guo24i.html", "pdf": "https://openreview.net/pdf?id=yUxdk32TU6", "openreview": "https://openreview.net/forum?id=yUxdk32TU6", "author_site": "Xingang Guo, Fangxu Yu, Huan Zhang, Lianhui Qin, Bin Hu", "tldr": "", "abstract": "Jailbreaks on large language models (LLMs) have recently received increasing attention. For a comprehensive assessment of LLM safety, it is essential to consider jailbreaks with diverse attributes, such as contextual coherence and sentiment/stylistic variations, and hence it is beneficial to study controllable jailbreaking, i.e. how to enforce control on LLM attacks. In this paper, we formally formulate the controllable attack generation problem, and build a novel connection between this problem and controllable text generation, a well-explored topic of natural language processing. Based on this connection, we adapt the Energy-based Constrained Decoding with Langevin Dynamics (COLD), a state-of-the-art, highly efficient algorithm in controllable text generation, and introduce the COLD-Attack framework which unifies and automates the search of adversarial LLM attacks under a variety of control requirements such as fluency, stealthiness, sentiment, and left-right-coherence. The controllability enabled by COLD-Attack leads to diverse new jailbreak scenarios which not only cover the standard setting of generating fluent (suffix) attack with continuation constraint, but also allow us to address new controllable attack settings such as revising a user query adversarially with paraphrasing constraint, and inserting stealthy attacks in context with position constraint. Our extensive experiments on various LLMs (Llama-2, Mistral, Vicuna, Guanaco, GPT-3.5, and GPT-4) show COLD-Attack's broad applicability, strong controllability, high success rate, and attack transferability. Our code is available at https://github.com/Yu-Fangxu/COLD-Attack.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xingang Guo;Fangxu Yu;Huan Zhang;Lianhui Qin;Bin Hu", "authorids": "~Xingang_Guo1;~Fangxu_Yu1;~Huan_Zhang1;~Lianhui_Qin1;~Bin_Hu2", "gender": "M;M;M;F;M", "homepage": "https://sites.google.com/view/guoxingang;https://yu-fangxu.github.io/;http://huan-zhang.com;https://lianhui.ucsd.edu/;", "dblp": ";299/2017.html;23/1797-1.html;184/3753;", "google_scholar": "8HmMeD8AAAAJ;https://scholar.google.com/citations?hl=zh-CN;LTa3GzEAAAAJ;smd19iIAAAAJ;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Xingang_Guo1;~Fangxu_Yu1;~Huan_Zhang1;~Lianhui_Qin1;~Bin_Hu2", "aff": "University of Illinois, Urbana-Champaign;Nanjing University;University of Illinois, Urbana Champaign;Allen Institute for Artificial Intelligence;University of Illinois, Urbana Champaign", "aff_domain": "uiuc.edu;nju.edu;uiuc.edu;allenai.org;illinois.edu", "position": "PhD student;MS student;Assistant Professor;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nguo2024coldattack,\ntitle={{COLD}-Attack: Jailbreaking {LLM}s with Stealthiness and Controllability},\nauthor={Xingang Guo and Fangxu Yu and Huan Zhang and Lianhui Qin and Bin Hu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yUxdk32TU6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1903607, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 87, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=806977407777287303&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "uiuc.edu;nju.edu;uiuc.edu;allenai.org;illinois.edu", "author_num": 5, "aff_unique_index": "0;1;2;3;2", "aff_unique_norm": "University of Illinois;Nanjing University;University of Illinois Urbana-Champaign;Allen Institute for Artificial Intelligence", "aff_unique_dep": ";;;", "aff_unique_url": "https://illinois.edu;https://www.nju.edu.cn;https://illinois.edu;https://allenai.org", "aff_unique_abbr": "UIUC;Nanjing U;UIUC;AI2", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;China" }, { "title": "Towards Generalization beyond Pointwise Learning: A Unified Information-theoretic Perspective", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32665", "id": "yXlQL9goY8", "proceeding": "https://proceedings.mlr.press/v235/dong24a.html", "pdf": "https://openreview.net/pdf?id=yXlQL9goY8", "openreview": "https://openreview.net/forum?id=yXlQL9goY8", "author_site": "Yuxin Dong, Tieliang Gong, Hong Chen, Zhongjiang He, Shiquan Wang, Shuangyong Song, Chen Li", "tldr": "", "abstract": "The recent surge in contrastive learning has intensified the interest in understanding the generalization of non-pointwise learning paradigms. While information-theoretic analysis achieves remarkable success in characterizing the generalization behavior of learning algorithms, its applicability is largely confined to pointwise learning, with extensions to the simplest pairwise settings remaining unexplored due to the challenges of non-i.i.d losses and dimensionality explosion. In this paper, we develop the first series of information-theoretic bounds extending beyond pointwise scenarios, encompassing pointwise, pairwise, triplet, quadruplet, and higher-order scenarios, all within a unified framework. Specifically, our hypothesis-based bounds elucidate the generalization behavior of iterative and noisy learning algorithms via gradient covariance analysis, and our prediction-based bounds accurately estimate the generalization gap with computationally tractable low-dimensional information metrics. Comprehensive numerical studies then demonstrate the effectiveness of our bounds in capturing the generalization dynamics across diverse learning scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuxin Dong;Tieliang Gong;Hong Chen;Zhongjiang He;Mengxiang Li;Shuangyong Song;Chen Li", "authorids": "~Yuxin_Dong1;~Tieliang_Gong2;~Hong_Chen1;~Zhongjiang_He1;~Mengxiang_Li2;~Shuangyong_Song2;~Chen_Li19", "gender": "M;;;M;;;M", "homepage": "https://yuxin-dong.github.io/;;https://chenhongml.github.io/;;;;http://chenli.group", "dblp": ";;https://dblp.uni-trier.de/pers/hd/c/Chen_0004:Hong;348/6925;;;l/ChenLi32", "google_scholar": "yFJv-2kAAAAJ;;;;;;", "orcid": "0000-0002-4475-5056;;;;;;0000-0002-0079-3106", "linkedin": "yuxin-dong-939a03349;;;%E5%BF%A0%E6%B1%9F-%E4%BD%95-7bb92391/;;;", "or_profile": "~Yuxin_Dong1;~Tieliang_Gong2;~Hong_Chen1;~Zhongjiang_He1;~Mengxiang_Li2;~Shuangyong_Song2;~Chen_Li19", "aff": "Xi'an Jiaotong University;;Huazhong Agricultural University;ChinaTelecom;;;Xi'an Jiaotong University", "aff_domain": "xjtu.edu.cn;;hzau.edu.cn;chinatelecom.cn;;;xjtu.edu.cn", "position": "PhD student;;Full Professor;Researcher;;;Full Professor", "bibtex": "@inproceedings{\ndong2024towards,\ntitle={Towards Generalization beyond Pointwise Learning: A Unified Information-theoretic Perspective},\nauthor={Yuxin Dong and Tieliang Gong and Hong Chen and Zhongjiang He and Mengxiang Li and Shuangyong Song and Chen Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yXlQL9goY8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1035287, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18225061269000991865&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 5, "email": "xjtu.edu.cn;;hzau.edu.cn;chinatelecom.cn;;;xjtu.edu.cn", "author_num": 7, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Xi'an Jiao Tong University;Huazhong Agricultural University;China Telecom", "aff_unique_dep": ";;", "aff_unique_url": "https://www.xjtu.edu.cn;http://www.hzau.edu.cn/;https://www.chinatelecom.com.cn", "aff_unique_abbr": "XJTU;HAU;China Telecom", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "CLIF: Complementary Leaky Integrate-and-Fire Neuron for Spiking Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32664", "id": "yY6N89IlHa", "proceeding": "https://proceedings.mlr.press/v235/huang24n.html", "pdf": "https://openreview.net/pdf?id=yY6N89IlHa", "openreview": "https://openreview.net/forum?id=yY6N89IlHa", "author_site": "Yulong Huang, Xiaopeng LIN, Hongwei Ren, Haotian FU, Yue Zhou, Zunchang LIU, biao pan, Bojun Cheng", "tldr": "", "abstract": "Spiking neural networks (SNNs) are promising brain-inspired energy-efficient models. Compared to conventional deep Artificial Neural Networks (ANNs), SNNs exhibit superior efficiency and capability to process temporal information. However, it remains a challenge to train SNNs due to their undifferentiable spiking mechanism. The surrogate gradients method is commonly used to train SNNs, but often comes with an accuracy disadvantage over ANNs counterpart. We link the degraded accuracy to the vanishing of gradient on the temporal dimension through the analytical and experimental study of the training process of Leaky Integrate-and-Fire (LIF) Neuron-based SNNs. Moreover, we propose the Complementary Leaky Integrate-and-Fire (CLIF) Neuron. CLIF creates extra paths to facilitate the backpropagation in computing temporal gradient while keeping binary output. CLIF is hyperparameter-free and features broad applicability. Extensive experiments on a variety of datasets demonstrate CLIF's clear performance advantage over other neuron models. Furthermore, the CLIF's performance even slightly surpasses superior ANNs with identical network structure and training conditions. The code is available at https://github.com/HuuYuLong/Complementary-LIF.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yulong Huang;Xiaopeng LIN;Hongwei Ren;Haotian FU;Yue Zhou;Zunchang LIU;biao pan;Bojun Cheng", "authorids": "~Yulong_Huang2;~Xiaopeng_LIN1;~Hongwei_Ren2;~Haotian_FU4;~Yue_Zhou8;~Zunchang_LIU2;~biao_pan1;~Bojun_Cheng1", "gender": ";;M;M;F;;M;M", "homepage": ";https://github.com/xplin13;http://rhwdmx.github.io;;;;https://shi.buaa.edu.cn/panbiao/en/index.htm;https://personal.hkust-gz.edu.cn/bojuncheng/index.html", "dblp": ";;;;;;;285/0564", "google_scholar": ";;https://scholar.google.com.hk/citations?user=eD60q1YAAAAJ;;;;;https://scholar.google.ch/citations?user=Zisp-_IAAAAJ", "orcid": ";;;0000-0001-5445-4487;0000-0001-9323-4524;;;", "linkedin": ";;;;;;;", "or_profile": "~Yulong_Huang2;~Xiaopeng_LIN1;~Hongwei_Ren2;~Haotian_FU4;~Yue_Zhou8;~Zunchang_LIU2;~biao_pan1;~Bojun_Cheng1", "aff": ";Hong Kong University of Science and Technology;Hong Kong University of Science and Technology;Hong Kong University of Science and Technology;Hong Kong University of Science and Technology;;Beihang University;The Hong Kong University of Science and Technology (Guangzhou)", "aff_domain": ";hkust.edu;connect.hkust-gz.edu.cn;hkust.edu;hkust.edu;;buaa.edu.cn;hkust-gz.edu.cn", "position": ";PhD student;PhD student;PhD student;PhD student;;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nhuang2024clif,\ntitle={{CLIF}: Complementary Leaky Integrate-and-Fire Neuron for Spiking Neural Networks},\nauthor={Yulong Huang and Xiaopeng LIN and Hongwei Ren and Haotian FU and Yue Zhou and Zunchang LIU and biao pan and Bojun Cheng},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yY6N89IlHa}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1051599, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7797613458807942376&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": ";hkust.edu;connect.hkust-gz.edu.cn;hkust.edu;hkust.edu;;buaa.edu.cn;hkust-gz.edu.cn", "author_num": 8, "aff_unique_index": "0;0;0;0;1;0", "aff_unique_norm": "Hong Kong University of Science and Technology;Beihang University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ust.hk;http://www.buaa.edu.cn/", "aff_unique_abbr": "HKUST;BUAA", "aff_campus_unique_index": "0;0;0;0;2", "aff_campus_unique": "Hong Kong SAR;;Guangzhou", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Refined Coreset Selection: Towards Minimal Coreset Size under Model Performance Constraints", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32663", "id": "yb5xV8LFDq", "proceeding": "https://proceedings.mlr.press/v235/xia24b.html", "pdf": "https://openreview.net/pdf?id=yb5xV8LFDq", "openreview": "https://openreview.net/forum?id=yb5xV8LFDq", "author_site": "Xiaobo Xia, Jiale Liu, Shaokun Zhang, Qingyun Wu, Hongxin Wei, Tongliang Liu", "tldr": "", "abstract": "Coreset selection is powerful in reducing computational costs and accelerating data processing for deep learning algorithms. It strives to identify a small subset from large-scale data, so that training only on the subset practically performs on par with full data. Practitioners regularly desire to identify the smallest possible coreset in realistic scenes while maintaining comparable model performance, to minimize costs and maximize acceleration. Motivated by this desideratum, for the first time, we pose the problem of refined coreset selection, in which the minimal coreset size under model performance constraints is explored. Moreover, to address this problem, we propose an innovative method, which maintains optimization priority order over the model performance and coreset size, and efficiently optimizes them in the coreset selection procedure. Theoretically, we provide the convergence guarantee of the proposed method. Empirically, extensive experiments confirm its superiority compared with previous strategies, often yielding better model performance with smaller coreset sizes. The implementation is available at https://github.com/xiaoboxia/LBCS.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaobo Xia;Jiale Liu;Shaokun Zhang;Qingyun Wu;Hongxin Wei;Tongliang Liu", "authorids": "~Xiaobo_Xia1;~Jiale_Liu2;~Shaokun_Zhang2;~Qingyun_Wu2;~Hongxin_Wei1;~Tongliang_Liu1", "gender": "M;;;;M;M", "homepage": "https://xiaoboxia.github.io/;;;;https://hongxin001.github.io/;https://tongliang-liu.github.io/", "dblp": "242/8072;;;;150/6350;150/6667", "google_scholar": "jRsugY0AAAAJ;;;;cABH034AAAAJ;https://scholar.google.com.au/citations?user=EiLdZ_YAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Xiaobo_Xia1;~Jiale_Liu2;~Shaokun_Zhang2;~Qingyun_Wu2;~Hongxin_Wei1;~Tongliang_Liu1", "aff": "The University of Sydney;;;;Southern University of Science and Technology;Mohamed bin Zayed University of Artificial Intelligence", "aff_domain": "sydney.edu.au;;;;sustech.edu.cn;mbzuai.ac.ae", "position": "PhD student;;;;Assistant Professor;Affiliated Associate Professor", "bibtex": "@inproceedings{\nxia2024refined,\ntitle={Refined Coreset Selection: Towards Minimal Coreset Size under Model Performance Constraints},\nauthor={Xiaobo Xia and Jiale Liu and Shaokun Zhang and Qingyun Wu and Hongxin Wei and Tongliang Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yb5xV8LFDq}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 639175, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5289768640643188273&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "sydney.edu.au;;;;sustech.edu.cn;mbzuai.ac.ae", "author_num": 6, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Sydney;Southern University of Science and Technology;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";;", "aff_unique_url": "https://www.sydney.edu.au;https://www.sustech.edu.cn;https://mbzuai.ac.ae", "aff_unique_abbr": "USYD;SUSTech;MBZUAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "Australia;China;United Arab Emirates" }, { "title": "Better Safe than Sorry: Pre-training CLIP against Targeted Data Poisoning and Backdoor Attacks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32662", "id": "ycLHJuLYuD", "proceeding": "https://proceedings.mlr.press/v235/yang24i.html", "pdf": "https://openreview.net/pdf?id=ycLHJuLYuD", "openreview": "https://openreview.net/forum?id=ycLHJuLYuD", "author_site": "Wenhan Yang, Jingdong Gao, Baharan Mirzasoleiman", "tldr": "", "abstract": "Contrastive Language-Image Pre-training (CLIP) on large image-caption datasets has achieved remarkable success in zero-shot classification and enabled transferability to new domains. However, CLIP is extremely more vulnerable to targeted data poisoning and backdoor attacks compared to supervised learning. Perhaps surprisingly, poisoning 0.0001% of CLIP pre-training data is enough to make targeted data poisoning attacks successful. This is four orders of magnitude smaller than what is required to poison supervised models. Despite this vulnerability, existing methods are very limited in defending CLIP models during pre-training. In this work, we propose a strong defense, SAFECLIP, to safely pre-train CLIP against targeted data poisoning and backdoor attacks. SAFECLIP warms up the model by applying unimodal contrastive learning (CL) on image and text modalities separately. Then, it divides the data into safe and risky sets by applying a Gaussian Mixture Model to the cosine similarity of image-caption pair representations. SAFECLIP pre-trains the model by applying the CLIP loss to the safe set and applying unimodal CL to image and text modalities of the risky set separately. By gradually increasing the size of the safe set during pre-training, SAFECLIP effectively breaks targeted data poisoning and backdoor attacks without harming the CLIP performance. Our extensive experiments on CC3M, Visual Genome, and MSCOCO demonstrate that SAFECLIP significantly reduces the success rate of targeted data poisoning attacks from 93.75% to 0% and that of various backdoor attacks from up to 100% to 0%, without harming CLIP\u2019s performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenhan Yang;Jingdong Gao;Baharan Mirzasoleiman", "authorids": "~Wenhan_Yang5;~Jingdong_Gao1;~Baharan_Mirzasoleiman1", "gender": "M;;F", "homepage": ";https://github.com/mxuan0;http://web.cs.ucla.edu/~baharan/", "dblp": ";;52/10075", "google_scholar": ";;x63j7HEAAAAJ", "orcid": ";;", "linkedin": "wenhan-yang-6413981b4/;;", "or_profile": "~Wenhan_Yang5;~Jingdong_Gao1;~Baharan_Mirzasoleiman1", "aff": "University of California, Los Angeles;;University of California, Los Angeles", "aff_domain": "ucla.edu;;ucla.edu", "position": "PhD student;;Assistant Professor", "bibtex": "@inproceedings{\nyang2024better,\ntitle={Better Safe than Sorry: Pre-training {CLIP} against Targeted Data Poisoning and Backdoor Attacks},\nauthor={Wenhan Yang and Jingdong Gao and Baharan Mirzasoleiman},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ycLHJuLYuD}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1113468, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17503491736576494934&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "ucla.edu;;ucla.edu", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Los Angeles", "aff_unique_dep": "", "aff_unique_url": "https://www.ucla.edu", "aff_unique_abbr": "UCLA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Learning Shadow Variable Representation for Treatment Effect Estimation under Collider Bias", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32661", "id": "ycXo4tQIpN", "proceeding": "https://proceedings.mlr.press/v235/li24am.html", "pdf": "https://openreview.net/pdf?id=ycXo4tQIpN", "openreview": "https://openreview.net/forum?id=ycXo4tQIpN", "author_site": "Baohong Li, Haoxuan Li, Ruoxuan Xiong, Anpeng Wu, Fei Wu, Kun Kuang", "tldr": "", "abstract": "One of the significant challenges in treatment effect estimation is collider bias, a specific form of sample selection bias induced by the common causes of both the treatment and outcome. Identifying treatment effects under collider bias requires well-defined shadow variables in observational data, which are assumed to be related to the outcome and independent of the sample selection mechanism, conditional on the other observed variables. However, finding a valid shadow variable is not an easy task in real-world scenarios and requires domain-specific knowledge from experts. Therefore, in this paper, we propose a novel method that can automatically learn shadow-variable representations from observational data without prior knowledge. To ensure the learned representations satisfy the assumptions of the shadow variable, we introduce a tester to perform hypothesis testing in the representation learning process. We iteratively generate representations and test whether they satisfy the shadow-variable assumptions until they pass the test. With the help of the learned shadow-variable representations, we propose a novel treatment effect estimator to address collider bias. Experiments show that the proposed methods outperform existing treatment effect estimation methods under collider bias and prove their potential application value.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Baohong Li;Haoxuan Li;Ruoxuan Xiong;Anpeng Wu;Fei Wu;Kun Kuang", "authorids": "~Baohong_Li1;~Haoxuan_Li6;~Ruoxuan_Xiong1;~Anpeng_Wu1;~Fei_Wu1;~Kun_Kuang1", "gender": "M;M;;M;M;M", "homepage": ";https://haoxuanli-pku.github.io/;http://www.ruoxuanxiong.com/;https://scholar.google.com.hk/citations?user=VQ4m6zQAAAAJ&hl=zh-CN&oi=sra;https://person.zju.edu.cn/wufei;http://kunkuang.github.io", "dblp": "83/3116;145/4965-1.html;222/2927;267/5637;84/3254-1;194/4245", "google_scholar": "M08DvYsAAAAJ;gtDqiucAAAAJ;lg_0u-0AAAAJ;https://scholar.google.com.hk/citations?user=VQ4m6zQAAAAJ;XJLn4MYAAAAJ;https://scholar.google.com.hk/citations?user=FOsNiMQAAAAJ", "orcid": "0000-0002-3222-002X;0000-0003-3620-3769;;0000-0003-3898-7122;;0009-0000-7528-8131", "linkedin": ";;;;;", "or_profile": "~Baohong_Li1;~Haoxuan_Li6;~Ruoxuan_Xiong1;~Anpeng_Wu1;~Fei_Wu1;~Kun_Kuang1", "aff": "Zhejiang University;Peking University;Emory University;Mohamed bin Zayed University of Artificial Intelligence;Zhejiang University;Zhejiang University", "aff_domain": "zju.edu.cn;pku.edu.cn;emory.edu;mbzuai.ac.ae;zju.edu.cn;zju.edu.cn", "position": "PhD student;PhD student;Assistant Professor;Researcher;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nli2024learning,\ntitle={Learning Shadow Variable Representation for Treatment Effect Estimation under Collider Bias},\nauthor={Baohong Li and Haoxuan Li and Ruoxuan Xiong and Anpeng Wu and Fei Wu and Kun Kuang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ycXo4tQIpN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 0, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17884543821089514765&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "zju.edu.cn;pku.edu.cn;emory.edu;mbzuai.ac.ae;zju.edu.cn;zju.edu.cn", "author_num": 6, "aff_unique_index": "0;1;2;3;0;0", "aff_unique_norm": "Zhejiang University;Peking University;Emory University;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.zju.edu.cn;http://www.pku.edu.cn;https://www.emory.edu;https://mbzuai.ac.ae", "aff_unique_abbr": "ZJU;Peking U;Emory;MBZUAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;2;0;0", "aff_country_unique": "China;United States;United Arab Emirates" }, { "title": "Accelerating Legacy Numerical Solvers by Non-intrusive Gradient-based Meta-solving", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32660", "id": "yh6Y7ppf46", "proceeding": "https://proceedings.mlr.press/v235/arisaka24a.html", "pdf": "https://openreview.net/pdf?id=yh6Y7ppf46", "openreview": "https://openreview.net/forum?id=yh6Y7ppf46", "author_site": "Sohei Arisaka, Qianxiao Li", "tldr": "", "abstract": "Scientific computing is an essential tool for scientific discovery and engineering design, and its computational cost is always a main concern in practice. To accelerate scientific computing, it is a promising approach to use machine learning (especially meta-learning) techniques for selecting hyperparameters of traditional numerical methods. There have been numerous proposals to this direction, but many of them require automatic-differentiable numerical methods. However, in reality, many practical applications still depend on well-established but non-automatic-differentiable legacy codes, which prevents practitioners from applying the state-of-the-art research to their own problems. To resolve this problem, we propose a non-intrusive methodology with a novel gradient estimation technique to combine machine learning and legacy numerical codes without any modification. We theoretically and numerically show the advantage of the proposed method over other baselines and present applications of accelerating established non-automatic-differentiable numerical solvers implemented in PETSc, a widely used open-source numerical software library.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sohei Arisaka;Qianxiao Li", "authorids": "~Sohei_Arisaka1;~Qianxiao_Li1", "gender": "M;M", "homepage": "https://github.com/arisakaso;https://blog.nus.edu.sg/qianxiaoli/", "dblp": "322/5322;172/0930.html", "google_scholar": ";https://scholar.google.com.sg/citations?user=zLgReYoAAAAJ", "orcid": ";0000-0002-3903-3737", "linkedin": ";", "or_profile": "~Sohei_Arisaka1;~Qianxiao_Li1", "aff": "National University of Singapore;National University of Singapore", "aff_domain": "nus.edu;nus.edu.sg", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\narisaka2024accelerating,\ntitle={Accelerating Legacy Numerical Solvers by Non-intrusive Gradient-based Meta-solving},\nauthor={Sohei Arisaka and Qianxiao Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yh6Y7ppf46}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9992881, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11149702471186857469&as_sdt=4005&sciodt=0,6&hl=en", "gs_version_total": 6, "email": "nus.edu;nus.edu.sg", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "National University of Singapore", "aff_unique_dep": "", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Singapore" }, { "title": "Provably Robust DPO: Aligning Language Models with Noisy Feedback", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32659", "id": "yhpDKSw7yA", "proceeding": "https://proceedings.mlr.press/v235/ray-chowdhury24a.html", "pdf": "https://openreview.net/pdf?id=yhpDKSw7yA", "openreview": "https://openreview.net/forum?id=yhpDKSw7yA", "author_site": "Sayak Ray Chowdhury, Anush Kini, Nagarajan Natarajan", "tldr": "", "abstract": "Learning from preference-based feedback has recently gained traction as a promising approach to align language models with human interests. While these aligned generative models have demonstrated impressive capabilities across various tasks, their dependence on high-quality human preference data poses a bottleneck in practical applications. Specifically, noisy (incorrect and ambiguous) preference pairs in the dataset might restrict the language models from capturing human intent accurately. While practitioners have recently proposed heuristics to mitigate the effect of noisy preferences, a complete theoretical understanding of their workings remain elusive. In this work, we aim to bridge this gap by introducing a general framework for policy optimization in the presence of random preference flips. We focus on the direct preference optimization (DPO) algorithm in particular since it assumes that preferences adhere to the Bradley-Terry-Luce (BTL) model, raising concerns about the impact of noisy data on the learned policy. We design a novel loss function, which de-bias the effect of noise on average, making a policy trained by minimizing that loss robust to the noise. Under log-linear parameterization of the policy class and assuming good feature coverage of the SFT policy, we prove that the sub-optimality gap of the proposed robust DPO (rDPO) policy compared to the optimal policy is of the order $O(\\frac{1}{1-2\\epsilon}\\sqrt{\\frac{d}{n}})$, where $\\epsilon < 1/2$ is flip rate of labels, $d$ is policy parameter dimension and $n$ is size of dataset. Our experiments on IMDb sentiment generation and Anthropic's helpful-harmless dataset shows that rDPO is robust to noise in preference labels compared to vanilla DPO and other heuristics proposed by practitioners.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sayak Ray Chowdhury;Anush Kini;Nagarajan Natarajan", "authorids": "~Sayak_Ray_Chowdhury1;t-anushkini@microsoft.com;~Nagarajan_Natarajan1", "gender": "M;;", "homepage": "https://sites.google.com/view/sayakraychowdhury/home;;", "dblp": "195/8152;;", "google_scholar": "Q0_CaxYAAAAJ;;", "orcid": ";;", "linkedin": "sayak-ray-chowdhury-54878154/;;", "or_profile": "~Sayak_Ray_Chowdhury1;t-anushkini@microsoft.com;~Nagarajan_Natarajan1", "aff": "Microsoft Research;;", "aff_domain": "microsoft.com;;", "position": "Postdoc;;", "bibtex": "@inproceedings{\nchowdhury2024provably,\ntitle={Provably Robust {DPO}: Aligning Language Models with Noisy Feedback},\nauthor={Sayak Ray Chowdhury and Anush Kini and Nagarajan Natarajan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yhpDKSw7yA}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 474767, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17107994932123032153&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "microsoft.com;;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Microsoft", "aff_unique_dep": "Microsoft Research", "aff_unique_url": "https://www.microsoft.com/en-us/research", "aff_unique_abbr": "MSR", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Amortizing Pragmatic Program Synthesis with Rankings", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32658", "id": "yj8h567Ia7", "proceeding": "https://proceedings.mlr.press/v235/pu24c.html", "pdf": "https://openreview.net/pdf?id=yj8h567Ia7", "openreview": "https://openreview.net/forum?id=yj8h567Ia7", "author_site": "Yewen Pu, Saujas Vaduguru, Priyan Vaithilingam, Elena Glassman, Daniel Fried", "tldr": "", "abstract": "The usage of Rational Speech Acts (RSA) framework has been successful in building *pragmatic* program synthesizers that return programs which, in addition to being logically consistent with user-generated examples, account for the fact that a user chooses their examples informatively. We present a general method of amortizing the slow, exact RSA synthesizer. Our method first compiles a communication dataset of partially ranked programs by querying the exact RSA synthesizer. It then distills a *global ranking* -- a single, total ordering of all programs, to approximate the partial rankings from this dataset. This global ranking is then used at inference time to rank multiple logically consistent candidate programs generated from a fast, non-pragmatic synthesizer. Experiments on two program synthesis domains using our ranking method resulted in orders of magnitudes of speed ups compared to the exact RSA synthesizer, while being more accurate than a non-pragmatic synthesizer. Finally, we prove that in the special case of synthesis from a single example, this approximation is exact.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yewen Pu;Saujas Vaduguru;Priyan Vaithilingam;Elena Glassman;Daniel Fried", "authorids": "~Yewen_Pu1;~Saujas_Vaduguru1;~Priyan_Vaithilingam1;~Elena_Glassman1;~Daniel_Fried1", "gender": "M;M;M;F;M", "homepage": "http://www.mit.edu/~yewenpu;https://saujasv.github.io/;https://priyan.info;http://glassmanlab.seas.harvard.edu/;https://dpfried.github.io/", "dblp": "53/10322;294/8886;;118/6231.html;117/4804", "google_scholar": "LJnNKXMAAAAJ;U2MUXuMAAAAJ;;C_r8d0AAAAAJ;sJDqACEAAAAJ", "orcid": ";;; 0000-0001-5178-3496;", "linkedin": ";;;;", "or_profile": "~Yewen_Pu1;~Saujas_Vaduguru1;~Priyan_Vaithilingam1;~Elena_Glassman1;~Daniel_Fried1", "aff": "Autodesk;Autodesk;Harvard University, Harvard University;Harvard University;Carnegie Mellon University", "aff_domain": "autodesk.com;autodesk.com;g.harvard.edu;harvard.edu;cmu.edu", "position": "Principal Researcher;Intern;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\npu2024amortizing,\ntitle={Amortizing Pragmatic Program Synthesis with Rankings},\nauthor={Yewen Pu and Saujas Vaduguru and Priyan Vaithilingam and Elena Glassman and Daniel Fried},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yj8h567Ia7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3874676, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14365784593433803614&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "email": "autodesk.com;autodesk.com;g.harvard.edu;harvard.edu;cmu.edu", "author_num": 5, "aff_unique_index": "0;0;1;1;2", "aff_unique_norm": "Autodesk;Harvard University;Carnegie Mellon University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.autodesk.com;https://www.harvard.edu;https://www.cmu.edu", "aff_unique_abbr": "Autodesk;Harvard;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Bootstrap AutoEncoders With Contrastive Paradigm for Self-supervised Gaze Estimation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32657", "id": "ykRY34kL3j", "proceeding": "https://proceedings.mlr.press/v235/wang24ah.html", "pdf": "https://openreview.net/pdf?id=ykRY34kL3j", "openreview": "https://openreview.net/forum?id=ykRY34kL3j", "author_site": "Yaoming Wang, Jin Li, Wenrui Dai, Bowen Shi, xiaopeng zhang, Chenglin Li, Hongkai Xiong", "tldr": "", "abstract": "Existing self-supervised methods for gaze estimation using the dominant streams of contrastive and generative approaches are restricted to eye images and could fail in general full-face settings. In this paper, we reveal that contrastive methods are ineffective in data augmentation for self-supervised full-face gaze estimation, while generative methods are prone to trivial solutions due to the absence of explicit regularization on semantic representations. To address this challenge, we propose a novel approach called **B**ootstrap auto-**e**ncoders with **C**ontrastive p**a**radigm (**BeCa**), which combines the strengths of both generative and contrastive methods. Specifically, we revisit the Auto-Encoder used in generative approaches and incorporate the contrastive paradigm to introduce explicit regularization on gaze representation. Furthermore, we design the InfoMSE loss as an alternative to the vanilla MSE loss for Auto-Encoder to mitigate the inconsistency between reconstruction and representation learning. Experimental results demonstrate that the proposed approaches outperform state-of-the-art unsupervised gaze approaches on extensive datasets (including wild scenes) under both within-dataset and cross-dataset protocols.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yaoming Wang;Jin Li;Wenrui Dai;Bowen Shi;XIAOPENG ZHANG;Chenglin Li;Hongkai Xiong", "authorids": "~Yaoming_Wang1;~Jin_Li10;~Wenrui_Dai1;~Bowen_Shi2;~XIAOPENG_ZHANG7;~Chenglin_Li2;~Hongkai_Xiong1", "gender": ";;;M;M;M;M", "homepage": ";;;;https://sites.google.com/site/zxphistory/;https://min.sjtu.edu.cn/En/FacultyShow/4?Vid=17;http://min.sjtu.edu.cn", "dblp": ";;16/5135.html;;;;21/3569", "google_scholar": ";;Xg8MhyAAAAAJ;lJHbpY0AAAAJ;Ud6aBAcAAAAJ;ltW2JMcAAAAJ;bB16iN4AAAAJ", "orcid": ";;;;;;0000-0003-4552-0029", "linkedin": ";;;;;;", "or_profile": "~Yaoming_Wang1;~Jin_Li10;~Wenrui_Dai1;~Bowen_Shi2;~XIAOPENG_ZHANG7;~Chenglin_Li2;~Hongkai_Xiong1", "aff": ";;Shanghai Jiaotong University;Shanghai Jiaotong University;Huawei Technologies Ltd.;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": ";;sjtu.edu.cn;sjtu.edu.cn;huawei.com;sjtu.edu.cn;sjtu.edu.cn", "position": ";;Associate Professor;PhD student;Principal Researcher;Full Professor;Full Professor", "bibtex": "@inproceedings{\nwang2024bootstrap,\ntitle={Bootstrap AutoEncoders With Contrastive Paradigm for Self-supervised Gaze Estimation},\nauthor={Yaoming Wang and Jin Li and Wenrui Dai and Bowen Shi and XIAOPENG ZHANG and Chenglin Li and Hongkai Xiong},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ykRY34kL3j}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 786595, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FSMeaEpvl98J:scholar.google.com/&scioq=Bootstrap+AutoEncoders+With+Contrastive+Paradigm+for+Self-supervised+Gaze+Estimation&hl=en&as_sdt=0,5", "gs_version_total": 4, "email": ";;sjtu.edu.cn;sjtu.edu.cn;huawei.com;sjtu.edu.cn;sjtu.edu.cn", "author_num": 7, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Shanghai Jiao Tong University;Huawei", "aff_unique_dep": ";Huawei Technologies", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.huawei.com", "aff_unique_abbr": "SJTU;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Learning with Complementary Labels Revisited: The Selected-Completely-at-Random Setting Is More Practical", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32656", "id": "ykZYLBcA9g", "proceeding": "https://proceedings.mlr.press/v235/wang24ac.html", "pdf": "https://openreview.net/pdf?id=ykZYLBcA9g", "openreview": "https://openreview.net/forum?id=ykZYLBcA9g", "author_site": "Wei Wang, Takashi Ishida, Yu-Jie Zhang, Gang Niu, Masashi Sugiyama", "tldr": "", "abstract": "Complementary-label learning is a weakly supervised learning problem in which each training example is associated with one or multiple complementary labels indicating the classes to which it does not belong. Existing consistent approaches have relied on the uniform distribution assumption to model the generation of complementary labels, or on an ordinary-label training set to estimate the transition matrix in non-uniform cases. However, either condition may not be satisfied in real-world scenarios. In this paper, we propose a novel consistent approach that does not rely on these conditions. Inspired by the positive-unlabeled (PU) learning literature, we propose an unbiased risk estimator based on the Selected-Completely-at-Random assumption for complementary-label learning. We then introduce a risk-correction approach to address overfitting problems. Furthermore, we find that complementary-label learning can be expressed as a set of negative-unlabeled binary classification problems when using the one-versus-rest strategy. Extensive experimental results on both synthetic and real-world benchmark datasets validate the superiority of our proposed approach over state-of-the-art methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wei Wang;Takashi Ishida;Yu-Jie Zhang;Gang Niu;Masashi Sugiyama", "authorids": "~Wei_Wang68;~Takashi_Ishida1;~Yu-Jie_Zhang1;~Gang_Niu1;~Masashi_Sugiyama1", "gender": "M;M;M;M;M", "homepage": "https://wwangwitsel.github.io/;https://takashiishida.github.io/;https://yujie-zhang96.github.io/;https://niug1984.github.io;http://www.ms.k.u-tokyo.ac.jp/sugi/", "dblp": "35/7092-373.html;84/2290-1;234/6681;26/3367-1;35/1228", "google_scholar": "a38jZkwAAAAJ;IzoyKyUAAAAJ;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.co.jp/citations?user=HOkcy00AAAAJ;https://scholar.google.co.jp/citations?user=GkYIrlIAAAAJ", "orcid": "0000-0002-8860-0494;;;;0000-0001-6658-6743", "linkedin": ";;;;", "or_profile": "~Wei_Wang68;~Takashi_Ishida1;~Yu-Jie_Zhang1;~Gang_Niu1;~Masashi_Sugiyama1", "aff": "The University of Tokyo;The University of Tokyo;The University of Tokyo;Southeast University;The University of Tokyo", "aff_domain": "u-tokyo.ac.jp;tokyo.ac.jp;u-tokyo.ac.jp;seu.edu.cn;u-tokyo.ac.jp", "position": "PhD student;Lecturer;PhD student;Adjunct Full Professor;Full Professor", "bibtex": "@inproceedings{\nwang2024learning,\ntitle={Learning with Complementary Labels Revisited: The Selected-Completely-at-Random Setting Is More Practical},\nauthor={Wei Wang and Takashi Ishida and Yu-Jie Zhang and Gang Niu and Masashi Sugiyama},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ykZYLBcA9g}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 538844, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5803612661222475310&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "email": "u-tokyo.ac.jp;tokyo.ac.jp;u-tokyo.ac.jp;seu.edu.cn;u-tokyo.ac.jp", "author_num": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "University of Tokyo;Southeast University", "aff_unique_dep": ";", "aff_unique_url": "https://www.u-tokyo.ac.jp;https://www.seu.edu.cn/", "aff_unique_abbr": "UTokyo;SEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "Japan;China" }, { "title": "Incentivized Learning in Principal-Agent Bandit Games", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32655", "id": "ykgZk6vFrh", "proceeding": "https://proceedings.mlr.press/v235/scheid24a.html", "pdf": "https://openreview.net/pdf?id=ykgZk6vFrh", "openreview": "https://openreview.net/forum?id=ykgZk6vFrh", "author_site": "Antoine Scheid, Daniil Tiapkin, Etienne Boursier, Aymeric Capitaine, Eric Moulines, Michael Jordan, El-Mahdi El-Mhamdi, Alain Oliviero Durmus", "tldr": "", "abstract": "This work considers a repeated principal-agent bandit game, where the principal can only interact with her environment through the agent. The principal and the agent have misaligned objectives and the choice of action is only left to the agent. However, the principal can influence the agent's decisions by offering incentives which add up to his rewards. The principal aims to iteratively learn an incentive policy to maximize her own total utility. This framework extends usual bandit problems and is motivated by several practical applications, such as healthcare or ecological taxation, where traditionally used mechanism design theories often overlook the learning aspect of the problem. We present nearly optimal (with respect to a horizon $T$) learning algorithms for the principal's regret in both multi-armed and linear contextual settings. Finally, we support our theoretical guarantees through numerical experiments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Antoine Scheid;Daniil Tiapkin;Etienne Boursier;Aymeric Capitaine;Eric Moulines;Michael Jordan;El-Mahdi El-Mhamdi;Alain Oliviero Durmus", "authorids": "~Antoine_Scheid1;~Daniil_Tiapkin1;~Etienne_Boursier1;~Aymeric_Capitaine1;~Eric_Moulines1;~Michael_Jordan1;~El-Mahdi_El-Mhamdi1;~Alain_Oliviero_Durmus1", "gender": "M;M;M;M;M;M;M;M", "homepage": ";https://d-tiapkin.github.io/;https://eboursier.github.io/;https://fr.linkedin.com/in/aymeric-capitaine-ab00a818b;;http://www.cs.berkeley.edu/~jordan/;;https://elmahdielmhamdi.com", "dblp": ";267/5445;203/8633;;54/2358;j/MichaelIJordan;01/11275;198/0984", "google_scholar": "M9zQVwgAAAAJ;https://scholar.google.ru/citations?user=AB23PXQAAAAJ;https://scholar.google.fr/citations?user=-9todDUAAAAJ;;https://scholar.google.fr/citations?user=_XE1LvQAAAAJ;https://scholar.google.com.tw/citations?user=yxUduqMAAAAJ;;https://scholar.google.ch/citations?user=kNA-WLQAAAAJ", "orcid": ";0000-0002-8832-7926;;;0000-0002-2058-0693;0000-0001-8935-817X;;", "linkedin": "antoine-scheid-687735239/;daniil-tiapkin-049714240/;;;;;;mahdielmhamdi/", "or_profile": "~Antoine_Scheid1;~Daniil_Tiapkin1;~Etienne_Boursier1;~Aymeric_Capitaine1;~Eric_Moulines1;~Michael_Jordan1;~Alain_Durmus1;~El_Mahdi_El_Mhamdi1", "aff": "\u00c9cole Polytechnique;Google Deepmind;INRIA;\u00c9cole Polytechnique;Ecole polytechnique;University of California, Berkeley;\u00c9cole Polytechnique;Ecole polytechnique", "aff_domain": "polytechnique.edu;google.com;inria.fr;polytechnique.fr;polytechnique.edu;berkeley.edu;polytechnique.fr;polytechnique.edu", "position": "PhD student;Intern;Researcher;PhD student;Full Professor;Full Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nscheid2024incentivized,\ntitle={Incentivized Learning in Principal-Agent Bandit Games},\nauthor={Antoine Scheid and Daniil Tiapkin and Etienne Boursier and Aymeric Capitaine and Eric Moulines and Michael Jordan and El-Mahdi El-Mhamdi and Alain Oliviero Durmus},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ykgZk6vFrh}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1111295, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14751409399811785705&as_sdt=800005&sciodt=0,15&hl=en", "gs_version_total": 13, "email": "polytechnique.edu;google.com;inria.fr;polytechnique.fr;polytechnique.edu;berkeley.edu;polytechnique.fr;polytechnique.edu", "author_num": 8, "aff_unique_index": "0;1;2;0;0;3;0;0", "aff_unique_norm": "Ecole Polytechnique;DeepMind;INRIA;University of California, Berkeley", "aff_unique_dep": ";DeepMind;;", "aff_unique_url": "https://www.polytechnique.edu;https://deepmind.com;https://www.inria.fr;https://www.berkeley.edu", "aff_unique_abbr": "X;DeepMind;INRIA;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;0;0;0;2;0;0", "aff_country_unique": "France;United Kingdom;United States" }, { "title": "Estimating Barycenters of Distributions with Neural Optimal Transport", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32654", "id": "ymgcTqrZLT", "proceeding": "https://proceedings.mlr.press/v235/kolesov24a.html", "pdf": "https://openreview.net/pdf?id=ymgcTqrZLT", "openreview": "https://openreview.net/forum?id=ymgcTqrZLT", "author_site": "Alexander Kolesov, Petr Mokrov, Igor Udovichenko, Milena Gazdieva, Gudmund Pammer, Evgeny Burnaev, Alexander Korotin", "tldr": "", "abstract": "Given a collection of probability measures, a practitioner sometimes needs to find an \"average\" distribution which adequately aggregates reference distributions. A theoretically appealing notion of such an average is the Wasserstein barycenter, which is the primal focus of our work. By building upon the dual formulation of Optimal Transport (OT), we propose a new scalable approach for solving the Wasserstein barycenter problem. Our methodology is based on the recent Neural OT solver: it has bi-level adversarial learning objective and works for general cost functions. These are key advantages of our method since the typical adversarial algorithms leveraging barycenter tasks utilize tri-level optimization and focus mostly on quadratic cost. We also establish theoretical error bounds for our proposed approach and showcase its applicability and effectiveness in illustrative scenarios and image data setups. Our source code is available at https://github.com/justkolesov/NOTBarycenters.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alexander Kolesov;Petr Mokrov;Igor Udovichenko;Milena Gazdieva;Gudmund Pammer;Evgeny Burnaev;Alexander Korotin", "authorids": "~Alexander_Kolesov1;~Petr_Mokrov1;~Igor_Udovichenko1;~Milena_Gazdieva1;~Gudmund_Pammer1;~Evgeny_Burnaev1;~Alexander_Korotin2", "gender": "M;M;;F;M;M;", "homepage": "https://github.com/Kolessov;https://github.com/PetrMokrov;;;https://people.math.ethz.ch/~gpammer/;http://faculty.skoltech.ru/people/evgenyburnaev;", "dblp": "287/4380;;;309/6585;;144/7845;", "google_scholar": "WyAI_wUAAAAJ;CRsi4IkAAAAJ;;h52_Zx8AAAAJ;ipItetYAAAAJ;https://scholar.google.ru/citations?user=pCRdcOwAAAAJ;", "orcid": ";;;0000-0003-0047-1577;0000-0003-2494-8739;0000-0001-8424-0690;", "linkedin": ";;;;;;", "or_profile": "~Alexander_Kolesov1;~Petr_Mokrov1;~Igor_Udovichenko1;~Milena_Gazdieva1;~Gudmund_Pammer1;~Evgeny_Burnaev1;~Alexander_Korotin2", "aff": "The Skolkovo Institute of Science and Technology;Skolkovo Institute of Science and Technology;;Skolkovo Institute of Science and Technology;ETHZ - ETH Zurich;Skolkovo Institute of Science and Technology;", "aff_domain": "skoltech.ru;skolkovotech.ru;;skoltech.ru;ethz.ch;skoltech.ru;", "position": "PhD student;PhD student;;PhD student;Postdoc;Full Professor;", "bibtex": "@inproceedings{\nkolesov2024estimating,\ntitle={Estimating Barycenters of Distributions with Neural Optimal Transport},\nauthor={Alexander Kolesov and Petr Mokrov and Igor Udovichenko and Milena Gazdieva and Gudmund Pammer and Evgeny Burnaev and Alexander Korotin},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ymgcTqrZLT}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3021247, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8185992999946145886&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "skoltech.ru;skolkovotech.ru;;skoltech.ru;ethz.ch;skoltech.ru;", "author_num": 7, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Skolkovo Institute of Science and Technology;ETH Zurich", "aff_unique_dep": ";", "aff_unique_url": "https://www.skoltech.ru;https://www.ethz.ch", "aff_unique_abbr": "Skoltech;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "Russian Federation;Switzerland" }, { "title": "Position: AI/ML Influencers Have a Place in the Academic Process", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32653", "id": "yo9Jyt3XCY", "proceeding": "https://proceedings.mlr.press/v235/weissburg24a.html", "pdf": "https://openreview.net/pdf?id=yo9Jyt3XCY", "openreview": "https://openreview.net/forum?id=yo9Jyt3XCY", "author_site": "Iain Xie Weissburg, Mehir Arora, Xinyi Wang, Liangming Pan, William Wang", "tldr": "", "abstract": "As the number of accepted papers at AI and ML conferences reaches into the thousands, it has become unclear how researchers access and read research publications. In this paper, we investigate the role of social media influencers in enhancing the visibility of machine learning research, particularly the citation counts of papers they share. We have compiled a comprehensive dataset of over 8,000 papers, spanning tweets from December 2018 to October 2023, alongside controls precisely matched by 9 key covariates. Our statistical and causal inference analysis reveals a significant increase in citations for papers endorsed by these influencers, with median citation counts 2-3 times higher than those of the control group. Additionally, the study delves into the geographic, gender, and institutional diversity of highlighted authors. Given these findings, we advocate for a responsible approach to curation, encouraging influencers to uphold the journalistic standard that includes showcasing diverse research topics, authors, and institutions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Iain Weissburg;Mehir Arora;Xinyi Wang;Liangming Pan;William Yang Wang", "authorids": "~Iain_Weissburg1;mehir@ucsb.edu;~Xinyi_Wang2;~Liangming_Pan1;~William_Yang_Wang2", "gender": "M;;F;M;", "homepage": ";;https://wangxinyilinda.github.io/;https://liangmingpan.bio;", "dblp": ";;;186/9707;", "google_scholar": ";;3vvbplcAAAAJ;JcjjOTUAAAAJ;", "orcid": ";;;;", "linkedin": "iain-weissburg;;xinyi-wang-444385133/;;", "or_profile": "~Iain_Weissburg1;mehir@ucsb.edu;~Xinyi_Wang2;~Liangming_Pan1;~William_Yang_Wang2", "aff": "University of California, Santa Barbara;;International Business Machines;University of California, Santa Barbara;", "aff_domain": "ucsb.edu;;ibm.com;ucsb.edu;", "position": "Undergrad student;;Intern;Postdoc;", "bibtex": "@inproceedings{\nweissburg2024position,\ntitle={Position: {AI}/{ML} Influencers Have a Place in the Academic Process},\nauthor={Iain Weissburg and Mehir Arora and Xinyi Wang and Liangming Pan and William Yang Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yo9Jyt3XCY}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8175468, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14008796772771087821&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "ucsb.edu;;ibm.com;ucsb.edu;", "author_num": 5, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of California, Santa Barbara;International Business Machines Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucsb.edu;https://www.ibm.com", "aff_unique_abbr": "UCSB;IBM", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Santa Barbara;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Rethinking Guidance Information to Utilize Unlabeled Samples: A Label Encoding Perspective", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32652", "id": "yoTCwNqQS6", "proceeding": "https://proceedings.mlr.press/v235/zhang24bt.html", "pdf": "https://openreview.net/pdf?id=yoTCwNqQS6", "openreview": "https://openreview.net/forum?id=yoTCwNqQS6", "author_site": "Yulong Zhang, Yuan Yao, Shuhao Chen, Pengrong Jin, Yu Zhang, Jian Jin, Jiangang Lu", "tldr": "", "abstract": "Empirical Risk Minimization (ERM) is fragile in scenarios with insufficient labeled samples. A vanilla extension of ERM to unlabeled samples is Entropy Minimization (EntMin), which employs the soft-labels of unlabeled samples to guide their learning. However, EntMin emphasizes prediction discriminability while neglecting prediction diversity. To alleviate this issue, in this paper, we rethink the guidance information to utilize unlabeled samples. By analyzing the learning objective of ERM, we find that the guidance information for labeled samples in a specific category is the corresponding *label encoding*. Inspired by this finding, we propose a Label-Encoding Risk Minimization (LERM). It first estimates the label encodings through prediction means of unlabeled samples and then aligns them with their corresponding ground-truth label encodings. As a result, the LERM ensures both prediction discriminability and diversity, and it can be integrated into existing methods as a plugin. Theoretically, we analyze the relationships between LERM and ERM as well as EntMin. Empirically, we verify the superiority of the LERM under several label insufficient scenarios. The codes are available at https://github.com/zhangyl660/LERM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yulong Zhang;Yuan Yao;Shuhao Chen;Pengrong Jin;Yu Zhang;Jian Jin;Jiangang Lu", "authorids": "~Yulong_Zhang2;~Yuan_Yao15;~Shuhao_Chen1;~Pengrong_Jin2;~Yu_Zhang3;~Jian_Jin4;~Jiangang_Lu1", "gender": ";M;M;M;M;M;M", "homepage": ";https://yyyaoyuan.github.io/;https://scholar.google.com/citations?user=YqX_IbAAAAAJ&hl=zh-CN;http://cse.sustech.edu.cn/faculty/~zhangy/;http://www.caict.ac.cn;https://person.zju.edu.cn/lujg/;", "dblp": "32/9374;25/4120-16;43/2127;50/671-6;;20/6851;", "google_scholar": "https://scholar.google.com.hk/citations?user=2hY14LYAAAAJ;https://scholar.google.com.hk/citations?user=jD1u0hAAAAAJ;YqX_IbAAAAAJ;https://scholar.google.com.hk/citations?user=jaRS5w4AAAAJ;;;gageqRgAAAAJ", "orcid": "0000-0002-4038-1616;;0009-0002-0410-5961;;0000-0002-4128-966X;0000-0002-1551-6179;", "linkedin": ";;;;;;", "or_profile": "~Yulong_Zhang2;~Yuan_Yao15;~Shuhao_Chen1;~Yu_Zhang3;~Jian_Jin4;~Jiangang_Lu1;~Pengrong_JIN1", "aff": "Zhejiang University;Beijing Teleinfo Technology Co., LTD;Southern University of Science and Technology;Southern University of Science and Technology;Pengcheng Labs;;Southern University of Science and Technology", "aff_domain": "zju.edu.cn;teleinfo.cn;sustech.edu;sustc.edu.cn;pcl.ac.cn;;sustech.edu.cn", "position": "PhD student;Researcher;MS student;Associate Professor;Full Professor;;Researcher", "bibtex": "@inproceedings{\nzhang2024rethinking,\ntitle={Rethinking Guidance Information to Utilize Unlabeled Samples: A Label Encoding Perspective},\nauthor={Yulong Zhang and Yuan Yao and Shuhao Chen and Pengrong Jin and Yu Zhang and Jian Jin and Jiangang Lu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yoTCwNqQS6}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 612586, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12957300970993692298&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 7, "email": "zju.edu.cn;teleinfo.cn;sustech.edu;sustc.edu.cn;pcl.ac.cn;;sustech.edu.cn", "author_num": 7, "aff_unique_index": "0;1;2;2;3;2", "aff_unique_norm": "Zhejiang University;Beijing Teleinfo Technology Co., LTD;Southern University of Science and Technology;Pengcheng Labs", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.zju.edu.cn;;https://www.sustech.edu.cn;", "aff_unique_abbr": "ZJU;;SUSTech;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Scaling Laws for Fine-Grained Mixture of Experts", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32651", "id": "yoqdlynCRs", "proceeding": "https://proceedings.mlr.press/v235/ludziejewski24a.html", "pdf": "https://openreview.net/pdf?id=yoqdlynCRs", "openreview": "https://openreview.net/forum?id=yoqdlynCRs", "author_site": "Jan Ludziejewski, Jakub Krajewski, Kamil Adamczewski, Maciej Pi\u00f3ro, Micha\u0142 Krutul, Szymon Antoniak, Kamil Ciebiera, Krystian Kr\u00f3l, Tomasz Odrzyg\u00f3\u017ad\u017a, Piotr Sankowski, Marek Cygan, Sebastian Jaszczur", "tldr": "", "abstract": "Mixture of Experts (MoE) models have emerged as a primary solution for reducing the computational cost of Large Language Models. In this work, we analyze their scaling properties, highlighting certain arbitrary assumptions present in the existing literature. In particular, we introduce a new hyperparameter, granularity, the modification of which allows for the optimal adjustment of the size of experts. Subsequently, we present scaling laws for fine-grained MoE, taking into account the number of training tokens, model size, and granularity. Using these scaling laws, we derive the optimal training configuration for a given computational budget. Furthermore, in contrast with previous works, we demonstrate that the gap in efficiency between dense and MoE models grows as we scale up the model size and training budget.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jan Ludziejewski;Jakub Krajewski;Kamil Adamczewski;Maciej Pi\u00f3ro;Micha\u0142 Krutul;Szymon Antoniak;Kamil Ciebiera;Krystian Kr\u00f3l;Tomasz Odrzyg\u00f3\u017ad\u017a;Piotr Sankowski;Marek Cygan;Sebastian Jaszczur", "authorids": "~Jan_Ludziejewski1;~Jakub_Krajewski1;~Kamil_Adamczewski1;~Maciej_Pi\u00f3ro1;~Micha\u0142_Krutul1;~Szymon_Antoniak1;~Kamil_Ciebiera1;~Krystian_Kr\u00f3l2;~Tomasz_Odrzyg\u00f3\u017ad\u017a1;~Piotr_Sankowski1;~Marek_Cygan1;~Sebastian_Jaszczur1", "gender": "M;;M;M;M;Not Specified;M;M;M;;;M", "homepage": ";;;;;;;;;https://www.mimuw.edu.pl/~sank;;", "dblp": "276/0228;;150/5954;;;;;;;80/4282;76/819;206/3302", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;;https://scholar.google.pl/citations?user=O30Xj14AAAAJ;;;;;;J2ERJ7cAAAAJ;https://scholar.google.pl/citations?user=-5LVg-0AAAAJ;df8TSy4AAAAJ;KF__0WgAAAAJ", "orcid": "0000-0002-6556-6801;;;;;;;;;;;0000-0003-1628-7176", "linkedin": "jan-ludziejewski-034959223/;jakub-krj;;maciej-pi%C3%B3ro-0ab6201a2/;micha%C5%82-krutul-405596150/;szymon-antoniak-022998258/;kamil-ciebiera-9439a0252/;krystiankrol;tomasz-odrzygozdz/;;marek-cygan-b9a316140/;sebastian-jaszczur-129866a3", "or_profile": "~Jan_Ludziejewski1;~Jakub_Krajewski1;~Kamil_Adamczewski1;~Maciej_Pi\u00f3ro1;~Micha\u0142_Krutul1;~Szymon_Antoniak1;~Kamil_Ciebiera1;~Krystian_Kr\u00f3l2;~Tomasz_Odrzyg\u00f3\u017ad\u017a1;~Piotr_Sankowski1;~Marek_Cygan1;~Sebastian_Jaszczur1", "aff": "University of Warsaw;University of Warsaw;IDEAS NCBR Sp.;IDEAS NCBR Sp.;IDEAS NCBR Sp.;;IDEAS NCBR Sp.;University of Warsaw;IDEAS NCBR;IDEAS NCBR;Nomagic;University of Warsaw", "aff_domain": "mimuw.edu.pl;mimuw.edu.pl;ideas-ncbr.pl;ideas-ncbr.pl;ideas-ncbr.pl;;ideas-ncbr.pl;mimuw.edu.pl;ideas-ncbr.pl;ideas-ncbr.pl;nomagic.ai;uw.edu.pl", "position": "PhD student;PhD student;Postdoc;Researcher;PhD student;;Intern;MS student;Postdoc;CEO;Founder / CTO;PhD student", "bibtex": "@inproceedings{\nludziejewski2024scaling,\ntitle={Scaling Laws for Fine-Grained Mixture of Experts},\nauthor={Jan Ludziejewski and Jakub Krajewski and Kamil Adamczewski and Maciej Pi{\\'o}ro and Micha{\\l} Krutul and Szymon Antoniak and Kamil Ciebiera and Krystian Kr{\\'o}l and Tomasz Odrzyg{\\'o}{\\'z}d{\\'z} and Piotr Sankowski and Marek Cygan and Sebastian Jaszczur},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yoqdlynCRs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 544219, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 12, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=38484587262236720&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "email": "mimuw.edu.pl;mimuw.edu.pl;ideas-ncbr.pl;ideas-ncbr.pl;ideas-ncbr.pl;;ideas-ncbr.pl;mimuw.edu.pl;ideas-ncbr.pl;ideas-ncbr.pl;nomagic.ai;uw.edu.pl", "author_num": 12, "aff_unique_index": "0;0;1;1;1;1;0;2;2;3;0", "aff_unique_norm": "University of Warsaw;IDEAS NCBR;Institute for Development, Economic Analysis, and Simulation (IDEAS);Nomagic", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.uw.edu.pl;;https://www.ideas-ncbr.gov.pl;", "aff_unique_abbr": "UW;;IDEAS;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "Poland;" }, { "title": "Revisiting Scalable Hessian Diagonal Approximations for Applications in Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32650", "id": "yrFUJzcTsk", "proceeding": "https://proceedings.mlr.press/v235/elsayed24a.html", "pdf": "https://openreview.net/pdf?id=yrFUJzcTsk", "openreview": "https://openreview.net/forum?id=yrFUJzcTsk", "author_site": "Mohamed Elsayed, Homayoon Farrahi, Felix Dangel, Rupam Mahmood", "tldr": "", "abstract": "Second-order information is valuable for many applications but challenging to compute. Several works focus on computing or approximating Hessian diagonals, but even this simplification introduces significant additional costs compared to computing a gradient. In the absence of efficient exact computation schemes for Hessian diagonals, we revisit an early approximation scheme proposed by Becker and LeCun (1989, BL89), which has a cost similar to gradients and appears to have been overlooked by the community. We introduce HesScale, an improvement over BL89, which adds negligible extra computation. On small networks, we find that this improvement is of higher quality than all alternatives, even those with theoretical guarantees, such as unbiasedness, while being much cheaper to compute. We use this insight in reinforcement learning problems where small networks are used and demonstrate HesScale in second-order optimization and scaling the step-size parameter. In our experiments, HesScale optimizes faster than existing methods and improves stability through step-size scaling. These findings are promising for scaling second-order methods in larger models in the future.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mohamed Elsayed;Homayoon Farrahi;Felix Dangel;A. Rupam Mahmood", "authorids": "~Mohamed_Elsayed2;~Homayoon_Farrahi1;~Felix_Dangel1;~A._Rupam_Mahmood1", "gender": "M;;M;", "homepage": "https://mohmdelsayed.github.io;;https://f-dangel.com;", "dblp": "224/8735-3;;236/4218;120/6935", "google_scholar": "https://scholar.google.ca/citations?user=gShveMAAAAAJ;;9hlJ9W0AAAAJ;https://scholar.google.ca/citations?user=YwB8XM4AAAAJ", "orcid": ";;0000-0002-1414-8554;", "linkedin": "mohamedelsayed95/;homayoonfarrahi/;;", "or_profile": "~Mohamed_Elsayed2;~Homayoon_Farrahi1;~Felix_Dangel1;~Rupam_Mahmood1", "aff": "University of Alberta;University of Alberta;Vector Institute, Toronto;University of Alberta", "aff_domain": "ualberta.ca;ualberta.ca;vectorinstitute.ai;ualberta.ca", "position": "PhD student;MS student;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nelsayed2024revisiting,\ntitle={Revisiting Scalable Hessian Diagonal Approximations for Applications in Reinforcement Learning},\nauthor={Mohamed Elsayed and Homayoon Farrahi and Felix Dangel and A. Rupam Mahmood},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yrFUJzcTsk}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 4033041, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1706746759217710288&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "ualberta.ca;ualberta.ca;vectorinstitute.ai;ualberta.ca", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Alberta;Vector Institute", "aff_unique_dep": ";", "aff_unique_url": "https://www.ualberta.ca;https://vectorinstitute.ai", "aff_unique_abbr": "UAlberta;Vector Institute", "aff_campus_unique_index": "1", "aff_campus_unique": ";Toronto", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Canada" }, { "title": "Stabilizing Policy Gradients for Stochastic Differential Equations via Consistency with Perturbation Process", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32649", "id": "ytz2naZoDB", "proceeding": "https://proceedings.mlr.press/v235/zhou24q.html", "pdf": "https://openreview.net/pdf?id=ytz2naZoDB", "openreview": "https://openreview.net/forum?id=ytz2naZoDB", "author_site": "Xiangxin Zhou, Liang Wang, Yichi Zhou", "tldr": "", "abstract": "Considering generating samples with high rewards, we focus on optimizing deep neural networks parameterized stochastic differential equations (SDEs), the advanced generative models with high expressiveness, with policy gradient, the leading algorithm in reinforcement learning. Nevertheless, when applying policy gradients to SDEs, since the policy gradient is estimated on a finite set of trajectories, it can be ill-defined, and the policy behavior in data-scarce regions may be uncontrolled. This challenge compromises the stability of policy gradients and negatively impacts sample complexity. To address these issues, we propose constraining the SDE to be consistent with its associated perturbation process. Since the perturbation process covers the entire space and is easy to sample, we can mitigate the aforementioned problems. Our framework offers a general approach allowing for a versatile selection of policy gradient methods to effectively and efficiently train SDEs. We evaluate our algorithm on the task of structure-based drug design and optimize the binding affinity of generated ligand molecules. Our method achieves the best Vina score (-9.07) on the CrossDocked2020 dataset.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiangxin Zhou;Liang Wang;Yichi Zhou", "authorids": "~Xiangxin_Zhou1;~Liang_Wang3;~Yichi_Zhou2", "gender": "Not Specified;M;", "homepage": ";;https://www.microsoft.com/en-us/research/people/yiczho/", "dblp": "247/9275;56/4499-1;203/4453", "google_scholar": "eQgIWcQAAAAJ;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Xiangxin_Zhou1;~Liang_Wang3;~Yichi_Zhou2", "aff": "Institute of Automation, Chinese Academy of Sciences;Institute of Automation\uff0c CAS\uff0cChina;Microsoft", "aff_domain": "ia.ac.cn;ia.ac.cn;microsoft.com", "position": "PhD student;Full Professor;Microsoft research", "bibtex": "@inproceedings{\nzhou2024stabilizing,\ntitle={Stabilizing Policy Gradients for Stochastic Differential Equations via Consistency with Perturbation Process},\nauthor={Xiangxin Zhou and Liang Wang and Yichi Zhou},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ytz2naZoDB}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8812593, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8605356463271737423&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "email": "ia.ac.cn;ia.ac.cn;microsoft.com", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "Chinese Academy of Sciences;Microsoft", "aff_unique_dep": "Institute of Automation;Microsoft Corporation", "aff_unique_url": "http://www.ia.cas.cn;https://www.microsoft.com", "aff_unique_abbr": "CAS;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;United States" }, { "title": "By Tying Embeddings You Are Assuming the Distributional Hypothesis", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32648", "id": "yyYMAprcAR", "proceeding": "https://proceedings.mlr.press/v235/bertolotti24a.html", "pdf": "https://openreview.net/pdf?id=yyYMAprcAR", "openreview": "https://openreview.net/forum?id=yyYMAprcAR", "author_site": "Bertolotti Francesco, Walter Cazzola", "tldr": "", "abstract": "In this work, we analyze both theoretically and empirically the effect of tied input-output embeddings\u2014a popular technique that reduces the model size while often improving training. Interestingly, we found that this technique is connected to Harris (1954)\u2019s distributional hypothesis\u2014often portrayed by the famous Firth (1957)\u2019s quote \u201ca word is characterized by the company it keeps\u201d. Specifically, our findings indicate that words (or, more broadly, symbols) with similar semantics tend to be encoded in similar input embeddings, while words that appear in similar contexts are encoded in similar output embeddings (thus explaining the semantic space arising in input and output embedding of foundational language models). As a consequence of these findings, the tying of the input and output embeddings is encouraged only when the distributional hypothesis holds for the underlying data. These results also provide insight into the embeddings of foundation language models (which are known to be semantically organized). Further, we complement the theoretical findings with several experiments supporting the claims.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Francesco Bertolotti;Walter Cazzola", "authorids": "~Francesco_Bertolotti1;~Walter_Cazzola1", "gender": "M;", "homepage": "https://f14-bertolotti.github.io/;https://cazzola.di.unimi.it", "dblp": "282/1942;88/5167", "google_scholar": "mFYoE-4AAAAJ;https://scholar.google.it/citations?user=kQ7FsKMAAAAJ", "orcid": "0000-0002-3867-6175;0000-0002-4652-8113", "linkedin": ";", "or_profile": "~Francesco_Bertolotti1;~Walter_Cazzola1", "aff": "University of Milan;University of Milan", "aff_domain": "unimi.it;unimi.it", "position": "Postdoc;Full Professor", "bibtex": "@inproceedings{\nbertolotti2024by,\ntitle={By Tying Embeddings You Are Assuming the Distributional Hypothesis},\nauthor={Francesco Bertolotti and Walter Cazzola},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yyYMAprcAR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1576804, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1267190451443294909&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "email": "unimi.it;unimi.it", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Milan", "aff_unique_dep": "", "aff_unique_url": "https://www.unimi.it", "aff_unique_abbr": "UniMi", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Italy" }, { "title": "Triple Changes Estimator for Targeted Policies", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32647", "id": "yzNEkTmcoF", "proceeding": "https://proceedings.mlr.press/v235/akbari24a.html", "pdf": "https://openreview.net/pdf?id=yzNEkTmcoF", "openreview": "https://openreview.net/forum?id=yzNEkTmcoF", "author_site": "Sina Akbari, Negar Kiyavash", "tldr": "", "abstract": "The renowned difference-in-differences (DiD) estimator relies on the assumption of 'parallel trends,' which may not hold in many practical applications. To address this issue, economists are increasingly considering the triple difference estimator as a more credible alternative. Both DiD and triple difference are limited to assessing average effects exclusively. An alternative avenue is offered by the changes-in-changes (CiC) estimator, which provides an estimate of the entire counterfactual distribution by relying on assumptions imposed on the distribution of potential outcomes. In this work, we extend the triple difference estimator to accommodate the CiC framework, presenting the `triple changes estimator' and its identification assumptions, thereby expanding the scope of the CiC paradigm. Subsequently, we empirically evaluate the proposed framework and apply it to a study examining the impact of Medicaid expansion on children's preventive care.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sina Akbari;Negar Kiyavash", "authorids": "~Sina_Akbari1;~Negar_Kiyavash1", "gender": "M;F", "homepage": "https://sinaakbarii.github.io;https://people.epfl.ch/negar.kiyavash?lang=en", "dblp": ";85/4976", "google_scholar": "-kNnS1AAAAAJ;7tBDvOwAAAAJ", "orcid": ";0000-0002-8545-7709", "linkedin": "sina-akbari/;", "or_profile": "~Sina_Akbari1;~Negar_Kiyavash1", "aff": "Swiss Federal Institute of Technology Lausanne;EPFL - EPF Lausanne", "aff_domain": "epfl.ch;epfl.ch", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nakbari2024triple,\ntitle={Triple Changes Estimator for Targeted Policies},\nauthor={Sina Akbari and Negar Kiyavash},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yzNEkTmcoF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 535088, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9472251823734917400&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "epfl.ch;epfl.ch", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Swiss Federal Institute of Technology Lausanne;EPFL", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://www.epfl.ch", "aff_unique_abbr": "EPFL;EPFL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "title": "Demystifying SGD with Doubly Stochastic Gradients", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32646", "id": "z373OXJXWU", "proceeding": "https://proceedings.mlr.press/v235/kim24r.html", "pdf": "https://openreview.net/pdf?id=z373OXJXWU", "openreview": "https://openreview.net/forum?id=z373OXJXWU", "author_site": "Kyurae Kim, Joohwan Ko, Yian Ma, Jacob Gardner", "tldr": "", "abstract": "Optimization objectives in the form of a sum of intractable expectations are rising in importance (*e.g.,*, diffusion models, variational autoencoders, and many more), a setting also known as \"finite sum with infinite data.\" For these problems, a popular strategy is to employ SGD with *doubly stochastic gradients* (doubly SGD): the expectations are estimated using the gradient estimator of each component, while the sum is estimated by subsampling over these estimators. Despite its popularity, little is known about the convergence properties of doubly SGD, except under strong assumptions such as bounded variance. In this work, we establish the convergence of doubly SGD with independent minibatching and random reshuffling under general conditions, which encompasses dependent component gradient estimators. In particular, for dependent estimators, our analysis allows fined-grained analysis of the effect correlations. As a result, under a per-iteration computational budget of $b \\times m$, where $b$ is the minibatch size and $m$ is the number of Monte Carlo samples, our analysis suggests where one should invest most of the budget in general. Furthermore, we prove that random reshuffling (RR) improves the complexity dependence on the subsampling noise.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kyurae Kim;Joohwan Ko;Yian Ma;Jacob R. Gardner", "authorids": "~Kyurae_Kim1;~Joohwan_Ko2;~Yian_Ma1;~Jacob_R._Gardner1", "gender": "M;M;M;M", "homepage": "https://joohwanko.com/;https://sites.google.com/view/yianma;;https://krkim.me", "dblp": "358/5976;;144/7773;322/4034", "google_scholar": ";A0TFlacAAAAJ;0gkajvEAAAAJ;pKGsQ1cAAAAJ", "orcid": ";;;0000-0003-2063-0889", "linkedin": ";;;red-portal/", "or_profile": "~Joohwan_Ko2;~Yian_Ma1;~Jacob_R_Gardner1;~Khurai_Kim1", "aff": "Korea Advanced Institute of Science & Technology;University of California, San Diego;University of Pennsylvania;University of Pennsylvania", "aff_domain": "kaist.edu;ucsd.edu;upenn.edu;seas.upenn.edu", "position": "MS student;Assistant Professor;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nkim2024demystifying,\ntitle={Demystifying {SGD} with Doubly Stochastic Gradients},\nauthor={Kyurae Kim and Joohwan Ko and Yian Ma and Jacob R. Gardner},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=z373OXJXWU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 618553, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YVru_mKVr_gJ:scholar.google.com/&scioq=Demystifying+SGD+with+Doubly+Stochastic+Gradients&hl=en&as_sdt=0,11", "gs_version_total": 10, "email": "kaist.edu;ucsd.edu;upenn.edu;seas.upenn.edu", "author_num": 4, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;University of California, San Diego;University of Pennsylvania", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kaist.ac.kr;https://www.ucsd.edu;https://www.upenn.edu", "aff_unique_abbr": "KAIST;UCSD;UPenn", "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "South Korea;United States" }, { "title": "Dynamic Metric Embedding into lp Space", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32645", "id": "z3PUNzdmGs", "proceeding": "https://proceedings.mlr.press/v235/banihashem24b.html", "pdf": "https://openreview.net/pdf?id=z3PUNzdmGs", "openreview": "https://openreview.net/forum?id=z3PUNzdmGs", "author_site": "Kiarash Banihashem, MohammadTaghi Hajiaghayi, Dariusz Kowalski, Jan Olkowski, Max Springer", "tldr": "", "abstract": "We give the first non-trivial decremental dynamic embedding of a weighted, undirected graph $G$ into $\\ell_p$ space. Given a weighted graph $G$ undergoing a sequence of edge weight increases, the goal of this problem is to maintain a (randomized) mapping $\\phi: (G,d) \\to (X,\\ell_p)$ from the set of vertices of the graph to the $\\ell_p$ space such that for every pair of vertices $u$ and $v$, the expected distance between $\\phi(u)$ and $\\phi(v)$ in the $\\ell_p$ metric is within a small multiplicative factor, referred to as the distortion, of their distance in $G$. Our main result is a dynamic algorithm with expected distortion $O(\\log^2 n)$ and total update time $O\\left((m^{1+o(1)} \\log^2 W + Q)\\log(nW) \\right)$, where $W$ is the maximum weight of the edges, $Q$ is the total number of updates and $n, m$ denote the number of vertices and edges in $G$ respectively. This is the first result of its kind, extending the seminal result of Bourgain '85 to the expanding field of dynamic algorithms. Moreover, we demonstrate that in the fully dynamic regime, where we tolerate edge insertions as well as deletions, no algorithm can explicitly maintain an embedding into $\\ell_p$ space that has a low distortion with high probability.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kiarash Banihashem;MohammadTaghi Hajiaghayi;Dariusz Rafal Kowalski;Jan Olkowski;Max Springer", "authorids": "~Kiarash_Banihashem1;~MohammadTaghi_Hajiaghayi1;~Dariusz_Rafal_Kowalski1;~Jan_Olkowski1;~Max_Springer1", "gender": "M;M;M;M;M", "homepage": ";http://www.cs.umd.edu/~hajiagha/;;https://www.cs.umd.edu/people/olkowski;https://www.maxspringer.me", "dblp": "285/5061;334/4488;43/6109;;292/2716", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=SQ1eGN4AAAAJ;https://scholar.google.com/citations?hl=en;;x9NBFhwAAAAJ", "orcid": ";0000-0003-4842-0533;0000-0002-1316-7788;;0000-0001-9291-6574", "linkedin": ";mohammad-hajiaghayi-2139a913a&ved=2ahUKEwjMyeH-5-_-AhV3K1kFHeeBDKwQjjh6BAgSEAE&usg=AOvVaw1NSVoT5FCGtOTi4eT8nr4b;;;mss423/", "or_profile": "~Kiarash_Banihashem1;~MohammadTaghi_Hajiaghayi1;~Dariusz_Rafal_Kowalski1;~Jan_Olkowski1;~Max_Springer1", "aff": "University of Maryland, College Park;University of Maryland, College Park;Augusta University;University of Maryland, College Park;University of Maryland, College Park", "aff_domain": "umd.edu;umd.edu;augusta.edu;umd.edu;umd.edu", "position": "PhD student;Full Professor;Full Professor;PhD student;PhD student", "bibtex": "@inproceedings{\nbanihashem2024dynamic,\ntitle={Dynamic Metric Embedding into lp Space},\nauthor={Kiarash Banihashem and MohammadTaghi Hajiaghayi and Dariusz Rafal Kowalski and Jan Olkowski and Max Springer},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=z3PUNzdmGs}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 572710, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:juqJzJi29fkJ:scholar.google.com/&scioq=Dynamic+Metric+Embedding+into+lp+Space&hl=en&as_sdt=0,5", "gs_version_total": 9, "email": "umd.edu;umd.edu;augusta.edu;umd.edu;umd.edu", "author_num": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "University of Maryland;Augusta University", "aff_unique_dep": ";", "aff_unique_url": "https://www/umd.edu;https://www.augusta.edu", "aff_unique_abbr": "UMD;AU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "DITTO: Diffusion Inference-Time T-Optimization for Music Generation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32644", "id": "z5Ux2u6t7U", "proceeding": "https://proceedings.mlr.press/v235/novack24a.html", "pdf": "https://openreview.net/pdf?id=z5Ux2u6t7U", "openreview": "https://openreview.net/forum?id=z5Ux2u6t7U", "author_site": "Zachary Novack, Julian McAuley, Taylor Berg-Kirkpatrick, Nicholas Bryan", "tldr": "", "abstract": "We propose Diffusion Inference-Time T-Optimization (DITTO), a general-purpose framework for controlling pre-trained text-to-music diffusion models at inference-time via optimizing initial noise latents. Our method can be used to optimize through any differentiable feature matching loss to achieve a target (stylized) output and leverages gradient checkpointing for memory efficiency. We demonstrate a surprisingly wide-range of applications for music generation including inpainting, outpainting, and looping as well as intensity, melody, and musical structure control \u2013 all without ever fine-tuning the underlying model. When we compare our approach against related training, guidance, and optimization-based methods, we find DITTO achieves state-of-the-art performance on nearly all tasks, including outperforming comparable approaches on controllability, audio quality, and computational efficiency, thus opening the door for high-quality, flexible, training-free control of diffusion models. Sound examples can be found at https://ditto-music.github.io/web/.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zachary Novack;Julian McAuley;Taylor Berg-Kirkpatrick;Nicholas J. Bryan", "authorids": "~Zachary_Novack1;~Julian_McAuley1;~Taylor_Berg-Kirkpatrick1;~Nicholas_J._Bryan1", "gender": "M;M;M;M", "homepage": "https://zacharynovack.github.io/;http://cseweb.ucsd.edu/~jmcauley/;https://cseweb.ucsd.edu/~tberg/;https://njb.github.io", "dblp": "334/7662;29/3483;22/8160;07/8697", "google_scholar": "fZKJdb0AAAAJ;icbo4M0AAAAJ;mN6_BKAAAAAJ;65zgjxQAAAAJ", "orcid": ";0000-0003-0955-7588;;0000-0003-1469-7278", "linkedin": "zachary-novack/;;;", "or_profile": "~Zachary_Novack1;~Julian_McAuley1;~Taylor_Berg-Kirkpatrick1;~Nicholas_J._Bryan1", "aff": "University of California, San Diego;University of California, San Diego, University of California, San Diego;University of California, San Diego;Adobe Systems", "aff_domain": "ucsd.edu;eng.ucsd.edu;ucsd.edu;adobe.com", "position": "PhD student;Full Professor;Associate Professor;Researcher", "bibtex": "@inproceedings{\nnovack2024ditto,\ntitle={{DITTO}: Diffusion Inference-Time T-Optimization for Music Generation},\nauthor={Zachary Novack and Julian McAuley and Taylor Berg-Kirkpatrick and Nicholas J. Bryan},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=z5Ux2u6t7U}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9314433, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8255528827602992760&as_sdt=80005&sciodt=0,11&hl=en", "gs_version_total": 5, "email": "ucsd.edu;eng.ucsd.edu;ucsd.edu;adobe.com", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of California, San Diego;Adobe", "aff_unique_dep": ";Adobe Systems Incorporated", "aff_unique_url": "https://www.ucsd.edu;https://www.adobe.com", "aff_unique_abbr": "UCSD;Adobe", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Characterizing ResNet's Universal Approximation Capability", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32643", "id": "z7zHsNFXHc", "proceeding": "https://proceedings.mlr.press/v235/liu24am.html", "pdf": "https://openreview.net/pdf?id=z7zHsNFXHc", "openreview": "https://openreview.net/forum?id=z7zHsNFXHc", "author_site": "Chenghao LIU, Enming Liang, Minghua Chen", "tldr": "", "abstract": "Since its debut in 2016, ResNet has become arguably the most favorable architecture in deep neural network (DNN) design. It effectively addresses the gradient vanishing/exploding issue in DNN training, allowing engineers to fully unleash DNN's potential in tackling challenging problems in various domains. Despite its practical success, an essential theoretical question remains largely open: how well/best can ResNet approximate functions? In this paper, we answer this question for several important function classes, including polynomials and smooth functions. In particular, we show that ResNet with constant width can approximate Lipschitz continuous function with a Lipschitz constant $\\mu$ using $\\mathcal{O}(c(d)(\\varepsilon/\\mu)^{-d/2})$ tunable weights, where $c(d)$ is a constant depending on the input dimension $d$ and $\\epsilon>0$ is the target approximation error. Further, we extend such a result to Lebesgue-integrable functions with the upper bound characterized by the modulus of continuity. These results indicate a factor of $d$ reduction in the number of tunable weights compared with the classical results for ReLU networks. Our results are also order-optimal in $\\varepsilon$, thus achieving optimal approximation rate, as they match a generalized lower bound derived in this paper. This work adds to the theoretical justifications for ResNet's stellar practical performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chenghao Liu;Enming Liang;Minghua Chen", "authorids": "~Chenghao_Liu5;~Enming_Liang1;~Minghua_Chen1", "gender": "M;M;M", "homepage": ";https://emliang.github.io/;https://www.mhchen.com", "dblp": ";;12/4395-1.html", "google_scholar": ";https://scholar.google.com.sg/citations?user=Todfu6AAAAAJ;https://scholar.google.com.hk/citations?user=WzEQ9QwAAAAJ", "orcid": "0009-0001-5374-2767;;0000-0003-4763-0037", "linkedin": ";enming-liang-95b5b216a/;", "or_profile": "~Chenghao_Liu5;~Enming_Liang1;~Minghua_Chen1", "aff": "City University of Hong Kong;City University of Hong Kong;City University of Hong Kong", "aff_domain": "cityu.edu.hk;cityu.edu.hk;cityu.edu.hk", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nliu2024characterizing,\ntitle={Characterizing ResNet's Universal Approximation Capability},\nauthor={Chenghao Liu and Enming Liang and Minghua Chen},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=z7zHsNFXHc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1238348, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=280982346352879172&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "cityu.edu.hk;cityu.edu.hk;cityu.edu.hk", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "City University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cityu.edu.hk", "aff_unique_abbr": "CityU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "What is Dataset Distillation Learning?", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32642", "id": "z8sYc334fU", "proceeding": "https://proceedings.mlr.press/v235/yang24am.html", "pdf": "https://openreview.net/pdf?id=z8sYc334fU", "openreview": "https://openreview.net/forum?id=z8sYc334fU", "author_site": "William Yang, Ye Zhu, Zhiwei Deng, Olga Russakovsky", "tldr": "", "abstract": "Dataset distillation has emerged as a strategy to overcome the hurdles associated with large datasets by learning a compact set of synthetic data that retains essential information from the original dataset. While distilled data can be used to train high performing models, little is understood about how the information is stored. In this study, we posit and answer three questions about the behavior, representativeness, and point-wise information content of distilled data. We reveal distilled data cannot serve as a substitute for real data during training outside the standard evaluation setting for dataset distillation. Additionally, the distillation process retains high task performance by compressing information related to the early training dynamics of real models. Finally, we provide an framework for interpreting distilled data and reveal that individual distilled data points contain meaningful semantic information. This investigation sheds light on the intricate nature of distilled data, providing a better understanding on how they can be effectively utilized.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "William Yang;Ye Zhu;Zhiwei Deng;Olga Russakovsky", "authorids": "~William_Yang1;~Ye_Zhu3;~Zhiwei_Deng3;~Olga_Russakovsky1", "gender": "M;F;M;F", "homepage": ";https://l-yezhu.github.io/;http://www.zhiweideng.com;http://cs.princeton.edu/~olgarus", "dblp": ";;160/3578;52/6883", "google_scholar": ";uk5WuyIAAAAJ;tWBPUHwAAAAJ;TB5OwW8AAAAJ", "orcid": ";;;0000-0001-5272-3241", "linkedin": "william-y-b218a6168/;;;", "or_profile": "~William_Yang1;~Ye_Zhu3;~Zhiwei_Deng3;~Olga_Russakovsky1", "aff": "Princeton University;Princeton University;Google Deepmind;Princeton University", "aff_domain": "princeton.edu;princeton.edu;google.com;princeton.edu", "position": "PhD student;Postdoc;Research Scientist;Associate Professor", "bibtex": "@inproceedings{\nyang2024what,\ntitle={What is Dataset Distillation Learning?},\nauthor={William Yang and Ye Zhu and Zhiwei Deng and Olga Russakovsky},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=z8sYc334fU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5452036, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15849366565602999551&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "princeton.edu;princeton.edu;google.com;princeton.edu", "author_num": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Princeton University;DeepMind", "aff_unique_dep": ";DeepMind", "aff_unique_url": "https://www.princeton.edu;https://deepmind.com", "aff_unique_abbr": "Princeton;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Overcoming the Optimizer's Curse: Obtaining Realistic Prescriptions from Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32641", "id": "zB6VQzDmK8", "proceeding": "https://proceedings.mlr.press/v235/tsiourvas24a.html", "pdf": "https://openreview.net/pdf?id=zB6VQzDmK8", "openreview": "https://openreview.net/forum?id=zB6VQzDmK8", "author_site": "Asterios Tsiourvas, Georgia Perakis", "tldr": "", "abstract": "We study the problem of obtaining optimal and realistic prescriptions when using ReLU networks for data-driven decision-making. In this setting, the network is used to predict a quantity of interest and then is optimized to retrieve the decisions that maximize the quantity (e.g. find the best prices that maximize revenue). However, optimizing over-parameterized models often produces unrealistic prescriptions, far from the data manifold. This phenomenon is known as the Optimizer's Curse. To tackle this problem, we model the requirement for the resulting decisions to align with the data manifold as a tractable optimization constraint. This is achieved by reformulating the highly nonlinear Local Outlier Factor (LOF) metric as a single linear or quadratic constraint. To solve the problem efficiently for large networks, we propose an adaptive sampling algorithm that reduces the initial hard-to-solve optimization problem into a small number of significantly easier-to-solve problems by restricting the decision space to realistic polytopes, i.e. polytopes of the decision space that contain at least one realistic data point. Experiments on publicly available networks demonstrate the efficacy and scalability of our approach.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Asterios Tsiourvas;Georgia Perakis", "authorids": "~Asterios_Tsiourvas1;~Georgia_Perakis1", "gender": "M;F", "homepage": "https://www.linkedin.com/in/asterios-tsiourvas/;https://mitmgmtfaculty.mit.edu/gperakis/", "dblp": ";", "google_scholar": ";SUwM5jUAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Asterios_Tsiourvas1;~Georgia_Perakis1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\ntsiourvas2024overcoming,\ntitle={Overcoming the Optimizer's Curse: Obtaining Realistic Prescriptions from Neural Networks},\nauthor={Asterios Tsiourvas and Georgia Perakis},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zB6VQzDmK8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3940456, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ccotXKnoTc4J:scholar.google.com/&scioq=Overcoming+the+Optimizer%27s+Curse:+Obtaining+Realistic+Prescriptions+from+Neural+Networks&hl=en&as_sdt=0,44", "gs_version_total": 4, "email": "mit.edu;mit.edu", "author_num": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Individual Contributions as Intrinsic Exploration Scaffolds for Multi-agent Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32640", "id": "zCmMkWK4Ly", "proceeding": "https://proceedings.mlr.press/v235/li24aw.html", "pdf": "https://openreview.net/pdf?id=zCmMkWK4Ly", "openreview": "https://openreview.net/forum?id=zCmMkWK4Ly", "author_site": "Xinran Li, Zifan LIU, Shibo Chen, Jun Zhang", "tldr": "", "abstract": "In multi-agent reinforcement learning (MARL), effective exploration is critical, especially in sparse reward environments. Although introducing global intrinsic rewards can foster exploration in such settings, it often complicates credit assignment among agents. To address this difficulty, we propose Individual Contributions as intrinsic Exploration Scaffolds (ICES), a novel approach to motivate exploration by assessing each agent's contribution from a global view. In particular, ICES constructs exploration scaffolds with Bayesian surprise, leveraging global transition information during centralized training. These scaffolds, used only in training, help to guide individual agents towards actions that significantly impact the global latent state transitions. Additionally, ICES separates exploration policies from exploitation policies, enabling the former to utilize privileged global information during training. Extensive experiments on cooperative benchmark tasks with sparse rewards, including Google Research Football (GRF) and StarCraft Multi-agent Challenge (SMAC), demonstrate that ICES exhibits superior exploration capabilities compared with baselines. The code is publicly available at https://github.com/LXXXXR/ICES.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xinran Li;Zifan LIU;Shibo Chen;Jun Zhang", "authorids": "~Xinran_Li3;~Zifan_LIU1;~Shibo_Chen2;~Jun_Zhang25", "gender": "F;M;M;", "homepage": "https://lxxxxr.github.io/;https://ziffer-byakuya.github.io/zifferliu.github.io/;http://faculty.sustech.edu.cn/profiles/chensb;https://eejzhang.people.ust.hk/", "dblp": ";;;z/JunZhang4", "google_scholar": "6fYlKXgAAAAJ;;;1Is687QAAAAJ", "orcid": "0000-0003-0245-9459;0000-0001-7948-5124;0000-0001-7329-428X;0000-0002-5222-1898", "linkedin": ";;;", "or_profile": "~Xinran_Li3;~Zifan_LIU1;~Shibo_Chen2;~Jun_Zhang25", "aff": "Hong Kong University of Science and Technology;Hong Kong University of Science and Technology;;Hong Kong University of Science and Technology", "aff_domain": "hkust.edu;hkust.edu.hk;;ust.hk", "position": "PhD student;PhD student;;Associate Professor", "bibtex": "@inproceedings{\nli2024individual,\ntitle={Individual Contributions as Intrinsic Exploration Scaffolds for Multi-agent Reinforcement Learning},\nauthor={Xinran Li and Zifan LIU and Shibo Chen and Jun Zhang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zCmMkWK4Ly}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9622902, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2407135969953811606&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "hkust.edu;hkust.edu.hk;;ust.hk", "author_num": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Hong Kong University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.ust.hk", "aff_unique_abbr": "HKUST", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "On a Neural Implementation of Brenier's Polar Factorization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32639", "id": "zDCwJQY3eI", "proceeding": "https://proceedings.mlr.press/v235/vesseron24a.html", "pdf": "https://openreview.net/pdf?id=zDCwJQY3eI", "openreview": "https://openreview.net/forum?id=zDCwJQY3eI", "author_site": "Nina Vesseron, Marco Cuturi", "tldr": "", "abstract": "In 1991, Brenier proved a theorem that generalizes the polar decomposition for square matrices -- factored as PSD $\\times$ unitary -- to any vector field $F:\\mathbb{R}^d\\rightarrow \\mathbb{R}^d$. The theorem, known as the polar factorization theorem, states that any field $F$ can be recovered as the composition of the gradient of a convex function $u$ with a measure-preserving map $M$, namely $F=\\nabla u \\circ M$. We propose a practical implementation of this far-reaching theoretical result, and explore possible uses within machine learning. The theorem is closely related to optimal transport (OT) theory, and we borrow from recent advances in the field of neural optimal transport to parameterize the potential $u$ as an input convex neural network. The map $M$ can be either evaluated pointwise using $u^*$, the convex conjugate of $u$, through the identity $M=\\nabla u^* \\circ F$, or learned as an auxiliary network. Because $M$ is, in general, not injective, we consider the additional task of estimating the ill-posed inverse map that can approximate the pre-image measure $M^{-1}$ using a stochastic generator. We illustrate possible applications of Brenier's polar factorization to non-convex optimization problems, as well as sampling of densities that are not log-concave.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nina Vesseron;marco cuturi", "authorids": "~Nina_Vesseron1;~marco_cuturi2", "gender": "F;M", "homepage": ";http://marcocuturi.net", "dblp": "277/0850;85/5102", "google_scholar": "Vff0VnoAAAAJ;https://scholar.google.fr/citations?user=kQEydDMAAAAJ", "orcid": ";", "linkedin": "nina-vesseron-879925170/;", "or_profile": "~Nina_Vesseron1;~marco_cuturi2", "aff": "Ecole Nationale de la Statistique et de l'Administration Economique;Ensae ParisTech", "aff_domain": "ensae.fr;ensae.fr", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nvesseron2024on,\ntitle={On a Neural Implementation of Brenier's Polar Factorization},\nauthor={Nina Vesseron and marco cuturi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zDCwJQY3eI}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8354494, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4353322844491522559&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "ensae.fr;ensae.fr", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Ecole Nationale de la Statistique et de l'Administration Economique;ENSAE ParisTech", "aff_unique_dep": ";", "aff_unique_url": "https://ensae.fr;https://www.ensae.fr", "aff_unique_abbr": "ENSAE;Ensae", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "title": "SignSGD with Federated Defense: Harnessing Adversarial Attacks through Gradient Sign Decoding", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32638", "id": "zEqeNEuiJr", "proceeding": "https://proceedings.mlr.press/v235/park24h.html", "pdf": "https://openreview.net/pdf?id=zEqeNEuiJr", "openreview": "https://openreview.net/forum?id=zEqeNEuiJr", "author_site": "Chanho Park, Namyoon Lee", "tldr": "", "abstract": "Distributed learning is an effective approach to accelerate model training by using parallel computing power of multiple workers. However, substantial communication delays arise between workers and a parameter server due to the massive costs associated with communicating gradients. SignSGD with majority voting (signSGD-MV) is a simple yet effective optimizer that reduces communication costs through sign quantization, but its convergence rate significantly decreases when adversarial workers arbitrarily manipulate datasets or local gradient updates. In this paper, we consider a distributed learning problem where the workforce comprises a mixture of honest and adversarial workers. In this setting, we show that the convergence rate can remain invariant as long as the number of honest workers providing trustworthy local updates to the parameter server exceeds the number of adversarial workers. The key idea behind this counter-intuitive result is our novel aggregation method, signSGD with federated defense (signSGD-FD). Unlike traditional approaches, signSGD-FD utilizes the gradient information sent by adversarial workers with appropriate weights, obtained through gradient sign decoding. Experimental results demonstrate that signSGD-FD achieves superior convergence rates compared to traditional algorithms in various adversarial attack scenarios.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chanho Park;Namyoon Lee", "authorids": "~Chanho_Park1;~Namyoon_Lee1", "gender": "M;M", "homepage": ";", "dblp": ";", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;gTPowyYAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Chanho_Park1;~Namyoon_Lee1", "aff": "Pohang University of Science and Technology;Korea University", "aff_domain": "postech.edu;korea.ac.kr", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\npark2024signsgd,\ntitle={Sign{SGD} with Federated Defense: Harnessing Adversarial Attacks through Gradient Sign Decoding},\nauthor={Chanho Park and Namyoon Lee},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zEqeNEuiJr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 561281, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6685720368723477072&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "postech.edu;korea.ac.kr", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Pohang University of Science and Technology;Korea University", "aff_unique_dep": ";", "aff_unique_url": "https://www.postech.ac.kr;https://www.korea.ac.kr", "aff_unique_abbr": "POSTECH;KU", "aff_campus_unique_index": "0", "aff_campus_unique": "Pohang;", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Guarantees for Nonlinear Representation Learning: Non-identical Covariates, Dependent Data, Fewer Samples", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32637", "id": "zFHaB7KESM", "proceeding": "https://proceedings.mlr.press/v235/zhang24ab.html", "pdf": "https://openreview.net/pdf?id=zFHaB7KESM", "openreview": "https://openreview.net/forum?id=zFHaB7KESM", "author_site": "Thomas T. Zhang, Bruce Lee, Ingvar Ziemann, George J. Pappas, Nikolai Matni", "tldr": "", "abstract": "A driving force behind the diverse applicability of modern machine learning is the ability to extract meaningful features across many sources. However, many practical domains involve data that are non-identically distributed across sources, and possibly statistically dependent within its source, violating vital assumptions in existing theoretical studies of representation learning. Toward addressing these issues, we establish statistical guarantees for learning general *nonlinear* representations from multiple data sources that admit different input distributions and possibly dependent data. Specifically, we study the sample-complexity of learning $T+1$ functions $f_\\star^{(t)} \\circ g_\\star$ from a function class $\\mathcal{F} \\times \\mathcal{G}$, where $f_\\star^{(t)}$ are task specific linear functions and $g_\\star$ is a shared non-linear representation. An approximate representation $\\hat g$ is estimated using $N$ samples from each of $T$ source tasks, and a fine-tuning function $\\hat f^{(0)}$ is fit using $N'$ samples from a target task passed through $\\hat g$. Our results show that the excess risk of the estimate $\\hat f^{(0)} \\circ \\hat g$ on the target task decays as $\\tilde{\\mathcal{O}}\\Big(\\frac{\\mathrm{C}(\\mathcal{G})}{N T} + \\frac{\\text{dim}(\\mathcal{F})}{N'}\\Big)$, where $\\mathrm{C}(\\mathcal{G})$ denotes the complexity of $\\mathcal{G}$. Notably, our rates match that of the iid setting, while requiring fewer samples per task than prior analysis and admitting *no dependence on the mixing time*. We support our analysis with numerical experiments performing imitation learning over non-linear dynamical systems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Thomas TCK Zhang;Bruce D Lee;Ingvar Ziemann;George J. Pappas;Nikolai Matni", "authorids": "~Thomas_TCK_Zhang1;~Bruce_D_Lee1;~Ingvar_Ziemann1;~George_J._Pappas1;~Nikolai_Matni2", "gender": "M;M;M;M;M", "homepage": "https://brucedlee.github.io;https://www.kth.se/profile/ziemann;http://www.georgejpappas.org/;https://nikolaimatni.github.io;http://thomaszh3.github.io", "dblp": ";247/4222;p/GeorgeJPappas;52/8135;143/7488", "google_scholar": "CRNmfCAAAAAJ;https://scholar.google.se/citations?user=_RBAS2IAAAAJ;https://scholar.google.com.tw/citations?user=Kia-4B0AAAAJ;ZDPCh_EAAAAJ;0ZSqAe0AAAAJ", "orcid": ";;0000-0001-9081-0637;;", "linkedin": ";;;;", "or_profile": "~Bruce_D_Lee1;~Ingvar_Ziemann1;~George_Pappas1;~Nikolai_Matni1;~Thomas_Tian_Cheng_Kaiming_Zhang1", "aff": "University of Pennsylvania;University of Pennsylvania;School of Engineering and Applied Science, University of Pennsylvania;School of Engineering and Applied Science, University of Pennsylvania;University of Pennsylvania", "aff_domain": "upenn.edu;upenn.edu;seas.upenn.edu;seas.upenn.edu;seas.upenn.edu", "position": "PhD student;Postdoc;Full Professor;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nzhang2024guarantees,\ntitle={Guarantees for Nonlinear Representation Learning: Non-identical Covariates, Dependent Data, Fewer Samples},\nauthor={Thomas TCK Zhang and Bruce D Lee and Ingvar Ziemann and George J. Pappas and Nikolai Matni},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zFHaB7KESM}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 606349, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5384781563425392353&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "upenn.edu;upenn.edu;seas.upenn.edu;seas.upenn.edu;seas.upenn.edu", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Transitional Uncertainty with Layered Intermediate Predictions", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32636", "id": "zII3Olw7cr", "proceeding": "https://proceedings.mlr.press/v235/benkert24a.html", "pdf": "https://openreview.net/pdf?id=zII3Olw7cr", "openreview": "https://openreview.net/forum?id=zII3Olw7cr", "author_site": "Ryan Benkert, Mohit Prabhushankar, Ghassan AlRegib", "tldr": "", "abstract": "In this paper, we discuss feature engineering for single-pass uncertainty estimation. For accurate uncertainty estimates, neural networks must extract differences in the feature space that quantify uncertainty. This could be achieved by current single-pass approaches that maintain feature distances between data points as they traverse the network. While initial results are promising, maintaining feature distances within the network representations frequently inhibits information compression and opposes the learning objective. We study this effect theoretically and empirically to arrive at a simple conclusion: preserving feature distances in the output is beneficial when the preserved features contribute to learning the label distribution and act in opposition otherwise. We then propose Transitional Uncertainty with Layered Intermediate Predictions (TULIP) as a simple approach to address the shortcomings of current single-pass estimators. Specifically, we implement feature preservation by extracting features from intermediate representations before information is collapsed by subsequent layers. We refer to the underlying preservation mechanism as transitional feature preservation. We show that TULIP matches or outperforms current single-pass methods on standard benchmarks and in practical settings where these methods are less reliable (imbalances, complex architectures, medical modalities).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ryan Benkert;Mohit Prabhushankar;Ghassan AlRegib", "authorids": "~Ryan_Benkert1;~Mohit_Prabhushankar1;~Ghassan_AlRegib1", "gender": "M;M;M", "homepage": ";https://sites.google.com/view/mohit-prabhushankar;http://www.ghassanalregib.info", "dblp": ";185/7435;83/1655", "google_scholar": ";https://scholar.google.com/scholar?hl=en;https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-9634-9359;0000-0002-8743-7058;", "linkedin": ";;ghassan-alregib-0602131/", "or_profile": "~Ryan_Benkert1;~Mohit_Prabhushankar1;~Ghassan_AlRegib1", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology", "aff_domain": "gatech.edu;gatech.edu;gatech.edu", "position": "PhD student;Postdoc;Full Professor", "bibtex": "@inproceedings{\nbenkert2024transitional,\ntitle={Transitional Uncertainty with Layered Intermediate Predictions},\nauthor={Ryan Benkert and Mohit Prabhushankar and Ghassan AlRegib},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zII3Olw7cr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8458259, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5191218187650326232&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "email": "gatech.edu;gatech.edu;gatech.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "GALA3D: Towards Text-to-3D Complex Scene Generation via Layout-guided Generative Gaussian Splatting", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32635", "id": "zL9q2JD1dC", "proceeding": "https://proceedings.mlr.press/v235/zhou24p.html", "pdf": "https://openreview.net/pdf?id=zL9q2JD1dC", "openreview": "https://openreview.net/forum?id=zL9q2JD1dC", "author_site": "Xiaoyu Zhou, Xingjian Ran, Yajiao Xiong, Jinlin He, Zhiwei Lin, Yongtao Wang, Deqing Sun, Ming-Hsuan Yang", "tldr": "", "abstract": "We present GALA3D, generative 3D GAussians with LAyout-guided control, for effective compositional text-to-3D generation. We first utilize large language models (LLMs) to generate the initial layout and introduce a layout-guided 3D Gaussian representation for 3D content generation with adaptive geometric constraints. We then propose an instance-scene compositional optimization mechanism with conditioned diffusion to collaboratively generate realistic 3D scenes with consistent geometry, texture, scale, and accurate interactions among multiple objects while simultaneously adjusting the coarse layout priors extracted from the LLMs to align with the generated scene. Experiments show that GALA3D is a user-friendly, end-to-end framework for state-of-the-art scene-level 3D content generation and controllable editing while ensuring the high fidelity of object-level entities within the scene. The source codes and models will be available at gala3d.github.io.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaoyu Zhou;Xingjian Ran;Yajiao Xiong;Jinlin He;Zhiwei Lin;Yongtao Wang;Deqing Sun;Ming-Hsuan Yang", "authorids": "~Xiaoyu_Zhou1;~Xingjian_Ran1;~Yajiao_Xiong1;~Jinlin_He1;~Zhiwei_Lin1;~Yongtao_Wang1;~Deqing_Sun2;~Ming-Hsuan_Yang1", "gender": "M;M;F;;M;M;M;M", "homepage": ";;;;;https://www.icst.pku.edu.cn/xztd/1298696.htm;https://deqings.github.io/;https://faculty.ucmerced.edu/mhyang/", "dblp": ";368/5985.html;;;;48/4720;69/4250;79/3711.html", "google_scholar": "WLE4TUoAAAAJ;8AubXI4AAAAJ;;;ClU7ua0AAAAJ;Zna90HQAAAAJ;t4rgICIAAAAJ;p9-ohHsAAAAJ", "orcid": "0000-0002-6562-5752;0009-0002-4014-0705;0009-0001-9464-5314;0009-0001-7547-3692;;;;0000-0003-4848-2304", "linkedin": ";;;;;;;minghsuanyang/", "or_profile": "~Xiaoyu_Zhou1;~Xingjian_Ran1;~Yajiao_Xiong1;~Jinlin_He1;~Zhiwei_Lin1;~Yongtao_Wang1;~Deqing_Sun2;~Ming-Hsuan_Yang1", "aff": "Peking University;University of Electronic Science and Technology of China;Peking University;University of Electronic Science and Technology of China;Peking University;Peking University;Google DeepMind;University of California at Merced", "aff_domain": "pku.edu.cn;uestc.edu.cn;pku.edu.cn;uestc.edu.cn;pku.edu.cn;pku.edu.cn;google.com;umcerced.edu", "position": "PhD student;Undergrad student;Undergrad student;Undergrad student;PhD student;Associate Professor;Research Scientist;Professor", "bibtex": "@inproceedings{\nzhou2024galad,\ntitle={{GALA}3D: Towards Text-to-3D Complex Scene Generation via Layout-guided Generative Gaussian Splatting},\nauthor={Xiaoyu Zhou and Xingjian Ran and Yajiao Xiong and Jinlin He and Zhiwei Lin and Yongtao Wang and Deqing Sun and Ming-Hsuan Yang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zL9q2JD1dC}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 9363598, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 55, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16132469630794833393&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "pku.edu.cn;uestc.edu.cn;pku.edu.cn;uestc.edu.cn;pku.edu.cn;pku.edu.cn;google.com;umcerced.edu", "author_num": 8, "aff_unique_index": "0;1;0;1;0;0;2;3", "aff_unique_norm": "Peking University;University of Electronic Science and Technology of China;Google;University of California, Merced", "aff_unique_dep": ";;Google DeepMind;", "aff_unique_url": "http://www.pku.edu.cn;https://www.uestc.edu.cn;https://deepmind.com;https://www.ucmerced.edu", "aff_unique_abbr": "Peking U;UESTC;DeepMind;UC Merced", "aff_campus_unique_index": "1", "aff_campus_unique": ";Merced", "aff_country_unique_index": "0;0;0;0;0;0;1;2", "aff_country_unique": "China;United Kingdom;United States" }, { "title": "A Rate-Distortion View of Uncertainty Quantification", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32634", "id": "zMGUDsPopK", "proceeding": "https://proceedings.mlr.press/v235/apostolopoulou24a.html", "pdf": "https://openreview.net/pdf?id=zMGUDsPopK", "openreview": "https://openreview.net/forum?id=zMGUDsPopK", "author_site": "Ifigeneia Apostolopoulou, Benjamin Eysenbach, Frank Nielsen, Artur Dubrawski", "tldr": "", "abstract": "In supervised learning, understanding an input\u2019s proximity to the training data can help a model decide whether it has sufficient evidence for reaching a reliable prediction. While powerful probabilistic models such as Gaussian Processes naturally have this property, deep neural networks often lack it. In this paper, we introduce Distance Aware Bottleneck (DAB), i.e., a new method for enriching deep neural networks with this property. Building on prior information bottleneck approaches, our method learns a codebook that stores a compressed representation of all inputs seen during training. The distance of a new example from this codebook can serve as an uncertainty estimate for the example. The resulting model is simple to train and provides deterministic uncertainty estimates by a single forward pass. Finally, our method achieves better out-of-distribution (OOD) detection and misclassification prediction than prior methods, including expensive ensemble methods, deep kernel Gaussian Processes, and approaches based on the standard information bottleneck.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ifigeneia Apostolopoulou;Benjamin Eysenbach;Frank Nielsen;Artur Dubrawski", "authorids": "~Ifigeneia_Apostolopoulou1;~Benjamin_Eysenbach1;~Frank_Nielsen1;~Artur_Dubrawski2", "gender": ";M;M;M", "homepage": ";https://ben-eysenbach.github.io/;https://franknielsen.github.io/;https://www.autonlab.org", "dblp": "145/9415.html;192/1863;http://dblp.uni-trier.de/pers/hd/n/Nielsen:Frank;76/48", "google_scholar": "xiJGHuwAAAAJ;DRnOvU8AAAAJ;c-cuO9cAAAAJ;O3gezzcAAAAJ", "orcid": ";0009-0000-7136-6307;0000-0001-5728-0726;0000-0002-2372-0831", "linkedin": ";benjamin-eysenbach-a7235775/;;artur-dubrawski-33a2a87/", "or_profile": "~Ifigeneia_Apostolopoulou1;~Benjamin_Eysenbach1;~Frank_Nielsen1;~Artur_Dubrawski2", "aff": "Carnegie Mellon University;Princeton University;Sony Computer Science Laboratories Inc (Tokyo);Carnegie Mellon University", "aff_domain": "cmu.edu;princeton.edu;sonycsl.co.jp;cmu.edu", "position": "PhD student;Assistant Professor;Fellow;Research Professor", "bibtex": "@inproceedings{\napostolopoulou2024a,\ntitle={A Rate-Distortion View of Uncertainty Quantification},\nauthor={Ifigeneia Apostolopoulou and Benjamin Eysenbach and Frank Nielsen and Artur Dubrawski},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zMGUDsPopK}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 825747, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11287082915133960597&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 9, "email": "cmu.edu;princeton.edu;sonycsl.co.jp;cmu.edu", "author_num": 4, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Carnegie Mellon University;Princeton University;Sony Computer Science Laboratories Inc", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cmu.edu;https://www.princeton.edu;https://www.sony.net/", "aff_unique_abbr": "CMU;Princeton;Sony CSL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Tokyo", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;Japan" }, { "title": "Impact of Decentralized Learning on Player Utilities in Stackelberg Games", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32633", "id": "zMsMQJraEj", "proceeding": "https://proceedings.mlr.press/v235/donahue24a.html", "pdf": "https://openreview.net/pdf?id=zMsMQJraEj", "openreview": "https://openreview.net/forum?id=zMsMQJraEj", "author_site": "Kate Donahue, Nicole Immorlica, Meena Jagadeesan, Brendan Lucier, Alex Slivkins", "tldr": "", "abstract": "When deployed in the world, a learning agent such as a recommender system or a chatbot often repeatedly interacts with another learning agent (such as a user) over time. In many such two-agent systems, each agent learns separately and the rewards of the two agents are not perfectly aligned. To better understand such cases, we examine the learning dynamics of the two-agent system and the implications for each agent's objective. We model these systems as Stackelberg games with decentralized learning and show that standard regret benchmarks (such as Stackelberg equilibrium payoffs) result in worst-case linear regret for at least one player. To better capture these systems, we construct a relaxed regret benchmark that is tolerant to small learning errors by agents. We show that standard learning algorithms fail to provide sublinear regret, and we develop algorithms to achieve near-optimal $\\mathcal{O}(T^{2/3})$ regret for both players with respect to these benchmarks. We further design relaxed environments under which faster learning ($\\mathcal{O}(\\sqrt{T})$) is possible. Altogether, our results take a step towards assessing how two-agent interactions in sequential and decentralized learning environments affect the utility of both agents.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kate Donahue;Nicole Immorlica;Meena Jagadeesan;Brendan Lucier;Aleksandrs Slivkins", "authorids": "~Kate_Donahue1;~Nicole_Immorlica3;~Meena_Jagadeesan1;~Brendan_Lucier1;~Aleksandrs_Slivkins1", "gender": "F;;F;;M", "homepage": "https://www.katedonahue.me/;https://immorlica.com/;https://mjagadeesan.github.io;;https://www.microsoft.com/en-us/research/people/slivkins/", "dblp": "243/3358;43/3631;205/2407;09/6191;91/4014", "google_scholar": "c9SPOdwAAAAJ;;XW62DrcAAAAJ;;f2x233wAAAAJ", "orcid": ";;;;", "linkedin": "kate-donahue-795bb162/;;;;", "or_profile": "~Kate_Donahue1;~Nicole_Immorlica3;~Meena_Jagadeesan1;~Brendan_Lucier1;~Aleksandrs_Slivkins1", "aff": "Cornell University;Microsoft;University of California, Berkeley;Microsoft;Microsoft", "aff_domain": "cornell.edu;microsoft.com;berkeley.edu;microsoft.com;microsoft.com", "position": "PhD student;Principal Researcher;PhD student;Researcher;Researcher", "bibtex": "@inproceedings{\ndonahue2024impact,\ntitle={Impact of Decentralized Learning on Player Utilities in Stackelberg Games},\nauthor={Kate Donahue and Nicole Immorlica and Meena Jagadeesan and Brendan Lucier and Aleksandrs Slivkins},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zMsMQJraEj}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 661580, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10769796445061011183&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "cornell.edu;microsoft.com;berkeley.edu;microsoft.com;microsoft.com", "author_num": 5, "aff_unique_index": "0;1;2;1;1", "aff_unique_norm": "Cornell University;Microsoft;University of California, Berkeley", "aff_unique_dep": ";Microsoft Corporation;", "aff_unique_url": "https://www.cornell.edu;https://www.microsoft.com;https://www.berkeley.edu", "aff_unique_abbr": "Cornell;Microsoft;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Deep Networks Always Grok and Here is Why", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32632", "id": "zMue490KMr", "proceeding": "https://proceedings.mlr.press/v235/humayun24a.html", "pdf": "https://openreview.net/pdf?id=zMue490KMr", "openreview": "https://openreview.net/forum?id=zMue490KMr", "author_site": "Ahmed Imtiaz Humayun, Randall Balestriero, Richard Baraniuk", "tldr": "", "abstract": "Grokking, or delayed generalization, is a phenomenon where generalization in a deep neural network (DNN) occurs long after achieving near zero training error. Previous studies have reported the occurrence of grokking in specific controlled settings, such as DNNs initialized with large-norm parameters or transformers trained on algorithmic datasets. We demonstrate that grokking is actually much more widespread and materializes in a wide range of practical settings, such as training of a convolutional neural network (CNN) on CIFAR10 or a Resnet on Imagenette. We introduce the new concept of delayed robustness, whereby a DNN groks adversarial examples and becomes robust, long after interpolation and/or generalization. We develop an analytical explanation for the emergence of both delayed generalization and delayed robustness based on the local complexity of a DNN's input-output mapping. Our local complexity measures the density of so-called ``linear regions\u2019\u2019 (aka, spline partition regions) that tile the DNN input space and serves as a utile progress measure for training. We provide the first evidence that, for classification problems, the linear regions undergo a phase transition during training whereafter they migrate away from the training samples (making the DNN mapping smoother there) and towards the decision boundary (making the DNN mapping less smooth there). Grokking occurs post phase transition as a robust partition of the input space thanks to the linearization of the DNN mapping around the training points. Web: https://bit.ly/grok-adversarial.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ahmed Imtiaz Humayun;Randall Balestriero;Richard Baraniuk", "authorids": "~Ahmed_Imtiaz_Humayun1;~Randall_Balestriero1;~Richard_Baraniuk1", "gender": "M;M;", "homepage": "https://imtiazhumayun.github.io;https://randallbalestriero.github.io/;http://richb.rice.edu/", "dblp": "222/1771;175/5364;32/2804", "google_scholar": "wJ2HUn4AAAAJ;S1x_xqcAAAAJ;https://scholar.google.com.tw/citations?user=N-BBA20AAAAJ", "orcid": ";;", "linkedin": ";randallbalestriero/;richard-baraniuk", "or_profile": "~Ahmed_Imtiaz_Humayun1;~Randall_Balestriero1;~Richard_Baraniuk1", "aff": "Google;Citadel;William Marsh Rice University", "aff_domain": "google.com;citadel.com;rice.edu", "position": "Student Researcher;Researcher;C. Sidney Burrus Professor", "bibtex": "@inproceedings{\nhumayun2024deep,\ntitle={Deep Networks Always Grok and Here is Why},\nauthor={Ahmed Imtiaz Humayun and Randall Balestriero and Richard Baraniuk},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zMue490KMr}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6917634, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17993880889403352488&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "google.com;citadel.com;rice.edu", "author_num": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "Google;Citadel;Rice University", "aff_unique_dep": "Google;;", "aff_unique_url": "https://www.google.com;https://www.citadel.edu;https://www.rice.edu", "aff_unique_abbr": "Google;Citadel;Rice", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Agent Instructs Large Language Models to be General Zero-Shot Reasoners", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32631", "id": "zMwFvxr6CV", "proceeding": "https://proceedings.mlr.press/v235/crispino24a.html", "pdf": "https://openreview.net/pdf?id=zMwFvxr6CV", "openreview": "https://openreview.net/forum?id=zMwFvxr6CV", "author_site": "Nicholas Crispino, Kyle Montgomery, Fankun Zeng, Dawn Song, Chenguang Wang", "tldr": "", "abstract": "We introduce a method to improve the zero-shot reasoning abilities of large language models on general language understanding tasks. Specifically, we build an autonomous agent to instruct the reasoning process of large language models. To enable this, our agent only needs to generate a single set of instructions for each task. These instructions turn out to be extremely effective for improving the reasoning process of different large language models across all task instances. We show this approach further unleashes the zero-shot reasoning abilities of large language models to more tasks. We study the performance of our method on a wide set of datasets spanning generation, classification, and reasoning. We show that our method generalizes to most tasks and obtains state-of-the-art zero-shot performance on 20 of the 29 datasets that we evaluate. For instance, our method boosts the performance of state-of-the-art large language models by a large margin, including Vicuna-13b, Llama-2-70b-chat, and GPT-3.5 Turbo. Compared to zero-shot chain of thought, our improvement in reasoning is striking. With our method, Llama-2-70b-chat outperforms zero-shot GPT-3.5 Turbo significantly.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nicholas Crispino;Kyle Montgomery;Fankun Zeng;Dawn Song;Chenguang Wang", "authorids": "~Nicholas_Crispino1;~Kyle_Montgomery1;~Fankun_Zeng1;~Dawn_Song1;~Chenguang_Wang1", "gender": "M;M;F;M;", "homepage": "https://kylemontgomery1.github.io/;https://zengfankun.com/;;https://cgraywang.github.io/;https://github.com/ncrispino", "dblp": "261/4798;;s/DXSong;62/3432-1.html;", "google_scholar": "O8tnCagAAAAJ;;;hsZ2aj0AAAAJ;", "orcid": "0009-0004-0563-347X;0009-0007-0625-0499;;;", "linkedin": "kyle-montgomery-/;fankun-zeng;;;", "or_profile": "~Kyle_Montgomery1;~Fankun_Zeng1;~Dawn_Song1;~Chenguang_Wang1;~Nicholas_R_Crispino1", "aff": "Washington University, Saint Louis;;University of California, Berkeley;Washington University, Saint Louis;Washington University, Saint Louis", "aff_domain": "wustl.edu;;berkeley.edu;wustl.edu;cse.wustl.edu", "position": "MS student;;Full Professor;Assistant Professor;PhD student", "bibtex": "@inproceedings{\ncrispino2024agent,\ntitle={Agent Instructs Large Language Models to be General Zero-Shot Reasoners},\nauthor={Nicholas Crispino and Kyle Montgomery and Fankun Zeng and Dawn Song and Chenguang Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zMwFvxr6CV}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6903856, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3791774223503853880&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "email": "wustl.edu;;berkeley.edu;wustl.edu;cse.wustl.edu", "author_num": 5, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Washington University in St. Louis;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://wustl.edu;https://www.berkeley.edu", "aff_unique_abbr": "WUSTL;UC Berkeley", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Saint Louis;Berkeley", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Homomorphism Counts for Graph Neural Networks: All About That Basis", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32630", "id": "zRrzSLwNHQ", "proceeding": "https://proceedings.mlr.press/v235/jin24a.html", "pdf": "https://openreview.net/pdf?id=zRrzSLwNHQ", "openreview": "https://openreview.net/forum?id=zRrzSLwNHQ", "author_site": "Emily Jin, Michael Bronstein, Ismail Ceylan, Matthias Lanzinger", "tldr": "", "abstract": "A large body of work has investigated the properties of graph neural networks and identified several limitations, particularly pertaining to their expressive power. Their inability to count certain *patterns* (e.g., cycles) in a graph lies at the heart of such limitations, since many functions to be learned rely on the ability of counting such patterns. Two prominent paradigms aim to address this limitation by enriching the graph features with *subgraph* or *homomorphism* pattern counts. In this work, we show that both of these approaches are sub-optimal in a certain sense and argue for a more *fine-grained* approach, which incorporates the homomorphism counts of *all* structures in the ``basis'' of the target pattern. This yields strictly more expressive architectures without incurring any additional overhead in terms of computational complexity compared to existing approaches. We prove a series of theoretical results on node-level and graph-level *motif parameters* and empirically validate them on standard benchmark datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Emily Jin;Michael M. Bronstein;Ismail Ilkan Ceylan;Matthias Lanzinger", "authorids": "~Emily_Jin2;~Michael_M._Bronstein1;~Ismail_Ilkan_Ceylan2;~Matthias_Lanzinger1", "gender": "F;M;;M", "homepage": "https://www.cs.ox.ac.uk/people/emily.jin/;http://www.inf.usi.ch/bronstein/;https://www.cs.ox.ac.uk/people/ismaililkan.ceylan/;https://www.cs.ox.ac.uk/people/matthias.lanzinger/", "dblp": "346/1033;07/2668;147/6111;232/1851", "google_scholar": "BI4R8j4AAAAJ;UU3N6-UAAAAJ;avJ5kQcAAAAJ;XAR1lVUAAAAJ", "orcid": ";;0000-0003-4118-4689;0000-0002-7601-3727", "linkedin": ";mbronstein/;;matthias-lanzinger/", "or_profile": "~Emily_Jin2;~Michael_M._Bronstein1;~Ismail_Ilkan_Ceylan2;~Matthias_Lanzinger1", "aff": "University of Oxford;University of Oxford;University of Oxford;Technische Universit\u00e4t Wien", "aff_domain": "cs.ox.ac.uk;ox.ac.uk;oxford.ac.uk;tuwien.ac.at", "position": "PhD student;Full Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\njin2024homomorphism,\ntitle={Homomorphism Counts for Graph Neural Networks: All About That Basis},\nauthor={Emily Jin and Michael M. Bronstein and Ismail Ilkan Ceylan and Matthias Lanzinger},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zRrzSLwNHQ}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 630449, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 4, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13337717117272743674&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "cs.ox.ac.uk;ox.ac.uk;oxford.ac.uk;tuwien.ac.at", "author_num": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of Oxford;Technische Universit\u00e4t Wien", "aff_unique_dep": ";", "aff_unique_url": "https://www.ox.ac.uk;https://www.tuwien.ac.at", "aff_unique_abbr": "Oxford;TU Wien", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United Kingdom;Austria" }, { "title": "DSD-DA: Distillation-based Source Debiasing for Domain Adaptive Object Detection", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32629", "id": "zS8zUuAU8T", "proceeding": "https://proceedings.mlr.press/v235/feng24d.html", "pdf": "https://openreview.net/pdf?id=zS8zUuAU8T", "openreview": "https://openreview.net/forum?id=zS8zUuAU8T", "author_site": "Yongchao Feng, Shiwei Li, Yingjie Gao, Ziyue Huang, Yanan Zhang, Qingjie Liu, Yunhong Wang", "tldr": "", "abstract": "Though feature-alignment based Domain Adaptive Object Detection (DAOD) methods have achieved remarkable progress, they ignore the source bias issue, i.e., the detector tends to acquire more source-specific knowledge, impeding its generalization capabilities in the target domain. Furthermore, these methods face a more formidable challenge in achieving consistent classification and localization in the target domain compared to the source domain. To overcome these challenges, we propose a novel Distillation-based Source Debiasing (DSD) framework for DAOD, which can distill domain-agnostic knowledge from a pre-trained teacher model, improving the detector's performance on both domains. In addition, we design a Target-Relevant Object Localization Network (TROLN), which can mine target-related localization information from source and target-style mixed data. Accordingly, we present a Domain-aware Consistency Enhancing (DCE) strategy, in which these information are formulated into a new localization representation to further refine classification scores in the testing stage, achieving a harmonization between classification and localization. Extensive experiments have been conducted to manifest the effectiveness of this method, which consistently improves the strong baseline by large margins, outperforming existing alignment-based works.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yongchao Feng;Shiwei Li;Yingjie Gao;Ziyue Huang;Yanan Zhang;Qingjie Liu;Yunhong Wang", "authorids": "~Yongchao_Feng1;~Shiwei_Li5;~Yingjie_Gao1;~Ziyue_Huang2;~Yanan_Zhang4;~Qingjie_Liu2;~Yunhong_Wang1", "gender": "M;M;M;M;M;;", "homepage": ";;;https://scholar.google.com/citations?view_op=list_works&hl=en&user=UCfUofcAAAAJ;https://scholar.google.com.hk/citations?hl=zh-CN&pli=1&user=0_nun80AAAAJ;;", "dblp": "235/4366;;;;20/8874-5;;", "google_scholar": "ODShuI0AAAAJ;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.hk/citations?hl=zh-CN;https://scholar.google.com/citations?view_op=list_works;https://scholar.google.com.hk/citations?hl=zh-CN;;", "orcid": ";0000-0002-8765-9283;;;0000-0003-1592-1067;;", "linkedin": ";;;;;;", "or_profile": "~Yongchao_Feng1;~Shiwei_Li5;~Yingjie_Gao1;~Ziyue_Huang2;~Yanan_Zhang4;~Qingjie_Liu2;~Yunhong_Wang1", "aff": "Beihang University ;Beihang University;Beihang University;Beihang University;Beihang University;;", "aff_domain": "buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;;", "position": "PhD student;Postdoc;PhD student;PhD student;PhD student;;", "bibtex": "@inproceedings{\nfeng2024dsdda,\ntitle={{DSD}-{DA}: Distillation-based Source Debiasing for Domain Adaptive Object Detection},\nauthor={Yongchao Feng and Shiwei Li and Yingjie Gao and Ziyue Huang and Yanan Zhang and Qingjie Liu and Yunhong Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zS8zUuAU8T}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5139284, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 7, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8432856833839950579&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;;", "author_num": 7, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Beihang University", "aff_unique_dep": "", "aff_unique_url": "http://www.buaa.edu.cn/", "aff_unique_abbr": "BUAA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Handling Heterogeneous Curvatures in Bandit LQR Control", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32628", "id": "zWIS8I9G9B", "proceeding": "https://proceedings.mlr.press/v235/yan24f.html", "pdf": "https://openreview.net/pdf?id=zWIS8I9G9B", "openreview": "https://openreview.net/forum?id=zWIS8I9G9B", "author_site": "Yu-Hu Yan, Jing Wang, Peng Zhao", "tldr": "", "abstract": "We investigate online Linear Quadratic Regulator (LQR) with bandit feedback and semi-adversarial disturbances. Previous works assume costs with *homogeneous* curvatures (i.e., with a uniform strong convexity lower bound), which can be hard to satisfy in many real scenarios and prohibits adapting to true curvatures for better performance. In this paper, we initiate the study of bandit LQR control with *heterogeneous* cost curvatures, aiming to strengthen the algorithm's adaptivity. To achieve this, we reduce the problem to bandit convex optimization with memory via a ``with-history'' reduction to avoid hard-to-control truncation errors. Then we provide a novel analysis for an important *stability* term that appeared in both regret and memory, using *Newton decrement* developed in interior-point methods. The analysis enables us to guarantee memory-related terms introduced in the reduction and also provide a simplified analysis for handling heterogeneous curvatures in bandit convex optimization. Finally, we achieve interpolated guarantees that can not only recover existing bounds for convex and quadratic costs but also attain new implications for cases of corrupted and decaying quadraticity.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yu-Hu Yan;Jing Wang;Peng Zhao", "authorids": "~Yu-Hu_Yan1;~Jing_Wang32;~Peng_Zhao1", "gender": "M;M;", "homepage": "https://www.lamda.nju.edu.cn/yanyh;http://www.lamda.nju.edu.cn/wangjing/;", "dblp": "271/0054;02/736;", "google_scholar": "NdaoylQAAAAJ;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yu-Hu_Yan1;~Jing_Wang32;~Peng_Zhao1", "aff": "Nanjing University;Nanjing University;", "aff_domain": "nju.edu.cn;nju.edu.cn;", "position": "PhD student;PhD student;", "bibtex": "@inproceedings{\nyan2024handling,\ntitle={Handling Heterogeneous Curvatures in Bandit {LQR} Control},\nauthor={Yu-Hu Yan and Jing Wang and Peng Zhao},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zWIS8I9G9B}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 483488, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13054636812618260746&as_sdt=80000005&sciodt=0,23&hl=en", "gs_version_total": 7, "email": "nju.edu.cn;nju.edu.cn;", "author_num": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "", "aff_unique_url": "https://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "A Distributional Analogue to the Successor Representation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32627", "id": "zajsXCxMgW", "proceeding": "https://proceedings.mlr.press/v235/wiltzer24a.html", "pdf": "https://openreview.net/pdf?id=zajsXCxMgW", "openreview": "https://openreview.net/forum?id=zajsXCxMgW", "author_site": "Harley Wiltzer, Jesse Farebrother, Arthur Gretton, Yunhao Tang, Andre Barreto, Will Dabney, Marc Bellemare, Mark Rowland", "tldr": "", "abstract": "This paper contributes a new approach for distributional reinforcement learning which elucidates a clean separation of transition structure and reward in the learning process. Analogous to how the successor representation (SR) describes the expected consequences of behaving according to a given policy, our distributional successor measure (SM) describes the distributional consequences of this behaviour. We formulate the distributional SM as a distribution over distributions and provide theory connecting it with distributional and model-based reinforcement learning. Moreover, we propose an algorithm that learns the distributional SM from data by minimizing a two-level maximum mean discrepancy. Key to our method are a number of algorithmic techniques that are independently valuable for learning generative models of state. As an illustration of the usefulness of the distributional SM, we show that it enables zero-shot risk-sensitive policy evaluation in a way that was not previously possible.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Harley Wiltzer;Jesse Farebrother;Arthur Gretton;Yunhao Tang;Andre Barreto;Will Dabney;Marc G Bellemare;Mark Rowland", "authorids": "~Harley_Wiltzer1;~Jesse_Farebrother1;~Arthur_Gretton1;~Yunhao_Tang1;~Andre_Barreto1;~Will_Dabney1;~Marc_G_Bellemare1;~Mark_Rowland1", "gender": "M;M;M;M;M;M;M;M", "homepage": "https://harwiltz.github.io/about;https://brosa.ca;http://www.gatsby.ucl.ac.uk/~gretton/;https://robintyh1.github.io;https://sites.google.com/corp/view/andrebarreto/about;;http://www.marcgbellemare.info;http://sites.google.com/view/markrowland", "dblp": "321/0992;228/6862;56/2574;210/2229;72/953;https://dblp.uni-trier.de/pers/hd/d/Dabney:Will;38/4525;86/4090", "google_scholar": ";cA12XHcAAAAJ;OUv7J6QAAAAJ;;https://scholar.google.co.uk/citations?user=H-xtdV4AAAAJ;https://scholar.google.co.uk/citations?user=dR-7QW8AAAAJ;https://scholar.google.co.uk/citations?user=uyYPun0AAAAJ;https://scholar.google.co.uk/citations?user=-0U84zMAAAAJ", "orcid": ";0000-0002-5178-4947;;;;;;", "linkedin": "harley-wiltzer-4998547a;jessefarebro/;;;;;;", "or_profile": "~Harley_Wiltzer1;~Jesse_Farebrother1;~Arthur_Gretton1;~Yunhao_Tang1;~Andre_Barreto1;~Will_Dabney1;~Marc_G_Bellemare1;~Mark_Rowland1", "aff": "Mila;Google DeepMind;University College London;Google DeepMind;Google DeepMind;Google DeepMind;Google;Google DeepMind", "aff_domain": "mila.quebec;google.com;ucl.ac.uk;deepmind.com;google.com;google.com;google.com;google.com", "position": "PhD student;Student Researcher;Professor;Research Scientist;Research Scientist;Research Scientist;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nwiltzer2024a,\ntitle={A Distributional Analogue to the Successor Representation},\nauthor={Harley Wiltzer and Jesse Farebrother and Arthur Gretton and Yunhao Tang and Andre Barreto and Will Dabney and Marc G Bellemare and Mark Rowland},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zajsXCxMgW}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1441976, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 8, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9561131691410033906&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 7, "email": "mila.quebec;google.com;ucl.ac.uk;deepmind.com;google.com;google.com;google.com;google.com", "author_num": 8, "aff_unique_index": "0;1;2;1;1;1;1;1", "aff_unique_norm": "Mila;Google;University College London", "aff_unique_dep": "Quebec Artificial Intelligence Institute;Google DeepMind;", "aff_unique_url": "https://mila.quebec;https://deepmind.com;https://www.ucl.ac.uk", "aff_unique_abbr": "Mila;DeepMind;UCL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;1;1;1;2;1", "aff_country_unique": "Canada;United Kingdom;United States" }, { "title": "Contrastive Predict-and-Search for Mixed Integer Linear Programs", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32626", "id": "zatLnLvbs8", "proceeding": "https://proceedings.mlr.press/v235/huang24f.html", "pdf": "https://openreview.net/pdf?id=zatLnLvbs8", "openreview": "https://openreview.net/forum?id=zatLnLvbs8", "author_site": "Taoan Huang, Aaron Ferber, Arman Zharmagambetov, Yuandong Tian, Bistra Dilkina", "tldr": "", "abstract": "Mixed integer linear programs (MILP) are flexible and powerful tools for modeling and solving many difficult real-world combinatorial optimization problems. In this paper, we propose a novel machine learning (ML)-based framework ConPaS that learns to predict solutions to MILPs with contrastive learning. For training, we collect high-quality solutions as positive samples. We also collect low-quality or infeasible solutions as negative samples using novel optimization-based or sampling approaches. We then learn to make discriminative predictions by contrasting the positive and negative samples. During testing, we predict and fix the assignments for a subset of integer variables and then solve the resulting reduced MILP to find high-quality solutions. Empirically, ConPaS achieves state-of-the-art results compared to other ML-based approaches in terms of the quality of and the speed at which solutions are found.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Taoan Huang;Aaron M Ferber;Arman Zharmagambetov;Yuandong Tian;Bistra Dilkina", "authorids": "~Taoan_Huang2;~Aaron_M_Ferber1;~Arman_Zharmagambetov1;~Yuandong_Tian1;~Bistra_Dilkina2", "gender": "M;M;M;M;F", "homepage": ";https://aaron-ferber.github.io/;https://arman-z.github.io/;http://yuandong-tian.com;", "dblp": "241/7690;163/7788;252/5004;t/YuandongTian;30/5718", "google_scholar": ";TuVq07oAAAAJ;D6QocXMAAAAJ;0mgEF28AAAAJ;1jjyaBYAAAAJ", "orcid": ";;;0000-0003-4202-4847;0000-0002-6784-473X", "linkedin": ";aaron-ferber-64a73980/;;yuandongtian;", "or_profile": "~Taoan_Huang2;~Aaron_M_Ferber1;~Arman_Zharmagambetov1;~Yuandong_Tian1;~Bistra_Dilkina2", "aff": "University of Southern California;Cornell University;Meta AI (FAIR);Meta AI (FAIR);University of Southern California", "aff_domain": "usc.edu;cornell.edu;meta.com;meta.com;usc.edu", "position": "PhD student;Postdoc;Postdoc;Research Scientist;Associate Professor", "bibtex": "@inproceedings{\nhuang2024contrastive,\ntitle={Contrastive Predict-and-Search for Mixed Integer Linear Programs},\nauthor={Taoan Huang and Aaron M Ferber and Arman Zharmagambetov and Yuandong Tian and Bistra Dilkina},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zatLnLvbs8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 887631, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13697986095892352198&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "email": "usc.edu;cornell.edu;meta.com;meta.com;usc.edu", "author_num": 5, "aff_unique_index": "0;1;2;2;0", "aff_unique_norm": "University of Southern California;Cornell University;Meta", "aff_unique_dep": ";;Facebook AI Research (FAIR)", "aff_unique_url": "https://www.usc.edu;https://www.cornell.edu;https://ai.facebook.com", "aff_unique_abbr": "USC;Cornell;Meta AI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Differentially Private Sum-Product Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32625", "id": "zc3bAEI5lp", "proceeding": "https://proceedings.mlr.press/v235/heilmann24a.html", "pdf": "https://openreview.net/pdf?id=zc3bAEI5lp", "openreview": "https://openreview.net/forum?id=zc3bAEI5lp", "author_site": "Xenia Heilmann, Mattia Cerrato, Ernst Althaus", "tldr": "", "abstract": "Differentially private ML approaches seek to learn models which may be publicly released while guaranteeing that the input data is kept private. One issue with this construction is that further model releases based on the same training data (e.g. for a new task) incur a further privacy budget cost. Privacy-preserving synthetic data generation is one possible solution to this conundrum. However, models trained on synthetic private data struggle to approach the performance of private, ad-hoc models. In this paper, we present a novel method based on sum-product networks that is able to perform both privacy-preserving classification and privacy-preserving data generation with a single model. To the best of our knowledge, ours is the first approach that provides both discriminative and generative capabilities to differentially private ML. We show that our approach outperforms the state of the art in terms of stability (i.e. number of training runs required for convergence) and utility of the generated data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xenia Heilmann;Mattia Cerrato;Ernst Althaus", "authorids": "~Xenia_Heilmann1;~Mattia_Cerrato1;ernst.althaus@uni-mainz.de", "gender": "F;M;", "homepage": "https://www.algorithmics.informatik.uni-mainz.de/people/xenia-heilmann/;https://github.com/Pibborn;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Xenia_Heilmann1;~Mattia_Cerrato1;ernst.althaus@uni-mainz.de", "aff": "Johannes-Gutenberg Universit\u00e4t Mainz;;", "aff_domain": "uni-mainz.de;;", "position": "PhD student;;", "bibtex": "@inproceedings{\nheilmann2024differentially,\ntitle={Differentially Private Sum-Product Networks},\nauthor={Xenia Heilmann and Mattia Cerrato and Ernst Althaus},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zc3bAEI5lp}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1226930, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:F_a8IPev8xsJ:scholar.google.com/&scioq=Differentially+Private+Sum-Product+Networks&hl=en&as_sdt=0,44", "gs_version_total": 4, "email": "uni-mainz.de;;", "author_num": 3, "aff_unique_index": "0", "aff_unique_norm": "Johannes Gutenberg University Mainz", "aff_unique_dep": "", "aff_unique_url": "https://www.jgu.de", "aff_unique_abbr": "JGU", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "title": "ODIN: Disentangled Reward Mitigates Hacking in RLHF", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32624", "id": "zcIV8OQFVF", "proceeding": "https://proceedings.mlr.press/v235/chen24bn.html", "pdf": "https://openreview.net/pdf?id=zcIV8OQFVF", "openreview": "https://openreview.net/forum?id=zcIV8OQFVF", "author_site": "Lichang Chen, Chen Zhu, Jiuhai Chen, Davit Soselia, Tianyi Zhou, Tom Goldstein, Heng Huang, Mohammad Shoeybi, Bryan Catanzaro", "tldr": "", "abstract": "In this work, we study the issue of reward hacking on the response length, a challenge emerging in Reinforcement Learning from Human Feedback (RLHF) on LLMs. A well-formatted, verbose but less helpful response from the LLMs can often deceive LLMs or even human evaluators and achieve high scores. The same issue also holds for some reward models in RL. To address the challenges in both training and evaluation, we establish a more reliable evaluation protocol for comparing different training configurations, which inspects the trade-off between LLM evaluation score and response length obtained by varying training hyperparameters. Based on this evaluation, we conduct large-scale studies, where the results shed insights into the efficacy of hyperparameters and tricks used in RL on mitigating length bias. We further propose to improve the reward model by jointly training two linear heads to predict the preference, one trained to correlate with length and the other trained to decorrelate with length and therefore focusing more on the actual content. We then discard the length head in RL to ignore the spurious length reward. Experiments demonstrate that our approach eliminates the reward correlation with length, and improves the obtained policy by a significant margin.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lichang Chen;Chen Zhu;Jiuhai Chen;Davit Soselia;Tianyi Zhou;Tom Goldstein;Heng Huang;Mohammad Shoeybi;Bryan Catanzaro", "authorids": "~Lichang_Chen2;~Chen_Zhu2;~Jiuhai_Chen1;~Davit_Soselia1;~Tianyi_Zhou1;~Tom_Goldstein1;~Heng_Huang1;~Mohammad_Shoeybi1;~Bryan_Catanzaro1", "gender": "M;M;M;M;M;M;M;M;M", "homepage": "http://www.cs.umd.edu/~chenzhu/;https://www.linkedin.com/in/jiuhai-chen-6a486715a/;https://davitsoselia.com/;https://tianyizhou.github.io/;https://www.cs.umd.edu/~tomg/;https://www.cs.umd.edu/~heng/;;https://ctnzr.io;", "dblp": "59/10522-1.html;;232/2103;88/8205-1;25/8184;03/281;53/9742;14/4826;151/6212", "google_scholar": "m-om5O8AAAAJ;;rHit2vIAAAAJ;OKvgizMAAAAJ;KmSuVtgAAAAJ;4OqLaDwAAAAJ;62ElavIAAAAJ;UZ6kI2AAAAAJ;https://scholar.google.com/citations?hl=zh-CN", "orcid": ";;0000-0002-0500-7442;0000-0001-5348-0632;;;;0000-0003-0034-7728;", "linkedin": ";;davitsoselia/;tianyizhou;;;shoeybi/;bryancatanzaro/;lichang-chen-b7a506173/", "or_profile": "~Chen_Zhu2;~Jiuhai_Chen1;~Davit_Soselia1;~Tianyi_Zhou1;~Tom_Goldstein1;~Heng_Huang1;~Mohammad_Shoeybi1;~Bryan_Catanzaro1;~LICHANG_CHEN1", "aff": "NVIDIA;University of Maryland, College Park;University of Maryland, College Park;University of Maryland, College Park;University of Maryland, College Park;Department of Computer Science, University of Maryland, College Park;NVIDIA;NVIDIA;Department of Computer Science, University of Maryland, College Park", "aff_domain": "nvidia.com;umd.edu;umd.edu;umd.edu;umd.edu;cs.umd.edu;nvidia.com;nvidia.com;cs.umd.edu", "position": "Research Scientist;PhD student;PhD student;Assistant Professor;Full Professor;Full Professor;Director of Applied Resesrch;Vice President;PhD student", "bibtex": "@inproceedings{\nchen2024odin,\ntitle={{ODIN}: Disentangled Reward Mitigates Hacking in {RLHF}},\nauthor={Lichang Chen and Chen Zhu and Jiuhai Chen and Davit Soselia and Tianyi Zhou and Tom Goldstein and Heng Huang and Mohammad Shoeybi and Bryan Catanzaro},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zcIV8OQFVF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1098168, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 9, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=151356864447242668&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "nvidia.com;umd.edu;umd.edu;umd.edu;umd.edu;cs.umd.edu;nvidia.com;nvidia.com;cs.umd.edu", "author_num": 9, "aff_unique_index": "0;1;1;1;1;2;0;0;2", "aff_unique_norm": "NVIDIA;University of Maryland;University of Maryland, College Park", "aff_unique_dep": "NVIDIA Corporation;;Department of Computer Science", "aff_unique_url": "https://www.nvidia.com;https://www/umd.edu;https://www/umd.edu", "aff_unique_abbr": "NVIDIA;UMD;UMD", "aff_campus_unique_index": "1;1;1;1;1;1", "aff_campus_unique": ";College Park", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "TIC-TAC: A Framework For Improved Covariance Estimation In Deep Heteroscedastic Regression", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32623", "id": "zdNTiTs5gU", "proceeding": "https://proceedings.mlr.press/v235/shukla24a.html", "pdf": "https://openreview.net/pdf?id=zdNTiTs5gU", "openreview": "https://openreview.net/forum?id=zdNTiTs5gU", "author_site": "Megh Shukla, Mathieu Salzmann, Alexandre Alahi", "tldr": "", "abstract": "Deep heteroscedastic regression involves jointly optimizing the mean and covariance of the predicted distribution using the negative log-likelihood. However, recent works show that this may result in sub-optimal convergence due to the challenges associated with covariance estimation. While the literature addresses this by proposing alternate formulations to mitigate the impact of the predicted covariance, we focus on improving the predicted covariance itself. We study two questions: (1) Does the predicted covariance truly capture the randomness of the predicted mean? (2) In the absence of supervision, how can we quantify the accuracy of covariance estimation? We address (1) with a _Taylor Induced Covariance (TIC)_, which captures the randomness of the predicted mean by incorporating its gradient and curvature through the second order Taylor polynomial. Furthermore, we tackle (2) by introducing a _Task Agnostic Correlations (TAC)_ metric, which combines the notion of correlations and absolute error to evaluate the covariance. We evaluate TIC-TAC across multiple experiments spanning synthetic and real-world datasets. Our results show that not only does TIC accurately learn the covariance, it additionally facilitates an improved convergence of the negative log-likelihood. Our code is available at [https://github.com/vita-epfl/TIC-TAC](https://github.com/vita-epfl/TIC-TAC)", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Megh Shukla;Mathieu Salzmann;Alexandre Alahi", "authorids": "~Megh_Shukla1;~Mathieu_Salzmann1;~Alexandre_Alahi3", "gender": "M;M;M", "homepage": "https://meghshukla.github.io;https://people.epfl.ch/mathieu.salzmann;https://vita.epfl.ch/", "dblp": "251/3159;18/4533;48/3455", "google_scholar": "1_yu5DAAAAAJ;https://scholar.google.ch/citations?user=n-B0jr4AAAAJ;UIhXQ64AAAAJ", "orcid": "0000-0001-7189-2278;;", "linkedin": "megh-shukla/;;", "or_profile": "~Megh_Shukla1;~Mathieu_Salzmann1;~Alexandre_Alahi3", "aff": "EPFL - EPF Lausanne;CSIRO;EPFL", "aff_domain": "epfl.ch;data61.csiro.au;epfl.ch", "position": "PhD student;Collaborator;Associate Professor", "bibtex": "@inproceedings{\nshukla2024tictac,\ntitle={{TIC}-{TAC}: A Framework For Improved Covariance Estimation In Deep Heteroscedastic Regression},\nauthor={Megh Shukla and Mathieu Salzmann and Alexandre Alahi},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zdNTiTs5gU}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5851023, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8388827144842703209&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "email": "epfl.ch;data61.csiro.au;epfl.ch", "author_num": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "EPFL;Commonwealth Scientific and Industrial Research Organisation", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://www.csiro.au", "aff_unique_abbr": "EPFL;CSIRO", "aff_campus_unique_index": "0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Switzerland;Australia" }, { "title": "Individualized Privacy Accounting via Subsampling with Applications in Combinatorial Optimization", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32622", "id": "zfmwAaB9Nw", "proceeding": "https://proceedings.mlr.press/v235/ghazi24a.html", "pdf": "https://openreview.net/pdf?id=zfmwAaB9Nw", "openreview": "https://openreview.net/forum?id=zfmwAaB9Nw", "author_site": "Badih Ghazi, Pritish Kamath, Ravi Kumar, Pasin Manurangsi, Adam Sealfon", "tldr": "", "abstract": "In this work, we give a new technique for analyzing individualized privacy accounting via the following simple observation: if an algorithm is one-sided add-DP, then its subsampled variant satisfies two-sided DP. From this, we obtain several improved algorithms for private combinatorial optimization problems, including decomposable submodular maximization and set cover. Our error guarantees are asymptotically tight and our algorithm satisfies pure-DP while previously known algorithms (Gupta et al., 2010; Chaturvedi et al., 2021) are approximate-DP. We also show an application of our technique beyond combinatorial optimization by giving a pure-DP algorithm for the shifting heavy hitter problem in a stream; previously, only an approximate-DP algorithm was known (Kaplan et al., 2021; Cohen & Lyu, 2023).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Badih Ghazi;Pritish Kamath;Ravi Kumar;Pasin Manurangsi;Adam Sealfon", "authorids": "~Badih_Ghazi1;~Pritish_Kamath2;~Ravi_Kumar1;~Pasin_Manurangsi2;~Adam_Sealfon1", "gender": ";M;M;M;", "homepage": "https://sites.google.com/view/badihghazi/home;https://pritishkamath.github.io/;https://sites.google.com/site/ravik53/;https://pasin30055.github.io/;https://asealfon.github.io/", "dblp": "125/2134;https://dblp.org/pers/k/Kamath:Pritish.html;k/RaviKumar.html;133/2059;150/6253", "google_scholar": "GBJLTN8AAAAJ;1JFARhUAAAAJ;J_XhIsgAAAAJ;35hM-PkAAAAJ;nrlhJMcAAAAJ", "orcid": ";;0000-0002-2203-2586;;", "linkedin": "badih-ghazi-608379132/;;ravi-kumar-a3a9631;;", "or_profile": "~Badih_Ghazi1;~Pritish_Kamath2;~Ravi_Kumar1;~Pasin_Manurangsi2;~Adam_Sealfon1", "aff": "Google;Google Research;Google;Google;Google", "aff_domain": "google.com;google.com;google.com;google.com;google.com", "position": "Researcher;Research Scientist;Research Scientist;Research Scientist;Researcher", "bibtex": "@inproceedings{\nghazi2024individualized,\ntitle={Individualized Privacy Accounting via Subsampling with Applications in Combinatorial Optimization},\nauthor={Badih Ghazi and Pritish Kamath and Ravi Kumar and Pasin Manurangsi and Adam Sealfon},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zfmwAaB9Nw}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 489335, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=387210836198676526&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "email": "google.com;google.com;google.com;google.com;google.com", "author_num": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Conditionally-Conjugate Gaussian Process Factor Analysis for Spike Count Data via Data Augmentation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32621", "id": "zgiT3uxvCF", "proceeding": "https://proceedings.mlr.press/v235/nadew24a.html", "pdf": "https://openreview.net/pdf?id=zgiT3uxvCF", "openreview": "https://openreview.net/forum?id=zgiT3uxvCF", "author_site": "Yididiya Nadew, Xuhui Fan, Christopher J Quinn", "tldr": "", "abstract": "Gaussian process factor analysis (GPFA) is a latent variable modeling technique commonly used to identify smooth, low-dimensional latent trajectories underlying high-dimensional neural recordings. Specifically, researchers model spiking rates as Gaussian observations, resulting in tractable inference. Recently, GPFA has been extended to model spike count data. However, due to the non-conjugacy of the likelihood, the inference becomes intractable. Prior works rely on either black-box inference techniques, numerical integration or polynomial approximations of the likelihood to handle intractability. To overcome this challenge, we propose a conditionally-conjugate Gaussian process factor analysis (ccGPFA) resulting in both analytically and computationally tractable inference for modeling neural activity from spike count data. In particular, we develop a novel data augmentation based method that renders the model conditionally conjugate. Consequently, our model enjoys the advantage of simple closed-form updates using a variational EM algorithm. Furthermore, due to its conditional conjugacy, we show our model can be readily scaled using sparse Gaussian Processes and accelerated inference via natural gradients. To validate our method, we empirically demonstrate its efficacy through experiments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yididiya Y. Nadew;Xuhui Fan;Christopher John Quinn", "authorids": "~Yididiya_Y._Nadew1;~Xuhui_Fan1;~Christopher_John_Quinn1", "gender": "M;M;M", "homepage": "https://xuhuifan.github.io/;https://www.cs.iastate.edu/people/christopher-quinn;", "dblp": "117/4874;50/8822;339/0727", "google_scholar": "https://scholar.google.com.au/citations?user=NSc42eUAAAAJ;oXWIgXcAAAAJ;qfO0624AAAAJ", "orcid": "0000-0002-7558-7200;0000-0002-9053-1504;", "linkedin": ";;", "or_profile": "~Xuhui_Fan1;~Christopher_John_Quinn1;~Yididiya_Y_Nadew1", "aff": "Macquarie University;Iowa State University;Iowa State University", "aff_domain": "mq.edu.au;iastate.edu;iastate.edu", "position": "Lecturer;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nnadew2024conditionallyconjugate,\ntitle={Conditionally-Conjugate Gaussian Process Factor Analysis for Spike Count Data via Data Augmentation},\nauthor={Yididiya Y. Nadew and Xuhui Fan and Christopher John Quinn},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zgiT3uxvCF}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 553976, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:AQfTbtsnVPMJ:scholar.google.com/&scioq=Conditionally-Conjugate+Gaussian+Process+Factor+Analysis+for+Spike+Count+Data+via+Data+Augmentation&hl=en&as_sdt=0,5", "gs_version_total": 9, "email": "mq.edu.au;iastate.edu;iastate.edu", "author_num": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Macquarie University;Iowa State University", "aff_unique_dep": ";", "aff_unique_url": "https://www.mq.edu.au;https://www.iastate.edu", "aff_unique_abbr": "MQ;ISU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Australia;United States" }, { "title": "Improving Factuality and Reasoning in Language Models through Multiagent Debate", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32620", "id": "zj7YuTE4t8", "proceeding": "https://proceedings.mlr.press/v235/du24e.html", "pdf": "https://openreview.net/pdf?id=zj7YuTE4t8", "openreview": "https://openreview.net/forum?id=zj7YuTE4t8", "author_site": "Yilun Du, Shuang Li, Antonio Torralba, Josh Tenenbaum, Igor Mordatch", "tldr": "", "abstract": "Large language models (LLMs) have demonstrated remarkable capabilities in language generation, understanding, and few-shot learning in recent years. An extensive body of work has explored how their performance may be further improved through the tools of prompting, ranging from verification, self-consistency, or intermediate scratchpads. In this paper, we present a complementary approach to improve language responses where multiple language model instances propose and debate their individual responses and reasoning processes over multiple rounds to arrive at a common final answer. Our findings indicate that this approach significantly enhances mathematical and strategic reasoning across a number of tasks. We also demonstrate that our approach improves the factual validity of generated content, reducing fallacious answers and hallucinations that contemporary models are prone to. Our approach may be directly applied to existing black-box models and uses identical procedure and prompts for all tasks we investigate. Overall, our findings suggest that such \"society of minds\" approach has the potential to significantly advance the capabilities of LLMs and pave the way for further breakthroughs in language generation and understanding.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yilun Du;Shuang Li;Antonio Torralba;Joshua B. Tenenbaum;Igor Mordatch", "authorids": "~Yilun_Du1;~Shuang_Li5;~Antonio_Torralba1;~Joshua_B._Tenenbaum1;~Igor_Mordatch4", "gender": ";;M;;", "homepage": "https://yilundu.github.io;;http://web.mit.edu/torralba/www//;;", "dblp": "204/4379;;t/AntonioBTorralba;t/JoshuaBTenenbaum;", "google_scholar": ";;https://scholar.google.com.tw/citations?user=8cxDHS4AAAAJ;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Yilun_Du1;~Shuang_Li5;~Antonio_Torralba1;~Joshua_B._Tenenbaum1;~Igor_Mordatch4", "aff": "Massachusetts Institute of Technology;;Massachusetts Institute of Technology;Massachusetts Institute of Technology;", "aff_domain": "mit.edu;;mit.edu;mit.edu;", "position": "PhD student;;Full Professor;Professor;", "bibtex": "@inproceedings{\ndu2024improving,\ntitle={Improving Factuality and Reasoning in Language Models through Multiagent Debate},\nauthor={Yilun Du and Shuang Li and Antonio Torralba and Joshua B. Tenenbaum and Igor Mordatch},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zj7YuTE4t8}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1886161, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 620, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4306390936519288835&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "email": "mit.edu;;mit.edu;mit.edu;", "author_num": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Flexible Residual Binarization for Image Super-Resolution", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32619", "id": "zji9DLksTz", "proceeding": "https://proceedings.mlr.press/v235/zhang24bb.html", "pdf": "https://openreview.net/pdf?id=zji9DLksTz", "openreview": "https://openreview.net/forum?id=zji9DLksTz", "author_site": "Yulun Zhang, Haotong Qin, Zixiang Zhao, Xianglong Liu, Martin Danelljan, Fisher Yu", "tldr": "", "abstract": "Binarized image super-resolution (SR) has attracted much research attention due to its potential to drastically reduce parameters and operations. However, most binary SR works binarize network weights directly, which hinders high-frequency information extraction. Furthermore, as a pixel-wise reconstruction task, binarization often results in heavy representation content distortion. To address these issues, we propose a flexible residual binarization (FRB) method for image SR. We first propose a second-order residual binarization (SRB), to counter the information loss caused by binarization. In addition to the primary weight binarization, we also binarize the reconstruction error, which is added as a residual term in the prediction. Furthermore, to narrow the representation content gap between the binarized and full-precision networks, we propose Distillation-guided Binarization Training (DBT). We uniformly align the contents of different bit widths by constructing a normalized attention form. Finally, we generalize our method by applying our FRB to binarize convolution and Transformer-based SR networks, resulting in two binary baselines: FRBC and FRBT. We conduct extensive experiments and comparisons with recent leading binarization methods. Our proposed baselines, FRBC and FRBT, achieve superior performance both quantitatively and visually. The code and model will be released.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yulun Zhang;Haotong Qin;Zixiang Zhao;Xianglong Liu;Martin Danelljan;Fisher Yu", "authorids": "~Yulun_Zhang1;~Haotong_Qin1;~Zixiang_Zhao1;~Xianglong_Liu3;~Martin_Danelljan4;~Fisher_Yu2", "gender": "M;M;M;;M;M", "homepage": "http://yulunzhang.com/;https://htqin.github.io/;https://zhaozixiang1228.github.io/;;https://martin-danelljan.github.io/;https://www.yf.io/", "dblp": "166/2763-1.html;262/3626.html;65/5420;;151/8848;117/6314", "google_scholar": "ORmLjWoAAAAJ;mK6n-KgAAAAJ;tUv_X8cAAAAJ;;NCSSpMkAAAAJ;-XCiamcAAAAJ", "orcid": "0000-0002-2288-5079;;;;;", "linkedin": "yulun-zhang-1116b5b9/;;;;;", "or_profile": "~Yulun_Zhang1;~Haotong_Qin1;~Zixiang_Zhao1;~Xianglong_Liu3;~Martin_Danelljan4;~Fisher_Yu2", "aff": "Swiss Federal Institute of Technology;ETHZ - ETH Zurich;ETHZ - ETH Zurich;;ETH Zurich;Swiss Federal Institute of Technology", "aff_domain": "ethz.ch;ethz.ch;ethz.ch;;vision.ee.ethz.ch;ethz.ch", "position": "Postdoc;Postdoc;Postdoc;;Principal Researcher;Assistant Professor", "bibtex": "@inproceedings{\nzhang2024flexible,\ntitle={Flexible Residual Binarization for Image Super-Resolution},\nauthor={Yulun Zhang and Haotong Qin and Zixiang Zhao and Xianglong Liu and Martin Danelljan and Fisher Yu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zji9DLksTz}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 1836191, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2208874550807841978&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 5, "email": "ethz.ch;ethz.ch;ethz.ch;;vision.ee.ethz.ch;ethz.ch", "author_num": 6, "aff_unique_index": "0;1;1;1;0", "aff_unique_norm": "Swiss Federal Institute of Technology;ETH Zurich", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.ethz.ch", "aff_unique_abbr": "ETH Zurich;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Don\u2019t Label Twice: Quantity Beats Quality when Comparing Binary Classifiers on a Budget", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32618", "id": "zkcya47Sq5", "proceeding": "https://proceedings.mlr.press/v235/dorner24a.html", "pdf": "https://openreview.net/pdf?id=zkcya47Sq5", "openreview": "https://openreview.net/forum?id=zkcya47Sq5", "author_site": "Florian Dorner, Moritz Hardt", "tldr": "", "abstract": "We study how to best spend a budget of noisy labels to compare the accuracy of two binary classifiers. It\u2019s common practice to collect and aggregate multiple noisy labels for a given data point into a less noisy label via a majority vote. We prove a theorem that runs counter to conventional wisdom. If the goal is to identify the better of two classifiers, we show it\u2019s best to spend the budget on collecting a single label for more samples. Our result follows from a non-trivial application of Cram\u00e9r\u2019s theorem, a staple in the theory of large deviations. We discuss the implications of our work for the design of machine learning benchmarks, where they overturn some time-honored recommendations. In addition, our results provide sample size bounds superior to what follows from Hoeffding\u2019s bound.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Florian E. Dorner;Moritz Hardt", "authorids": "~Florian_E._Dorner1;~Moritz_Hardt1", "gender": "M;Not Specified", "homepage": "https://flodorner.github.io/;http://mrtz.org/", "dblp": "285/5327;26/4683", "google_scholar": "aYHq31IAAAAJ;adnTgaAAAAAJ", "orcid": ";", "linkedin": "florian-dorner-242b48172/;", "or_profile": "~Florian_E._Dorner1;~Moritz_Hardt1", "aff": "ETHZ - ETH Zurich;Max-Planck-Institute for Intelligent Systems, Max-Planck Institute", "aff_domain": "ethz.ch;is.mpg.de", "position": "PhD student;Principal Researcher", "bibtex": "@inproceedings{\ndorner2024dont,\ntitle={Don{\\textquoteright}t Label Twice: Quantity Beats Quality when Comparing Binary Classifiers on a Budget},\nauthor={Florian E. Dorner and Moritz Hardt},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zkcya47Sq5}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 511353, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8691906839514762273&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "email": "ethz.ch;is.mpg.de", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "ETH Zurich;Max-Planck-Institute for Intelligent Systems", "aff_unique_dep": ";Intelligent Systems", "aff_unique_url": "https://www.ethz.ch;https://www.mpi-is.mpg.de", "aff_unique_abbr": "ETHZ;MPI-IS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Switzerland;Germany" }, { "title": "Trustworthy Actionable Perturbations", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32617", "id": "zkjGpZrIX3", "proceeding": "https://proceedings.mlr.press/v235/friedbaum24a.html", "pdf": "https://openreview.net/pdf?id=zkjGpZrIX3", "openreview": "https://openreview.net/forum?id=zkjGpZrIX3", "author_site": "Jesse Friedbaum, Sudarshan Adiga, Ravi Tandon", "tldr": "", "abstract": "*Counterfactuals*, or modified inputs that lead to a different outcome, are an important tool for understanding the logic used by machine learning classifiers and how to change an undesirable classification. Even if a counterfactual changes a classifier's decision, however, it may not affect the true underlying class probabilities, i.e. the counterfactual may act like an adversarial attack and ``fool'' the classifier. We propose a new framework for creating modified inputs that change the true underlying probabilities in a beneficial way which we call *Trustworthy Actionable Perturbations* (TAP). This includes a novel verification procedure to ensure that TAP change the true class probabilities instead of acting adversarially. Our framework also includes new cost, reward, and goal definitions that are better suited to effectuating change in the real world. We present PAC-learnability results for our verification procedure and theoretically analyze our new method for measuring reward. We also develop a methodology for creating TAP and compare our results to those achieved by previous counterfactual methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jesse Friedbaum;Sudarshan Adiga;Ravi Tandon", "authorids": "~Jesse_Friedbaum1;~Sudarshan_Adiga2;~Ravi_Tandon1", "gender": "M;M;", "homepage": "https://sites.google.com/math.arizona.edu/friedbaum;https://www2.engr.arizona.edu/~adiga/;https://tandonravi.github.io/", "dblp": ";;19/543.html", "google_scholar": ";https://scholar.google.co.in/citations?user=YfYMcrkAAAAJ;u-lTHjcAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Jesse_Friedbaum1;~Sudarshan_Adiga2;~Ravi_Tandon1", "aff": "University of Arizona;University of Arizona;University of Arizona", "aff_domain": "arizona.edu;arizona.edu;arizona.edu", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nfriedbaum2024trustworthy,\ntitle={Trustworthy Actionable Perturbations},\nauthor={Jesse Friedbaum and Sudarshan Adiga and Ravi Tandon},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zkjGpZrIX3}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 5868432, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 3, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11005339962093650367&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "email": "arizona.edu;arizona.edu;arizona.edu", "author_num": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Arizona", "aff_unique_dep": "", "aff_unique_url": "https://www.arizona.edu", "aff_unique_abbr": "UA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "S3GCL: Spectral, Swift, Spatial Graph Contrastive Learning", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32616", "id": "znKAWRZSF9", "proceeding": "https://proceedings.mlr.press/v235/wan24g.html", "pdf": "https://openreview.net/pdf?id=znKAWRZSF9", "openreview": "https://openreview.net/forum?id=znKAWRZSF9", "author_site": "Guancheng Wan, Yijun Tian, Wenke Huang, Nitesh Chawla, Mang Ye", "tldr": "", "abstract": "Graph Contrastive Learning (GCL) has emerged as a highly effective self-supervised approach in graph representation learning. However, prevailing GCL methods confront two primary challenges: 1) They predominantly operate under homophily assumptions, focusing on low-frequency signals in node features while neglecting heterophilic edges that connect nodes with dissimilar features. 2) Their reliance on neighborhood aggregation for inference leads to scalability challenges and hinders deployment in real-time applications. In this paper, we introduce S3GCL, an innovative framework designed to tackle these challenges. Inspired by spectral GNNs, we initially demonstrate the correlation between frequency and homophily levels. Then, we propose a novel cosine-parameterized Chebyshev polynomial as low/high-pass filters to generate biased graph views. To resolve the inference dilemma, we incorporate an MLP encoder and enhance its awareness of graph context by introducing structurally and semantically neighboring nodes as positive pairs in the spatial domain. Finally, we formulate a cross-pass GCL objective between full-pass MLP and biased-pass GNN filtered features, eliminating the need for augmentation. Extensive experiments on real-world tasks validate S3GCL proficiency in generalization to diverse homophily levels and its superior inference efficiency.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guancheng Wan;Yijun Tian;Wenke Huang;Nitesh V Chawla;Mang Ye", "authorids": "~Guancheng_Wan1;~Yijun_Tian1;~Wenke_Huang1;~Nitesh_V_Chawla1;~Mang_Ye1", "gender": ";;M;;M", "homepage": ";https://www.yijuntian.com/;https://wenkehuang.github.io/;;https://marswhu.github.io/", "dblp": ";234/9123-1;330/1664;;156/0610", "google_scholar": ";dbaBgV0AAAAJ;https://scholar.google.com/citations?hl=zh-CN;;j-HxRy0AAAAJ", "orcid": ";0000-0003-2795-6080;0000-0003-4819-293X;;0000-0003-3989-7655", "linkedin": ";yijun-tian/;;;", "or_profile": "~Guancheng_Wan1;~Yijun_Tian1;~Wenke_Huang1;~Nitesh_V_Chawla1;~Mang_Ye1", "aff": ";University of Notre Dame;Wuhan University;;Wuhan University", "aff_domain": ";nd.edu;whu.edu.cn;;whu.edu.cn", "position": ";PhD student;PhD student;;Professor", "bibtex": "@inproceedings{\nwan2024sgcl,\ntitle={S3{GCL}: Spectral, Swift, Spatial Graph Contrastive Learning},\nauthor={Guancheng Wan and Yijun Tian and Wenke Huang and Nitesh V Chawla and Mang Ye},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=znKAWRZSF9}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 6090638, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7000914117643966597&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 4, "email": ";nd.edu;whu.edu.cn;;whu.edu.cn", "author_num": 5, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Notre Dame;Wuhan University", "aff_unique_dep": ";", "aff_unique_url": "https://www.nd.edu;http://www.whu.edu.cn/", "aff_unique_abbr": "Notre Dame;WHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;China" }, { "title": "Bias of Stochastic Gradient Descent or the Architecture: Disentangling the Effects of Overparameterization of Neural Networks", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32615", "id": "znz261CQK7", "proceeding": "https://proceedings.mlr.press/v235/peleg24a.html", "pdf": "https://openreview.net/pdf?id=znz261CQK7", "openreview": "https://openreview.net/forum?id=znz261CQK7", "author_site": "Amit Peleg, Matthias Hein", "tldr": "", "abstract": "Neural networks typically generalize well when fitting the data perfectly, even though they are heavily overparameterized. Many factors have been pointed out as the reason for this phenomenon, including an implicit bias of stochastic gradient descent (SGD) and a possible simplicity bias arising from the neural network architecture. The goal of this paper is to disentangle the factors that influence generalization stemming from optimization and architectural choices by studying *random* and *SGD-optimized* networks that achieve zero training error. We experimentally show, in the low sample regime, that overparameterization in terms of increasing width is beneficial for generalization, and this benefit is due to the bias of SGD and not due to an architectural bias. In contrast, for increasing depth, overparameterization is detrimental for generalization, but random and SGD-optimized networks behave similarly, so this can be attributed to an architectural bias.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Amit Peleg;Matthias Hein", "authorids": "~Amit_Peleg1;~Matthias_Hein2", "gender": ";M", "homepage": ";https://uni-tuebingen.de/de/164260", "dblp": "297/5541;97/1213-1", "google_scholar": ";0ZAb3tsAAAAJ", "orcid": ";", "linkedin": "amit-peleg-4bb506212/;", "or_profile": "~Amit_Peleg1;~Matthias_Hein2", "aff": "Eberhard-Karls-Universit\u00e4t T\u00fcbingen;University of T\u00fcbingen", "aff_domain": "uni-tuebingen.de;uni-tuebingen.de", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\npeleg2024bias,\ntitle={Bias of Stochastic Gradient Descent or the Architecture: Disentangling the Effects of Overparameterization of Neural Networks},\nauthor={Amit Peleg and Matthias Hein},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=znz261CQK7}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 8752048, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:DT1u8Rkx4DAJ:scholar.google.com/&scioq=Bias+of+Stochastic+Gradient+Descent+or+the+Architecture:+Disentangling+the+Effects+of+Overparameterization+of+Neural+Networks&hl=en&as_sdt=0,47", "gs_version_total": 7, "email": "uni-tuebingen.de;uni-tuebingen.de", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Eberhard Karls University of T\u00fcbingen;University of T\u00fcbingen", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.uni-tuebingen.de/", "aff_unique_abbr": "Uni T\u00fcbingen;Uni T\u00fcbingen", "aff_campus_unique_index": "0", "aff_campus_unique": "T\u00fcbingen;", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Rethinking Independent Cross-Entropy Loss For Graph-Structured Data", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32614", "id": "zrQIc9mQQN", "proceeding": "https://proceedings.mlr.press/v235/miao24c.html", "pdf": "https://openreview.net/pdf?id=zrQIc9mQQN", "openreview": "https://openreview.net/forum?id=zrQIc9mQQN", "author_site": "Rui Miao, Kaixiong Zhou, Yili Wang, Ninghao Liu, Ying Wang, Xin Wang", "tldr": "", "abstract": "Graph neural networks (GNNs) have exhibited prominent performance in learning graph-structured data. Considering node classification task, based on the i.i.d assumption among node labels, the traditional supervised learning simply sums up cross-entropy losses of the independent training nodes and applies the average loss to optimize GNNs' weights. But different from other data formats, the nodes are naturally connected. It is found that the independent distribution modeling of node labels restricts GNNs' capability to generalize over the entire graph and defend adversarial attacks. In this work, we propose a new framework, termed joint-cluster supervised learning, to model the joint distribution of each node with its corresponding cluster. We learn the joint distribution of node and cluster labels conditioned on their representations, and train GNNs with the obtained joint loss. In this way, the data-label reference signals extracted from the local cluster explicitly strengthen the discrimination ability on the target node. The extensive experiments demonstrate that our joint-cluster supervised learning can effectively bolster GNNs' node classification accuracy. Furthermore, being benefited from the reference signals which may be free from spiteful interference, our learning paradigm significantly protects the node classification from being affected by the adversarial attack.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rui Miao;Kaixiong Zhou;Yili Wang;Ninghao Liu;Ying Wang;Xin Wang", "authorids": "~Rui_Miao2;~Kaixiong_Zhou1;~Yili_Wang2;~Ninghao_Liu2;~Ying_Wang13;~Xin_Wang54", "gender": "M;M;M;F;M;M", "homepage": ";https://kaixiong-zhou.github.io/;https://yl-wang.github.io//;https://ccst.jlu.edu.cn/info/1367/19675.htm;https://xinwangjlu.github.io/;https://cobweb.cs.uga.edu/~ninghaoliu/", "dblp": "62/675-3;178/7315;48/6261-4;94/3104-9.html;10/5630-35;145/4489", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;zMspIjIAAAAJ;https://scholar.google.com/citations?view_op=list_works;;https://scholar.google.com.hk/citations?user=Eh9n5VAAAAAJ;Nir-EDYAAAAJ", "orcid": "0000-0002-2917-2311;0000-0001-5226-8736;0000-0003-0845-9521;0000-0002-3288-5195;0000-0001-9448-7689;0000-0002-9170-2424", "linkedin": ";;;;;", "or_profile": "~Rui_Miao2;~Kaixiong_Zhou1;~Yili_Wang2;~Ying_Wang13;~Xin_Wang54;~Ninghao_Liu1", "aff": "Jilin University;Massachusetts Institute of Technology;Jilin University;Jilin University;Jilin University;University of Georgia", "aff_domain": "jlu.edu.cn;mit.edu;jlu.edu.cn;jlu.edu.cn;jlu.edu.cn;uga.edu", "position": "PhD student;Postdoc;PhD student;Full Professor;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nmiao2024rethinking,\ntitle={Rethinking Independent Cross-Entropy Loss For Graph-Structured Data},\nauthor={Rui Miao and Kaixiong Zhou and Yili Wang and Ninghao Liu and Ying Wang and Xin Wang},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zrQIc9mQQN}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 876111, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13743410795862586802&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 7, "email": "jlu.edu.cn;mit.edu;jlu.edu.cn;jlu.edu.cn;jlu.edu.cn;uga.edu", "author_num": 6, "aff_unique_index": "0;1;0;0;0;2", "aff_unique_norm": "Jilin University;Massachusetts Institute of Technology;University of Georgia", "aff_unique_dep": ";;", "aff_unique_url": "http://www.jlu.edu.cn;https://web.mit.edu;https://www.uga.edu", "aff_unique_abbr": "JLU;MIT;UGA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Transformers are SSMs: Generalized Models and Efficient Algorithms Through Structured State Space Duality", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32613", "id": "ztn8FCR1td", "proceeding": "https://proceedings.mlr.press/v235/dao24a.html", "pdf": "https://openreview.net/pdf?id=ztn8FCR1td", "openreview": "https://openreview.net/forum?id=ztn8FCR1td", "author_site": "Tri Dao, Albert Gu", "tldr": "", "abstract": "While Transformers have been the main architecture behind deep learning's success in language modeling, state-space models (SSMs) such as Mamba have recently been shown to match or outperform Transformers at small to medium scale. We show that these families of models are actually quite closely related, and develop a rich framework of theoretical connections between SSMs and variants of attention, connected through various decompositions of a well-studied class of structured *semiseparable matrices*. Our state space duality (SSD) framework allows us to design a new architecture (**Mamba-2**) whose core layer is an a refinement of Mamba's selective SSM that is 2-8$\\times$ faster, while continuing to be competitive with Transformers on language modeling.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tri Dao;Albert Gu", "authorids": "~Tri_Dao1;~Albert_Gu1", "gender": ";M", "homepage": "https://tridao.me/;", "dblp": "206/7018;130/0612", "google_scholar": "NQRw0bQAAAAJ;DVCHv1kAAAAJ", "orcid": ";0000-0002-4946-6042", "linkedin": ";", "or_profile": "~Tri_Dao1;~Albert_Gu1", "aff": "Princeton University;Carnegie Mellon University", "aff_domain": "princeton.edu;cmu.edu", "position": "Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\ndao2024transformers,\ntitle={Transformers are {SSM}s: Generalized Models and Efficient Algorithms Through Structured State Space Duality},\nauthor={Tri Dao and Albert Gu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ztn8FCR1td}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 2120465, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 2, "gs_citation": 487, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16362672365556250342&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "email": "princeton.edu;cmu.edu", "author_num": 2, "aff_unique_index": "0;1", "aff_unique_norm": "Princeton University;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.princeton.edu;https://www.cmu.edu", "aff_unique_abbr": "Princeton;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Understanding Server-Assisted Federated Learning in the Presence of Incomplete Client Participation", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/35294", "id": "zwUEk9WpsR", "proceeding": "https://proceedings.mlr.press/v235/yang24r.html", "pdf": "https://openreview.net/pdf?id=zwUEk9WpsR", "openreview": "https://openreview.net/forum?id=zwUEk9WpsR", "author_site": "Haibo Yang, Peiwen Qiu, Prashant Khanduri, Minghong Fang, Jia (Kevin) Liu", "tldr": "", "abstract": "Existing works in federated learning (FL) often assume either full client or uniformly distributed client participation. However, in reality, some clients may never participate in FL training (aka incomplete client participation) due to various system heterogeneity factors. A popular solution is the server-assisted federated learning (SA-FL) framework, where the server uses an auxiliary dataset. Despite empirical evidence of SA-FL's effectiveness in addressing incomplete client participation, theoretical understanding of SA-FL is lacking. Furthermore, the effects of incomplete client participation in conventional FL are poorly understood. This motivates us to rigorously investigate SA-FL. Toward this end, we first show that conventional FL is *not* PAC-learnable under incomplete client participation in the worst case. Then, we show that the PAC-learnability of FL with incomplete client participation can indeed be revived by SA-FL, which theoretically justifies the use of SA-FL for the first time. Lastly, to provide practical guidance for SA-FL training under *incomplete client participation*, we propose the SAFARI (server-assisted federated averaging) algorithm that enjoys the same linear convergence speedup guarantees as classic FL with ideal client participation assumptions, offering the first SA-FL algorithm with convergence guarantee. Extensive experiments on different datasets show SAFARI significantly improves the performance under incomplete client participation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haibo Yang;Peiwen Qiu;Prashant Khanduri;Minghong Fang;Jia Liu", "authorids": "~Haibo_Yang1;~Peiwen_Qiu1;~Prashant_Khanduri1;~Minghong_Fang1;~Jia_Liu1", "gender": "M;F;M;M;M", "homepage": "https://haibo-yang-osu.github.io/homepage/;;https://sites.google.com/view/khanduri-prashant/home?authuser=0;https://minghongfang.com/;https://kevinliu-osu.github.io/index.html", "dblp": "43/7829-1;287/6757;158/4888;157/0863;", "google_scholar": "eyy22VoAAAAJ;LzaQe5sAAAAJ;;L6vkkC8AAAAJ;Ofx3dScAAAAJ", "orcid": "0000-0002-3245-2728;;;0000-0002-1365-3911;", "linkedin": ";peiwen-qiu/;prashant-khanduri-0497894b/;;", "or_profile": "~Haibo_Yang1;~Peiwen_Qiu1;~Prashant_Khanduri1;~Minghong_Fang1;~Jia_Liu1", "aff": "Rochester Institute of Technology;Ohio State University, Columbus;Wayne State University;Duke University;The Ohio State University", "aff_domain": "rit.edu;osu.edu;wayne.edu;duke.edu;osu.edu", "position": "Assistant Professor;PhD student;Assistant Professor;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nyang2024understanding,\ntitle={Understanding Server-Assisted Federated Learning in the Presence of Incomplete Client Participation},\nauthor={Haibo Yang and Peiwen Qiu and Prashant Khanduri and Minghong Fang and Jia Liu},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zwUEk9WpsR}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 631407, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8723411176483585880&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "email": "rit.edu;osu.edu;wayne.edu;duke.edu;osu.edu", "author_num": 5, "aff_unique_index": "0;1;2;3;1", "aff_unique_norm": "Rochester Institute of Technology;Ohio State University;Wayne State University;Duke University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.rit.edu;https://www.osu.edu;https://wayne.edu;https://www.duke.edu", "aff_unique_abbr": "RIT;OSU;WSU;Duke", "aff_campus_unique_index": "1", "aff_campus_unique": ";Columbus", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "A Graph is Worth $K$ Words: Euclideanizing Graph using Pure Transformer", "status": "Poster", "track": "main", "site": "https://icml.cc/virtual/2024/poster/32612", "id": "zxxSJAVQPc", "proceeding": "https://proceedings.mlr.press/v235/gao24e.html", "pdf": "https://openreview.net/pdf?id=zxxSJAVQPc", "openreview": "https://openreview.net/forum?id=zxxSJAVQPc", "author_site": "Zhangyang Gao, Daize Dong, Cheng Tan, Jun Xia, Bozhen Hu, Stan Z Li", "tldr": "", "abstract": "Can we model Non-Euclidean graphs as pure language or even Euclidean vectors while retaining their inherent information? The Non-Euclidean property have posed a long term challenge in graph modeling. Despite recent graph neural networks and graph transformers efforts encoding graphs as Euclidean vectors, recovering the original graph from vectors remains a challenge. In this paper, we introduce GraphsGPT, featuring an Graph2Seq encoder that transforms Non-Euclidean graphs into learnable Graph Words in the Euclidean space, along with a GraphGPT decoder that reconstructs the original graph from Graph Words to ensure information equivalence. We pretrain GraphsGPT on $100$M molecules and yield some interesting findings: (1) The pretrained Graph2Seq excels in graph representation learning, achieving state-of-the-art results on $8/9$ graph classification and regression tasks. (2) The pretrained GraphGPT serves as a strong graph generator, demonstrated by its strong ability to perform both few-shot and conditional graph generation. (3) Graph2Seq+GraphGPT enables effective graph mixup in the Euclidean space, overcoming previously known Non-Euclidean challenges. (4) The edge-centric pretraining framework GraphsGPT demonstrates its efficacy in graph domain tasks, excelling in both representation and generation. Code is available at https://github.com/A4Bio/GraphsGPT.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhangyang Gao;Daize Dong;Cheng Tan;Jun Xia;Bozhen Hu;Stan Z. Li", "authorids": "~Zhangyang_Gao1;~Daize_Dong1;~Cheng_Tan1;~Jun_Xia1;~Bozhen_Hu1;~Stan_Z._Li2", "gender": "M;;M;M;M;", "homepage": ";;https://chengtan9907.github.io/;http://junxia97.github.io/;;", "dblp": "275/3266;;70/1533-12.html;;279/8665;", "google_scholar": "4SclT-QAAAAJ;;6kTV6aMAAAAJ;aPKKpSYAAAAJ;https://scholar.google.com/citations?hl=zh-CN;", "orcid": "0000-0003-1026-6083;;;;0000-0002-3428-0114;", "linkedin": ";;;;;", "or_profile": "~Zhangyang_Gao1;~Daize_Dong1;~Cheng_Tan1;~Jun_Xia1;~Bozhen_Hu1;~Stan_Z._Li2", "aff": "Westlake University, China;;Zhejiang University & Westlake University;Westlake University, China;Westlake University;", "aff_domain": "westlake.edu.cn;;westlake.edu.cn;westlake.edu.cn;westlake.edu.cn;", "position": "PhD student;;PhD student;PhD student;PhD student;", "bibtex": "@inproceedings{\ngao2024a,\ntitle={A Graph is Worth \\$K\\$ Words: Euclideanizing Graph using Pure Transformer},\nauthor={Zhangyang Gao and Daize Dong and Cheng Tan and Jun Xia and Bozhen Hu and Stan Z. Li},\nbooktitle={Forty-first International Conference on Machine Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zxxSJAVQPc}\n}", "github": "", "project": "", "reviewers": "", "pdf_size": 3065159, "rating": "", "rating_avg": 0, "replies_avg": 0, "authors#_avg": 6, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16183370446849029969&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 7, "email": "westlake.edu.cn;;westlake.edu.cn;westlake.edu.cn;westlake.edu.cn;", "author_num": 6, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Westlake University;Zhejiang University", "aff_unique_dep": ";", "aff_unique_url": "https://www.westlake.edu.cn;http://www.zju.edu.cn", "aff_unique_abbr": "WU;ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" } ]